140 lines
4.3 KiB
Python
140 lines
4.3 KiB
Python
import requests
|
|
import re
|
|
import json
|
|
from bs4 import BeautifulSoup
|
|
from tqdm.auto import tqdm
|
|
from concurrent.futures import ThreadPoolExecutor
|
|
|
|
from app.config import settings
|
|
from app.db import connect_db, disconnect_db
|
|
from time import sleep
|
|
|
|
|
|
def scrape_orsr():
|
|
"""
|
|
This is the main function that scrapes data from endpoint defined in config and stores it in mongodb.
|
|
"""
|
|
# get all links to "Aktuálny" from the orsr url
|
|
html = requests.get(settings["orsr_url"])
|
|
soup = BeautifulSoup(html.content, "html.parser")
|
|
records = soup.find_all("a", string="Aktuálny")
|
|
records = [record["href"] for record in records]
|
|
|
|
# distribute the work in #of threads defined in config
|
|
worker_ids = list(range(1, len(records)+1))
|
|
parts = [worker_ids[i::settings["threads"]] for i in range(settings["threads"])]
|
|
|
|
with ThreadPoolExecutor() as t:
|
|
for thread_id, part in enumerate(parts):
|
|
t.submit(process_records, part, thread_id+1)
|
|
|
|
|
|
def process_records(records, thread):
|
|
"""
|
|
worker for processing records in a thread
|
|
:param records: list of urls of records to proceses
|
|
:param thread: thread id of processing thread
|
|
"""
|
|
data = []
|
|
for i in tqdm(range(len(records)), desc=f"thread {thread}"):
|
|
record = process_record(records[i])
|
|
data.append(record)
|
|
collection = connect_db()
|
|
collection.bulk_write(data)
|
|
disconnect_db(collection)
|
|
|
|
|
|
def process_record(url):
|
|
"""
|
|
process one record. Scrape url and store data to mongodb
|
|
:param url: url of the record
|
|
:return dictionary of parameters
|
|
"""
|
|
html = requests.get(url)
|
|
soup = BeautifulSoup(html.content, "html.parser")
|
|
|
|
|
|
def get_oddiel(soup):
|
|
oddiel = soup.find("span", class_="tl", string=re.compile("Oddiel:")).parent.find("span", class_="ra").text.strip()
|
|
return {"value": oddiel}
|
|
|
|
|
|
def get_vlozka(soup):
|
|
vlozka = soup.find("span", class_="tl", string=re.compile("Vložka")).parent.find("span", class_="ra").text.strip()
|
|
return {"value": vlozka}
|
|
|
|
|
|
def get_obchodneMeno(soup):
|
|
data = {}
|
|
|
|
# find the table <tr> element of "Obchodné meno:"
|
|
meno_tr = soup.find("span", class_="tl", string=re.compile("Obchodné")).parent.parent
|
|
|
|
# parse the name and date
|
|
active = meno_tr.find_all("span", class_="ra")
|
|
active = [x.text.strip() for x in active]
|
|
if len(active) == 0:
|
|
value, valid_from, valid_until = "", ""
|
|
else:
|
|
value, valid = active[0], active[1]
|
|
valid_from, valid_until = parse_oddo(valid)
|
|
data.update({"value": value, "valid_from": valid_from, "valid_until": valid_until})
|
|
|
|
# check for older entries
|
|
old = meno_tr.find_all("span", class_="ro")
|
|
old = [x.text.strip() for x in old]
|
|
if len(old) == 0:
|
|
old_values = []
|
|
else:
|
|
old_values = [{"value": y[0], "valid_from": y[1][0], "valid_until": y[1][1]} for y in zip(old[::2], list(map(parse_oddo,old[1::2])))]
|
|
data.update({"old_values": old_values})
|
|
return data
|
|
|
|
|
|
def parse_oddo(text):
|
|
"""
|
|
Parses the valid_from and valid_until from string
|
|
:param text: od_do_dates in format: "(od: DD.MM.YYYY do: DD.MM.YYYY)"
|
|
:return: returns a tuple (valid_from, valid_until)
|
|
"""
|
|
valid_from, valid_until = "", ""
|
|
|
|
if (start_from := text.find("od: ")) > -1:
|
|
valid_from = text[start_from+4:start_from+14]
|
|
if (start_until := text.find("do: ")) > -1:
|
|
valid_until = text[start_until+4:start_until+14]
|
|
|
|
return valid_from, valid_until
|
|
|
|
def test():
|
|
url = "https://www.orsr.sk/vypis.asp?ID=670947&SID=2&P=1"
|
|
html = requests.get(url)
|
|
soup = BeautifulSoup(html.content, "html.parser")
|
|
|
|
record = {
|
|
"oddiel": get_oddiel(soup),
|
|
"vlozka": get_vlozka(soup),
|
|
"obchodneMeno": get_obchodneMeno(soup),
|
|
"sidlo": "",
|
|
"ico": "",
|
|
"denZapisu": "",
|
|
"pravnaForma": "",
|
|
"predmetyCinnosti": "",
|
|
"spolocnici": "",
|
|
"vyskaVkladov": "",
|
|
"statutarnyOrgan": "",
|
|
"konanie": "",
|
|
"zakladneImanie": "",
|
|
"aktualizaciaUdajov": "",
|
|
"vypisUdajov": ""
|
|
}
|
|
print(json.dumps(record,indent=4))
|
|
collection = connect_db()
|
|
#collection.bulk_write(soup)
|
|
disconnect_db(collection)
|
|
|
|
|
|
if __name__ == "__main__":
|
|
#scrape_orsr()
|
|
test()
|