diff --git a/scraper.py b/scraper.py index ef01fc5..cb18cb6 100644 --- a/scraper.py +++ b/scraper.py @@ -1,5 +1,6 @@ import requests import re +import json from bs4 import BeautifulSoup from tqdm.auto import tqdm from concurrent.futures import ThreadPoolExecutor @@ -53,30 +54,81 @@ def process_record(url): soup = BeautifulSoup(html.content, "html.parser") +def get_oddiel(soup): + oddiel = soup.find("span", class_="tl", string=re.compile("Oddiel:")).parent.find("span", class_="ra").text.strip() + return {"value": oddiel} + + +def get_vlozka(soup): + vlozka = soup.find("span", class_="tl", string=re.compile("Vložka")).parent.find("span", class_="ra").text.strip() + return {"value": vlozka} + + +def get_obchodneMeno(soup): + data = {} + + # find the table element of "Obchodné meno:" + meno_tr = soup.find("span", class_="tl", string=re.compile("Obchodné")).parent.parent + + # parse the name and date + active = meno_tr.find_all("span", class_="ra") + active = [x.text.strip() for x in active] + if len(active) == 0: + value, valid_from, valid_until = "", "" + else: + value, valid = active[0], active[1] + valid_from, valid_until = parse_oddo(valid) + data.update({"value": value, "valid_from": valid_from, "valid_until": valid_until}) + + # check for older entries + old = meno_tr.find_all("span", class_="ro") + old = [x.text.strip() for x in old] + if len(old) == 0: + old_values = [] + else: + old_values = [{"value": y[0], "valid_from": y[1][0], "valid_until": y[1][1]} for y in zip(old[::2], list(map(parse_oddo,old[1::2])))] + data.update({"old_values": old_values}) + return data + + +def parse_oddo(text): + """ + Parses the valid_from and valid_until from string + :param text: od_do_dates in format: "(od: DD.MM.YYYY do: DD.MM.YYYY)" + :return: returns a tuple (valid_from, valid_until) + """ + valid_from, valid_until = "", "" + + if (start_from := text.find("od: ")) > -1: + valid_from = text[start_from+4:start_from+14] + if (start_until := text.find("do: ")) > -1: + valid_until = text[start_until+4:start_until+14] + + return valid_from, valid_until + def test(): - url = "https://www.orsr.sk/vypis.asp?ID=648444&SID=9&P=0" + url = "https://www.orsr.sk/vypis.asp?ID=670947&SID=2&P=1" html = requests.get(url) soup = BeautifulSoup(html.content, "html.parser") - ''' record = { - "oddiel": soup.find("span", string=re.compile("Oddiel:")), - "vlozka": pass, - "obchodneMeno": pass, - "sidlo": pass, - "ico": pass, - "denZapisu": pass, - "pravnaForma": pass, - "predmetyCinnosti": pass, - "spolocnici": pass, - "vyskaVkladov": pass, - "statutarnyOrgan": pass, - "konanie": pass, - "zakladneImanie": pass, - "aktualizaciaUdajov": pass, - "vypisUdajov": pass + "oddiel": get_oddiel(soup), + "vlozka": get_vlozka(soup), + "obchodneMeno": get_obchodneMeno(soup), + "sidlo": "", + "ico": "", + "denZapisu": "", + "pravnaForma": "", + "predmetyCinnosti": "", + "spolocnici": "", + "vyskaVkladov": "", + "statutarnyOrgan": "", + "konanie": "", + "zakladneImanie": "", + "aktualizaciaUdajov": "", + "vypisUdajov": "" } - ''' + print(json.dumps(record,indent=4)) collection = connect_db() #collection.bulk_write(soup) disconnect_db(collection)