From 91fab836bbdc7d54cfd865ff9b42819b86a5e051 Mon Sep 17 00:00:00 2001 From: Oto Imrich Date: Wed, 27 Sep 2023 16:40:59 +0200 Subject: [PATCH] make data extraction more general --- scraper.py | 120 +++++++++++++++++++++++++++++++++++++++++++++-------- 1 file changed, 102 insertions(+), 18 deletions(-) diff --git a/scraper.py b/scraper.py index cb18cb6..ac0d244 100644 --- a/scraper.py +++ b/scraper.py @@ -68,7 +68,8 @@ def get_obchodneMeno(soup): data = {} # find the table element of "Obchodné meno:" - meno_tr = soup.find("span", class_="tl", string=re.compile("Obchodné")).parent.parent + #meno_tr = soup.find("span", class_="tl", string=re.compile("Obchodné")).parent.parent + meno_tr = get_data_td(soup, "Obchodné") # parse the name and date active = meno_tr.find_all("span", class_="ra") @@ -91,6 +92,88 @@ def get_obchodneMeno(soup): return data +def get_sidlo(soup): + data = {} + + # find the table element of "Sídlo:" + sidlo_tr = get_data_td(soup, "Sídlo") + return data + + +def get_ico(soup): + data = {} + + return data + + +def get_denZapisu(soup): + data = {} + + return data + + +def get_pravnaForma(soup): + data = {} + + return data + + +def get_predmetyCinnosti(soup): + data = {} + + return data + + +def get_spolocnici(soup): + data = {} + + return data + + +def get_vyskaVkladov(soup): + data = {} + + return data + + +def get_statutarnyOrgan(soup): + data = {} + + return data + + +def get_konanie(soup): + data = {} + + return data + + +def get_zakladneImanie(soup): + data = {} + + return data + + +def get_aktualizaciaUdajov(soup): + data = {} + + return data + + +def get_vypisUdajov(soup): + data = {} + + return data + + +def get_data(soup, name): + data_td = soup.find("span", class_="tl", string=re.compile(f"{name}")).parent.find_next_sibling("td") + + + return [] + + + def parse_oddo(text): """ Parses the valid_from and valid_until from string @@ -106,29 +189,30 @@ def parse_oddo(text): return valid_from, valid_until + def test(): - url = "https://www.orsr.sk/vypis.asp?ID=670947&SID=2&P=1" + url = "https://www.orsr.sk/vypis.asp?ID=12388&SID=8&P=1" html = requests.get(url) soup = BeautifulSoup(html.content, "html.parser") record = { - "oddiel": get_oddiel(soup), - "vlozka": get_vlozka(soup), - "obchodneMeno": get_obchodneMeno(soup), - "sidlo": "", - "ico": "", - "denZapisu": "", - "pravnaForma": "", - "predmetyCinnosti": "", - "spolocnici": "", - "vyskaVkladov": "", - "statutarnyOrgan": "", - "konanie": "", - "zakladneImanie": "", - "aktualizaciaUdajov": "", - "vypisUdajov": "" + "oddiel": get_oddiel(soup), + "vlozka": get_vlozka(soup), + "obchodneMeno": get_obchodneMeno(soup), + "sidlo": get_sidlo(soup), + "ico": get_ico(soup), + "denZapisu": get_denZapisu(soup), + "pravnaForma": get_pravnaForma(soup), + "predmetyCinnosti": get_predmetyCinnosti(soup), + "spolocnici": get_spolocnici(soup), + "vyskaVkladov": get_vyskaVkladov(soup), + "statutarnyOrgan": get_statutarnyOrgan(soup), + "konanie": get_konanie(soup), + "zakladneImanie": get_zakladneImanie(soup), + "aktualizaciaUdajov": get_aktualizaciaUdajov(soup), + "vypisUdajov": get_vypisUdajov(soup) } - print(json.dumps(record,indent=4)) + print(json.dumps(record,indent=4,ensure_ascii=False)) collection = connect_db() #collection.bulk_write(soup) disconnect_db(collection)