From 229e4ab01db87c4889390b0d45334143a22ab396 Mon Sep 17 00:00:00 2001 From: Oto Imrich Date: Wed, 27 Sep 2023 21:38:05 +0200 Subject: [PATCH] working for sro, adapt to other --- .gitignore | 2 +- app/config.py | 3 +- config.cfg | 11 ++++ config_base.cfg | 5 +- scraper.py | 164 +++++++++++++++++++++--------------------------- 5 files changed, 88 insertions(+), 97 deletions(-) create mode 100644 config.cfg diff --git a/.gitignore b/.gitignore index 2602141..2875098 100644 --- a/.gitignore +++ b/.gitignore @@ -1,3 +1,3 @@ .venv .idea -.config.cfg \ No newline at end of file +config.cfg \ No newline at end of file diff --git a/app/config.py b/app/config.py index b74e41b..f4ce3de 100644 --- a/app/config.py +++ b/app/config.py @@ -14,6 +14,7 @@ settings = { "mongodb_uri": config.get("DB","MONGODB_URI"), "mongodb_db": config.get("DB","MONGODB_DB"), "mongodb_collection": config.get("DB","MONGODB_COLLECTION"), - "orsr_url": config.get("WEB", "ORSR_URL"), + "base_url": config.get("WEB", "BASE_URL"), + "endpoint": config.get("WEB", "ENDPOINT"), "threads": int(config.get("APP", "THREADS")) } diff --git a/config.cfg b/config.cfg new file mode 100644 index 0000000..92e5548 --- /dev/null +++ b/config.cfg @@ -0,0 +1,11 @@ +[DB] +#MONGODB_URI = mongodb://localhost:27017/softone +#MONGODB_DB = softone +#MONGODB_COLLECTION = orsr + +[WEB] +#BASE_URL = https://www.orsr.sk/ +#ENDPOINT = hladaj_zmeny.asp + +[APP] +THREADS = 1 \ No newline at end of file diff --git a/config_base.cfg b/config_base.cfg index 7ab8b63..dbe56b0 100644 --- a/config_base.cfg +++ b/config_base.cfg @@ -4,7 +4,8 @@ MONGODB_DB = softone MONGODB_COLLECTION = orsr [WEB] -ORSR_URL = http://www.orsr.sk/hladaj_zmeny.asp +BASE_URL = https://www.orsr.sk/ +ENDPOINT = hladaj_zmeny.asp [APP] -THREADS = 8 \ No newline at end of file +THREADS = 4 \ No newline at end of file diff --git a/scraper.py b/scraper.py index 3f48f92..f6df6f1 100644 --- a/scraper.py +++ b/scraper.py @@ -4,6 +4,7 @@ import json from bs4 import BeautifulSoup from tqdm.auto import tqdm from concurrent.futures import ThreadPoolExecutor +from pymongo import InsertOne from app.config import settings from app.db import connect_db, disconnect_db @@ -15,18 +16,18 @@ def scrape_orsr(): This is the main function that scrapes data from endpoint defined in config and stores it in mongodb. """ # get all links to "Aktuálny" from the orsr url - html = requests.get(settings["orsr_url"]) + html = requests.get(settings["base_url"]+settings["endpoint"]) soup = BeautifulSoup(html.content, "html.parser") - records = soup.find_all("a", string="Aktuálny") - records = [record["href"] for record in records] + records = soup.find_all("a", string="Úplný") + records = [settings["base_url"]+record["href"] for record in records] # distribute the work in #of threads defined in config - worker_ids = list(range(1, len(records)+1)) - parts = [worker_ids[i::settings["threads"]] for i in range(settings["threads"])] + parts = [records[i::settings["threads"]] for i in range(settings["threads"])] with ThreadPoolExecutor() as t: for thread_id, part in enumerate(parts): t.submit(process_records, part, thread_id+1) + print("records_processed") def process_records(records, thread): @@ -38,7 +39,7 @@ def process_records(records, thread): data = [] for i in tqdm(range(len(records)), desc=f"thread {thread}"): record = process_record(records[i]) - data.append(record) + data.append(InsertOne(record)) collection = connect_db() collection.bulk_write(data) disconnect_db(collection) @@ -53,6 +54,25 @@ def process_record(url): html = requests.get(url) soup = BeautifulSoup(html.content, "html.parser") + record = { + "oddiel": get_oddiel(soup), + "vlozka": get_vlozka(soup), + "obchodneMeno": get_data(soup, "Obchodné meno", allow_multiple_active=False), + "sidlo": get_data(soup,"Sídlo", allow_multiple_active=False), + "ico": get_data(soup, "IČO", value_type="number" , allow_multiple_active=False), + "denZapisu": get_data(soup, "Deň zápisu", allow_multiple_active=False), + "pravnaForma": get_data(soup, "Právna forma", allow_multiple_active=False), + "predmetyCinnosti": get_data(soup, "Predmet činnosti"), + "spolocnici": get_data(soup, "Spoločníci", value_type="spolocnici"), + "vyskaVkladov": get_data(soup, "Výška vkladu", value_type="vklad"), + "statutarnyOrgan": get_data(soup, "Štatutárny orgán"), + "konanie": get_data(soup, "Konanie menom"), + "zakladneImanie": get_data(soup, "Základné imanie"), + "aktualizaciaUdajov": get_aktualizaciaUdajov(soup), + "vypisUdajov": get_vypisUdajov(soup) + } + return record + def get_oddiel(soup): oddiel = soup.find("span", class_="tl", string=re.compile("Oddiel:")).parent.find("span", class_="ra").text.strip() @@ -64,6 +84,16 @@ def get_vlozka(soup): return {"value": vlozka} +def get_aktualizaciaUdajov(soup): + aktualizacia = soup.find("td", class_="tl", string=re.compile("Dátum aktualizácie")).find_next_sibling("td").text.strip() + return {"value": aktualizacia} + + +def get_vypisUdajov(soup): + vypis = soup.find("td", class_="tl", string=re.compile("Dátum výpisu")).find_next_sibling("td").text.strip() + return {"value": vypis} + + def get_obchodneMeno(soup): data = {} @@ -91,79 +121,6 @@ def get_obchodneMeno(soup): return data -def get_sidlo(soup): - data = {} - - # find the table element of "Sídlo:" - return data - - -def get_ico(soup): - data = {} - - return data - - -def get_denZapisu(soup): - data = {} - - return data - - -def get_pravnaForma(soup): - data = {} - - return data - - -def get_predmetyCinnosti(soup): - data = {} - - return data - - -def get_spolocnici(soup): - data = {} - - return data - - -def get_vyskaVkladov(soup): - data = {} - - return data - - -def get_statutarnyOrgan(soup): - data = {} - - return data - - -def get_konanie(soup): - data = {} - - return data - - -def get_zakladneImanie(soup): - data = {} - - return data - - -def get_aktualizaciaUdajov(soup): - data = {} - - return data - - -def get_vypisUdajov(soup): - data = {} - - return data - - def process_entry(entry, value_type): """ extracts one entry from the table of entries for a given data @@ -181,6 +138,26 @@ def process_entry(entry, value_type): active = True lines = [f.strip() for f in " ".join(["\n" if x.name == "br" else x.text.strip() for x in value_td.find_all()]).split("\n") if f] + + if value_type == "text": + value = ", ".join(lines) + elif value_type == "number": + value = int("".join(lines).replace(" ","")) + elif value_type == "spolocnici": + spolocnik = lines[0] + adresa = ", ".join(lines[1:]) + value = { + "spolocnik": spolocnik, + "adresa": adresa + } + elif value_type == "vklad": + spolocnik = lines[0] + vklad = ", ".join(lines[1:]) + value = { + "spolocnik": spolocnik, + "vklad": vklad + } + valid_from, valid_until = parse_oddo(valid_td.text.strip()) return value, valid_from, valid_until, active @@ -236,26 +213,27 @@ def test(): record = { "oddiel": get_oddiel(soup), "vlozka": get_vlozka(soup), - "obchodneMeno": get_obchodneMeno(soup), - "sidlo": get_sidlo(soup), - "ico": get_ico(soup), - "denZapisu": get_denZapisu(soup), - "pravnaForma": get_pravnaForma(soup), - "predmetyCinnosti": get_predmetyCinnosti(soup), - "spolocnici": get_spolocnici(soup), - "vyskaVkladov": get_vyskaVkladov(soup), - "statutarnyOrgan": get_statutarnyOrgan(soup), - "konanie": get_konanie(soup), - "zakladneImanie": get_zakladneImanie(soup), + "obchodneMeno": get_data(soup, "Obchodné meno", allow_multiple_active=False), + "sidlo": get_data(soup,"Sídlo", allow_multiple_active=False), + "ico": get_data(soup, "IČO", value_type="number" , allow_multiple_active=False), + "denZapisu": get_data(soup, "Deň zápisu", allow_multiple_active=False), + "pravnaForma": get_data(soup, "Právna forma", allow_multiple_active=False), + "predmetyCinnosti": get_data(soup, "Predmet činnosti"), + "spolocnici": get_data(soup, "Spoločníci", value_type="spolocnici"), + "vyskaVkladov": get_data(soup, "Výška vkladu", value_type="vklad"), + "statutarnyOrgan": get_data(soup, "Štatutárny orgán"), + "konanie": get_data(soup, "Konanie menom"), + "zakladneImanie": get_data(soup, "Základné imanie"), "aktualizaciaUdajov": get_aktualizaciaUdajov(soup), "vypisUdajov": get_vypisUdajov(soup) } print(json.dumps(record,indent=4,ensure_ascii=False)) collection = connect_db() - #collection.bulk_write(soup) + records = [InsertOne(record)] + collection.bulk_write(records) disconnect_db(collection) if __name__ == "__main__": - #scrape_orsr() - test() + scrape_orsr() + #test()