diff --git a/.gitignore b/.gitignore index 2602141..2875098 100644 --- a/.gitignore +++ b/.gitignore @@ -1,3 +1,3 @@ .venv .idea -.config.cfg \ No newline at end of file +config.cfg \ No newline at end of file diff --git a/app/config.py b/app/config.py index b74e41b..f4ce3de 100644 --- a/app/config.py +++ b/app/config.py @@ -14,6 +14,7 @@ settings = { "mongodb_uri": config.get("DB","MONGODB_URI"), "mongodb_db": config.get("DB","MONGODB_DB"), "mongodb_collection": config.get("DB","MONGODB_COLLECTION"), - "orsr_url": config.get("WEB", "ORSR_URL"), + "base_url": config.get("WEB", "BASE_URL"), + "endpoint": config.get("WEB", "ENDPOINT"), "threads": int(config.get("APP", "THREADS")) } diff --git a/config.cfg b/config.cfg new file mode 100644 index 0000000..92e5548 --- /dev/null +++ b/config.cfg @@ -0,0 +1,11 @@ +[DB] +#MONGODB_URI = mongodb://localhost:27017/softone +#MONGODB_DB = softone +#MONGODB_COLLECTION = orsr + +[WEB] +#BASE_URL = https://www.orsr.sk/ +#ENDPOINT = hladaj_zmeny.asp + +[APP] +THREADS = 1 \ No newline at end of file diff --git a/config_base.cfg b/config_base.cfg index 7ab8b63..dbe56b0 100644 --- a/config_base.cfg +++ b/config_base.cfg @@ -4,7 +4,8 @@ MONGODB_DB = softone MONGODB_COLLECTION = orsr [WEB] -ORSR_URL = http://www.orsr.sk/hladaj_zmeny.asp +BASE_URL = https://www.orsr.sk/ +ENDPOINT = hladaj_zmeny.asp [APP] -THREADS = 8 \ No newline at end of file +THREADS = 4 \ No newline at end of file diff --git a/scraper.py b/scraper.py index 3f48f92..f6df6f1 100644 --- a/scraper.py +++ b/scraper.py @@ -4,6 +4,7 @@ import json from bs4 import BeautifulSoup from tqdm.auto import tqdm from concurrent.futures import ThreadPoolExecutor +from pymongo import InsertOne from app.config import settings from app.db import connect_db, disconnect_db @@ -15,18 +16,18 @@ def scrape_orsr(): This is the main function that scrapes data from endpoint defined in config and stores it in mongodb. """ # get all links to "Aktuálny" from the orsr url - html = requests.get(settings["orsr_url"]) + html = requests.get(settings["base_url"]+settings["endpoint"]) soup = BeautifulSoup(html.content, "html.parser") - records = soup.find_all("a", string="Aktuálny") - records = [record["href"] for record in records] + records = soup.find_all("a", string="Úplný") + records = [settings["base_url"]+record["href"] for record in records] # distribute the work in #of threads defined in config - worker_ids = list(range(1, len(records)+1)) - parts = [worker_ids[i::settings["threads"]] for i in range(settings["threads"])] + parts = [records[i::settings["threads"]] for i in range(settings["threads"])] with ThreadPoolExecutor() as t: for thread_id, part in enumerate(parts): t.submit(process_records, part, thread_id+1) + print("records_processed") def process_records(records, thread): @@ -38,7 +39,7 @@ def process_records(records, thread): data = [] for i in tqdm(range(len(records)), desc=f"thread {thread}"): record = process_record(records[i]) - data.append(record) + data.append(InsertOne(record)) collection = connect_db() collection.bulk_write(data) disconnect_db(collection) @@ -53,6 +54,25 @@ def process_record(url): html = requests.get(url) soup = BeautifulSoup(html.content, "html.parser") + record = { + "oddiel": get_oddiel(soup), + "vlozka": get_vlozka(soup), + "obchodneMeno": get_data(soup, "Obchodné meno", allow_multiple_active=False), + "sidlo": get_data(soup,"Sídlo", allow_multiple_active=False), + "ico": get_data(soup, "IČO", value_type="number" , allow_multiple_active=False), + "denZapisu": get_data(soup, "Deň zápisu", allow_multiple_active=False), + "pravnaForma": get_data(soup, "Právna forma", allow_multiple_active=False), + "predmetyCinnosti": get_data(soup, "Predmet činnosti"), + "spolocnici": get_data(soup, "Spoločníci", value_type="spolocnici"), + "vyskaVkladov": get_data(soup, "Výška vkladu", value_type="vklad"), + "statutarnyOrgan": get_data(soup, "Štatutárny orgán"), + "konanie": get_data(soup, "Konanie menom"), + "zakladneImanie": get_data(soup, "Základné imanie"), + "aktualizaciaUdajov": get_aktualizaciaUdajov(soup), + "vypisUdajov": get_vypisUdajov(soup) + } + return record + def get_oddiel(soup): oddiel = soup.find("span", class_="tl", string=re.compile("Oddiel:")).parent.find("span", class_="ra").text.strip() @@ -64,6 +84,16 @@ def get_vlozka(soup): return {"value": vlozka} +def get_aktualizaciaUdajov(soup): + aktualizacia = soup.find("td", class_="tl", string=re.compile("Dátum aktualizácie")).find_next_sibling("td").text.strip() + return {"value": aktualizacia} + + +def get_vypisUdajov(soup): + vypis = soup.find("td", class_="tl", string=re.compile("Dátum výpisu")).find_next_sibling("td").text.strip() + return {"value": vypis} + + def get_obchodneMeno(soup): data = {} @@ -91,79 +121,6 @@ def get_obchodneMeno(soup): return data -def get_sidlo(soup): - data = {} - - # find the table