From 3460c718d26d7ec272685030cc7e37da00b2331f Mon Sep 17 00:00:00 2001 From: Oto Imrich Date: Thu, 28 Sep 2023 11:53:38 +0200 Subject: [PATCH] working scraper --- config.cfg | 2 +- config_base.cfg | 4 +- scraper.py | 133 +++++++++++++++++++++--------------------------- 3 files changed, 61 insertions(+), 78 deletions(-) diff --git a/config.cfg b/config.cfg index 92e5548..89f7c3c 100644 --- a/config.cfg +++ b/config.cfg @@ -8,4 +8,4 @@ #ENDPOINT = hladaj_zmeny.asp [APP] -THREADS = 1 \ No newline at end of file +THREADS = 32 \ No newline at end of file diff --git a/config_base.cfg b/config_base.cfg index dbe56b0..4c09d27 100644 --- a/config_base.cfg +++ b/config_base.cfg @@ -1,5 +1,5 @@ [DB] -MONGODB_URI = mongodb://localhost:27017/softone +MONGODB_URI = mongodb://localhost:27017 MONGODB_DB = softone MONGODB_COLLECTION = orsr @@ -8,4 +8,4 @@ BASE_URL = https://www.orsr.sk/ ENDPOINT = hladaj_zmeny.asp [APP] -THREADS = 4 \ No newline at end of file +THREADS = 32 \ No newline at end of file diff --git a/scraper.py b/scraper.py index f6df6f1..5fa6e24 100644 --- a/scraper.py +++ b/scraper.py @@ -1,6 +1,7 @@ import requests import re import json +import unicodedata from bs4 import BeautifulSoup from tqdm.auto import tqdm from concurrent.futures import ThreadPoolExecutor @@ -11,6 +12,13 @@ from app.db import connect_db, disconnect_db from time import sleep +single_value = ["Obchodné meno:", "Sídlo:", "IČO:", "Deň zápisu:", "Právna forma:"] +value_type_dict = { + "IČO:": "number", + "Spoločníci:": "spolocnici", + "Výška vkladu každého spoločníka:": "vklad" +} + def scrape_orsr(): """ This is the main function that scrapes data from endpoint defined in config and stores it in mongodb. @@ -24,10 +32,11 @@ def scrape_orsr(): # distribute the work in #of threads defined in config parts = [records[i::settings["threads"]] for i in range(settings["threads"])] + print(f"Processing {len(records)} records using {settings['threads']} threads:") with ThreadPoolExecutor() as t: for thread_id, part in enumerate(parts): t.submit(process_records, part, thread_id+1) - print("records_processed") + print("All records_processed") def process_records(records, thread): @@ -37,7 +46,8 @@ def process_records(records, thread): :param thread: thread id of processing thread """ data = [] - for i in tqdm(range(len(records)), desc=f"thread {thread}"): + #for i in tqdm(range(len(records)), desc=f"thread {thread}"): + for i in tqdm(range(1), desc=f"thread {thread}"): record = process_record(records[i]) data.append(InsertOne(record)) collection = connect_db() @@ -51,26 +61,12 @@ def process_record(url): :param url: url of the record :return dictionary of parameters """ + url = "https://www.orsr.sk/vypis.asp?ID=471674&SID=4&P=1" html = requests.get(url) soup = BeautifulSoup(html.content, "html.parser") - record = { - "oddiel": get_oddiel(soup), - "vlozka": get_vlozka(soup), - "obchodneMeno": get_data(soup, "Obchodné meno", allow_multiple_active=False), - "sidlo": get_data(soup,"Sídlo", allow_multiple_active=False), - "ico": get_data(soup, "IČO", value_type="number" , allow_multiple_active=False), - "denZapisu": get_data(soup, "Deň zápisu", allow_multiple_active=False), - "pravnaForma": get_data(soup, "Právna forma", allow_multiple_active=False), - "predmetyCinnosti": get_data(soup, "Predmet činnosti"), - "spolocnici": get_data(soup, "Spoločníci", value_type="spolocnici"), - "vyskaVkladov": get_data(soup, "Výška vkladu", value_type="vklad"), - "statutarnyOrgan": get_data(soup, "Štatutárny orgán"), - "konanie": get_data(soup, "Konanie menom"), - "zakladneImanie": get_data(soup, "Základné imanie"), - "aktualizaciaUdajov": get_aktualizaciaUdajov(soup), - "vypisUdajov": get_vypisUdajov(soup) - } + record = get_record_data(soup) + return record @@ -94,31 +90,47 @@ def get_vypisUdajov(soup): return {"value": vypis} -def get_obchodneMeno(soup): - data = {} +def get_record_data(soup): + record = { + "oddiel": get_oddiel(soup), + "vlozka": get_vlozka(soup) + } - # find the table element of "Obchodné meno:" - meno_tr = soup.find("span", class_="tl", string=re.compile("Obchodné")).parent.parent + # find the last table before variable data + entry = soup.find("span", class_="tl", string=re.compile("Oddiel:")).parent.parent.parent + while True: + entry = entry.find_next_sibling("table") + entry_tr = entry.find_all("tr") + entry_tr = [i for i in entry_tr if i.parent == entry_tr[0].parent] - # parse the name and date - active = meno_tr.find_all("span", class_="ra") - active = [x.text.strip() for x in active] - if len(active) == 0: - value, valid_from, valid_until = "", "" - else: - value, valid = active[0], active[1] - valid_from, valid_until = parse_oddo(valid) - data.update({"value": value, "valid_from": valid_from, "valid_until": valid_until}) + if len(entry_tr) > 1: # last table with "Dátum aktualizácie údajov + break - # check for older entries - old = meno_tr.find_all("span", class_="ro") - old = [x.text.strip() for x in old] - if len(old) == 0: - old_values = [] - else: - old_values = [{"value": y[0], "valid_from": y[1][0], "valid_until": y[1][1]} for y in zip(old[::2], list(map(parse_oddo,old[1::2])))] - data.update({"old_values": old_values}) - return data + # get enry name and entry data + entry_container = entry_tr[0].find_all("td") + entry_name = entry_container[0].text.strip() + + allow_multiple_active = True + value_type = "text" + if entry_name in single_value: + allow_multiple_active = False + if (v_type := value_type_dict.get(entry_name)) is not None: + value_type = v_type + entry_name = transform_entry_name(entry_name) + entry_data = get_data(entry_container[1], value_type=value_type, allow_multiple_active=allow_multiple_active) + record.update({entry_name: entry_data}) + + record.update({ + "aktualizaciaUdajov": get_aktualizaciaUdajov(soup), + "vypisUdajov": get_vypisUdajov(soup) + }) + + return record + + +def transform_entry_name(name): + s = unicodedata.normalize("NFKD",name).encode('ascii', 'ignore').decode().replace(":", "").lower().split() + return s[0].lower() + "".join(w.capitalize() for w in s[1:]) def process_entry(entry, value_type): @@ -137,7 +149,7 @@ def process_entry(entry, value_type): if value_td.span.attrs["class"][0] == "ra": active = True - lines = [f.strip() for f in " ".join(["\n" if x.name == "br" else x.text.strip() for x in value_td.find_all()]).split("\n") if f] + lines = [f.strip() for f in " ".join(["\n" if x.name == "br" else x.text.strip() for x in value_td.find_all(["br","span"])]).split("\n") if f] if value_type == "text": value = ", ".join(lines) @@ -163,8 +175,8 @@ def process_entry(entry, value_type): return value, valid_from, valid_until, active -def get_data(soup, name, value_type="text", allow_multiple_active=True): - data_td = soup.find("span", class_="tl", string=re.compile(f"{name}")).parent.find_next_sibling("td") +def get_data(data_td, value_type="text", allow_multiple_active=True): + data_td = data_td data = {} @@ -181,7 +193,8 @@ def get_data(soup, name, value_type="text", allow_multiple_active=True): old_values.append({"value": value, "valid_from": valid_from, "valid_until": valid_until}) if not allow_multiple_active: - data.update(values[0]) + if len(values) > 0: + data.update(values[0]) else: data.update({"values": values}) data.update({"old_values": old_values}) @@ -205,35 +218,5 @@ def parse_oddo(text): return valid_from, valid_until -def test(): - url = "https://www.orsr.sk/vypis.asp?ID=12388&SID=8&P=1" - html = requests.get(url) - soup = BeautifulSoup(html.content, "html.parser") - - record = { - "oddiel": get_oddiel(soup), - "vlozka": get_vlozka(soup), - "obchodneMeno": get_data(soup, "Obchodné meno", allow_multiple_active=False), - "sidlo": get_data(soup,"Sídlo", allow_multiple_active=False), - "ico": get_data(soup, "IČO", value_type="number" , allow_multiple_active=False), - "denZapisu": get_data(soup, "Deň zápisu", allow_multiple_active=False), - "pravnaForma": get_data(soup, "Právna forma", allow_multiple_active=False), - "predmetyCinnosti": get_data(soup, "Predmet činnosti"), - "spolocnici": get_data(soup, "Spoločníci", value_type="spolocnici"), - "vyskaVkladov": get_data(soup, "Výška vkladu", value_type="vklad"), - "statutarnyOrgan": get_data(soup, "Štatutárny orgán"), - "konanie": get_data(soup, "Konanie menom"), - "zakladneImanie": get_data(soup, "Základné imanie"), - "aktualizaciaUdajov": get_aktualizaciaUdajov(soup), - "vypisUdajov": get_vypisUdajov(soup) - } - print(json.dumps(record,indent=4,ensure_ascii=False)) - collection = connect_db() - records = [InsertOne(record)] - collection.bulk_write(records) - disconnect_db(collection) - - if __name__ == "__main__": scrape_orsr() - #test()