import requests import re import unicodedata from bs4 import BeautifulSoup from tqdm.auto import tqdm from concurrent.futures import ThreadPoolExecutor from pymongo import InsertOne from app.config import settings from app.db import connect_db, disconnect_db # variables for custom data parsing single_value = ["Obchodné meno:", "Sídlo:", "IČO:", "Deň zápisu:", "Právna forma:"] value_type_dict = { "IČO:": "number", "Spoločníci:": "spolocnici", "Výška vkladu každého spoločníka:": "vklad" } def scrape_orsr(): """ This is the main function that scrapes data from endpoint defined in config and stores it in mongodb. """ print("#########################") print("Starting ORSR scraper") # get all links to from the orsr url print("Downloading changed records..") url = settings["base_url"]+settings["endpoint"] proxies = {} if (pr := settings["http_proxy"]) is not None: proxies.update({"http": pr}) print(f"Found http proxy: {pr}") if (pr := settings["https_proxy"]) is not None: proxies.update({"https": pr}) print(f"Found https proxy: {pr}") html = requests.get(url, proxies=proxies) print("All changed records downloaded.") # use bs4 to parse the page soup = BeautifulSoup(html.content, "html.parser") # choice between Aktualny and Uplny m_type = input("Choose which type of records do you want to download:\n[1] 'Aktuálne'\n[2] 'Úplné' (default)\n") if m_type == "1": record_type = "Aktuálny" print("Record type is 'Aktuálny'") else: record_type = "Úplný" print("Record type is 'Úplný'") records = soup.find_all("a", string=record_type) # add base_url to href links records = [settings["base_url"]+record["href"] for record in records] print(f"There were {len(records)} records found.") # distribute the work in #of threads defined in config parts = [records[i::settings["threads"]] for i in range(settings["threads"])] print(f"Processing {len(records)} records using {settings['threads']} threads:") with ThreadPoolExecutor() as t: for thread_id, part in enumerate(parts): t.submit(process_records, part, thread_id+1) print("All records_processed") print("Closing ORSR Scraper...") print("#########################") def process_records(records, thread): """ worker for processing records in a thread :param records: list of urls of records to proceses :param thread: thread id of processing thread """ data = [] # add status bar for processing the records for i in tqdm(range(len(records)), desc=f"thread {thread}"): try: record = process_record(records[i]) data.append(InsertOne(record)) except Exception as e: print(f"When downloading and parsing record {records[i]} following error occured: {e}") # store processed records in db collection = connect_db() collection.bulk_write(data) disconnect_db(collection) def process_record(url): """ process one record. Scrape url data and parse them to dictionary :param url: url of the record :return dictionary of parameters """ proxies = {} if (pr := settings["http_proxy"]) is not None: proxies.update({"http": pr}) if (pr := settings["https_proxy"]) is not None: proxies.update({"https": pr}) html = requests.get(url, proxies=proxies) soup = BeautifulSoup(html.content, "html.parser") record = get_record_data(soup) return record def get_oddiel(soup): """ Helper function to get Oddiel :param soup: website data :return: dictionary with value: oddiel """ oddiel = soup.find("span", class_="tl", string=re.compile("Oddiel:")).parent.find("span", class_="ra").text.strip() return {"value": oddiel} def get_vlozka(soup): """ Helper function to get VloŽžka :param soup: website data :return: dictionary with value: vlozka """ vlozka = soup.find("span", class_="tl", string=re.compile("Vložka")).parent.find("span", class_="ra").text.strip() return {"value": vlozka} def get_aktualizaciaUdajov(soup): """ Helper function to get the date of "Dátum aktualizácie údajov" :param soup: website data :return: dictionary with value: aktualizacia """ aktualizacia = soup.find("td", class_="tl", string=re.compile("Dátum aktualizácie")).find_next_sibling("td").text.strip() return {"value": aktualizacia} def get_vypisUdajov(soup): """ Helper function to get the date of "Dátum výpisu" :param soup: website data :return: dictionary with value: vypis """ vypis = soup.find("td", class_="tl", string=re.compile("Dátum výpisu")).find_next_sibling("td").text.strip() return {"value": vypis} def get_data(data_td, value_type="text", allow_multiple_active=True): """ Generic function to retrieve data for one key :param data_td: