import requests import re import json from bs4 import BeautifulSoup from tqdm.auto import tqdm from concurrent.futures import ThreadPoolExecutor from app.config import settings from app.db import connect_db, disconnect_db from time import sleep def scrape_orsr(): """ This is the main function that scrapes data from endpoint defined in config and stores it in mongodb. """ # get all links to "Aktuálny" from the orsr url html = requests.get(settings["orsr_url"]) soup = BeautifulSoup(html.content, "html.parser") records = soup.find_all("a", string="Aktuálny") records = [record["href"] for record in records] # distribute the work in #of threads defined in config worker_ids = list(range(1, len(records)+1)) parts = [worker_ids[i::settings["threads"]] for i in range(settings["threads"])] with ThreadPoolExecutor() as t: for thread_id, part in enumerate(parts): t.submit(process_records, part, thread_id+1) def process_records(records, thread): """ worker for processing records in a thread :param records: list of urls of records to proceses :param thread: thread id of processing thread """ data = [] for i in tqdm(range(len(records)), desc=f"thread {thread}"): record = process_record(records[i]) data.append(record) collection = connect_db() collection.bulk_write(data) disconnect_db(collection) def process_record(url): """ process one record. Scrape url and store data to mongodb :param url: url of the record :return dictionary of parameters """ html = requests.get(url) soup = BeautifulSoup(html.content, "html.parser") def get_oddiel(soup): oddiel = soup.find("span", class_="tl", string=re.compile("Oddiel:")).parent.find("span", class_="ra").text.strip() return {"value": oddiel} def get_vlozka(soup): vlozka = soup.find("span", class_="tl", string=re.compile("Vložka")).parent.find("span", class_="ra").text.strip() return {"value": vlozka} def get_obchodneMeno(soup): data = {} # find the table element of "Obchodné meno:" #meno_tr = soup.find("span", class_="tl", string=re.compile("Obchodné")).parent.parent meno_tr = get_data_td(soup, "Obchodné") # parse the name and date active = meno_tr.find_all("span", class_="ra") active = [x.text.strip() for x in active] if len(active) == 0: value, valid_from, valid_until = "", "" else: value, valid = active[0], active[1] valid_from, valid_until = parse_oddo(valid) data.update({"value": value, "valid_from": valid_from, "valid_until": valid_until}) # check for older entries old = meno_tr.find_all("span", class_="ro") old = [x.text.strip() for x in old] if len(old) == 0: old_values = [] else: old_values = [{"value": y[0], "valid_from": y[1][0], "valid_until": y[1][1]} for y in zip(old[::2], list(map(parse_oddo,old[1::2])))] data.update({"old_values": old_values}) return data def get_sidlo(soup): data = {} # find the table element of "Sídlo:" sidlo_tr = get_data_td(soup, "Sídlo") return data def get_ico(soup): data = {} return data def get_denZapisu(soup): data = {} return data def get_pravnaForma(soup): data = {} return data def get_predmetyCinnosti(soup): data = {} return data def get_spolocnici(soup): data = {} return data def get_vyskaVkladov(soup): data = {} return data def get_statutarnyOrgan(soup): data = {} return data def get_konanie(soup): data = {} return data def get_zakladneImanie(soup): data = {} return data def get_aktualizaciaUdajov(soup): data = {} return data def get_vypisUdajov(soup): data = {} return data def get_data(soup, name): data_td = soup.find("span", class_="tl", string=re.compile(f"{name}")).parent.find_next_sibling("td") return [] def parse_oddo(text): """ Parses the valid_from and valid_until from string :param text: od_do_dates in format: "(od: DD.MM.YYYY do: DD.MM.YYYY)" :return: returns a tuple (valid_from, valid_until) """ valid_from, valid_until = "", "" if (start_from := text.find("od: ")) > -1: valid_from = text[start_from+4:start_from+14] if (start_until := text.find("do: ")) > -1: valid_until = text[start_until+4:start_until+14] return valid_from, valid_until def test(): url = "https://www.orsr.sk/vypis.asp?ID=12388&SID=8&P=1" html = requests.get(url) soup = BeautifulSoup(html.content, "html.parser") record = { "oddiel": get_oddiel(soup), "vlozka": get_vlozka(soup), "obchodneMeno": get_obchodneMeno(soup), "sidlo": get_sidlo(soup), "ico": get_ico(soup), "denZapisu": get_denZapisu(soup), "pravnaForma": get_pravnaForma(soup), "predmetyCinnosti": get_predmetyCinnosti(soup), "spolocnici": get_spolocnici(soup), "vyskaVkladov": get_vyskaVkladov(soup), "statutarnyOrgan": get_statutarnyOrgan(soup), "konanie": get_konanie(soup), "zakladneImanie": get_zakladneImanie(soup), "aktualizaciaUdajov": get_aktualizaciaUdajov(soup), "vypisUdajov": get_vypisUdajov(soup) } print(json.dumps(record,indent=4,ensure_ascii=False)) collection = connect_db() #collection.bulk_write(soup) disconnect_db(collection) if __name__ == "__main__": #scrape_orsr() test()