import requests import re import json from bs4 import BeautifulSoup from tqdm.auto import tqdm from concurrent.futures import ThreadPoolExecutor from pymongo import InsertOne from app.config import settings from app.db import connect_db, disconnect_db from time import sleep def scrape_orsr(): """ This is the main function that scrapes data from endpoint defined in config and stores it in mongodb. """ # get all links to "Aktuálny" from the orsr url html = requests.get(settings["base_url"]+settings["endpoint"]) soup = BeautifulSoup(html.content, "html.parser") records = soup.find_all("a", string="Úplný") records = [settings["base_url"]+record["href"] for record in records] # distribute the work in #of threads defined in config parts = [records[i::settings["threads"]] for i in range(settings["threads"])] with ThreadPoolExecutor() as t: for thread_id, part in enumerate(parts): t.submit(process_records, part, thread_id+1) print("records_processed") def process_records(records, thread): """ worker for processing records in a thread :param records: list of urls of records to proceses :param thread: thread id of processing thread """ data = [] for i in tqdm(range(len(records)), desc=f"thread {thread}"): record = process_record(records[i]) data.append(InsertOne(record)) collection = connect_db() collection.bulk_write(data) disconnect_db(collection) def process_record(url): """ process one record. Scrape url and store data to mongodb :param url: url of the record :return dictionary of parameters """ html = requests.get(url) soup = BeautifulSoup(html.content, "html.parser") record = { "oddiel": get_oddiel(soup), "vlozka": get_vlozka(soup), "obchodneMeno": get_data(soup, "Obchodné meno", allow_multiple_active=False), "sidlo": get_data(soup,"Sídlo", allow_multiple_active=False), "ico": get_data(soup, "IČO", value_type="number" , allow_multiple_active=False), "denZapisu": get_data(soup, "Deň zápisu", allow_multiple_active=False), "pravnaForma": get_data(soup, "Právna forma", allow_multiple_active=False), "predmetyCinnosti": get_data(soup, "Predmet činnosti"), "spolocnici": get_data(soup, "Spoločníci", value_type="spolocnici"), "vyskaVkladov": get_data(soup, "Výška vkladu", value_type="vklad"), "statutarnyOrgan": get_data(soup, "Štatutárny orgán"), "konanie": get_data(soup, "Konanie menom"), "zakladneImanie": get_data(soup, "Základné imanie"), "aktualizaciaUdajov": get_aktualizaciaUdajov(soup), "vypisUdajov": get_vypisUdajov(soup) } return record def get_oddiel(soup): oddiel = soup.find("span", class_="tl", string=re.compile("Oddiel:")).parent.find("span", class_="ra").text.strip() return {"value": oddiel} def get_vlozka(soup): vlozka = soup.find("span", class_="tl", string=re.compile("Vložka")).parent.find("span", class_="ra").text.strip() return {"value": vlozka} def get_aktualizaciaUdajov(soup): aktualizacia = soup.find("td", class_="tl", string=re.compile("Dátum aktualizácie")).find_next_sibling("td").text.strip() return {"value": aktualizacia} def get_vypisUdajov(soup): vypis = soup.find("td", class_="tl", string=re.compile("Dátum výpisu")).find_next_sibling("td").text.strip() return {"value": vypis} def get_obchodneMeno(soup): data = {} # find the table element of "Obchodné meno:" meno_tr = soup.find("span", class_="tl", string=re.compile("Obchodné")).parent.parent # parse the name and date active = meno_tr.find_all("span", class_="ra") active = [x.text.strip() for x in active] if len(active) == 0: value, valid_from, valid_until = "", "" else: value, valid = active[0], active[1] valid_from, valid_until = parse_oddo(valid) data.update({"value": value, "valid_from": valid_from, "valid_until": valid_until}) # check for older entries old = meno_tr.find_all("span", class_="ro") old = [x.text.strip() for x in old] if len(old) == 0: old_values = [] else: old_values = [{"value": y[0], "valid_from": y[1][0], "valid_until": y[1][1]} for y in zip(old[::2], list(map(parse_oddo,old[1::2])))] data.update({"old_values": old_values}) return data def process_entry(entry, value_type): """ extracts one entry from the table of entries for a given data :param entry: one table element of data :param value_type: type of the value data :return: tuple: (value, valid_from, valid_until, active) """ value, valid_from, valid_until, active = None, None, None, False value_td, valid_td = entry.find_all("td") # Check if active entry if value_td.span.attrs["class"][0] == "ra": active = True lines = [f.strip() for f in " ".join(["\n" if x.name == "br" else x.text.strip() for x in value_td.find_all()]).split("\n") if f] if value_type == "text": value = ", ".join(lines) elif value_type == "number": value = int("".join(lines).replace(" ","")) elif value_type == "spolocnici": spolocnik = lines[0] adresa = ", ".join(lines[1:]) value = { "spolocnik": spolocnik, "adresa": adresa } elif value_type == "vklad": spolocnik = lines[0] vklad = ", ".join(lines[1:]) value = { "spolocnik": spolocnik, "vklad": vklad } valid_from, valid_until = parse_oddo(valid_td.text.strip()) return value, valid_from, valid_until, active def get_data(soup, name, value_type="text", allow_multiple_active=True): data_td = soup.find("span", class_="tl", string=re.compile(f"{name}")).parent.find_next_sibling("td") data = {} values = [] old_values = [] for entry in data_td.find_all("table"): value, valid_from, valid_until, active = process_entry(entry, value_type) if value is None: continue if active: values.append({"value": value, "valid_from": valid_from, "valid_until": valid_until}) else: old_values.append({"value": value, "valid_from": valid_from, "valid_until": valid_until}) if not allow_multiple_active: data.update(values[0]) else: data.update({"values": values}) data.update({"old_values": old_values}) return data def parse_oddo(text): """ Parses the valid_from and valid_until from string :param text: od_do_dates in format: "(od: DD.MM.YYYY do: DD.MM.YYYY)" :return: returns a tuple (valid_from, valid_until) """ valid_from, valid_until = "", "" if (start_from := text.find("od: ")) > -1: valid_from = text[start_from+4:start_from+14] if (start_until := text.find("do: ")) > -1: valid_until = text[start_until+4:start_until+14] return valid_from, valid_until def test(): url = "https://www.orsr.sk/vypis.asp?ID=12388&SID=8&P=1" html = requests.get(url) soup = BeautifulSoup(html.content, "html.parser") record = { "oddiel": get_oddiel(soup), "vlozka": get_vlozka(soup), "obchodneMeno": get_data(soup, "Obchodné meno", allow_multiple_active=False), "sidlo": get_data(soup,"Sídlo", allow_multiple_active=False), "ico": get_data(soup, "IČO", value_type="number" , allow_multiple_active=False), "denZapisu": get_data(soup, "Deň zápisu", allow_multiple_active=False), "pravnaForma": get_data(soup, "Právna forma", allow_multiple_active=False), "predmetyCinnosti": get_data(soup, "Predmet činnosti"), "spolocnici": get_data(soup, "Spoločníci", value_type="spolocnici"), "vyskaVkladov": get_data(soup, "Výška vkladu", value_type="vklad"), "statutarnyOrgan": get_data(soup, "Štatutárny orgán"), "konanie": get_data(soup, "Konanie menom"), "zakladneImanie": get_data(soup, "Základné imanie"), "aktualizaciaUdajov": get_aktualizaciaUdajov(soup), "vypisUdajov": get_vypisUdajov(soup) } print(json.dumps(record,indent=4,ensure_ascii=False)) collection = connect_db() records = [InsertOne(record)] collection.bulk_write(records) disconnect_db(collection) if __name__ == "__main__": scrape_orsr() #test()