import requests import re from bs4 import BeautifulSoup from tqdm.auto import tqdm from concurrent.futures import ThreadPoolExecutor from app.config import settings from app.db import connect_db, disconnect_db from time import sleep def scrape_orsr(): """ This is the main function that scrapes data from endpoint defined in config and stores it in mongodb. """ # get all links to "Aktuálny" from the orsr url html = requests.get(settings["orsr_url"]) soup = BeautifulSoup(html.content, "html.parser") records = soup.find_all("a", string="Aktuálny") records = [record["href"] for record in records] # distribute the work in #of threads defined in config worker_ids = list(range(1, len(records)+1)) parts = [worker_ids[i::settings["threads"]] for i in range(settings["threads"])] with ThreadPoolExecutor() as t: for thread_id, part in enumerate(parts): t.submit(process_records, part, thread_id+1) def process_records(records, thread): """ worker for processing records in a thread :param records: list of urls of records to proceses :param thread: thread id of processing thread """ data = [] for i in tqdm(range(len(records)), desc=f"thread {thread}"): record = process_record(records[i]) data.append(record) collection = connect_db() collection.bulk_write(data) disconnect_db(collection) def process_record(url): """ process one record. Scrape url and store data to mongodb :param url: url of the record :return dictionary of parameters """ html = requests.get(url) soup = BeautifulSoup(html.content, "html.parser") def test(): url = "https://www.orsr.sk/vypis.asp?ID=648444&SID=9&P=0" html = requests.get(url) soup = BeautifulSoup(html.content, "html.parser") ''' record = { "oddiel": soup.find("span", string=re.compile("Oddiel:")), "vlozka": pass, "obchodneMeno": pass, "sidlo": pass, "ico": pass, "denZapisu": pass, "pravnaForma": pass, "predmetyCinnosti": pass, "spolocnici": pass, "vyskaVkladov": pass, "statutarnyOrgan": pass, "konanie": pass, "zakladneImanie": pass, "aktualizaciaUdajov": pass, "vypisUdajov": pass } ''' collection = connect_db() #collection.bulk_write(soup) disconnect_db(collection) if __name__ == "__main__": #scrape_orsr() test()