import requests import re import json from bs4 import BeautifulSoup from tqdm.auto import tqdm from concurrent.futures import ThreadPoolExecutor from app.config import settings from app.db import connect_db, disconnect_db from time import sleep def scrape_orsr(): """ This is the main function that scrapes data from endpoint defined in config and stores it in mongodb. """ # get all links to "Aktuálny" from the orsr url html = requests.get(settings["orsr_url"]) soup = BeautifulSoup(html.content, "html.parser") records = soup.find_all("a", string="Aktuálny") records = [record["href"] for record in records] # distribute the work in #of threads defined in config worker_ids = list(range(1, len(records)+1)) parts = [worker_ids[i::settings["threads"]] for i in range(settings["threads"])] with ThreadPoolExecutor() as t: for thread_id, part in enumerate(parts): t.submit(process_records, part, thread_id+1) def process_records(records, thread): """ worker for processing records in a thread :param records: list of urls of records to proceses :param thread: thread id of processing thread """ data = [] for i in tqdm(range(len(records)), desc=f"thread {thread}"): record = process_record(records[i]) data.append(record) collection = connect_db() collection.bulk_write(data) disconnect_db(collection) def process_record(url): """ process one record. Scrape url and store data to mongodb :param url: url of the record :return dictionary of parameters """ html = requests.get(url) soup = BeautifulSoup(html.content, "html.parser") def get_oddiel(soup): oddiel = soup.find("span", class_="tl", string=re.compile("Oddiel:")).parent.find("span", class_="ra").text.strip() return {"value": oddiel} def get_vlozka(soup): vlozka = soup.find("span", class_="tl", string=re.compile("Vložka")).parent.find("span", class_="ra").text.strip() return {"value": vlozka} def get_obchodneMeno(soup): data = {} # find the table