multithreading and parsing1

2023-09-27 13:02:30 +02:00
parent 00d7456f89
commit e453cc2c6c
7 changed files with 127 additions and 14 deletions
--- a/scraper.py
+++ b/scraper.py
@@ -0,0 +1,87 @@
+import requests
+import re
+from bs4 import BeautifulSoup
+from tqdm.auto import tqdm
+from concurrent.futures import ThreadPoolExecutor
+
+from app.config import settings
+from app.db import connect_db, disconnect_db
+from time import sleep
+
+
+def scrape_orsr():
+    """
+    This is the main function that scrapes data from endpoint defined in config and stores it in mongodb.
+    """
+    # get all links to "Aktuálny" from the orsr url
+    html = requests.get(settings["orsr_url"])
+    soup = BeautifulSoup(html.content, "html.parser")
+    records = soup.find_all("a", string="Aktuálny")
+    records = [record["href"] for record in records]
+
+    # distribute the work in #of threads defined in config
+    worker_ids = list(range(1, len(records)+1))
+    parts = [worker_ids[i::settings["threads"]] for i in range(settings["threads"])]
+
+    with ThreadPoolExecutor() as t:
+        for thread_id, part in enumerate(parts):
+            t.submit(process_records, part, thread_id+1)
+
+
+def process_records(records, thread):
+    """
+    worker for processing records in a thread
+    :param records: list of urls of records to proceses
+    :param thread: thread id of processing thread
+    """
+    data = []
+    for i in tqdm(range(len(records)), desc=f"thread {thread}"):
+        record = process_record(records[i])
+        data.append(record)
+    collection = connect_db()
+    collection.bulk_write(data)
+    disconnect_db(collection)
+
+
+def process_record(url):
+    """
+    process one record. Scrape url and store data to mongodb
+    :param url: url of the record
+    :return dictionary of parameters
+    """
+    html = requests.get(url)
+    soup = BeautifulSoup(html.content, "html.parser")
+
+
+def test():
+    url = "https://www.orsr.sk/vypis.asp?ID=648444&SID=9&P=0"
+    html = requests.get(url)
+    soup = BeautifulSoup(html.content, "html.parser")
+
+    '''
+    record = {
+        "oddiel":               soup.find("span", string=re.compile("Oddiel:")),
+        "vlozka":               pass,
+        "obchodneMeno":         pass,
+        "sidlo":                pass,
+        "ico":                  pass,
+        "denZapisu":            pass,
+        "pravnaForma":          pass,
+        "predmetyCinnosti":     pass,
+        "spolocnici":           pass,
+        "vyskaVkladov":         pass,
+        "statutarnyOrgan":      pass,
+        "konanie":              pass,
+        "zakladneImanie":       pass,
+        "aktualizaciaUdajov":   pass,
+        "vypisUdajov":          pass
+    }
+    '''
+    collection = connect_db()
+    #collection.bulk_write(soup)
+    disconnect_db(collection)
+
+
+if __name__ == "__main__":
+    #scrape_orsr()
+    test()