comments and README.md

2023-09-28 17:08:50 +02:00
parent 68311135bf
commit 945b9c2195
6 changed files with 298 additions and 58 deletions
--- a/scraper.py
+++ b/scraper.py
@@ -1,6 +1,5 @@
 import requests
 import re
-import json
 import unicodedata
 from bs4 import BeautifulSoup
 from tqdm.auto import tqdm
@@ -9,9 +8,8 @@ from pymongo import InsertOne

 from app.config import settings
 from app.db import connect_db, disconnect_db
-from time import sleep
-

+# variables for custom data parsing
 single_value = ["Obchodné meno:", "Sídlo:", "IČO:", "Deň zápisu:", "Právna forma:"]
 value_type_dict = {
    "IČO:": "number",
@@ -19,6 +17,7 @@ value_type_dict = {
    "Výška vkladu každého spoločníka:": "vklad"
 }

+
 def scrape_orsr():
    """
    This is the main function that scrapes data from endpoint defined in config and stores it in mongodb.
@@ -26,9 +25,11 @@ def scrape_orsr():
    print("#########################")
    print("Starting ORSR scraper")

-    # get all links to "Aktuálny" from the orsr url
+    # get all links to from the orsr url
    print("Downloading changed records..")
+
    url = settings["base_url"]+settings["endpoint"]
+
    proxies = {}
    if (pr := settings["http_proxy"]) is not None:
        proxies.update({"http": pr})
@@ -36,10 +37,15 @@ def scrape_orsr():
    if (pr := settings["https_proxy"]) is not None:
        proxies.update({"https": pr})
        print(f"Found https proxy: {pr}")
+
    html = requests.get(url, proxies=proxies)
+
    print("All changed records downloaded.")
+
+    # use bs4 to parse the page
    soup = BeautifulSoup(html.content, "html.parser")

+    # choice between Aktualny and Uplny
    m_type = input("Choose which type of records do you want to download:\n[1] 'Aktuálne'\n[2] 'Úplné' (default)\n")
    if m_type == "1":
        record_type = "Aktuálny"
@@ -48,16 +54,21 @@ def scrape_orsr():
        record_type = "Úplný"
        print("Record type is 'Úplný'")
    records = soup.find_all("a", string=record_type)
+
+    # add base_url to href links
    records = [settings["base_url"]+record["href"] for record in records]
+
    print(f"There were {len(records)} records found.")

    # distribute the work in #of threads defined in config
    parts = [records[i::settings["threads"]] for i in range(settings["threads"])]

    print(f"Processing {len(records)} records using {settings['threads']} threads:")
+
    with ThreadPoolExecutor() as t:
        for thread_id, part in enumerate(parts):
            t.submit(process_records, part, thread_id+1)
+
    print("All records_processed")
    print("Closing ORSR Scraper...")
    print("#########################")
@@ -70,13 +81,15 @@ def process_records(records, thread):
    :param thread: thread id of processing thread
    """
    data = []
-    # for i in tqdm(range(len(records)), desc=f"thread {thread}"):
-    for i in tqdm(range(1), desc=f"thread {thread}"):
+    # add status bar for processing the records
+    for i in tqdm(range(len(records)), desc=f"thread {thread}"):
        try:
            record = process_record(records[i])
+            data.append(InsertOne(record))
        except Exception as e:
            print(f"When downloading and parsing record {records[i]} following error occured: {e}")
-        data.append(InsertOne(record))
+
+    # store processed records in db
    collection = connect_db()
    collection.bulk_write(data)
    disconnect_db(collection)
@@ -84,7 +97,7 @@ def process_records(records, thread):

 def process_record(url):
    """
-    process one record. Scrape url and store data to mongodb
+    process one record. Scrape url data and parse them to dictionary
    :param url: url of the record
    :return dictionary of parameters
    """
@@ -102,26 +115,87 @@ def process_record(url):


 def get_oddiel(soup):
+    """
+    Helper function to get Oddiel
+    :param soup: website data
+    :return: dictionary with value: oddiel
+    """
    oddiel = soup.find("span", class_="tl", string=re.compile("Oddiel:")).parent.find("span", class_="ra").text.strip()
    return {"value": oddiel}


 def get_vlozka(soup):
+    """
+    Helper function to get VloŽžka
+    :param soup: website data
+    :return: dictionary with value: vlozka
+    """
    vlozka = soup.find("span", class_="tl", string=re.compile("Vložka")).parent.find("span", class_="ra").text.strip()
    return {"value": vlozka}


 def get_aktualizaciaUdajov(soup):
+    """
+    Helper function to get the date of "Dátum aktualizácie údajov"
+    :param soup: website data
+    :return: dictionary with value: aktualizacia
+    """
    aktualizacia = soup.find("td", class_="tl", string=re.compile("Dátum aktualizácie")).find_next_sibling("td").text.strip()
    return {"value": aktualizacia}


 def get_vypisUdajov(soup):
+    """
+    Helper function to get the date of "Dátum výpisu"
+    :param soup: website data
+    :return: dictionary with value: vypis
+    """
    vypis = soup.find("td", class_="tl", string=re.compile("Dátum výpisu")).find_next_sibling("td").text.strip()
    return {"value": vypis}


+def get_data(data_td, value_type="text", allow_multiple_active=True):
+    """
+    Generic function to retrieve data for one key
+    :param data_td: <td>-element containing the data
+    :param value_type: type of value that we want to retrieve. Default value is "text" other values are defined in value_type_dict
+    :param allow_multiple_active: if multiple active values are allowed, then a list of active values is returned, instead of single items
+    :return: dictionary of data for the entry
+    """
+    data_td = data_td
+
+    data = {}
+
+    # lists holding the data for one key in the record
+    values = []
+    old_values = []
+
+    # get multiple entries (as table data)
+    for entry in data_td.find_all("table"):
+        value, valid_from, valid_until, active = process_entry(entry, value_type)
+        if value is None:
+            continue
+        if active:
+            values.append({"value": value, "valid_from": valid_from, "valid_until": valid_until})
+        else:
+            old_values.append({"value": value, "valid_from": valid_from, "valid_until": valid_until})
+
+    if not allow_multiple_active:
+        if len(values) > 0:
+            data.update(values[0])
+    else:
+        data.update({"values": values})
+    data.update({"old_values": old_values})
+
+    return data
+
+
 def get_record_data(soup):
+    """
+    Retrieve data for one record
+    :param soup: souped-html for the record
+    :return: dictionary with record data
+    """
    record = {
        "oddiel": get_oddiel(soup),
        "vlozka": get_vlozka(soup)
@@ -129,6 +203,9 @@ def get_record_data(soup):

    # find the last table before variable data
    entry = soup.find("span", class_="tl", string=re.compile("Oddiel:")).parent.parent.parent
+
+    # retrieve all keys for a record. Since there are multiple different record types with different keys,
+    # the keys of the record are created automatically from available data
    while True:
        entry = entry.find_next_sibling("table")
        entry_tr = entry.find_all("tr")
@@ -137,19 +214,23 @@ def get_record_data(soup):
        if len(entry_tr) > 1:  # last table with "Dátum aktualizácie údajov
            break

-        # get enry name and entry data
-        entry_container = entry_tr[0].find_all("td")
-        entry_name = entry_container[0].text.strip()
+        # get key name and key data
+        key_container = entry_tr[0].find_all("td")
+        key_name = key_container[0].text.strip()

+        # check if multiple active allowed and the value_type
        allow_multiple_active = True
        value_type = "text"
-        if entry_name in single_value:
+        if key_name in single_value:
            allow_multiple_active = False
-        if (v_type := value_type_dict.get(entry_name)) is not None:
+        if (v_type := value_type_dict.get(key_name)) is not None:
            value_type = v_type
-        entry_name = transform_entry_name(entry_name)
-        entry_data = get_data(entry_container[1], value_type=value_type, allow_multiple_active=allow_multiple_active)
-        record.update({entry_name: entry_data})
+
+        key_name = transform_key_name(key_name)
+
+        # reads the data of the key
+        key_data = get_data(key_container[1], value_type=value_type, allow_multiple_active=allow_multiple_active)
+        record.update({key_name: key_data})

    record.update({
        "aktualizaciaUdajov": get_aktualizaciaUdajov(soup),
@@ -159,7 +240,12 @@ def get_record_data(soup):
    return record


-def transform_entry_name(name):
+def transform_key_name(name):
+    """
+    Helper function to create camelCase key name
+    :param name: string with input data (from ORSR)
+    :return: camelCase key name
+    """
    s = unicodedata.normalize("NFKD",name).encode('ascii', 'ignore').decode().replace(":", "").lower().split()
    return s[0].lower() + "".join(w.capitalize() for w in s[1:])

@@ -180,8 +266,10 @@ def process_entry(entry, value_type):
    if value_td.span.attrs["class"][0] == "ra":
        active = True

+    # get clean lines from multiline entries
    lines = [f.strip() for f in " ".join(["\n" if x.name == "br" else x.text.strip() for x in value_td.find_all(["br","span"])]).split("\n") if f]

+    # parse data according to value_type
    if value_type == "text":
        value = ", ".join(lines)
    elif value_type == "number":
@@ -206,33 +294,6 @@ def process_entry(entry, value_type):
    return value, valid_from, valid_until, active


-def get_data(data_td, value_type="text", allow_multiple_active=True):
-    data_td = data_td
-
-    data = {}
-
-    values = []
-    old_values = []
-
-    for entry in data_td.find_all("table"):
-        value, valid_from, valid_until, active = process_entry(entry, value_type)
-        if value is None:
-            continue
-        if active:
-            values.append({"value": value, "valid_from": valid_from, "valid_until": valid_until})
-        else:
-            old_values.append({"value": value, "valid_from": valid_from, "valid_until": valid_until})
-
-    if not allow_multiple_active:
-        if len(values) > 0:
-            data.update(values[0])
-    else:
-        data.update({"values": values})
-    data.update({"old_values": old_values})
-
-    return data
-
-
 def parse_oddo(text):
    """
    Parses the valid_from and valid_until from string