comments and README.md

2023-09-28 17:08:50 +02:00
parent 68311135bf
commit 945b9c2195
6 changed files with 298 additions and 58 deletions
@@ -0,0 +1,89 @@
+# ORSR Scraper
+
+With this application you can get all changed records in orsr for the current day. 
+
+The application consists of two parts:
+
+### 1. Scraper: 
+- gets the data of all changed records 
+  - either the "aktuálna" or the "úplna" version
+  - can use a socks5 proxy
+- stores the data in a MongoDB
+
+### 2. Flask app:
+
+- Minimalistic flask app that has two endpoints: 
+  - /detail with parameter ico
+    - returns a json data for the record with ico
+  - /list
+    - returns a paginated list of records ico and obhcodneMeno
+
+
+## Setup
+### 1. Prerequisites
+You need to have installed/access to:
+- current python
+- MongoDB
+- Socks5 proxy (optional) 
+
+The installation of these is out of scope of this README
+
+### 1. Download the app
+Download/clone the application
+
+### 2. venv and requirements
+Open terminal cd to app folder and install venv
+```
+cd [appPath]
+python -m venv venv
+```
+install the requirements from `requirements.txt`
+```
+venv/bin/pip install -r requirements.txt
+
+for Windows:
+venv\Scripts\pip.exe install -r requirements.txt
+```
+
+### 3. Config File
+There is a default config file "config_base.cfg". 
+For local changes copy this base config file and store it as "config.cfg". The config file has the following structure:
+```
+[DB]
+MONGODB_URI = mongodb://localhost:27017
+MONGODB_DB = softone
+MONGODB_COLLECTION = orsr
+
+[WEB]
+BASE_URL = https://www.orsr.sk/
+ENDPOINT = hladaj_zmeny.asp
+
+[PROXY]
+#HTTP_PROXY = socks5://user:pass@host:port
+#HTTPS_PROXY = socks5://user:pass@host:port
+
+[APP]
+THREADS = 8
+```
+
+Setup the connection to MongoDB, number of threads being used for collecting the data and optionally also the Socks5 Proxy params.
+
+## Run the applications
+### 1. Scraper
+Run the scraper with 
+```
+venv/bin/python scraper.py
+
+for Windows:
+venv\Scripts\python.exe scraper.py
+```
+It will ask you if you want to download the "aktuálny" or "úplný" record.
+### 2. Flask
+Start flask application
+```
+venv/bin/python flaskapp.py
+
+for Windows:
+venv\Scripts\python.exe flaskapp.py
+```
+Now you can get the data from the local test server that usually runs on `http://127.0.0.1:5000`
@@ -1,6 +1,12 @@
 import configparser
 from os import path

+"""
+Reads the config files and stores the config into settings dictionary.
+This dictionary can be then used in the application to access config values. 
+"""
+
+
 # parse base config file
 config = configparser.ConfigParser()
 config_path = path.dirname(path.abspath(__file__))
@@ -1,10 +1,15 @@
 from flask import g
-from werkzeug.local import LocalProxy
 from pymongo import MongoClient
 from .config import settings


 def connect_db():
+    """
+    Creates the connection to the MongoDB using the config file data.
+    Thos function is used by flask application as well as the scraper script
+
+    :return: connection to the collection in the mongodb
+    """
    client = MongoClient(settings["mongodb_uri"])
    db = client[settings["mongodb_db"]]
    collection = db[settings["mongodb_collection"]]
@@ -12,12 +17,17 @@ def connect_db():


 def disconnect_db(conn):
+    """
+    Disconnects open connection
+    :param conn: open db connection
+    """
    conn.database.client.close()


 def get_db():
    """
-    Configuration method to return db instance
+    Get collection instance for flask application. If no instance stored in global variables, then create connection
+    and store it in g
    """
    collection = getattr(g, "_database", None)

@@ -25,7 +35,3 @@ def get_db():
        collection = g._database = connect_db()

    return collection
-
-
-# Use LocalProxy to read the global db instance with just `db`
-db = LocalProxy(get_db)
@@ -1,17 +1,92 @@
 from flask import current_app as app
-from flask import request
-from .db import db
+from flask import request, url_for
+from .db import get_db


@app.route("/detail", methods=["GET"])
 def detail():
-    ico = request.args.get("ico")
-    if "ico" is None:
-        return "missing ico"
+    """
+    GET a detial of one record in json format
+    :args: ico: integer value of IČO
+    :return: json object representing record with ico
+    """

-    return f"ICO je {ico}"
+    ico = request.args.get("ico")
+    if ico is None:
+        return {}
+    try:
+        ico = int(ico)
+    except ValueError:
+        return{}
+
+    collection = get_db()
+    data = collection.find_one({"ico.value":ico})
+    data.pop("_id")
+    return data


@app.route("/list", methods=["GET"])
 def list_data():
-    return "list"
+    """
+    GET a list of ORSR records.
+
+    Pagination is inspired by official MongoDB resources/tutorials
+
+    The results are paginated using the `page` parameter.
+    :return: json object with list of records and links to other pages
+    """
+
+    page = int(request.args.get("page", 1))
+    per_page = 50  # A const value.
+    collection = get_db()
+
+    # For pagination, we sort by ICO
+    # then skip the number of docs that earlier pages would have displayed,
+    # and then to limit to the fixed page size, ``per_page``.
+    records = collection.find().sort("ico.value").skip(per_page * (page - 1)).limit(per_page)
+
+    records_count = collection.count_documents({})
+
+    links = {
+        "self": {"href": url_for(".list_data", page=page, _external=True)},
+        "last": {
+            "href": url_for(
+                ".list_data", page=(records_count // per_page) + 1, _external=True
+            )
+        },
+    }
+    # Add a 'prev' link if it's not on the first page:
+    if page > 1:
+        links["prev"] = {
+            "href": url_for(".list_data", page=page - 1, _external=True)
+        }
+    # Add a 'next' link if it's not on the last page:
+    if page - 1 < records_count // per_page:
+        links["next"] = {
+            "href": url_for(".list_data", page=page + 1, _external=True)
+        }
+
+    return {
+        "records": [transform_for_list(record) for record in records],  # get only
+        "_links": links,
+    }
+
+
+def transform_for_list(record_in):
+    """
+    retrieve ico and obchodneMeno from record
+    :param record_in: record with all data
+    :return: dictionary of ico and obchodneMeno
+    """
+    if (obch_meno := record_in["obchodneMeno"].get("value")) is None:
+        obch_meno_old = record_in["obchodneMeno"].get("old_values")
+        if len(obch_meno_old) == 0:
+            obch_meno = ""
+        else:
+            obch_meno =  obch_meno_old[0].get("value", "")
+
+    record = {
+        "ico": record_in["ico"]["value"],
+        "obchodneMeno": obch_meno
+    }
+    return record
@@ -1,9 +1,12 @@
 from flask import Flask
-from app.config import settings


 def create_app():
+    """
+    Create a very simple flask app.

+    :return: Flask application
+    """
    flaskapp = Flask(__name__)

    with flaskapp.app_context():
@@ -1,6 +1,5 @@
 import requests
 import re
-import json
 import unicodedata
 from bs4 import BeautifulSoup
 from tqdm.auto import tqdm
@@ -9,9 +8,8 @@ from pymongo import InsertOne

 from app.config import settings
 from app.db import connect_db, disconnect_db
-from time import sleep
-

+# variables for custom data parsing
 single_value = ["Obchodné meno:", "Sídlo:", "IČO:", "Deň zápisu:", "Právna forma:"]
 value_type_dict = {
    "IČO:": "number",
@@ -19,6 +17,7 @@ value_type_dict = {
    "Výška vkladu každého spoločníka:": "vklad"
 }

+
 def scrape_orsr():
    """
    This is the main function that scrapes data from endpoint defined in config and stores it in mongodb.
@@ -26,9 +25,11 @@ def scrape_orsr():
    print("#########################")
    print("Starting ORSR scraper")

-    # get all links to "Aktuálny" from the orsr url
+    # get all links to from the orsr url
    print("Downloading changed records..")
+
    url = settings["base_url"]+settings["endpoint"]
+
    proxies = {}
    if (pr := settings["http_proxy"]) is not None:
        proxies.update({"http": pr})
@@ -36,10 +37,15 @@ def scrape_orsr():
    if (pr := settings["https_proxy"]) is not None:
        proxies.update({"https": pr})
        print(f"Found https proxy: {pr}")
+
    html = requests.get(url, proxies=proxies)
+
    print("All changed records downloaded.")
+
+    # use bs4 to parse the page
    soup = BeautifulSoup(html.content, "html.parser")

+    # choice between Aktualny and Uplny
    m_type = input("Choose which type of records do you want to download:\n[1] 'Aktuálne'\n[2] 'Úplné' (default)\n")
    if m_type == "1":
        record_type = "Aktuálny"
@@ -48,16 +54,21 @@ def scrape_orsr():
        record_type = "Úplný"
        print("Record type is 'Úplný'")
    records = soup.find_all("a", string=record_type)
+
+    # add base_url to href links
    records = [settings["base_url"]+record["href"] for record in records]
+
    print(f"There were {len(records)} records found.")

    # distribute the work in #of threads defined in config
    parts = [records[i::settings["threads"]] for i in range(settings["threads"])]

    print(f"Processing {len(records)} records using {settings['threads']} threads:")
+
    with ThreadPoolExecutor() as t:
        for thread_id, part in enumerate(parts):
            t.submit(process_records, part, thread_id+1)
+
    print("All records_processed")
    print("Closing ORSR Scraper...")
    print("#########################")
@@ -70,13 +81,15 @@ def process_records(records, thread):
    :param thread: thread id of processing thread
    """
    data = []
-    # for i in tqdm(range(len(records)), desc=f"thread {thread}"):
-    for i in tqdm(range(1), desc=f"thread {thread}"):
+    # add status bar for processing the records
+    for i in tqdm(range(len(records)), desc=f"thread {thread}"):
        try:
            record = process_record(records[i])
+            data.append(InsertOne(record))
        except Exception as e:
            print(f"When downloading and parsing record {records[i]} following error occured: {e}")
-        data.append(InsertOne(record))
+
+    # store processed records in db
    collection = connect_db()
    collection.bulk_write(data)
    disconnect_db(collection)
@@ -84,7 +97,7 @@ def process_records(records, thread):

 def process_record(url):
    """
-    process one record. Scrape url and store data to mongodb
+    process one record. Scrape url data and parse them to dictionary
    :param url: url of the record
    :return dictionary of parameters
    """
@@ -102,26 +115,87 @@ def process_record(url):


 def get_oddiel(soup):
+    """
+    Helper function to get Oddiel
+    :param soup: website data
+    :return: dictionary with value: oddiel
+    """
    oddiel = soup.find("span", class_="tl", string=re.compile("Oddiel:")).parent.find("span", class_="ra").text.strip()
    return {"value": oddiel}


 def get_vlozka(soup):
+    """
+    Helper function to get VloŽžka
+    :param soup: website data
+    :return: dictionary with value: vlozka
+    """
    vlozka = soup.find("span", class_="tl", string=re.compile("Vložka")).parent.find("span", class_="ra").text.strip()
    return {"value": vlozka}


 def get_aktualizaciaUdajov(soup):
+    """
+    Helper function to get the date of "Dátum aktualizácie údajov"
+    :param soup: website data
+    :return: dictionary with value: aktualizacia
+    """
    aktualizacia = soup.find("td", class_="tl", string=re.compile("Dátum aktualizácie")).find_next_sibling("td").text.strip()
    return {"value": aktualizacia}


 def get_vypisUdajov(soup):
+    """
+    Helper function to get the date of "Dátum výpisu"
+    :param soup: website data
+    :return: dictionary with value: vypis
+    """
    vypis = soup.find("td", class_="tl", string=re.compile("Dátum výpisu")).find_next_sibling("td").text.strip()
    return {"value": vypis}


+def get_data(data_td, value_type="text", allow_multiple_active=True):
+    """
+    Generic function to retrieve data for one key
+    :param data_td: <td>-element containing the data
+    :param value_type: type of value that we want to retrieve. Default value is "text" other values are defined in value_type_dict
+    :param allow_multiple_active: if multiple active values are allowed, then a list of active values is returned, instead of single items
+    :return: dictionary of data for the entry
+    """
+    data_td = data_td
+
+    data = {}
+
+    # lists holding the data for one key in the record
+    values = []
+    old_values = []
+
+    # get multiple entries (as table data)
+    for entry in data_td.find_all("table"):
+        value, valid_from, valid_until, active = process_entry(entry, value_type)
+        if value is None:
+            continue
+        if active:
+            values.append({"value": value, "valid_from": valid_from, "valid_until": valid_until})
+        else:
+            old_values.append({"value": value, "valid_from": valid_from, "valid_until": valid_until})
+
+    if not allow_multiple_active:
+        if len(values) > 0:
+            data.update(values[0])
+    else:
+        data.update({"values": values})
+    data.update({"old_values": old_values})
+
+    return data
+
+
 def get_record_data(soup):
+    """
+    Retrieve data for one record
+    :param soup: souped-html for the record
+    :return: dictionary with record data
+    """
    record = {
        "oddiel": get_oddiel(soup),
        "vlozka": get_vlozka(soup)
@@ -129,6 +203,9 @@ def get_record_data(soup):

    # find the last table before variable data
    entry = soup.find("span", class_="tl", string=re.compile("Oddiel:")).parent.parent.parent
+
+    # retrieve all keys for a record. Since there are multiple different record types with different keys,
+    # the keys of the record are created automatically from available data
    while True:
        entry = entry.find_next_sibling("table")
        entry_tr = entry.find_all("tr")
@@ -137,19 +214,23 @@ def get_record_data(soup):
        if len(entry_tr) > 1:  # last table with "Dátum aktualizácie údajov
            break

-        # get enry name and entry data
-        entry_container = entry_tr[0].find_all("td")
-        entry_name = entry_container[0].text.strip()
+        # get key name and key data
+        key_container = entry_tr[0].find_all("td")
+        key_name = key_container[0].text.strip()

+        # check if multiple active allowed and the value_type
        allow_multiple_active = True
        value_type = "text"
-        if entry_name in single_value:
+        if key_name in single_value:
            allow_multiple_active = False
-        if (v_type := value_type_dict.get(entry_name)) is not None:
+        if (v_type := value_type_dict.get(key_name)) is not None:
            value_type = v_type
-        entry_name = transform_entry_name(entry_name)
-        entry_data = get_data(entry_container[1], value_type=value_type, allow_multiple_active=allow_multiple_active)
-        record.update({entry_name: entry_data})
+
+        key_name = transform_key_name(key_name)
+
+        # reads the data of the key
+        key_data = get_data(key_container[1], value_type=value_type, allow_multiple_active=allow_multiple_active)
+        record.update({key_name: key_data})

    record.update({
        "aktualizaciaUdajov": get_aktualizaciaUdajov(soup),
@@ -159,7 +240,12 @@ def get_record_data(soup):
    return record


-def transform_entry_name(name):
+def transform_key_name(name):
+    """
+    Helper function to create camelCase key name
+    :param name: string with input data (from ORSR)
+    :return: camelCase key name
+    """
    s = unicodedata.normalize("NFKD",name).encode('ascii', 'ignore').decode().replace(":", "").lower().split()
    return s[0].lower() + "".join(w.capitalize() for w in s[1:])

@@ -180,8 +266,10 @@ def process_entry(entry, value_type):
    if value_td.span.attrs["class"][0] == "ra":
        active = True

+    # get clean lines from multiline entries
    lines = [f.strip() for f in " ".join(["\n" if x.name == "br" else x.text.strip() for x in value_td.find_all(["br","span"])]).split("\n") if f]

+    # parse data according to value_type
    if value_type == "text":
        value = ", ".join(lines)
    elif value_type == "number":
@@ -206,33 +294,6 @@ def process_entry(entry, value_type):
    return value, valid_from, valid_until, active


-def get_data(data_td, value_type="text", allow_multiple_active=True):
-    data_td = data_td
-
-    data = {}
-
-    values = []
-    old_values = []
-
-    for entry in data_td.find_all("table"):
-        value, valid_from, valid_until, active = process_entry(entry, value_type)
-        if value is None:
-            continue
-        if active:
-            values.append({"value": value, "valid_from": valid_from, "valid_until": valid_until})
-        else:
-            old_values.append({"value": value, "valid_from": valid_from, "valid_until": valid_until})
-
-    if not allow_multiple_active:
-        if len(values) > 0:
-            data.update(values[0])
-    else:
-        data.update({"values": values})
-    data.update({"old_values": old_values})
-
-    return data
-
-
 def parse_oddo(text):
    """
    Parses the valid_from and valid_until from string