comments and README.md

2023-09-28 17:08:50 +02:00
parent 68311135bf
commit 945b9c2195
6 changed files with 298 additions and 58 deletions
--- a/README.md
+++ b/README.md
@@ -0,0 +1,89 @@
 # ORSR Scraper
 With this application you can get all changed records in orsr for the current day. 
 The application consists of two parts:
 ### 1. Scraper: 
 - gets the data of all changed records 
  - either the "aktuálna" or the "úplna" version
  - can use a socks5 proxy
 - stores the data in a MongoDB
 ### 2. Flask app:
 - Minimalistic flask app that has two endpoints: 
  - /detail with parameter ico
    - returns a json data for the record with ico
  - /list
    - returns a paginated list of records ico and obhcodneMeno
 ## Setup
 ### 1. Prerequisites
 You need to have installed/access to:
 - current python
 - MongoDB
 - Socks5 proxy (optional) 
 The installation of these is out of scope of this README
 ### 1. Download the app
 Download/clone the application
 ### 2. venv and requirements
 Open terminal cd to app folder and install venv
 ```
 cd [appPath]
 python -m venv venv
 ```
 install the requirements from `requirements.txt`
 ```
 venv/bin/pip install -r requirements.txt
 for Windows:
 venv\Scripts\pip.exe install -r requirements.txt
 ```
 ### 3. Config File
 There is a default config file "config_base.cfg". 
 For local changes copy this base config file and store it as "config.cfg". The config file has the following structure:
 ```
 [DB]
 MONGODB_URI = mongodb://localhost:27017
 MONGODB_DB = softone
 MONGODB_COLLECTION = orsr
 [WEB]
 BASE_URL = https://www.orsr.sk/
 ENDPOINT = hladaj_zmeny.asp
 [PROXY]
 #HTTP_PROXY = socks5://user:pass@host:port
 #HTTPS_PROXY = socks5://user:pass@host:port
 [APP]
 THREADS = 8
 ```
 Setup the connection to MongoDB, number of threads being used for collecting the data and optionally also the Socks5 Proxy params.
 ## Run the applications
 ### 1. Scraper
 Run the scraper with 
 ```
 venv/bin/python scraper.py
 for Windows:
 venv\Scripts\python.exe scraper.py
 ```
 It will ask you if you want to download the "aktuálny" or "úplný" record.
 ### 2. Flask
 Start flask application
 ```
 venv/bin/python flaskapp.py
 for Windows:
 venv\Scripts\python.exe flaskapp.py
 ```
 Now you can get the data from the local test server that usually runs on `http://127.0.0.1:5000`
--- a/app/config.py
+++ b/app/config.py
@@ -1,6 +1,12 @@
 import configparser
 from os import path
 """
 Reads the config files and stores the config into settings dictionary.
 This dictionary can be then used in the application to access config values. 
 """
 # parse base config file
 config = configparser.ConfigParser()
 config_path = path.dirname(path.abspath(__file__))
--- a/app/db.py
+++ b/app/db.py
@@ -1,10 +1,15 @@
 from flask import g
 from werkzeug.local import LocalProxy
 from pymongo import MongoClient
 from .config import settings
 def connect_db():
    """
    Creates the connection to the MongoDB using the config file data.
    Thos function is used by flask application as well as the scraper script
    :return: connection to the collection in the mongodb
    """
    client = MongoClient(settings["mongodb_uri"])
    db = client[settings["mongodb_db"]]
    collection = db[settings["mongodb_collection"]]
@@ -12,12 +17,17 @@ def connect_db():
 def disconnect_db(conn):
    """
    Disconnects open connection
    :param conn: open db connection
    """
    conn.database.client.close()
 def get_db():
    """
-    Configuration method to return db instance
+    Get collection instance for flask application. If no instance stored in global variables, then create connection
    and store it in g
    """
    collection = getattr(g, "_database", None)
@@ -25,7 +35,3 @@ def get_db():
        collection = g._database = connect_db()
    return collection
 # Use LocalProxy to read the global db instance with just `db`
 db = LocalProxy(get_db)
--- a/app/routes.py
+++ b/app/routes.py
@@ -1,17 +1,92 @@
 from flask import current_app as app
-from flask import request
+from flask import request, url_for
-from .db import db
+from .db import get_db
@app.route("/detail", methods=["GET"])
 def detail():
-    ico = request.args.get("ico")
+    """
-    if "ico" is None:
+    GET a detial of one record in json format
-        return "missing ico"
+    :args: ico: integer value of IČO
    :return: json object representing record with ico
    """
-    return f"ICO je {ico}"
+    ico = request.args.get("ico")
    if ico is None:
        return {}
    try:
        ico = int(ico)
    except ValueError:
        return{}
    collection = get_db()
    data = collection.find_one({"ico.value":ico})
    data.pop("_id")
    return data
@app.route("/list", methods=["GET"])
 def list_data():
-    return "list"
+    """
    GET a list of ORSR records.
    Pagination is inspired by official MongoDB resources/tutorials
    The results are paginated using the `page` parameter.
    :return: json object with list of records and links to other pages
    """
    page = int(request.args.get("page", 1))
    per_page = 50  # A const value.
    collection = get_db()
    # For pagination, we sort by ICO
    # then skip the number of docs that earlier pages would have displayed,
    # and then to limit to the fixed page size, ``per_page``.
    records = collection.find().sort("ico.value").skip(per_page * (page - 1)).limit(per_page)
    records_count = collection.count_documents({})
    links = {
        "self": {"href": url_for(".list_data", page=page, _external=True)},
        "last": {
            "href": url_for(
                ".list_data", page=(records_count // per_page) + 1, _external=True
            )
        },
    }
    # Add a 'prev' link if it's not on the first page:
    if page > 1:
        links["prev"] = {
            "href": url_for(".list_data", page=page - 1, _external=True)
        }
    # Add a 'next' link if it's not on the last page:
    if page - 1 < records_count // per_page:
        links["next"] = {
            "href": url_for(".list_data", page=page + 1, _external=True)
        }
    return {
        "records": [transform_for_list(record) for record in records],  # get only
        "_links": links,
    }
 def transform_for_list(record_in):
    """
    retrieve ico and obchodneMeno from record
    :param record_in: record with all data
    :return: dictionary of ico and obchodneMeno
    """
    if (obch_meno := record_in["obchodneMeno"].get("value")) is None:
        obch_meno_old = record_in["obchodneMeno"].get("old_values")
        if len(obch_meno_old) == 0:
            obch_meno = ""
        else:
            obch_meno =  obch_meno_old[0].get("value", "")
    record = {
        "ico": record_in["ico"]["value"],
        "obchodneMeno": obch_meno
    }
    return record
--- a/flaskapp.py
+++ b/flaskapp.py
@@ -1,9 +1,12 @@
 from flask import Flask
 from app.config import settings
 def create_app():
    """
    Create a very simple flask app.
    :return: Flask application
    """
    flaskapp = Flask(__name__)
    with flaskapp.app_context():
--- a/scraper.py
+++ b/scraper.py
@@ -1,6 +1,5 @@
 import requests
 import re
 import json
 import unicodedata
 from bs4 import BeautifulSoup
 from tqdm.auto import tqdm
@@ -9,9 +8,8 @@ from pymongo import InsertOne
 from app.config import settings
 from app.db import connect_db, disconnect_db
 from time import sleep
 # variables for custom data parsing
 single_value = ["Obchodné meno:", "Sídlo:", "IČO:", "Deň zápisu:", "Právna forma:"]
 value_type_dict = {
    "IČO:": "number",
@@ -19,6 +17,7 @@ value_type_dict = {
    "Výška vkladu každého spoločníka:": "vklad"
 }
 def scrape_orsr():
    """
    This is the main function that scrapes data from endpoint defined in config and stores it in mongodb.
@@ -26,9 +25,11 @@ def scrape_orsr():
    print("#########################")
    print("Starting ORSR scraper")
-    # get all links to "Aktuálny" from the orsr url
+    # get all links to from the orsr url
    print("Downloading changed records..")
    url = settings["base_url"]+settings["endpoint"]
    proxies = {}
    if (pr := settings["http_proxy"]) is not None:
        proxies.update({"http": pr})
@@ -36,10 +37,15 @@ def scrape_orsr():
    if (pr := settings["https_proxy"]) is not None:
        proxies.update({"https": pr})
        print(f"Found https proxy: {pr}")
    html = requests.get(url, proxies=proxies)
    print("All changed records downloaded.")
    # use bs4 to parse the page
    soup = BeautifulSoup(html.content, "html.parser")
    # choice between Aktualny and Uplny
    m_type = input("Choose which type of records do you want to download:\n[1] 'Aktuálne'\n[2] 'Úplné' (default)\n")
    if m_type == "1":
        record_type = "Aktuálny"
@@ -48,16 +54,21 @@ def scrape_orsr():
        record_type = "Úplný"
        print("Record type is 'Úplný'")
    records = soup.find_all("a", string=record_type)
    # add base_url to href links
    records = [settings["base_url"]+record["href"] for record in records]
    print(f"There were {len(records)} records found.")
    # distribute the work in #of threads defined in config
    parts = [records[i::settings["threads"]] for i in range(settings["threads"])]
    print(f"Processing {len(records)} records using {settings['threads']} threads:")
    with ThreadPoolExecutor() as t:
        for thread_id, part in enumerate(parts):
            t.submit(process_records, part, thread_id+1)
    print("All records_processed")
    print("Closing ORSR Scraper...")
    print("#########################")
@@ -70,13 +81,15 @@ def process_records(records, thread):
    :param thread: thread id of processing thread
    """
    data = []
-    # for i in tqdm(range(len(records)), desc=f"thread {thread}"):
+    # add status bar for processing the records
-    for i in tqdm(range(1), desc=f"thread {thread}"):
+    for i in tqdm(range(len(records)), desc=f"thread {thread}"):
        try:
            record = process_record(records[i])
            data.append(InsertOne(record))
        except Exception as e:
            print(f"When downloading and parsing record {records[i]} following error occured: {e}")
-        data.append(InsertOne(record))
+
    # store processed records in db
    collection = connect_db()
    collection.bulk_write(data)
    disconnect_db(collection)
@@ -84,7 +97,7 @@ def process_records(records, thread):
 def process_record(url):
    """
-    process one record. Scrape url and store data to mongodb
+    process one record. Scrape url data and parse them to dictionary
    :param url: url of the record
    :return dictionary of parameters
    """
@@ -102,26 +115,87 @@ def process_record(url):
 def get_oddiel(soup):
    """
    Helper function to get Oddiel
    :param soup: website data
    :return: dictionary with value: oddiel
    """
    oddiel = soup.find("span", class_="tl", string=re.compile("Oddiel:")).parent.find("span", class_="ra").text.strip()
    return {"value": oddiel}
 def get_vlozka(soup):
    """
    Helper function to get VloŽžka
    :param soup: website data
    :return: dictionary with value: vlozka
    """
    vlozka = soup.find("span", class_="tl", string=re.compile("Vložka")).parent.find("span", class_="ra").text.strip()
    return {"value": vlozka}
 def get_aktualizaciaUdajov(soup):
    """
    Helper function to get the date of "Dátum aktualizácie údajov"
    :param soup: website data
    :return: dictionary with value: aktualizacia
    """
    aktualizacia = soup.find("td", class_="tl", string=re.compile("Dátum aktualizácie")).find_next_sibling("td").text.strip()
    return {"value": aktualizacia}
 def get_vypisUdajov(soup):
    """
    Helper function to get the date of "Dátum výpisu"
    :param soup: website data
    :return: dictionary with value: vypis
    """
    vypis = soup.find("td", class_="tl", string=re.compile("Dátum výpisu")).find_next_sibling("td").text.strip()
    return {"value": vypis}
 def get_data(data_td, value_type="text", allow_multiple_active=True):
    """
    Generic function to retrieve data for one key
    :param data_td: <td>-element containing the data
    :param value_type: type of value that we want to retrieve. Default value is "text" other values are defined in value_type_dict
    :param allow_multiple_active: if multiple active values are allowed, then a list of active values is returned, instead of single items
    :return: dictionary of data for the entry
    """
    data_td = data_td
    data = {}
    # lists holding the data for one key in the record
    values = []
    old_values = []
    # get multiple entries (as table data)
    for entry in data_td.find_all("table"):
        value, valid_from, valid_until, active = process_entry(entry, value_type)
        if value is None:
            continue
        if active:
            values.append({"value": value, "valid_from": valid_from, "valid_until": valid_until})
        else:
            old_values.append({"value": value, "valid_from": valid_from, "valid_until": valid_until})
    if not allow_multiple_active:
        if len(values) > 0:
            data.update(values[0])
    else:
        data.update({"values": values})
    data.update({"old_values": old_values})
    return data
 def get_record_data(soup):
    """
    Retrieve data for one record
    :param soup: souped-html for the record
    :return: dictionary with record data
    """
    record = {
        "oddiel": get_oddiel(soup),
        "vlozka": get_vlozka(soup)
@@ -129,6 +203,9 @@ def get_record_data(soup):
    # find the last table before variable data
    entry = soup.find("span", class_="tl", string=re.compile("Oddiel:")).parent.parent.parent
    # retrieve all keys for a record. Since there are multiple different record types with different keys,
    # the keys of the record are created automatically from available data
    while True:
        entry = entry.find_next_sibling("table")
        entry_tr = entry.find_all("tr")
@@ -137,19 +214,23 @@ def get_record_data(soup):
        if len(entry_tr) > 1:  # last table with "Dátum aktualizácie údajov
            break
-        # get enry name and entry data
+        # get key name and key data
-        entry_container = entry_tr[0].find_all("td")
+        key_container = entry_tr[0].find_all("td")
-        entry_name = entry_container[0].text.strip()
+        key_name = key_container[0].text.strip()
        # check if multiple active allowed and the value_type
        allow_multiple_active = True
        value_type = "text"
-        if entry_name in single_value:
+        if key_name in single_value:
            allow_multiple_active = False
-        if (v_type := value_type_dict.get(entry_name)) is not None:
+        if (v_type := value_type_dict.get(key_name)) is not None:
            value_type = v_type
-        entry_name = transform_entry_name(entry_name)
+
-        entry_data = get_data(entry_container[1], value_type=value_type, allow_multiple_active=allow_multiple_active)
+        key_name = transform_key_name(key_name)
-        record.update({entry_name: entry_data})
+
        # reads the data of the key
        key_data = get_data(key_container[1], value_type=value_type, allow_multiple_active=allow_multiple_active)
        record.update({key_name: key_data})
    record.update({
        "aktualizaciaUdajov": get_aktualizaciaUdajov(soup),
@@ -159,7 +240,12 @@ def get_record_data(soup):
    return record
-def transform_entry_name(name):
+def transform_key_name(name):
    """
    Helper function to create camelCase key name
    :param name: string with input data (from ORSR)
    :return: camelCase key name
    """
    s = unicodedata.normalize("NFKD",name).encode('ascii', 'ignore').decode().replace(":", "").lower().split()
    return s[0].lower() + "".join(w.capitalize() for w in s[1:])
@@ -180,8 +266,10 @@ def process_entry(entry, value_type):
    if value_td.span.attrs["class"][0] == "ra":
        active = True
    # get clean lines from multiline entries
    lines = [f.strip() for f in " ".join(["\n" if x.name == "br" else x.text.strip() for x in value_td.find_all(["br","span"])]).split("\n") if f]
    # parse data according to value_type
    if value_type == "text":
        value = ", ".join(lines)
    elif value_type == "number":
@@ -206,33 +294,6 @@ def process_entry(entry, value_type):
    return value, valid_from, valid_until, active
 def get_data(data_td, value_type="text", allow_multiple_active=True):
    data_td = data_td
    data = {}
    values = []
    old_values = []
    for entry in data_td.find_all("table"):
        value, valid_from, valid_until, active = process_entry(entry, value_type)
        if value is None:
            continue
        if active:
            values.append({"value": value, "valid_from": valid_from, "valid_until": valid_until})
        else:
            old_values.append({"value": value, "valid_from": valid_from, "valid_until": valid_until})
    if not allow_multiple_active:
        if len(values) > 0:
            data.update(values[0])
    else:
        data.update({"values": values})
    data.update({"old_values": old_values})
    return data
 def parse_oddo(text):
    """
    Parses the valid_from and valid_until from string