From 945b9c2195adad822a9c194de5be3577a723f9d1 Mon Sep 17 00:00:00 2001 From: Oto Imrich Date: Thu, 28 Sep 2023 17:08:50 +0200 Subject: [PATCH] comments and README.md --- README.md | 89 ++++++++++++++++++++++++++++++ app/config.py | 6 ++ app/db.py | 18 ++++-- app/routes.py | 89 +++++++++++++++++++++++++++--- flaskapp.py | 5 +- scraper.py | 149 +++++++++++++++++++++++++++++++++++--------------- 6 files changed, 298 insertions(+), 58 deletions(-) diff --git a/README.md b/README.md index e69de29..70602db 100644 --- a/README.md +++ b/README.md @@ -0,0 +1,89 @@ +# ORSR Scraper + +With this application you can get all changed records in orsr for the current day. + +The application consists of two parts: + +### 1. Scraper: +- gets the data of all changed records + - either the "aktuálna" or the "úplna" version + - can use a socks5 proxy +- stores the data in a MongoDB + +### 2. Flask app: + +- Minimalistic flask app that has two endpoints: + - /detail with parameter ico + - returns a json data for the record with ico + - /list + - returns a paginated list of records ico and obhcodneMeno + + +## Setup +### 1. Prerequisites +You need to have installed/access to: +- current python +- MongoDB +- Socks5 proxy (optional) + +The installation of these is out of scope of this README + +### 1. Download the app +Download/clone the application + +### 2. venv and requirements +Open terminal cd to app folder and install venv +``` +cd [appPath] +python -m venv venv +``` +install the requirements from `requirements.txt` +``` +venv/bin/pip install -r requirements.txt + +for Windows: +venv\Scripts\pip.exe install -r requirements.txt +``` + +### 3. Config File +There is a default config file "config_base.cfg". +For local changes copy this base config file and store it as "config.cfg". The config file has the following structure: +``` +[DB] +MONGODB_URI = mongodb://localhost:27017 +MONGODB_DB = softone +MONGODB_COLLECTION = orsr + +[WEB] +BASE_URL = https://www.orsr.sk/ +ENDPOINT = hladaj_zmeny.asp + +[PROXY] +#HTTP_PROXY = socks5://user:pass@host:port +#HTTPS_PROXY = socks5://user:pass@host:port + +[APP] +THREADS = 8 +``` + +Setup the connection to MongoDB, number of threads being used for collecting the data and optionally also the Socks5 Proxy params. + +## Run the applications +### 1. Scraper +Run the scraper with +``` +venv/bin/python scraper.py + +for Windows: +venv\Scripts\python.exe scraper.py +``` +It will ask you if you want to download the "aktuálny" or "úplný" record. +### 2. Flask +Start flask application +``` +venv/bin/python flaskapp.py + +for Windows: +venv\Scripts\python.exe flaskapp.py +``` +Now you can get the data from the local test server that usually runs on `http://127.0.0.1:5000` \ No newline at end of file diff --git a/app/config.py b/app/config.py index fbff49f..81582f1 100644 --- a/app/config.py +++ b/app/config.py @@ -1,6 +1,12 @@ import configparser from os import path +""" +Reads the config files and stores the config into settings dictionary. +This dictionary can be then used in the application to access config values. +""" + + # parse base config file config = configparser.ConfigParser() config_path = path.dirname(path.abspath(__file__)) diff --git a/app/db.py b/app/db.py index 6193b07..c3a8b3d 100644 --- a/app/db.py +++ b/app/db.py @@ -1,10 +1,15 @@ from flask import g -from werkzeug.local import LocalProxy from pymongo import MongoClient from .config import settings def connect_db(): + """ + Creates the connection to the MongoDB using the config file data. + Thos function is used by flask application as well as the scraper script + + :return: connection to the collection in the mongodb + """ client = MongoClient(settings["mongodb_uri"]) db = client[settings["mongodb_db"]] collection = db[settings["mongodb_collection"]] @@ -12,12 +17,17 @@ def connect_db(): def disconnect_db(conn): + """ + Disconnects open connection + :param conn: open db connection + """ conn.database.client.close() def get_db(): """ - Configuration method to return db instance + Get collection instance for flask application. If no instance stored in global variables, then create connection + and store it in g """ collection = getattr(g, "_database", None) @@ -25,7 +35,3 @@ def get_db(): collection = g._database = connect_db() return collection - - -# Use LocalProxy to read the global db instance with just `db` -db = LocalProxy(get_db) diff --git a/app/routes.py b/app/routes.py index dc2db88..4150c48 100644 --- a/app/routes.py +++ b/app/routes.py @@ -1,17 +1,92 @@ from flask import current_app as app -from flask import request -from .db import db +from flask import request, url_for +from .db import get_db @app.route("/detail", methods=["GET"]) def detail(): - ico = request.args.get("ico") - if "ico" is None: - return "missing ico" + """ + GET a detial of one record in json format + :args: ico: integer value of IČO + :return: json object representing record with ico + """ - return f"ICO je {ico}" + ico = request.args.get("ico") + if ico is None: + return {} + try: + ico = int(ico) + except ValueError: + return{} + + collection = get_db() + data = collection.find_one({"ico.value":ico}) + data.pop("_id") + return data @app.route("/list", methods=["GET"]) def list_data(): - return "list" + """ + GET a list of ORSR records. + + Pagination is inspired by official MongoDB resources/tutorials + + The results are paginated using the `page` parameter. + :return: json object with list of records and links to other pages + """ + + page = int(request.args.get("page", 1)) + per_page = 50 # A const value. + collection = get_db() + + # For pagination, we sort by ICO + # then skip the number of docs that earlier pages would have displayed, + # and then to limit to the fixed page size, ``per_page``. + records = collection.find().sort("ico.value").skip(per_page * (page - 1)).limit(per_page) + + records_count = collection.count_documents({}) + + links = { + "self": {"href": url_for(".list_data", page=page, _external=True)}, + "last": { + "href": url_for( + ".list_data", page=(records_count // per_page) + 1, _external=True + ) + }, + } + # Add a 'prev' link if it's not on the first page: + if page > 1: + links["prev"] = { + "href": url_for(".list_data", page=page - 1, _external=True) + } + # Add a 'next' link if it's not on the last page: + if page - 1 < records_count // per_page: + links["next"] = { + "href": url_for(".list_data", page=page + 1, _external=True) + } + + return { + "records": [transform_for_list(record) for record in records], # get only + "_links": links, + } + + +def transform_for_list(record_in): + """ + retrieve ico and obchodneMeno from record + :param record_in: record with all data + :return: dictionary of ico and obchodneMeno + """ + if (obch_meno := record_in["obchodneMeno"].get("value")) is None: + obch_meno_old = record_in["obchodneMeno"].get("old_values") + if len(obch_meno_old) == 0: + obch_meno = "" + else: + obch_meno = obch_meno_old[0].get("value", "") + + record = { + "ico": record_in["ico"]["value"], + "obchodneMeno": obch_meno + } + return record diff --git a/flaskapp.py b/flaskapp.py index 21e0492..556d636 100644 --- a/flaskapp.py +++ b/flaskapp.py @@ -1,9 +1,12 @@ from flask import Flask -from app.config import settings def create_app(): + """ + Create a very simple flask app. + :return: Flask application + """ flaskapp = Flask(__name__) with flaskapp.app_context(): diff --git a/scraper.py b/scraper.py index 579b76e..1ed5f63 100644 --- a/scraper.py +++ b/scraper.py @@ -1,6 +1,5 @@ import requests import re -import json import unicodedata from bs4 import BeautifulSoup from tqdm.auto import tqdm @@ -9,9 +8,8 @@ from pymongo import InsertOne from app.config import settings from app.db import connect_db, disconnect_db -from time import sleep - +# variables for custom data parsing single_value = ["Obchodné meno:", "Sídlo:", "IČO:", "Deň zápisu:", "Právna forma:"] value_type_dict = { "IČO:": "number", @@ -19,6 +17,7 @@ value_type_dict = { "Výška vkladu každého spoločníka:": "vklad" } + def scrape_orsr(): """ This is the main function that scrapes data from endpoint defined in config and stores it in mongodb. @@ -26,9 +25,11 @@ def scrape_orsr(): print("#########################") print("Starting ORSR scraper") - # get all links to "Aktuálny" from the orsr url + # get all links to from the orsr url print("Downloading changed records..") + url = settings["base_url"]+settings["endpoint"] + proxies = {} if (pr := settings["http_proxy"]) is not None: proxies.update({"http": pr}) @@ -36,10 +37,15 @@ def scrape_orsr(): if (pr := settings["https_proxy"]) is not None: proxies.update({"https": pr}) print(f"Found https proxy: {pr}") + html = requests.get(url, proxies=proxies) + print("All changed records downloaded.") + + # use bs4 to parse the page soup = BeautifulSoup(html.content, "html.parser") + # choice between Aktualny and Uplny m_type = input("Choose which type of records do you want to download:\n[1] 'Aktuálne'\n[2] 'Úplné' (default)\n") if m_type == "1": record_type = "Aktuálny" @@ -48,16 +54,21 @@ def scrape_orsr(): record_type = "Úplný" print("Record type is 'Úplný'") records = soup.find_all("a", string=record_type) + + # add base_url to href links records = [settings["base_url"]+record["href"] for record in records] + print(f"There were {len(records)} records found.") # distribute the work in #of threads defined in config parts = [records[i::settings["threads"]] for i in range(settings["threads"])] print(f"Processing {len(records)} records using {settings['threads']} threads:") + with ThreadPoolExecutor() as t: for thread_id, part in enumerate(parts): t.submit(process_records, part, thread_id+1) + print("All records_processed") print("Closing ORSR Scraper...") print("#########################") @@ -70,13 +81,15 @@ def process_records(records, thread): :param thread: thread id of processing thread """ data = [] - # for i in tqdm(range(len(records)), desc=f"thread {thread}"): - for i in tqdm(range(1), desc=f"thread {thread}"): + # add status bar for processing the records + for i in tqdm(range(len(records)), desc=f"thread {thread}"): try: record = process_record(records[i]) + data.append(InsertOne(record)) except Exception as e: print(f"When downloading and parsing record {records[i]} following error occured: {e}") - data.append(InsertOne(record)) + + # store processed records in db collection = connect_db() collection.bulk_write(data) disconnect_db(collection) @@ -84,7 +97,7 @@ def process_records(records, thread): def process_record(url): """ - process one record. Scrape url and store data to mongodb + process one record. Scrape url data and parse them to dictionary :param url: url of the record :return dictionary of parameters """ @@ -102,26 +115,87 @@ def process_record(url): def get_oddiel(soup): + """ + Helper function to get Oddiel + :param soup: website data + :return: dictionary with value: oddiel + """ oddiel = soup.find("span", class_="tl", string=re.compile("Oddiel:")).parent.find("span", class_="ra").text.strip() return {"value": oddiel} def get_vlozka(soup): + """ + Helper function to get VloŽžka + :param soup: website data + :return: dictionary with value: vlozka + """ vlozka = soup.find("span", class_="tl", string=re.compile("Vložka")).parent.find("span", class_="ra").text.strip() return {"value": vlozka} def get_aktualizaciaUdajov(soup): + """ + Helper function to get the date of "Dátum aktualizácie údajov" + :param soup: website data + :return: dictionary with value: aktualizacia + """ aktualizacia = soup.find("td", class_="tl", string=re.compile("Dátum aktualizácie")).find_next_sibling("td").text.strip() return {"value": aktualizacia} def get_vypisUdajov(soup): + """ + Helper function to get the date of "Dátum výpisu" + :param soup: website data + :return: dictionary with value: vypis + """ vypis = soup.find("td", class_="tl", string=re.compile("Dátum výpisu")).find_next_sibling("td").text.strip() return {"value": vypis} +def get_data(data_td, value_type="text", allow_multiple_active=True): + """ + Generic function to retrieve data for one key + :param data_td: -element containing the data + :param value_type: type of value that we want to retrieve. Default value is "text" other values are defined in value_type_dict + :param allow_multiple_active: if multiple active values are allowed, then a list of active values is returned, instead of single items + :return: dictionary of data for the entry + """ + data_td = data_td + + data = {} + + # lists holding the data for one key in the record + values = [] + old_values = [] + + # get multiple entries (as table data) + for entry in data_td.find_all("table"): + value, valid_from, valid_until, active = process_entry(entry, value_type) + if value is None: + continue + if active: + values.append({"value": value, "valid_from": valid_from, "valid_until": valid_until}) + else: + old_values.append({"value": value, "valid_from": valid_from, "valid_until": valid_until}) + + if not allow_multiple_active: + if len(values) > 0: + data.update(values[0]) + else: + data.update({"values": values}) + data.update({"old_values": old_values}) + + return data + + def get_record_data(soup): + """ + Retrieve data for one record + :param soup: souped-html for the record + :return: dictionary with record data + """ record = { "oddiel": get_oddiel(soup), "vlozka": get_vlozka(soup) @@ -129,6 +203,9 @@ def get_record_data(soup): # find the last table before variable data entry = soup.find("span", class_="tl", string=re.compile("Oddiel:")).parent.parent.parent + + # retrieve all keys for a record. Since there are multiple different record types with different keys, + # the keys of the record are created automatically from available data while True: entry = entry.find_next_sibling("table") entry_tr = entry.find_all("tr") @@ -137,19 +214,23 @@ def get_record_data(soup): if len(entry_tr) > 1: # last table with "Dátum aktualizácie údajov break - # get enry name and entry data - entry_container = entry_tr[0].find_all("td") - entry_name = entry_container[0].text.strip() + # get key name and key data + key_container = entry_tr[0].find_all("td") + key_name = key_container[0].text.strip() + # check if multiple active allowed and the value_type allow_multiple_active = True value_type = "text" - if entry_name in single_value: + if key_name in single_value: allow_multiple_active = False - if (v_type := value_type_dict.get(entry_name)) is not None: + if (v_type := value_type_dict.get(key_name)) is not None: value_type = v_type - entry_name = transform_entry_name(entry_name) - entry_data = get_data(entry_container[1], value_type=value_type, allow_multiple_active=allow_multiple_active) - record.update({entry_name: entry_data}) + + key_name = transform_key_name(key_name) + + # reads the data of the key + key_data = get_data(key_container[1], value_type=value_type, allow_multiple_active=allow_multiple_active) + record.update({key_name: key_data}) record.update({ "aktualizaciaUdajov": get_aktualizaciaUdajov(soup), @@ -159,7 +240,12 @@ def get_record_data(soup): return record -def transform_entry_name(name): +def transform_key_name(name): + """ + Helper function to create camelCase key name + :param name: string with input data (from ORSR) + :return: camelCase key name + """ s = unicodedata.normalize("NFKD",name).encode('ascii', 'ignore').decode().replace(":", "").lower().split() return s[0].lower() + "".join(w.capitalize() for w in s[1:]) @@ -180,8 +266,10 @@ def process_entry(entry, value_type): if value_td.span.attrs["class"][0] == "ra": active = True + # get clean lines from multiline entries lines = [f.strip() for f in " ".join(["\n" if x.name == "br" else x.text.strip() for x in value_td.find_all(["br","span"])]).split("\n") if f] + # parse data according to value_type if value_type == "text": value = ", ".join(lines) elif value_type == "number": @@ -206,33 +294,6 @@ def process_entry(entry, value_type): return value, valid_from, valid_until, active -def get_data(data_td, value_type="text", allow_multiple_active=True): - data_td = data_td - - data = {} - - values = [] - old_values = [] - - for entry in data_td.find_all("table"): - value, valid_from, valid_until, active = process_entry(entry, value_type) - if value is None: - continue - if active: - values.append({"value": value, "valid_from": valid_from, "valid_until": valid_until}) - else: - old_values.append({"value": value, "valid_from": valid_from, "valid_until": valid_until}) - - if not allow_multiple_active: - if len(values) > 0: - data.update(values[0]) - else: - data.update({"values": values}) - data.update({"old_values": old_values}) - - return data - - def parse_oddo(text): """ Parses the valid_from and valid_until from string