comments and README.md

This commit is contained in:
2023-09-28 17:08:50 +02:00
parent 68311135bf
commit 945b9c2195
6 changed files with 298 additions and 58 deletions

View File

@@ -0,0 +1,89 @@
# ORSR Scraper
With this application you can get all changed records in orsr for the current day.
The application consists of two parts:
### 1. Scraper:
- gets the data of all changed records
- either the "aktuálna" or the "úplna" version
- can use a socks5 proxy
- stores the data in a MongoDB
### 2. Flask app:
- Minimalistic flask app that has two endpoints:
- /detail with parameter ico
- returns a json data for the record with ico
- /list
- returns a paginated list of records ico and obhcodneMeno
## Setup
### 1. Prerequisites
You need to have installed/access to:
- current python
- MongoDB
- Socks5 proxy (optional)
The installation of these is out of scope of this README
### 1. Download the app
Download/clone the application
### 2. venv and requirements
Open terminal cd to app folder and install venv
```
cd [appPath]
python -m venv venv
```
install the requirements from `requirements.txt`
```
venv/bin/pip install -r requirements.txt
for Windows:
venv\Scripts\pip.exe install -r requirements.txt
```
### 3. Config File
There is a default config file "config_base.cfg".
For local changes copy this base config file and store it as "config.cfg". The config file has the following structure:
```
[DB]
MONGODB_URI = mongodb://localhost:27017
MONGODB_DB = softone
MONGODB_COLLECTION = orsr
[WEB]
BASE_URL = https://www.orsr.sk/
ENDPOINT = hladaj_zmeny.asp
[PROXY]
#HTTP_PROXY = socks5://user:pass@host:port
#HTTPS_PROXY = socks5://user:pass@host:port
[APP]
THREADS = 8
```
Setup the connection to MongoDB, number of threads being used for collecting the data and optionally also the Socks5 Proxy params.
## Run the applications
### 1. Scraper
Run the scraper with
```
venv/bin/python scraper.py
for Windows:
venv\Scripts\python.exe scraper.py
```
It will ask you if you want to download the "aktuálny" or "úplný" record.
### 2. Flask
Start flask application
```
venv/bin/python flaskapp.py
for Windows:
venv\Scripts\python.exe flaskapp.py
```
Now you can get the data from the local test server that usually runs on `http://127.0.0.1:5000`

View File

@@ -1,6 +1,12 @@
import configparser import configparser
from os import path from os import path
"""
Reads the config files and stores the config into settings dictionary.
This dictionary can be then used in the application to access config values.
"""
# parse base config file # parse base config file
config = configparser.ConfigParser() config = configparser.ConfigParser()
config_path = path.dirname(path.abspath(__file__)) config_path = path.dirname(path.abspath(__file__))

View File

@@ -1,10 +1,15 @@
from flask import g from flask import g
from werkzeug.local import LocalProxy
from pymongo import MongoClient from pymongo import MongoClient
from .config import settings from .config import settings
def connect_db(): def connect_db():
"""
Creates the connection to the MongoDB using the config file data.
Thos function is used by flask application as well as the scraper script
:return: connection to the collection in the mongodb
"""
client = MongoClient(settings["mongodb_uri"]) client = MongoClient(settings["mongodb_uri"])
db = client[settings["mongodb_db"]] db = client[settings["mongodb_db"]]
collection = db[settings["mongodb_collection"]] collection = db[settings["mongodb_collection"]]
@@ -12,12 +17,17 @@ def connect_db():
def disconnect_db(conn): def disconnect_db(conn):
"""
Disconnects open connection
:param conn: open db connection
"""
conn.database.client.close() conn.database.client.close()
def get_db(): def get_db():
""" """
Configuration method to return db instance Get collection instance for flask application. If no instance stored in global variables, then create connection
and store it in g
""" """
collection = getattr(g, "_database", None) collection = getattr(g, "_database", None)
@@ -25,7 +35,3 @@ def get_db():
collection = g._database = connect_db() collection = g._database = connect_db()
return collection return collection
# Use LocalProxy to read the global db instance with just `db`
db = LocalProxy(get_db)

View File

@@ -1,17 +1,92 @@
from flask import current_app as app from flask import current_app as app
from flask import request from flask import request, url_for
from .db import db from .db import get_db
@app.route("/detail", methods=["GET"]) @app.route("/detail", methods=["GET"])
def detail(): def detail():
ico = request.args.get("ico") """
if "ico" is None: GET a detial of one record in json format
return "missing ico" :args: ico: integer value of IČO
:return: json object representing record with ico
"""
return f"ICO je {ico}" ico = request.args.get("ico")
if ico is None:
return {}
try:
ico = int(ico)
except ValueError:
return{}
collection = get_db()
data = collection.find_one({"ico.value":ico})
data.pop("_id")
return data
@app.route("/list", methods=["GET"]) @app.route("/list", methods=["GET"])
def list_data(): def list_data():
return "list" """
GET a list of ORSR records.
Pagination is inspired by official MongoDB resources/tutorials
The results are paginated using the `page` parameter.
:return: json object with list of records and links to other pages
"""
page = int(request.args.get("page", 1))
per_page = 50 # A const value.
collection = get_db()
# For pagination, we sort by ICO
# then skip the number of docs that earlier pages would have displayed,
# and then to limit to the fixed page size, ``per_page``.
records = collection.find().sort("ico.value").skip(per_page * (page - 1)).limit(per_page)
records_count = collection.count_documents({})
links = {
"self": {"href": url_for(".list_data", page=page, _external=True)},
"last": {
"href": url_for(
".list_data", page=(records_count // per_page) + 1, _external=True
)
},
}
# Add a 'prev' link if it's not on the first page:
if page > 1:
links["prev"] = {
"href": url_for(".list_data", page=page - 1, _external=True)
}
# Add a 'next' link if it's not on the last page:
if page - 1 < records_count // per_page:
links["next"] = {
"href": url_for(".list_data", page=page + 1, _external=True)
}
return {
"records": [transform_for_list(record) for record in records], # get only
"_links": links,
}
def transform_for_list(record_in):
"""
retrieve ico and obchodneMeno from record
:param record_in: record with all data
:return: dictionary of ico and obchodneMeno
"""
if (obch_meno := record_in["obchodneMeno"].get("value")) is None:
obch_meno_old = record_in["obchodneMeno"].get("old_values")
if len(obch_meno_old) == 0:
obch_meno = ""
else:
obch_meno = obch_meno_old[0].get("value", "")
record = {
"ico": record_in["ico"]["value"],
"obchodneMeno": obch_meno
}
return record

View File

@@ -1,9 +1,12 @@
from flask import Flask from flask import Flask
from app.config import settings
def create_app(): def create_app():
"""
Create a very simple flask app.
:return: Flask application
"""
flaskapp = Flask(__name__) flaskapp = Flask(__name__)
with flaskapp.app_context(): with flaskapp.app_context():

View File

@@ -1,6 +1,5 @@
import requests import requests
import re import re
import json
import unicodedata import unicodedata
from bs4 import BeautifulSoup from bs4 import BeautifulSoup
from tqdm.auto import tqdm from tqdm.auto import tqdm
@@ -9,9 +8,8 @@ from pymongo import InsertOne
from app.config import settings from app.config import settings
from app.db import connect_db, disconnect_db from app.db import connect_db, disconnect_db
from time import sleep
# variables for custom data parsing
single_value = ["Obchodné meno:", "Sídlo:", "IČO:", "Deň zápisu:", "Právna forma:"] single_value = ["Obchodné meno:", "Sídlo:", "IČO:", "Deň zápisu:", "Právna forma:"]
value_type_dict = { value_type_dict = {
"IČO:": "number", "IČO:": "number",
@@ -19,6 +17,7 @@ value_type_dict = {
"Výška vkladu každého spoločníka:": "vklad" "Výška vkladu každého spoločníka:": "vklad"
} }
def scrape_orsr(): def scrape_orsr():
""" """
This is the main function that scrapes data from endpoint defined in config and stores it in mongodb. This is the main function that scrapes data from endpoint defined in config and stores it in mongodb.
@@ -26,9 +25,11 @@ def scrape_orsr():
print("#########################") print("#########################")
print("Starting ORSR scraper") print("Starting ORSR scraper")
# get all links to "Aktuálny" from the orsr url # get all links to from the orsr url
print("Downloading changed records..") print("Downloading changed records..")
url = settings["base_url"]+settings["endpoint"] url = settings["base_url"]+settings["endpoint"]
proxies = {} proxies = {}
if (pr := settings["http_proxy"]) is not None: if (pr := settings["http_proxy"]) is not None:
proxies.update({"http": pr}) proxies.update({"http": pr})
@@ -36,10 +37,15 @@ def scrape_orsr():
if (pr := settings["https_proxy"]) is not None: if (pr := settings["https_proxy"]) is not None:
proxies.update({"https": pr}) proxies.update({"https": pr})
print(f"Found https proxy: {pr}") print(f"Found https proxy: {pr}")
html = requests.get(url, proxies=proxies) html = requests.get(url, proxies=proxies)
print("All changed records downloaded.") print("All changed records downloaded.")
# use bs4 to parse the page
soup = BeautifulSoup(html.content, "html.parser") soup = BeautifulSoup(html.content, "html.parser")
# choice between Aktualny and Uplny
m_type = input("Choose which type of records do you want to download:\n[1] 'Aktuálne'\n[2] 'Úplné' (default)\n") m_type = input("Choose which type of records do you want to download:\n[1] 'Aktuálne'\n[2] 'Úplné' (default)\n")
if m_type == "1": if m_type == "1":
record_type = "Aktuálny" record_type = "Aktuálny"
@@ -48,16 +54,21 @@ def scrape_orsr():
record_type = "Úplný" record_type = "Úplný"
print("Record type is 'Úplný'") print("Record type is 'Úplný'")
records = soup.find_all("a", string=record_type) records = soup.find_all("a", string=record_type)
# add base_url to href links
records = [settings["base_url"]+record["href"] for record in records] records = [settings["base_url"]+record["href"] for record in records]
print(f"There were {len(records)} records found.") print(f"There were {len(records)} records found.")
# distribute the work in #of threads defined in config # distribute the work in #of threads defined in config
parts = [records[i::settings["threads"]] for i in range(settings["threads"])] parts = [records[i::settings["threads"]] for i in range(settings["threads"])]
print(f"Processing {len(records)} records using {settings['threads']} threads:") print(f"Processing {len(records)} records using {settings['threads']} threads:")
with ThreadPoolExecutor() as t: with ThreadPoolExecutor() as t:
for thread_id, part in enumerate(parts): for thread_id, part in enumerate(parts):
t.submit(process_records, part, thread_id+1) t.submit(process_records, part, thread_id+1)
print("All records_processed") print("All records_processed")
print("Closing ORSR Scraper...") print("Closing ORSR Scraper...")
print("#########################") print("#########################")
@@ -70,13 +81,15 @@ def process_records(records, thread):
:param thread: thread id of processing thread :param thread: thread id of processing thread
""" """
data = [] data = []
# for i in tqdm(range(len(records)), desc=f"thread {thread}"): # add status bar for processing the records
for i in tqdm(range(1), desc=f"thread {thread}"): for i in tqdm(range(len(records)), desc=f"thread {thread}"):
try: try:
record = process_record(records[i]) record = process_record(records[i])
data.append(InsertOne(record))
except Exception as e: except Exception as e:
print(f"When downloading and parsing record {records[i]} following error occured: {e}") print(f"When downloading and parsing record {records[i]} following error occured: {e}")
data.append(InsertOne(record))
# store processed records in db
collection = connect_db() collection = connect_db()
collection.bulk_write(data) collection.bulk_write(data)
disconnect_db(collection) disconnect_db(collection)
@@ -84,7 +97,7 @@ def process_records(records, thread):
def process_record(url): def process_record(url):
""" """
process one record. Scrape url and store data to mongodb process one record. Scrape url data and parse them to dictionary
:param url: url of the record :param url: url of the record
:return dictionary of parameters :return dictionary of parameters
""" """
@@ -102,26 +115,87 @@ def process_record(url):
def get_oddiel(soup): def get_oddiel(soup):
"""
Helper function to get Oddiel
:param soup: website data
:return: dictionary with value: oddiel
"""
oddiel = soup.find("span", class_="tl", string=re.compile("Oddiel:")).parent.find("span", class_="ra").text.strip() oddiel = soup.find("span", class_="tl", string=re.compile("Oddiel:")).parent.find("span", class_="ra").text.strip()
return {"value": oddiel} return {"value": oddiel}
def get_vlozka(soup): def get_vlozka(soup):
"""
Helper function to get VloŽžka
:param soup: website data
:return: dictionary with value: vlozka
"""
vlozka = soup.find("span", class_="tl", string=re.compile("Vložka")).parent.find("span", class_="ra").text.strip() vlozka = soup.find("span", class_="tl", string=re.compile("Vložka")).parent.find("span", class_="ra").text.strip()
return {"value": vlozka} return {"value": vlozka}
def get_aktualizaciaUdajov(soup): def get_aktualizaciaUdajov(soup):
"""
Helper function to get the date of "Dátum aktualizácie údajov"
:param soup: website data
:return: dictionary with value: aktualizacia
"""
aktualizacia = soup.find("td", class_="tl", string=re.compile("Dátum aktualizácie")).find_next_sibling("td").text.strip() aktualizacia = soup.find("td", class_="tl", string=re.compile("Dátum aktualizácie")).find_next_sibling("td").text.strip()
return {"value": aktualizacia} return {"value": aktualizacia}
def get_vypisUdajov(soup): def get_vypisUdajov(soup):
"""
Helper function to get the date of "Dátum výpisu"
:param soup: website data
:return: dictionary with value: vypis
"""
vypis = soup.find("td", class_="tl", string=re.compile("Dátum výpisu")).find_next_sibling("td").text.strip() vypis = soup.find("td", class_="tl", string=re.compile("Dátum výpisu")).find_next_sibling("td").text.strip()
return {"value": vypis} return {"value": vypis}
def get_data(data_td, value_type="text", allow_multiple_active=True):
"""
Generic function to retrieve data for one key
:param data_td: <td>-element containing the data
:param value_type: type of value that we want to retrieve. Default value is "text" other values are defined in value_type_dict
:param allow_multiple_active: if multiple active values are allowed, then a list of active values is returned, instead of single items
:return: dictionary of data for the entry
"""
data_td = data_td
data = {}
# lists holding the data for one key in the record
values = []
old_values = []
# get multiple entries (as table data)
for entry in data_td.find_all("table"):
value, valid_from, valid_until, active = process_entry(entry, value_type)
if value is None:
continue
if active:
values.append({"value": value, "valid_from": valid_from, "valid_until": valid_until})
else:
old_values.append({"value": value, "valid_from": valid_from, "valid_until": valid_until})
if not allow_multiple_active:
if len(values) > 0:
data.update(values[0])
else:
data.update({"values": values})
data.update({"old_values": old_values})
return data
def get_record_data(soup): def get_record_data(soup):
"""
Retrieve data for one record
:param soup: souped-html for the record
:return: dictionary with record data
"""
record = { record = {
"oddiel": get_oddiel(soup), "oddiel": get_oddiel(soup),
"vlozka": get_vlozka(soup) "vlozka": get_vlozka(soup)
@@ -129,6 +203,9 @@ def get_record_data(soup):
# find the last table before variable data # find the last table before variable data
entry = soup.find("span", class_="tl", string=re.compile("Oddiel:")).parent.parent.parent entry = soup.find("span", class_="tl", string=re.compile("Oddiel:")).parent.parent.parent
# retrieve all keys for a record. Since there are multiple different record types with different keys,
# the keys of the record are created automatically from available data
while True: while True:
entry = entry.find_next_sibling("table") entry = entry.find_next_sibling("table")
entry_tr = entry.find_all("tr") entry_tr = entry.find_all("tr")
@@ -137,19 +214,23 @@ def get_record_data(soup):
if len(entry_tr) > 1: # last table with "Dátum aktualizácie údajov if len(entry_tr) > 1: # last table with "Dátum aktualizácie údajov
break break
# get enry name and entry data # get key name and key data
entry_container = entry_tr[0].find_all("td") key_container = entry_tr[0].find_all("td")
entry_name = entry_container[0].text.strip() key_name = key_container[0].text.strip()
# check if multiple active allowed and the value_type
allow_multiple_active = True allow_multiple_active = True
value_type = "text" value_type = "text"
if entry_name in single_value: if key_name in single_value:
allow_multiple_active = False allow_multiple_active = False
if (v_type := value_type_dict.get(entry_name)) is not None: if (v_type := value_type_dict.get(key_name)) is not None:
value_type = v_type value_type = v_type
entry_name = transform_entry_name(entry_name)
entry_data = get_data(entry_container[1], value_type=value_type, allow_multiple_active=allow_multiple_active) key_name = transform_key_name(key_name)
record.update({entry_name: entry_data})
# reads the data of the key
key_data = get_data(key_container[1], value_type=value_type, allow_multiple_active=allow_multiple_active)
record.update({key_name: key_data})
record.update({ record.update({
"aktualizaciaUdajov": get_aktualizaciaUdajov(soup), "aktualizaciaUdajov": get_aktualizaciaUdajov(soup),
@@ -159,7 +240,12 @@ def get_record_data(soup):
return record return record
def transform_entry_name(name): def transform_key_name(name):
"""
Helper function to create camelCase key name
:param name: string with input data (from ORSR)
:return: camelCase key name
"""
s = unicodedata.normalize("NFKD",name).encode('ascii', 'ignore').decode().replace(":", "").lower().split() s = unicodedata.normalize("NFKD",name).encode('ascii', 'ignore').decode().replace(":", "").lower().split()
return s[0].lower() + "".join(w.capitalize() for w in s[1:]) return s[0].lower() + "".join(w.capitalize() for w in s[1:])
@@ -180,8 +266,10 @@ def process_entry(entry, value_type):
if value_td.span.attrs["class"][0] == "ra": if value_td.span.attrs["class"][0] == "ra":
active = True active = True
# get clean lines from multiline entries
lines = [f.strip() for f in " ".join(["\n" if x.name == "br" else x.text.strip() for x in value_td.find_all(["br","span"])]).split("\n") if f] lines = [f.strip() for f in " ".join(["\n" if x.name == "br" else x.text.strip() for x in value_td.find_all(["br","span"])]).split("\n") if f]
# parse data according to value_type
if value_type == "text": if value_type == "text":
value = ", ".join(lines) value = ", ".join(lines)
elif value_type == "number": elif value_type == "number":
@@ -206,33 +294,6 @@ def process_entry(entry, value_type):
return value, valid_from, valid_until, active return value, valid_from, valid_until, active
def get_data(data_td, value_type="text", allow_multiple_active=True):
data_td = data_td
data = {}
values = []
old_values = []
for entry in data_td.find_all("table"):
value, valid_from, valid_until, active = process_entry(entry, value_type)
if value is None:
continue
if active:
values.append({"value": value, "valid_from": valid_from, "valid_until": valid_until})
else:
old_values.append({"value": value, "valid_from": valid_from, "valid_until": valid_until})
if not allow_multiple_active:
if len(values) > 0:
data.update(values[0])
else:
data.update({"values": values})
data.update({"old_values": old_values})
return data
def parse_oddo(text): def parse_oddo(text):
""" """
Parses the valid_from and valid_until from string Parses the valid_from and valid_until from string