comments and README.md
This commit is contained in:
89
README.md
89
README.md
@@ -0,0 +1,89 @@
|
|||||||
|
# ORSR Scraper
|
||||||
|
|
||||||
|
With this application you can get all changed records in orsr for the current day.
|
||||||
|
|
||||||
|
The application consists of two parts:
|
||||||
|
|
||||||
|
### 1. Scraper:
|
||||||
|
- gets the data of all changed records
|
||||||
|
- either the "aktuálna" or the "úplna" version
|
||||||
|
- can use a socks5 proxy
|
||||||
|
- stores the data in a MongoDB
|
||||||
|
|
||||||
|
### 2. Flask app:
|
||||||
|
|
||||||
|
- Minimalistic flask app that has two endpoints:
|
||||||
|
- /detail with parameter ico
|
||||||
|
- returns a json data for the record with ico
|
||||||
|
- /list
|
||||||
|
- returns a paginated list of records ico and obhcodneMeno
|
||||||
|
|
||||||
|
|
||||||
|
## Setup
|
||||||
|
### 1. Prerequisites
|
||||||
|
You need to have installed/access to:
|
||||||
|
- current python
|
||||||
|
- MongoDB
|
||||||
|
- Socks5 proxy (optional)
|
||||||
|
|
||||||
|
The installation of these is out of scope of this README
|
||||||
|
|
||||||
|
### 1. Download the app
|
||||||
|
Download/clone the application
|
||||||
|
|
||||||
|
### 2. venv and requirements
|
||||||
|
Open terminal cd to app folder and install venv
|
||||||
|
```
|
||||||
|
cd [appPath]
|
||||||
|
python -m venv venv
|
||||||
|
```
|
||||||
|
install the requirements from `requirements.txt`
|
||||||
|
```
|
||||||
|
venv/bin/pip install -r requirements.txt
|
||||||
|
|
||||||
|
for Windows:
|
||||||
|
venv\Scripts\pip.exe install -r requirements.txt
|
||||||
|
```
|
||||||
|
|
||||||
|
### 3. Config File
|
||||||
|
There is a default config file "config_base.cfg".
|
||||||
|
For local changes copy this base config file and store it as "config.cfg". The config file has the following structure:
|
||||||
|
```
|
||||||
|
[DB]
|
||||||
|
MONGODB_URI = mongodb://localhost:27017
|
||||||
|
MONGODB_DB = softone
|
||||||
|
MONGODB_COLLECTION = orsr
|
||||||
|
|
||||||
|
[WEB]
|
||||||
|
BASE_URL = https://www.orsr.sk/
|
||||||
|
ENDPOINT = hladaj_zmeny.asp
|
||||||
|
|
||||||
|
[PROXY]
|
||||||
|
#HTTP_PROXY = socks5://user:pass@host:port
|
||||||
|
#HTTPS_PROXY = socks5://user:pass@host:port
|
||||||
|
|
||||||
|
[APP]
|
||||||
|
THREADS = 8
|
||||||
|
```
|
||||||
|
|
||||||
|
Setup the connection to MongoDB, number of threads being used for collecting the data and optionally also the Socks5 Proxy params.
|
||||||
|
|
||||||
|
## Run the applications
|
||||||
|
### 1. Scraper
|
||||||
|
Run the scraper with
|
||||||
|
```
|
||||||
|
venv/bin/python scraper.py
|
||||||
|
|
||||||
|
for Windows:
|
||||||
|
venv\Scripts\python.exe scraper.py
|
||||||
|
```
|
||||||
|
It will ask you if you want to download the "aktuálny" or "úplný" record.
|
||||||
|
### 2. Flask
|
||||||
|
Start flask application
|
||||||
|
```
|
||||||
|
venv/bin/python flaskapp.py
|
||||||
|
|
||||||
|
for Windows:
|
||||||
|
venv\Scripts\python.exe flaskapp.py
|
||||||
|
```
|
||||||
|
Now you can get the data from the local test server that usually runs on `http://127.0.0.1:5000`
|
||||||
@@ -1,6 +1,12 @@
|
|||||||
import configparser
|
import configparser
|
||||||
from os import path
|
from os import path
|
||||||
|
|
||||||
|
"""
|
||||||
|
Reads the config files and stores the config into settings dictionary.
|
||||||
|
This dictionary can be then used in the application to access config values.
|
||||||
|
"""
|
||||||
|
|
||||||
|
|
||||||
# parse base config file
|
# parse base config file
|
||||||
config = configparser.ConfigParser()
|
config = configparser.ConfigParser()
|
||||||
config_path = path.dirname(path.abspath(__file__))
|
config_path = path.dirname(path.abspath(__file__))
|
||||||
|
|||||||
18
app/db.py
18
app/db.py
@@ -1,10 +1,15 @@
|
|||||||
from flask import g
|
from flask import g
|
||||||
from werkzeug.local import LocalProxy
|
|
||||||
from pymongo import MongoClient
|
from pymongo import MongoClient
|
||||||
from .config import settings
|
from .config import settings
|
||||||
|
|
||||||
|
|
||||||
def connect_db():
|
def connect_db():
|
||||||
|
"""
|
||||||
|
Creates the connection to the MongoDB using the config file data.
|
||||||
|
Thos function is used by flask application as well as the scraper script
|
||||||
|
|
||||||
|
:return: connection to the collection in the mongodb
|
||||||
|
"""
|
||||||
client = MongoClient(settings["mongodb_uri"])
|
client = MongoClient(settings["mongodb_uri"])
|
||||||
db = client[settings["mongodb_db"]]
|
db = client[settings["mongodb_db"]]
|
||||||
collection = db[settings["mongodb_collection"]]
|
collection = db[settings["mongodb_collection"]]
|
||||||
@@ -12,12 +17,17 @@ def connect_db():
|
|||||||
|
|
||||||
|
|
||||||
def disconnect_db(conn):
|
def disconnect_db(conn):
|
||||||
|
"""
|
||||||
|
Disconnects open connection
|
||||||
|
:param conn: open db connection
|
||||||
|
"""
|
||||||
conn.database.client.close()
|
conn.database.client.close()
|
||||||
|
|
||||||
|
|
||||||
def get_db():
|
def get_db():
|
||||||
"""
|
"""
|
||||||
Configuration method to return db instance
|
Get collection instance for flask application. If no instance stored in global variables, then create connection
|
||||||
|
and store it in g
|
||||||
"""
|
"""
|
||||||
collection = getattr(g, "_database", None)
|
collection = getattr(g, "_database", None)
|
||||||
|
|
||||||
@@ -25,7 +35,3 @@ def get_db():
|
|||||||
collection = g._database = connect_db()
|
collection = g._database = connect_db()
|
||||||
|
|
||||||
return collection
|
return collection
|
||||||
|
|
||||||
|
|
||||||
# Use LocalProxy to read the global db instance with just `db`
|
|
||||||
db = LocalProxy(get_db)
|
|
||||||
|
|||||||
@@ -1,17 +1,92 @@
|
|||||||
from flask import current_app as app
|
from flask import current_app as app
|
||||||
from flask import request
|
from flask import request, url_for
|
||||||
from .db import db
|
from .db import get_db
|
||||||
|
|
||||||
|
|
||||||
@app.route("/detail", methods=["GET"])
|
@app.route("/detail", methods=["GET"])
|
||||||
def detail():
|
def detail():
|
||||||
ico = request.args.get("ico")
|
"""
|
||||||
if "ico" is None:
|
GET a detial of one record in json format
|
||||||
return "missing ico"
|
:args: ico: integer value of IČO
|
||||||
|
:return: json object representing record with ico
|
||||||
|
"""
|
||||||
|
|
||||||
return f"ICO je {ico}"
|
ico = request.args.get("ico")
|
||||||
|
if ico is None:
|
||||||
|
return {}
|
||||||
|
try:
|
||||||
|
ico = int(ico)
|
||||||
|
except ValueError:
|
||||||
|
return{}
|
||||||
|
|
||||||
|
collection = get_db()
|
||||||
|
data = collection.find_one({"ico.value":ico})
|
||||||
|
data.pop("_id")
|
||||||
|
return data
|
||||||
|
|
||||||
|
|
||||||
@app.route("/list", methods=["GET"])
|
@app.route("/list", methods=["GET"])
|
||||||
def list_data():
|
def list_data():
|
||||||
return "list"
|
"""
|
||||||
|
GET a list of ORSR records.
|
||||||
|
|
||||||
|
Pagination is inspired by official MongoDB resources/tutorials
|
||||||
|
|
||||||
|
The results are paginated using the `page` parameter.
|
||||||
|
:return: json object with list of records and links to other pages
|
||||||
|
"""
|
||||||
|
|
||||||
|
page = int(request.args.get("page", 1))
|
||||||
|
per_page = 50 # A const value.
|
||||||
|
collection = get_db()
|
||||||
|
|
||||||
|
# For pagination, we sort by ICO
|
||||||
|
# then skip the number of docs that earlier pages would have displayed,
|
||||||
|
# and then to limit to the fixed page size, ``per_page``.
|
||||||
|
records = collection.find().sort("ico.value").skip(per_page * (page - 1)).limit(per_page)
|
||||||
|
|
||||||
|
records_count = collection.count_documents({})
|
||||||
|
|
||||||
|
links = {
|
||||||
|
"self": {"href": url_for(".list_data", page=page, _external=True)},
|
||||||
|
"last": {
|
||||||
|
"href": url_for(
|
||||||
|
".list_data", page=(records_count // per_page) + 1, _external=True
|
||||||
|
)
|
||||||
|
},
|
||||||
|
}
|
||||||
|
# Add a 'prev' link if it's not on the first page:
|
||||||
|
if page > 1:
|
||||||
|
links["prev"] = {
|
||||||
|
"href": url_for(".list_data", page=page - 1, _external=True)
|
||||||
|
}
|
||||||
|
# Add a 'next' link if it's not on the last page:
|
||||||
|
if page - 1 < records_count // per_page:
|
||||||
|
links["next"] = {
|
||||||
|
"href": url_for(".list_data", page=page + 1, _external=True)
|
||||||
|
}
|
||||||
|
|
||||||
|
return {
|
||||||
|
"records": [transform_for_list(record) for record in records], # get only
|
||||||
|
"_links": links,
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
def transform_for_list(record_in):
|
||||||
|
"""
|
||||||
|
retrieve ico and obchodneMeno from record
|
||||||
|
:param record_in: record with all data
|
||||||
|
:return: dictionary of ico and obchodneMeno
|
||||||
|
"""
|
||||||
|
if (obch_meno := record_in["obchodneMeno"].get("value")) is None:
|
||||||
|
obch_meno_old = record_in["obchodneMeno"].get("old_values")
|
||||||
|
if len(obch_meno_old) == 0:
|
||||||
|
obch_meno = ""
|
||||||
|
else:
|
||||||
|
obch_meno = obch_meno_old[0].get("value", "")
|
||||||
|
|
||||||
|
record = {
|
||||||
|
"ico": record_in["ico"]["value"],
|
||||||
|
"obchodneMeno": obch_meno
|
||||||
|
}
|
||||||
|
return record
|
||||||
|
|||||||
@@ -1,9 +1,12 @@
|
|||||||
from flask import Flask
|
from flask import Flask
|
||||||
from app.config import settings
|
|
||||||
|
|
||||||
|
|
||||||
def create_app():
|
def create_app():
|
||||||
|
"""
|
||||||
|
Create a very simple flask app.
|
||||||
|
|
||||||
|
:return: Flask application
|
||||||
|
"""
|
||||||
flaskapp = Flask(__name__)
|
flaskapp = Flask(__name__)
|
||||||
|
|
||||||
with flaskapp.app_context():
|
with flaskapp.app_context():
|
||||||
|
|||||||
149
scraper.py
149
scraper.py
@@ -1,6 +1,5 @@
|
|||||||
import requests
|
import requests
|
||||||
import re
|
import re
|
||||||
import json
|
|
||||||
import unicodedata
|
import unicodedata
|
||||||
from bs4 import BeautifulSoup
|
from bs4 import BeautifulSoup
|
||||||
from tqdm.auto import tqdm
|
from tqdm.auto import tqdm
|
||||||
@@ -9,9 +8,8 @@ from pymongo import InsertOne
|
|||||||
|
|
||||||
from app.config import settings
|
from app.config import settings
|
||||||
from app.db import connect_db, disconnect_db
|
from app.db import connect_db, disconnect_db
|
||||||
from time import sleep
|
|
||||||
|
|
||||||
|
|
||||||
|
# variables for custom data parsing
|
||||||
single_value = ["Obchodné meno:", "Sídlo:", "IČO:", "Deň zápisu:", "Právna forma:"]
|
single_value = ["Obchodné meno:", "Sídlo:", "IČO:", "Deň zápisu:", "Právna forma:"]
|
||||||
value_type_dict = {
|
value_type_dict = {
|
||||||
"IČO:": "number",
|
"IČO:": "number",
|
||||||
@@ -19,6 +17,7 @@ value_type_dict = {
|
|||||||
"Výška vkladu každého spoločníka:": "vklad"
|
"Výška vkladu každého spoločníka:": "vklad"
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
def scrape_orsr():
|
def scrape_orsr():
|
||||||
"""
|
"""
|
||||||
This is the main function that scrapes data from endpoint defined in config and stores it in mongodb.
|
This is the main function that scrapes data from endpoint defined in config and stores it in mongodb.
|
||||||
@@ -26,9 +25,11 @@ def scrape_orsr():
|
|||||||
print("#########################")
|
print("#########################")
|
||||||
print("Starting ORSR scraper")
|
print("Starting ORSR scraper")
|
||||||
|
|
||||||
# get all links to "Aktuálny" from the orsr url
|
# get all links to from the orsr url
|
||||||
print("Downloading changed records..")
|
print("Downloading changed records..")
|
||||||
|
|
||||||
url = settings["base_url"]+settings["endpoint"]
|
url = settings["base_url"]+settings["endpoint"]
|
||||||
|
|
||||||
proxies = {}
|
proxies = {}
|
||||||
if (pr := settings["http_proxy"]) is not None:
|
if (pr := settings["http_proxy"]) is not None:
|
||||||
proxies.update({"http": pr})
|
proxies.update({"http": pr})
|
||||||
@@ -36,10 +37,15 @@ def scrape_orsr():
|
|||||||
if (pr := settings["https_proxy"]) is not None:
|
if (pr := settings["https_proxy"]) is not None:
|
||||||
proxies.update({"https": pr})
|
proxies.update({"https": pr})
|
||||||
print(f"Found https proxy: {pr}")
|
print(f"Found https proxy: {pr}")
|
||||||
|
|
||||||
html = requests.get(url, proxies=proxies)
|
html = requests.get(url, proxies=proxies)
|
||||||
|
|
||||||
print("All changed records downloaded.")
|
print("All changed records downloaded.")
|
||||||
|
|
||||||
|
# use bs4 to parse the page
|
||||||
soup = BeautifulSoup(html.content, "html.parser")
|
soup = BeautifulSoup(html.content, "html.parser")
|
||||||
|
|
||||||
|
# choice between Aktualny and Uplny
|
||||||
m_type = input("Choose which type of records do you want to download:\n[1] 'Aktuálne'\n[2] 'Úplné' (default)\n")
|
m_type = input("Choose which type of records do you want to download:\n[1] 'Aktuálne'\n[2] 'Úplné' (default)\n")
|
||||||
if m_type == "1":
|
if m_type == "1":
|
||||||
record_type = "Aktuálny"
|
record_type = "Aktuálny"
|
||||||
@@ -48,16 +54,21 @@ def scrape_orsr():
|
|||||||
record_type = "Úplný"
|
record_type = "Úplný"
|
||||||
print("Record type is 'Úplný'")
|
print("Record type is 'Úplný'")
|
||||||
records = soup.find_all("a", string=record_type)
|
records = soup.find_all("a", string=record_type)
|
||||||
|
|
||||||
|
# add base_url to href links
|
||||||
records = [settings["base_url"]+record["href"] for record in records]
|
records = [settings["base_url"]+record["href"] for record in records]
|
||||||
|
|
||||||
print(f"There were {len(records)} records found.")
|
print(f"There were {len(records)} records found.")
|
||||||
|
|
||||||
# distribute the work in #of threads defined in config
|
# distribute the work in #of threads defined in config
|
||||||
parts = [records[i::settings["threads"]] for i in range(settings["threads"])]
|
parts = [records[i::settings["threads"]] for i in range(settings["threads"])]
|
||||||
|
|
||||||
print(f"Processing {len(records)} records using {settings['threads']} threads:")
|
print(f"Processing {len(records)} records using {settings['threads']} threads:")
|
||||||
|
|
||||||
with ThreadPoolExecutor() as t:
|
with ThreadPoolExecutor() as t:
|
||||||
for thread_id, part in enumerate(parts):
|
for thread_id, part in enumerate(parts):
|
||||||
t.submit(process_records, part, thread_id+1)
|
t.submit(process_records, part, thread_id+1)
|
||||||
|
|
||||||
print("All records_processed")
|
print("All records_processed")
|
||||||
print("Closing ORSR Scraper...")
|
print("Closing ORSR Scraper...")
|
||||||
print("#########################")
|
print("#########################")
|
||||||
@@ -70,13 +81,15 @@ def process_records(records, thread):
|
|||||||
:param thread: thread id of processing thread
|
:param thread: thread id of processing thread
|
||||||
"""
|
"""
|
||||||
data = []
|
data = []
|
||||||
# for i in tqdm(range(len(records)), desc=f"thread {thread}"):
|
# add status bar for processing the records
|
||||||
for i in tqdm(range(1), desc=f"thread {thread}"):
|
for i in tqdm(range(len(records)), desc=f"thread {thread}"):
|
||||||
try:
|
try:
|
||||||
record = process_record(records[i])
|
record = process_record(records[i])
|
||||||
|
data.append(InsertOne(record))
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
print(f"When downloading and parsing record {records[i]} following error occured: {e}")
|
print(f"When downloading and parsing record {records[i]} following error occured: {e}")
|
||||||
data.append(InsertOne(record))
|
|
||||||
|
# store processed records in db
|
||||||
collection = connect_db()
|
collection = connect_db()
|
||||||
collection.bulk_write(data)
|
collection.bulk_write(data)
|
||||||
disconnect_db(collection)
|
disconnect_db(collection)
|
||||||
@@ -84,7 +97,7 @@ def process_records(records, thread):
|
|||||||
|
|
||||||
def process_record(url):
|
def process_record(url):
|
||||||
"""
|
"""
|
||||||
process one record. Scrape url and store data to mongodb
|
process one record. Scrape url data and parse them to dictionary
|
||||||
:param url: url of the record
|
:param url: url of the record
|
||||||
:return dictionary of parameters
|
:return dictionary of parameters
|
||||||
"""
|
"""
|
||||||
@@ -102,26 +115,87 @@ def process_record(url):
|
|||||||
|
|
||||||
|
|
||||||
def get_oddiel(soup):
|
def get_oddiel(soup):
|
||||||
|
"""
|
||||||
|
Helper function to get Oddiel
|
||||||
|
:param soup: website data
|
||||||
|
:return: dictionary with value: oddiel
|
||||||
|
"""
|
||||||
oddiel = soup.find("span", class_="tl", string=re.compile("Oddiel:")).parent.find("span", class_="ra").text.strip()
|
oddiel = soup.find("span", class_="tl", string=re.compile("Oddiel:")).parent.find("span", class_="ra").text.strip()
|
||||||
return {"value": oddiel}
|
return {"value": oddiel}
|
||||||
|
|
||||||
|
|
||||||
def get_vlozka(soup):
|
def get_vlozka(soup):
|
||||||
|
"""
|
||||||
|
Helper function to get VloŽžka
|
||||||
|
:param soup: website data
|
||||||
|
:return: dictionary with value: vlozka
|
||||||
|
"""
|
||||||
vlozka = soup.find("span", class_="tl", string=re.compile("Vložka")).parent.find("span", class_="ra").text.strip()
|
vlozka = soup.find("span", class_="tl", string=re.compile("Vložka")).parent.find("span", class_="ra").text.strip()
|
||||||
return {"value": vlozka}
|
return {"value": vlozka}
|
||||||
|
|
||||||
|
|
||||||
def get_aktualizaciaUdajov(soup):
|
def get_aktualizaciaUdajov(soup):
|
||||||
|
"""
|
||||||
|
Helper function to get the date of "Dátum aktualizácie údajov"
|
||||||
|
:param soup: website data
|
||||||
|
:return: dictionary with value: aktualizacia
|
||||||
|
"""
|
||||||
aktualizacia = soup.find("td", class_="tl", string=re.compile("Dátum aktualizácie")).find_next_sibling("td").text.strip()
|
aktualizacia = soup.find("td", class_="tl", string=re.compile("Dátum aktualizácie")).find_next_sibling("td").text.strip()
|
||||||
return {"value": aktualizacia}
|
return {"value": aktualizacia}
|
||||||
|
|
||||||
|
|
||||||
def get_vypisUdajov(soup):
|
def get_vypisUdajov(soup):
|
||||||
|
"""
|
||||||
|
Helper function to get the date of "Dátum výpisu"
|
||||||
|
:param soup: website data
|
||||||
|
:return: dictionary with value: vypis
|
||||||
|
"""
|
||||||
vypis = soup.find("td", class_="tl", string=re.compile("Dátum výpisu")).find_next_sibling("td").text.strip()
|
vypis = soup.find("td", class_="tl", string=re.compile("Dátum výpisu")).find_next_sibling("td").text.strip()
|
||||||
return {"value": vypis}
|
return {"value": vypis}
|
||||||
|
|
||||||
|
|
||||||
|
def get_data(data_td, value_type="text", allow_multiple_active=True):
|
||||||
|
"""
|
||||||
|
Generic function to retrieve data for one key
|
||||||
|
:param data_td: <td>-element containing the data
|
||||||
|
:param value_type: type of value that we want to retrieve. Default value is "text" other values are defined in value_type_dict
|
||||||
|
:param allow_multiple_active: if multiple active values are allowed, then a list of active values is returned, instead of single items
|
||||||
|
:return: dictionary of data for the entry
|
||||||
|
"""
|
||||||
|
data_td = data_td
|
||||||
|
|
||||||
|
data = {}
|
||||||
|
|
||||||
|
# lists holding the data for one key in the record
|
||||||
|
values = []
|
||||||
|
old_values = []
|
||||||
|
|
||||||
|
# get multiple entries (as table data)
|
||||||
|
for entry in data_td.find_all("table"):
|
||||||
|
value, valid_from, valid_until, active = process_entry(entry, value_type)
|
||||||
|
if value is None:
|
||||||
|
continue
|
||||||
|
if active:
|
||||||
|
values.append({"value": value, "valid_from": valid_from, "valid_until": valid_until})
|
||||||
|
else:
|
||||||
|
old_values.append({"value": value, "valid_from": valid_from, "valid_until": valid_until})
|
||||||
|
|
||||||
|
if not allow_multiple_active:
|
||||||
|
if len(values) > 0:
|
||||||
|
data.update(values[0])
|
||||||
|
else:
|
||||||
|
data.update({"values": values})
|
||||||
|
data.update({"old_values": old_values})
|
||||||
|
|
||||||
|
return data
|
||||||
|
|
||||||
|
|
||||||
def get_record_data(soup):
|
def get_record_data(soup):
|
||||||
|
"""
|
||||||
|
Retrieve data for one record
|
||||||
|
:param soup: souped-html for the record
|
||||||
|
:return: dictionary with record data
|
||||||
|
"""
|
||||||
record = {
|
record = {
|
||||||
"oddiel": get_oddiel(soup),
|
"oddiel": get_oddiel(soup),
|
||||||
"vlozka": get_vlozka(soup)
|
"vlozka": get_vlozka(soup)
|
||||||
@@ -129,6 +203,9 @@ def get_record_data(soup):
|
|||||||
|
|
||||||
# find the last table before variable data
|
# find the last table before variable data
|
||||||
entry = soup.find("span", class_="tl", string=re.compile("Oddiel:")).parent.parent.parent
|
entry = soup.find("span", class_="tl", string=re.compile("Oddiel:")).parent.parent.parent
|
||||||
|
|
||||||
|
# retrieve all keys for a record. Since there are multiple different record types with different keys,
|
||||||
|
# the keys of the record are created automatically from available data
|
||||||
while True:
|
while True:
|
||||||
entry = entry.find_next_sibling("table")
|
entry = entry.find_next_sibling("table")
|
||||||
entry_tr = entry.find_all("tr")
|
entry_tr = entry.find_all("tr")
|
||||||
@@ -137,19 +214,23 @@ def get_record_data(soup):
|
|||||||
if len(entry_tr) > 1: # last table with "Dátum aktualizácie údajov
|
if len(entry_tr) > 1: # last table with "Dátum aktualizácie údajov
|
||||||
break
|
break
|
||||||
|
|
||||||
# get enry name and entry data
|
# get key name and key data
|
||||||
entry_container = entry_tr[0].find_all("td")
|
key_container = entry_tr[0].find_all("td")
|
||||||
entry_name = entry_container[0].text.strip()
|
key_name = key_container[0].text.strip()
|
||||||
|
|
||||||
|
# check if multiple active allowed and the value_type
|
||||||
allow_multiple_active = True
|
allow_multiple_active = True
|
||||||
value_type = "text"
|
value_type = "text"
|
||||||
if entry_name in single_value:
|
if key_name in single_value:
|
||||||
allow_multiple_active = False
|
allow_multiple_active = False
|
||||||
if (v_type := value_type_dict.get(entry_name)) is not None:
|
if (v_type := value_type_dict.get(key_name)) is not None:
|
||||||
value_type = v_type
|
value_type = v_type
|
||||||
entry_name = transform_entry_name(entry_name)
|
|
||||||
entry_data = get_data(entry_container[1], value_type=value_type, allow_multiple_active=allow_multiple_active)
|
key_name = transform_key_name(key_name)
|
||||||
record.update({entry_name: entry_data})
|
|
||||||
|
# reads the data of the key
|
||||||
|
key_data = get_data(key_container[1], value_type=value_type, allow_multiple_active=allow_multiple_active)
|
||||||
|
record.update({key_name: key_data})
|
||||||
|
|
||||||
record.update({
|
record.update({
|
||||||
"aktualizaciaUdajov": get_aktualizaciaUdajov(soup),
|
"aktualizaciaUdajov": get_aktualizaciaUdajov(soup),
|
||||||
@@ -159,7 +240,12 @@ def get_record_data(soup):
|
|||||||
return record
|
return record
|
||||||
|
|
||||||
|
|
||||||
def transform_entry_name(name):
|
def transform_key_name(name):
|
||||||
|
"""
|
||||||
|
Helper function to create camelCase key name
|
||||||
|
:param name: string with input data (from ORSR)
|
||||||
|
:return: camelCase key name
|
||||||
|
"""
|
||||||
s = unicodedata.normalize("NFKD",name).encode('ascii', 'ignore').decode().replace(":", "").lower().split()
|
s = unicodedata.normalize("NFKD",name).encode('ascii', 'ignore').decode().replace(":", "").lower().split()
|
||||||
return s[0].lower() + "".join(w.capitalize() for w in s[1:])
|
return s[0].lower() + "".join(w.capitalize() for w in s[1:])
|
||||||
|
|
||||||
@@ -180,8 +266,10 @@ def process_entry(entry, value_type):
|
|||||||
if value_td.span.attrs["class"][0] == "ra":
|
if value_td.span.attrs["class"][0] == "ra":
|
||||||
active = True
|
active = True
|
||||||
|
|
||||||
|
# get clean lines from multiline entries
|
||||||
lines = [f.strip() for f in " ".join(["\n" if x.name == "br" else x.text.strip() for x in value_td.find_all(["br","span"])]).split("\n") if f]
|
lines = [f.strip() for f in " ".join(["\n" if x.name == "br" else x.text.strip() for x in value_td.find_all(["br","span"])]).split("\n") if f]
|
||||||
|
|
||||||
|
# parse data according to value_type
|
||||||
if value_type == "text":
|
if value_type == "text":
|
||||||
value = ", ".join(lines)
|
value = ", ".join(lines)
|
||||||
elif value_type == "number":
|
elif value_type == "number":
|
||||||
@@ -206,33 +294,6 @@ def process_entry(entry, value_type):
|
|||||||
return value, valid_from, valid_until, active
|
return value, valid_from, valid_until, active
|
||||||
|
|
||||||
|
|
||||||
def get_data(data_td, value_type="text", allow_multiple_active=True):
|
|
||||||
data_td = data_td
|
|
||||||
|
|
||||||
data = {}
|
|
||||||
|
|
||||||
values = []
|
|
||||||
old_values = []
|
|
||||||
|
|
||||||
for entry in data_td.find_all("table"):
|
|
||||||
value, valid_from, valid_until, active = process_entry(entry, value_type)
|
|
||||||
if value is None:
|
|
||||||
continue
|
|
||||||
if active:
|
|
||||||
values.append({"value": value, "valid_from": valid_from, "valid_until": valid_until})
|
|
||||||
else:
|
|
||||||
old_values.append({"value": value, "valid_from": valid_from, "valid_until": valid_until})
|
|
||||||
|
|
||||||
if not allow_multiple_active:
|
|
||||||
if len(values) > 0:
|
|
||||||
data.update(values[0])
|
|
||||||
else:
|
|
||||||
data.update({"values": values})
|
|
||||||
data.update({"old_values": old_values})
|
|
||||||
|
|
||||||
return data
|
|
||||||
|
|
||||||
|
|
||||||
def parse_oddo(text):
|
def parse_oddo(text):
|
||||||
"""
|
"""
|
||||||
Parses the valid_from and valid_until from string
|
Parses the valid_from and valid_until from string
|
||||||
|
|||||||
Reference in New Issue
Block a user