working for sro, adapt to other

This commit is contained in:
2023-09-27 21:38:05 +02:00
parent 940f424f05
commit 229e4ab01d
5 changed files with 88 additions and 97 deletions

2
.gitignore vendored
View File

@@ -1,3 +1,3 @@
.venv .venv
.idea .idea
.config.cfg config.cfg

View File

@@ -14,6 +14,7 @@ settings = {
"mongodb_uri": config.get("DB","MONGODB_URI"), "mongodb_uri": config.get("DB","MONGODB_URI"),
"mongodb_db": config.get("DB","MONGODB_DB"), "mongodb_db": config.get("DB","MONGODB_DB"),
"mongodb_collection": config.get("DB","MONGODB_COLLECTION"), "mongodb_collection": config.get("DB","MONGODB_COLLECTION"),
"orsr_url": config.get("WEB", "ORSR_URL"), "base_url": config.get("WEB", "BASE_URL"),
"endpoint": config.get("WEB", "ENDPOINT"),
"threads": int(config.get("APP", "THREADS")) "threads": int(config.get("APP", "THREADS"))
} }

11
config.cfg Normal file
View File

@@ -0,0 +1,11 @@
[DB]
#MONGODB_URI = mongodb://localhost:27017/softone
#MONGODB_DB = softone
#MONGODB_COLLECTION = orsr
[WEB]
#BASE_URL = https://www.orsr.sk/
#ENDPOINT = hladaj_zmeny.asp
[APP]
THREADS = 1

View File

@@ -4,7 +4,8 @@ MONGODB_DB = softone
MONGODB_COLLECTION = orsr MONGODB_COLLECTION = orsr
[WEB] [WEB]
ORSR_URL = http://www.orsr.sk/hladaj_zmeny.asp BASE_URL = https://www.orsr.sk/
ENDPOINT = hladaj_zmeny.asp
[APP] [APP]
THREADS = 8 THREADS = 4

View File

@@ -4,6 +4,7 @@ import json
from bs4 import BeautifulSoup from bs4 import BeautifulSoup
from tqdm.auto import tqdm from tqdm.auto import tqdm
from concurrent.futures import ThreadPoolExecutor from concurrent.futures import ThreadPoolExecutor
from pymongo import InsertOne
from app.config import settings from app.config import settings
from app.db import connect_db, disconnect_db from app.db import connect_db, disconnect_db
@@ -15,18 +16,18 @@ def scrape_orsr():
This is the main function that scrapes data from endpoint defined in config and stores it in mongodb. This is the main function that scrapes data from endpoint defined in config and stores it in mongodb.
""" """
# get all links to "Aktuálny" from the orsr url # get all links to "Aktuálny" from the orsr url
html = requests.get(settings["orsr_url"]) html = requests.get(settings["base_url"]+settings["endpoint"])
soup = BeautifulSoup(html.content, "html.parser") soup = BeautifulSoup(html.content, "html.parser")
records = soup.find_all("a", string="Aktuálny") records = soup.find_all("a", string="Úplný")
records = [record["href"] for record in records] records = [settings["base_url"]+record["href"] for record in records]
# distribute the work in #of threads defined in config # distribute the work in #of threads defined in config
worker_ids = list(range(1, len(records)+1)) parts = [records[i::settings["threads"]] for i in range(settings["threads"])]
parts = [worker_ids[i::settings["threads"]] for i in range(settings["threads"])]
with ThreadPoolExecutor() as t: with ThreadPoolExecutor() as t:
for thread_id, part in enumerate(parts): for thread_id, part in enumerate(parts):
t.submit(process_records, part, thread_id+1) t.submit(process_records, part, thread_id+1)
print("records_processed")
def process_records(records, thread): def process_records(records, thread):
@@ -38,7 +39,7 @@ def process_records(records, thread):
data = [] data = []
for i in tqdm(range(len(records)), desc=f"thread {thread}"): for i in tqdm(range(len(records)), desc=f"thread {thread}"):
record = process_record(records[i]) record = process_record(records[i])
data.append(record) data.append(InsertOne(record))
collection = connect_db() collection = connect_db()
collection.bulk_write(data) collection.bulk_write(data)
disconnect_db(collection) disconnect_db(collection)
@@ -53,6 +54,25 @@ def process_record(url):
html = requests.get(url) html = requests.get(url)
soup = BeautifulSoup(html.content, "html.parser") soup = BeautifulSoup(html.content, "html.parser")
record = {
"oddiel": get_oddiel(soup),
"vlozka": get_vlozka(soup),
"obchodneMeno": get_data(soup, "Obchodné meno", allow_multiple_active=False),
"sidlo": get_data(soup,"Sídlo", allow_multiple_active=False),
"ico": get_data(soup, "IČO", value_type="number" , allow_multiple_active=False),
"denZapisu": get_data(soup, "Deň zápisu", allow_multiple_active=False),
"pravnaForma": get_data(soup, "Právna forma", allow_multiple_active=False),
"predmetyCinnosti": get_data(soup, "Predmet činnosti"),
"spolocnici": get_data(soup, "Spoločníci", value_type="spolocnici"),
"vyskaVkladov": get_data(soup, "Výška vkladu", value_type="vklad"),
"statutarnyOrgan": get_data(soup, "Štatutárny orgán"),
"konanie": get_data(soup, "Konanie menom"),
"zakladneImanie": get_data(soup, "Základné imanie"),
"aktualizaciaUdajov": get_aktualizaciaUdajov(soup),
"vypisUdajov": get_vypisUdajov(soup)
}
return record
def get_oddiel(soup): def get_oddiel(soup):
oddiel = soup.find("span", class_="tl", string=re.compile("Oddiel:")).parent.find("span", class_="ra").text.strip() oddiel = soup.find("span", class_="tl", string=re.compile("Oddiel:")).parent.find("span", class_="ra").text.strip()
@@ -64,6 +84,16 @@ def get_vlozka(soup):
return {"value": vlozka} return {"value": vlozka}
def get_aktualizaciaUdajov(soup):
aktualizacia = soup.find("td", class_="tl", string=re.compile("Dátum aktualizácie")).find_next_sibling("td").text.strip()
return {"value": aktualizacia}
def get_vypisUdajov(soup):
vypis = soup.find("td", class_="tl", string=re.compile("Dátum výpisu")).find_next_sibling("td").text.strip()
return {"value": vypis}
def get_obchodneMeno(soup): def get_obchodneMeno(soup):
data = {} data = {}
@@ -91,79 +121,6 @@ def get_obchodneMeno(soup):
return data return data
def get_sidlo(soup):
data = {}
# find the table <tr> element of "Sídlo:"
return data
def get_ico(soup):
data = {}
return data
def get_denZapisu(soup):
data = {}
return data
def get_pravnaForma(soup):
data = {}
return data
def get_predmetyCinnosti(soup):
data = {}
return data
def get_spolocnici(soup):
data = {}
return data
def get_vyskaVkladov(soup):
data = {}
return data
def get_statutarnyOrgan(soup):
data = {}
return data
def get_konanie(soup):
data = {}
return data
def get_zakladneImanie(soup):
data = {}
return data
def get_aktualizaciaUdajov(soup):
data = {}
return data
def get_vypisUdajov(soup):
data = {}
return data
def process_entry(entry, value_type): def process_entry(entry, value_type):
""" """
extracts one entry from the table of entries for a given data extracts one entry from the table of entries for a given data
@@ -181,6 +138,26 @@ def process_entry(entry, value_type):
active = True active = True
lines = [f.strip() for f in " ".join(["\n" if x.name == "br" else x.text.strip() for x in value_td.find_all()]).split("\n") if f] lines = [f.strip() for f in " ".join(["\n" if x.name == "br" else x.text.strip() for x in value_td.find_all()]).split("\n") if f]
if value_type == "text":
value = ", ".join(lines)
elif value_type == "number":
value = int("".join(lines).replace(" ",""))
elif value_type == "spolocnici":
spolocnik = lines[0]
adresa = ", ".join(lines[1:])
value = {
"spolocnik": spolocnik,
"adresa": adresa
}
elif value_type == "vklad":
spolocnik = lines[0]
vklad = ", ".join(lines[1:])
value = {
"spolocnik": spolocnik,
"vklad": vklad
}
valid_from, valid_until = parse_oddo(valid_td.text.strip()) valid_from, valid_until = parse_oddo(valid_td.text.strip())
return value, valid_from, valid_until, active return value, valid_from, valid_until, active
@@ -236,26 +213,27 @@ def test():
record = { record = {
"oddiel": get_oddiel(soup), "oddiel": get_oddiel(soup),
"vlozka": get_vlozka(soup), "vlozka": get_vlozka(soup),
"obchodneMeno": get_obchodneMeno(soup), "obchodneMeno": get_data(soup, "Obchodné meno", allow_multiple_active=False),
"sidlo": get_sidlo(soup), "sidlo": get_data(soup,"Sídlo", allow_multiple_active=False),
"ico": get_ico(soup), "ico": get_data(soup, "IČO", value_type="number" , allow_multiple_active=False),
"denZapisu": get_denZapisu(soup), "denZapisu": get_data(soup, "Deň zápisu", allow_multiple_active=False),
"pravnaForma": get_pravnaForma(soup), "pravnaForma": get_data(soup, "Právna forma", allow_multiple_active=False),
"predmetyCinnosti": get_predmetyCinnosti(soup), "predmetyCinnosti": get_data(soup, "Predmet činnosti"),
"spolocnici": get_spolocnici(soup), "spolocnici": get_data(soup, "Spoločníci", value_type="spolocnici"),
"vyskaVkladov": get_vyskaVkladov(soup), "vyskaVkladov": get_data(soup, "Výška vkladu", value_type="vklad"),
"statutarnyOrgan": get_statutarnyOrgan(soup), "statutarnyOrgan": get_data(soup, "Štatutárny orgán"),
"konanie": get_konanie(soup), "konanie": get_data(soup, "Konanie menom"),
"zakladneImanie": get_zakladneImanie(soup), "zakladneImanie": get_data(soup, "kladné imanie"),
"aktualizaciaUdajov": get_aktualizaciaUdajov(soup), "aktualizaciaUdajov": get_aktualizaciaUdajov(soup),
"vypisUdajov": get_vypisUdajov(soup) "vypisUdajov": get_vypisUdajov(soup)
} }
print(json.dumps(record,indent=4,ensure_ascii=False)) print(json.dumps(record,indent=4,ensure_ascii=False))
collection = connect_db() collection = connect_db()
#collection.bulk_write(soup) records = [InsertOne(record)]
collection.bulk_write(records)
disconnect_db(collection) disconnect_db(collection)
if __name__ == "__main__": if __name__ == "__main__":
#scrape_orsr() scrape_orsr()
test() #test()