working scraper

This commit is contained in:
2023-09-28 11:53:38 +02:00
parent 229e4ab01d
commit 3460c718d2
3 changed files with 61 additions and 78 deletions

View File

@@ -8,4 +8,4 @@
#ENDPOINT = hladaj_zmeny.asp #ENDPOINT = hladaj_zmeny.asp
[APP] [APP]
THREADS = 1 THREADS = 32

View File

@@ -1,5 +1,5 @@
[DB] [DB]
MONGODB_URI = mongodb://localhost:27017/softone MONGODB_URI = mongodb://localhost:27017
MONGODB_DB = softone MONGODB_DB = softone
MONGODB_COLLECTION = orsr MONGODB_COLLECTION = orsr
@@ -8,4 +8,4 @@ BASE_URL = https://www.orsr.sk/
ENDPOINT = hladaj_zmeny.asp ENDPOINT = hladaj_zmeny.asp
[APP] [APP]
THREADS = 4 THREADS = 32

View File

@@ -1,6 +1,7 @@
import requests import requests
import re import re
import json import json
import unicodedata
from bs4 import BeautifulSoup from bs4 import BeautifulSoup
from tqdm.auto import tqdm from tqdm.auto import tqdm
from concurrent.futures import ThreadPoolExecutor from concurrent.futures import ThreadPoolExecutor
@@ -11,6 +12,13 @@ from app.db import connect_db, disconnect_db
from time import sleep from time import sleep
single_value = ["Obchodné meno:", "Sídlo:", "IČO:", "Deň zápisu:", "Právna forma:"]
value_type_dict = {
"IČO:": "number",
"Spoločníci:": "spolocnici",
"Výška vkladu každého spoločníka:": "vklad"
}
def scrape_orsr(): def scrape_orsr():
""" """
This is the main function that scrapes data from endpoint defined in config and stores it in mongodb. This is the main function that scrapes data from endpoint defined in config and stores it in mongodb.
@@ -24,10 +32,11 @@ def scrape_orsr():
# distribute the work in #of threads defined in config # distribute the work in #of threads defined in config
parts = [records[i::settings["threads"]] for i in range(settings["threads"])] parts = [records[i::settings["threads"]] for i in range(settings["threads"])]
print(f"Processing {len(records)} records using {settings['threads']} threads:")
with ThreadPoolExecutor() as t: with ThreadPoolExecutor() as t:
for thread_id, part in enumerate(parts): for thread_id, part in enumerate(parts):
t.submit(process_records, part, thread_id+1) t.submit(process_records, part, thread_id+1)
print("records_processed") print("All records_processed")
def process_records(records, thread): def process_records(records, thread):
@@ -37,7 +46,8 @@ def process_records(records, thread):
:param thread: thread id of processing thread :param thread: thread id of processing thread
""" """
data = [] data = []
for i in tqdm(range(len(records)), desc=f"thread {thread}"): #for i in tqdm(range(len(records)), desc=f"thread {thread}"):
for i in tqdm(range(1), desc=f"thread {thread}"):
record = process_record(records[i]) record = process_record(records[i])
data.append(InsertOne(record)) data.append(InsertOne(record))
collection = connect_db() collection = connect_db()
@@ -51,26 +61,12 @@ def process_record(url):
:param url: url of the record :param url: url of the record
:return dictionary of parameters :return dictionary of parameters
""" """
url = "https://www.orsr.sk/vypis.asp?ID=471674&SID=4&P=1"
html = requests.get(url) html = requests.get(url)
soup = BeautifulSoup(html.content, "html.parser") soup = BeautifulSoup(html.content, "html.parser")
record = { record = get_record_data(soup)
"oddiel": get_oddiel(soup),
"vlozka": get_vlozka(soup),
"obchodneMeno": get_data(soup, "Obchodné meno", allow_multiple_active=False),
"sidlo": get_data(soup,"Sídlo", allow_multiple_active=False),
"ico": get_data(soup, "IČO", value_type="number" , allow_multiple_active=False),
"denZapisu": get_data(soup, "Deň zápisu", allow_multiple_active=False),
"pravnaForma": get_data(soup, "Právna forma", allow_multiple_active=False),
"predmetyCinnosti": get_data(soup, "Predmet činnosti"),
"spolocnici": get_data(soup, "Spoločníci", value_type="spolocnici"),
"vyskaVkladov": get_data(soup, "Výška vkladu", value_type="vklad"),
"statutarnyOrgan": get_data(soup, "Štatutárny orgán"),
"konanie": get_data(soup, "Konanie menom"),
"zakladneImanie": get_data(soup, "Základné imanie"),
"aktualizaciaUdajov": get_aktualizaciaUdajov(soup),
"vypisUdajov": get_vypisUdajov(soup)
}
return record return record
@@ -94,31 +90,47 @@ def get_vypisUdajov(soup):
return {"value": vypis} return {"value": vypis}
def get_obchodneMeno(soup): def get_record_data(soup):
data = {} record = {
"oddiel": get_oddiel(soup),
"vlozka": get_vlozka(soup)
}
# find the table <tr> element of "Obchodné meno:" # find the last table before variable data
meno_tr = soup.find("span", class_="tl", string=re.compile("Obchodné")).parent.parent entry = soup.find("span", class_="tl", string=re.compile("Oddiel:")).parent.parent.parent
while True:
entry = entry.find_next_sibling("table")
entry_tr = entry.find_all("tr")
entry_tr = [i for i in entry_tr if i.parent == entry_tr[0].parent]
# parse the name and date if len(entry_tr) > 1: # last table with "Dátum aktualizácie údajov
active = meno_tr.find_all("span", class_="ra") break
active = [x.text.strip() for x in active]
if len(active) == 0:
value, valid_from, valid_until = "", ""
else:
value, valid = active[0], active[1]
valid_from, valid_until = parse_oddo(valid)
data.update({"value": value, "valid_from": valid_from, "valid_until": valid_until})
# check for older entries # get enry name and entry data
old = meno_tr.find_all("span", class_="ro") entry_container = entry_tr[0].find_all("td")
old = [x.text.strip() for x in old] entry_name = entry_container[0].text.strip()
if len(old) == 0:
old_values = [] allow_multiple_active = True
else: value_type = "text"
old_values = [{"value": y[0], "valid_from": y[1][0], "valid_until": y[1][1]} for y in zip(old[::2], list(map(parse_oddo,old[1::2])))] if entry_name in single_value:
data.update({"old_values": old_values}) allow_multiple_active = False
return data if (v_type := value_type_dict.get(entry_name)) is not None:
value_type = v_type
entry_name = transform_entry_name(entry_name)
entry_data = get_data(entry_container[1], value_type=value_type, allow_multiple_active=allow_multiple_active)
record.update({entry_name: entry_data})
record.update({
"aktualizaciaUdajov": get_aktualizaciaUdajov(soup),
"vypisUdajov": get_vypisUdajov(soup)
})
return record
def transform_entry_name(name):
s = unicodedata.normalize("NFKD",name).encode('ascii', 'ignore').decode().replace(":", "").lower().split()
return s[0].lower() + "".join(w.capitalize() for w in s[1:])
def process_entry(entry, value_type): def process_entry(entry, value_type):
@@ -137,7 +149,7 @@ def process_entry(entry, value_type):
if value_td.span.attrs["class"][0] == "ra": if value_td.span.attrs["class"][0] == "ra":
active = True active = True
lines = [f.strip() for f in " ".join(["\n" if x.name == "br" else x.text.strip() for x in value_td.find_all()]).split("\n") if f] lines = [f.strip() for f in " ".join(["\n" if x.name == "br" else x.text.strip() for x in value_td.find_all(["br","span"])]).split("\n") if f]
if value_type == "text": if value_type == "text":
value = ", ".join(lines) value = ", ".join(lines)
@@ -163,8 +175,8 @@ def process_entry(entry, value_type):
return value, valid_from, valid_until, active return value, valid_from, valid_until, active
def get_data(soup, name, value_type="text", allow_multiple_active=True): def get_data(data_td, value_type="text", allow_multiple_active=True):
data_td = soup.find("span", class_="tl", string=re.compile(f"{name}")).parent.find_next_sibling("td") data_td = data_td
data = {} data = {}
@@ -181,7 +193,8 @@ def get_data(soup, name, value_type="text", allow_multiple_active=True):
old_values.append({"value": value, "valid_from": valid_from, "valid_until": valid_until}) old_values.append({"value": value, "valid_from": valid_from, "valid_until": valid_until})
if not allow_multiple_active: if not allow_multiple_active:
data.update(values[0]) if len(values) > 0:
data.update(values[0])
else: else:
data.update({"values": values}) data.update({"values": values})
data.update({"old_values": old_values}) data.update({"old_values": old_values})
@@ -205,35 +218,5 @@ def parse_oddo(text):
return valid_from, valid_until return valid_from, valid_until
def test():
url = "https://www.orsr.sk/vypis.asp?ID=12388&SID=8&P=1"
html = requests.get(url)
soup = BeautifulSoup(html.content, "html.parser")
record = {
"oddiel": get_oddiel(soup),
"vlozka": get_vlozka(soup),
"obchodneMeno": get_data(soup, "Obchodné meno", allow_multiple_active=False),
"sidlo": get_data(soup,"Sídlo", allow_multiple_active=False),
"ico": get_data(soup, "IČO", value_type="number" , allow_multiple_active=False),
"denZapisu": get_data(soup, "Deň zápisu", allow_multiple_active=False),
"pravnaForma": get_data(soup, "Právna forma", allow_multiple_active=False),
"predmetyCinnosti": get_data(soup, "Predmet činnosti"),
"spolocnici": get_data(soup, "Spoločníci", value_type="spolocnici"),
"vyskaVkladov": get_data(soup, "Výška vkladu", value_type="vklad"),
"statutarnyOrgan": get_data(soup, "Štatutárny orgán"),
"konanie": get_data(soup, "Konanie menom"),
"zakladneImanie": get_data(soup, "Základné imanie"),
"aktualizaciaUdajov": get_aktualizaciaUdajov(soup),
"vypisUdajov": get_vypisUdajov(soup)
}
print(json.dumps(record,indent=4,ensure_ascii=False))
collection = connect_db()
records = [InsertOne(record)]
collection.bulk_write(records)
disconnect_db(collection)
if __name__ == "__main__": if __name__ == "__main__":
scrape_orsr() scrape_orsr()
#test()