import requests
import re
import json
from bs4 import BeautifulSoup
from tqdm.auto import tqdm
from concurrent.futures import ThreadPoolExecutor
from pymongo import InsertOne
from app.config import settings
from app.db import connect_db, disconnect_db
from time import sleep
def scrape_orsr():
"""
This is the main function that scrapes data from endpoint defined in config and stores it in mongodb.
"""
# get all links to "Aktuálny" from the orsr url
html = requests.get(settings["base_url"]+settings["endpoint"])
soup = BeautifulSoup(html.content, "html.parser")
records = soup.find_all("a", string="Úplný")
records = [settings["base_url"]+record["href"] for record in records]
# distribute the work in #of threads defined in config
parts = [records[i::settings["threads"]] for i in range(settings["threads"])]
with ThreadPoolExecutor() as t:
for thread_id, part in enumerate(parts):
t.submit(process_records, part, thread_id+1)
print("records_processed")
def process_records(records, thread):
"""
worker for processing records in a thread
:param records: list of urls of records to proceses
:param thread: thread id of processing thread
"""
data = []
for i in tqdm(range(len(records)), desc=f"thread {thread}"):
record = process_record(records[i])
data.append(InsertOne(record))
collection = connect_db()
collection.bulk_write(data)
disconnect_db(collection)
def process_record(url):
"""
process one record. Scrape url and store data to mongodb
:param url: url of the record
:return dictionary of parameters
"""
html = requests.get(url)
soup = BeautifulSoup(html.content, "html.parser")
record = {
"oddiel": get_oddiel(soup),
"vlozka": get_vlozka(soup),
"obchodneMeno": get_data(soup, "Obchodné meno", allow_multiple_active=False),
"sidlo": get_data(soup,"Sídlo", allow_multiple_active=False),
"ico": get_data(soup, "IČO", value_type="number" , allow_multiple_active=False),
"denZapisu": get_data(soup, "Deň zápisu", allow_multiple_active=False),
"pravnaForma": get_data(soup, "Právna forma", allow_multiple_active=False),
"predmetyCinnosti": get_data(soup, "Predmet činnosti"),
"spolocnici": get_data(soup, "Spoločníci", value_type="spolocnici"),
"vyskaVkladov": get_data(soup, "Výška vkladu", value_type="vklad"),
"statutarnyOrgan": get_data(soup, "Štatutárny orgán"),
"konanie": get_data(soup, "Konanie menom"),
"zakladneImanie": get_data(soup, "Základné imanie"),
"aktualizaciaUdajov": get_aktualizaciaUdajov(soup),
"vypisUdajov": get_vypisUdajov(soup)
}
return record
def get_oddiel(soup):
oddiel = soup.find("span", class_="tl", string=re.compile("Oddiel:")).parent.find("span", class_="ra").text.strip()
return {"value": oddiel}
def get_vlozka(soup):
vlozka = soup.find("span", class_="tl", string=re.compile("Vložka")).parent.find("span", class_="ra").text.strip()
return {"value": vlozka}
def get_aktualizaciaUdajov(soup):
aktualizacia = soup.find("td", class_="tl", string=re.compile("Dátum aktualizácie")).find_next_sibling("td").text.strip()
return {"value": aktualizacia}
def get_vypisUdajov(soup):
vypis = soup.find("td", class_="tl", string=re.compile("Dátum výpisu")).find_next_sibling("td").text.strip()
return {"value": vypis}
def get_obchodneMeno(soup):
data = {}
# find the table
element of "Obchodné meno:"
meno_tr = soup.find("span", class_="tl", string=re.compile("Obchodné")).parent.parent
# parse the name and date
active = meno_tr.find_all("span", class_="ra")
active = [x.text.strip() for x in active]
if len(active) == 0:
value, valid_from, valid_until = "", ""
else:
value, valid = active[0], active[1]
valid_from, valid_until = parse_oddo(valid)
data.update({"value": value, "valid_from": valid_from, "valid_until": valid_until})
# check for older entries
old = meno_tr.find_all("span", class_="ro")
old = [x.text.strip() for x in old]
if len(old) == 0:
old_values = []
else:
old_values = [{"value": y[0], "valid_from": y[1][0], "valid_until": y[1][1]} for y in zip(old[::2], list(map(parse_oddo,old[1::2])))]
data.update({"old_values": old_values})
return data
def process_entry(entry, value_type):
"""
extracts one entry from the table of entries for a given data
:param entry: one table element of data
:param value_type: type of the value data
:return: tuple: (value, valid_from, valid_until, active)
"""
value, valid_from, valid_until, active = None, None, None, False
value_td, valid_td = entry.find_all("td")
# Check if active entry
if value_td.span.attrs["class"][0] == "ra":
active = True
lines = [f.strip() for f in " ".join(["\n" if x.name == "br" else x.text.strip() for x in value_td.find_all()]).split("\n") if f]
if value_type == "text":
value = ", ".join(lines)
elif value_type == "number":
value = int("".join(lines).replace(" ",""))
elif value_type == "spolocnici":
spolocnik = lines[0]
adresa = ", ".join(lines[1:])
value = {
"spolocnik": spolocnik,
"adresa": adresa
}
elif value_type == "vklad":
spolocnik = lines[0]
vklad = ", ".join(lines[1:])
value = {
"spolocnik": spolocnik,
"vklad": vklad
}
valid_from, valid_until = parse_oddo(valid_td.text.strip())
return value, valid_from, valid_until, active
def get_data(soup, name, value_type="text", allow_multiple_active=True):
data_td = soup.find("span", class_="tl", string=re.compile(f"{name}")).parent.find_next_sibling("td")
data = {}
values = []
old_values = []
for entry in data_td.find_all("table"):
value, valid_from, valid_until, active = process_entry(entry, value_type)
if value is None:
continue
if active:
values.append({"value": value, "valid_from": valid_from, "valid_until": valid_until})
else:
old_values.append({"value": value, "valid_from": valid_from, "valid_until": valid_until})
if not allow_multiple_active:
data.update(values[0])
else:
data.update({"values": values})
data.update({"old_values": old_values})
return data
def parse_oddo(text):
"""
Parses the valid_from and valid_until from string
:param text: od_do_dates in format: "(od: DD.MM.YYYY do: DD.MM.YYYY)"
:return: returns a tuple (valid_from, valid_until)
"""
valid_from, valid_until = "", ""
if (start_from := text.find("od: ")) > -1:
valid_from = text[start_from+4:start_from+14]
if (start_until := text.find("do: ")) > -1:
valid_until = text[start_until+4:start_until+14]
return valid_from, valid_until
def test():
url = "https://www.orsr.sk/vypis.asp?ID=12388&SID=8&P=1"
html = requests.get(url)
soup = BeautifulSoup(html.content, "html.parser")
record = {
"oddiel": get_oddiel(soup),
"vlozka": get_vlozka(soup),
"obchodneMeno": get_data(soup, "Obchodné meno", allow_multiple_active=False),
"sidlo": get_data(soup,"Sídlo", allow_multiple_active=False),
"ico": get_data(soup, "IČO", value_type="number" , allow_multiple_active=False),
"denZapisu": get_data(soup, "Deň zápisu", allow_multiple_active=False),
"pravnaForma": get_data(soup, "Právna forma", allow_multiple_active=False),
"predmetyCinnosti": get_data(soup, "Predmet činnosti"),
"spolocnici": get_data(soup, "Spoločníci", value_type="spolocnici"),
"vyskaVkladov": get_data(soup, "Výška vkladu", value_type="vklad"),
"statutarnyOrgan": get_data(soup, "Štatutárny orgán"),
"konanie": get_data(soup, "Konanie menom"),
"zakladneImanie": get_data(soup, "Základné imanie"),
"aktualizaciaUdajov": get_aktualizaciaUdajov(soup),
"vypisUdajov": get_vypisUdajov(soup)
}
print(json.dumps(record,indent=4,ensure_ascii=False))
collection = connect_db()
records = [InsertOne(record)]
collection.bulk_write(records)
disconnect_db(collection)
if __name__ == "__main__":
scrape_orsr()
#test()