Files
softone_zadanie/scraper.py
2023-09-27 18:11:10 +02:00

261 lines
6.7 KiB
Python

import requests
import re
import json
from bs4 import BeautifulSoup
from tqdm.auto import tqdm
from concurrent.futures import ThreadPoolExecutor
from app.config import settings
from app.db import connect_db, disconnect_db
from time import sleep
def scrape_orsr():
"""
This is the main function that scrapes data from endpoint defined in config and stores it in mongodb.
"""
# get all links to "Aktuálny" from the orsr url
html = requests.get(settings["orsr_url"])
soup = BeautifulSoup(html.content, "html.parser")
records = soup.find_all("a", string="Aktuálny")
records = [record["href"] for record in records]
# distribute the work in #of threads defined in config
worker_ids = list(range(1, len(records)+1))
parts = [worker_ids[i::settings["threads"]] for i in range(settings["threads"])]
with ThreadPoolExecutor() as t:
for thread_id, part in enumerate(parts):
t.submit(process_records, part, thread_id+1)
def process_records(records, thread):
"""
worker for processing records in a thread
:param records: list of urls of records to proceses
:param thread: thread id of processing thread
"""
data = []
for i in tqdm(range(len(records)), desc=f"thread {thread}"):
record = process_record(records[i])
data.append(record)
collection = connect_db()
collection.bulk_write(data)
disconnect_db(collection)
def process_record(url):
"""
process one record. Scrape url and store data to mongodb
:param url: url of the record
:return dictionary of parameters
"""
html = requests.get(url)
soup = BeautifulSoup(html.content, "html.parser")
def get_oddiel(soup):
oddiel = soup.find("span", class_="tl", string=re.compile("Oddiel:")).parent.find("span", class_="ra").text.strip()
return {"value": oddiel}
def get_vlozka(soup):
vlozka = soup.find("span", class_="tl", string=re.compile("Vložka")).parent.find("span", class_="ra").text.strip()
return {"value": vlozka}
def get_obchodneMeno(soup):
data = {}
# find the table <tr> element of "Obchodné meno:"
meno_tr = soup.find("span", class_="tl", string=re.compile("Obchodné")).parent.parent
# parse the name and date
active = meno_tr.find_all("span", class_="ra")
active = [x.text.strip() for x in active]
if len(active) == 0:
value, valid_from, valid_until = "", ""
else:
value, valid = active[0], active[1]
valid_from, valid_until = parse_oddo(valid)
data.update({"value": value, "valid_from": valid_from, "valid_until": valid_until})
# check for older entries
old = meno_tr.find_all("span", class_="ro")
old = [x.text.strip() for x in old]
if len(old) == 0:
old_values = []
else:
old_values = [{"value": y[0], "valid_from": y[1][0], "valid_until": y[1][1]} for y in zip(old[::2], list(map(parse_oddo,old[1::2])))]
data.update({"old_values": old_values})
return data
def get_sidlo(soup):
data = {}
# find the table <tr> element of "Sídlo:"
return data
def get_ico(soup):
data = {}
return data
def get_denZapisu(soup):
data = {}
return data
def get_pravnaForma(soup):
data = {}
return data
def get_predmetyCinnosti(soup):
data = {}
return data
def get_spolocnici(soup):
data = {}
return data
def get_vyskaVkladov(soup):
data = {}
return data
def get_statutarnyOrgan(soup):
data = {}
return data
def get_konanie(soup):
data = {}
return data
def get_zakladneImanie(soup):
data = {}
return data
def get_aktualizaciaUdajov(soup):
data = {}
return data
def get_vypisUdajov(soup):
data = {}
return data
def process_entry(entry, value_type):
"""
extracts one entry from the table of entries for a given data
:param entry: one table element of data
:param value_type: type of the value data
:return: tuple: (value, valid_from, valid_until, active)
"""
value, valid_from, valid_until, active = None, None, None, False
value_td, valid_td = entry.find_all("td")
# Check if active entry
if value_td.span.attrs["class"][0] == "ra":
active = True
return value, valid_from, valid_until, active
def get_data(soup, name, value_type="text", allow_multiple_active=True):
data_td = soup.find("span", class_="tl", string=re.compile(f"{name}")).parent.find_next_sibling("td")
data = {}
values = []
old_values = []
for entry in data_td.find_all("table"):
value, valid_from, valid_until, active = process_entry(entry, value_type)
if value is None:
continue
if active:
values.append({"value": value, "valid_from": valid_from, "valid_until": valid_until})
else:
old_values.append({"value": value, "valid_from": valid_from, "valid_until": valid_until})
if not allow_multiple_active:
data.update(values[0])
else:
data.update({"values": values})
data.update({"old_values": old_values})
return data
def parse_oddo(text):
"""
Parses the valid_from and valid_until from string
:param text: od_do_dates in format: "(od: DD.MM.YYYY do: DD.MM.YYYY)"
:return: returns a tuple (valid_from, valid_until)
"""
valid_from, valid_until = "", ""
if (start_from := text.find("od: ")) > -1:
valid_from = text[start_from+4:start_from+14]
if (start_until := text.find("do: ")) > -1:
valid_until = text[start_until+4:start_until+14]
return valid_from, valid_until
def test():
url = "https://www.orsr.sk/vypis.asp?ID=12388&SID=8&P=1"
html = requests.get(url)
soup = BeautifulSoup(html.content, "html.parser")
record = {
"oddiel": get_oddiel(soup),
"vlozka": get_vlozka(soup),
"obchodneMeno": get_obchodneMeno(soup),
"sidlo": get_sidlo(soup),
"ico": get_ico(soup),
"denZapisu": get_denZapisu(soup),
"pravnaForma": get_pravnaForma(soup),
"predmetyCinnosti": get_predmetyCinnosti(soup),
"spolocnici": get_spolocnici(soup),
"vyskaVkladov": get_vyskaVkladov(soup),
"statutarnyOrgan": get_statutarnyOrgan(soup),
"konanie": get_konanie(soup),
"zakladneImanie": get_zakladneImanie(soup),
"aktualizaciaUdajov": get_aktualizaciaUdajov(soup),
"vypisUdajov": get_vypisUdajov(soup)
}
print(json.dumps(record,indent=4,ensure_ascii=False))
collection = connect_db()
#collection.bulk_write(soup)
disconnect_db(collection)
if __name__ == "__main__":
#scrape_orsr()
test()