softone_zadanie/scraper.py

import requests
import re
import json
from bs4 import BeautifulSoup
from tqdm.auto import tqdm
from concurrent.futures import ThreadPoolExecutor

from app.config import settings
from app.db import connect_db, disconnect_db
from time import sleep


def scrape_orsr():
    """
    This is the main function that scrapes data from endpoint defined in config and stores it in mongodb.
    """
    # get all links to "Aktuálny" from the orsr url
    html = requests.get(settings["orsr_url"])
    soup = BeautifulSoup(html.content, "html.parser")
    records = soup.find_all("a", string="Aktuálny")
    records = [record["href"] for record in records]

    # distribute the work in #of threads defined in config
    worker_ids = list(range(1, len(records)+1))
    parts = [worker_ids[i::settings["threads"]] for i in range(settings["threads"])]

    with ThreadPoolExecutor() as t:
        for thread_id, part in enumerate(parts):
            t.submit(process_records, part, thread_id+1)


def process_records(records, thread):
    """
    worker for processing records in a thread
    :param records: list of urls of records to proceses
    :param thread: thread id of processing thread
    """
    data = []
    for i in tqdm(range(len(records)), desc=f"thread {thread}"):
        record = process_record(records[i])
        data.append(record)
    collection = connect_db()
    collection.bulk_write(data)
    disconnect_db(collection)


def process_record(url):
    """
    process one record. Scrape url and store data to mongodb
    :param url: url of the record
    :return dictionary of parameters
    """
    html = requests.get(url)
    soup = BeautifulSoup(html.content, "html.parser")


def get_oddiel(soup):
    oddiel = soup.find("span", class_="tl", string=re.compile("Oddiel:")).parent.find("span", class_="ra").text.strip()
    return {"value": oddiel}


def get_vlozka(soup):
    vlozka = soup.find("span", class_="tl", string=re.compile("Vložka")).parent.find("span", class_="ra").text.strip()
    return {"value": vlozka}


def get_obchodneMeno(soup):
    data = {}

    # find the table <tr> element of "Obchodné meno:"
    meno_tr = soup.find("span", class_="tl", string=re.compile("Obchodné")).parent.parent

    # parse the name and date
    active = meno_tr.find_all("span", class_="ra")
    active = [x.text.strip() for x in active]
    if len(active) == 0:
        value, valid_from, valid_until = "", ""
    else:
        value, valid = active[0], active[1]
        valid_from, valid_until = parse_oddo(valid)
    data.update({"value": value, "valid_from": valid_from, "valid_until": valid_until})

    # check for older entries
    old = meno_tr.find_all("span", class_="ro")
    old = [x.text.strip() for x in old]
    if len(old) == 0:
        old_values = []
    else:
        old_values = [{"value": y[0], "valid_from": y[1][0], "valid_until": y[1][1]} for y in zip(old[::2], list(map(parse_oddo,old[1::2])))]
    data.update({"old_values": old_values})
    return data


def get_sidlo(soup):
    data = {}

    # find the table <tr> element of "Sídlo:"
    return data


def get_ico(soup):
    data = {}

    return data


def get_denZapisu(soup):
    data = {}

    return data


def get_pravnaForma(soup):
    data = {}

    return data


def get_predmetyCinnosti(soup):
    data = {}

    return data


def get_spolocnici(soup):
    data = {}

    return data


def get_vyskaVkladov(soup):
    data = {}

    return data


def get_statutarnyOrgan(soup):
    data = {}

    return data


def get_konanie(soup):
    data = {}

    return data


def get_zakladneImanie(soup):
    data = {}

    return data


def get_aktualizaciaUdajov(soup):
    data = {}

    return data


def get_vypisUdajov(soup):
    data = {}

    return data


def process_entry(entry, value_type):
    """
    extracts one entry from the table of entries for a given data
    :param entry: one table element of data
    :param value_type: type of the value data
    :return: tuple: (value, valid_from, valid_until, active)

    """
    value, valid_from, valid_until, active = None, None, None, False

    value_td, valid_td = entry.find_all("td")

    # Check if active entry
    if value_td.span.attrs["class"][0] == "ra":
        active = True


    return value, valid_from, valid_until, active


def get_data(soup, name, value_type="text", allow_multiple_active=True):
    data_td = soup.find("span", class_="tl", string=re.compile(f"{name}")).parent.find_next_sibling("td")

    data = {}

    values = []
    old_values = []

    for entry in data_td.find_all("table"):
        value, valid_from, valid_until, active = process_entry(entry, value_type)
        if value is None:
            continue
        if active:
            values.append({"value": value, "valid_from": valid_from, "valid_until": valid_until})
        else:
            old_values.append({"value": value, "valid_from": valid_from, "valid_until": valid_until})

    if not allow_multiple_active:
        data.update(values[0])
    else:
        data.update({"values": values})
    data.update({"old_values": old_values})

    return data


def parse_oddo(text):
    """
    Parses the valid_from and valid_until from string
    :param text: od_do_dates in format: "(od: DD.MM.YYYY do: DD.MM.YYYY)"
    :return: returns a tuple (valid_from, valid_until)
    """
    valid_from, valid_until = "", ""

    if (start_from := text.find("od: ")) > -1:
        valid_from = text[start_from+4:start_from+14]
    if (start_until := text.find("do: ")) > -1:
        valid_until = text[start_until+4:start_until+14]

    return valid_from, valid_until


def test():
    url = "https://www.orsr.sk/vypis.asp?ID=12388&SID=8&P=1"
    html = requests.get(url)
    soup = BeautifulSoup(html.content, "html.parser")

    record = {
        "oddiel":               get_oddiel(soup),
        "vlozka":               get_vlozka(soup),
        "obchodneMeno":         get_obchodneMeno(soup),
        "sidlo":                get_sidlo(soup),
        "ico":                  get_ico(soup),
        "denZapisu":            get_denZapisu(soup),
        "pravnaForma":          get_pravnaForma(soup),
        "predmetyCinnosti":     get_predmetyCinnosti(soup),
        "spolocnici":           get_spolocnici(soup),
        "vyskaVkladov":         get_vyskaVkladov(soup),
        "statutarnyOrgan":      get_statutarnyOrgan(soup),
        "konanie":              get_konanie(soup),
        "zakladneImanie":       get_zakladneImanie(soup),
        "aktualizaciaUdajov":   get_aktualizaciaUdajov(soup),
        "vypisUdajov":          get_vypisUdajov(soup)
    }
    print(json.dumps(record,indent=4,ensure_ascii=False))
    collection = connect_db()
    #collection.bulk_write(soup)
    disconnect_db(collection)


if __name__ == "__main__":
    #scrape_orsr()
    test()