From b34aa10521ad56aec9126d8e68be03dc601bf86b Mon Sep 17 00:00:00 2001 From: Oto Imrich Date: Wed, 27 Sep 2023 18:11:10 +0200 Subject: [PATCH] parsing one entry --- scraper.py | 47 ++++++++++++++++++++++++++++++++++++++++++----- 1 file changed, 42 insertions(+), 5 deletions(-) diff --git a/scraper.py b/scraper.py index ac0d244..f672671 100644 --- a/scraper.py +++ b/scraper.py @@ -68,8 +68,7 @@ def get_obchodneMeno(soup): data = {} # find the table element of "Obchodné meno:" - #meno_tr = soup.find("span", class_="tl", string=re.compile("Obchodné")).parent.parent - meno_tr = get_data_td(soup, "Obchodné") + meno_tr = soup.find("span", class_="tl", string=re.compile("Obchodné")).parent.parent # parse the name and date active = meno_tr.find_all("span", class_="ra") @@ -96,7 +95,6 @@ def get_sidlo(soup): data = {} # find the table element of "Sídlo:" - sidlo_tr = get_data_td(soup, "Sídlo") return data @@ -166,12 +164,51 @@ def get_vypisUdajov(soup): return data -def get_data(soup, name): +def process_entry(entry, value_type): + """ + extracts one entry from the table of entries for a given data + :param entry: one table element of data + :param value_type: type of the value data + :return: tuple: (value, valid_from, valid_until, active) + + """ + value, valid_from, valid_until, active = None, None, None, False + + value_td, valid_td = entry.find_all("td") + + # Check if active entry + if value_td.span.attrs["class"][0] == "ra": + active = True + + + + return value, valid_from, valid_until, active + + +def get_data(soup, name, value_type="text", allow_multiple_active=True): data_td = soup.find("span", class_="tl", string=re.compile(f"{name}")).parent.find_next_sibling("td") + data = {} - return [] + values = [] + old_values = [] + for entry in data_td.find_all("table"): + value, valid_from, valid_until, active = process_entry(entry, value_type) + if value is None: + continue + if active: + values.append({"value": value, "valid_from": valid_from, "valid_until": valid_until}) + else: + old_values.append({"value": value, "valid_from": valid_from, "valid_until": valid_until}) + + if not allow_multiple_active: + data.update(values[0]) + else: + data.update({"values": values}) + data.update({"old_values": old_values}) + + return data def parse_oddo(text):