make data extraction more general

This commit is contained in:
2023-09-27 16:40:59 +02:00
parent a7a60ee89a
commit 91fab836bb

View File

@@ -68,7 +68,8 @@ def get_obchodneMeno(soup):
data = {} data = {}
# find the table <tr> element of "Obchodné meno:" # find the table <tr> element of "Obchodné meno:"
meno_tr = soup.find("span", class_="tl", string=re.compile("Obchodné")).parent.parent #meno_tr = soup.find("span", class_="tl", string=re.compile("Obchodné")).parent.parent
meno_tr = get_data_td(soup, "Obchodné")
# parse the name and date # parse the name and date
active = meno_tr.find_all("span", class_="ra") active = meno_tr.find_all("span", class_="ra")
@@ -91,6 +92,88 @@ def get_obchodneMeno(soup):
return data return data
def get_sidlo(soup):
data = {}
# find the table <tr> element of "Sídlo:"
sidlo_tr = get_data_td(soup, "Sídlo")
return data
def get_ico(soup):
data = {}
return data
def get_denZapisu(soup):
data = {}
return data
def get_pravnaForma(soup):
data = {}
return data
def get_predmetyCinnosti(soup):
data = {}
return data
def get_spolocnici(soup):
data = {}
return data
def get_vyskaVkladov(soup):
data = {}
return data
def get_statutarnyOrgan(soup):
data = {}
return data
def get_konanie(soup):
data = {}
return data
def get_zakladneImanie(soup):
data = {}
return data
def get_aktualizaciaUdajov(soup):
data = {}
return data
def get_vypisUdajov(soup):
data = {}
return data
def get_data(soup, name):
data_td = soup.find("span", class_="tl", string=re.compile(f"{name}")).parent.find_next_sibling("td")
return []
def parse_oddo(text): def parse_oddo(text):
""" """
Parses the valid_from and valid_until from string Parses the valid_from and valid_until from string
@@ -106,8 +189,9 @@ def parse_oddo(text):
return valid_from, valid_until return valid_from, valid_until
def test(): def test():
url = "https://www.orsr.sk/vypis.asp?ID=670947&SID=2&P=1" url = "https://www.orsr.sk/vypis.asp?ID=12388&SID=8&P=1"
html = requests.get(url) html = requests.get(url)
soup = BeautifulSoup(html.content, "html.parser") soup = BeautifulSoup(html.content, "html.parser")
@@ -115,20 +199,20 @@ def test():
"oddiel": get_oddiel(soup), "oddiel": get_oddiel(soup),
"vlozka": get_vlozka(soup), "vlozka": get_vlozka(soup),
"obchodneMeno": get_obchodneMeno(soup), "obchodneMeno": get_obchodneMeno(soup),
"sidlo": "", "sidlo": get_sidlo(soup),
"ico": "", "ico": get_ico(soup),
"denZapisu": "", "denZapisu": get_denZapisu(soup),
"pravnaForma": "", "pravnaForma": get_pravnaForma(soup),
"predmetyCinnosti": "", "predmetyCinnosti": get_predmetyCinnosti(soup),
"spolocnici": "", "spolocnici": get_spolocnici(soup),
"vyskaVkladov": "", "vyskaVkladov": get_vyskaVkladov(soup),
"statutarnyOrgan": "", "statutarnyOrgan": get_statutarnyOrgan(soup),
"konanie": "", "konanie": get_konanie(soup),
"zakladneImanie": "", "zakladneImanie": get_zakladneImanie(soup),
"aktualizaciaUdajov": "", "aktualizaciaUdajov": get_aktualizaciaUdajov(soup),
"vypisUdajov": "" "vypisUdajov": get_vypisUdajov(soup)
} }
print(json.dumps(record,indent=4)) print(json.dumps(record,indent=4,ensure_ascii=False))
collection = connect_db() collection = connect_db()
#collection.bulk_write(soup) #collection.bulk_write(soup)
disconnect_db(collection) disconnect_db(collection)