make data extraction more general
This commit is contained in:
114
scraper.py
114
scraper.py
@@ -68,7 +68,8 @@ def get_obchodneMeno(soup):
|
|||||||
data = {}
|
data = {}
|
||||||
|
|
||||||
# find the table <tr> element of "Obchodné meno:"
|
# find the table <tr> element of "Obchodné meno:"
|
||||||
meno_tr = soup.find("span", class_="tl", string=re.compile("Obchodné")).parent.parent
|
#meno_tr = soup.find("span", class_="tl", string=re.compile("Obchodné")).parent.parent
|
||||||
|
meno_tr = get_data_td(soup, "Obchodné")
|
||||||
|
|
||||||
# parse the name and date
|
# parse the name and date
|
||||||
active = meno_tr.find_all("span", class_="ra")
|
active = meno_tr.find_all("span", class_="ra")
|
||||||
@@ -91,6 +92,88 @@ def get_obchodneMeno(soup):
|
|||||||
return data
|
return data
|
||||||
|
|
||||||
|
|
||||||
|
def get_sidlo(soup):
|
||||||
|
data = {}
|
||||||
|
|
||||||
|
# find the table <tr> element of "Sídlo:"
|
||||||
|
sidlo_tr = get_data_td(soup, "Sídlo")
|
||||||
|
return data
|
||||||
|
|
||||||
|
|
||||||
|
def get_ico(soup):
|
||||||
|
data = {}
|
||||||
|
|
||||||
|
return data
|
||||||
|
|
||||||
|
|
||||||
|
def get_denZapisu(soup):
|
||||||
|
data = {}
|
||||||
|
|
||||||
|
return data
|
||||||
|
|
||||||
|
|
||||||
|
def get_pravnaForma(soup):
|
||||||
|
data = {}
|
||||||
|
|
||||||
|
return data
|
||||||
|
|
||||||
|
|
||||||
|
def get_predmetyCinnosti(soup):
|
||||||
|
data = {}
|
||||||
|
|
||||||
|
return data
|
||||||
|
|
||||||
|
|
||||||
|
def get_spolocnici(soup):
|
||||||
|
data = {}
|
||||||
|
|
||||||
|
return data
|
||||||
|
|
||||||
|
|
||||||
|
def get_vyskaVkladov(soup):
|
||||||
|
data = {}
|
||||||
|
|
||||||
|
return data
|
||||||
|
|
||||||
|
|
||||||
|
def get_statutarnyOrgan(soup):
|
||||||
|
data = {}
|
||||||
|
|
||||||
|
return data
|
||||||
|
|
||||||
|
|
||||||
|
def get_konanie(soup):
|
||||||
|
data = {}
|
||||||
|
|
||||||
|
return data
|
||||||
|
|
||||||
|
|
||||||
|
def get_zakladneImanie(soup):
|
||||||
|
data = {}
|
||||||
|
|
||||||
|
return data
|
||||||
|
|
||||||
|
|
||||||
|
def get_aktualizaciaUdajov(soup):
|
||||||
|
data = {}
|
||||||
|
|
||||||
|
return data
|
||||||
|
|
||||||
|
|
||||||
|
def get_vypisUdajov(soup):
|
||||||
|
data = {}
|
||||||
|
|
||||||
|
return data
|
||||||
|
|
||||||
|
|
||||||
|
def get_data(soup, name):
|
||||||
|
data_td = soup.find("span", class_="tl", string=re.compile(f"{name}")).parent.find_next_sibling("td")
|
||||||
|
|
||||||
|
|
||||||
|
return []
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
def parse_oddo(text):
|
def parse_oddo(text):
|
||||||
"""
|
"""
|
||||||
Parses the valid_from and valid_until from string
|
Parses the valid_from and valid_until from string
|
||||||
@@ -106,8 +189,9 @@ def parse_oddo(text):
|
|||||||
|
|
||||||
return valid_from, valid_until
|
return valid_from, valid_until
|
||||||
|
|
||||||
|
|
||||||
def test():
|
def test():
|
||||||
url = "https://www.orsr.sk/vypis.asp?ID=670947&SID=2&P=1"
|
url = "https://www.orsr.sk/vypis.asp?ID=12388&SID=8&P=1"
|
||||||
html = requests.get(url)
|
html = requests.get(url)
|
||||||
soup = BeautifulSoup(html.content, "html.parser")
|
soup = BeautifulSoup(html.content, "html.parser")
|
||||||
|
|
||||||
@@ -115,20 +199,20 @@ def test():
|
|||||||
"oddiel": get_oddiel(soup),
|
"oddiel": get_oddiel(soup),
|
||||||
"vlozka": get_vlozka(soup),
|
"vlozka": get_vlozka(soup),
|
||||||
"obchodneMeno": get_obchodneMeno(soup),
|
"obchodneMeno": get_obchodneMeno(soup),
|
||||||
"sidlo": "",
|
"sidlo": get_sidlo(soup),
|
||||||
"ico": "",
|
"ico": get_ico(soup),
|
||||||
"denZapisu": "",
|
"denZapisu": get_denZapisu(soup),
|
||||||
"pravnaForma": "",
|
"pravnaForma": get_pravnaForma(soup),
|
||||||
"predmetyCinnosti": "",
|
"predmetyCinnosti": get_predmetyCinnosti(soup),
|
||||||
"spolocnici": "",
|
"spolocnici": get_spolocnici(soup),
|
||||||
"vyskaVkladov": "",
|
"vyskaVkladov": get_vyskaVkladov(soup),
|
||||||
"statutarnyOrgan": "",
|
"statutarnyOrgan": get_statutarnyOrgan(soup),
|
||||||
"konanie": "",
|
"konanie": get_konanie(soup),
|
||||||
"zakladneImanie": "",
|
"zakladneImanie": get_zakladneImanie(soup),
|
||||||
"aktualizaciaUdajov": "",
|
"aktualizaciaUdajov": get_aktualizaciaUdajov(soup),
|
||||||
"vypisUdajov": ""
|
"vypisUdajov": get_vypisUdajov(soup)
|
||||||
}
|
}
|
||||||
print(json.dumps(record,indent=4))
|
print(json.dumps(record,indent=4,ensure_ascii=False))
|
||||||
collection = connect_db()
|
collection = connect_db()
|
||||||
#collection.bulk_write(soup)
|
#collection.bulk_write(soup)
|
||||||
disconnect_db(collection)
|
disconnect_db(collection)
|
||||||
|
|||||||
Reference in New Issue
Block a user