parse_ first values

This commit is contained in:
2023-09-27 15:41:33 +02:00
parent e453cc2c6c
commit a7a60ee89a

View File

@@ -1,5 +1,6 @@
import requests
import re
import json
from bs4 import BeautifulSoup
from tqdm.auto import tqdm
from concurrent.futures import ThreadPoolExecutor
@@ -53,30 +54,81 @@ def process_record(url):
soup = BeautifulSoup(html.content, "html.parser")
def get_oddiel(soup):
oddiel = soup.find("span", class_="tl", string=re.compile("Oddiel:")).parent.find("span", class_="ra").text.strip()
return {"value": oddiel}
def get_vlozka(soup):
vlozka = soup.find("span", class_="tl", string=re.compile("Vložka")).parent.find("span", class_="ra").text.strip()
return {"value": vlozka}
def get_obchodneMeno(soup):
data = {}
# find the table <tr> element of "Obchodné meno:"
meno_tr = soup.find("span", class_="tl", string=re.compile("Obchodné")).parent.parent
# parse the name and date
active = meno_tr.find_all("span", class_="ra")
active = [x.text.strip() for x in active]
if len(active) == 0:
value, valid_from, valid_until = "", ""
else:
value, valid = active[0], active[1]
valid_from, valid_until = parse_oddo(valid)
data.update({"value": value, "valid_from": valid_from, "valid_until": valid_until})
# check for older entries
old = meno_tr.find_all("span", class_="ro")
old = [x.text.strip() for x in old]
if len(old) == 0:
old_values = []
else:
old_values = [{"value": y[0], "valid_from": y[1][0], "valid_until": y[1][1]} for y in zip(old[::2], list(map(parse_oddo,old[1::2])))]
data.update({"old_values": old_values})
return data
def parse_oddo(text):
"""
Parses the valid_from and valid_until from string
:param text: od_do_dates in format: "(od: DD.MM.YYYY do: DD.MM.YYYY)"
:return: returns a tuple (valid_from, valid_until)
"""
valid_from, valid_until = "", ""
if (start_from := text.find("od: ")) > -1:
valid_from = text[start_from+4:start_from+14]
if (start_until := text.find("do: ")) > -1:
valid_until = text[start_until+4:start_until+14]
return valid_from, valid_until
def test():
url = "https://www.orsr.sk/vypis.asp?ID=648444&SID=9&P=0"
url = "https://www.orsr.sk/vypis.asp?ID=670947&SID=2&P=1"
html = requests.get(url)
soup = BeautifulSoup(html.content, "html.parser")
'''
record = {
"oddiel": soup.find("span", string=re.compile("Oddiel:")),
"vlozka": pass,
"obchodneMeno": pass,
"sidlo": pass,
"ico": pass,
"denZapisu": pass,
"pravnaForma": pass,
"predmetyCinnosti": pass,
"spolocnici": pass,
"vyskaVkladov": pass,
"statutarnyOrgan": pass,
"konanie": pass,
"zakladneImanie": pass,
"aktualizaciaUdajov": pass,
"vypisUdajov": pass
"oddiel": get_oddiel(soup),
"vlozka": get_vlozka(soup),
"obchodneMeno": get_obchodneMeno(soup),
"sidlo": "",
"ico": "",
"denZapisu": "",
"pravnaForma": "",
"predmetyCinnosti": "",
"spolocnici": "",
"vyskaVkladov": "",
"statutarnyOrgan": "",
"konanie": "",
"zakladneImanie": "",
"aktualizaciaUdajov": "",
"vypisUdajov": ""
}
'''
print(json.dumps(record,indent=4))
collection = connect_db()
#collection.bulk_write(soup)
disconnect_db(collection)