parse_ first values

This commit is contained in:
2023-09-27 15:41:33 +02:00
parent e453cc2c6c
commit a7a60ee89a

View File

@@ -1,5 +1,6 @@
import requests import requests
import re import re
import json
from bs4 import BeautifulSoup from bs4 import BeautifulSoup
from tqdm.auto import tqdm from tqdm.auto import tqdm
from concurrent.futures import ThreadPoolExecutor from concurrent.futures import ThreadPoolExecutor
@@ -53,30 +54,81 @@ def process_record(url):
soup = BeautifulSoup(html.content, "html.parser") soup = BeautifulSoup(html.content, "html.parser")
def get_oddiel(soup):
oddiel = soup.find("span", class_="tl", string=re.compile("Oddiel:")).parent.find("span", class_="ra").text.strip()
return {"value": oddiel}
def get_vlozka(soup):
vlozka = soup.find("span", class_="tl", string=re.compile("Vložka")).parent.find("span", class_="ra").text.strip()
return {"value": vlozka}
def get_obchodneMeno(soup):
data = {}
# find the table <tr> element of "Obchodné meno:"
meno_tr = soup.find("span", class_="tl", string=re.compile("Obchodné")).parent.parent
# parse the name and date
active = meno_tr.find_all("span", class_="ra")
active = [x.text.strip() for x in active]
if len(active) == 0:
value, valid_from, valid_until = "", ""
else:
value, valid = active[0], active[1]
valid_from, valid_until = parse_oddo(valid)
data.update({"value": value, "valid_from": valid_from, "valid_until": valid_until})
# check for older entries
old = meno_tr.find_all("span", class_="ro")
old = [x.text.strip() for x in old]
if len(old) == 0:
old_values = []
else:
old_values = [{"value": y[0], "valid_from": y[1][0], "valid_until": y[1][1]} for y in zip(old[::2], list(map(parse_oddo,old[1::2])))]
data.update({"old_values": old_values})
return data
def parse_oddo(text):
"""
Parses the valid_from and valid_until from string
:param text: od_do_dates in format: "(od: DD.MM.YYYY do: DD.MM.YYYY)"
:return: returns a tuple (valid_from, valid_until)
"""
valid_from, valid_until = "", ""
if (start_from := text.find("od: ")) > -1:
valid_from = text[start_from+4:start_from+14]
if (start_until := text.find("do: ")) > -1:
valid_until = text[start_until+4:start_until+14]
return valid_from, valid_until
def test(): def test():
url = "https://www.orsr.sk/vypis.asp?ID=648444&SID=9&P=0" url = "https://www.orsr.sk/vypis.asp?ID=670947&SID=2&P=1"
html = requests.get(url) html = requests.get(url)
soup = BeautifulSoup(html.content, "html.parser") soup = BeautifulSoup(html.content, "html.parser")
'''
record = { record = {
"oddiel": soup.find("span", string=re.compile("Oddiel:")), "oddiel": get_oddiel(soup),
"vlozka": pass, "vlozka": get_vlozka(soup),
"obchodneMeno": pass, "obchodneMeno": get_obchodneMeno(soup),
"sidlo": pass, "sidlo": "",
"ico": pass, "ico": "",
"denZapisu": pass, "denZapisu": "",
"pravnaForma": pass, "pravnaForma": "",
"predmetyCinnosti": pass, "predmetyCinnosti": "",
"spolocnici": pass, "spolocnici": "",
"vyskaVkladov": pass, "vyskaVkladov": "",
"statutarnyOrgan": pass, "statutarnyOrgan": "",
"konanie": pass, "konanie": "",
"zakladneImanie": pass, "zakladneImanie": "",
"aktualizaciaUdajov": pass, "aktualizaciaUdajov": "",
"vypisUdajov": pass "vypisUdajov": ""
} }
''' print(json.dumps(record,indent=4))
collection = connect_db() collection = connect_db()
#collection.bulk_write(soup) #collection.bulk_write(soup)
disconnect_db(collection) disconnect_db(collection)