parse_ first values
This commit is contained in:
88
scraper.py
88
scraper.py
@@ -1,5 +1,6 @@
|
|||||||
import requests
|
import requests
|
||||||
import re
|
import re
|
||||||
|
import json
|
||||||
from bs4 import BeautifulSoup
|
from bs4 import BeautifulSoup
|
||||||
from tqdm.auto import tqdm
|
from tqdm.auto import tqdm
|
||||||
from concurrent.futures import ThreadPoolExecutor
|
from concurrent.futures import ThreadPoolExecutor
|
||||||
@@ -53,30 +54,81 @@ def process_record(url):
|
|||||||
soup = BeautifulSoup(html.content, "html.parser")
|
soup = BeautifulSoup(html.content, "html.parser")
|
||||||
|
|
||||||
|
|
||||||
|
def get_oddiel(soup):
|
||||||
|
oddiel = soup.find("span", class_="tl", string=re.compile("Oddiel:")).parent.find("span", class_="ra").text.strip()
|
||||||
|
return {"value": oddiel}
|
||||||
|
|
||||||
|
|
||||||
|
def get_vlozka(soup):
|
||||||
|
vlozka = soup.find("span", class_="tl", string=re.compile("Vložka")).parent.find("span", class_="ra").text.strip()
|
||||||
|
return {"value": vlozka}
|
||||||
|
|
||||||
|
|
||||||
|
def get_obchodneMeno(soup):
|
||||||
|
data = {}
|
||||||
|
|
||||||
|
# find the table <tr> element of "Obchodné meno:"
|
||||||
|
meno_tr = soup.find("span", class_="tl", string=re.compile("Obchodné")).parent.parent
|
||||||
|
|
||||||
|
# parse the name and date
|
||||||
|
active = meno_tr.find_all("span", class_="ra")
|
||||||
|
active = [x.text.strip() for x in active]
|
||||||
|
if len(active) == 0:
|
||||||
|
value, valid_from, valid_until = "", ""
|
||||||
|
else:
|
||||||
|
value, valid = active[0], active[1]
|
||||||
|
valid_from, valid_until = parse_oddo(valid)
|
||||||
|
data.update({"value": value, "valid_from": valid_from, "valid_until": valid_until})
|
||||||
|
|
||||||
|
# check for older entries
|
||||||
|
old = meno_tr.find_all("span", class_="ro")
|
||||||
|
old = [x.text.strip() for x in old]
|
||||||
|
if len(old) == 0:
|
||||||
|
old_values = []
|
||||||
|
else:
|
||||||
|
old_values = [{"value": y[0], "valid_from": y[1][0], "valid_until": y[1][1]} for y in zip(old[::2], list(map(parse_oddo,old[1::2])))]
|
||||||
|
data.update({"old_values": old_values})
|
||||||
|
return data
|
||||||
|
|
||||||
|
|
||||||
|
def parse_oddo(text):
|
||||||
|
"""
|
||||||
|
Parses the valid_from and valid_until from string
|
||||||
|
:param text: od_do_dates in format: "(od: DD.MM.YYYY do: DD.MM.YYYY)"
|
||||||
|
:return: returns a tuple (valid_from, valid_until)
|
||||||
|
"""
|
||||||
|
valid_from, valid_until = "", ""
|
||||||
|
|
||||||
|
if (start_from := text.find("od: ")) > -1:
|
||||||
|
valid_from = text[start_from+4:start_from+14]
|
||||||
|
if (start_until := text.find("do: ")) > -1:
|
||||||
|
valid_until = text[start_until+4:start_until+14]
|
||||||
|
|
||||||
|
return valid_from, valid_until
|
||||||
|
|
||||||
def test():
|
def test():
|
||||||
url = "https://www.orsr.sk/vypis.asp?ID=648444&SID=9&P=0"
|
url = "https://www.orsr.sk/vypis.asp?ID=670947&SID=2&P=1"
|
||||||
html = requests.get(url)
|
html = requests.get(url)
|
||||||
soup = BeautifulSoup(html.content, "html.parser")
|
soup = BeautifulSoup(html.content, "html.parser")
|
||||||
|
|
||||||
'''
|
|
||||||
record = {
|
record = {
|
||||||
"oddiel": soup.find("span", string=re.compile("Oddiel:")),
|
"oddiel": get_oddiel(soup),
|
||||||
"vlozka": pass,
|
"vlozka": get_vlozka(soup),
|
||||||
"obchodneMeno": pass,
|
"obchodneMeno": get_obchodneMeno(soup),
|
||||||
"sidlo": pass,
|
"sidlo": "",
|
||||||
"ico": pass,
|
"ico": "",
|
||||||
"denZapisu": pass,
|
"denZapisu": "",
|
||||||
"pravnaForma": pass,
|
"pravnaForma": "",
|
||||||
"predmetyCinnosti": pass,
|
"predmetyCinnosti": "",
|
||||||
"spolocnici": pass,
|
"spolocnici": "",
|
||||||
"vyskaVkladov": pass,
|
"vyskaVkladov": "",
|
||||||
"statutarnyOrgan": pass,
|
"statutarnyOrgan": "",
|
||||||
"konanie": pass,
|
"konanie": "",
|
||||||
"zakladneImanie": pass,
|
"zakladneImanie": "",
|
||||||
"aktualizaciaUdajov": pass,
|
"aktualizaciaUdajov": "",
|
||||||
"vypisUdajov": pass
|
"vypisUdajov": ""
|
||||||
}
|
}
|
||||||
'''
|
print(json.dumps(record,indent=4))
|
||||||
collection = connect_db()
|
collection = connect_db()
|
||||||
#collection.bulk_write(soup)
|
#collection.bulk_write(soup)
|
||||||
disconnect_db(collection)
|
disconnect_db(collection)
|
||||||
|
|||||||
Reference in New Issue
Block a user