diff --git a/scraper.py b/scraper.py
index ef01fc5..cb18cb6 100644
--- a/scraper.py
+++ b/scraper.py
@@ -1,5 +1,6 @@
import requests
import re
+import json
from bs4 import BeautifulSoup
from tqdm.auto import tqdm
from concurrent.futures import ThreadPoolExecutor
@@ -53,30 +54,81 @@ def process_record(url):
soup = BeautifulSoup(html.content, "html.parser")
+def get_oddiel(soup):
+ oddiel = soup.find("span", class_="tl", string=re.compile("Oddiel:")).parent.find("span", class_="ra").text.strip()
+ return {"value": oddiel}
+
+
+def get_vlozka(soup):
+ vlozka = soup.find("span", class_="tl", string=re.compile("Vložka")).parent.find("span", class_="ra").text.strip()
+ return {"value": vlozka}
+
+
+def get_obchodneMeno(soup):
+ data = {}
+
+ # find the table
element of "Obchodné meno:"
+ meno_tr = soup.find("span", class_="tl", string=re.compile("Obchodné")).parent.parent
+
+ # parse the name and date
+ active = meno_tr.find_all("span", class_="ra")
+ active = [x.text.strip() for x in active]
+ if len(active) == 0:
+ value, valid_from, valid_until = "", ""
+ else:
+ value, valid = active[0], active[1]
+ valid_from, valid_until = parse_oddo(valid)
+ data.update({"value": value, "valid_from": valid_from, "valid_until": valid_until})
+
+ # check for older entries
+ old = meno_tr.find_all("span", class_="ro")
+ old = [x.text.strip() for x in old]
+ if len(old) == 0:
+ old_values = []
+ else:
+ old_values = [{"value": y[0], "valid_from": y[1][0], "valid_until": y[1][1]} for y in zip(old[::2], list(map(parse_oddo,old[1::2])))]
+ data.update({"old_values": old_values})
+ return data
+
+
+def parse_oddo(text):
+ """
+ Parses the valid_from and valid_until from string
+ :param text: od_do_dates in format: "(od: DD.MM.YYYY do: DD.MM.YYYY)"
+ :return: returns a tuple (valid_from, valid_until)
+ """
+ valid_from, valid_until = "", ""
+
+ if (start_from := text.find("od: ")) > -1:
+ valid_from = text[start_from+4:start_from+14]
+ if (start_until := text.find("do: ")) > -1:
+ valid_until = text[start_until+4:start_until+14]
+
+ return valid_from, valid_until
+
def test():
- url = "https://www.orsr.sk/vypis.asp?ID=648444&SID=9&P=0"
+ url = "https://www.orsr.sk/vypis.asp?ID=670947&SID=2&P=1"
html = requests.get(url)
soup = BeautifulSoup(html.content, "html.parser")
- '''
record = {
- "oddiel": soup.find("span", string=re.compile("Oddiel:")),
- "vlozka": pass,
- "obchodneMeno": pass,
- "sidlo": pass,
- "ico": pass,
- "denZapisu": pass,
- "pravnaForma": pass,
- "predmetyCinnosti": pass,
- "spolocnici": pass,
- "vyskaVkladov": pass,
- "statutarnyOrgan": pass,
- "konanie": pass,
- "zakladneImanie": pass,
- "aktualizaciaUdajov": pass,
- "vypisUdajov": pass
+ "oddiel": get_oddiel(soup),
+ "vlozka": get_vlozka(soup),
+ "obchodneMeno": get_obchodneMeno(soup),
+ "sidlo": "",
+ "ico": "",
+ "denZapisu": "",
+ "pravnaForma": "",
+ "predmetyCinnosti": "",
+ "spolocnici": "",
+ "vyskaVkladov": "",
+ "statutarnyOrgan": "",
+ "konanie": "",
+ "zakladneImanie": "",
+ "aktualizaciaUdajov": "",
+ "vypisUdajov": ""
}
- '''
+ print(json.dumps(record,indent=4))
collection = connect_db()
#collection.bulk_write(soup)
disconnect_db(collection)