working scraper
This commit is contained in:
@@ -8,4 +8,4 @@
|
|||||||
#ENDPOINT = hladaj_zmeny.asp
|
#ENDPOINT = hladaj_zmeny.asp
|
||||||
|
|
||||||
[APP]
|
[APP]
|
||||||
THREADS = 1
|
THREADS = 32
|
||||||
@@ -1,5 +1,5 @@
|
|||||||
[DB]
|
[DB]
|
||||||
MONGODB_URI = mongodb://localhost:27017/softone
|
MONGODB_URI = mongodb://localhost:27017
|
||||||
MONGODB_DB = softone
|
MONGODB_DB = softone
|
||||||
MONGODB_COLLECTION = orsr
|
MONGODB_COLLECTION = orsr
|
||||||
|
|
||||||
@@ -8,4 +8,4 @@ BASE_URL = https://www.orsr.sk/
|
|||||||
ENDPOINT = hladaj_zmeny.asp
|
ENDPOINT = hladaj_zmeny.asp
|
||||||
|
|
||||||
[APP]
|
[APP]
|
||||||
THREADS = 4
|
THREADS = 32
|
||||||
133
scraper.py
133
scraper.py
@@ -1,6 +1,7 @@
|
|||||||
import requests
|
import requests
|
||||||
import re
|
import re
|
||||||
import json
|
import json
|
||||||
|
import unicodedata
|
||||||
from bs4 import BeautifulSoup
|
from bs4 import BeautifulSoup
|
||||||
from tqdm.auto import tqdm
|
from tqdm.auto import tqdm
|
||||||
from concurrent.futures import ThreadPoolExecutor
|
from concurrent.futures import ThreadPoolExecutor
|
||||||
@@ -11,6 +12,13 @@ from app.db import connect_db, disconnect_db
|
|||||||
from time import sleep
|
from time import sleep
|
||||||
|
|
||||||
|
|
||||||
|
single_value = ["Obchodné meno:", "Sídlo:", "IČO:", "Deň zápisu:", "Právna forma:"]
|
||||||
|
value_type_dict = {
|
||||||
|
"IČO:": "number",
|
||||||
|
"Spoločníci:": "spolocnici",
|
||||||
|
"Výška vkladu každého spoločníka:": "vklad"
|
||||||
|
}
|
||||||
|
|
||||||
def scrape_orsr():
|
def scrape_orsr():
|
||||||
"""
|
"""
|
||||||
This is the main function that scrapes data from endpoint defined in config and stores it in mongodb.
|
This is the main function that scrapes data from endpoint defined in config and stores it in mongodb.
|
||||||
@@ -24,10 +32,11 @@ def scrape_orsr():
|
|||||||
# distribute the work in #of threads defined in config
|
# distribute the work in #of threads defined in config
|
||||||
parts = [records[i::settings["threads"]] for i in range(settings["threads"])]
|
parts = [records[i::settings["threads"]] for i in range(settings["threads"])]
|
||||||
|
|
||||||
|
print(f"Processing {len(records)} records using {settings['threads']} threads:")
|
||||||
with ThreadPoolExecutor() as t:
|
with ThreadPoolExecutor() as t:
|
||||||
for thread_id, part in enumerate(parts):
|
for thread_id, part in enumerate(parts):
|
||||||
t.submit(process_records, part, thread_id+1)
|
t.submit(process_records, part, thread_id+1)
|
||||||
print("records_processed")
|
print("All records_processed")
|
||||||
|
|
||||||
|
|
||||||
def process_records(records, thread):
|
def process_records(records, thread):
|
||||||
@@ -37,7 +46,8 @@ def process_records(records, thread):
|
|||||||
:param thread: thread id of processing thread
|
:param thread: thread id of processing thread
|
||||||
"""
|
"""
|
||||||
data = []
|
data = []
|
||||||
for i in tqdm(range(len(records)), desc=f"thread {thread}"):
|
#for i in tqdm(range(len(records)), desc=f"thread {thread}"):
|
||||||
|
for i in tqdm(range(1), desc=f"thread {thread}"):
|
||||||
record = process_record(records[i])
|
record = process_record(records[i])
|
||||||
data.append(InsertOne(record))
|
data.append(InsertOne(record))
|
||||||
collection = connect_db()
|
collection = connect_db()
|
||||||
@@ -51,26 +61,12 @@ def process_record(url):
|
|||||||
:param url: url of the record
|
:param url: url of the record
|
||||||
:return dictionary of parameters
|
:return dictionary of parameters
|
||||||
"""
|
"""
|
||||||
|
url = "https://www.orsr.sk/vypis.asp?ID=471674&SID=4&P=1"
|
||||||
html = requests.get(url)
|
html = requests.get(url)
|
||||||
soup = BeautifulSoup(html.content, "html.parser")
|
soup = BeautifulSoup(html.content, "html.parser")
|
||||||
|
|
||||||
record = {
|
record = get_record_data(soup)
|
||||||
"oddiel": get_oddiel(soup),
|
|
||||||
"vlozka": get_vlozka(soup),
|
|
||||||
"obchodneMeno": get_data(soup, "Obchodné meno", allow_multiple_active=False),
|
|
||||||
"sidlo": get_data(soup,"Sídlo", allow_multiple_active=False),
|
|
||||||
"ico": get_data(soup, "IČO", value_type="number" , allow_multiple_active=False),
|
|
||||||
"denZapisu": get_data(soup, "Deň zápisu", allow_multiple_active=False),
|
|
||||||
"pravnaForma": get_data(soup, "Právna forma", allow_multiple_active=False),
|
|
||||||
"predmetyCinnosti": get_data(soup, "Predmet činnosti"),
|
|
||||||
"spolocnici": get_data(soup, "Spoločníci", value_type="spolocnici"),
|
|
||||||
"vyskaVkladov": get_data(soup, "Výška vkladu", value_type="vklad"),
|
|
||||||
"statutarnyOrgan": get_data(soup, "Štatutárny orgán"),
|
|
||||||
"konanie": get_data(soup, "Konanie menom"),
|
|
||||||
"zakladneImanie": get_data(soup, "Základné imanie"),
|
|
||||||
"aktualizaciaUdajov": get_aktualizaciaUdajov(soup),
|
|
||||||
"vypisUdajov": get_vypisUdajov(soup)
|
|
||||||
}
|
|
||||||
return record
|
return record
|
||||||
|
|
||||||
|
|
||||||
@@ -94,31 +90,47 @@ def get_vypisUdajov(soup):
|
|||||||
return {"value": vypis}
|
return {"value": vypis}
|
||||||
|
|
||||||
|
|
||||||
def get_obchodneMeno(soup):
|
def get_record_data(soup):
|
||||||
data = {}
|
record = {
|
||||||
|
"oddiel": get_oddiel(soup),
|
||||||
|
"vlozka": get_vlozka(soup)
|
||||||
|
}
|
||||||
|
|
||||||
# find the table <tr> element of "Obchodné meno:"
|
# find the last table before variable data
|
||||||
meno_tr = soup.find("span", class_="tl", string=re.compile("Obchodné")).parent.parent
|
entry = soup.find("span", class_="tl", string=re.compile("Oddiel:")).parent.parent.parent
|
||||||
|
while True:
|
||||||
|
entry = entry.find_next_sibling("table")
|
||||||
|
entry_tr = entry.find_all("tr")
|
||||||
|
entry_tr = [i for i in entry_tr if i.parent == entry_tr[0].parent]
|
||||||
|
|
||||||
# parse the name and date
|
if len(entry_tr) > 1: # last table with "Dátum aktualizácie údajov
|
||||||
active = meno_tr.find_all("span", class_="ra")
|
break
|
||||||
active = [x.text.strip() for x in active]
|
|
||||||
if len(active) == 0:
|
|
||||||
value, valid_from, valid_until = "", ""
|
|
||||||
else:
|
|
||||||
value, valid = active[0], active[1]
|
|
||||||
valid_from, valid_until = parse_oddo(valid)
|
|
||||||
data.update({"value": value, "valid_from": valid_from, "valid_until": valid_until})
|
|
||||||
|
|
||||||
# check for older entries
|
# get enry name and entry data
|
||||||
old = meno_tr.find_all("span", class_="ro")
|
entry_container = entry_tr[0].find_all("td")
|
||||||
old = [x.text.strip() for x in old]
|
entry_name = entry_container[0].text.strip()
|
||||||
if len(old) == 0:
|
|
||||||
old_values = []
|
allow_multiple_active = True
|
||||||
else:
|
value_type = "text"
|
||||||
old_values = [{"value": y[0], "valid_from": y[1][0], "valid_until": y[1][1]} for y in zip(old[::2], list(map(parse_oddo,old[1::2])))]
|
if entry_name in single_value:
|
||||||
data.update({"old_values": old_values})
|
allow_multiple_active = False
|
||||||
return data
|
if (v_type := value_type_dict.get(entry_name)) is not None:
|
||||||
|
value_type = v_type
|
||||||
|
entry_name = transform_entry_name(entry_name)
|
||||||
|
entry_data = get_data(entry_container[1], value_type=value_type, allow_multiple_active=allow_multiple_active)
|
||||||
|
record.update({entry_name: entry_data})
|
||||||
|
|
||||||
|
record.update({
|
||||||
|
"aktualizaciaUdajov": get_aktualizaciaUdajov(soup),
|
||||||
|
"vypisUdajov": get_vypisUdajov(soup)
|
||||||
|
})
|
||||||
|
|
||||||
|
return record
|
||||||
|
|
||||||
|
|
||||||
|
def transform_entry_name(name):
|
||||||
|
s = unicodedata.normalize("NFKD",name).encode('ascii', 'ignore').decode().replace(":", "").lower().split()
|
||||||
|
return s[0].lower() + "".join(w.capitalize() for w in s[1:])
|
||||||
|
|
||||||
|
|
||||||
def process_entry(entry, value_type):
|
def process_entry(entry, value_type):
|
||||||
@@ -137,7 +149,7 @@ def process_entry(entry, value_type):
|
|||||||
if value_td.span.attrs["class"][0] == "ra":
|
if value_td.span.attrs["class"][0] == "ra":
|
||||||
active = True
|
active = True
|
||||||
|
|
||||||
lines = [f.strip() for f in " ".join(["\n" if x.name == "br" else x.text.strip() for x in value_td.find_all()]).split("\n") if f]
|
lines = [f.strip() for f in " ".join(["\n" if x.name == "br" else x.text.strip() for x in value_td.find_all(["br","span"])]).split("\n") if f]
|
||||||
|
|
||||||
if value_type == "text":
|
if value_type == "text":
|
||||||
value = ", ".join(lines)
|
value = ", ".join(lines)
|
||||||
@@ -163,8 +175,8 @@ def process_entry(entry, value_type):
|
|||||||
return value, valid_from, valid_until, active
|
return value, valid_from, valid_until, active
|
||||||
|
|
||||||
|
|
||||||
def get_data(soup, name, value_type="text", allow_multiple_active=True):
|
def get_data(data_td, value_type="text", allow_multiple_active=True):
|
||||||
data_td = soup.find("span", class_="tl", string=re.compile(f"{name}")).parent.find_next_sibling("td")
|
data_td = data_td
|
||||||
|
|
||||||
data = {}
|
data = {}
|
||||||
|
|
||||||
@@ -181,7 +193,8 @@ def get_data(soup, name, value_type="text", allow_multiple_active=True):
|
|||||||
old_values.append({"value": value, "valid_from": valid_from, "valid_until": valid_until})
|
old_values.append({"value": value, "valid_from": valid_from, "valid_until": valid_until})
|
||||||
|
|
||||||
if not allow_multiple_active:
|
if not allow_multiple_active:
|
||||||
data.update(values[0])
|
if len(values) > 0:
|
||||||
|
data.update(values[0])
|
||||||
else:
|
else:
|
||||||
data.update({"values": values})
|
data.update({"values": values})
|
||||||
data.update({"old_values": old_values})
|
data.update({"old_values": old_values})
|
||||||
@@ -205,35 +218,5 @@ def parse_oddo(text):
|
|||||||
return valid_from, valid_until
|
return valid_from, valid_until
|
||||||
|
|
||||||
|
|
||||||
def test():
|
|
||||||
url = "https://www.orsr.sk/vypis.asp?ID=12388&SID=8&P=1"
|
|
||||||
html = requests.get(url)
|
|
||||||
soup = BeautifulSoup(html.content, "html.parser")
|
|
||||||
|
|
||||||
record = {
|
|
||||||
"oddiel": get_oddiel(soup),
|
|
||||||
"vlozka": get_vlozka(soup),
|
|
||||||
"obchodneMeno": get_data(soup, "Obchodné meno", allow_multiple_active=False),
|
|
||||||
"sidlo": get_data(soup,"Sídlo", allow_multiple_active=False),
|
|
||||||
"ico": get_data(soup, "IČO", value_type="number" , allow_multiple_active=False),
|
|
||||||
"denZapisu": get_data(soup, "Deň zápisu", allow_multiple_active=False),
|
|
||||||
"pravnaForma": get_data(soup, "Právna forma", allow_multiple_active=False),
|
|
||||||
"predmetyCinnosti": get_data(soup, "Predmet činnosti"),
|
|
||||||
"spolocnici": get_data(soup, "Spoločníci", value_type="spolocnici"),
|
|
||||||
"vyskaVkladov": get_data(soup, "Výška vkladu", value_type="vklad"),
|
|
||||||
"statutarnyOrgan": get_data(soup, "Štatutárny orgán"),
|
|
||||||
"konanie": get_data(soup, "Konanie menom"),
|
|
||||||
"zakladneImanie": get_data(soup, "Základné imanie"),
|
|
||||||
"aktualizaciaUdajov": get_aktualizaciaUdajov(soup),
|
|
||||||
"vypisUdajov": get_vypisUdajov(soup)
|
|
||||||
}
|
|
||||||
print(json.dumps(record,indent=4,ensure_ascii=False))
|
|
||||||
collection = connect_db()
|
|
||||||
records = [InsertOne(record)]
|
|
||||||
collection.bulk_write(records)
|
|
||||||
disconnect_db(collection)
|
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
scrape_orsr()
|
scrape_orsr()
|
||||||
#test()
|
|
||||||
|
|||||||
Reference in New Issue
Block a user