working for sro, adapt to other
This commit is contained in:
2
.gitignore
vendored
2
.gitignore
vendored
@@ -1,3 +1,3 @@
|
|||||||
.venv
|
.venv
|
||||||
.idea
|
.idea
|
||||||
.config.cfg
|
config.cfg
|
||||||
@@ -14,6 +14,7 @@ settings = {
|
|||||||
"mongodb_uri": config.get("DB","MONGODB_URI"),
|
"mongodb_uri": config.get("DB","MONGODB_URI"),
|
||||||
"mongodb_db": config.get("DB","MONGODB_DB"),
|
"mongodb_db": config.get("DB","MONGODB_DB"),
|
||||||
"mongodb_collection": config.get("DB","MONGODB_COLLECTION"),
|
"mongodb_collection": config.get("DB","MONGODB_COLLECTION"),
|
||||||
"orsr_url": config.get("WEB", "ORSR_URL"),
|
"base_url": config.get("WEB", "BASE_URL"),
|
||||||
|
"endpoint": config.get("WEB", "ENDPOINT"),
|
||||||
"threads": int(config.get("APP", "THREADS"))
|
"threads": int(config.get("APP", "THREADS"))
|
||||||
}
|
}
|
||||||
|
|||||||
11
config.cfg
Normal file
11
config.cfg
Normal file
@@ -0,0 +1,11 @@
|
|||||||
|
[DB]
|
||||||
|
#MONGODB_URI = mongodb://localhost:27017/softone
|
||||||
|
#MONGODB_DB = softone
|
||||||
|
#MONGODB_COLLECTION = orsr
|
||||||
|
|
||||||
|
[WEB]
|
||||||
|
#BASE_URL = https://www.orsr.sk/
|
||||||
|
#ENDPOINT = hladaj_zmeny.asp
|
||||||
|
|
||||||
|
[APP]
|
||||||
|
THREADS = 1
|
||||||
@@ -4,7 +4,8 @@ MONGODB_DB = softone
|
|||||||
MONGODB_COLLECTION = orsr
|
MONGODB_COLLECTION = orsr
|
||||||
|
|
||||||
[WEB]
|
[WEB]
|
||||||
ORSR_URL = http://www.orsr.sk/hladaj_zmeny.asp
|
BASE_URL = https://www.orsr.sk/
|
||||||
|
ENDPOINT = hladaj_zmeny.asp
|
||||||
|
|
||||||
[APP]
|
[APP]
|
||||||
THREADS = 8
|
THREADS = 4
|
||||||
164
scraper.py
164
scraper.py
@@ -4,6 +4,7 @@ import json
|
|||||||
from bs4 import BeautifulSoup
|
from bs4 import BeautifulSoup
|
||||||
from tqdm.auto import tqdm
|
from tqdm.auto import tqdm
|
||||||
from concurrent.futures import ThreadPoolExecutor
|
from concurrent.futures import ThreadPoolExecutor
|
||||||
|
from pymongo import InsertOne
|
||||||
|
|
||||||
from app.config import settings
|
from app.config import settings
|
||||||
from app.db import connect_db, disconnect_db
|
from app.db import connect_db, disconnect_db
|
||||||
@@ -15,18 +16,18 @@ def scrape_orsr():
|
|||||||
This is the main function that scrapes data from endpoint defined in config and stores it in mongodb.
|
This is the main function that scrapes data from endpoint defined in config and stores it in mongodb.
|
||||||
"""
|
"""
|
||||||
# get all links to "Aktuálny" from the orsr url
|
# get all links to "Aktuálny" from the orsr url
|
||||||
html = requests.get(settings["orsr_url"])
|
html = requests.get(settings["base_url"]+settings["endpoint"])
|
||||||
soup = BeautifulSoup(html.content, "html.parser")
|
soup = BeautifulSoup(html.content, "html.parser")
|
||||||
records = soup.find_all("a", string="Aktuálny")
|
records = soup.find_all("a", string="Úplný")
|
||||||
records = [record["href"] for record in records]
|
records = [settings["base_url"]+record["href"] for record in records]
|
||||||
|
|
||||||
# distribute the work in #of threads defined in config
|
# distribute the work in #of threads defined in config
|
||||||
worker_ids = list(range(1, len(records)+1))
|
parts = [records[i::settings["threads"]] for i in range(settings["threads"])]
|
||||||
parts = [worker_ids[i::settings["threads"]] for i in range(settings["threads"])]
|
|
||||||
|
|
||||||
with ThreadPoolExecutor() as t:
|
with ThreadPoolExecutor() as t:
|
||||||
for thread_id, part in enumerate(parts):
|
for thread_id, part in enumerate(parts):
|
||||||
t.submit(process_records, part, thread_id+1)
|
t.submit(process_records, part, thread_id+1)
|
||||||
|
print("records_processed")
|
||||||
|
|
||||||
|
|
||||||
def process_records(records, thread):
|
def process_records(records, thread):
|
||||||
@@ -38,7 +39,7 @@ def process_records(records, thread):
|
|||||||
data = []
|
data = []
|
||||||
for i in tqdm(range(len(records)), desc=f"thread {thread}"):
|
for i in tqdm(range(len(records)), desc=f"thread {thread}"):
|
||||||
record = process_record(records[i])
|
record = process_record(records[i])
|
||||||
data.append(record)
|
data.append(InsertOne(record))
|
||||||
collection = connect_db()
|
collection = connect_db()
|
||||||
collection.bulk_write(data)
|
collection.bulk_write(data)
|
||||||
disconnect_db(collection)
|
disconnect_db(collection)
|
||||||
@@ -53,6 +54,25 @@ def process_record(url):
|
|||||||
html = requests.get(url)
|
html = requests.get(url)
|
||||||
soup = BeautifulSoup(html.content, "html.parser")
|
soup = BeautifulSoup(html.content, "html.parser")
|
||||||
|
|
||||||
|
record = {
|
||||||
|
"oddiel": get_oddiel(soup),
|
||||||
|
"vlozka": get_vlozka(soup),
|
||||||
|
"obchodneMeno": get_data(soup, "Obchodné meno", allow_multiple_active=False),
|
||||||
|
"sidlo": get_data(soup,"Sídlo", allow_multiple_active=False),
|
||||||
|
"ico": get_data(soup, "IČO", value_type="number" , allow_multiple_active=False),
|
||||||
|
"denZapisu": get_data(soup, "Deň zápisu", allow_multiple_active=False),
|
||||||
|
"pravnaForma": get_data(soup, "Právna forma", allow_multiple_active=False),
|
||||||
|
"predmetyCinnosti": get_data(soup, "Predmet činnosti"),
|
||||||
|
"spolocnici": get_data(soup, "Spoločníci", value_type="spolocnici"),
|
||||||
|
"vyskaVkladov": get_data(soup, "Výška vkladu", value_type="vklad"),
|
||||||
|
"statutarnyOrgan": get_data(soup, "Štatutárny orgán"),
|
||||||
|
"konanie": get_data(soup, "Konanie menom"),
|
||||||
|
"zakladneImanie": get_data(soup, "Základné imanie"),
|
||||||
|
"aktualizaciaUdajov": get_aktualizaciaUdajov(soup),
|
||||||
|
"vypisUdajov": get_vypisUdajov(soup)
|
||||||
|
}
|
||||||
|
return record
|
||||||
|
|
||||||
|
|
||||||
def get_oddiel(soup):
|
def get_oddiel(soup):
|
||||||
oddiel = soup.find("span", class_="tl", string=re.compile("Oddiel:")).parent.find("span", class_="ra").text.strip()
|
oddiel = soup.find("span", class_="tl", string=re.compile("Oddiel:")).parent.find("span", class_="ra").text.strip()
|
||||||
@@ -64,6 +84,16 @@ def get_vlozka(soup):
|
|||||||
return {"value": vlozka}
|
return {"value": vlozka}
|
||||||
|
|
||||||
|
|
||||||
|
def get_aktualizaciaUdajov(soup):
|
||||||
|
aktualizacia = soup.find("td", class_="tl", string=re.compile("Dátum aktualizácie")).find_next_sibling("td").text.strip()
|
||||||
|
return {"value": aktualizacia}
|
||||||
|
|
||||||
|
|
||||||
|
def get_vypisUdajov(soup):
|
||||||
|
vypis = soup.find("td", class_="tl", string=re.compile("Dátum výpisu")).find_next_sibling("td").text.strip()
|
||||||
|
return {"value": vypis}
|
||||||
|
|
||||||
|
|
||||||
def get_obchodneMeno(soup):
|
def get_obchodneMeno(soup):
|
||||||
data = {}
|
data = {}
|
||||||
|
|
||||||
@@ -91,79 +121,6 @@ def get_obchodneMeno(soup):
|
|||||||
return data
|
return data
|
||||||
|
|
||||||
|
|
||||||
def get_sidlo(soup):
|
|
||||||
data = {}
|
|
||||||
|
|
||||||
# find the table <tr> element of "Sídlo:"
|
|
||||||
return data
|
|
||||||
|
|
||||||
|
|
||||||
def get_ico(soup):
|
|
||||||
data = {}
|
|
||||||
|
|
||||||
return data
|
|
||||||
|
|
||||||
|
|
||||||
def get_denZapisu(soup):
|
|
||||||
data = {}
|
|
||||||
|
|
||||||
return data
|
|
||||||
|
|
||||||
|
|
||||||
def get_pravnaForma(soup):
|
|
||||||
data = {}
|
|
||||||
|
|
||||||
return data
|
|
||||||
|
|
||||||
|
|
||||||
def get_predmetyCinnosti(soup):
|
|
||||||
data = {}
|
|
||||||
|
|
||||||
return data
|
|
||||||
|
|
||||||
|
|
||||||
def get_spolocnici(soup):
|
|
||||||
data = {}
|
|
||||||
|
|
||||||
return data
|
|
||||||
|
|
||||||
|
|
||||||
def get_vyskaVkladov(soup):
|
|
||||||
data = {}
|
|
||||||
|
|
||||||
return data
|
|
||||||
|
|
||||||
|
|
||||||
def get_statutarnyOrgan(soup):
|
|
||||||
data = {}
|
|
||||||
|
|
||||||
return data
|
|
||||||
|
|
||||||
|
|
||||||
def get_konanie(soup):
|
|
||||||
data = {}
|
|
||||||
|
|
||||||
return data
|
|
||||||
|
|
||||||
|
|
||||||
def get_zakladneImanie(soup):
|
|
||||||
data = {}
|
|
||||||
|
|
||||||
return data
|
|
||||||
|
|
||||||
|
|
||||||
def get_aktualizaciaUdajov(soup):
|
|
||||||
data = {}
|
|
||||||
|
|
||||||
return data
|
|
||||||
|
|
||||||
|
|
||||||
def get_vypisUdajov(soup):
|
|
||||||
data = {}
|
|
||||||
|
|
||||||
return data
|
|
||||||
|
|
||||||
|
|
||||||
def process_entry(entry, value_type):
|
def process_entry(entry, value_type):
|
||||||
"""
|
"""
|
||||||
extracts one entry from the table of entries for a given data
|
extracts one entry from the table of entries for a given data
|
||||||
@@ -181,6 +138,26 @@ def process_entry(entry, value_type):
|
|||||||
active = True
|
active = True
|
||||||
|
|
||||||
lines = [f.strip() for f in " ".join(["\n" if x.name == "br" else x.text.strip() for x in value_td.find_all()]).split("\n") if f]
|
lines = [f.strip() for f in " ".join(["\n" if x.name == "br" else x.text.strip() for x in value_td.find_all()]).split("\n") if f]
|
||||||
|
|
||||||
|
if value_type == "text":
|
||||||
|
value = ", ".join(lines)
|
||||||
|
elif value_type == "number":
|
||||||
|
value = int("".join(lines).replace(" ",""))
|
||||||
|
elif value_type == "spolocnici":
|
||||||
|
spolocnik = lines[0]
|
||||||
|
adresa = ", ".join(lines[1:])
|
||||||
|
value = {
|
||||||
|
"spolocnik": spolocnik,
|
||||||
|
"adresa": adresa
|
||||||
|
}
|
||||||
|
elif value_type == "vklad":
|
||||||
|
spolocnik = lines[0]
|
||||||
|
vklad = ", ".join(lines[1:])
|
||||||
|
value = {
|
||||||
|
"spolocnik": spolocnik,
|
||||||
|
"vklad": vklad
|
||||||
|
}
|
||||||
|
|
||||||
valid_from, valid_until = parse_oddo(valid_td.text.strip())
|
valid_from, valid_until = parse_oddo(valid_td.text.strip())
|
||||||
|
|
||||||
return value, valid_from, valid_until, active
|
return value, valid_from, valid_until, active
|
||||||
@@ -236,26 +213,27 @@ def test():
|
|||||||
record = {
|
record = {
|
||||||
"oddiel": get_oddiel(soup),
|
"oddiel": get_oddiel(soup),
|
||||||
"vlozka": get_vlozka(soup),
|
"vlozka": get_vlozka(soup),
|
||||||
"obchodneMeno": get_obchodneMeno(soup),
|
"obchodneMeno": get_data(soup, "Obchodné meno", allow_multiple_active=False),
|
||||||
"sidlo": get_sidlo(soup),
|
"sidlo": get_data(soup,"Sídlo", allow_multiple_active=False),
|
||||||
"ico": get_ico(soup),
|
"ico": get_data(soup, "IČO", value_type="number" , allow_multiple_active=False),
|
||||||
"denZapisu": get_denZapisu(soup),
|
"denZapisu": get_data(soup, "Deň zápisu", allow_multiple_active=False),
|
||||||
"pravnaForma": get_pravnaForma(soup),
|
"pravnaForma": get_data(soup, "Právna forma", allow_multiple_active=False),
|
||||||
"predmetyCinnosti": get_predmetyCinnosti(soup),
|
"predmetyCinnosti": get_data(soup, "Predmet činnosti"),
|
||||||
"spolocnici": get_spolocnici(soup),
|
"spolocnici": get_data(soup, "Spoločníci", value_type="spolocnici"),
|
||||||
"vyskaVkladov": get_vyskaVkladov(soup),
|
"vyskaVkladov": get_data(soup, "Výška vkladu", value_type="vklad"),
|
||||||
"statutarnyOrgan": get_statutarnyOrgan(soup),
|
"statutarnyOrgan": get_data(soup, "Štatutárny orgán"),
|
||||||
"konanie": get_konanie(soup),
|
"konanie": get_data(soup, "Konanie menom"),
|
||||||
"zakladneImanie": get_zakladneImanie(soup),
|
"zakladneImanie": get_data(soup, "Základné imanie"),
|
||||||
"aktualizaciaUdajov": get_aktualizaciaUdajov(soup),
|
"aktualizaciaUdajov": get_aktualizaciaUdajov(soup),
|
||||||
"vypisUdajov": get_vypisUdajov(soup)
|
"vypisUdajov": get_vypisUdajov(soup)
|
||||||
}
|
}
|
||||||
print(json.dumps(record,indent=4,ensure_ascii=False))
|
print(json.dumps(record,indent=4,ensure_ascii=False))
|
||||||
collection = connect_db()
|
collection = connect_db()
|
||||||
#collection.bulk_write(soup)
|
records = [InsertOne(record)]
|
||||||
|
collection.bulk_write(records)
|
||||||
disconnect_db(collection)
|
disconnect_db(collection)
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
#scrape_orsr()
|
scrape_orsr()
|
||||||
test()
|
#test()
|
||||||
|
|||||||
Reference in New Issue
Block a user