added proxies, choice for type of record, some output

This commit is contained in:
2023-09-28 14:45:03 +02:00
parent ffc71712c8
commit 68311135bf
3 changed files with 45 additions and 6 deletions

View File

@@ -16,5 +16,7 @@ settings = {
"mongodb_collection": config.get("DB","MONGODB_COLLECTION"), "mongodb_collection": config.get("DB","MONGODB_COLLECTION"),
"base_url": config.get("WEB", "BASE_URL"), "base_url": config.get("WEB", "BASE_URL"),
"endpoint": config.get("WEB", "ENDPOINT"), "endpoint": config.get("WEB", "ENDPOINT"),
"threads": int(config.get("APP", "THREADS")) "threads": int(config.get("APP", "THREADS")),
"http_proxy": config.get("PROXY", "HTTP_PROXY", fallback=None),
"https_proxy": config.get("PROXY", "HTTPS_PROXY", fallback=None)
} }

View File

@@ -7,5 +7,9 @@ MONGODB_COLLECTION = orsr
BASE_URL = https://www.orsr.sk/ BASE_URL = https://www.orsr.sk/
ENDPOINT = hladaj_zmeny.asp ENDPOINT = hladaj_zmeny.asp
[PROXY]
#HTTP_PROXY = socks5://user:pass@host:port
#HTTPS_PROXY = socks5://user:pass@host:port
[APP] [APP]
THREADS = 8 THREADS = 8

View File

@@ -23,11 +23,33 @@ def scrape_orsr():
""" """
This is the main function that scrapes data from endpoint defined in config and stores it in mongodb. This is the main function that scrapes data from endpoint defined in config and stores it in mongodb.
""" """
print("#########################")
print("Starting ORSR scraper")
# get all links to "Aktuálny" from the orsr url # get all links to "Aktuálny" from the orsr url
html = requests.get(settings["base_url"]+settings["endpoint"]) print("Downloading changed records..")
url = settings["base_url"]+settings["endpoint"]
proxies = {}
if (pr := settings["http_proxy"]) is not None:
proxies.update({"http": pr})
print(f"Found http proxy: {pr}")
if (pr := settings["https_proxy"]) is not None:
proxies.update({"https": pr})
print(f"Found https proxy: {pr}")
html = requests.get(url, proxies=proxies)
print("All changed records downloaded.")
soup = BeautifulSoup(html.content, "html.parser") soup = BeautifulSoup(html.content, "html.parser")
records = soup.find_all("a", string="Úplný")
m_type = input("Choose which type of records do you want to download:\n[1] 'Aktuálne'\n[2] 'Úplné' (default)\n")
if m_type == "1":
record_type = "Aktuálny"
print("Record type is 'Aktuálny'")
else:
record_type = "Úplný"
print("Record type is 'Úplný'")
records = soup.find_all("a", string=record_type)
records = [settings["base_url"]+record["href"] for record in records] records = [settings["base_url"]+record["href"] for record in records]
print(f"There were {len(records)} records found.")
# distribute the work in #of threads defined in config # distribute the work in #of threads defined in config
parts = [records[i::settings["threads"]] for i in range(settings["threads"])] parts = [records[i::settings["threads"]] for i in range(settings["threads"])]
@@ -37,6 +59,8 @@ def scrape_orsr():
for thread_id, part in enumerate(parts): for thread_id, part in enumerate(parts):
t.submit(process_records, part, thread_id+1) t.submit(process_records, part, thread_id+1)
print("All records_processed") print("All records_processed")
print("Closing ORSR Scraper...")
print("#########################")
def process_records(records, thread): def process_records(records, thread):
@@ -46,8 +70,12 @@ def process_records(records, thread):
:param thread: thread id of processing thread :param thread: thread id of processing thread
""" """
data = [] data = []
for i in tqdm(range(len(records)), desc=f"thread {thread}"): # for i in tqdm(range(len(records)), desc=f"thread {thread}"):
record = process_record(records[i]) for i in tqdm(range(1), desc=f"thread {thread}"):
try:
record = process_record(records[i])
except Exception as e:
print(f"When downloading and parsing record {records[i]} following error occured: {e}")
data.append(InsertOne(record)) data.append(InsertOne(record))
collection = connect_db() collection = connect_db()
collection.bulk_write(data) collection.bulk_write(data)
@@ -60,7 +88,12 @@ def process_record(url):
:param url: url of the record :param url: url of the record
:return dictionary of parameters :return dictionary of parameters
""" """
html = requests.get(url) proxies = {}
if (pr := settings["http_proxy"]) is not None:
proxies.update({"http": pr})
if (pr := settings["https_proxy"]) is not None:
proxies.update({"https": pr})
html = requests.get(url, proxies=proxies)
soup = BeautifulSoup(html.content, "html.parser") soup = BeautifulSoup(html.content, "html.parser")
record = get_record_data(soup) record = get_record_data(soup)