diff --git a/app/config.py b/app/config.py index f4ce3de..fbff49f 100644 --- a/app/config.py +++ b/app/config.py @@ -16,5 +16,7 @@ settings = { "mongodb_collection": config.get("DB","MONGODB_COLLECTION"), "base_url": config.get("WEB", "BASE_URL"), "endpoint": config.get("WEB", "ENDPOINT"), - "threads": int(config.get("APP", "THREADS")) + "threads": int(config.get("APP", "THREADS")), + "http_proxy": config.get("PROXY", "HTTP_PROXY", fallback=None), + "https_proxy": config.get("PROXY", "HTTPS_PROXY", fallback=None) } diff --git a/config_base.cfg b/config_base.cfg index 9dc4091..7675d93 100644 --- a/config_base.cfg +++ b/config_base.cfg @@ -7,5 +7,9 @@ MONGODB_COLLECTION = orsr BASE_URL = https://www.orsr.sk/ ENDPOINT = hladaj_zmeny.asp +[PROXY] +#HTTP_PROXY = socks5://user:pass@host:port +#HTTPS_PROXY = socks5://user:pass@host:port + [APP] THREADS = 8 \ No newline at end of file diff --git a/scraper.py b/scraper.py index a39c4fb..579b76e 100644 --- a/scraper.py +++ b/scraper.py @@ -23,11 +23,33 @@ def scrape_orsr(): """ This is the main function that scrapes data from endpoint defined in config and stores it in mongodb. """ + print("#########################") + print("Starting ORSR scraper") + # get all links to "Aktuálny" from the orsr url - html = requests.get(settings["base_url"]+settings["endpoint"]) + print("Downloading changed records..") + url = settings["base_url"]+settings["endpoint"] + proxies = {} + if (pr := settings["http_proxy"]) is not None: + proxies.update({"http": pr}) + print(f"Found http proxy: {pr}") + if (pr := settings["https_proxy"]) is not None: + proxies.update({"https": pr}) + print(f"Found https proxy: {pr}") + html = requests.get(url, proxies=proxies) + print("All changed records downloaded.") soup = BeautifulSoup(html.content, "html.parser") - records = soup.find_all("a", string="Úplný") + + m_type = input("Choose which type of records do you want to download:\n[1] 'Aktuálne'\n[2] 'Úplné' (default)\n") + if m_type == "1": + record_type = "Aktuálny" + print("Record type is 'Aktuálny'") + else: + record_type = "Úplný" + print("Record type is 'Úplný'") + records = soup.find_all("a", string=record_type) records = [settings["base_url"]+record["href"] for record in records] + print(f"There were {len(records)} records found.") # distribute the work in #of threads defined in config parts = [records[i::settings["threads"]] for i in range(settings["threads"])] @@ -37,6 +59,8 @@ def scrape_orsr(): for thread_id, part in enumerate(parts): t.submit(process_records, part, thread_id+1) print("All records_processed") + print("Closing ORSR Scraper...") + print("#########################") def process_records(records, thread): @@ -46,8 +70,12 @@ def process_records(records, thread): :param thread: thread id of processing thread """ data = [] - for i in tqdm(range(len(records)), desc=f"thread {thread}"): - record = process_record(records[i]) + # for i in tqdm(range(len(records)), desc=f"thread {thread}"): + for i in tqdm(range(1), desc=f"thread {thread}"): + try: + record = process_record(records[i]) + except Exception as e: + print(f"When downloading and parsing record {records[i]} following error occured: {e}") data.append(InsertOne(record)) collection = connect_db() collection.bulk_write(data) @@ -60,7 +88,12 @@ def process_record(url): :param url: url of the record :return dictionary of parameters """ - html = requests.get(url) + proxies = {} + if (pr := settings["http_proxy"]) is not None: + proxies.update({"http": pr}) + if (pr := settings["https_proxy"]) is not None: + proxies.update({"https": pr}) + html = requests.get(url, proxies=proxies) soup = BeautifulSoup(html.content, "html.parser") record = get_record_data(soup)