added proxies, choice for type of record, some output

2023-09-28 14:45:03 +02:00
parent ffc71712c8
commit 68311135bf
3 changed files with 45 additions and 6 deletions
--- a/app/config.py
+++ b/app/config.py
@@ -16,5 +16,7 @@ settings = {
    "mongodb_collection": config.get("DB","MONGODB_COLLECTION"),
    "base_url": config.get("WEB", "BASE_URL"),
    "endpoint": config.get("WEB", "ENDPOINT"),
-    "threads": int(config.get("APP", "THREADS"))
+    "threads": int(config.get("APP", "THREADS")),
+    "http_proxy": config.get("PROXY", "HTTP_PROXY", fallback=None),
+    "https_proxy": config.get("PROXY", "HTTPS_PROXY", fallback=None)
 }
--- a/config_base.cfg
+++ b/config_base.cfg
@@ -7,5 +7,9 @@ MONGODB_COLLECTION = orsr
 BASE_URL = https://www.orsr.sk/
 ENDPOINT = hladaj_zmeny.asp

+[PROXY]
+#HTTP_PROXY = socks5://user:pass@host:port
+#HTTPS_PROXY = socks5://user:pass@host:port
+
 [APP]
 THREADS = 8
--- a/scraper.py
+++ b/scraper.py
@@ -23,11 +23,33 @@ def scrape_orsr():
    """
    This is the main function that scrapes data from endpoint defined in config and stores it in mongodb.
    """
+    print("#########################")
+    print("Starting ORSR scraper")
+
    # get all links to "Aktuálny" from the orsr url
-    html = requests.get(settings["base_url"]+settings["endpoint"])
+    print("Downloading changed records..")
+    url = settings["base_url"]+settings["endpoint"]
+    proxies = {}
+    if (pr := settings["http_proxy"]) is not None:
+        proxies.update({"http": pr})
+        print(f"Found http proxy: {pr}")
+    if (pr := settings["https_proxy"]) is not None:
+        proxies.update({"https": pr})
+        print(f"Found https proxy: {pr}")
+    html = requests.get(url, proxies=proxies)
+    print("All changed records downloaded.")
    soup = BeautifulSoup(html.content, "html.parser")
-    records = soup.find_all("a", string="Úplný")
+
+    m_type = input("Choose which type of records do you want to download:\n[1] 'Aktuálne'\n[2] 'Úplné' (default)\n")
+    if m_type == "1":
+        record_type = "Aktuálny"
+        print("Record type is 'Aktuálny'")
+    else:
+        record_type = "Úplný"
+        print("Record type is 'Úplný'")
+    records = soup.find_all("a", string=record_type)
    records = [settings["base_url"]+record["href"] for record in records]
+    print(f"There were {len(records)} records found.")

    # distribute the work in #of threads defined in config
    parts = [records[i::settings["threads"]] for i in range(settings["threads"])]
@@ -37,6 +59,8 @@ def scrape_orsr():
        for thread_id, part in enumerate(parts):
            t.submit(process_records, part, thread_id+1)
    print("All records_processed")
+    print("Closing ORSR Scraper...")
+    print("#########################")


 def process_records(records, thread):
@@ -46,8 +70,12 @@ def process_records(records, thread):
    :param thread: thread id of processing thread
    """
    data = []
-    for i in tqdm(range(len(records)), desc=f"thread {thread}"):
-        record = process_record(records[i])
+    # for i in tqdm(range(len(records)), desc=f"thread {thread}"):
+    for i in tqdm(range(1), desc=f"thread {thread}"):
+        try:
+            record = process_record(records[i])
+        except Exception as e:
+            print(f"When downloading and parsing record {records[i]} following error occured: {e}")
        data.append(InsertOne(record))
    collection = connect_db()
    collection.bulk_write(data)
@@ -60,7 +88,12 @@ def process_record(url):
    :param url: url of the record
    :return dictionary of parameters
    """
-    html = requests.get(url)
+    proxies = {}
+    if (pr := settings["http_proxy"]) is not None:
+        proxies.update({"http": pr})
+    if (pr := settings["https_proxy"]) is not None:
+        proxies.update({"https": pr})
+    html = requests.get(url, proxies=proxies)
    soup = BeautifulSoup(html.content, "html.parser")

    record = get_record_data(soup)