added proxies, choice for type of record, some output

This commit is contained in:
2023-09-28 14:45:03 +02:00
parent ffc71712c8
commit 68311135bf
3 changed files with 45 additions and 6 deletions

View File

@@ -16,5 +16,7 @@ settings = {
"mongodb_collection": config.get("DB","MONGODB_COLLECTION"),
"base_url": config.get("WEB", "BASE_URL"),
"endpoint": config.get("WEB", "ENDPOINT"),
"threads": int(config.get("APP", "THREADS"))
"threads": int(config.get("APP", "THREADS")),
"http_proxy": config.get("PROXY", "HTTP_PROXY", fallback=None),
"https_proxy": config.get("PROXY", "HTTPS_PROXY", fallback=None)
}

View File

@@ -7,5 +7,9 @@ MONGODB_COLLECTION = orsr
BASE_URL = https://www.orsr.sk/
ENDPOINT = hladaj_zmeny.asp
[PROXY]
#HTTP_PROXY = socks5://user:pass@host:port
#HTTPS_PROXY = socks5://user:pass@host:port
[APP]
THREADS = 8

View File

@@ -23,11 +23,33 @@ def scrape_orsr():
"""
This is the main function that scrapes data from endpoint defined in config and stores it in mongodb.
"""
print("#########################")
print("Starting ORSR scraper")
# get all links to "Aktuálny" from the orsr url
html = requests.get(settings["base_url"]+settings["endpoint"])
print("Downloading changed records..")
url = settings["base_url"]+settings["endpoint"]
proxies = {}
if (pr := settings["http_proxy"]) is not None:
proxies.update({"http": pr})
print(f"Found http proxy: {pr}")
if (pr := settings["https_proxy"]) is not None:
proxies.update({"https": pr})
print(f"Found https proxy: {pr}")
html = requests.get(url, proxies=proxies)
print("All changed records downloaded.")
soup = BeautifulSoup(html.content, "html.parser")
records = soup.find_all("a", string="Úplný")
m_type = input("Choose which type of records do you want to download:\n[1] 'Aktuálne'\n[2] 'Úplné' (default)\n")
if m_type == "1":
record_type = "Aktuálny"
print("Record type is 'Aktuálny'")
else:
record_type = "Úplný"
print("Record type is 'Úplný'")
records = soup.find_all("a", string=record_type)
records = [settings["base_url"]+record["href"] for record in records]
print(f"There were {len(records)} records found.")
# distribute the work in #of threads defined in config
parts = [records[i::settings["threads"]] for i in range(settings["threads"])]
@@ -37,6 +59,8 @@ def scrape_orsr():
for thread_id, part in enumerate(parts):
t.submit(process_records, part, thread_id+1)
print("All records_processed")
print("Closing ORSR Scraper...")
print("#########################")
def process_records(records, thread):
@@ -46,8 +70,12 @@ def process_records(records, thread):
:param thread: thread id of processing thread
"""
data = []
for i in tqdm(range(len(records)), desc=f"thread {thread}"):
record = process_record(records[i])
# for i in tqdm(range(len(records)), desc=f"thread {thread}"):
for i in tqdm(range(1), desc=f"thread {thread}"):
try:
record = process_record(records[i])
except Exception as e:
print(f"When downloading and parsing record {records[i]} following error occured: {e}")
data.append(InsertOne(record))
collection = connect_db()
collection.bulk_write(data)
@@ -60,7 +88,12 @@ def process_record(url):
:param url: url of the record
:return dictionary of parameters
"""
html = requests.get(url)
proxies = {}
if (pr := settings["http_proxy"]) is not None:
proxies.update({"http": pr})
if (pr := settings["https_proxy"]) is not None:
proxies.update({"https": pr})
html = requests.get(url, proxies=proxies)
soup = BeautifulSoup(html.content, "html.parser")
record = get_record_data(soup)