added proxies, choice for type of record, some output
This commit is contained in:
@@ -16,5 +16,7 @@ settings = {
|
||||
"mongodb_collection": config.get("DB","MONGODB_COLLECTION"),
|
||||
"base_url": config.get("WEB", "BASE_URL"),
|
||||
"endpoint": config.get("WEB", "ENDPOINT"),
|
||||
"threads": int(config.get("APP", "THREADS"))
|
||||
"threads": int(config.get("APP", "THREADS")),
|
||||
"http_proxy": config.get("PROXY", "HTTP_PROXY", fallback=None),
|
||||
"https_proxy": config.get("PROXY", "HTTPS_PROXY", fallback=None)
|
||||
}
|
||||
|
||||
@@ -7,5 +7,9 @@ MONGODB_COLLECTION = orsr
|
||||
BASE_URL = https://www.orsr.sk/
|
||||
ENDPOINT = hladaj_zmeny.asp
|
||||
|
||||
[PROXY]
|
||||
#HTTP_PROXY = socks5://user:pass@host:port
|
||||
#HTTPS_PROXY = socks5://user:pass@host:port
|
||||
|
||||
[APP]
|
||||
THREADS = 8
|
||||
41
scraper.py
41
scraper.py
@@ -23,11 +23,33 @@ def scrape_orsr():
|
||||
"""
|
||||
This is the main function that scrapes data from endpoint defined in config and stores it in mongodb.
|
||||
"""
|
||||
print("#########################")
|
||||
print("Starting ORSR scraper")
|
||||
|
||||
# get all links to "Aktuálny" from the orsr url
|
||||
html = requests.get(settings["base_url"]+settings["endpoint"])
|
||||
print("Downloading changed records..")
|
||||
url = settings["base_url"]+settings["endpoint"]
|
||||
proxies = {}
|
||||
if (pr := settings["http_proxy"]) is not None:
|
||||
proxies.update({"http": pr})
|
||||
print(f"Found http proxy: {pr}")
|
||||
if (pr := settings["https_proxy"]) is not None:
|
||||
proxies.update({"https": pr})
|
||||
print(f"Found https proxy: {pr}")
|
||||
html = requests.get(url, proxies=proxies)
|
||||
print("All changed records downloaded.")
|
||||
soup = BeautifulSoup(html.content, "html.parser")
|
||||
records = soup.find_all("a", string="Úplný")
|
||||
|
||||
m_type = input("Choose which type of records do you want to download:\n[1] 'Aktuálne'\n[2] 'Úplné' (default)\n")
|
||||
if m_type == "1":
|
||||
record_type = "Aktuálny"
|
||||
print("Record type is 'Aktuálny'")
|
||||
else:
|
||||
record_type = "Úplný"
|
||||
print("Record type is 'Úplný'")
|
||||
records = soup.find_all("a", string=record_type)
|
||||
records = [settings["base_url"]+record["href"] for record in records]
|
||||
print(f"There were {len(records)} records found.")
|
||||
|
||||
# distribute the work in #of threads defined in config
|
||||
parts = [records[i::settings["threads"]] for i in range(settings["threads"])]
|
||||
@@ -37,6 +59,8 @@ def scrape_orsr():
|
||||
for thread_id, part in enumerate(parts):
|
||||
t.submit(process_records, part, thread_id+1)
|
||||
print("All records_processed")
|
||||
print("Closing ORSR Scraper...")
|
||||
print("#########################")
|
||||
|
||||
|
||||
def process_records(records, thread):
|
||||
@@ -46,8 +70,12 @@ def process_records(records, thread):
|
||||
:param thread: thread id of processing thread
|
||||
"""
|
||||
data = []
|
||||
for i in tqdm(range(len(records)), desc=f"thread {thread}"):
|
||||
# for i in tqdm(range(len(records)), desc=f"thread {thread}"):
|
||||
for i in tqdm(range(1), desc=f"thread {thread}"):
|
||||
try:
|
||||
record = process_record(records[i])
|
||||
except Exception as e:
|
||||
print(f"When downloading and parsing record {records[i]} following error occured: {e}")
|
||||
data.append(InsertOne(record))
|
||||
collection = connect_db()
|
||||
collection.bulk_write(data)
|
||||
@@ -60,7 +88,12 @@ def process_record(url):
|
||||
:param url: url of the record
|
||||
:return dictionary of parameters
|
||||
"""
|
||||
html = requests.get(url)
|
||||
proxies = {}
|
||||
if (pr := settings["http_proxy"]) is not None:
|
||||
proxies.update({"http": pr})
|
||||
if (pr := settings["https_proxy"]) is not None:
|
||||
proxies.update({"https": pr})
|
||||
html = requests.get(url, proxies=proxies)
|
||||
soup = BeautifulSoup(html.content, "html.parser")
|
||||
|
||||
record = get_record_data(soup)
|
||||
|
||||
Reference in New Issue
Block a user