added proxies, choice for type of record, some output
This commit is contained in:
@@ -16,5 +16,7 @@ settings = {
|
|||||||
"mongodb_collection": config.get("DB","MONGODB_COLLECTION"),
|
"mongodb_collection": config.get("DB","MONGODB_COLLECTION"),
|
||||||
"base_url": config.get("WEB", "BASE_URL"),
|
"base_url": config.get("WEB", "BASE_URL"),
|
||||||
"endpoint": config.get("WEB", "ENDPOINT"),
|
"endpoint": config.get("WEB", "ENDPOINT"),
|
||||||
"threads": int(config.get("APP", "THREADS"))
|
"threads": int(config.get("APP", "THREADS")),
|
||||||
|
"http_proxy": config.get("PROXY", "HTTP_PROXY", fallback=None),
|
||||||
|
"https_proxy": config.get("PROXY", "HTTPS_PROXY", fallback=None)
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -7,5 +7,9 @@ MONGODB_COLLECTION = orsr
|
|||||||
BASE_URL = https://www.orsr.sk/
|
BASE_URL = https://www.orsr.sk/
|
||||||
ENDPOINT = hladaj_zmeny.asp
|
ENDPOINT = hladaj_zmeny.asp
|
||||||
|
|
||||||
|
[PROXY]
|
||||||
|
#HTTP_PROXY = socks5://user:pass@host:port
|
||||||
|
#HTTPS_PROXY = socks5://user:pass@host:port
|
||||||
|
|
||||||
[APP]
|
[APP]
|
||||||
THREADS = 8
|
THREADS = 8
|
||||||
43
scraper.py
43
scraper.py
@@ -23,11 +23,33 @@ def scrape_orsr():
|
|||||||
"""
|
"""
|
||||||
This is the main function that scrapes data from endpoint defined in config and stores it in mongodb.
|
This is the main function that scrapes data from endpoint defined in config and stores it in mongodb.
|
||||||
"""
|
"""
|
||||||
|
print("#########################")
|
||||||
|
print("Starting ORSR scraper")
|
||||||
|
|
||||||
# get all links to "Aktuálny" from the orsr url
|
# get all links to "Aktuálny" from the orsr url
|
||||||
html = requests.get(settings["base_url"]+settings["endpoint"])
|
print("Downloading changed records..")
|
||||||
|
url = settings["base_url"]+settings["endpoint"]
|
||||||
|
proxies = {}
|
||||||
|
if (pr := settings["http_proxy"]) is not None:
|
||||||
|
proxies.update({"http": pr})
|
||||||
|
print(f"Found http proxy: {pr}")
|
||||||
|
if (pr := settings["https_proxy"]) is not None:
|
||||||
|
proxies.update({"https": pr})
|
||||||
|
print(f"Found https proxy: {pr}")
|
||||||
|
html = requests.get(url, proxies=proxies)
|
||||||
|
print("All changed records downloaded.")
|
||||||
soup = BeautifulSoup(html.content, "html.parser")
|
soup = BeautifulSoup(html.content, "html.parser")
|
||||||
records = soup.find_all("a", string="Úplný")
|
|
||||||
|
m_type = input("Choose which type of records do you want to download:\n[1] 'Aktuálne'\n[2] 'Úplné' (default)\n")
|
||||||
|
if m_type == "1":
|
||||||
|
record_type = "Aktuálny"
|
||||||
|
print("Record type is 'Aktuálny'")
|
||||||
|
else:
|
||||||
|
record_type = "Úplný"
|
||||||
|
print("Record type is 'Úplný'")
|
||||||
|
records = soup.find_all("a", string=record_type)
|
||||||
records = [settings["base_url"]+record["href"] for record in records]
|
records = [settings["base_url"]+record["href"] for record in records]
|
||||||
|
print(f"There were {len(records)} records found.")
|
||||||
|
|
||||||
# distribute the work in #of threads defined in config
|
# distribute the work in #of threads defined in config
|
||||||
parts = [records[i::settings["threads"]] for i in range(settings["threads"])]
|
parts = [records[i::settings["threads"]] for i in range(settings["threads"])]
|
||||||
@@ -37,6 +59,8 @@ def scrape_orsr():
|
|||||||
for thread_id, part in enumerate(parts):
|
for thread_id, part in enumerate(parts):
|
||||||
t.submit(process_records, part, thread_id+1)
|
t.submit(process_records, part, thread_id+1)
|
||||||
print("All records_processed")
|
print("All records_processed")
|
||||||
|
print("Closing ORSR Scraper...")
|
||||||
|
print("#########################")
|
||||||
|
|
||||||
|
|
||||||
def process_records(records, thread):
|
def process_records(records, thread):
|
||||||
@@ -46,8 +70,12 @@ def process_records(records, thread):
|
|||||||
:param thread: thread id of processing thread
|
:param thread: thread id of processing thread
|
||||||
"""
|
"""
|
||||||
data = []
|
data = []
|
||||||
for i in tqdm(range(len(records)), desc=f"thread {thread}"):
|
# for i in tqdm(range(len(records)), desc=f"thread {thread}"):
|
||||||
record = process_record(records[i])
|
for i in tqdm(range(1), desc=f"thread {thread}"):
|
||||||
|
try:
|
||||||
|
record = process_record(records[i])
|
||||||
|
except Exception as e:
|
||||||
|
print(f"When downloading and parsing record {records[i]} following error occured: {e}")
|
||||||
data.append(InsertOne(record))
|
data.append(InsertOne(record))
|
||||||
collection = connect_db()
|
collection = connect_db()
|
||||||
collection.bulk_write(data)
|
collection.bulk_write(data)
|
||||||
@@ -60,7 +88,12 @@ def process_record(url):
|
|||||||
:param url: url of the record
|
:param url: url of the record
|
||||||
:return dictionary of parameters
|
:return dictionary of parameters
|
||||||
"""
|
"""
|
||||||
html = requests.get(url)
|
proxies = {}
|
||||||
|
if (pr := settings["http_proxy"]) is not None:
|
||||||
|
proxies.update({"http": pr})
|
||||||
|
if (pr := settings["https_proxy"]) is not None:
|
||||||
|
proxies.update({"https": pr})
|
||||||
|
html = requests.get(url, proxies=proxies)
|
||||||
soup = BeautifulSoup(html.content, "html.parser")
|
soup = BeautifulSoup(html.content, "html.parser")
|
||||||
|
|
||||||
record = get_record_data(soup)
|
record = get_record_data(soup)
|
||||||
|
|||||||
Reference in New Issue
Block a user