multithreading and parsing1
This commit is contained in:
87
scraper.py
87
scraper.py
@@ -0,0 +1,87 @@
|
||||
import requests
|
||||
import re
|
||||
from bs4 import BeautifulSoup
|
||||
from tqdm.auto import tqdm
|
||||
from concurrent.futures import ThreadPoolExecutor
|
||||
|
||||
from app.config import settings
|
||||
from app.db import connect_db, disconnect_db
|
||||
from time import sleep
|
||||
|
||||
|
||||
def scrape_orsr():
|
||||
"""
|
||||
This is the main function that scrapes data from endpoint defined in config and stores it in mongodb.
|
||||
"""
|
||||
# get all links to "Aktuálny" from the orsr url
|
||||
html = requests.get(settings["orsr_url"])
|
||||
soup = BeautifulSoup(html.content, "html.parser")
|
||||
records = soup.find_all("a", string="Aktuálny")
|
||||
records = [record["href"] for record in records]
|
||||
|
||||
# distribute the work in #of threads defined in config
|
||||
worker_ids = list(range(1, len(records)+1))
|
||||
parts = [worker_ids[i::settings["threads"]] for i in range(settings["threads"])]
|
||||
|
||||
with ThreadPoolExecutor() as t:
|
||||
for thread_id, part in enumerate(parts):
|
||||
t.submit(process_records, part, thread_id+1)
|
||||
|
||||
|
||||
def process_records(records, thread):
|
||||
"""
|
||||
worker for processing records in a thread
|
||||
:param records: list of urls of records to proceses
|
||||
:param thread: thread id of processing thread
|
||||
"""
|
||||
data = []
|
||||
for i in tqdm(range(len(records)), desc=f"thread {thread}"):
|
||||
record = process_record(records[i])
|
||||
data.append(record)
|
||||
collection = connect_db()
|
||||
collection.bulk_write(data)
|
||||
disconnect_db(collection)
|
||||
|
||||
|
||||
def process_record(url):
|
||||
"""
|
||||
process one record. Scrape url and store data to mongodb
|
||||
:param url: url of the record
|
||||
:return dictionary of parameters
|
||||
"""
|
||||
html = requests.get(url)
|
||||
soup = BeautifulSoup(html.content, "html.parser")
|
||||
|
||||
|
||||
def test():
|
||||
url = "https://www.orsr.sk/vypis.asp?ID=648444&SID=9&P=0"
|
||||
html = requests.get(url)
|
||||
soup = BeautifulSoup(html.content, "html.parser")
|
||||
|
||||
'''
|
||||
record = {
|
||||
"oddiel": soup.find("span", string=re.compile("Oddiel:")),
|
||||
"vlozka": pass,
|
||||
"obchodneMeno": pass,
|
||||
"sidlo": pass,
|
||||
"ico": pass,
|
||||
"denZapisu": pass,
|
||||
"pravnaForma": pass,
|
||||
"predmetyCinnosti": pass,
|
||||
"spolocnici": pass,
|
||||
"vyskaVkladov": pass,
|
||||
"statutarnyOrgan": pass,
|
||||
"konanie": pass,
|
||||
"zakladneImanie": pass,
|
||||
"aktualizaciaUdajov": pass,
|
||||
"vypisUdajov": pass
|
||||
}
|
||||
'''
|
||||
collection = connect_db()
|
||||
#collection.bulk_write(soup)
|
||||
disconnect_db(collection)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
#scrape_orsr()
|
||||
test()
|
||||
|
||||
Reference in New Issue
Block a user