multithreading and parsing1

This commit is contained in:
2023-09-27 13:02:30 +02:00
parent 00d7456f89
commit e453cc2c6c
7 changed files with 127 additions and 14 deletions

View File

@@ -11,5 +11,9 @@ if path.exists((cfg_pth := path.join(config_path, "../config.cfg"))):
config.read(cfg_pth) config.read(cfg_pth)
settings = { settings = {
"mongodb_uri": config.get("DB","MONGODB_URI") "mongodb_uri": config.get("DB","MONGODB_URI"),
} "mongodb_db": config.get("DB","MONGODB_DB"),
"mongodb_collection": config.get("DB","MONGODB_COLLECTION"),
"orsr_url": config.get("WEB", "ORSR_URL"),
"threads": int(config.get("APP", "THREADS"))
}

View File

@@ -1,18 +1,30 @@
from flask import current_app, g from flask import g
from werkzeug.local import LocalProxy from werkzeug.local import LocalProxy
from flask_pymongo import PyMongo from pymongo import MongoClient
from .config import settings
def connect_db():
client = MongoClient(settings["mongodb_uri"])
db = client[settings["mongodb_db"]]
collection = db[settings["mongodb_collection"]]
return collection
def disconnect_db(conn):
conn.database.client.close()
def get_db(): def get_db():
""" """
Configuration method to return db instance Configuration method to return db instance
""" """
db = getattr(g, "_database", None) collection = getattr(g, "_database", None)
if db is None: if collection is None:
db = g._database = PyMongo(current_app).db collection = g._database = connect_db()
return db return collection
# Use LocalProxy to read the global db instance with just `db` # Use LocalProxy to read the global db instance with just `db`

View File

@@ -5,10 +5,10 @@ from .db import db
@app.route("/detail", methods=["GET"]) @app.route("/detail", methods=["GET"])
def detail(): def detail():
if "ico" not in request.args: ico = request.args.get("ico")
if "ico" is None:
return "missing ico" return "missing ico"
else:
ico = request.args["ico"]
return f"ICO je {ico}" return f"ICO je {ico}"

View File

@@ -1,2 +1,10 @@
[DB] [DB]
MONGODB_URI = "mongodb://localhost:27017/softone" MONGODB_URI = mongodb://localhost:27017/softone
MONGODB_DB = softone
MONGODB_COLLECTION = orsr
[WEB]
ORSR_URL = http://www.orsr.sk/hladaj_zmeny.asp
[APP]
THREADS = 8

View File

@@ -15,6 +15,5 @@ def create_app():
if __name__ == "__main__": if __name__ == "__main__":
app = create_app() app = create_app()
app.config["Debug"] = True app.config["Debug"] = True
app.config["MONGO_URI"] = settings["mongodb_uri"]
app.run() app.run()

View File

@@ -1,2 +1,5 @@
requests~=2.31.0
beautifulsoup4~=4.12.2
Flask~=2.3.3 Flask~=2.3.3
Flask-PyMongo~=2.3.0 pymongo~=4.5.0
tqdm~=4.66.1

View File

@@ -0,0 +1,87 @@
import requests
import re
from bs4 import BeautifulSoup
from tqdm.auto import tqdm
from concurrent.futures import ThreadPoolExecutor
from app.config import settings
from app.db import connect_db, disconnect_db
from time import sleep
def scrape_orsr():
"""
This is the main function that scrapes data from endpoint defined in config and stores it in mongodb.
"""
# get all links to "Aktuálny" from the orsr url
html = requests.get(settings["orsr_url"])
soup = BeautifulSoup(html.content, "html.parser")
records = soup.find_all("a", string="Aktuálny")
records = [record["href"] for record in records]
# distribute the work in #of threads defined in config
worker_ids = list(range(1, len(records)+1))
parts = [worker_ids[i::settings["threads"]] for i in range(settings["threads"])]
with ThreadPoolExecutor() as t:
for thread_id, part in enumerate(parts):
t.submit(process_records, part, thread_id+1)
def process_records(records, thread):
"""
worker for processing records in a thread
:param records: list of urls of records to proceses
:param thread: thread id of processing thread
"""
data = []
for i in tqdm(range(len(records)), desc=f"thread {thread}"):
record = process_record(records[i])
data.append(record)
collection = connect_db()
collection.bulk_write(data)
disconnect_db(collection)
def process_record(url):
"""
process one record. Scrape url and store data to mongodb
:param url: url of the record
:return dictionary of parameters
"""
html = requests.get(url)
soup = BeautifulSoup(html.content, "html.parser")
def test():
url = "https://www.orsr.sk/vypis.asp?ID=648444&SID=9&P=0"
html = requests.get(url)
soup = BeautifulSoup(html.content, "html.parser")
'''
record = {
"oddiel": soup.find("span", string=re.compile("Oddiel:")),
"vlozka": pass,
"obchodneMeno": pass,
"sidlo": pass,
"ico": pass,
"denZapisu": pass,
"pravnaForma": pass,
"predmetyCinnosti": pass,
"spolocnici": pass,
"vyskaVkladov": pass,
"statutarnyOrgan": pass,
"konanie": pass,
"zakladneImanie": pass,
"aktualizaciaUdajov": pass,
"vypisUdajov": pass
}
'''
collection = connect_db()
#collection.bulk_write(soup)
disconnect_db(collection)
if __name__ == "__main__":
#scrape_orsr()
test()