multithreading and parsing1
This commit is contained in:
@@ -11,5 +11,9 @@ if path.exists((cfg_pth := path.join(config_path, "../config.cfg"))):
|
|||||||
config.read(cfg_pth)
|
config.read(cfg_pth)
|
||||||
|
|
||||||
settings = {
|
settings = {
|
||||||
"mongodb_uri": config.get("DB","MONGODB_URI")
|
"mongodb_uri": config.get("DB","MONGODB_URI"),
|
||||||
|
"mongodb_db": config.get("DB","MONGODB_DB"),
|
||||||
|
"mongodb_collection": config.get("DB","MONGODB_COLLECTION"),
|
||||||
|
"orsr_url": config.get("WEB", "ORSR_URL"),
|
||||||
|
"threads": int(config.get("APP", "THREADS"))
|
||||||
}
|
}
|
||||||
24
app/db.py
24
app/db.py
@@ -1,18 +1,30 @@
|
|||||||
from flask import current_app, g
|
from flask import g
|
||||||
from werkzeug.local import LocalProxy
|
from werkzeug.local import LocalProxy
|
||||||
from flask_pymongo import PyMongo
|
from pymongo import MongoClient
|
||||||
|
from .config import settings
|
||||||
|
|
||||||
|
|
||||||
|
def connect_db():
|
||||||
|
client = MongoClient(settings["mongodb_uri"])
|
||||||
|
db = client[settings["mongodb_db"]]
|
||||||
|
collection = db[settings["mongodb_collection"]]
|
||||||
|
return collection
|
||||||
|
|
||||||
|
|
||||||
|
def disconnect_db(conn):
|
||||||
|
conn.database.client.close()
|
||||||
|
|
||||||
|
|
||||||
def get_db():
|
def get_db():
|
||||||
"""
|
"""
|
||||||
Configuration method to return db instance
|
Configuration method to return db instance
|
||||||
"""
|
"""
|
||||||
db = getattr(g, "_database", None)
|
collection = getattr(g, "_database", None)
|
||||||
|
|
||||||
if db is None:
|
if collection is None:
|
||||||
db = g._database = PyMongo(current_app).db
|
collection = g._database = connect_db()
|
||||||
|
|
||||||
return db
|
return collection
|
||||||
|
|
||||||
|
|
||||||
# Use LocalProxy to read the global db instance with just `db`
|
# Use LocalProxy to read the global db instance with just `db`
|
||||||
|
|||||||
@@ -5,10 +5,10 @@ from .db import db
|
|||||||
|
|
||||||
@app.route("/detail", methods=["GET"])
|
@app.route("/detail", methods=["GET"])
|
||||||
def detail():
|
def detail():
|
||||||
if "ico" not in request.args:
|
ico = request.args.get("ico")
|
||||||
|
if "ico" is None:
|
||||||
return "missing ico"
|
return "missing ico"
|
||||||
else:
|
|
||||||
ico = request.args["ico"]
|
|
||||||
return f"ICO je {ico}"
|
return f"ICO je {ico}"
|
||||||
|
|
||||||
|
|
||||||
|
|||||||
@@ -1,2 +1,10 @@
|
|||||||
[DB]
|
[DB]
|
||||||
MONGODB_URI = "mongodb://localhost:27017/softone"
|
MONGODB_URI = mongodb://localhost:27017/softone
|
||||||
|
MONGODB_DB = softone
|
||||||
|
MONGODB_COLLECTION = orsr
|
||||||
|
|
||||||
|
[WEB]
|
||||||
|
ORSR_URL = http://www.orsr.sk/hladaj_zmeny.asp
|
||||||
|
|
||||||
|
[APP]
|
||||||
|
THREADS = 8
|
||||||
@@ -15,6 +15,5 @@ def create_app():
|
|||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
app = create_app()
|
app = create_app()
|
||||||
app.config["Debug"] = True
|
app.config["Debug"] = True
|
||||||
app.config["MONGO_URI"] = settings["mongodb_uri"]
|
|
||||||
|
|
||||||
app.run()
|
app.run()
|
||||||
|
|||||||
@@ -1,2 +1,5 @@
|
|||||||
|
requests~=2.31.0
|
||||||
|
beautifulsoup4~=4.12.2
|
||||||
Flask~=2.3.3
|
Flask~=2.3.3
|
||||||
Flask-PyMongo~=2.3.0
|
pymongo~=4.5.0
|
||||||
|
tqdm~=4.66.1
|
||||||
87
scraper.py
87
scraper.py
@@ -0,0 +1,87 @@
|
|||||||
|
import requests
|
||||||
|
import re
|
||||||
|
from bs4 import BeautifulSoup
|
||||||
|
from tqdm.auto import tqdm
|
||||||
|
from concurrent.futures import ThreadPoolExecutor
|
||||||
|
|
||||||
|
from app.config import settings
|
||||||
|
from app.db import connect_db, disconnect_db
|
||||||
|
from time import sleep
|
||||||
|
|
||||||
|
|
||||||
|
def scrape_orsr():
|
||||||
|
"""
|
||||||
|
This is the main function that scrapes data from endpoint defined in config and stores it in mongodb.
|
||||||
|
"""
|
||||||
|
# get all links to "Aktuálny" from the orsr url
|
||||||
|
html = requests.get(settings["orsr_url"])
|
||||||
|
soup = BeautifulSoup(html.content, "html.parser")
|
||||||
|
records = soup.find_all("a", string="Aktuálny")
|
||||||
|
records = [record["href"] for record in records]
|
||||||
|
|
||||||
|
# distribute the work in #of threads defined in config
|
||||||
|
worker_ids = list(range(1, len(records)+1))
|
||||||
|
parts = [worker_ids[i::settings["threads"]] for i in range(settings["threads"])]
|
||||||
|
|
||||||
|
with ThreadPoolExecutor() as t:
|
||||||
|
for thread_id, part in enumerate(parts):
|
||||||
|
t.submit(process_records, part, thread_id+1)
|
||||||
|
|
||||||
|
|
||||||
|
def process_records(records, thread):
|
||||||
|
"""
|
||||||
|
worker for processing records in a thread
|
||||||
|
:param records: list of urls of records to proceses
|
||||||
|
:param thread: thread id of processing thread
|
||||||
|
"""
|
||||||
|
data = []
|
||||||
|
for i in tqdm(range(len(records)), desc=f"thread {thread}"):
|
||||||
|
record = process_record(records[i])
|
||||||
|
data.append(record)
|
||||||
|
collection = connect_db()
|
||||||
|
collection.bulk_write(data)
|
||||||
|
disconnect_db(collection)
|
||||||
|
|
||||||
|
|
||||||
|
def process_record(url):
|
||||||
|
"""
|
||||||
|
process one record. Scrape url and store data to mongodb
|
||||||
|
:param url: url of the record
|
||||||
|
:return dictionary of parameters
|
||||||
|
"""
|
||||||
|
html = requests.get(url)
|
||||||
|
soup = BeautifulSoup(html.content, "html.parser")
|
||||||
|
|
||||||
|
|
||||||
|
def test():
|
||||||
|
url = "https://www.orsr.sk/vypis.asp?ID=648444&SID=9&P=0"
|
||||||
|
html = requests.get(url)
|
||||||
|
soup = BeautifulSoup(html.content, "html.parser")
|
||||||
|
|
||||||
|
'''
|
||||||
|
record = {
|
||||||
|
"oddiel": soup.find("span", string=re.compile("Oddiel:")),
|
||||||
|
"vlozka": pass,
|
||||||
|
"obchodneMeno": pass,
|
||||||
|
"sidlo": pass,
|
||||||
|
"ico": pass,
|
||||||
|
"denZapisu": pass,
|
||||||
|
"pravnaForma": pass,
|
||||||
|
"predmetyCinnosti": pass,
|
||||||
|
"spolocnici": pass,
|
||||||
|
"vyskaVkladov": pass,
|
||||||
|
"statutarnyOrgan": pass,
|
||||||
|
"konanie": pass,
|
||||||
|
"zakladneImanie": pass,
|
||||||
|
"aktualizaciaUdajov": pass,
|
||||||
|
"vypisUdajov": pass
|
||||||
|
}
|
||||||
|
'''
|
||||||
|
collection = connect_db()
|
||||||
|
#collection.bulk_write(soup)
|
||||||
|
disconnect_db(collection)
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
#scrape_orsr()
|
||||||
|
test()
|
||||||
|
|||||||
Reference in New Issue
Block a user