diff --git a/app/config.py b/app/config.py index a4c46ff..b74e41b 100644 --- a/app/config.py +++ b/app/config.py @@ -11,5 +11,9 @@ if path.exists((cfg_pth := path.join(config_path, "../config.cfg"))): config.read(cfg_pth) settings = { - "mongodb_uri": config.get("DB","MONGODB_URI") -} \ No newline at end of file + "mongodb_uri": config.get("DB","MONGODB_URI"), + "mongodb_db": config.get("DB","MONGODB_DB"), + "mongodb_collection": config.get("DB","MONGODB_COLLECTION"), + "orsr_url": config.get("WEB", "ORSR_URL"), + "threads": int(config.get("APP", "THREADS")) +} diff --git a/app/db.py b/app/db.py index cef6a01..6193b07 100644 --- a/app/db.py +++ b/app/db.py @@ -1,18 +1,30 @@ -from flask import current_app, g +from flask import g from werkzeug.local import LocalProxy -from flask_pymongo import PyMongo +from pymongo import MongoClient +from .config import settings + + +def connect_db(): + client = MongoClient(settings["mongodb_uri"]) + db = client[settings["mongodb_db"]] + collection = db[settings["mongodb_collection"]] + return collection + + +def disconnect_db(conn): + conn.database.client.close() def get_db(): """ Configuration method to return db instance """ - db = getattr(g, "_database", None) + collection = getattr(g, "_database", None) - if db is None: - db = g._database = PyMongo(current_app).db + if collection is None: + collection = g._database = connect_db() - return db + return collection # Use LocalProxy to read the global db instance with just `db` diff --git a/app/routes.py b/app/routes.py index c274302..dc2db88 100644 --- a/app/routes.py +++ b/app/routes.py @@ -5,10 +5,10 @@ from .db import db @app.route("/detail", methods=["GET"]) def detail(): - if "ico" not in request.args: + ico = request.args.get("ico") + if "ico" is None: return "missing ico" - else: - ico = request.args["ico"] + return f"ICO je {ico}" diff --git a/config_base.cfg b/config_base.cfg index c4a89ec..7ab8b63 100644 --- a/config_base.cfg +++ b/config_base.cfg @@ -1,2 +1,10 @@ [DB] -MONGODB_URI = "mongodb://localhost:27017/softone" \ No newline at end of file +MONGODB_URI = mongodb://localhost:27017/softone +MONGODB_DB = softone +MONGODB_COLLECTION = orsr + +[WEB] +ORSR_URL = http://www.orsr.sk/hladaj_zmeny.asp + +[APP] +THREADS = 8 \ No newline at end of file diff --git a/flaskapp.py b/flaskapp.py index ddda1a3..21e0492 100644 --- a/flaskapp.py +++ b/flaskapp.py @@ -15,6 +15,5 @@ def create_app(): if __name__ == "__main__": app = create_app() app.config["Debug"] = True - app.config["MONGO_URI"] = settings["mongodb_uri"] app.run() diff --git a/requirements.txt b/requirements.txt index c7138a7..9bd61fb 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,2 +1,5 @@ +requests~=2.31.0 +beautifulsoup4~=4.12.2 Flask~=2.3.3 -Flask-PyMongo~=2.3.0 \ No newline at end of file +pymongo~=4.5.0 +tqdm~=4.66.1 \ No newline at end of file diff --git a/scraper.py b/scraper.py index e69de29..ef01fc5 100644 --- a/scraper.py +++ b/scraper.py @@ -0,0 +1,87 @@ +import requests +import re +from bs4 import BeautifulSoup +from tqdm.auto import tqdm +from concurrent.futures import ThreadPoolExecutor + +from app.config import settings +from app.db import connect_db, disconnect_db +from time import sleep + + +def scrape_orsr(): + """ + This is the main function that scrapes data from endpoint defined in config and stores it in mongodb. + """ + # get all links to "Aktuálny" from the orsr url + html = requests.get(settings["orsr_url"]) + soup = BeautifulSoup(html.content, "html.parser") + records = soup.find_all("a", string="Aktuálny") + records = [record["href"] for record in records] + + # distribute the work in #of threads defined in config + worker_ids = list(range(1, len(records)+1)) + parts = [worker_ids[i::settings["threads"]] for i in range(settings["threads"])] + + with ThreadPoolExecutor() as t: + for thread_id, part in enumerate(parts): + t.submit(process_records, part, thread_id+1) + + +def process_records(records, thread): + """ + worker for processing records in a thread + :param records: list of urls of records to proceses + :param thread: thread id of processing thread + """ + data = [] + for i in tqdm(range(len(records)), desc=f"thread {thread}"): + record = process_record(records[i]) + data.append(record) + collection = connect_db() + collection.bulk_write(data) + disconnect_db(collection) + + +def process_record(url): + """ + process one record. Scrape url and store data to mongodb + :param url: url of the record + :return dictionary of parameters + """ + html = requests.get(url) + soup = BeautifulSoup(html.content, "html.parser") + + +def test(): + url = "https://www.orsr.sk/vypis.asp?ID=648444&SID=9&P=0" + html = requests.get(url) + soup = BeautifulSoup(html.content, "html.parser") + + ''' + record = { + "oddiel": soup.find("span", string=re.compile("Oddiel:")), + "vlozka": pass, + "obchodneMeno": pass, + "sidlo": pass, + "ico": pass, + "denZapisu": pass, + "pravnaForma": pass, + "predmetyCinnosti": pass, + "spolocnici": pass, + "vyskaVkladov": pass, + "statutarnyOrgan": pass, + "konanie": pass, + "zakladneImanie": pass, + "aktualizaciaUdajov": pass, + "vypisUdajov": pass + } + ''' + collection = connect_db() + #collection.bulk_write(soup) + disconnect_db(collection) + + +if __name__ == "__main__": + #scrape_orsr() + test()