From bc6689be3c7ad886443ded8e2b0a374c9fee3c83 Mon Sep 17 00:00:00 2001 From: GreenNewbieeeee Date: Wed, 3 Jan 2024 16:03:21 +0800 Subject: [PATCH 1/6] report-harvester: crawl data from twse, and store into db --- docker-compose.yml | 2 + services/report-harvester/requirements.txt | 2 + services/report-harvester/src/main.py | 50 ++++++++++++++++++++-- 3 files changed, 51 insertions(+), 3 deletions(-) diff --git a/docker-compose.yml b/docker-compose.yml index 54af36a..2f47688 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -23,6 +23,8 @@ services: report-harvester: build: ./services/report-harvester restart: always + env_file: + - ./config/.env networks: - tradewise-net depends_on: diff --git a/services/report-harvester/requirements.txt b/services/report-harvester/requirements.txt index e197c57..b7d9146 100644 --- a/services/report-harvester/requirements.txt +++ b/services/report-harvester/requirements.txt @@ -1,3 +1,5 @@ requests==2.31.0 APScheduler==3.10.4 pytz==2023.3.post1 +pymongo==4.6.1 +lxml==5.0.0 diff --git a/services/report-harvester/src/main.py b/services/report-harvester/src/main.py index 34545d2..f7175f5 100644 --- a/services/report-harvester/src/main.py +++ b/services/report-harvester/src/main.py @@ -22,6 +22,15 @@ import requests from apscheduler.schedulers.background import BackgroundScheduler from requests.exceptions import HTTPError +from pymongo import MongoClient +from lxml import etree + +# Load environment variables +MONGO_USER = os.getenv("MONGO_INITDB_ROOT_USERNAME") +MONGO_PASSWORD = os.getenv("MONGO_INITDB_ROOT_PASSWORD") +MONGO_DB = os.getenv("MONGO_DATABASE") +client = MongoClient(f"mongodb://{MONGO_USER}:{MONGO_PASSWORD}@database:27017/") +db = client[MONGO_DB] # Set up logging log_level = os.environ.get("LOG_LEVEL", "DEBUG").upper() @@ -96,10 +105,45 @@ def store_financial_report(report_type, post_data): logging.error(f"Request error when storing data: {e}") return {"status_code": 500, "message": "Internal Server Error"} - def retrieve_ticker_symbols(): - # [TODO] Implement logic to retrieve the list of companies from the database - return ["2330", "2331"] + download_company_info() + collection = db['company'] + docs = collection.find({},{'symbol': 1}) + symbols = [doc['symbol']for doc in docs] + return symbols + +def download_company_info(): + base_url = 'https://isin.twse.com.tw/isin/class_main.jsp?owncode=&stockname=&isincode=&market=1&issuetype=1&industry_code=&Page=1&chklike=Y' + try: + response = requests.get(base_url) + response.raise_for_status() + + except HTTPError as http_err: + logging.error(f"HTTP error occurred while crawling: {http_err}") + raise + + except Exception as err: + logging.error(f"Error occurred while crawling: {err}") + raise + + listed_companies_data = response.text + root = etree.HTML(listed_companies_data) + + symbol_column_locator = '//tr//*[normalize-space()=\'{}\']/preceding-sibling::*'.format('有價證券代號') + symbol_column_index = len(root.xpath(symbol_column_locator)) + 1 + name_column_locator = '//tr//*[normalize-space()=\'{}\']/preceding-sibling::*'.format('有價證券名稱') + name_column_index = len(root.xpath(name_column_locator)) + 1 + row_locator = '//tr'.format(base_url) + rows = root.xpath(row_locator) + + results = [] + for row in rows: + symbol_company = { + 'symbol': row.xpath('.//td[{}]'.format(symbol_column_index))[0].text, + 'company': row.xpath('.//td[{}]'.format(name_column_index))[0].text + } + results.append(symbol_company) + db['company'].insert_many(results) def retrieve_financial_report_version_table(ticker_symbol, report_type): base_url = "http://database-api" From 17f4fa3a6c1e28cbf8581e5f28429ab2e20405ca Mon Sep 17 00:00:00 2001 From: GreenNewbieeeee Date: Thu, 4 Jan 2024 00:57:48 +0800 Subject: [PATCH 2/6] report-harvester: Remove header, and check record is duplicate when inserting --- services/report-harvester/src/main.py | 19 ++++++++++++------- 1 file changed, 12 insertions(+), 7 deletions(-) diff --git a/services/report-harvester/src/main.py b/services/report-harvester/src/main.py index f7175f5..5137f64 100644 --- a/services/report-harvester/src/main.py +++ b/services/report-harvester/src/main.py @@ -133,17 +133,22 @@ def download_company_info(): symbol_column_index = len(root.xpath(symbol_column_locator)) + 1 name_column_locator = '//tr//*[normalize-space()=\'{}\']/preceding-sibling::*'.format('有價證券名稱') name_column_index = len(root.xpath(name_column_locator)) + 1 - row_locator = '//tr'.format(base_url) + row_locator = '//tr[position()>1]' rows = root.xpath(row_locator) results = [] + collection = db['company'] for row in rows: - symbol_company = { - 'symbol': row.xpath('.//td[{}]'.format(symbol_column_index))[0].text, - 'company': row.xpath('.//td[{}]'.format(name_column_index))[0].text - } - results.append(symbol_company) - db['company'].insert_many(results) + symbol = row.xpath('.//td[{}]'.format(symbol_column_index))[0].text + company = row.xpath('.//td[{}]'.format(name_column_index))[0].text + if collection.count_documents({'symbol': symbol}, limit = 1) == 0: + symbol_company = { + 'symbol': symbol, + 'company': company + } + results.append(symbol_company) + if len(results) != 0: + collection.insert_many(results) def retrieve_financial_report_version_table(ticker_symbol, report_type): base_url = "http://database-api" From 7289aa651166fce8ec1252b282929022eaa1c32d Mon Sep 17 00:00:00 2001 From: GreenNewbieeeee Date: Mon, 8 Jan 2024 23:38:00 +0800 Subject: [PATCH 3/6] Extract function to corresponding modules. --- docker-compose.yml | 2 - services/database-api/src/main.py | 9 ++- services/mops-crawler/requirements.txt | 1 + services/mops-crawler/src/main.py | 37 ++++++++++++ services/report-harvester/requirements.txt | 2 - services/report-harvester/src/main.py | 68 ++++++++-------------- 6 files changed, 69 insertions(+), 50 deletions(-) diff --git a/docker-compose.yml b/docker-compose.yml index 2f47688..54af36a 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -23,8 +23,6 @@ services: report-harvester: build: ./services/report-harvester restart: always - env_file: - - ./config/.env networks: - tradewise-net depends_on: diff --git a/services/database-api/src/main.py b/services/database-api/src/main.py index 58aa0eb..cbd0f7e 100644 --- a/services/database-api/src/main.py +++ b/services/database-api/src/main.py @@ -21,7 +21,7 @@ log_level = os.environ.get("LOG_LEVEL", "DEBUG").upper() logging.basicConfig(level=log_level, format="%(asctime)s - %(levelname)s - %(message)s") -from fastapi import FastAPI, HTTPException, status +from fastapi import FastAPI, HTTPException, status, Request from pydantic import BaseModel from pymongo import MongoClient @@ -117,3 +117,10 @@ async def delete_report(ticker_symbol: str, report_type: str, year: int, season: if result.deleted_count: return {"message": f"{report_type.capitalize()} report deleted"} raise HTTPException(status_code=404, detail=f"{report_type.capitalize()} report not found") + +@app.post("/synchronize_company_table") +async def synchronize_company_table(request: Request): + collection = db['company'] + collection.drop() + companies = await request.json() + collection.insert_many(companies) \ No newline at end of file diff --git a/services/mops-crawler/requirements.txt b/services/mops-crawler/requirements.txt index 5b29f4a..d480515 100644 --- a/services/mops-crawler/requirements.txt +++ b/services/mops-crawler/requirements.txt @@ -5,3 +5,4 @@ lxml==4.9.3 requests==2.31.0 fastapi==0.105.0 uvicorn==0.25.0 +lxml==5.0.0 diff --git a/services/mops-crawler/src/main.py b/services/mops-crawler/src/main.py index 00e0faa..4a5e0d5 100644 --- a/services/mops-crawler/src/main.py +++ b/services/mops-crawler/src/main.py @@ -23,6 +23,7 @@ import requests from fastapi import FastAPI, HTTPException from pydantic import BaseModel +from lxml import etree # Set up logging log_level = os.environ.get("LOG_LEVEL", "DEBUG").upper() @@ -193,3 +194,39 @@ def get_financial_report(report_type: str, ticker_symbol: str, year: int, season # If everything went well, return the sanitized data return result["data"] + +@app.get('/get_all_companies') +def download_company_info(): + base_url = 'https://isin.twse.com.tw/isin/class_main.jsp?owncode=&stockname=&isincode=&market=1&issuetype=1&industry_code=&Page=1&chklike=Y' + try: + response = requests.get(base_url) + response.raise_for_status() + + except HTTPError as http_err: + logging.error(f"HTTP error occurred while crawling: {http_err}") + raise + + except Exception as err: + logging.error(f"Error occurred while crawling: {err}") + raise + + listed_companies_data = response.text + root = etree.HTML(listed_companies_data) + + symbol_column_locator = '//tr//*[normalize-space()=\'{}\']/preceding-sibling::*'.format('有價證券代號') + symbol_column_index = len(root.xpath(symbol_column_locator)) + 1 + name_column_locator = '//tr//*[normalize-space()=\'{}\']/preceding-sibling::*'.format('有價證券名稱') + name_column_index = len(root.xpath(name_column_locator)) + 1 + row_locator = '//tr[position()>1]' + rows = root.xpath(row_locator) + + results = [] + for row in rows: + symbol = row.xpath('.//td[{}]'.format(symbol_column_index))[0].text + company = row.xpath('.//td[{}]'.format(name_column_index))[0].text + symbol_company = { + 'symbol': symbol, + 'company': company + } + results.append(symbol_company) + return results \ No newline at end of file diff --git a/services/report-harvester/requirements.txt b/services/report-harvester/requirements.txt index b7d9146..e197c57 100644 --- a/services/report-harvester/requirements.txt +++ b/services/report-harvester/requirements.txt @@ -1,5 +1,3 @@ requests==2.31.0 APScheduler==3.10.4 pytz==2023.3.post1 -pymongo==4.6.1 -lxml==5.0.0 diff --git a/services/report-harvester/src/main.py b/services/report-harvester/src/main.py index 5137f64..fe1cb96 100644 --- a/services/report-harvester/src/main.py +++ b/services/report-harvester/src/main.py @@ -22,15 +22,6 @@ import requests from apscheduler.schedulers.background import BackgroundScheduler from requests.exceptions import HTTPError -from pymongo import MongoClient -from lxml import etree - -# Load environment variables -MONGO_USER = os.getenv("MONGO_INITDB_ROOT_USERNAME") -MONGO_PASSWORD = os.getenv("MONGO_INITDB_ROOT_PASSWORD") -MONGO_DB = os.getenv("MONGO_DATABASE") -client = MongoClient(f"mongodb://{MONGO_USER}:{MONGO_PASSWORD}@database:27017/") -db = client[MONGO_DB] # Set up logging log_level = os.environ.get("LOG_LEVEL", "DEBUG").upper() @@ -106,49 +97,36 @@ def store_financial_report(report_type, post_data): return {"status_code": 500, "message": "Internal Server Error"} def retrieve_ticker_symbols(): - download_company_info() - collection = db['company'] - docs = collection.find({},{'symbol': 1}) - symbols = [doc['symbol']for doc in docs] - return symbols - -def download_company_info(): - base_url = 'https://isin.twse.com.tw/isin/class_main.jsp?owncode=&stockname=&isincode=&market=1&issuetype=1&industry_code=&Page=1&chklike=Y' + companies = get_companies_by_crawler() + synchronize_company(companies) + +def get_companies_by_crawler() -> dict: + base_url = 'http://mops-crawler' + url = f'{base_url}/get_all_companies' try: - response = requests.get(base_url) + response = requests.get(url) response.raise_for_status() - + return response.text except HTTPError as http_err: - logging.error(f"HTTP error occurred while crawling: {http_err}") + logging.error(f"HTTP error occurred while retrieving report version table: {http_err}") raise - except Exception as err: - logging.error(f"Error occurred while crawling: {err}") + logging.error(f"Error occurred while retrieving report version table: {err}") raise - listed_companies_data = response.text - root = etree.HTML(listed_companies_data) - - symbol_column_locator = '//tr//*[normalize-space()=\'{}\']/preceding-sibling::*'.format('有價證券代號') - symbol_column_index = len(root.xpath(symbol_column_locator)) + 1 - name_column_locator = '//tr//*[normalize-space()=\'{}\']/preceding-sibling::*'.format('有價證券名稱') - name_column_index = len(root.xpath(name_column_locator)) + 1 - row_locator = '//tr[position()>1]' - rows = root.xpath(row_locator) - - results = [] - collection = db['company'] - for row in rows: - symbol = row.xpath('.//td[{}]'.format(symbol_column_index))[0].text - company = row.xpath('.//td[{}]'.format(name_column_index))[0].text - if collection.count_documents({'symbol': symbol}, limit = 1) == 0: - symbol_company = { - 'symbol': symbol, - 'company': company - } - results.append(symbol_company) - if len(results) != 0: - collection.insert_many(results) +def synchronize_company(companies: dict): + base_url = 'http://database-api' + url = f'{base_url}/synchronize_company_table' + + try: + response = requests.post(url, json=companies) + response.raise_for_status() + except HTTPError as http_err: + logging.error(f"HTTP error occurred while synchronizing company table: {http_err}") + raise + except Exception as err: + logging.error(f"Error occurred while synchronizing company table: {err}") + raise def retrieve_financial_report_version_table(ticker_symbol, report_type): base_url = "http://database-api" From 20118d507f942dbf954f8fe76c45c7aa9b794e05 Mon Sep 17 00:00:00 2001 From: GreenNewbieeeee Date: Tue, 9 Jan 2024 01:31:09 +0800 Subject: [PATCH 4/6] Fix synchronize_company_table --- services/database-api/src/main.py | 2 +- services/mops-crawler/requirements.txt | 1 - services/mops-crawler/src/main.py | 8 ++------ services/report-harvester/src/main.py | 8 +++++--- 4 files changed, 8 insertions(+), 11 deletions(-) diff --git a/services/database-api/src/main.py b/services/database-api/src/main.py index cbd0f7e..e67c138 100644 --- a/services/database-api/src/main.py +++ b/services/database-api/src/main.py @@ -123,4 +123,4 @@ async def synchronize_company_table(request: Request): collection = db['company'] collection.drop() companies = await request.json() - collection.insert_many(companies) \ No newline at end of file + collection.insert_many(companies) diff --git a/services/mops-crawler/requirements.txt b/services/mops-crawler/requirements.txt index d480515..5b29f4a 100644 --- a/services/mops-crawler/requirements.txt +++ b/services/mops-crawler/requirements.txt @@ -5,4 +5,3 @@ lxml==4.9.3 requests==2.31.0 fastapi==0.105.0 uvicorn==0.25.0 -lxml==5.0.0 diff --git a/services/mops-crawler/src/main.py b/services/mops-crawler/src/main.py index 4a5e0d5..af53dc4 100644 --- a/services/mops-crawler/src/main.py +++ b/services/mops-crawler/src/main.py @@ -202,12 +202,8 @@ def download_company_info(): response = requests.get(base_url) response.raise_for_status() - except HTTPError as http_err: - logging.error(f"HTTP error occurred while crawling: {http_err}") - raise - except Exception as err: - logging.error(f"Error occurred while crawling: {err}") + logging.error(f"HTTP error occurred while crawling: {err}") raise listed_companies_data = response.text @@ -229,4 +225,4 @@ def download_company_info(): 'company': company } results.append(symbol_company) - return results \ No newline at end of file + return results diff --git a/services/report-harvester/src/main.py b/services/report-harvester/src/main.py index fe1cb96..55eb86c 100644 --- a/services/report-harvester/src/main.py +++ b/services/report-harvester/src/main.py @@ -99,14 +99,16 @@ def store_financial_report(report_type, post_data): def retrieve_ticker_symbols(): companies = get_companies_by_crawler() synchronize_company(companies) + symbols = [company['symbol'] for company in companies] + return symbols -def get_companies_by_crawler() -> dict: +def get_companies_by_crawler() -> list: base_url = 'http://mops-crawler' url = f'{base_url}/get_all_companies' try: response = requests.get(url) response.raise_for_status() - return response.text + return response.json() except HTTPError as http_err: logging.error(f"HTTP error occurred while retrieving report version table: {http_err}") raise @@ -114,7 +116,7 @@ def get_companies_by_crawler() -> dict: logging.error(f"Error occurred while retrieving report version table: {err}") raise -def synchronize_company(companies: dict): +def synchronize_company(companies: list): base_url = 'http://database-api' url = f'{base_url}/synchronize_company_table' From 81440384adca62105fb55bef6c047b89be07ea4a Mon Sep 17 00:00:00 2001 From: GreenNewbieeeee Date: Wed, 10 Jan 2024 22:21:10 +0800 Subject: [PATCH 5/6] isort --- services/database-api/src/main.py | 2 +- services/mops-crawler/src/main.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/services/database-api/src/main.py b/services/database-api/src/main.py index e67c138..bc67afe 100644 --- a/services/database-api/src/main.py +++ b/services/database-api/src/main.py @@ -21,7 +21,7 @@ log_level = os.environ.get("LOG_LEVEL", "DEBUG").upper() logging.basicConfig(level=log_level, format="%(asctime)s - %(levelname)s - %(message)s") -from fastapi import FastAPI, HTTPException, status, Request +from fastapi import FastAPI, HTTPException, Request, status from pydantic import BaseModel from pymongo import MongoClient diff --git a/services/mops-crawler/src/main.py b/services/mops-crawler/src/main.py index af53dc4..0aeefc6 100644 --- a/services/mops-crawler/src/main.py +++ b/services/mops-crawler/src/main.py @@ -22,8 +22,8 @@ import pandas import requests from fastapi import FastAPI, HTTPException -from pydantic import BaseModel from lxml import etree +from pydantic import BaseModel # Set up logging log_level = os.environ.get("LOG_LEVEL", "DEBUG").upper() From 81d5979d05358484f92d7843d67a67a429657b11 Mon Sep 17 00:00:00 2001 From: GreenNewbieeeee Date: Wed, 10 Jan 2024 22:32:56 +0800 Subject: [PATCH 6/6] Revise error message --- services/report-harvester/src/main.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/services/report-harvester/src/main.py b/services/report-harvester/src/main.py index 55eb86c..fd218ea 100644 --- a/services/report-harvester/src/main.py +++ b/services/report-harvester/src/main.py @@ -110,10 +110,10 @@ def get_companies_by_crawler() -> list: response.raise_for_status() return response.json() except HTTPError as http_err: - logging.error(f"HTTP error occurred while retrieving report version table: {http_err}") + logging.error(f"HTTP error occurred while retrieving company table from crawler: {http_err}") raise except Exception as err: - logging.error(f"Error occurred while retrieving report version table: {err}") + logging.error(f"Error occurred while retrieving company table from crawler: {err}") raise def synchronize_company(companies: list):