diff --git a/services/database-api/src/main.py b/services/database-api/src/main.py index 58aa0eb..bc67afe 100644 --- a/services/database-api/src/main.py +++ b/services/database-api/src/main.py @@ -21,7 +21,7 @@ log_level = os.environ.get("LOG_LEVEL", "DEBUG").upper() logging.basicConfig(level=log_level, format="%(asctime)s - %(levelname)s - %(message)s") -from fastapi import FastAPI, HTTPException, status +from fastapi import FastAPI, HTTPException, Request, status from pydantic import BaseModel from pymongo import MongoClient @@ -117,3 +117,10 @@ async def delete_report(ticker_symbol: str, report_type: str, year: int, season: if result.deleted_count: return {"message": f"{report_type.capitalize()} report deleted"} raise HTTPException(status_code=404, detail=f"{report_type.capitalize()} report not found") + +@app.post("/synchronize_company_table") +async def synchronize_company_table(request: Request): + collection = db['company'] + collection.drop() + companies = await request.json() + collection.insert_many(companies) diff --git a/services/mops-crawler/src/main.py b/services/mops-crawler/src/main.py index 00e0faa..0aeefc6 100644 --- a/services/mops-crawler/src/main.py +++ b/services/mops-crawler/src/main.py @@ -22,6 +22,7 @@ import pandas import requests from fastapi import FastAPI, HTTPException +from lxml import etree from pydantic import BaseModel # Set up logging @@ -193,3 +194,35 @@ def get_financial_report(report_type: str, ticker_symbol: str, year: int, season # If everything went well, return the sanitized data return result["data"] + +@app.get('/get_all_companies') +def download_company_info(): + base_url = 'https://isin.twse.com.tw/isin/class_main.jsp?owncode=&stockname=&isincode=&market=1&issuetype=1&industry_code=&Page=1&chklike=Y' + try: + response = requests.get(base_url) + response.raise_for_status() + + except Exception as err: + logging.error(f"HTTP error occurred while crawling: {err}") + raise + + listed_companies_data = response.text + root = etree.HTML(listed_companies_data) + + symbol_column_locator = '//tr//*[normalize-space()=\'{}\']/preceding-sibling::*'.format('有價證券代號') + symbol_column_index = len(root.xpath(symbol_column_locator)) + 1 + name_column_locator = '//tr//*[normalize-space()=\'{}\']/preceding-sibling::*'.format('有價證券名稱') + name_column_index = len(root.xpath(name_column_locator)) + 1 + row_locator = '//tr[position()>1]' + rows = root.xpath(row_locator) + + results = [] + for row in rows: + symbol = row.xpath('.//td[{}]'.format(symbol_column_index))[0].text + company = row.xpath('.//td[{}]'.format(name_column_index))[0].text + symbol_company = { + 'symbol': symbol, + 'company': company + } + results.append(symbol_company) + return results diff --git a/services/report-harvester/src/main.py b/services/report-harvester/src/main.py index 34545d2..fd218ea 100644 --- a/services/report-harvester/src/main.py +++ b/services/report-harvester/src/main.py @@ -96,10 +96,39 @@ def store_financial_report(report_type, post_data): logging.error(f"Request error when storing data: {e}") return {"status_code": 500, "message": "Internal Server Error"} - def retrieve_ticker_symbols(): - # [TODO] Implement logic to retrieve the list of companies from the database - return ["2330", "2331"] + companies = get_companies_by_crawler() + synchronize_company(companies) + symbols = [company['symbol'] for company in companies] + return symbols + +def get_companies_by_crawler() -> list: + base_url = 'http://mops-crawler' + url = f'{base_url}/get_all_companies' + try: + response = requests.get(url) + response.raise_for_status() + return response.json() + except HTTPError as http_err: + logging.error(f"HTTP error occurred while retrieving company table from crawler: {http_err}") + raise + except Exception as err: + logging.error(f"Error occurred while retrieving company table from crawler: {err}") + raise + +def synchronize_company(companies: list): + base_url = 'http://database-api' + url = f'{base_url}/synchronize_company_table' + + try: + response = requests.post(url, json=companies) + response.raise_for_status() + except HTTPError as http_err: + logging.error(f"HTTP error occurred while synchronizing company table: {http_err}") + raise + except Exception as err: + logging.error(f"Error occurred while synchronizing company table: {err}") + raise def retrieve_financial_report_version_table(ticker_symbol, report_type): base_url = "http://database-api"