Rezztech · GreenNEWBIEEEEEEE · Jan 3, 2024 · Jan 3, 2024 · Jan 8, 2024 · Jan 8, 2024
diff --git a/services/database-api/src/main.py b/services/database-api/src/main.py
@@ -21,7 +21,7 @@
 log_level = os.environ.get("LOG_LEVEL", "DEBUG").upper()
 logging.basicConfig(level=log_level, format="%(asctime)s - %(levelname)s - %(message)s")
 
-from fastapi import FastAPI, HTTPException, status
+from fastapi import FastAPI, HTTPException, Request, status
 from pydantic import BaseModel
 from pymongo import MongoClient
 
@@ -117,3 +117,10 @@ async def delete_report(ticker_symbol: str, report_type: str, year: int, season:
     if result.deleted_count:
         return {"message": f"{report_type.capitalize()} report deleted"}
     raise HTTPException(status_code=404, detail=f"{report_type.capitalize()} report not found")
+
+@app.post("/synchronize_company_table")
+async def synchronize_company_table(request: Request):
+    collection = db['company']
+    collection.drop()
+    companies = await request.json()
+    collection.insert_many(companies)
diff --git a/services/mops-crawler/src/main.py b/services/mops-crawler/src/main.py
@@ -22,6 +22,7 @@
 import pandas
 import requests
 from fastapi import FastAPI, HTTPException
+from lxml import etree
 from pydantic import BaseModel
 
 # Set up logging
@@ -193,3 +194,35 @@ def get_financial_report(report_type: str, ticker_symbol: str, year: int, season
 
     # If everything went well, return the sanitized data
     return result["data"]
+
+@app.get('/get_all_companies')
+def download_company_info():
+    base_url = 'https://isin.twse.com.tw/isin/class_main.jsp?owncode=&stockname=&isincode=&market=1&issuetype=1&industry_code=&Page=1&chklike=Y'
+    try:
+        response = requests.get(base_url)
+        response.raise_for_status()
+
+    except Exception as err:
+        logging.error(f"HTTP error occurred while crawling: {err}")
+        raise
+
+    listed_companies_data = response.text
+    root = etree.HTML(listed_companies_data)
+
+    symbol_column_locator = '//tr//*[normalize-space()=\'{}\']/preceding-sibling::*'.format('有價證券代號')
+    symbol_column_index = len(root.xpath(symbol_column_locator)) + 1
+    name_column_locator = '//tr//*[normalize-space()=\'{}\']/preceding-sibling::*'.format('有價證券名稱')
+    name_column_index = len(root.xpath(name_column_locator)) + 1
+    row_locator = '//tr[position()>1]'
+    rows = root.xpath(row_locator)
+
+    results = []
+    for row in rows:
+        symbol = row.xpath('.//td[{}]'.format(symbol_column_index))[0].text
+        company = row.xpath('.//td[{}]'.format(name_column_index))[0].text
+        symbol_company = {
+            'symbol': symbol,
+            'company': company
+        }
+        results.append(symbol_company)
+    return results
diff --git a/services/report-harvester/src/main.py b/services/report-harvester/src/main.py
@@ -96,10 +96,39 @@ def store_financial_report(report_type, post_data):
         logging.error(f"Request error when storing data: {e}")
         return {"status_code": 500, "message": "Internal Server Error"}
 
-
 def retrieve_ticker_symbols():
-    # [TODO] Implement logic to retrieve the list of companies from the database
-    return ["2330", "2331"]
+    companies = get_companies_by_crawler()
+    synchronize_company(companies)
+    symbols = [company['symbol'] for company in companies]
+    return symbols
+
+def get_companies_by_crawler() -> list:
+    base_url = 'http://mops-crawler'
+    url = f'{base_url}/get_all_companies'
+    try:
+        response = requests.get(url)
+        response.raise_for_status()
+        return response.json()
+    except HTTPError as http_err:
+        logging.error(f"HTTP error occurred while retrieving company table from crawler: {http_err}")
+        raise
+    except Exception as err:
+        logging.error(f"Error occurred while retrieving company table from crawler: {err}")
+        raise
+
+def synchronize_company(companies: list):
+    base_url = 'http://database-api'
+    url = f'{base_url}/synchronize_company_table'
+
+    try:
+        response = requests.post(url, json=companies)
+        response.raise_for_status()
+    except HTTPError as http_err:
+        logging.error(f"HTTP error occurred while synchronizing company table: {http_err}")
+        raise
+    except Exception as err:
+        logging.error(f"Error occurred while synchronizing company  table: {err}")
+        raise
 
 def retrieve_financial_report_version_table(ticker_symbol, report_type):
     base_url = "http://database-api"