Skip to content

Commit 93cdbe6

Browse files
committed
Include OpenAPI crawled CSV in database build pipeline
1 parent 569faec commit 93cdbe6

2 files changed

Lines changed: 79 additions & 1 deletion

File tree

database/issue_status_report.json

Lines changed: 1 addition & 1 deletion
Large diffs are not rendered by default.

pipelines/build_databases.py

Lines changed: 78 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -70,6 +70,38 @@ def parse_roc_ym(raw: str) -> tuple[str, int, int, str] | None:
7070
return digits, ad_year, month, month_key
7171

7272

73+
def parse_any_ym(raw: str) -> tuple[str, int, int, str] | None:
74+
s = (raw or "").strip()
75+
if not s:
76+
return None
77+
78+
if "/" in s:
79+
parts = s.split("/")
80+
if len(parts) == 2 and parts[0].isdigit() and parts[1].isdigit():
81+
y = int(parts[0])
82+
m = int(parts[1])
83+
if y >= 1900 and 1 <= m <= 12:
84+
token = f"{y:04d}{m:02d}"
85+
return token, y, m, f"{y:04d}-{m:02d}"
86+
87+
digits = "".join(ch for ch in s if ch.isdigit())
88+
if not digits:
89+
return None
90+
91+
if len(digits) == 6:
92+
y = int(digits[:4])
93+
m = int(digits[-2:])
94+
if y >= 1900 and 1 <= m <= 12:
95+
return digits, y, m, f"{y:04d}-{m:02d}"
96+
97+
if len(digits) == 4:
98+
y = int(digits)
99+
if y >= 1900:
100+
return digits, y, 1, f"{y:04d}-01"
101+
102+
return parse_roc_ym(raw)
103+
104+
73105
def parse_value(raw: str) -> float | None:
74106
val = raw.strip().replace(",", "")
75107
if not val:
@@ -115,6 +147,50 @@ def read_facts(data_dir: Path) -> list[FactRow]:
115147
return rows
116148

117149

150+
def read_openapi_facts(openapi_dir: Path) -> list[FactRow]:
151+
rows: list[FactRow] = []
152+
if not openapi_dir.exists():
153+
return rows
154+
155+
for csv_path in sorted(openapi_dir.glob("*.csv")):
156+
dataset = f"OPENAPI_{csv_path.stem}"
157+
with csv_path.open("r", encoding="utf-8-sig", newline="") as f:
158+
reader = csv.DictReader(f)
159+
if reader.fieldnames is None:
160+
continue
161+
162+
metric_cols = [
163+
c for c in reader.fieldnames if c not in {"年月", "公告日期", "TRANS_DATE"}
164+
]
165+
for rec in reader:
166+
ym_parsed = parse_any_ym(rec.get("年月", ""))
167+
if ym_parsed is None:
168+
continue
169+
token, ad_year, month, month_key = ym_parsed
170+
171+
for metric in metric_cols:
172+
value_raw = (rec.get(metric) or "").strip()
173+
if value_raw == "":
174+
continue
175+
rows.append(
176+
FactRow(
177+
dataset=dataset,
178+
roc_ym=token,
179+
ad_year=ad_year,
180+
month=month,
181+
month_key=month_key,
182+
institution="總計",
183+
institution_type="OpenAPI",
184+
item_zh=metric,
185+
item_en=metric,
186+
value_raw=value_raw,
187+
value_num=parse_value(value_raw),
188+
source_file=str(csv_path),
189+
)
190+
)
191+
return rows
192+
193+
118194
def create_sqlite(sqlite_path: Path, rows: Iterable[FactRow]) -> None:
119195
sqlite_path.parent.mkdir(parents=True, exist_ok=True)
120196
conn = sqlite3.connect(sqlite_path)
@@ -394,8 +470,10 @@ def main() -> None:
394470
args = parse_args()
395471
base_path = Path(args.base_path).resolve()
396472
data_dir = base_path / "data"
473+
openapi_dir = base_path / "rawdata" / "openapi"
397474

398475
rows = read_facts(data_dir)
476+
rows.extend(read_openapi_facts(openapi_dir))
399477
if not rows:
400478
raise SystemExit(f"No CSV rows found under {data_dir}")
401479

0 commit comments

Comments
 (0)