diff --git a/.github/PULL_REQUEST_TEMPLATE.md b/.github/PULL_REQUEST_TEMPLATE.md index e5764a7..5533387 100644 --- a/.github/PULL_REQUEST_TEMPLATE.md +++ b/.github/PULL_REQUEST_TEMPLATE.md @@ -19,6 +19,7 @@ ## Testing - [ ] Tests pass (coverage โ‰ฅ 91%) - [ ] Manually tested +- [ ] `make docker-integration` passed locally *(required when touching `Dockerfile`, `entrypoint.sh`, `docker-compose.yml`, or `packages/parser-core/`)* ## Checklist - [ ] Code follows project style diff --git a/.gitignore b/.gitignore index 7b6fe98..b8515f1 100644 --- a/.gitignore +++ b/.gitignore @@ -249,3 +249,7 @@ tmp/ # GSD planning artifacts .planning/ + +# Integration test snapshot โ€” personal to each developer's local input/ PDFs. +# Never commit updates; the copy on main is kept only as a reference baseline. +packages/parser-core/tests/integration/snapshots/output_snapshot.json diff --git a/Makefile b/Makefile index 967523c..ea4c95c 100644 --- a/Makefile +++ b/Makefile @@ -360,7 +360,7 @@ show-retention-status: ## Show data retention status @python3 -c "from src.services.data_retention import DataRetentionService; from src.app import AppConfig; config = AppConfig.from_env(); service = DataRetentionService(config.data_retention_days, config.output_dir); files = service.find_expired_files(); print(f'Retention period: {config.data_retention_days} days'); print(f'Expired files: {len(files)}')" # Docker build modes -.PHONY: docker-local docker-remote docker-build docker-pull +.PHONY: docker-local docker-remote docker-build docker-pull docker-integration docker-local: ## Build and run from local code @echo "๐Ÿ”จ Building from local code..." @@ -379,6 +379,24 @@ docker-build: ## Build local image without running @cp .env.local .env docker-compose build +docker-integration: ## Run Docker integration test against input/ โ€” compares output to local snapshot + @echo "๐Ÿงช Running Docker integration test..." + @[ -d input ] && [ -n "$$(find input -name '*.pdf' 2>/dev/null | head -1)" ] || { echo "โŒ No PDFs found in input/ โ€” add statements first"; exit 1; } + @mkdir -p /tmp/docker-integration-output + @cp .env.local .env + @docker-compose build -q + @docker run --rm \ + -v "$$(pwd)/input:/app/input:ro" \ + -v "/tmp/docker-integration-output:/app/output" \ + -e EXIT_AFTER_PROCESSING=true \ + bankstatementsprocessor:latest + @python3 packages/parser-core/tests/integration/docker_snapshot.py \ + /tmp/docker-integration-output \ + packages/parser-core/tests/integration/snapshots/output_snapshot.json \ + $$([ "$(UPDATE)" = "1" ] && echo "--update" || echo "") + @rm -rf /tmp/docker-integration-output + @echo "โœ… Docker integration test passed" + docker-pull: ## Pull remote image without running @cp .env.remote .env docker-compose pull diff --git a/packages/parser-core/tests/integration/docker_snapshot.py b/packages/parser-core/tests/integration/docker_snapshot.py new file mode 100644 index 0000000..f06e0e9 --- /dev/null +++ b/packages/parser-core/tests/integration/docker_snapshot.py @@ -0,0 +1,140 @@ +"""Docker integration snapshot helper. + +Reads the container's output directory, builds the same snapshot structure +as test_output_snapshot.py, then either updates the baseline or compares +against it. + +Usage: + # Compare against existing snapshot: + python3 docker_snapshot.py + + # Update (or create) the snapshot baseline: + python3 docker_snapshot.py --update +""" + +from __future__ import annotations + +import json +import sys +from pathlib import Path + + +def _build_snapshot(output_dir: Path) -> dict: + """Collect comparable metrics from the output directory. + + Mirrors the logic in test_output_snapshot._build_snapshot(). + """ + snapshot: dict = {"files": {}} + + for path in sorted(output_dir.iterdir()): + if path.name.startswith(".") or path.is_dir(): + continue + + entry: dict = {"size_bytes": path.stat().st_size} + + if path.suffix == ".json": + try: + data = json.loads(path.read_text(encoding="utf-8")) + if isinstance(data, list): + entry["record_count"] = len(data) + elif isinstance(data, dict): + entry["keys"] = sorted(data.keys()) + except json.JSONDecodeError: + pass + + if path.suffix == ".csv": + lines = path.read_text(encoding="utf-8").splitlines() + entry["row_count"] = len([line for line in lines if line.strip()]) - 1 + + snapshot["files"][path.name] = entry + + csv_files = [ + k for k in snapshot["files"] if k.endswith(".csv") and "duplicate" not in k + ] + snapshot["summary"] = { + "total_files": len(snapshot["files"]), + "csv_outputs": len(csv_files), + "output_filenames": sorted(snapshot["files"].keys()), + } + + return snapshot + + +def main() -> None: + if len(sys.argv) < 3: + print( + "Usage: docker_snapshot.py [--update]", + file=sys.stderr, + ) + sys.exit(1) + + output_dir = Path(sys.argv[1]) + snapshot_file = Path(sys.argv[2]) + update = "--update" in sys.argv + + if not output_dir.exists(): + print(f"โŒ Output directory not found: {output_dir}", file=sys.stderr) + sys.exit(1) + + current = _build_snapshot(output_dir) + + if update: + snapshot_file.parent.mkdir(parents=True, exist_ok=True) + snapshot_file.write_text( + json.dumps(current, indent=2, sort_keys=True), encoding="utf-8" + ) + print(f"โœ… Snapshot updated: {snapshot_file}") + return + + if not snapshot_file.exists(): + print( + f"โŒ No snapshot found at {snapshot_file}.\n" + "Run with UPDATE=1 to create your baseline:\n" + " make docker-integration UPDATE=1", + file=sys.stderr, + ) + sys.exit(1) + + baseline = json.loads(snapshot_file.read_text(encoding="utf-8")) + diffs = [] + + # Compare per-file metrics + base_files = set(baseline.get("summary", {}).get("output_filenames", [])) + curr_files = set(current.get("summary", {}).get("output_filenames", [])) + for added in sorted(curr_files - base_files): + diffs.append(f" new output file: {added}") + for removed in sorted(base_files - curr_files): + diffs.append(f" removed output file: {removed}") + + for fname in sorted(base_files & curr_files): + base_entry = baseline["files"].get(fname, {}) + curr_entry = current["files"].get(fname, {}) + for metric in ("row_count", "record_count"): + bv = base_entry.get(metric) + cv = curr_entry.get(metric) + if bv is not None and bv != cv: + diffs.append(f" {fname}.{metric}: {bv} โ†’ {cv}") + + if diffs: + diff_text = "\n".join(diffs) + print( + f"โŒ Snapshot mismatch โ€” {len(diffs)} change(s) detected:\n" + f"{diff_text}\n\n" + "If intentional, re-run with UPDATE=1 to accept:\n" + " make docker-integration UPDATE=1", + file=sys.stderr, + ) + sys.exit(1) + + total = sum( + e.get("record_count", e.get("row_count", 0)) + for e in current["files"].values() + if "record_count" in e or "row_count" in e + ) + print( + f"โœ… Snapshot matches baseline ({total} records across {len(curr_files)} files)" + ) + + +if __name__ == "__main__": + main() diff --git a/packages/parser-core/tests/integration/test_output_snapshot.py b/packages/parser-core/tests/integration/test_output_snapshot.py index 7a8cfc7..cf00482 100644 --- a/packages/parser-core/tests/integration/test_output_snapshot.py +++ b/packages/parser-core/tests/integration/test_output_snapshot.py @@ -1,17 +1,21 @@ """Integration snapshot test for end-to-end output validation. Runs the full processing pipeline against the real input/ directory and -compares key output metrics against a committed snapshot baseline. +compares key output metrics against a local snapshot baseline. -Usage: - # Run the integration test (skipped by default): - pytest -m integration +The snapshot is personal to each developer's machine and input PDFs โ€” +it is gitignored and never committed. Run with --snapshot-update once +to create your baseline, then re-run as you make changes to catch +regressions. - # Update the snapshot baseline (first run or after intentional change): +Usage: + # Create or refresh your local snapshot baseline: pytest -m integration --snapshot-update -The snapshot file is committed to source control so changes are visible in -code review. Input/output folders are gitignored and never committed. + # Validate current output against your baseline: + pytest -m integration + +Input/output folders and the snapshot file are gitignored and never committed. """ from __future__ import annotations