Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions .github/PULL_REQUEST_TEMPLATE.md
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@
## Testing
- [ ] Tests pass (coverage ≥ 91%)
- [ ] Manually tested
- [ ] `make docker-integration` passed locally *(required when touching `Dockerfile`, `entrypoint.sh`, `docker-compose.yml`, or `packages/parser-core/`)*

## Checklist
- [ ] Code follows project style
Expand Down
4 changes: 4 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -249,3 +249,7 @@ tmp/

# GSD planning artifacts
.planning/

# Integration test snapshot — personal to each developer's local input/ PDFs.
# Never commit updates; the copy on main is kept only as a reference baseline.
packages/parser-core/tests/integration/snapshots/output_snapshot.json
20 changes: 19 additions & 1 deletion Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -360,7 +360,7 @@ show-retention-status: ## Show data retention status
@python3 -c "from src.services.data_retention import DataRetentionService; from src.app import AppConfig; config = AppConfig.from_env(); service = DataRetentionService(config.data_retention_days, config.output_dir); files = service.find_expired_files(); print(f'Retention period: {config.data_retention_days} days'); print(f'Expired files: {len(files)}')"

# Docker build modes
.PHONY: docker-local docker-remote docker-build docker-pull
.PHONY: docker-local docker-remote docker-build docker-pull docker-integration

docker-local: ## Build and run from local code
@echo "🔨 Building from local code..."
Expand All @@ -379,6 +379,24 @@ docker-build: ## Build local image without running
@cp .env.local .env
docker-compose build

docker-integration: ## Run Docker integration test against input/ — compares output to local snapshot
@echo "🧪 Running Docker integration test..."
@[ -d input ] && [ -n "$$(find input -name '*.pdf' 2>/dev/null | head -1)" ] || { echo "❌ No PDFs found in input/ — add statements first"; exit 1; }
@mkdir -p /tmp/docker-integration-output
@cp .env.local .env
@docker-compose build -q
@docker run --rm \
-v "$$(pwd)/input:/app/input:ro" \
-v "/tmp/docker-integration-output:/app/output" \
-e EXIT_AFTER_PROCESSING=true \
bankstatementsprocessor:latest
@python3 packages/parser-core/tests/integration/docker_snapshot.py \
/tmp/docker-integration-output \
packages/parser-core/tests/integration/snapshots/output_snapshot.json \
$$([ "$(UPDATE)" = "1" ] && echo "--update" || echo "")
@rm -rf /tmp/docker-integration-output
@echo "✅ Docker integration test passed"

docker-pull: ## Pull remote image without running
@cp .env.remote .env
docker-compose pull
140 changes: 140 additions & 0 deletions packages/parser-core/tests/integration/docker_snapshot.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,140 @@
"""Docker integration snapshot helper.

Reads the container's output directory, builds the same snapshot structure
as test_output_snapshot.py, then either updates the baseline or compares
against it.

Usage:
# Compare against existing snapshot:
python3 docker_snapshot.py <output_dir> <snapshot_file>

# Update (or create) the snapshot baseline:
python3 docker_snapshot.py <output_dir> <snapshot_file> --update
"""

from __future__ import annotations

import json
import sys
from pathlib import Path


def _build_snapshot(output_dir: Path) -> dict:
"""Collect comparable metrics from the output directory.

Mirrors the logic in test_output_snapshot._build_snapshot().
"""
snapshot: dict = {"files": {}}

for path in sorted(output_dir.iterdir()):
if path.name.startswith(".") or path.is_dir():
continue

entry: dict = {"size_bytes": path.stat().st_size}

if path.suffix == ".json":
try:
data = json.loads(path.read_text(encoding="utf-8"))
if isinstance(data, list):
entry["record_count"] = len(data)
elif isinstance(data, dict):
entry["keys"] = sorted(data.keys())
except json.JSONDecodeError:
pass

if path.suffix == ".csv":
lines = path.read_text(encoding="utf-8").splitlines()
entry["row_count"] = len([line for line in lines if line.strip()]) - 1

snapshot["files"][path.name] = entry

csv_files = [
k for k in snapshot["files"] if k.endswith(".csv") and "duplicate" not in k
]
snapshot["summary"] = {
"total_files": len(snapshot["files"]),
"csv_outputs": len(csv_files),
"output_filenames": sorted(snapshot["files"].keys()),
}

return snapshot


def main() -> None:
if len(sys.argv) < 3:
print(
"Usage: docker_snapshot.py <output_dir> <snapshot_file> [--update]",
file=sys.stderr,
)
sys.exit(1)

output_dir = Path(sys.argv[1])
snapshot_file = Path(sys.argv[2])
update = "--update" in sys.argv

if not output_dir.exists():
print(f"❌ Output directory not found: {output_dir}", file=sys.stderr)
sys.exit(1)

current = _build_snapshot(output_dir)

if update:
snapshot_file.parent.mkdir(parents=True, exist_ok=True)
snapshot_file.write_text(
json.dumps(current, indent=2, sort_keys=True), encoding="utf-8"
)
print(f"✅ Snapshot updated: {snapshot_file}")
return

if not snapshot_file.exists():
print(
f"❌ No snapshot found at {snapshot_file}.\n"
"Run with UPDATE=1 to create your baseline:\n"
" make docker-integration UPDATE=1",
file=sys.stderr,
)
sys.exit(1)

baseline = json.loads(snapshot_file.read_text(encoding="utf-8"))
diffs = []

# Compare per-file metrics
base_files = set(baseline.get("summary", {}).get("output_filenames", []))
curr_files = set(current.get("summary", {}).get("output_filenames", []))
for added in sorted(curr_files - base_files):
diffs.append(f" new output file: {added}")
for removed in sorted(base_files - curr_files):
diffs.append(f" removed output file: {removed}")

for fname in sorted(base_files & curr_files):
base_entry = baseline["files"].get(fname, {})
curr_entry = current["files"].get(fname, {})
for metric in ("row_count", "record_count"):
bv = base_entry.get(metric)
cv = curr_entry.get(metric)
if bv is not None and bv != cv:
diffs.append(f" {fname}.{metric}: {bv} → {cv}")

if diffs:
diff_text = "\n".join(diffs)
print(
f"❌ Snapshot mismatch — {len(diffs)} change(s) detected:\n"
f"{diff_text}\n\n"
"If intentional, re-run with UPDATE=1 to accept:\n"
" make docker-integration UPDATE=1",
file=sys.stderr,
)
sys.exit(1)

total = sum(
e.get("record_count", e.get("row_count", 0))
for e in current["files"].values()
if "record_count" in e or "row_count" in e
)
print(
f"✅ Snapshot matches baseline ({total} records across {len(curr_files)} files)"
)


if __name__ == "__main__":
main()
18 changes: 11 additions & 7 deletions packages/parser-core/tests/integration/test_output_snapshot.py
Original file line number Diff line number Diff line change
@@ -1,17 +1,21 @@
"""Integration snapshot test for end-to-end output validation.

Runs the full processing pipeline against the real input/ directory and
compares key output metrics against a committed snapshot baseline.
compares key output metrics against a local snapshot baseline.

Usage:
# Run the integration test (skipped by default):
pytest -m integration
The snapshot is personal to each developer's machine and input PDFs —
it is gitignored and never committed. Run with --snapshot-update once
to create your baseline, then re-run as you make changes to catch
regressions.

# Update the snapshot baseline (first run or after intentional change):
Usage:
# Create or refresh your local snapshot baseline:
pytest -m integration --snapshot-update

The snapshot file is committed to source control so changes are visible in
code review. Input/output folders are gitignored and never committed.
# Validate current output against your baseline:
pytest -m integration

Input/output folders and the snapshot file are gitignored and never committed.
"""

from __future__ import annotations
Expand Down
Loading