diff --git a/.env.example b/.env.example index ce4d72c..842c80a 100644 --- a/.env.example +++ b/.env.example @@ -18,29 +18,24 @@ API_PORT=8000 LOG_LEVEL=info APP_ENV=dev JWT_SECRET_KEY=change-me-generate-with-openssl-rand-hex-32 -AUTH_COOKIE_NAME=bracc_session -AUTH_COOKIE_SECURE=false -AUTH_COOKIE_SAMESITE=lax -TRUST_PROXY_HEADERS=false INVITE_CODE= CORS_ORIGINS=http://localhost:3000 PRODUCT_TIER=community PATTERNS_ENABLED=false -PUBLIC_MODE=true +PUBLIC_MODE=false PUBLIC_ALLOW_PERSON=false PUBLIC_ALLOW_ENTITY_LOOKUP=false PUBLIC_ALLOW_INVESTIGATIONS=false PATTERN_SPLIT_THRESHOLD_VALUE=80000 PATTERN_SPLIT_MIN_COUNT=3 -PATTERN_SHARE_THRESHOLD=0.60 +PATTERN_SHARE_THRESHOLD=0.6 PATTERN_SRP_MIN_ORGS=5 PATTERN_INEXIG_MIN_RECURRENCE=3 PATTERN_MAX_EVIDENCE_REFS=50 -SHARE_TOKEN_TTL_HOURS=168 # Frontend (dev only — production uses Caddy reverse proxy with relative paths) VITE_API_URL=http://localhost:8000 -VITE_PUBLIC_MODE=true +VITE_PUBLIC_MODE=false VITE_PATTERNS_ENABLED=false # Optional: Google Cloud (for Base dos Dados / TSE BigQuery) diff --git a/.github/ISSUE_TEMPLATE/config.yml b/.github/ISSUE_TEMPLATE/config.yml index d6257a9..b026312 100644 --- a/.github/ISSUE_TEMPLATE/config.yml +++ b/.github/ISSUE_TEMPLATE/config.yml @@ -1,5 +1,5 @@ blank_issues_enabled: false contact_links: - name: Security vulnerability report - url: https://github.com/World-Open-Graph/br-acc/security/advisories/new + url: https://github.com/brunoclz/world-transparency-graph/security/advisories/new about: Use GitHub Security Advisories for private vulnerability disclosure. diff --git a/.github/claude-automerge-policy.json b/.github/claude-automerge-policy.json index 94212a1..e917665 100644 --- a/.github/claude-automerge-policy.json +++ b/.github/claude-automerge-policy.json @@ -10,8 +10,8 @@ "README.md", "CONTRIBUTING.md", "frontend/src/**", - "api/src/icarus/queries/**", - "api/src/icarus/models/**", + "api/src/bracc/queries/**", + "api/src/bracc/models/**", "api/tests/**", "etl/tests/**", "frontend/src/**/*.test.*" diff --git a/.github/workflows/publish-release.yml b/.github/workflows/publish-release.yml index 3cbab87..1ef4336 100644 --- a/.github/workflows/publish-release.yml +++ b/.github/workflows/publish-release.yml @@ -23,26 +23,6 @@ on: description: "Release title (EN)" required: true type: string - highlights_pt: - description: "PT highlights (separate bullets with |)" - required: true - type: string - highlights_en: - description: "EN highlights (separate bullets with |)" - required: true - type: string - patterns_included: - description: "Comma-separated pattern IDs included in this release (use 'none' if not applicable)" - required: true - type: string - technical_changes_pt: - description: "PT technical changes (separate bullets with |)" - required: true - type: string - technical_changes_en: - description: "EN technical changes (separate bullets with |)" - required: true - type: string permissions: contents: write @@ -124,116 +104,63 @@ jobs: COMPARE_URL: ${{ steps.validate.outputs.compare_url }} TITLE_PT: ${{ inputs.title_pt }} TITLE_EN: ${{ inputs.title_en }} - HIGHLIGHTS_PT: ${{ inputs.highlights_pt }} - HIGHLIGHTS_EN: ${{ inputs.highlights_en }} - PATTERNS_INCLUDED: ${{ inputs.patterns_included }} - TECHNICAL_CHANGES_PT: ${{ inputs.technical_changes_pt }} - TECHNICAL_CHANGES_EN: ${{ inputs.technical_changes_en }} run: | set -euo pipefail DATE_UTC="$(date -u +"%Y-%m-%d")" export DATE_UTC - python - <<'PY' - import json - import os - from textwrap import dedent - - def split_pipe(raw: str) -> list[str]: - normalized = raw.replace("\r\n", "\n").replace("\n", "|") - return [item.strip(" -\t") for item in normalized.split("|") if item.strip()] + cat > release_notes.md < list[str]: - value = raw.strip() - if value.lower() in {"none", "n/a", "na", "-"}: - return [] - return [item.strip() for item in value.split(",") if item.strip()] + ${TITLE_PT} - def bullets(items: list[str], fallback: str) -> str: - if not items: - return f"- {fallback}" - return "\n".join(f"- {item}" for item in items) + ### Escopo + - Release publicada por marco. + - Mudanças detalhadas por categorias no histórico desta versão. - highlights_pt = split_pipe(os.environ["HIGHLIGHTS_PT"]) - highlights_en = split_pipe(os.environ["HIGHLIGHTS_EN"]) - technical_changes_pt = split_pipe(os.environ["TECHNICAL_CHANGES_PT"]) - technical_changes_en = split_pipe(os.environ["TECHNICAL_CHANGES_EN"]) - patterns = split_csv(os.environ["PATTERNS_INCLUDED"]) + ### Integridade pública + Os sinais e padrões refletem coocorrências em bases públicas e não constituem prova legal. - release_notes = dedent( - f""" - ## PT-BR + ## EN - {os.environ["TITLE_PT"]} + ${TITLE_EN} - ### Escopo - - Release publicada por marco. - - Mudanças listadas de forma específica para facilitar auditoria pública. + ### Scope + - Milestone-based release publication. + - Detailed changes grouped by category in this version history. - ### Destaques - {bullets(highlights_pt, "Sem destaques declarados.")} + ### Public integrity + Signals and patterns reflect co-occurrence in public records and are not legal proof. - ### Padrões incluídos - {bullets(patterns, "Sem novos padrões nesta release.")} + ## Compatibility - ### Mudanças técnicas - {bullets(technical_changes_pt, "Sem mudanças técnicas declaradas.")} + - Breaking changes: declare explicitly when applicable. + - Migration required: declare explicitly when applicable. - ### Integridade pública - Os sinais e padrões refletem coocorrências em bases públicas e não constituem prova legal. + ## Compare - ## EN + ${COMPARE_URL} - {os.environ["TITLE_EN"]} + ## Metadata - ### Scope - - Milestone-based release publication. - - Changes are listed explicitly for public traceability. + - Version: ${VERSION} + - Target SHA: ${TARGET_SHA} + - Previous tag: ${PREVIOUS_TAG} + - Date (UTC): ${DATE_UTC} + NOTES - ### Highlights - {bullets(highlights_en, "No highlights declared.")} - - ### Included patterns - {bullets(patterns, "No new patterns in this release.")} - - ### Technical changes - {bullets(technical_changes_en, "No technical changes declared.")} - - ### Public integrity - Signals and patterns reflect co-occurrence in public records and are not legal proof. - - ## Compatibility - - - Breaking changes: none declared. - - Migration required: no. - - ## Compare - - {os.environ.get("COMPARE_URL", "")} - - ## Metadata - - - Version: {os.environ["VERSION"]} - - Target SHA: {os.environ["TARGET_SHA"]} - - Previous tag: {os.environ["PREVIOUS_TAG"]} - - Date (UTC): {os.environ.get("DATE_UTC", "")} - """ - ).strip() + "\n" - - with open("release_notes.md", "w", encoding="utf-8") as fh: - fh.write(release_notes) + python - <<'PY' + import json + import os payload = { "version": os.environ["VERSION"], "date": os.environ.get("DATE_UTC", ""), - "highlights_pt": highlights_pt, - "highlights_en": highlights_en, + "highlights_pt": [os.environ["TITLE_PT"]], + "highlights_en": [os.environ["TITLE_EN"]], "api_changes": [], "data_changes": [], "privacy_compliance_changes": [], - "patterns_included": patterns, - "technical_changes_pt": technical_changes_pt, - "technical_changes_en": technical_changes_en, "breaking_changes": False, "migration_required": False, "compare_url": os.environ.get("COMPARE_URL", ""), diff --git a/.github/workflows/security.yml b/.github/workflows/security.yml index 3e02a4c..af879d5 100644 --- a/.github/workflows/security.yml +++ b/.github/workflows/security.yml @@ -6,18 +6,10 @@ on: pull_request: branches: [main] -concurrency: - group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }} - cancel-in-progress: true - -permissions: - contents: read - jobs: gitleaks: name: Gitleaks runs-on: ubuntu-latest - timeout-minutes: 15 steps: - uses: actions/checkout@v4 with: @@ -37,7 +29,6 @@ jobs: bandit: name: Bandit (Python) runs-on: ubuntu-latest - timeout-minutes: 15 steps: - uses: actions/checkout@v4 @@ -57,7 +48,6 @@ jobs: pip-audit: name: Pip Audit (Python deps) runs-on: ubuntu-latest - timeout-minutes: 20 steps: - uses: actions/checkout@v4 @@ -69,14 +59,6 @@ jobs: with: python-version: "3.12" - - name: Cache uv - uses: actions/cache@v4 - with: - path: ~/.cache/uv - key: ${{ runner.os }}-uv-security-${{ hashFiles('api/uv.lock', 'etl/uv.lock') }} - restore-keys: | - ${{ runner.os }}-uv-security- - - name: Export lock-compatible requirement sets run: | cd api @@ -93,7 +75,6 @@ jobs: public-privacy-gate: name: Public Privacy Gate runs-on: ubuntu-latest - timeout-minutes: 15 steps: - uses: actions/checkout@v4 @@ -107,7 +88,6 @@ jobs: compliance-pack-gate: name: Compliance Pack Gate runs-on: ubuntu-latest - timeout-minutes: 15 steps: - uses: actions/checkout@v4 @@ -120,9 +100,8 @@ jobs: public-boundary-gate: name: Public Boundary Gate - if: vars.PUBLIC_BOUNDARY_GATE_ENABLED == 'true' + if: github.repository == 'brunoclz/world-transparency-graph' runs-on: ubuntu-latest - timeout-minutes: 15 steps: - uses: actions/checkout@v4 @@ -136,7 +115,6 @@ jobs: internal-instruction-boundary: name: Internal Instruction Boundary runs-on: ubuntu-latest - timeout-minutes: 15 steps: - uses: actions/checkout@v4 diff --git a/.gitignore b/.gitignore index d1565ed..03039bc 100644 --- a/.gitignore +++ b/.gitignore @@ -75,7 +75,6 @@ scripts/audit-prompts/ # Local report artifacts in repository root /*.pdf /*.html -gitleaks-report*.json # Playwright MCP cache .playwright-mcp/ @@ -91,7 +90,7 @@ data/tse/ # Local MCP runtime config (keep example only) .mcp.json -# Internal assistant instructions (must never be published) +# Internal assistant instruction files (must never be published) CLAUDE.md AGENTS.md AGENTS*.md diff --git a/Makefile b/Makefile index 994a40d..23ff510 100644 --- a/Makefile +++ b/Makefile @@ -1,14 +1,125 @@ -.PHONY: dev stop seed bootstrap-demo bootstrap-full bootstrap-all bootstrap-all-noninteractive bootstrap-all-report check-public-claims check-source-urls check-pipeline-contracts check-pipeline-inputs generate-pipeline-status generate-source-summary generate-reference-metrics check neutrality +.PHONY: dev stop api etl frontend lint type-check test test-api test-etl test-frontend test-integration-api test-integration-etl test-integration check seed clean download-cnpj download-tse download-transparencia download-sanctions download-all etl-cnpj etl-cnpj-stream etl-tse etl-transparencia etl-sanctions etl-all link-persons bootstrap-demo bootstrap-full bootstrap-all bootstrap-all-noninteractive bootstrap-all-report check-public-claims check-source-urls check-pipeline-contracts check-pipeline-inputs generate-pipeline-status generate-source-summary generate-reference-metrics +# ── Development ───────────────────────────────────────── dev: - docker compose -f infra/docker-compose.yml up -d + docker compose up -d stop: - docker compose -f infra/docker-compose.yml down + docker compose down + +# ── API ───────────────────────────────────────────────── +api: + cd api && uv run uvicorn bracc.main:app --reload --host 0.0.0.0 --port 8000 + +# ── ETL ───────────────────────────────────────────────── +etl: + cd etl && uv run bracc-etl --help seed: bash infra/scripts/seed-dev.sh +# ── CNPJ Data ────────────────────────────────────────── +download-cnpj: + cd etl && uv run python scripts/download_cnpj.py --reference-only + cd etl && uv run python scripts/download_cnpj.py --files 1 + +download-cnpj-all: + cd etl && uv run python scripts/download_cnpj.py --files 10 + +etl-cnpj: + cd etl && uv run bracc-etl run --source cnpj --neo4j-password "$${NEO4J_PASSWORD}" --data-dir ../data + +etl-cnpj-dev: + cd etl && uv run bracc-etl run --source cnpj --neo4j-password "$${NEO4J_PASSWORD}" --data-dir ../data --limit 10000 + +etl-cnpj-stream: + cd etl && uv run bracc-etl run --source cnpj --neo4j-password "$${NEO4J_PASSWORD}" --data-dir ../data --streaming + +# ── TSE Data ────────────────────────────────────────── +download-tse: + cd etl && uv run python scripts/download_tse.py --years 2024 + +etl-tse: + cd etl && uv run bracc-etl run --source tse --neo4j-password "$${NEO4J_PASSWORD}" --data-dir ../data + +etl-tse-dev: + cd etl && uv run bracc-etl run --source tse --neo4j-password "$${NEO4J_PASSWORD}" --data-dir ../data --limit 10000 + +# ── Transparencia Data ──────────────────────────────── +download-transparencia: + cd etl && uv run python scripts/download_transparencia.py --year 2025 + +etl-transparencia: + cd etl && uv run bracc-etl run --source transparencia --neo4j-password "$${NEO4J_PASSWORD}" --data-dir ../data + +etl-transparencia-dev: + cd etl && uv run bracc-etl run --source transparencia --neo4j-password "$${NEO4J_PASSWORD}" --data-dir ../data --limit 10000 + +# ── Sanctions Data ──────────────────────────────────── +download-sanctions: + cd etl && uv run python scripts/download_sanctions.py + +etl-sanctions: + cd etl && uv run bracc-etl run --source sanctions --neo4j-password "$${NEO4J_PASSWORD}" --data-dir ../data + +# ── All Data ────────────────────────────────────────── +download-all: download-cnpj download-tse download-transparencia download-sanctions + +etl-all: etl-cnpj etl-tse etl-transparencia etl-sanctions + +# ── Entity Resolution ──────────────────────────────────── +link-persons: + docker compose exec neo4j cypher-shell -u neo4j -p "$${NEO4J_PASSWORD}" -f /scripts/link_persons.cypher + +# ── Frontend ──────────────────────────────────────────── +frontend: + cd frontend && npm run dev + +# ── Quality ───────────────────────────────────────────── +lint: + cd api && uv run ruff check src/ tests/ + cd etl && uv run ruff check src/ tests/ + cd frontend && npm run lint + +type-check: + cd api && uv run mypy src/ + cd etl && uv run mypy src/ + cd frontend && npm run type-check + +test-api: + cd api && uv run pytest + +test-etl: + cd etl && uv run pytest + +test-frontend: + cd frontend && npm test + +test: test-api test-etl test-frontend + +# ── Integration tests ───────────────────────────────── +test-integration-api: + cd api && uv run pytest -m integration + +test-integration-etl: + cd etl && uv run pytest -m integration + +test-integration: test-integration-api test-integration-etl + +# ── Full check (run before commit) ───────────────────── +check: lint type-check test + @echo "All checks passed." + +# ── Neutrality audit ─────────────────────────────────── +neutrality: + @! grep -rn \ + "suspicious\|corrupt\|criminal\|fraudulent\|illegal\|guilty\|CRITICAL\|HIGH.*severity\|MEDIUM.*severity\|LOW.*severity" \ + api/src/ etl/src/ frontend/src/ \ + --include="*.py" --include="*.ts" --include="*.tsx" --include="*.json" \ + || (echo "NEUTRALITY VIOLATION FOUND" && exit 1) + @echo "Neutrality check passed." + +# ── Bootstrap ───────────────────────────────────────────── bootstrap-demo: bash scripts/bootstrap_public_demo.sh --profile demo @@ -24,6 +135,7 @@ bootstrap-all-noninteractive: bootstrap-all-report: python3 scripts/run_bootstrap_all.py --repo-root . --report-latest +# ── Quality checks ──────────────────────────────────────── check-public-claims: python3 scripts/check_public_claims.py --repo-root . @@ -36,22 +148,20 @@ check-pipeline-contracts: check-pipeline-inputs: python3 scripts/check_pipeline_inputs.py +# ── Generators ──────────────────────────────────────────── generate-pipeline-status: - python3 scripts/generate_pipeline_status.py --registry-path docs/source_registry_br_v1.csv --output docs/pipeline_status.md + python3 scripts/generate_pipeline_status.py generate-source-summary: - python3 scripts/generate_data_sources_summary.py --registry-path docs/source_registry_br_v1.csv --docs-path docs/data-sources.md + python3 scripts/generate_data_sources_summary.py generate-reference-metrics: - python3 scripts/generate_reference_metrics.py --json-output audit-results/public-trust/latest/neo4j-reference-metrics.json --doc-output docs/reference_metrics.md - -check: - cd api && bash ../scripts/ci/python_quality.sh - cd etl && bash ../scripts/ci/python_quality.sh - cd frontend && bash ../scripts/ci/frontend_quality.sh - -neutrality: - @! grep -rn "suspicious\|corrupt\|criminal\|fraudulent\|illegal\|guilty" \ - api/src/ etl/src/ frontend/src/ \ - --include="*.py" --include="*.ts" --include="*.tsx" --include="*.json" \ - || (echo "NEUTRALITY VIOLATION: banned words found in source" && exit 1) + python3 scripts/generate_reference_metrics.py + +# ── Cleanup ───────────────────────────────────────────── +clean: + find . -type d -name __pycache__ -exec rm -rf {} + 2>/dev/null || true + find . -type d -name .pytest_cache -exec rm -rf {} + 2>/dev/null || true + find . -type d -name .mypy_cache -exec rm -rf {} + 2>/dev/null || true + find . -type d -name .ruff_cache -exec rm -rf {} + 2>/dev/null || true + rm -rf frontend/dist diff --git a/api/pyproject.toml b/api/pyproject.toml index 9933c58..a606287 100644 --- a/api/pyproject.toml +++ b/api/pyproject.toml @@ -1,7 +1,7 @@ [project] name = "bracc-api" version = "0.1.0" -description = "BRACC API — Brazilian public data anti-corruption graph tool" +description = "BR-ACC API — Brazilian public data anti-corruption graph tool" requires-python = ">=3.12" license = "AGPL-3.0-or-later" dependencies = [ diff --git a/api/src/bracc/config.py b/api/src/bracc/config.py index 02a7e80..76bc98e 100644 --- a/api/src/bracc/config.py +++ b/api/src/bracc/config.py @@ -1,5 +1,6 @@ from typing import Literal +from pydantic import Field from pydantic_settings import BaseSettings @@ -17,14 +18,15 @@ class Settings(BaseSettings): jwt_secret_key: str = "change-me-in-production" jwt_algorithm: str = "HS256" jwt_expire_minutes: int = 1440 - auth_cookie_name: str = "bracc_session" - auth_cookie_secure: bool = False - auth_cookie_samesite: Literal["lax", "strict", "none"] = "lax" - trust_proxy_headers: bool = False rate_limit_anon: str = "60/minute" rate_limit_auth: str = "300/minute" invite_code: str = "" cors_origins: str = "http://localhost:3000" + auth_cookie_name: str = "bracc_session" + auth_cookie_secure: bool = False + auth_cookie_samesite: Literal["lax", "strict", "none"] = "lax" + trust_proxy_headers: bool = False + share_token_ttl_hours: int = 168 # 7 days product_tier: str = "community" patterns_enabled: bool = False public_mode: bool = False @@ -37,7 +39,16 @@ class Settings(BaseSettings): pattern_srp_min_orgs: int = 5 pattern_inexig_min_recurrence: int = 3 pattern_max_evidence_refs: int = 50 - share_token_ttl_hours: int = 168 + + # Pattern hardening defaults (decision-complete contract) + pattern_temporal_window_years: int = Field(default=4, ge=1, le=20) + pattern_min_contract_value: float = Field(default=100000.0, ge=0) + pattern_min_contract_count: int = Field(default=2, ge=1) + pattern_min_debt_value: float = Field(default=50000.0, ge=0) + pattern_same_as_min_confidence: float = Field(default=0.85, ge=0, le=1) + pattern_pep_min_confidence: float = Field(default=0.85, ge=0, le=1) + pattern_min_recurrence: int = Field(default=2, ge=1) + pattern_min_discrepancy_ratio: float = Field(default=0.30, ge=0, le=1) model_config = {"env_prefix": "", "env_file": ".env"} diff --git a/api/src/bracc/dependencies.py b/api/src/bracc/dependencies.py index 9f4b1a1..6f6db6e 100644 --- a/api/src/bracc/dependencies.py +++ b/api/src/bracc/dependencies.py @@ -35,7 +35,12 @@ async def close_driver() -> None: async def get_driver(request: Request) -> AsyncDriver: - driver: AsyncDriver = request.app.state.neo4j_driver + driver: AsyncDriver | None = getattr(request.app.state, "neo4j_driver", None) + if driver is None: + raise HTTPException( + status_code=status.HTTP_503_SERVICE_UNAVAILABLE, + detail="Database connection not available", + ) return driver diff --git a/api/src/bracc/main.py b/api/src/bracc/main.py index b875f24..d9db376 100644 --- a/api/src/bracc/main.py +++ b/api/src/bracc/main.py @@ -2,7 +2,7 @@ from collections.abc import AsyncIterator from contextlib import asynccontextmanager -from fastapi import FastAPI, Request +from fastapi import FastAPI from fastapi.middleware.cors import CORSMiddleware from slowapi import _rate_limit_exceeded_handler from slowapi.errors import RateLimitExceeded @@ -51,7 +51,7 @@ async def lifespan(app: FastAPI) -> AsyncIterator[None]: app = FastAPI( - title="BRACC API", + title="BR-ACC API", description="Brazilian public data graph analysis tool", version="0.1.0", lifespan=lifespan, @@ -85,5 +85,5 @@ async def lifespan(app: FastAPI) -> AsyncIterator[None]: @app.get("/health") -async def health(request: Request) -> dict[str, str]: - return {"status": "ok", "version": request.app.version} +async def health() -> dict[str, str]: + return {"status": "ok"} diff --git a/api/src/bracc/middleware/cpf_masking.py b/api/src/bracc/middleware/cpf_masking.py index 8bb4c1d..ec4c79b 100644 --- a/api/src/bracc/middleware/cpf_masking.py +++ b/api/src/bracc/middleware/cpf_masking.py @@ -53,7 +53,7 @@ def _is_pep_record(record: dict[str, Any]) -> bool: for field in ("role", "cargo"): value = record.get(field) - if isinstance(value, str) and value.strip().lower() in PEP_ROLES: + if isinstance(value, str) and any(kw in value.strip().lower() for kw in PEP_ROLES): return True return False diff --git a/api/src/bracc/middleware/rate_limit.py b/api/src/bracc/middleware/rate_limit.py index b8caa8e..087b322 100644 --- a/api/src/bracc/middleware/rate_limit.py +++ b/api/src/bracc/middleware/rate_limit.py @@ -6,37 +6,20 @@ from bracc.services.auth_service import decode_access_token -def _extract_token(request: Request) -> str | None: +def _get_rate_limit_key(request: Request) -> str: + """Extract user_id from JWT (Bearer or cookie) for rate limiting, fallback to IP.""" auth = request.headers.get("authorization", "") if auth.startswith("Bearer "): - return auth[7:].strip() + token = auth[7:] + user_id = decode_access_token(token) + if user_id: + return f"user:{user_id}" cookie_token = request.cookies.get(settings.auth_cookie_name) if isinstance(cookie_token, str) and cookie_token.strip(): - return cookie_token.strip() - return None - - -def _resolve_client_ip(request: Request) -> str: - if settings.trust_proxy_headers: - forwarded = request.headers.get("x-forwarded-for", "") - if forwarded: - first_hop = forwarded.split(",", 1)[0].strip() - if first_hop: - return first_hop - real_ip = request.headers.get("x-real-ip", "").strip() - if real_ip: - return real_ip - return get_remote_address(request) - - -def _get_rate_limit_key(request: Request) -> str: - """Extract user_id from JWT for rate limiting, fallback to IP.""" - token = _extract_token(request) - if token: - user_id = decode_access_token(token) + user_id = decode_access_token(cookie_token.strip()) if user_id: return f"user:{user_id}" - return _resolve_client_ip(request) + return get_remote_address(request) limiter = Limiter( diff --git a/api/src/bracc/queries/entity_connections.cypher b/api/src/bracc/queries/entity_connections.cypher index 772e651..15f4093 100644 --- a/api/src/bracc/queries/entity_connections.cypher +++ b/api/src/bracc/queries/entity_connections.cypher @@ -1,27 +1,15 @@ -MATCH (center) WHERE elementId(center) = $entity_id +MATCH (center) +WHERE elementId(center) = $entity_id AND (center:Person OR center:Partner OR center:Company OR center:Contract OR center:Sanction OR center:Election OR center:Amendment OR center:Finance OR center:Embargo OR center:Health OR center:Education OR center:Convenio OR center:LaborStats OR center:PublicOffice) -WITH center, - CASE - WHEN coalesce($include_probable, false) THEN - "SOCIO_DE|DOOU|CANDIDATO_EM|VENCEU|AUTOR_EMENDA|SANCIONADA|OPERA_UNIDADE|DEVE|RECEBEU_EMPRESTIMO|EMBARGADA|MANTEDORA_DE|BENEFICIOU|GEROU_CONVENIO|SAME_AS|POSSIBLE_SAME_AS" - ELSE - "SOCIO_DE|DOOU|CANDIDATO_EM|VENCEU|AUTOR_EMENDA|SANCIONADA|OPERA_UNIDADE|DEVE|RECEBEU_EMPRESTIMO|EMBARGADA|MANTEDORA_DE|BENEFICIOU|GEROU_CONVENIO|SAME_AS" - END AS relationship_filter -CALL apoc.path.subgraphAll(center, { - relationshipFilter: relationship_filter, - labelFilter: "-User|-Investigation|-Annotation|-Tag", - maxLevel: $depth, - limit: 200 -}) -YIELD nodes, relationships -WITH center, nodes, relationships -UNWIND relationships AS r -WITH center, - startNode(r) AS src, - endNode(r) AS tgt, - r +OPTIONAL MATCH p=(center)-[:SOCIO_DE|DOOU|CANDIDATO_EM|VENCEU|AUTOR_EMENDA|SANCIONADA|OPERA_UNIDADE|DEVE|RECEBEU_EMPRESTIMO|EMBARGADA|MANTEDORA_DE|BENEFICIOU|GEROU_CONVENIO|SAME_AS|POSSIBLE_SAME_AS*1..4]-(connected) +WHERE length(p) <= $depth + AND all(x IN nodes(p) WHERE NOT (x:User OR x:Investigation OR x:Annotation OR x:Tag)) +WITH center, p +UNWIND CASE WHEN p IS NULL THEN [] ELSE relationships(p) END AS r +WITH DISTINCT center, r, startNode(r) AS src, endNode(r) AS tgt +WHERE coalesce($include_probable, false) OR type(r) <> "POSSIBLE_SAME_AS" RETURN center AS e, r, CASE WHEN elementId(src) = elementId(center) THEN tgt ELSE src END AS connected, diff --git a/api/src/bracc/queries/graph_expand.cypher b/api/src/bracc/queries/graph_expand.cypher index 733a293..b807d2a 100644 --- a/api/src/bracc/queries/graph_expand.cypher +++ b/api/src/bracc/queries/graph_expand.cypher @@ -1,14 +1,21 @@ -MATCH (center) WHERE elementId(center) = $entity_id +MATCH (center) +WHERE elementId(center) = $entity_id AND (center:Person OR center:Company OR center:Contract OR center:Sanction OR center:Election OR center:Amendment OR center:Finance OR center:Embargo OR center:Health OR center:Education OR center:Convenio OR center:LaborStats OR center:PublicOffice OR center:OffshoreEntity OR center:OffshoreOfficer OR center:GlobalPEP OR center:CVMProceeding OR center:Expense) -CALL apoc.path.subgraphAll(center, { - relationshipFilter: "SOCIO_DE|DOOU|CANDIDATO_EM|VENCEU|AUTOR_EMENDA|SANCIONADA|OPERA_UNIDADE|DEVE|RECEBEU_EMPRESTIMO|EMBARGADA|MANTEDORA_DE|BENEFICIOU|GEROU_CONVENIO|SAME_AS|POSSIBLY_SAME_AS|OFFICER_OF|INTERMEDIARY_OF|GLOBAL_PEP_MATCH|CVM_SANCIONADA|GASTOU|FORNECEU", - labelFilter: $label_filter, - maxLevel: $depth, - limit: 200 -}) -YIELD nodes, relationships -RETURN nodes, relationships, elementId(center) AS center_id \ No newline at end of file +OPTIONAL MATCH p=(center)-[:SOCIO_DE|DOOU|CANDIDATO_EM|VENCEU|AUTOR_EMENDA|SANCIONADA|OPERA_UNIDADE|DEVE|RECEBEU_EMPRESTIMO|EMBARGADA|MANTEDORA_DE|BENEFICIOU|GEROU_CONVENIO|SAME_AS|POSSIBLY_SAME_AS|OFFICER_OF|INTERMEDIARY_OF|GLOBAL_PEP_MATCH|CVM_SANCIONADA|GASTOU|FORNECEU*1..4]-(n) +WHERE length(p) <= $depth + AND all(x IN nodes(p) WHERE NOT (x:User OR x:Investigation OR x:Annotation OR x:Tag)) +WITH center, collect(p) AS paths +WITH center, + reduce(ns = [center], p IN paths | ns + CASE WHEN p IS NULL THEN [] ELSE nodes(p) END) AS raw_nodes, + reduce(rs = [], p IN paths | rs + CASE WHEN p IS NULL THEN [] ELSE relationships(p) END) AS raw_rels +UNWIND raw_nodes AS n +WITH center, collect(DISTINCT n) AS nodes, raw_rels +UNWIND CASE WHEN size(raw_rels) = 0 THEN [NULL] ELSE raw_rels END AS r +WITH center, nodes, collect(DISTINCT r) AS rels +RETURN nodes, + [x IN rels WHERE x IS NOT NULL] AS relationships, + elementId(center) AS center_id diff --git a/api/src/bracc/queries/investigation_by_token.cypher b/api/src/bracc/queries/investigation_by_token.cypher index ba9ff3d..6065b57 100644 --- a/api/src/bracc/queries/investigation_by_token.cypher +++ b/api/src/bracc/queries/investigation_by_token.cypher @@ -1,6 +1,4 @@ -MATCH (i:Investigation) -WHERE i.share_token = $token - AND (i.share_expires_at IS NULL OR i.share_expires_at > datetime()) +MATCH (i:Investigation {share_token: $token}) OPTIONAL MATCH (i)-[:INCLUDES]->(e) WITH i, collect(coalesce(e.cpf, e.cnpj, e.contract_id, e.sanction_id, e.amendment_id, e.cnes_code, e.finance_id, e.embargo_id, e.school_id, e.convenio_id, e.stats_id, elementId(e))) AS eids RETURN i.id AS id, @@ -9,5 +7,4 @@ RETURN i.id AS id, i.created_at AS created_at, i.updated_at AS updated_at, i.share_token AS share_token, - i.share_expires_at AS share_expires_at, [x IN eids WHERE x IS NOT NULL] AS entity_ids diff --git a/api/src/bracc/queries/investigation_create.cypher b/api/src/bracc/queries/investigation_create.cypher index b9bf3f3..a583340 100644 --- a/api/src/bracc/queries/investigation_create.cypher +++ b/api/src/bracc/queries/investigation_create.cypher @@ -4,8 +4,7 @@ CREATE (i:Investigation { description: $description, created_at: datetime(), updated_at: datetime(), - share_token: null, - share_expires_at: null + share_token: null }) WITH i MATCH (u:User {id: $user_id}) @@ -16,5 +15,4 @@ RETURN i.id AS id, i.created_at AS created_at, i.updated_at AS updated_at, i.share_token AS share_token, - i.share_expires_at AS share_expires_at, [] AS entity_ids diff --git a/api/src/bracc/queries/investigation_get.cypher b/api/src/bracc/queries/investigation_get.cypher index 25a54bd..1599b20 100644 --- a/api/src/bracc/queries/investigation_get.cypher +++ b/api/src/bracc/queries/investigation_get.cypher @@ -7,5 +7,4 @@ RETURN i.id AS id, i.created_at AS created_at, i.updated_at AS updated_at, i.share_token AS share_token, - i.share_expires_at AS share_expires_at, [x IN eids WHERE x IS NOT NULL] AS entity_ids diff --git a/api/src/bracc/queries/investigation_list.cypher b/api/src/bracc/queries/investigation_list.cypher index 0fe6e48..9310ad6 100644 --- a/api/src/bracc/queries/investigation_list.cypher +++ b/api/src/bracc/queries/investigation_list.cypher @@ -13,5 +13,4 @@ RETURN total, i.created_at AS created_at, i.updated_at AS updated_at, i.share_token AS share_token, - i.share_expires_at AS share_expires_at, [x IN eids WHERE x IS NOT NULL] AS entity_ids diff --git a/api/src/bracc/queries/investigation_share.cypher b/api/src/bracc/queries/investigation_share.cypher index 1115e83..ffb0594 100644 --- a/api/src/bracc/queries/investigation_share.cypher +++ b/api/src/bracc/queries/investigation_share.cypher @@ -1,7 +1,5 @@ MATCH (u:User {id: $user_id})-[:OWNS]->(i:Investigation {id: $id}) SET i.share_token = $share_token, - i.share_expires_at = $share_expires_at, i.updated_at = datetime() RETURN i.id AS id, - i.share_token AS share_token, - i.share_expires_at AS share_expires_at + i.share_token AS share_token diff --git a/api/src/bracc/queries/investigation_update.cypher b/api/src/bracc/queries/investigation_update.cypher index c6cfa05..540fc17 100644 --- a/api/src/bracc/queries/investigation_update.cypher +++ b/api/src/bracc/queries/investigation_update.cypher @@ -11,5 +11,4 @@ RETURN i.id AS id, i.created_at AS created_at, i.updated_at AS updated_at, i.share_token AS share_token, - i.share_expires_at AS share_expires_at, [x IN eids WHERE x IS NOT NULL] AS entity_ids diff --git a/api/src/bracc/queries/node_degree.cypher b/api/src/bracc/queries/node_degree.cypher index c751492..f16f291 100644 --- a/api/src/bracc/queries/node_degree.cypher +++ b/api/src/bracc/queries/node_degree.cypher @@ -1,5 +1,6 @@ -MATCH (n) WHERE elementId(n) = $entity_id +MATCH (n) +WHERE elementId(n) = $entity_id AND (n:Person OR n:Company OR n:Contract OR n:Sanction OR n:Election OR n:Amendment OR n:Finance OR n:Embargo OR n:Health OR n:Education OR n:Convenio OR n:LaborStats OR n:PublicOffice) -RETURN apoc.node.degree(n) AS degree \ No newline at end of file +RETURN COUNT { (n)--() } AS degree diff --git a/api/src/bracc/queries/public_graph_company.cypher b/api/src/bracc/queries/public_graph_company.cypher index 7ae6e16..3358f65 100644 --- a/api/src/bracc/queries/public_graph_company.cypher +++ b/api/src/bracc/queries/public_graph_company.cypher @@ -2,11 +2,31 @@ MATCH (center:Company) WHERE elementId(center) = $company_id OR center.cnpj = $company_identifier OR center.cnpj = $company_identifier_formatted -CALL apoc.path.subgraphAll(center, { - relationshipFilter: "SOCIO_DE|VENCEU|SANCIONADA|DEVE|RECEBEU_EMPRESTIMO|BENEFICIOU|GEROU_CONVENIO|MUNICIPAL_VENCEU|MUNICIPAL_LICITOU", - labelFilter: "+Company|+Contract|+Sanction|+Finance|+Amendment|+Convenio|+Bid|+MunicipalContract|+MunicipalBid|-Person|-Partner|-User|-Investigation|-Annotation|-Tag", - maxLevel: $depth, - limit: 200 -}) -YIELD nodes, relationships -RETURN nodes, relationships, elementId(center) AS center_id +OPTIONAL MATCH p=(center)-[:SOCIO_DE|VENCEU|SANCIONADA|DEVE|RECEBEU_EMPRESTIMO|BENEFICIOU|GEROU_CONVENIO|MUNICIPAL_VENCEU|MUNICIPAL_LICITOU*1..4]-(n) +WHERE length(p) <= $depth + AND all( + x IN nodes(p) + WHERE NOT ( + "Person" IN labels(x) + OR "Partner" IN labels(x) + OR "User" IN labels(x) + OR "Investigation" IN labels(x) + OR "Annotation" IN labels(x) + OR "Tag" IN labels(x) + ) + ) + AND ( + n:Company OR n:Contract OR n:Sanction OR n:Finance OR n:Amendment OR n:Convenio + OR n:Bid OR n:MunicipalContract OR n:MunicipalBid OR n IS NULL + ) +WITH center, collect(p) AS paths +WITH center, + reduce(ns = [center], p IN paths | ns + CASE WHEN p IS NULL THEN [] ELSE nodes(p) END) AS raw_nodes, + reduce(rs = [], p IN paths | rs + CASE WHEN p IS NULL THEN [] ELSE relationships(p) END) AS raw_rels +UNWIND raw_nodes AS n +WITH center, collect(DISTINCT n) AS nodes, raw_rels +UNWIND CASE WHEN size(raw_rels) = 0 THEN [NULL] ELSE raw_rels END AS r +WITH center, nodes, collect(DISTINCT r) AS rels +RETURN nodes, + [x IN rels WHERE x IS NOT NULL] AS relationships, + elementId(center) AS center_id diff --git a/api/src/bracc/queries/schema_init.cypher b/api/src/bracc/queries/schema_init.cypher index f19611b..8ea17e9 100644 --- a/api/src/bracc/queries/schema_init.cypher +++ b/api/src/bracc/queries/schema_init.cypher @@ -1,4 +1,4 @@ -// BRACC Neo4j Schema — Constraints and Indexes +// BR-ACC Neo4j Schema — Constraints and Indexes // Applied on database initialization // ── Uniqueness Constraints ────────────────────────────── diff --git a/api/src/bracc/routers/baseline.py b/api/src/bracc/routers/baseline.py index 2951ca1..8928ba1 100644 --- a/api/src/bracc/routers/baseline.py +++ b/api/src/bracc/routers/baseline.py @@ -6,6 +6,7 @@ from bracc.dependencies import get_session from bracc.models.baseline import BaselineResponse from bracc.services.baseline_service import BASELINE_QUERIES, run_all_baselines, run_baseline +from bracc.services.public_guard import enforce_entity_lookup_enabled router = APIRouter(prefix="/api/v1/baseline", tags=["baseline"]) @@ -16,6 +17,7 @@ async def get_baseline_for_entity( session: Annotated[AsyncSession, Depends(get_session)], dimension: Annotated[str | None, Query()] = None, ) -> BaselineResponse: + enforce_entity_lookup_enabled() if dimension: if dimension not in BASELINE_QUERIES: available = list(BASELINE_QUERIES.keys()) diff --git a/api/src/bracc/routers/entity.py b/api/src/bracc/routers/entity.py index e425086..d13e6e8 100644 --- a/api/src/bracc/routers/entity.py +++ b/api/src/bracc/routers/entity.py @@ -182,7 +182,7 @@ async def get_entity_timeline( date=event_date, label=str(label), entity_type=entity_type, - properties=sanitize_props(props), + properties=sanitize_public_properties(sanitize_props(props)), sources=[SourceAttribution(database="neo4j_graph")], )) diff --git a/api/src/bracc/routers/investigation.py b/api/src/bracc/routers/investigation.py index d7a0589..ea07581 100644 --- a/api/src/bracc/routers/investigation.py +++ b/api/src/bracc/routers/investigation.py @@ -311,7 +311,7 @@ async def export_investigation_pdf( cpf_val = node.get("cpf") if cpf_val and isinstance(cpf_val, str): role = str(node.get("role", node.get("cargo", ""))).lower() - is_pep = role in PEP_ROLES + is_pep = any(kw in role for kw in PEP_ROLES) if not is_pep: if "." in document and "-" in document: document = mask_formatted_cpf(document) diff --git a/api/src/bracc/routers/meta.py b/api/src/bracc/routers/meta.py index a3e9422..24155eb 100644 --- a/api/src/bracc/routers/meta.py +++ b/api/src/bracc/routers/meta.py @@ -6,6 +6,7 @@ from bracc.dependencies import get_session from bracc.services.neo4j_service import execute_query_single +from bracc.services.public_guard import should_hide_person_entities from bracc.services.source_registry import load_source_registry, source_registry_summary router = APIRouter(prefix="/api/v1/meta", tags=["meta"]) @@ -40,7 +41,9 @@ async def database_stats( result = { "total_nodes": record["total_nodes"] if record else 0, "total_relationships": record["total_relationships"] if record else 0, - "person_count": record["person_count"] if record else 0, + "person_count": ( + 0 if should_hide_person_entities() else (record["person_count"] if record else 0) + ), "company_count": record["company_count"] if record else 0, "health_count": record["health_count"] if record else 0, "finance_count": record["finance_count"] if record else 0, diff --git a/api/src/bracc/routers/public.py b/api/src/bracc/routers/public.py index 81b8695..716bddf 100644 --- a/api/src/bracc/routers/public.py +++ b/api/src/bracc/routers/public.py @@ -57,12 +57,6 @@ async def public_meta( return { "product": "World Transparency Graph", "mode": "public_safe", - "dataset_scope": { - "local_default": "demo_local", - "ingestion_mode": "byo_ingestion", - "reference_metrics": "reference_production_snapshot", - }, - "metrics_as_of_utc": "2026-03-01T23:05:00Z", "total_nodes": record["total_nodes"] if record else 0, "total_relationships": record["total_relationships"] if record else 0, "company_count": record["company_count"] if record else 0, diff --git a/api/src/bracc/routers/search.py b/api/src/bracc/routers/search.py index 953f436..78a4771 100644 --- a/api/src/bracc/routers/search.py +++ b/api/src/bracc/routers/search.py @@ -61,9 +61,9 @@ async def search_entities( { "query": _escape_lucene(q), "entity_type": type_filter, - "hide_person_entities": hide_person_entities, "skip": skip, "limit": size, + "hide_person_entities": hide_person_entities, }, ) total_record = await execute_query_single( diff --git a/api/tests/integration/conftest.py b/api/tests/integration/conftest.py index 177c091..e221791 100644 --- a/api/tests/integration/conftest.py +++ b/api/tests/integration/conftest.py @@ -9,6 +9,17 @@ from bracc.main import app +def _iter_cypher_statements(path: Path) -> list[str]: + # Strip comment-only lines before splitting to avoid dropping statements + # that are preceded by section headers. + filtered_lines = [ + line for line in path.read_text().splitlines() + if line.strip() and not line.strip().startswith("//") + ] + text = "\n".join(filtered_lines) + return [stmt.strip() for stmt in text.split(";") if stmt.strip()] + + @pytest.fixture(scope="session") def neo4j_container() -> Neo4jContainer: # type: ignore[misc] """Start a Neo4j container for integration tests.""" @@ -25,32 +36,43 @@ def neo4j_uri(neo4j_container: Neo4jContainer) -> str: @pytest.fixture(scope="session") def neo4j_auth(neo4j_container: Neo4jContainer) -> tuple[str, str]: - return ("neo4j", neo4j_container.NEO4J_ADMIN_PASSWORD) + # testcontainers.neo4j API changed: older versions exposed NEO4J_ADMIN_PASSWORD, + # newer versions expose username/password attributes. + username = getattr(neo4j_container, "username", "neo4j") + password = getattr( + neo4j_container, + "password", + getattr(neo4j_container, "NEO4J_ADMIN_PASSWORD", None), + ) + if password is None: + msg = "Could not resolve Neo4j testcontainer password" + raise RuntimeError(msg) + return (username, password) -@pytest.fixture(scope="session") +@pytest.fixture async def neo4j_driver( neo4j_uri: str, neo4j_auth: tuple[str, str] ) -> AsyncIterator[AsyncDriver]: + # Function-scoped driver avoids loop affinity issues between async tests. driver = AsyncGraphDatabase.driver(neo4j_uri, auth=neo4j_auth) + async with driver.session() as session: + # Keep tests deterministic across function scope by resetting test data. + await session.run("MATCH (n) DETACH DELETE n") # Apply schema schema_path = Path(__file__).parent.parent.parent.parent / "infra" / "neo4j" / "init.cypher" if schema_path.exists(): async with driver.session() as session: - for statement in schema_path.read_text().split(";"): - stmt = statement.strip() - if stmt and not stmt.startswith("//"): - await session.run(stmt) + for stmt in _iter_cypher_statements(schema_path): + await session.run(stmt) # Seed dev data seed_path = ( Path(__file__).parent.parent.parent.parent / "infra" / "scripts" / "seed-dev.cypher" ) if seed_path.exists(): async with driver.session() as session: - for statement in seed_path.read_text().split(";"): - stmt = statement.strip() - if stmt and not stmt.startswith("//"): - await session.run(stmt) + for stmt in _iter_cypher_statements(seed_path): + await session.run(stmt) yield driver await driver.close() diff --git a/api/tests/unit/test_auth.py b/api/tests/unit/test_auth.py index db51e5b..c532b58 100644 --- a/api/tests/unit/test_auth.py +++ b/api/tests/unit/test_auth.py @@ -34,7 +34,11 @@ def _setup_mock_session(driver: MagicMock, records: list[MagicMock]) -> AsyncMoc @pytest.mark.anyio -async def test_register_success(client: AsyncClient) -> None: +async def test_register_success(client: AsyncClient, monkeypatch: pytest.MonkeyPatch) -> None: + from bracc.config import settings + + monkeypatch.setattr(settings, "invite_code", "") + record = _mock_record({ "id": "user-uuid", "email": "test@example.com", @@ -56,19 +60,15 @@ async def test_register_success(client: AsyncClient) -> None: @pytest.mark.anyio -async def test_register_bad_invite(client: AsyncClient) -> None: +async def test_register_bad_invite(client: AsyncClient, monkeypatch: pytest.MonkeyPatch) -> None: from bracc.config import settings - original = settings.invite_code - try: - settings.invite_code = "secret-code" - response = await client.post( - "/api/v1/auth/register", - json={"email": "test@example.com", "password": "password123", "invite_code": "wrong"}, - ) - assert response.status_code == 403 - finally: - settings.invite_code = original + monkeypatch.setattr(settings, "invite_code", "secret-code") + response = await client.post( + "/api/v1/auth/register", + json={"email": "test@example.com", "password": "password123", "invite_code": "wrong"}, + ) + assert response.status_code == 403 @pytest.mark.anyio @@ -155,16 +155,25 @@ async def test_me_invalid_token(client: AsyncClient) -> None: @pytest.mark.anyio -async def test_register_duplicate_email(client: AsyncClient) -> None: +async def test_register_duplicate_email( + client: AsyncClient, + monkeypatch: pytest.MonkeyPatch, +) -> None: + from neo4j.exceptions import ConstraintError + + from bracc.config import settings from bracc.main import app + monkeypatch.setattr(settings, "invite_code", "") + driver = app.state.neo4j_driver mock_session = AsyncMock() - mock_session.run = AsyncMock(side_effect=Exception("Constraint violation")) + mock_session.run = AsyncMock(side_effect=ConstraintError("Node already exists")) driver.session.return_value.__aenter__ = AsyncMock(return_value=mock_session) - with pytest.raises(Exception, match="Constraint violation"): - await client.post( - "/api/v1/auth/register", - json={"email": "duplicate@example.com", "password": "password123"}, - ) + response = await client.post( + "/api/v1/auth/register", + json={"email": "duplicate@example.com", "password": "password123"}, + ) + assert response.status_code == 409 + assert response.json()["detail"] == "Email already registered" diff --git a/api/tests/unit/test_auth_service.py b/api/tests/unit/test_auth_service.py index 3071b4d..1cc6997 100644 --- a/api/tests/unit/test_auth_service.py +++ b/api/tests/unit/test_auth_service.py @@ -61,7 +61,9 @@ def test_decode_access_token_invalid() -> None: @pytest.mark.anyio -async def test_register_user_success() -> None: +async def test_register_user_success(monkeypatch: pytest.MonkeyPatch) -> None: + monkeypatch.setattr(settings, "invite_code", "") + mock_record = _mock_record({ "id": "user-uuid", "email": "test@example.com", @@ -80,15 +82,11 @@ async def test_register_user_success() -> None: @pytest.mark.anyio -async def test_register_user_bad_invite() -> None: - original = settings.invite_code - try: - settings.invite_code = "secret-code" - session = AsyncMock() - with pytest.raises(ValueError, match="Invalid invite code"): - await register_user(session, "test@example.com", "password123", "wrong-code") - finally: - settings.invite_code = original +async def test_register_user_bad_invite(monkeypatch: pytest.MonkeyPatch) -> None: + monkeypatch.setattr(settings, "invite_code", "secret-code") + session = AsyncMock() + with pytest.raises(ValueError, match="Invalid invite code"): + await register_user(session, "test@example.com", "password123", "wrong-code") @pytest.mark.anyio diff --git a/api/tests/unit/test_cpf_masking.py b/api/tests/unit/test_cpf_masking.py index 271df24..12721cb 100644 --- a/api/tests/unit/test_cpf_masking.py +++ b/api/tests/unit/test_cpf_masking.py @@ -68,6 +68,28 @@ def test_political_role(self, role: str) -> None: def test_cargo_field(self) -> None: assert _is_pep_record({"name": "X", "cpf": "11111111111", "cargo": "Deputado"}) + @pytest.mark.parametrize( + "role", + [ + "Deputado Federal", + "deputado federal", + "DEPUTADO FEDERAL", + "Senador da Republica", + "senadora da republica", + "Vereador Suplente", + "Ministro de Estado", + "Governadora do Estado de Sao Paulo", + "Presidente da Republica", + ], + ) + def test_compound_role_detected_as_pep(self, role: str) -> None: + """Compound PEP roles like 'deputado federal' must be detected via substring match.""" + assert _is_pep_record({"name": "X", "cpf": "11111111111", "role": role}) + + def test_compound_cargo_detected_as_pep(self) -> None: + """Compound PEP cargo like 'Deputado Federal' must be detected via substring match.""" + assert _is_pep_record({"name": "X", "cpf": "11111111111", "cargo": "Deputado Federal"}) + def test_non_pep_role(self) -> None: assert not _is_pep_record({"name": "X", "cpf": "11111111111", "role": "assessor"}) @@ -99,6 +121,18 @@ def test_deeply_nested(self) -> None: data = {"a": {"b": {"c": [{"cpf": "33333333333", "is_pep": True}]}}} assert "33333333333" in _collect_pep_cpfs(data) + def test_compound_role_collected(self) -> None: + """Compound roles like 'Deputado Federal' must be recognized in the walk.""" + data = { + "results": [ + {"cpf": "11111111111", "role": "Deputado Federal"}, + {"cpf": "22222222222", "role": "assessor parlamentar"}, + ] + } + peps = _collect_pep_cpfs(data) + assert "11111111111" in peps + assert "22222222222" not in peps + # --------------------------------------------------------------------------- # Unit tests for mask_cpfs_in_json @@ -205,4 +239,4 @@ async def test_health_not_masked(client: AsyncClient) -> None: """Non-CPF JSON responses pass through unchanged.""" resp = await client.get("/health") assert resp.status_code == 200 - assert resp.json()["status"] == "ok" and "version" in resp.json() + assert resp.json() == {"status": "ok"} diff --git a/api/tests/unit/test_health.py b/api/tests/unit/test_health.py index d250005..320e27b 100644 --- a/api/tests/unit/test_health.py +++ b/api/tests/unit/test_health.py @@ -8,9 +8,7 @@ async def test_health_returns_ok(client: AsyncClient) -> None: response = await client.get("/health") assert response.status_code == 200 - data = response.json() - assert data["status"] == "ok" - assert "version" in data + assert response.json() == {"status": "ok"} assert response.headers["x-content-type-options"] == "nosniff" assert response.headers["x-frame-options"] == "DENY" assert response.headers["referrer-policy"] == "no-referrer" diff --git a/api/tests/unit/test_patterns.py b/api/tests/unit/test_patterns.py deleted file mode 100644 index f87d026..0000000 --- a/api/tests/unit/test_patterns.py +++ /dev/null @@ -1,120 +0,0 @@ -from unittest.mock import AsyncMock, patch - -import pytest -from httpx import AsyncClient - -from bracc.config import settings -from bracc.models.pattern import PATTERN_METADATA -from bracc.services.intelligence_provider import COMMUNITY_PATTERN_IDS, COMMUNITY_PATTERN_QUERIES -from bracc.services.neo4j_service import CypherLoader - - -@pytest.fixture(autouse=True) -def _enable_patterns(monkeypatch: pytest.MonkeyPatch) -> None: - monkeypatch.setattr(settings, "patterns_enabled", True) - - -def test_all_community_patterns_have_metadata() -> None: - for pattern_id in COMMUNITY_PATTERN_IDS: - assert pattern_id in PATTERN_METADATA, f"Missing metadata for {pattern_id}" - - -def test_all_community_patterns_have_query_files() -> None: - for query_name in COMMUNITY_PATTERN_QUERIES.values(): - try: - CypherLoader.load(query_name) - except FileNotFoundError: - pytest.fail(f"Missing .cypher file for query {query_name}.cypher") - finally: - CypherLoader.clear_cache() - - -def test_pattern_metadata_has_required_fields() -> None: - for pid, meta in PATTERN_METADATA.items(): - assert "name_pt" in meta, f"{pid} missing name_pt" - assert "name_en" in meta, f"{pid} missing name_en" - assert "desc_pt" in meta, f"{pid} missing desc_pt" - assert "desc_en" in meta, f"{pid} missing desc_en" - - -@pytest.mark.anyio -async def test_list_patterns_endpoint(client: AsyncClient) -> None: - response = await client.get("/api/v1/patterns/") - assert response.status_code == 200 - data = response.json() - assert "patterns" in data - assert len(data["patterns"]) == 8 - - ids = {row["id"] for row in data["patterns"]} - assert ids == set(COMMUNITY_PATTERN_IDS) - - -@pytest.mark.anyio -async def test_patterns_endpoint_returns_503_when_disabled( - client: AsyncClient, - monkeypatch: pytest.MonkeyPatch, -) -> None: - monkeypatch.setattr(settings, "patterns_enabled", False) - response = await client.get("/api/v1/patterns/") - assert response.status_code == 503 - assert "temporarily unavailable" in response.json()["detail"] - - -@pytest.mark.anyio -async def test_invalid_pattern_returns_404(client: AsyncClient) -> None: - response = await client.get("/api/v1/patterns/test-id/nonexistent_pattern") - assert response.status_code == 404 - assert "Pattern not found" in response.json()["detail"] - - -@pytest.mark.anyio -async def test_patterns_endpoint_forwards_include_probable(client: AsyncClient) -> None: - with patch("bracc.routers.patterns.run_all_patterns", new_callable=AsyncMock) as mock_run_all: - mock_run_all.return_value = [] - response = await client.get("/api/v1/patterns/test-id?include_probable=true") - - assert response.status_code == 200 - mock_run_all.assert_awaited_once() - _driver, entity_id, _lang = mock_run_all.await_args.args - assert entity_id == "test-id" - assert mock_run_all.await_args.kwargs["include_probable"] is True - - -@pytest.mark.anyio -async def test_specific_pattern_endpoint_forwards_include_probable(client: AsyncClient) -> None: - with patch("bracc.routers.patterns.run_pattern", new_callable=AsyncMock) as mock_run_one: - mock_run_one.return_value = [] - response = await client.get( - "/api/v1/patterns/test-id/debtor_contracts?include_probable=true", - ) - - assert response.status_code == 200 - mock_run_one.assert_awaited_once() - _session, pattern_name, entity_id, _lang = mock_run_one.await_args.args - assert pattern_name == "debtor_contracts" - assert entity_id == "test-id" - assert mock_run_one.await_args.kwargs["include_probable"] is True - - -def test_community_queries_use_bind_params() -> None: - for query_name in COMMUNITY_PATTERN_QUERIES.values(): - try: - cypher = CypherLoader.load(query_name) - finally: - CypherLoader.clear_cache() - assert "$company_id" in cypher, f"{query_name}.cypher missing $company_id" - assert "$company_identifier" in cypher, f"{query_name}.cypher missing $company_identifier" - assert "$company_identifier_formatted" in cypher, ( - f"{query_name}.cypher missing $company_identifier_formatted" - ) - assert "${" not in cypher, f"{query_name}.cypher uses unsafe string interpolation" - - -def test_no_banned_words_in_pattern_metadata() -> None: - banned = {"suspicious", "corrupt", "criminal", "fraudulent", "illegal", "guilty"} - for pid, meta in PATTERN_METADATA.items(): - for key, value in meta.items(): - for word in banned: - assert word not in value.lower(), ( - f"Banned word '{word}' in {pid}.{key}: {value}" - ) diff --git a/api/tests/unit/test_patterns_new.py b/api/tests/unit/test_patterns_new.py deleted file mode 100644 index 0b8f919..0000000 --- a/api/tests/unit/test_patterns_new.py +++ /dev/null @@ -1,79 +0,0 @@ -"""Community public-safe pattern registry and query contract tests.""" - -import pytest - -from bracc.models.pattern import PATTERN_METADATA -from bracc.services.intelligence_provider import COMMUNITY_PATTERN_IDS, COMMUNITY_PATTERN_QUERIES -from bracc.services.neo4j_service import CypherLoader - - -def test_community_pattern_registry_exact_ids() -> None: - assert len(COMMUNITY_PATTERN_IDS) == 8 - assert set(COMMUNITY_PATTERN_IDS) == { - "sanctioned_still_receiving", - "amendment_beneficiary_contracts", - "split_contracts_below_threshold", - "contract_concentration", - "embargoed_receiving", - "debtor_contracts", - "srp_multi_org_hitchhiking", - "inexigibility_recurrence", - } - - -def test_community_pattern_query_mapping_is_complete() -> None: - assert set(COMMUNITY_PATTERN_QUERIES.keys()) == set(COMMUNITY_PATTERN_IDS) - for query_name in COMMUNITY_PATTERN_QUERIES.values(): - assert query_name.startswith("public_pattern_") - - -@pytest.mark.parametrize("query_name", COMMUNITY_PATTERN_QUERIES.values()) -def test_public_pattern_query_files_load(query_name: str) -> None: - try: - CypherLoader.load(query_name) - finally: - CypherLoader.clear_cache() - - -@pytest.mark.parametrize("query_name", COMMUNITY_PATTERN_QUERIES.values()) -def test_public_pattern_query_required_return_aliases(query_name: str) -> None: - try: - cypher = CypherLoader.load(query_name) - finally: - CypherLoader.clear_cache() - - for required_alias in ( - " AS pattern_id", - " AS risk_signal", - " AS amount_total", - " AS window_start", - " AS window_end", - " AS evidence_refs", - " AS evidence_count", - ): - assert required_alias in cypher, f"{query_name}.cypher missing alias: {required_alias}" - - -@pytest.mark.parametrize("pattern_id", COMMUNITY_PATTERN_IDS) -def test_community_pattern_metadata_is_present(pattern_id: str) -> None: - meta = PATTERN_METADATA.get(pattern_id) - assert meta is not None - assert meta.get("name_pt") - assert meta.get("name_en") - assert meta.get("desc_pt") - assert meta.get("desc_en") - - -def test_threshold_params_used_in_threshold_patterns() -> None: - query_params = { - "public_pattern_split_contracts_below_threshold": "$pattern_split_threshold_value", - "public_pattern_contract_concentration": "$pattern_share_threshold", - "public_pattern_srp_multi_org_hitchhiking": "$pattern_srp_min_orgs", - "public_pattern_inexigibility_recurrence": "$pattern_inexig_min_recurrence", - } - for query_name, required_param in query_params.items(): - try: - cypher = CypherLoader.load(query_name) - finally: - CypherLoader.clear_cache() - assert required_param in cypher, f"{query_name}.cypher missing {required_param}" diff --git a/api/tests/unit/test_public_mode.py b/api/tests/unit/test_public_mode.py index 72506ab..71c85a4 100644 --- a/api/tests/unit/test_public_mode.py +++ b/api/tests/unit/test_public_mode.py @@ -225,6 +225,135 @@ async def test_public_graph_company_filters_person_nodes(client: AsyncClient) -> assert len(payload["edges"]) == 0 +@pytest.mark.anyio +async def test_baseline_disabled_in_public_mode( + client: AsyncClient, + monkeypatch: pytest.MonkeyPatch, +) -> None: + monkeypatch.setattr(settings, "public_mode", True) + monkeypatch.setattr(settings, "public_allow_entity_lookup", False) + response = await client.get("/api/v1/baseline/test-id") + assert response.status_code == 403 + assert "disabled in public mode" in response.json()["detail"] + + +@pytest.mark.anyio +async def test_stats_hides_person_count_in_public_mode( + client: AsyncClient, + monkeypatch: pytest.MonkeyPatch, +) -> None: + monkeypatch.setattr(settings, "public_mode", True) + monkeypatch.setattr(settings, "public_allow_person", False) + # Clear stats cache to ensure fresh computation + import bracc.routers.meta as meta_mod + monkeypatch.setattr(meta_mod, "_stats_cache", None) + + fake_record = { + "total_nodes": 100, + "total_relationships": 200, + "person_count": 999, + "company_count": 50, + "health_count": 10, + "finance_count": 5, + "contract_count": 20, + "sanction_count": 3, + "election_count": 7, + "amendment_count": 4, + "embargo_count": 2, + "education_count": 6, + "convenio_count": 8, + "laborstats_count": 9, + "offshore_entity_count": 1, + "offshore_officer_count": 2, + "global_pep_count": 3, + "cvm_proceeding_count": 4, + "expense_count": 11, + "pep_record_count": 12, + "expulsion_count": 13, + "leniency_count": 14, + "international_sanction_count": 15, + "gov_card_expense_count": 16, + "gov_travel_count": 17, + "bid_count": 18, + "fund_count": 19, + "dou_act_count": 20, + "tax_waiver_count": 21, + "municipal_finance_count": 22, + "declared_asset_count": 23, + "party_membership_count": 24, + "barred_ngo_count": 25, + "bcb_penalty_count": 26, + "labor_movement_count": 27, + "legal_case_count": 28, + "judicial_case_count": 29, + "source_document_count": 30, + "ingestion_run_count": 31, + "temporal_violation_count": 32, + "cpi_count": 33, + "inquiry_requirement_count": 34, + "inquiry_session_count": 35, + "municipal_bid_count": 36, + "municipal_contract_count": 37, + "municipal_gazette_act_count": 38, + } + with patch( + "bracc.routers.meta.execute_query_single", + new_callable=AsyncMock, + return_value=fake_record, + ), patch( + "bracc.routers.meta.load_source_registry", + return_value=[], + ), patch( + "bracc.routers.meta.source_registry_summary", + return_value={ + "universe_v1_sources": 0, + "implemented_sources": 0, + "loaded_sources": 0, + "healthy_sources": 0, + "stale_sources": 0, + "blocked_external_sources": 0, + "quality_fail_sources": 0, + "discovered_uningested_sources": 0, + }, + ): + response = await client.get("/api/v1/meta/stats") + + assert response.status_code == 200 + payload = response.json() + assert payload["person_count"] == 0 + assert payload["company_count"] == 50 # non-person counts preserved + + +@pytest.mark.anyio +async def test_timeline_sanitizes_properties_in_public_mode( + client: AsyncClient, + monkeypatch: pytest.MonkeyPatch, +) -> None: + monkeypatch.setattr(settings, "public_mode", True) + monkeypatch.setattr(settings, "public_allow_entity_lookup", True) + mock_records = [ + { + "lbls": ["Contract"], + "props": {"type": "licitacao", "cpf": "12345678900", "value": 50000.0}, + "event_date": "2024-01-15", + "id": "evt-1", + }, + ] + with patch( + "bracc.routers.entity.execute_query", + new_callable=AsyncMock, + return_value=mock_records, + ): + response = await client.get("/api/v1/entity/test-id/timeline") + + assert response.status_code == 200 + payload = response.json() + assert len(payload["events"]) == 1 + event_props = payload["events"][0]["properties"] + assert "cpf" not in event_props + assert event_props["value"] == 50000.0 + + @pytest.mark.anyio async def test_investigations_disabled_in_public_mode( client: AsyncClient, diff --git a/api/tests/unit/test_rate_limit.py b/api/tests/unit/test_rate_limit.py index dc6a781..f401a52 100644 --- a/api/tests/unit/test_rate_limit.py +++ b/api/tests/unit/test_rate_limit.py @@ -1,24 +1,15 @@ from unittest.mock import MagicMock -from bracc.config import settings from bracc.middleware.rate_limit import _get_rate_limit_key, limiter from bracc.services.auth_service import create_access_token -def _make_request( - auth_header: str | None = None, - client_ip: str = "127.0.0.1", - cookie_token: str | None = None, - x_forwarded_for: str | None = None, -) -> MagicMock: +def _make_request(auth_header: str | None = None, client_ip: str = "127.0.0.1") -> MagicMock: request = MagicMock() headers: dict[str, str] = {} if auth_header: headers["authorization"] = auth_header - if x_forwarded_for: - headers["x-forwarded-for"] = x_forwarded_for request.headers = headers - request.cookies = {settings.auth_cookie_name: cookie_token} if cookie_token else {} request.client = MagicMock() request.client.host = client_ip return request @@ -43,23 +34,5 @@ def test_key_func_invalid_token_fallback() -> None: assert key == "10.0.0.1" -def test_key_func_extracts_user_from_cookie_token() -> None: - token = create_access_token("cookie-user-1") - request = _make_request(cookie_token=token) - key = _get_rate_limit_key(request) - assert key == "user:cookie-user-1" - - -def test_key_func_uses_forwarded_ip_when_enabled() -> None: - original = settings.trust_proxy_headers - try: - settings.trust_proxy_headers = True - request = _make_request(client_ip="127.0.0.1", x_forwarded_for="203.0.113.9, 10.0.0.4") - key = _get_rate_limit_key(request) - assert key == "203.0.113.9" - finally: - settings.trust_proxy_headers = original - - def test_limiter_instance_exists() -> None: assert limiter is not None diff --git a/api/tests/unit/test_search.py b/api/tests/unit/test_search.py index 494171c..bdc9fe8 100644 --- a/api/tests/unit/test_search.py +++ b/api/tests/unit/test_search.py @@ -1,21 +1,6 @@ import pytest from httpx import AsyncClient -from bracc.routers.search import _escape_lucene - - -def test_escape_lucene_cnpj() -> None: - assert _escape_lucene("00.000.000/0001-00") == "00.000.000\\/0001\\-00" - - -def test_escape_lucene_plain_text() -> None: - assert _escape_lucene("silva construcoes") == "silva construcoes" - - -def test_escape_lucene_all_special_chars() -> None: - for ch in r'+-&|!(){}[]^"~*?:\/': - assert f"\\{ch}" in _escape_lucene(ch) - @pytest.mark.anyio async def test_search_rejects_short_query(client: AsyncClient) -> None: diff --git a/api/uv.lock b/api/uv.lock index 1cae4a8..ddb9dbc 100644 --- a/api/uv.lock +++ b/api/uv.lock @@ -103,6 +103,56 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/27/44/d2ef5e87509158ad2187f4dd0852df80695bb1ee0cfe0a684727b01a69e0/bcrypt-5.0.0-cp39-abi3-win_arm64.whl", hash = "sha256:f2347d3534e76bf50bca5500989d6c1d05ed64b440408057a37673282c654927", size = 144953, upload-time = "2025-09-25T19:50:37.32Z" }, ] +[[package]] +name = "bracc-api" +version = "0.1.0" +source = { editable = "." } +dependencies = [ + { name = "bcrypt" }, + { name = "fastapi" }, + { name = "jinja2" }, + { name = "neo4j" }, + { name = "pydantic" }, + { name = "pydantic-settings" }, + { name = "pyjwt", extra = ["crypto"] }, + { name = "python-multipart" }, + { name = "slowapi" }, + { name = "uvicorn", extra = ["standard"] }, + { name = "weasyprint" }, +] + +[package.optional-dependencies] +dev = [ + { name = "httpx" }, + { name = "mypy" }, + { name = "pytest" }, + { name = "pytest-asyncio" }, + { name = "ruff" }, + { name = "testcontainers", extra = ["neo4j"] }, +] + +[package.metadata] +requires-dist = [ + { name = "bcrypt", specifier = ">=4.0.0" }, + { name = "fastapi", specifier = ">=0.115.0" }, + { name = "httpx", marker = "extra == 'dev'", specifier = ">=0.28.0" }, + { name = "jinja2", specifier = ">=3.1.0" }, + { name = "mypy", marker = "extra == 'dev'", specifier = ">=1.14.0" }, + { name = "neo4j", specifier = ">=5.27.0" }, + { name = "pydantic", specifier = ">=2.10.0" }, + { name = "pydantic-settings", specifier = ">=2.7.0" }, + { name = "pyjwt", extras = ["crypto"], specifier = ">=2.9.0" }, + { name = "pytest", marker = "extra == 'dev'", specifier = ">=8.0" }, + { name = "pytest-asyncio", marker = "extra == 'dev'", specifier = ">=0.24.0" }, + { name = "python-multipart", specifier = ">=0.0.18" }, + { name = "ruff", marker = "extra == 'dev'", specifier = ">=0.9.0" }, + { name = "slowapi", specifier = ">=0.1.9" }, + { name = "testcontainers", extras = ["neo4j"], marker = "extra == 'dev'", specifier = ">=4.0" }, + { name = "uvicorn", extras = ["standard"], specifier = ">=0.34.0" }, + { name = "weasyprint", specifier = ">=62.0" }, +] +provides-extras = ["dev"] + [[package]] name = "brotli" version = "1.2.0" @@ -523,56 +573,6 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/2a/39/e50c7c3a983047577ee07d2a9e53faf5a69493943ec3f6a384bdc792deb2/httpx-0.28.1-py3-none-any.whl", hash = "sha256:d909fcccc110f8c7faf814ca82a9a4d816bc5a6dbfea25d6591d6985b8ba59ad", size = 73517, upload-time = "2024-12-06T15:37:21.509Z" }, ] -[[package]] -name = "bracc-api" -version = "0.1.0" -source = { editable = "." } -dependencies = [ - { name = "bcrypt" }, - { name = "fastapi" }, - { name = "jinja2" }, - { name = "neo4j" }, - { name = "pydantic" }, - { name = "pydantic-settings" }, - { name = "pyjwt", extra = ["crypto"] }, - { name = "python-multipart" }, - { name = "slowapi" }, - { name = "uvicorn", extra = ["standard"] }, - { name = "weasyprint" }, -] - -[package.optional-dependencies] -dev = [ - { name = "httpx" }, - { name = "mypy" }, - { name = "pytest" }, - { name = "pytest-asyncio" }, - { name = "ruff" }, - { name = "testcontainers", extra = ["neo4j"] }, -] - -[package.metadata] -requires-dist = [ - { name = "bcrypt", specifier = ">=4.0.0" }, - { name = "fastapi", specifier = ">=0.115.0" }, - { name = "httpx", marker = "extra == 'dev'", specifier = ">=0.28.0" }, - { name = "jinja2", specifier = ">=3.1.0" }, - { name = "mypy", marker = "extra == 'dev'", specifier = ">=1.14.0" }, - { name = "neo4j", specifier = ">=5.27.0" }, - { name = "pydantic", specifier = ">=2.10.0" }, - { name = "pydantic-settings", specifier = ">=2.7.0" }, - { name = "pyjwt", extras = ["crypto"], specifier = ">=2.9.0" }, - { name = "pytest", marker = "extra == 'dev'", specifier = ">=8.0" }, - { name = "pytest-asyncio", marker = "extra == 'dev'", specifier = ">=0.24.0" }, - { name = "python-multipart", specifier = ">=0.0.18" }, - { name = "ruff", marker = "extra == 'dev'", specifier = ">=0.9.0" }, - { name = "slowapi", specifier = ">=0.1.9" }, - { name = "testcontainers", extras = ["neo4j"], marker = "extra == 'dev'", specifier = ">=4.0" }, - { name = "uvicorn", extras = ["standard"], specifier = ">=0.34.0" }, - { name = "weasyprint", specifier = ">=62.0" }, -] -provides-extras = ["dev"] - [[package]] name = "idna" version = "3.11" diff --git a/data/.gitkeep b/data/.gitkeep new file mode 100644 index 0000000..e69de29 diff --git a/data/cnpj/extracted/.gitkeep b/data/cnpj/extracted/.gitkeep new file mode 100644 index 0000000..e69de29 diff --git a/data/cnpj/raw/.gitkeep b/data/cnpj/raw/.gitkeep new file mode 100644 index 0000000..e69de29 diff --git a/data/cnpj/reference/.gitkeep b/data/cnpj/reference/.gitkeep new file mode 100644 index 0000000..e69de29 diff --git a/docs/brand/bracc-header.png b/docs/brand/bracc-header.png deleted file mode 100644 index 7cbd43b..0000000 Binary files a/docs/brand/bracc-header.png and /dev/null differ diff --git a/docs/brand/wtg-header.png b/docs/brand/wtg-header.png deleted file mode 100644 index 5e2fa56..0000000 Binary files a/docs/brand/wtg-header.png and /dev/null differ diff --git a/docs/data-sources.md b/docs/data-sources.md index 8c1ea34..cd35c72 100644 --- a/docs/data-sources.md +++ b/docs/data-sources.md @@ -1,24 +1,13 @@ -# BRACC Data Source Catalog +# ICARUS Data Source Catalog - -**Generated from `docs/source_registry_br_v1.csv` (as-of UTC: 2026-03-01T23:05:00Z)** - -- Universe v1 sources: 108 -- Implemented pipelines: 45 -- Loaded sources (load_state=loaded): 36 -- Partial sources (load_state=partial): 8 -- Not loaded sources (load_state=not_loaded): 64 -- Status counts: loaded=36, partial=5, stale=3, blocked_external=1, not_built=63 - - -Catalog note: counts and status labels are generated from the public registry (`docs/source_registry_br_v1.csv`). -This document includes reference production inventory context and backlog discovery; it is not a guarantee that every listed source is currently loaded in your local environment. +**38 loaded | 3 pipelines pending data | 60+ not yet built** +Last updated: 2026-02-26 --- -## 1. Reference Production Snapshot (Loaded/Implemented Inventory) +## 1. LOADED (38 sources) -The table below is a timestamped reference snapshot and should be interpreted together with the generated summary block above. +All sources below have working ETL pipelines in `etl/src/icarus_etl/pipelines/` and are loaded into production Neo4j. | # | Source | Pipeline | Nodes Created | Rels Created | Notes | |---|--------|----------|---------------|--------------|-------| diff --git a/docs/demo/dataset-contract.md b/docs/demo/dataset-contract.md deleted file mode 100644 index d03eda7..0000000 --- a/docs/demo/dataset-contract.md +++ /dev/null @@ -1,29 +0,0 @@ -# Demo Dataset Contract (WTG Open) - -## Objective -Provide a reproducible, public-safe demo graph with synthetic records only. - -## Safety rules -- Synthetic data only. No real CPF, no real personal names, no real personal addresses. -- Company identifiers may use synthetic CNPJ-like values reserved for demonstration. -- Demo graph cannot include `Person` or `Partner` labels. -- Demo exports must never include private or operational metadata. - -## Required files -- `data/demo/synthetic_graph.json` -- `data/demo/README.md` -- `scripts/generate_demo_dataset.py` - -## JSON schema (minimum) -- `nodes[]`: `{id, label, type, properties}` -- `edges[]`: `{id, source, target, type, properties}` -- `meta`: `{generated_at_utc, generator_version, source: "synthetic"}` - -## Acceptance checks -- No field name contains `cpf`, `doc_partial`, or `doc_raw`. -- No node label equals `Person` or `Partner`. -- CI privacy gate passes. - -## Runtime target -- Dedicated demo Neo4j instance (non-production). -- Public API served with `PUBLIC_MODE=true`. diff --git a/docs/release/community_announcement_template.md b/docs/release/community_announcement_template.md index f4ec9a7..18379cc 100644 --- a/docs/release/community_announcement_template.md +++ b/docs/release/community_announcement_template.md @@ -14,7 +14,6 @@ Resumo: Release notes: {release_url} Observação de integridade: os sinais refletem coocorrências em bases públicas e não constituem prova legal. -Divulgação obrigatória: o repositório público entrega engine + demo + fluxo BYO-data; métricas de escala são snapshots de referência com timestamp. ## Short post (EN) @@ -28,7 +27,6 @@ Summary: Release notes: {release_url} Integrity note: signals reflect co-occurrence in public records and are not legal proof. -Mandatory disclosure: the public repo ships engine + demo + BYO-data workflow; production-scale metrics are timestamped reference snapshots. ## Discord/Telegram long form (PT+EN) @@ -44,11 +42,6 @@ Mandatory disclosure: the public repo ships engine + demo + BYO-data workflow; p **Compatibilidade** - {pt_compat} -**Reproducibility Reality Check** -- Funciona agora: {pt_works_now} -- Requer ingestão de dados: {pt_requires_ingestion} -- Não incluído por padrão: {pt_not_included} - **Link** - {release_url} @@ -64,10 +57,5 @@ Mandatory disclosure: the public repo ships engine + demo + BYO-data workflow; p **Compatibility** - {en_compat} -**Reproducibility Reality Check** -- Works now: {en_works_now} -- Requires data ingestion: {en_requires_ingestion} -- Not included by default: {en_not_included} - **Link** - {release_url} diff --git a/docs/release/public_boundary_matrix.csv b/docs/release/public_boundary_matrix.csv index 9f4ff1a..e200cad 100644 --- a/docs/release/public_boundary_matrix.csv +++ b/docs/release/public_boundary_matrix.csv @@ -7,8 +7,8 @@ docs/**,PUBLIC with review,Keep public documentation and legal pack,include revi .github/workflows/**,PUBLIC,CI and security transparency,include scripts/**,PUBLIC with review,Keep public utilities and gates,include reviewed subset data/demo/**,PUBLIC,Synthetic demo dataset only,include -api/src/bracc/services/pattern_service.py,REMOVE_FROM_PUBLIC,Pattern engine disabled pending validation,exclude -api/src/bracc/queries/pattern_*.cypher,REMOVE_FROM_PUBLIC,Pattern query engine disabled pending validation,exclude +api/src/icarus/services/pattern_service.py,REMOVE_FROM_PUBLIC,Pattern engine disabled pending validation,exclude +api/src/icarus/queries/pattern_*.cypher,REMOVE_FROM_PUBLIC,Pattern query engine disabled pending validation,exclude scripts/auto_finalize_pncp_backfill.sh,REMOVE_FROM_PUBLIC,Production operational script with server-specific assumptions,exclude docs/shadow_rollout_runbook.md,REMOVE_FROM_PUBLIC,Production operational runbook details,exclude docs/ingestion_priority_runbook.md,REMOVE_FROM_PUBLIC,Production operational runbook details,exclude diff --git a/docs/release/public_repo_release_checklist.md b/docs/release/public_repo_release_checklist.md index 73c4f6c..8799cf8 100644 --- a/docs/release/public_repo_release_checklist.md +++ b/docs/release/public_repo_release_checklist.md @@ -1,56 +1,78 @@ -# Public Repo Release Checklist — `World-Open-Graph/br-acc` - -## 1) Pre-release gate - -1. Confirm target merge commit exists on `main`. -2. Confirm CI + Security + Public gates are green on that commit. -3. Confirm PR is merged with exactly one release label. - -## 2) Public boundary checks +# Public Repo Release Checklist — World Transparency Graph +## 1) Prepare sanitized snapshot ```bash -python scripts/check_public_privacy.py --repo-root . -python scripts/check_compliance_pack.py --repo-root . -python scripts/check_open_core_boundary.py --repo-root . +bash scripts/prepare_public_snapshot.sh /Users/brunoclz/CORRUPTOS /tmp/world-transparency-graph-public ``` -Expected: all `PASS`. - -## 3) Snapshot hygiene (optional verification) - +## 2) Initialize clean-history repo from snapshot ```bash -bash scripts/prepare_public_snapshot.sh . /tmp/br-acc-public -python /tmp/br-acc-public/scripts/check_public_privacy.py --repo-root /tmp/br-acc-public -python /tmp/br-acc-public/scripts/check_compliance_pack.py --repo-root /tmp/br-acc-public -python /tmp/br-acc-public/scripts/check_open_core_boundary.py --repo-root /tmp/br-acc-public +cd /tmp/world-transparency-graph-public +git init +git add . +git commit -m "Initial public release (WTG)" ``` -Expected in snapshot: - -- No `CLAUDE.md`. -- No `AGENTS.md` or `AGENTS*.md`. -- No private operational runbooks outside public scope. - -## 4) Publish release (manual workflow) +## 3) Create GitHub repository (manual) +- Owner: `brunoclz` +- Name: `world-transparency-graph` +- Visibility: Public +- Do not auto-add README/License (already present) -In GitHub Actions, run **Publish Release** with: - -- `version`: SemVer tag (e.g. `v0.3.0`, `v0.3.1-rc.1`) -- `target_sha`: merge commit on `main` -- `prerelease`: `false` (stable) or `true` (RC) -- `title_pt`: release title PT-BR -- `title_en`: release title EN - -## 5) Verify outputs - -1. Tag exists in repository. -2. Release page published under `/releases`. -3. Notes include PT+EN and non-accusatory disclaimer. -4. `release_manifest.json` asset is attached. -5. Compare link is valid (`previous_tag...new_tag`). - -## 6) Community communication +## 4) Push initial release +```bash +git branch -M main +git remote add origin https://github.com/brunoclz/world-transparency-graph.git +git push -u origin main +``` -1. Use `docs/release/community_announcement_template.md`. -2. Publish short PT+EN summary with release URL. -3. Keep wording factual: “signals/co-occurrence”, never accusatory language. +## 5) Configure branch protection (GitHub UI) +Require all checks: +- `API (Python)` +- `ETL (Python)` +- `Frontend (TypeScript)` +- `Neutrality Audit` +- `Gitleaks` +- `Bandit (Python)` +- `Pip Audit (Python deps)` +- `Public Privacy Gate` +- `Compliance Pack Gate` +- `Public Boundary Gate` + +## 6) Configure environment defaults +- Set public deployment environment vars: + - `PRODUCT_TIER=community` + - `PUBLIC_MODE=true` + - `PUBLIC_ALLOW_PERSON=false` + - `PUBLIC_ALLOW_ENTITY_LOOKUP=false` + - `PUBLIC_ALLOW_INVESTIGATIONS=false` + - `PATTERNS_ENABLED=false` + - `VITE_PUBLIC_MODE=true` + - `VITE_PATTERNS_ENABLED=false` + +## 7) Final checks before launch +- `python scripts/check_public_privacy.py --repo-root .` => `PASS` +- `python scripts/check_compliance_pack.py --repo-root .` => `PASS` +- `python scripts/check_open_core_boundary.py --repo-root .` => `PASS` +- Confirm no internal runbooks in public repo +- Confirm demo data is synthetic (`data/demo/synthetic_graph.json`) +- Confirm all legal docs exist in root: + - `ETHICS.md` + - `LGPD.md` + - `PRIVACY.md` + - `TERMS.md` + - `DISCLAIMER.md` + - `SECURITY.md` + - `ABUSE_RESPONSE.md` + +## 8) Launch communication split +- Publish product announcement as **WTG** +- Publish movement announcement as **BRCC** +- Mention methodology limits and non-accusatory policy + +## 9) Release system bootstrap +- Ensure `.github/release.yml` exists for auto-notes categories. +- Ensure `.github/release-drafter.yml` + workflow are active. +- Ensure `publish-release.yml` workflow is present and dispatchable. +- Ensure release label taxonomy is documented and applied to PRs. +- Publish first policy-compliant tag from this stream (`v0.3.0`). diff --git a/docs/release/release_policy.md b/docs/release/release_policy.md index 7f9bfc3..de6ca83 100644 --- a/docs/release/release_policy.md +++ b/docs/release/release_policy.md @@ -48,11 +48,10 @@ A release can only be published from a commit on `main` where all required gates Every release must include PT-BR and EN sections with: 1. Scope summary. -2. Notable changes (explicit bullet points). -3. Included pattern IDs when release contains pattern/signal changes. -4. Compatibility/breaking notes. -5. Privacy/compliance notes when applicable. -6. Non-accusatory disclaimer. +2. Notable changes. +3. Compatibility/breaking notes. +4. Privacy/compliance notes when applicable. +5. Non-accusatory disclaimer. ## Artifacts diff --git a/docs/release/release_runbook.md b/docs/release/release_runbook.md index 33ca74b..491f11a 100644 --- a/docs/release/release_runbook.md +++ b/docs/release/release_runbook.md @@ -37,19 +37,6 @@ For validation cycles use RC: - `prerelease`: `true` for RC, `false` for stable - `title_pt`: short PT-BR title - `title_en`: short EN title -- `highlights_pt`: PT highlights separated by `|` -- `highlights_en`: EN highlights separated by `|` -- `patterns_included`: comma-separated pattern IDs (use `none` when not applicable) -- `technical_changes_pt`: PT technical changes separated by `|` -- `technical_changes_en`: EN technical changes separated by `|` - -Example inputs for a pattern release: - -- `highlights_pt`: `Port de 8 padrões públicos factuais | Padronização de payload público` -- `highlights_en`: `Port of 8 factual public-safe patterns | Public payload standardization` -- `patterns_included`: `sanctioned_still_receiving,amendment_beneficiary_contracts,split_contracts_below_threshold,contract_concentration,embargoed_receiving,debtor_contracts,srp_multi_org_hitchhiking,inexigibility_recurrence` -- `technical_changes_pt`: `Provider community de 4 para 8 padrões | ETL criou relação Contract-REFERENTE_A-Bid` -- `technical_changes_en`: `Community provider expanded from 4 to 8 patterns | ETL created Contract-REFERENTE_A-Bid linkage` ## 4) Workflow validations performed @@ -65,7 +52,7 @@ The workflow blocks publication when: On success the workflow: 1. Creates and pushes an annotated tag. -2. Creates GitHub Release (PT+EN notes) with explicit highlights, patterns, and technical changes. +2. Creates GitHub Release (PT+EN notes). 3. Uploads `release_manifest.json` asset. ## 6) Post-release checklist @@ -73,7 +60,6 @@ On success the workflow: 1. Open the release page and confirm: - version tag is correct, - PT+EN notes are present, -- included patterns are explicitly listed (or marked as none), - non-accusatory disclaimer line is present, - `release_manifest.json` is attached. 2. Share release link in community channels. diff --git a/docs/source_onboarding_contract.md b/docs/source_onboarding_contract.md deleted file mode 100644 index c58602c..0000000 --- a/docs/source_onboarding_contract.md +++ /dev/null @@ -1,67 +0,0 @@ -# Source Onboarding Contract (Brazil Coverage v1) - -This contract is mandatory for every new source before `shadow -> promote`. - -## 1. Source Identity -- `source_id`: -- `name`: -- `category`: -- `tier`: -- `owner_agent`: -- `primary_url`: -- `access_mode` (`file|api|bigquery|web`): -- `public_access_mode` (`open|open_with_rate_limit|registration|credentialed_public`): -- `discovery_status` (`discovered|discovered_uningested|monitored|unreachable`): -- `last_seen_url`: -- `cadence_expected`: -- `cadence_observed`: -- `quality_status` (`healthy|stale|quality_fail|blocked_external|not_built|partial|loaded`): - -## 2. Access and Legal -- Credential required: -- Secret name/path: -- License or usage restriction: -- LGPD/privacy considerations: -- `blocked_external` criteria: - -## 3. Data Contract -- Downloader script: `etl/scripts/download_.py` -- Canonical output files: -- Manifest file: -- Manifest mandatory fields (`run_id`, `source_id`, `window_start`, `window_end`, `rows`, `error`, `checksum`, `retrieved_at_utc`): -- Update cadence: -- Expected row volume: -- Partition/window strategy: - -## 4. Graph Contract -- Node labels introduced: -- Relationship types introduced: -- Natural key(s) per node: -- Merge key strategy: -- Relationship quality tier (`strong|probable`): -- Provenance fields (`method`, `confidence`, `source_ref`, `run_id`): - -## 5. Index and Constraint Contract -- Required uniqueness constraints: -- Required date indexes: -- Required lookup indexes: -- Required fulltext indexes (if text-heavy): - -## 6. Quality Gates (Hard Stop/Go) -- Identity integrity preserved (`Person.cpf` masked = 0, 14-digit = 0): -- Freshness SLA threshold: -- Temporal sanity (`<= now + 365d`): -- Null/duplicate key thresholds: -- Mandatory non-zero nodes/rels: - -## 7. Operational Flow -- Shadow load command: -- Gate runner commands: -- API smoke checks: -- Promote command: -- Rollback command: - -## 8. Acceptance -- Evidence bundle path in `audit-results/`: -- Final status: `resolved | resolved_full | blocked_external | quality_fail` -- Reviewer sign-off: diff --git a/docs/source_registry_br_v1.csv b/docs/source_registry_br_v1.csv index 1789a8e..2e941f2 100644 --- a/docs/source_registry_br_v1.csv +++ b/docs/source_registry_br_v1.csv @@ -1,109 +1,109 @@ -source_id,name,category,tier,status,implementation_state,load_state,frequency,in_universe_v1,primary_url,pipeline_id,owner_agent,access_mode,notes,public_access_mode,discovery_status,last_seen_url,cadence_expected,cadence_observed,quality_status,last_verified_utc,verification_status -cnpj,Receita Federal CNPJ,identity,P0,loaded,implemented,loaded,monthly,true,https://dadosabertos.rfb.gov.br/CNPJ/,cnpj,Agent A,file,http://dadosabertos.rfb.gov.br,,monitored,,,,healthy,2026-03-01T23:11:31.444615+00:00,transient_error -tse,TSE elections and donations,electoral,P0,loaded,implemented,loaded,biennial,true,https://dadosabertos.tse.jus.br/,tse,Agent E,file,Core electoral data loaded,,monitored,,,,healthy,2026-03-01T23:11:31.444615+00:00,ok -transparencia,Portal da Transparencia contracts,contracts,P0,loaded,implemented,loaded,monthly,true,https://portaldatransparencia.gov.br/download-de-dados,transparencia,Agent C,file,Federal contracts and servants,,monitored,,,,healthy,2026-03-01T23:11:31.444615+00:00,ok -sanctions,CEIS CNEP sanctions,sanctions,P0,loaded,implemented,loaded,monthly,true,https://portaldatransparencia.gov.br/sancoes/consulta,sanctions,Agent C,file,Administrative sanctions,,monitored,,,,healthy,2026-03-01T23:11:31.444615+00:00,ok -pep_cgu,CGU PEP list,integrity,P1,loaded,implemented,loaded,monthly,true,https://portaldatransparencia.gov.br/download-de-dados/pep,pep_cgu,Agent A,file,PEP baseline,,monitored,,,,healthy,2026-03-01T23:11:31.444615+00:00,ok -bndes,BNDES financings,finance,P1,loaded,implemented,loaded,monthly,true,https://www.bndes.gov.br/wps/portal/site/home/transparencia/dados,bndes,Agent G,file,Loan relationships,,monitored,,,,healthy,2026-03-01T23:11:31.444615+00:00,ok -pgfn,PGFN divida ativa,fiscal,P0,loaded,implemented,loaded,monthly,true,https://www.regularize.pgfn.gov.br/dados-abertos,pgfn,Agent C,file,Debt risk core,,monitored,,,,healthy,2026-03-01T23:11:31.444615+00:00,ok -ibama,IBAMA embargos,environment,P1,loaded,implemented,loaded,monthly,true,https://servicos.ibama.gov.br/ctf/publico/areasembargadas/,ibama,Agent F,file,Environmental enforcement,,monitored,,,,healthy,2026-03-01T23:11:31.444615+00:00,ok -comprasnet,ComprasNet contracts,contracts,P0,stale,implemented,partial,monthly,true,https://dados.gov.br/dados/conjuntos-dados/comprasnet-contratos,comprasnet,Agent C,file,Needs freshness backfill,,monitored,,,,stale,2026-03-01T23:11:31.444615+00:00,ok -tcu,TCU sanctions,audit,P1,loaded,implemented,loaded,monthly,true,https://contas.tcu.gov.br/ords/f?p=INIDONEAS:INIDONEAS,tcu,Agent C,file,Inidoneidade sanctions,,monitored,,,,healthy,2026-03-01T23:11:31.444615+00:00,ok -transferegov,TransfereGov emendas e convenios,transfers,P0,loaded,implemented,loaded,monthly,true,https://www.transferegov.sistema.gov.br/portal/download-de-dados,transferegov,Agent C,file,Transfer relationships,,monitored,,,,healthy,2026-03-01T23:11:31.444615+00:00,transient_error -rais,RAIS aggregated labor,labor,P1,loaded,implemented,loaded,annual,true,https://basedosdados.org/dataset/br-me-rais,rais,Agent H,bigquery,Aggregate mode only,,monitored,,,,healthy,2026-03-01T23:11:31.444615+00:00,ok -inep,INEP school census,education,P2,loaded,implemented,loaded,annual,true,https://www.gov.br/inep/pt-br/acesso-a-informacao/dados-abertos/microdados/censo-escolar,inep,Agent H,file,Education coverage,,monitored,,,,healthy,2026-03-01T23:11:31.444615+00:00,auth_or_rate_limited -dou,Diario Oficial da Uniao,gazette,P0,loaded,implemented,loaded,daily,true,https://www.in.gov.br/leiturajornal,dou,Agent E,bigquery,National acts ingestion,,monitored,,,,healthy,2026-03-01T23:11:31.444615+00:00,auth_or_rate_limited -datasus,DATASUS CNES,health,P1,loaded,implemented,loaded,monthly,true,https://opendatasus.saude.gov.br/,datasus,Agent H,file,Health establishments,,monitored,,,,healthy,2026-03-01T23:11:31.444615+00:00,ok -icij,ICIJ offshore leaks,offshore,P1,loaded,implemented,loaded,yearly,true,https://offshoreleaks.icij.org/pages/database,icij,Agent G,file,Offshore entities and officers,,monitored,,,,healthy,2026-03-01T23:11:31.444615+00:00,ok -opensanctions,OpenSanctions global PEP,sanctions,P1,loaded,implemented,loaded,monthly,true,https://www.opensanctions.org/datasets/peps/,opensanctions,Agent G,file,Global PEP matching,,monitored,,,,healthy,2026-03-01T23:11:31.444615+00:00,ok -cvm,CVM proceedings,market,P1,loaded,implemented,loaded,monthly,true,https://dados.cvm.gov.br/,cvm,Agent G,file,Proceedings loaded,,monitored,,,,healthy,2026-03-01T23:11:31.444615+00:00,ok -cvm_funds,CVM fund registry,market,P1,loaded,implemented,loaded,monthly,true,https://dados.cvm.gov.br/dados/FI/,cvm_funds,Agent G,file,Fund baseline,,monitored,,,,healthy,2026-03-01T23:11:31.444615+00:00,ok -camara,Camara CEAP expenses,legislative,P1,loaded,implemented,loaded,monthly,true,https://dadosabertos.camara.leg.br/,camara,Agent E,api,Expense reimbursement,,monitored,,,,healthy,2026-03-01T23:11:31.444615+00:00,ok -camara_inquiries,Camara inquiries and requirements,legislative,P0,partial,implemented,partial,daily,true,https://dadosabertos.camara.leg.br/,camara_inquiries,Agent E,api,Sessions still low,,monitored,,,,partial,2026-03-01T23:11:31.444615+00:00,ok -senado,Senado CEAPS expenses,legislative,P1,loaded,implemented,loaded,monthly,true,https://www12.senado.leg.br/dados-abertos,senado,Agent E,api,Expense data loaded,,monitored,,,,healthy,2026-03-01T23:11:31.444615+00:00,ok -ceaf,CEAF expelled servants,integrity,P1,loaded,implemented,loaded,monthly,true,https://portaldatransparencia.gov.br/download-de-dados/ceaf,ceaf,Agent A,file,Expulsion evidence,,monitored,,,,healthy,2026-03-01T23:11:31.444615+00:00,ok -cepim,CEPIM barred NGOs,integrity,P1,loaded,implemented,loaded,monthly,true,https://portaldatransparencia.gov.br/download-de-dados/cepim,cepim,Agent A,file,NGO restrictions,,monitored,,,,healthy,2026-03-01T23:11:31.444615+00:00,ok -cpgf,CPGF gov card expenses,spending,P2,loaded,implemented,loaded,monthly,true,https://portaldatransparencia.gov.br/download-de-dados/cpgf,cpgf,Agent H,file,Masked CPF source,,monitored,,,,healthy,2026-03-01T23:11:31.444615+00:00,ok -leniency,Acordos de leniencia,integrity,P0,loaded,implemented,loaded,monthly,true,https://portaldatransparencia.gov.br/download-de-dados/acordos-leniencia,leniency,Agent A,file,High signal low volume,,monitored,,,,healthy,2026-03-01T23:11:31.444615+00:00,ok -ofac,OFAC sanctions,sanctions,P1,loaded,implemented,loaded,monthly,true,https://home.treasury.gov/policy-issues/financial-sanctions/sdn-list-data-files,ofac,Agent G,file,International sanctions,,monitored,,,,healthy,2026-03-01T23:11:31.444615+00:00,ok -holdings,Brasil IO holdings,ownership,P1,loaded,implemented,loaded,monthly,true,https://brasil.io/dataset/socios-brasil/,holdings,Agent G,file,Ownership enrichment,,monitored,,,,healthy,2026-03-01T23:11:31.444615+00:00,ok -viagens,Viagens a servico,spending,P2,loaded,implemented,loaded,monthly,true,https://portaldatransparencia.gov.br/download-de-dados/viagens,viagens,Agent H,file,Travel spend baseline,,monitored,,,,healthy,2026-03-01T23:11:31.444615+00:00,ok -siop,SIOP emendas,budget,P0,partial,implemented,partial,annual,true,https://www.siop.planejamento.gov.br/,siop,Agent C,api,Author linkage limited,,monitored,,,,partial,2026-03-01T23:11:31.444615+00:00,auth_or_rate_limited -pncp,PNCP bids and contracts,contracts,P0,stale,implemented,partial,monthly,true,https://pncp.gov.br/api/consulta/v1/contratacoes/publicacao,pncp,Agent C,api,Freshness SLA pending,,monitored,,,,stale,2026-03-01T23:11:31.444615+00:00,transient_error -renuncias,Renuncias fiscais,fiscal,P1,loaded,implemented,loaded,annual,true,https://www.gov.br/receitafederal/pt-br/acesso-a-informacao/dados-abertos,renuncias,Agent G,file,Tax waiver baseline,,monitored,,,,healthy,2026-03-01T23:11:31.444615+00:00,auth_or_rate_limited -siconfi,SICONFI municipal finance,fiscal,P1,partial,implemented,partial,annual,true,https://apidatalake.tesouro.gov.br/docs/siconfi/,siconfi,Agent C,api,No CNPJ direct links,,monitored,,,,partial,2026-03-01T23:11:31.444615+00:00,ok -tse_bens,TSE candidate assets,electoral,P1,loaded,implemented,loaded,biennial,true,https://dadosabertos.tse.jus.br/api/3/action/package_search?q=bens,tse_bens,Agent E,file,Patrimony baseline,,monitored,,,,healthy,2026-03-01T23:11:31.444615+00:00,ok -tse_filiados,TSE party memberships,electoral,P1,loaded,implemented,loaded,monthly,true,https://dadosabertos.tse.jus.br/api/3/action/package_search?q=filiacao,tse_filiados,Agent E,file,Party network,,monitored,,,,healthy,2026-03-01T23:11:31.444615+00:00,ok -bcb,BCB penalties,finance,P1,loaded,implemented,loaded,monthly,true,https://dadosabertos.bcb.gov.br/,bcb,Agent G,file,Bank penalties loaded,,monitored,,,,healthy,2026-03-01T23:11:31.444615+00:00,ok -stf,STF court data,judiciary,P1,loaded,implemented,loaded,monthly,true,https://basedosdados.org/dataset/br-stf-corte-aberta,stf,Agent D,bigquery,Supreme court coverage,,monitored,,,,healthy,2026-03-01T23:11:31.444615+00:00,ok -caged,CAGED labor movements,labor,P1,stale,implemented,partial,monthly,true,https://ftp.mtps.gov.br/pdet/microdados/NOVO%20CAGED/,caged,Agent H,file,Aggregate-only implementation,,monitored,,,,stale,2026-03-01T23:11:31.444615+00:00,transient_error -eu_sanctions,EU sanctions,sanctions,P1,loaded,implemented,loaded,monthly,true,https://data.europa.eu/data/datasets/consolidated-list-of-persons-groups-and-entities-subject-to-eu-financial-sanctions,eu_sanctions,Agent G,file,International sanctions,,monitored,,,,healthy,2026-03-01T23:11:31.444615+00:00,ok -un_sanctions,UN sanctions,sanctions,P1,loaded,implemented,loaded,monthly,true,https://scsanctions.un.org/resources/xml/en/consolidated.xml,un_sanctions,Agent G,file,International sanctions,,monitored,,,,healthy,2026-03-01T23:11:31.444615+00:00,transient_error -world_bank,World Bank debarment,sanctions,P1,loaded,implemented,loaded,monthly,true,https://www.worldbank.org/en/projects-operations/procurement/debarred-firms,world_bank,Agent G,file,International sanctions,,monitored,,,,healthy,2026-03-01T23:11:31.444615+00:00,ok -senado_cpis,Senado CPIs,legislative,P0,partial,implemented,partial,yearly,true,https://www12.senado.leg.br/dados-abertos,senado_cpis,Agent E,api,Needs richer sessions and requirements,,monitored,,,,partial,2026-03-01T23:11:31.444615+00:00,ok -mides,MiDES municipal procurement,municipal,P0,loaded,implemented,loaded,daily,true,https://basedosdados.org/dataset/world-wb-mides,mides,Agent H,bigquery,Operational after access fix,,monitored,,,,healthy,2026-03-01T23:11:31.444615+00:00,ok -querido_diario,Querido Diario gazettes,municipal,P1,partial,implemented,partial,daily,true,https://queridodiario.ok.org.br/api,querido_diario,Agent H,api,Text availability gap,,monitored,,,,partial,2026-03-01T23:11:31.444615+00:00,ok -datajud,CNJ DataJud,judiciary,P0,blocked_external,implemented,not_loaded,monthly,true,https://api-publica.datajud.cnj.jus.br/,datajud,Agent D,api,Credentials not fully operational in prod,,monitored,,,,blocked_external,2026-03-01T23:11:31.444615+00:00,auth_or_rate_limited -bolsa_familia_bpc,Bolsa Familia and BPC,social,P3,not_built,not_implemented,not_loaded,monthly,true,https://portaldatransparencia.gov.br/download-de-dados/bolsa-familia-pagamentos,,Agent H,file,High volume masked identities,,discovered_uningested,,,,not_built,2026-03-01T23:11:31.444615+00:00,ok -estban,BCB ESTBAN balances,finance,P3,not_built,not_implemented,not_loaded,monthly,true,https://dadosabertos.bcb.gov.br/,,Agent G,file,Banking aggregates,,discovered_uningested,,,,not_built,2026-03-01T23:11:31.444615+00:00,ok -if_data,BCB IF data indicators,finance,P3,not_built,not_implemented,not_loaded,quarterly,true,https://dadosabertos.bcb.gov.br/,,Agent G,file,Institution KPIs,,discovered_uningested,,,,not_built,2026-03-01T23:11:31.444615+00:00,ok -bcb_liquidacao,BCB bank liquidation acts,finance,P2,not_built,not_implemented,not_loaded,monthly,true,https://dadosabertos.bcb.gov.br/,,Agent G,file,Regulatory actions,,discovered_uningested,,,,not_built,2026-03-01T23:11:31.444615+00:00,ok -stj_dados_abertos,STJ open data,judiciary,P1,not_built,not_implemented,not_loaded,monthly,true,https://dadosabertos.stj.jus.br/,,Agent D,api,Superior court decisions,,discovered_uningested,,,,not_built,2026-03-01T23:11:31.444615+00:00,transient_error -cnciai_improbidade,CNIAI improbidade,judiciary,P1,not_built,not_implemented,not_loaded,monthly,true,https://www.cnj.jus.br/sistemas/datajud/,,Agent D,api,Misconduct convictions,,discovered_uningested,,,,not_built,2026-03-01T23:11:31.444615+00:00,ok -carf_tax_appeals,CARF tax appeals,judiciary,P2,not_built,not_implemented,not_loaded,monthly,true,https://carf.economia.gov.br/dados-abertos,,Agent D,file,Tax litigation,,discovered_uningested,,,,not_built,2026-03-01T23:11:31.444615+00:00,ok -anp_royalties,ANP royalties and fuel,regulatory,P2,not_built,not_implemented,not_loaded,monthly,true,https://dados.gov.br/dados/conjuntos-dados/anp,,Agent F,api,Oil and gas royalties,,discovered_uningested,,,,not_built,2026-03-01T23:11:31.444615+00:00,ok -aneel_concessions,ANEEL concessions,regulatory,P2,not_built,not_implemented,not_loaded,monthly,true,https://dadosabertos.aneel.gov.br/,,Agent F,api,Energy concessions,,discovered_uningested,,,,not_built,2026-03-01T23:11:31.444615+00:00,ok -anm_mining_rights,ANM mining rights,regulatory,P1,not_built,not_implemented,not_loaded,monthly,true,https://dados.gov.br/dados/conjuntos-dados/anm,,Agent F,api,Mining rights and permits,,discovered_uningested,,,,not_built,2026-03-01T23:11:31.444615+00:00,ok -antt_transport_concessions,ANTT concessions,regulatory,P3,not_built,not_implemented,not_loaded,monthly,true,https://dados.gov.br/dados/conjuntos-dados/antt,,Agent F,api,Transport concessions,,discovered_uningested,,,,not_built,2026-03-01T23:11:31.444615+00:00,ok -ans_health_plans,ANS operators,regulatory,P3,not_built,not_implemented,not_loaded,monthly,true,https://dados.gov.br/dados/conjuntos-dados/ans,,Agent H,api,Health insurance operators,,discovered_uningested,,,,not_built,2026-03-01T23:11:31.444615+00:00,ok -anvisa_registrations,ANVISA products,regulatory,P3,not_built,not_implemented,not_loaded,monthly,true,https://dados.gov.br/dados/conjuntos-dados/anvisa,,Agent H,api,Regulatory registrations,,discovered_uningested,,,,not_built,2026-03-01T23:11:31.444615+00:00,ok -anac_aviation_concessions,ANAC concessions,regulatory,P3,not_built,not_implemented,not_loaded,monthly,true,https://dados.gov.br/dados/conjuntos-dados/anac,,Agent F,api,Aviation contracts,,discovered_uningested,,,,not_built,2026-03-01T23:11:31.444615+00:00,ok -antaq_port_contracts,ANTAQ contracts,regulatory,P3,not_built,not_implemented,not_loaded,monthly,true,https://dados.gov.br/dados/conjuntos-dados/antaq,,Agent F,api,Port concessions,,discovered_uningested,,,,not_built,2026-03-01T23:11:31.444615+00:00,ok -ana_water_grants,ANA water grants,regulatory,P3,not_built,not_implemented,not_loaded,monthly,true,https://dados.gov.br/dados/conjuntos-dados/ana,,Agent F,api,Water use rights,,discovered_uningested,,,,not_built,2026-03-01T23:11:31.444615+00:00,ok -anatel_telecom_licenses,ANATEL licenses,regulatory,P3,not_built,not_implemented,not_loaded,monthly,true,https://dados.gov.br/dados/conjuntos-dados/anatel,,Agent G,api,Telecom operators,,discovered_uningested,,,,not_built,2026-03-01T23:11:31.444615+00:00,ok -susep_insurance_market,SUSEP insurance market,regulatory,P3,not_built,not_implemented,not_loaded,monthly,true,https://dados.gov.br/dados/conjuntos-dados/susep,,Agent G,file,Insurance entities,,discovered_uningested,,,,not_built,2026-03-01T23:11:31.444615+00:00,ok -cvm_full_ownership_chain,CVM ownership chains,market,P1,not_built,not_implemented,not_loaded,monthly,true,https://dados.cvm.gov.br/,,Agent G,file,Shareholder graph expansion,,discovered_uningested,,,,not_built,2026-03-01T23:11:31.444615+00:00,ok -receita_dirbi,Receita DIRBI,tax,P1,not_built,not_implemented,not_loaded,monthly,true,https://dados.gov.br/dados/conjuntos-dados/declaracao-dirbi,,Agent G,file,Tax benefit declarations,,discovered_uningested,,,,not_built,2026-03-01T23:11:31.444615+00:00,ok -mapbiomas_alertas,MapBiomas Alerta,environment,P1,not_built,not_implemented,not_loaded,monthly,true,https://alerta.mapbiomas.org/api,,Agent F,api,Deforestation alerts,,discovered_uningested,,,,not_built,2026-03-01T23:11:31.444615+00:00,ok -sicar_rural_registry,SiCAR rural registry,environment,P1,not_built,not_implemented,not_loaded,quarterly,true,https://www.car.gov.br/publico/municipios/downloads,,Agent F,file,Property boundaries and owners,,discovered_uningested,,,,not_built,2026-03-01T23:11:31.444615+00:00,transient_error -icmbio_cnuc,ICMBio CNUC units,environment,P2,not_built,not_implemented,not_loaded,monthly,true,https://www.gov.br/icmbio/pt-br,,Agent F,file,Protected areas,,discovered_uningested,,,,not_built,2026-03-01T23:11:31.444615+00:00,auth_or_rate_limited -tesouro_emendas,Tesouro emendas,budget,P0,not_built,not_implemented,not_loaded,monthly,true,https://www.tesourotransparente.gov.br/,,Agent C,file,Budget execution,,discovered_uningested,,,,not_built,2026-03-01T23:11:31.444615+00:00,ok -siga_brasil,SIGA Brasil,budget,P0,not_built,not_implemented,not_loaded,monthly,true,https://www12.senado.leg.br/orcamento/sigabrasil,,Agent C,file,Federal budget traces,,discovered_uningested,,,,not_built,2026-03-01T23:11:31.444615+00:00,ok -camara_votes_bills,Camara votes and bills,legislative,P1,not_built,not_implemented,not_loaded,daily,true,https://dadosabertos.camara.leg.br/api/v2,,Agent E,api,Legislative behavior,,discovered_uningested,,,,not_built,2026-03-01T23:11:31.444615+00:00,ok -senado_votes_bills,Senado votes and bills,legislative,P1,not_built,not_implemented,not_loaded,daily,true,https://legis.senado.leg.br/dadosabertos,,Agent E,api,Legislative behavior,,discovered_uningested,,,,not_built,2026-03-01T23:11:31.444615+00:00,ok -interpol_red_notices,Interpol red notices,international,P2,not_built,not_implemented,not_loaded,weekly,true,https://www.interpol.int/How-we-work/Notices/Red-Notices,,Agent G,api,Requires key,,discovered_uningested,,,,not_built,2026-03-01T23:11:31.444615+00:00,ok -tce_sp,TCE Sao Paulo,state,P2,not_built,not_implemented,not_loaded,monthly,true,https://transparencia.tce.sp.gov.br/,,Agent H,api,State audit procurement,,discovered_uningested,,,,not_built,2026-03-01T23:11:31.444615+00:00,ok -tce_pe,TCE Pernambuco,state,P2,not_built,not_implemented,not_loaded,monthly,true,https://sistemas.tce.pe.gov.br/,,Agent H,api,State audit procurement,,discovered_uningested,,,,not_built,2026-03-01T23:11:31.444615+00:00,auth_or_rate_limited -tce_rj,TCE Rio de Janeiro,state,P2,not_built,not_implemented,not_loaded,monthly,true,https://dados.tce.rj.gov.br/,,Agent H,api,State audit procurement,,discovered_uningested,,,,not_built,2026-03-01T23:11:31.444615+00:00,transient_error -tce_rs,TCE Rio Grande do Sul,state,P2,not_built,not_implemented,not_loaded,monthly,true,https://tcers.tc.br/fiscalizado/,,Agent H,file,State audit procurement,,discovered_uningested,,,,not_built,2026-03-01T23:11:31.444615+00:00,ok -tce_mg,TCE Minas Gerais,state,P2,not_built,not_implemented,not_loaded,monthly,true,https://www.tce.mg.gov.br/,,Agent H,web,State audit procurement,,discovered_uningested,,,,not_built,2026-03-01T23:11:31.444615+00:00,transient_error -tce_ba,TCE Bahia,state,P3,not_built,not_implemented,not_loaded,monthly,true,https://www.tce.ba.gov.br/,,Agent H,web,State audit procurement,,discovered_uningested,,,,not_built,2026-03-01T23:11:31.444615+00:00,ok -tce_ce,TCE Ceara,state,P3,not_built,not_implemented,not_loaded,monthly,true,https://www.tce.ce.gov.br/,,Agent H,web,State audit procurement,,discovered_uningested,,,,not_built,2026-03-01T23:11:31.444615+00:00,ok -tce_go,TCE Goias,state,P3,not_built,not_implemented,not_loaded,monthly,true,https://portal.tce.go.gov.br/,,Agent H,web,State audit procurement,,discovered_uningested,,,,not_built,2026-03-01T23:11:31.444615+00:00,ok -tce_pr,TCE Parana,state,P3,not_built,not_implemented,not_loaded,monthly,true,https://www1.tce.pr.gov.br/,,Agent H,web,State audit procurement,,discovered_uningested,,,,not_built,2026-03-01T23:11:31.444615+00:00,transient_error -tce_sc,TCE Santa Catarina,state,P3,not_built,not_implemented,not_loaded,monthly,true,https://www.tcesc.tc.br/,,Agent H,web,State audit procurement,,discovered_uningested,,,,not_built,2026-03-01T23:11:31.444615+00:00,ok -tce_es,TCE Espirito Santo,state,P3,not_built,not_implemented,not_loaded,monthly,true,https://www.tcees.tc.br/,,Agent H,web,State audit procurement,,discovered_uningested,,,,not_built,2026-03-01T23:11:31.444615+00:00,ok -tce_mt,TCE Mato Grosso,state,P3,not_built,not_implemented,not_loaded,monthly,true,https://www.tce.mt.gov.br/,,Agent H,web,State audit procurement,,discovered_uningested,,,,not_built,2026-03-01T23:11:31.444615+00:00,ok -tce_ms,TCE Mato Grosso do Sul,state,P3,not_built,not_implemented,not_loaded,monthly,true,https://www.tce.ms.gov.br/,,Agent H,web,State audit procurement,,discovered_uningested,,,,not_built,2026-03-01T23:11:31.444615+00:00,ok -tce_am,TCE Amazonas,state,P3,not_built,not_implemented,not_loaded,monthly,true,https://www.tce.am.gov.br/,,Agent H,web,State audit procurement,,discovered_uningested,,,,not_built,2026-03-01T23:11:31.444615+00:00,ok -tce_pa,TCE Para,state,P3,not_built,not_implemented,not_loaded,monthly,true,https://www.tcepa.tc.br/,,Agent H,web,State audit procurement,,discovered_uningested,,,,not_built,2026-03-01T23:11:31.444615+00:00,ok -tce_ro,TCE Rondonia,state,P3,not_built,not_implemented,not_loaded,monthly,true,https://www.tce.ro.gov.br/,,Agent H,web,State audit procurement,,discovered_uningested,,,,not_built,2026-03-01T23:11:31.444615+00:00,ok -tce_rr,TCE Roraima,state,P3,not_built,not_implemented,not_loaded,monthly,true,https://www.tcerr.tc.br/,,Agent H,web,State audit procurement,,discovered_uningested,,,,not_built,2026-03-01T23:11:31.444615+00:00,ok -tce_ap,TCE Amapa,state,P3,not_built,not_implemented,not_loaded,monthly,true,https://www.tce.ap.gov.br/,,Agent H,web,State audit procurement,,discovered_uningested,,,,not_built,2026-03-01T23:11:31.444615+00:00,transient_error -tce_to,TCE Tocantins,state,P3,not_built,not_implemented,not_loaded,monthly,true,https://www.tceto.tc.br/,,Agent H,web,State audit procurement,,discovered_uningested,,,,not_built,2026-03-01T23:11:31.444615+00:00,ok -tce_ma,TCE Maranhao,state,P3,not_built,not_implemented,not_loaded,monthly,true,https://www.tcema.tc.br/,,Agent H,web,State audit procurement,,discovered_uningested,,,,not_built,2026-03-01T23:11:31.444615+00:00,ok -tce_pi,TCE Piaui,state,P3,not_built,not_implemented,not_loaded,monthly,true,https://www.tce.pi.gov.br/,,Agent H,web,State audit procurement,,discovered_uningested,,,,not_built,2026-03-01T23:11:31.444615+00:00,ok -tce_rn,TCE Rio Grande do Norte,state,P3,not_built,not_implemented,not_loaded,monthly,true,https://www.tce.rn.gov.br/,,Agent H,web,State audit procurement,,discovered_uningested,,,,not_built,2026-03-01T23:11:31.444615+00:00,ok -tce_pb,TCE Paraiba,state,P3,not_built,not_implemented,not_loaded,monthly,true,https://tce.pb.gov.br/,,Agent H,web,State audit procurement,,discovered_uningested,,,,not_built,2026-03-01T23:11:31.444615+00:00,ok -tce_al,TCE Alagoas,state,P3,not_built,not_implemented,not_loaded,monthly,true,https://www.tceal.tc.br/,,Agent H,web,State audit procurement,,discovered_uningested,,,,not_built,2026-03-01T23:11:31.444615+00:00,ok -tce_se,TCE Sergipe,state,P3,not_built,not_implemented,not_loaded,monthly,true,https://www.tce.se.gov.br/,,Agent H,web,State audit procurement,,discovered_uningested,,,,not_built,2026-03-01T23:11:31.444615+00:00,ok -state_portal_sp,Sao Paulo transparency portal,state,P2,not_built,not_implemented,not_loaded,monthly,true,https://www.transparencia.sp.gov.br/,,Agent H,api,State expenses and contracts,,discovered_uningested,,,,not_built,2026-03-01T23:11:31.444615+00:00,ok -state_portal_mg,Minas Gerais transparency portal,state,P2,not_built,not_implemented,not_loaded,monthly,true,https://www.transparencia.mg.gov.br/,,Agent H,web,State expenses and contracts,,discovered_uningested,,,,not_built,2026-03-01T23:11:31.444615+00:00,ok -state_portal_ba,Bahia transparency portal,state,P2,not_built,not_implemented,not_loaded,monthly,true,https://www.transparencia.ba.gov.br/,,Agent H,web,State expenses and contracts,,discovered_uningested,,,,not_built,2026-03-01T23:11:31.444615+00:00,ok -state_portal_ce,Ceara transparency portal,state,P2,not_built,not_implemented,not_loaded,monthly,true,https://www.transparencia.ce.gov.br/,,Agent H,web,State expenses and contracts,,discovered_uningested,,,,not_built,2026-03-01T23:11:31.444615+00:00,transient_error -state_portal_go,Goias transparency portal,state,P2,not_built,not_implemented,not_loaded,monthly,true,https://www.transparencia.go.gov.br/,,Agent H,web,State expenses and contracts,,discovered_uningested,,,,not_built,2026-03-01T23:11:31.444615+00:00,ok -state_portal_pr,Parana transparency portal,state,P2,not_built,not_implemented,not_loaded,monthly,true,https://www.transparencia.pr.gov.br/,,Agent H,web,State expenses and contracts,,discovered_uningested,,,,not_built,2026-03-01T23:11:31.444615+00:00,ok -state_portal_sc,Santa Catarina transparency portal,state,P2,not_built,not_implemented,not_loaded,monthly,true,https://www.transparencia.sc.gov.br/,,Agent H,web,State expenses and contracts,,discovered_uningested,,,,not_built,2026-03-01T23:11:31.444615+00:00,ok -state_portal_rs,Rio Grande do Sul transparency portal,state,P2,not_built,not_implemented,not_loaded,monthly,true,https://www.transparencia.rs.gov.br/,,Agent H,web,State expenses and contracts,,discovered_uningested,,,,not_built,2026-03-01T23:11:31.444615+00:00,ok -state_portal_pe,Pernambuco transparency portal,state,P2,not_built,not_implemented,not_loaded,monthly,true,https://www.transparencia.pe.gov.br/,,Agent H,web,State expenses and contracts,,discovered_uningested,,,,not_built,2026-03-01T23:11:31.444615+00:00,ok -state_portal_rj,Rio de Janeiro transparency portal,state,P2,not_built,not_implemented,not_loaded,monthly,true,https://www.transparencia.rj.gov.br/,,Agent H,web,State expenses and contracts,,discovered_uningested,,,,not_built,2026-03-01T23:11:31.444615+00:00,transient_error +source_id,name,category,tier,status,implementation_state,load_state,frequency,in_universe_v1,primary_url,pipeline_id,owner_agent,access_mode,notes,public_access_mode,discovery_status,last_seen_url,cadence_expected,cadence_observed,quality_status +cnpj,Receita Federal CNPJ,identity,P0,loaded,implemented,loaded,monthly,true,https://dadosabertos.rfb.gov.br/CNPJ/dados_abertos_cnpj/,cnpj,Agent A,file,http://dadosabertos.rfb.gov.br,,monitored,,,,healthy +tse,TSE elections and donations,electoral,P0,loaded,implemented,loaded,biennial,true,https://dadosabertos.tse.jus.br/,tse,Agent E,file,Core electoral data loaded,,monitored,,,,healthy +transparencia,Portal da Transparencia contracts,contracts,P0,loaded,implemented,loaded,monthly,true,https://portaldatransparencia.gov.br/download-de-dados,transparencia,Agent C,file,Federal contracts and servants,,monitored,,,,healthy +sanctions,CEIS CNEP sanctions,sanctions,P0,loaded,implemented,loaded,monthly,true,https://portaldatransparencia.gov.br/sancoes/consulta,sanctions,Agent C,file,Administrative sanctions,,monitored,,,,healthy +pep_cgu,CGU PEP list,integrity,P1,loaded,implemented,loaded,monthly,true,https://portaldatransparencia.gov.br/download-de-dados/pep,pep_cgu,Agent A,file,PEP baseline,,monitored,,,,healthy +bndes,BNDES financings,finance,P1,loaded,implemented,loaded,monthly,true,https://www.bndes.gov.br/wps/portal/site/home/transparencia/dados,bndes,Agent G,file,Loan relationships,,monitored,,,,healthy +pgfn,PGFN divida ativa,fiscal,P0,loaded,implemented,loaded,monthly,true,https://www.regularize.pgfn.gov.br/dados-abertos,pgfn,Agent C,file,Debt risk core,,monitored,,,,healthy +ibama,IBAMA embargos,environment,P1,loaded,implemented,loaded,monthly,true,https://servicos.ibama.gov.br/ctf/publico/areasembargadas/,ibama,Agent F,file,Environmental enforcement,,monitored,,,,healthy +comprasnet,ComprasNet contracts,contracts,P0,stale,implemented,partial,monthly,true,https://dados.gov.br/dados/conjuntos-dados/comprasnet-contratos,comprasnet,Agent C,file,Needs freshness backfill,,monitored,,,,stale +tcu,TCU sanctions,audit,P1,loaded,implemented,loaded,monthly,true,https://contas.tcu.gov.br/ords/f?p=INIDONEAS:INIDONEAS,tcu,Agent C,file,Inidoneidade sanctions,,monitored,,,,healthy +transferegov,TransfereGov emendas e convenios,transfers,P0,loaded,implemented,loaded,monthly,true,https://www.transferegov.sistema.gov.br/portal/download-de-dados,transferegov,Agent C,file,Transfer relationships,,monitored,,,,healthy +rais,RAIS aggregated labor,labor,P1,loaded,implemented,loaded,annual,true,https://basedosdados.org/dataset/br-me-rais,rais,Agent H,bigquery,Aggregate mode only,,monitored,,,,healthy +inep,INEP school census,education,P2,loaded,implemented,loaded,annual,true,https://www.gov.br/inep/pt-br/acesso-a-informacao/dados-abertos/microdados/censo-escolar,inep,Agent H,file,Education coverage,,monitored,,,,healthy +dou,Diario Oficial da Uniao,gazette,P0,loaded,implemented,loaded,daily,true,https://www.in.gov.br/leiturajornal,dou,Agent E,bigquery,National acts ingestion,,monitored,,,,healthy +datasus,DATASUS CNES,health,P1,loaded,implemented,loaded,monthly,true,https://opendatasus.saude.gov.br/,datasus,Agent H,file,Health establishments,,monitored,,,,healthy +icij,ICIJ offshore leaks,offshore,P1,loaded,implemented,loaded,yearly,true,https://offshoreleaks.icij.org/pages/database,icij,Agent G,file,Offshore entities and officers,,monitored,,,,healthy +opensanctions,OpenSanctions global PEP,sanctions,P1,loaded,implemented,loaded,monthly,true,https://www.opensanctions.org/datasets/peps/,opensanctions,Agent G,file,Global PEP matching,,monitored,,,,healthy +cvm,CVM proceedings,market,P1,loaded,implemented,loaded,monthly,true,https://dados.cvm.gov.br/,cvm,Agent G,file,Proceedings loaded,,monitored,,,,healthy +cvm_funds,CVM fund registry,market,P1,loaded,implemented,loaded,monthly,true,https://dados.cvm.gov.br/dados/FI/,cvm_funds,Agent G,file,Fund baseline,,monitored,,,,healthy +camara,Camara CEAP expenses,legislative,P1,loaded,implemented,loaded,monthly,true,https://dadosabertos.camara.leg.br/,camara,Agent E,api,Expense reimbursement,,monitored,,,,healthy +camara_inquiries,Camara inquiries and requirements,legislative,P0,partial,implemented,partial,daily,true,https://dadosabertos.camara.leg.br/,camara_inquiries,Agent E,api,Sessions still low,,monitored,,,,partial +senado,Senado CEAPS expenses,legislative,P1,loaded,implemented,loaded,monthly,true,https://www12.senado.leg.br/dados-abertos,senado,Agent E,api,Expense data loaded,,monitored,,,,healthy +ceaf,CEAF expelled servants,integrity,P1,loaded,implemented,loaded,monthly,true,https://portaldatransparencia.gov.br/download-de-dados/ceaf,ceaf,Agent A,file,Expulsion evidence,,monitored,,,,healthy +cepim,CEPIM barred NGOs,integrity,P1,loaded,implemented,loaded,monthly,true,https://portaldatransparencia.gov.br/download-de-dados/cepim,cepim,Agent A,file,NGO restrictions,,monitored,,,,healthy +cpgf,CPGF gov card expenses,spending,P2,loaded,implemented,loaded,monthly,true,https://portaldatransparencia.gov.br/download-de-dados/cpgf,cpgf,Agent H,file,Masked CPF source,,monitored,,,,healthy +leniency,Acordos de leniencia,integrity,P0,loaded,implemented,loaded,monthly,true,https://portaldatransparencia.gov.br/download-de-dados/acordos-leniencia,leniency,Agent A,file,High signal low volume,,monitored,,,,healthy +ofac,OFAC sanctions,sanctions,P1,loaded,implemented,loaded,monthly,true,https://home.treasury.gov/policy-issues/financial-sanctions/sdn-list-data-files,ofac,Agent G,file,International sanctions,,monitored,,,,healthy +holdings,Brasil IO holdings,ownership,P1,loaded,implemented,loaded,monthly,true,https://brasil.io/dataset/socios-brasil/holding/,holdings,Agent G,file,Ownership enrichment,,monitored,,,,healthy +viagens,Viagens a servico,spending,P2,loaded,implemented,loaded,monthly,true,https://portaldatransparencia.gov.br/download-de-dados/viagens,viagens,Agent H,file,Travel spend baseline,,monitored,,,,healthy +siop,SIOP emendas,budget,P0,partial,implemented,partial,annual,true,https://www.siop.planejamento.gov.br/,siop,Agent C,api,Author linkage limited,,monitored,,,,partial +pncp,PNCP bids and contracts,contracts,P0,stale,implemented,partial,monthly,true,https://pncp.gov.br/api/consulta/v1/,pncp,Agent C,api,Freshness SLA pending,,monitored,,,,stale +renuncias,Renuncias fiscais,fiscal,P1,loaded,implemented,loaded,annual,true,https://www.gov.br/receitafederal/pt-br/acesso-a-informacao/dados-abertos,renuncias,Agent G,file,Tax waiver baseline,,monitored,,,,healthy +siconfi,SICONFI municipal finance,fiscal,P1,partial,implemented,partial,annual,true,https://apidatalake.tesouro.gov.br/docs/siconfi/,siconfi,Agent C,api,No CNPJ direct links,,monitored,,,,partial +tse_bens,TSE candidate assets,electoral,P1,loaded,implemented,loaded,biennial,true,https://dadosabertos.tse.jus.br/dataset/bens-candidato,tse_bens,Agent E,file,Patrimony baseline,,monitored,,,,healthy +tse_filiados,TSE party memberships,electoral,P1,loaded,implemented,loaded,monthly,true,https://dadosabertos.tse.jus.br/dataset/filiados-partidos,tse_filiados,Agent E,file,Party network,,monitored,,,,healthy +bcb,BCB penalties,finance,P1,loaded,implemented,loaded,monthly,true,https://dadosabertos.bcb.gov.br/dataset/penalidades,bcb,Agent G,file,Bank penalties loaded,,monitored,,,,healthy +stf,STF court data,judiciary,P1,loaded,implemented,loaded,monthly,true,https://basedosdados.org/dataset/br-stf-corte-aberta,stf,Agent D,bigquery,Supreme court coverage,,monitored,,,,healthy +caged,CAGED labor movements,labor,P1,stale,implemented,partial,monthly,true,https://ftp.mtps.gov.br/pdet/microdados/NOVO%20CAGED/,caged,Agent H,file,Aggregate-only implementation,,monitored,,,,stale +eu_sanctions,EU sanctions,sanctions,P1,loaded,implemented,loaded,monthly,true,https://data.europa.eu/data/datasets/consolidated-list-of-persons-groups-and-entities-subject-to-eu-financial-sanctions,eu_sanctions,Agent G,file,International sanctions,,monitored,,,,healthy +un_sanctions,UN sanctions,sanctions,P1,loaded,implemented,loaded,monthly,true,https://scsanctions.un.org/resources/xml/en/consolidated.xml,un_sanctions,Agent G,file,International sanctions,,monitored,,,,healthy +world_bank,World Bank debarment,sanctions,P1,loaded,implemented,loaded,monthly,true,https://www.worldbank.org/en/projects-operations/procurement/debarred-firms,world_bank,Agent G,file,International sanctions,,monitored,,,,healthy +senado_cpis,Senado CPIs,legislative,P0,partial,implemented,partial,yearly,true,https://www12.senado.leg.br/dados-abertos,senado_cpis,Agent E,api,Needs richer sessions and requirements,,monitored,,,,partial +mides,MiDES municipal procurement,municipal,P0,loaded,implemented,loaded,daily,true,https://basedosdados.org/dataset/world-wb-mides,mides,Agent H,bigquery,Operational after access fix,,monitored,,,,healthy +querido_diario,Querido Diario gazettes,municipal,P1,partial,implemented,partial,daily,true,https://queridodiario.ok.org.br/api,querido_diario,Agent H,api,Text availability gap,,monitored,,,,partial +datajud,CNJ DataJud,judiciary,P0,blocked_external,implemented,not_loaded,monthly,true,https://api-publica.datajud.cnj.jus.br/,datajud,Agent D,api,Credentials not fully operational in prod,,monitored,,,,blocked_external +bolsa_familia_bpc,Bolsa Familia and BPC,social,P3,not_built,not_implemented,not_loaded,monthly,true,https://portaldatransparencia.gov.br/download-de-dados/bolsa-familia-pagamentos,,Agent H,file,High volume masked identities,,discovered_uningested,,,,not_built +estban,BCB ESTBAN balances,finance,P3,not_built,not_implemented,not_loaded,monthly,true,https://dadosabertos.bcb.gov.br/dataset/estban,,Agent G,file,Banking aggregates,,discovered_uningested,,,,not_built +if_data,BCB IF data indicators,finance,P3,not_built,not_implemented,not_loaded,quarterly,true,https://dadosabertos.bcb.gov.br/dataset/if-data,,Agent G,file,Institution KPIs,,discovered_uningested,,,,not_built +bcb_liquidacao,BCB bank liquidation acts,finance,P2,not_built,not_implemented,not_loaded,monthly,true,https://dadosabertos.bcb.gov.br/dataset/intervencao-e-liquidacao,,Agent G,file,Regulatory actions,,discovered_uningested,,,,not_built +stj_dados_abertos,STJ open data,judiciary,P1,not_built,not_implemented,not_loaded,monthly,true,https://dadosabertos.stj.jus.br/,,Agent D,api,Superior court decisions,,discovered_uningested,,,,not_built +cnciai_improbidade,CNIAI improbidade,judiciary,P1,not_built,not_implemented,not_loaded,monthly,true,https://www.cnj.jus.br/sistemas/cnciai/,,Agent D,api,Misconduct convictions,,discovered_uningested,,,,not_built +carf_tax_appeals,CARF tax appeals,judiciary,P2,not_built,not_implemented,not_loaded,monthly,true,https://carf.economia.gov.br/dados-abertos,,Agent D,file,Tax litigation,,discovered_uningested,,,,not_built +anp_royalties,ANP royalties and fuel,regulatory,P2,not_built,not_implemented,not_loaded,monthly,true,https://dados.gov.br/dados/conjuntos-dados/anp,,Agent F,api,Oil and gas royalties,,discovered_uningested,,,,not_built +aneel_concessions,ANEEL concessions,regulatory,P2,not_built,not_implemented,not_loaded,monthly,true,https://dadosabertos.aneel.gov.br/,,Agent F,api,Energy concessions,,discovered_uningested,,,,not_built +anm_mining_rights,ANM mining rights,regulatory,P1,not_built,not_implemented,not_loaded,monthly,true,https://dados.gov.br/dados/conjuntos-dados/anm,,Agent F,api,Mining rights and permits,,discovered_uningested,,,,not_built +antt_transport_concessions,ANTT concessions,regulatory,P3,not_built,not_implemented,not_loaded,monthly,true,https://dados.gov.br/dados/conjuntos-dados/antt,,Agent F,api,Transport concessions,,discovered_uningested,,,,not_built +ans_health_plans,ANS operators,regulatory,P3,not_built,not_implemented,not_loaded,monthly,true,https://dados.gov.br/dados/conjuntos-dados/ans,,Agent H,api,Health insurance operators,,discovered_uningested,,,,not_built +anvisa_registrations,ANVISA products,regulatory,P3,not_built,not_implemented,not_loaded,monthly,true,https://dados.gov.br/dados/conjuntos-dados/anvisa,,Agent H,api,Regulatory registrations,,discovered_uningested,,,,not_built +anac_aviation_concessions,ANAC concessions,regulatory,P3,not_built,not_implemented,not_loaded,monthly,true,https://dados.gov.br/dados/conjuntos-dados/anac,,Agent F,api,Aviation contracts,,discovered_uningested,,,,not_built +antaq_port_contracts,ANTAQ contracts,regulatory,P3,not_built,not_implemented,not_loaded,monthly,true,https://dados.gov.br/dados/conjuntos-dados/antaq,,Agent F,api,Port concessions,,discovered_uningested,,,,not_built +ana_water_grants,ANA water grants,regulatory,P3,not_built,not_implemented,not_loaded,monthly,true,https://dados.gov.br/dados/conjuntos-dados/ana,,Agent F,api,Water use rights,,discovered_uningested,,,,not_built +anatel_telecom_licenses,ANATEL licenses,regulatory,P3,not_built,not_implemented,not_loaded,monthly,true,https://dados.gov.br/dados/conjuntos-dados/anatel,,Agent G,api,Telecom operators,,discovered_uningested,,,,not_built +susep_insurance_market,SUSEP insurance market,regulatory,P3,not_built,not_implemented,not_loaded,monthly,true,https://dados.gov.br/dados/conjuntos-dados/susep,,Agent G,file,Insurance entities,,discovered_uningested,,,,not_built +cvm_full_ownership_chain,CVM ownership chains,market,P1,not_built,not_implemented,not_loaded,monthly,true,https://dados.cvm.gov.br/,,Agent G,file,Shareholder graph expansion,,discovered_uningested,,,,not_built +receita_dirbi,Receita DIRBI,tax,P1,not_built,not_implemented,not_loaded,monthly,true,https://dados.gov.br/dados/conjuntos-dados/declaracao-dirbi,,Agent G,file,Tax benefit declarations,,discovered_uningested,,,,not_built +mapbiomas_alertas,MapBiomas Alerta,environment,P1,not_built,not_implemented,not_loaded,monthly,true,https://alerta.mapbiomas.org/api,,Agent F,api,Deforestation alerts,,discovered_uningested,,,,not_built +sicar_rural_registry,SiCAR rural registry,environment,P1,not_built,not_implemented,not_loaded,quarterly,true,https://www.car.gov.br/publico/municipios/downloads,,Agent F,file,Property boundaries and owners,,discovered_uningested,,,,not_built +icmbio_cnuc,ICMBio CNUC units,environment,P2,not_built,not_implemented,not_loaded,monthly,true,https://www.icmbio.gov.br/portal/faunabrasileira/cadastro-nacional-de-unidades-de-conservacao,,Agent F,file,Protected areas,,discovered_uningested,,,,not_built +tesouro_emendas,Tesouro emendas,budget,P0,not_built,not_implemented,not_loaded,monthly,true,https://www.tesourotransparente.gov.br/,,Agent C,file,Budget execution,,discovered_uningested,,,,not_built +siga_brasil,SIGA Brasil,budget,P0,not_built,not_implemented,not_loaded,monthly,true,https://www12.senado.leg.br/orcamento/sigabrasil,,Agent C,file,Federal budget traces,,discovered_uningested,,,,not_built +camara_votes_bills,Camara votes and bills,legislative,P1,not_built,not_implemented,not_loaded,daily,true,https://dadosabertos.camara.leg.br/api/v2,,Agent E,api,Legislative behavior,,discovered_uningested,,,,not_built +senado_votes_bills,Senado votes and bills,legislative,P1,not_built,not_implemented,not_loaded,daily,true,https://legis.senado.leg.br/dadosabertos,,Agent E,api,Legislative behavior,,discovered_uningested,,,,not_built +interpol_red_notices,Interpol red notices,international,P2,not_built,not_implemented,not_loaded,weekly,true,https://www.interpol.int/How-we-work/Notices/Red-Notices,,Agent G,api,Requires key,,discovered_uningested,,,,not_built +tce_sp,TCE Sao Paulo,state,P2,not_built,not_implemented,not_loaded,monthly,true,https://transparencia.tce.sp.gov.br/,,Agent H,api,State audit procurement,,discovered_uningested,,,,not_built +tce_pe,TCE Pernambuco,state,P2,not_built,not_implemented,not_loaded,monthly,true,https://sistemas.tce.pe.gov.br/,,Agent H,api,State audit procurement,,discovered_uningested,,,,not_built +tce_rj,TCE Rio de Janeiro,state,P2,not_built,not_implemented,not_loaded,monthly,true,https://dados.tce.rj.gov.br/,,Agent H,api,State audit procurement,,discovered_uningested,,,,not_built +tce_rs,TCE Rio Grande do Sul,state,P2,not_built,not_implemented,not_loaded,monthly,true,https://portal.tce.rs.gov.br/,,Agent H,file,State audit procurement,,discovered_uningested,,,,not_built +tce_mg,TCE Minas Gerais,state,P2,not_built,not_implemented,not_loaded,monthly,true,https://www.tce.mg.gov.br/,,Agent H,web,State audit procurement,,discovered_uningested,,,,not_built +tce_ba,TCE Bahia,state,P3,not_built,not_implemented,not_loaded,monthly,true,https://www.tce.ba.gov.br/,,Agent H,web,State audit procurement,,discovered_uningested,,,,not_built +tce_ce,TCE Ceara,state,P3,not_built,not_implemented,not_loaded,monthly,true,https://www.tce.ce.gov.br/,,Agent H,web,State audit procurement,,discovered_uningested,,,,not_built +tce_go,TCE Goias,state,P3,not_built,not_implemented,not_loaded,monthly,true,https://portal.tce.go.gov.br/,,Agent H,web,State audit procurement,,discovered_uningested,,,,not_built +tce_pr,TCE Parana,state,P3,not_built,not_implemented,not_loaded,monthly,true,https://www1.tce.pr.gov.br/,,Agent H,web,State audit procurement,,discovered_uningested,,,,not_built +tce_sc,TCE Santa Catarina,state,P3,not_built,not_implemented,not_loaded,monthly,true,https://www.tcesc.tc.br/,,Agent H,web,State audit procurement,,discovered_uningested,,,,not_built +tce_es,TCE Espirito Santo,state,P3,not_built,not_implemented,not_loaded,monthly,true,https://www.tcees.tc.br/,,Agent H,web,State audit procurement,,discovered_uningested,,,,not_built +tce_mt,TCE Mato Grosso,state,P3,not_built,not_implemented,not_loaded,monthly,true,https://www.tce.mt.gov.br/,,Agent H,web,State audit procurement,,discovered_uningested,,,,not_built +tce_ms,TCE Mato Grosso do Sul,state,P3,not_built,not_implemented,not_loaded,monthly,true,https://www.tce.ms.gov.br/,,Agent H,web,State audit procurement,,discovered_uningested,,,,not_built +tce_am,TCE Amazonas,state,P3,not_built,not_implemented,not_loaded,monthly,true,https://www.tce.am.gov.br/,,Agent H,web,State audit procurement,,discovered_uningested,,,,not_built +tce_pa,TCE Para,state,P3,not_built,not_implemented,not_loaded,monthly,true,https://www.tcepa.tc.br/,,Agent H,web,State audit procurement,,discovered_uningested,,,,not_built +tce_ro,TCE Rondonia,state,P3,not_built,not_implemented,not_loaded,monthly,true,https://www.tce.ro.gov.br/,,Agent H,web,State audit procurement,,discovered_uningested,,,,not_built +tce_rr,TCE Roraima,state,P3,not_built,not_implemented,not_loaded,monthly,true,https://www.tcerr.tc.br/,,Agent H,web,State audit procurement,,discovered_uningested,,,,not_built +tce_ap,TCE Amapa,state,P3,not_built,not_implemented,not_loaded,monthly,true,https://www.tce.ap.gov.br/,,Agent H,web,State audit procurement,,discovered_uningested,,,,not_built +tce_to,TCE Tocantins,state,P3,not_built,not_implemented,not_loaded,monthly,true,https://www.tceto.tc.br/,,Agent H,web,State audit procurement,,discovered_uningested,,,,not_built +tce_ma,TCE Maranhao,state,P3,not_built,not_implemented,not_loaded,monthly,true,https://www.tcema.tc.br/,,Agent H,web,State audit procurement,,discovered_uningested,,,,not_built +tce_pi,TCE Piaui,state,P3,not_built,not_implemented,not_loaded,monthly,true,https://www.tce.pi.gov.br/,,Agent H,web,State audit procurement,,discovered_uningested,,,,not_built +tce_rn,TCE Rio Grande do Norte,state,P3,not_built,not_implemented,not_loaded,monthly,true,https://www.tce.rn.gov.br/,,Agent H,web,State audit procurement,,discovered_uningested,,,,not_built +tce_pb,TCE Paraiba,state,P3,not_built,not_implemented,not_loaded,monthly,true,https://tce.pb.gov.br/,,Agent H,web,State audit procurement,,discovered_uningested,,,,not_built +tce_al,TCE Alagoas,state,P3,not_built,not_implemented,not_loaded,monthly,true,https://www.tceal.tc.br/,,Agent H,web,State audit procurement,,discovered_uningested,,,,not_built +tce_se,TCE Sergipe,state,P3,not_built,not_implemented,not_loaded,monthly,true,https://www.tce.se.gov.br/,,Agent H,web,State audit procurement,,discovered_uningested,,,,not_built +state_portal_sp,Sao Paulo transparency portal,state,P2,not_built,not_implemented,not_loaded,monthly,true,https://www.transparencia.sp.gov.br/,,Agent H,api,State expenses and contracts,,discovered_uningested,,,,not_built +state_portal_mg,Minas Gerais transparency portal,state,P2,not_built,not_implemented,not_loaded,monthly,true,https://www.transparencia.mg.gov.br/,,Agent H,web,State expenses and contracts,,discovered_uningested,,,,not_built +state_portal_ba,Bahia transparency portal,state,P2,not_built,not_implemented,not_loaded,monthly,true,https://www.transparencia.ba.gov.br/,,Agent H,web,State expenses and contracts,,discovered_uningested,,,,not_built +state_portal_ce,Ceara transparency portal,state,P2,not_built,not_implemented,not_loaded,monthly,true,https://www.transparencia.ce.gov.br/,,Agent H,web,State expenses and contracts,,discovered_uningested,,,,not_built +state_portal_go,Goias transparency portal,state,P2,not_built,not_implemented,not_loaded,monthly,true,https://www.transparencia.go.gov.br/,,Agent H,web,State expenses and contracts,,discovered_uningested,,,,not_built +state_portal_pr,Parana transparency portal,state,P2,not_built,not_implemented,not_loaded,monthly,true,https://www.transparencia.pr.gov.br/,,Agent H,web,State expenses and contracts,,discovered_uningested,,,,not_built +state_portal_sc,Santa Catarina transparency portal,state,P2,not_built,not_implemented,not_loaded,monthly,true,https://www.transparencia.sc.gov.br/,,Agent H,web,State expenses and contracts,,discovered_uningested,,,,not_built +state_portal_rs,Rio Grande do Sul transparency portal,state,P2,not_built,not_implemented,not_loaded,monthly,true,https://www.transparencia.rs.gov.br/,,Agent H,web,State expenses and contracts,,discovered_uningested,,,,not_built +state_portal_pe,Pernambuco transparency portal,state,P2,not_built,not_implemented,not_loaded,monthly,true,https://www.transparencia.pe.gov.br/,,Agent H,web,State expenses and contracts,,discovered_uningested,,,,not_built +state_portal_rj,Rio de Janeiro transparency portal,state,P2,not_built,not_implemented,not_loaded,monthly,true,https://www.transparencia.rj.gov.br/,,Agent H,web,State expenses and contracts,,discovered_uningested,,,,not_built diff --git a/etl/pyproject.toml b/etl/pyproject.toml index f54aa99..ff31967 100644 --- a/etl/pyproject.toml +++ b/etl/pyproject.toml @@ -1,7 +1,7 @@ [project] name = "bracc-etl" version = "0.1.0" -description = "BRACC ETL — Data ingestion pipelines for Brazilian public data" +description = "BR-ACC ETL — Data ingestion pipelines for Brazilian public data" requires-python = ">=3.12" license = "AGPL-3.0-or-later" dependencies = [ @@ -9,10 +9,11 @@ dependencies = [ "pandas>=2.2.0", "httpx>=0.28.0", "click>=8.1.0", - "defusedxml>=0.7.1", "pydantic>=2.10.0", "pydantic-settings>=2.7.0", "pypdf>=5.2.0", + "defusedxml>=0.7.0", + "pandera>=0.21.0", ] [project.optional-dependencies] diff --git a/etl/scripts/_download_utils.py b/etl/scripts/_download_utils.py index ed0a67f..14261ea 100644 --- a/etl/scripts/_download_utils.py +++ b/etl/scripts/_download_utils.py @@ -3,8 +3,6 @@ from __future__ import annotations import logging -import shutil -import stat import zipfile from pathlib import Path @@ -38,12 +36,21 @@ def download_file(url: str, dest: Path, *, timeout: int = 600) -> bool: response.raise_for_status() + # If we requested a range but server returned full content (200 vs 206), + # start fresh to avoid corruption + if start_byte > 0 and response.status_code != 206: + logger.warning( + "Server ignored Range header for %s, restarting download", + dest.name, + ) + start_byte = 0 + total = response.headers.get("content-length") total_mb = f"{int(total) / 1e6:.1f} MB" if total else "unknown size" logger.info("Downloading %s (%s)...", dest.name, total_mb) - mode = "ab" if start_byte > 0 else "wb" - downloaded = start_byte + mode = "ab" if start_byte > 0 and response.status_code == 206 else "wb" + downloaded = start_byte if mode == "ab" else 0 with open(partial, mode) as f: for chunk in response.iter_bytes(chunk_size=65_536): f.write(chunk) @@ -58,24 +65,49 @@ def download_file(url: str, dest: Path, *, timeout: int = 600) -> bool: return False -def extract_zip(zip_path: Path, output_dir: Path) -> list[Path]: - """Extract ZIP and return list of extracted files. +def safe_extract_zip( + zip_path: Path, + output_dir: Path, + *, + max_total_bytes: int = 50 * 1024**3, # 50GB default (CNPJ zips are huge) +) -> list[Path]: + """Safely extract ZIP with path traversal and bomb guards. Deletes corrupted ZIPs for re-download. """ try: with zipfile.ZipFile(zip_path, "r") as zf: - extracted = safe_extract_zip(zf, output_dir) - logger.info("Extracted %d files from %s", len(extracted), zip_path.name) - return extracted + # Check for path traversal + resolved_output = output_dir.resolve() + for info in zf.infolist(): + target = (output_dir / info.filename).resolve() + if not target.is_relative_to(resolved_output): + raise ValueError( + f"Path traversal detected in {zip_path.name}: {info.filename}" + ) + + # Check total uncompressed size (zip bomb guard) + total_size = sum(info.file_size for info in zf.infolist()) + if total_size > max_total_bytes: + raise ValueError( + f"ZIP bomb guard: {zip_path.name} would extract to " + f"{total_size / 1e9:.1f}GB (limit: {max_total_bytes / 1e9:.1f}GB)" + ) + + names = zf.namelist() + zf.extractall(output_dir) + + logger.info("Extracted %d files from %s", len(names), zip_path.name) + return [output_dir / n for n in names] except zipfile.BadZipFile: logger.warning("Bad ZIP file: %s — deleting for re-download", zip_path.name) zip_path.unlink() return [] - except ValueError as exc: - logger.warning("Unsafe ZIP file %s: %s — deleting", zip_path.name, exc) - zip_path.unlink(missing_ok=True) - return [] + + +def extract_zip(zip_path: Path, output_dir: Path) -> list[Path]: + """Extract ZIP and return list of extracted files.""" + return safe_extract_zip(zip_path, output_dir) def validate_csv( @@ -111,60 +143,3 @@ def validate_csv( except Exception as e: logger.warning("Validation failed for %s: %s", path.name, e) return False - - -def safe_extract_zip( - archive: zipfile.ZipFile, - output_dir: Path, - *, - max_members: int = 50_000, - max_uncompressed_bytes: int = 5_000_000_000, -) -> list[Path]: - """Safely extract a ZIP archive. - - Blocks path traversal, symlinks, and oversized archives. - """ - output_root = output_dir.resolve() - infos = archive.infolist() - if len(infos) > max_members: - msg = f"ZIP has too many entries ({len(infos)} > {max_members})" - raise ValueError(msg) - - extracted: list[Path] = [] - uncompressed_total = 0 - for info in infos: - member_name = info.filename.replace("\\", "/") - if not member_name: - continue - - # Reject symlink entries. - mode = info.external_attr >> 16 - if stat.S_ISLNK(mode): - msg = f"ZIP contains symlink entry: {member_name}" - raise ValueError(msg) - - target = (output_dir / member_name).resolve() - try: - target.relative_to(output_root) - except ValueError as exc: - msg = f"Path traversal detected: {member_name}" - raise ValueError(msg) from exc - - if info.is_dir(): - target.mkdir(parents=True, exist_ok=True) - continue - - uncompressed_total += info.file_size - if uncompressed_total > max_uncompressed_bytes: - msg = ( - f"ZIP exceeds max extracted size " - f"({uncompressed_total} > {max_uncompressed_bytes})" - ) - raise ValueError(msg) - - target.parent.mkdir(parents=True, exist_ok=True) - with archive.open(info, "r") as source, target.open("wb") as destination: - shutil.copyfileobj(source, destination) - extracted.append(target) - - return extracted diff --git a/etl/scripts/download_caged.py b/etl/scripts/download_caged.py index 329943a..3d62382 100644 --- a/etl/scripts/download_caged.py +++ b/etl/scripts/download_caged.py @@ -5,9 +5,9 @@ resumability and memory management on large datasets. Usage: - python etl/scripts/download_caged.py --billing-project bracc-corruptos - python etl/scripts/download_caged.py --billing-project bracc-corruptos --start-year 2024 - python etl/scripts/download_caged.py --billing-project bracc-corruptos --skip-existing + python etl/scripts/download_caged.py --billing-project icarus-corruptos + python etl/scripts/download_caged.py --billing-project icarus-corruptos --start-year 2024 + python etl/scripts/download_caged.py --billing-project icarus-corruptos --skip-existing """ from __future__ import annotations diff --git a/etl/scripts/download_camara_inquiries.py b/etl/scripts/download_camara_inquiries.py index 02813c8..691c244 100644 --- a/etl/scripts/download_camara_inquiries.py +++ b/etl/scripts/download_camara_inquiries.py @@ -413,7 +413,7 @@ def _write_manifest( ) @click.option( "--billing-project", - default="bracc-corruptos", + default="icarus-corruptos", help="GCP billing project for BQ mode.", ) @click.option( diff --git a/etl/scripts/download_cnpj.py b/etl/scripts/download_cnpj.py index aff1d58..62dabd2 100644 --- a/etl/scripts/download_cnpj.py +++ b/etl/scripts/download_cnpj.py @@ -6,15 +6,21 @@ python etl/scripts/download_cnpj.py --reference-only # reference tables only (tiny) python etl/scripts/download_cnpj.py --files 1 # just first file of each type python etl/scripts/download_cnpj.py --types Empresas # specific type only + python etl/scripts/download_cnpj.py --release 2026-03 # pin to specific monthly release """ from __future__ import annotations +import hashlib +import json import logging +import os import sys +from datetime import datetime, timezone from pathlib import Path import click +import httpx sys.path.insert(0, str(Path(__file__).parent)) from _download_utils import download_file, extract_zip, validate_csv @@ -22,7 +28,13 @@ logging.basicConfig(level=logging.INFO, format="%(asctime)s %(levelname)s %(message)s") logger = logging.getLogger(__name__) -BASE_URL = "https://dadosabertos.rfb.gov.br/CNPJ/" +# Receita Federal Nextcloud (primary since Jan 2026) +NEXTCLOUD_BASE = "https://arquivos.receitafederal.gov.br/s/{token}/download?path=%2F&files=" +KNOWN_TOKENS = ["gn672Ad4CF8N6TK", "YggdBLfdninEJX9"] + +# Legacy URLs (dadosabertos.rfb.gov.br decommissioned Jan 2026) +LEGACY_NEW_BASE_PATTERN = "https://dadosabertos.rfb.gov.br/CNPJ/dados_abertos_cnpj/{year_month}/" +LEGACY_BASE_URL = "https://dadosabertos.rfb.gov.br/CNPJ/" MAIN_TYPES = ["Empresas", "Socios", "Estabelecimentos"] REFERENCE_FILES = [ @@ -48,6 +60,126 @@ } +def _previous_month(year: int, month: int) -> tuple[int, int]: + """Return (year, month) for the previous month.""" + if month == 1: + return year - 1, 12 + return year, month - 1 + + +def _check_url_accessible(url: str, timeout: int = 30) -> bool: + """Send HTTP HEAD to verify a URL is accessible (2xx).""" + try: + resp = httpx.head(url, follow_redirects=True, timeout=timeout) + return resp.status_code < 400 + except httpx.HTTPError: + return False + + +def _check_nextcloud_token(token: str, timeout: int = 30) -> bool: + """Verify a Nextcloud share token is valid via HEAD request.""" + share_url = f"https://arquivos.receitafederal.gov.br/s/{token}" + try: + resp = httpx.head(share_url, follow_redirects=True, timeout=timeout) + return resp.status_code < 400 + except httpx.HTTPError: + return False + + +def resolve_rf_release(year_month: str | None = None) -> str: + """Resolve the Receita Federal CNPJ release URL. + + Strategy: + 1. Try Nextcloud share (primary since Jan 2026): + a. Check CNPJ_SHARE_TOKEN env var first. + b. Then try each known token. + 2. Fall back to legacy dadosabertos.rfb.gov.br paths. + 3. Raise RuntimeError if nothing works (fail-closed). + + Returns the resolved base URL. For Nextcloud, files are fetched via + ``{base_url}{filename}``. + """ + now = datetime.now(timezone.utc) + + # --- Nextcloud (primary) --- + tokens_to_try: list[str] = [] + + env_token = os.environ.get("CNPJ_SHARE_TOKEN") + if env_token: + tokens_to_try.append(env_token) + + for t in KNOWN_TOKENS: + if t not in tokens_to_try: + tokens_to_try.append(t) + + for token in tokens_to_try: + logger.info("Probing Nextcloud token: %s...", token[:6]) + if _check_nextcloud_token(token): + base_url = NEXTCLOUD_BASE.format(token=token) + logger.info("Resolved CNPJ via Nextcloud (token %s...)", token[:6]) + return base_url + + # --- Legacy dadosabertos.rfb.gov.br --- + if year_month is not None: + candidates = [year_month] + else: + current = f"{now.year:04d}-{now.month:02d}" + prev_y, prev_m = _previous_month(now.year, now.month) + previous = f"{prev_y:04d}-{prev_m:02d}" + candidates = [current, previous] + + for ym in candidates: + url = LEGACY_NEW_BASE_PATTERN.format(year_month=ym) + logger.info("Probing legacy release URL: %s", url) + if _check_url_accessible(url): + logger.info("Resolved CNPJ release (legacy new path): %s", url) + return url + + logger.info("Trying legacy flat URL: %s", LEGACY_BASE_URL) + if _check_url_accessible(LEGACY_BASE_URL): + logger.info("Resolved CNPJ release (legacy flat): %s", LEGACY_BASE_URL) + return LEGACY_BASE_URL + + tried = ", ".join(candidates) + raise RuntimeError( + f"Could not resolve CNPJ release. Tried Nextcloud tokens, " + f"legacy months [{tried}], and legacy flat path. " + "Receita Federal portal may be down or the URL structure has changed." + ) + + +def _write_manifest( + output_dir: Path, + base_url: str, + resolved_release: str, + file_results: list[dict], + started_at: str, +) -> Path: + """Write download manifest JSON after download completes.""" + finished_at = datetime.now(timezone.utc).isoformat() + + # Compute an aggregate checksum over all successful file names + sizes + hasher = hashlib.sha256() + for fr in sorted(file_results, key=lambda x: x["name"]): + hasher.update(f"{fr['name']}:{fr['size_bytes']}:{fr['status']}".encode()) + checksum = f"sha256:{hasher.hexdigest()}" + + manifest = { + "source": "receita_federal_cnpj", + "resolved_release": resolved_release, + "base_url": base_url, + "files": file_results, + "started_at": started_at, + "finished_at": finished_at, + "checksum": checksum, + } + + manifest_path = output_dir / "download_manifest.json" + manifest_path.write_text(json.dumps(manifest, indent=2, ensure_ascii=False), encoding="utf-8") + logger.info("Manifest written: %s", manifest_path) + return manifest_path + + @click.command() @click.option("--output-dir", default="./data/cnpj", help="Base output directory") @click.option("--files", type=int, default=10, help="Number of files per type (0-9)") @@ -56,6 +188,7 @@ @click.option("--skip-existing/--no-skip-existing", default=True, help="Skip already downloaded files") @click.option("--skip-extract", is_flag=True, help="Skip extraction after download") @click.option("--timeout", type=int, default=600, help="Download timeout in seconds") +@click.option("--release", default=None, help="Pin to specific monthly release (YYYY-MM format)") def main( output_dir: str, files: int, @@ -64,8 +197,20 @@ def main( skip_existing: bool, skip_extract: bool, timeout: int, + release: str | None, ) -> None: """Download and extract CNPJ data from Receita Federal.""" + started_at = datetime.now(timezone.utc).isoformat() + + base_url = resolve_rf_release(release) + # Extract the release identifier from the resolved URL + resolved_release = release or "legacy" + if "arquivos.receitafederal.gov.br" in base_url: + resolved_release = "nextcloud" + elif "/dados_abertos_cnpj/" in base_url: + # Extract YYYY-MM from URL + resolved_release = base_url.rstrip("/").rsplit("/", 1)[-1] + base = Path(output_dir) raw_dir = base / "raw" extract_dir = base / "extracted" @@ -73,14 +218,26 @@ def main( for d in [raw_dir, extract_dir, ref_dir]: d.mkdir(parents=True, exist_ok=True) + file_results: list[dict] = [] + # --- Reference tables (always download, they're tiny) --- logger.info("=== Reference tables ===") for filename in REFERENCE_FILES: dest = raw_dir / filename if skip_existing and dest.exists(): logger.info("Skipping (exists): %s", filename) + file_results.append({ + "name": filename, + "status": "skipped", + "size_bytes": dest.stat().st_size, + }) else: - download_file(f"{BASE_URL}{filename}", dest, timeout=timeout) + success = download_file(f"{base_url}{filename}", dest, timeout=timeout) + file_results.append({ + "name": filename, + "status": "ok" if success else "failed", + "size_bytes": dest.stat().st_size if dest.exists() else 0, + }) if not skip_extract and dest.exists(): extracted = extract_zip(dest, ref_dir) @@ -90,7 +247,8 @@ def main( validate_csv(f, expected_cols=expected) if reference_only: - logger.info("Reference-only mode — done.") + logger.info("Reference-only mode -- done.") + _write_manifest(base, base_url, resolved_release, file_results, started_at) return # --- Main data files --- @@ -102,10 +260,25 @@ def main( dest = raw_dir / filename if skip_existing and dest.exists(): logger.info("Skipping (exists): %s", filename) + file_results.append({ + "name": filename, + "status": "skipped", + "size_bytes": dest.stat().st_size, + }) else: - success = download_file(f"{BASE_URL}{filename}", dest, timeout=timeout) + success = download_file(f"{base_url}{filename}", dest, timeout=timeout) if not success: + file_results.append({ + "name": filename, + "status": "failed", + "size_bytes": 0, + }) continue + file_results.append({ + "name": filename, + "status": "ok", + "size_bytes": dest.stat().st_size if dest.exists() else 0, + }) if not skip_extract and dest.exists(): extracted = extract_zip(dest, extract_dir) @@ -120,6 +293,7 @@ def main( logger.info("=== Download complete ===") _print_summary(raw_dir, extract_dir, ref_dir) + _write_manifest(base, base_url, resolved_release, file_results, started_at) def _print_summary(raw_dir: Path, extract_dir: Path, ref_dir: Path) -> None: diff --git a/etl/scripts/download_cnpj_bq.py b/etl/scripts/download_cnpj_bq.py index 8357bb3..d685db5 100644 --- a/etl/scripts/download_cnpj_bq.py +++ b/etl/scripts/download_cnpj_bq.py @@ -10,8 +10,8 @@ - download_manifest.json Usage: - python etl/scripts/download_cnpj_bq.py --billing-project bracc-corruptos - python etl/scripts/download_cnpj_bq.py --billing-project bracc-corruptos --tables socios + python etl/scripts/download_cnpj_bq.py --billing-project icarus-corruptos + python etl/scripts/download_cnpj_bq.py --billing-project icarus-corruptos --tables socios """ from __future__ import annotations @@ -105,6 +105,44 @@ PAGE_SIZE = 100_000 +def _run_bigquery_precheck( + *, + billing_project: str, + source_project: str, + source_dataset: str, + snapshot_start: str | None, +) -> None: + """Run explicit auth/ACL prechecks before starting large table downloads.""" + from google.cloud import bigquery + + client = bigquery.Client(project=billing_project) + logger.info("Running BigQuery precheck: SELECT 1") + list(client.query("SELECT 1 AS ok").result()) + + socios_table = f"{source_project}.{source_dataset}.socios" + if snapshot_start: + precheck_sql = ( + f"SELECT COUNT(1) AS n FROM `{socios_table}` " + "WHERE data >= @snapshot_start" + ) + query_params = [ + bigquery.ScalarQueryParameter("snapshot_start", "DATE", snapshot_start), + ] + else: + precheck_sql = f"SELECT COUNT(1) AS n FROM `{socios_table}`" + query_params = [] + + logger.info("Running BigQuery precheck: %s", precheck_sql) + rows = list( + client.query( + precheck_sql, + job_config=bigquery.QueryJobConfig(query_parameters=query_params), + ).result(), + ) + check_value = rows[0].n if rows else 0 + logger.info("BigQuery precheck OK: socios_count=%s", check_value) + + def _sha256_file(path: Path) -> str: digest = hashlib.sha256() with path.open("rb") as f: @@ -292,6 +330,19 @@ def main( ) source_project, source_dataset = dataset.split(".", 1) + try: + _run_bigquery_precheck( + billing_project=billing_project, + source_project=source_project, + source_dataset=source_dataset, + snapshot_start=snapshot_start, + ) + except Exception as exc: + raise click.ClickException( + "BigQuery precheck failed. Configure a non-interactive service account " + "(GOOGLE_APPLICATION_CREDENTIALS) with dataset ACL and billing access.", + ) from exc + selected = list(tables) if tables else list(TABLES.keys()) run_id = f"cnpj-bq-{datetime.now(UTC).strftime('%Y%m%d%H%M%S')}-{uuid.uuid4().hex[:8]}" logger.info( diff --git a/etl/scripts/download_dou.py b/etl/scripts/download_dou.py index 4d85096..4a0759f 100644 --- a/etl/scripts/download_dou.py +++ b/etl/scripts/download_dou.py @@ -22,7 +22,6 @@ import click import httpx -from _download_utils import safe_extract_zip logging.basicConfig( level=logging.INFO, format="%(asctime)s %(levelname)s %(message)s" @@ -90,15 +89,24 @@ def _download_zip( xml_count = 0 try: + resolved_dir = section_dir.resolve() with zipfile.ZipFile(BytesIO(resp.content)) as zf: - extracted = safe_extract_zip(zf, section_dir) - xml_count = sum(1 for path in extracted if path.suffix.lower() == ".xml") + for member in zf.namelist(): + # Path traversal guard + target = (section_dir / member).resolve() + if not target.is_relative_to(resolved_dir): + logger.warning( + "Path traversal detected in %s: %s — skipping", + zip_name, + member, + ) + continue + if member.lower().endswith(".xml"): + zf.extract(member, section_dir) + xml_count += 1 except zipfile.BadZipFile: logger.warning("Bad ZIP file: %s", zip_name) return 0 - except ValueError as exc: - logger.warning("Unsafe ZIP file %s: %s", zip_name, exc) - return 0 if xml_count > 0: marker.write_text(str(xml_count)) diff --git a/etl/scripts/download_mides.py b/etl/scripts/download_mides.py index 8a51a4e..0617568 100644 --- a/etl/scripts/download_mides.py +++ b/etl/scripts/download_mides.py @@ -71,7 +71,7 @@ def _write_manifest(out_dir: Path, tables: list[dict[str, Any]]) -> Path: @click.command() -@click.option("--billing-project", default="bracc-corruptos", help="GCP billing project") +@click.option("--billing-project", default="icarus-corruptos", help="GCP billing project") @click.option( "--dataset", default=WORLD_WB_DATASET, diff --git a/etl/scripts/download_pncp.py b/etl/scripts/download_pncp.py index 4ac59c5..061536e 100644 --- a/etl/scripts/download_pncp.py +++ b/etl/scripts/download_pncp.py @@ -439,7 +439,7 @@ def main( client = httpx.Client( timeout=timeout, follow_redirects=True, - headers={"User-Agent": "BRACC-ETL/1.0 (public data research)"}, + headers={"User-Agent": "BR-ACC-ETL/1.0 (public data research)"}, ) total_records = 0 diff --git a/etl/scripts/download_renuncias.py b/etl/scripts/download_renuncias.py index fae7bc2..2fbef69 100644 --- a/etl/scripts/download_renuncias.py +++ b/etl/scripts/download_renuncias.py @@ -8,10 +8,12 @@ import argparse import logging -import zipfile +import sys from pathlib import Path import httpx + +sys.path.insert(0, str(Path(__file__).parent)) from _download_utils import safe_extract_zip logger = logging.getLogger(__name__) @@ -34,15 +36,14 @@ def download_year(output_dir: Path, year: int) -> None: url, follow_redirects=True, timeout=300, - headers={"User-Agent": "BRACC-ETL/1.0"}, + headers={"User-Agent": "BR-ACC-ETL/1.0"}, ) response.raise_for_status() dest_zip.write_bytes(response.content) logger.info("Downloaded: %s (%d bytes)", dest_zip.name, len(response.content)) - with zipfile.ZipFile(dest_zip, "r") as zf: - extracted = safe_extract_zip(zf, output_dir) - logger.info("Extracted %d files", len(extracted)) + extracted = safe_extract_zip(dest_zip, output_dir) + logger.info("Extracted %d files", len(extracted)) except httpx.HTTPError: logger.warning("Failed to download renuncias for %d", year) diff --git a/etl/scripts/download_senado_cpis.py b/etl/scripts/download_senado_cpis.py index d16832a..2a4570f 100644 --- a/etl/scripts/download_senado_cpis.py +++ b/etl/scripts/download_senado_cpis.py @@ -16,13 +16,13 @@ import json import logging import re +import defusedxml.ElementTree as ET from datetime import UTC, datetime from pathlib import Path from typing import Any import click import httpx -from defusedxml import ElementTree as ET from download_senado_cpi_archive import fetch_archive_historical logger = logging.getLogger(__name__) diff --git a/etl/scripts/download_siconfi.py b/etl/scripts/download_siconfi.py index c105c3b..6ce1687 100644 --- a/etl/scripts/download_siconfi.py +++ b/etl/scripts/download_siconfi.py @@ -44,7 +44,7 @@ def get_all_entities() -> list[dict]: url, params={"offset": offset, "limit": limit}, timeout=60, - headers={"User-Agent": "BRACC-ETL/1.0"}, + headers={"User-Agent": "BR-ACC-ETL/1.0"}, ) response.raise_for_status() data = response.json() @@ -125,7 +125,7 @@ def download_year( header_written = partial.exists() and partial.stat().st_size > 0 with ( - httpx.Client(headers={"User-Agent": "BRACC-ETL/1.0"}) as client, + httpx.Client(headers={"User-Agent": "BR-ACC-ETL/1.0"}) as client, open(partial, "a", newline="", encoding="utf-8") as f, ): writer: csv.DictWriter | None = None diff --git a/etl/scripts/download_stf.py b/etl/scripts/download_stf.py index ddb13f3..99e4048 100644 --- a/etl/scripts/download_stf.py +++ b/etl/scripts/download_stf.py @@ -5,9 +5,9 @@ Requires `google-cloud-bigquery` and an authenticated GCP project. Usage: - python etl/scripts/download_stf.py --billing-project bracc-corruptos - python etl/scripts/download_stf.py --billing-project bracc-corruptos --skip-existing - python etl/scripts/download_stf.py --billing-project bracc-corruptos --output-dir ./data/stf + python etl/scripts/download_stf.py --billing-project icarus-corruptos + python etl/scripts/download_stf.py --billing-project icarus-corruptos --skip-existing + python etl/scripts/download_stf.py --billing-project icarus-corruptos --output-dir ./data/stf """ from __future__ import annotations diff --git a/etl/scripts/download_tse_bens.py b/etl/scripts/download_tse_bens.py index 24cbf87..954e0c1 100644 --- a/etl/scripts/download_tse_bens.py +++ b/etl/scripts/download_tse_bens.py @@ -5,9 +5,9 @@ Requires `google-cloud-bigquery` and an authenticated GCP project. Usage: - python etl/scripts/download_tse_bens.py --billing-project bracc-corruptos - python etl/scripts/download_tse_bens.py --billing-project bracc-corruptos --start-year 2018 - python etl/scripts/download_tse_bens.py --billing-project bracc-corruptos --skip-existing + python etl/scripts/download_tse_bens.py --billing-project icarus-corruptos + python etl/scripts/download_tse_bens.py --billing-project icarus-corruptos --start-year 2018 + python etl/scripts/download_tse_bens.py --billing-project icarus-corruptos --skip-existing """ from __future__ import annotations diff --git a/etl/scripts/download_tse_filiados.py b/etl/scripts/download_tse_filiados.py index 7c26323..f8d2531 100644 --- a/etl/scripts/download_tse_filiados.py +++ b/etl/scripts/download_tse_filiados.py @@ -7,9 +7,9 @@ Requires `google-cloud-bigquery` and an authenticated GCP project. Usage: - python etl/scripts/download_tse_filiados.py --billing-project bracc-corruptos - python etl/scripts/download_tse_filiados.py --billing-project bracc-corruptos --skip-existing - python etl/scripts/download_tse_filiados.py --billing-project bracc-corruptos --all-statuses + python etl/scripts/download_tse_filiados.py --billing-project icarus-corruptos + python etl/scripts/download_tse_filiados.py --billing-project icarus-corruptos --skip-existing + python etl/scripts/download_tse_filiados.py --billing-project icarus-corruptos --all-statuses """ from __future__ import annotations diff --git a/etl/scripts/download_un_sanctions.py b/etl/scripts/download_un_sanctions.py index 9ec5af1..ddfe443 100644 --- a/etl/scripts/download_un_sanctions.py +++ b/etl/scripts/download_un_sanctions.py @@ -14,10 +14,10 @@ import json import logging import sys +import defusedxml.ElementTree as ET from pathlib import Path import click -from defusedxml import ElementTree as ET # Allow imports from scripts/ directory sys.path.insert(0, str(Path(__file__).parent)) diff --git a/etl/src/bracc_etl/base.py b/etl/src/bracc_etl/base.py index 2309a6a..effd05e 100644 --- a/etl/src/bracc_etl/base.py +++ b/etl/src/bracc_etl/base.py @@ -21,12 +21,16 @@ def __init__( limit: int | None = None, chunk_size: int = 50_000, neo4j_database: str | None = None, + history: bool = False, ) -> None: self.driver = driver self.data_dir = data_dir self.limit = limit self.chunk_size = chunk_size self.neo4j_database = neo4j_database or os.getenv("NEO4J_DATABASE", "neo4j") + self.history = history + self.rows_in: int = 0 + self.rows_loaded: int = 0 source_key = getattr(self, "source_id", getattr(self, "name", "unknown_source")) self.run_id = f"{source_key}_{datetime.now(tz=UTC).strftime('%Y%m%d%H%M%S')}" @@ -87,8 +91,8 @@ def _upsert_ingestion_run( " r.started_at = coalesce($started_at, r.started_at), " " r.finished_at = coalesce($finished_at, r.finished_at), " " r.error = coalesce($error, r.error), " - " r.rows_in = coalesce(r.rows_in, 0), " - " r.rows_loaded = coalesce(r.rows_loaded, 0)" + " r.rows_in = $rows_in, " + " r.rows_loaded = $rows_loaded" ) run_id = getattr(self, "run_id", f"{source_id}_manual") params = { @@ -98,6 +102,8 @@ def _upsert_ingestion_run( "started_at": started_at, "finished_at": finished_at, "error": error, + "rows_in": self.rows_in, + "rows_loaded": self.rows_loaded, } try: with self.driver.session(database=self.neo4j_database) as session: diff --git a/etl/src/bracc_etl/entity_resolution/config.py b/etl/src/bracc_etl/entity_resolution/config.py index 6ff0518..34f65a5 100644 --- a/etl/src/bracc_etl/entity_resolution/config.py +++ b/etl/src/bracc_etl/entity_resolution/config.py @@ -13,7 +13,7 @@ def get_person_settings() -> dict[str, Any]: """ try: import splink.comparison_library as cl # type: ignore[import-not-found] - from splink import SettingsCreator + from splink import SettingsCreator # type: ignore[import-not-found,unused-ignore] except ImportError as exc: raise ImportError( "splink is required for entity resolution. " diff --git a/etl/src/bracc_etl/pipelines/bcb.py b/etl/src/bracc_etl/pipelines/bcb.py index a52b790..bbb59d2 100644 --- a/etl/src/bracc_etl/pipelines/bcb.py +++ b/etl/src/bracc_etl/pipelines/bcb.py @@ -51,8 +51,9 @@ def __init__( data_dir: str = "./data", limit: int | None = None, chunk_size: int = 50_000, + **kwargs: Any, ) -> None: - super().__init__(driver, data_dir, limit=limit, chunk_size=chunk_size) + super().__init__(driver, data_dir, limit=limit, chunk_size=chunk_size, **kwargs) self._raw: pd.DataFrame = pd.DataFrame() self.penalties: list[dict[str, Any]] = [] self.company_rels: list[dict[str, Any]] = [] diff --git a/etl/src/bracc_etl/pipelines/bndes.py b/etl/src/bracc_etl/pipelines/bndes.py index ac9fbf4..49cb86a 100644 --- a/etl/src/bracc_etl/pipelines/bndes.py +++ b/etl/src/bracc_etl/pipelines/bndes.py @@ -33,8 +33,9 @@ def __init__( data_dir: str = "./data", limit: int | None = None, chunk_size: int = 50_000, + **kwargs: Any, ) -> None: - super().__init__(driver, data_dir, limit=limit, chunk_size=chunk_size) + super().__init__(driver, data_dir, limit=limit, chunk_size=chunk_size, **kwargs) self._raw: pd.DataFrame = pd.DataFrame() self.finances: list[dict[str, Any]] = [] self.relationships: list[dict[str, Any]] = [] @@ -51,8 +52,15 @@ def _parse_value(self, value: str) -> float: def extract(self) -> None: bndes_dir = Path(self.data_dir) / "bndes" + if not bndes_dir.exists(): + logger.warning("[%s] Data directory not found: %s", self.name, bndes_dir) + return + csv_path = bndes_dir / "operacoes-nao-automaticas.csv" + if not csv_path.exists(): + logger.warning("[%s] CSV file not found: %s", self.name, csv_path) + return self._raw = pd.read_csv( - bndes_dir / "operacoes-nao-automaticas.csv", + csv_path, dtype=str, delimiter=";", encoding="latin-1", diff --git a/etl/src/bracc_etl/pipelines/caged.py b/etl/src/bracc_etl/pipelines/caged.py index 38f46b1..f0d47eb 100644 --- a/etl/src/bracc_etl/pipelines/caged.py +++ b/etl/src/bracc_etl/pipelines/caged.py @@ -88,8 +88,9 @@ def __init__( data_dir: str = "./data", limit: int | None = None, chunk_size: int = 50_000, + **kwargs: Any, ) -> None: - super().__init__(driver, data_dir, limit=limit, chunk_size=chunk_size) + super().__init__(driver, data_dir, limit=limit, chunk_size=chunk_size, **kwargs) self._csv_files: list[Path] = [] def extract(self) -> None: diff --git a/etl/src/bracc_etl/pipelines/camara.py b/etl/src/bracc_etl/pipelines/camara.py index 68d8c4e..4050a36 100644 --- a/etl/src/bracc_etl/pipelines/camara.py +++ b/etl/src/bracc_etl/pipelines/camara.py @@ -60,8 +60,9 @@ def __init__( data_dir: str = "./data", limit: int | None = None, chunk_size: int = 50_000, + **kwargs: Any, ) -> None: - super().__init__(driver, data_dir, limit=limit, chunk_size=chunk_size) + super().__init__(driver, data_dir, limit=limit, chunk_size=chunk_size, **kwargs) self._raw: pd.DataFrame = pd.DataFrame() self.expenses: list[dict[str, Any]] = [] self.deputies: list[dict[str, Any]] = [] diff --git a/etl/src/bracc_etl/pipelines/camara_inquiries.py b/etl/src/bracc_etl/pipelines/camara_inquiries.py index 30e611f..8966910 100644 --- a/etl/src/bracc_etl/pipelines/camara_inquiries.py +++ b/etl/src/bracc_etl/pipelines/camara_inquiries.py @@ -66,8 +66,9 @@ def __init__( data_dir: str = "./data", limit: int | None = None, chunk_size: int = 50_000, + **kwargs: Any, ) -> None: - super().__init__(driver, data_dir, limit=limit, chunk_size=chunk_size) + super().__init__(driver, data_dir, limit=limit, chunk_size=chunk_size, **kwargs) self._raw_inquiries: pd.DataFrame = pd.DataFrame() self._raw_requirements: pd.DataFrame = pd.DataFrame() diff --git a/etl/src/bracc_etl/pipelines/ceaf.py b/etl/src/bracc_etl/pipelines/ceaf.py index d83a4f2..1895ba0 100644 --- a/etl/src/bracc_etl/pipelines/ceaf.py +++ b/etl/src/bracc_etl/pipelines/ceaf.py @@ -31,8 +31,9 @@ def __init__( data_dir: str = "./data", limit: int | None = None, chunk_size: int = 50_000, + **kwargs: Any, ) -> None: - super().__init__(driver, data_dir, limit=limit, chunk_size=chunk_size) + super().__init__(driver, data_dir, limit=limit, chunk_size=chunk_size, **kwargs) self._raw: pd.DataFrame = pd.DataFrame() self.expulsions: list[dict[str, Any]] = [] self.person_rels: list[dict[str, Any]] = [] diff --git a/etl/src/bracc_etl/pipelines/cepim.py b/etl/src/bracc_etl/pipelines/cepim.py index bac6a1a..fc7a914 100644 --- a/etl/src/bracc_etl/pipelines/cepim.py +++ b/etl/src/bracc_etl/pipelines/cepim.py @@ -37,8 +37,9 @@ def __init__( data_dir: str = "./data", limit: int | None = None, chunk_size: int = 50_000, + **kwargs: Any, ) -> None: - super().__init__(driver, data_dir, limit=limit, chunk_size=chunk_size) + super().__init__(driver, data_dir, limit=limit, chunk_size=chunk_size, **kwargs) self._raw: pd.DataFrame = pd.DataFrame() self.ngos: list[dict[str, Any]] = [] self.company_rels: list[dict[str, Any]] = [] diff --git a/etl/src/bracc_etl/pipelines/cnpj.py b/etl/src/bracc_etl/pipelines/cnpj.py index 5eb713e..ce9d3e9 100644 --- a/etl/src/bracc_etl/pipelines/cnpj.py +++ b/etl/src/bracc_etl/pipelines/cnpj.py @@ -216,9 +216,11 @@ def __init__( limit: int | None = None, chunk_size: int = 50_000, history: bool = False, + **kwargs: Any, ) -> None: - super().__init__(driver, data_dir, limit=limit, chunk_size=chunk_size) - self.history = history + super().__init__( + driver, data_dir, limit=limit, chunk_size=chunk_size, history=history, **kwargs, + ) self.run_id = f"cnpj-{datetime.now(UTC).strftime('%Y%m%d%H%M%S')}" self._raw_empresas: pd.DataFrame = pd.DataFrame() self._raw_socios: pd.DataFrame = pd.DataFrame() diff --git a/etl/src/bracc_etl/pipelines/comprasnet.py b/etl/src/bracc_etl/pipelines/comprasnet.py index bd6386c..373b33b 100644 --- a/etl/src/bracc_etl/pipelines/comprasnet.py +++ b/etl/src/bracc_etl/pipelines/comprasnet.py @@ -63,7 +63,7 @@ class ComprasnetPipeline(Pipeline): """ETL pipeline for PNCP federal procurement contracts.""" name = "comprasnet" - source_id = "pncp" + source_id = "comprasnet" def __init__( self, @@ -71,8 +71,9 @@ def __init__( data_dir: str = "./data", limit: int | None = None, chunk_size: int = 50_000, + **kwargs: Any, ) -> None: - super().__init__(driver, data_dir, limit=limit, chunk_size=chunk_size) + super().__init__(driver, data_dir, limit=limit, chunk_size=chunk_size, **kwargs) self.contracts: list[dict[str, Any]] = [] def extract(self) -> None: diff --git a/etl/src/bracc_etl/pipelines/cpgf.py b/etl/src/bracc_etl/pipelines/cpgf.py index e757b71..122e341 100644 --- a/etl/src/bracc_etl/pipelines/cpgf.py +++ b/etl/src/bracc_etl/pipelines/cpgf.py @@ -84,8 +84,9 @@ def __init__( data_dir: str = "./data", limit: int | None = None, chunk_size: int = 50_000, + **kwargs: Any, ) -> None: - super().__init__(driver, data_dir, limit=limit, chunk_size=chunk_size) + super().__init__(driver, data_dir, limit=limit, chunk_size=chunk_size, **kwargs) self._raw: pd.DataFrame = pd.DataFrame() self.expenses: list[dict[str, Any]] = [] self.cardholders: list[dict[str, Any]] = [] diff --git a/etl/src/bracc_etl/pipelines/cvm.py b/etl/src/bracc_etl/pipelines/cvm.py index 7593a95..8f45cc4 100644 --- a/etl/src/bracc_etl/pipelines/cvm.py +++ b/etl/src/bracc_etl/pipelines/cvm.py @@ -38,8 +38,9 @@ def __init__( data_dir: str = "./data", limit: int | None = None, chunk_size: int = 50_000, + **kwargs: Any, ) -> None: - super().__init__(driver, data_dir, limit=limit, chunk_size=chunk_size) + super().__init__(driver, data_dir, limit=limit, chunk_size=chunk_size, **kwargs) self._raw_processos: pd.DataFrame = pd.DataFrame() self._raw_acusados: pd.DataFrame = pd.DataFrame() self.proceedings: list[dict[str, Any]] = [] diff --git a/etl/src/bracc_etl/pipelines/cvm_funds.py b/etl/src/bracc_etl/pipelines/cvm_funds.py index 4c205bd..dbb320d 100644 --- a/etl/src/bracc_etl/pipelines/cvm_funds.py +++ b/etl/src/bracc_etl/pipelines/cvm_funds.py @@ -43,8 +43,9 @@ def __init__( data_dir: str = "./data", limit: int | None = None, chunk_size: int = 50_000, + **kwargs: Any, ) -> None: - super().__init__(driver, data_dir, limit=limit, chunk_size=chunk_size) + super().__init__(driver, data_dir, limit=limit, chunk_size=chunk_size, **kwargs) self._raw: pd.DataFrame = pd.DataFrame() self.funds: list[dict[str, Any]] = [] self.admin_rels: list[dict[str, Any]] = [] diff --git a/etl/src/bracc_etl/pipelines/datajud.py b/etl/src/bracc_etl/pipelines/datajud.py index 3805a2f..3410525 100644 --- a/etl/src/bracc_etl/pipelines/datajud.py +++ b/etl/src/bracc_etl/pipelines/datajud.py @@ -50,8 +50,9 @@ def __init__( data_dir: str = "./data", limit: int | None = None, chunk_size: int = 50_000, + **kwargs: Any, ) -> None: - super().__init__(driver, data_dir, limit=limit, chunk_size=chunk_size) + super().__init__(driver, data_dir, limit=limit, chunk_size=chunk_size, **kwargs) self._raw_cases: pd.DataFrame = pd.DataFrame() self._raw_parties: pd.DataFrame = pd.DataFrame() diff --git a/etl/src/bracc_etl/pipelines/datasus.py b/etl/src/bracc_etl/pipelines/datasus.py index 6f4f7a1..a4a9269 100644 --- a/etl/src/bracc_etl/pipelines/datasus.py +++ b/etl/src/bracc_etl/pipelines/datasus.py @@ -29,8 +29,9 @@ def __init__( data_dir: str = "./data", limit: int | None = None, chunk_size: int = 50_000, + **kwargs: Any, ) -> None: - super().__init__(driver, data_dir, limit=limit, chunk_size=chunk_size) + super().__init__(driver, data_dir, limit=limit, chunk_size=chunk_size, **kwargs) self._raw: pd.DataFrame = pd.DataFrame() self.facilities: list[dict[str, Any]] = [] self.company_links: list[dict[str, Any]] = [] diff --git a/etl/src/bracc_etl/pipelines/dou.py b/etl/src/bracc_etl/pipelines/dou.py index 9e83cee..cded552 100644 --- a/etl/src/bracc_etl/pipelines/dou.py +++ b/etl/src/bracc_etl/pipelines/dou.py @@ -17,7 +17,10 @@ from pathlib import Path from typing import TYPE_CHECKING, Any -from defusedxml import ElementTree # type: ignore[import-untyped] +from defusedxml.ElementTree import ParseError as _XmlParseError # type: ignore[import-untyped] +from defusedxml.ElementTree import ( + parse as _safe_xml_parse, # type: ignore[import-untyped,unused-ignore] +) from bracc_etl.base import Pipeline from bracc_etl.loader import Neo4jBatchLoader @@ -141,8 +144,9 @@ def __init__( data_dir: str = "./data", limit: int | None = None, chunk_size: int = 50_000, + **kwargs: Any, ) -> None: - super().__init__(driver, data_dir, limit=limit, chunk_size=chunk_size) + super().__init__(driver, data_dir, limit=limit, chunk_size=chunk_size, **kwargs) self._raw_acts: list[dict[str, str]] = [] self.acts: list[dict[str, Any]] = [] self.person_rels: list[dict[str, Any]] = [] @@ -227,8 +231,8 @@ def _extract_xml(self, xml_files: list[Path]) -> None: """Extract acts from Imprensa Nacional XML dumps.""" for f in xml_files: try: - tree = ElementTree.parse(f) # noqa: S314 - except ElementTree.ParseError: + tree = _safe_xml_parse(f) + except _XmlParseError: logger.warning("[dou] Failed to parse XML: %s", f.name) continue diff --git a/etl/src/bracc_etl/pipelines/eu_sanctions.py b/etl/src/bracc_etl/pipelines/eu_sanctions.py index 2bb60a7..5d3b11e 100644 --- a/etl/src/bracc_etl/pipelines/eu_sanctions.py +++ b/etl/src/bracc_etl/pipelines/eu_sanctions.py @@ -76,8 +76,9 @@ def __init__( data_dir: str = "./data", limit: int | None = None, chunk_size: int = 50_000, + **kwargs: Any, ) -> None: - super().__init__(driver, data_dir, limit=limit, chunk_size=chunk_size) + super().__init__(driver, data_dir, limit=limit, chunk_size=chunk_size, **kwargs) self._raw: pd.DataFrame = pd.DataFrame() self.sanctions: list[dict[str, Any]] = [] self.person_rels: list[dict[str, Any]] = [] diff --git a/etl/src/bracc_etl/pipelines/holdings.py b/etl/src/bracc_etl/pipelines/holdings.py index 801b776..e9afd90 100644 --- a/etl/src/bracc_etl/pipelines/holdings.py +++ b/etl/src/bracc_etl/pipelines/holdings.py @@ -36,8 +36,9 @@ def __init__( data_dir: str = "./data", limit: int | None = None, chunk_size: int = 50_000, + **kwargs: Any, ) -> None: - super().__init__(driver, data_dir, limit=limit, chunk_size=chunk_size) + super().__init__(driver, data_dir, limit=limit, chunk_size=chunk_size, **kwargs) self._raw: pd.DataFrame = pd.DataFrame() self.holding_rels: list[dict[str, Any]] = [] diff --git a/etl/src/bracc_etl/pipelines/ibama.py b/etl/src/bracc_etl/pipelines/ibama.py index be1f1c6..3256d34 100644 --- a/etl/src/bracc_etl/pipelines/ibama.py +++ b/etl/src/bracc_etl/pipelines/ibama.py @@ -40,8 +40,9 @@ def __init__( data_dir: str = "./data", limit: int | None = None, chunk_size: int = 50_000, + **kwargs: Any, ) -> None: - super().__init__(driver, data_dir, limit=limit, chunk_size=chunk_size) + super().__init__(driver, data_dir, limit=limit, chunk_size=chunk_size, **kwargs) self._raw: pd.DataFrame = pd.DataFrame() self.embargoes: list[dict[str, Any]] = [] self.companies: list[dict[str, Any]] = [] @@ -65,7 +66,13 @@ def _primary_biome(self, value: str) -> str: def extract(self) -> None: ibama_dir = Path(self.data_dir) / "ibama" + if not ibama_dir.exists(): + logger.warning("[%s] Data directory not found: %s", self.name, ibama_dir) + return csv_path = ibama_dir / "areas_embargadas.csv" + if not csv_path.exists(): + logger.warning("[%s] CSV file not found: %s", self.name, csv_path) + return logger.info("[ibama] Reading %s", csv_path) self._raw = pd.read_csv( csv_path, diff --git a/etl/src/bracc_etl/pipelines/icij.py b/etl/src/bracc_etl/pipelines/icij.py index b025f1e..e1fede6 100644 --- a/etl/src/bracc_etl/pipelines/icij.py +++ b/etl/src/bracc_etl/pipelines/icij.py @@ -42,8 +42,9 @@ def __init__( data_dir: str = "./data", limit: int | None = None, chunk_size: int = 50_000, + **kwargs: Any, ) -> None: - super().__init__(driver, data_dir, limit=limit, chunk_size=chunk_size) + super().__init__(driver, data_dir, limit=limit, chunk_size=chunk_size, **kwargs) self._entities_raw: pd.DataFrame = pd.DataFrame() self._officers_raw: pd.DataFrame = pd.DataFrame() self._intermediaries_raw: pd.DataFrame = pd.DataFrame() diff --git a/etl/src/bracc_etl/pipelines/inep.py b/etl/src/bracc_etl/pipelines/inep.py index fd0d1c4..0ff9d35 100644 --- a/etl/src/bracc_etl/pipelines/inep.py +++ b/etl/src/bracc_etl/pipelines/inep.py @@ -42,8 +42,9 @@ def __init__( data_dir: str = "./data", limit: int | None = None, chunk_size: int = 50_000, + **kwargs: Any, ) -> None: - super().__init__(driver, data_dir, limit=limit, chunk_size=chunk_size) + super().__init__(driver, data_dir, limit=limit, chunk_size=chunk_size, **kwargs) self.schools: list[dict[str, Any]] = [] self.school_company_links: list[dict[str, Any]] = [] diff --git a/etl/src/bracc_etl/pipelines/leniency.py b/etl/src/bracc_etl/pipelines/leniency.py index 6076664..6b4573d 100644 --- a/etl/src/bracc_etl/pipelines/leniency.py +++ b/etl/src/bracc_etl/pipelines/leniency.py @@ -31,8 +31,9 @@ def __init__( data_dir: str = "./data", limit: int | None = None, chunk_size: int = 50_000, + **kwargs: Any, ) -> None: - super().__init__(driver, data_dir, limit=limit, chunk_size=chunk_size) + super().__init__(driver, data_dir, limit=limit, chunk_size=chunk_size, **kwargs) self._raw: pd.DataFrame = pd.DataFrame() self.agreements: list[dict[str, Any]] = [] self.company_rels: list[dict[str, Any]] = [] diff --git a/etl/src/bracc_etl/pipelines/mides.py b/etl/src/bracc_etl/pipelines/mides.py index 62033c3..7a9f520 100644 --- a/etl/src/bracc_etl/pipelines/mides.py +++ b/etl/src/bracc_etl/pipelines/mides.py @@ -74,8 +74,9 @@ def __init__( data_dir: str = "./data", limit: int | None = None, chunk_size: int = 50_000, + **kwargs: Any, ) -> None: - super().__init__(driver, data_dir, limit=limit, chunk_size=chunk_size) + super().__init__(driver, data_dir, limit=limit, chunk_size=chunk_size, **kwargs) self._raw_bids: pd.DataFrame = pd.DataFrame() self._raw_contracts: pd.DataFrame = pd.DataFrame() diff --git a/etl/src/bracc_etl/pipelines/ofac.py b/etl/src/bracc_etl/pipelines/ofac.py index 4b64a62..da7b3a2 100644 --- a/etl/src/bracc_etl/pipelines/ofac.py +++ b/etl/src/bracc_etl/pipelines/ofac.py @@ -63,8 +63,9 @@ def __init__( data_dir: str = "./data", limit: int | None = None, chunk_size: int = 50_000, + **kwargs: Any, ) -> None: - super().__init__(driver, data_dir, limit=limit, chunk_size=chunk_size) + super().__init__(driver, data_dir, limit=limit, chunk_size=chunk_size, **kwargs) self._raw: pd.DataFrame = pd.DataFrame() self.sanctions: list[dict[str, Any]] = [] diff --git a/etl/src/bracc_etl/pipelines/opensanctions.py b/etl/src/bracc_etl/pipelines/opensanctions.py index fa76dff..4b51290 100644 --- a/etl/src/bracc_etl/pipelines/opensanctions.py +++ b/etl/src/bracc_etl/pipelines/opensanctions.py @@ -81,8 +81,9 @@ def __init__( data_dir: str = "./data", limit: int | None = None, chunk_size: int = 50_000, + **kwargs: Any, ) -> None: - super().__init__(driver, data_dir, limit=limit, chunk_size=chunk_size) + super().__init__(driver, data_dir, limit=limit, chunk_size=chunk_size, **kwargs) self._raw_entities: list[dict[str, Any]] = [] self.global_peps: list[dict[str, Any]] = [] self.pep_match_rels: list[dict[str, Any]] = [] diff --git a/etl/src/bracc_etl/pipelines/pep_cgu.py b/etl/src/bracc_etl/pipelines/pep_cgu.py index b50ffb3..141f665 100644 --- a/etl/src/bracc_etl/pipelines/pep_cgu.py +++ b/etl/src/bracc_etl/pipelines/pep_cgu.py @@ -84,8 +84,9 @@ def __init__( data_dir: str = "./data", limit: int | None = None, chunk_size: int = 50_000, + **kwargs: Any, ) -> None: - super().__init__(driver, data_dir, limit=limit, chunk_size=chunk_size) + super().__init__(driver, data_dir, limit=limit, chunk_size=chunk_size, **kwargs) self._raw: pd.DataFrame = pd.DataFrame() self.pep_records: list[dict[str, Any]] = [] self.person_links: list[dict[str, Any]] = [] diff --git a/etl/src/bracc_etl/pipelines/pgfn.py b/etl/src/bracc_etl/pipelines/pgfn.py index 62f6eeb..2d0bf09 100644 --- a/etl/src/bracc_etl/pipelines/pgfn.py +++ b/etl/src/bracc_etl/pipelines/pgfn.py @@ -38,8 +38,9 @@ def __init__( data_dir: str = "./data", limit: int | None = None, chunk_size: int = 50_000, + **kwargs: Any, ) -> None: - super().__init__(driver, data_dir, limit=limit, chunk_size=chunk_size) + super().__init__(driver, data_dir, limit=limit, chunk_size=chunk_size, **kwargs) self._csv_files: list[Path] = [] self.finances: list[dict[str, Any]] = [] self.relationships: list[dict[str, Any]] = [] @@ -56,10 +57,13 @@ def _parse_value(self, value: str) -> float: def extract(self) -> None: pgfn_dir = Path(self.data_dir) / "pgfn" + if not pgfn_dir.exists(): + logger.warning("[%s] Data directory not found: %s", self.name, pgfn_dir) + return self._csv_files = sorted(pgfn_dir.glob("arquivo_lai_SIDA_*_*.csv")) if not self._csv_files: - msg = f"No PGFN CSV files found in {pgfn_dir}" - raise FileNotFoundError(msg) + logger.warning("[%s] No PGFN CSV files found in %s", self.name, pgfn_dir) + return logger.info("[pgfn] Found %d CSV files to process", len(self._csv_files)) def transform(self) -> None: diff --git a/etl/src/bracc_etl/pipelines/pncp.py b/etl/src/bracc_etl/pipelines/pncp.py index 845c29c..37757f6 100644 --- a/etl/src/bracc_etl/pipelines/pncp.py +++ b/etl/src/bracc_etl/pipelines/pncp.py @@ -68,8 +68,9 @@ def __init__( data_dir: str = "./data", limit: int | None = None, chunk_size: int = 50_000, + **kwargs: Any, ) -> None: - super().__init__(driver, data_dir, limit=limit, chunk_size=chunk_size) + super().__init__(driver, data_dir, limit=limit, chunk_size=chunk_size, **kwargs) self._raw_records: list[dict[str, Any]] = [] self.bids: list[dict[str, Any]] = [] self.coverage_start: str = "" @@ -97,8 +98,8 @@ def _infer_coverage( self.coverage_end = str(manifest.get("coverage_end", "")).strip() self.coverage_complete = bool(manifest.get("coverage_complete", False)) return - except Exception: - logger.warning("Invalid PNCP coverage manifest: %s", manifest_path) + except Exception as exc: + logger.warning("Invalid PNCP coverage manifest %s: %s", manifest_path, exc) dates: list[str] = [] for rec in records: @@ -135,8 +136,12 @@ def extract(self) -> None: all_records: list[dict[str, Any]] = [] for f in json_files: - raw = f.read_text(encoding="utf-8") - payload = json.loads(raw, strict=False) + try: + raw = f.read_text(encoding="utf-8") + payload = json.loads(raw, strict=False) + except (json.JSONDecodeError, OSError) as exc: + logger.warning("Failed to parse JSON from %s: %s", f, exc) + continue # Handle both wrapped (API response) and flat (list) formats if isinstance(payload, dict) and "data" in payload: diff --git a/etl/src/bracc_etl/pipelines/querido_diario.py b/etl/src/bracc_etl/pipelines/querido_diario.py index 3bb3762..846e0f0 100644 --- a/etl/src/bracc_etl/pipelines/querido_diario.py +++ b/etl/src/bracc_etl/pipelines/querido_diario.py @@ -64,8 +64,9 @@ def __init__( data_dir: str = "./data", limit: int | None = None, chunk_size: int = 50_000, + **kwargs: Any, ) -> None: - super().__init__(driver, data_dir, limit=limit, chunk_size=chunk_size) + super().__init__(driver, data_dir, limit=limit, chunk_size=chunk_size, **kwargs) self._raw_acts: list[dict[str, str]] = [] self.acts: list[dict[str, Any]] = [] self.company_mentions: list[dict[str, Any]] = [] diff --git a/etl/src/bracc_etl/pipelines/rais.py b/etl/src/bracc_etl/pipelines/rais.py index 92945ec..5d84a35 100644 --- a/etl/src/bracc_etl/pipelines/rais.py +++ b/etl/src/bracc_etl/pipelines/rais.py @@ -45,8 +45,9 @@ def __init__( data_dir: str = "./data", limit: int | None = None, chunk_size: int = 50_000, + **kwargs: Any, ) -> None: - super().__init__(driver, data_dir, limit=limit, chunk_size=chunk_size) + super().__init__(driver, data_dir, limit=limit, chunk_size=chunk_size, **kwargs) self.labor_stats: list[dict[str, Any]] = [] def extract(self) -> None: diff --git a/etl/src/bracc_etl/pipelines/renuncias.py b/etl/src/bracc_etl/pipelines/renuncias.py index ef4b948..639810a 100644 --- a/etl/src/bracc_etl/pipelines/renuncias.py +++ b/etl/src/bracc_etl/pipelines/renuncias.py @@ -47,8 +47,9 @@ def __init__( data_dir: str = "./data", limit: int | None = None, chunk_size: int = 50_000, + **kwargs: Any, ) -> None: - super().__init__(driver, data_dir, limit=limit, chunk_size=chunk_size) + super().__init__(driver, data_dir, limit=limit, chunk_size=chunk_size, **kwargs) self._raw: pd.DataFrame = pd.DataFrame() self.waivers: list[dict[str, Any]] = [] self.company_rels: list[dict[str, Any]] = [] diff --git a/etl/src/bracc_etl/pipelines/sanctions.py b/etl/src/bracc_etl/pipelines/sanctions.py index c8b6d72..c1ac5e5 100644 --- a/etl/src/bracc_etl/pipelines/sanctions.py +++ b/etl/src/bracc_etl/pipelines/sanctions.py @@ -1,5 +1,6 @@ from __future__ import annotations +import logging from pathlib import Path from typing import TYPE_CHECKING, Any @@ -19,6 +20,8 @@ strip_document, ) +logger = logging.getLogger(__name__) + class SanctionsPipeline(Pipeline): """ETL pipeline for CEIS/CNEP sanctions data.""" @@ -32,8 +35,9 @@ def __init__( data_dir: str = "./data", limit: int | None = None, chunk_size: int = 50_000, + **kwargs: Any, ) -> None: - super().__init__(driver, data_dir, limit=limit, chunk_size=chunk_size) + super().__init__(driver, data_dir, limit=limit, chunk_size=chunk_size, **kwargs) self._raw_ceis: pd.DataFrame = pd.DataFrame() self._raw_cnep: pd.DataFrame = pd.DataFrame() self.sanctions: list[dict[str, Any]] = [] @@ -41,17 +45,19 @@ def __init__( def extract(self) -> None: sanctions_dir = Path(self.data_dir) / "sanctions" + if not sanctions_dir.exists(): + logger.warning("[%s] Data directory not found: %s", self.name, sanctions_dir) + return + ceis_path = sanctions_dir / "ceis.csv" + cnep_path = sanctions_dir / "cnep.csv" + if not ceis_path.exists() or not cnep_path.exists(): + logger.warning("[%s] Required CSV files not found in %s", self.name, sanctions_dir) + return self._raw_ceis = pd.read_csv( - sanctions_dir / "ceis.csv", - dtype=str, - encoding="latin-1", - keep_default_na=False, + ceis_path, dtype=str, encoding="latin-1", keep_default_na=False, ) self._raw_cnep = pd.read_csv( - sanctions_dir / "cnep.csv", - dtype=str, - encoding="latin-1", - keep_default_na=False, + cnep_path, dtype=str, encoding="latin-1", keep_default_na=False, ) def _process_rows( diff --git a/etl/src/bracc_etl/pipelines/senado.py b/etl/src/bracc_etl/pipelines/senado.py index fd16117..5c55181 100644 --- a/etl/src/bracc_etl/pipelines/senado.py +++ b/etl/src/bracc_etl/pipelines/senado.py @@ -64,8 +64,9 @@ def __init__( data_dir: str = "./data", limit: int | None = None, chunk_size: int = 50_000, + **kwargs: Any, ) -> None: - super().__init__(driver, data_dir, limit=limit, chunk_size=chunk_size) + super().__init__(driver, data_dir, limit=limit, chunk_size=chunk_size, **kwargs) self._raw: pd.DataFrame = pd.DataFrame() self._senator_lookup: dict[str, dict[str, str]] = {} self.expenses: list[dict[str, Any]] = [] diff --git a/etl/src/bracc_etl/pipelines/senado_cpis.py b/etl/src/bracc_etl/pipelines/senado_cpis.py index 680d3fb..9b1e953 100644 --- a/etl/src/bracc_etl/pipelines/senado_cpis.py +++ b/etl/src/bracc_etl/pipelines/senado_cpis.py @@ -108,8 +108,9 @@ def __init__( data_dir: str = "./data", limit: int | None = None, chunk_size: int = 50_000, + **kwargs: Any, ) -> None: - super().__init__(driver, data_dir, limit=limit, chunk_size=chunk_size) + super().__init__(driver, data_dir, limit=limit, chunk_size=chunk_size, **kwargs) self._raw_inquiries: pd.DataFrame = pd.DataFrame() self._raw: pd.DataFrame = pd.DataFrame() diff --git a/etl/src/bracc_etl/pipelines/siconfi.py b/etl/src/bracc_etl/pipelines/siconfi.py index 472ce2f..128d8a8 100644 --- a/etl/src/bracc_etl/pipelines/siconfi.py +++ b/etl/src/bracc_etl/pipelines/siconfi.py @@ -39,8 +39,9 @@ def __init__( data_dir: str = "./data", limit: int | None = None, chunk_size: int = 50_000, + **kwargs: Any, ) -> None: - super().__init__(driver, data_dir, limit=limit, chunk_size=chunk_size) + super().__init__(driver, data_dir, limit=limit, chunk_size=chunk_size, **kwargs) self._raw: list[dict[str, Any]] = [] self.finances: list[dict[str, Any]] = [] self.municipality_rels: list[dict[str, Any]] = [] diff --git a/etl/src/bracc_etl/pipelines/siop.py b/etl/src/bracc_etl/pipelines/siop.py index c33b0d0..cb9508b 100644 --- a/etl/src/bracc_etl/pipelines/siop.py +++ b/etl/src/bracc_etl/pipelines/siop.py @@ -67,8 +67,9 @@ def __init__( data_dir: str = "./data", limit: int | None = None, chunk_size: int = 50_000, + **kwargs: Any, ) -> None: - super().__init__(driver, data_dir, limit=limit, chunk_size=chunk_size) + super().__init__(driver, data_dir, limit=limit, chunk_size=chunk_size, **kwargs) self._raw: pd.DataFrame = pd.DataFrame() self.amendments: list[dict[str, Any]] = [] self.authors: list[dict[str, Any]] = [] diff --git a/etl/src/bracc_etl/pipelines/stf.py b/etl/src/bracc_etl/pipelines/stf.py index 3969574..193ba8b 100644 --- a/etl/src/bracc_etl/pipelines/stf.py +++ b/etl/src/bracc_etl/pipelines/stf.py @@ -39,8 +39,9 @@ def __init__( data_dir: str = "./data", limit: int | None = None, chunk_size: int = 50_000, + **kwargs: Any, ) -> None: - super().__init__(driver, data_dir, limit=limit, chunk_size=chunk_size) + super().__init__(driver, data_dir, limit=limit, chunk_size=chunk_size, **kwargs) self._raw: pd.DataFrame = pd.DataFrame() self.cases: list[dict[str, Any]] = [] self.rapporteur_rels: list[dict[str, Any]] = [] diff --git a/etl/src/bracc_etl/pipelines/tcu.py b/etl/src/bracc_etl/pipelines/tcu.py index 1b34397..007db8e 100644 --- a/etl/src/bracc_etl/pipelines/tcu.py +++ b/etl/src/bracc_etl/pipelines/tcu.py @@ -42,8 +42,9 @@ def __init__( data_dir: str = "./data", limit: int | None = None, chunk_size: int = 50_000, + **kwargs: Any, ) -> None: - super().__init__(driver, data_dir, limit=limit, chunk_size=chunk_size) + super().__init__(driver, data_dir, limit=limit, chunk_size=chunk_size, **kwargs) self._raw_inabilitados: pd.DataFrame = pd.DataFrame() self._raw_inidoneos: pd.DataFrame = pd.DataFrame() self._raw_irregulares: pd.DataFrame = pd.DataFrame() diff --git a/etl/src/bracc_etl/pipelines/transferegov.py b/etl/src/bracc_etl/pipelines/transferegov.py index 2eefd89..e441736 100644 --- a/etl/src/bracc_etl/pipelines/transferegov.py +++ b/etl/src/bracc_etl/pipelines/transferegov.py @@ -55,8 +55,9 @@ def __init__( data_dir: str = "./data", limit: int | None = None, chunk_size: int = 50_000, + **kwargs: Any, ) -> None: - super().__init__(driver, data_dir, limit=limit, chunk_size=chunk_size) + super().__init__(driver, data_dir, limit=limit, chunk_size=chunk_size, **kwargs) self._raw_emendas: pd.DataFrame = pd.DataFrame() self._raw_favorecidos: pd.DataFrame = pd.DataFrame() self._raw_convenios: pd.DataFrame = pd.DataFrame() diff --git a/etl/src/bracc_etl/pipelines/transparencia.py b/etl/src/bracc_etl/pipelines/transparencia.py index 0f9c4ee..5d67d4c 100644 --- a/etl/src/bracc_etl/pipelines/transparencia.py +++ b/etl/src/bracc_etl/pipelines/transparencia.py @@ -1,6 +1,7 @@ from __future__ import annotations import hashlib +import logging import re from pathlib import Path from typing import TYPE_CHECKING, Any @@ -21,6 +22,8 @@ strip_document, ) +logger = logging.getLogger(__name__) + # Classified contracts (Polícia Federal etc.) use this sentinel CNPJ. _SIGILOSO_CNPJ = "-11" @@ -78,8 +81,9 @@ def __init__( data_dir: str = "./data", limit: int | None = None, chunk_size: int = 50_000, + **kwargs: Any, ) -> None: - super().__init__(driver, data_dir, limit=limit, chunk_size=chunk_size) + super().__init__(driver, data_dir, limit=limit, chunk_size=chunk_size, **kwargs) self._raw_contratos: pd.DataFrame = pd.DataFrame() self._raw_servidores: pd.DataFrame = pd.DataFrame() self._raw_emendas: pd.DataFrame = pd.DataFrame() @@ -89,24 +93,30 @@ def __init__( def extract(self) -> None: src_dir = Path(self.data_dir) / "transparencia" - self._raw_contratos = pd.read_csv( - src_dir / "contratos.csv", - dtype=str, - keep_default_na=False, - encoding="utf-8", - ) - self._raw_servidores = pd.read_csv( - src_dir / "servidores.csv", - dtype=str, - keep_default_na=False, - encoding="utf-8", - ) - self._raw_emendas = pd.read_csv( - src_dir / "emendas.csv", - dtype=str, - keep_default_na=False, - encoding="utf-8", - ) + if not src_dir.exists(): + logger.warning("[%s] Data directory not found: %s", self.name, src_dir) + return + contratos_path = src_dir / "contratos.csv" + servidores_path = src_dir / "servidores.csv" + emendas_path = src_dir / "emendas.csv" + if not contratos_path.exists(): + logger.warning("[%s] contratos.csv not found in %s", self.name, src_dir) + else: + self._raw_contratos = pd.read_csv( + contratos_path, dtype=str, keep_default_na=False, encoding="utf-8", + ) + if not servidores_path.exists(): + logger.warning("[%s] servidores.csv not found in %s", self.name, src_dir) + else: + self._raw_servidores = pd.read_csv( + servidores_path, dtype=str, keep_default_na=False, encoding="utf-8", + ) + if not emendas_path.exists(): + logger.warning("[%s] emendas.csv not found in %s", self.name, src_dir) + else: + self._raw_emendas = pd.read_csv( + emendas_path, dtype=str, keep_default_na=False, encoding="utf-8", + ) def transform(self) -> None: contracts: list[dict[str, Any]] = [] diff --git a/etl/src/bracc_etl/pipelines/tse.py b/etl/src/bracc_etl/pipelines/tse.py index b59de31..c539052 100644 --- a/etl/src/bracc_etl/pipelines/tse.py +++ b/etl/src/bracc_etl/pipelines/tse.py @@ -1,5 +1,6 @@ from __future__ import annotations +import logging from pathlib import Path from typing import TYPE_CHECKING, Any @@ -18,6 +19,8 @@ strip_document, ) +logger = logging.getLogger(__name__) + # TSE 2024 masks ALL candidate CPFs as "-4". After strip_document → "4", # format_cpf → "4" — every candidate MERGEs into one ghost node. # We use SQ_CANDIDATO (unique sequential ID per candidate per election) instead. @@ -36,20 +39,33 @@ def __init__( data_dir: str = "./data", limit: int | None = None, chunk_size: int = 50_000, + **kwargs: Any, ) -> None: - super().__init__(driver, data_dir, limit=limit, chunk_size=chunk_size) + super().__init__(driver, data_dir, limit=limit, chunk_size=chunk_size, **kwargs) self.candidates: list[dict[str, Any]] = [] self.donations: list[dict[str, Any]] = [] self.elections: list[dict[str, Any]] = [] def extract(self) -> None: tse_dir = Path(self.data_dir) / "tse" + if not tse_dir.exists(): + logger.warning("[%s] Data directory not found: %s", self.name, tse_dir) + self._raw_candidatos = pd.DataFrame() + self._raw_doacoes = pd.DataFrame() + return + candidatos_path = tse_dir / "candidatos.csv" + doacoes_path = tse_dir / "doacoes.csv" + if not candidatos_path.exists() or not doacoes_path.exists(): + logger.warning("[%s] Required CSV files not found in %s", self.name, tse_dir) + self._raw_candidatos = pd.DataFrame() + self._raw_doacoes = pd.DataFrame() + return self._raw_candidatos = pd.read_csv( - tse_dir / "candidatos.csv", encoding="latin-1", dtype=str, + candidatos_path, encoding="latin-1", dtype=str, nrows=self.limit, ) self._raw_doacoes = pd.read_csv( - tse_dir / "doacoes.csv", encoding="latin-1", dtype=str, + doacoes_path, encoding="latin-1", dtype=str, nrows=self.limit, ) diff --git a/etl/src/bracc_etl/pipelines/tse_bens.py b/etl/src/bracc_etl/pipelines/tse_bens.py index 6806759..ac1dee1 100644 --- a/etl/src/bracc_etl/pipelines/tse_bens.py +++ b/etl/src/bracc_etl/pipelines/tse_bens.py @@ -51,8 +51,9 @@ def __init__( data_dir: str = "./data", limit: int | None = None, chunk_size: int = 50_000, + **kwargs: Any, ) -> None: - super().__init__(driver, data_dir, limit=limit, chunk_size=chunk_size) + super().__init__(driver, data_dir, limit=limit, chunk_size=chunk_size, **kwargs) self._raw: pd.DataFrame = pd.DataFrame() self.assets: list[dict[str, Any]] = [] self.person_rels: list[dict[str, Any]] = [] diff --git a/etl/src/bracc_etl/pipelines/tse_filiados.py b/etl/src/bracc_etl/pipelines/tse_filiados.py index 12a18ef..dd154cc 100644 --- a/etl/src/bracc_etl/pipelines/tse_filiados.py +++ b/etl/src/bracc_etl/pipelines/tse_filiados.py @@ -50,8 +50,9 @@ def __init__( data_dir: str = "./data", limit: int | None = None, chunk_size: int = 50_000, + **kwargs: Any, ) -> None: - super().__init__(driver, data_dir, limit=limit, chunk_size=chunk_size) + super().__init__(driver, data_dir, limit=limit, chunk_size=chunk_size, **kwargs) self._raw: pd.DataFrame = pd.DataFrame() self.memberships: list[dict[str, Any]] = [] self.person_rels: list[dict[str, Any]] = [] diff --git a/etl/src/bracc_etl/pipelines/un_sanctions.py b/etl/src/bracc_etl/pipelines/un_sanctions.py index 7218a15..9b382a4 100644 --- a/etl/src/bracc_etl/pipelines/un_sanctions.py +++ b/etl/src/bracc_etl/pipelines/un_sanctions.py @@ -44,8 +44,9 @@ def __init__( data_dir: str = "./data", limit: int | None = None, chunk_size: int = 50_000, + **kwargs: Any, ) -> None: - super().__init__(driver, data_dir, limit=limit, chunk_size=chunk_size) + super().__init__(driver, data_dir, limit=limit, chunk_size=chunk_size, **kwargs) self._raw: list[dict[str, Any]] = [] self.sanctions: list[dict[str, Any]] = [] self.person_rels: list[dict[str, Any]] = [] diff --git a/etl/src/bracc_etl/pipelines/viagens.py b/etl/src/bracc_etl/pipelines/viagens.py index d3e5abd..cdc7cfe 100644 --- a/etl/src/bracc_etl/pipelines/viagens.py +++ b/etl/src/bracc_etl/pipelines/viagens.py @@ -104,8 +104,9 @@ def __init__( data_dir: str = "./data", limit: int | None = None, chunk_size: int = 50_000, + **kwargs: Any, ) -> None: - super().__init__(driver, data_dir, limit=limit, chunk_size=chunk_size) + super().__init__(driver, data_dir, limit=limit, chunk_size=chunk_size, **kwargs) self._raw: pd.DataFrame = pd.DataFrame() self.travels: list[dict[str, Any]] = [] self.person_rels: list[dict[str, Any]] = [] diff --git a/etl/src/bracc_etl/pipelines/world_bank.py b/etl/src/bracc_etl/pipelines/world_bank.py index 0e94a47..746e50d 100644 --- a/etl/src/bracc_etl/pipelines/world_bank.py +++ b/etl/src/bracc_etl/pipelines/world_bank.py @@ -42,8 +42,9 @@ def __init__( data_dir: str = "./data", limit: int | None = None, chunk_size: int = 50_000, + **kwargs: Any, ) -> None: - super().__init__(driver, data_dir, limit=limit, chunk_size=chunk_size) + super().__init__(driver, data_dir, limit=limit, chunk_size=chunk_size, **kwargs) self._raw: pd.DataFrame = pd.DataFrame() self.sanctions: list[dict[str, Any]] = [] diff --git a/etl/src/bracc_etl/runner.py b/etl/src/bracc_etl/runner.py index 423115e..7c5bcef 100644 --- a/etl/src/bracc_etl/runner.py +++ b/etl/src/bracc_etl/runner.py @@ -102,7 +102,7 @@ @click.group() def cli() -> None: - """BRACC ETL — Data ingestion pipelines for Brazilian public data.""" + """BR-ACC ETL — Data ingestion pipelines for Brazilian public data.""" logging.basicConfig(level=logging.INFO, format="%(asctime)s %(levelname)s %(message)s") @@ -141,14 +141,14 @@ def run( ) -> None: """Run an ETL pipeline.""" os.environ["NEO4J_DATABASE"] = neo4j_database - driver = GraphDatabase.driver(neo4j_uri, auth=(neo4j_user, neo4j_password)) if source not in PIPELINES: available = ", ".join(PIPELINES.keys()) raise click.ClickException(f"Unknown source: {source}. Available: {available}") - pipeline_cls = PIPELINES[source] + driver = GraphDatabase.driver(neo4j_uri, auth=(neo4j_user, neo4j_password)) try: + pipeline_cls = PIPELINES[source] pipeline = pipeline_cls( driver=driver, data_dir=data_dir, @@ -156,37 +156,89 @@ def run( chunk_size=chunk_size, history=history, ) - except TypeError: - pipeline = pipeline_cls( + + if streaming and hasattr(pipeline, "run_streaming"): + pipeline.run_streaming(start_phase=start_phase) + else: + pipeline.run() + + run_post_load_hooks( driver=driver, - data_dir=data_dir, - limit=limit, - chunk_size=chunk_size, + source=source, + neo4j_database=neo4j_database, + linking_tier=linking_tier, ) + finally: + driver.close() + + +def _resolve_rf_release_inline(year_month: str | None = None) -> str: + """Resolve Receita Federal CNPJ release URL. - if streaming and hasattr(pipeline, "run_streaming"): - pipeline.run_streaming(start_phase=start_phase) + Tries Nextcloud shares first (new primary), falls back to dadosabertos.rfb.gov.br. + """ + from datetime import UTC, datetime + + import httpx + + # --- Nextcloud (primary) --- + nextcloud_dl = "https://arquivos.receitafederal.gov.br/s/{token}/download?path=%2F&files=" + tokens: list[str] = [] + env_token = os.environ.get("CNPJ_SHARE_TOKEN") + if env_token: + tokens.append(env_token) + tokens.extend(["gn672Ad4CF8N6TK", "YggdBLfdninEJX9"]) + + for token in tokens: + share_url = f"https://arquivos.receitafederal.gov.br/s/{token}" + try: + resp = httpx.head(share_url, follow_redirects=True, timeout=30) + if resp.status_code < 400: + return nextcloud_dl.format(token=token) + except httpx.HTTPError: + pass + + # --- Legacy dadosabertos (fallback) --- + new_base = "https://dadosabertos.rfb.gov.br/CNPJ/dados_abertos_cnpj/{ym}/" + legacy_url = "https://dadosabertos.rfb.gov.br/CNPJ/" + + now = datetime.now(UTC) + if year_month is not None: + candidates = [year_month] else: - pipeline.run() + current = f"{now.year:04d}-{now.month:02d}" + prev_m = now.month - 1 if now.month > 1 else 12 + prev_y = now.year if now.month > 1 else now.year - 1 + candidates = [current, f"{prev_y:04d}-{prev_m:02d}"] + + for ym in candidates: + url = new_base.format(ym=ym) + try: + resp = httpx.head(url, follow_redirects=True, timeout=30) + if resp.status_code < 400: + return url + except httpx.HTTPError: + pass - run_post_load_hooks( - driver=driver, - source=source, - neo4j_database=neo4j_database, - linking_tier=linking_tier, - ) + try: + resp = httpx.head(legacy_url, follow_redirects=True, timeout=30) + if resp.status_code < 400: + return legacy_url + except httpx.HTTPError: + pass - driver.close() + tried = ", ".join(candidates) + msg = f"Could not resolve CNPJ release. Tried Nextcloud tokens, months [{tried}], and legacy." + raise RuntimeError(msg) @cli.command() @click.option("--output-dir", default="./data/cnpj", help="Output directory") @click.option("--files", type=int, default=10, help="Number of files per type (0-9)") @click.option("--skip-existing/--no-skip-existing", default=True) -def download(output_dir: str, files: int, skip_existing: bool) -> None: +@click.option("--release", default=None, help="Pin to specific monthly release (YYYY-MM)") +def download(output_dir: str, files: int, skip_existing: bool, release: str | None) -> None: """Download CNPJ data from Receita Federal.""" - import shutil - import stat import zipfile from pathlib import Path @@ -194,56 +246,13 @@ def download(output_dir: str, files: int, skip_existing: bool) -> None: logger = logging.getLogger(__name__) - base_url = "https://dadosabertos.rfb.gov.br/CNPJ/" + base_url = _resolve_rf_release_inline(release) + logger.info("Using CNPJ release URL: %s", base_url) file_types = ["Empresas", "Socios", "Estabelecimentos"] out = Path(output_dir) out.mkdir(parents=True, exist_ok=True) - def _safe_extract_zip( - archive: zipfile.ZipFile, - output_root: Path, - *, - max_members: int = 50_000, - max_uncompressed_bytes: int = 5_000_000_000, - ) -> None: - base = output_root.resolve() - infos = archive.infolist() - if len(infos) > max_members: - raise click.ClickException( - f"Unsafe ZIP archive: too many entries ({len(infos)} > {max_members})", - ) - - uncompressed_total = 0 - for info in infos: - if not info.filename: - continue - member = info.filename.replace("\\", "/") - mode = info.external_attr >> 16 - if stat.S_ISLNK(mode): - raise click.ClickException(f"Unsafe ZIP member (symlink): {member}") - - target = (output_root / member).resolve() - try: - target.relative_to(base) - except ValueError as exc: - raise click.ClickException(f"Unsafe ZIP member path: {member}") from exc - - if info.is_dir(): - target.mkdir(parents=True, exist_ok=True) - continue - - uncompressed_total += info.file_size - if uncompressed_total > max_uncompressed_bytes: - raise click.ClickException( - "Unsafe ZIP archive: exceeds max extracted size " - f"({uncompressed_total} > {max_uncompressed_bytes})", - ) - - target.parent.mkdir(parents=True, exist_ok=True) - with archive.open(info, "r") as source, target.open("wb") as destination: - shutil.copyfileobj(source, destination) - for file_type in file_types: for i in range(min(files, 10)): filename = f"{file_type}{i}.zip" @@ -264,17 +273,84 @@ def _safe_extract_zip( logger.info("Extracting %s...", dest.name) with zipfile.ZipFile(dest, "r") as zf: - _safe_extract_zip(zf, out) + # Path traversal guard + out_resolved = out.resolve() + safe = True + for info in zf.infolist(): + target = (out / info.filename).resolve() + if not target.is_relative_to(out_resolved): + logger.warning( + "Path traversal in %s: %s — skipping archive", + dest.name, + info.filename, + ) + safe = False + break + if not safe: + continue + # Zip bomb guard (50 GB limit for CNPJ data) + total = sum(i.file_size for i in zf.infolist()) + if total > 50 * 1024**3: + logger.warning( + "Uncompressed size too large: %s (%.1f GB) — skipping", + dest.name, + total / 1e9, + ) + continue + zf.extractall(out) except httpx.HTTPError: logger.warning("Failed to download %s (may not exist)", filename) @cli.command() -def sources() -> None: +@click.option("--status", "show_status", is_flag=True, help="Show ingestion status from Neo4j") +@click.option("--neo4j-uri", default="bolt://localhost:7687", help="Neo4j URI") +@click.option("--neo4j-user", default="neo4j") +@click.option("--neo4j-password", default=None) +def sources(show_status: bool, neo4j_uri: str, neo4j_user: str, neo4j_password: str | None) -> None: """List available data sources.""" - click.echo("Available pipelines:") - for name in sorted(PIPELINES): - click.echo(f" {name}") + if not show_status: + click.echo("Available pipelines:") + for name in sorted(PIPELINES): + click.echo(f" {name}") + return + + if not neo4j_password: + neo4j_password = os.environ.get("NEO4J_PASSWORD", "") + if not neo4j_password: + raise click.ClickException( + "--neo4j-password or NEO4J_PASSWORD env var required for --status" + ) + + driver = GraphDatabase.driver(neo4j_uri, auth=(neo4j_user, neo4j_password)) + try: + with driver.session() as session: + result = session.run( + "MATCH (r:IngestionRun) " + "WITH r ORDER BY r.started_at DESC " + "WITH r.source_id AS sid, collect(r)[0] AS latest " + "RETURN latest ORDER BY sid" + ) + runs = {r["latest"]["source_id"]: dict(r["latest"]) for r in result} + + click.echo( + f"{'Source':<20} {'Status':<15} {'Rows In':>10} {'Loaded':>10} " + f"{'Started':<20} {'Finished':<20}" + ) + click.echo("-" * 100) + + for name in sorted(PIPELINES): + run = runs.get(name, {}) + click.echo( + f"{name:<20} " + f"{run.get('status', '-'):<15} " + f"{run.get('rows_in', 0):>10,} " + f"{run.get('rows_loaded', 0):>10,} " + f"{str(run.get('started_at', '-')):<20} " + f"{str(run.get('finished_at', '-')):<20}" + ) + finally: + driver.close() if __name__ == "__main__": diff --git a/etl/src/bracc_etl/schemas/__init__.py b/etl/src/bracc_etl/schemas/__init__.py new file mode 100644 index 0000000..5a7315b --- /dev/null +++ b/etl/src/bracc_etl/schemas/__init__.py @@ -0,0 +1,5 @@ +"""Pandera DataFrameSchema definitions for ETL data quality validation.""" + +from bracc_etl.schemas.validator import validate_dataframe, validate_dataframe_sampled + +__all__ = ["validate_dataframe", "validate_dataframe_sampled"] diff --git a/etl/src/bracc_etl/schemas/cnpj.py b/etl/src/bracc_etl/schemas/cnpj.py new file mode 100644 index 0000000..879e9c6 --- /dev/null +++ b/etl/src/bracc_etl/schemas/cnpj.py @@ -0,0 +1,129 @@ +"""Pandera schemas for CNPJ (Receita Federal Company Registry) pipeline. + +Validates the three core entity DataFrames produced by CNPJPipeline.transform(): +- empresas: Company nodes (cnpj, razao_social, capital_social, uf, etc.) +- socios (PF strong): Person nodes keyed by CPF +- socios (PF partial): Partner nodes keyed by partner_id hash + +Column definitions derived from cnpj.py _transform_empresas_rf/simple +and _transform_socios_rf/simple output dictionaries. +""" + +import pandera.pandas as pa + +# ------------------------------------------------------------------ +# Empresas (Company nodes) +# Output columns: cnpj, razao_social, natureza_juridica, cnae_principal, +# capital_social, uf, municipio, porte_empresa +# ------------------------------------------------------------------ +empresas_schema = pa.DataFrameSchema( + columns={ + "cnpj": pa.Column( + str, + nullable=True, + coerce=True, + checks=[ + # Formatted CNPJ: XX.XXX.XXX/XXXX-XX (18 chars) or raw digits + pa.Check.str_matches( + r"^(\d{2}\.\d{3}\.\d{3}/\d{4}-\d{2}|\d{8,14})$", + error="CNPJ must be formatted (XX.XXX.XXX/XXXX-XX) or 8-14 digits", + ), + ], + ), + "razao_social": pa.Column(str, nullable=True, coerce=True), + "natureza_juridica": pa.Column(str, nullable=True, coerce=True), + "cnae_principal": pa.Column(str, nullable=True, coerce=True), + "capital_social": pa.Column(float, nullable=True, coerce=True, checks=[ + pa.Check.ge(0, error="capital_social must be >= 0"), + ]), + "uf": pa.Column(str, nullable=True, coerce=True, checks=[ + # Brazilian UF: 2 uppercase letters or empty + pa.Check.str_matches( + r"^([A-Z]{2})?$", + error="UF must be 2 uppercase letters or empty", + ), + ]), + "municipio": pa.Column(str, nullable=True, coerce=True), + "porte_empresa": pa.Column(str, nullable=True, coerce=True), + }, + coerce=True, + strict=False, # Allow extra columns +) + + +# ------------------------------------------------------------------ +# Socios PF (Person nodes with strong CPF identity) +# Output columns: name, cpf, tipo_socio +# ------------------------------------------------------------------ +socios_pf_schema = pa.DataFrameSchema( + columns={ + "name": pa.Column(str, nullable=True, coerce=True), + "cpf": pa.Column( + str, + nullable=True, + coerce=True, + checks=[ + # Formatted CPF: XXX.XXX.XXX-XX (14 chars) or raw 11 digits + pa.Check.str_matches( + r"^(\d{3}\.\d{3}\.\d{3}-\d{2}|\d{11})$", + error="CPF must be formatted (XXX.XXX.XXX-XX) or 11 digits", + ), + ], + ), + "tipo_socio": pa.Column(str, nullable=True, coerce=True), + }, + coerce=True, + strict=False, +) + + +# ------------------------------------------------------------------ +# Socios PF partial (Partner nodes with masked/invalid docs) +# Output columns: partner_id, name, doc_raw, doc_digits, doc_partial, +# doc_type, tipo_socio, identity_quality, source +# ------------------------------------------------------------------ +socios_partial_schema = pa.DataFrameSchema( + columns={ + "partner_id": pa.Column(str, nullable=False, coerce=True, checks=[ + pa.Check.str_length(min_value=1, error="partner_id must not be empty"), + ]), + "name": pa.Column(str, nullable=True, coerce=True), + "doc_raw": pa.Column(str, nullable=True, coerce=True), + "doc_digits": pa.Column(str, nullable=True, coerce=True), + "doc_partial": pa.Column(str, nullable=True, coerce=True), + "doc_type": pa.Column(str, nullable=True, coerce=True), + "tipo_socio": pa.Column(str, nullable=True, coerce=True), + "identity_quality": pa.Column(str, nullable=True, coerce=True, checks=[ + pa.Check.isin( + ["partial", "unknown", ""], + error="identity_quality must be partial/unknown", + ), + ]), + "source": pa.Column(str, nullable=True, coerce=True), + }, + coerce=True, + strict=False, +) + + +# ------------------------------------------------------------------ +# SOCIO_DE relationships (all variants: PF, partial, PJ) +# Output columns: source_key, target_key, tipo_socio, qualificacao, +# data_entrada, snapshot_date +# ------------------------------------------------------------------ +socio_relationship_schema = pa.DataFrameSchema( + columns={ + "source_key": pa.Column(str, nullable=False, coerce=True, checks=[ + pa.Check.str_length(min_value=1, error="source_key must not be empty"), + ]), + "target_key": pa.Column(str, nullable=False, coerce=True, checks=[ + pa.Check.str_length(min_value=1, error="target_key must not be empty"), + ]), + "tipo_socio": pa.Column(str, nullable=True, coerce=True), + "qualificacao": pa.Column(str, nullable=True, coerce=True), + "data_entrada": pa.Column(str, nullable=True, coerce=True), + "snapshot_date": pa.Column(str, nullable=True, coerce=True), + }, + coerce=True, + strict=False, +) diff --git a/etl/src/bracc_etl/schemas/dou.py b/etl/src/bracc_etl/schemas/dou.py new file mode 100644 index 0000000..cab28e7 --- /dev/null +++ b/etl/src/bracc_etl/schemas/dou.py @@ -0,0 +1,103 @@ +"""Pandera schemas for DOU (Diario Oficial da Uniao) pipeline. + +Validates the three entity lists produced by DouPipeline.transform(): +- acts: DOUAct nodes (act_id, title, act_type, date, section, etc.) +- person_rels: PUBLICOU relationships (Person CPF -> DOUAct) +- company_rels: MENCIONOU relationships (Company CNPJ -> DOUAct) + +Column definitions derived from dou.py transform() output dictionaries. +Act types are classified from title/abstract text into: nomeacao, +exoneracao, contrato, penalidade, outro. +""" + +import pandera.pandas as pa + +# ------------------------------------------------------------------ +# Acts (DOUAct nodes) +# Output keys: act_id, title, act_type, date, section, agency, +# category, text_excerpt, url, source +# ------------------------------------------------------------------ +acts_schema = pa.DataFrameSchema( + columns={ + "act_id": pa.Column(str, nullable=False, coerce=True, checks=[ + pa.Check.str_length(min_value=1, error="act_id must not be empty"), + ]), + "title": pa.Column(str, nullable=True, coerce=True), + "act_type": pa.Column(str, nullable=False, coerce=True, checks=[ + pa.Check.isin( + ["nomeacao", "exoneracao", "contrato", "penalidade", "outro"], + error="act_type must be one of the classified types", + ), + ]), + "date": pa.Column(str, nullable=True, coerce=True), + "section": pa.Column(str, nullable=True, coerce=True), + "agency": pa.Column(str, nullable=True, coerce=True), + "category": pa.Column(str, nullable=True, coerce=True), + "text_excerpt": pa.Column(str, nullable=True, coerce=True, checks=[ + pa.Check.str_length(max_value=500, error="text_excerpt must be <= 500 chars"), + ]), + "url": pa.Column(str, nullable=True, coerce=True, checks=[ + pa.Check.str_matches( + r"^https?://", + error="url must start with http:// or https://", + ), + ]), + "source": pa.Column(str, nullable=False, coerce=True, checks=[ + pa.Check.isin(["imprensa_nacional"], error="source must be 'imprensa_nacional'"), + ]), + }, + coerce=True, + strict=False, +) + + +# ------------------------------------------------------------------ +# Person relationships (PUBLICOU: Person -> DOUAct) +# Output keys: source_key (CPF), target_key (act_id) +# ------------------------------------------------------------------ +person_rels_schema = pa.DataFrameSchema( + columns={ + "source_key": pa.Column( + str, + nullable=False, + coerce=True, + checks=[ + pa.Check.str_matches( + r"^(\d{3}\.\d{3}\.\d{3}-\d{2}|\d{11})$", + error="source_key must be a formatted CPF", + ), + ], + ), + "target_key": pa.Column(str, nullable=False, coerce=True, checks=[ + pa.Check.str_length(min_value=1, error="target_key must not be empty"), + ]), + }, + coerce=True, + strict=False, +) + + +# ------------------------------------------------------------------ +# Company relationships (MENCIONOU: Company -> DOUAct) +# Output keys: source_key (CNPJ), target_key (act_id) +# ------------------------------------------------------------------ +company_rels_schema = pa.DataFrameSchema( + columns={ + "source_key": pa.Column( + str, + nullable=False, + coerce=True, + checks=[ + pa.Check.str_matches( + r"^(\d{2}\.\d{3}\.\d{3}/\d{4}-\d{2}|\d{14})$", + error="source_key must be a formatted CNPJ", + ), + ], + ), + "target_key": pa.Column(str, nullable=False, coerce=True, checks=[ + pa.Check.str_length(min_value=1, error="target_key must not be empty"), + ]), + }, + coerce=True, + strict=False, +) diff --git a/etl/src/bracc_etl/schemas/pgfn.py b/etl/src/bracc_etl/schemas/pgfn.py new file mode 100644 index 0000000..3a6bd69 --- /dev/null +++ b/etl/src/bracc_etl/schemas/pgfn.py @@ -0,0 +1,80 @@ +"""Pandera schemas for PGFN (Tax Debt / Divida Ativa) pipeline. + +Validates the two entity lists produced by PgfnPipeline.transform(): +- finances: Finance nodes (finance_id, type, inscription_number, value, etc.) +- relationships: DEVE relationships (source_key=CNPJ, target_key=finance_id) + +Column definitions derived from pgfn.py transform() output dictionaries. +Only company (PJ) debtors with PRINCIPAL debtor type are loaded; person +records are pre-filtered due to LGPD CPF masking by PGFN. +""" + +import pandera.pandas as pa + +# ------------------------------------------------------------------ +# Finances (Finance nodes) +# Output keys: finance_id, type, inscription_number, value, date, +# situation, revenue_type, court_action, source +# ------------------------------------------------------------------ +finances_schema = pa.DataFrameSchema( + columns={ + "finance_id": pa.Column(str, nullable=False, coerce=True, checks=[ + pa.Check.str_matches( + r"^pgfn_\S+$", + error="finance_id must start with 'pgfn_' followed by inscription number", + ), + ]), + "type": pa.Column(str, nullable=False, coerce=True, checks=[ + pa.Check.isin(["divida_ativa"], error="type must be 'divida_ativa'"), + ]), + "inscription_number": pa.Column(str, nullable=False, coerce=True, checks=[ + pa.Check.str_length(min_value=1, error="inscription_number must not be empty"), + ]), + "value": pa.Column(float, nullable=True, coerce=True, checks=[ + pa.Check.ge(0, error="value must be >= 0"), + ]), + "date": pa.Column(str, nullable=True, coerce=True), + "situation": pa.Column(str, nullable=True, coerce=True), + "revenue_type": pa.Column(str, nullable=True, coerce=True), + "court_action": pa.Column(str, nullable=True, coerce=True), + "source": pa.Column(str, nullable=False, coerce=True, checks=[ + pa.Check.isin(["pgfn"], error="source must be 'pgfn'"), + ]), + }, + coerce=True, + strict=False, +) + + +# ------------------------------------------------------------------ +# DEVE relationships (Company -> Finance) +# Output keys: source_key, target_key, value, date, company_name +# ------------------------------------------------------------------ +deve_relationship_schema = pa.DataFrameSchema( + columns={ + "source_key": pa.Column( + str, + nullable=False, + coerce=True, + checks=[ + pa.Check.str_matches( + r"^(\d{2}\.\d{3}\.\d{3}/\d{4}-\d{2}|\d{14})$", + error="source_key must be a formatted CNPJ", + ), + ], + ), + "target_key": pa.Column(str, nullable=False, coerce=True, checks=[ + pa.Check.str_matches( + r"^pgfn_\S+$", + error="target_key must be a pgfn_ finance_id", + ), + ]), + "value": pa.Column(float, nullable=True, coerce=True, checks=[ + pa.Check.ge(0, error="value must be >= 0"), + ]), + "date": pa.Column(str, nullable=True, coerce=True), + "company_name": pa.Column(str, nullable=True, coerce=True), + }, + coerce=True, + strict=False, +) diff --git a/etl/src/bracc_etl/schemas/transparencia.py b/etl/src/bracc_etl/schemas/transparencia.py new file mode 100644 index 0000000..02417c8 --- /dev/null +++ b/etl/src/bracc_etl/schemas/transparencia.py @@ -0,0 +1,91 @@ +"""Pandera schemas for Transparencia (Portal da Transparencia) pipeline. + +Validates the three entity lists produced by TransparenciaPipeline.transform(): +- contracts: Contract nodes (contract_id, object, value, contracting_org, date, cnpj, razao_social) +- offices: PublicOffice nodes (office_id, servidor_id, cpf_partial, name, org, salary) +- amendments: Amendment nodes (amendment_id, author_key, name, object, value) + +Column definitions derived from transparencia.py transform() output dictionaries. +""" + +import pandera.pandas as pa + +# ------------------------------------------------------------------ +# Contracts (Contract nodes + Company VENCEU relationship) +# Output keys: contract_id, object, value, contracting_org, date, cnpj, razao_social +# ------------------------------------------------------------------ +contracts_schema = pa.DataFrameSchema( + columns={ + "contract_id": pa.Column(str, nullable=False, coerce=True, checks=[ + pa.Check.str_length(min_value=1, error="contract_id must not be empty"), + ]), + "object": pa.Column(str, nullable=True, coerce=True), + "value": pa.Column(float, nullable=True, coerce=True, checks=[ + pa.Check.ge(0, error="value must be >= 0"), + ]), + "contracting_org": pa.Column(str, nullable=True, coerce=True), + "date": pa.Column(str, nullable=True, coerce=True), + "cnpj": pa.Column( + str, + nullable=True, + coerce=True, + checks=[ + pa.Check.str_matches( + r"^(\d{2}\.\d{3}\.\d{3}/\d{4}-\d{2}|\d{14})$", + error="CNPJ must be formatted (XX.XXX.XXX/XXXX-XX) or 14 digits", + ), + ], + ), + "razao_social": pa.Column(str, nullable=True, coerce=True), + }, + coerce=True, + strict=False, +) + + +# ------------------------------------------------------------------ +# Offices (PublicOffice nodes + Person RECEBEU_SALARIO relationship) +# Output keys: office_id, servidor_id, cpf_partial, name, org, salary +# ------------------------------------------------------------------ +offices_schema = pa.DataFrameSchema( + columns={ + "office_id": pa.Column(str, nullable=False, coerce=True, checks=[ + pa.Check.str_length(min_value=1, error="office_id must not be empty"), + ]), + "servidor_id": pa.Column(str, nullable=False, coerce=True, checks=[ + pa.Check.str_length(min_value=1, error="servidor_id must not be empty"), + ]), + # cpf_partial: 6 middle digits from LGPD-masked CPF, or None + "cpf_partial": pa.Column(str, nullable=True, coerce=True), + "name": pa.Column(str, nullable=True, coerce=True), + "org": pa.Column(str, nullable=True, coerce=True), + "salary": pa.Column(float, nullable=True, coerce=True, checks=[ + pa.Check.ge(0, error="salary must be >= 0"), + ]), + }, + coerce=True, + strict=False, +) + + +# ------------------------------------------------------------------ +# Amendments (Amendment nodes + Person AUTOR_EMENDA relationship) +# Output keys: amendment_id, author_key, name, object, value +# ------------------------------------------------------------------ +amendments_schema = pa.DataFrameSchema( + columns={ + "amendment_id": pa.Column(str, nullable=False, coerce=True, checks=[ + pa.Check.str_length(min_value=1, error="amendment_id must not be empty"), + ]), + "author_key": pa.Column(str, nullable=False, coerce=True, checks=[ + pa.Check.str_length(min_value=1, error="author_key must not be empty"), + ]), + "name": pa.Column(str, nullable=True, coerce=True), + "object": pa.Column(str, nullable=True, coerce=True), + "value": pa.Column(float, nullable=True, coerce=True, checks=[ + pa.Check.ge(0, error="value must be >= 0"), + ]), + }, + coerce=True, + strict=False, +) diff --git a/etl/src/bracc_etl/schemas/tse.py b/etl/src/bracc_etl/schemas/tse.py new file mode 100644 index 0000000..4103cbe --- /dev/null +++ b/etl/src/bracc_etl/schemas/tse.py @@ -0,0 +1,104 @@ +"""Pandera schemas for TSE (Electoral Donations) pipeline. + +Validates the three entity lists produced by TSEPipeline.transform(): +- candidates: Person nodes (sq_candidato, name, cpf, partido, uf) +- elections: Election nodes (year, cargo, uf, municipio, candidate_sq) +- donations: DOOU relationships (candidate_sq, donor_doc, valor, year, etc.) + +Column definitions derived from tse.py _transform_candidates and +_transform_donations output dictionaries. + +Note: TSE 2024 masks ALL candidate CPFs as "-4". After stripping, +candidates without real CPFs omit the 'cpf' key entirely. The cpf +column is therefore nullable. +""" + +import pandera.pandas as pa + +# ------------------------------------------------------------------ +# Candidates (Person nodes) +# Output keys: sq_candidato, name, partido, uf, cpf (optional) +# ------------------------------------------------------------------ +candidates_schema = pa.DataFrameSchema( + columns={ + "sq_candidato": pa.Column(str, nullable=False, coerce=True, checks=[ + pa.Check.str_length(min_value=1, error="sq_candidato must not be empty"), + ]), + "name": pa.Column(str, nullable=True, coerce=True), + "partido": pa.Column(str, nullable=True, coerce=True), + "uf": pa.Column(str, nullable=True, coerce=True, checks=[ + pa.Check.str_matches( + r"^[A-Z]{2}$", + error="UF must be 2 uppercase letters", + ), + ]), + # cpf is optional — absent for masked candidates (TSE sentinel "-4") + "cpf": pa.Column( + str, + nullable=True, + coerce=True, + required=False, + checks=[ + pa.Check.str_matches( + r"^(\d{3}\.\d{3}\.\d{3}-\d{2}|\d{11})$", + error="CPF must be formatted or 11 digits", + ), + ], + ), + }, + coerce=True, + strict=False, +) + + +# ------------------------------------------------------------------ +# Elections (Election nodes) +# Output keys: year, cargo, uf, municipio, candidate_sq +# ------------------------------------------------------------------ +elections_schema = pa.DataFrameSchema( + columns={ + "year": pa.Column(int, nullable=False, coerce=True, checks=[ + pa.Check.in_range(1945, 2030, error="year must be between 1945 and 2030"), + ]), + "cargo": pa.Column(str, nullable=True, coerce=True), + "uf": pa.Column(str, nullable=True, coerce=True), + "municipio": pa.Column(str, nullable=True, coerce=True), + "candidate_sq": pa.Column(str, nullable=False, coerce=True), + }, + coerce=True, + strict=False, +) + + +# ------------------------------------------------------------------ +# Donations (DOOU relationships) +# Output keys: candidate_sq, donor_doc, donor_name, donor_is_company, +# valor, year +# ------------------------------------------------------------------ +donations_schema = pa.DataFrameSchema( + columns={ + "candidate_sq": pa.Column(str, nullable=False, coerce=True), + "donor_doc": pa.Column( + str, + nullable=True, + coerce=True, + checks=[ + # Formatted CPF or CNPJ (11 or 14 digits, with or without punctuation) + pa.Check.str_matches( + r"^(\d{3}\.\d{3}\.\d{3}-\d{2}|\d{2}\.\d{3}\.\d{3}/\d{4}-\d{2}|\d{11}|\d{14})$", + error="donor_doc must be formatted CPF or CNPJ", + ), + ], + ), + "donor_name": pa.Column(str, nullable=True, coerce=True), + "donor_is_company": pa.Column(bool, nullable=False, coerce=True), + "valor": pa.Column(float, nullable=False, coerce=True, checks=[ + pa.Check.ge(0, error="valor must be >= 0"), + ]), + "year": pa.Column(int, nullable=False, coerce=True, checks=[ + pa.Check.in_range(1945, 2030, error="year must be between 1945 and 2030"), + ]), + }, + coerce=True, + strict=False, +) diff --git a/etl/src/bracc_etl/schemas/validator.py b/etl/src/bracc_etl/schemas/validator.py new file mode 100644 index 0000000..7f70ede --- /dev/null +++ b/etl/src/bracc_etl/schemas/validator.py @@ -0,0 +1,80 @@ +"""Schema validation utility with configurable strictness.""" + +import logging +import os +from typing import Any, cast + +import pandas as pd + +logger = logging.getLogger(__name__) + + +def _get_validation_mode() -> str: + """Get validation mode from env: 'warn' (default), 'strict', or 'off'.""" + return os.environ.get("BRACC_SCHEMA_VALIDATION", "warn").lower() + + +def validate_dataframe( + df: pd.DataFrame, + schema: Any, # pa.DataFrameSchema + source_name: str, +) -> pd.DataFrame: + """Validate a DataFrame against a Pandera schema. + + Behavior controlled by BRACC_SCHEMA_VALIDATION env var: + - 'off': skip validation entirely + - 'warn': validate, log warnings, return original df + - 'strict': validate, raise on failure + """ + mode = _get_validation_mode() + if mode == "off": + return df + + try: + import pandera as pa + + validated = schema.validate(df, lazy=True) + logger.info("[%s] Schema validation passed: %d rows OK", source_name, len(df)) + return cast("pd.DataFrame", validated) + except pa.errors.SchemaErrors as exc: + n_failures = len(exc.failure_cases) + logger.warning( + "[%s] Schema validation: %d failures in %d rows", + source_name, + n_failures, + len(df), + ) + for _, row in exc.failure_cases.head(10).iterrows(): + logger.warning( + " %s: column=%s check=%s", + source_name, + row.get("column"), + row.get("check"), + ) + + if mode == "strict": + raise + return df # warn mode: return original + except ImportError: + logger.warning("[%s] pandera not installed, skipping validation", source_name) + return df + + +def validate_dataframe_sampled( + df: pd.DataFrame, + schema: Any, + source_name: str, + sample_size: int = 10_000, +) -> pd.DataFrame: + """Validate a random sample of a large DataFrame (e.g., CNPJ). + + For DataFrames larger than sample_size, validates only a random sample + to keep validation fast on multi-million-row datasets. Always returns + the full original DataFrame. + """ + if len(df) <= sample_size: + return validate_dataframe(df, schema, source_name) + + sample = df.sample(n=sample_size, random_state=42) + validate_dataframe(sample, schema, f"{source_name}[sample={sample_size}]") + return df # Always return full df diff --git a/etl/src/bracc_etl/transforms/date_formatting.py b/etl/src/bracc_etl/transforms/date_formatting.py index 3afca17..0776ec0 100644 --- a/etl/src/bracc_etl/transforms/date_formatting.py +++ b/etl/src/bracc_etl/transforms/date_formatting.py @@ -1,10 +1,16 @@ +import logging + import pandas as pd +logger = logging.getLogger(__name__) + def parse_date(value: str) -> str: """Parse a date string to ISO format (YYYY-MM-DD) or empty string. Handles: DD/MM/YYYY, DD/MM/YYYY HH:MM:SS, YYYY-MM-DD, YYYYMMDD. + Returns empty string when all format attempts fail (prevents garbage + dates from reaching Neo4j). """ value = value.strip() if not value: @@ -14,4 +20,5 @@ def parse_date(value: str) -> str: return str(pd.to_datetime(value, format=fmt).strftime("%Y-%m-%d")) except ValueError: continue - return value + logger.debug("Could not parse date: %r", value) + return "" diff --git a/etl/tests/integration/test_link_persons.py b/etl/tests/integration/test_link_persons.py deleted file mode 100644 index 1c90101..0000000 --- a/etl/tests/integration/test_link_persons.py +++ /dev/null @@ -1,307 +0,0 @@ -"""Integration tests for link_persons.cypher. - -Runs against a real Neo4j testcontainer to verify SAME_AS relationships -are created with correct confidence, method, and uniqueness guards. -""" - -from __future__ import annotations - -from pathlib import Path -from typing import TYPE_CHECKING - -import pytest - -if TYPE_CHECKING: - from neo4j import Driver - -LINK_SCRIPT = ( - Path(__file__).parent.parent.parent.parent - / "scripts" - / "link_persons.cypher" -) - - -def _parse_phases() -> dict[int, str]: - """Parse link_persons.cypher into phase number → Cypher blocks.""" - text = LINK_SCRIPT.read_text() - blocks: dict[int, str] = {} - current_phase: int | None = None - lines: list[str] = [] - for line in text.splitlines(): - if line.startswith("// ── Phase "): - if current_phase is not None: - blocks[current_phase] = "\n".join(lines) - phase_str = line.split("Phase ")[1].split(":")[0] - current_phase = int(phase_str) - lines = [] - else: - lines.append(line) - if current_phase is not None: - blocks[current_phase] = "\n".join(lines) - return blocks - - -PHASE_BLOCKS = _parse_phases() - - -def _strip_comments(cypher: str) -> str: - """Remove // comment lines from a Cypher block.""" - return "\n".join( - line for line in cypher.splitlines() - if not line.strip().startswith("//") - ) - - -def _run_cypher(driver: Driver, cypher: str) -> None: - """Run one or more semicolon-separated Cypher statements.""" - with driver.session() as session: - for stmt in cypher.split(";"): - stmt = _strip_comments(stmt).strip() - if stmt: - session.run(stmt).consume() - - -def _run_phases(driver: Driver, phases: list[int]) -> None: - """Run specific phases from link_persons.cypher.""" - for phase in phases: - _run_cypher(driver, PHASE_BLOCKS[phase]) - - -def _clear_db(driver: Driver) -> None: - _run_cypher(driver, "MATCH (n) DETACH DELETE n") - - -def _setup(driver: Driver, *statements: str) -> None: - """Run setup Cypher statements, each in its own auto-commit tx.""" - for stmt in statements: - _run_cypher(driver, stmt) - - -def _count_same_as( - driver: Driver, method: str | None = None, -) -> int: - """Count SAME_AS relationships, optionally filtered by method.""" - q = "MATCH ()-[r:SAME_AS]->() " - if method: - q += f"WHERE r.method = '{method}' " - q += "RETURN count(r) AS cnt" - with driver.session() as session: - result = session.run(q) - record = result.single() - return record["cnt"] if record else 0 - - -# ── Phase 4 tests ────────────────────────────────────────────────── - - -@pytest.mark.integration -def test_phase4_partial_cpf_name_match(neo4j_driver: Driver) -> None: - """Phase 4 is disabled: partial CPF matching must not create SAME_AS.""" - _clear_db(neo4j_driver) - _setup( - neo4j_driver, - "CREATE (:Person {cpf: '026.005.602-20', name: 'JOSE DIAS TOFFOLI'})", - ) - _run_phases(neo4j_driver, [0]) - _setup( - neo4j_driver, - "CREATE (:Person {" - "cpf: '005602', cpf_partial: '005602', " - "name: 'JOSE DIAS TOFFOLI'})", - ) - - _run_phases(neo4j_driver, [4]) - - assert _count_same_as(neo4j_driver) == 0 - - -@pytest.mark.integration -def test_phase4_no_match_different_name(neo4j_driver: Driver) -> None: - """Same cpf_partial/cpf_middle6 but different names -> no match.""" - _clear_db(neo4j_driver) - _setup( - neo4j_driver, - "CREATE (:Person {cpf: '026.005.602-20', name: 'JOSE DIAS TOFFOLI'})", - ) - _run_phases(neo4j_driver, [0]) - _setup( - neo4j_driver, - "CREATE (:Person {" - "cpf: '005602', cpf_partial: '005602', " - "name: 'MARIA DA SILVA'})", - ) - - _run_phases(neo4j_driver, [4]) - - assert _count_same_as(neo4j_driver) == 0 - - -@pytest.mark.integration -def test_phase4_no_duplicate_if_already_linked( - neo4j_driver: Driver, -) -> None: - """Running disabled Phase 4 twice must still create no SAME_AS.""" - _clear_db(neo4j_driver) - _setup( - neo4j_driver, - "CREATE (:Person {cpf: '026.005.602-20', name: 'JOSE DIAS TOFFOLI'})", - ) - _run_phases(neo4j_driver, [0]) - _setup( - neo4j_driver, - "CREATE (:Person {" - "cpf: '005602', cpf_partial: '005602', " - "name: 'JOSE DIAS TOFFOLI'})", - ) - - _run_phases(neo4j_driver, [4]) - _run_phases(neo4j_driver, [4]) # idempotent - - assert _count_same_as(neo4j_driver) == 0 - - -# ── Phase 5 tests ────────────────────────────────────────────────── - - -@pytest.mark.integration -def test_phase5_unique_name_match(neo4j_driver: Driver) -> None: - """Unique-name servidor (blank cpf_partial) matches unique person.""" - _clear_db(neo4j_driver) - _setup( - neo4j_driver, - "CREATE (:Person {cpf: '026.005.602-20', name: 'JOSE DIAS TOFFOLI'})", - ) - _run_phases(neo4j_driver, [0]) - _setup( - neo4j_driver, - "CREATE (:Person {name: 'JOSE DIAS TOFFOLI'})" - "-[:RECEBEU_SALARIO]->(:PublicOffice {cpf: 'classified_1'})", - ) - - _run_phases(neo4j_driver, [5]) - - with neo4j_driver.session() as s: - result = s.run( - "MATCH ()-[r:SAME_AS]->() " - "RETURN r.confidence AS conf, r.method AS method" - ) - records = list(result) - assert len(records) == 1 - assert records[0]["conf"] == 0.85 - assert records[0]["method"] == "unique_name_match_servidor" - - -@pytest.mark.integration -def test_phase5_common_name_servidor_side_no_match( - neo4j_driver: Driver, -) -> None: - """Two servidores with same name and blank cpf_partial -> no match.""" - _clear_db(neo4j_driver) - _setup( - neo4j_driver, - "CREATE (:Person {cpf: '111.222.333-44', name: 'JOSE DA SILVA'})", - ) - _run_phases(neo4j_driver, [0]) - _setup( - neo4j_driver, - "CREATE (:Person {name: 'JOSE DA SILVA'})" - "-[:RECEBEU_SALARIO]->(:PublicOffice {cpf: 'classified_a'})", - "CREATE (:Person {name: 'JOSE DA SILVA'})" - "-[:RECEBEU_SALARIO]->(:PublicOffice {cpf: 'classified_b'})", - ) - - _run_phases(neo4j_driver, [5]) - - assert _count_same_as(neo4j_driver) == 0 - - -@pytest.mark.integration -def test_phase5_common_name_person_side_no_match( - neo4j_driver: Driver, -) -> None: - """Unique servidor but two full-CPF persons share the name -> no match.""" - _clear_db(neo4j_driver) - _setup( - neo4j_driver, - "CREATE (:Person {cpf: '111.222.333-44', name: 'MARIA OLIVEIRA'})", - "CREATE (:Person {cpf: '555.666.777-88', name: 'MARIA OLIVEIRA'})", - ) - _run_phases(neo4j_driver, [0]) - _setup( - neo4j_driver, - "CREATE (:Person {name: 'MARIA OLIVEIRA'})" - "-[:RECEBEU_SALARIO]->(:PublicOffice {cpf: 'classified_x'})", - ) - - _run_phases(neo4j_driver, [5]) - - assert _count_same_as(neo4j_driver) == 0 - - -@pytest.mark.integration -def test_phase5_requires_recebeu_salario(neo4j_driver: Driver) -> None: - """Person without RECEBEU_SALARIO should not match in Phase 5.""" - _clear_db(neo4j_driver) - _setup( - neo4j_driver, - "CREATE (:Person {cpf: '026.005.602-20', name: 'JOSE DIAS TOFFOLI'})", - ) - _run_phases(neo4j_driver, [0]) - _setup( - neo4j_driver, - # Amendment author — no RECEBEU_SALARIO relationship - "CREATE (:Person {name: 'JOSE DIAS TOFFOLI', author_key: 'toffoli'})", - ) - - _run_phases(neo4j_driver, [5]) - - assert _count_same_as(neo4j_driver, "unique_name_match_servidor") == 0 - - -@pytest.mark.integration -def test_phase5_no_duplicate_if_already_linked( - neo4j_driver: Driver, -) -> None: - """Running Phase 5 twice should not create duplicate SAME_AS.""" - _clear_db(neo4j_driver) - _setup( - neo4j_driver, - "CREATE (:Person {cpf: '026.005.602-20', name: 'JOSE DIAS TOFFOLI'})", - ) - _run_phases(neo4j_driver, [0]) - _setup( - neo4j_driver, - "CREATE (:Person {name: 'JOSE DIAS TOFFOLI'})" - "-[:RECEBEU_SALARIO]->(:PublicOffice {cpf: 'classified_1'})", - ) - - _run_phases(neo4j_driver, [5]) - _run_phases(neo4j_driver, [5]) # idempotent - - assert _count_same_as(neo4j_driver) == 1 - - -@pytest.mark.integration -def test_phase5_servidor_with_cpf_partial_skipped( - neo4j_driver: Driver, -) -> None: - """Servidor with cpf_partial IS NOT NULL should not match in Phase 5. - - Phase 5 only handles blank-CPF servidores (cpf_partial IS NULL). - Partial-CPF matching is intentionally disabled. - """ - _clear_db(neo4j_driver) - _setup( - neo4j_driver, - "CREATE (:Person {cpf: '026.005.602-20', name: 'JOSE DIAS TOFFOLI'})", - ) - _run_phases(neo4j_driver, [0]) - _setup( - neo4j_driver, - "CREATE (:Person {name: 'JOSE DIAS TOFFOLI', cpf_partial: '005602'})" - "-[:RECEBEU_SALARIO]->(:PublicOffice {cpf: 'partial_1'})", - ) - - _run_phases(neo4j_driver, [5]) - - assert _count_same_as(neo4j_driver, "unique_name_match_servidor") == 0 diff --git a/etl/tests/test_base.py b/etl/tests/test_base.py index bbc64de..ee21673 100644 --- a/etl/tests/test_base.py +++ b/etl/tests/test_base.py @@ -8,8 +8,7 @@ class DummyPipeline(Pipeline): source_id = "test" def __init__(self) -> None: - self.driver = MagicMock() - self.data_dir = "./data" + super().__init__(driver=MagicMock(), data_dir="./data") self.extracted = False self.transformed = False self.loaded = False diff --git a/etl/tests/test_comprasnet_pipeline.py b/etl/tests/test_comprasnet_pipeline.py index 0134aef..b8665e3 100644 --- a/etl/tests/test_comprasnet_pipeline.py +++ b/etl/tests/test_comprasnet_pipeline.py @@ -23,7 +23,7 @@ def _extract_from_fixtures(pipeline: ComprasnetPipeline) -> None: def test_pipeline_name_and_source_id() -> None: pipeline = _make_pipeline() assert pipeline.name == "comprasnet" - assert pipeline.source_id == "pncp" + assert pipeline.source_id == "comprasnet" def test_transform_produces_correct_contracts() -> None: diff --git a/etl/tests/test_date_formatting.py b/etl/tests/test_date_formatting.py index 265f2d9..17cee75 100644 --- a/etl/tests/test_date_formatting.py +++ b/etl/tests/test_date_formatting.py @@ -14,7 +14,7 @@ ("31/12/2024 14:30:00", "2024-12-31"), ("", ""), (" ", ""), - ("invalid", "invalid"), + ("invalid", ""), ("01/01/2000", "2000-01-01"), ], ) diff --git a/etl/tests/test_download_cnpj.py b/etl/tests/test_download_cnpj.py new file mode 100644 index 0000000..eb3d9e4 --- /dev/null +++ b/etl/tests/test_download_cnpj.py @@ -0,0 +1,210 @@ +"""Tests for etl/scripts/download_cnpj.py — release resolution and manifest.""" + +from __future__ import annotations + +import importlib.util +import json +from pathlib import Path +from typing import TYPE_CHECKING +from unittest.mock import MagicMock, patch + +if TYPE_CHECKING: + from types import ModuleType + +import httpx +import pytest + + +def _load_script_module() -> ModuleType: + """Load download_cnpj.py as a module without running it.""" + scripts_dir = Path(__file__).resolve().parents[1] / "scripts" + script_path = scripts_dir / "download_cnpj.py" + spec = importlib.util.spec_from_file_location("download_cnpj", script_path) + assert spec is not None + assert spec.loader is not None + module = importlib.util.module_from_spec(spec) + spec.loader.exec_module(module) + return module + + +@pytest.fixture() +def mod() -> ModuleType: + return _load_script_module() + + +# ---- resolve_rf_release tests ---- + + +def test_resolve_rf_release_nextcloud_success(mod: ModuleType) -> None: + """When Nextcloud token probe returns 200, use Nextcloud URL.""" + def _fake_head(url: str, **kwargs) -> MagicMock: # type: ignore[no-untyped-def] + resp = MagicMock() + resp.status_code = 200 + return resp + + with ( + patch.object(httpx, "head", side_effect=_fake_head), + patch.dict("os.environ", {}, clear=False), + ): + result = mod.resolve_rf_release() + + assert "arquivos.receitafederal.gov.br" in result + assert "download?path=" in result + + +def test_resolve_rf_release_nextcloud_env_token_priority(mod: ModuleType) -> None: + """CNPJ_SHARE_TOKEN env var is tried before known tokens.""" + probed_urls: list[str] = [] + + def _fake_head(url: str, **kwargs) -> MagicMock: # type: ignore[no-untyped-def] + probed_urls.append(url) + resp = MagicMock() + resp.status_code = 200 + return resp + + with ( + patch.object(httpx, "head", side_effect=_fake_head), + patch.dict("os.environ", {"CNPJ_SHARE_TOKEN": "customToken123"}, clear=False), + ): + result = mod.resolve_rf_release() + + # First probe should use the env token + assert "customToken123" in probed_urls[0] + assert "customToken123" in result + + +def test_resolve_rf_release_fallback_to_legacy_when_nextcloud_down(mod: ModuleType) -> None: + """When all Nextcloud tokens fail, fall back to legacy paths.""" + def _fake_head(url: str, **kwargs) -> MagicMock: # type: ignore[no-untyped-def] + resp = MagicMock() + if "arquivos.receitafederal.gov.br" in url: + resp.status_code = 404 + elif "dados_abertos_cnpj" in url: + resp.status_code = 200 + else: + resp.status_code = 404 + return resp + + with ( + patch.object(httpx, "head", side_effect=_fake_head), + patch.dict("os.environ", {}, clear=False), + ): + result = mod.resolve_rf_release() + + assert "dados_abertos_cnpj" in result + + +def test_resolve_rf_release_explicit_override_legacy(mod: ModuleType) -> None: + """When year_month is provided and Nextcloud down, use legacy with that month.""" + def _fake_head(url: str, **kwargs) -> MagicMock: # type: ignore[no-untyped-def] + resp = MagicMock() + if "arquivos.receitafederal.gov.br" in url: + resp.status_code = 404 + else: + resp.status_code = 200 + return resp + + with ( + patch.object(httpx, "head", side_effect=_fake_head), + patch.dict("os.environ", {}, clear=False), + ): + result = mod.resolve_rf_release("2026-01") + + assert "2026-01" in result + assert result == "https://dadosabertos.rfb.gov.br/CNPJ/dados_abertos_cnpj/2026-01/" + + +def test_resolve_rf_release_all_fail_raises(mod: ModuleType) -> None: + """When all candidates (Nextcloud + legacy) return 404, raise RuntimeError.""" + def _fake_head(url: str, **kwargs) -> MagicMock: # type: ignore[no-untyped-def] + resp = MagicMock() + resp.status_code = 404 + return resp + + with ( + patch.object(httpx, "head", side_effect=_fake_head), + patch.dict("os.environ", {}, clear=False), + pytest.raises(RuntimeError, match="Could not resolve CNPJ release"), + ): + mod.resolve_rf_release() + + +def test_resolve_rf_release_legacy_flat_fallback(mod: ModuleType) -> None: + """When Nextcloud + legacy new paths fail, fall back to legacy flat URL.""" + call_urls: list[str] = [] + + def _fake_head(url: str, **kwargs) -> MagicMock: # type: ignore[no-untyped-def] + call_urls.append(url) + resp = MagicMock() + # Everything fails except legacy flat URL + if url == "https://dadosabertos.rfb.gov.br/CNPJ/": + resp.status_code = 200 + else: + resp.status_code = 404 + return resp + + with ( + patch.object(httpx, "head", side_effect=_fake_head), + patch.dict("os.environ", {}, clear=False), + ): + result = mod.resolve_rf_release() + + assert result == "https://dadosabertos.rfb.gov.br/CNPJ/" + # Should have tried Nextcloud tokens + legacy new paths + legacy flat + assert len(call_urls) >= 5 # 2 Nextcloud + 2 monthly + 1 flat + + +# ---- manifest test ---- + + +def test_manifest_written_after_download(mod: ModuleType, tmp_path: Path) -> None: + """Verify download_manifest.json is created with expected structure.""" + from click.testing import CliRunner + + # Patch resolve_rf_release to avoid HTTP calls + def _fake_resolve(year_month: str | None = None) -> str: + return "https://dadosabertos.rfb.gov.br/CNPJ/dados_abertos_cnpj/2026-03/" + + # Patch download_file to simulate successful downloads + def _fake_download(url: str, dest: Path, **kwargs) -> bool: # type: ignore[no-untyped-def] + dest.parent.mkdir(parents=True, exist_ok=True) + dest.write_bytes(b"fake-zip-content") + return True + + # Patch extract_zip to no-op + def _fake_extract(zip_path: Path, output_dir: Path) -> list[Path]: + return [] + + with ( + patch.object(mod, "resolve_rf_release", side_effect=_fake_resolve), + patch.object(mod, "download_file", side_effect=_fake_download), + patch.object(mod, "extract_zip", side_effect=_fake_extract), + ): + runner = CliRunner() + result = runner.invoke( + mod.main, + [ + "--output-dir", str(tmp_path), + "--files", "1", + "--skip-extract", + ], + ) + + assert result.exit_code == 0, result.output + + manifest_path = tmp_path / "download_manifest.json" + assert manifest_path.exists(), f"Manifest not found. Output:\n{result.output}" + + manifest = json.loads(manifest_path.read_text(encoding="utf-8")) + assert manifest["source"] == "receita_federal_cnpj" + assert manifest["resolved_release"] == "2026-03" + assert manifest["base_url"] == "https://dadosabertos.rfb.gov.br/CNPJ/dados_abertos_cnpj/2026-03/" + assert "checksum" in manifest + assert manifest["checksum"].startswith("sha256:") + assert "started_at" in manifest + assert "finished_at" in manifest + + # Should have reference files + main files (1 per type = 3 main + 6 reference) + assert len(manifest["files"]) == 9 + statuses = {f["status"] for f in manifest["files"]} + assert statuses <= {"ok", "skipped", "failed"} diff --git a/etl/tests/test_download_cnpj_bq.py b/etl/tests/test_download_cnpj_bq.py index dea7294..c390fdb 100644 --- a/etl/tests/test_download_cnpj_bq.py +++ b/etl/tests/test_download_cnpj_bq.py @@ -47,12 +47,13 @@ def _fake_download(*args, **kwargs): # type: ignore[no-untyped-def] } monkeypatch.setattr(module, "_download_table", _fake_download) + monkeypatch.setattr(module, "_run_bigquery_precheck", lambda **kw: None) runner = CliRunner() result = runner.invoke( module.main, [ "--billing-project", - "bracc-corruptos", + "icarus-corruptos", "--output-dir", str(tmp_path), "--dataset", @@ -94,12 +95,13 @@ def _fail_on_socios(*args, **kwargs): # type: ignore[no-untyped-def] } monkeypatch.setattr(module, "_download_table", _fail_on_socios) + monkeypatch.setattr(module, "_run_bigquery_precheck", lambda **kw: None) runner = CliRunner() result = runner.invoke( module.main, [ "--billing-project", - "bracc-corruptos", + "icarus-corruptos", "--output-dir", str(tmp_path), "--dataset", diff --git a/etl/uv.lock b/etl/uv.lock index 800f331..37f031f 100644 --- a/etl/uv.lock +++ b/etl/uv.lock @@ -70,6 +70,7 @@ dependencies = [ { name = "httpx" }, { name = "neo4j" }, { name = "pandas" }, + { name = "pandera" }, { name = "pydantic" }, { name = "pydantic-settings" }, { name = "pypdf" }, @@ -107,7 +108,7 @@ dev = [ requires-dist = [ { name = "click", specifier = ">=8.1.0" }, { name = "db-dtypes", marker = "extra == 'bigquery'", specifier = ">=1.3.0" }, - { name = "defusedxml", specifier = ">=0.7.1" }, + { name = "defusedxml", specifier = ">=0.7.0" }, { name = "google-cloud-bigquery", marker = "extra == 'bigquery'", specifier = ">=3.25.0" }, { name = "google-cloud-bigquery-storage", marker = "extra == 'bigquery'", specifier = ">=2.27.0" }, { name = "httpx", specifier = ">=0.28.0" }, @@ -115,6 +116,7 @@ requires-dist = [ { name = "neo4j", specifier = ">=5.27.0" }, { name = "pandas", specifier = ">=2.2.0" }, { name = "pandas-stubs", marker = "extra == 'dev'", specifier = ">=2.2.0" }, + { name = "pandera", specifier = ">=0.21.0" }, { name = "pyarrow", marker = "extra == 'bigquery'", specifier = ">=17.0.0" }, { name = "pydantic", specifier = ">=2.10.0" }, { name = "pydantic-settings", specifier = ">=2.7.0" }, @@ -1014,6 +1016,22 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/7c/2f/f91e4eee21585ff548e83358332d5632ee49f6b2dcd96cb5dca4e0468951/pandas_stubs-3.0.0.260204-py3-none-any.whl", hash = "sha256:5ab9e4d55a6e2752e9720828564af40d48c4f709e6a2c69b743014a6fcb6c241", size = 168540, upload-time = "2026-02-04T15:17:15.615Z" }, ] +[[package]] +name = "pandera" +version = "0.29.0" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "packaging" }, + { name = "pydantic" }, + { name = "typeguard" }, + { name = "typing-extensions" }, + { name = "typing-inspect" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/80/ee/8e0d40dad2c0947b933fc9c0959b2c17cc3419ccdf50df683216f37a3f96/pandera-0.29.0.tar.gz", hash = "sha256:06bc4fc1e4ff02534dd44482a9bc704fb2e58fe3fbb11be906aa714f7f5ec801", size = 575324, upload-time = "2026-01-29T02:49:36.891Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/75/7b/03299e4ccc5e3cfb0f9e234207ac43ef08b3ba6c4c2882c890e550ceadba/pandera-0.29.0-py3-none-any.whl", hash = "sha256:b3b25d6c00d7c100fbab96aff0e81e52d3dae543a880d24135cca705fa97c516", size = 295876, upload-time = "2026-01-29T02:49:34.812Z" }, +] + [[package]] name = "pathspec" version = "1.0.4" @@ -1538,6 +1556,18 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/24/99/4772b8e00a136f3e01236de33b0efda31ee7077203ba5967fcc76da94d65/texttable-1.7.0-py2.py3-none-any.whl", hash = "sha256:72227d592c82b3d7f672731ae73e4d1f88cd8e2ef5b075a7a7f01a23a3743917", size = 10768, upload-time = "2023-10-03T09:48:10.434Z" }, ] +[[package]] +name = "typeguard" +version = "4.5.1" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "typing-extensions" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/2b/e8/66e25efcc18542d58706ce4e50415710593721aae26e794ab1dec34fb66f/typeguard-4.5.1.tar.gz", hash = "sha256:f6f8ecbbc819c9bc749983cc67c02391e16a9b43b8b27f15dc70ed7c4a007274", size = 80121, upload-time = "2026-02-19T16:09:03.392Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/91/88/b55b3117287a8540b76dbdd87733808d4d01c8067a3b339408c250bb3600/typeguard-4.5.1-py3-none-any.whl", hash = "sha256:44d2bf329d49a244110a090b55f5f91aa82d9a9834ebfd30bcc73651e4a8cc40", size = 36745, upload-time = "2026-02-19T16:09:01.6Z" }, +] + [[package]] name = "typing-extensions" version = "4.15.0" @@ -1547,6 +1577,19 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/18/67/36e9267722cc04a6b9f15c7f3441c2363321a3ea07da7ae0c0707beb2a9c/typing_extensions-4.15.0-py3-none-any.whl", hash = "sha256:f0fa19c6845758ab08074a0cfa8b7aecb71c999ca73d62883bc25cc018c4e548", size = 44614, upload-time = "2025-08-25T13:49:24.86Z" }, ] +[[package]] +name = "typing-inspect" +version = "0.9.0" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "mypy-extensions" }, + { name = "typing-extensions" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/dc/74/1789779d91f1961fa9438e9a8710cdae6bd138c80d7303996933d117264a/typing_inspect-0.9.0.tar.gz", hash = "sha256:b23fc42ff6f6ef6954e4852c1fb512cdd18dbea03134f91f856a95ccc9461f78", size = 13825, upload-time = "2023-05-24T20:25:47.612Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/65/f3/107a22063bf27bdccf2024833d3445f4eea42b2e598abfbd46f6a63b6cb0/typing_inspect-0.9.0-py3-none-any.whl", hash = "sha256:9ee6fc59062311ef8547596ab6b955e1b8aa46242d854bfc78f4f6b0eff35f9f", size = 8827, upload-time = "2023-05-24T20:25:45.287Z" }, +] + [[package]] name = "typing-inspection" version = "0.4.2" diff --git a/frontend/index.html b/frontend/index.html index 11780c0..783b12e 100644 --- a/frontend/index.html +++ b/frontend/index.html @@ -4,7 +4,7 @@ - BRACC + BR-ACC diff --git a/frontend/package-lock.json b/frontend/package-lock.json index 52db820..5fcbabc 100644 --- a/frontend/package-lock.json +++ b/frontend/package-lock.json @@ -1675,9 +1675,9 @@ "license": "MIT" }, "node_modules/@rollup/rollup-android-arm-eabi": { - "version": "4.59.0", - "resolved": "https://registry.npmjs.org/@rollup/rollup-android-arm-eabi/-/rollup-android-arm-eabi-4.59.0.tgz", - "integrity": "sha512-upnNBkA6ZH2VKGcBj9Fyl9IGNPULcjXRlg0LLeaioQWueH30p6IXtJEbKAgvyv+mJaMxSm1l6xwDXYjpEMiLMg==", + "version": "4.58.0", + "resolved": "https://registry.npmjs.org/@rollup/rollup-android-arm-eabi/-/rollup-android-arm-eabi-4.58.0.tgz", + "integrity": "sha512-mr0tmS/4FoVk1cnaeN244A/wjvGDNItZKR8hRhnmCzygyRXYtKF5jVDSIILR1U97CTzAYmbgIj/Dukg62ggG5w==", "cpu": [ "arm" ], @@ -1689,9 +1689,9 @@ ] }, "node_modules/@rollup/rollup-android-arm64": { - "version": "4.59.0", - "resolved": "https://registry.npmjs.org/@rollup/rollup-android-arm64/-/rollup-android-arm64-4.59.0.tgz", - "integrity": "sha512-hZ+Zxj3SySm4A/DylsDKZAeVg0mvi++0PYVceVyX7hemkw7OreKdCvW2oQ3T1FMZvCaQXqOTHb8qmBShoqk69Q==", + "version": "4.58.0", + "resolved": "https://registry.npmjs.org/@rollup/rollup-android-arm64/-/rollup-android-arm64-4.58.0.tgz", + "integrity": "sha512-+s++dbp+/RTte62mQD9wLSbiMTV+xr/PeRJEc/sFZFSBRlHPNPVaf5FXlzAL77Mr8FtSfQqCN+I598M8U41ccQ==", "cpu": [ "arm64" ], @@ -1703,9 +1703,9 @@ ] }, "node_modules/@rollup/rollup-darwin-arm64": { - "version": "4.59.0", - "resolved": "https://registry.npmjs.org/@rollup/rollup-darwin-arm64/-/rollup-darwin-arm64-4.59.0.tgz", - "integrity": "sha512-W2Psnbh1J8ZJw0xKAd8zdNgF9HRLkdWwwdWqubSVk0pUuQkoHnv7rx4GiF9rT4t5DIZGAsConRE3AxCdJ4m8rg==", + "version": "4.58.0", + "resolved": "https://registry.npmjs.org/@rollup/rollup-darwin-arm64/-/rollup-darwin-arm64-4.58.0.tgz", + "integrity": "sha512-MFWBwTcYs0jZbINQBXHfSrpSQJq3IUOakcKPzfeSznONop14Pxuqa0Kg19GD0rNBMPQI2tFtu3UzapZpH0Uc1Q==", "cpu": [ "arm64" ], @@ -1717,9 +1717,9 @@ ] }, "node_modules/@rollup/rollup-darwin-x64": { - "version": "4.59.0", - "resolved": "https://registry.npmjs.org/@rollup/rollup-darwin-x64/-/rollup-darwin-x64-4.59.0.tgz", - "integrity": "sha512-ZW2KkwlS4lwTv7ZVsYDiARfFCnSGhzYPdiOU4IM2fDbL+QGlyAbjgSFuqNRbSthybLbIJ915UtZBtmuLrQAT/w==", + "version": "4.58.0", + "resolved": "https://registry.npmjs.org/@rollup/rollup-darwin-x64/-/rollup-darwin-x64-4.58.0.tgz", + "integrity": "sha512-yiKJY7pj9c9JwzuKYLFaDZw5gma3fI9bkPEIyofvVfsPqjCWPglSHdpdwXpKGvDeYDms3Qal8qGMEHZ1M/4Udg==", "cpu": [ "x64" ], @@ -1731,9 +1731,9 @@ ] }, "node_modules/@rollup/rollup-freebsd-arm64": { - "version": "4.59.0", - "resolved": "https://registry.npmjs.org/@rollup/rollup-freebsd-arm64/-/rollup-freebsd-arm64-4.59.0.tgz", - "integrity": "sha512-EsKaJ5ytAu9jI3lonzn3BgG8iRBjV4LxZexygcQbpiU0wU0ATxhNVEpXKfUa0pS05gTcSDMKpn3Sx+QB9RlTTA==", + "version": "4.58.0", + "resolved": "https://registry.npmjs.org/@rollup/rollup-freebsd-arm64/-/rollup-freebsd-arm64-4.58.0.tgz", + "integrity": "sha512-x97kCoBh5MOevpn/CNK9W1x8BEzO238541BGWBc315uOlN0AD/ifZ1msg+ZQB05Ux+VF6EcYqpiagfLJ8U3LvQ==", "cpu": [ "arm64" ], @@ -1745,9 +1745,9 @@ ] }, "node_modules/@rollup/rollup-freebsd-x64": { - "version": "4.59.0", - "resolved": "https://registry.npmjs.org/@rollup/rollup-freebsd-x64/-/rollup-freebsd-x64-4.59.0.tgz", - "integrity": "sha512-d3DuZi2KzTMjImrxoHIAODUZYoUUMsuUiY4SRRcJy6NJoZ6iIqWnJu9IScV9jXysyGMVuW+KNzZvBLOcpdl3Vg==", + "version": "4.58.0", + "resolved": "https://registry.npmjs.org/@rollup/rollup-freebsd-x64/-/rollup-freebsd-x64-4.58.0.tgz", + "integrity": "sha512-Aa8jPoZ6IQAG2eIrcXPpjRcMjROMFxCt1UYPZZtCxRV68WkuSigYtQ/7Zwrcr2IvtNJo7T2JfDXyMLxq5L4Jlg==", "cpu": [ "x64" ], @@ -1759,9 +1759,9 @@ ] }, "node_modules/@rollup/rollup-linux-arm-gnueabihf": { - "version": "4.59.0", - "resolved": "https://registry.npmjs.org/@rollup/rollup-linux-arm-gnueabihf/-/rollup-linux-arm-gnueabihf-4.59.0.tgz", - "integrity": "sha512-t4ONHboXi/3E0rT6OZl1pKbl2Vgxf9vJfWgmUoCEVQVxhW6Cw/c8I6hbbu7DAvgp82RKiH7TpLwxnJeKv2pbsw==", + "version": "4.58.0", + "resolved": "https://registry.npmjs.org/@rollup/rollup-linux-arm-gnueabihf/-/rollup-linux-arm-gnueabihf-4.58.0.tgz", + "integrity": "sha512-Ob8YgT5kD/lSIYW2Rcngs5kNB/44Q2RzBSPz9brf2WEtcGR7/f/E9HeHn1wYaAwKBni+bdXEwgHvUd0x12lQSA==", "cpu": [ "arm" ], @@ -1773,9 +1773,9 @@ ] }, "node_modules/@rollup/rollup-linux-arm-musleabihf": { - "version": "4.59.0", - "resolved": "https://registry.npmjs.org/@rollup/rollup-linux-arm-musleabihf/-/rollup-linux-arm-musleabihf-4.59.0.tgz", - "integrity": "sha512-CikFT7aYPA2ufMD086cVORBYGHffBo4K8MQ4uPS/ZnY54GKj36i196u8U+aDVT2LX4eSMbyHtyOh7D7Zvk2VvA==", + "version": "4.58.0", + "resolved": "https://registry.npmjs.org/@rollup/rollup-linux-arm-musleabihf/-/rollup-linux-arm-musleabihf-4.58.0.tgz", + "integrity": "sha512-K+RI5oP1ceqoadvNt1FecL17Qtw/n9BgRSzxif3rTL2QlIu88ccvY+Y9nnHe/cmT5zbH9+bpiJuG1mGHRVwF4Q==", "cpu": [ "arm" ], @@ -1787,9 +1787,9 @@ ] }, "node_modules/@rollup/rollup-linux-arm64-gnu": { - "version": "4.59.0", - "resolved": "https://registry.npmjs.org/@rollup/rollup-linux-arm64-gnu/-/rollup-linux-arm64-gnu-4.59.0.tgz", - "integrity": "sha512-jYgUGk5aLd1nUb1CtQ8E+t5JhLc9x5WdBKew9ZgAXg7DBk0ZHErLHdXM24rfX+bKrFe+Xp5YuJo54I5HFjGDAA==", + "version": "4.58.0", + "resolved": "https://registry.npmjs.org/@rollup/rollup-linux-arm64-gnu/-/rollup-linux-arm64-gnu-4.58.0.tgz", + "integrity": "sha512-T+17JAsCKUjmbopcKepJjHWHXSjeW7O5PL7lEFaeQmiVyw4kkc5/lyYKzrv6ElWRX/MrEWfPiJWqbTvfIvjM1Q==", "cpu": [ "arm64" ], @@ -1801,9 +1801,9 @@ ] }, "node_modules/@rollup/rollup-linux-arm64-musl": { - "version": "4.59.0", - "resolved": "https://registry.npmjs.org/@rollup/rollup-linux-arm64-musl/-/rollup-linux-arm64-musl-4.59.0.tgz", - "integrity": "sha512-peZRVEdnFWZ5Bh2KeumKG9ty7aCXzzEsHShOZEFiCQlDEepP1dpUl/SrUNXNg13UmZl+gzVDPsiCwnV1uI0RUA==", + "version": "4.58.0", + "resolved": "https://registry.npmjs.org/@rollup/rollup-linux-arm64-musl/-/rollup-linux-arm64-musl-4.58.0.tgz", + "integrity": "sha512-cCePktb9+6R9itIJdeCFF9txPU7pQeEHB5AbHu/MKsfH/k70ZtOeq1k4YAtBv9Z7mmKI5/wOLYjQ+B9QdxR6LA==", "cpu": [ "arm64" ], @@ -1815,9 +1815,9 @@ ] }, "node_modules/@rollup/rollup-linux-loong64-gnu": { - "version": "4.59.0", - "resolved": "https://registry.npmjs.org/@rollup/rollup-linux-loong64-gnu/-/rollup-linux-loong64-gnu-4.59.0.tgz", - "integrity": "sha512-gbUSW/97f7+r4gHy3Jlup8zDG190AuodsWnNiXErp9mT90iCy9NKKU0Xwx5k8VlRAIV2uU9CsMnEFg/xXaOfXg==", + "version": "4.58.0", + "resolved": "https://registry.npmjs.org/@rollup/rollup-linux-loong64-gnu/-/rollup-linux-loong64-gnu-4.58.0.tgz", + "integrity": "sha512-iekUaLkfliAsDl4/xSdoCJ1gnnIXvoNz85C8U8+ZxknM5pBStfZjeXgB8lXobDQvvPRCN8FPmmuTtH+z95HTmg==", "cpu": [ "loong64" ], @@ -1829,9 +1829,9 @@ ] }, "node_modules/@rollup/rollup-linux-loong64-musl": { - "version": "4.59.0", - "resolved": "https://registry.npmjs.org/@rollup/rollup-linux-loong64-musl/-/rollup-linux-loong64-musl-4.59.0.tgz", - "integrity": "sha512-yTRONe79E+o0FWFijasoTjtzG9EBedFXJMl888NBEDCDV9I2wGbFFfJQQe63OijbFCUZqxpHz1GzpbtSFikJ4Q==", + "version": "4.58.0", + "resolved": "https://registry.npmjs.org/@rollup/rollup-linux-loong64-musl/-/rollup-linux-loong64-musl-4.58.0.tgz", + "integrity": "sha512-68ofRgJNl/jYJbxFjCKE7IwhbfxOl1muPN4KbIqAIe32lm22KmU7E8OPvyy68HTNkI2iV/c8y2kSPSm2mW/Q9Q==", "cpu": [ "loong64" ], @@ -1843,9 +1843,9 @@ ] }, "node_modules/@rollup/rollup-linux-ppc64-gnu": { - "version": "4.59.0", - "resolved": "https://registry.npmjs.org/@rollup/rollup-linux-ppc64-gnu/-/rollup-linux-ppc64-gnu-4.59.0.tgz", - "integrity": "sha512-sw1o3tfyk12k3OEpRddF68a1unZ5VCN7zoTNtSn2KndUE+ea3m3ROOKRCZxEpmT9nsGnogpFP9x6mnLTCaoLkA==", + "version": "4.58.0", + "resolved": "https://registry.npmjs.org/@rollup/rollup-linux-ppc64-gnu/-/rollup-linux-ppc64-gnu-4.58.0.tgz", + "integrity": "sha512-dpz8vT0i+JqUKuSNPCP5SYyIV2Lh0sNL1+FhM7eLC457d5B9/BC3kDPp5BBftMmTNsBarcPcoz5UGSsnCiw4XQ==", "cpu": [ "ppc64" ], @@ -1857,9 +1857,9 @@ ] }, "node_modules/@rollup/rollup-linux-ppc64-musl": { - "version": "4.59.0", - "resolved": "https://registry.npmjs.org/@rollup/rollup-linux-ppc64-musl/-/rollup-linux-ppc64-musl-4.59.0.tgz", - "integrity": "sha512-+2kLtQ4xT3AiIxkzFVFXfsmlZiG5FXYW7ZyIIvGA7Bdeuh9Z0aN4hVyXS/G1E9bTP/vqszNIN/pUKCk/BTHsKA==", + "version": "4.58.0", + "resolved": "https://registry.npmjs.org/@rollup/rollup-linux-ppc64-musl/-/rollup-linux-ppc64-musl-4.58.0.tgz", + "integrity": "sha512-4gdkkf9UJ7tafnweBCR/mk4jf3Jfl0cKX9Np80t5i78kjIH0ZdezUv/JDI2VtruE5lunfACqftJ8dIMGN4oHew==", "cpu": [ "ppc64" ], @@ -1871,9 +1871,9 @@ ] }, "node_modules/@rollup/rollup-linux-riscv64-gnu": { - "version": "4.59.0", - "resolved": "https://registry.npmjs.org/@rollup/rollup-linux-riscv64-gnu/-/rollup-linux-riscv64-gnu-4.59.0.tgz", - "integrity": "sha512-NDYMpsXYJJaj+I7UdwIuHHNxXZ/b/N2hR15NyH3m2qAtb/hHPA4g4SuuvrdxetTdndfj9b1WOmy73kcPRoERUg==", + "version": "4.58.0", + "resolved": "https://registry.npmjs.org/@rollup/rollup-linux-riscv64-gnu/-/rollup-linux-riscv64-gnu-4.58.0.tgz", + "integrity": "sha512-YFS4vPnOkDTD/JriUeeZurFYoJhPf9GQQEF/v4lltp3mVcBmnsAdjEWhr2cjUCZzZNzxCG0HZOvJU44UGHSdzw==", "cpu": [ "riscv64" ], @@ -1885,9 +1885,9 @@ ] }, "node_modules/@rollup/rollup-linux-riscv64-musl": { - "version": "4.59.0", - "resolved": "https://registry.npmjs.org/@rollup/rollup-linux-riscv64-musl/-/rollup-linux-riscv64-musl-4.59.0.tgz", - "integrity": "sha512-nLckB8WOqHIf1bhymk+oHxvM9D3tyPndZH8i8+35p/1YiVoVswPid2yLzgX7ZJP0KQvnkhM4H6QZ5m0LzbyIAg==", + "version": "4.58.0", + "resolved": "https://registry.npmjs.org/@rollup/rollup-linux-riscv64-musl/-/rollup-linux-riscv64-musl-4.58.0.tgz", + "integrity": "sha512-x2xgZlFne+QVNKV8b4wwaCS8pwq3y14zedZ5DqLzjdRITvreBk//4Knbcvm7+lWmms9V9qFp60MtUd0/t/PXPw==", "cpu": [ "riscv64" ], @@ -1899,9 +1899,9 @@ ] }, "node_modules/@rollup/rollup-linux-s390x-gnu": { - "version": "4.59.0", - "resolved": "https://registry.npmjs.org/@rollup/rollup-linux-s390x-gnu/-/rollup-linux-s390x-gnu-4.59.0.tgz", - "integrity": "sha512-oF87Ie3uAIvORFBpwnCvUzdeYUqi2wY6jRFWJAy1qus/udHFYIkplYRW+wo+GRUP4sKzYdmE1Y3+rY5Gc4ZO+w==", + "version": "4.58.0", + "resolved": "https://registry.npmjs.org/@rollup/rollup-linux-s390x-gnu/-/rollup-linux-s390x-gnu-4.58.0.tgz", + "integrity": "sha512-jIhrujyn4UnWF8S+DHSkAkDEO3hLX0cjzxJZPLF80xFyzyUIYgSMRcYQ3+uqEoyDD2beGq7Dj7edi8OnJcS/hg==", "cpu": [ "s390x" ], @@ -1913,9 +1913,9 @@ ] }, "node_modules/@rollup/rollup-linux-x64-gnu": { - "version": "4.59.0", - "resolved": "https://registry.npmjs.org/@rollup/rollup-linux-x64-gnu/-/rollup-linux-x64-gnu-4.59.0.tgz", - "integrity": "sha512-3AHmtQq/ppNuUspKAlvA8HtLybkDflkMuLK4DPo77DfthRb71V84/c4MlWJXixZz4uruIH4uaa07IqoAkG64fg==", + "version": "4.58.0", + "resolved": "https://registry.npmjs.org/@rollup/rollup-linux-x64-gnu/-/rollup-linux-x64-gnu-4.58.0.tgz", + "integrity": "sha512-+410Srdoh78MKSJxTQ+hZ/Mx+ajd6RjjPwBPNd0R3J9FtL6ZA0GqiiyNjCO9In0IzZkCNrpGymSfn+kgyPQocg==", "cpu": [ "x64" ], @@ -1927,9 +1927,9 @@ ] }, "node_modules/@rollup/rollup-linux-x64-musl": { - "version": "4.59.0", - "resolved": "https://registry.npmjs.org/@rollup/rollup-linux-x64-musl/-/rollup-linux-x64-musl-4.59.0.tgz", - "integrity": "sha512-2UdiwS/9cTAx7qIUZB/fWtToJwvt0Vbo0zmnYt7ED35KPg13Q0ym1g442THLC7VyI6JfYTP4PiSOWyoMdV2/xg==", + "version": "4.58.0", + "resolved": "https://registry.npmjs.org/@rollup/rollup-linux-x64-musl/-/rollup-linux-x64-musl-4.58.0.tgz", + "integrity": "sha512-ZjMyby5SICi227y1MTR3VYBpFTdZs823Rs/hpakufleBoufoOIB6jtm9FEoxn/cgO7l6PM2rCEl5Kre5vX0QrQ==", "cpu": [ "x64" ], @@ -1941,9 +1941,9 @@ ] }, "node_modules/@rollup/rollup-openbsd-x64": { - "version": "4.59.0", - "resolved": "https://registry.npmjs.org/@rollup/rollup-openbsd-x64/-/rollup-openbsd-x64-4.59.0.tgz", - "integrity": "sha512-M3bLRAVk6GOwFlPTIxVBSYKUaqfLrn8l0psKinkCFxl4lQvOSz8ZrKDz2gxcBwHFpci0B6rttydI4IpS4IS/jQ==", + "version": "4.58.0", + "resolved": "https://registry.npmjs.org/@rollup/rollup-openbsd-x64/-/rollup-openbsd-x64-4.58.0.tgz", + "integrity": "sha512-ds4iwfYkSQ0k1nb8LTcyXw//ToHOnNTJtceySpL3fa7tc/AsE+UpUFphW126A6fKBGJD5dhRvg8zw1rvoGFxmw==", "cpu": [ "x64" ], @@ -1955,9 +1955,9 @@ ] }, "node_modules/@rollup/rollup-openharmony-arm64": { - "version": "4.59.0", - "resolved": "https://registry.npmjs.org/@rollup/rollup-openharmony-arm64/-/rollup-openharmony-arm64-4.59.0.tgz", - "integrity": "sha512-tt9KBJqaqp5i5HUZzoafHZX8b5Q2Fe7UjYERADll83O4fGqJ49O1FsL6LpdzVFQcpwvnyd0i+K/VSwu/o/nWlA==", + "version": "4.58.0", + "resolved": "https://registry.npmjs.org/@rollup/rollup-openharmony-arm64/-/rollup-openharmony-arm64-4.58.0.tgz", + "integrity": "sha512-fd/zpJniln4ICdPkjWFhZYeY/bpnaN9pGa6ko+5WD38I0tTqk9lXMgXZg09MNdhpARngmxiCg0B0XUamNw/5BQ==", "cpu": [ "arm64" ], @@ -1969,9 +1969,9 @@ ] }, "node_modules/@rollup/rollup-win32-arm64-msvc": { - "version": "4.59.0", - "resolved": "https://registry.npmjs.org/@rollup/rollup-win32-arm64-msvc/-/rollup-win32-arm64-msvc-4.59.0.tgz", - "integrity": "sha512-V5B6mG7OrGTwnxaNUzZTDTjDS7F75PO1ae6MJYdiMu60sq0CqN5CVeVsbhPxalupvTX8gXVSU9gq+Rx1/hvu6A==", + "version": "4.58.0", + "resolved": "https://registry.npmjs.org/@rollup/rollup-win32-arm64-msvc/-/rollup-win32-arm64-msvc-4.58.0.tgz", + "integrity": "sha512-YpG8dUOip7DCz3nr/JUfPbIUo+2d/dy++5bFzgi4ugOGBIox+qMbbqt/JoORwvI/C9Kn2tz6+Bieoqd5+B1CjA==", "cpu": [ "arm64" ], @@ -1983,9 +1983,9 @@ ] }, "node_modules/@rollup/rollup-win32-ia32-msvc": { - "version": "4.59.0", - "resolved": "https://registry.npmjs.org/@rollup/rollup-win32-ia32-msvc/-/rollup-win32-ia32-msvc-4.59.0.tgz", - "integrity": "sha512-UKFMHPuM9R0iBegwzKF4y0C4J9u8C6MEJgFuXTBerMk7EJ92GFVFYBfOZaSGLu6COf7FxpQNqhNS4c4icUPqxA==", + "version": "4.58.0", + "resolved": "https://registry.npmjs.org/@rollup/rollup-win32-ia32-msvc/-/rollup-win32-ia32-msvc-4.58.0.tgz", + "integrity": "sha512-b9DI8jpFQVh4hIXFr0/+N/TzLdpBIoPzjt0Rt4xJbW3mzguV3mduR9cNgiuFcuL/TeORejJhCWiAXe3E/6PxWA==", "cpu": [ "ia32" ], @@ -1997,9 +1997,9 @@ ] }, "node_modules/@rollup/rollup-win32-x64-gnu": { - "version": "4.59.0", - "resolved": "https://registry.npmjs.org/@rollup/rollup-win32-x64-gnu/-/rollup-win32-x64-gnu-4.59.0.tgz", - "integrity": "sha512-laBkYlSS1n2L8fSo1thDNGrCTQMmxjYY5G0WFWjFFYZkKPjsMBsgJfGf4TLxXrF6RyhI60L8TMOjBMvXiTcxeA==", + "version": "4.58.0", + "resolved": "https://registry.npmjs.org/@rollup/rollup-win32-x64-gnu/-/rollup-win32-x64-gnu-4.58.0.tgz", + "integrity": "sha512-CSrVpmoRJFN06LL9xhkitkwUcTZtIotYAF5p6XOR2zW0Zz5mzb3IPpcoPhB02frzMHFNo1reQ9xSF5fFm3hUsQ==", "cpu": [ "x64" ], @@ -2011,9 +2011,9 @@ ] }, "node_modules/@rollup/rollup-win32-x64-msvc": { - "version": "4.59.0", - "resolved": "https://registry.npmjs.org/@rollup/rollup-win32-x64-msvc/-/rollup-win32-x64-msvc-4.59.0.tgz", - "integrity": "sha512-2HRCml6OztYXyJXAvdDXPKcawukWY2GpR5/nxKp4iBgiO3wcoEGkAaqctIbZcNB6KlUQBIqt8VYkNSj2397EfA==", + "version": "4.58.0", + "resolved": "https://registry.npmjs.org/@rollup/rollup-win32-x64-msvc/-/rollup-win32-x64-msvc-4.58.0.tgz", + "integrity": "sha512-QFsBgQNTnh5K0t/sBsjJLq24YVqEIVkGpfN2VHsnN90soZyhaiA9UUHufcctVNL4ypJY0wrwad0wslx2KJQ1/w==", "cpu": [ "x64" ], @@ -2413,24 +2413,37 @@ "typescript": ">=4.8.4 <6.0.0" } }, + "node_modules/@typescript-eslint/typescript-estree/node_modules/balanced-match": { + "version": "4.0.3", + "resolved": "https://registry.npmjs.org/balanced-match/-/balanced-match-4.0.3.tgz", + "integrity": "sha512-1pHv8LX9CpKut1Zp4EXey7Z8OfH11ONNH6Dhi2WDUt31VVZFXZzKwXcysBgqSumFCmR+0dqjMK5v5JiFHzi0+g==", + "dev": true, + "license": "MIT", + "engines": { + "node": "20 || >=22" + } + }, "node_modules/@typescript-eslint/typescript-estree/node_modules/brace-expansion": { - "version": "2.0.2", - "resolved": "https://registry.npmjs.org/brace-expansion/-/brace-expansion-2.0.2.tgz", - "integrity": "sha512-Jt0vHyM+jmUBqojB7E1NIYadt0vI0Qxjxd2TErW94wDz+E2LAm5vKMXXwg6ZZBTHPuUlDgQHKXvjGBdfcF1ZDQ==", + "version": "5.0.2", + "resolved": "https://registry.npmjs.org/brace-expansion/-/brace-expansion-5.0.2.tgz", + "integrity": "sha512-Pdk8c9poy+YhOgVWw1JNN22/HcivgKWwpxKq04M/jTmHyCZn12WPJebZxdjSa5TmBqISrUSgNYU3eRORljfCCw==", "dev": true, "license": "MIT", "dependencies": { - "balanced-match": "^1.0.0" + "balanced-match": "^4.0.2" + }, + "engines": { + "node": "20 || >=22" } }, "node_modules/@typescript-eslint/typescript-estree/node_modules/minimatch": { - "version": "9.0.9", - "resolved": "https://registry.npmjs.org/minimatch/-/minimatch-9.0.9.tgz", - "integrity": "sha512-OBwBN9AL4dqmETlpS2zasx+vTeWclWzkblfZk7KTA5j3jeOONz/tRCnZomUyvNg83wL5Zv9Ss6HMJXAgL8R2Yg==", + "version": "9.0.6", + "resolved": "https://registry.npmjs.org/minimatch/-/minimatch-9.0.6.tgz", + "integrity": "sha512-kQAVowdR33euIqeA0+VZTDqU+qo1IeVY+hrKYtZMio3Pg0P0vuh/kwRylLUddJhB6pf3q/botcOvRtx4IN1wqQ==", "dev": true, "license": "ISC", "dependencies": { - "brace-expansion": "^2.0.2" + "brace-expansion": "^5.0.2" }, "engines": { "node": ">=16 || 14 >=14.17" @@ -4537,9 +4550,9 @@ } }, "node_modules/minimatch": { - "version": "3.1.5", - "resolved": "https://registry.npmjs.org/minimatch/-/minimatch-3.1.5.tgz", - "integrity": "sha512-VgjWUsnnT6n+NUk6eZq77zeFdpW2LWDzP6zFGrCbHXiYNul5Dzqk2HHQ5uFH2DNW5Xbp8+jVzaeNt94ssEEl4w==", + "version": "3.1.3", + "resolved": "https://registry.npmjs.org/minimatch/-/minimatch-3.1.3.tgz", + "integrity": "sha512-M2GCs7Vk83NxkUyQV1bkABc4yxgz9kILhHImZiBPAZ9ybuvCb0/H7lEl5XvIg3g+9d4eNotkZA5IWwYl0tibaA==", "dev": true, "license": "ISC", "dependencies": { @@ -5067,9 +5080,9 @@ } }, "node_modules/rollup": { - "version": "4.59.0", - "resolved": "https://registry.npmjs.org/rollup/-/rollup-4.59.0.tgz", - "integrity": "sha512-2oMpl67a3zCH9H79LeMcbDhXW/UmWG/y2zuqnF2jQq5uq9TbM9TVyXvA4+t+ne2IIkBdrLpAaRQAvo7YI/Yyeg==", + "version": "4.58.0", + "resolved": "https://registry.npmjs.org/rollup/-/rollup-4.58.0.tgz", + "integrity": "sha512-wbT0mBmWbIvvq8NeEYWWvevvxnOyhKChir47S66WCxw1SXqhw7ssIYejnQEVt7XYQpsj2y8F9PM+Cr3SNEa0gw==", "dev": true, "license": "MIT", "dependencies": { @@ -5083,31 +5096,31 @@ "npm": ">=8.0.0" }, "optionalDependencies": { - "@rollup/rollup-android-arm-eabi": "4.59.0", - "@rollup/rollup-android-arm64": "4.59.0", - "@rollup/rollup-darwin-arm64": "4.59.0", - "@rollup/rollup-darwin-x64": "4.59.0", - "@rollup/rollup-freebsd-arm64": "4.59.0", - "@rollup/rollup-freebsd-x64": "4.59.0", - "@rollup/rollup-linux-arm-gnueabihf": "4.59.0", - "@rollup/rollup-linux-arm-musleabihf": "4.59.0", - "@rollup/rollup-linux-arm64-gnu": "4.59.0", - "@rollup/rollup-linux-arm64-musl": "4.59.0", - "@rollup/rollup-linux-loong64-gnu": "4.59.0", - "@rollup/rollup-linux-loong64-musl": "4.59.0", - "@rollup/rollup-linux-ppc64-gnu": "4.59.0", - "@rollup/rollup-linux-ppc64-musl": "4.59.0", - "@rollup/rollup-linux-riscv64-gnu": "4.59.0", - "@rollup/rollup-linux-riscv64-musl": "4.59.0", - "@rollup/rollup-linux-s390x-gnu": "4.59.0", - "@rollup/rollup-linux-x64-gnu": "4.59.0", - "@rollup/rollup-linux-x64-musl": "4.59.0", - "@rollup/rollup-openbsd-x64": "4.59.0", - "@rollup/rollup-openharmony-arm64": "4.59.0", - "@rollup/rollup-win32-arm64-msvc": "4.59.0", - "@rollup/rollup-win32-ia32-msvc": "4.59.0", - "@rollup/rollup-win32-x64-gnu": "4.59.0", - "@rollup/rollup-win32-x64-msvc": "4.59.0", + "@rollup/rollup-android-arm-eabi": "4.58.0", + "@rollup/rollup-android-arm64": "4.58.0", + "@rollup/rollup-darwin-arm64": "4.58.0", + "@rollup/rollup-darwin-x64": "4.58.0", + "@rollup/rollup-freebsd-arm64": "4.58.0", + "@rollup/rollup-freebsd-x64": "4.58.0", + "@rollup/rollup-linux-arm-gnueabihf": "4.58.0", + "@rollup/rollup-linux-arm-musleabihf": "4.58.0", + "@rollup/rollup-linux-arm64-gnu": "4.58.0", + "@rollup/rollup-linux-arm64-musl": "4.58.0", + "@rollup/rollup-linux-loong64-gnu": "4.58.0", + "@rollup/rollup-linux-loong64-musl": "4.58.0", + "@rollup/rollup-linux-ppc64-gnu": "4.58.0", + "@rollup/rollup-linux-ppc64-musl": "4.58.0", + "@rollup/rollup-linux-riscv64-gnu": "4.58.0", + "@rollup/rollup-linux-riscv64-musl": "4.58.0", + "@rollup/rollup-linux-s390x-gnu": "4.58.0", + "@rollup/rollup-linux-x64-gnu": "4.58.0", + "@rollup/rollup-linux-x64-musl": "4.58.0", + "@rollup/rollup-openbsd-x64": "4.58.0", + "@rollup/rollup-openharmony-arm64": "4.58.0", + "@rollup/rollup-win32-arm64-msvc": "4.58.0", + "@rollup/rollup-win32-ia32-msvc": "4.58.0", + "@rollup/rollup-win32-x64-gnu": "4.58.0", + "@rollup/rollup-win32-x64-msvc": "4.58.0", "fsevents": "~2.3.2" } }, diff --git a/frontend/src/App.test.tsx b/frontend/src/App.test.tsx index d8472cd..4ddde49 100644 --- a/frontend/src/App.test.tsx +++ b/frontend/src/App.test.tsx @@ -1,4 +1,4 @@ -import { render, screen } from "@testing-library/react"; +import { act, render, screen, waitFor } from "@testing-library/react"; import { MemoryRouter } from "react-router"; import { describe, expect, it, vi } from "vitest"; @@ -22,24 +22,37 @@ vi.mock("./stores/auth", () => ({ ), })); +// Keep App route test deterministic without Landing async effects. +vi.mock("./pages/Landing", () => ({ + Landing: () =>
BR-ACC
, +})); + import { App } from "./App"; describe("App", () => { - it("renders the landing page with title", () => { - render( - - - , - ); - expect(screen.getAllByText("BRACC").length).toBeGreaterThan(0); + it("renders the landing page with title", async () => { + await act(async () => { + render( + + + , + ); + }); + await waitFor(() => { + expect(screen.getAllByText("BR-ACC").length).toBeGreaterThan(0); + }); }); - it("renders login page at /login", () => { - render( - - - , - ); - expect(screen.getByLabelText(/e-mail/i)).toBeInTheDocument(); + it("renders login page at /login", async () => { + await act(async () => { + render( + + + , + ); + }); + await waitFor(() => { + expect(screen.getByLabelText(/e-mail/i)).toBeInTheDocument(); + }); }); }); diff --git a/frontend/src/api/client.ts b/frontend/src/api/client.ts index a0ba6b3..a7ea25d 100644 --- a/frontend/src/api/client.ts +++ b/frontend/src/api/client.ts @@ -12,14 +12,13 @@ export class ApiError extends Error { export async function apiFetch(path: string, init?: RequestInit): Promise { const url = `${API_BASE}${path}`; - const headers = new Headers(init?.headers); - if (!headers.has("content-type")) { - headers.set("Content-Type", "application/json"); - } const response = await fetch(url, { credentials: "include", ...init, - headers, + headers: { + "Content-Type": "application/json", + ...init?.headers, + }, }); if (!response.ok) { @@ -33,6 +32,24 @@ export async function apiFetch(path: string, init?: RequestInit): Promise return response.json() as Promise; } +async function apiFetchBlob(path: string): Promise { + const url = `${API_BASE}${path}`; + const response = await fetch(url, { credentials: "include" }); + + if (!response.ok) { + let detail = response.statusText; + try { + const err = await response.json(); + detail = err.detail || detail; + } catch { + // response wasn't JSON + } + throw new ApiError(response.status, detail); + } + + return response.blob(); +} + export interface SourceAttribution { database: string; record_id?: string | null; @@ -201,7 +218,7 @@ export interface Investigation { updated_at: string; entity_ids: string[]; share_token: string | null; - share_expires_at?: string | null; + share_expires_at: string | null; } export interface InvestigationListResponse { @@ -342,19 +359,22 @@ export function getSharedInvestigation(token: string): Promise { export function generateShareLink( investigationId: string, -): Promise<{ share_token: string; share_expires_at?: string | null }> { - return apiFetch<{ share_token: string; share_expires_at?: string | null }>( +): Promise<{ share_token: string; share_expires_at: string }> { + return apiFetch<{ share_token: string; share_expires_at: string }>( `/api/v1/investigations/${encodeURIComponent(investigationId)}/share`, { method: "POST" }, ); } +export function revokeShareLink(investigationId: string): Promise { + return apiFetch( + `/api/v1/investigations/${encodeURIComponent(investigationId)}/share`, + { method: "DELETE" }, + ); +} + export function exportInvestigation(investigationId: string): Promise { - const url = `${API_BASE}/api/v1/investigations/${encodeURIComponent(investigationId)}/export`; - return fetch(url, { credentials: "include" }).then((res) => { - if (!res.ok) throw new ApiError(res.status, `API error: ${res.statusText}`); - return res.blob(); - }); + return apiFetchBlob(`/api/v1/investigations/${encodeURIComponent(investigationId)}/export`); } // --- Stats --- @@ -444,9 +464,7 @@ export function exportInvestigationPDF( lang = "pt", ): Promise { const params = new URLSearchParams({ lang }); - const url = `${API_BASE}/api/v1/investigations/${encodeURIComponent(investigationId)}/export/pdf?${params}`; - return fetch(url, { credentials: "include" }).then((res) => { - if (!res.ok) throw new ApiError(res.status, `API error: ${res.statusText}`); - return res.blob(); - }); + return apiFetchBlob( + `/api/v1/investigations/${encodeURIComponent(investigationId)}/export/pdf?${params}`, + ); } diff --git a/frontend/src/components/common/AppShell.tsx b/frontend/src/components/common/AppShell.tsx index 852249e..3b46cfe 100644 --- a/frontend/src/components/common/AppShell.tsx +++ b/frontend/src/components/common/AppShell.tsx @@ -144,7 +144,7 @@ export function AppShell() {