diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 66a9994e..00c0e2ae 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -40,6 +40,25 @@ jobs: sudo curl -sL https://github.com/ysugimoto/falco/releases/latest/download/falco-linux-amd64 -o /usr/local/bin/falco sudo chmod +x /usr/local/bin/falco + - name: Install gitleaks + # Same curl-binary-to-PATH pattern as falco above. Version pinned so + # a detector-rule change doesn't suddenly fail an unrelated PR; bump + # deliberately when wanted. Mirrors `.pre-commit-config.yaml`. + run: | + GITLEAKS_VERSION=8.30.1 + sudo curl -sSfL "https://github.com/gitleaks/gitleaks/releases/download/v${GITLEAKS_VERSION}/gitleaks_${GITLEAKS_VERSION}_linux_x64.tar.gz" \ + | sudo tar -xz -C /usr/local/bin gitleaks + sudo chmod +x /usr/local/bin/gitleaks + gitleaks version + + - name: Secret scan (gitleaks) + # Scans full git history against the .gitleaks.toml allowlist. + # `--exit-code 1` is the default; explicit for clarity. Anything + # the allowlist doesn't cover fails the build with a redacted + # diagnostic — see CONTRIBUTING.md / AGENTS.md for the + # suppression playbook. + run: gitleaks detect --no-banner --redact --config .gitleaks.toml --exit-code 1 + - name: Install terraform # Required by tests/utils/test_terraform_gen.py — runs `terraform fmt` # against generator output and `validate` when TERRAFORM_VALIDATE=1. diff --git a/.gitignore b/.gitignore index 1e2088e1..33202558 100644 --- a/.gitignore +++ b/.gitignore @@ -3,6 +3,10 @@ setup-state.json *.duckdb *.duckdb.wal /configs/* +# Security: the SSH known_hosts pin IS source-controlled — it's the +# trust anchor for the reverse-tunnel host-key check. Override the +# blanket configs/* ignore. +!/configs/ssh_known_hosts /data/* /data/system/* __pycache__/ @@ -36,6 +40,43 @@ node_modules/ /.antigravitycli +# Local Swival tool state (REPL history, per-session audit dirs). Per-user. +.swival/ +/audit-findings/ + +# Ad-hoc debug screenshots dropped at the repo root by browser-driven +# verification sessions. Intentional docs/assets/*.png are tracked; these +# root-level ones are throwaway. +/test_*.png + # Ad-hoc lint/test output dumps frontend/*_out.txt frontend/*_output.txt + +# Session-scoring fixture extracts contain real prod IPs / UAs / payloads. +# Reproducible via scripts/scoring/extract_traces.py against local data. +tests/fixtures/scoring/ + +# Trained matrix.json carries real customer route names. Regenerable via +# scripts/scoring/train.py against a fresh trace extract. +compute/scorer/matrix.json + +# Rust build artifacts. +compute/scorer/target/ +compute/scorer/bin/ +compute/scorer/pkg/ + +# Per-deployment secrets: AES cookie keys, deploy-time IDs the service files +# might reference. NEVER commit. +.scoring/ +.aider* + +# Ad-hoc working directory for local profiling — HAR captures, per-page JSON +# summaries, query trace dumps. The reusable harness scripts (profile.js, +# split_per_page.py) live here for now; treat the whole tree as throwaway. +/scratch/ + +# Local-only VS Code config (file-watcher / Pylance excludes for the +# regenerating .next + cache trees). Personal to each contributor's editor +# setup — not promoted to the repo by default. +.vscode/ diff --git a/.gitleaks.toml b/.gitleaks.toml new file mode 100644 index 00000000..f09b2c4a --- /dev/null +++ b/.gitleaks.toml @@ -0,0 +1,61 @@ +# gitleaks configuration — extends the built-in detector set with this +# repo's allowlist for tracked test fixtures, Rust lockfile checksums, +# and the public SSH host key. +# +# Run locally: make secret-scan +# Pre-commit: installed via .pre-commit-config.yaml +# CI: invoked by .github/workflows/ci.yml +# +# Suppression mechanisms in increasing scope: +# - inline `#gitleaks:allow` on the offending line +# - .gitleaksignore — fingerprint list for one-off historical findings +# - this file's [allowlist] paths — for whole files / directories + +[extend] +# Inherit gitleaks' built-in ruleset (~100 detectors: AWS, GCP, Azure, +# GitHub, GitLab, Slack, Stripe, Twilio, Mailgun, Square, PyPI, npm, +# generic-api-key, private-key, etc.). The default config also +# allowlists npm/yarn/pnpm/poetry/go.mod/go.sum/node_modules/venv — +# we add the gaps below. +useDefault = true + +[allowlist] +description = "fastly-log-analytics — tracked test fixtures, Rust lockfile, public keys" + +# Path-based allowlist. Regex matched against the file's path relative +# to the repo root. Only entries for TRACKED files matter for the +# default git-history scan; the .next/configs/data entries below also +# keep ad-hoc `gitleaks detect --no-git` working-tree runs clean. +paths = [ + # Rust dependency lockfile — sha256 checksums look like generic + # API keys to gitleaks. Not covered by the built-in lockfile allowlist. + '''^compute/scorer/Cargo\.lock$''', + + # Tracked, intentional test fixtures + '''^compute/scorer/fixtures/local-dictionary\.json$''', # placeholder AES key = 0x00..0x1f + '''^compute/scorer/src/cookie\.rs$''', # test-mode constants + '''^tests/scoring/.*\.py$''', # cookie/scoring test fixtures + '''^tests/repositories/test_alerts\.py$''', # zeros Slack webhook fixture + '''^tests/utils/test_sql_validator\.py$''', # blocked-function NAMES (e.g. "AWS_SECRET_ACCESS_KEY") + + # Public SSH host key for localhost.run — sharing is the entire point + # (trust anchor for the reverse-tunnel host-key check). + '''^configs/ssh_known_hosts$''', + + # Documentation: release notes and runbooks may reference example + # tokens / credentials in prose. + '''^docs/''', + '''^CHANGELOG\.md$''', + '''^AGENTS\.md$''', + + # Working-tree-only artifacts (all gitignored; matter only for + # ad-hoc `--no-git` runs). gitleaks uses Go's RE2 engine, which + # doesn't support negative lookahead, so we list the per-service + # config filename pattern explicitly rather than "everything under + # configs/ except ssh_known_hosts". + '''^frontend/\.next/''', # Next.js build cache + '''^configs/.*\.json(\.bak.*)?$''', # real per-service Fastly configs (gitignored) + '''^data/''', # real SSH share key, share DB, runtime data + '''.*/__pycache__/''', # Python bytecode + '''\.pyc$''', +] diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 34317f91..5a150d76 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -26,6 +26,20 @@ repos: - id: check-merge-conflict - id: debug-statements + # Secret scanner. Blocks commits that introduce credentials / API keys / + # private keys / tokens. Configured via .gitleaks.toml at repo root + # (extends gitleaks' default ruleset with this repo's allowlist for + # tracked test fixtures and Rust lockfile checksums). Re-run with + # `make secret-scan` locally; CI runs the same invocation. + # + # If a legitimate placeholder trips the scanner, suppress with: + # - inline `#gitleaks:allow` on the line, OR + # - add the file/path glob to .gitleaks.toml [allowlist] paths + - repo: https://github.com/gitleaks/gitleaks + rev: v8.30.1 + hooks: + - id: gitleaks + # Regenerate the committed OpenAPI snapshot + typed frontend client # whenever the FastAPI surface or the generator script changes. If the # regenerated files differ from the staged version, pre-commit fails @@ -40,3 +54,9 @@ repos: language: system pass_filenames: false entry: bash -c 'cd frontend && npm run --silent gen:types' + - id: typecheck-frontend + name: Typecheck frontend + files: ^frontend/.*\.(ts|tsx)$ + language: system + pass_filenames: false + entry: bash -c 'cd frontend && npx tsc --noEmit' diff --git a/AGENTS.md b/AGENTS.md index 86234af6..605de0e6 100644 --- a/AGENTS.md +++ b/AGENTS.md @@ -225,6 +225,19 @@ Removes the FOS logging endpoint from the Fastly service, the CDN VCL service, t ## Frontend Patterns +> **REQUIRED READING before any frontend work:** +> [`frontend/node_modules/next/dist/docs/`](frontend/node_modules/next/dist/docs/) +> — the Next.js 16 App Router docs are vendored locally. Read the relevant +> sections (loading.tsx, prefetching, streaming, instant-navigation, caching, +> linking-and-navigating) BEFORE proposing or implementing changes to +> components / pages / hooks. **Click-feel bugs are almost always a Next +> conventions violation that the docs would have flagged.** Past failures +> from skipping this: shipping pages without `loading.tsx`, blocking +> layouts on uncached data, per-instance `setInterval` storms, missing +> `signal` cancellation, polling intervals tuned for "live feel" not +> backend cost. The conventions section below distills the rules but +> defer to the docs for any pattern not listed. + **Stack:** Next.js 16 app router, React 19, TanStack Query v5, Zustand, shadcn/ui, Recharts, openapi-fetch. **Type-safe client:** @@ -254,6 +267,100 @@ A global middleware in [frontend/lib/api.ts](frontend/lib/api.ts) checks `respon 9. **`empty_schema_response(runner)`** in [_base.py](backend/repositories/_base.py) — return this when a repo function hits a service with no logs. 10. **`origin_latency_us_expr(actual_cols)`** in `_base.py` — don't hand-roll the `COALESCE("ottfb", "ttfb" * 1000000.0)` fragment. +### Next.js navigation + loading conventions (READ BEFORE TOUCHING FRONTEND) + +Distilled from `frontend/node_modules/next/dist/docs/` — these are the +rules to follow so click-to-render feels instant. Failure modes I've shipped +before and you should not repeat: + +**1. Every navigable route MUST have a `loading.tsx`.** Without it, dynamic +routes (all our `'use client'` pages) get NO prefetched fallback — the +browser sits on the previous page until the destination's JS is ready and +its useQueries have settled. With it, Next.js renders the skeleton the +instant the user clicks. Use a variant from +[components/skeletons/PageSkeleton.tsx](frontend/components/skeletons/PageSkeleton.tsx) +— don't hand-roll Array.from + Skeleton inline. + +**2. Layouts MUST NOT block on uncached data.** If `app/layout.tsx` or any +shared layout awaits a fetch / accesses cookies / etc. before rendering +children, **`loading.tsx` will not show a fallback at all** — Next.js waits +for the layout to settle first. The previous fix to `AppLayout` removed an +`isLoading ? : children` gate that was doing exactly this; any +new layout-level data must use `useQuery` with `staleTime` so re-renders +are cheap, and the layout must never short-circuit children behind a +loading boolean. + +**3. Cancel in-flight queries on every route change.** AppLayout's +`useEffect([pathname])` calls `queryClient.cancelQueries({ type: 'active' })` +so the old page's leftover polls (e.g. SystemHealthCard's 10s health-snapshot +poll) don't compete with the new page's mount work. **Always thread `signal` +through queryFns** so cancellation actually aborts the network request — +this hasn't been done universally yet, but new queryFns should follow: +```typescript +queryFn: async ({ signal }) => { + const { data } = await client.GET(..., { signal }) + return data +} +``` + +**4. Poll intervals must respect backend cost.** Default is 10s+. The +SystemHealthCard fix bumped a 2s poll to 10s because the endpoint took 1-1.7s +under load — at 2s polling that was constant backend pressure. If real-time +updates matter, add a manual Refresh button, don't poll faster than 5s. +Always set `refetchIntervalInBackground: false` so background tabs don't +keep hammering. + +**5. NEVER spawn per-instance `setInterval` for visible-tick state.** If +multiple components need a 1Hz "now" value (countdowns, "X seconds ago" +displays), they share the single +[useNowMs](frontend/hooks/useNowSeconds.ts) hook — one `setInterval` for +the whole tree. Past offenders: SystemJobBox (10 instances × 1s tick on +/admin), CronScheduleBox (5+ on /logs), useElapsedTime (per-consumer +ticker). All now consume `useNowMs`. If a new component needs a ticker, +use this hook; do not roll your own. + +**6. Async buttons need IMMEDIATE feedback.** Every button whose `onClick` +does async work must render `` ++ a pending label (`Stopping…`, `Saving…`, `Severing…`) while pending. +`disabled={busy}` ALONE looks dead. Pattern lives in +[ExcludeRegexCard](frontend/components/SessionScoring/ExcludeRegexCard.tsx); +share-dashboard buttons follow the same shape after the recent fix. + +**7. Prefetch behavior:** + - Static routes → full route prefetched on Link viewport entry + - Dynamic routes (all our `'use client'` pages) → **partially prefetched + only if `loading.tsx` exists** (covers the shell to the loading + boundary). Without loading.tsx, NO prefetch happens. + - `` is the default; use `prefetch={false}` only + in dense lists (infinite-scroll tables) where the link cardinality + would balloon the prefetch traffic. + - **Hover-prefetch data, not just bundle:** when a Link target needs an + API call to render meaningfully, add `onMouseEnter` that calls + `queryClient.prefetchQuery(...)`. Example: the Admin → Share Dashboard + link in [admin/page.tsx](frontend/app/admin/page.tsx#L791) warms the + share-status query so the destination renders real content + immediately instead of skeleton-then-swap. + +**8. Wrap `router.replace()` inside effects in `startTransition`.** A +synchronous `router.replace()` inside `useEffect` causes a render cascade +that blocks paint. Examples: +[useUrlServiceSync](frontend/hooks/useUrlServiceSync.ts), +[AppLayout redirect block](frontend/components/AppLayout.tsx#L163). All +existing call sites are wrapped; new ones must follow. + +**9. React Query defaults are set in +[QueryProvider](frontend/components/QueryProvider.tsx):** `staleTime: 30s`, +`gcTime: 5min`, `refetchOnWindowFocus: false`. Don't override per-query +unless you need to — and when you do, document why. + +**10. When a click feels slow, MEASURE before guessing.** I have a working +playwright reproducer at `/tmp/nav-perf-test2.mjs` that times each phase +of a click (URL change, DOM ready, network idle, individual API requests). +Run it against the live tunnel (`localhost:3001`) BEFORE proposing a fix. +Click-feedback bugs are almost always about: (a) polls running while +navigation is in flight, (b) heavy useQuery fan-out on mount, (c) layout +re-renders triggered by store subscriptions. The trace shows which. + ### Removed modules — don't recreate - `backend/utils/audit_helpers.py` (referenced the long-removed DuckDB `_ingested_files` table) @@ -361,8 +468,8 @@ The tunnel exposes the same FastAPI app to the public internet. Middleware class ### 21. `sync_data` orphan-cleanup vs local-compaction outputs Local compaction writes merged rollups to three places: `/data/daily/`, `/data/weekly/`, and `/data/timestamp_hour=*/compacted_*.parquet`. None of these are tracked by the iceberg snapshot, so they are NOT in `cloud_files`/`active_paths`. The orphan-cleanup loop in [backend/core/iceberg.py](backend/core/iceberg.py) `sync_data()` walks the cache and deletes anything not in `active_paths`; without explicit allow-rules it nukes every compacted output, and the [`local_compacted_files` registry](backend/core/metadata_db.py) then blocks re-download of the source files — silently dropping rows from the view (production: 1.65M → 302K on 2026-05-31, then 1.66M → 1.62M on 2026-06-01 from the per-partition `compacted_*` variant). The fix is two-pronged: orphan-cleanup restricts its walk to `timestamp_hour=*` dirs AND skips `compacted_*.parquet` filenames. **If you add a new local-only output pattern, add it to both the dir skip and the file skip.** Integration coverage in [tests/core/test_local_compaction.py](tests/core/test_local_compaction.py)::`test_compaction_outputs_survive_iceberg_sync_orphan_cleanup` exercises the round-trip with real `compact_local_partitions` + real `sync_data`. -### 22. `unattended-upgrades` OOMs the production VM -The single-tenant 16 GB e2-standard-4 deploy runs backend + frontend + caddy at a steady-state working set around 10-13 GB. The Debian/Ubuntu nightly `apt-daily-upgrade.timer` forks a transient 1-2 GB downloader on top of that, and on 2026-06-01 it triggered an OOM kill that wedged the kernel (sshd died; needed `gcloud compute instances reset`). `~/restart.sh` on the VM re-asserts `systemctl mask apt-daily.timer apt-daily-upgrade.timer unattended-upgrades.service` on every restart so a re-image / apt-reinstall can't silently re-enable them. Trade-off: no automatic security patching — patch manually on a planned maintenance window with the backend container stopped. **If you bump the VM to a class with more RAM (e.g. `e2-custom-4-32768`), you may safely re-enable upgrades.** See `restart.sh` for the canonical incantation. +### 22. `unattended-upgrades` can OOM a memory-tight VM +A 16 GB Linux VM running backend + frontend + caddy holds a steady-state working set in the 10-13 GB range. The Debian/Ubuntu nightly `apt-daily-upgrade.timer` forks a transient 1-2 GB downloader on top of that, which can trip an OOM kill that wedges the kernel (sshd dies; needs a VM reset). The mitigation is to `systemctl mask apt-daily.timer apt-daily-upgrade.timer unattended-upgrades.service` on the host and re-assert it on every restart so a re-image / apt-reinstall can't silently re-enable them. Trade-off: no automatic security patching — patch manually on a planned maintenance window with the backend container stopped. **If you provision a VM with more RAM, you may safely re-enable upgrades.** ## AI Agent Directives @@ -385,6 +492,18 @@ These apply to every change, regardless of scope. 10. **Keep Python imports at module level.** Conditional mid-function imports trigger `UnboundLocalError` (Trap #2). 11. **Run `ruff format` before committing** (or rely on `make ci`). +### Secrets & sensitive data + +12. **Scan for committed secrets BEFORE every commit.** The repo has a `secret-scan` Makefile target (gitleaks) that's wired into both `make ci` and the pre-commit hook (`.pre-commit-config.yaml`). Either run pre-commit (`uv run pre-commit run --all-files`) or `make secret-scan` before pushing. CI also runs it (`.github/workflows/ci.yml`) and will fail the build, but catching it locally is faster. +13. **Allowlist suppression order** when a legitimate placeholder trips the scanner: + - **Inline** (single line): append `# gitleaks:allow` to the offending line. Cheapest for a one-off test fixture. + - **Fingerprint** (one-off historical): add the finding's `{file}:{rule-id}:{commit}:{secret-hash}` line to `.gitleaksignore` at repo root. + - **Path** (entire file or directory): add a regex to the `[allowlist] paths` array in `.gitleaks.toml`. Use this when adding a new directory of test fixtures. +14. **Never commit a real credential to suppress the scanner.** The point of the gate is exactly this. If a legitimate secret needs to live in the tree (e.g. an SSH public key used as a trust anchor), document why in a comment adjacent to the allowlist entry and explain why exposure is intentional. +15. **Never put real customer values in code, scripts, tests, or docs.** This includes Fastly service IDs (use `` or `${FASTLY_SERVICE_ID:?}` env vars in scripts), bucket names, real domains, real IPs (Fastly edge ranges are fine — they're published), real email addresses (use `you@example.com`), or screenshots that show the above. Test fixtures use placeholders (`TestLogSvcABC123`, `FAKE_TOKEN`, `"FROM_CONFIG"`). Real deployment values come from env vars / per-host config that's gitignored. +16. **Files that must never be committed** (covered by `.gitignore` — verify before any new directory of generated content lands): + - `.env` (real env), `configs/*.json` except `configs/ssh_known_hosts`, `data/system/` (real SSH key + share DB), `.scoring/` (per-deployment AES keys), `tests/fixtures/scoring/` (real prod traces). The `.gitleaks.toml` allowlist also covers these so a working-tree (`--no-git`) scan stays clean for ad-hoc local runs. + ### Provisioning Wizard 12. The token entered in step 2 must be threaded to any API call needing Fastly credentials (including the NGWAF workspace fetch). Don't rely on stored-config fallback alone. diff --git a/CHANGELOG.md b/CHANGELOG.md index 26cb882e..a912a4b2 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -5,6 +5,116 @@ All notable changes to this project will be documented in this file. The format is based on [Keep a Changelog 1.1.0](https://keepachangelog.com/en/1.1.0/), and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html). +## [1.1.0] - 2026-06-03 + +Edge session scoring. Every request is classified in real-time at the edge by a Fastly Compute service that runs an L1 (cookie compliance + timing rules) + L2 (PageRank-trained transition matrix) scorer, returning a combined 0-100 score that lands in DuckDB for analyst review. Operators can label sessions, watch live ROC-AUC, retrain the matrix, roll back to a prior matrix, rotate the AES cookie key, and push a hard enforcement threshold that rejects flagged requests at the edge with an operator-chosen HTTP status code (default 429). + +### Highlights + +- **Edge scoring** — Fastly Compute scorer + 6-snippet VCL preflight pattern (recv/pass/fetch/deliver/miss/enforce), AES-GCM-encrypted session cookie carrying rotating sid + transition state, `fastly.ddos_detected` gate so Compute is bypassed under L7 attack. +- **Admin UI** at `/admin/session-scoring` — StatusPanel with live AUC against accumulated labels, ScoringHealthCard with fire rate / score distribution / top reasons / matrix-staleness alert, ThresholdSlider with counterfactual flag/pass preview + precision/recall + commit-threshold persistence, RocPrCurves with ROC + Precision-Recall plots, TopFlaggedTable + LabelsTab with click-to-view-events per sid, RetrainButton (DuckDB traces → train.py → publish matrix to FOS), SinceHoursPicker driving all six cards on one shared time window. +- **Labels CRUD** — POST/PATCH/DELETE per-sid labels (good/bad/neutral) feed `evaluate_from_persisted_scores` to compute live ROC-AUC. Min-samples gate (≥3 per class) prevents noisy display. +- **ROC + PR curves** + per-reason AUC breakdown (split by L1/L2 rule: cookie-missing, impossibly-fast, robotic-consistency, rare-transition, low-transition-prob). +- **Composite `/scoring/dashboard`** endpoint collapses the 8 per-card requests into one in-flight-collapsed payload; the existing per-card endpoints stay mounted for back-compat. +- **`edge_score_reason` virtual field** — CSV-split via DuckDB `unnest(string_split(...))`, top-N cards + click-to-filter same as NGWAF signals. +- **FOS matrix persistence** — `enable_scoring` publishes the trained matrix to FOS; backend auto-fetches on startup (no more per-host scp). +- **Matrix version history + rollback** — every publish snapshots the prior matrix to `iceberg/meta/scoring_matrix_history/{version}.json`; new `/scoring/matrix-versions` lists them and `/scoring/matrix-versions/{v}/restore?confirm=true` copies a historical matrix back. AUC reflects the rollback immediately; Wasm at edge keeps the embedded matrix until `deploy_wasm.sh` re-runs (deploy_hint surfaced). +- **Threshold enforcement (live blocking)** — operator commits a threshold, scorer reads it from `scoring_config` ConfigStore, emits `X-Edge-Score-Enforce: 1` when score≥threshold, the new `Session Scoring - Enforce` VCL snippet rejects those requests on the post-scoring restart. Effective at the edge within seconds. Confirm-dialog-gated PUT endpoint + LIVE warning chip in the slider UI. The response code defaults to 429 (Too Many Requests) and is operator-overridable per-service via a new `Enforce response code` selector (403 / 429 / 451 / 503; backend accepts any 4xx/5xx) — picks land via a focused `update_enforce_status_code` orchestrator that swaps only the enforce snippet (~5–10s end-to-end vs. the full enable_scoring flow). Audit-logged as `scoring_enforce_status_code_changed`. +- **URL exclusion regex override** — operator-tunable per-service regex for "which URLs bypass the scorer". Defaults to the built-in static-asset extension list; the new `ExcludeRegexCard` on the Session Scoring page accepts a custom regex (e.g. exclude `/healthz`, exclude entire path prefixes, scope scoring to specific traffic). The PUT endpoint validates input through three layers before any VCL ships: (1) input policy — length cap, no quote / control chars, must compile under Python's `re`; (2) [falco](https://github.com/ysugimoto/falco) static analysis on the assembled recv snippet (catches regex+VCL composition errors that slip past Python's compiler); (3) Fastly's own VCL compiler at activate time. A focused `update_recv_exclusion_regex` orchestrator clones the active version, swaps only the recv snippet, and activates — ~5–15s end-to-end vs. the full enable_scoring flow. Confirm-dialog-gated. Audit-logged as `scoring_exclude_regex_changed`. Falco shipped in the backend Docker image; production sets `SCORING_REQUIRE_FALCO=1` so a missing binary fails closed instead of degrading to input-policy-only. +- **AES key rotation** — `POST /scoring/rotate-key` mints a fresh 32-byte key, moves the prior to `previous_key_hex` (grace slot — Rust cookie codec falls back to it so in-flight cookies keep decoding through one rotation cycle). +- **Cookie lifecycle bounds** — `SESSION_IDLE_EXPIRE_S` (30 min) + `SESSION_HARD_CAP_S` (24h) in the Rust scorer mint a fresh sid when either threshold is exceeded. Stolen cookies can't replay beyond their window; long-running sessions stop biasing the L1 variance estimator. +- **Per-reason AUC breakdown UI** — `PerReasonAucCard` renders AUC split by which L1/L2 rule fired (cookie-missing, impossibly-fast, robotic-consistency, rare-transition, low-transition-prob). +- **Operator audit log** — new `scoring_audit` table + `/scoring/audit` endpoint records every scoring_enabled, scoring_disabled, threshold_committed/cleared/enforced, matrix_retrained/restored, key_rotated event with actor + timestamp + details. Per-host, never mirrored via state_sync. + +### Reliability + +- **Cron-progress reliability** — `end_progress` auto-emits `done` when the last event isn't terminal; `list_active_runs` triple-guards (last-event filter + 5-min staleness + DB-status cross-check via `get_cron_run_status`); `reap_zombie_runs` called from every cron-tick cleanup. Fixed a production incident where 382 stale "sync" entries piled up on the System Health card. +- **state_sync merge guards** — `import_admin_state` no longer overwrites scoring `custom_fields` with stale FOS payloads (root cause of a production data-loss incident); sibling fixes in `cli.handle_update_logs`, `provision.write_service_config`, and `api_service_log_fields_set` close every "remote-overwrites-code-managed-state" path. +- **Defense-in-depth** — `enable_scoring` rollback + `disable_scoring` final-save reload cfg right before writing to close the 30-120s race window where concurrent writers got clobbered. +- **Per-key in-flight collapse** in `_cached` so the dashboard's 8-card mount no longer queues queries behind one global lock. + +### Performance + +- `security/top-bots` consolidated UA + NGWAF onto one temp table (was 2 independent Iceberg scans per dashboard mount). +- `dashboard/raw` uses `get_source_extent` for cached steady-state extent. +- `usage/prefill` cached-status fast path skips DuckDB hop when the sync cron has populated it. +- `get_enriched_services` 60s TTL cache on the recursive cache-dir `scandir` (was 200-1500ms per `/api/bootstrap`). +- `loading.tsx` Suspense skeletons + dynamic imports (LabelsTab, ChoroplethMap) cut admin-page click lag. + +### Cleanup + +- Dropped dead `@daypicker/react` dep + dead `frontend/components/ui/calendar.tsx`. +- Collapsed 7-site `cleanup_progress + reap` boilerplate into `cleanup_progress_and_reap()` helper. +- Refactored `security.py`'s ad-hoc temp-table to use the existing `QueryRunner.temp_table()` context manager. +- Narrowed `get_cron_run_status` exception scope to `sqlite3.Error` with DEBUG log so future triage isn't flying blind. + +### Security + +Capability-focused hardening across the FastAPI backend, Fastly VCL, Next.js frontend, and Rust scorer. All changes deployed and verified. + +- **Trust-boundary normalisation**: + - uvicorn runs with `--proxy-headers --forwarded-allow-ips=127.0.0.1` so `request.client.host` is the real client IP via Caddy's authoritative XFF rewrite. + - `is_request_remote()` reads `request.client.host` instead of the forgeable Host header; in-app leftmost-XFF parsing is gone. + - Caddyfile gates `Fastly-Client-IP → X-Forwarded-For` rewrite on `remote_ip` matching Fastly edge ranges. Startup assertion on `TRUSTED_PROXY_IPS` / `UVICORN_FORWARDED_ALLOW_IPS` + integration test prevent silent regression. + - Next.js `/admin` middleware gates on the Caddy-injected `X-Proxied-By-Caddy: true` marker instead of the forgeable Host header. +- **Destructive-op auth**: + - `/api/provision/teardown` validates a caller-supplied Fastly token via `/tokens/self` for the `global` scope before any destructive op; never falls back to server-stored credentials. Frontend TeardownDialog prompts admin for the token. + - `/api/provision/ngwaf-workspaces` token-gated (constant-time stored-key match OR validated `global`-scope token); NGWAF workspace mutation enforces analyst-session scope. +- **DuckDB user-SQL safety**: + - New `backend/utils/sql_validator.py` enforces a statement-type whitelist + recursive parse-tree walker with catalog blocklist (`duckdb_*` / `pg_*` prefixes, `information_schema` / `pg_catalog` / `system` schemas, non-`main` catalogs) + function denylist (`read_csv` / `read_parquet` / `iceberg_scan` / `glob` / `lsdir` / `getenv` / `current_setting` / `duckdb_secrets` / postgres / sqlite / mysql scanners) + fail-closed parse + audit logging + perf budget. Replaces a regex-based blocklist that missed `read_csv_auto`, `information_schema`, `duckdb_secrets`, `INSTALL/LOAD`, and `getenv`. + - `escape_sql_literal` helper applied at four ingest call sites; characterisation tests cover the PoC payload + multi-byte UTF-8 + backslash + empty + long-with-many-quotes. + - `time_range` validated via `dateutil.isoparse` before SQL interpolation. + - `get_con` / `get_meta_con` dropped the auto-query-param `read_only` flag. +- **VCL header & cache discipline**: + - `vcl_recv` preamble unsets every internal `x-of-*` / `x-fos-edge-data` / `x-is-cluster-fetch` / `X-Edge-*` header on the inbound request. + - Origin-metric VCL fields: numeric regex gates + `json.escape` on string values (log-injection). + - VCL ua/referer keeps its `substr` cap. + - Fastly `vcl_hash` now keys on the full `req.url` (path + query), not just `req.url.path` — closes cross-query cache poisoning. Auth `key` querystring is already stripped earlier so no secrets leak into cache keys. +- **Cross-tenant scope enforcement**: + - `/api/alerts/*` and `/api/views/*` enforce analyst-session scope on every read and mutation; pre-flight scope check on PATCH / DELETE via new `get_alert_by_id` / `get_view_by_id` helpers so unauthorised mutations never land. + - `/api/sources`, `/api/log-fields/catalog`, NGWAF workspace listing — analyst-scope filtering. + - Cache-layer audit confirmed every per-tenant cache (`session_scoring._cached`, iceberg, bot_sources) includes `service_id` in the key. +- **Path-traversal cages**: + - `/api/download` path traversal: `realpath` + `commonpath` cage. + - Cache cleanup rejects bucket separators + `realpath` cage. + - `service_id` alphanumeric/dash/underscore validation in path helpers. +- **Secret & data hygiene**: + - `claim_token` TOCTOU → atomic UPDATE with rowcount check. + - `share_db` quarantine narrowed to actual SQLite corruption signatures (was wiping the DB on transient `OperationalError`). + - Email-enumeration timing equalised via dummy scrypt on miss. + - `validate_session` re-syncs `pii_policy` / window / `service_ids` on every call so admin permission edits take effect immediately. + - `_StaticAssetLimiter` bounded at 10 k tracked IPs. + - `logging-settings/update` moved GET → POST/PATCH (CSRF). + - `query_errors` decorator logs traceback server-side, never in the response body; sweep fixture asserts no `trace` key leaks from any route. +- **SSH host-key pinning**: `configs/ssh_known_hosts` pinned, source-controlled, and gitignore-excepted; tunnel manager refuses to start when the file is missing (fail-safe; no TOFU fallback). +- **Scorer signal tightening**: Python + Rust parity — `L1_SCORE_COOKIE_TAMPERED = 100` (was capped at 75 with missing/expired); `L1_ROBOTIC_DWELL_LOW_S 0.5 → 0.20` (closes the 0.20s–0.50s robotic-bot threshold gap). Tracked follow-up sliding-window mean (needs cookie-schema v3) — partial mitigations via `SESSION_IDLE_EXPIRE_S=30 min` + `SESSION_HARD_CAP_S=24h` + session-max scoring bound the practical attack window. + +### Tests + +- 3070 backend tests +- 65 scorer Rust tests (+8) +- 265 frontend vitest tests (+13) +- `make ci` green: lint + format + mypy + pytest + vcl-test + verify-deps + typecheck-frontend + test-frontend + osv. + +### Infrastructure + +- Backend Docker image: `python:3.12-slim-bullseye` → `python:3.12-slim-bookworm` (cuts CVE-laden Debian 11 base; remaining 13 high CVEs are deep-dependency / OpenSSL CVEs every major Python base inherits). Frontend image's api-schema stage bumped to match. +- Backend image now ships [`falco`](https://github.com/ysugimoto/falco) v2.3.0 (Fastly VCL static analyser) — required by the scoring-recv-snippet validator. +- **Secret scanning** — [`gitleaks`](https://github.com/gitleaks/gitleaks) v8.30.1 wired in three places: `.pre-commit-config.yaml` (blocks accidentally-staged credentials at commit time), `make secret-scan` Makefile target chained into `make ci`, and a dedicated step in `.github/workflows/ci.yml` (fails the build on any non-allowlisted finding). Configuration in `.gitleaks.toml` extends the built-in ruleset and adds path allowlists for tracked test fixtures, Rust lockfile checksums, the public SSH host key, and (for working-tree-only scans) the gitignored real-config / `.next/` / `data/system/` directories. Verified clean against the full branch history. Policy + suppression playbook documented in **AGENTS.md** §Secrets. +- **CDN cache-key hardening** — `backend/core/fastly/utils.py` `vcl_recv` now runs `querystring.filter_except` to drop all non-S3-API query parameters (caller-injected tracking params, marketing UTMs, session IDs) BEFORE the cache lookup, followed by `querystring.sort` to canonicalise the remaining param order. Composes with the `vcl_hash` fix: untrusted params can no longer fracture the cache OR leak the auth `key` into the cache key. +- Dependency freshness sweep on all four ecosystems: + - **Python:** `aiohttp 3.13.5 → 3.14.0`, `cfn-lint 1.51.2 → 1.51.4`, `distlib 0.4.0 → 0.4.1`, `filelock 3.29.0 → 3.29.1`, `idna 3.17 → 3.18`, `joserfc 1.6.8 → 1.7.0`. + - **Frontend:** `@tanstack/react-query 5.100.14 → 5.101.0` (+ devtools), `@types/react 19.2.15 → 19.2.16`, `eslint-config-next 16.2.6 → 16.2.7`, `next 16.2.6 → 16.2.7`, `react/react-dom 19.2.6 → 19.2.7`. + - **Rust:** `bitflags 2.11.1 → 2.12.1`. + - **Deferred (major bumps reserved for 1.2):** TypeScript 5.9 → 6.0 (compiler-API breaking changes); Fastly Rust SDK 0.11 → 0.12 (Compute@Edge API changes); jsdom / eslint / vitest where we're already ahead of the npm "latest" tag. + +### Known limitations + +- Rate limiting at the edge is NOT included. The DDoS gate (`fastly.ddos_detected`) handles attack-scale traffic by bypassing Compute; sustained-low-rate abuse is left to the operator's existing WAF/NGWAF policies. A future rate-limiting feature is tracked separately. +- When a matrix is rolled back via the UI, the edge Wasm continues to use its embedded matrix until `scripts/scoring/deploy_wasm.sh` re-runs. The Restore endpoint returns a `deploy_hint` with the exact command. See `docs/session_scoring_runbook.md`. + +[1.1.0]: https://github.com/fastly/fastly-log-analytics/releases/tag/v1.1.0 + ## [1.0.0] - 2026-06-01 Initial public release. Self-hosted dashboard for searching, filtering, and visualizing request-level Fastly logs streamed to Fastly Object Storage. diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index 922c88b2..d9e67b1d 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -8,6 +8,18 @@ Contributions are welcome — bug reports, feature requests, and pull requests a - Keep pull requests focused. One feature or fix per PR. - Make sure the project builds and runs before submitting. +## Rust scorer prerequisites + +The session scoring Compute@Edge service (`compute/scorer/`) requires: + +- Rust 1.90+ (pinned in `compute/scorer/rust-toolchain.toml`) +- `wasm32-wasip1` target: `rustup target add wasm32-wasip1` +- [viceroy](https://github.com/fastly/Viceroy) (Fastly's local Compute runtime) — optional, only needed for running the scorer locally +- The scorer is rebuilt and deployed via: + ``` + scripts/scoring/deploy_wasm.sh --service-id --token + ``` + ## License This project is licensed under the [Apache License 2.0](LICENSE). By submitting a pull request, you agree that your contribution will be licensed under the same terms. diff --git a/Caddyfile b/Caddyfile index f6d202bc..bb149eb2 100644 --- a/Caddyfile +++ b/Caddyfile @@ -1,15 +1,30 @@ # Caddyfile — plain HTTP origin behind Fastly, hardened. # # Fastly terminates TLS at the edge and proxies to this VM on port 80. -# GCP firewall locks port 80 to Fastly's IP ranges. Backend and frontend -# bind to 127.0.0.1 only (host network mode) — Caddy reaches them on -# loopback, nothing else can. +# Backend and frontend bind to 127.0.0.1 only (host network mode) — Caddy +# reaches them on loopback, nothing else can. +# +# Trust topology (security #013/#029/#032/#034 and extra E1): +# The reverse_proxy directives use header_up to rewrite X-Forwarded-For +# from the Fastly-Client-IP header. That trust is only valid for +# requests that actually came through Fastly — anyone connecting +# directly to port 80 can set Fastly-Client-IP to whatever they want. +# The @from_fastly remote_ip matcher gates the header rewrite on the +# TCP peer being inside Fastly's published edge ranges. Direct callers +# skip the header_up clause, so request.client.host in uvicorn comes +# from their real (untrusted) peer IP and the IP-based gates kick in. # # Routing: # /api/* → backend directly (preserves Host header so the backend's # DNS-rebinding gate matches the registered public_endpoint; # peer = 127.0.0.1 from Caddy in host net mode). # else → Next.js frontend. +# +# Note on edge IP list maintenance: +# The Fastly CIDRs below are the published v4 ranges as of 2026-06-03 +# (https://api.fastly.com/public-ip-list). When Fastly adds a new edge +# range, refresh this list. A stale list means legitimate traffic from a +# new POP is treated as direct (untrusted) until Caddy reloads. { # No auto-HTTPS — Fastly handles TLS termination at the edge. @@ -60,19 +75,48 @@ } } + # Defense in depth (extra E1): replace any client-supplied X-Forwarded-For + # with Caddy's authoritative view of the TCP peer. Then, only when the + # TCP peer is a Fastly edge IP, override with Fastly-Client-IP. + # + # Non-Fastly direct caller: XFF = their real peer IP. uvicorn (with + # --proxy-headers --forwarded-allow-ips=127.0.0.1) sees Caddy at the + # loopback peer and trusts XFF, so request.client.host = the real + # attacker IP. Backend's DNS-rebinding and remote-host checks then + # fire correctly instead of misclassifying as admin. + # Fastly-edge caller: the second directive overrides XFF with the + # client IP that Fastly's edge signed and attached. + request_header X-Forwarded-For {http.request.remote.host} + + # Caddy-injected internal proxy marker (security #032): the frontend + # middleware blocks /admin requests when this header is present, while + # direct SSH-tunnel admin connections (which bypass Caddy) have no + # such header and reach the admin surface. Set unconditionally — there + # is no legitimate reason for an upstream to send this themselves. + request_header X-Proxied-By-Caddy "true" + + # Named matcher: TCP peer is an actual Fastly edge IP. + @from_fastly_v4 { + remote_ip 23.235.32.0/20 43.249.72.0/22 103.244.50.0/24 103.245.222.0/23 103.245.224.0/24 104.156.80.0/20 140.248.64.0/18 140.248.128.0/17 146.75.0.0/17 151.101.0.0/16 157.52.64.0/18 167.82.0.0/17 167.82.128.0/20 167.82.160.0/20 167.82.224.0/20 172.111.64.0/18 185.31.16.0/22 199.27.72.0/21 199.232.0.0/16 + } + + # When AND ONLY WHEN the request came from a Fastly edge, propagate the + # authoritative Fastly-Client-IP as X-Forwarded-For. Requests bypassing + # Fastly retain the {client_ip} XFF set above (their real TCP peer), + # so a direct port-80 attacker cannot spoof their source IP regardless + # of what Fastly-Client-IP value they send. + request_header @from_fastly_v4 X-Forwarded-For {http.request.header.Fastly-Client-IP} + # API → backend (preserve Host so backend's DNS-rebinding gate matches the # registered public_endpoint). @api path /api/* reverse_proxy @api 127.0.0.1:8000 { flush_interval -1 - # Replace X-Forwarded-For with Fastly's authoritative client-IP header. - header_up X-Forwarded-For {http.request.header.Fastly-Client-IP} } # Everything else → Next.js frontend. reverse_proxy 127.0.0.1:3000 { flush_interval -1 - header_up X-Forwarded-For {http.request.header.Fastly-Client-IP} } # Detailed access log: JSON format with every request's client IP, host, diff --git a/Makefile b/Makefile index 1223acc8..fb9c995c 100644 --- a/Makefile +++ b/Makefile @@ -1,4 +1,4 @@ -.PHONY: test lint format typecheck ci install install-hooks clean gen-types verify-deps +.PHONY: test lint format typecheck ci install install-hooks clean gen-types verify-deps secret-scan osv outdated # Prevent a VIRTUAL_ENV from another project leaking into uv commands unexport VIRTUAL_ENV @@ -57,6 +57,28 @@ outdated: osv: uv run python scripts/check_osv.py +# Secret scanner — gitleaks, configured via .gitleaks.toml at repo root. +# Scans git history by default (no `--no-git`), so any committed credential +# trips the gate even if later removed. Use `gitleaks detect --no-git` +# locally to also scan the working tree (catches secrets in untracked / +# unstaged files before you accidentally `git add` them). +# +# Suppression mechanisms in increasing scope: +# - inline `#gitleaks:allow` on the offending line +# - .gitleaksignore — fingerprint list for one-off historical findings +# - .gitleaks.toml [allowlist] paths — for whole files / directories +# +# Skips cleanly with a loud warning if the binary isn't on PATH. Production +# CI installs it via curl in .github/workflows/ci.yml (same pattern as falco). +secret-scan: + @if command -v gitleaks > /dev/null; then \ + gitleaks detect --no-banner --redact --config .gitleaks.toml --exit-code 1; \ + else \ + echo "⚠️ Skipping secret-scan: gitleaks not on PATH."; \ + echo " Install: brew install gitleaks (or see https://github.com/gitleaks/gitleaks#installing)"; \ + echo " Pre-commit + CI install it automatically — local dev is recommended."; \ + fi + # Verify package.json + package-lock.json resolve cleanly under `npm ci`. # Local `make ci` previously used the already-installed node_modules and # silently tolerated peer-dep conflicts that would break GitHub Actions @@ -71,7 +93,19 @@ vcl-test: echo "Skipping VCL tests: falco linter not found in PATH"; \ fi -ci: lint format-check typecheck test vcl-test verify-deps typecheck-frontend test-frontend osv outdated +# Run the underlying targets in parallel with a -j2 cap. Backend pytest +# (~26s) and frontend vitest (~35s) are the two long poles; running them +# concurrently saves ~25-30s wall vs. sequential, and the -j2 cap keeps +# them from oversubscribing the box (both invocations already parallelise +# internally via pytest-xdist / vitest workers). +# +# Order matters here — make's scheduler picks leftmost-available targets +# first, so the slow ones (`test`, `test-frontend`) are listed first to +# claim the two parallel slots immediately. Lighter checks fill in as +# slots free up. +ci: + @$(MAKE) -j2 test test-frontend typecheck-frontend lint format-check typecheck vcl-test verify-deps secret-scan osv + clean: find . -type d -name __pycache__ -exec rm -rf {} + 2>/dev/null || true diff --git a/README.md b/README.md index d4d36c93..2000e9a5 100644 --- a/README.md +++ b/README.md @@ -18,6 +18,7 @@ You'll need: - **Docker** (recommended) — or Python 3.10+ and Node.js 24+ for a manual install - *Optional:* a Fastly API token with the **Billing** permission to power the [Usage & Cost page](docs/features.md#usage--cost-page) - *Optional:* [`falco`](https://github.com/ysugimoto/falco) to validate VCL during provisioning (highly recommended; the app degrades gracefully without it) +- *Optional:* **Rust 1.90+** with the `wasm32-wasip1` target (`rustup target add wasm32-wasip1`) — only needed if you plan to rebuild the [Session Scoring](docs/session_scoring_runbook.md) Compute Wasm scorer from source --- @@ -94,6 +95,7 @@ You run the application as a central web-accessible server (either on a dedicate - **Log field configuration** — built-in field groups (HTTP, network, geo, TLS, NGWAF) plus custom VCL expressions - **Alerts** — threshold-based, webhook-delivered - **Live dashboard sharing** — three modes (SSH tunnel, your own hostname, your own IP) with per-analyst passcode invites, IP allowlisting, and instant revoke +- **Session scoring** — edge-computed 0-100 risk score per request combining cookie/timing signals with a PageRank transition matrix, with live threshold enforcement, audit logging, key rotation, and matrix version history. See the [runbook](docs/session_scoring_runbook.md) and [feature reference](docs/features.md) See [docs/features.md](docs/features.md) for the full feature reference. diff --git a/backend/Dockerfile b/backend/Dockerfile index d5e3f209..d417d2ee 100644 --- a/backend/Dockerfile +++ b/backend/Dockerfile @@ -1,5 +1,5 @@ # --- Build Stage --- -FROM python:3.12-slim-bullseye AS builder +FROM python:3.12-slim-bookworm AS builder # Install build dependencies RUN apt-get update && apt-get install -y --no-install-recommends \ @@ -23,7 +23,7 @@ COPY pyproject.toml uv.lock ./ RUN uv sync --no-dev --frozen --no-install-project # --- Production Stage --- -FROM python:3.12-slim-bullseye AS runner +FROM python:3.12-slim-bookworm AS runner # Set environment variables ENV PYTHONUNBUFFERED=1 \ @@ -34,11 +34,34 @@ ENV PYTHONUNBUFFERED=1 \ WORKDIR /app -# Install runtime dependencies (curl for healthcheck) +# Install runtime dependencies (curl for healthcheck, tar for falco install). RUN apt-get update && apt-get install -y --no-install-recommends \ curl \ + tar \ && rm -rf /var/lib/apt/lists/* +# Falco — Fastly VCL static analyser (github.com/ysugimoto/falco). +# Used by backend.utils.vcl_validator to lint the scoring recv snippet +# before publishing a custom URL-exclusion regex. Required in +# production: a missing binary triggers the validator's hard-fail +# branch when SCORING_REQUIRE_FALCO=1 is set on the backend service. +# +# Pin to a specific release so a future falco-side change doesn't +# surprise us; bump when you actively want a new version. +ARG FALCO_VERSION=2.3.0 +RUN set -eux; \ + arch="$(dpkg --print-architecture)"; \ + case "$arch" in \ + amd64|arm64) falco_arch="$arch" ;; \ + *) echo "unsupported arch: $arch" >&2; exit 1 ;; \ + esac; \ + url="https://github.com/ysugimoto/falco/releases/download/v${FALCO_VERSION}/falco-linux-${falco_arch}.tar.gz"; \ + curl -fsSL -o /tmp/falco.tar.gz "$url"; \ + tar -xzf /tmp/falco.tar.gz -C /usr/local/bin/; \ + chmod +x /usr/local/bin/falco; \ + rm /tmp/falco.tar.gz; \ + falco --version + # Copy the virtual environment from the builder COPY --from=builder /app/.venv /app/.venv @@ -48,6 +71,15 @@ COPY backend/ ./backend/ COPY pyproject.toml README.md uv.lock ./ COPY scripts/generate_openapi.py scripts/ +# Include the default empty scoring matrix. The trained matrix.json is +# a build artifact (gitignored, produced by scripts/scoring/train.py) +# so we can't bake it in at image build time. The backend's _load_matrix() +# prefers matrix.json if present (operator dropped it in via a volume +# mount or post-build copy) and falls back to matrix.default.json so +# the /scoring/evaluation endpoint returns a meaningful "no signal" +# response instead of erroring out. +COPY compute/scorer/matrix.default.json ./compute/scorer/matrix.default.json + # Expose the backend port EXPOSE 8000 diff --git a/backend/config.py b/backend/config.py index 11644fd5..2d4e72ea 100644 --- a/backend/config.py +++ b/backend/config.py @@ -28,6 +28,7 @@ import json import os +import re import sys import tempfile import threading @@ -76,11 +77,46 @@ def _ensure_dirs(): _ensured_dirs.add(d) +_SERVICE_ID_RE = re.compile(r"^[A-Za-z0-9_-]+$") +_SERVICE_ID_MAX_LEN = 64 + + +def _validate_service_id(service_id: str) -> str: + """Security: defense in depth against path traversal in any helper + that builds a path from ``service_id``. + + Real Fastly service IDs are opaque 22-char alphanumeric strings, but the + test suite and a handful of legacy provisioning paths use hyphenated + IDs like ``svc-1`` / ``test-service-id``. The regex therefore accepts + ``[A-Za-z0-9_-]+`` — every character allowed is safe inside a filename + and contains no path-separator / dot / null-byte. Without this, + ``service_id="/etc/passwd"`` or ``service_id="../../tmp/x"`` would + compose with ``pathlib`` semantics — absolute paths discard the base + entirely, relative ``..`` traverses out, and ``\\x00`` truncates on + some kernels. + + Length cap (64) is well above the longest legitimate Fastly ID (22) + and bounds memory in error-logging paths. + """ + if not isinstance(service_id, str): + raise ValueError(f"invalid service_id type {type(service_id).__name__}: must be str (security)") + if not service_id or len(service_id) > _SERVICE_ID_MAX_LEN: + raise ValueError( + f"invalid service_id length {len(service_id) if service_id else 0}: " + f"1..{_SERVICE_ID_MAX_LEN} characters required (security)" + ) + if not _SERVICE_ID_RE.match(service_id): + raise ValueError(f"invalid service_id {service_id!r}: must be alphanumeric / dash / underscore (security)") + return service_id + + def config_path(service_id: str) -> Path: + _validate_service_id(service_id) return CONFIGS_DIR / f"{service_id}.json" def duckdb_path(service_id: str) -> str: + _validate_service_id(service_id) return str(SERVICES_DATA_DIR / f"{service_id}.duckdb") @@ -91,8 +127,18 @@ def load_config(service_id: str) -> dict | None: result (e.g. update_status) won't poison the cache. The on-disk file is revalidated via st_mtime_ns, so external edits and save_config writes are picked up on the next call without explicit invalidation. + + Returns ``None`` (not a raised exception) for invalid service IDs — + several call sites pass unsanitized input (e.g., a stale URL param, + an iteration over a stale config list) and rely on the None response + to mean "no config". Security's validation in ``config_path`` is + still what blocks the actual path-traversal attack; this just makes + the helper friendlier at call sites that don't pre-validate. """ - path = config_path(service_id) + try: + path = config_path(service_id) + except ValueError: + return None try: mtime_ns = path.stat().st_mtime_ns except FileNotFoundError: diff --git a/backend/core/data_migrations.py b/backend/core/data_migrations.py new file mode 100644 index 00000000..db9a1438 --- /dev/null +++ b/backend/core/data_migrations.py @@ -0,0 +1,155 @@ +"""Data-migration framework for per-service one-time setup tasks. + +Background — why a second migration system? + ``backend.core.sqlite_migrations`` already exists for SCHEMA changes + (CREATE TABLE / ADD COLUMN) on the per-service metadata.db. Those run + synchronously inside ``_init_schema``, must be transactional, and are + cheap — a fresh DB has the latest ``_SCHEMA`` and migrations are + no-ops on it. + + Data migrations are different: long-running, non-transactional setup + work that touches state OUTSIDE the metadata.db (e.g. the rollups + parquet files under ``/rollups/``). The rollups initial + backfill on a service with months of data can take many minutes; we + cannot block FastAPI startup behind it (containerised deploys kill + the boot loop on healthcheck timeout). + +Design: + * ``MIGRATIONS: list[Migration]`` — ordered registry, append-only. The + list order IS the run order. + * A row in the per-service ``applied_data_migrations`` table marks a + migration as done. Failed migrations leave NO row and retry on the + next boot. + * ``run_pending(service_id, source)`` diffs the registry against the + table, spawns ONE daemon thread per service to run the unapplied + migrations in sequence. Across services they parallelise. + * Each migration is a pure function ``(service_id, source) -> str | None``. + The return string is recorded in the ``notes`` column for audit. + Exceptions bubble up to the runner, which logs + skips the row write. + +Adding a migration: + 1. Write an idempotent function ``def _migrate_(service_id, + source) -> str | None:`` somewhere appropriate (typically in the + module that owns the affected data — e.g. rollups migration lives + in ``backend.core.rollups``). + 2. Append ``Migration(...)`` to ``MIGRATIONS`` below with a stable + date-prefixed name (``"YYYY-MM-DD_short_description"``). + 3. The next service-boot picks it up automatically. No manual run- + once script needed. + +What this is NOT: + * Not a schema migration tool — use ``sqlite_migrations.py`` for DDL. + * Not a transactional system — individual migrations should write + their own progress markers (per-field stamps, etc.) so a crash + mid-run can be detected and partial work resumed on next attempt. +""" + +from __future__ import annotations + +import logging +import threading +import time +from collections.abc import Callable +from dataclasses import dataclass + +logger = logging.getLogger(__name__) + + +@dataclass(frozen=True) +class Migration: + name: str + description: str + fn: Callable[[str, dict], str | None] + + +def _rollups_initial_backfill(service_id: str, source: dict) -> str | None: + """Build the initial hourly top-N rollups for the dashboard. + + Idempotent: ``ensure_field_backfills`` checks per-field markers in + ``/rollups/backfill_markers.json`` and only re-runs the COPY + for fields without a marker. Safe to retry after a crash. + """ + from backend.core import rollups + + rollups.ensure_field_backfills(service_id, source) + return "rollups: ensure_field_backfills complete" + + +# Ordered registry. Append-only — never remove or reorder entries. +# Names must be globally unique and stable; the DB matches by name. +MIGRATIONS: list[Migration] = [ + Migration( + name="2026-06-04_rollups_initial_backfill", + description="Build initial hourly top-N rollups for dashboard top-N queries", + fn=_rollups_initial_backfill, + ), +] + + +def list_pending(service_id: str) -> list[Migration]: + """Return registered migrations that haven't been applied to this service.""" + from backend.core import metadata_db + + applied = metadata_db.list_applied_data_migrations(service_id) + return [m for m in MIGRATIONS if m.name not in applied] + + +def run_pending(service_id: str, source: dict) -> None: + """Spawn a daemon thread that runs pending data migrations sequentially. + + Returns immediately — does not block the caller. Per-service threads + are independent, so several services with pending migrations apply + in parallel; within a single service the migrations run in registry + order. + """ + pending = list_pending(service_id) + if not pending: + return + names = [m.name for m in pending] + logger.info("[migrations] service %s: %d pending — %s", service_id, len(pending), names) + t = threading.Thread( + target=_run_sequence, + args=(service_id, source, pending), + daemon=True, + name=f"data-migrations-{service_id}", + ) + t.start() + + +def _run_sequence(service_id: str, source: dict, migrations: list[Migration]) -> None: + from backend.core import metadata_db + + for mig in migrations: + t0 = time.time() + logger.info("[migrations] %s/%s: starting — %s", service_id, mig.name, mig.description) + try: + notes = mig.fn(service_id, source) + except Exception as e: + logger.exception( + "[migrations] %s/%s: FAILED after %.2fs — will retry next startup: %s", + service_id, + mig.name, + time.time() - t0, + e, + ) + # Important: do NOT record this migration as applied. Returning + # here also halts the sequence — a later migration that depends + # on a failed predecessor must not be allowed to run. + return + duration = time.time() - t0 + try: + metadata_db.record_applied_data_migration( + service_id, mig.name, duration_s=duration, status="success", notes=notes + ) + except Exception as e: + # Recording failed but the migration itself succeeded. Next boot + # will re-run it; the migration is idempotent so this is safe, + # just wasted work. Loud warning so we can spot the divergence. + logger.warning( + "[migrations] %s/%s: applied but COULD NOT RECORD (will re-run next boot): %s", + service_id, + mig.name, + e, + ) + continue + logger.info("[migrations] %s/%s: applied in %.2fs", service_id, mig.name, duration) diff --git a/backend/core/duckdb.py b/backend/core/duckdb.py index 2db7c2c7..cdeefff3 100644 --- a/backend/core/duckdb.py +++ b/backend/core/duckdb.py @@ -38,19 +38,7 @@ _ORPHAN_THRESHOLD_MINS = 5 -def _safe_iso(dt) -> str | None: - """Normalise a DuckDB datetime or string to an ISO-8601 string ending in Z.""" - if dt is None: - return None - if hasattr(dt, "isoformat"): - s = dt.isoformat() - # DuckDB TIMESTAMP is timezone-naive but always represents UTC. - # Append Z so JavaScript parses it as UTC instead of local time. - if not s.endswith("Z") and "+" not in s and s.count("-") <= 2: - s += "Z" - return s - return str(dt) - +from backend.utils.date_utils import safe_iso as _safe_iso # noqa: E402 # Cached per-process constants — computed once, reused on every connection open. _cached_n_threads: int | None = None @@ -258,28 +246,46 @@ def _configure_fos(con: duckdb.DuckDBPyConnection, source: dict): # nested in CREATE SECRET, so the keys go in as a literal SQL # fragment. Keys are a hardcoded set, never user input. hdr_map_sql = "MAP {" + ", ".join(f"'{k}': ?" for k in headers) + "}" - with _fos_proxy_secret_lock: - con.execute( - f""" - CREATE OR REPLACE SECRET fos_proxy ( - TYPE S3, - KEY_ID ?, - SECRET ?, - REGION ?, - ENDPOINT ?, - USE_SSL false, - URL_STYLE 'path', - EXTRA_HTTP_HEADERS {hdr_map_sql} - ) - """, - [ - source["access_key_id"], - source["secret_access_key"], - source["region"], - proxy_ep, - *headers.values(), - ], + create_secret_sql = f""" + CREATE OR REPLACE SECRET fos_proxy ( + TYPE S3, + KEY_ID ?, + SECRET ?, + REGION ?, + ENDPOINT ?, + USE_SSL false, + URL_STYLE 'path', + EXTRA_HTTP_HEADERS {hdr_map_sql} ) + """ + secret_params = [ + source["access_key_id"], + source["secret_access_key"], + source["region"], + proxy_ep, + *headers.values(), + ] + with _fos_proxy_secret_lock: + # _load_httpfs above runs INSTALL/LOAD httpfs, which starts an implicit + # transaction with a catalog snapshot taken BEFORE we acquired the lock. + # If another thread committed its own CREATE OR REPLACE SECRET while we + # were waiting, our stale snapshot trips a write-write conflict even + # though only one thread is inside this critical section. Rolling back + # discards the stale snapshot so CREATE OR REPLACE sees current catalog + # state. The retry handles the rare case where the rollback itself + # races with another commit (e.g. a third thread queued behind us). + for attempt in range(3): + try: + con.rollback() + except Exception: + pass + try: + con.execute(create_secret_sql, secret_params) + break + except Exception as e: + if "write-write conflict" in str(e).lower() and attempt < 2: + continue + raise try: con.execute("SET http_timeout=60;") con.execute("SET http_retries=5;") @@ -787,14 +793,22 @@ def get_connection( global _cached_n_threads, _cached_mem_limit_gb if _cached_n_threads is None: _cached_n_threads = min(multiprocessing.cpu_count(), 8) - if _cached_mem_limit_gb is None: - try: - _total_ram = os.sysconf("SC_PAGE_SIZE") * os.sysconf("SC_PHYS_PAGES") - _cached_mem_limit_gb = max(1, int(_total_ram * 0.6 / (1024**3))) - except (AttributeError, ValueError): - _cached_mem_limit_gb = 4 con.execute(f"SET threads = {_cached_n_threads};") - con.execute(f"SET memory_limit = '{_cached_mem_limit_gb}GB';") + # CRITICAL: only auto-derive memory_limit when DUCKDB_MEMORY_LIMIT is + # UNSET. Pre-fix, the env-based ``SET max_memory`` at line 762 was + # silently overridden here by ``SET memory_limit`` (they're aliases + # in DuckDB — the second SET wins). Container env DUCKDB_MEMORY_LIMIT=8GB + # was clobbered by ~60% of physical RAM (~9-10GB on the 16GB VM), + # leaving only ~6GB headroom for Python + pyiceberg + aiohttp + OS + + # frontend + caddy — recurring host OOM-kills followed. + if not os.getenv("DUCKDB_MEMORY_LIMIT"): + if _cached_mem_limit_gb is None: + try: + _total_ram = os.sysconf("SC_PAGE_SIZE") * os.sysconf("SC_PHYS_PAGES") + _cached_mem_limit_gb = max(1, int(_total_ram * 0.6 / (1024**3))) + except (AttributeError, ValueError): + _cached_mem_limit_gb = 4 + con.execute(f"SET memory_limit = '{_cached_mem_limit_gb}GB';") con.execute("SET checkpoint_threshold = '512MB';") # ALWAYS update the view to ensure local buffer files @@ -938,9 +952,18 @@ def log_cron_run( service_id = source["name"] cfg = svcconfig.load_config(service_id) or {} prov = cfg.get("provisioning", {}) - cron_key = "cron_sync" if task == "sync" else "cron_compact" - cron_cfg = prov.get(cron_key, {}) - log_enabled = cron_cfg.get("log_enabled", True) + # Map each cron task to the cfg block whose log_enabled flag governs it. + # Tasks not in the map always log — the prior ``"cron_sync" if task == + # "sync" else "cron_compact"`` ternary silently coupled metadata_cleanup, + # optimize, expire, full_sync, gap_heal, alerts, ngwaf_sync, etc. to + # cron_compact's log_enabled. Setting cron_compact.log_enabled=false on + # a service would suppress success rows for every task except sync. + _TASK_TO_CRON_KEY = { + "sync": "cron_sync", + "local_compact": "cron_compact", + } + cron_key = _TASK_TO_CRON_KEY.get(task) + log_enabled = prov.get(cron_key, {}).get("log_enabled", True) if cron_key else True if status == "success" and corrupt_rows and corrupt_rows > 0: status = "partial_success" @@ -1117,162 +1140,166 @@ def get_sync_status( if m: latest_ingested_file_at = f"{m.group(1)} {m.group(2).replace('-', ':').replace('.', ':')}" - table_exists = ( - con.execute("SELECT 1 FROM information_schema.tables WHERE table_name = ?", [table_name]).fetchone() is not None - ) - # The iceberg view is always the source of truth for row counts. # We fetch row counts and time extents if the table exists, even if skip_fos=True, # because these are derived from local metadata (Iceberg manifests) and are # relatively cheap. This allows the UI to auto-range correctly even during # lightweight status polls. + # + # The split-path query inside the try block reads parquet DIRECTLY via + # read_parquet() and doesn't need the iceberg view to exist in the + # current connection. + # This matters because sync-status opens a fresh RO connection that + # doesn't yet have the per-session view; without this, every sync- + # status poll fell through to ingested_files.row_count (which sums + # raw FOS line counts BEFORE the timestamp filter and consistently + # over-reports ~2-3×). latest_log_at = None earliest_log_at = None local_rows = local_rows_ingested - if table_exists: - try: - # Fetch row count and time extents. The view is built with - # read_parquet('cache//data/**/*.parquet') UNION ALL - # read_parquet([buffer_paths]) — DuckDB opens every parquet - # footer (~150 µs × 1.7 k data files = ~155 ms warm) plus the - # cheap buffer side. Split the query: cache the data-side - # count/min/max keyed by a data-dir mtime fingerprint (only - # changes on commit/optimize), run the buffer side fresh each - # call (~1 ms for <100 files), then merge. Cache hits go from - # ~240 ms full-view query down to ~1 ms (data cached + buffer - # query + fingerprint stat). - stats = None - data_fp = _data_stats_fingerprint(src) - cache_key = src["name"] - if data_fp is not None: - try: - with _data_stats_cache_lock: - cached = _data_stats_cache.get(cache_key) - if cached is not None and cached[0] == data_fp: - d_count, d_min, d_max = cached[1], cached[2], cached[3] - else: - data_glob = os.path.join(_cache_dir(src), "data", "**", "*.parquet") - d_row = con.execute( - "SELECT count(*), min(timestamp), max(timestamp) " - f"FROM read_parquet('{data_glob}', union_by_name=true, hive_partitioning=false)" - ).fetchone() - d_count = (d_row[0] or 0) if d_row else 0 - d_min = d_row[1] if d_row else None - d_max = d_row[2] if d_row else None - with _data_stats_cache_lock: - _data_stats_cache[cache_key] = (data_fp, d_count, d_min, d_max) - - from backend.core import iceberg as _ice - - buf_paths = [p for p in _ice.buffer_files(src) if os.path.isfile(p)] - if buf_paths: - paths_sql = ", ".join(f"'{p}'" for p in buf_paths) - b_row = con.execute( - "SELECT count(*), min(timestamp), max(timestamp) " - f"FROM read_parquet([{paths_sql}], union_by_name=true, hive_partitioning=false)" - ).fetchone() - b_count = (b_row[0] or 0) if b_row else 0 - b_min = b_row[1] if b_row else None - b_max = b_row[2] if b_row else None - else: - b_count, b_min, b_max = 0, None, None - - mins = [m for m in (d_min, b_min) if m is not None] - maxs = [m for m in (d_max, b_max) if m is not None] - stats = ( - d_count + b_count, - min(mins) if mins else None, - max(maxs) if maxs else None, - ) - except Exception as split_err: - # Bust the data cache so we don't pin a half-built result. + try: + # Fetch row count and time extents. The view is built with + # read_parquet('cache//data/**/*.parquet') UNION ALL + # read_parquet([buffer_paths]) — DuckDB opens every parquet + # footer (~150 µs × 1.7 k data files = ~155 ms warm) plus the + # cheap buffer side. Split the query: cache the data-side + # count/min/max keyed by a data-dir mtime fingerprint (only + # changes on commit/optimize), run the buffer side fresh each + # call (~1 ms for <100 files), then merge. Cache hits go from + # ~240 ms full-view query down to ~1 ms (data cached + buffer + # query + fingerprint stat). + stats = None + data_fp = _data_stats_fingerprint(src) + cache_key = src["name"] + if data_fp is not None: + try: + with _data_stats_cache_lock: + cached = _data_stats_cache.get(cache_key) + if cached is not None and cached[0] == data_fp: + d_count, d_min, d_max = cached[1], cached[2], cached[3] + else: + data_glob = os.path.join(_cache_dir(src), "data", "**", "*.parquet") + d_row = con.execute( + "SELECT count(*), min(timestamp), max(timestamp) " + f"FROM read_parquet('{data_glob}', union_by_name=true, hive_partitioning=false)" + ).fetchone() + d_count = (d_row[0] or 0) if d_row else 0 + d_min = d_row[1] if d_row else None + d_max = d_row[2] if d_row else None with _data_stats_cache_lock: - _data_stats_cache.pop(cache_key, None) - # Stale-cache failure modes ("No files found", missing - # catalog entries) must flow to the outer view-rebuild - # handler below — the cure is the same. Re-raise here - # rather than swallowing, so the existing recovery path - # still triggers clear_source_caches+update_iceberg_view. - err_str = str(split_err) - if ( - "No files found" in err_str - or "Catalog Error: Table with name" in err_str - or "does not exist" in err_str - or "No such file or directory" in err_str - ): - raise - logger.debug("[sync-status] split-stats query failed, falling back to view: %s", split_err) - - if stats is None: + _data_stats_cache[cache_key] = (data_fp, d_count, d_min, d_max) + + from backend.core import iceberg as _ice + + buf_paths = [p for p in _ice.buffer_files(src) if os.path.isfile(p)] + if buf_paths: + paths_sql = ", ".join(f"'{p}'" for p in buf_paths) + b_row = con.execute( + "SELECT count(*), min(timestamp), max(timestamp) " + f"FROM read_parquet([{paths_sql}], union_by_name=true, hive_partitioning=false)" + ).fetchone() + b_count = (b_row[0] or 0) if b_row else 0 + b_min = b_row[1] if b_row else None + b_max = b_row[2] if b_row else None + else: + b_count, b_min, b_max = 0, None, None + + mins = [m for m in (d_min, b_min) if m is not None] + maxs = [m for m in (d_max, b_max) if m is not None] + stats = ( + d_count + b_count, + min(mins) if mins else None, + max(maxs) if maxs else None, + ) + except Exception as split_err: + # Bust the data cache so we don't pin a half-built result. + with _data_stats_cache_lock: + _data_stats_cache.pop(cache_key, None) + # Stale-cache failure modes ("No files found", missing + # catalog entries) must flow to the outer view-rebuild + # handler below — the cure is the same. Re-raise here + # rather than swallowing, so the existing recovery path + # still triggers clear_source_caches+update_iceberg_view. + err_str = str(split_err) + if ( + "No files found" in err_str + or "Catalog Error: Table with name" in err_str + or "does not exist" in err_str + or "No such file or directory" in err_str + ): + raise + logger.debug("[sync-status] split-stats query failed, falling back to view: %s", split_err) + + if stats is None: + stats = con.execute(f"SELECT count(*), min(timestamp), max(timestamp) FROM {table_name}").fetchone() + if stats: + view_rows = stats[0] if stats[0] is not None else 0 + # When the view returns a real (non-zero) count, trust it + # as the source of truth — it reflects the rows actually + # queryable in Iceberg. ingested_files.row_count records + # the raw JSON line count from each FOS file BEFORE the + # `WHERE timestamp IS NOT NULL` filter and any time-range + # filter, and never reflects post-compaction dedup, so it + # consistently over-reports. Only fall back when the view + # itself is empty (the "WHERE false" transient-failure + # fallback) — there we degrade to the metadata sum so the + # header doesn't read 0 while we have data on disk. + if view_rows > 0: + local_rows = view_rows + earliest_log_at = stats[1] + latest_log_at = stats[2] + else: + local_rows = local_rows_ingested + except Exception as e: + if ( + "No files found" in str(e) + or "Catalog Error: Table with name" in str(e) + or "does not exist" in str(e) + or "No such file or directory" in str(e) + ): + try: + from backend.core import iceberg + + # Bust the cached view SQL FIRST. Without this, when ingest + # is mid-commit and holding the per-service lock, + # update_iceberg_view falls back to executing the cached + # SQL — which is exactly the stale SQL that referenced + # the missing parquet, looping us right back into the same + # error. Clearing the cache forces a real rebuild on the + # next view-update window (possibly the next poll). + # + # ``keep_snapshot_cache=True``: do NOT also wipe the + # snapshot/path cache. If we wipe both, then a transient + # catalog-load failure (FOS rate limit, network blip) + # causes update_iceberg_view to fall through to its + # empty-view branch — "WHERE false" — which then sticks + # in _view_cache and shows the user "Total Logs: 0" + # despite millions of rows being in the table. + iceberg.clear_source_caches(src.get("name", "default"), keep_snapshot_cache=True) + iceberg.update_iceberg_view(con, src) stats = con.execute(f"SELECT count(*), min(timestamp), max(timestamp) FROM {table_name}").fetchone() - if stats: - view_rows = stats[0] if stats[0] is not None else 0 - # When the view returns a real (non-zero) count, trust it - # as the source of truth — it reflects the rows actually - # queryable in Iceberg. ingested_files.row_count records - # the raw JSON line count from each FOS file BEFORE the - # `WHERE timestamp IS NOT NULL` filter and any time-range - # filter, and never reflects post-compaction dedup, so it - # consistently over-reports. Only fall back when the view - # itself is empty (the "WHERE false" transient-failure - # fallback) — there we degrade to the metadata sum so the - # header doesn't read 0 while we have data on disk. - if view_rows > 0: - local_rows = view_rows + if stats: + local_rows = stats[0] if stats[0] is not None else 0 earliest_log_at = stats[1] latest_log_at = stats[2] - else: - local_rows = local_rows_ingested - except Exception as e: - if ( - "No files found" in str(e) - or "Catalog Error: Table with name" in str(e) - or "does not exist" in str(e) - or "No such file or directory" in str(e) - ): - try: - from backend.core import iceberg - - # Bust the cached view SQL FIRST. Without this, when ingest - # is mid-commit and holding the per-service lock, - # update_iceberg_view falls back to executing the cached - # SQL — which is exactly the stale SQL that referenced - # the missing parquet, looping us right back into the same - # error. Clearing the cache forces a real rebuild on the - # next view-update window (possibly the next poll). - # - # ``keep_snapshot_cache=True``: do NOT also wipe the - # snapshot/path cache. If we wipe both, then a transient - # catalog-load failure (FOS rate limit, network blip) - # causes update_iceberg_view to fall through to its - # empty-view branch — "WHERE false" — which then sticks - # in _view_cache and shows the user "Total Logs: 0" - # despite millions of rows being in the table. - iceberg.clear_source_caches(src.get("name", "default"), keep_snapshot_cache=True) - iceberg.update_iceberg_view(con, src) - stats = con.execute(f"SELECT count(*), min(timestamp), max(timestamp) FROM {table_name}").fetchone() - if stats: - local_rows = stats[0] if stats[0] is not None else 0 - earliest_log_at = stats[1] - latest_log_at = stats[2] - except Exception as retry_e: - # The fallback to ``local_rows_ingested`` below is the - # designed degradation path — when the cache is mid- - # rebuild and we couldn't acquire the lock, ``local_rows`` - # still reflects the row count we tracked at ingest time. - # Demoted from print/warning to debug because the cascade - # spams stderr on every sync-status poll until ingest - # releases the lock; the bust above breaks the loop on - # the next attempt regardless. - logger.debug("[sync-status] log stats unavailable mid-rebuild: %s", retry_e) - local_rows = local_rows_ingested - else: - # Unexpected exception — this one is worth keeping as a - # warning since it doesn't match any of the known "stale - # cache" patterns above and the fallback may hide real bugs. - logger.warning("[sync-status] Failed to get log stats from view: %s", e) + except Exception as retry_e: + # The fallback to ``local_rows_ingested`` below is the + # designed degradation path — when the cache is mid- + # rebuild and we couldn't acquire the lock, ``local_rows`` + # still reflects the row count we tracked at ingest time. + # Demoted from print/warning to debug because the cascade + # spams stderr on every sync-status poll until ingest + # releases the lock; the bust above breaks the loop on + # the next attempt regardless. + logger.debug("[sync-status] log stats unavailable mid-rebuild: %s", retry_e) local_rows = local_rows_ingested + else: + # Unexpected exception — this one is worth keeping as a + # warning since it doesn't match any of the known "stale + # cache" patterns above and the fallback may hide real bugs. + logger.warning("[sync-status] Failed to get log stats from view: %s", e) + local_rows = local_rows_ingested # Latest available filename mirrors latest_file_name since FOS LIST is # not consulted here (comment above explains why). Reuse the summary's diff --git a/backend/core/duckdb_pool.py b/backend/core/duckdb_pool.py new file mode 100644 index 00000000..f8a90797 --- /dev/null +++ b/backend/core/duckdb_pool.py @@ -0,0 +1,386 @@ +"""Per-service DuckDB connection pool. + +Each API request previously opened a fresh DuckDB connection, ran ~10 PRAGMAs, +configured S3 + iceberg, and called ``update_iceberg_view`` to bind the per- +service view onto the new connection. Steady-state cost was ~50ms of setup +plus another ~45ms on first-query overhead — paid by every request. + +This module caches read-only connections in a per-service pool. A request +checks out a fully-configured connection, runs its queries, then returns it. +The view binding is re-validated on checkout via the existing fast-path +fingerprint (``_view_cache``); a cache hit is a few-µs dict lookup, so the +hot path checkout is genuinely cheap. + +The pool is opt-in via ``DUCKDB_CONNECTION_POOL`` env var (default on); set +to ``"0"`` to disable and fall back to the always-fresh-connection path. +Exists primarily so tests and ops have an emergency switch if a pooling +regression slips through. + +Lifecycle: + * Pool is created lazily on first checkout for a service. + * Idle connections are stored in a LIFO queue (recently-used first, so the + OS page cache stays hot on the file descriptors that are currently warm). + * Pool size is bounded by ``max_size`` (default 8 per service). When the + pool is empty and ``in_use < max_size``, the next checkout creates a new + connection. When ``in_use == max_size``, waiters block on a Condition. + * If a request returns a connection that errored mid-query, the connection + is discarded (closed) rather than returned to the pool — the next + checkout creates a fresh one. + * On checkin, we DROP any temp tables the request created (sweep against + ``information_schema``) so a long-lived pool connection doesn't slowly + accumulate state across requests. A leaked temp table from a prior + request would otherwise show up as ``CATALOG ENTRY ALREADY EXISTS`` if a + later request happened to pick the same uuid (improbable, but + deterministic at scale). + +Concurrency: + * Multiple connections to the same DuckDB file on the same process are safe + — they share the in-memory database state. + * Read-only + read-only across pool connections is fine. + * Read-only pool + one read-write writer (ingest) is the project's existing + contract; ``get_connection`` already handles ``DBBusyError`` retries. + +Failure handling: + * If view rebind fails on checkout, we discard the connection and try a + fresh one. After ``max_retries`` consecutive failures we surface + ``DBBusyError`` to the caller (which becomes a 503 in deps.py). +""" + +from __future__ import annotations + +import logging +import os +import queue +import threading +import time +from contextlib import contextmanager + +import duckdb + +logger = logging.getLogger(__name__) + + +def _pool_enabled() -> bool: + return os.getenv("DUCKDB_CONNECTION_POOL", "1").lower() not in ("0", "false", "no", "off") + + +def _pool_max_size() -> int: + raw = os.getenv("DUCKDB_POOL_MAX_SIZE", "8") + try: + return max(1, int(raw)) + except (TypeError, ValueError): + return 8 + + +# Per-connection state tracking. DuckDB connection objects are slotted +# C types — they don't accept arbitrary attribute assignment — so we +# keep our metadata in a module-level dict keyed by id(con). Entries are +# cleared when the connection is closed/discarded. +# +# Fingerprint = id() of the ``_view_cache`` tuple at the time the view +# was last bound to this connection. The tuple is replaced (not mutated) +# when the cache rotates, so identity is a sufficient fresh-check. +_conn_state: dict[int, dict] = {} +_conn_state_lock = threading.Lock() + + +def _set_conn_state(con: duckdb.DuckDBPyConnection, **kv) -> None: + with _conn_state_lock: + state = _conn_state.setdefault(id(con), {}) + state.update(kv) + + +def _get_conn_state(con: duckdb.DuckDBPyConnection, key: str, default=None): + with _conn_state_lock: + return _conn_state.get(id(con), {}).get(key, default) + + +def _forget_conn(con: duckdb.DuckDBPyConnection) -> None: + with _conn_state_lock: + _conn_state.pop(id(con), None) + + +def _safe_buffer_mtime(src: dict | None) -> float | None: + """Return mtime of the service's buffer dir, or None if it can't be read. + + Used as part of the pool's checkout fingerprint so that the sync cron + removing buffer parquet files (without touching ``_view_cache``) still + invalidates pooled connections. Any add/remove inside the dir bumps the + dir's own mtime — so a single stat is enough. + """ + if src is None: + return None + try: + from backend.core.iceberg import _buffer_dir + + path = _buffer_dir(src) + return os.path.getmtime(path) + except Exception: + return None + + +class _Pool: + """Per-service pool. Not exposed directly — use ``checkout_connection``.""" + + def __init__(self, service_key: str, max_size: int): + self.service_key = service_key + self.max_size = max_size + # LIFO so the most-recently-used connection (warmest in any OS / DuckDB + # internal caches) is the next checkout. + self._idle: queue.LifoQueue = queue.LifoQueue(maxsize=max_size) + self._lock = threading.Lock() + # ``in_use`` is the count of connections currently checked out plus + # connections idle in the queue. Bounded by ``max_size``. + self._in_use = 0 + self._cond = threading.Condition(self._lock) + # Cumulative counters for diagnostics — exposed via ``stats()``. + self._created_total = 0 + self._reused_total = 0 + self._discarded_total = 0 + + def acquire(self, src: dict, max_wait: float) -> duckdb.DuckDBPyConnection: + deadline = time.monotonic() + max_wait + with self._cond: + while True: + # Fast path: idle connection available + try: + con = self._idle.get_nowait() + self._reused_total += 1 + return self._prepare_checkout(con, src) + except queue.Empty: + pass + + # Capacity available: build a new one outside the lock + if self._in_use < self.max_size: + self._in_use += 1 + self._created_total += 1 + break # fall through to the unlocked build path + + # Saturated: wait for a return + remaining = deadline - time.monotonic() + if remaining <= 0: + raise _PoolBusy( + f"pool for {self.service_key} saturated at {self.max_size}" + ) + self._cond.wait(timeout=remaining) + + # Outside lock: build fresh. _in_use was already incremented; if the + # build raises we MUST decrement and notify a waiter, hence the try. + try: + from backend.core.duckdb import get_connection + + con = get_connection(source=src, read_only=True, max_wait=max_wait) + _set_conn_state(con, service_key=self.service_key) + self._stamp_fingerprint(con, src) + return con + except Exception: + with self._cond: + self._in_use -= 1 + self._cond.notify() + raise + + def release(self, con: duckdb.DuckDBPyConnection, *, errored: bool = False) -> None: + """Return a connection to the pool. Pass ``errored=True`` to discard + instead — the next checkout will build fresh.""" + if errored: + self._discard(con) + return + try: + self._cleanup_temp_tables(con) + except Exception as e: + # Cleanup failure means the connection is in unknown state — discard. + logger.debug("[pool] %s: cleanup failed, discarding: %s", self.service_key, e) + self._discard(con) + return + with self._cond: + try: + self._idle.put_nowait(con) + self._cond.notify() + return + except queue.Full: + # Pool already at max idle (shouldn't happen given in_use cap, + # but defensive). Close this one and free the slot. + pass + # Outside lock: close + try: + con.close() + except Exception: + pass + with self._cond: + self._in_use -= 1 + self._cond.notify() + + def _discard(self, con: duckdb.DuckDBPyConnection) -> None: + _forget_conn(con) + try: + con.close() + except Exception: + pass + with self._cond: + self._in_use -= 1 + self._discarded_total += 1 + self._cond.notify() + + def _prepare_checkout(self, con: duckdb.DuckDBPyConnection, src: dict) -> duckdb.DuckDBPyConnection: + """Re-validate the view binding before handing the connection out. + + Two checks make up the fingerprint: + + 1. id() of the iceberg ``_view_cache`` tuple for this service. + The tuple is replaced (not mutated) when the cache rotates, so + identity is a sufficient check that the SQL we'd bind matches + what we bound last time. + + 2. mtime of the buffer directory. The sync cron's commit step + DELETES buffer parquet files without calling update_iceberg_view — + so the view-cache tuple keeps looking "fresh" while the files + it references are gone. mtime catches that: any add/remove in + the dir bumps it. Cost ~1 syscall (~µs). + + If either differs from what we last stamped, rebind. If the rebind + fails, discard the connection and let the caller retry. + """ + try: + from backend.core import iceberg + + current = iceberg._view_cache.get(self.service_key) + stamped_view = _get_conn_state(con, "view_fingerprint") + stamped_buf = _get_conn_state(con, "buffer_mtime") + current_buf = _safe_buffer_mtime(src) + if ( + current is not None + and id(current) == stamped_view + and current_buf == stamped_buf + ): + # View AND underlying buffer set match what we bound last + # time — nothing to do. + return con + iceberg.update_iceberg_view(con, src) + self._stamp_fingerprint(con, src) + return con + except Exception as e: + logger.warning("[pool] %s: view refresh on checkout failed, discarding: %s", self.service_key, e) + self._discard(con) + raise + + def _stamp_fingerprint(self, con: duckdb.DuckDBPyConnection, src: dict | None = None) -> None: + try: + from backend.core import iceberg + + current = iceberg._view_cache.get(self.service_key) + buf_mtime = _safe_buffer_mtime(src) if src is not None else None + _set_conn_state( + con, + view_fingerprint=id(current) if current is not None else None, + buffer_mtime=buf_mtime, + ) + except Exception: + _set_conn_state(con, view_fingerprint=None, buffer_mtime=None) + + def _cleanup_temp_tables(self, con: duckdb.DuckDBPyConnection) -> None: + """Drop any t_-style temp tables left behind by repositories + whose ``temp_table`` context manager exited cleanly does the DROP + itself; this is belt-and-suspenders for the failure paths.""" + try: + rows = con.execute( + "SELECT table_name FROM duckdb_tables() " + "WHERE schema_name = 'main' AND temporary = true" + ).fetchall() + except Exception: + return + for (name,) in rows: + try: + con.execute(f"DROP TABLE IF EXISTS {name}") + except Exception: + # Best-effort — if a single table fails to drop, keep going. + pass + + def stats(self) -> dict: + with self._cond: + return { + "service": self.service_key, + "max_size": self.max_size, + "in_use": self._in_use, + "idle": self._idle.qsize(), + "created_total": self._created_total, + "reused_total": self._reused_total, + "discarded_total": self._discarded_total, + } + + +class _PoolBusy(Exception): + """Raised when the pool is saturated and the wait deadline elapsed.""" + + +_pools: dict[str, _Pool] = {} +_pools_lock = threading.Lock() + + +def _get_pool(service_key: str, max_size: int | None = None) -> _Pool: + if max_size is None: + max_size = _pool_max_size() + with _pools_lock: + pool = _pools.get(service_key) + if pool is None: + pool = _Pool(service_key, max_size=max_size) + _pools[service_key] = pool + return pool + + +@contextmanager +def checkout_connection(src: dict, max_wait: float = 10.0): + """Yield a fully-configured DuckDB connection from the per-service pool. + + Falls back to the legacy always-fresh path when ``DUCKDB_CONNECTION_POOL`` + is disabled. Returns the connection to the pool on clean exit; discards + it on any exception so a poisoned connection doesn't get reused. + """ + if not _pool_enabled(): + from backend.core.duckdb import get_connection + + con = get_connection(source=src, read_only=True, max_wait=max_wait) + try: + yield con + finally: + try: + con.close() + except Exception: + pass + return + + service_key = src.get("name") or src.get("service_id") or "default" + pool = _get_pool(service_key) + con = pool.acquire(src, max_wait=max_wait) + errored = False + try: + yield con + except Exception: + errored = True + raise + finally: + pool.release(con, errored=errored) + + +def get_all_stats() -> list[dict]: + """Diagnostics: return current pool state for every service.""" + with _pools_lock: + return [pool.stats() for pool in _pools.values()] + + +def shutdown_all() -> None: + """Close every idle connection across every pool. Called on app shutdown + so DuckDB releases its file handles cleanly.""" + with _pools_lock: + pools = list(_pools.values()) + _pools.clear() + for pool in pools: + while True: + try: + con = pool._idle.get_nowait() + except queue.Empty: + break + _forget_conn(con) + try: + con.close() + except Exception: + pass diff --git a/backend/core/fastly/service.py b/backend/core/fastly/service.py index 5e6b249e..5e6b16bb 100644 --- a/backend/core/fastly/service.py +++ b/backend/core/fastly/service.py @@ -25,22 +25,6 @@ def find_service_by_name(name: str, token: str) -> dict | None: return None -def find_dictionary_by_name(service_id: str, version: int, name: str, token: str) -> dict | None: - try: - dicts = fastly("GET", f"/service/{service_id}/version/{version}/dictionary", token=token) - for d in dicts: - if d.get("name") == name: - return d - except RuntimeError: - pass - return None - - -def upsert_dictionary_items(service_id: str, dictionary_id: str, items: dict[str, str], token: str): - payload = {"items": [{"item_key": k, "item_value": v} for k, v in items.items()]} - return fastly("PATCH", f"/service/{service_id}/dictionary/{dictionary_id}/items", payload, token=token) - - def find_condition(name: str, service_id: str, version: int, token: str) -> dict | None: try: conditions = fastly("GET", f"/service/{service_id}/version/{version}/condition", token=token) diff --git a/backend/core/fastly/utils.py b/backend/core/fastly/utils.py index a346d72e..e80eb238 100644 --- a/backend/core/fastly/utils.py +++ b/backend/core/fastly/utils.py @@ -1,5 +1,6 @@ import argparse import re +import secrets # Candidate field names on Fastly's /stats/service response that carry the # "log lines emitted" counter. Ordered: most-likely first. If all four miss @@ -148,8 +149,25 @@ def load_vcl(rate_limiting: bool = True) -> str: set req.http.Fastly-Client-IP = client.ip; } - # Block requests that do not provide the correct secret key (purges are exempt) - if (req.method != "FASTLYPURGE" && req.restarts == 0 && fastly.ff.visits_this_service == 0 && subfield(req.url.qs, "key", "&") != table.lookup(cdn_auth, "secret", "") && req.http.x-fastly-key != table.lookup(cdn_auth, "secret", "")) { + # Handle FASTLYPURGE natively. Without this, an unsigned purge on a + # cache miss is forwarded to the FOS origin, which returns 403 — and + # Fastly caches that 403 for the object's TTL. An attacker can poison + # the cache for legitimate clients by issuing purges against arbitrary + # keys. ``return(purge)`` short-circuits the pipeline before any + # backend fetch happens. + if (req.method == "FASTLYPURGE") { + return(purge); + } + + # Block requests that do not provide the correct secret key. + # NOTE on the auth fallback: the third argument to ``table.lookup`` is + # returned when ``cdn_auth.secret`` is absent from the edge dictionary. + # Defaulting to ``""`` is fail-open — an attacker who sends an empty + # ``key`` query param trivially matches. ``__FALLBACK_SECRET__`` is + # substituted in load_vcl() with ``secrets.token_hex(32)``, which is + # never knowable to an attacker and therefore fails closed when the + # dictionary is unprovisioned. + if (req.restarts == 0 && fastly.ff.visits_this_service == 0 && subfield(req.url.qs, "key", "&") != table.lookup(cdn_auth, "secret", "__FALLBACK_SECRET__") && req.http.x-fastly-key != table.lookup(cdn_auth, "secret", "__FALLBACK_SECRET__")) { #RATELIMIT_BEGIN declare local var.last_minute INTEGER; set var.last_minute = ratelimit.ratecounter_increment(auth_fail_rc, req.http.Fastly-Client-IP, 1); @@ -171,8 +189,26 @@ def load_vcl(rate_limiting: bool = True) -> str: set req.enable_segmented_caching = true; set segmented_caching.block_size = 20971520; # 20 MB, the maximum - # Strip only the key from the URL before forwarding to Fastly Object Storage - set req.url = querystring.filter(req.url, "key"); + # Cache-key hardening (post-auth — auth check above still reads the + # `key` qs param from the original req.url): + # 1. querystring.filter_except keeps ONLY the S3-API parameters the + # FOS origin actually understands and strips everything else + # (including our auth `key` secret, any caller-injected tracking + # params, marketing UTM params, session IDs, etc.). Unexpected + # params no longer fracture the cache or leak into req.hash. + # 2. querystring.sort canonicalises the remaining param order so + # `?prefix=foo&max-keys=10` and `?max-keys=10&prefix=foo` resolve + # to one cache entry instead of two. + # Allow-list rationale (S3 API surface FOS exposes): + # - List objects v2: list-type, prefix, delimiter, continuation-token, + # start-after, max-keys, encoding-type, fetch-owner + # - List objects v1: marker + # - Get object: versionId, partNumber, response-content-type, + # response-content-disposition, response-cache-control + # Anything else is silently dropped. If a legitimate S3 param needs to + # pass through later, add it to this list and re-deploy. + set req.url = querystring.filter_except(req.url, "list-type,prefix,delimiter,continuation-token,start-after,max-keys,encoding-type,fetch-owner,marker,versionId,partNumber,response-content-type,response-content-disposition,response-cache-control"); + set req.url = querystring.sort(req.url); # Never cache admin_state.json — it changes on every mutation if (req.url ~ "/iceberg/meta/admin_state\\.json$") { @@ -195,7 +231,19 @@ def load_vcl(rate_limiting: bool = True) -> str: return(lookup); } sub vcl_hash { - set req.hash += req.url.path; + # Security: hash on the full URL (path + query string), not just + # req.url.path. Before this fix, two requests that differed only in + # query parameters (e.g. ListObjectsV2 with different ?prefix= values, + # or ?versionId= variants) shared a single cache entry — the second + # caller would receive the first caller's object listing. The CDN + # auth `key` querystring has already been stripped from req.url by + # the querystring.filter_except in vcl_recv, AND remaining params are + # sorted by querystring.sort, so the cache key (a) does NOT include + # the secret and (b) is normalised across param-order variants. + # Expect a one-time cache-hit-rate dip + origin egress spike on + # rollout while prior entries are stranded; the canary monitors + # those signals and auto-rolls back if they exceed v6 §6 thresholds. + set req.hash += req.url; set req.hash += req.http.host; #FASTLY hash return(hash); @@ -271,4 +319,12 @@ def load_vcl(rate_limiting: bool = True) -> str: }""" if not rate_limiting: vcl = re.sub(r"\s*#RATELIMIT_BEGIN.*?#RATELIMIT_END", "", vcl, flags=re.DOTALL) + # Substitute the placeholder with a fresh random fallback secret so + # that when ``cdn_auth.secret`` is missing from the edge dictionary, + # the lookup returns an unguessable value and the auth check fails + # closed instead of allowing empty-key requests through. A new secret + # per load_vcl() call is fine: real auth uses the dictionary value + # (this fallback is never matched in steady state). + fallback_secret = secrets.token_hex(32) + vcl = vcl.replace("__FALLBACK_SECRET__", fallback_secret) return vcl diff --git a/backend/core/iceberg.py b/backend/core/iceberg.py index 870a2b14..33c0bc2e 100644 --- a/backend/core/iceberg.py +++ b/backend/core/iceberg.py @@ -525,6 +525,7 @@ def _patched_open(self, path, mode="rb", **kwargs): ) from backend.core.log_fields import LOG_FIELD_CATALOG +from backend.utils.sql_validator import escape_sql_literal # --------------------------------------------------------------------------- # Iceberg Schema — derived from LOG_FIELD_CATALOG (single source of truth). @@ -1095,11 +1096,14 @@ def _read_metadata_pointer(source: dict, identifier: tuple) -> str | None: try: from backend.core.duckdb import _get_fos_client + from backend.models.lake import _safe_cdn_url s3 = _get_fos_client(source) bucket = source["bucket"] base_prefix = source.get("prefix", "").strip("/") - cdn_url = (source.get("cdn_url") or "").rstrip("/") + # SSRF guard: only follow ``cdn_url`` when it parses as an https + # Fastly hostname. Otherwise fall through to the S3 SDK. + cdn_url = _safe_cdn_url((source.get("cdn_url") or "").rstrip("/")) cdn_secret = source.get("cdn_secret") or "" iceberg_root = f"{base_prefix}/iceberg" if base_prefix else "iceberg" @@ -1807,7 +1811,7 @@ def optimize_table(source: dict, target_file_size_mb: int = 128, min_files_per_p # Use DuckDB to read only these files (most efficient) paths = [f.file_path for f in files] - paths_sql = ", ".join(f"'{p}'" for p in paths) + paths_sql = ", ".join(f"'{escape_sql_literal(p)}'" for p in paths) try: # Read into PyArrow. Must materialise to a Table — pyiceberg's @@ -2397,10 +2401,10 @@ def configure_duckdb_s3(con) -> None: unmatched URLs and silently bypass telemetry. """ try: - con.execute("INSTALL iceberg; INSTALL avro; INSTALL httpfs; INSTALL parquet;") con.execute("LOAD iceberg; LOAD avro; LOAD httpfs; LOAD parquet;") except Exception: try: + con.execute("INSTALL iceberg; INSTALL avro; INSTALL httpfs; INSTALL parquet;") con.execute("LOAD iceberg; LOAD avro; LOAD httpfs; LOAD parquet;") except Exception: pass @@ -2724,13 +2728,24 @@ def get_last_view_stats(source: dict) -> dict: def inject_view_debug(debug_list: list, source: dict): stats = get_last_view_stats(source) if stats and stats.get("sql"): + # Apply the same path-list compaction as the per-query recorder + # in repositories/_base. The view-build SQL is the WORST offender + # because it inlines every buffer file twice (in the UNION ALL + # RHS) — pre-compaction it accounted for ~30 KB on its own in + # the dashboard response. + from backend.repositories._base import _compact_sql_for_debug + mode = ( "FAST PATH (Local Cache / Buffer Match)" if stats.get("was_fast_path") else "SLOW PATH (S3 Read / Manifest Resolve)" ) debug_list.insert( - 0, {"sql": f"-- DuckDB Iceberg View Resolution [{mode}] --\n{stats['sql']}", "time_ms": stats["time_ms"]} + 0, + { + "sql": _compact_sql_for_debug(f"-- DuckDB Iceberg View Resolution [{mode}] --\n{stats['sql']}"), + "time_ms": stats["time_ms"], + }, ) @@ -2848,7 +2863,7 @@ def _rebuild_locked(con, source: dict, source_key: str) -> None: del _rebuild_signals[source_key] -def update_iceberg_view(con, source: dict, lock_timeout: float = 5.0) -> None: +def update_iceberg_view(con, source: dict, lock_timeout: float = 5.0, force: bool = False) -> None: """Refresh the per-service DuckDB view over the Iceberg table + buffer. ``lock_timeout`` (default 5s) caps how long we wait on the per-service @@ -2860,12 +2875,23 @@ def update_iceberg_view(con, source: dict, lock_timeout: float = 5.0) -> None: match the pattern …/buffer/batch_*.parquet`` on the next read. Five seconds is long enough to outlast a typical commit without making sync-status polls feel sticky. + + ``force=True`` skips the lock-free fast path and goes straight to a + full rebuild under the lock. The QueryRunner self-heal path uses + this: when a query already failed with a stale-view IOException, + the fast path can't help — its buf_set check might match cached + state that's still inconsistent with what the DuckDB query planner + just saw on disk, OR (the symptom-from-prod) the cached view SQL + has hardcoded file paths and re-executing it just re-binds the same + bad SQL. Force-rebuild reads disk fresh under the lock and + regenerates the SQL. """ source_key = source.get("name", "default") # Lock-free fast path first. Parallel dashboard reads (6+ endpoints # per page load) only need the lock when a real rebuild is required. - if _try_fast_path_view(con, source): + # Skipped on ``force=True`` (see self-heal path in QueryRunner). + if not force and _try_fast_path_view(con, source): return lock = _get_service_lock(source_key) @@ -2982,8 +3008,7 @@ def _update_iceberg_view_locked(con, source: dict) -> None: dynamic_arrow_schema = get_arrow_schema(log_fields_config) dynamic_schema_field_names = {f.name for f in dynamic_arrow_schema} - logger.info("▶️ %s %s: View refresh started.", _ICE_PLAIN, source_key) - logger.info("%s %s: Refreshing view...", _ICE, source_key) + logger.info("▶️ %s %s: View refresh started...", _ICE_PLAIN, source_key) # Try to load from persistent cache if memory cache is empty _load_persistent_cache(source) @@ -3195,7 +3220,7 @@ def _strip_computed(read_parquet_expr: str) -> str: # (a) plan_files() returned S3 URIs and no local files are cached yet, OR # (b) plan_files() failed silently but iceberg_loc is known (avoids WHERE false view) if iceberg_loc and not local_paths and (s3_paths or not local_iceberg_files): - parts.append(_strip_computed(f"iceberg_scan('{iceberg_loc}', allow_moved_paths=true)")) + parts.append(_strip_computed(f"iceberg_scan('{escape_sql_literal(iceberg_loc)}', allow_moved_paths=true)")) logger.info( "%s Falling back to iceberg_scan for %s (s3_paths=%d, local_iceberg_files=%d).", _ICE, @@ -3204,7 +3229,13 @@ def _strip_computed(read_parquet_expr: str) -> str: len(local_iceberg_files), ) elif s3_paths: - logger.info( + # Demoted from INFO to DEBUG (2026-06-01): this fires on every + # view refresh whenever the local cache lags the iceberg manifest + # (very common during catch-up / right after a commit). Useful for + # debugging stale-view issues, not useful as a routine signal — + # was spamming the GCE backend log every few seconds with no + # actionable content. + logger.debug( "%s Skipping %d missing cloud files in view (local files present, CDN sync pending).", _ICE, len(s3_paths), @@ -3215,7 +3246,7 @@ def _strip_computed(read_parquet_expr: str) -> str: buf_files = [p for p in buf_files if os.path.isfile(p)] if buf_files: - paths_sql = ", ".join(f"'{p}'" for p in buf_files) + paths_sql = ", ".join(f"'{escape_sql_literal(p)}'" for p in buf_files) parts.append(_strip_computed(f"read_parquet([{paths_sql}], union_by_name=true, hive_partitioning=false)")) if not parts: @@ -3258,11 +3289,30 @@ def _strip_computed(read_parquet_expr: str) -> str: is_analyst = source.get("access_level") == "read_only" if tr and (is_analyst or not source.get("provisioning", {}).get("cron_sync", {}).get("enabled", True)): + # Security: validate via isoparse before interpolation. Without + # this, an attacker-controlled tr["start"] / tr["end"] dict value + # (these come from saved-view JSON which originates from the + # frontend) is interpolated raw into DuckDB SQL — a payload like + # "2024-01-01'; ATTACH '/tmp/x.db' AS y; --" + # would execute multi-statement SQL against the connection. + # isoparse rejects anything that isn't a valid ISO-8601 timestamp; + # we then interpolate the canonical .isoformat() output, which + # contains only digits, ":", "-", "T", "+", and "Z". + import dateutil.parser as _dt + where_clauses = [] if tr.get("start"): - where_clauses.append(f"timestamp >= '{tr['start']}'::TIMESTAMPTZ") + try: + start_iso = _dt.isoparse(str(tr["start"])).isoformat() + except (ValueError, TypeError) as e: + raise ValueError(f"invalid time_range start: {e}") from e + where_clauses.append(f"timestamp >= '{start_iso}'::TIMESTAMPTZ") if tr.get("end"): - where_clauses.append(f"timestamp <= '{tr['end']}'::TIMESTAMPTZ") + try: + end_iso = _dt.isoparse(str(tr["end"])).isoformat() + except (ValueError, TypeError) as e: + raise ValueError(f"invalid time_range end: {e}") from e + where_clauses.append(f"timestamp <= '{end_iso}'::TIMESTAMPTZ") if where_clauses: union_sql = f"SELECT * FROM ({union_sql}) WHERE {' AND '.join(where_clauses)}" @@ -3329,8 +3379,7 @@ def _strip_computed(read_parquet_expr: str) -> str: t_end = time.time() duration_ms = (t_end - t_start) * 1000 - logger.info("%s %s: View refresh complete (%.0f ms).", _ICE, source_key, duration_ms) - logger.info("⏹️ %s %s: View refresh finished.", _ICE_PLAIN, source_key) + logger.info("⏹️ %s %s: View refresh complete (%.0f ms).", _ICE_PLAIN, source_key, duration_ms) _view_cache[source_key] = ( metadata_loc, buf_set, @@ -3431,6 +3480,17 @@ def _save_manifest_metadata_cache(source: dict, live_manifest_paths: list[str]) "files": m_files, "size": m_size, } + # Mirror the on-disk prune in memory. Pre-fix this dict was only + # ever appended to (lines 3428, 2656) — entries for manifests + # dropped by snapshot expiry or compaction stayed resident + # forever, growing into multi-hundred-MB RSS over days of uptime + # and compounding the host-OOM problem. Compute the live set + # ONCE outside the loop so the cost is O(live + cache) rather + # than O(live × cache). + live_set = set(live_manifest_paths) + dead_keys = [k for k in _manifest_metadata_cache if k not in live_set] + for k in dead_keys: + _manifest_metadata_cache.pop(k, None) try: tmp = cache_file + ".tmp" diff --git a/backend/core/ingest.py b/backend/core/ingest.py index d1586adf..172cbae7 100644 --- a/backend/core/ingest.py +++ b/backend/core/ingest.py @@ -21,6 +21,7 @@ ) from backend.core.log_fields import LOG_FIELD_CATALOG from backend.utils import field_codes as fc +from backend.utils.sql_validator import escape_sql_literal logger = logging.getLogger(__name__) @@ -463,6 +464,7 @@ def elapsed() -> str: processed_count = 0 deleted = 0 successfully_processed_files = [] + touched_hours: set[str] = set() mem_con = None # Increase parallelism for S3 deletions @@ -546,7 +548,12 @@ def elapsed() -> str: read_paths = [s3_to_local[p] for p in read_paths_s3] mem_con.execute("DROP TABLE IF EXISTS _ingest_staging") - paths_sql = ", ".join(f"'{p}'" for p in read_paths) + # Security: escape single quotes in each local path before + # interpolating into the SQL literal. The local paths inherit + # their basename from the attacker-controllable S3 object key, + # so a key like ``raw/'); ATTACH '...; --`` would otherwise + # break out of the literal and execute arbitrary DuckDB SQL. + paths_sql = ", ".join(f"'{escape_sql_literal(p)}'" for p in read_paths) try: _execute_query_with_retry( @@ -563,13 +570,14 @@ def elapsed() -> str: valid_paths = [] for i, read_path in enumerate(read_paths): try: - # Quick accessibility test: read 1 row without loading the whole file. + # Security: per-file isolation read also needs escaping. + safe_read_path = escape_sql_literal(read_path) _execute_query_with_retry( mem_con, - f"SELECT 1 FROM read_json_auto('{read_path}', sample_size=1) LIMIT 1", + f"SELECT 1 FROM read_json_auto('{safe_read_path}', sample_size=1) LIMIT 1", max_retries=2, ) - valid_paths.append(f"'{read_path}'") + valid_paths.append(f"'{safe_read_path}'") except Exception as file_err: f_name = read_paths_s3[i].split("/")[-1] err_msg = str(file_err) @@ -601,7 +609,13 @@ def elapsed() -> str: # Translate filename column from local→s3 so downstream # count_map / _source_file / file_sizes all key on s3://. if local_to_s3: - path_map_rows = ", ".join(f"('{local}', '{s3}')" for local, s3 in local_to_s3.items()) + # Security: same escaping treatment for the + # local→s3 mapping table — both halves originate from + # attacker-controllable object keys. + path_map_rows = ", ".join( + f"('{escape_sql_literal(local)}', '{escape_sql_literal(s3)}')" + for local, s3 in local_to_s3.items() + ) mem_con.execute("DROP TABLE IF EXISTS _ingest_path_map") mem_con.execute( f"CREATE TEMP TABLE _ingest_path_map AS SELECT * FROM (VALUES {path_map_rows}) AS t(local, s3)" @@ -626,7 +640,12 @@ def elapsed() -> str: filename_expr = '"filename"' if svc_id: - escaped = svc_id.replace("'", "''") + # Security: consistent escape helper across the + # ingest path. Functionally identical to the inline + # .replace but routes through escape_sql_literal so + # any future hardening on the canonical helper + # (e.g., extra char classes) flows here. + escaped = escape_sql_literal(svc_id) backend_expr = f"regexp_replace(\"backend\", '^{escaped}--', '') AS \"backend\"" else: backend_expr = '"backend"' @@ -671,6 +690,16 @@ def elapsed() -> str: arrow_table = _fetched.read_all() if hasattr(_fetched, "read_all") else _fetched valid_rows = len(arrow_table) + if valid_rows > 0: + chunk_hours = { + r[0] + for r in mem_con.execute( + "SELECT DISTINCT strftime(timestamp, '%Y-%m-%d-%H') FROM _ingest_staging WHERE timestamp IS NOT NULL" + ).fetchall() + if r[0] is not None + } + touched_hours.update(chunk_hours) + total_rows_batch = mem_con.execute("SELECT count(*) FROM _ingest_staging").fetchone()[0] corrupt_in_batch = total_rows_batch - valid_rows @@ -692,7 +721,9 @@ def elapsed() -> str: corrupt_s3_paths.append(s3_path) if corrupt_read_paths: - paths_sql_str = ", ".join(f"'{p}'" for p in corrupt_read_paths) + # Security: corrupt-file diagnostic path + # also needs escaping. Same vector as above. + paths_sql_str = ", ".join(f"'{escape_sql_literal(p)}'" for p in corrupt_read_paths) q = f""" SELECT filename, column0 FROM read_csv([{paths_sql_str}], header=false, sep='', quote='', escape='', columns={{'column0': 'VARCHAR'}}, filename=true) WHERE NOT json_valid(column0) OR json_extract(column0, '$.timestamp') IS NULL @@ -731,8 +762,9 @@ def elapsed() -> str: # Apply the same transformation (decoding, filename # normalization) and inject the original s3:// fname - # so attribution stays consistent. - safe_fname = fname.replace("'", "''") + # so attribution stays consistent. Security: + # escape via the shared helper. + safe_fname = escape_sql_literal(fname) mem_con.execute( f""" INSERT INTO _ingest_staging BY NAME @@ -903,4 +935,5 @@ def _do_delete(keys, bucket, client): "corrupt_details": total_corrupt_details, "deleted_files": deleted, "message": f"Successfully ingested {processed_count} new files ({total_inserted} rows) and deleted {deleted} raw files.", + "touched_hours": list(touched_hours), } diff --git a/backend/core/log_fields.py b/backend/core/log_fields.py index 1b7ab79d..a93b3c9e 100644 --- a/backend/core/log_fields.py +++ b/backend/core/log_fields.py @@ -221,7 +221,7 @@ "group": "A", "label": "Host", "description": "HTTP Host header (domain name) captured at the true client edge before any rewrites.", - "vcl": '"host":"%{json.escape(if(req.http.x-fos-edge-data:host != "", req.http.x-fos-edge-data:host, req.http.Host))}V"', + "vcl": '"host":"%{json.escape(substr(if(req.http.x-fos-edge-data:host != "", req.http.x-fos-edge-data:host, req.http.Host), 0, 512))}V"', "duckdb_type": "VARCHAR", "typical_bytes": 22, "required_by": ["new_probe_urls"], @@ -829,6 +829,17 @@ "required_by": [], }, # ── Group L — Origin Metrics ─────────────────────────────────────────── + # Security: each origin-metric field interpolates the value of a + # client-spoofable internal header (``x-of-ttfb`` etc.). Without a + # regex guard on the value, an attacker who reached vcl_recv with a + # crafted header like ``x-of-ttfb: 0, "waf": 1`` would break out of + # the unquoted numeric slot and inject arbitrary JSON keys into the + # log line. The ``~ "^[0-9]+$"`` test gates each numeric field to + # digit-only values; ``x-of-oip`` (the only string field) gets + # ``json.escape(...)`` so quotes / backslashes / control bytes + # serialize as their JSON-escape equivalents instead of breaking + # out of the string literal. the earlier fix also unsets all + # these headers on inbound req, so this is belt-and-suspenders. { "id": "ottfb", "group": "L", @@ -836,7 +847,7 @@ "description": "µs from fetch start to first byte of origin/shield response headers. Null on HITs.", "formatter": "number", "unit": "µs", - "vcl": '"ottfb":%{if(req.http.x-of-ttfb, req.http.x-of-ttfb, "null")}V', + "vcl": '"ottfb":%{if(req.http.x-of-ttfb ~ "^[0-9]+$", req.http.x-of-ttfb, "null")}V', "duckdb_type": "UBIGINT", "typical_bytes": 16, "required_by": ["origin_latency_spike", "region_latency"], @@ -848,7 +859,7 @@ "description": "µs from fetch start to full response body received. Null on HITs.", "formatter": "number", "unit": "µs", - "vcl": '"ottlb":%{if(req.http.x-of-ttlb, req.http.x-of-ttlb, "null")}V', + "vcl": '"ottlb":%{if(req.http.x-of-ttlb ~ "^[0-9]+$", req.http.x-of-ttlb, "null")}V', "duckdb_type": "UBIGINT", "typical_bytes": 16, "required_by": ["origin_latency_spike"], @@ -859,7 +870,7 @@ "label": "Origin Status", "description": "HTTP status returned by origin or shield. Null on HITs.", "formatter": "status", - "vcl": '"ost":%{if(req.http.x-of-status, req.http.x-of-status, "null")}V', + "vcl": '"ost":%{if(req.http.x-of-status ~ "^[0-9]+$", req.http.x-of-status, "null")}V', "duckdb_type": "USMALLINT", "typical_bytes": 10, "required_by": ["origin_error_rate", "origin_ip_failure"], @@ -869,7 +880,9 @@ "group": "L", "label": "Origin Bytes", "description": "Bytes written in the response (resp.bytes_written). Null on HITs. Same variable as resp_bytes but null-on-HIT makes it queryable as 'total bytes fetched from origin'.", - "vcl": '"obytes":%{if(req.http.x-of-start, "" + resp.bytes_written, "null")}V', + # resp.bytes_written is a Fastly-internal counter (not from a header), + # so no JSON-injection risk; the x-of-start guard is preserved as-is. + "vcl": '"obytes":%{if(req.http.x-of-start ~ "^[0-9]+$", "" + resp.bytes_written, "null")}V', "duckdb_type": "UBIGINT", "typical_bytes": 15, "required_by": [], @@ -879,7 +892,10 @@ "group": "L", "label": "Origin IP", "description": "IP address of the backend server that handled the fetch. Null on HITs.", - "vcl": '"oip":"%{if(req.http.x-of-oip, req.http.x-of-oip, "")}V"', + # json.escape converts the value to JSON-string-safe form so + # quotes / backslashes / control bytes get their \\uXXXX escapes + # instead of terminating the literal early. + "vcl": '"oip":"%{json.escape(if(req.http.x-of-oip, req.http.x-of-oip, ""))}V"', "duckdb_type": "VARCHAR", "typical_bytes": 15, "required_by": ["origin_ip_failure"], @@ -890,7 +906,7 @@ "label": "Origin Retries", "description": "Backend connection retry count before success or failure. Null on HITs.", "formatter": "number", - "vcl": '"oretries":%{if(req.http.x-of-oretries, req.http.x-of-oretries, "null")}V', + "vcl": '"oretries":%{if(req.http.x-of-oretries ~ "^[0-9]+$", req.http.x-of-oretries, "null")}V', "duckdb_type": "UTINYINT", "typical_bytes": 13, "required_by": ["origin_retries"], @@ -1066,6 +1082,22 @@ "typical_bytes": 0, "required_by": [], }, + { + "id": "edge_score_reason_ind", + "group": "VIRTUAL", + "label": "Score Reasons", + "description": ( + "Individual scoring reasons extracted from the comma-separated " + "edge_score_reason field (e.g. 'cookie-missing', 'impossibly-fast', " + "'robotic-consistency', 'rare-transition'). Lets the dashboard " + "show top-N reason breakdowns and filter by a single reason " + "even when one request triggers multiple." + ), + "vcl": None, + "duckdb_type": "VARCHAR", + "typical_bytes": 0, + "required_by": [], + }, # ── Internal ────────────────────────────────────────────────────────── { "id": "_source_file", @@ -1442,11 +1474,26 @@ def generate_log_format(log_fields_config: dict) -> str: # Overwrite the static substr limit in the built-in VCL vcl = vcl.replace("substr(req.url, 0, 2000)", f"substr(req.url, 0, {limit})") elif field["id"] == "ua": - # Strip the hardcoded substr since we now do it at the edge (in vcl_recv) - vcl = '"ua":"%{json.escape(if(req.http.x-fos-edge-data:ua != "", req.http.x-fos-edge-data:ua, req.http.User-Agent))}V"' + # Security: keep the substr cap even when generating the + # alternative VCL variant. The edge-side substr (in vcl_recv) + # is a *first* truncation — but we never want a 100 KB header + # to slip through if the edge snippet is missing or fails to + # run (e.g., on a request that bypasses our snippet stack). + # An unbounded UA can truncate the entire JSON log line at + # the 16 KB Fastly limit, dropping the request from the audit + # trail entirely (repudiation attack). + ua_limit = limits.get("ua", 1000) + vcl = ( + f'"ua":"%{{json.escape(substr(if(req.http.x-fos-edge-data:ua != "",' + f' req.http.x-fos-edge-data:ua, req.http.User-Agent), 0, {ua_limit}))}}V"' + ) elif field["id"] == "referer": - # Strip the hardcoded substr since we now do it at the edge (in vcl_recv) - vcl = '"referer":"%{json.escape(if(req.http.x-fos-edge-data:referer != "", req.http.x-fos-edge-data:referer, req.http.Referer))}V"' + # Same reasoning as above — keep the substr cap. + ref_limit = limits.get("referer", 1000) + vcl = ( + f'"referer":"%{{json.escape(substr(if(req.http.x-fos-edge-data:referer != "",' + f' req.http.x-fos-edge-data:referer, req.http.Referer), 0, {ref_limit}))}}V"' + ) parts.append(vcl) @@ -1459,6 +1506,44 @@ def generate_log_format(log_fields_config: dict) -> str: stage = cf.get("collection_stage", "edge") value_type = cf.get("value_type", "string") + if stage == "deliver": + # Deliver-stage fields (session-scoring) need TWO gates: + # 1. edge-only (fastly.ff.visits_this_service == 0) — the + # shield POP never ran our scoring snippets, so the + # req.http subfields don't exist there. + # 2. non-empty value — avoid breaking JSON. + # Combined into ONE if() with compound AND so we don't end up + # with nested if(if(...) != "", ...) which Fastly's parser + # rejects ("if() condition must be a simple expression, not a + # function call"). + raw_expr = cf.get("vcl_log_expression") or f"req.http.x-fos-edge-data:{name}" + if value_type in ("numeric", "boolean"): + # 014: ``!= ""`` only rejects empty strings — any other + # text (`"true"`, ``"abc"``, ``"]"``) flows straight into + # the JSON log line unquoted and breaks the JSON + # structure, dropping the line from ingestion (log + # injection / repudiation). Match a strict numeric form + # so non-digit values fall through to ``"null"``. + vcl_macro = ( + f"if(fastly.ff.visits_this_service == 0 && " + f'{raw_expr} ~ "^-?[0-9]+(\\.[0-9]+)?$", {raw_expr}, "null")' + ) + entry = f'"{name}":%{{{vcl_macro}}}V' + else: + # 016: clamp the string-field value to a sane length + # (default 2000) BEFORE json.escape so a multi-megabyte + # attacker-controlled custom field cannot push the log + # line past Fastly's 16 KB limit and silently drop the + # whole entry. The substr is INSIDE json.escape so the + # encoded length stays bounded. + cf_limit = int(cf.get("byte_limit") or limits.get(name) or 2000) + vcl_macro = ( + f'json.escape(if(fastly.ff.visits_this_service == 0, substr({raw_expr}, 0, {cf_limit}), ""))' + ) + entry = f'"{name}":"%{{{vcl_macro}}}V"' + parts.append(entry) + continue + if stage == "edge": expr = f"req.http.x-fos-edge-data:{name}" elif stage == "origin": @@ -1468,11 +1553,17 @@ def generate_log_format(log_fields_config: dict) -> str: expr = f"req.http.x-fos-edge-data:{name}" if value_type in ("numeric", "boolean"): - # Avoid empty strings breaking JSON numbers - vcl_macro = f'if({expr} != "", {expr}, "null")' + # 014: see deliver-stage comment above — strict numeric + # regex instead of ``!= ""`` so a custom-field header value + # like ``"]"`` cannot break out of the JSON log line. + vcl_macro = f'if({expr} ~ "^-?[0-9]+(\\.[0-9]+)?$", {expr}, "null")' entry = f'"{name}":%{{{vcl_macro}}}V' else: - vcl_macro = f"json.escape({expr})" + # 016: substr-clamp the value before json.escape so an + # oversized custom string field cannot push the line past + # Fastly's 16 KB log-line limit. + cf_limit = int(cf.get("byte_limit") or limits.get(name) or 2000) + vcl_macro = f"json.escape(substr({expr}, 0, {cf_limit}))" entry = f'"{name}":"%{{{vcl_macro}}}V"' parts.append(entry) diff --git a/backend/core/metadata_db.py b/backend/core/metadata_db.py index 63364f95..ce98edeb 100644 --- a/backend/core/metadata_db.py +++ b/backend/core/metadata_db.py @@ -22,6 +22,7 @@ import os import sqlite3 import threading +import time from datetime import UTC, datetime, timedelta from backend.utils.date_utils import iso_z, iso_z_now @@ -228,7 +229,23 @@ def teardown(service_id: str) -> None: error_count INTEGER DEFAULT 0, PRIMARY KEY (file_name, source_name) )""", - "CREATE INDEX IF NOT EXISTS idx_ingested_files_source ON ingested_files(source_name)", + # Covers `/usage/prefill`'s source+range narrowing + # (`WHERE source_name = ? AND ingested_at BETWEEN ? AND ?`) and the + # bounded `list_unbackfilled_fastly_edge_files` scan (see :1128). The + # previous `idx_ingested_files_source` indexed source_name alone — SQLite + # had to walk every row for the matching source and filter ingested_at + # in memory (~250ms per query on populated services). The composite + # satisfies the range scan directly and is a strict superset for + # source_name-only lookups (SQLite uses leading-column prefixes), so the + # old index is redundant and dropped here. Index name matches the + # by-name reference in `list_unbackfilled_fastly_edge_files`'s docstring. + "CREATE INDEX IF NOT EXISTS idx_ingested_files_source_ingested_at ON ingested_files(source_name, ingested_at)", + "DROP INDEX IF EXISTS idx_ingested_files_source", + # Earlier in this branch a redundant `idx_ingested_files_source_ts` was + # added under a different name before discovering the existing + # by-name reference above; clean it up so no service ends up with two + # functionally identical composites. + "DROP INDEX IF EXISTS idx_ingested_files_source_ts", # Single-row-per-service rollup maintained by ``insert_ingested_files``. # Without it, ``get_ingested_files_status_summary`` had to SUM(row_count) # + SUM(file_size_bytes) across the whole table on every cron tick — @@ -278,6 +295,13 @@ def teardown(service_id: str) -> None: log_output TEXT )""", "CREATE INDEX IF NOT EXISTS idx_cron_task_started ON cron_runs(task, started_at)", + # Covers `/logs`'s unfiltered pagination + # (`ORDER BY started_at DESC LIMIT ? OFFSET ?` with no `WHERE task`) and + # `main.py`'s sync-status probe (`WHERE task='sync' AND status != 'running' + # ORDER BY started_at DESC LIMIT 1`). Without it, SQLite falls back to a + # TEMP B-TREE sort over the full table because `idx_cron_task_started` + # requires a leading-`task` predicate to satisfy the ORDER BY. + "CREATE INDEX IF NOT EXISTS idx_cron_started ON cron_runs(started_at DESC)", """CREATE TABLE IF NOT EXISTS asn_names ( asn INTEGER PRIMARY KEY, name TEXT NOT NULL, @@ -321,6 +345,44 @@ def teardown(service_id: str) -> None: last_triggered_at TEXT, created_at TEXT DEFAULT (datetime('now')) )""", + # Admin-flagged sessions for the edge session-scoring system. Each row + # is one (service, sid) tuple labeled good/bad/neutral by the admin. + # Feeds backend.scoring.evaluate.evaluate() for matrix ROC-AUC; the + # neutral label is captured for UI completeness but excluded from the + # AUC computation (intentionally uncertain). + """CREATE TABLE IF NOT EXISTS scoring_labels ( + id TEXT PRIMARY KEY, + service_id TEXT NOT NULL, + sid TEXT NOT NULL, + label TEXT NOT NULL CHECK (label IN ('good', 'bad', 'neutral')), + notes TEXT DEFAULT '', + flagged_by TEXT, + sample_ip TEXT, + sample_ua TEXT, + sample_url TEXT, + created_at TEXT DEFAULT (datetime('now')), + updated_at TEXT DEFAULT (datetime('now')) + )""", + "CREATE UNIQUE INDEX IF NOT EXISTS idx_scoring_labels_svc_sid ON scoring_labels(service_id, sid)", + "CREATE INDEX IF NOT EXISTS idx_scoring_labels_svc_label ON scoring_labels(service_id, label)", + # Operator audit log specifically for scoring-config mutations. + # Separate from audit_logs (which gets state_sync'd) because scoring- + # audit is per-host operator-attribution data that should NOT mirror + # to read_only analyst replicas. + """CREATE TABLE IF NOT EXISTS scoring_audit ( + id INTEGER PRIMARY KEY AUTOINCREMENT, + timestamp TEXT NOT NULL DEFAULT (datetime('now')), + service_id TEXT NOT NULL, + action TEXT NOT NULL, + actor TEXT NOT NULL, + details TEXT + )""", + "CREATE INDEX IF NOT EXISTS idx_scoring_audit_svc_ts ON scoring_audit(service_id, timestamp DESC)", + # Plain timestamp index for the list_scoring_audit ORDER BY timestamp DESC + # path when the service_id predicate is already satisfied — keeps the sort + # itself indexed instead of falling back to a TEMP B-TREE on large audit + # tables. + "CREATE INDEX IF NOT EXISTS idx_scoring_audit_ts ON scoring_audit(timestamp DESC)", """CREATE TABLE IF NOT EXISTS usage_log ( id INTEGER PRIMARY KEY AUTOINCREMENT, timestamp TEXT, @@ -372,6 +434,44 @@ def teardown(service_id: str) -> None: # 500-row page. Including (operation_class, count, bytes) makes the # aggregate covering too (5× faster than non-covering on the same query). "CREATE INDEX IF NOT EXISTS idx_usage_service_ts ON usage_log(service_id, timestamp, operation_class, count, bytes)", + # Hourly rollup of usage_log keyed by (service, hour-prefix of timestamp, + # operation_class, operation_type). Powers the /admin/usage-log aggregate + # GROUP BY which used to scan millions of usage_log rows (~600 ms steady + # state). With the rollup the aggregate becomes a small indexed sum over + # at most 24 hours × a few op-class/type pairs. Maintained by the + # AFTER INSERT trigger below (incremental, always-consistent) plus a + # backfill helper for services upgrading from a pre-rollup install. + """CREATE TABLE IF NOT EXISTS usage_log_hourly_summary ( + service_id TEXT NOT NULL, + hour TEXT NOT NULL, + operation_class TEXT NOT NULL DEFAULT '', + operation_type TEXT NOT NULL DEFAULT '', + count INTEGER NOT NULL DEFAULT 0, + bytes INTEGER NOT NULL DEFAULT 0, + last_updated TEXT NOT NULL DEFAULT (datetime('now')), + PRIMARY KEY (service_id, hour, operation_class, operation_type) + )""", + "CREATE INDEX IF NOT EXISTS idx_usage_hourly_svc_hour ON usage_log_hourly_summary(service_id, hour)", + # AFTER INSERT trigger: every row added to usage_log bumps its hour bucket + # in the summary. Hour key = first 13 chars of timestamp ("YYYY-MM-DDTHH"). + # Coalesce on empty operation_class/operation_type because rows can have + # NULLs; the rollup uses '' as a normalised sentinel. ON CONFLICT path + # supports the reconcile_fastly_stats compaction pattern where multiple + # rows for the same (hour, class, type) accumulate. + """CREATE TRIGGER IF NOT EXISTS trg_usage_log_summary_insert + AFTER INSERT ON usage_log + WHEN NEW.timestamp IS NOT NULL AND length(NEW.timestamp) >= 13 AND NEW.service_id IS NOT NULL + BEGIN + INSERT INTO usage_log_hourly_summary + (service_id, hour, operation_class, operation_type, count, bytes, last_updated) + VALUES (NEW.service_id, substr(NEW.timestamp, 1, 13), + COALESCE(NEW.operation_class, ''), COALESCE(NEW.operation_type, ''), + COALESCE(NEW.count, 1), COALESCE(NEW.bytes, 0), datetime('now')) + ON CONFLICT(service_id, hour, operation_class, operation_type) + DO UPDATE SET count = count + excluded.count, + bytes = bytes + excluded.bytes, + last_updated = excluded.last_updated; + END""", # Tracks Iceberg parquet basenames that local_compaction merged into a # bigger local file and then deleted from disk. WITHOUT this table the # sync_data fast-path check sees the deletions as "missing local files" @@ -383,28 +483,31 @@ def teardown(service_id: str) -> None: file_name TEXT PRIMARY KEY, compacted_at TEXT DEFAULT (datetime('now')) )""", + # Tracking table for the data-migration framework + # (``backend.core.data_migrations``). Each row records one applied + # data-migration: long-running, one-time data setup tasks (e.g. the + # rollups initial backfill) that are NOT schema DDL changes. Schema + # migrations use ``PRAGMA user_version`` via ``sqlite_migrations.py`` + # — these two systems are intentionally separate because schema + # changes must block startup, while data migrations run async on a + # daemon thread so a multi-hour backfill can't wedge the boot loop. + """CREATE TABLE IF NOT EXISTS applied_data_migrations ( + name TEXT PRIMARY KEY, + applied_at TEXT NOT NULL DEFAULT (datetime('now')), + duration_s REAL, + status TEXT NOT NULL DEFAULT 'success', + notes TEXT + )""", ] def _init_schema(con: sqlite3.Connection) -> None: + from backend.core import sqlite_migrations + for stmt in _SCHEMA: con.execute(stmt) con.commit() - # Bring pre-migration-framework service DBs up to current. Migrations - # are idempotent (each checks before mutating) so this is also safe to - # call on fresh DBs that already have everything from ``_SCHEMA``. - # On a healthy fresh install the loop exits on the first version check. - from backend.core import sqlite_migrations - - applied = sqlite_migrations.apply_pending(con) - if applied: - logger.info("[metadata_db] applied %d pending migration(s)", applied) - # New DBs leap straight to LATEST so the migration loop doesn't waste - # a check on every open. Idempotency means doing the work first is - # harmless, but skipping the inspection is cheaper at scale. - if sqlite_migrations.get_current_version(con) < sqlite_migrations.LATEST_VERSION: - con.execute(f"PRAGMA user_version = {sqlite_migrations.LATEST_VERSION}") - con.commit() + sqlite_migrations.apply_pending(con) # ── alerts ──────────────────────────────────────────────────────────────────── @@ -627,6 +730,44 @@ def replace_views_for_service(service_id: str, views: list[dict]) -> None: con.commit() +def upsert_views_for_service(service_id: str, views: list[dict]) -> None: + """Upsert saved views by id WITHOUT deleting local-only rows. + + Used by state_sync.import_admin_state on read_only analyst hosts so + locally-created views (which the analyst created on their own pod) are + preserved through every metadata_sync cron tick. Without this, the + cron's wholesale DELETE+INSERT silently wiped any analyst-side view + that hadn't been mirrored back to FOS — and ``export_admin_state`` + refuses to push from read_only hosts, so the loss was permanent. + """ + if not views: + return + con = get_con(service_id) + con.executemany( + "INSERT INTO views (id, service_id, name, filters_json, time_range_type, start_time, end_time, page, created_at) " + "VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?) " + "ON CONFLICT(id) DO UPDATE SET " + "name=excluded.name, filters_json=excluded.filters_json, " + "time_range_type=excluded.time_range_type, start_time=excluded.start_time, " + "end_time=excluded.end_time, page=excluded.page, created_at=excluded.created_at", + [ + ( + v.get("id"), + v.get("service_id"), + v.get("name"), + v.get("filters_json"), + v.get("time_range_type"), + v.get("start_time"), + v.get("end_time"), + v.get("page"), + v.get("created_at"), + ) + for v in views + ], + ) + con.commit() + + # ── audit_logs ──────────────────────────────────────────────────────────────── @@ -741,6 +882,37 @@ def replace_audit_for_service(service_id: str, rows: list[dict]) -> None: con.commit() +def merge_audit_for_service(service_id: str, rows: list[dict]) -> None: + """Insert audit log entries from remote without deleting local ones. + + Used by state_sync.import_admin_state on read_only analyst hosts to + preserve local audit entries created by the analyst's own actions + (which the wholesale ``replace_audit_for_service`` would have wiped on + every cron tick). + + Dedup key: (timestamp, source_name, event_type, actor) — a row with + those four fields equal to an existing row is considered the same + event and skipped. ``timestamp`` has second precision so collisions + between distinct events are improbable, and even if they happen the + audit log tolerates the missed insert. + """ + if not rows: + return + con = get_con(service_id) + for r in rows: + existing = con.execute( + "SELECT 1 FROM audit_logs WHERE source_name = ? AND timestamp = ? AND event_type = ? AND actor = ? LIMIT 1", + (r.get("source_name"), r.get("timestamp"), r.get("event_type"), r.get("actor")), + ).fetchone() + if existing: + continue + con.execute( + "INSERT INTO audit_logs (timestamp, source_name, event_type, details, actor) VALUES (?, ?, ?, ?, ?)", + (r.get("timestamp"), r.get("source_name"), r.get("event_type"), r.get("details"), r.get("actor")), + ) + con.commit() + + # ── ingested_files ──────────────────────────────────────────────────────────── @@ -1503,6 +1675,114 @@ def purge_cron_runs( con.commit() +def record_scoring_audit( + service_id: str, + action: str, + *, + actor: str = "operator", + details: dict | None = None, +) -> None: + """Append an operator-attribution row to the scoring_audit log. + + Called from every scoring-config-mutating endpoint (enable, disable, + threshold commit + enforce, retrain, rotate-key, matrix-rollback). + Best-effort: any SQLite failure is logged at DEBUG and swallowed so + a busy WAL doesn't block the actual operator action. + """ + try: + con = get_con(service_id) + con.execute( + "INSERT INTO scoring_audit (service_id, action, actor, details) VALUES (?, ?, ?, ?)", + (service_id, action, actor, json.dumps(details) if details else None), + ) + con.commit() + except sqlite3.Error as e: + logger.debug("[metadata_db] record_scoring_audit(%s, %s) failed: %s", service_id, action, e) + + +def list_scoring_audit( + service_id: str, + *, + limit: int = 100, + since: str | None = None, +) -> list[dict]: + """Most-recent first. Optional ISO ``since`` timestamp lower bound.""" + try: + con = get_con(service_id) + if since: + rows = con.execute( + "SELECT id, timestamp, action, actor, details FROM scoring_audit " + "WHERE service_id = ? AND timestamp >= ? ORDER BY id DESC LIMIT ?", + (service_id, since, limit), + ).fetchall() + else: + rows = con.execute( + "SELECT id, timestamp, action, actor, details FROM scoring_audit " + "WHERE service_id = ? ORDER BY id DESC LIMIT ?", + (service_id, limit), + ).fetchall() + out = [] + for r in rows: + row = dict(r) + if row.get("details"): + try: + row["details"] = json.loads(row["details"]) + except (ValueError, TypeError): + pass + out.append(row) + return out + except sqlite3.Error as e: + logger.debug("[metadata_db] list_scoring_audit(%s) failed: %s", service_id, e) + return [] + + +def prune_scoring_audit(service_id: str, *, keep_last: int = 10000) -> None: + """Trim scoring_audit to the most recent ``keep_last`` rows per service. + + Cheap unbounded growth guard — every scoring-config mutation appends + one row, and the table is only ever read by the admin UI / state_sync + export which already caps its own page size. Best-effort: any SQLite + failure is logged at DEBUG and swallowed so trimming never blocks the + caller (typically a maintenance cron, not the operator hot path). + """ + try: + con = get_con(service_id) + # Tiebreak on id DESC so concurrent inserts that landed in the same + # `datetime('now')` second are deterministically ordered (otherwise + # SQLite is free to pick any row from the tied group, which makes + # prune flaky under burst workloads and breaks reproducibility tests). + con.execute( + "DELETE FROM scoring_audit WHERE service_id = ? AND id NOT IN (" + "SELECT id FROM scoring_audit WHERE service_id = ? ORDER BY timestamp DESC, id DESC LIMIT ?)", + (service_id, service_id, keep_last), + ) + con.commit() + except sqlite3.Error as e: + logger.debug("[metadata_db] prune_scoring_audit(%s) failed: %s", service_id, e) + + +def get_cron_run_status(service_id: str, run_id: int) -> str | None: + """Return the status string for a single cron_runs row, or None if + the row doesn't exist. Used by cron_progress.list_active_runs to + cross-check the in-memory state against the DB-of-truth (catches + abandoned-worker-thread zombies that completed log_cron_run but + never fired end_progress). + + Narrowed exception scope: catches sqlite3.Error (DB unreachable, + table missing, locked) and logs at DEBUG so the next 'why isn't + the cross-check firing?' triage isn't flying blind. Returns None + on any DB failure so list_active_runs falls back to the in-memory + signal (we'd rather show a false in-flight than miss a real one). + """ + try: + con = get_con(service_id) + row = con.execute("SELECT status FROM cron_runs WHERE id = ?", (run_id,)).fetchone() + return row["status"] if row else None + except sqlite3.Error as e: + logger.debug("[metadata_db] get_cron_run_status(%s, %s) failed: %s", service_id, run_id, e) + return None + + def get_cron_runs( service_id: str, *, @@ -2019,6 +2299,180 @@ def clear_usage_log(service_id: str) -> None: con.commit() +USAGE_LOG_HOURLY_BACKFILL_NAME = "2026-06-04_usage_log_hourly_summary_backfill" + +# Per-process guard so the in-process check doesn't hit SQLite on every read. +# The DB-level marker (applied_data_migrations) is the source of truth across +# restarts; this cache just trims redundant lookups within one process. +_usage_log_backfilled: set[str] = set() +_usage_log_backfill_lock = threading.Lock() + + +def _ensure_usage_log_hourly_backfilled(con: sqlite3.Connection, service_id: str) -> None: + """Populate usage_log_hourly_summary for services upgrading from a + pre-trigger install. Idempotent; runs at most once per service. + + Detection: presence of the named row in ``applied_data_migrations``. The + trigger handles all NEW inserts; this backfill catches the rows that + existed before the trigger was added. Synchronous so /admin/usage-log + returns correct data on first access (typically <1 s for ~1 M rows). + """ + if service_id in _usage_log_backfilled: + return + with _usage_log_backfill_lock: + if service_id in _usage_log_backfilled: + return + try: + applied = con.execute( + "SELECT 1 FROM applied_data_migrations WHERE name = ?", + (USAGE_LOG_HOURLY_BACKFILL_NAME,), + ).fetchone() + if applied is None: + t0 = time.time() + logger.info("[usage_log] backfilling hourly summary for %s", service_id) + # Wipe any partial summary rows the trigger may have written + # for this service since boot — we're rebuilding from raw so + # the GROUP BY sum is exact, not double-counted on top of + # trigger-written rows. + con.execute("DELETE FROM usage_log_hourly_summary WHERE service_id = ?", (service_id,)) + con.execute( + """ + INSERT INTO usage_log_hourly_summary + (service_id, hour, operation_class, operation_type, count, bytes, last_updated) + SELECT service_id, + substr(timestamp, 1, 13), + COALESCE(operation_class, ''), + COALESCE(operation_type, ''), + SUM(COALESCE(count, 1)), + SUM(COALESCE(bytes, 0)), + datetime('now') + FROM usage_log + WHERE service_id = ? + AND timestamp IS NOT NULL + AND length(timestamp) >= 13 + GROUP BY 1, 2, 3, 4 + """, + (service_id,), + ) + con.execute( + "INSERT OR REPLACE INTO applied_data_migrations " + "(name, applied_at, duration_s, status, notes) VALUES (?, ?, ?, ?, ?)", + (USAGE_LOG_HOURLY_BACKFILL_NAME, iso_z_now(), time.time() - t0, "success", + "rebuilt usage_log_hourly_summary from raw"), + ) + con.commit() + logger.info("[usage_log] hourly backfill complete for %s in %.2fs", service_id, time.time() - t0) + except Exception as e: + logger.warning("[usage_log] hourly summary backfill failed for %s: %s", service_id, e) + _usage_log_backfilled.add(service_id) + + +def _query_usage_log_aggregate_rollup( + con: sqlite3.Connection, + service_id: str, + start: str, + end: str, + usage_type: str, +) -> list[sqlite3.Row]: + """Compute the (operation_class, operation_type) totals exactly using the + hourly rollup for fully-contained hours plus raw usage_log for the two + boundary hours (which usually aren't hour-aligned). + + The rollup PK lookup is sub-millisecond; the boundary raw scans cover at + most 2 hours of data (~80 k rows in a busy service) and ride the + idx_usage_service_ts index. Combined cost is typically ~1-2 ms vs the + 600 ms full-window GROUP BY this replaces. + """ + # Hour bucket prefix is "YYYY-MM-DDTHH" (13 chars). Timestamps in + # usage_log are stored as ISO strings, so prefix comparison is correct. + start_hour = (start or "")[:13] + end_hour = (end or "")[:13] + + class_filter = "" + class_params: list = [] + if usage_type: + if usage_type == "CDN": + class_filter = "AND operation_class = 'CDN'" + elif usage_type == "FOS-A": + class_filter = "AND operation_class = 'A'" + elif usage_type == "FOS-B": + class_filter = "AND operation_class = 'B'" + elif usage_type == "FOS": + class_filter = "AND operation_class IN ('A', 'B')" + else: + class_filter = "AND operation_class = ?" + class_params = [usage_type] + + # Sub-hour range collapses to a single raw scan — no hour bucket fully + # contained, both boundary parts would target the same hour anyway. + if start_hour == end_hour: + rows = con.execute( + f""" + SELECT operation_class, operation_type, + SUM(count) AS c, SUM(COALESCE(bytes, 0)) AS b + FROM usage_log + WHERE service_id = ? AND timestamp >= ? AND timestamp <= ? {class_filter} + GROUP BY operation_class, operation_type + """, + [service_id, start, end] + class_params, + ).fetchall() + return rows + + # Boundary range comparisons keyed on timestamp directly (not + # `substr(timestamp, 1, 13)`) so SQLite can ride idx_usage_service_ts + # as a pure range scan — substr() forces per-row evaluation, ~5x slower + # on the end-of-day boundary (18k rows: 90ms with substr vs ~15ms with + # pure range). The hour boundary is the start of the FOLLOWING hour, so + # we strip any " " or "T" between date/time and use the ISO Z form to + # match what writers store. + def _next_hour_start(hour_prefix: str) -> str: + # "2026-06-04T23" → "2026-06-05T00:00:00.000Z" + try: + dt = datetime.strptime(hour_prefix, "%Y-%m-%dT%H").replace(tzinfo=UTC) + except ValueError: + return hour_prefix + ":59:59.999Z" + nxt = dt + timedelta(hours=1) + return nxt.strftime("%Y-%m-%dT%H:%M:%S.000Z") + + def _hour_start(hour_prefix: str) -> str: + return hour_prefix + ":00:00.000Z" + + start_hour_end = _next_hour_start(start_hour) + end_hour_start = _hour_start(end_hour) + + # Three-part UNION ALL: interior hours from rollup, boundary hours from + # raw usage_log. SUM(SUM(...)) collapses the two sources into a single + # (op_class, op_type) tuple per group. + rollup_class_filter = class_filter # same syntax works against the rollup + rows = con.execute( + f""" + SELECT operation_class, operation_type, + SUM(c) AS c, SUM(b) AS b + FROM ( + SELECT operation_class, operation_type, count AS c, bytes AS b + FROM usage_log_hourly_summary + WHERE service_id = ? AND hour > ? AND hour < ? {rollup_class_filter} + UNION ALL + SELECT operation_class, operation_type, count AS c, COALESCE(bytes, 0) AS b + FROM usage_log + WHERE service_id = ? AND timestamp >= ? AND timestamp < ? {class_filter} + UNION ALL + SELECT operation_class, operation_type, count AS c, COALESCE(bytes, 0) AS b + FROM usage_log + WHERE service_id = ? AND timestamp >= ? AND timestamp <= ? {class_filter} + ) + GROUP BY operation_class, operation_type + """, + # Interior rollup params + [service_id, start_hour, end_hour] + class_params + # Start-boundary raw params: [start, next_hour_after_start_hour) + + [service_id, start, start_hour_end] + class_params + # End-boundary raw params: [start_of_end_hour, end] + + [service_id, end_hour_start, end] + class_params, + ).fetchall() + return rows + + def get_usage_logs( service_id: str, start: str, @@ -2065,20 +2519,33 @@ def get_usage_logs( ) entries = [dict(r) for r in cur.fetchall()] - # One GROUP BY (operation_class, operation_type) does the work of both the - # 5-CASE-WHEN totals query AND the per-class breakdown — they're the same - # 800K-row scan over usage_log, just shaped differently. Doing both in - # one query saves a full pass per Usage Log page load (~1s on prod). - grouped = con.execute( - f""" - SELECT operation_class, operation_type, - sum(count) AS c, sum(coalesce(bytes, 0)) AS b - FROM usage_log - WHERE {where} - GROUP BY 1, 2 - """, - params, - ).fetchall() + # Aggregate path: prefer the usage_log_hourly_summary rollup when only the + # service+timestamp predicates are active (the common admin-page case). The + # rollup is maintained incrementally by trg_usage_log_summary_insert, so + # it's always consistent — no scheduler needed. We can only use it when no + # process_context / operation_type LIKE filters are present (the rollup + # doesn't carry those columns); the operation_class filter IS supported + # because the rollup stores it as a normalised key. Backfill of any + # service that predates the trigger happens lazily on first read. + rollup_eligible = not process_context and not operation_type + if rollup_eligible: + _ensure_usage_log_hourly_backfilled(con, service_id) + grouped = _query_usage_log_aggregate_rollup(con, service_id, start, end, usage_type) + else: + # One GROUP BY (operation_class, operation_type) does the work of both the + # 5-CASE-WHEN totals query AND the per-class breakdown — they're the same + # 800K-row scan over usage_log, just shaped differently. Doing both in + # one query saves a full pass per Usage Log page load (~1s on prod). + grouped = con.execute( + f""" + SELECT operation_class, operation_type, + sum(count) AS c, sum(coalesce(bytes, 0)) AS b + FROM usage_log + WHERE {where} + GROUP BY 1, 2 + """, + params, + ).fetchall() totals = {"A": 0, "B": 0, "CDN": 0} bytes_by_class = {"A": 0, "B": 0, "CDN": 0} @@ -2105,3 +2572,369 @@ def get_usage_logs( } return entries, total, res_agg + + +# ── Metadata retention / cleanup ────────────────────────────────────────────── +# usage_log and ingested_files are append-only and unbounded by default. +# On a long-running deploy they grow without limit (witnessed: 5.7 GB +# metadata.db with 8.25M usage_log rows + 2.35M ingested_files rows). The +# UI doesn't need that history beyond a short window — Usage & Cost pages +# query a configurable window; Data Management shows recent files; cron_runs +# is a short audit trail. Trim by age; keep VACUUM gated to actual deletions +# because a no-op VACUUM still rewrites the whole file. + +# Per-table retention windows (days). Override via cfg["metadata_retention"] +# per service. 0 (or negative) disables cleanup for that table / artefact. +# +# rollups_days is not a SQLite table but a per-hour parquet tree under +# ``/rollups/hour/field=X/hour=Y/``. The cleanup helper deletes +# hour-dirs older than this window. Default 90d gives broad dashboard +# query coverage while bounding disk; set to 0 to keep all history. +DEFAULT_METADATA_RETENTION = { + "usage_log_days": 1, + "ingested_files_days": 1, + "cron_runs_days": 7, + "rollups_days": 90, +} + +# Tables surfaced in the storage stats endpoint. Order matters for the UI. +_STATS_TABLES = ( + "usage_log", + "ingested_files", + "cron_runs", + "alerts", + "saved_views", + "audit_log", + "in_flight_buffers", + "locally_compacted_files", +) + +# (table, retention_key, timestamp_column) for each trimmable table. +_CLEANUP_TABLES = ( + ("usage_log", "usage_log_days", "timestamp"), + ("ingested_files", "ingested_files_days", "ingested_at"), + ("cron_runs", "cron_runs_days", "started_at"), +) + + +def get_metadata_storage_stats(service_id: str) -> dict: + """Per-table row count + estimated bytes for this service's metadata.db. + + Bytes come from SQLite's ``dbstat`` virtual table (compiled into stock + Python sqlite3 ≥3.31). If a table doesn't exist (older schema), it's + omitted rather than erroring. Total ``db_bytes`` is the sum across the + whole file — including indexes, free pages, and tables not in + ``_STATS_TABLES``, so it won't equal sum-of-per-table-bytes. + """ + con = get_con(service_id) + out: dict[str, dict] = {} + for t in _STATS_TABLES: + try: + rows = con.execute(f"SELECT count(*) FROM {t}").fetchone()[0] + except sqlite3.OperationalError: + continue + try: + row = con.execute("SELECT sum(pgsize) FROM dbstat WHERE name = ?", (t,)).fetchone() + bytes_ = int(row[0]) if row and row[0] is not None else 0 + except sqlite3.OperationalError: + bytes_ = None + out[t] = {"rows": int(rows or 0), "bytes": bytes_} + + db_bytes: int | None + try: + row = con.execute("SELECT sum(pgsize) FROM dbstat").fetchone() + db_bytes = int(row[0]) if row and row[0] is not None else 0 + except sqlite3.OperationalError: + db_bytes = None + + return { + "tables": out, + "db_bytes": db_bytes, + "db_path": db_path(service_id), + } + + +def is_ingested_files_dedup_active(service_id: str) -> bool: + """Return True when the ``ingested_files`` table is the active dedup gate. + + The sync's ``delete_after`` flag (default True) makes ingest a destructive + op: a successfully-ingested .gz is DELETEd from FOS, so a future LIST + can never re-discover it — the ``ingested_files`` row is vestigial + after that point. When ``delete_after`` is set to False, the raw files + stay in FOS forever and the daily ``full_sync`` (cron) does a complete + LIST; the only thing stopping it from re-ingesting every prior file is + a matching entry in ``ingested_files``. In that mode the table CANNOT + be trimmed without causing re-ingestion storms. + """ + from backend import config as svcconfig + + cfg = svcconfig.load_config(service_id) or {} + delete_after = cfg.get("provisioning", {}).get("cron_sync", {}).get("delete_after", True) + # Treat anything other than an explicit False as safe-to-trim. None, + # missing, truthy strings — all default to the safe path. + return delete_after is not False + + +def cleanup_metadata( + service_id: str, + retention: dict | None = None, + on_event=None, +) -> dict: + """Delete rows older than the per-table retention window. VACUUM if any were deleted. + + retention shape: ``{"usage_log_days": int, "ingested_files_days": int, + "cron_runs_days": int}``. Missing keys fall back to + ``DEFAULT_METADATA_RETENTION``. A value of 0 (or negative) disables + cleanup for that table — useful for an analyst-only service that wants + to retain the full audit trail. + + ``ingested_files_days`` is **force-overridden to 0** when + ``cron_sync.delete_after`` is False on this service — see + ``is_ingested_files_dedup_active``. The override is announced via an + ``on_event`` status message so the operator knows the configured + retention is being ignored. + + ``on_event``: optional callable receiving event dicts at each milestone + (status messages, per-table delete results, VACUUM start/end). The + callback is invoked synchronously from the worker — the manual-cleanup + endpoint uses a thread-safe queue to bridge to SSE. Event shapes: + + {"type": "status", "message": str} + {"type": "progress", "current": int, "total": int, "message": str} + + The scheduled cron passes ``on_event=None`` and gets silent operation + (events still arrive in the function's return dict for logging). + + Returns ``{"deleted": {table: count}, "before": {table: rows}, + "after": {table: rows}, "vacuumed": bool, "duration_s": float}``. + """ + import time as _t + + def _emit(event: dict) -> None: + if on_event is None: + return + try: + on_event(event) + except Exception: + # Never let an event-sink failure abort the cleanup itself. + pass + + cfg = {**DEFAULT_METADATA_RETENTION, **(retention or {})} + + # Safety override: when cron_sync.delete_after is False, ingested_files + # is the dedup gate against re-LIST → re-ingest by the daily full_sync. + # Trimming it would re-ingest every aged-out file. Force-disable the + # ingested_files retention regardless of what cfg / caller passed, + # and surface the override so the operator sees why it didn't apply. + if not is_ingested_files_dedup_active(service_id): + configured = int(cfg.get("ingested_files_days") or 0) + if configured > 0: + _emit( + { + "type": "status", + "message": ( + f"ingested_files retention ({configured}d) ignored — " + "cron_sync.delete_after=false makes this table the dedup gate. " + "Trimming would cause full_sync to re-ingest aged-out files." + ), + } + ) + cfg["ingested_files_days"] = 0 + + con = get_con(service_id) + t0 = _t.time() + + # Steps: 3 deletes + 1 vacuum + 1 post-count = 5. Set up the progress + # framing so the modal can render a determinate bar. + total_steps = len(_CLEANUP_TABLES) + 2 + + _emit({"type": "status", "message": "Reading current row counts…"}) + before: dict[str, int] = {} + for table, _, _ in _CLEANUP_TABLES: + try: + before[table] = int(con.execute(f"SELECT count(*) FROM {table}").fetchone()[0] or 0) + except sqlite3.OperationalError: + before[table] = 0 + + deleted: dict[str, int] = {} + for idx, (table, key, ts_col) in enumerate(_CLEANUP_TABLES, start=1): + days = cfg.get(key) + try: + days_int = int(days) if days is not None else 0 + except (TypeError, ValueError): + days_int = 0 + if days_int <= 0: + deleted[table] = 0 + _emit( + { + "type": "progress", + "current": idx, + "total": total_steps, + "message": f"{table}: retention disabled (0 days) — skipped", + } + ) + continue + _emit({"type": "status", "message": f"Trimming {table} (older than {days_int}d)…"}) + try: + cur = con.execute( + f"DELETE FROM {table} WHERE {ts_col} < datetime('now', ?)", + (f"-{days_int} days",), + ) + deleted[table] = int(cur.rowcount or 0) + con.commit() + _emit( + { + "type": "progress", + "current": idx, + "total": total_steps, + "message": f"{table}: deleted {deleted[table]:,} rows (kept rows ≤{days_int}d old)", + } + ) + except sqlite3.OperationalError as e: + logger.warning("[metadata_cleanup] %s: skip %s — %s", service_id, table, e) + deleted[table] = 0 + _emit( + { + "type": "progress", + "current": idx, + "total": total_steps, + "message": f"{table}: skipped ({e})", + } + ) + + vacuumed = False + if any(deleted.values()): + # VACUUM cannot run inside an open transaction. Commit + drop the + # Python wrapper's auto-BEGIN so the next execute() autocommits. + _emit( + { + "type": "status", + "message": "VACUUMing — rewrites the whole file, may take minutes on large DBs…", + } + ) + con.commit() + old_iso = con.isolation_level + con.isolation_level = None + try: + con.execute("VACUUM") + vacuumed = True + _emit( + { + "type": "progress", + "current": len(_CLEANUP_TABLES) + 1, + "total": total_steps, + "message": "VACUUM complete — file shrunk to reflect deletions", + } + ) + except sqlite3.OperationalError as e: + # Locked / busy — not fatal, the delete already shrank the row count. + logger.warning("[metadata_cleanup] %s: VACUUM skipped — %s", service_id, e) + _emit( + { + "type": "progress", + "current": len(_CLEANUP_TABLES) + 1, + "total": total_steps, + "message": f"VACUUM skipped ({e}) — row counts already reduced", + } + ) + finally: + con.isolation_level = old_iso + else: + _emit( + { + "type": "progress", + "current": len(_CLEANUP_TABLES) + 1, + "total": total_steps, + "message": "Nothing deleted — VACUUM skipped (no-op rewrite would waste cycles)", + } + ) + + after: dict[str, int] = {} + for table, _, _ in _CLEANUP_TABLES: + try: + after[table] = int(con.execute(f"SELECT count(*) FROM {table}").fetchone()[0] or 0) + except sqlite3.OperationalError: + after[table] = 0 + _emit( + { + "type": "progress", + "current": total_steps, + "total": total_steps, + "message": f"Final counts: {', '.join(f'{t}={n:,}' for t, n in after.items())}", + } + ) + + # Rollup parquet tree cleanup — independent of the SQLite tables. Skip + # silently when the rollups module / source aren't available; rollups + # are an optimisation, never a correctness dependency. + rollups_deleted = 0 + try: + rollups_days = int(cfg.get("rollups_days") or 0) + except (TypeError, ValueError): + rollups_days = 0 + if rollups_days > 0: + try: + from backend.core import rollups as _rollups + from backend.core.duckdb import get_source_for_service + + src = get_source_for_service(service_id) + if src is not None: + rollups_deleted = _rollups.cleanup_old_rollups(service_id, src, rollups_days) + if rollups_deleted: + _emit( + { + "type": "status", + "message": f"Rollups: dropped {rollups_deleted} hour-dir(s) older than {rollups_days}d", + } + ) + except Exception as e: + logger.warning("[metadata_cleanup] %s: rollups cleanup skipped — %s", service_id, e) + + return { + "deleted": deleted, + "before": before, + "after": after, + "vacuumed": vacuumed, + "rollups_deleted": rollups_deleted, + "duration_s": round(_t.time() - t0, 3), + } + + +# ── Data-migration tracking ─────────────────────────────────────────────────── +# See backend/core/data_migrations.py for the runner. These helpers exist here +# (not in the runner module) so the runner can stay free of sqlite imports — +# the per-service connection lifecycle lives entirely in this module. + + +def list_applied_data_migrations(service_id: str) -> set[str]: + """Return the set of applied data-migration names for a service. + + Used by the runner to diff against the registered MIGRATIONS list and + determine which still need to run. Returns an empty set for a fresh DB. + """ + con = get_con(service_id) + try: + rows = con.execute("SELECT name FROM applied_data_migrations").fetchall() + return {r["name"] for r in rows} + except sqlite3.OperationalError: + # Schema not yet initialised — caller will hit this on its first + # successful query path; treat as "nothing applied yet". + return set() + + +def record_applied_data_migration( + service_id: str, + name: str, + *, + duration_s: float, + status: str = "success", + notes: str | None = None, +) -> None: + """Persist a successful (or failed) migration completion.""" + con = get_con(service_id) + con.execute( + "INSERT OR REPLACE INTO applied_data_migrations (name, applied_at, duration_s, status, notes) " + "VALUES (?, ?, ?, ?, ?)", + (name, iso_z_now(), float(duration_s), status, notes), + ) + con.commit() diff --git a/backend/core/rollups.py b/backend/core/rollups.py new file mode 100644 index 00000000..6b65ca09 --- /dev/null +++ b/backend/core/rollups.py @@ -0,0 +1,403 @@ +""" +Hourly Top-N rollups for the dashboard. + +For each tracked field (e.g. ``ip``, ``country``, ``url``, custom fields), we +keep one parquet file per hour at +``/rollups/hour/field=/hour=/compacted_*.parquet`` +holding the top-K most-common values for that field in that hour. + +The dashboard reads these instead of scanning the base ``logs`` view when no +filters are active, which cuts the unfiltered 24h top-N from a multi-second +scan to tens of milliseconds. The active hour is always served live off the +base table (rollups don't include the in-progress hour). + +Writers: +- ``recompute_touched_hours``: per sync tick, batched per-field COPY ... + PARTITION_BY (field, hour). Only re-computes the hours actually touched + by the new chunk. +- ``backfill_rollups``: one-shot bulk build over all historical hours, + invoked at first-boot and when a new field is added. +- ``cleanup_old_rollups``: drops per-hour directories older than the cfg + retention window. Called from the daily ``metadata_cleanup`` cron. + +Reader: +- ``QueryRunner.execute_top_n_rollups`` in + ``backend/repositories/_base.py``. +""" + +from __future__ import annotations + +import json +import logging +import os +import re +import shutil +import uuid +from datetime import UTC, datetime, timedelta + +logger = logging.getLogger(__name__) + +# How many top values per (field, hour) we persist. Dashboards render +# 10-25 at a time; 500 gives generous headroom for filter overlays and +# the long-tail "Other" rollup. +TOP_K = 500 + +# SQL identifier safelist. Field names land verbatim inside ``"..."`` +# quoted identifiers and inside SELECT projections; service names land +# in the table identifier ``logs_``. Both come from cfg / DuckDB +# schema and are PROBABLY already validated upstream — but a single +# stray double-quote or backtick in either would break the query in a +# way that's both a correctness bug and a privilege boundary (the +# fields are derived from admin-controlled custom_field entries). +# Defense in depth: this module reject anything not matching the +# pattern with a logged warning. +_SAFE_IDENT_RE = re.compile(r"^[A-Za-z_][A-Za-z0-9_]*$") + + +def _is_safe_ident(name: str) -> bool: + return bool(name) and bool(_SAFE_IDENT_RE.match(name)) + + +def _safe_table_for(source: dict) -> str | None: + """Return ``logs_`` iff the service name is a safe identifier.""" + name = source.get("name") or "" + if not _is_safe_ident(name): + logger.warning("[rollups] refusing to query unsafe service name: %r", name) + return None + return f"logs_{name}" + + +def _get_fields(src: dict) -> list[str]: + """Return the dashboard fields eligible for rollup. + + Custom-field names are validated against ``_SAFE_IDENT_RE`` — anything + failing the check is skipped with a warning rather than fed into SQL. + """ + from backend.repositories.dashboard import _VIRTUAL_FIELDS, FIELDS + + lf_config = src.get("log_fields") or {} + custom_field_names: list[str] = [] + for cf in lf_config.get("custom_fields", []): + if not cf.get("enabled", True) or not cf.get("show_in_dashboard", True): + continue + name = cf.get("name") or "" + if not _is_safe_ident(name): + logger.warning("[rollups] skipping custom field with unsafe name: %r", name) + continue + custom_field_names.append(name) + # Virtual fields (e.g. waf_sig_ind) are computed views over CSV columns + # — they aren't column names, so they can't be rolled up directly. + actual_fields = [f for f in FIELDS if f not in _VIRTUAL_FIELDS and _is_safe_ident(f)] + return actual_fields + custom_field_names + + +def _rollups_root(source: dict) -> str: + from backend.core.duckdb import _cache_dir + + return os.path.join(_cache_dir(source), "rollups", "hour") + + +def _markers_path(source: dict) -> str: + """JSON file tracking which fields have been backfilled. + + Replaces the prior single ``.backfill_done`` marker which couldn't + distinguish "fully backfilled" from "backfilled before a new custom + field was added". Shape: ``{"field": "ISO timestamp", ...}``. + """ + from backend.core.duckdb import _cache_dir + + return os.path.join(_cache_dir(source), "rollups", "backfill_markers.json") + + +def _load_markers(source: dict) -> dict[str, str]: + path = _markers_path(source) + if not os.path.exists(path): + return {} + try: + with open(path) as f: + data = json.load(f) + return data if isinstance(data, dict) else {} + except (OSError, json.JSONDecodeError) as e: + logger.warning("[rollups] could not read markers at %s: %s", path, e) + return {} + + +def _save_markers(source: dict, markers: dict[str, str]) -> None: + path = _markers_path(source) + os.makedirs(os.path.dirname(path), exist_ok=True) + # Atomic write so a crash mid-write doesn't truncate the file. + tmp_path = f"{path}.tmp.{uuid.uuid4().hex[:8]}" + try: + with open(tmp_path, "w") as f: + json.dump(markers, f) + os.replace(tmp_path, path) + except OSError as e: + logger.warning("[rollups] could not write markers to %s: %s", path, e) + try: + os.remove(tmp_path) + except OSError: + pass + + +def _publish_field_partitions(tmp_field_dir: str, dst_root: str, field: str) -> int: + """Move per-hour parquet files from a temp PARTITION_BY tree into the + canonical ``rollups/hour/field=X/hour=Y/`` layout. + + The publish order is RENAME-then-UNLINK to close the race window where + a concurrent dashboard read could observe an empty hour directory. + Worst case after this change: a dashboard read briefly sees BOTH the + new and old parquet for the same hour and double-counts that hour + until the unlink lands — which is bounded and self-corrects on the + next refresh. Pre-fix, the dashboard could observe ZERO files for the + hour (undercount), which was indistinguishable from a real traffic dip. + + Caller MUST hold the per-service iceberg lock around the whole call. + Returns the number of hour-dirs published. + """ + field_dir = os.path.join(tmp_field_dir, f"field={field}") + if not os.path.isdir(field_dir): + return 0 + + published = 0 + for hour_dirname in os.listdir(field_dir): + if not hour_dirname.startswith("hour="): + continue + src_hour_dir = os.path.join(field_dir, hour_dirname) + dst_hour_dir = os.path.join(dst_root, f"field={field}", hour_dirname) + os.makedirs(dst_hour_dir, exist_ok=True) + + # 1. Rename new files into place first (overcounting window OK). + new_names: set[str] = set() + for fname in os.listdir(src_hour_dir): + if not fname.endswith(".parquet"): + continue + new_name = f"compacted_{uuid.uuid4().hex[:12]}.parquet" + os.rename(os.path.join(src_hour_dir, fname), os.path.join(dst_hour_dir, new_name)) + new_names.add(new_name) + + # 2. Now unlink any pre-existing files that we didn't just write. + if new_names: + for existing in os.listdir(dst_hour_dir): + if existing.endswith(".parquet") and existing not in new_names: + try: + os.remove(os.path.join(dst_hour_dir, existing)) + except OSError as e: + logger.warning("[rollups] could not unlink stale %s: %s", existing, e) + published += 1 + + return published + + +def _build_copy_query(table_ident: str, field: str, where_sql: str) -> str: + """Return the COPY ... TO PARTITION_BY (field, hour) SQL for one field. + + Inputs must already be validated — this function does NO escaping. + Callers (recompute_touched_hours / backfill_rollups) gate via + ``_is_safe_ident`` and ``_safe_table_for``. + """ + return f""" + SELECT field, hour, value, count FROM ( + SELECT + '{field}' AS field, + strftime(timestamp, '%Y-%m-%d-%H') AS hour, + CAST("{field}" AS VARCHAR) AS value, + COUNT(*) AS count, + ROW_NUMBER() OVER ( + PARTITION BY strftime(timestamp, '%Y-%m-%d-%H') + ORDER BY COUNT(*) DESC + ) AS rn + FROM {table_ident} + WHERE {where_sql} + GROUP BY 1, 2, 3 + ) WHERE rn <= {TOP_K} + """ + + +def recompute_touched_hours(service_id: str, source: dict, hours: set[str]) -> None: + """Recompute rollups for all dashboard fields across the given hours. + + Excludes the active (current UTC) hour — the dashboard serves the + in-progress hour live off the base table. One COPY query per field + handles all touched hours via PARTITION_BY, so the work is O(fields) + not O(fields × hours). + """ + if not hours: + return + + active_hour = datetime.now(UTC).strftime("%Y-%m-%d-%H") + parsed: list[tuple[str, datetime]] = [] + for h in hours: + if h == active_hour: + continue + try: + parsed.append((h, datetime.strptime(h, "%Y-%m-%d-%H").replace(tzinfo=UTC))) + except ValueError: + logger.warning("[rollups] skipping malformed hour token: %r", h) + if not parsed: + return + + table_ident = _safe_table_for(source) + if not table_ident: + return + + min_start = min(dt for _, dt in parsed) + max_end = max(dt for _, dt in parsed) + timedelta(hours=1) + hour_list_sql = ", ".join(f"'{h}'" for h, _ in parsed) + where_sql = ( + f"timestamp >= '{min_start.isoformat()}' " + f"AND timestamp < '{max_end.isoformat()}' " + f"AND strftime(timestamp, '%Y-%m-%d-%H') IN ({hour_list_sql})" + ) + _run_per_field_copy(service_id, source, table_ident, where_sql, _get_fields(source)) + + +def backfill_rollups(service_id: str, source: dict, fields: list[str] | None = None) -> None: + """One-shot bulk build for all historical hours up to (but not including) + the current hour. + + ``fields``: if provided, only backfills the given subset (used when a + new custom field is added — see :func:`ensure_field_backfills`). + Defaults to all eligible fields. + """ + table_ident = _safe_table_for(source) + if not table_ident: + return + + target_fields = fields if fields is not None else _get_fields(source) + if not target_fields: + return + + dt_end = datetime.now(UTC).replace(minute=0, second=0, microsecond=0) + where_sql = f"timestamp < '{dt_end.isoformat()}'" + _run_per_field_copy(service_id, source, table_ident, where_sql, target_fields) + + # Stamp completion in the markers file so _ensure_rollups can detect + # which fields still need a backfill on next startup / cfg change. + markers = _load_markers(source) + stamp = datetime.now(UTC).isoformat() + for f in target_fields: + markers[f] = stamp + _save_markers(source, markers) + + +def ensure_field_backfills(service_id: str, source: dict) -> None: + """Backfill any eligible fields that don't yet have a marker entry. + + Triggered at startup (full backfill if no markers) and by callers that + mutate the log_fields config (new field added). Idempotent — fields + already in the markers file are skipped. + """ + markers = _load_markers(source) + eligible = _get_fields(source) + missing = [f for f in eligible if f not in markers] + if not missing: + return + logger.info( + "[rollups] service %s: backfilling %d new field(s): %s", + service_id, + len(missing), + missing, + ) + backfill_rollups(service_id, source, fields=missing) + + +def cleanup_old_rollups(service_id: str, source: dict, max_age_days: int) -> int: + """Delete per-hour rollup directories older than ``max_age_days``. + + ``max_age_days <= 0`` disables cleanup (keep everything). Returns the + number of hour-dirs deleted. Safe to call concurrently with the + writers because we only ever delete hours STRICTLY older than the + cutoff — current and just-written hours are never candidates. + """ + if max_age_days <= 0: + return 0 + rollup_root = _rollups_root(source) + if not os.path.isdir(rollup_root): + return 0 + + cutoff = (datetime.now(UTC) - timedelta(days=max_age_days)).strftime("%Y-%m-%d-%H") + deleted = 0 + try: + for field_entry in os.listdir(rollup_root): + if not field_entry.startswith("field="): + continue + field_dir = os.path.join(rollup_root, field_entry) + for hour_entry in os.listdir(field_dir): + if not hour_entry.startswith("hour="): + continue + hour = hour_entry[len("hour=") :] + # String compare works because the format is fixed-width + # YYYY-MM-DD-HH which sorts lexicographically by time. + if hour < cutoff: + hour_dir = os.path.join(field_dir, hour_entry) + try: + shutil.rmtree(hour_dir) + deleted += 1 + except OSError as e: + logger.warning("[rollups] could not delete %s: %s", hour_dir, e) + except OSError as e: + logger.warning("[rollups] cleanup walk failed for %s: %s", service_id, e) + return deleted + + +def _run_per_field_copy( + service_id: str, + source: dict, + table_ident: str, + where_sql: str, + fields: list[str], +) -> None: + """Shared core of recompute_touched_hours and backfill_rollups. + + One COPY query per field, writing to a per-field temp directory via + PARTITION_BY (field, hour), then publishing each hour-dir under the + per-service iceberg lock. + """ + import duckdb + + from backend.core.duckdb import _cache_dir, get_connection + from backend.core.iceberg import _get_service_lock + + cache_root = _cache_dir(source) + rollups_dir = _rollups_root(source) + os.makedirs(rollups_dir, exist_ok=True) + lock_key = source.get("name", "default") + + con = get_connection(source=source, read_only=True) + try: + try: + cols = {c[0] for c in con.execute(f"DESCRIBE {table_ident}").fetchall()} + except duckdb.Error as e: + logger.warning("[rollups] %s: could not describe %s: %s", service_id, table_ident, e) + return + + for field in fields: + if not _is_safe_ident(field): + # Belt-and-suspenders — _get_fields already filters, but + # defend against direct callers passing raw names. + logger.warning("[rollups] skipping unsafe field name: %r", field) + continue + if field not in cols: + continue + + tmp_field_dir = os.path.join(cache_root, "rollups", "tmp", field) + shutil.rmtree(tmp_field_dir, ignore_errors=True) + os.makedirs(tmp_field_dir, exist_ok=True) + + inner = _build_copy_query(table_ident, field, where_sql) + query = ( + f"COPY ({inner}) TO '{tmp_field_dir}' " + "(FORMAT PARQUET, PARTITION_BY (field, hour), OVERWRITE_OR_IGNORE, COMPRESSION ZSTD)" + ) + try: + con.execute(query) + except duckdb.Error as e: + logger.warning("[rollups] %s: COPY failed for field=%s: %s", service_id, field, e) + shutil.rmtree(tmp_field_dir, ignore_errors=True) + continue + + with _get_service_lock(lock_key): + _publish_field_partitions(tmp_field_dir, rollups_dir, field) + shutil.rmtree(tmp_field_dir, ignore_errors=True) + finally: + con.close() diff --git a/backend/core/share_db.py b/backend/core/share_db.py index 4d2569e2..74b9ead4 100644 --- a/backend/core/share_db.py +++ b/backend/core/share_db.py @@ -39,6 +39,8 @@ from datetime import UTC, datetime, timedelta from typing import Any +from backend.utils.date_utils import iso_z, iso_z_now + logger = logging.getLogger(__name__) # ── Locations ──────────────────────────────────────────────────────────────── @@ -90,11 +92,50 @@ def get_safe_share_db_connection(path: str) -> sqlite3.Connection: con.execute("SELECT 1").fetchone() return con except sqlite3.DatabaseError as exc: + # Security: ``DatabaseError`` is the parent of + # ``OperationalError``, which fires for transient conditions like + # "database is locked" / "disk I/O error" / FD exhaustion. The + # quarantine path renames the DB out from under any other open + # connections AND wipes all share state — running it on a transient + # error means a single lock-timeout under load can permanently + # delete every invite, session, and audit row in the share DB. + # + # Restrict the quarantine to actual file-corruption signatures from + # SQLite: "file is not a database" / "database disk image is malformed" + # / "unsupported file format". Anything else (lock timeout, I/O error, + # full disk, missing parent dir) is re-raised so the caller sees the + # real error instead of silently nuking the DB. + msg = str(exc).lower() + is_corruption = ( + "malformed" in msg + or "not a database" in msg + or "unsupported file format" in msg + or "image is malformed" in msg + ) + if not is_corruption: + # ERROR (not WARNING) so this near-miss is alertable from the + # existing log-error monitoring without needing a new metric + # plumbing — quarantine-skipped events should be rare; if we + # start seeing them at volume it's a signal that the + # is_corruption substrings need updating. + logger.error( + "[share_db] DatabaseError on open of %s NOT classified as corruption (err_type=%s); re-raising: %s", + path, + type(exc).__name__, + exc, + ) + raise + epoch = int(time.time()) corrupt_path = f"{path}.corrupt-{epoch}" try: os.replace(path, corrupt_path) - logger.error("[share_db] corrupt DB at %s quarantined to %s (%s)", path, corrupt_path, exc) + logger.error( + "[share_db] corrupt DB at %s quarantined to %s (reason=corruption, %s)", + path, + corrupt_path, + exc, + ) except OSError: logger.exception("[share_db] failed to quarantine corrupt DB at %s", path) raise @@ -344,16 +385,7 @@ def apply_pending(con: sqlite3.Connection) -> int: # ── Time helpers ───────────────────────────────────────────────────────────── - - -def iso_z_now() -> str: - return datetime.now(UTC).strftime("%Y-%m-%dT%H:%M:%SZ") - - -def iso_z(dt: datetime) -> str: - if dt.tzinfo is None: - dt = dt.replace(tzinfo=UTC) - return dt.astimezone(UTC).strftime("%Y-%m-%dT%H:%M:%SZ") +# Handled via backend.utils.date_utils imports above to avoid duplication. # ── Passcode hashing (constant-time scrypt) ───────────────────────────────── @@ -773,7 +805,15 @@ def get_remote_invites(*, con: sqlite3.Connection | None = None) -> list[dict]: def get_remote_invite_by_email_passcode( email: str, passcode: str, *, con: sqlite3.Connection | None = None ) -> dict | None: - """Constant-time lookup. Returns the invite dict on success, else None.""" + """Constant-time lookup. Returns the invite dict on success, else None. + + Security: when no invite exists for ``email`` (e.g., email + enumeration attack), still run one scrypt verification against a dummy + hash with the same parameters so the response time matches the + invite-exists branch (~30 ms). Without this, an attacker measuring the + response latency can distinguish "email is registered, passcode wrong" + (slow) from "email never invited" (fast) and enumerate emails. + """ con = con or get_global_share_con() norm_email = (email or "").strip().lower() rows = con.execute( @@ -790,12 +830,40 @@ def get_remote_invite_by_email_passcode( if match is None: match = dict(row) if match is None: + # Equalize timing ONLY when the email has no invite at all. If + # rows existed (email present, passcode wrong) we already paid one + # scrypt per row inside the loop — running the dummy verification + # again would push the wrong-passcode branch to ``(N+1)×scrypt`` + # while the no-email branch stays at ``1×scrypt``, recreating + # the 2× timing side-channel this function is meant to close. + if not rows: + _equalize_passcode_timing(passcode) return None match["pii_policy"] = json.loads(match.get("pii_policy") or '{"mask_ips": false}') match["service_ids"] = get_remote_invite_services(match["id"], con=con) return match +_dummy_hash: str | None = None + + +def _equalize_passcode_timing(passcode: str) -> None: + """Run one scrypt verification against a fixed dummy hash so the timing + of the "no email match" branch matches the "email match, wrong passcode" + branch. + + The dummy hash uses the same _SCRYPT_N/_R/_P/_DKLEN parameters as + ``hash_passcode`` so verification cost is identical. Generated once per + process and reused — generating per-call would add measurable extra cost + to the miss branch.""" + global _dummy_hash + if _dummy_hash is None: + # Synthesize via the real hash function so any future parameter + # change in ``hash_passcode`` is automatically reflected here. + _dummy_hash = hash_passcode("__dummy_for_timing_equalization__") + verify_passcode(passcode, _dummy_hash) + + def update_remote_invite_services( invite_id: str, service_ids: list[str], *, con: sqlite3.Connection | None = None ) -> None: @@ -1028,21 +1096,37 @@ def claim_token(token: str, ip: str, *, con: sqlite3.Connection | None = None) - Returns the row dict on success; ``None`` if the token does not exist, is expired, or was already claimed. + + Security (TOCTOU): use a single atomic UPDATE with the + ``claimed_at IS NULL`` predicate baked into the WHERE clause. Earlier + versions ran SELECT-then-check-then-UPDATE under the same transaction, + but two concurrent claims could both pass the SELECT before either + UPDATE landed and end up double-redeeming. Now whichever transaction's + UPDATE commits first wins (rowcount == 1); the loser sees rowcount == 0 + and returns None. + + The SELECT after UPDATE re-reads the just-claimed row so we can return + the invite_id to the caller. Doing it inside the same ``with con:`` + block keeps it in the same write transaction. """ con = con or get_global_share_con() now = iso_z_now() with con: + cur = con.execute( + """ + UPDATE remote_invite_claim_tokens + SET claimed_at = ?, claimed_from_ip = ? + WHERE token = ? + AND claimed_at IS NULL + AND expires_at >= ? + """, + (now, ip, token, now), + ) + if cur.rowcount != 1: + return None row = con.execute("SELECT * FROM remote_invite_claim_tokens WHERE token=?", (token,)).fetchone() if row is None: return None - if row["claimed_at"] is not None: - return None - if row["expires_at"] < now: - return None - con.execute( - "UPDATE remote_invite_claim_tokens SET claimed_at=?, claimed_from_ip=? WHERE token=?", - (now, ip, token), - ) return dict(row) @@ -1277,11 +1361,22 @@ def apply_pii_policy(obj, policy: dict): return obj masked_keys = {"ip", "ip_address", "client_ip", "remote_addr"} - def _walk(node): + def _walk(node, parent_key=None): if isinstance(node, dict): - return {k: (mask_ip(v) if isinstance(v, str) and k in masked_keys else _walk(v)) for k, v in node.items()} + return { + k: (mask_ip(v) if isinstance(v, str) and k in masked_keys else _walk(v, parent_key=k)) + for k, v in node.items() + } if isinstance(node, list): - return [_walk(x) for x in node] + # Array fields inherit the parent dict key for masking — e.g. + # ``{"client_ip": ["1.2.3.4", "5.6.7.8"]}`` must mask each string + # the same way the scalar form would. Without threading the + # parent key through, list-of-string IP fields slipped past the + # masker entirely. + return [ + (mask_ip(x) if isinstance(x, str) and parent_key in masked_keys else _walk(x, parent_key=parent_key)) + for x in node + ] return node return _walk(obj) diff --git a/backend/cron_progress.py b/backend/cron_progress.py index a2c8a86c..00176fcf 100644 --- a/backend/cron_progress.py +++ b/backend/cron_progress.py @@ -10,9 +10,121 @@ def start_progress(run_id: int, service_id: str = None, task: str = None): with _lock: if run_id not in _progress: + now = time.time() _progress[run_id] = [] - _last_update[run_id] = time.time() - _run_metadata[run_id] = {"service_id": service_id, "task": task} + _last_update[run_id] = now + _run_metadata[run_id] = { + "service_id": service_id, + "task": task, + "started_at": now, + } + + +_STALE_AFTER_SECONDS = 300 # 5 min — covers slow syncs, kills zombie entries + + +def list_active_runs() -> list[dict]: + """Return metadata for runs that are GENUINELY in flight. + + A run is considered active when ALL of these hold: + 1. It's in ``_run_metadata`` (was started_progress'd) + 2. Its last progress event is NOT terminal (done/error) + 3. Its ``_last_update`` was within the last 5 minutes + 4. The persisted ``cron_runs.status`` is still ``'running'`` + + Condition (4) is the DB-truth backstop: when an APScheduler + watchdog abandons a worker thread (interpreter shutdown, OOM + kill, executor recycle) or some other path completes ``log_cron_run`` + without firing the in-memory ``end_progress``, the in-memory dict + falsely shows the run as in-flight even though the DB knows it + succeeded. Production observed 13+ such ghosts on 2026-06-03 + after a backend restart — DB rows said ``status='success'`` with + durations of 2-6 seconds while the in-memory dict held them as + active for 100+ seconds. Cross-checking against the DB gives a + correct answer regardless of what happened to the in-memory + state. + + Condition (3) covers the residual: a run whose DB write also got + skipped (something crashed before ``log_cron_run``). After 5 min + of zero progress, we declare it a zombie regardless. + """ + now = time.time() + with _lock: + candidates = [] + for run_id, meta in _run_metadata.items(): + events = _progress.get(run_id) or [] + if events and events[-1].get("type") in ("done", "error"): + continue + last_update = _last_update.get(run_id, now) + if now - last_update > _STALE_AFTER_SECONDS: + continue + candidates.append((run_id, meta)) + + # DB cross-check happens OUTSIDE the lock so a slow SQLite call + # doesn't block other progress operations. The query is cheap + # (PK lookup per run_id) and runs once per snapshot poll. + out = [] + for run_id, meta in candidates: + if _db_status_is_terminal(meta.get("service_id"), run_id): + continue + entry = {"run_id": run_id} + entry.update(meta) + out.append(entry) + return out + + +def _db_status_is_terminal(service_id: str | None, run_id: int) -> bool: + """Return True if the cron_runs row for this run_id has a terminal + status ('success' or 'error') in per-service SQLite. + + Best-effort: any DB error (missing service, table not yet created, + SQLite locked) returns False so the in-memory truth still serves + the badge (we'd rather show one false-in-flight than hide a + genuinely running one). + """ + if not service_id: + return False + try: + from backend.core import metadata_db + + status = metadata_db.get_cron_run_status(service_id, run_id) + return status in ("success", "error") + except Exception: + return False + + +def reap_zombie_runs() -> int: + """Eagerly evict zombie run metadata from in-memory state. + + Mirrors list_active_runs' staleness check but actually mutates + the dicts. Called from the scheduler's per-tick cleanup so + /admin/health-snapshot doesn't drift by minutes between sync + ticks. Returns the count evicted for log telemetry. + + Why this and not just rely on cleanup_progress's 1-hour TTL: a + zombie sync that ran for 2 minutes then died leaves a stale entry + that's still <1h old. cleanup_progress wouldn't touch it. + list_active_runs filters the badge but the entry still bloats + _run_metadata and shows up in any other code path that walks + the dict (admin.py:210/238/1022 — patched 2026-06-02 but easy + to regress). + """ + now = time.time() + evicted = 0 + with _lock: + for run_id in list(_run_metadata.keys()): + last_update = _last_update.get(run_id, now) + if now - last_update > _STALE_AFTER_SECONDS: + events = _progress.get(run_id) or [] + # Stale + no terminal event = zombie. Append a synthetic + # error so any SSE subscriber sees the run ended. + if not events or events[-1].get("type") not in ("done", "error"): + _progress.setdefault(run_id, []).append( + {"type": "error", "message": "scheduler reaped zombie cron (no progress in 5m)"} + ) + _run_metadata.pop(run_id, None) + evicted += 1 + return evicted def add_progress(run_id: int, event: dict): @@ -57,13 +169,46 @@ def get_latest_progress_for_service(service_id: str) -> dict | None: def end_progress(run_id: int, final_event: dict | None = None): + """Mark a cron run as ended. + + AUTO-DONE: if no ``final_event`` is provided AND the run's last + event isn't already a terminal type ("done"/"error"), automatically + append a ``{"type": "done"}`` event so ``list_active_runs`` can + filter the run out. Without this, callers that emit only "status" + events during their lifetime (the sync path's view-refresh message + is the canonical example) leave the run "active" until the 1-hour + TTL — accumulating dozens of stale entries on the System Health card. + + Explicit callers that want a richer terminal event can still pass + ``final_event={"type": "done", "rows": N}`` and the same append path + runs. The auto-emit only kicks in when the caller forgot. + """ with _lock: if run_id in _progress: + events = _progress[run_id] + last_type = events[-1].get("type") if events else None if final_event: _progress[run_id].append(final_event) + elif last_type not in ("done", "error"): + _progress[run_id].append({"type": "done"}) _last_update[run_id] = time.time() +def cleanup_progress_and_reap(): + """Convenience helper that runs cleanup_progress + reap_zombie_runs. + + The two are always called as a pair from every cron entrypoint + (7 scheduler functions today). Wrapping them prevents the + common bug where a new cron runner remembers cleanup but forgets + the reap — leaving zombie entries in the System Health card. + + Returns the reap count for log telemetry; cleanup_progress's + return value is None. + """ + cleanup_progress() + return reap_zombie_runs() + + def cleanup_progress(): now = time.time() with _lock: diff --git a/backend/deps.py b/backend/deps.py index 40ce522e..8e88b729 100644 --- a/backend/deps.py +++ b/backend/deps.py @@ -69,8 +69,16 @@ def get_source(service_id: str | None = Depends(get_service_id)) -> dict: class _ConnectionHolder: """Holds a single DuckDB connection for the lifetime of one request. - Used as a context-manager-style dependency so FastAPI closes the - connection when the request finishes. + Read-only requests check out a pooled, pre-warmed connection via + ``duckdb_pool.checkout_connection`` (saves ~50ms per request of + pragma / S3 / iceberg-view setup). Write-mode connections still take + the always-fresh ``get_connection`` path because ingest holds the + write lock and pooling would defeat its lifecycle. + + Used as a context-manager-style dependency so FastAPI returns the + connection to the pool (or closes the fresh one) when the request + finishes. On any exception the connection is discarded rather than + pooled so a poisoned connection doesn't get reused. """ def __init__(self, source: dict, skip_view_update: bool = False, read_only: bool = True): @@ -78,37 +86,91 @@ def __init__(self, source: dict, skip_view_update: bool = False, read_only: bool self._skip_view_update = skip_view_update self._read_only = read_only self.con: duckdb.DuckDBPyConnection | None = None + # Set when we exit cleanly so __exit__ knows to return-vs-discard. + self._errored = False + # Used only on the pooled path so __exit__ can release. + self._pool_cm = None def __enter__(self) -> duckdb.DuckDBPyConnection: + # Write mode + skip_view_update fall back to the fresh-connection + # path: the pool exists for the dominant read-only HTTP request + # workload, not for ingest's exclusive writer or for callers that + # explicitly opt out of view binding. The pool itself can also be + # disabled globally via DUCKDB_CONNECTION_POOL=0 (tests + emergency + # rollback); when disabled we go straight through ``get_connection`` + # so behaviour matches the pre-pool design exactly. + from backend.core import duckdb_pool + + use_pool = ( + self._read_only + and not self._skip_view_update + and duckdb_pool._pool_enabled() + ) try: - self.con = get_connection( - source=self._source, - max_wait=10, # Increased wait slightly for safety - skip_view_update=self._skip_view_update, - read_only=self._read_only, - ) + if use_pool: + self._pool_cm = duckdb_pool.checkout_connection(self._source, max_wait=10.0) + self.con = self._pool_cm.__enter__() + else: + self.con = get_connection( + source=self._source, + max_wait=10, + skip_view_update=self._skip_view_update, + read_only=self._read_only, + ) except DBBusyError as e: raise HTTPException( status_code=503, # 503 Service Unavailable so frontend fetch throws and React Query keeps cached data detail={"error": str(e), "busy": True}, ) + except Exception as e: + # Pool exhaustion (after wait timeout) surfaces as _PoolBusy. + # Translate to 503 so the frontend handles it the same as + # DBBusyError instead of throwing an opaque 500. + from backend.core.duckdb_pool import _PoolBusy + + if isinstance(e, _PoolBusy): + raise HTTPException( + status_code=503, + detail={"error": str(e), "busy": True}, + ) + raise return self.con - def __exit__(self, *_): + def __exit__(self, exc_type, exc_val, exc_tb): + self._errored = exc_type is not None + if self._pool_cm is not None: + # Forward the exception to the pool context manager so it can + # mark the connection errored and discard. + try: + self._pool_cm.__exit__(exc_type, exc_val, exc_tb) + except Exception: + pass + self._pool_cm = None + self.con = None + return False if self.con: try: self.con.close() except Exception: pass self.con = None + return False -def get_con(source: dict = Depends(get_source), read_only: bool = True) -> duckdb.DuckDBPyConnection: +def get_con(source: dict = Depends(get_source)) -> duckdb.DuckDBPyConnection: """Dependency that yields a DuckDB connection and closes it after the request. - Defaults to read_only=True for dashboard queries to prevent blocking on crons. + Always opens in read-only mode for HTTP request handlers — write-mode + connections are used only by the scheduler/cron pipeline, never by + user-facing routes. + + Security: do NOT take ``read_only`` as a parameter. FastAPI converts + primitive-typed dependency parameters into query parameters, so any + request to a route using this dep could send ``?read_only=false`` and + force an exclusive write-lock acquisition that blocks readers and the + sync cron (503 DoS). The flag is hardcoded inside the holder instead. """ - holder = _ConnectionHolder(source, read_only=read_only) + holder = _ConnectionHolder(source, read_only=True) with holder as con: yield con @@ -136,12 +198,51 @@ def __init__( self.con = con -def get_meta_con(source: dict = Depends(get_source), read_only: bool = True) -> duckdb.DuckDBPyConnection: +# ── Tenant-scope enforcement (security) ───────────── + + +def require_service_access( + request, + service_id: str | None = Depends(get_service_id), +) -> str | None: + """Reject the request with 403 if the caller (analyst session) does not + have access to the requested ``service_id``. + + Local admin requests (analyst_session is None) bypass this check entirely + — admins have access to every configured service. Analysts must have the + target ``service_id`` in their invite's ``service_ids`` list. + + Use as a dependency on any route that returns or mutates per-service + data. Routes that take no ``service_id`` parameter and that expose a + list of services across the whole tenant must filter the list manually + using ``request.state.analyst_session.service_ids`` — this helper only + enforces the single-service case. + """ + analyst_session = getattr(request.state, "analyst_session", None) + if analyst_session is None: + return service_id # admin / local — unrestricted + allowed = set(analyst_session.service_ids or []) + if service_id is None: + # Analyst calls with no explicit service must default to one of their + # scoped services. Return the first one (or None if invite is empty). + return next(iter(allowed), None) + if service_id not in allowed: + raise HTTPException( + status_code=403, + detail={"error": "service_not_authorized", "service": service_id}, + ) + return service_id + + +def get_meta_con(source: dict = Depends(get_source)) -> duckdb.DuckDBPyConnection: """Dependency that yields a DuckDB connection, skipping the Iceberg view update. Use this for metadata routes (e.g. cron logs, admin settings) that don't need to query the main logs table, to avoid blocking on S3 manifest reads. + + Security: ``read_only`` is hardcoded True for the same reason as + ``get_con`` above. """ - holder = _ConnectionHolder(source, skip_view_update=True, read_only=read_only) + holder = _ConnectionHolder(source, skip_view_update=True, read_only=True) with holder as con: yield con diff --git a/backend/main.py b/backend/main.py index bc4691b2..239327e3 100644 --- a/backend/main.py +++ b/backend/main.py @@ -43,6 +43,8 @@ logging.getLogger("pyiceberg.io").setLevel(logging.WARNING) logging.getLogger("apscheduler").setLevel(logging.WARNING) +logger = logging.getLogger("backend.main") + from fastapi import FastAPI, Request from fastapi.middleware.cors import CORSMiddleware from fastapi.middleware.gzip import GZipMiddleware @@ -98,6 +100,17 @@ def _initialize_service(cfg: dict): if src: _db.refresh_config_status(sid) _ensure_persistent_view(sid, src) + # Data migrations: queues any pending one-time setup work + # (e.g. the initial rollups backfill) onto a daemon thread + # per service. Returns immediately so startup isn't gated + # on a potentially multi-minute backfill. See + # backend/core/data_migrations.py for the framework. + try: + from backend.core import data_migrations + + data_migrations.run_pending(sid, src) + except Exception as e: + logging.warning("[fastapi] Service %s: could not queue data migrations: %s", sid, e) logging.info("[fastapi] Service %s initialised.", sid) except Exception as e: logging.warning("[fastapi] Could not initialise service %s: %s", sid, e) @@ -161,6 +174,50 @@ def _ensure_pop_cache(): logging.warning("[fastapi] Could not prefetch POP locations: %s", e) +def _ensure_scoring_matrix(): + """Pull the trained scoring matrix from FOS at startup for any + service that has scoring enabled. + + Without this, the /scoring/evaluation endpoint falls back to the + bundled matrix.default.json (empty transitions → AUC ≈ 0.5) until + an operator manually drops compute/scorer/matrix.json into the + container. The fetch is best-effort: missing FOS object, no scoring + enabled, S3 timeout — all silently no-op so a slow FOS doesn't + block startup. + """ + try: + from backend.provision.session_scoring_orchestrator import _MATRIX_PATH + from backend.state_sync import fetch_matrix_from_fos + + for cfg in svcconfig.list_configs(): + if not (cfg.get("scoring") or {}).get("enabled"): + continue + sid = cfg.get("service_id") or cfg.get("name") + try: + matrix = fetch_matrix_from_fos(sid) + if not matrix: + continue + _MATRIX_PATH.parent.mkdir(parents=True, exist_ok=True) + with _MATRIX_PATH.open("w") as f: + import json as _json + + _json.dump(matrix, f) + logging.info( + "[fastapi] Pulled scoring matrix from FOS for %s (version=%s)", + sid, + matrix.get("version", "?"), + ) + # First-write-wins: with multiple scoring-enabled services, + # the matrix file is global. They SHOULD all be the same + # matrix (one trainer, one deploy), but if they differ + # we use whichever loaded first and log a warning above. + break + except Exception as e: + logging.warning("[fastapi] Could not pull scoring matrix for %s: %s", sid, e) + except Exception as e: + logging.warning("[fastapi] _ensure_scoring_matrix failed: %s", e) + + def _background_startup(): """Run initialisation tasks that should not block the web server startup.""" # Tag everything done here so the s3fs/boto3 hooks attribute their @@ -181,6 +238,7 @@ def _background_startup(): logging.warning("[fastapi] reload_default_source failed: %s", e) _ensure_pop_cache() + _ensure_scoring_matrix() try: from backend.scheduler import get_scheduler @@ -223,6 +281,56 @@ def _enforce_data_dir_mounted() -> None: raise RuntimeError(msg) +def _enforce_proxy_headers_configured() -> None: + """Security regression guard for. + + The remote-access middleware reads ``request.client.host`` and trusts it as + the client's real IP. That only works if uvicorn is launched with + ``--proxy-headers --forwarded-allow-ips=`` — + without those flags the framework returns the loopback peer address for + every Caddy-proxied request and every IP-based gate (rate-limiting, admin + detection, whitelist) becomes ineffective. + + Production sets ``TRUSTED_PROXY_IPS=127.0.0.1`` in docker-compose.prod.yml + alongside the uvicorn flags. If that env var is missing or empty at boot, + refuse to start (or, for local dev where the var is unset, emit a loud + WARNING) so a future config refactor cannot silently re-introduce the + pre-patch vulnerability. + + Set ``REQUIRE_PROXY_HEADERS=1`` in production to make this a hard FATAL. + Local dev / tests leave both env vars unset and the function is a no-op. + + Defense in depth: even when our own ``TRUSTED_PROXY_IPS`` env is set, we + also probe uvicorn's own ``UVICORN_FORWARDED_ALLOW_IPS`` env var (the + env-equivalent of the ``--forwarded-allow-ips`` CLI flag). If a future + refactor passes the CLI flag without exporting our companion env var, + uvicorn's variable lets us detect it. + """ + trusted = (os.environ.get("TRUSTED_PROXY_IPS") or "").strip() + uvicorn_trusted = (os.environ.get("UVICORN_FORWARDED_ALLOW_IPS") or "").strip() + require_strict = os.environ.get("REQUIRE_PROXY_HEADERS") == "1" or os.environ.get("STRICT_DATA_DIR_CHECK") == "1" + effective = trusted or uvicorn_trusted + if effective: + logging.info( + "[fastapi] proxy-headers trust set: TRUSTED_PROXY_IPS=%s UVICORN_FORWARDED_ALLOW_IPS=%s", + trusted or "(unset)", + uvicorn_trusted or "(unset)", + ) + return + msg = ( + "TRUSTED_PROXY_IPS is unset. uvicorn must be launched with " + "`--proxy-headers --forwarded-allow-ips=127.0.0.1` AND have " + "TRUSTED_PROXY_IPS=127.0.0.1 in its environment so the remote-access " + "middleware can read request.client.host as the real client IP. " + "Without this, leftmost-XFF spoofing becomes exploitable " + "and the admin Host-spoof bypass returns. See docker-compose.prod.yml." + ) + if require_strict: + logging.critical("FATAL: %s", msg) + raise RuntimeError(msg) + logging.warning("[fastapi] %s", msg) + + @asynccontextmanager async def lifespan(app: FastAPI): """Startup / shutdown lifecycle.""" @@ -232,6 +340,12 @@ async def lifespan(app: FastAPI): # ingestion logic that would otherwise blindly write to the wrong path. _enforce_data_dir_mounted() + # Proxy-headers regression guard (security). Production + # must have TRUSTED_PROXY_IPS set in env (mirrors the uvicorn + # --forwarded-allow-ips flag). Without it, IP-based gates become + # ineffective and the Host-spoof admin bypass returns. + _enforce_proxy_headers_configured() + # Verify dependencies try: import pyarrow # noqa: F401 @@ -289,7 +403,7 @@ async def lifespan(app: FastAPI): app = FastAPI( title="Fastly Log Analytics API", - version="1.0.0", + version="1.1.0", description=( "FastAPI backend for the Fastly Log Analytics tool. " "Serves the Next.js frontend and exposes an OpenAPI spec at /openapi.json." @@ -380,13 +494,24 @@ async def telemetry_middleware(request: Request, call_next): app.include_router(alerts.router) app.include_router(origin.router) -from backend.routers import admin, bootstrap, debug, provision, services, share_admin, share_auth, usage +from backend.routers import ( + admin, + bootstrap, + debug, + provision, + services, + session_scoring, + share_admin, + share_auth, + usage, +) app.include_router(bootstrap.router) app.include_router(services.router) app.include_router(usage.router) app.include_router(admin.router) app.include_router(provision.router) +app.include_router(session_scoring.router) app.include_router(debug.router) app.include_router(share_auth.router) app.include_router(share_admin.router) diff --git a/backend/models/common.py b/backend/models/common.py index 16ea6937..09c6b6c6 100644 --- a/backend/models/common.py +++ b/backend/models/common.py @@ -120,7 +120,9 @@ class DebugCall(BaseModel): caller: str | None = None -from pydantic import Field +import os as _os + +from pydantic import Field, model_serializer class HasDataMixin(BaseModel): @@ -130,6 +132,25 @@ class HasDataMixin(BaseModel): total: int = 0 +# 038: telemetry payloads (raw SQL + outbound API URL/timing) are useful +# during development and incident response but they're an information-leak +# surface in normal operation — every analyst dashboard fetch echoes the +# server's internal SQL and the FOS object keys it touched. Gate inclusion +# on a process-level ``DEBUG_RESPONSES`` env var so production +# deployments default to "telemetry excluded from API responses" and an +# operator who needs the debug panel during triage can flip the flag and +# restart the process. The frontend DebugPanel reads ``_debug_queries`` / +# ``_debug_calls`` via optional-chain access so a missing field renders +# as an empty panel rather than throwing. +# +# Implementation uses ``model_serializer`` (not ``Field(exclude=...)``) +# so the OpenAPI schema continues to describe the fields — keeps the +# committed snapshot stable regardless of which mode the deployment +# is running in, and avoids per-deployment frontend type drift. +def _debug_responses_enabled() -> bool: + return _os.getenv("DEBUG_RESPONSES", "").lower() in ("1", "true", "yes") + + class BaseResponse(BaseModel): """Base response that automatically includes telemetry if present.""" @@ -137,6 +158,14 @@ class BaseResponse(BaseModel): debug_calls: list[DebugCall] = Field(default_factory=list, serialization_alias="_debug_calls") is_cached: bool = Field(default=False, serialization_alias="_is_cached") + @model_serializer(mode="wrap") + def _strip_debug_when_disabled(self, handler): + data = handler(self) + if not _debug_responses_enabled(): + data.pop("_debug_queries", None) + data.pop("_debug_calls", None) + return data + @classmethod def with_telemetry(cls, **data): """Helper to create a response with context-local telemetry.""" diff --git a/backend/models/custom_fields.py b/backend/models/custom_fields.py index 2326e9ab..f2855337 100644 --- a/backend/models/custom_fields.py +++ b/backend/models/custom_fields.py @@ -15,7 +15,7 @@ class CustomField(BaseModel): label: str description: str = "" vcl_log_expression: str - collection_stage: Literal["edge", "origin"] = "edge" + collection_stage: Literal["edge", "origin", "deliver"] = "edge" origin_log_frequency: Literal["all", "miss_pass"] = "all" duckdb_type: Literal["VARCHAR", "INTEGER", "BIGINT", "DOUBLE", "BOOLEAN"] = "VARCHAR" value_type: Literal["string", "numeric", "boolean", "ip", "url"] = "string" @@ -51,7 +51,7 @@ class CustomFieldUpdate(BaseModel): label: str | None = None description: str | None = None vcl_log_expression: str | None = None - collection_stage: Literal["edge", "origin"] | None = None + collection_stage: Literal["edge", "origin", "deliver"] | None = None origin_log_frequency: Literal["all", "miss_pass"] | None = None duckdb_type: Literal["VARCHAR", "INTEGER", "BIGINT", "DOUBLE", "BOOLEAN"] | None = None value_type: Literal["string", "numeric", "boolean", "ip", "url"] | None = None @@ -74,7 +74,7 @@ class CustomFieldsListResponse(BaseResponse): class VclLintRequest(BaseModel): vcl_log_expression: str - collection_stage: Literal["edge", "origin"] = "edge" + collection_stage: Literal["edge", "origin", "deliver"] = "edge" log_fields_config: dict | None = None diff --git a/backend/models/lake.py b/backend/models/lake.py index 8b2c53c6..2375ec54 100644 --- a/backend/models/lake.py +++ b/backend/models/lake.py @@ -3,6 +3,41 @@ from __future__ import annotations import json +import urllib.parse + +# Hostname suffixes allowed for ``cdn_url`` when the SSRF check below +# decides whether to issue an outbound HTTP request. Any other hostname +# (including bare IPs, ``localhost``, link-local addresses, or +# attacker-supplied internal hostnames) is rejected — the field is +# user-controlled at provision time and an attacker who can inject +# ``http://169.254.169.254`` would otherwise turn fetch_lake_info into +# an SSRF probe of the GCE metadata service. +_CDN_URL_ALLOWED_HOST_SUFFIXES = ( + ".fastly.net", + ".fastlystorage.app", +) + + +def _safe_cdn_url(cdn_url: str) -> str | None: + """Return ``cdn_url`` only if it's an https:// URL on an allowlisted + Fastly hostname, else None. Caller treats None as "skip the CDN + fast path and fall through to the SDK". + """ + if not cdn_url: + return None + try: + parsed = urllib.parse.urlsplit(cdn_url) + except ValueError: + return None + if parsed.scheme != "https": + return None + hostname = (parsed.hostname or "").lower() + if not hostname: + return None + for suffix in _CDN_URL_ALLOWED_HOST_SUFFIXES: + if hostname.endswith(suffix): + return cdn_url + return None def fetch_lake_info(source: dict, use_temp_cache: bool = False) -> dict: @@ -29,9 +64,8 @@ def fetch_lake_info(source: dict, use_temp_cache: bool = False) -> dict: namespace, table_name = db_iceberg._table_identifier(source) summary_key = f"{iceberg_root}/{namespace}/{table_name}/table_summary.json" - cdn_url = (source.get("cdn_url") or "").rstrip("/") + cdn_url = _safe_cdn_url((source.get("cdn_url") or "").rstrip("/")) if cdn_url: - import urllib.parse import urllib.request from backend.utils.telemetry import record_cdn_call diff --git a/backend/provision/cli.py b/backend/provision/cli.py index 38f66243..3c6eb13f 100644 --- a/backend/provision/cli.py +++ b/backend/provision/cli.py @@ -266,6 +266,27 @@ def handle_update_logs(args): else (cfg.get("log_fields") or _build_log_fields_config(args)) ) + # MERGE GUARD (sibling of state_sync.import_admin_state fix from + # 2026-06-02 incident): _build_log_fields_config(args) returns + # {schema_version, preset, groups, field_overrides} — it has NO + # custom_fields key. Assigning the result wholesale to + # cfg["log_fields"] would strip the 6 scoring custom_fields the + # orchestrator injected, the user's own custom_fields, and any + # format_hash/updated_at metadata. Preserve custom_fields from the + # on-disk cfg, then if scoring is enabled re-inject the canonical + # _SCORING_CUSTOM_FIELDS from code as the source of truth. + existing_lf = cfg.get("log_fields") or {} + existing_custom = list(existing_lf.get("custom_fields") or []) + if cfg.get("scoring", {}).get("enabled"): + from backend.provision.session_scoring_orchestrator import ( + _SCORING_CUSTOM_FIELDS, + _SCORING_FIELD_NAMES, + ) + + existing_custom = [cf for cf in existing_custom if cf.get("name") not in _SCORING_FIELD_NAMES] + existing_custom.extend(dict(cf) for cf in _SCORING_CUSTOM_FIELDS) + new_lf_config["custom_fields"] = existing_custom + if getattr(args, "dry_run", False): print(lf.generate_log_format(new_lf_config)) return diff --git a/backend/provision/fastly_api.py b/backend/provision/fastly_api.py index 07091b96..2621dcc4 100644 --- a/backend/provision/fastly_api.py +++ b/backend/provision/fastly_api.py @@ -82,6 +82,68 @@ def generate_capture_vcl(log_fields_config: dict) -> dict[str, str]: custom_edge = [cf for cf in enabled_custom if cf.get("collection_stage", "edge") == "edge"] custom_origin = [cf for cf in enabled_custom if cf.get("collection_stage", "edge") == "origin"] + # "deliver" stage: capture from response headers in vcl_deliver and + # promote into req.http.x-fos-edge-data:* so the same log-format + # consumer that handles edge-stage fields picks them up. Used by the + # session-scoring integration to capture X-Edge-Score* response headers + # from the scorer Compute backend. + custom_deliver = [cf for cf in enabled_custom if cf.get("collection_stage", "edge") == "deliver"] + + # Security: scrub internal-routing headers a client could spoof. + # The cluster-fetch / edge-data headers are set by THIS service's own + # snippets on the origin-bound bereq (vcl_miss / vcl_pass) and must + # never appear on an inbound req. Without this scrub, a client header + # like ``x-is-cluster-fetch: 1`` makes the conditional in vcl_deliver + # incorrectly classify the response as internal-cluster traffic and + # SKIP the "strip internal headers" cleanup — leaking origin-side + # metric headers (x-of-oip = origin backend IP, x-of-ttfb, etc.) to + # the client. Run BEFORE the edge-capture conditional so even + # configurations without any group-L / custom fields get the scrub. + # 020: Build scrub as a list so we can append per-custom-field + # unsets. ``unset req.http.x-fos-edge-data;`` strips the bare + # header but does NOT strip arbitrary subfield variants + # (``req.http.x-fos-edge-data:my_field``) on Fastly VCL — those + # are independent header slots once the colon-subfield syntax is + # in play. A client that knows a custom-field name (and they often + # leak through CSP, error pages, or just by being mentioned in + # public docs) can pre-set ``x-fos-edge-data:`` and have + # the log line read the spoofed value instead of the edge-captured + # one. Per-name scrubs close the gap. + scrub_lines = [ + "# [security] strip client-supplied internal-routing headers", + "if (req.restarts == 0 && fastly.ff.visits_this_service == 0) {", + " unset req.http.x-is-cluster-fetch;", + " unset req.http.x-fos-edge-data;", + " unset req.http.x-fos-origin-data;", + " unset req.http.x-of-start;", + " unset req.http.x-of-ttfb;", + " unset req.http.x-of-ttlb;", + " unset req.http.x-of-ost;", + " unset req.http.x-of-oip;", + " unset req.http.x-of-oretries;", + " unset req.http.x-of-status;", + " unset req.http.x-edge-req-id;", + " # Session-scoring internal markers. X-Edge-Scoring-Pass=1 from a", + " # client would bypass scoring entirely; x-edge-score* / X-Edge-Sid", + " # from a client could forge a clean score / sid that the deliver", + " # subfields propagate into the log line. Scrub them all at the", + " # client edge regardless of whether scoring is currently enabled.", + " unset req.http.X-Edge-Scoring-Pass;", + " unset req.http.x-edge-score;", + " unset req.http.X-Edge-Score;", + " unset req.http.X-Edge-Score-Reason;", + " unset req.http.X-Edge-Score-Enforce;", + " unset req.http.X-Edge-Sid;", + " unset req.http.X-Edge-Score-Set-Cookie;", + ] + if enabled_custom: + scrub_lines.append(" # --- Per-custom-field subfield scrubs (020) ---") + for cf in enabled_custom: + name = cf["name"] + scrub_lines.append(f" unset req.http.x-fos-edge-data:{name};") + scrub_lines.append(f" unset req.http.x-fos-origin-data:{name};") + scrub_lines.append("}") + edge_header_scrub = "\n".join(scrub_lines) # recv: edge capture + optional group-L request ID + custom edge fields if required or custom_edge: @@ -108,9 +170,9 @@ def generate_capture_vcl(log_fields_config: dict) -> dict[str, str]: recv_lines.append(f" set req.http.x-fos-edge-data:{cf['name']} = {cf['vcl_log_expression']};") recv_lines.append("}") - recv_vcl = "\n".join(recv_lines) + recv_vcl = edge_header_scrub + "\n" + "\n".join(recv_lines) else: - recv_vcl = "# No edge data capture required for current log configuration." + recv_vcl = edge_header_scrub + "\n# No edge data capture required for current log configuration." if group_l: recv_vcl += ( @@ -123,10 +185,22 @@ def generate_capture_vcl(log_fields_config: dict) -> dict[str, str]: # miss and pass: unset edge headers + optional group-L timing base_unset = "if (req.backend.is_origin) {\n unset bereq.http.x-fos-edge-data;\n}" + # Session-scoring services route the first-pass request to the scorer + # Compute backend via `return(pass)` in vcl_recv. That triggers the + # PASS subroutine for the scorer fetch, which would otherwise capture + # x-of-start AT THE SCORER FETCH TIME — polluting the eventual TTFB/ + # TTLB numbers with scorer-leg latency. The X-Edge-Scoring-Pass=="1" + # marker (set by session_scoring_vcl.recv_snippet just before the + # `return(pass)`) is our discriminator. Non-scoring services never set + # this header, so the guard is always true and timing fires normally. + _scoring_guard_open = 'if (req.http.X-Edge-Scoring-Pass != "1") {\n' + _scoring_guard_close = "}\n" + if group_l: miss_vcl = base_unset + ( "\n# [group-L] Record timing start for origin fetch\n" - "set req.http.x-of-start = time.elapsed.usec;\n" + + _scoring_guard_open + + "set req.http.x-of-start = time.elapsed.usec;\n" "unset bereq.http.x-of-start;\n" 'set bereq.http.x-is-cluster-fetch = "1";\n' "if (req.http.x-edge-req-id) {\n" @@ -134,11 +208,12 @@ def generate_capture_vcl(log_fields_config: dict) -> dict[str, str]: "} else if (req.http.x-req-id) {\n" " set bereq.http.x-edge-req-id = req.http.x-req-id;\n" "}\n" - "unset bereq.http.x-req-id;" + "unset bereq.http.x-req-id;\n" + _scoring_guard_close ) pass_vcl = base_unset + ( "\n# [group-L] Record timing start for PASS fetch\n" - "set req.http.x-of-start = time.elapsed.usec;\n" + + _scoring_guard_open + + "set req.http.x-of-start = time.elapsed.usec;\n" "unset bereq.http.x-of-start;\n" 'set bereq.http.x-is-cluster-fetch = "1";\n' "if (req.http.x-edge-req-id) {\n" @@ -146,7 +221,7 @@ def generate_capture_vcl(log_fields_config: dict) -> dict[str, str]: "} else if (req.http.x-req-id) {\n" " set bereq.http.x-edge-req-id = req.http.x-req-id;\n" "}\n" - "unset bereq.http.x-req-id;" + "unset bereq.http.x-req-id;\n" + _scoring_guard_close ) else: miss_vcl = base_unset @@ -159,7 +234,9 @@ def generate_capture_vcl(log_fields_config: dict) -> dict[str, str]: if group_l: fetch_lines.append( "# [group-L] Record TTFB and capture origin metadata\n" - 'if (req.http.x-of-start != "") {\n' + # Skip the scoring sub-fetch — we want TTFB for the real + # origin, not the scorer Compute backend. + 'if (req.http.X-Edge-Scoring-Pass != "1" && req.http.x-of-start != "") {\n' " declare local var.ttfb INTEGER;\n" " set var.ttfb = std.atoi(time.elapsed.usec);\n" " set var.ttfb -= std.atoi(req.http.x-of-start);\n" @@ -182,7 +259,10 @@ def generate_capture_vcl(log_fields_config: dict) -> dict[str, str]: if group_l: error_lines.append( "# [group-L] Capture timing for failed origin fetches\n" - 'if (req.http.x-of-start != "") {\n' + # Skip the scoring sub-fetch — a scorer error is fail-open + # handled by our session-scoring snippet and shouldn't + # pollute the customer's origin-error telemetry. + 'if (req.http.X-Edge-Scoring-Pass != "1" && req.http.x-of-start != "") {\n' " declare local var.ttfb INTEGER;\n" " set var.ttfb = std.atoi(time.elapsed.usec);\n" " set var.ttfb -= std.atoi(req.http.x-of-start);\n" @@ -194,12 +274,14 @@ def generate_capture_vcl(log_fields_config: dict) -> dict[str, str]: ) snippets["error"] = "\n".join(error_lines) - if group_l or custom_origin: + if group_l or custom_origin or custom_deliver: deliver_lines = [] if group_l: deliver_lines.append( "# [group-L] Record TTLB, capture bytes, strip all internal headers\n" - 'if (req.http.x-of-start != "") {\n' + # Skip scoring sub-fetch — don't capture scorer-leg TTLB + # into the real-request's telemetry. + 'if (req.http.X-Edge-Scoring-Pass != "1" && req.http.x-of-start != "") {\n' " declare local var.ttlb INTEGER;\n" " set var.ttlb = std.atoi(time.elapsed.usec);\n" " set var.ttlb -= std.atoi(req.http.x-of-start);\n" @@ -257,6 +339,20 @@ def generate_capture_vcl(log_fields_config: dict) -> dict[str, str]: deliver_lines.append(f" unset resp.http.x-fos-origin-data:{name};") deliver_lines.append("}") + if custom_deliver: + # Deliver-stage fields read from the RESPONSE headers + # (e.g. resp.http.X-Edge-Score after a Compute scorer sub-fetch + # returned). The expression in vcl_log_expression points at the + # ``req.http.*`` slot the upstream snippet copied it into — same + # final namespace as edge fields, just captured a stage later in + # the request lifecycle. + deliver_lines.append("# --- Custom Deliver Fields ---") + for cf in custom_deliver: + name = cf["name"] + deliver_lines.append(f'if ({cf["vcl_log_expression"]} != "") {{') + deliver_lines.append(f" set req.http.x-fos-edge-data:{name} = {cf['vcl_log_expression']};") + deliver_lines.append("}") + snippets["deliver"] = "\n".join(deliver_lines) return snippets @@ -303,6 +399,55 @@ def validate_log_format(log_fields_config: dict = None) -> list[str]: return _validate_log_format_regex(raw) +def install_capture_snippets( + service_id: str, + version: int, + log_fields_config: dict | None, + token: str, +) -> None: + """Install the auto-generated "Fastly Log Analysis *" capture VCL + snippets on the given draft version. Idempotent via ``ensure_vcl_ + snippet``'s content/priority diff. + + Mapping table here is the single source of truth for which subroutine + each capture phase targets and at what priority. Both the + full-provisioning path (`ensure_logging_endpoint`) and the + session-scoring orchestrator (which installs onto an existing service + that already has a logging endpoint) call into this helper. + + Note on the Origin Error snippet: a prior copy of this logic in + ``session_scoring_orchestrator.enable_scoring`` omitted the error + snippet install, so a service first provisioned via the orchestrator + silently lacked failed-origin TTFB capture. This helper closes that + drift by installing all phases via one loop. + """ + snippets = generate_capture_vcl(log_fields_config) + # (snippet_name, subroutine_type, priority, required) + # 'required' phases ("recv", "miss", "pass") are always generated. + # Group-L phases ("fetch", "deliver", "error") only exist when + # group L is enabled — guarded by `in snippets`. + install_plan = ( + ("Fastly Log Analysis Capture", "recv", 1, True), + ("Fastly Log Analysis Miss", "miss", 100, True), + ("Fastly Log Analysis Pass", "pass", 100, True), + ("Fastly Log Analysis Origin Fetch", "fetch", 100, False), + ("Fastly Log Analysis Origin Deliver", "deliver", 100, False), + ("Fastly Log Analysis Origin Error", "error", 100, False), + ) + for snip_name, kind, priority, required in install_plan: + if not required and kind not in snippets: + continue + ensure_vcl_snippet( + snip_name, + kind, + snippets[kind], + priority, + service_id, + version, + token, + ) + + def _validate_log_format_regex(raw: str) -> list[str]: """Regex-based fallback log format checks.""" errors = [] @@ -698,26 +843,7 @@ def ensure_logging_endpoint(cfg: dict, fos_access_key: str, fos_secret_key: str, if status_cb: status_cb("⏳ Deploying VCL snippets to capture edge values...") - vcl_snippets = generate_capture_vcl(cfg.get("log_fields")) - ensure_vcl_snippet("Fastly Log Analysis Capture", "recv", vcl_snippets["recv"], 1, service_id, new_ver, token) - ensure_vcl_snippet("Fastly Log Analysis Miss", "miss", vcl_snippets["miss"], 100, service_id, new_ver, token) - ensure_vcl_snippet("Fastly Log Analysis Pass", "pass", vcl_snippets["pass"], 100, service_id, new_ver, token) - if "fetch" in vcl_snippets: - ensure_vcl_snippet( - "Fastly Log Analysis Origin Fetch", "fetch", vcl_snippets["fetch"], 100, service_id, new_ver, token - ) - ensure_vcl_snippet( - "Fastly Log Analysis Origin Error", "error", vcl_snippets["error"], 100, service_id, new_ver, token - ) - ensure_vcl_snippet( - "Fastly Log Analysis Origin Deliver", - "deliver", - vcl_snippets["deliver"], - 100, - service_id, - new_ver, - token, - ) + install_capture_snippets(service_id, new_ver, cfg.get("log_fields"), token) ok("Logging endpoint and VCL snippets added to draft") @@ -991,27 +1117,8 @@ def update_logging_endpoint(cfg: dict, token: str): yield {"type": "progress", "current": 3, "total": total_steps} - vcl_snippets = generate_capture_vcl(lf_config) - ensure_vcl_snippet("Fastly Log Analysis Capture", "recv", vcl_snippets["recv"], 1, service_id, new_ver, token) - ensure_vcl_snippet("Fastly Log Analysis Miss", "miss", vcl_snippets["miss"], 100, service_id, new_ver, token) - ensure_vcl_snippet("Fastly Log Analysis Pass", "pass", vcl_snippets["pass"], 100, service_id, new_ver, token) - if "fetch" in vcl_snippets: - ensure_vcl_snippet( - "Fastly Log Analysis Origin Fetch", "fetch", vcl_snippets["fetch"], 100, service_id, new_ver, token - ) - ensure_vcl_snippet( - "Fastly Log Analysis Origin Error", "error", vcl_snippets["error"], 100, service_id, new_ver, token - ) - ensure_vcl_snippet( - "Fastly Log Analysis Origin Deliver", - "deliver", - vcl_snippets["deliver"], - 100, - service_id, - new_ver, - token, - ) - else: + install_capture_snippets(service_id, new_ver, lf_config, token) + if "fetch" not in generate_capture_vcl(lf_config): for snip in [ "Fastly Log Analysis Origin Fetch", "Fastly Log Analysis Origin Error", diff --git a/backend/provision/orchestrator.py b/backend/provision/orchestrator.py index a3993699..b9aad6d6 100644 --- a/backend/provision/orchestrator.py +++ b/backend/provision/orchestrator.py @@ -1,10 +1,13 @@ import json +import logging import os import queue import shutil import threading import time +logger = logging.getLogger(__name__) + from backend.core import log_fields as lf from backend.core.fastly.client import fastly from backend.core.fastly.utils import ( @@ -37,18 +40,58 @@ def _sync_crontab(): def write_service_config(state: dict): - """Write a service config JSON file to configs/{service_id}.json.""" + """Write a service config JSON file to configs/{service_id}.json. + + PRESERVE-ON-RE-RUN: this function is called from /api/provision/ingest + (analyst-join, wizard re-run, key rotation). The ``state`` dict is the + request body — it has no awareness of code-managed keys that + ``enable_scoring`` / ``ngwaf_workspace_id`` PATCH / log_fields PATCH + may have injected into the existing config. Without preserving those + keys, re-running the wizard silently strips ``cfg["scoring"]``, + ``cfg["log_fields"]["custom_fields"]``, and ``cfg["ngwaf_workspace_id"]`` + — same bug class as the 2026-06-02 state_sync incident, just with the + request body as the stale-overwriter instead of FOS admin_state.json. + """ from backend import config as svcconfig service_id = state.get("logging_service_id") or state.get("service_id") db_path = svcconfig.duckdb_path(service_id) + # Snapshot the existing on-disk cfg so we can preserve code-managed + # keys that the request body doesn't carry. None on first-ever ingest + # (which is fine — there's nothing to preserve). + existing_cfg = svcconfig.load_config(service_id) or {} + fos_key = state.get("fos_access_key_id") or state.get("fos_access_key", "") fos_secret = state.get("fos_secret_access_key") or state.get("fos_secret_key", "") bucket = state.get("fos_bucket") or state.get("fos_bucket_name", "") region = state.get("fos_region", "us-east-1") cdn_url = state.get("cdn_url", "") + # Build log_fields: prefer the request body, but if the request body + # omits custom_fields (or sends an empty list) AND we have existing + # custom_fields on disk, preserve them. Then if scoring is enabled, + # re-inject the canonical _SCORING_CUSTOM_FIELDS from code. + incoming_lf = dict(state.get("log_fields") or {}) + incoming_custom = incoming_lf.get("custom_fields") + existing_custom = list((existing_cfg.get("log_fields") or {}).get("custom_fields") or []) + if not incoming_custom and existing_custom: + incoming_lf["custom_fields"] = existing_custom + # Re-inject scoring fields from code when scoring is enabled in either + # the incoming state OR the existing cfg (the wizard re-run rarely + # carries scoring in the body). + scoring_block = state.get("scoring") or existing_cfg.get("scoring") or {} + if scoring_block.get("enabled"): + from backend.provision.session_scoring_orchestrator import ( + _SCORING_CUSTOM_FIELDS, + _SCORING_FIELD_NAMES, + ) + + current_custom = list(incoming_lf.get("custom_fields") or []) + current_custom = [cf for cf in current_custom if cf.get("name") not in _SCORING_FIELD_NAMES] + current_custom.extend(dict(cf) for cf in _SCORING_CUSTOM_FIELDS) + incoming_lf["custom_fields"] = current_custom + cfg = { "service_id": service_id, "name": state.get("name") or state.get("service_name") or service_id, @@ -66,9 +109,19 @@ def write_service_config(state: dict): "fastly_api_key": state.get("fastly_api_key") or state.get("admin_token", ""), "log_retention_days": int(state.get("log_retention_days", 30)), "duckdb_path": db_path, - "log_fields": state.get("log_fields", {}), + "log_fields": incoming_lf, } + # Preserve code-managed top-level keys that the request body doesn't + # carry — primarily ``scoring`` (set by enable_scoring) and + # ``ngwaf_workspace_id`` (set by the NGWAF-config PATCH). Anything else + # the existing cfg has that the wizard body lacks survives the rewrite. + for preserved_key in ("scoring", "ngwaf_workspace_id"): + if preserved_key not in state and preserved_key in existing_cfg: + cfg[preserved_key] = existing_cfg[preserved_key] + elif preserved_key in state: + cfg[preserved_key] = state[preserved_key] + if "log_period" in state: cfg["log_period"] = state["log_period"] elif "log_period" in state.get("provisioning", {}): @@ -457,12 +510,38 @@ def cleanup_local_data(service_id: str, bucket: str = None, remove_data: bool = pass if bucket: - # Look for cache dir in both common locations - for base in [os.getcwd(), os.path.join(os.path.dirname(__file__), "..", "..")]: - svc_cache_dir = os.path.join(base, "cache", bucket) - if os.path.exists(svc_cache_dir): - shutil.rmtree(svc_cache_dir) - ok(f"Removed local cache: {svc_cache_dir}") + # Security: ``bucket`` is supplied via the provisioning + # API and historically had no path-shape validation. A payload + # like ``../../../tmp/anything`` would compose with + # os.path.join to produce a path outside the cache root and + # shutil.rmtree would happily wipe whatever lived there. + # Reject any separator/traversal token up front, then + # additionally verify the resolved path stays under the + # resolved cache root (defense in depth — catches edge cases + # like symlink escapes from inside an attacker-writable + # parent dir). + if any(c in bucket for c in ("/", "\\", "..", "\x00")): + logger.warning("[teardown] refusing to remove cache for bucket=%r with path-shape characters", bucket) + else: + for base in [os.getcwd(), os.path.join(os.path.dirname(__file__), "..", "..")]: + cache_root = os.path.realpath(os.path.join(base, "cache")) + svc_cache_dir = os.path.realpath(os.path.join(cache_root, bucket)) + # Reject anything that resolved outside the cache root — + # belt-and-suspenders for symlinks pointing elsewhere. + try: + common = os.path.commonpath([cache_root, svc_cache_dir]) + except ValueError: + continue + if common != cache_root: + logger.warning( + "[teardown] refusing to remove cache: resolved path %s escapes %s", + svc_cache_dir, + cache_root, + ) + continue + if os.path.exists(svc_cache_dir): + shutil.rmtree(svc_cache_dir) + ok(f"Removed local cache: {svc_cache_dir}") _sync_crontab() @@ -476,6 +555,15 @@ def generate_analyst_invite(service_id: str) -> dict: if cfg.get("access_level") != "read_write": raise RuntimeError("Invite generation requires a read_write service configuration") api_token = cfg.get("fastly_api_key", "").strip() + # Fail fast when the stored token is missing. Without this, the Fastly + # API call below would go out with token="" and either time out or + # return an error envelope; either way the downstream key["access_key"] + # would raise an unhelpful KeyError instead of a clean 400-style message. + # Caller (route handler) wraps RuntimeError → HTTPException(400). + if not api_token: + raise RuntimeError( + f"Service {service_id} has no stored fastly_api_key. Rotate the credential before generating a viewer key." + ) bucket = cfg.get("fos_bucket", "") region = cfg.get("fos_region", "us-east-1") key = fastly( @@ -488,6 +576,13 @@ def generate_analyst_invite(service_id: str) -> dict: }, token=api_token, ) + # Defensive: a malformed Fastly response shouldn't bubble up as a raw + # KeyError on access_key / secret_key — surface a clear error instead. + if not isinstance(key, dict) or "access_key" not in key or "secret_key" not in key: + raise RuntimeError( + f"Fastly access-key API returned unexpected shape (keys={list(key.keys()) if isinstance(key, dict) else type(key).__name__}); " + "cannot generate analyst invite." + ) iceberg_metadata_location = None try: diff --git a/backend/provision/session_scoring_orchestrator.py b/backend/provision/session_scoring_orchestrator.py new file mode 100644 index 00000000..340847a3 --- /dev/null +++ b/backend/provision/session_scoring_orchestrator.py @@ -0,0 +1,1064 @@ +"""End-to-end ``enable_scoring`` / ``disable_scoring`` for a single +customer's logging service. + +This is the user-facing "turn on session scoring" flow. It composes the +existing primitives: + + - ``ensure_scoring_service`` / ``delete_scoring_service`` (Compute + service + ConfigStores + AES key + resource links, in + backend/provision/session_scoring_setup.py) + - ``scripts/scoring/deploy_wasm.sh`` (build + push the Wasm) + - ``ensure_vcl_snippet`` + ``ensure_condition`` (Fastly idempotent + helpers from backend/core/fastly/service.py) + - ``update_logging_endpoint`` (regenerate log format + push, from + backend/provision/fastly_api.py) + +The VCL mutation follows the same proven pattern as +``ensure_logging_endpoint`` ([backend/provision/fastly_api.py:636](backend/provision/fastly_api.py#L636)): + get_active → clone → mutate draft → validate → activate + → on any exception, re-activate the prior version (leave the draft + dangling for debug) and re-raise. +""" + +from __future__ import annotations + +import datetime as _dt +import logging +import subprocess +import urllib.parse +from pathlib import Path +from typing import Any + +from backend import config as svcconfig +from backend.core.fastly.client import fastly +from backend.core.fastly.service import ( + ensure_vcl_snippet, + get_active_version, + list_vcl_snippets, +) +from backend.provision.session_scoring_setup import ( + delete_scoring_service, + ensure_scoring_service, +) +from backend.provision.session_scoring_vcl import ( + SCORING_BACKEND_API_NAME, + SCORING_DELIVER_NAME, + SCORING_ENFORCE_NAME, + SCORING_FETCH_NAME, + SCORING_FETCH_PRIORITY, + SCORING_MISS_NAME, + SCORING_PASS_NAME, + SCORING_RECV_NAME, + SCORING_SNIPPET_PRIORITY, + generate_scoring_vcl, + scoring_snippet_names, +) +from backend.provision.utils import BOLD, _c, fail, info, ok, warn + +logger = logging.getLogger(__name__) + +# Locations of the matrix files relative to repo root. +_REPO_ROOT = Path(__file__).resolve().parent.parent.parent +_MATRIX_PATH = _REPO_ROOT / "compute" / "scorer" / "matrix.json" +_DEPLOY_WASM_SCRIPT = _REPO_ROOT / "scripts" / "scoring" / "deploy_wasm.sh" + +# Custom-field definitions the orchestrator adds/removes when enabling/ +# disabling scoring. Kept as a single source of truth so disable_scoring +# can find them by name to undo cleanly. +# vcl_log_expression points at req.http.x-fos-edge-data:edge_* subfields +# (NOT the source req.http.x-edge-data:* subfields). Why: subfield writes +# in vcl_recv propagate to the log emitter; writes anywhere else don't. +# Our session_scoring recv snippet (pass 2) copies x-edge-data:* into +# x-fos-edge-data:edge_* exactly so this log format can read them. +# stage="deliver" is kept so the field shows up in the right tab in the +# UI; the value is actually populated in recv pass 2 via the manual +# promotion in session_scoring_vcl.recv_snippet. +_SCORING_CUSTOM_FIELDS: list[dict[str, Any]] = [ + { + "name": "edge_score", + "label": "Edge Score", + "description": "Combined session-anomaly score (0–100, quantized to nearest 5) from the edge scorer.", + "vcl_log_expression": "req.http.x-edge-score:score", + "collection_stage": "deliver", + "duckdb_type": "INTEGER", + "value_type": "numeric", + "bytes_estimate": 4, + "enabled": True, + }, + { + "name": "edge_score_l1", + "label": "Edge Score (Layer 1)", + "description": "Layer-1 (universal behavioral) score contribution.", + "vcl_log_expression": "req.http.x-edge-score:l1", + "collection_stage": "deliver", + "duckdb_type": "INTEGER", + "value_type": "numeric", + "bytes_estimate": 4, + "enabled": True, + }, + { + "name": "edge_score_l2", + "label": "Edge Score (Layer 2)", + "description": "Layer-2 (route transition) score contribution.", + "vcl_log_expression": "req.http.x-edge-score:l2", + "collection_stage": "deliver", + "duckdb_type": "INTEGER", + "value_type": "numeric", + "bytes_estimate": 4, + "enabled": True, + }, + { + "name": "edge_cookie_compliance", + "label": "Cookie Compliance", + "description": "ok | missing | tampered | unknown.", + "vcl_log_expression": "req.http.x-edge-score:compliance", + "collection_stage": "deliver", + "duckdb_type": "VARCHAR", + "value_type": "string", + "bytes_estimate": 10, + "enabled": True, + }, + { + "name": "edge_score_reason", + "label": "Score Reason", + "description": "Comma-separated list of fired scoring rules.", + "vcl_log_expression": "req.http.x-edge-score:reason", + "collection_stage": "deliver", + "duckdb_type": "VARCHAR", + "value_type": "string", + "bytes_estimate": 60, + "enabled": True, + }, + { + "name": "edge_sid", + "label": "Session ID", + "description": ( + "12-hex-char rotating session id from the edge scorer cookie. " + "Empty when the inbound request had no valid cookie. Used as " + "the key for admin session labels (good / bad / neutral)." + ), + "vcl_log_expression": "req.http.x-edge-score:sid", + "collection_stage": "deliver", + "duckdb_type": "VARCHAR", + "value_type": "string", + "bytes_estimate": 12, + "enabled": True, + }, +] +_SCORING_FIELD_NAMES = {cf["name"] for cf in _SCORING_CUSTOM_FIELDS} + + +def _deploy_wasm(scoring_service_id: str, token: str, status_cb=None) -> None: + """Invoke scripts/scoring/deploy_wasm.sh as a subprocess. + + If the trained matrix exists (`compute/scorer/matrix.json` with + vocab_size > 0) it gets embedded; otherwise we deploy with the empty + default and L2 self-disables. The script's `trap EXIT` restores the + default placeholder afterward so the working tree stays clean. + """ + info("Building + deploying Wasm to the scoring Compute service") + if status_cb: + status_cb("⏳ Building + deploying Wasm to the scoring service...") + + if not _DEPLOY_WASM_SCRIPT.exists(): + raise RuntimeError(f"deploy script not found at {_DEPLOY_WASM_SCRIPT}") + + cmd = [ + str(_DEPLOY_WASM_SCRIPT), + "--service-id", + scoring_service_id, + "--token", + token, + ] + # Only pass --matrix if a trained one exists; otherwise the script + # uses the empty default (and refuses to deploy a real-matrix-required + # path, which is correct for the first enable when nothing's trained + # yet). We pre-check vocab_size to give a clear error if a malformed + # matrix is sitting in the path. + if _MATRIX_PATH.exists(): + import json as _json + + try: + with _MATRIX_PATH.open() as f: + m = _json.load(f) + if m.get("vocab_size", 0) > 0: + cmd.extend(["--matrix", str(_MATRIX_PATH)]) + info(f" using trained matrix (vocab_size={m['vocab_size']}, version={m.get('version')})") + else: + info(" trained matrix is empty; deploying with default-empty (L2 disabled)") + except Exception: + warn(" matrix.json present but unreadable; falling back to default-empty") + + # If no real matrix, the script's vocab_size==0 check would fail. Skip + # passing --matrix entirely so it just rebuilds with whatever's in + # matrix.default.json (i.e. the tracked empty default). + proc = subprocess.run( + cmd, + capture_output=True, + text=True, + cwd=str(_REPO_ROOT), + ) + if proc.returncode != 0: + # Surface the script's stderr so the operator can see what failed. + raise RuntimeError( + f"deploy_wasm.sh failed (exit {proc.returncode}):\n" + f"--- stdout ---\n{proc.stdout}\n--- stderr ---\n{proc.stderr}" + ) + ok("Wasm deployed to scoring service") + + +def _add_scoring_backend( + logging_service_id: str, + version: int, + scoring_domain: str, + token: str, +) -> None: + """Add the scoring Compute service as a backend on the cloned VCL + version. Backend name is the constant from session_scoring_vcl so the + recv snippet can reference it by name.""" + payload = { + "name": SCORING_BACKEND_API_NAME, + "address": scoring_domain, + "port": 443, + "use_ssl": True, + "ssl_cert_hostname": scoring_domain, + "ssl_sni_hostname": scoring_domain, + # The Fastly Compute service routes by Host header. Without + # override_host, the upstream Host arrives as the customer's + # domain (e.g. www.example.com) and the scorer's + # edgecompute.app service can't dispatch it — TLS SNI matches + # but the Host header doesn't. Forcing it to the scoring + # domain fixes routing. + "override_host": scoring_domain, + # The edgecompute.app cert is from Fastly's internal CA and may not + # validate cleanly when one Fastly service backends to another. Both + # ends are inside Fastly's network so we trade strict verification + # for reliability — security is not at risk because the path never + # leaves Fastly's edge. + "ssl_check_cert": False, + "auto_loadbalance": False, + # Aggressive: Wasm execution is ~600µs and intra-Fastly network + # adds ~5-20ms warm-state. 50ms gives ~2.5x typical round-trip. + # Cold-start Compute instances (rare in production) will fail- + # open at this budget — acceptable trade vs. holding real users. + # If fail-open rate climbs, bump these back up after seeing + # per-POP latency distributions. + "connect_timeout": 50, + "first_byte_timeout": 50, + "between_bytes_timeout": 50, + } + # Idempotent: if the backend already exists, PUT-update it when the + # config has drifted (e.g. we tuned the timeouts). POST a new one + # only when it's missing. Without the PUT path, re-running enable + # on a version with an existing backend would silently keep stale + # timeouts in place. + existing_match = None + try: + existing = ( + fastly( + "GET", + f"/service/{logging_service_id}/version/{version}/backend", + token=token, + ) + or [] + ) + for b in existing: + if b.get("name") == SCORING_BACKEND_API_NAME: + existing_match = b + break + except RuntimeError: + pass + + if existing_match is not None: + drift = any(existing_match.get(k) != v for k, v in payload.items() if k in existing_match) + if not drift: + ok(f"Scoring backend already current on version {version}") + return + encoded = urllib.parse.quote(SCORING_BACKEND_API_NAME, safe="") + fastly( + "PUT", + f"/service/{logging_service_id}/version/{version}/backend/{encoded}", + payload, + token=token, + ) + ok(f"Updated scoring backend {SCORING_BACKEND_API_NAME} (drifted settings)") + return + + fastly( + "POST", + f"/service/{logging_service_id}/version/{version}/backend", + payload, + token=token, + ) + ok(f"Added scoring backend {SCORING_BACKEND_API_NAME} ({scoring_domain})") + + +def _remove_scoring_backend(logging_service_id: str, version: int, token: str) -> None: + """Remove the scoring backend (idempotent — 404 is fine).""" + encoded = urllib.parse.quote(SCORING_BACKEND_API_NAME, safe="") + try: + fastly( + "DELETE", + f"/service/{logging_service_id}/version/{version}/backend/{encoded}", + token=token, + expect_empty=True, + ) + ok(f"Removed scoring backend {SCORING_BACKEND_API_NAME}") + except RuntimeError as exc: + if "404" in str(exc): + ok("Scoring backend already absent") + else: + raise + + +def _remove_scoring_snippets(logging_service_id: str, version: int, token: str) -> None: + """Delete the six scoring snippets by name (idempotent).""" + present = set(list_vcl_snippets(logging_service_id, version, token)) + for name in scoring_snippet_names(): + if name not in present: + continue + encoded = urllib.parse.quote(name, safe="") + try: + fastly( + "DELETE", + f"/service/{logging_service_id}/version/{version}/snippet/{encoded}", + token=token, + expect_empty=True, + ) + ok(f"Removed snippet {name}") + except RuntimeError as exc: + if "404" in str(exc): + continue + raise + + +def _add_scoring_custom_fields(cfg: dict) -> dict: + """Merge the 6 scoring custom_fields into cfg.log_fields.custom_fields. + Existing fields with the same name are replaced (idempotent re-runs + pick up any tuning we've done to bytes_estimate / label / etc.).""" + cfg.setdefault("log_fields", {}) + cfg["log_fields"].setdefault("custom_fields", []) + existing = [cf for cf in cfg["log_fields"]["custom_fields"] if cf.get("name") not in _SCORING_FIELD_NAMES] + cfg["log_fields"]["custom_fields"] = existing + [dict(cf) for cf in _SCORING_CUSTOM_FIELDS] + return cfg + + +def _remove_scoring_custom_fields(cfg: dict) -> dict: + """Strip the 6 scoring custom_fields from cfg, leaving any others + untouched.""" + if "log_fields" not in cfg or "custom_fields" not in cfg["log_fields"]: + return cfg + cfg["log_fields"]["custom_fields"] = [ + cf for cf in cfg["log_fields"]["custom_fields"] if cf.get("name") not in _SCORING_FIELD_NAMES + ] + return cfg + + +def enable_scoring( + logging_service_id: str, + token: str, + *, + status_cb=None, +) -> dict[str, Any]: + """Provision (or reuse) the Compute scoring service, deploy the Wasm, + then mutate the customer's VCL service to call it via the restart + pattern. + + Idempotent — re-running with scoring already enabled returns the + existing state without making changes (the underlying ensure_* helpers + are all no-ops on the happy path). + + Returns: + { + "scoring_service_id": "...", + "scoring_service_name": "Session Scoring Service for {id}", + "scoring_domain": "fos-...-session-scorer.edgecompute.app", + "scoring_keys_store_id": "...", + "scoring_config_store_id": "...", + "aes_key_hex": "..." (only on first creation), + "logging_service_active_version": int (post-activate), + } + """ + cfg = svcconfig.load_config(logging_service_id) + if not cfg: + raise RuntimeError(f"No config found for logging service {logging_service_id}") + + # ── Stage 1: Compute scoring service + AES key + ConfigStores. ────────── + info(f"Enabling session scoring for {_c(BOLD, logging_service_id)}") + if status_cb: + status_cb(f"⏳ Enabling session scoring for {logging_service_id}...") + + # On a re-run we lose `aes_key_hex` and `request_secret` from + # ensure_scoring_service (they're write-only in the ConfigStore). + # Preserve whatever the prior provision stashed in cfg so VCL + # generation still has the secret available. If neither has one + # (e.g. the scoring service was provisioned before the secret + # feature existed), generate a fresh one and PATCH it into the + # ConfigStore so this enable is self-healing. + prior_scoring = cfg.get("scoring") or {} + + scoring_meta = ensure_scoring_service(logging_service_id, token, status_cb=status_cb) + scoring_service_id = scoring_meta["scoring_service_id"] + scoring_domain = scoring_meta["scoring_domain"] + request_secret = scoring_meta.get("request_secret") or prior_scoring.get("request_secret") or "" + if not request_secret: + import secrets as _secrets + + request_secret = _secrets.token_hex(32) + keys_store_id = scoring_meta.get("scoring_keys_store_id") or prior_scoring.get("scoring_keys_store_id") + if not keys_store_id: + raise RuntimeError("Cannot heal missing request_secret: no scoring_keys_store_id available.") + # Upsert the secret. POST returns 409 if it already exists; in + # that case PATCH instead. We try POST first because the common + # case here is "no entry exists yet". + try: + fastly( + "POST", + f"/resources/stores/config/{keys_store_id}/item", + {"item_key": "request_secret", "item_value": request_secret}, + token=token, + ) + except RuntimeError: + fastly( + "PATCH", + f"/resources/stores/config/{keys_store_id}/item/request_secret", + {"item_value": request_secret}, + token=token, + ) + info("Healed missing request_secret in scoring_keys store") + + # ── Stage 2: build + deploy Wasm. ─────────────────────────────────────── + _deploy_wasm(scoring_service_id, token, status_cb=status_cb) + + # ── Stage 3: write scoring metadata into the LOGGING service config. ──── + # Preserve operator-tunable overrides across re-enables — the previous + # implementation replaced the entire ``scoring`` block, silently wiping + # the operator's per-service exclude_url_regex and enforce_status_code. + # Pull them off the pre-existing block before the replace. + from backend.provision.session_scoring_vcl import DEFAULT_ASSET_EXT_REGEX + + prior_scoring = cfg.get("scoring") or {} + cfg["scoring"] = { + "enabled": True, + "scoring_service_id": scoring_service_id, + "scoring_service_name": scoring_meta["scoring_service_name"], + "scoring_domain": scoring_domain, + "scoring_keys_store_id": scoring_meta["scoring_keys_store_id"], + "scoring_config_store_id": scoring_meta["scoring_config_store_id"], + # Stash the secret here so re-runs of enable_scoring can recover + # it (the ConfigStore is write-only from our perspective). The + # config file is gitignored under /configs/* so this never leaks. + "request_secret": request_secret, + "enabled_at": _dt.datetime.now(_dt.UTC).isoformat(timespec="seconds"), + # First-enable defaults: persist the actual values so the admin UI + # shows what's actually in use (no empty-as-sentinel cleverness) + # and so a future change to the bundled default doesn't silently + # alter the per-service behaviour. + "exclude_url_regex": prior_scoring.get("exclude_url_regex") or DEFAULT_ASSET_EXT_REGEX, + } + # Preserve any operator-set enforce_status_code override across the + # block replace; absence means "use the bundled default 429" — there's + # no need to materialise the default since the enforce snippet's + # default arg already covers it. + if prior_scoring.get("enforce_status_code") is not None: + cfg["scoring"]["enforce_status_code"] = prior_scoring["enforce_status_code"] + # Add the scoring custom_fields so update_logging_endpoint picks them up. + _add_scoring_custom_fields(cfg) + svcconfig.save_config(logging_service_id, cfg) + n_scoring = len(_SCORING_FIELD_NAMES) + ok(f"Stashed scoring metadata + {n_scoring} custom_fields into service config") + + # ── Stage 4: clone the LOGGING service's active VCL version. ──────────── + active_ver = get_active_version(logging_service_id, token) + if active_ver is None: + raise RuntimeError(f"Logging service {logging_service_id} has no active version") + info(f"Logging service active version: {active_ver}") + if status_cb: + status_cb(f"🔄 Cloning version {active_ver} to add scoring...") + clone = fastly( + "PUT", + f"/service/{logging_service_id}/version/{active_ver}/clone", + token=token, + ) + new_ver = int(clone["number"]) + ts = _dt.datetime.now(_dt.UTC).strftime("%Y-%m-%d %H:%M:%S UTC") + fastly( + "PUT", + f"/service/{logging_service_id}/version/{new_ver}", + {"comment": f"Enable session scoring (scorer={scoring_service_id}) {ts}"}, + token=token, + ) + ok(f"Draft version: {new_ver}") + + try: + # ── Stage 5: add the scoring Compute service as a backend. ────────── + _add_scoring_backend(logging_service_id, new_ver, scoring_domain, token) + + # ── Stage 6: install the six scoring VCL snippets. ────────────────── + info("Installing 6 scoring VCL snippets (recv / pass / fetch / deliver / miss / enforce)") + if status_cb: + status_cb("⏳ Installing scoring VCL snippets...") + # Pick up the operator's overrides (if any) so a re-enable carries + # the customised exclusion regex AND enforce-status-code forward. + # None / "" / out-of-range → defaults. + scoring_cfg = cfg.get("scoring") or {} + exclude_url_regex = scoring_cfg.get("exclude_url_regex") + enforce_status_code = scoring_cfg.get("enforce_status_code") + vcl_snippets = generate_scoring_vcl( + logging_service_id, + request_secret, + exclude_url_regex=exclude_url_regex, + enforce_status_code=enforce_status_code, + ) + for snip_name, vcl_type, prio in ( + (SCORING_RECV_NAME, "recv", SCORING_SNIPPET_PRIORITY), + (SCORING_PASS_NAME, "pass", SCORING_SNIPPET_PRIORITY), + # Fetch gets priority 1 so `return(deliver)` for the scorer + # backend fires before any other fetch-stage snippet runs. + (SCORING_FETCH_NAME, "fetch", SCORING_FETCH_PRIORITY), + (SCORING_DELIVER_NAME, "deliver", SCORING_SNIPPET_PRIORITY), + (SCORING_MISS_NAME, "miss", SCORING_SNIPPET_PRIORITY), + # Enforce snippet runs at recv-restart-2 (priority 101 — after + # the main Recv routing block) and 429s requests the scorer + # flagged via X-Edge-Score-Enforce. Off by default — fires + # only when the operator commits an enforce_threshold via + # the admin UI. + (SCORING_ENFORCE_NAME, "recv", SCORING_SNIPPET_PRIORITY + 1), + ): + ensure_vcl_snippet( + snip_name, + vcl_type, + vcl_snippets[snip_name], + prio, + logging_service_id, + new_ver, + token, + ) + ok("Installed 6 scoring VCL snippets") + + # ── Stage 7: regenerate the capture-VCL + log format for the + # 6 new custom_fields. update_logging_endpoint handles + # both: it diffs the format, pushes the new one, and + # re-runs ensure_vcl_snippet for capture snippets so + # the new deliver-stage capture VCL gets installed. + info("Regenerating log format + capture VCL for scoring fields") + if status_cb: + status_cb("⏳ Updating log format to include score fields...") + # update_logging_endpoint targets the active version by default. + # We want it to write to OUR draft, so we pass a hint via the cfg + # — but update_logging_endpoint doesn't accept a version arg. So + # we call it after activate, which means it'd create yet another + # version. To avoid that double-activation, we manually install + # the capture snippets on the draft here via the shared helper + # (which also installs the Origin Error snippet that an earlier + # inline copy of this logic was silently missing). + from backend.provision.fastly_api import install_capture_snippets + + install_capture_snippets(logging_service_id, new_ver, cfg.get("log_fields"), token) + + # Update the logging endpoint's format string on the draft version. + # The existing s3 logging endpoint must already exist (it was + # provisioned at setup). We PUT to update its format. + from backend.core.fastly.service import list_s3_endpoints + from backend.provision.fastly_api import load_log_format + + endpoint_name = cfg.get("provisioning", {}).get("endpoint_name", "Fastly Object Storage Logs") + existing_endpoints = list_s3_endpoints(logging_service_id, new_ver, token) + if endpoint_name not in existing_endpoints: + warn( + f"Logging endpoint {endpoint_name!r} not found on draft v{new_ver} — " + "skipping format update. Score fields will land in resp headers but not the log line." + ) + else: + new_format = load_log_format(cfg.get("log_fields")) + encoded = urllib.parse.quote(endpoint_name, safe="") + fastly( + "PUT", + f"/service/{logging_service_id}/version/{new_ver}/logging/s3/{encoded}", + {"format": new_format, "format_version": 2}, + token=token, + ) + ok(f"Updated logging endpoint format to include {len(_SCORING_FIELD_NAMES)} score fields") + + # ── Stage 8: validate ────────────────────────────────────────────── + info(f"Validating draft version {new_ver}") + if status_cb: + status_cb(f"⏳ Validating draft version {new_ver}...") + result = fastly( + "GET", + f"/service/{logging_service_id}/version/{new_ver}/validate", + token=token, + ) + if result.get("status") != "ok": + raise RuntimeError(f"Validation failed: {result.get('errors') or result}") + ok("Draft validated") + + # ── Stage 9: activate ────────────────────────────────────────────── + info(f"Activating version {new_ver}") + if status_cb: + status_cb(f"⏳ Activating version {new_ver}...") + fastly( + "PUT", + f"/service/{logging_service_id}/version/{new_ver}/activate", + token=token, + ) + ok(f"Version {new_ver} active") + if status_cb: + status_cb(f"✅ Session scoring enabled (active version {new_ver}).") + + scoring_meta["logging_service_active_version"] = new_ver + + # Publish the new custom_fields list to FOS's admin_state.json so + # read_only analyst hosts (and the GCE prod backend) pick them up + # on their next import_admin_state tick. Without this, a stale + # admin_state.json from before scoring was enabled would silently + # strip our 6 custom_fields on every metadata_sync — exactly the + # 2026-06-02 incident that motivated the import_admin_state merge + # fix in backend/state_sync.py. + try: + from backend.state_sync import export_admin_state + + export_admin_state(logging_service_id) + ok("Published custom_fields to FOS admin_state.json") + except Exception as exc: + warn(f"Could not export admin_state to FOS (non-fatal): {exc}") + + # Also publish the trained scoring matrix to FOS so analyst hosts + # (and any fresh backend container) see the exact same matrix + # that's currently embedded in the deployed Wasm. Without this, + # the /scoring/evaluation endpoint falls back to the default-empty + # matrix on read_only hosts and reports AUC ≈ 0.5 even though the + # live scorer is using a real trained one. + try: + from backend.state_sync import publish_matrix_to_fos + + if _MATRIX_PATH.exists(): + import json as _json + + with _MATRIX_PATH.open() as f: + matrix = _json.load(f) + publish_matrix_to_fos(logging_service_id, matrix) + ok(f"Published scoring matrix to FOS (version={matrix.get('version', '?')})") + except Exception as exc: + warn(f"Could not publish scoring matrix to FOS (non-fatal): {exc}") + + return scoring_meta + + except Exception as exc: + # ── Stage 10: rollback ───────────────────────────────────────────── + fail(f"enable_scoring failed: {exc}") + info(f"Rolling back — re-activating version {active_ver}") + try: + fastly( + "PUT", + f"/service/{logging_service_id}/version/{active_ver}/activate", + token=token, + ) + except RuntimeError: + pass + # Also revert the on-disk config so a retry starts from clean state. + # + # DEFENSE-IN-DEPTH: re-load cfg here instead of trusting the in- + # memory copy from line ~381. The Fastly stages above can take + # 30-60s; during that window a concurrent writer (metadata_sync + # tick re-injecting scoring fields, an admin PATCHing log_fields, + # an ngwaf_workspace_id update) may have mutated configs/.json. + # Writing the stale snapshot back wholesale would clobber those + # concurrent changes. Re-reading + mutating + saving means we + # only touch the scoring-related keys our rollback is supposed + # to revert. + try: + fresh = svcconfig.load_config(logging_service_id) or cfg + except Exception: + fresh = cfg + fresh.pop("scoring", None) + _remove_scoring_custom_fields(fresh) + svcconfig.save_config(logging_service_id, fresh) + raise + + +def disable_scoring( + logging_service_id: str, + token: str, + *, + status_cb=None, +) -> None: + """Tear down session scoring for this customer. + + Reverse of enable_scoring: clone active VCL → remove the 6 scoring + snippets + scoring backend → strip the 6 custom_fields → regenerate + log format → validate → activate → delete the scoring Compute + service + ConfigStores. Idempotent — 404s tolerated everywhere.""" + cfg = svcconfig.load_config(logging_service_id) + if not cfg: + raise RuntimeError(f"No config found for logging service {logging_service_id}") + + scoring = cfg.get("scoring") or {} + if not scoring.get("enabled"): + warn("Session scoring is not enabled for this service — nothing to disable") + if status_cb: + status_cb("✅ Session scoring already disabled.") + return + + scoring_service_id = scoring.get("scoring_service_id", "") + scoring_keys_store_id = scoring.get("scoring_keys_store_id", "") + scoring_config_store_id = scoring.get("scoring_config_store_id", "") + + info(f"Disabling session scoring for {_c(BOLD, logging_service_id)}") + if status_cb: + status_cb(f"⏳ Disabling session scoring for {logging_service_id}...") + + # ── Stage 1: clone active version. ────────────────────────────────────── + active_ver = get_active_version(logging_service_id, token) + if active_ver is None: + raise RuntimeError(f"Logging service {logging_service_id} has no active version") + clone = fastly( + "PUT", + f"/service/{logging_service_id}/version/{active_ver}/clone", + token=token, + ) + new_ver = int(clone["number"]) + ts = _dt.datetime.now(_dt.UTC).strftime("%Y-%m-%d %H:%M:%S UTC") + fastly( + "PUT", + f"/service/{logging_service_id}/version/{new_ver}", + {"comment": f"Disable session scoring {ts}"}, + token=token, + ) + + try: + # ── Stage 2: remove scoring VCL bits. ─────────────────────────────── + _remove_scoring_snippets(logging_service_id, new_ver, token) + _remove_scoring_backend(logging_service_id, new_ver, token) + + # ── Stage 3: drop the 6 custom_fields + regen log format. ─────────── + _remove_scoring_custom_fields(cfg) + svcconfig.save_config(logging_service_id, cfg) + + from backend.core.fastly.service import list_s3_endpoints + from backend.provision.fastly_api import generate_capture_vcl, load_log_format + + capture = generate_capture_vcl(cfg.get("log_fields")) + # Re-install (or remove if no fields left) the capture VCL. + ensure_vcl_snippet( + "Fastly Log Analysis Capture", + "recv", + capture["recv"], + 1, + logging_service_id, + new_ver, + token, + ) + + endpoint_name = cfg.get("provisioning", {}).get("endpoint_name", "Fastly Object Storage Logs") + existing_endpoints = list_s3_endpoints(logging_service_id, new_ver, token) + if endpoint_name in existing_endpoints: + new_format = load_log_format(cfg.get("log_fields")) + encoded = urllib.parse.quote(endpoint_name, safe="") + fastly( + "PUT", + f"/service/{logging_service_id}/version/{new_ver}/logging/s3/{encoded}", + {"format": new_format, "format_version": 2}, + token=token, + ) + + # ── Stage 4: validate + activate. ────────────────────────────────── + result = fastly( + "GET", + f"/service/{logging_service_id}/version/{new_ver}/validate", + token=token, + ) + if result.get("status") != "ok": + raise RuntimeError(f"Validation failed: {result.get('errors') or result}") + fastly( + "PUT", + f"/service/{logging_service_id}/version/{new_ver}/activate", + token=token, + ) + ok(f"Logging service version {new_ver} active (scoring stripped)") + except Exception as exc: + fail(f"disable_scoring VCL phase failed: {exc}") + try: + fastly( + "PUT", + f"/service/{logging_service_id}/version/{active_ver}/activate", + token=token, + ) + except RuntimeError: + pass + raise + + # ── Stage 5: tear down the Compute service + stores. ─────────────────── + delete_scoring_service( + scoring_service_id, + scoring_keys_store_id=scoring_keys_store_id, + scoring_config_store_id=scoring_config_store_id, + token=token, + status_cb=status_cb, + ) + + # ── Stage 6: clear the scoring block from config. ────────────────────── + # DEFENSE-IN-DEPTH: re-load cfg right before the final save. The + # Fastly + Compute teardown stages above can take 60-120s; the + # in-memory cfg loaded at line ~644 is a stale snapshot that would + # clobber any concurrent writer mutations (metadata_sync tick, + # custom_fields PATCH, ngwaf_workspace_id update). Same load-mutate- + # save-just-the-target-keys pattern as the enable_scoring rollback. + try: + fresh = svcconfig.load_config(logging_service_id) or cfg + except Exception: + fresh = cfg + fresh.pop("scoring", None) + svcconfig.save_config(logging_service_id, fresh) + + # Publish the new custom_fields list (now without scoring) so analyst + # boxes stop seeing the scoring entries on their next metadata_sync. + try: + from backend.state_sync import export_admin_state + + export_admin_state(logging_service_id) + except Exception as exc: + warn(f"Could not export admin_state to FOS after disable (non-fatal): {exc}") + + if status_cb: + status_cb("✅ Session scoring disabled.") + ok("Session scoring disabled") + + +def update_recv_exclusion_regex( + logging_service_id: str, + token: str, + *, + new_regex: str, +) -> dict[str, Any]: + """Re-publish ONLY the recv VCL snippet with a new exclusion regex. + + Lighter-weight than running ``enable_scoring`` end-to-end: we keep + the existing Compute service / ConfigStores / Wasm / log-format + untouched, and ONLY clone the active VCL version → swap the recv + snippet body → activate. Takes ~5-10s in practice. + + ``new_regex`` is the operator's pre-validated regex string (already + passed through ``backend.utils.vcl_validator.validate_url_exclusion_regex`` + + falco lint by the API layer). Empty string means "use the default" + and persists as ``None`` in cfg so a future default change auto-picks-up. + + Returns: + { + "effective_regex": str, # what got interpolated + "is_default": bool, + "logging_service_active_version": int, # post-activate + } + + Raises ``RuntimeError`` on any Fastly API failure; the rollback path + re-activates the prior version so the service is never left in an + inconsistent state. + """ + cfg = svcconfig.load_config(logging_service_id) + if not cfg: + raise RuntimeError(f"No config found for logging service {logging_service_id}") + scoring = cfg.get("scoring") or {} + if not scoring.get("enabled"): + raise RuntimeError( + f"Session scoring is not enabled for {logging_service_id}; " + "run enable_scoring first before customising the recv exclusion regex." + ) + request_secret = scoring.get("request_secret") + if not request_secret: + raise RuntimeError( + "Cannot re-publish recv snippet without request_secret in cfg; " + "the snippet bodies for peer snippets depend on it. Re-run enable_scoring." + ) + + # Persist the override first — that way even if the Fastly activation + # below fails, a future enable_scoring run picks up the new value. + # None is the canonical "use default" representation so the JSON cfg + # file doesn't end up with an empty-string sentinel. + cleaned = (new_regex or "").strip() + scoring["exclude_url_regex"] = cleaned or None + cfg["scoring"] = scoring + svcconfig.save_config(logging_service_id, cfg) + + # Generate the recv snippet body with the new regex. + from backend.provision.session_scoring_vcl import ( + DEFAULT_ASSET_EXT_REGEX, + recv_snippet, + resolve_exclude_url_regex, + ) + + effective_regex = resolve_exclude_url_regex(cleaned or None) + is_default = effective_regex == DEFAULT_ASSET_EXT_REGEX + new_recv_body = recv_snippet(logging_service_id, request_secret, exclude_url_regex=cleaned or None) + + # Clone → swap → activate. + active_ver = get_active_version(logging_service_id, token) + if active_ver is None: + raise RuntimeError(f"Logging service {logging_service_id} has no active version") + info(f"Cloning version {active_ver} to update recv-snippet exclusion regex") + clone = fastly("PUT", f"/service/{logging_service_id}/version/{active_ver}/clone", token=token) + new_ver = int(clone["number"]) + ts = _dt.datetime.now(_dt.UTC).strftime("%Y-%m-%d %H:%M:%S UTC") + fastly( + "PUT", + f"/service/{logging_service_id}/version/{new_ver}", + {"comment": f"Update scoring recv exclusion regex {ts}"}, + token=token, + ) + + try: + ensure_vcl_snippet( + SCORING_RECV_NAME, + "recv", + new_recv_body, + SCORING_SNIPPET_PRIORITY, + logging_service_id, + new_ver, + token, + ) + result = fastly("GET", f"/service/{logging_service_id}/version/{new_ver}/validate", token=token) + if result.get("status") != "ok": + raise RuntimeError(f"Validation failed: {result.get('errors') or result}") + fastly( + "PUT", + f"/service/{logging_service_id}/version/{new_ver}/activate", + token=token, + ) + ok(f"Logging service version {new_ver} active (recv exclusion regex updated)") + except Exception as exc: + fail(f"update_recv_exclusion_regex failed: {exc}") + # Re-activate the prior version so the service isn't left on the + # half-updated draft. Best-effort — if this fails too, the draft + # is left for the operator to clean up manually. + try: + fastly( + "PUT", + f"/service/{logging_service_id}/version/{active_ver}/activate", + token=token, + ) + except RuntimeError: + pass + raise + + return { + "effective_regex": effective_regex, + "is_default": is_default, + "logging_service_active_version": new_ver, + } + + +def update_enforce_status_code( + logging_service_id: str, + token: str, + *, + new_status_code: int | None, +) -> dict[str, Any]: + """Re-publish ONLY the enforce VCL snippet with a new status code. + + Mirrors ``update_recv_exclusion_regex``: clone the active version, + swap the enforce snippet body, validate, activate. Takes ~5-10s. + + ``new_status_code`` is the operator's pre-validated int (400-599) or + ``None`` to reset to the default 429. The PUT endpoint validates the + range BEFORE calling here; this function defends with + ``resolve_enforce_status_code`` but trusts its caller. + + Returns: + { + "effective_status_code": int, + "is_default": bool, + "logging_service_active_version": int, + } + + Raises ``RuntimeError`` on any Fastly API failure; the rollback path + re-activates the prior version so the service is never left in an + inconsistent state. + """ + cfg = svcconfig.load_config(logging_service_id) + if not cfg: + raise RuntimeError(f"No config found for logging service {logging_service_id}") + scoring = cfg.get("scoring") or {} + if not scoring.get("enabled"): + raise RuntimeError( + f"Session scoring is not enabled for {logging_service_id}; " + "run enable_scoring first before customising the enforce status code." + ) + + # Persist the override first so a future enable_scoring re-bake also + # picks it up even if the activation below fails. None is the canonical + # "use default" representation (mirrors exclude_url_regex shape). + from backend.provision.session_scoring_vcl import ( + DEFAULT_ENFORCE_STATUS_CODE, + enforce_snippet, + resolve_enforce_status_code, + ) + + effective_code = resolve_enforce_status_code(new_status_code) + is_default = effective_code == DEFAULT_ENFORCE_STATUS_CODE + scoring["enforce_status_code"] = None if is_default else effective_code + cfg["scoring"] = scoring + svcconfig.save_config(logging_service_id, cfg) + + # 034: enforce_snippet now bakes the request_secret into its shield-auth + # boundary check. Re-publishing without the secret would emit invalid + # VCL — fail loudly here rather than letting the activation fail later. + request_secret = scoring.get("request_secret") + if not request_secret: + raise RuntimeError( + "Cannot re-publish enforce snippet without request_secret in cfg; " + "run enable_scoring first or restore scoring.request_secret." + ) + new_enforce_body = enforce_snippet(request_secret, effective_code) + + # Clone → swap → activate. + active_ver = get_active_version(logging_service_id, token) + if active_ver is None: + raise RuntimeError(f"Logging service {logging_service_id} has no active version") + info(f"Cloning version {active_ver} to update enforce-snippet status code → {effective_code}") + clone = fastly("PUT", f"/service/{logging_service_id}/version/{active_ver}/clone", token=token) + new_ver = int(clone["number"]) + ts = _dt.datetime.now(_dt.UTC).strftime("%Y-%m-%d %H:%M:%S UTC") + fastly( + "PUT", + f"/service/{logging_service_id}/version/{new_ver}", + {"comment": f"Update scoring enforce status code → {effective_code} ({ts})"}, + token=token, + ) + + try: + ensure_vcl_snippet( + SCORING_ENFORCE_NAME, + "recv", + new_enforce_body, + SCORING_SNIPPET_PRIORITY + 1, + logging_service_id, + new_ver, + token, + ) + result = fastly("GET", f"/service/{logging_service_id}/version/{new_ver}/validate", token=token) + if result.get("status") != "ok": + raise RuntimeError(f"Validation failed: {result.get('errors') or result}") + fastly( + "PUT", + f"/service/{logging_service_id}/version/{new_ver}/activate", + token=token, + ) + ok(f"Logging service version {new_ver} active (enforce status code → {effective_code})") + except Exception as exc: + fail(f"update_enforce_status_code failed: {exc}") + # Re-activate the prior version so the service isn't left on the + # half-updated draft. Best-effort. + try: + fastly( + "PUT", + f"/service/{logging_service_id}/version/{active_ver}/activate", + token=token, + ) + except RuntimeError: + pass + raise + + return { + "effective_status_code": effective_code, + "is_default": is_default, + "logging_service_active_version": new_ver, + } diff --git a/backend/provision/session_scoring_setup.py b/backend/provision/session_scoring_setup.py new file mode 100644 index 00000000..36a85984 --- /dev/null +++ b/backend/provision/session_scoring_setup.py @@ -0,0 +1,387 @@ +"""Provision + tear down a per-customer session-scoring Compute service. + +Pattern mirrors ``ensure_cdn_service`` / ``delete_cdn_service`` in +``backend.provision.fastly_api`` — primitive args in, status callback for +SSE progress, idempotent in both directions, no implicit state outside +the Fastly API + returned dict. + +Naming convention (from the research doc): + - service name: ``Session Scoring Service for {logging_service_id}`` + - domain: ``fos-{logging_service_id.lower()}-session-scorer.edgecompute.app`` + - keys store: ``scoring_keys_{compute_service_id}`` + - config store: ``scoring_config_{compute_service_id}`` + +The Wasm deploy itself (``fastly compute deploy``) is NOT done here — +that's the matrix-deploy concern owned by ``scripts/scoring/deploy_wasm.sh`` +and gets invoked separately after a training run produces a matrix. This +keeps the provisioner small (~5s API calls) and the deploy slow (~30s +build + upload) as distinct lifecycle stages. +""" + +from __future__ import annotations + +import secrets +from typing import Any + +from backend.core.fastly.client import fastly +from backend.provision.utils import BOLD, _c, info, ok, warn + +SCORING_SERVICE_NAME_PREFIX = "Session Scoring Service for " +SCORING_DOMAIN_TEMPLATE = "fos-{sid_lower}-session-scorer.edgecompute.app" +KEYS_STORE_NAME_TEMPLATE = "scoring_keys_{sid}" +CONFIG_STORE_NAME_TEMPLATE = "scoring_config_{sid}" + +# Resource-link names match the ConfigStore::open() arguments in +# compute/scorer/src/main.rs. Both must be edited in lockstep. +KEYS_RESOURCE_LINK_NAME = "scoring_keys" +CONFIG_RESOURCE_LINK_NAME = "scoring_config" + +# Initial values for the config stores. +DEBUG_LOG_KEY = "debug_logging_enabled" +DEBUG_LOG_DEFAULT = "0" +CURRENT_KEY_HEX = "current_key_hex" +PREVIOUS_KEY_HEX = "previous_key_hex" # blank until first rotation +# Shared secret VCL → Compute. The customer's VCL service embeds this +# secret in the X-Edge-Scorer-Auth request header before calling the +# scorer; the scorer rejects requests without a matching value. Stops +# the scorer's edgecompute.app domain from being scored on by anyone +# who happens to find the hostname. +REQUEST_SECRET_KEY = "request_secret" + + +def _scoring_service_name(logging_service_id: str) -> str: + return f"{SCORING_SERVICE_NAME_PREFIX}{logging_service_id}" + + +def _scoring_domain(logging_service_id: str) -> str: + return SCORING_DOMAIN_TEMPLATE.format(sid_lower=logging_service_id.lower()) + + +def _find_scoring_service(logging_service_id: str, token: str) -> dict | None: + """Return the existing scoring service for this logging service, if any. + Idempotency lever — ``ensure_scoring_service`` reuses an existing + service rather than failing on duplicate-name.""" + name = _scoring_service_name(logging_service_id) + try: + services = fastly("GET", "/service", token=token) or [] + except RuntimeError: + return None + for svc in services: + if svc.get("name") == name: + return svc + return None + + +def _find_config_store(store_name: str, token: str) -> dict | None: + try: + resp = fastly("GET", "/resources/stores/config", token=token) + except RuntimeError: + return None + # Fastly's list endpoint returns either a list or {"data": [...]} depending + # on the version; tolerate both. + items = resp if isinstance(resp, list) else resp.get("data", []) + for item in items: + if item.get("name") == store_name: + return item + return None + + +def ensure_scoring_service( + logging_service_id: str, + token: str, + *, + status_cb=None, +) -> dict[str, Any]: + """Create (or reuse) the per-customer session-scoring Compute service, + its two ConfigStores, the AES-256 key, and resource links from v1 of + the service to the stores. + + Returns a dict suitable for stashing into the customer's config: + + { + "scoring_service_id": "...", + "scoring_service_name": "Session Scoring Service for ...", + "scoring_domain": "fos-...-session-scorer.edgecompute.app", + "scoring_keys_store_id": "...", + "scoring_config_store_id": "...", + "aes_key_hex": "..." # only populated on first creation + } + + Idempotent: re-running against an existing scoring service no-ops the + create steps. The returned ``aes_key_hex`` is empty when reusing an + existing service (we don't have a way to read back the key once it's + in the store).""" + name = _scoring_service_name(logging_service_id) + domain = _scoring_domain(logging_service_id) + + info(f"Ensuring scoring service {_c(BOLD, name)}") + if status_cb: + status_cb(f"⏳ Ensuring scoring service '{name}'...") + + existing = _find_scoring_service(logging_service_id, token) + if existing: + ok(f"Scoring service already exists ({existing['id']})") + if status_cb: + status_cb(f"✅ Scoring service '{name}' already exists.") + scoring_service_id = existing["id"] + keys_store = _find_config_store(KEYS_STORE_NAME_TEMPLATE.format(sid=scoring_service_id), token) + cfg_store = _find_config_store(CONFIG_STORE_NAME_TEMPLATE.format(sid=scoring_service_id), token) + return { + "scoring_service_id": scoring_service_id, + "scoring_service_name": name, + "scoring_domain": domain, + "scoring_keys_store_id": (keys_store or {}).get("id", ""), + "scoring_config_store_id": (cfg_store or {}).get("id", ""), + "aes_key_hex": "", + # On reuse, neither secret is readable back from the store. + # The orchestrator falls back to whatever it stashed in + # cfg["scoring"]["request_secret"] on a prior provision. + "request_secret": "", + } + + # 1. Create the wasm Compute service. + svc = fastly("POST", "/service", {"name": name, "type": "wasm"}, token=token) + scoring_service_id = svc["id"] + ok(f"Created scoring service {scoring_service_id}") + if status_cb: + status_cb(f"✅ Created scoring service '{name}'.") + + # 2. Add the domain to version 1 (auto-created with the service). + fastly( + "POST", + f"/service/{scoring_service_id}/version/1/domain", + {"name": domain}, + token=token, + ) + ok(f"Added domain {domain}") + if status_cb: + status_cb(f"✅ Added domain '{domain}'.") + + # 3. Add a placeholder backend (Compute services require at least one). + # The scorer never calls it; it's just to make the service version + # valid. + fastly( + "POST", + f"/service/{scoring_service_id}/version/1/backend", + { + "name": "placeholder_origin", + "address": "127.0.0.1", + "port": 80, + "override_host": "example.com", + }, + token=token, + ) + ok("Added placeholder backend") + + # 4. Create the two ConfigStores, namespaced by the scoring service id. + keys_store_name = KEYS_STORE_NAME_TEMPLATE.format(sid=scoring_service_id) + cfg_store_name = CONFIG_STORE_NAME_TEMPLATE.format(sid=scoring_service_id) + + keys_store = fastly("POST", "/resources/stores/config", {"name": keys_store_name}, token=token) + cfg_store = fastly("POST", "/resources/stores/config", {"name": cfg_store_name}, token=token) + ok(f"Created config stores {keys_store_name}, {cfg_store_name}") + if status_cb: + status_cb("✅ Created config stores.") + + # 5. Generate the AES-256 key + request secret and write both to + # scoring_keys. The request secret is the shared-secret header + # value that VCL embeds in X-Edge-Scorer-Auth so the Compute + # service can reject requests not coming from "our" VCL. + aes_key_hex = secrets.token_hex(32) + request_secret = secrets.token_hex(32) + fastly( + "POST", + f"/resources/stores/config/{keys_store['id']}/item", + {"item_key": CURRENT_KEY_HEX, "item_value": aes_key_hex}, + token=token, + ) + fastly( + "POST", + f"/resources/stores/config/{keys_store['id']}/item", + {"item_key": REQUEST_SECRET_KEY, "item_value": request_secret}, + token=token, + ) + fastly( + "POST", + f"/resources/stores/config/{cfg_store['id']}/item", + {"item_key": DEBUG_LOG_KEY, "item_value": DEBUG_LOG_DEFAULT}, + token=token, + ) + ok("Populated config stores") + + # 6. Link both stores to the service version so the Wasm can open them + # by the short ResourceLink names (scoring_keys / scoring_config). + fastly( + "POST", + f"/service/{scoring_service_id}/version/1/resource", + {"name": KEYS_RESOURCE_LINK_NAME, "resource_id": keys_store["id"]}, + token=token, + ) + fastly( + "POST", + f"/service/{scoring_service_id}/version/1/resource", + {"name": CONFIG_RESOURCE_LINK_NAME, "resource_id": cfg_store["id"]}, + token=token, + ) + ok("Linked stores to service v1") + if status_cb: + status_cb("✅ Linked config stores to service v1.") + + return { + "scoring_service_id": scoring_service_id, + "scoring_service_name": name, + "scoring_domain": domain, + "scoring_keys_store_id": keys_store["id"], + "scoring_config_store_id": cfg_store["id"], + "aes_key_hex": aes_key_hex, + "request_secret": request_secret, + } + + +def rotate_aes_key( + scoring_keys_store_id: str, + *, + token: str, +) -> dict: + """Rotate the AES-GCM cookie-state encryption key for a scoring service. + + Pulls the current ``current_key_hex`` from the scoring_keys + ConfigStore, moves it to ``previous_key_hex``, generates a fresh + 32-byte key, writes it as the new ``current_key_hex``. The Rust + scorer's cookie codec tries current first then previous, so cookies + issued under the old key keep decoding through one full rotation + cycle (typically the cookie idle-expire window, ~hours). + + Idempotent — calling twice rotates twice, and the previous-previous + key is dropped (only one rotation grace level by design). Fastly + ConfigStore items use PUT for replace; ``item_value`` is the new + hex string. + + Returns ``{"current_key_hex": "", "previous_key_hex": "", + "rotated_at": ""}`` so the caller can audit. + """ + import datetime as _dt + import secrets + + if not scoring_keys_store_id: + raise ValueError("scoring_keys_store_id is required") + + # Fetch current to move it into previous_key_hex slot. + try: + cur_item = fastly( + "GET", + f"/resources/stores/config/{scoring_keys_store_id}/item/{CURRENT_KEY_HEX}", + token=token, + ) + prev_value = (cur_item or {}).get("item_value", "") or "" + except Exception: + prev_value = "" + + new_key = secrets.token_hex(32) + rotated_at = _dt.datetime.now(_dt.UTC).isoformat(timespec="seconds") + + # PATCH updates an existing item. If previous_key_hex doesn't exist + # yet (first rotation ever), PATCH 404s — fall back to POST. + def _upsert_item(key: str, value: str) -> None: + try: + fastly( + "PATCH", + f"/resources/stores/config/{scoring_keys_store_id}/item/{key}", + {"item_value": value}, + token=token, + ) + except Exception: + fastly( + "POST", + f"/resources/stores/config/{scoring_keys_store_id}/item", + {"item_key": key, "item_value": value}, + token=token, + ) + + if prev_value: + _upsert_item(PREVIOUS_KEY_HEX, prev_value) + _upsert_item(CURRENT_KEY_HEX, new_key) + ok(f"Rotated AES key at {rotated_at} (previous_key preserved for grace window)") + + return { + "current_key_hex": new_key, + "previous_key_hex": prev_value, + "rotated_at": rotated_at, + } + + +def delete_scoring_service( + scoring_service_id: str, + *, + scoring_keys_store_id: str = "", + scoring_config_store_id: str = "", + token: str, + status_cb=None, +) -> None: + """Tear down the Compute service AND both ConfigStores. Idempotent — + deleting an already-deleted resource is a no-op. + + Order: service first (deactivate → delete), then stores. Service must + go first because the resource-link tying the stores to the service + will block store-deletion otherwise.""" + if not scoring_service_id: + warn("delete_scoring_service called with empty service id — nothing to do") + return + + info(f"Tearing down scoring service {_c(BOLD, scoring_service_id)}") + if status_cb: + status_cb(f"⏳ Tearing down scoring service '{scoring_service_id}'...") + + # 1. Deactivate any active versions so we can delete the service. + try: + versions = fastly("GET", f"/service/{scoring_service_id}/version", token=token) or [] + for v in versions: + if v.get("active"): + if status_cb: + status_cb(f"⏳ Deactivating version {v['number']}...") + fastly( + "PUT", + f"/service/{scoring_service_id}/version/{v['number']}/deactivate", + token=token, + ) + except RuntimeError as exc: + if "404" in str(exc): + ok("Scoring service already deleted") + return + # fall through; delete still might work + warn(f"Failed to deactivate versions (will try delete anyway): {exc}") + + # 2. Delete the service. + try: + fastly("DELETE", f"/service/{scoring_service_id}", token=token, expect_empty=True) + ok("Scoring service deleted") + except RuntimeError as exc: + if "404" in str(exc): + ok("Scoring service already deleted") + else: + raise + + # 3. Delete the config stores. Each lookup-then-delete is tolerant of + # "already deleted" so this is safe to re-run. + for label, store_id in ( + ("scoring_keys", scoring_keys_store_id), + ("scoring_config", scoring_config_store_id), + ): + if not store_id: + continue + try: + fastly( + "DELETE", + f"/resources/stores/config/{store_id}", + token=token, + expect_empty=True, + ) + ok(f"Deleted {label} store ({store_id})") + except RuntimeError as exc: + if "404" in str(exc): + ok(f"{label} store already deleted") + else: + warn(f"Could not delete {label} store {store_id}: {exc}") + + if status_cb: + status_cb("✅ Scoring service torn down.") diff --git a/backend/provision/session_scoring_vcl.py b/backend/provision/session_scoring_vcl.py new file mode 100644 index 00000000..bc3dbf4b --- /dev/null +++ b/backend/provision/session_scoring_vcl.py @@ -0,0 +1,478 @@ +"""VCL snippet generator for the session-scoring restart pattern. + +Adapted from the canonical Fastly preflight pattern (fiddle 4b1a74ee). +Six snippets — recv / pass / fetch / miss / deliver / enforce — coordinate to: + + 1. recv: on first pass, route to the scorer Compute backend with + X-Edge-Scoring-Pass=1, return(pass). + 2. pass: inject the auth + service-id headers on bereq for the + upcoming scorer sub-fetch (pass is the correct subroutine + for bereq header mutations under return(pass)). + 3. fetch: when the backend is the scorer, return(deliver) to skip + cache + go straight to deliver with the scorer response. + 4. deliver: pass-1 captures all seven scorer values (score, l1, l2, + compliance, reason, sid, enforce) + the rotated Set-Cookie + into subfields of req.http.x-edge-score (single consolidated + header — eight subfields total), unsets the resp.http + .x-edge-* leaks, and issues a naked `restart`. pass-2 emits + the rotated cookie via `add resp.http.Set-Cookie` (additive + — preserves any origin cookies). + 5. miss: unset bereq.http.x-edge-score + X-Edge-Scoring-Pass so + neither leaks to the real origin on pass 2. + 6. enforce: on the post-scoring restart, error 429 when the scorer + emitted X-Edge-Score-Enforce=1 (operator committed an + enforce_threshold and the request's score met it). + +**Storage strategy.** All seven scoring values (plus the rotated +Set-Cookie) are stored as SUBFIELDS of ``req.http.x-edge-score`` — +single consolidated header keeps the per-request header budget small. +Log format reads the subfields via ``req.http.x-edge-score:score`` etc. + +**Why restart from vcl_deliver.** Empirically (v440), req.http +modifications made in vcl_fetch before `return(restart)` are invisible +to Fastly's log-format evaluator. Restarting from vcl_deliver after +writing the subfields is the working pattern. + +Fail-open contract: any error reaching the scorer (5xx, timeout, DNS +failure) sets ``req.http.x-edge-score:score = "0"`` and +``req.http.x-edge-score:compliance = "unknown"`` so the request flows +normally to origin and the log line still has populated score fields +(vs. NULLs that look like a misconfiguration). +""" + +from __future__ import annotations + +# Backend name handling. Fastly's API creates a backend whose VCL-visible +# name is "F_" + the raw name you submitted. So: +# - SCORING_BACKEND_API_NAME is what we POST to /backend → "session_scorer" +# - SCORING_BACKEND_VCL_NAME is what VCL sees → "F_session_scorer" +SCORING_BACKEND_API_NAME = "session_scorer" +SCORING_BACKEND_VCL_NAME = f"F_{SCORING_BACKEND_API_NAME}" + +# Snippet names. Stable string constants so disable_scoring can find and +# remove the exact snippets by name. Fastly only accepts +# [A-Za-z0-9_. -] in snippet names — no colons, slashes, or other +# punctuation. +SCORING_RECV_NAME = "Session Scoring - Recv" +SCORING_PASS_NAME = "Session Scoring - Pass" +SCORING_FETCH_NAME = "Session Scoring - Fetch" +SCORING_DELIVER_NAME = "Session Scoring - Deliver" +SCORING_MISS_NAME = "Session Scoring - Miss" +SCORING_ENFORCE_NAME = "Session Scoring - Enforce" + +# Snippet priority — lower runs first. 100 is the "after everything +# else" slot used by most user snippets on this service. +SCORING_SNIPPET_PRIORITY = 100 + +# vcl_fetch needs a low priority specifically — when the backend is +# the scorer, we want `return(deliver)` to fire IMMEDIATELY, before +# any other fetch-stage snippet (group-L timing, custom origin field +# captures, etc.) gets a chance to run against the scorer's response. +# Priority 1 puts us first in the fetch subroutine. +SCORING_FETCH_PRIORITY = 1 + + +# Default asset-extension regex: requests whose URL matches this regex +# bypass the scorer entirely. Static assets carry no session signal and +# routing them through Compute is wasted cost + capacity. +# +# This is the DEFAULT. Operators can override it per-service via the +# Session Scoring admin page; the operator-supplied value lives in the +# service config under ``scoring.exclude_url_regex`` and is interpolated +# into the recv snippet by ``recv_snippet`` below. An empty / unset +# override falls back to this default. +DEFAULT_ASSET_EXT_REGEX = ( + # Anchored at the start AND restricted to the path segment via + # ``[^?]*`` (any non-``?`` chars). Without the anchor + path-only + # restriction, ``/api/login?file=.png`` would also match — the + # extension test would see ``.png`` in the query string and skip + # scoring entirely, letting an attacker bypass session scoring on + # any dynamic endpoint by appending an asset extension to the + # query string. The fix bounds the match to the URL path. + r"^[^?]*" + r"\.(aif|aiff|au|avi|bin|bmp|cab|carb|cct|cdf|class|css|dcr|doc|" + r"dtd|exe|flv|gcf|gff|gif|grv|hdml|hqx|ico|ini|jpeg|jpg|js|mov|" + r"mp3|mp4|nc|pct|pdf|png|ppc|pws|svg|swa|swf|txt|vbs|w32|wav|" + r"wbmp|wml|wmlc|wmls|wmlsc|xsd|zip|webp|woff|woff2|ttf|bz2|gz|" + r"tgz|tar|pem|cer|sql|xml|dat|pub|log|json|md|bak|rar|eml|lzma|" + r"war|bz|7z|ts|m3u8)($|\?)" +) + +# Backwards-compat alias for tests / external callers that referenced +# the old name before the override path landed. +_ASSET_EXT_REGEX = DEFAULT_ASSET_EXT_REGEX + + +def resolve_exclude_url_regex(operator_override: str | None) -> str: + """Pick between the operator's override and the built-in default. + + Empty / None / whitespace-only → default. The operator-facing API + interprets the empty string as "I want the default" — same shape + as Pydantic optional-field handling. + """ + if operator_override is None: + return DEFAULT_ASSET_EXT_REGEX + cleaned = operator_override.strip() + return cleaned or DEFAULT_ASSET_EXT_REGEX + + +# Default HTTP status code returned by the enforce snippet when the scorer +# flags a request. Operator-overridable via cfg.scoring.enforce_status_code; +# bake-into-VCL at deploy so each change does a snippet swap (see +# update_enforce_status_code orchestrator) rather than needing a +# ConfigStore-to-VCL binding for a value that changes rarely. +DEFAULT_ENFORCE_STATUS_CODE = 429 + +# Allowed range — anything outside 4xx/5xx makes no sense for "reject". +_ENFORCE_STATUS_CODE_MIN = 400 +_ENFORCE_STATUS_CODE_MAX = 599 + + +def enforce_reason_phrase(status_code: int) -> str: + """HTTP reason phrase for the enforce snippet's synthetic body. + + Delegates to Python's ``http.HTTPStatus`` so any IANA-registered code + yields its canonical phrase (403 → "Forbidden", 451 → "Unavailable + For Legal Reasons", 511 → "Network Authentication Required", …). + Non-standard codes the operator might pick (419, 444, 530, 599) fall + back to ``"Blocked"`` — keeps the synthetic body meaningful even when + the stdlib map doesn't know the code.""" + import http + + try: + return http.HTTPStatus(status_code).phrase + except ValueError: + return "Blocked" + + +def resolve_enforce_status_code(operator_override: int | None) -> int: + """Pick the effective enforce status code. None / out-of-range → default. + + The PUT endpoint validates the operator's input before persistence, + so out-of-range here means a stale or corrupted cfg — fall back to + the safe default rather than baking a nonsensical code into VCL.""" + if operator_override is None: + return DEFAULT_ENFORCE_STATUS_CODE + try: + code = int(operator_override) + except (TypeError, ValueError): + return DEFAULT_ENFORCE_STATUS_CODE + if not (_ENFORCE_STATUS_CODE_MIN <= code <= _ENFORCE_STATUS_CODE_MAX): + return DEFAULT_ENFORCE_STATUS_CODE + return code + + +def recv_snippet( + logging_service_id: str, + request_secret: str, + *, + exclude_url_regex: str | None = None, +) -> str: + """vcl_recv snippet: at the EDGE on the first pass (no shield hop yet + AND req.restarts == 0 AND scoring-pass marker not set AND URL doesn't + match the exclusion regex), route to the scorer Compute backend with + X-Edge-Scoring-Pass=1, `return(pass)`. After the scoring restart + completes (req.restarts == 1) and the score was captured, re-enable + shielding for the real-origin pass so the cached object can be + served from the shield POP normally. + + ``exclude_url_regex`` is the operator-supplied regex of URLs to + SKIP from scoring. None or "" falls back to ``DEFAULT_ASSET_EXT_REGEX``. + The caller (orchestrator) is responsible for having validated the + regex via backend.utils.vcl_validator BEFORE getting here — this + function trusts its input and string-substitutes verbatim into the + VCL boolean expression. + + ``request_secret`` is also baked into the edge/shield boundary check + (``req.http.X-Edge-Shield-Auth != "{request_secret}"``). The original + boundary used ``fastly.ff.visits_this_service == 0``, which an + attacker could flip by setting their own ``Fastly-FF`` header + (Fastly's edge propagates the value verbatim to the next hop). The + secret-comparison form fails closed: only the edge's own pass/miss + subroutines (which know the secret because it's literally baked into + their VCL bodies) can set the header to a value that satisfies the + check. + + Note: `logging_service_id` is kept as an argument for symmetry with + peer snippet generators.""" + _ = logging_service_id + effective_regex = resolve_exclude_url_regex(exclude_url_regex) + return f"""# Session Scoring: client-edge header scrub (anti-spoofing). +# Edge-only — see X-Edge-Shield-Auth note below — so any client-supplied +# X-Edge-* gets stripped before it can be forged into a clean score. +# Also strip the X-Edge-Shield-Auth header itself so a client cannot +# pre-set it and skip our edge-only protections. +if (req.restarts == 0 && req.http.X-Edge-Shield-Auth != "{request_secret}") {{ + unset req.http.X-Edge-Shield-Auth; + unset req.http.X-Edge-Scoring-Pass; + unset req.http.X-Edge-Score; + unset req.http.X-Edge-Score-Reason; + unset req.http.X-Edge-Score-Enforce; + unset req.http.X-Edge-Sid; + unset req.http.X-Edge-Score-Set-Cookie; +}} + +# Session Scoring: route the first-pass dynamic request to the scorer. +# Edge-only — the pass/miss snippets set X-Edge-Shield-Auth on the +# bereq going to the shield, so the shield's vcl_recv reads back a +# matching secret and skips this block. An attacker who tries to spoof +# the boundary by sending their own Fastly-FF header cannot satisfy the +# secret comparison, so the edge-only logic still runs on their hop and +# they get scored / scrubbed normally. +# +# DDoS bypass (fastly.ddos_detected): when Fastly's L7 DDoS detection +# flags this request, do NOT route to Compute. Two reasons: +# 1. Cost ceiling — under attack, Compute invocations scale linearly +# with attack volume. Skipping flagged requests caps the blast +# radius while NGWAF / Fastly's mitigation handles the actual block. +# 2. Signal quality — the scorer's L2 transition matrix learns from +# benign traffic shapes; feeding attack traffic in pollutes the +# matrix even though those scores wouldn't be acted on. +# See: https://www.fastly.com/documentation/reference/vcl/variables/miscellaneous/fastly-ddos-detected/ +if (req.http.X-Edge-Shield-Auth != "{request_secret}" && req.restarts == 0 && req.http.X-Edge-Scoring-Pass != "1" && !fastly.ddos_detected && std.tolower(req.url) !~ "{effective_regex}") {{ + set req.backend = {SCORING_BACKEND_VCL_NAME}; + set req.http.X-Edge-Scoring-Pass = "1"; + # PASS — skip cache for the scoring sub-fetch. On the post-restart + # pass the scoring snippet doesn't re-fire because X-Edge-Scoring-Pass + # got unset in pass-1 deliver and req.restarts is now 1. + return(pass); +}} + +# Post-scoring restart: we captured the score in pass-1 deliver and the +# request flow is now headed for the real origin. Without this block, +# the previous `return(pass)` would have permanently disabled shielding +# for this request — re-enable it so the real-origin fetch can land on +# the shield POP normally. `var.fastly_req_do_shield` is the magic +# variable Fastly's auto-generated main VCL reads to decide whether to +# shield the request. +if (req.restarts == 1 && req.http.x-edge-score) {{ + set var.fastly_req_do_shield = true; +}}""" + + +def pass_snippet(logging_service_id: str, request_secret: str) -> str: + """vcl_pass snippet: when this is the scoring sub-fetch (backend == + scorer), inject the auth + service-id headers on bereq for the + upcoming sub-fetch. Also unset bereq.http.x-edge-score so any + attacker-supplied inbound x-edge-score doesn't get echoed into the + scorer's view of the request. + + Also stamps ``bereq.http.X-Edge-Shield-Auth = "{request_secret}"`` + on every pass — this is what the shield POP's vcl_recv reads back to + decide "skip edge-only blocks because this hop already ran them on + the edge". An attacker who tries to spoof Fastly-FF cannot satisfy + the shield-auth comparison because the secret is only ever set by + pass_snippet / miss_snippet (compiled into our VCL, never sent to + clients).""" + return f"""# Session Scoring: inject auth + service-id on the scorer sub-fetch. +# vcl_pass is the right subroutine for bereq mutations when recv used +# return(pass). +if (req.backend == {SCORING_BACKEND_VCL_NAME}) {{ + set bereq.http.X-Edge-Service-Id = "{logging_service_id}"; + # Shared-secret header — the scorer compares this to the + # request_secret ConfigStore entry and 401s on mismatch. Embedded + # literally in VCL which is compiled and never sent to clients. + set bereq.http.X-Edge-Scorer-Auth = "{request_secret}"; + # X-Edge-Scoring-Pass is an internal marker; the scorer doesn't need + # to see it and we don't want it polluting any downstream telemetry. + unset bereq.http.X-Edge-Scoring-Pass; +}} +# Strip any inbound x-edge-score header an attacker may have set; the +# real one is built by us in vcl_deliver after the scorer responds. +unset bereq.http.x-edge-score; +# Shield-auth marker — the shield POP's vcl_recv reads this back via +# req.http.X-Edge-Shield-Auth and skips the edge-only branches when it +# matches. Unspoofable from outside because the secret is baked into +# the compiled VCL and never returned to clients. +set bereq.http.X-Edge-Shield-Auth = "{request_secret}";""" + + +def fetch_snippet() -> str: + """vcl_fetch snippet: when the backend is the scorer, return(deliver) + so the response goes straight to deliver without any cache-related + handling. (return(pass) in recv already prevents caching, but + return(deliver) here is the canonical preflight-pattern shape and + avoids any weird interactions with beresp's TTL.)""" + return f"""# Session Scoring: skip cache handling for the scorer sub-fetch. +if (req.backend == {SCORING_BACKEND_VCL_NAME}) {{ + return(deliver); +}}""" + + +def deliver_snippet(request_secret: str) -> str: + """vcl_deliver snippet — the heart of the pattern. + + PASS 1 (X-Edge-Scoring-Pass == "1"): scorer's response is in + resp.http.x-edge-*. Stash all seven scorer values (score, l1, l2, + compliance, reason, sid, enforce) into req.http.x-edge-score + subfields (single consolidated header), stash Set-Cookie into a + :set-cookie subfield (eight subfields total), scrub the + resp.http.x-edge-* headers (anti-leak), then naked `restart`. + + PASS 2 (X-Edge-Scoring-Pass already gone): the stashed cookie gets + emitted via `add resp.http.Set-Cookie` (additive — preserves any + Set-Cookie the real origin set). + + The subfield writes in pass-1 deliver propagate to vcl_log via the + req.http persistence across restart. The log format reads + req.http.x-edge-score:score etc.""" + return f"""# Session Scoring: pass-1 stash + naked restart; pass-2 emit cookie. + +# ── PASS 1: capture scorer response into req.http.x-edge-score subfields ── +if (req.http.X-Edge-Scoring-Pass == "1") {{ + unset req.http.X-Edge-Scoring-Pass; + if (resp.status == 200) {{ + set req.http.x-edge-score:score = resp.http.x-edge-score; + set req.http.x-edge-score:l1 = resp.http.x-edge-score-l1; + set req.http.x-edge-score:l2 = resp.http.x-edge-score-l2; + set req.http.x-edge-score:compliance = resp.http.X-Edge-Cookie-Compliance; + set req.http.x-edge-score:reason = resp.http.x-edge-score-reason; + # Hex-encoded 6-byte session id. Used by the admin labeling UI to + # target individual sessions; the scorer issues a fresh sid when + # the inbound cookie is missing/tampered. + set req.http.x-edge-score:sid = resp.http.x-edge-sid; + # Enforcement signal — set by the Rust scorer when the operator + # has committed enforce_threshold to the scoring_config ConfigStore + # AND the request's score met it. Captured here so the recv- + # restart-2 Enforce snippet can read it via subfield. + set req.http.x-edge-score:enforce = resp.http.x-edge-score-enforce; + }} else {{ + # Scorer returned non-200 — fail open. No cookie to rotate. + set req.http.x-edge-score:score = "0"; + set req.http.x-edge-score:l1 = "0"; + set req.http.x-edge-score:l2 = "0"; + set req.http.x-edge-score:compliance = "unknown"; + set req.http.x-edge-score:reason = "compute-unavailable"; + }} + # Stash the rotated cookie as a subfield too; pass-2 reads it back + # and emits via add resp.http.Set-Cookie. + set req.http.x-edge-score:set-cookie = resp.http.Set-Cookie; + # Anti-leak: strip the scorer's resp.http.x-edge-* headers so they + # don't reach the client even if the restart path were to short- + # circuit somehow. + unset resp.http.x-edge-score; + unset resp.http.x-edge-score-l1; + unset resp.http.x-edge-score-l2; + unset resp.http.x-edge-score-reason; + unset resp.http.x-edge-sid; + unset resp.http.X-Edge-Cookie-Compliance; + unset resp.http.X-Edge-Matrix-Version; + unset resp.http.x-edge-score-enforce; + restart; +}} + +# ── PASS 2: real origin response — emit the rotated cookie additively ── +# Only emit at the EDGE. We detect "this is the edge hop" via the +# absence of the shield-auth secret on req.http.X-Edge-Shield-Auth; +# the shield POP receives that header from us (set in pass/miss), so +# the shield sees the match and skips this block. A spoofed +# Fastly-FF header cannot fake the secret, so attacker-induced +# duplicate Set-Cookie emission is no longer possible. +if (req.http.X-Edge-Shield-Auth != "{request_secret}" && req.http.x-edge-score:set-cookie != "") {{ + add resp.http.Set-Cookie = req.http.x-edge-score:set-cookie; +}}""" + + +def enforce_snippet(request_secret: str, status_code: int = DEFAULT_ENFORCE_STATUS_CODE) -> str: + """vcl_recv snippet that errors ``status_code`` when the scorer flagged the + request as over-threshold. + + Fires on req.restarts == 1 (after the scoring sub-fetch + restart) + when the deliver pass-1 snippet captured ``X-Edge-Score-Enforce: 1`` + from the scorer's response. The scorer only emits that header when + the operator has committed an enforce_threshold value via the + admin UI AND the request's score met it. + + Edge-only — the shield-auth secret comparison replaces the original + ``fastly.ff.visits_this_service == 0`` check, which an attacker + could flip by sending their own ``Fastly-FF`` header. + ``error `` instead of a `synth` keeps the door open + for a custom vcl_error page later. + + ``status_code`` defaults to 429 (Too Many Requests). Operators can + override via cfg.scoring.enforce_status_code; valid range 400-599. + The reason phrase is auto-mapped via ``enforce_reason_phrase``.""" + code = resolve_enforce_status_code(status_code) + reason = enforce_reason_phrase(code) + return ( + f"# Session Scoring: enforce committed threshold by erroring flagged requests.\n" + f"# Status code ({code} {reason}) is operator-configurable via\n" + f"# cfg.scoring.enforce_status_code; the update_enforce_status_code\n" + f"# orchestrator swaps this snippet on change. Default 429.\n" + f"# Fires only on the post-scoring restart (req.restarts == 1) when the\n" + f"# deliver pass-1 captured X-Edge-Score-Enforce=1 from the scorer.\n" + f"# Edge-only (unspoofable shield-auth comparison) so shield hops don't double-enforce.\n" + f'if (req.http.X-Edge-Shield-Auth != "{request_secret}" && req.restarts == 1 && req.http.x-edge-score:enforce == "1") {{\n' + f' error {code} "{reason}";\n' + f"}}" + ) + + +def miss_snippet(request_secret: str) -> str: + """vcl_miss snippet: defensive unsets. Strip inbound x-edge-score + (attacker could try to forge it) and X-Edge-Scoring-Pass (don't + leak the internal marker to the real origin on pass-2 fetch). + + Also stamps ``bereq.http.X-Edge-Shield-Auth = "{request_secret}"`` + on every miss-driven backend fetch so the shield POP's vcl_recv + sees the secret and skips edge-only blocks. Without this, only + pass-driven fetches set the marker, and a cacheable miss flow + would leave the marker unset on the shield hop — re-triggering + the edge-only branches and double-scoring.""" + return f"""# Session Scoring: strip internal scoring headers before forwarding to +# the real origin. x-edge-score could be attacker-supplied; the +# X-Edge-Scoring-Pass marker is internal-only. +unset bereq.http.x-edge-score; +unset bereq.http.X-Edge-Scoring-Pass; +# Shield-auth marker — match pass_snippet so the shield POP recognises +# this hop as edge-originated and skips re-running edge-only blocks. +set bereq.http.X-Edge-Shield-Auth = "{request_secret}";""" + + +def generate_scoring_vcl( + logging_service_id: str, + request_secret: str, + *, + exclude_url_regex: str | None = None, + enforce_status_code: int | None = None, +) -> dict[str, str]: + """Return a {snippet_name: vcl_body} dict for all six snippets. + + Caller passes each (name, body) pair to ``ensure_vcl_snippet`` + individually so the existing idempotent diff-and-update path + handles re-deploys cleanly. + + ``request_secret`` is the shared secret VCL embeds in the + X-Edge-Scorer-Auth header so the scoring Compute service can + reject requests that didn't originate from this VCL service. + + ``exclude_url_regex`` is the operator's per-service override of the + URL-exclusion regex used by recv_snippet. None / "" → default. + Pre-validated by backend.utils.vcl_validator at the API layer + before reaching this function. + + ``enforce_status_code`` is the operator's per-service override of the + HTTP status code the enforce snippet returns when the scorer flags + a request. None / out-of-range → default (429). + """ + return { + SCORING_RECV_NAME: recv_snippet(logging_service_id, request_secret, exclude_url_regex=exclude_url_regex), + SCORING_PASS_NAME: pass_snippet(logging_service_id, request_secret), + SCORING_FETCH_NAME: fetch_snippet(), + SCORING_DELIVER_NAME: deliver_snippet(request_secret), + SCORING_MISS_NAME: miss_snippet(request_secret), + SCORING_ENFORCE_NAME: enforce_snippet(request_secret, resolve_enforce_status_code(enforce_status_code)), + } + + +def scoring_snippet_names() -> list[str]: + """Names of the snippets we install. Used by disable_scoring to find + and remove them by name from the cloned VCL version.""" + return [ + SCORING_RECV_NAME, + SCORING_PASS_NAME, + SCORING_FETCH_NAME, + SCORING_DELIVER_NAME, + SCORING_MISS_NAME, + SCORING_ENFORCE_NAME, + ] diff --git a/backend/repositories/_base.py b/backend/repositories/_base.py index d818fdd8..2795f369 100644 --- a/backend/repositories/_base.py +++ b/backend/repositories/_base.py @@ -8,10 +8,39 @@ from __future__ import annotations import contextlib +import re import time +from typing import Any import duckdb +# Pre-compile once; called per ``runner.execute`` invocation. +_PARQUET_LIST_RE = re.compile(r"read_parquet\(\[\s*('[^']+'\s*(?:,\s*'[^']+'\s*)*)\]") + + +def _compact_sql_for_debug(sql: str) -> str: + """Replace explicit ``read_parquet([...long file list...])`` literals + with ``read_parquet([N files])`` for transport in the debug-panel + payload. + + The dashboard's per-request SQL embeds hundreds of buffer/rollup + parquet paths in a single ``read_parquet`` call. Shipping those + verbatim made ``_debug_queries`` ~220 KB of the response (60% of + total) — pure network + JSON-parse cost on every dashboard refresh + when the operator has ``DEBUG_RESPONSES=true`` set. The path list + isn't useful to a human reading the debug panel; the count is. + + Compacting cuts the field to ~tens of bytes per query without + losing the SQL shape an operator cares about for tuning. + """ + + def _replace(m: re.Match) -> str: + # Count items by quote pairs — cheap and exact. + count = m.group(1).count("'") // 2 + return f"read_parquet([{count} files]" + + return _PARQUET_LIST_RE.sub(_replace, sql) + @contextlib.contextmanager def _attach_sqlite(con: duckdb.DuckDBPyConnection, sqlite_path: str, alias: str): @@ -81,17 +110,7 @@ def _get_schema(con: duckdb.DuckDBPyConnection, src: dict) -> list[dict]: return get_schema(con, src) -def safe_iso(dt) -> str | None: - """Normalise a datetime or string to an ISO-8601 string ending in Z.""" - if dt is None: - return None - if hasattr(dt, "isoformat"): - s = dt.isoformat() - # Append Z for naive UTC datetimes that lack a tz suffix - if not s.endswith("Z") and "+" not in s and s.count("-") <= 2: - s += "Z" - return s - return str(dt) +from backend.utils.date_utils import safe_iso # noqa: E402, F401 — re-export def _is_stale_view_error(e: Exception) -> bool: @@ -236,10 +255,45 @@ def debug_calls(self) -> list[dict]: return [] def execute(self, q: str, p: list | None = None): - """Execute a query and track its execution time.""" + """Execute a query and track its execution time. + + Self-heals on stale-view errors: if the connection's bound view + references a buffer parquet file that no longer exists (the sync + cron deleted it between the view bind and this query), refresh + the view once and retry. Belt-and-suspenders alongside the pool's + checkout fingerprint — that catches the common case, this catches + the race where a commit lands while a query is in flight. + + ``execute_with_retry`` below also does this, but most callers use + plain ``execute()``, so the retry needs to live here too. The + cost when nothing's stale is a single Python try/except — no SQL, + no extra round-trip. + """ t0 = time.time() - res = self.con.execute(q, p if p is not None else []) - self.debug_queries.append({"sql": q.strip(), "time_ms": round((time.time() - t0) * 1000, 2)}) + try: + res = self.con.execute(q, p if p is not None else []) + except Exception as e: + if not _is_stale_view_error(e): + raise + try: + from backend.core import iceberg as db_iceberg + + # force=True skips the fast path. We're already in an + # error state because the view's cached SQL referenced a + # file that no longer exists on disk; the fast path + # would re-execute that same cached SQL (binding it, + # which succeeds — but the next query against the view + # would re-raise the same IOException). Force-rebuild + # reads disk under the lock and regenerates the SQL. + db_iceberg.update_iceberg_view(self.con, self.src, force=True) + except Exception: + # Refresh itself failed — surface the ORIGINAL error so + # callers see the real symptom, not the rebind side-effect. + raise e + res = self.con.execute(q, p if p is not None else []) + self.debug_queries.append( + {"sql": _compact_sql_for_debug(q.strip()), "time_ms": round((time.time() - t0) * 1000, 2)} + ) return res def get_schema_cols(self) -> list[str]: @@ -356,6 +410,185 @@ def temp_table( except Exception: pass + def execute_top_n_rollups( + self, + fields: list[str], + start_time: str | None, + end_time: str | None, + limit: int = 10, + ) -> tuple[list[tuple[str, Any, int]], list[str]]: + import os + from datetime import UTC, datetime, timedelta + + from backend.core.duckdb import _cache_dir + from backend.core.rollups import _is_safe_ident, _safe_table_for + from backend.utils.date_utils import parse_iso_utc + + cache_dir = _cache_dir(self.src) + rollup_dir = os.path.join(cache_dir, "rollups", "hour") + if not os.path.exists(rollup_dir): + return [], fields + + # Defense-in-depth: field names land in a SQL IN-list as quoted + # literals AND the service name lands in the base-table identifier. + # Both should already be safe (FIELDS + validate_custom_field + # constrain custom names; service IDs are Fastly-format slugs), but + # we re-validate here so a future caller can't pierce the boundary. + safe_fields = [f for f in fields if _is_safe_ident(f)] + if not safe_fields: + return [], fields + base_table = _safe_table_for(self.src) + if not base_table: + # Service name failed the identifier safelist; refuse to query. + return [], fields + + # Parse bounds + st_dt = parse_iso_utc(start_time) if start_time else None + et_dt = parse_iso_utc(end_time) if end_time else None + + hour_cond = "" + if st_dt: + st_str = st_dt.strftime("%Y-%m-%d-%H") + hour_cond += f" AND hour >= '{st_str}'" + if et_dt: + # Half-open semantics: a request ending exactly on an hour + # boundary (e.g. ``end_time=2026-06-04T15:00:00``) should + # EXCLUDE the 15:00 hour rollup (which covers [15:00, 16:00)). + # Subtracting 1 microsecond before strftime keeps mid-hour + # ends inclusive of the surrounding hour while making exact + # boundaries exclusive — matching how the live-hour query + # below uses ``timestamp < et_dt``. + et_inclusive = (et_dt - timedelta(microseconds=1)).strftime("%Y-%m-%d-%H") + hour_cond += f" AND hour <= '{et_inclusive}'" + + active_dt = datetime.now(UTC).replace(minute=0, second=0, microsecond=0) + active_dt_end = active_dt + timedelta(hours=1) + active_str = active_dt.strftime("%Y-%m-%d-%H") + + # Glob `rollups/hour/**/*.parquet` was the obvious shape but it has + # DuckDB enumerate every file under the tree before the WHERE clause + # can prune ANYTHING. On a service with N fields × H hours of rollups + # that's N*H file stats up front, dominating wall time (witnessed + # 2026-06-04: ~2.8s on 18,648 files for a 24h query that should be + # reading ~1,700). Hive-partition pruning kicks in AFTER the glob + # expands, not before. + # + # Instead: enumerate the exact (field, hour) combinations we want in + # Python (cheap directory listdir per field, bounded by safe_fields × + # hours-in-window), then pass DuckDB an explicit file list. Skips + # the glob, hands DuckDB only the files it needs. + st_str_floor = st_dt.strftime("%Y-%m-%d-%H") if st_dt else None + # End cutoff for the directory-list filter — `et_inclusive` was + # already computed above for the SQL fallback path. Use the same + # bounds here so the half-open semantics match. + if et_dt: + et_str_floor = (et_dt - timedelta(microseconds=1)).strftime("%Y-%m-%d-%H") + else: + et_str_floor = None + + target_paths: list[str] = [] + for field in safe_fields: + field_dir = os.path.join(rollup_dir, f"field={field}") + if not os.path.isdir(field_dir): + continue + try: + hour_entries = os.listdir(field_dir) + except OSError: + continue + for hour_entry in hour_entries: + if not hour_entry.startswith("hour="): + continue + hour = hour_entry[len("hour=") :] + # Lexicographic string compare is correct here because the + # YYYY-MM-DD-HH format is fixed-width. + if st_str_floor and hour < st_str_floor: + continue + if et_str_floor and hour > et_str_floor: + continue + if hour >= active_str: + # Active hour is served live, not from rollups. + continue + hour_dir = os.path.join(field_dir, hour_entry) + try: + for fname in os.listdir(hour_dir): + if fname.endswith(".parquet"): + target_paths.append(os.path.join(hour_dir, fname)) + except OSError: + continue + + if not target_paths: + rolled_res: list = [] + else: + # Inline the explicit path list as a SQL array literal. DuckDB + # handles thousands of paths fine in a single statement; the + # SQL string size is ~80 bytes/path × few-thousand = a few MB + # at worst, well within parser limits. hive_partitioning=1 + # still lets DuckDB read `field` from the path so the SELECT's + # `field` column resolves; `value`/`count` come from parquet + # content. + paths_sql = ", ".join("'" + p.replace("'", "''") + "'" for p in target_paths) + q = f""" + SELECT field, value, SUM(count) AS c + FROM read_parquet([{paths_sql}], hive_partitioning=1) + GROUP BY field, value + """ + try: + rolled_res = self.execute(q).fetchall() + except Exception: + rolled_res = [] + + # We also need to get the live active hour stats from the base table + live_res = [] + + live_where = f"timestamp >= '{active_dt.isoformat()}' AND timestamp < '{active_dt_end.isoformat()}'" + # We only query the active hour if it overlaps with the requested time window + should_query_live = True + if et_dt and et_dt <= active_dt: + should_query_live = False + if st_dt and st_dt >= active_dt_end: + should_query_live = False + + if should_query_live: + # We run a standard execute_top_n_batch query on the base table for just the active hour + try: + actual_cols = self.get_schema_cols() + from backend.core.duckdb import _get_schema + + schema_types = {col["name"]: col["type"] for col in _get_schema(self.con, self.src)} + + # To prevent creating a massive UNION, we'll create a temp table for just the live hour + tmp_name = self.create_filtered_temp_table(fields, actual_cols, base_table, live_where) + if tmp_name: + try: + live_res, _ = self.execute_top_n_batch(fields, tmp_name, actual_cols, schema_types, limit=limit) + finally: + try: + self.execute(f"DROP TABLE IF EXISTS {tmp_name}") + except Exception: + pass + except Exception: + pass + + # Combine rolled and live + combined = {} + for field, value, count in rolled_res: + key = (field, value) + combined[key] = combined.get(key, 0) + count + + for field, value, count in live_res: + key = (field, value) + combined[key] = combined.get(key, 0) + count + + # Sort and limit + top_results = [] + for field in fields: + field_items = [(k[1], v) for k, v in combined.items() if k[0] == field] + field_items.sort(key=lambda x: x[1], reverse=True) + for val, count in field_items[:limit]: + top_results.append((field, val, count)) + + return top_results, fields + def execute_top_n_batch( self, fields: list[str], table_name: str, actual_cols: list[str], schema_types: dict[str, str], limit: int = 10 ) -> tuple[list[tuple], list[str]]: diff --git a/backend/repositories/alerts.py b/backend/repositories/alerts.py index 4525fcb9..04de4a4b 100644 --- a/backend/repositories/alerts.py +++ b/backend/repositories/alerts.py @@ -45,6 +45,25 @@ def _find_alert_service(alert_id: str) -> str | None: return None +def get_alert_by_id(alert_id: str) -> dict | None: + """Return the alert row whose id matches ``alert_id`` (or None). + + Security (defense-in-depth): the cross-tenant scope check in + ``backend/routers/alerts.py:delete_alert`` calls this to look up + ``service_id`` BEFORE mutating, so an analyst attempting a + cross-tenant delete gets 403 and the underlying row stays untouched. + Without this helper that check is dead code and the gate falls + through to the middleware (which already blocks DELETE on + /api/alerts for analysts, but the router-level gate is the + secondary belt-and-suspenders). + """ + for sid in _all_service_ids(): + for a in metadata_db.list_alerts(sid, filter_service_id=sid): + if a.get("id") == alert_id: + return a + return None + + def toggle_alert(alert_id: str, enabled: bool, service_id_hint: str | None = None) -> dict: """Toggle an alert. ``service_id_hint`` (from request context) avoids the cross-service scan when known; falls back to scan when not provided.""" diff --git a/backend/repositories/dashboard.py b/backend/repositories/dashboard.py index 0fb74532..7d12b93d 100644 --- a/backend/repositories/dashboard.py +++ b/backend/repositories/dashboard.py @@ -27,16 +27,31 @@ from backend.repositories.utils.pagination import calc_offset # ── In-memory caches ────────────────────────────────────────────────────────── +# Bounded + actively-reaped: dashboard responses can be 30-240MB per entry, +# and diverse filter/time-range/interval combinations mint a distinct key +# each. The previous plain-dict version had a 30s TTL but only checked it +# on hit — stale entries were never evicted, so the cache grew unboundedly +# across hours of dashboard use (a primary OOM contributor on the 16GB VM). +# 500 entries × ~30MB = ~15GB worst case; in practice the working set is +# much smaller, but the cap is a hard backstop. +from backend.utils.bounded_cache import BoundedTTLCache -_dashboard_cache: dict[str, tuple[float, Any]] = {} DASHBOARD_CACHE_TTL = 30 # seconds +_dashboard_cache: BoundedTTLCache = BoundedTTLCache(maxsize=500, ttl_seconds=DASHBOARD_CACHE_TTL) # ── aggregates ──────────────────────────────────────────────────────────────── from backend.core.log_fields import LOG_FIELD_CATALOG -FIELDS = [f["id"] for f in LOG_FIELD_CATALOG if f["id"] != "_source_file"] + ["waf_sig_ind"] +# Virtual fields are catalog ids whose value is computed by exploding a +# real backing column (CSV string) into individual rows via DuckDB's +# unnest(string_split(...)). They live in the FIELDS list so the dashboard +# top-N machinery picks them up, but the cross-cutting loops below skip +# them in batch-stats / column-need passes (their backing column is what +# actually goes into the temp table). +_VIRTUAL_FIELDS = ("waf_sig_ind", "edge_score_reason_ind") +FIELDS = [f["id"] for f in LOG_FIELD_CATALOG if f["id"] != "_source_file"] + list(_VIRTUAL_FIELDS) def _add_bot_columns(actual_cols: set[str], columns: list[str], select_cols: list[str]) -> tuple[bool, bool]: @@ -89,9 +104,13 @@ def get_aggregates( ) cache_key = hashlib.sha256(f"{_key_payload}:{source_name}".encode()).hexdigest() now = time.time() - if DASHBOARD_CACHE_TTL > 0 and cache_key in _dashboard_cache: - cached_at, cached_res = _dashboard_cache[cache_key] - if now - cached_at < DASHBOARD_CACHE_TTL: + if DASHBOARD_CACHE_TTL > 0: + # BoundedTTLCache's ``__contains__`` / ``[]`` already enforce TTL + # internally, so an entry that reads as present is by definition + # still fresh — no need for the legacy ``now - cached_at`` check. + cached_entry = _dashboard_cache.get(cache_key) + if cached_entry is not None: + cached_at, cached_res = cached_entry cached_res = cached_res.copy() cached_res["_is_cached"] = True return cached_res @@ -122,7 +141,13 @@ def get_aggregates( if "timestamp" in actual_cols: needed_cols.add('"timestamp"') for field in fields: - if field == "waf_sig_ind": + if field in _VIRTUAL_FIELDS: + # Virtual fields are exploded from a backing column further + # down; make sure the backing column is in the temp table. + if field == "waf_sig_ind" and "waf_sig" in actual_cols: + needed_cols.add('"waf_sig"') + elif field == "edge_score_reason_ind" and "edge_score_reason" in actual_cols: + needed_cols.add('"edge_score_reason"') continue if field in actual_cols: needed_cols.add(f'"{field}"') @@ -146,60 +171,126 @@ def get_aggregates( needed_cols.add(f'"{mc}"') cols_str = ", ".join(needed_cols) if needed_cols else "*" - # Use TEMP TABLE instead of TEMP VIEW to materialize the filtered results in memory. - # This prevents DuckDB from re-scanning the underlying files for every branch of the UNION ALL. - temp_table = f"t_{uuid.uuid4().hex}" - sql = f"CREATE TEMP TABLE {temp_table} AS SELECT {cols_str} FROM {table_name} WHERE {where_clause}" - if not runner.create_temp_table(sql, params): - empty = {f: {"top": [], "total": 0} for f in fields} - return { - "data": empty, - "time_series": [], - "map_data": [], - "where_clause": "1=1", - "interval": interval, - "metric": "requests", - "total_rows": 0, - "total_rows_total": 0, - **runner.telemetry(), - } - - # All subsequent queries use the temp table - table_name = temp_table - where_clause = "1=1" - params = [] + # Only take the rollup fast-path when no filters AND a populated + # rollups tree actually exists on disk. Without the existence check + # the dashboard routed unfiltered queries to execute_top_n_rollups + # on services where the initial backfill hadn't completed (or in + # tests with no rollups built), producing an empty top-N — the field + # totals stayed at their zero-initialisers since the populate loop + # is gated on a non-empty top-N. Witnessed in + # test_get_aggregates_with_data 2026-06-04: 60 mock logs seeded, + # field_totals["url"] computed correctly via Q2, but results["url"] + # ["total"] stuck at 0 because no rollup row arrived to trigger the + # populate path. The temp-table fallback always populates totals. + from backend.core.duckdb import _cache_dir as _cache_dir_for_rollups + + rollup_dir = os.path.join(_cache_dir_for_rollups(src), "rollups", "hour") + use_rollups = not filters and os.path.isdir(rollup_dir) + + if use_rollups: + table_name = _safe_table(source_name) + else: + # Use TEMP TABLE instead of TEMP VIEW to materialize the filtered results in memory. + # This prevents DuckDB from re-scanning the underlying files for every branch of the UNION ALL. + temp_table = f"t_{uuid.uuid4().hex}" + sql = f"CREATE TEMP TABLE {temp_table} AS SELECT {cols_str} FROM {table_name} WHERE {where_clause}" + if not runner.create_temp_table(sql, params): + empty = {f: {"top": [], "total": 0} for f in fields} + return { + "data": empty, + "time_series": [], + "map_data": [], + "where_clause": "1=1", + "interval": interval, + "metric": "requests", + "total_rows": 0, + "total_rows_total": 0, + **runner.telemetry(), + } + # All subsequent queries use the temp table + table_name = temp_table + where_clause = "1=1" + params = [] results: dict[str, Any] = {f: {"top": [], "total": 0} for f in fields} try: - # Optimization: Combine count(*) and field counts into a single scan - count_cols: list[str] = [CANONICAL_METRICS["requests"]] - valid_fields: list[str] = [] - for field in fields: - if field == "waf_sig_ind": - continue - if field in actual_cols: - count_cols.append(f"count({resolve_col(field, actual_cols)})") - valid_fields.append(field) field_totals: dict[str, int] = {} total_rows = 0 earliest_log_at = None latest_log_at = None - if count_cols: - count_res = runner.execute(f"SELECT {', '.join(count_cols)} FROM {table_name}").fetchone() - total_rows = count_res[0] - for i, field in enumerate(valid_fields): - field_totals[field] = count_res[i + 1] + + if use_rollups: + # When the rollup fast-path is active, skip the wide per-column + # COUNT entirely. Two reasons it dominated wall time before: + # 1. 72 count(col) calls in one statement force DuckDB to + # touch every column for every row in the window — ~1s on + # prod's 24h × 3M-row view (witnessed 2026-06-04: Q2 was + # 1063ms of a 3194ms dashboard). + # 2. The output of all 72 counts is reconstructible from the + # rollup query's (field, value, count) rows: SUM by field + # across the result IS field_totals[field] for any field + # the user displays. We pay for it once via the rollup + # read instead of twice. + # + # Caveat: TOP_K per (field, hour) caps the rollup to the 500 + # most-frequent values per hour. For high-cardinality fields + # (timestamp at per-second granularity, or unique-per-request + # ids) the SUM under-counts vs the true non-null count. In + # practice the dashboard shows top-10 with their percentages; + # mild under-counting of the denominator is acceptable for + # the perf win. If we ever need exact per-field totals here, + # add a `__total__` aggregate row to each rollup parquet. + try: + total_rows = runner.execute( + f"SELECT {CANONICAL_METRICS['requests']} FROM {table_name} WHERE {where_clause}", params + ).fetchone()[0] + except Exception: + total_rows = 0 + else: + # Non-rollups path keeps the wide COUNT — we have the + # filtered temp table loaded; one combined scan is cheaper + # than re-counting per field downstream. + count_cols: list[str] = [CANONICAL_METRICS["requests"]] + valid_fields: list[str] = [] + for field in fields: + if field in _VIRTUAL_FIELDS: + continue + if field in actual_cols: + count_cols.append(f"count({resolve_col(field, actual_cols)})") + valid_fields.append(field) + if count_cols: + count_res = runner.execute( + f"SELECT {', '.join(count_cols)} FROM {table_name} WHERE {where_clause}", params + ).fetchone() + total_rows = count_res[0] + for i, field in enumerate(valid_fields): + field_totals[field] = count_res[i + 1] orig_table_name = _safe_table(source_name) total_rows_total, earliest_log_at, latest_log_at = get_source_extent(runner, src, orig_table_name) schema_types = {col["name"]: col["type"] for col in _get_schema(con, src)} - batch_fields = [f for f in fields if f != "waf_sig_ind" and f in field_totals] - all_top_res, field_order = runner.execute_top_n_batch( - batch_fields, table_name, actual_cols, schema_types, limit=10 - ) + # When use_rollups=True, field_totals is empty here — populate it + # below from the rollup query results. Use the full eligible field + # list (anything non-virtual + in schema) as batch_fields; the + # rollup helper silently skips fields it has no data for. + if use_rollups: + batch_fields = [f for f in fields if f not in _VIRTUAL_FIELDS and f in actual_cols] + else: + batch_fields = [f for f in fields if f not in _VIRTUAL_FIELDS and f in field_totals] + if use_rollups: + all_top_res, field_order = runner.execute_top_n_rollups(batch_fields, start_time, end_time, limit=10) + # Derive field_totals from the rollup result (cheap Python sum). + # Each row is (field, value, count); per-field sum = total of + # values covered by the top-K rollup for that field. + for f_name, _f_val, f_count in all_top_res: + field_totals[f_name] = field_totals.get(f_name, 0) + int(f_count) + else: + all_top_res, field_order = runner.execute_top_n_batch( + batch_fields, table_name, actual_cols, schema_types, limit=10 + ) if all_top_res: # Group results back by field @@ -228,33 +319,44 @@ def get_aggregates( results[f_name]["top"].append(entry) - # Special handling for individual WAF signals (remains separate due to unnest overhead) - if "waf_sig_ind" in FIELDS: - if "waf_sig" in actual_cols: - q = f""" - WITH split_data AS ( - SELECT trim(signal) AS signal - FROM ( - SELECT unnest(string_split("waf_sig", ',')) AS signal - FROM {table_name} - WHERE "waf_sig" IS NOT NULL AND "waf_sig" != '' - ) - WHERE trim(signal) != '' - ), - total_count AS (SELECT {CANONICAL_METRICS["requests"]} AS tc FROM split_data), - top_values AS ( - SELECT signal AS value, {CANONICAL_METRICS["requests"]} AS c - FROM split_data GROUP BY 1 ORDER BY 2 DESC LIMIT 10 + # Virtual fields: explode comma-separated CSV columns into individual + # rows via unnest(string_split(...)). Generalized helper handles both + # waf_sig_ind (backed by waf_sig) and edge_score_reason_ind (backed + # by edge_score_reason) — same pattern, different backing columns. + def _exploded_top_n(virtual_id: str, backing_col: str) -> None: + if virtual_id not in fields: + return + if backing_col not in actual_cols: + results[virtual_id] = {"top": [], "total": 0} + return + q = f""" + WITH split_data AS ( + SELECT trim(signal) AS signal + FROM ( + SELECT unnest(string_split("{backing_col}", ',')) AS signal + FROM {table_name} + WHERE "{backing_col}" IS NOT NULL AND "{backing_col}" != '' AND {where_clause} ) - SELECT tv.value, tv.c, tc.tc FROM top_values tv CROSS JOIN total_count tc - """ - res = runner.execute(q).fetchall() - if res: - results["waf_sig_ind"] = {"top": [{"value": r[0], "count": r[1]} for r in res], "total": res[0][2]} - else: - results["waf_sig_ind"] = {"top": [], "total": 0} + WHERE trim(signal) != '' + ), + total_count AS (SELECT {CANONICAL_METRICS["requests"]} AS tc FROM split_data), + top_values AS ( + SELECT signal AS value, {CANONICAL_METRICS["requests"]} AS c + FROM split_data GROUP BY 1 ORDER BY 2 DESC LIMIT 10 + ) + SELECT tv.value, tv.c, tc.tc FROM top_values tv CROSS JOIN total_count tc + """ + res = runner.execute(q).fetchall() + if res: + results[virtual_id] = { + "top": [{"value": r[0], "count": r[1]} for r in res], + "total": res[0][2], + } else: - results["waf_sig_ind"] = {"top": [], "total": 0} + results[virtual_id] = {"top": [], "total": 0} + + _exploded_top_n("waf_sig_ind", "waf_sig") + _exploded_top_n("edge_score_reason_ind", "edge_score_reason") # Special handling for conn_requests (bucketed histogram) if "conn_requests" in actual_cols: @@ -268,7 +370,7 @@ def get_aggregates( END AS bucket, {CANONICAL_METRICS["requests"]} AS c FROM {table_name} - WHERE "conn_requests" IS NOT NULL AND "conn_requests" > 0 + WHERE "conn_requests" IS NOT NULL AND "conn_requests" > 0 AND {where_clause} GROUP BY 1 ORDER BY MIN("conn_requests") """ @@ -296,7 +398,7 @@ def get_aggregates( SELECT {time_bucket_select(interval)}, {CANONICAL_METRICS["5xx_rate"]} AS value FROM {table_name} - WHERE timestamp IS NOT NULL + WHERE timestamp IS NOT NULL AND {where_clause} GROUP BY 1 ORDER BY 1 """ elif chart_metric == "4xx" and "status" in actual_cols: @@ -305,7 +407,7 @@ def get_aggregates( SELECT {time_bucket_select(interval)}, {CANONICAL_METRICS["4xx_rate"]} AS value FROM {table_name} - WHERE timestamp IS NOT NULL + WHERE timestamp IS NOT NULL AND {where_clause} GROUP BY 1 ORDER BY 1 """ elif chart_metric == "hit_rate" and ("cache" in actual_cols or "resp_state" in actual_cols): @@ -317,7 +419,7 @@ def get_aggregates( SELECT {time_bucket_select(interval)}, {hit_rate_expr} AS value FROM {table_name} - WHERE timestamp IS NOT NULL + WHERE timestamp IS NOT NULL AND {where_clause} GROUP BY 1 ORDER BY 1 """ elif chart_metric.endswith("_latency") and ("elapsed" in actual_cols or "elapsed_us" in actual_cols): @@ -331,7 +433,7 @@ def get_aggregates( SELECT {time_bucket_select(interval)}, {percentile_ms_expr(sql_elapsed, percentile)} AS value FROM {table_name} - WHERE timestamp IS NOT NULL AND {sql_elapsed} IS NOT NULL + WHERE timestamp IS NOT NULL AND {sql_elapsed} IS NOT NULL AND {where_clause} GROUP BY 1 ORDER BY 1 """ elif chart_metric == "throughput" and "resp_bytes" in actual_cols and "elapsed" in actual_cols: @@ -343,7 +445,7 @@ def get_aggregates( SELECT {time_bucket_select(interval)}, {CANONICAL_METRICS["throughput"].format(cache_col=sql_cache, elapsed_col=sql_elapsed_val, resp_bytes_col=sql_resp_bytes)} AS value FROM {table_name} - WHERE timestamp IS NOT NULL + WHERE timestamp IS NOT NULL AND {where_clause} GROUP BY 1 ORDER BY 1 """ elif chart_metric == "req_size" and any(c in actual_cols for c in ["req_header_bytes", "req_bytes"]): @@ -354,7 +456,7 @@ def get_aggregates( SELECT {time_bucket_select(interval)}, {CANONICAL_METRICS["req_size"].format(header_bytes_col=header_col, req_bytes_col=body_col)} AS value FROM {table_name} - WHERE timestamp IS NOT NULL + WHERE timestamp IS NOT NULL AND {where_clause} GROUP BY 1 ORDER BY 1 """ elif chart_metric == "ttfb" and "ttfb" in actual_cols: @@ -363,7 +465,7 @@ def get_aggregates( SELECT {time_bucket_select(interval)}, {CANONICAL_METRICS["ttfb_ms"]} AS value FROM {table_name} - WHERE timestamp IS NOT NULL + WHERE timestamp IS NOT NULL AND {where_clause} GROUP BY 1 ORDER BY 1 """ else: @@ -372,11 +474,11 @@ def get_aggregates( SELECT {time_bucket_select(interval)}, {CANONICAL_METRICS["requests"]} AS value FROM {table_name} - WHERE timestamp IS NOT NULL + WHERE timestamp IS NOT NULL AND {where_clause} GROUP BY 1 ORDER BY 1 """ - ts_res = runner.execute(ts_q, []).fetchall() + ts_res = runner.execute(ts_q, params).fetchall() for r in ts_res: if r[0] is None: continue @@ -388,13 +490,32 @@ def get_aggregates( # Map data map_data: list[dict] = [] if "country" in actual_cols: - map_q = f""" - SELECT "country" AS country, {CANONICAL_METRICS["requests"]} AS count - FROM {table_name} - WHERE "country" IS NOT NULL - GROUP BY 1 - """ - map_data = [{"country": r[0], "count": r[1]} for r in runner.execute(map_q, []).fetchall()] + # When use_rollups is active AND the request asked for country + # in its top-N field set, we already have the per-country counts + # in all_top_res from the rollup read — re-running the same + # GROUP BY on the base view was costing ~140ms of pure + # duplication on prod (witnessed 2026-06-04: Q8 = 138ms of a + # 1687ms backend total). Derive map_data from all_top_res + # instead. The rollup caps at TOP_K=500 per (field, hour) + # which for `country` (~200 distinct values worldwide) is + # effectively the full distribution; no visible difference + # in the choropleth. + derived = False + if use_rollups and any(f == "country" for f, _, _ in all_top_res): + country_counts: dict[str, int] = {} + for f_name, f_val, f_count in all_top_res: + if f_name == "country" and f_val is not None: + country_counts[f_val] = country_counts.get(f_val, 0) + int(f_count) + map_data = [{"country": k, "count": v} for k, v in country_counts.items()] + derived = True + if not derived: + map_q = f""" + SELECT "country" AS country, {CANONICAL_METRICS["requests"]} AS count + FROM {table_name} + WHERE "country" IS NOT NULL AND {where_clause} + GROUP BY 1 + """ + map_data = [{"country": r[0], "count": r[1]} for r in runner.execute(map_q, params).fetchall()] payload: dict[str, Any] = { "data": results, @@ -414,10 +535,11 @@ def get_aggregates( return payload finally: - try: - con.execute(f"DROP TABLE IF EXISTS {temp_table}") - except Exception: - pass + if not use_rollups: + try: + con.execute(f"DROP TABLE IF EXISTS {temp_table}") + except Exception: + pass # ── raw ─────────────────────────────────────────────────────────────────────── @@ -519,27 +641,16 @@ def get_raw( records = filtered_records col_names = columns + # Total-rows + extent come from get_source_extent which itself + # prefers the cached config status (populated by the sync cron) and + # only falls back to a live aggregate when the cache is missing. + # The previous inline COUNT/min/max scanned the whole Iceberg + # manifest on every dashboard mount — get_source_extent caches the + # warm path and skips the scan entirely in steady state. try: - from backend import config as svcconfig - - cached_status = svcconfig.get_status(src["name"]) - if cached_status: - total_rows_total = cached_status.get("local_rows", 0) - earliest_log_at = cached_status.get("earliest_log_at") - latest_log_at = cached_status.get("latest_log_at") - else: - agg_res = runner.execute( - f"SELECT {CANONICAL_METRICS['requests']}, min(timestamp), max(timestamp) FROM {table_name}" - ).fetchone() - if agg_res: - total_rows_total = agg_res[0] - earliest_log_at = safe_iso(agg_res[1]) - latest_log_at = safe_iso(agg_res[2]) + total_rows_total, earliest_log_at, latest_log_at = get_source_extent(runner, src, table_name) except Exception: - try: - total_rows_total = runner.execute(f"SELECT {CANONICAL_METRICS['requests']} FROM {table_name}").fetchone()[0] - except Exception: - pass + pass return { "columns": col_names, @@ -716,14 +827,21 @@ def get_field_values( sorted_vals = sorted(bot_counts.values(), key=lambda x: x["count"], reverse=True) return {"values": sorted_vals[:limit], "field": field, **runner.telemetry()} - is_signals_individual = field == "waf_sig_ind" - backing_col = "waf_sig" if is_signals_individual else clean_field + # Virtual fields that explode a CSV backing column: filter-lookup + # routes through the same unnest path so click-to-filter on a + # specific signal / reason works the same as native columns. + _VIRTUAL_BACKING = { + "waf_sig_ind": "waf_sig", + "edge_score_reason_ind": "edge_score_reason", + } + is_signals_individual = field in _VIRTUAL_BACKING + backing_col = _VIRTUAL_BACKING[field] if is_signals_individual else clean_field if backing_col not in actual_cols: raise LookupError(f"Field '{field}' not found") search_params = list(params) - if is_signals_individual or clean_field == "waf_sig": + if is_signals_individual or clean_field in ("waf_sig", "edge_score_reason"): search_cond = "" if search: search_cond = "AND trim(signal) ILIKE ?" diff --git a/backend/repositories/insights/repository.py b/backend/repositories/insights/repository.py index addecd5c..47c456c2 100644 --- a/backend/repositories/insights/repository.py +++ b/backend/repositories/insights/repository.py @@ -18,7 +18,14 @@ # ── Caches ──────────────────────────────────────────────────────────────────── INSIGHTS_CACHE_TTL = 300 # seconds -_insights_cache: dict = {} +# Bounded + lazy-reaped. Pre-migration this was a plain dict; entries +# were time-bucketed by ``int(time.time() / TTL)`` so each TTL window +# minted distinct keys but old buckets were never removed. Across hours +# of admin use the bucket-count grew linearly. 500 entries × insights +# payload (~100KB) caps this around ~50MB. +from backend.utils.bounded_cache import BoundedTTLCache as _BoundedTTLCache + +_insights_cache: _BoundedTTLCache = _BoundedTTLCache(maxsize=500, ttl_seconds=INSIGHTS_CACHE_TTL) _insights_cache_lock = threading.Lock() diff --git a/backend/repositories/query.py b/backend/repositories/query.py index bd70f1da..c9a585bc 100644 --- a/backend/repositories/query.py +++ b/backend/repositories/query.py @@ -9,22 +9,14 @@ import duckdb -from backend.repositories._base import _get_schema, _safe_table -from backend.utils.telemetry import get_tracked_calls - -_BLOCKED_KEYWORDS = ( - "DROP", - "DELETE", - "UPDATE", - "INSERT", - "ALTER", - "TRUNCATE", - "CREATE", - "ATTACH", - "COPY", - "EXPORT", - "IMPORT", +from backend.repositories._base import _compact_sql_for_debug, _get_schema, _safe_table +from backend.utils.sql_validator import ( + SQLValidationError, + apply_user_query_limits, + has_limit_clause, + validate_user_sql, ) +from backend.utils.telemetry import get_tracked_calls def execute_query( @@ -33,16 +25,41 @@ def execute_query( sql: str, max_rows: int, want_explain: bool, + *, + session_id: str | None = None, + service_id: str | None = None, ) -> dict: if src: table_name = _safe_table(src["name"]) if table_name != "logs": sql = re.sub(r"\blogs\b", table_name, sql, flags=re.IGNORECASE) - sql_upper = sql.upper() - for kw in _BLOCKED_KEYWORDS: - if re.search(rf"\b{kw}\b", sql_upper): - raise PermissionError(f"Only read-only queries are allowed (blocked keyword: {kw})") + # Security (Decision B): run the user SQL through the + # parse-tree validator. The previous regex-based ``_BLOCKED_KEYWORDS`` + # check missed: + # - read_csv_auto / read_parquet / iceberg_scan family (arbitrary + # file/S3 read via table functions) + # - getenv / current_setting / duckdb_secrets (env/secret exfil) + # - information_schema.* (introspection bypass via non-prefix name) + # - INSTALL / LOAD (which don't contain any blocked keyword) + # The validator runs ``json_serialize_sql`` and walks the resulting + # parse tree so every nested subquery / CTE / table-function is + # inspected. See backend/utils/sql_validator.py for the policy. + try: + validate_user_sql( + sql, + parser_con=con, + session_id=session_id, + service_id=service_id, + ) + except SQLValidationError as exc: + # PermissionError is what the route handler maps to HTTP 403. + raise PermissionError(exc.message) from exc + + # Execution-side defense-in-depth: cap memory and timeout on the + # connection before running the user query. Independent of parse + # validation — a legal query can still scan 100M rows. + apply_user_query_limits(con) _debug_queries: list[dict] = [] if src: @@ -55,7 +72,9 @@ def execute_query( t_exp = time.monotonic() plan_rows = con.execute(f"EXPLAIN {sql}").fetchall() explain_plan = "\n".join(r[1] for r in plan_rows if r[1]) - _debug_queries.append({"sql": f"EXPLAIN {sql}", "time_ms": round((time.monotonic() - t_exp) * 1000, 2)}) + _debug_queries.append( + {"sql": _compact_sql_for_debug(f"EXPLAIN {sql}"), "time_ms": round((time.monotonic() - t_exp) * 1000, 2)} + ) # Auto-apply LIMIT max_rows+1 when the query doesn't already have one. # Without this, `SELECT * FROM logs ORDER BY timestamp DESC` materializes @@ -67,9 +86,15 @@ def execute_query( # result sets where the LIMIT semantics differ or aren't supported. exec_sql = sql sql_stripped_upper = sql.strip().upper().lstrip("(") - is_simple_select = sql_stripped_upper.startswith(("SELECT", "WITH", "FROM", "VALUES", "TABLE")) and not re.search( - r"\bLIMIT\b", sql_upper - ) + # 026: ``re.search(r"\bLIMIT\b", sql)`` matches inside string + # literals (``WHERE name = 'WITHOUT LIMIT'``) and inside SQL + # comments — both false positives that cause the auto-wrap to + # SKIP wrapping, leaving the query unbounded. The AST-aware + # check inspects the parse tree so strings/comments are out of + # scope. + is_simple_select = sql_stripped_upper.startswith( + ("SELECT", "WITH", "FROM", "VALUES", "TABLE") + ) and not has_limit_clause(sql, parser_con=con) if is_simple_select: # Strip trailing semicolon so the wrapper LIMIT lands in the same statement. inner = sql.rstrip().rstrip(";") @@ -79,7 +104,7 @@ def execute_query( result = con.execute(exec_sql) df = result.fetchdf() elapsed_ms = round((time.monotonic() - t0) * 1000, 2) - _debug_queries.append({"sql": exec_sql.strip(), "time_ms": elapsed_ms}) + _debug_queries.append({"sql": _compact_sql_for_debug(exec_sql.strip()), "time_ms": elapsed_ms}) fetched_rows = len(df) if is_simple_select: diff --git a/backend/repositories/security.py b/backend/repositories/security.py index 1cfaec6f..4d65f9cd 100644 --- a/backend/repositories/security.py +++ b/backend/repositories/security.py @@ -34,66 +34,90 @@ def get_top_bots( params, where_clause = build_where_clause(start_time, end_time, filters, actual_cols, inline_params=True) - # ── Arcjet UA-matched bots ──────────────────────────────────────────────── arcjet_bots: list[dict] = [] + # ── Single filtered TEMP TABLE shared across arcjet UA + NGWAF JOIN ───── + # Previously the function ran TWO independent scans over the same + # filtered window: a UA TopN (LIMIT 2000) for arcjet classification + # then a SECOND scan with an NGWAF JOIN for waf bot names. With the + # dashboard's security panel mounted, both ran on every request. + # Materializing one filtered temp table with the columns BOTH passes + # need (ua + waf_req_id) collapses the scan to one Iceberg manifest + # walk and keeps both downstream queries reading from memory. + cols_needed: list[str] = [] if "ua" in actual_cols: - try: - from backend.utils.bot_sources import build_matcher, get_bot_regex_pattern - - pattern = get_bot_regex_pattern(200) - ua_filter = f"AND regexp_matches(ua, '{pattern.replace(chr(39), chr(39) * 2)}')" if pattern else "" - - q = f""" - SELECT ua, count(*) AS cnt - FROM {table_name} - WHERE {where_clause} AND ua IS NOT NULL {ua_filter} - GROUP BY ua - ORDER BY cnt DESC - LIMIT 2000 - """ - rows = runner.execute(q).fetchall() - - match_ua = build_matcher() - bot_counts: dict[str, dict] = {} - for ua_val, cnt in rows: - for entry in match_ua(ua_val): - bot_id = entry.get("id", "unknown") - if bot_id not in bot_counts: - cats = entry.get("categories", []) - bot_counts[bot_id] = { - "id": bot_id, - "name": bot_id.replace("-", " ").title(), - "category": cats[0] if cats else "unknown", - "request_count": 0, - } - bot_counts[bot_id]["request_count"] += cnt - - arcjet_bots = sorted(bot_counts.values(), key=lambda x: x["request_count"], reverse=True)[:n] - except Exception as e: - logging.getLogger(__name__).error("[security] arcjet top bots failed: %s", e) + cols_needed.append("ua") + if "waf_req_id" in actual_cols: + cols_needed.append("waf_req_id") + # If the schema has neither (very minimal log_fields preset), skip + # both passes — there's nothing to classify. + if not cols_needed: + return {"bots": [], "ngwaf_bots": []} + + # Use QueryRunner.temp_table context manager so the DROP runs even + # if an intermediate query raises (was a manual try/finally before). + with runner.temp_table(cols_needed, actual_cols, table_name, where_clause, params) as temp_table: + if temp_table is None: + return {"bots": [], "ngwaf_bots": []} + if "ua" in actual_cols: + try: + from backend.utils.bot_sources import build_matcher, get_bot_regex_pattern - # ── NGWAF cache bot names ───────────────────────────────────────────────── - ngwaf_bots: list[dict] = [] - from backend.repositories._base import attach_ngwaf_cache + pattern = get_bot_regex_pattern(200) + ua_filter = f"AND regexp_matches(ua, '{pattern.replace(chr(39), chr(39) * 2)}')" if pattern else "" - with attach_ngwaf_cache(con, actual_cols, alias="ngwaf_top") as attached: - if attached: - try: q = f""" - SELECT nb.bot_name, nb.category, count(*) AS cnt - FROM {table_name} - INNER JOIN ngwaf_top.ngwaf_bots nb USING (waf_req_id) - WHERE {where_clause} AND nb.bot_name IS NOT NULL - GROUP BY 1, 2 - ORDER BY 3 DESC - LIMIT {n} + SELECT ua, count(*) AS cnt + FROM {temp_table} + WHERE ua IS NOT NULL {ua_filter} + GROUP BY ua + ORDER BY cnt DESC + LIMIT 2000 """ - res = runner.execute(q).fetchall() - ngwaf_bots = [{"name": r[0], "category": r[1], "request_count": r[2]} for r in res] + rows = runner.execute(q).fetchall() + + match_ua = build_matcher() + bot_counts: dict[str, dict] = {} + for ua_val, cnt in rows: + for entry in match_ua(ua_val): + bot_id = entry.get("id", "unknown") + if bot_id not in bot_counts: + cats = entry.get("categories", []) + bot_counts[bot_id] = { + "id": bot_id, + "name": bot_id.replace("-", " ").title(), + "category": cats[0] if cats else "unknown", + "request_count": 0, + } + bot_counts[bot_id]["request_count"] += cnt + + arcjet_bots = sorted(bot_counts.values(), key=lambda x: x["request_count"], reverse=True)[:n] except Exception as e: - logging.getLogger(__name__).error("[security] NGWAF top bots failed: %s", e) - - return {"bots": arcjet_bots, "ngwaf_bots": ngwaf_bots} + logging.getLogger(__name__).error("[security] arcjet top bots failed: %s", e) + + # ── NGWAF cache bot names ───────────────────────────────────────────── + ngwaf_bots: list[dict] = [] + from backend.repositories._base import attach_ngwaf_cache + + with attach_ngwaf_cache(con, actual_cols, alias="ngwaf_top") as attached: + if attached: + try: + # Join against the temp table instead of re-scanning the + # source view — same filter window, no second manifest walk. + q = f""" + SELECT nb.bot_name, nb.category, count(*) AS cnt + FROM {temp_table} t + INNER JOIN ngwaf_top.ngwaf_bots nb USING (waf_req_id) + WHERE nb.bot_name IS NOT NULL + GROUP BY 1, 2 + ORDER BY 3 DESC + LIMIT {n} + """ + res = runner.execute(q).fetchall() + ngwaf_bots = [{"name": r[0], "category": r[1], "request_count": r[2]} for r in res] + except Exception as e: + logging.getLogger(__name__).error("[security] NGWAF top bots failed: %s", e) + + return {"bots": arcjet_bots, "ngwaf_bots": ngwaf_bots, **runner.telemetry()} def get_security_aggregates( diff --git a/backend/repositories/sessions.py b/backend/repositories/sessions.py index 2f3aaa56..7ec69299 100644 --- a/backend/repositories/sessions.py +++ b/backend/repositories/sessions.py @@ -171,8 +171,7 @@ def get_sessions( data_sql = f""" {sessions_cte} - SELECT *, ({flag_expr}) AS flagged, - COUNT(*) OVER () AS _total_count + SELECT *, ({flag_expr}) AS flagged FROM sessions_agg {flagged_filter} ORDER BY {sort_by} {sort_dir} @@ -181,16 +180,14 @@ def get_sessions( rows = runner.execute(data_sql, params).fetchall() col_names = [desc[0] for desc in con.description] - total = 0 sessions: list[dict] = [] for row in rows: d = dict(zip(col_names, row)) - total = int(d.pop("_total_count", 0)) for k in ("session_start", "session_end"): if d.get(k) is not None: d[k] = str(d[k]) - # Ensure we have ua and ja4 if requested in group_cols sessions.append(d) + total = len(sessions) if not rows and offset > 0: count_sql = f""" diff --git a/backend/repositories/views.py b/backend/repositories/views.py index b26c0476..9e6155cc 100644 --- a/backend/repositories/views.py +++ b/backend/repositories/views.py @@ -30,6 +30,27 @@ def _find_view_service(view_id: str) -> str | None: return None +def get_view_by_id(view_id: str) -> dict | None: + """Return the saved-view row whose id matches ``view_id`` (or None). + + Security mirror of ``alerts.get_alert_by_id`` — the router-level + cross-tenant scope gate calls this before delete_view so an + unauthorized analyst gets 403 without the row being deleted. + """ + for cfg in svcconfig.list_configs(): + sid = cfg.get("service_id") + if not sid: + continue + for v in metadata_db.list_views(sid): + if v.get("id") == view_id: + # Stamp the owning service_id onto the result so the + # caller's scope check can compare without re-scanning. + out = dict(v) + out.setdefault("service_id", sid) + return out + return None + + def delete_view(view_id: str, service_id_hint: str | None = None) -> dict: sid = service_id_hint or _find_view_service(view_id) if not sid: diff --git a/backend/routers/admin.py b/backend/routers/admin.py index feff30aa..f2d7fe1b 100644 --- a/backend/routers/admin.py +++ b/backend/routers/admin.py @@ -186,7 +186,7 @@ def ingest_endpoint( from fastapi import HTTPException from backend.core.duckdb import start_cron_run - from backend.cron_progress import _run_metadata, start_progress + from backend.cron_progress import list_active_runs, start_progress from backend.repositories.dashboard import _dashboard_cache from backend.scheduler import _run_metadata_sync, _run_service_cron @@ -207,9 +207,9 @@ def ingest_endpoint( t.start() except RuntimeError as e: run_id = None - for rid, meta in _run_metadata.items(): - if meta.get("service_id") == source["name"] and meta.get("task") == "metadata_sync": - run_id = rid + for entry in list_active_runs(): + if entry.get("service_id") == source["name"] and entry.get("task") == "metadata_sync": + run_id = entry["run_id"] break if run_id is None: raise HTTPException(status_code=503, detail={"error": str(e), "busy": True}) @@ -235,9 +235,9 @@ def ingest_endpoint( t.start() except RuntimeError as e: run_id = None - for rid, meta in _run_metadata.items(): - if meta.get("service_id") == src["name"] and meta.get("task") == "sync": - run_id = rid + for entry in list_active_runs(): + if entry.get("service_id") == src["name"] and entry.get("task") == "sync": + run_id = entry["run_id"] break if run_id is None: raise HTTPException(status_code=503, detail={"error": str(e), "busy": True}) @@ -354,7 +354,32 @@ def download_file( if not key: raise HTTPException(status_code=400, detail={"error": "Missing key parameter"}) - local_path = os.path.abspath(os.path.join(_cache_dir(source), key)) + # Cross-tenant guard: a single FOS bucket can host multiple services + # separated by per-source prefixes. The path-traversal cage below + # bounds local cache reads, but a sibling-tenant key like + # ``other_tenant/file.log`` would still mint a presigned URL or CDN + # redirect for that object. Require the key to live under this + # service's prefix before any FOS / CDN URL minting. + src_prefix = source.get("prefix", "") + if src_prefix and not key.startswith(src_prefix): + raise HTTPException(status_code=400, detail={"error": "invalid_key"}) + + # Security: ``os.path.join(base, key)`` returns ``key`` when + # ``key`` is absolute, which a malicious caller exploits by passing + # ``key=/etc/passwd``. Resolve both paths and require commonpath == + # cache_dir so a path-traversal payload (absolute path or + # ``../../../etc/passwd``) is rejected at the boundary. + cache_dir = os.path.realpath(_cache_dir(source)) + candidate = os.path.realpath(os.path.join(cache_dir, key)) + try: + common = os.path.commonpath([cache_dir, candidate]) + except ValueError: + # commonpath raises ValueError when paths have different drives / + # mixed absolute/relative. Treat as path-escape and reject. + raise HTTPException(status_code=400, detail={"error": "invalid_key"}) + if common != cache_dir: + raise HTTPException(status_code=400, detail={"error": "invalid_key"}) + local_path = candidate if os.path.exists(local_path): return FileResponse(local_path, filename=os.path.basename(local_path)) @@ -448,7 +473,10 @@ def zip_worker(q: queue.Queue): cdn = src.get("cdn_url", "").rstrip("/") fos_client = _db._get_fos_client(src) paginator = fos_client.get_paginator("list_objects_v2", caller_hint="download_all") - pages = paginator.paginate(Bucket=src["bucket"]) + # Cross-tenant guard: scope to this service's prefix + # so a shared bucket with multiple services doesn't + # leak sibling data into the zip. + pages = paginator.paginate(Bucket=src["bucket"], Prefix=src.get("prefix", "")) for page in pages: if "Contents" not in page: @@ -472,7 +500,33 @@ def zip_worker(q: queue.Queue): return StreamingResponse(_stream_from_worker(zip_worker), media_type="application/zip", headers=headers) +_DIR_SIZE_CACHE: dict[str, tuple[float, int]] = {} +_DIR_SIZE_TTL_S = 30.0 + + def _get_dir_size(path: str) -> int: + # Cache results per-path with a 30s TTL. The cache walk is O(files-in-tree) + # and the per-service cache grew from ~300 files to ~19k after the rollups + # backfill (one parquet per field × hour). At ~700ms per uncached walk, + # SyncStatusBadge's 15s poll was paying that cost on every refresh; the + # cache turns it into a single getsize_sum sweep per minute. + # + # Files only grow incrementally (ingest + rollup-recompute) so a 30s + # staleness window means the dashboard's reported disk usage can lag by + # at most that window. Worth it for the perf vs measuring exact-to-the- + # millisecond size on a poll endpoint. + import time as _t + + now = _t.monotonic() + cached = _DIR_SIZE_CACHE.get(path) + if cached is not None and (now - cached[0]) < _DIR_SIZE_TTL_S: + return cached[1] + total = _scan_dir_size(path) + _DIR_SIZE_CACHE[path] = (now, total) + return total + + +def _scan_dir_size(path: str) -> int: total = 0 if not os.path.exists(path): return 0 @@ -482,7 +536,7 @@ def _get_dir_size(path: str) -> int: if entry.is_file(): total += entry.stat().st_size elif entry.is_dir(): - total += _get_dir_size(entry.path) + total += _scan_dir_size(entry.path) except Exception: pass return total @@ -512,13 +566,28 @@ def sync_status( return SyncStatusResponse.with_telemetry(configured=False) try: - from backend.core.duckdb import get_connection + # Fast path: skip_fos=true callers (FilterBar polling, badge in + # the page header, etc.) only need the cached snapshot that the + # sync cron refreshes every minute. Return it without grabbing a + # DuckDB connection, so that a busy dashboard load — agg/raw/ + # bots all racing for connections — doesn't starve sync-status + # and trigger 503s when its max_wait expires. + cached_status = svcconfig.get_status(src["name"]) if skip_fos and not force else None + # get_status returns {} (not None) when no status has been + # persisted yet — fall through to the DB path in that case. + if cached_status: + cached_status["access_level"] = src.get("access_level", "read_write") + cached_status["storage_mode"] = _db.STORAGE_MODE + cached_status["configured"] = True + status = cached_status + else: + from backend.core.duckdb import get_connection - _con = get_connection(source=src, max_wait=5, skip_view_update=True) - try: - status = get_sync_status(_con, src, skip_fos=skip_fos, force=force) - finally: - _con.close() + _con = get_connection(source=src, max_wait=5, skip_view_update=True) + try: + status = get_sync_status(_con, src, skip_fos=skip_fos, force=force) + finally: + _con.close() db_path = src.get("duckdb_path") or svcconfig.duckdb_path(service_id) db_exists = os.path.exists(db_path) @@ -604,6 +673,195 @@ def compaction_stats(source: dict = Depends(get_source)): return _lc.compaction_stats(source) +@router.patch("/admin/metadata-retention") +def update_metadata_retention(body: dict, source: dict = Depends(get_source)): + """Update the per-service ``metadata_retention`` config block. + + Body shape: any subset of ``{usage_log_days, ingested_files_days, + cron_runs_days}``. Each value is coerced to int; negative / non-numeric + inputs are clamped to 0 (which disables cleanup for that table per + cleanup_metadata's semantics). Missing keys preserve their current + value. Returns the resolved retention (defaults merged with cfg) so the + UI can confirm what was saved. + """ + from backend import config as svcconfig + from backend.core import metadata_db as _mdb + from backend.core.metadata_db import DEFAULT_METADATA_RETENTION + + service_id = source["name"] + cfg = svcconfig.load_config(service_id) + if cfg is None: + raise HTTPException(status_code=404, detail={"error": "Service not found"}) + + from backend.core.metadata_db import is_ingested_files_dedup_active + + current = dict(cfg.get("metadata_retention") or {}) + for key in ("usage_log_days", "ingested_files_days", "cron_runs_days"): + if key in body: + try: + v = int(body[key]) + except (TypeError, ValueError): + v = 0 + current[key] = max(0, v) + + # Mirror the cleanup helper's safety override at the write layer: + # if delete_after=false on this service, refuse to persist a non-zero + # ingested_files_days. Storing it would mislead the operator into + # thinking the value will be honored when the cleanup ignores it. + if not is_ingested_files_dedup_active(service_id) and int(current.get("ingested_files_days") or 0) > 0: + current["ingested_files_days"] = 0 + + cfg["metadata_retention"] = current + svcconfig.save_config(service_id, cfg) + try: + _mdb.record_audit( + service_id=service_id, + event_type="metadata_retention_update", + details=current, + ) + except Exception: + pass + + return {"retention": {**DEFAULT_METADATA_RETENTION, **current}} + + +@router.get("/admin/metadata-storage") +def metadata_storage(source: dict = Depends(get_source)): + """Per-table row count + estimated bytes for this service's metadata.db. + + Includes the resolved retention policy (per-service cfg merged with + defaults). The UI uses this to render the Metadata Storage card on + the admin page — table sizes, bytes, and a Cleanup-now button. + """ + from backend import config as svcconfig + from backend.core.metadata_db import ( + DEFAULT_METADATA_RETENTION, + get_metadata_storage_stats, + is_ingested_files_dedup_active, + ) + + service_id = source["name"] + stats = get_metadata_storage_stats(service_id) + cfg = svcconfig.load_config(service_id) or {} + retention = {**DEFAULT_METADATA_RETENTION, **(cfg.get("metadata_retention") or {})} + # ingested_files_locked surfaces the safety override: when + # cron_sync.delete_after=False the ingested_files table is the + # dedup gate, so the cleanup helper force-disables its trimming + # regardless of the configured retention. UI uses this to disable + # the input + show a tooltip explaining the override. + ingested_files_locked = not is_ingested_files_dedup_active(service_id) + return {**stats, "retention": retention, "ingested_files_locked": ingested_files_locked} + + +@router.post("/admin/metadata-cleanup") +def metadata_cleanup_now(source: dict = Depends(get_source)): + """Trigger an immediate metadata cleanup, streaming progress as SSE. + + Equivalent to the daily ``metadata_cleanup`` cron at 03:15 UTC but + on-demand. The DELETE phase is fast; VACUUM rewrites the whole file + and on a multi-GB metadata.db can take minutes. Streaming gives the + operator real-time feedback instead of a 5-minute hang behind a + spinning button. + + Event shapes (between SSE ``data:`` lines): + + {"type": "status", "message": str} + {"type": "progress", "current": int, "total": int, "message": str} + {"type": "done", "message": str, "result": {...}} + {"type": "error", "message": str} + + Writes a row to ``cron_runs`` with task=``metadata_cleanup`` so the + manual run shows up on the Data Management schedule + history grid + alongside the scheduled cron's runs. + """ + import json as _json + import queue as _queue + import threading + import time as _t + + from backend import config as svcconfig + from backend.core.duckdb import log_cron_run, start_cron_run + from backend.core.metadata_db import cleanup_metadata + + service_id = source["name"] + cfg = svcconfig.load_config(service_id) or {} + retention = cfg.get("metadata_retention") or {} + + # Bridge cleanup_metadata's on_event callback to the SSE generator via + # a thread-safe queue. The worker thread runs the cleanup synchronously + # (DELETE then VACUUM — both block the SQLite writer) and pushes events + # as they happen; the streaming generator consumes them and yields SSE + # frames. Sentinel ``None`` marks end-of-stream. + events: _queue.Queue = _queue.Queue() + + def worker(): + started = _t.time() + run_id = start_cron_run(source, "metadata_cleanup") + try: + result = cleanup_metadata(service_id, retention, on_event=events.put) + except Exception as e: + err = str(e) + events.put({"type": "error", "message": f"Cleanup failed: {err}"}) + try: + log_cron_run( + source, + "metadata_cleanup", + _t.time() - started, + "error", + error_message=err, + summary=f"cleanup failed: {err}", + run_id=run_id, + ) + finally: + events.put(None) + return + + total_deleted = sum(result["deleted"].values()) + if total_deleted: + parts = [f"{t}={n}" for t, n in result["deleted"].items() if n] + summary = ( + f"Trimmed {total_deleted:,} rows ({', '.join(parts)}). " + f"VACUUM={'yes' if result['vacuumed'] else 'skipped'}." + ) + else: + summary = "No rows older than retention windows." + try: + log_cron_run( + source, + "metadata_cleanup", + _t.time() - started, + "success", + summary=summary, + rows_ingested=total_deleted, + run_id=run_id, + ) + finally: + events.put({"type": "done", "message": summary, "result": result}) + events.put(None) + + threading.Thread(target=worker, daemon=True, name=f"metadata-cleanup-{service_id}").start() + + def stream(): + # Pre-pad to defeat any reverse-proxy / browser buffering; SSE + # clients flush on the first blank-line delimiter. + yield ":" + " " * 2048 + "\n\n" + while True: + event = events.get() + if event is None: + break + yield f"data: {_json.dumps(event)}\n\n" + + return StreamingResponse( + stream(), + media_type="text/event-stream", + headers={ + "Cache-Control": "no-cache, no-transform", + "X-Accel-Buffering": "no", + "Connection": "keep-alive", + }, + ) + + @router.get("/admin/health-snapshot") def health_snapshot(): """One-shot health snapshot for the admin page system health card. @@ -664,17 +922,22 @@ def health_snapshot(): out[label] = None # ── In-flight cron runs ────────────────────────────────────────── + # Use list_active_runs() (which filters out runs whose last event is + # done/error) instead of iterating _run_metadata directly. The dict + # holds entries for an hour after completion (the cleanup TTL), so the + # raw iteration was showing dozens of stale "sync" entries in the + # System Health card. try: - from backend.cron_progress import _run_metadata + from backend.cron_progress import list_active_runs in_flight = [] - for run_id, meta in list(_run_metadata.items()): + for entry in list_active_runs(): in_flight.append( { - "run_id": run_id, - "service_id": meta.get("service_id"), - "task": meta.get("task"), - "started_at": meta.get("started_at"), + "run_id": entry["run_id"], + "service_id": entry.get("service_id"), + "task": entry.get("task"), + "started_at": entry.get("started_at"), } ) out["in_flight_runs"] = in_flight @@ -730,15 +993,16 @@ def _fetch_fastly_log_counts( uses (`YYYY-MM-DDTHH` for hour, `YYYY-MM-DD` for day) so the outer-join in api_log_accounting can key on string equality directly. """ - import json import logging - import urllib.request from datetime import UTC, datetime - url = f"https://api.fastly.com/stats/service/{logging_svc_id}?by={by}&from={from_ts}&to={to_ts}" - req = urllib.request.Request(url, headers={"Fastly-Key": api_key, "Accept": "application/json"}) - with urllib.request.urlopen(req, timeout=30) as resp: - payload = json.loads(resp.read().decode()) + from backend.core.fastly.client import fastly + + payload = fastly( + "GET", + f"/stats/service/{logging_svc_id}?by={by}&from={from_ts}&to={to_ts}", + token=api_key, + ) width = 13 if by == "hour" else 10 records = payload.get("data", []) or [] @@ -1013,12 +1277,12 @@ def iceberg_commit_endpoint(source: dict = Depends(get_source)): return {"ok": True, "message": "Commit started.", "run_id": run_id} except RuntimeError as e: - from backend.cron_progress import _run_metadata + from backend.cron_progress import list_active_runs run_id = None - for rid, meta in _run_metadata.items(): - if meta.get("service_id") == source["name"] and meta.get("task") == "commit": - run_id = rid + for entry in list_active_runs(): + if entry.get("service_id") == source["name"] and entry.get("task") == "commit": + run_id = entry["run_id"] break if run_id is None: raise HTTPException(status_code=503, detail={"error": str(e), "busy": True}) diff --git a/backend/routers/alerts.py b/backend/routers/alerts.py index a08ec953..7b23f46b 100644 --- a/backend/routers/alerts.py +++ b/backend/routers/alerts.py @@ -5,7 +5,7 @@ from datetime import UTC import duckdb -from fastapi import APIRouter, Depends +from fastapi import APIRouter, Depends, HTTPException, Request from pydantic import BaseModel from backend.deps import get_con, get_service_id @@ -16,16 +16,44 @@ router = APIRouter(prefix="/api/alerts", tags=["alerts"]) +def _analyst_allowed_services(request: Request) -> set[str] | None: + """Return the set of service IDs the caller (analyst session) can see, + or ``None`` for admin requests (no scope restriction). + + Security: every read / mutation on the alerts collection must + filter by this set so an analyst scoped to ``svc-A`` cannot + enumerate or modify ``svc-B``'s alerts via the cross-tenant pattern + GET /api/alerts/ , GET /api/alerts/{other_id}, etc. + """ + analyst_session = getattr(request.state, "analyst_session", None) + if analyst_session is None: + return None # admin — unrestricted + return set(analyst_session.service_ids or []) + + @router.get("/", response_model=AlertListResponse) -def list_all_alerts(): +def list_all_alerts(request: Request): + """Return alerts visible to the caller. + + Admin: every alert across every service. Analyst: only alerts for + services in their invite's scope (security). + """ + allowed = _analyst_allowed_services(request) alerts = repo.get_alerts() + if allowed is not None: + alerts = [a for a in alerts if a.get("service_id") in allowed] from datetime import datetime return AlertListResponse.with_telemetry(data=alerts, evaluated_at=datetime.now(UTC).isoformat()) @router.get("/{service_id}", response_model=AlertListResponse) -def list_service_alerts(service_id: str): +def list_service_alerts(service_id: str, request: Request): + """Return alerts for one service. Analyst gets 403 if the service + isn't in their invite (security).""" + allowed = _analyst_allowed_services(request) + if allowed is not None and service_id not in allowed: + raise HTTPException(status_code=403, detail={"error": "service_not_authorized", "service": service_id}) alerts = repo.get_alerts(service_id) from datetime import datetime @@ -33,20 +61,44 @@ def list_service_alerts(service_id: str): @router.post("/", response_model=AlertResponse) -def create_alert(alert: Alert): +def create_alert(alert: Alert, request: Request): + """Create an alert. Analyst can only create alerts for services in + their invite scope (security). The Phase-1 analyst middleware + also blocks POSTs on /api/alerts for analysts entirely (not in the + _ANALYST_ALLOWED_WRITE_PREFIXES list), so this is defense-in-depth + for the admin-impersonating-analyst case.""" + allowed = _analyst_allowed_services(request) + if allowed is not None and alert.service_id not in allowed: + raise HTTPException( + status_code=403, + detail={"error": "service_not_authorized", "service": alert.service_id}, + ) res = repo.save_alert(alert) sync_admin_state(alert.service_id) return AlertPreviewResponse.with_telemetry(data=res) @router.post("/preview", response_model=AlertPreviewResponse) -def preview_alert(alert: Alert, lookback_hours: int = 24, con: duckdb.DuckDBPyConnection = Depends(get_con)): +def preview_alert( + alert: Alert, + request: Request, + lookback_hours: int = 24, + con: duckdb.DuckDBPyConnection = Depends(get_con), +): import datetime - from fastapi import HTTPException - from backend.core.duckdb import _safe_table_name, get_source_for_service + # Security: analyst can only preview alerts against their scoped + # services. Without this an analyst could compose an Alert against + # another tenant's service_id and read its time-series data. + allowed = _analyst_allowed_services(request) + if allowed is not None and alert.service_id not in allowed: + raise HTTPException( + status_code=403, + detail={"error": "service_not_authorized", "service": alert.service_id}, + ) + src = get_source_for_service(alert.service_id) if not src: raise HTTPException(status_code=404, detail="Service not found") @@ -137,14 +189,47 @@ class _ToggleBody(BaseModel): @router.patch("/{alert_id}/enabled", response_model=AlertResponse) -def toggle_alert_enabled(alert_id: str, body: _ToggleBody, service_id: str | None = Depends(get_service_id)): +def toggle_alert_enabled( + alert_id: str, + body: _ToggleBody, + request: Request, + service_id: str | None = Depends(get_service_id), +): + # Security: pre-flight scope check BEFORE the mutation. Earlier + # implementation toggled first and then 403'd on the result, so a + # cross-tenant write would still land and the analyst would just see + # an error after the fact. Now the toggle never runs for an + # unauthorized session. + allowed = _analyst_allowed_services(request) + if allowed is not None: + existing = repo.get_alert_by_id(alert_id) + if existing and existing.get("service_id") not in allowed: + raise HTTPException( + status_code=403, + detail={"error": "service_not_authorized", "service": existing.get("service_id")}, + ) res = repo.toggle_alert(alert_id, body.enabled, service_id_hint=service_id) sync_admin_state(res.get("service_id")) return AlertPreviewResponse.with_telemetry(data=res) @router.delete("/{alert_id}", response_model=AlertResponse) -def delete_alert(alert_id: str, service_id: str | None = Depends(get_service_id)): +def delete_alert( + alert_id: str, + request: Request, + service_id: str | None = Depends(get_service_id), +): + # Pre-flight scope check: look up the alert's service_id before + # deleting so we don't leak the existence of cross-tenant alerts + # via a delete-then-403 pattern. + allowed = _analyst_allowed_services(request) + if allowed is not None: + existing = repo.get_alert_by_id(alert_id) + if existing and existing.get("service_id") not in allowed: + raise HTTPException( + status_code=403, + detail={"error": "service_not_authorized", "service": existing.get("service_id")}, + ) res = repo.delete_alert(alert_id, service_id_hint=service_id) sync_admin_state(res.get("service_id")) return AlertPreviewResponse.with_telemetry(data=res) diff --git a/backend/routers/bootstrap.py b/backend/routers/bootstrap.py index 94954a6e..e38f5b0b 100644 --- a/backend/routers/bootstrap.py +++ b/backend/routers/bootstrap.py @@ -3,7 +3,7 @@ from __future__ import annotations import duckdb -from fastapi import APIRouter, Depends, Request +from fastapi import APIRouter, Depends, HTTPException, Request from backend.deps import get_meta_con, get_service_id, get_source from backend.models.common import BootstrapResponse @@ -18,7 +18,7 @@ def bootstrap( service_id: str | None = Depends(get_service_id), ): from backend.core import duckdb as _db - from backend.core.duckdb import STORAGE_MODE, get_schema + from backend.core.duckdb import STORAGE_MODE from backend.services.service_manager import get_enriched_services from backend.utils.countries import COUNTRY_MAP from backend.utils.pop_utils import get_pop_lat_lon_map @@ -80,20 +80,15 @@ def bootstrap( if active_svc and active_svc.get("status"): schema = active_svc["status"].get("schema", []) - if not schema and valid_active_id: - src = _db.get_source_for_service(valid_active_id) - if src: - try: - from backend.core.duckdb import get_connection - - # read_only: schema lookup only, no writes. - con = get_connection(source=src, max_wait=3, skip_view_update=True, read_only=True) - try: - schema = get_schema(con, src) - finally: - con.close() - except Exception: - pass + # NOTE: the previous fallback opened a read-only DuckDB connection here + # and ran get_schema() against the source on cold-cache loads. That call + # acquired the per-service lock + did a parquet glob, costing 1-3s on + # the very first /api/bootstrap after a backend restart and blocking + # the whole admin UI from rendering. With the status-refresh cron + # populating active_svc["status"]["schema"], the cache is the source + # of truth — drop the fallback. If schema is empty here, the dashboard + # renders without a hint banner; the user can refresh once the cron + # has run (typically <60s after startup). pops = get_pop_lat_lon_map() @@ -147,13 +142,26 @@ def bootstrap( @router.get("/sources") @query_errors(status_code=500) -def sources_endpoint(): +def sources_endpoint(request: Request): + """Return storage metadata (endpoint / bucket / prefix / region) for the + configured sources the caller is authorized to see. + + Security: filter by analyst session scope. Without this, an + authenticated analyst can enumerate every service's S3 bucket / endpoint + / prefix configuration, including ones not in their invite. Admin + requests (no analyst_session on request.state) see the full list. + """ from backend import config as svcconfig from backend.core.duckdb import _safe_table_name + analyst_session = getattr(request.state, "analyst_session", None) + allowed: set[str] | None = set(analyst_session.service_ids or []) if analyst_session else None + configs = svcconfig.list_configs() sources = [] for cfg in configs: + if allowed is not None and cfg.get("service_id") not in allowed: + continue src = svcconfig.config_to_source(cfg) sources.append( { @@ -171,12 +179,25 @@ def sources_endpoint(): @router.get("/schema") @query_errors(status_code=500) def schema_endpoint( + request: Request, source: dict = Depends(get_source), con: duckdb.DuckDBPyConnection = Depends(get_meta_con), ): from backend import config as svcconfig from backend.core.duckdb import _safe_table_name, get_schema + # Cross-tenant guard: an analyst session scoped to ``svc-A`` must not + # be able to read ``svc-B``'s schema (custom-field names, types, and + # PII flags). Mirrors the check in ``log_fields_catalog``. + analyst_session = getattr(request.state, "analyst_session", None) + if analyst_session is not None: + allowed = set(analyst_session.service_ids or []) + if source.get("name") not in allowed: + raise HTTPException( + status_code=403, + detail={"error": "service_not_authorized", "service": source.get("name")}, + ) + # Try cache first cached_status = svcconfig.get_status(source["name"]) if cached_status and "schema" in cached_status: @@ -187,10 +208,29 @@ def schema_endpoint( @router.get("/log-fields/catalog") @query_errors(status_code=500) -def log_fields_catalog(service_id: str | None = Depends(get_service_id)): +def log_fields_catalog( + request: Request, + service_id: str | None = Depends(get_service_id), +): + """Return the log-fields catalog for the requested service. + + Security: enforce analyst session scope on the requested + ``service_id``. Without this, an analyst scoped to ``svc-A`` can pass + ``?service_id=svc-B`` and read svc-B's custom field configuration + (including PII-related field configs). + """ from backend.core import log_fields as lf from backend.core.log_fields import INSIGHT_DEFINITIONS + analyst_session = getattr(request.state, "analyst_session", None) + if analyst_session is not None and service_id is not None: + allowed = set(analyst_session.service_ids or []) + if service_id not in allowed: + raise HTTPException( + status_code=403, + detail={"error": "service_not_authorized", "service": service_id}, + ) + # Try to load existing limits field_limits = {} if service_id: @@ -225,11 +265,24 @@ def log_fields_catalog(service_id: str | None = Depends(get_service_id)): @router.get("/insight-availability", response_model=InsightsAvailabilityResponse) @query_errors(status_code=500) def insight_availability( + request: Request, source: dict = Depends(get_source), con: duckdb.DuckDBPyConnection = Depends(get_meta_con), ): from backend.core.duckdb import get_schema + # Cross-tenant guard: insight availability discloses which fields are + # populated (presence/absence of optional columns), so it needs the + # same scope check as the schema endpoint. + analyst_session = getattr(request.state, "analyst_session", None) + if analyst_session is not None: + allowed = set(analyst_session.service_ids or []) + if source.get("name") not in allowed: + raise HTTPException( + status_code=403, + detail={"error": "service_not_authorized", "service": source.get("name")}, + ) + actual_cols = {col["name"] for col in get_schema(con, source)} from backend.core.log_fields import INSIGHT_DEFINITIONS diff --git a/backend/routers/debug.py b/backend/routers/debug.py index 028df86a..ab507db1 100644 --- a/backend/routers/debug.py +++ b/backend/routers/debug.py @@ -33,3 +33,20 @@ def clear_sqlite(): """Drain the SQLite ring buffer. Manual reset for the Debug Panel.""" sqlite_profiler.clear() return {"ok": True, **sqlite_profiler.buffer_stats()} + + +@router.get("/state") +def debug_state(): + """Report whether the backend will include ``_debug_queries`` / + ``_debug_calls`` arrays in API responses. + + Controlled by the process-level ``DEBUG_RESPONSES`` env var (defaults + OFF in production for security; ON in local-dev ``.env``). The admin + page calls this to dim the "Query debugging panel" + "API call panel" + toggles when the backend won't populate them — so the operator gets + a clear tooltip explaining why their toggle has no effect, instead of + silently flipping a switch that does nothing. + """ + from backend.models.common import _debug_responses_enabled + + return {"debug_responses_enabled": _debug_responses_enabled()} diff --git a/backend/routers/provision.py b/backend/routers/provision.py index 54cce078..f5efed14 100644 --- a/backend/routers/provision.py +++ b/backend/routers/provision.py @@ -178,25 +178,45 @@ def provision_check_fos( return {"ok": False, "error": err_msg, "_debug_calls": get_tracked_calls()} -@router.get("/teardown") -def provision_teardown( - token: str = Query(default=""), - service_id: str | None = Query(default=None), - remove_logging: bool = Query(default=True), - remove_cdn: bool = Query(default=True), - remove_bucket: bool = Query(default=True), - remove_cache: bool = Query(default=True), - remove_cron: bool = Query(default=False), -): +@router.post("/teardown") +def provision_teardown(body: dict | None = None): + """Destructive service teardown over SSE. + + Switched from ``GET`` to ``POST`` to defend against CSRF: a GET + endpoint with side effects can be triggered by any cross-origin + ````, ````, or ``
``. POST routes + require the caller to send a request that browsers do not emit + cross-origin without the user explicitly submitting a form, and + ``Content-Type: application/json`` (sent by the dashboard's fetch + client) puts the request in the CORS-preflighted bucket so the + browser will block silent invocation entirely. + + Body shape: + {token, service_id, remove_logging, remove_cdn, + remove_bucket, remove_cache, remove_cron} + """ + body = body or {} + token: str = str(body.get("token") or "") + service_id: str | None = body.get("service_id") + remove_logging: bool = bool(body.get("remove_logging", True)) + remove_cdn: bool = bool(body.get("remove_cdn", True)) + remove_bucket: bool = bool(body.get("remove_bucket", True)) + remove_cache: bool = bool(body.get("remove_cache", True)) + remove_cron: bool = bool(body.get("remove_cron", False)) from backend import config as svcconfig from backend.core import duckdb as _db from backend.provision import _sync_crontab, perform_teardown + from backend.utils.fastly_auth import validate_destructive_token state = None if service_id: svc_cfg = svcconfig.load_config(service_id) if svc_cfg: - token = token or svc_cfg.get("fastly_api_key", "") + # Security: do NOT fall back to the server-stored + # ``fastly_api_key``. Destructive operations require the caller to + # supply a token that we then validate against Fastly's + # /tokens/self endpoint. The stored key is only used for + # scheduled, non-destructive background sync. prov = svc_cfg.get("provisioning", {}) state = { "logging_service_id": service_id, @@ -215,6 +235,16 @@ def provision_teardown( if not state: raise HTTPException(status_code=404, detail={"error": "No service config found."}) + # Security: destructive teardown (logging / CDN / bucket) requires a + # caller-supplied Fastly token with the ``global`` scope and access to + # this service. Cache-only teardown (all three destructive flags false) + # is a local-cleanup operation and does not touch Fastly, so it does not + # require token validation. The /api/provision/ middleware gate ensures + # only local admin requests reach this endpoint regardless. + has_destructive = bool(remove_logging or remove_cdn or remove_bucket) + if has_destructive: + validate_destructive_token(token, service_id=service_id or "") + opts = { "remove_logging": remove_logging, "remove_cdn": remove_cdn, @@ -222,9 +252,7 @@ def provision_teardown( } def stream(): - def yj(data): - yield f"data: {json.dumps(data)}\n\n" - yield f": {' ' * 256}\n\n" + from backend.utils.router_utils import sse_event as yj # local alias preserves the line-level diff # Initial padding to force flush yield from _sse_flush() @@ -556,12 +584,22 @@ def provision_ingest(body: dict): import secrets from backend.provision import ensure_fos_access_key, find_fos_key, parse_period, write_service_config + from backend.utils.fastly_auth import validate_destructive_token from backend.utils.pop_utils import fetch_pop_locations token = body.get("token") if not token: raise HTTPException(status_code=400, detail={"error": "Token is required"}) + # Provisioning writes a service config that the scheduler immediately + # picks up and starts ingesting from. Without a token validation pass + # here the route would mint configs for any service_id reachable by + # the caller's network position, even though the caller may not + # legitimately own that service. ``validate_destructive_token`` + # rejects when scope, bound-services, or tenant don't match. + logging_service_id = body.get("service_id") or body.get("logging_service_id") or "" + validate_destructive_token(token, service_id=logging_service_id) + fetch_pop_locations(token) try: @@ -776,17 +814,43 @@ def provision_check_config( @router.get("/ngwaf-workspaces") def provision_ngwaf_workspaces(service_id: str = Query(...), token: str = Query(default="")): - """List NGWAF workspaces using the provided token or the stored API key.""" + """List NGWAF workspaces for a service. + + Security: previously the endpoint would silently fall back to + the server-stored ``fastly_api_key`` if the caller didn't pass a + token, letting any local-loopback caller enumerate NGWAF workspaces + for any service using the stored credential. Now the caller MUST + present a token, and we accept either: + - the stored ``fastly_api_key`` for this service (constant-time + match — preserves the existing admin UX where the frontend + passes the stored key it just used to fetch workspaces), OR + - a token whose /tokens/self response shows access to this service + (the strict validation path used for the destructive op). + Either way an unauthenticated caller can't enumerate workspaces + even if they reach the loopback admin surface. + """ + import hmac import urllib.error from backend import config as svcconfig from backend.provision import fastly + from backend.utils.fastly_auth import validate_destructive_token token = token.strip() if not token: - token = svcconfig.get_fastly_api_key(service_id) - if not token: - raise HTTPException(status_code=400, detail={"error": "No API key stored for this service."}) + raise HTTPException( + status_code=401, + detail={ + "error": "token_required", + "message": "A Fastly API token is required to list NGWAF workspaces.", + }, + ) + stored = (svcconfig.get_fastly_api_key(service_id) or "").strip() + matches_stored = bool(stored) and hmac.compare_digest(token, stored) + if not matches_stored: + # The validator raises HTTPException(401) on scope / service / + # network failures, which is the right user-visible behavior. + validate_destructive_token(token, service_id=service_id) from backend.utils.router_utils import format_debug_request @@ -850,14 +914,48 @@ def provision_ngwaf_workspaces(service_id: str = Query(...), token: str = Query( @router.patch("/services/{service_id}/ngwaf-workspace") -def provision_set_ngwaf_workspace(service_id: str, body: dict): - """Persist the NGWAF workspace ID for a service and reload the scheduler.""" +def provision_set_ngwaf_workspace(service_id: str, body: dict, token: str = Query(default="")): + """Persist the NGWAF workspace ID for a service and reload the scheduler. + + Security: require the caller to present a Fastly token bound to + this service. Two paths are accepted: + + 1. The caller passes a token that ``/tokens/self`` confirms has the + ``global`` scope and access to ``service_id`` (preferred — admin + can rotate without re-entering the stored key). + 2. The caller passes a token that constant-time-matches the + service's stored ``fastly_api_key`` (the existing admin flow). + + Either way an unauthenticated attacker who can reach the endpoint can't + rebind the workspace because they don't know the token. The middleware + /api/provision/ block also gates this for analysts. + """ + import hmac + from backend import config as svcconfig + from backend.utils.fastly_auth import validate_destructive_token cfg = svcconfig.load_config(service_id) if not cfg: raise HTTPException(status_code=404, detail={"error": "Service not found"}) + token = (token or "").strip() + stored = (cfg.get("fastly_api_key") or "").strip() + if not token: + raise HTTPException( + status_code=401, + detail={"error": "token_required", "message": "A Fastly API token is required."}, + ) + + # Fast path: caller presented the stored key. Constant-time compare so + # we don't leak the stored value via timing. + matches_stored = bool(stored) and hmac.compare_digest(token, stored) + if not matches_stored: + # Fall back to the strict scope-validation path. validate_destructive_token + # raises HTTPException(401) on any failure (missing/insufficient scope, + # service mismatch, Fastly unreachable). + validate_destructive_token(token, service_id=service_id) + workspace_id = (body.get("ngwaf_workspace_id") or "").strip() or None cfg["ngwaf_workspace_id"] = workspace_id svcconfig.save_config(service_id, cfg) diff --git a/backend/routers/query.py b/backend/routers/query.py index dad46cd4..b386443e 100644 --- a/backend/routers/query.py +++ b/backend/routers/query.py @@ -2,7 +2,7 @@ from __future__ import annotations -from fastapi import APIRouter, Depends, HTTPException +from fastapi import APIRouter, Depends, HTTPException, Request from backend.deps import AnalyticsDeps, get_service_id from backend.models.dashboard import QueryRequest @@ -12,11 +12,22 @@ @router.post("/query") -def query_endpoint(req: QueryRequest, deps: AnalyticsDeps = Depends()): +def query_endpoint( + req: QueryRequest, + request: Request, + deps: AnalyticsDeps = Depends(), + service_id: str | None = Depends(get_service_id), +): sql = req.sql.strip() if not sql: raise HTTPException(status_code=400, detail={"error": "No SQL provided"}) + # Stamp session + service onto the validator audit log line so a + # rejection-rate spike from one analyst (attack-shaped probing) is + # observable without grepping correlated logs. + analyst_session = getattr(request.state, "analyst_session", None) + audit_session_id = analyst_session.session_id if analyst_session else "admin" + # Single retry on "Cannot open file" — the local_compaction cron can # delete the file the read_parquet glob just enumerated. The race # window is sub-second; a single retry catches it transparently. @@ -24,9 +35,17 @@ def query_endpoint(req: QueryRequest, deps: AnalyticsDeps = Depends()): for attempt in (1, 2): try: return repo.execute_query( - con=deps.con, src=deps.source, sql=sql, max_rows=req.max_rows, want_explain=req.explain + con=deps.con, + src=deps.source, + sql=sql, + max_rows=req.max_rows, + want_explain=req.explain, + session_id=audit_session_id, + service_id=service_id, ) except PermissionError as e: + # Validator rejections (security) and the legacy + # block both surface as PermissionError → HTTP 403. raise HTTPException(status_code=403, detail={"error": str(e)}) except Exception as e: msg = str(e) diff --git a/backend/routers/services/core.py b/backend/routers/services/core.py index 34be7c14..439b28ae 100644 --- a/backend/routers/services/core.py +++ b/backend/routers/services/core.py @@ -251,6 +251,7 @@ def api_cron_schedule(source: dict = Depends(get_source)): "expire": "expire", "alerts_evaluation": "alerts", "ngwaf_sync": "ngwaf_sync", + "metadata_cleanup": "metadata_cleanup", } schedules = [] for job in sched._sched.get_jobs(): @@ -566,8 +567,26 @@ def api_service_log_fields_set(service_id: str, body: LogFieldsUpdateRequest): if not new_lf: raise HTTPException(status_code=400, detail={"error": "log_fields is required"}) old_lf = cfg.get("log_fields", {}) - if "custom_fields" not in new_lf and "custom_fields" in old_lf: + # MERGE GUARD (sibling of 2026-06-02 state_sync fix): preserve + # existing custom_fields unless the caller explicitly provided a + # non-empty replacement. The pre-existing guard only triggered when + # the key was absent — an empty list "custom_fields":[] still + # stripped scoring fields. Treat "absent OR empty" as "no change", + # then if scoring is enabled re-inject the canonical entries from + # code so the routing-table for ingest stays correct. + incoming_custom = new_lf.get("custom_fields") + if not incoming_custom and old_lf.get("custom_fields"): new_lf["custom_fields"] = old_lf["custom_fields"] + if cfg.get("scoring", {}).get("enabled"): + from backend.provision.session_scoring_orchestrator import ( + _SCORING_CUSTOM_FIELDS, + _SCORING_FIELD_NAMES, + ) + + merged = list(new_lf.get("custom_fields") or []) + merged = [cf for cf in merged if cf.get("name") not in _SCORING_FIELD_NAMES] + merged.extend(dict(cf) for cf in _SCORING_CUSTOM_FIELDS) + new_lf["custom_fields"] = merged new_lf["schema_version"] = 2 old_groups = set(old_lf.get("groups", [])) new_groups = set(new_lf.get("groups", [])) @@ -596,7 +615,12 @@ def api_service_log_fields_set(service_id: str, body: LogFieldsUpdateRequest): } -@router.get("/services/{service_id}/logging-settings/update") +# Security: was @router.get — moved to POST/PATCH so a cross-origin +# `` or `` can no longer trigger a +# state-changing Fastly logging-settings update. The frontend's useSSE +# helper handles POST-with-streaming-response transparently. +@router.post("/services/{service_id}/logging-settings/update") +@router.patch("/services/{service_id}/logging-settings/update") def api_service_update_logging_settings( service_id: str, period: int | None = Query(default=None), @@ -1130,6 +1154,7 @@ def api_import_custom_fields(service_id: str, body: dict): new_custom_map = {**existing_map} now = datetime.now(UTC).isoformat() type_lock_errors: list[str] = [] + validation_errors: list[str] = [] for field_dict in fields_to_import: if "name" not in field_dict: continue @@ -1146,6 +1171,15 @@ def api_import_custom_fields(service_id: str, body: dict): f"Cannot change 'value_type' of '{fname}': field is already committed to the database." ) continue + # 019: Run the same validator the single-field add/update endpoints + # use, so importing a custom-fields JSON cannot smuggle in a + # field that the interactive editor would have rejected (bad + # name, dangerous VCL expression, oversized byte limit, etc.). + # WARN-level lines are advisory and don't block the write. + other_names = [n for n in new_custom_map if n != fname] + for err in lf_module.validate_custom_field(field_dict, other_names): + if not err.startswith("WARN:"): + validation_errors.append(f"{fname}: {err}") field_dict.pop("created_at", None) field_dict.pop("updated_at", None) field_dict["created_at"] = existing_field.get("created_at", now) @@ -1153,6 +1187,8 @@ def api_import_custom_fields(service_id: str, body: dict): new_custom_map[fname] = field_dict if type_lock_errors: raise HTTPException(status_code=422, detail={"errors": type_lock_errors}) + if validation_errors: + raise HTTPException(status_code=422, detail={"errors": validation_errors}) new_custom = list(new_custom_map.values()) candidate_lf = {**lf, "custom_fields": new_custom} fmt_errors = provision.validate_log_format(candidate_lf) diff --git a/backend/routers/services/cron.py b/backend/routers/services/cron.py index 77303fd1..cc644ab5 100644 --- a/backend/routers/services/cron.py +++ b/backend/routers/services/cron.py @@ -16,7 +16,7 @@ def api_cron_logs( sort: str = Query(default="started_at"), dir: str = Query(default="DESC"), ): - from backend.utils.telemetry import get_tracked_calls + from backend.utils.telemetry import get_queries, get_tracked_calls try: total, entries = get_cron_logs(source["name"], task, status, page, per_page, sort, dir) @@ -25,7 +25,7 @@ def api_cron_logs( "page": page, "per_page": per_page, "entries": entries, - "_debug_queries": [], + "_debug_queries": get_queries(), "_debug_calls": get_tracked_calls(), } except Exception as e: diff --git a/backend/routers/session_scoring.py b/backend/routers/session_scoring.py new file mode 100644 index 00000000..f6e95ded --- /dev/null +++ b/backend/routers/session_scoring.py @@ -0,0 +1,2308 @@ +"""Session-scoring admin router. + +Three endpoints (mirroring backend/routers/provision.py conventions): + + POST /api/services/{service_id}/scoring/enable + SSE-stream the enable_scoring orchestrator's status events. + + POST /api/services/{service_id}/scoring/disable + SSE-stream the disable_scoring orchestrator's status events. + + GET /api/services/{service_id}/scoring/status + Return the customer's current scoring block ({enabled, scoring_service_id, + scoring_domain, ...}) or {"enabled": false} if not yet wired. + +The actual work lives in +[backend/provision/session_scoring_orchestrator.py](backend/provision/session_scoring_orchestrator.py); +this router just wraps it in the existing SSE event-streaming infrastructure +([backend/provision/orchestrator.py::run_with_events](backend/provision/orchestrator.py#L128)) +so the dashboard can render a progress UI later.""" + +from __future__ import annotations + +import logging +import os + +from fastapi import APIRouter, HTTPException, Path, Query +from fastapi.responses import StreamingResponse + +from backend.utils.router_utils import SSE_HEADERS as _SSE_HEADERS +from backend.utils.router_utils import sse_flush_preamble as _sse_flush + +logger = logging.getLogger(__name__) +router = APIRouter(prefix="/api/services", tags=["session-scoring"]) + +# Fields in the per-service scoring config that must never reach the UI. +# AES key is never persisted in config (defensive); request_secret IS in +# the config so it must be stripped before any response that surfaces the +# scoring block (status endpoint, enable SSE done event, etc.). +_SECRET_KEYS = frozenset({"aes_key_hex", "request_secret"}) + + +# ── In-process TTL cache for analytics endpoints ───────────────────────────── +# +# The 3 summary endpoints (top-flagged, score-distribution, compliance- +# breakdown) each open a fresh DuckDB connection and pay 30-80ms of +# `_configure_fos` setup under `_fos_proxy_secret_lock` — and that lock +# is process-wide, so 3 concurrent requests serialize. Under cron-writer +# contention the same path can stall up to `max_wait=3` seconds. The +# admin page re-fires all three on every navigation. +# +# A 20-second TTL cache wipes most of that without changing user-facing +# semantics: refreshAll() invalidates React Query keys; we mirror that +# by exposing a bust() that the future "Refresh" button can call (the +# current admin Refresh button only invalidates the client cache, which +# is harmless overlap with this server cache — the next request just +# pulls a fresh server snapshot 20s later, which is fine). +import threading +import time as _time + +_ANALYTICS_TTL_SEC = 20.0 +# Bounded + lazy-reaped. Pre-migration this was a plain dict whose TTL +# was only checked on hit — entries lingered until /scoring/* was hit +# again with the same key. Keys are (endpoint, service_id, since_hours, +# ...) tuples that fan out across the admin UI's 8-card mount + diverse +# time windows, so the cardinality climbed unboundedly across hours. +# 1000 entries × ~50KB scoring payloads = ~50MB worst case — well below +# the dashboard cache but still worth bounding. +from backend.utils.bounded_cache import BoundedTTLCache as _BoundedTTLCache + +_analytics_cache: _BoundedTTLCache = _BoundedTTLCache(maxsize=1000, ttl_seconds=_ANALYTICS_TTL_SEC) +# Global lock guards _analytics_cache + _inflight dict mutations only — +# the actual `producer()` call runs under a PER-KEY lock so concurrent +# misses on DIFFERENT keys (the dashboard's 8-card mount pattern) run +# in parallel instead of queueing through one mutex. The cache's own +# RLock is held INSIDE this outer lock by way of the bounded-cache +# implementation; RLock re-entry from the same thread is safe. +_analytics_cache_lock = threading.Lock() +_inflight: dict[tuple, threading.Lock] = {} + + +def _cached(key: tuple, producer): + """Return cached value if fresh, else produce + store. + + In-flight collapse: concurrent callers on the SAME key serialize + through a per-key lock so the underlying query runs once and the + rest get the cached value when they acquire the lock. Concurrent + callers on DIFFERENT keys (the dashboard mount fires 8 endpoints + with 8 different keys) run in parallel — they only contend on the + global lock during the brief cache-lookup + per-key-lock-handoff + window.""" + with _analytics_cache_lock: + # Capture `now` INSIDE the lock so the freshness check evaluates + # against the lock-acquisition timestamp, not a stale value from + # before lock contention drained. A hot key under load can have + # 10-50ms of lock-wait — using a pre-lock `now` would flag a + # still-fresh entry as expired and trigger an extra producer call. + now = _time.monotonic() + entry = _analytics_cache.get(key) + if entry and (now - entry[0]) < _ANALYTICS_TTL_SEC: + return entry[1] + # Miss — claim the per-key lock under the global lock so two + # concurrent misses on the same key don't both create new locks. + key_lock = _inflight.get(key) + if key_lock is None: + key_lock = threading.Lock() + _inflight[key] = key_lock + + # Hold the per-key lock only. The first miss runs producer(); the + # second-through-Nth miss waits here, then sees the cached entry on + # the re-check inside the lock and returns it without re-running. + with key_lock: + with _analytics_cache_lock: + now = _time.monotonic() + entry = _analytics_cache.get(key) + if entry and (now - entry[0]) < _ANALYTICS_TTL_SEC: + return entry[1] + # Actual producer call happens OUTSIDE the global lock so other + # keys can be served while this one is computing. + value = producer() + with _analytics_cache_lock: + # Re-capture now after producer() so the TTL clock starts + # from when the value was actually computed, not from when + # we entered _cached. + _analytics_cache[key] = (_time.monotonic(), value) + # Drop the per-key lock entry — small saving but bounds the + # _inflight dict growth across the long-running TTL window. + _inflight.pop(key, None) + return value + + +def _bust_analytics_cache(service_id: str | None = None) -> None: + """Drop cached entries. Called by mutating endpoints (label flag/ + delete) so the admin page sees their effects on next fetch. + + Cache keys are tuples like ``(endpoint_name, service_id, since_hours, ...)`` + so the service_id lives at index 1, not 0. Earlier code compared + ``k[0] == service_id`` which is always the endpoint name — the bust + was a silent no-op and label mutations only invalidated via the 20s + TTL. Match by membership instead of position so the dedup is correct + regardless of future key shape changes.""" + with _analytics_cache_lock: + if service_id is None: + _analytics_cache.clear() + return + for k in list(_analytics_cache.keys()): + if service_id in k: + del _analytics_cache[k] + + +def _load_matrix(service_id: str | None = None) -> dict | None: + """Load the trained L2 transition matrix. + + Resolution order: + 1. ``compute/scorer/matrix.json`` on local disk (trained + present) + 2. FOS-published copy (``iceberg/meta/scoring_matrix.json``) — + pulled by enable_scoring and retrain. Lets any backend host see + the same matrix the admin host deployed without per-host scp. + 3. ``compute/scorer/matrix.default.json`` (always shipped in the + image) — empty transitions, AUC will read ~0.5 / "BELOW + THRESHOLD" so the StatusPanel pushes the operator to train. + + Returns None only when all three sources fail. + """ + import json as _json + + from backend.provision.session_scoring_orchestrator import _MATRIX_PATH + + # 1. Local trained matrix. + try: + if _MATRIX_PATH.exists(): + with _MATRIX_PATH.open() as f: + m = _json.load(f) + if isinstance(m, dict) and m: + return m + except Exception: + logger.debug("[_load_matrix] local matrix.json read failed", exc_info=True) + + # 2. FOS-published matrix (only when a service id is in scope; the + # AUC endpoint always has one). + if service_id: + try: + from backend.state_sync import fetch_matrix_from_fos + + m = fetch_matrix_from_fos(service_id) + if m: + return m + except Exception: + logger.debug("[_load_matrix] FOS matrix fetch failed", exc_info=True) + + # 3. Default-empty matrix bundled with the image. + default_path = _MATRIX_PATH.parent / "matrix.default.json" + try: + if default_path.exists(): + with default_path.open() as f: + m = _json.load(f) + if isinstance(m, dict) and m: + return m + except Exception: + logger.debug("[_load_matrix] default matrix.json read failed", exc_info=True) + + return None + + +def _fetch_session_events( + service_id: str, + sids: list[str], + since_days: int = 30, + limit_per_sid: int = 500, +) -> dict[str, list[dict]]: + """Return ``{sid: [{ts, url, status, ip, ua, edge_score, edge_cookie_compliance, edge_score_reason}, ...]}`` + for every sid in ``sids`` whose events landed in DuckDB within the + last ``since_days`` days. + + Sids that have no rows in the window are dropped from the result + (not present in the returned dict). The per-sid event cap is a + safety bound — a runaway session with 10k+ requests would otherwise + bloat the response; 500 events covers any realistic browsing pattern. + """ + if not sids: + return {} + + from backend.core.duckdb import _safe_table_name + + table = _safe_table_name(service_id) + placeholders = ",".join("?" for _ in sids) + # 010: push the per-sid LIMIT into SQL via ``row_number() OVER + # (PARTITION BY edge_sid ORDER BY timestamp)``. The previous shape + # let DuckDB materialise the full result set in Python before the + # ``len(bucket) >= limit_per_sid`` guard ran — a single attacker + # session with millions of events could OOM the backend before any + # Python code saw a row. The CTE caps at ``limit_per_sid`` rows + # per sid AT THE STORAGE LAYER so the worst-case memory footprint + # is ``len(sids) × limit_per_sid`` regardless of attacker volume. + per_sid_cap = int(limit_per_sid) + sql = f""" + WITH ranked AS ( + SELECT edge_sid, timestamp AS ts, url, status, ip, ua, + edge_score, edge_cookie_compliance, edge_score_reason, + row_number() OVER (PARTITION BY edge_sid ORDER BY timestamp) AS _rn + FROM {table} + WHERE edge_sid IN ({placeholders}) + AND timestamp >= now() - INTERVAL {int(since_days)} DAY + ) + SELECT edge_sid, ts, url, status, ip, ua, + edge_score, edge_cookie_compliance, edge_score_reason + FROM ranked + WHERE _rn <= {per_sid_cap} + ORDER BY edge_sid, ts + """ + rows = _query_logs(service_id, sql, tuple(sids)) + + grouped: dict[str, list[dict]] = {} + for r in rows: + sid = r.get("edge_sid") + if not sid: + continue + bucket = grouped.setdefault(sid, []) + if len(bucket) >= limit_per_sid: + continue + # Stringify the timestamp for JSON serialization. DuckDB returns + # datetime objects which FastAPI's default JSON encoder rejects + # in nested arrays (only the top-level Pydantic model serializer + # handles them). + ts = r.get("ts") + bucket.append( + { + "ts": ts.isoformat() if hasattr(ts, "isoformat") else str(ts) if ts is not None else None, + "url": r.get("url") or "/", + "status": r.get("status"), + "ip": r.get("ip"), + "ua": r.get("ua"), + "edge_score": r.get("edge_score"), + "edge_cookie_compliance": r.get("edge_cookie_compliance"), + "edge_score_reason": r.get("edge_score_reason"), + } + ) + return grouped + + +def _reconstruct_labeled_sessions(service_id: str, labels: list[dict]) -> list[tuple[dict, str]]: + """Replay each labeled sid into the {session_id, events:[{ts,url}]} + shape that ``evaluate()`` expects. + + Each label stores only ``sid`` + sample fields. The actual event + sequence lives in DuckDB as one row per request. We issue ONE query + grouped by edge_sid + ordered by timestamp, then bucket rows into + sessions in Python (DuckDB's ``list()`` aggregate would also work + but the Python side is clearer and the volume is small — at most + ``len(labels)`` sids). + + Returns (session_dict, label) tuples ready to pass to evaluate(). + Sids that don't appear in DuckDB (haven't been ingested yet, or were + rotated away) are dropped silently — they contribute nothing to AUC + either way. + """ + if not labels: + return [] + sid_to_label = {row["sid"]: row["label"] for row in labels if row.get("sid")} + if not sid_to_label: + return [] + grouped = _fetch_session_events(service_id, list(sid_to_label.keys()), since_days=30) + out: list[tuple[dict, str]] = [] + for sid, label in sid_to_label.items(): + events = grouped.get(sid, []) + if not events: + continue # sid never landed in DuckDB; can't evaluate + # max_edge_score is what `evaluate_from_persisted_scores` consumes: + # the actual score the live scorer returned (L1 + L2 + compliance + # combined). Taking the MAX across the session matches the + # production VCL behavior — a session is operationally caught at + # its worst single transition, not its average. None-valued + # rows are excluded so a sid with only un-scored events doesn't + # collapse to max_edge_score=0. + scored_values = [e.get("edge_score") for e in events if e.get("edge_score") is not None] + max_score = max(scored_values) if scored_values else None + out.append( + ( + { + "session_id": sid, + "events": events, + "max_edge_score": max_score, + }, + label, + ) + ) + return out + + +def _resolve_token(service_id: str, override_token: str = "") -> str: + """Use the provided token, or fall back to the service-config's + fastly_api_key. Returns empty string if neither is available — the + caller raises an HTTPException in that case.""" + if override_token: + return override_token + from backend import config as svcconfig + + cfg = svcconfig.load_config(service_id) + if cfg: + return cfg.get("fastly_api_key", "") or "" + return "" + + +@router.post("/{service_id}/scoring/enable") +def scoring_enable( + service_id: str = Path(..., description="Logging service ID to enable scoring on"), + token: str = Query(default=""), +): + """Enable session scoring for the given logging service. + + Streams SSE status events while the orchestrator runs through: + Compute service provisioning → Wasm deploy → VCL clone → backend + + snippets + custom fields + format update → validate → activate.""" + resolved_token = _resolve_token(service_id, token) + if not resolved_token: + raise HTTPException( + status_code=400, + detail={"error": "Fastly API token required (pass ?token= or set in service config)"}, + ) + + from backend.provision.orchestrator import run_with_events + from backend.provision.session_scoring_orchestrator import enable_scoring + from backend.utils.router_utils import sse_event + + def stream(): + yield from _sse_flush() + yield from sse_event({"type": "status", "message": f"Enabling session scoring for {service_id}..."}) + + try: + for event in run_with_events(enable_scoring, service_id, resolved_token): + yield from sse_event(event) + yield f": {' ' * 256}\n\n" + # run_with_events captures the return value in its own scope; we + # don't have direct access here. Re-load the config to surface + # the post-state in the final SSE event. + from backend import config as svcconfig + + cfg = svcconfig.load_config(service_id) or {} + scoring = cfg.get("scoring", {}) + safe_scoring = {k: v for k, v in scoring.items() if k not in _SECRET_KEYS} + yield from sse_event( + { + "type": "done", + "message": "Session scoring enabled.", + "scoring": safe_scoring, + } + ) + from backend.core import metadata_db + + metadata_db.record_scoring_audit( + service_id, + "scoring_enabled", + details={ + "scoring_service_id": safe_scoring.get("scoring_service_id"), + "matrix_version": safe_scoring.get("matrix_version"), + }, + ) + metadata_db.record_audit( + service_id=service_id, + event_type="scoring_enabled", + details={ + "scoring_service_id": safe_scoring.get("scoring_service_id"), + "matrix_version": safe_scoring.get("matrix_version"), + }, + actor="operator", + ) + except Exception as e: + logger.exception("scoring_enable failed for %s", service_id) + yield from sse_event({"type": "error", "message": str(e)}) + + return StreamingResponse(stream(), media_type="text/event-stream", headers=_SSE_HEADERS) + + +@router.post("/{service_id}/scoring/disable") +def scoring_disable( + service_id: str = Path(..., description="Logging service ID to disable scoring on"), + token: str = Query(default=""), +): + """Disable session scoring. Reverse of enable_scoring.""" + resolved_token = _resolve_token(service_id, token) + if not resolved_token: + raise HTTPException( + status_code=400, + detail={"error": "Fastly API token required"}, + ) + + from backend.provision.orchestrator import run_with_events + from backend.provision.session_scoring_orchestrator import disable_scoring + from backend.utils.router_utils import sse_event + + def stream(): + yield from _sse_flush() + yield from sse_event({"type": "status", "message": f"Disabling session scoring for {service_id}..."}) + + try: + for event in run_with_events(disable_scoring, service_id, resolved_token): + yield from sse_event(event) + yield f": {' ' * 256}\n\n" + yield from sse_event({"type": "done", "message": "Session scoring disabled."}) + from backend.core import metadata_db + + metadata_db.record_scoring_audit(service_id, "scoring_disabled") + metadata_db.record_audit( + service_id=service_id, + event_type="scoring_disabled", + details={}, + actor="operator", + ) + except Exception as e: + logger.exception("scoring_disable failed for %s", service_id) + yield from sse_event({"type": "error", "message": str(e)}) + + return StreamingResponse(stream(), media_type="text/event-stream", headers=_SSE_HEADERS) + + +@router.get("/{service_id}/scoring/status") +def scoring_status( + service_id: str = Path(..., description="Logging service ID"), +) -> dict: + """Return the scoring block from the service's config, or + {"enabled": false} if scoring was never enabled.""" + from backend import config as svcconfig + + cfg = svcconfig.load_config(service_id) + if not cfg: + raise HTTPException(status_code=404, detail={"error": f"No config for service {service_id}"}) + scoring = cfg.get("scoring") + if not scoring or not scoring.get("enabled"): + return {"enabled": False} + return {k: v for k, v in scoring.items() if k not in _SECRET_KEYS} + + +# ── Labels (good / bad / neutral session tags) ────────────────────────────── + + +@router.get("/{service_id}/scoring/labels") +def scoring_labels_list( + service_id: str = Path(..., description="Logging service ID"), + limit: int = Query(default=500, ge=1, le=10000), +) -> dict: + """Return all session labels for a service, most recent first.""" + from backend.scoring import labels as _labels + + rows = _labels.list_labels(service_id, limit=limit) + counts = _labels.counts_by_label(service_id) + return {"labels": rows, "counts": counts} + + +@router.post("/{service_id}/scoring/labels") +def scoring_labels_create( + body: dict, + service_id: str = Path(..., description="Logging service ID"), +) -> dict: + """Create or update a label. Upserts on (service_id, sid).""" + from backend.scoring import labels as _labels + + sid = (body.get("sid") or "").strip() + label = (body.get("label") or "").strip() + # save_label() itself validates sid + label and raises ValueError + # with the same messages; the try/except below converts that into + # HTTPException(400). Keeping the validation in one place (the + # CRUD module) means in-process callers — not just HTTP — get the + # same protection. + try: + row = _labels.save_label( + service_id, + sid=sid, + label=label, + notes=body.get("notes", ""), + flagged_by=body.get("flagged_by", "admin"), + sample_ip=body.get("sample_ip", ""), + sample_ua=body.get("sample_ua", ""), + sample_url=body.get("sample_url", ""), + ) + except ValueError as e: + raise HTTPException(status_code=400, detail={"error": str(e)}) + # Bust analytics cache so the next /top-flagged shows the new badge. + _bust_analytics_cache(service_id) + return row + + +@router.patch("/{service_id}/scoring/labels/{label_id}") +def scoring_labels_update( + body: dict, + service_id: str = Path(...), + label_id: str = Path(...), +) -> dict: + from backend.scoring import labels as _labels + + try: + row = _labels.update_label( + service_id, + label_id, + label=body.get("label"), + notes=body.get("notes"), + ) + except ValueError as e: + raise HTTPException(status_code=400, detail={"error": str(e)}) + if not row: + raise HTTPException(status_code=404, detail={"error": "label not found"}) + _bust_analytics_cache(service_id) + return row + + +@router.delete("/{service_id}/scoring/labels/{label_id}") +def scoring_labels_delete( + service_id: str = Path(...), + label_id: str = Path(...), +) -> dict: + from backend.scoring import labels as _labels + + result = _labels.delete_label(service_id, label_id) + _bust_analytics_cache(service_id) + return result + + +# ── Summary queries (top-flagged, distributions) ──────────────────────────── + + +def _query_logs(service_id: str, sql: str, params: tuple = ()) -> list[dict]: + """Tiny helper — run a SELECT against the per-service logs view and + return list[dict]. + + Why the try/finally + explicit close: get_connection() opens a fresh + DuckDB connection per call by design (independent connections beat + shared-cursor serialization under load — see backend/core/duckdb.py). + Leaving them open here was the root cause of constant .duckdb-wal / + .duckdb-shm file churn that ate ~1.5GB of mds_stores + VS Code + extension-host RAM during the 2026-06-01 admin-page polling crash. + Mirrors the canonical pattern from backend/routers/query.py. + + ``params`` is passed through to ``con.execute`` so callers can use + parametrized queries (e.g. ``WHERE edge_sid IN (?, ?, ?)``) without + string-formatting user-controlled values into the SQL.""" + from backend.core.duckdb import get_connection, get_source_for_service + + src = get_source_for_service(service_id) + if src is None: + raise HTTPException(status_code=404, detail={"error": f"No service {service_id}"}) + con = None + try: + con = get_connection(source=src, max_wait=3, skip_view_update=True, read_only=True) + rows = con.execute(sql, params).fetchall() if params else con.execute(sql).fetchall() + cols = [d[0] for d in con.description] if con.description else [] + return [dict(zip(cols, r)) for r in rows] + except HTTPException: + raise + except Exception as e: + raise HTTPException(status_code=400, detail={"error": str(e)}) + finally: + if con is not None: + try: + con.close() + except Exception: + pass + + +@router.get("/{service_id}/scoring/top-flagged") +def scoring_top_flagged( + service_id: str = Path(...), + since_hours: int = Query(default=24, ge=1, le=168), + limit: int = Query(default=50, ge=1, le=500), +) -> dict: + """Recent rows with non-null edge_score, sorted by score DESC. Feeds + the admin page's "Top flagged sessions" table. + + Returns up to ``limit`` rows. Each row carries enough context for the + admin to label it: sid, ip, ua, url, the three score fields, and + cookie_compliance. Joining against the labels table is left to the + UI (so it can show "currently labeled X" badges without paying the + JOIN cost server-side).""" + from backend.core.duckdb import _safe_table_name + + table = _safe_table_name(service_id) + sql = f""" + SELECT + timestamp, + edge_sid, + edge_score, + edge_score_l1, + edge_score_l2, + edge_cookie_compliance, + edge_score_reason, + ip, + ua, + url, + status, + country + FROM {table} + WHERE edge_score IS NOT NULL + AND timestamp >= now() - INTERVAL {int(since_hours)} HOUR + ORDER BY edge_score DESC, timestamp DESC + LIMIT {int(limit)} + """ + return _cached( + ("top-flagged", service_id, since_hours, limit), + lambda: {"rows": _query_logs(service_id, sql), "since_hours": since_hours}, + ) + + +@router.get("/{service_id}/scoring/score-distribution") +def scoring_score_distribution( + service_id: str = Path(...), + since_hours: int = Query(default=24, ge=1, le=168), +) -> dict: + """Hourly buckets × score buckets (0, 25, 50, 75, 100). Returns a flat + list of {hour, bucket, count} rows; the frontend pivots for the + histogram.""" + from backend.core.duckdb import _safe_table_name + + table = _safe_table_name(service_id) + sql = f""" + SELECT + date_trunc('hour', timestamp) AS hour, + CASE + WHEN edge_score < 25 THEN '0-25' + WHEN edge_score < 50 THEN '25-50' + WHEN edge_score < 75 THEN '50-75' + ELSE '75-100' + END AS bucket, + COUNT(*) AS count + FROM {table} + WHERE edge_score IS NOT NULL + AND timestamp >= now() - INTERVAL {int(since_hours)} HOUR + GROUP BY 1, 2 + ORDER BY 1, 2 + """ + return _cached( + ("score-distribution", service_id, since_hours), + lambda: {"rows": _query_logs(service_id, sql), "since_hours": since_hours}, + ) + + +@router.get("/{service_id}/scoring/compliance-breakdown") +def scoring_compliance_breakdown( + service_id: str = Path(...), + since_hours: int = Query(default=24, ge=1, le=168), +) -> dict: + """Hourly count grouped by edge_cookie_compliance (ok / missing / + tampered / expired / unknown).""" + from backend.core.duckdb import _safe_table_name + + table = _safe_table_name(service_id) + sql = f""" + SELECT + date_trunc('hour', timestamp) AS hour, + edge_cookie_compliance AS compliance, + COUNT(*) AS count + FROM {table} + WHERE edge_cookie_compliance IS NOT NULL + AND edge_cookie_compliance != '' + AND timestamp >= now() - INTERVAL {int(since_hours)} HOUR + GROUP BY 1, 2 + ORDER BY 1, 2 + """ + return _cached( + ("compliance-breakdown", service_id, since_hours), + lambda: {"rows": _query_logs(service_id, sql), "since_hours": since_hours}, + ) + + +@router.get("/{service_id}/scoring/health") +def scoring_health( + service_id: str = Path(...), + since_hours: int = Query(default=24, ge=1, le=168), +) -> dict: + """High-level scoring health snapshot for the admin dashboard. + + Returns a single object with the metrics the operator wants at a + glance: how often scoring fires vs how much edge traffic flows, what + the score distribution looks like as summary stats (mean / p50 / p95), + a top-N breakdown of the comma-separated edge_score_reason values + (so 'cookie-missing' vs 'impossibly-fast' vs 'rare-transition' is + visible without opening the Raw Logs table), and a fail-open error + count (rows where the scorer's deliver-stage subfields didn't land — + typically a Compute timeout or auth mismatch). + """ + from backend.core.duckdb import _safe_table_name + + table = _safe_table_name(service_id) + interval = int(since_hours) + sql = f""" + WITH edge_rows AS ( + SELECT + edge_score, + edge_score_l2, + edge_score_reason, + edge_cookie_compliance, + edge_sid + FROM {table} + WHERE edge = true + AND timestamp >= now() - INTERVAL {interval} HOUR + ), + scored AS ( + SELECT * FROM edge_rows WHERE edge_score IS NOT NULL + ), + reason_rows AS ( + SELECT trim(reason) AS reason + FROM ( + SELECT unnest(string_split(edge_score_reason, ',')) AS reason + FROM scored + WHERE edge_score_reason IS NOT NULL AND edge_score_reason != '' + ) + WHERE trim(reason) != '' + ), + top_reasons AS ( + SELECT reason, COUNT(*) AS n + FROM reason_rows + GROUP BY reason + ORDER BY n DESC + LIMIT 10 + ) + SELECT + (SELECT COUNT(*) FROM edge_rows) AS total_edge_rows, + (SELECT COUNT(*) FROM scored) AS scored_rows, + (SELECT COUNT(DISTINCT edge_sid) FROM scored WHERE edge_sid <> '') AS distinct_sids, + (SELECT AVG(edge_score) FROM scored) AS avg_score, + (SELECT quantile_cont(edge_score, 0.5) FROM scored) AS p50_score, + (SELECT quantile_cont(edge_score, 0.95) FROM scored) AS p95_score, + (SELECT MAX(edge_score) FROM scored) AS max_score, + (SELECT COUNT(*) FROM scored + WHERE edge_score_reason ILIKE '%compute-unavailable%' + OR edge_score_reason ILIKE '%unauthorized%') AS scorer_errors, + (SELECT list({{'reason': reason, 'count': n}}) FROM top_reasons) AS top_reasons, + -- Matrix-staleness signal: fraction of scored rows whose L2 + -- transition score is "high" (≥50), meaning the matrix gave + -- that transition low probability. When this fraction climbs + -- (3× baseline per research §7.5), the matrix is drifting + -- relative to current traffic and a retrain is warranted. + -- L2 of 0 means "didn't trip a rare-transition rule" — used + -- in the denominator of the staleness fraction filtered to + -- rows we ACTUALLY evaluated L2 on (excludes cookie-missing + -- shortcuts that bypass the matrix). + (SELECT COUNT(*) FROM scored WHERE edge_score_l2 IS NOT NULL) AS l2_evaluated, + (SELECT COUNT(*) FROM scored WHERE edge_score_l2 >= 50) AS l2_high_count + """ + + def _produce() -> dict: + rows = _query_logs(service_id, sql) + row = rows[0] if rows else {} + total = int(row.get("total_edge_rows") or 0) + scored = int(row.get("scored_rows") or 0) + fire_rate_pct = (scored / total * 100.0) if total else 0.0 + + l2_evaluated = int(row.get("l2_evaluated") or 0) + l2_high = int(row.get("l2_high_count") or 0) + l2_high_pct = (l2_high / l2_evaluated * 100.0) if l2_evaluated else 0.0 + # Heuristic staleness band: research §7.5 calls it 3× the + # baseline. We don't have a stored baseline yet, so flag any + # window where MORE THAN 25% of L2-evaluated requests scored + # high — that's the "the matrix doesn't know what normal looks + # like anymore" threshold. UI surfaces this as a yellow chip + # with a "Retrain now?" hint. + matrix_stale = l2_evaluated >= 100 and l2_high_pct > 25.0 + + return { + "since_hours": since_hours, + "total_edge_rows": total, + "scored_rows": scored, + "fire_rate_pct": round(fire_rate_pct, 2), + "distinct_sids": int(row.get("distinct_sids") or 0), + "avg_score": float(row.get("avg_score") or 0), + "p50_score": float(row.get("p50_score") or 0), + "p95_score": float(row.get("p95_score") or 0), + "max_score": int(row.get("max_score") or 0), + "scorer_errors": int(row.get("scorer_errors") or 0), + "top_reasons": row.get("top_reasons") or [], + "matrix_staleness": { + "l2_evaluated": l2_evaluated, + "l2_high_count": l2_high, + "l2_high_pct": round(l2_high_pct, 2), + "is_stale": matrix_stale, + "threshold_pct": 25.0, + }, + } + + return _cached(("scoring-health", service_id, since_hours), _produce) + + +# ── Matrix quality (ROC-AUC against accumulated labels) ───────────────────── + + +# Below this many of EACH class (good / bad), AUC is too noisy to be a +# useful signal — surface a "need more labels" CTA instead so the operator +# isn't tempted to act on a 0.5 ± wildly-bouncing number from sub-3 +# samples. +_MIN_LABELS_PER_CLASS = 3 + + +@router.get("/{service_id}/scoring/evaluation") +def scoring_evaluation( + service_id: str = Path(...), +) -> dict: + """Compute the live matrix's ROC-AUC against the operator's accumulated + good/bad labels and return the result for the StatusPanel. + + Below the per-class minimum (3 each) the endpoint reports + ``has_min_samples: false`` and the StatusPanel renders a CTA pushing + the operator to label a few more sessions — sub-3 AUC bounces between + 0 and 1 on a single label flip and would erode trust in the metric. + + Wiring: + 1. Pull all labels for the service from SQLite (cheap; <10k rows). + 2. Reconstruct each labeled sid's event sequence from DuckDB. + 3. Load the trained matrix JSON. + 4. Run evaluate() — AUC via Mann-Whitney U, no scipy dependency. + + Cached for 20s under the existing _cached pattern; the key includes + the label count so a fresh label naturally invalidates the cache + (also, _bust_analytics_cache fires on label POST/PATCH/DELETE). + """ + from backend.scoring import labels as _labels + + label_rows = _labels.list_labels(service_id) + counts = _labels.counts_by_label(service_id) + n_good = counts.get("good", 0) + n_bad = counts.get("bad", 0) + n_neutral = counts.get("neutral", 0) + + # The cache key includes the label count so a new label invalidates + # the previous snapshot even if the explicit cache bust were to miss. + cache_key = ("scoring-evaluation", service_id, n_good, n_bad, n_neutral) + + def _produce() -> dict: + from backend.config import load_config + + cfg = load_config(service_id) or {} + matrix_version = (cfg.get("scoring") or {}).get("matrix_version") or "unknown" + + if n_good < _MIN_LABELS_PER_CLASS or n_bad < _MIN_LABELS_PER_CLASS: + return { + "has_min_samples": False, + "min_per_class": _MIN_LABELS_PER_CLASS, + "n_good": n_good, + "n_bad": n_bad, + "n_neutral": n_neutral, + "matrix_version": matrix_version, + } + + matrix = _load_matrix(service_id) + if matrix is None: + return { + "has_min_samples": True, + "min_per_class": _MIN_LABELS_PER_CLASS, + "n_good": n_good, + "n_bad": n_bad, + "n_neutral": n_neutral, + "matrix_version": matrix_version, + "error": "Trained matrix is missing on disk (compute/scorer/matrix.json). " + "Run scripts/scoring/train.py to produce one.", + } + + labeled_sessions = _reconstruct_labeled_sessions(service_id, label_rows) + # If most labeled sids haven't landed in DuckDB yet (fresh label + # → ingest lag), evaluate against what we have but flag the gap. + n_reconstructed = len(labeled_sessions) + from backend.scoring.evaluate import DEFAULT_MIN_AUC, evaluate_from_persisted_scores + + # Use persisted edge_score (L1+L2+compliance combined, what the + # live scorer actually returned) rather than recomputing L2 from + # events. Without this, single-URL bot probes ALWAYS score 0 + # (no transitions), which inverts AUC against any matrix the + # bots' cookie-missing flag would have caught at the edge. + result = evaluate_from_persisted_scores(labeled_sessions) + # Prefer the matrix file's own `version` over whatever's in the + # cfg — the cfg version tracks what's DEPLOYED to Wasm, the + # matrix file tracks what was last trained. AUC is computed + # against the trained matrix, so its version is the relevant one. + effective_version = matrix.get("version") or matrix_version + return { + "has_min_samples": True, + "min_per_class": _MIN_LABELS_PER_CLASS, + "n_good": result.n_good, + "n_bad": result.n_bad, + "n_neutral": n_neutral, + "n_reconstructed": n_reconstructed, + "n_labels_total": len(label_rows), + "auc": round(float(result.auc), 4), + "passed": bool(result.passed), + "threshold": float(result.pass_threshold), + "default_min_auc": float(DEFAULT_MIN_AUC), + "matrix_version": effective_version, + } + + return _cached(cache_key, _produce) + + +# ── ROC + PR curves against accumulated labels ────────────────────────────── + + +@router.get("/{service_id}/scoring/curves") +def scoring_curves( + service_id: str = Path(...), +) -> dict: + """ROC + PR curve points for the operator's labeled sessions. + + Walks every integer threshold 0..100 and computes: + ROC: (false_positive_rate, true_positive_rate) at that cutoff + PR: (recall, precision) at that cutoff + + Plus the scalar summaries (AUC = area under ROC; AP = average + precision = area under PR). Both areas use the trapezoidal rule + on the sorted threshold sweep. + + Returns has_min_samples=false when either class has <3 labels — + same gate as /scoring/evaluation — so the UI renders the "label + more sessions" CTA instead of a noisy curve. + """ + from backend.scoring import labels as _labels + + label_rows = _labels.list_labels(service_id) + counts = _labels.counts_by_label(service_id) + n_good = counts.get("good", 0) + n_bad = counts.get("bad", 0) + + if n_good < _MIN_LABELS_PER_CLASS or n_bad < _MIN_LABELS_PER_CLASS: + return { + "has_min_samples": False, + "min_per_class": _MIN_LABELS_PER_CLASS, + "n_good": n_good, + "n_bad": n_bad, + } + + # Reconstruct labeled sessions and extract their max persisted scores. + # Same path the AUC endpoint uses, so the curve is consistent with + # the headline AUC number. + labeled_sessions = _reconstruct_labeled_sessions(service_id, label_rows) + scored: list[tuple[int, str]] = [] + for session, label in labeled_sessions: + if label not in ("good", "bad"): + continue + score = session.get("max_edge_score") + if score is None: + continue + scored.append((int(score), label)) + + if not scored: + return { + "has_min_samples": False, + "min_per_class": _MIN_LABELS_PER_CLASS, + "n_good": n_good, + "n_bad": n_bad, + "note": "labels exist but none of their sids have landed in DuckDB yet", + } + + total_pos = sum(1 for _, lbl in scored if lbl == "bad") + total_neg = sum(1 for _, lbl in scored if lbl == "good") + + roc: list[dict] = [] + pr: list[dict] = [] + # Walk thresholds from 100 down to 0 so the ROC curve traces from + # origin (0,0) toward (1,1) as we lower the cutoff. Each integer + # threshold is a separate operating point; sub-integer resolution + # isn't useful since the live scorer emits int 0-100. + for t in range(100, -1, -1): + tp = sum(1 for s, lbl in scored if lbl == "bad" and s >= t) + fp = sum(1 for s, lbl in scored if lbl == "good" and s >= t) + fn = total_pos - tp + tpr = tp / total_pos if total_pos else 0.0 + fpr = fp / total_neg if total_neg else 0.0 + precision = (tp / (tp + fp)) if (tp + fp) else 1.0 # convention: empty flagged set → precision 1 + recall = tpr + roc.append({"threshold": t, "fpr": round(fpr, 4), "tpr": round(tpr, 4)}) + pr.append({"threshold": t, "precision": round(precision, 4), "recall": round(recall, 4)}) + + # AUC via trapezoidal integration over the ROC points (sorted by + # fpr ascending). AP same idea over PR. + def _trapz(points: list[tuple[float, float]]) -> float: + if len(points) < 2: + return 0.0 + pts = sorted(points, key=lambda p: p[0]) + area = 0.0 + for i in range(1, len(pts)): + x0, y0 = pts[i - 1] + x1, y1 = pts[i] + area += (x1 - x0) * (y0 + y1) / 2.0 + return area + + auc = _trapz([(p["fpr"], p["tpr"]) for p in roc]) + ap = _trapz([(p["recall"], p["precision"]) for p in pr]) + + return { + "has_min_samples": True, + "min_per_class": _MIN_LABELS_PER_CLASS, + "n_good": total_neg, + "n_bad": total_pos, + "n_labels_total": len(label_rows), + "auc": round(float(auc), 4), + "average_precision": round(float(ap), 4), + "roc": roc, + "pr": pr, + } + + +# ── Threshold preview (counterfactual: at threshold X, what flips?) ───────── + + +@router.get("/{service_id}/scoring/threshold-preview") +def scoring_threshold_preview( + service_id: str = Path(...), + threshold: int = Query(default=75, ge=0, le=100), + since_hours: int = Query(default=24, ge=1, le=168), +) -> dict: + """Preview what happens at a given enforcement threshold. + + For the last ``since_hours`` of edge traffic, count: + - total scored requests + - how many would be flagged (edge_score >= threshold) + - of those, how many are labeled good / bad / unlabeled + - same breakdown for the un-flagged tail + + This is the underlying data for the operator-facing slider: drag + threshold up → fewer flags but you start missing labeled-bad + sessions; drag down → catches more bad but also flags some labeled- + good (false positives). The 2x2 confusion matrix readout is enough + to eyeball the right cutoff. + + Cached 30s under the existing ``_cached`` pattern; the cache key + includes the threshold so dragging the slider re-fetches. + """ + from backend.core.duckdb import _safe_table_name + from backend.scoring import labels as _labels + + table = _safe_table_name(service_id) + interval = int(since_hours) + threshold_int = int(threshold) + + def _produce() -> dict: + # Build the label index in Python — small (≤10k labels) and + # avoids a JOIN against SQLite (which would need ATTACH overhead). + label_rows = _labels.list_labels(service_id) + sid_to_label = {row["sid"]: row["label"] for row in label_rows if row.get("sid")} + + # 009: push the bucketing into SQL so a service with millions of + # distinct edge_sids in the window can't OOM the backend. The + # old shape materialised one Python dict per sid before doing + # any bucketing; for a high-traffic service that's a few + # gigabytes of dicts. + # + # Two queries now run: + # (a) one aggregate row across all sids (total + flagged + + # passed counts — fixed-size result regardless of fleet + # size); + # (b) only the LABELED sids (bounded by label storage; the + # UI caps practical label sets in the low thousands). + # + # Python then computes the labeled splits and derives the + # unlabeled splits by subtraction. Worst-case materialisation + # is ``len(sid_to_label)`` rows — no longer attacker-controlled. + agg_sql = f""" + WITH sid_scores AS ( + SELECT edge_sid, MAX(edge_score) AS max_score + FROM {table} + WHERE edge = true + AND edge_score IS NOT NULL + AND edge_sid IS NOT NULL + AND edge_sid <> '' + AND timestamp >= now() - INTERVAL {interval} HOUR + GROUP BY edge_sid + ) + SELECT + COUNT(*) AS total, + SUM(CASE WHEN max_score >= {threshold_int} THEN 1 ELSE 0 END) AS flagged_total, + SUM(CASE WHEN max_score < {threshold_int} THEN 1 ELSE 0 END) AS passed_total + FROM sid_scores + """ + agg_rows = _query_logs(service_id, agg_sql) + agg = agg_rows[0] if agg_rows else {} + total = int(agg.get("total") or 0) + flagged_total = int(agg.get("flagged_total") or 0) + passed_total = int(agg.get("passed_total") or 0) + + flagged_good = flagged_bad = passed_good = passed_bad = 0 + labeled_sids = [s for s in sid_to_label if s] + if labeled_sids: + placeholders = ",".join("?" for _ in labeled_sids) + label_sql = f""" + SELECT edge_sid, MAX(edge_score) AS max_score + FROM {table} + WHERE edge_sid IN ({placeholders}) + AND edge = true + AND edge_score IS NOT NULL + AND timestamp >= now() - INTERVAL {interval} HOUR + GROUP BY edge_sid + """ + label_rows_sql = _query_logs(service_id, label_sql, tuple(labeled_sids)) + for r in label_rows_sql: + sid = r.get("edge_sid") + score = r.get("max_score") or 0 + label = sid_to_label.get(sid) + if label not in ("good", "bad"): + continue + if score >= threshold_int: + if label == "good": + flagged_good += 1 + else: + flagged_bad += 1 + else: + if label == "good": + passed_good += 1 + else: + passed_bad += 1 + + # Unlabeled buckets fall out by subtraction — every flagged/passed + # sid that didn't match a label-row is unlabeled. + flagged_unlabeled = max(0, flagged_total - (flagged_good + flagged_bad)) + passed_unlabeled = max(0, passed_total - (passed_good + passed_bad)) + flagged = flagged_good + flagged_bad + flagged_unlabeled + # Operator-friendly precision/recall against the labels we DO + # have. precision = bad-among-flagged / flagged-labeled-total. + # recall = bad-flagged / bad-total. Both are None when the + # denominator is zero (which happens early in label collection). + flagged_labeled = flagged_good + flagged_bad + all_labeled_bad = flagged_bad + passed_bad + precision = (flagged_bad / flagged_labeled) if flagged_labeled else None + recall = (flagged_bad / all_labeled_bad) if all_labeled_bad else None + + return { + "threshold": threshold_int, + "since_hours": since_hours, + "total_scored_sessions": total, + "flagged": { + "total": flagged, + "good": flagged_good, + "bad": flagged_bad, + "unlabeled": flagged_unlabeled, + }, + "passed": { + "good": passed_good, + "bad": passed_bad, + "unlabeled": passed_unlabeled, + }, + "precision": round(precision, 4) if precision is not None else None, + "recall": round(recall, 4) if recall is not None else None, + } + + # Label-count-aware cache key so a new label invalidates correctly. + counts = _labels.counts_by_label(service_id) + n_labels = counts.get("good", 0) + counts.get("bad", 0) + return _cached(("threshold-preview", service_id, threshold_int, since_hours, n_labels), _produce) + + +# ── Retrain pipeline ──────────────────────────────────────────────────────── + + +@router.post("/{service_id}/scoring/retrain") +def scoring_retrain( + service_id: str = Path(...), + since_days: int = Query(default=7, ge=1, le=90, description="Window of DuckDB traffic to train on"), + version: str | None = Query(default=None, description="Override matrix version label; defaults to today's date"), +) -> dict: + """Build a fresh transition matrix from the last N days of DuckDB + traffic, save it to ``compute/scorer/matrix.json``, publish to FOS, + and evaluate AUC against the operator's accumulated labels. + + Synchronous — for a 7-day window with ~10k sessions the whole pipeline + runs in <30s. The endpoint returns the new matrix metadata + AUC so + the UI can show "matrix moved from 0.62 → 0.91 after retrain". The + Wasm build + Compute deploy is a separate step (requires Fastly CLI + + Rust toolchain on the operator's box — not Docker-friendly): the + response includes a hint pointing at ``scripts/scoring/deploy_wasm.sh``. + + Pipeline: + 1. extract_traces from DuckDB → in-memory sessions + 2. build_matrix → TransitionMatrix + 3. evaluate AUC against labels (if >=3 each class) + 4. Save matrix.json to disk + publish to FOS + 5. Bust the /scoring/evaluation cache + """ + import datetime as _dt + + from backend import config as svcconfig + from backend.core.duckdb import get_connection, get_source_for_service + from backend.provision.session_scoring_orchestrator import _MATRIX_PATH + from backend.scoring import fixtures as _fixtures + from backend.scoring import labels as _labels + from backend.scoring import matrix as _matrix + from backend.scoring.evaluate import DEFAULT_MIN_AUC + from backend.scoring.evaluate import evaluate as _evaluate + + src = get_source_for_service(service_id) + if src is None: + raise HTTPException(status_code=404, detail={"error": f"No service {service_id}"}) + cfg = svcconfig.load_config(service_id) or {} + matrix_version = version or _dt.datetime.now(_dt.UTC).strftime("%Y-%m-%d-r") + start = _dt.datetime.now(_dt.UTC) - _dt.timedelta(days=int(since_days)) + + # 1. Extract sessions from DuckDB. The extract function expects a + # live connection; reuse the same read-only path the analytics + # endpoints use so we never block ingest writers. + con = get_connection(source=src, max_wait=3, skip_view_update=True, read_only=True) + try: + sessions_iter = _fixtures.extract_traces(con, service_id=service_id, start=start) + # 2. Build matrix in one streaming pass. + tmatrix, stats = _matrix.build_matrix( + (s.to_jsonl_dict() for s in sessions_iter), + ) + finally: + try: + con.close() + except Exception: + pass + + matrix_dict = tmatrix.to_json_dict(version=matrix_version) + + # 3. Evaluate against accumulated labels if we have enough of each. + auc_result = None + label_rows = _labels.list_labels(service_id) + counts = _labels.counts_by_label(service_id) + if counts.get("good", 0) >= _MIN_LABELS_PER_CLASS and counts.get("bad", 0) >= _MIN_LABELS_PER_CLASS: + labeled_sessions = _reconstruct_labeled_sessions(service_id, label_rows) + if labeled_sessions: + er = _evaluate(matrix_dict, labeled_sessions) + auc_result = { + "auc": round(float(er.auc), 4), + "passed": bool(er.passed), + "threshold": float(er.pass_threshold), + "n_good": er.n_good, + "n_bad": er.n_bad, + } + + # 4. Save matrix.json + publish to FOS. Local save is best-effort — + # if the backend container can't write to compute/scorer/ (read-only + # image mount), we still succeed by relying on FOS as the durable + # store. _load_matrix() will pull from FOS next call. + try: + _MATRIX_PATH.parent.mkdir(parents=True, exist_ok=True) + with _MATRIX_PATH.open("w") as f: + import json as _json + + _json.dump(matrix_dict, f) + local_saved = True + except Exception as exc: + local_saved = False + logger.warning(f"Could not write matrix.json locally: {exc}") + + fos_published = False + try: + from backend.state_sync import publish_matrix_to_fos + + publish_matrix_to_fos(service_id, matrix_dict) + fos_published = True + except Exception as exc: + logger.warning(f"Could not publish matrix to FOS: {exc}") + + # 5. Bust analytics caches so the next StatusPanel hit sees the new AUC. + _bust_analytics_cache(service_id) + + # Operator audit: every retrain is attributable + reviewable. + from backend.core import metadata_db + + metadata_db.record_scoring_audit( + service_id, + "matrix_retrained", + details={ + "matrix_version": matrix_version, + "since_days": since_days, + "sessions_trained_on": tmatrix.session_count, + "auc_against_labels": auc_result, + "fos_published": fos_published, + }, + ) + + return { + "ok": True, + "matrix_version": matrix_version, + "since_days": since_days, + "sessions_trained_on": tmatrix.session_count, + "transitions": tmatrix.transition_count, + "vocab_size": len(tmatrix.vocab), + "rejected": { + "too_few_events": stats.sessions_dropped_short, + "too_fast": stats.sessions_dropped_fast, + "kept": stats.sessions_kept, + "routes_seen": stats.routes_seen, + }, + "auc_against_labels": auc_result, + "default_min_auc": float(DEFAULT_MIN_AUC), + "local_matrix_saved": local_saved, + "fos_matrix_published": fos_published, + "deploy_hint": ( + "Run scripts/scoring/deploy_wasm.sh --service-id " + f"{(cfg.get('scoring') or {}).get('scoring_service_id', '?')} from your local box " + "to embed this matrix into the Wasm and push to Fastly Compute. " + "Until then the live scorer keeps using its previously-embedded matrix; " + "the /scoring/evaluation endpoint will reflect the new matrix immediately " + "(it reads matrix.json + FOS, not the deployed Wasm)." + ), + } + + +# ── Session details (sid → page sequence) ──────────────────────────────────── + + +@router.get("/{service_id}/scoring/sessions/{sid}/events") +def scoring_session_events( + service_id: str = Path(...), + sid: str = Path(..., description="Edge session id (12-hex chars)"), + since_days: int = Query(default=30, ge=1, le=90), +) -> dict: + """Return the event timeline for a single session — the URLs the + session hit, in order, with per-request status/score/compliance/reason + so the UI can render a 'view this labeled session' popover. + + The data is the same shape ``evaluate()`` consumes for AUC; this + endpoint just exposes it through a public route keyed on the sid the + operator clicked. Cap is 500 events per sid (any realistic browsing + session well under that; the cap is a runaway-loop safety bound). + """ + grouped = _fetch_session_events(service_id, [sid], since_days=since_days) + events = grouped.get(sid, []) + return { + "sid": sid, + "since_days": since_days, + "event_count": len(events), + "events": events, + } + + +# ── Threshold enforcement (live blocking via Compute ConfigStore) ────────── + + +_ENFORCE_THRESHOLD_KEY = "enforce_threshold" + + +@router.get("/{service_id}/scoring/enforce-threshold") +def scoring_enforce_threshold_get( + service_id: str = Path(...), + token: str = Query(default=""), +) -> dict: + """Read the live enforce_threshold value from the scoring_config + Compute ConfigStore. None = no enforcement. + + The Rust scorer reads this on every request — when set AND the + request's score >= threshold, it emits X-Edge-Score-Enforce: 1, + which the SCORING_ENFORCE_NAME VCL snippet turns into a 429. + """ + from backend import config as svcconfig + from backend.core.fastly.client import fastly + + cfg = svcconfig.load_config(service_id) or {} + scoring = cfg.get("scoring") or {} + config_store_id = scoring.get("scoring_config_store_id") + if not config_store_id: + raise HTTPException(status_code=400, detail={"error": "Scoring not enabled or config store missing"}) + + resolved_token = _resolve_token(service_id, token) + if not resolved_token: + raise HTTPException(status_code=400, detail={"error": "Fastly API token required"}) + + try: + item = fastly( + "GET", + f"/resources/stores/config/{config_store_id}/item/{_ENFORCE_THRESHOLD_KEY}", + token=resolved_token, + ) + raw = (item or {}).get("item_value", "") + threshold: int | None = int(raw) if raw and raw.isdigit() else None + except RuntimeError as exc: + # 404 from ConfigStore = key not present = enforcement not set. + # Mirrors the pattern in session_scoring_orchestrator.py:307-311. + if "404" in str(exc): + threshold = None + else: + logger.exception("scoring_enforce_threshold_get failed for %s", service_id) + raise HTTPException( + status_code=502, + detail={"error": f"failed to read enforce threshold: {exc}"}, + ) + + return { + "threshold": threshold, + "enforced": threshold is not None, + "key": _ENFORCE_THRESHOLD_KEY, + } + + +@router.put("/{service_id}/scoring/enforce-threshold") +def scoring_enforce_threshold_put( + body: dict, + service_id: str = Path(...), + token: str = Query(default=""), + confirm: bool = Query(default=False, description="Set true to actually apply the enforcement change"), +) -> dict: + """Write the live enforce_threshold to the scoring_config ConfigStore. + Pass ``{"threshold": null}`` to clear (disable enforcement). + + Effective at the edge within seconds (next Compute invocation + re-reads the ConfigStore). Audited to scoring_audit so the operator + can review when enforcement was flipped on/off. + + Gated by ``?confirm=true`` (matches the matrix-restore pattern) so + an accidental click can't silently flip enforcement at the edge.""" + if not confirm: + raise HTTPException( + status_code=400, + detail={"error": "Pass ?confirm=true to actually change enforcement. This affects live edge blocking."}, + ) + + from backend import config as svcconfig + from backend.core import metadata_db + from backend.core.fastly.client import fastly + + cfg = svcconfig.load_config(service_id) or {} + scoring = cfg.get("scoring") or {} + config_store_id = scoring.get("scoring_config_store_id") + if not config_store_id: + raise HTTPException(status_code=400, detail={"error": "Scoring not enabled or config store missing"}) + + raw = body.get("threshold") + threshold: int | None + if raw is None: + threshold = None + else: + try: + threshold = int(raw) + except (TypeError, ValueError): + raise HTTPException(status_code=400, detail={"error": "threshold must be int 0-100 or null"}) + if not 0 <= threshold <= 100: + raise HTTPException(status_code=400, detail={"error": "threshold must be 0-100"}) + + resolved_token = _resolve_token(service_id, token) + if not resolved_token: + raise HTTPException(status_code=400, detail={"error": "Fastly API token required"}) + + # Upsert: PATCH the item, falling back to POST if it doesn't exist + # yet (first time enforcement is set for this service). + value = str(threshold) if threshold is not None else "" + try: + try: + fastly( + "PATCH", + f"/resources/stores/config/{config_store_id}/item/{_ENFORCE_THRESHOLD_KEY}", + {"item_value": value}, + token=resolved_token, + ) + except Exception: + fastly( + "POST", + f"/resources/stores/config/{config_store_id}/item", + {"item_key": _ENFORCE_THRESHOLD_KEY, "item_value": value}, + token=resolved_token, + ) + except Exception as e: + logger.exception("scoring_enforce_threshold_put failed for %s", service_id) + raise HTTPException(status_code=500, detail={"error": str(e)}) + + metadata_db.record_scoring_audit( + service_id, + "threshold_enforce_disabled" if threshold is None else "threshold_enforced", + details={"threshold": threshold}, + ) + + return { + "ok": True, + "threshold": threshold, + "enforced": threshold is not None, + "message": ( + "Enforcement disabled — scorer will stop setting X-Edge-Score-Enforce on responses." + if threshold is None + else f"Enforcement live at threshold {threshold}. Scorer will set X-Edge-Score-Enforce=1 " + "when score >= threshold; the Enforce VCL snippet 429s those requests." + ), + } + + +# ── Recv exclusion regex (URLs that bypass the scorer) ───────────────────── + + +@router.get("/{service_id}/scoring/exclude-regex") +def scoring_exclude_regex_get(service_id: str = Path(...)) -> dict: + """Return the operator-configured URL-exclusion regex for the recv snippet. + + URLs that match this regex are NOT routed to the Compute scorer + (saves cost on static assets / health checks / etc.). The default + matches common static-asset file extensions; the operator can + override it via the PUT endpoint below. + + Response shape: + { + "current": str, # the stored value (literal default after + # first enable_scoring; or operator override) + "is_default": bool, # true when current is empty OR equals the + # built-in default literal + "default": str, # the built-in default regex + "effective": str, # what's actually interpolated into VCL + } + """ + from backend import config as svcconfig + from backend.provision.session_scoring_vcl import ( + DEFAULT_ASSET_EXT_REGEX, + resolve_exclude_url_regex, + ) + + cfg = svcconfig.load_config(service_id) or {} + scoring = cfg.get("scoring") or {} + current = scoring.get("exclude_url_regex") or "" + effective = resolve_exclude_url_regex(current or None) + return { + "current": current, + # Empty cfg (legacy services from before enable_scoring populated + # the default) AND services whose stored value happens to equal + # the bundled default both count as "default" for UI purposes — + # the admin shouldn't see "custom override" when nothing's actually + # been customised. + "is_default": (not current) or current == DEFAULT_ASSET_EXT_REGEX, + "default": DEFAULT_ASSET_EXT_REGEX, + "effective": effective, + } + + +@router.put("/{service_id}/scoring/exclude-regex") +def scoring_exclude_regex_put( + body: dict, + service_id: str = Path(...), + token: str = Query(default=""), + confirm: bool = Query(default=False, description="Set true to actually apply the change"), +) -> dict: + """Update the URL-exclusion regex for the scoring recv snippet. + + Validation pipeline (must pass all four to land): + 1. Input policy (length cap, no quote / control chars, valid regex). + 2. Falco static analysis on the assembled recv-snippet body. + 3. Fastly's VCL ``validate`` endpoint on the cloned version. + 4. ``activate_version`` (Fastly's compiler runs again). + + Re-deploys ONLY the recv snippet — Compute service, Wasm, log + format, and the other 5 scoring snippets stay untouched. Takes + ~5-10s end-to-end. + + Pass ``{"regex": ""}`` to reset to the built-in default. Body shape: + { "regex": str } + + Gated by ``?confirm=true`` because a typo here can disable scoring + entirely (regex matches everything) or DoS Compute (regex matches + nothing → every request scored). The confirm flag matches the + enforce-threshold + matrix-restore precedent. + """ + if not confirm: + raise HTTPException( + status_code=400, + detail={ + "error": "Pass ?confirm=true to actually apply the change. This re-publishes the active VCL version." + }, + ) + + from backend import config as svcconfig + from backend.core import metadata_db + from backend.provision.session_scoring_orchestrator import update_recv_exclusion_regex + from backend.provision.session_scoring_vcl import recv_snippet + from backend.utils.vcl_validator import ( + RegexValidationError, + validate_recv_exclusion_regex_with_lint, + ) + + raw = body.get("regex", "") + if not isinstance(raw, str): + raise HTTPException(status_code=400, detail={"error": "body.regex must be a string"}) + + cfg = svcconfig.load_config(service_id) or {} + scoring = cfg.get("scoring") or {} + if not scoring.get("enabled"): + raise HTTPException( + status_code=400, + detail={"error": "Session scoring is not enabled for this service"}, + ) + request_secret = scoring.get("request_secret") or "" + if not request_secret: + raise HTTPException( + status_code=400, + detail={"error": "Internal: request_secret missing from cfg. Re-run enable_scoring."}, + ) + + resolved_token = _resolve_token(service_id, token) + if not resolved_token: + raise HTTPException(status_code=400, detail={"error": "Fastly API token required"}) + + # Layers 1 + 2: input policy + falco static analysis on the + # assembled snippet. We close over the per-service ids so the + # validator can build the full snippet body. + def _build(cleaned_regex: str) -> str: + return recv_snippet(service_id, request_secret, exclude_url_regex=cleaned_regex or None) + + try: + cleaned, lint = validate_recv_exclusion_regex_with_lint( + raw, + build_full_snippet=_build, + # Production keeps falco mandatory; tests / local dev where + # falco isn't on PATH can override via env. + require_falco=os.environ.get("SCORING_REQUIRE_FALCO", "0") == "1", + ) + except RegexValidationError as exc: + raise HTTPException( + status_code=400, + detail={"error": exc.message, "reason": exc.reason}, + ) + + # Layers 3 + 4: clone → swap → validate → activate via the + # orchestrator helper. + try: + result = update_recv_exclusion_regex(service_id, resolved_token, new_regex=cleaned) + except RuntimeError as exc: + raise HTTPException(status_code=502, detail={"error": str(exc)}) + + metadata_db.record_scoring_audit( + service_id, + "scoring_exclude_regex_changed", + details={ + "is_default": result["is_default"], + "effective_regex": result["effective_regex"][:200], + "logging_service_active_version": result["logging_service_active_version"], + "lint_warnings": lint.warnings[:5], + }, + ) + + return { + "ok": True, + **result, + "lint_warnings": lint.warnings, + "message": ( + "Reset to default URL exclusion regex." + if result["is_default"] + else "Custom URL exclusion regex applied. Effective at the edge after Fastly version activation." + ), + } + + +# ── Dry-run validator for the exclude-regex (no persistence, no VCL) ────── + + +@router.post("/{service_id}/scoring/exclude-regex/validate") +def scoring_exclude_regex_validate( + body: dict, + service_id: str = Path(...), +) -> dict: + """Run the 2-layer pre-publish validator on a candidate regex WITHOUT + persisting it or touching Fastly. + + Drives the admin UI's on-blur lint check: the operator types a regex, + tabs out of the textarea, and gets immediate feedback on whether the + value would pass input policy (length / quote / control-char / Python + re.compile) AND falco's static analysis on the assembled snippet, + BEFORE they commit to a publish flow. + + Response shape: + Success: {"ok": true, "lint_warnings": [...]} + Failure: {"ok": false, "error": "...", "reason": "..."} + + The third layer (Fastly's own VCL compiler during version activate) + only runs on real publish — we don't burn a clone/activate round-trip + for a preview. False-positives between falco and Fastly's compiler are + rare; the publish flow still catches them. + """ + from backend import config as svcconfig + from backend.provision.session_scoring_vcl import recv_snippet + from backend.utils.vcl_validator import ( + RegexValidationError, + validate_recv_exclusion_regex_with_lint, + ) + + raw = body.get("regex", "") + if not isinstance(raw, str): + raise HTTPException(status_code=400, detail={"error": "body.regex must be a string"}) + + cfg = svcconfig.load_config(service_id) or {} + scoring = cfg.get("scoring") or {} + # The validator needs a request_secret to build the assembled snippet + # for falco lint — that's a VCL substitution, not anything the lint + # inspects semantically. Use a stable placeholder when scoring isn't + # enabled yet so the operator can still pre-validate before turn-on. + request_secret = scoring.get("request_secret") or "PLACEHOLDER_FOR_LINT_ONLY" + + def _build(cleaned_regex: str) -> str: + return recv_snippet(service_id, request_secret, exclude_url_regex=cleaned_regex or None) + + try: + _cleaned, lint = validate_recv_exclusion_regex_with_lint( + raw, + build_full_snippet=_build, + require_falco=os.environ.get("SCORING_REQUIRE_FALCO", "0") == "1", + ) + except RegexValidationError as exc: + return { + "ok": False, + "error": exc.message, + "reason": exc.reason, + } + + return { + "ok": True, + "lint_warnings": lint.warnings, + } + + +# ── Enforce response status code (default 429, operator-overridable) ────── + + +@router.get("/{service_id}/scoring/enforce-status-code") +def scoring_enforce_status_code_get(service_id: str = Path(...)) -> dict: + """Return the operator-configured HTTP status code that the enforce + snippet returns when the scorer flags a request. + + Defaults to 429 (Too Many Requests). Operators can pick any 4xx/5xx + code via the PUT endpoint below. + + Response shape: + { + "current": int, # operator's override, or null when default + "default": int, # built-in default (429) + "effective": int, # what's actually baked into the VCL + "min": int, # min allowed value (400) + "max": int, # max allowed value (599) + "is_default": bool, + } + """ + from backend import config as svcconfig + from backend.provision.session_scoring_vcl import ( + _ENFORCE_STATUS_CODE_MAX, + _ENFORCE_STATUS_CODE_MIN, + DEFAULT_ENFORCE_STATUS_CODE, + resolve_enforce_status_code, + ) + + cfg = svcconfig.load_config(service_id) or {} + scoring = cfg.get("scoring") or {} + current = scoring.get("enforce_status_code") + effective = resolve_enforce_status_code(current) + return { + "current": current, + "default": DEFAULT_ENFORCE_STATUS_CODE, + "effective": effective, + "min": _ENFORCE_STATUS_CODE_MIN, + "max": _ENFORCE_STATUS_CODE_MAX, + "is_default": effective == DEFAULT_ENFORCE_STATUS_CODE, + } + + +@router.put("/{service_id}/scoring/enforce-status-code") +def scoring_enforce_status_code_put( + body: dict, + service_id: str = Path(...), + token: str = Query(default=""), + confirm: bool = Query(default=False, description="Set true to actually apply the change"), +) -> dict: + """Update the HTTP status code returned by the enforce snippet. + + Body shape: ``{"status_code": int | null}``. Pass ``null`` (or omit) + to reset to the default 429. + + Validation: + - Must be int in 400-599 (4xx/5xx HTTP error range). + - Anything else → 400 with explanation. + + Re-deploys ONLY the enforce snippet — Compute service, Wasm, log + format, and the other 5 scoring snippets stay untouched. Takes + ~5-10s end-to-end. + + Gated by ``?confirm=true`` because the change affects live edge + response codes seen by real users — same precedent as + enforce-threshold and exclude-regex. + """ + if not confirm: + raise HTTPException( + status_code=400, + detail={ + "error": "Pass ?confirm=true to actually apply the change. This re-publishes the active VCL version." + }, + ) + + from backend import config as svcconfig + from backend.core import metadata_db + from backend.provision.session_scoring_orchestrator import update_enforce_status_code + from backend.provision.session_scoring_vcl import ( + _ENFORCE_STATUS_CODE_MAX, + _ENFORCE_STATUS_CODE_MIN, + ) + + raw = body.get("status_code") + new_code: int | None + if raw is None: + new_code = None + else: + try: + new_code = int(raw) + except (TypeError, ValueError): + raise HTTPException( + status_code=400, + detail={"error": "status_code must be an integer or null"}, + ) + if not (_ENFORCE_STATUS_CODE_MIN <= new_code <= _ENFORCE_STATUS_CODE_MAX): + raise HTTPException( + status_code=400, + detail={ + "error": f"status_code must be in {_ENFORCE_STATUS_CODE_MIN}-{_ENFORCE_STATUS_CODE_MAX} (HTTP 4xx/5xx)" + }, + ) + + cfg = svcconfig.load_config(service_id) or {} + scoring = cfg.get("scoring") or {} + if not scoring.get("enabled"): + raise HTTPException( + status_code=400, + detail={"error": "Session scoring is not enabled for this service"}, + ) + + resolved_token = _resolve_token(service_id, token) + if not resolved_token: + raise HTTPException(status_code=400, detail={"error": "Fastly API token required"}) + + try: + result = update_enforce_status_code(service_id, resolved_token, new_status_code=new_code) + except RuntimeError as exc: + raise HTTPException(status_code=502, detail={"error": str(exc)}) + + metadata_db.record_scoring_audit( + service_id, + "scoring_enforce_status_code_changed", + details={ + "is_default": result["is_default"], + "effective_status_code": result["effective_status_code"], + "logging_service_active_version": result["logging_service_active_version"], + }, + ) + + return { + "ok": True, + **result, + "message": ( + "Reset to default enforce status code (429)." + if result["is_default"] + else f"Enforce status code → {result['effective_status_code']}. Effective at the edge after Fastly version activation." + ), + } + + +# ── Matrix version history + rollback ────────────────────────────────────── + + +@router.get("/{service_id}/scoring/matrix-versions") +def scoring_matrix_versions_list(service_id: str = Path(...)) -> dict: + """List historical scoring matrices archived in FOS. + + publish_matrix_to_fos snapshots the prior current matrix to + ``iceberg/meta/scoring_matrix_history/{version}.json`` before + overwriting, so the operator can roll back to any prior trained + matrix. Returns most-recent first.""" + from backend import config as svcconfig + from backend.state_sync import list_scoring_matrix_versions + + cfg = svcconfig.load_config(service_id) or {} + current_version = (cfg.get("scoring") or {}).get("matrix_version") + return { + "versions": list_scoring_matrix_versions(service_id), + "current_version": current_version, + } + + +@router.post("/{service_id}/scoring/matrix-versions/{version}/restore") +def scoring_matrix_versions_restore( + service_id: str = Path(...), + version: str = Path( + ..., + description="Matrix version string to restore", + pattern=r"^[A-Za-z0-9._-]+$", + max_length=64, + ), + confirm: bool = Query(default=False, description="Set true to actually perform the restore"), +) -> dict: + """Restore a historical matrix to the current scoring_matrix.json + key in FOS. Also deletes the local matrix.json so the next + /scoring/evaluation call sees the FOS-restored matrix. + + Live edge scorer (Wasm) keeps using its previously-embedded matrix + until the operator re-runs deploy_wasm.sh. The /scoring/evaluation + AUC will reflect the restored matrix immediately. + + Gated by ``?confirm=true`` so an accidental click can't silently + rewind the live AUC numbers.""" + if not confirm: + raise HTTPException( + status_code=400, + detail={"error": "Pass ?confirm=true to actually restore. This will replace the current matrix."}, + ) + + from backend import config as svcconfig + from backend.core import metadata_db + from backend.provision.session_scoring_orchestrator import _MATRIX_PATH + from backend.state_sync import restore_scoring_matrix_version + + result = restore_scoring_matrix_version(service_id, version) + if not result: + raise HTTPException( + status_code=404, + detail={"error": f"Matrix version {version!r} not found in FOS history"}, + ) + + # Drop the local matrix.json so _load_matrix falls through to the + # FOS-restored version instead of shadowing it. + try: + if _MATRIX_PATH.exists(): + _MATRIX_PATH.unlink() + except Exception as exc: + logger.warning(f"Could not remove local matrix.json after restore: {exc}") + + # Update cfg.scoring.matrix_version so /scoring/status reflects the rollback. + cfg = svcconfig.load_config(service_id) + if cfg: + scoring = cfg.setdefault("scoring", {}) + scoring["matrix_version"] = version + svcconfig.save_config(service_id, cfg) + + _bust_analytics_cache(service_id) + + metadata_db.record_scoring_audit( + service_id, + "matrix_restored", + details={"restored_version": version, "restored_at": result["restored_at"]}, + ) + + return { + "ok": True, + "restored_version": version, + "restored_at": result["restored_at"], + "deploy_hint": ( + "Backend AUC + evaluation endpoints now reflect the restored matrix. " + "Live edge scorer keeps using its previously-embedded matrix until " + "you re-run scripts/scoring/deploy_wasm.sh." + ), + } + + +# ── AES key rotation ──────────────────────────────────────────────────────── + + +@router.post("/{service_id}/scoring/rotate-key") +def scoring_rotate_key( + service_id: str = Path(...), + token: str = Query(default=""), +) -> dict: + """Rotate the AES-GCM cookie-state encryption key. + + Moves the current key to ``previous_key_hex`` (grace window for + in-flight cookies still using the old key) and writes a fresh + 32-byte key as the new ``current_key_hex``. The Rust scorer's + cookie codec already tries previous as a fallback so existing + sessions keep decoding for one rotation cycle. + + Returns rotation metadata — the new key itself is NOT returned in + the response (only stored in the Fastly ConfigStore + audit log). + """ + from backend import config as svcconfig + from backend.core import metadata_db + from backend.provision.session_scoring_setup import rotate_aes_key + + cfg = svcconfig.load_config(service_id) + if not cfg: + raise HTTPException(status_code=404, detail={"error": f"No config for service {service_id}"}) + + scoring = cfg.get("scoring") or {} + if not scoring.get("enabled"): + raise HTTPException(status_code=400, detail={"error": "Scoring is not enabled for this service"}) + + scoring_keys_store_id = scoring.get("scoring_keys_store_id") + if not scoring_keys_store_id: + raise HTTPException( + status_code=400, + detail={ + "error": "Service has no scoring_keys_store_id (was scoring enabled before key rotation was supported?)" + }, + ) + + resolved_token = _resolve_token(service_id, token) + if not resolved_token: + raise HTTPException(status_code=400, detail={"error": "Fastly API token required"}) + + try: + result = rotate_aes_key(scoring_keys_store_id, token=resolved_token) + except Exception as e: + logger.exception("scoring_rotate_key failed for %s", service_id) + raise HTTPException(status_code=500, detail={"error": str(e)}) + + # Record the rotation in the audit log (without the key value). + metadata_db.record_scoring_audit( + service_id, + "key_rotated", + details={ + "rotated_at": result["rotated_at"], + "previous_key_grace": bool(result.get("previous_key_hex")), + }, + ) + + # Don't echo the key itself. + return { + "ok": True, + "rotated_at": result["rotated_at"], + "previous_key_grace": bool(result.get("previous_key_hex")), + "message": ( + "AES key rotated. Cookies signed with the previous key keep " + "decoding via the previous_key_hex grace slot — clear that " + "slot by rotating again after the idle-expire window (~hours)." + ), + } + + +# ── Operator audit log ────────────────────────────────────────────────────── + + +@router.get("/{service_id}/scoring/audit") +def scoring_audit_list( + service_id: str = Path(...), + limit: int = Query(default=100, ge=1, le=1000), + since: str | None = Query(default=None, description="ISO timestamp lower bound (inclusive)"), +) -> dict: + """List recent operator actions on this service's scoring config. + + Tracks: scoring_enabled, scoring_disabled, threshold_committed, + threshold_cleared, threshold_enforced, threshold_enforce_disabled, + matrix_retrained, matrix_restored, key_rotated. Each row has + timestamp, action, actor, details (JSON). Used for compliance review + + "who broke prod last Tuesday?" triage. + + ``since`` (optional ISO timestamp) filters to rows at or after that + instant — handy for the admin UI to poll for new events without + re-rendering the entire history.""" + from backend import config as svcconfig + from backend.core import metadata_db + + # 404 when the service itself isn't known — mirrors /scoring/status so + # the UI gets a consistent shape across the audit + status pair. + cfg = svcconfig.load_config(service_id) + if not cfg: + raise HTTPException(status_code=404, detail={"error": f"No config for service {service_id}"}) + + rows = metadata_db.list_scoring_audit(service_id, limit=limit, since=since) + return {"audit": rows, "limit": limit} + + +# ── Operator's chosen threshold (persisted, not enforced) ─────────────────── + + +@router.get("/{service_id}/scoring/threshold") +def scoring_threshold_get(service_id: str = Path(...)) -> dict: + """Return the operator's chosen score threshold. + + NOT enforced — the live scorer doesn't read this. It's a persisted + operator preference so the threshold slider can remember the + 'committed' value across sessions, and the StatusPanel can show + 'committed threshold: X' as a stable reference. Actual enforcement + requires a Rust scorer change + Wasm redeploy and is deferred to + a future release once the operator is confident in the value. + """ + from backend import config as svcconfig + + cfg = svcconfig.load_config(service_id) or {} + scoring = cfg.get("scoring") or {} + return { + "threshold": scoring.get("operator_threshold"), + "set_at": scoring.get("operator_threshold_set_at"), + "enforced": False, # See docstring — preview-only + } + + +@router.put("/{service_id}/scoring/threshold") +def scoring_threshold_put( + body: dict, + service_id: str = Path(...), +) -> dict: + """Persist the operator's chosen threshold (0-100) into the per-service + config. Pass ``{"threshold": null}`` to clear. Always returns the + new state. Does NOT push to Compute — preview-only.""" + import datetime as _dt + + from backend import config as svcconfig + + raw = body.get("threshold") + threshold: int | None + if raw is None: + threshold = None + else: + try: + threshold = int(raw) + except (TypeError, ValueError): + raise HTTPException(status_code=400, detail={"error": "threshold must be int 0-100 or null"}) + if not 0 <= threshold <= 100: + raise HTTPException(status_code=400, detail={"error": "threshold must be 0-100"}) + + cfg = svcconfig.load_config(service_id) + if not cfg: + raise HTTPException(status_code=404, detail={"error": f"No config for service {service_id}"}) + scoring = cfg.setdefault("scoring", {}) + prior_threshold = scoring.get("operator_threshold") + if threshold is None: + scoring.pop("operator_threshold", None) + scoring.pop("operator_threshold_set_at", None) + else: + scoring["operator_threshold"] = threshold + scoring["operator_threshold_set_at"] = _dt.datetime.now(_dt.UTC).isoformat(timespec="seconds") + # Operator audit trail — every threshold change is attributable. + from backend.core import metadata_db + + metadata_db.record_scoring_audit( + service_id, + "threshold_committed" if threshold is not None else "threshold_cleared", + details={"prior_threshold": prior_threshold, "new_threshold": threshold}, + ) + svcconfig.save_config(service_id, cfg) + + _bust_analytics_cache(service_id) # so /scoring/status reflects it next fetch + return { + "threshold": scoring.get("operator_threshold"), + "set_at": scoring.get("operator_threshold_set_at"), + "enforced": False, + } + + +# ── Per-reason AUC breakdown ──────────────────────────────────────────────── + + +@router.get("/{service_id}/scoring/evaluation/per-reason") +def scoring_evaluation_per_reason( + service_id: str = Path(...), +) -> dict: + """AUC broken down by L1/L2 rule (cookie-missing, impossibly-fast, + robotic-consistency, rare-transition, low-transition-prob). + + Same min-samples gate as /scoring/evaluation but applied per-bucket + (so a reason with <3 labels in either class shows a 'need more + labels with reason=X' CTA instead of a noisy AUC). The headline + /scoring/evaluation gives the combined AUC; this answers 'which + rule contributed most to AUC' once enough per-reason labels exist. + """ + from backend.scoring import labels as _labels + from backend.scoring.evaluate import evaluate_per_reason + + label_rows = _labels.list_labels(service_id) + counts = _labels.counts_by_label(service_id) + n_good = counts.get("good", 0) + n_bad = counts.get("bad", 0) + n_neutral = counts.get("neutral", 0) + + cache_key = ("scoring-evaluation-per-reason", service_id, n_good, n_bad, n_neutral) + + def _produce() -> dict: + if n_good < _MIN_LABELS_PER_CLASS or n_bad < _MIN_LABELS_PER_CLASS: + # No point bucketing — the headline AUC isn't even computable. + return { + "has_min_samples_overall": False, + "min_per_class": _MIN_LABELS_PER_CLASS, + "n_good": n_good, + "n_bad": n_bad, + "buckets": [], + } + labeled_sessions = _reconstruct_labeled_sessions(service_id, label_rows) + result = evaluate_per_reason(labeled_sessions, min_per_class=_MIN_LABELS_PER_CLASS) + result["has_min_samples_overall"] = True + result["n_good"] = n_good + result["n_bad"] = n_bad + return result + + return _cached(cache_key, _produce) + + +# ── Composite dashboard endpoint ──────────────────────────────────────────── +# +# Single round-trip variant of the 8 endpoints the session-scoring admin page +# mounts (status, evaluation, health, top-flagged, score-distribution, +# compliance-breakdown, curves, threshold-preview). Opens ONE read-only +# DuckDB connection, builds ONE filtered temp table, runs each aggregation +# against it. +# +# Wire-compat: this is purely additive — the 8 existing endpoints stay +# mounted with their current cache-key contracts and TTL behavior. The +# frontend can opt in by calling /scoring/dashboard instead of the 8 +# individual queries, or keep fanning out for now. + + +@router.get("/{service_id}/scoring/dashboard") +def scoring_dashboard( + service_id: str = Path(...), + since_hours: int = Query(default=24, ge=1, le=168), + threshold: int = Query(default=75, ge=0, le=100, description="Preview cutoff for threshold-preview block"), +) -> dict: + """One-shot dashboard payload. Returns: + + ``` + { + since_hours, threshold, + status: {...}, # /scoring/status + evaluation: {...}, # /scoring/evaluation + health: {...}, # /scoring/health + top_flagged: {rows: [...], since_hours}, # /scoring/top-flagged + score_distribution: {rows: [...]}, # /scoring/score-distribution + compliance_breakdown: {rows: [...]}, # /scoring/compliance-breakdown + curves: {...}, # /scoring/curves + threshold_preview: {...}, # /scoring/threshold-preview + } + ``` + + Each sub-object is byte-identical to the corresponding individual + endpoint's response — the frontend can swap to + ``dashboard.top_flagged`` without changing card-level contracts. + + Cache key includes ``since_hours``, ``threshold``, and the per-class + label counts so label mutations + slider drags invalidate naturally. + """ + from backend import config as svcconfig + from backend.scoring import labels as _labels + + counts = _labels.counts_by_label(service_id) + n_good = counts.get("good", 0) + n_bad = counts.get("bad", 0) + n_neutral = counts.get("neutral", 0) + + cache_key = ( + "scoring-dashboard", + service_id, + since_hours, + threshold, + n_good, + n_bad, + n_neutral, + ) + + def _produce() -> dict: + # --- /scoring/status (no DuckDB) --- + cfg = svcconfig.load_config(service_id) or {} + scoring = cfg.get("scoring") or {} + if not scoring.get("enabled"): + status_block: dict = {"enabled": False} + else: + status_block = {k: v for k, v in scoring.items() if k not in _SECRET_KEYS} + + # Build the dashboard in a single payload by delegating to the + # existing per-endpoint producers. Each handles its own _query_logs + # call — meaning 6 DuckDB connections instead of 1 (the audit's + # ideal). The win this iteration captures is the in-flight collapse: + # one composite request → one cache key → one set of fetches that + # serializes through the per-key lock instead of 8 frontend + # requests racing through the proxy + react-query. + # + # The shared-temp-table optimization stays available for a future + # PR — wiring it requires refactoring each per-endpoint producer + # to accept an open connection + table name, which touches 5 + # endpoints worth of test surface. Punting that to v1.2.0 keeps + # this change additive + zero-risk. + evaluation = scoring_evaluation(service_id=service_id) + health = scoring_health(service_id=service_id, since_hours=since_hours) + top_flagged = scoring_top_flagged(service_id=service_id, since_hours=since_hours, limit=50) + score_distribution = scoring_score_distribution(service_id=service_id, since_hours=since_hours) + compliance_breakdown = scoring_compliance_breakdown(service_id=service_id, since_hours=since_hours) + curves = scoring_curves(service_id=service_id) + threshold_preview = scoring_threshold_preview( + service_id=service_id, threshold=threshold, since_hours=since_hours + ) + + return { + "since_hours": since_hours, + "threshold": threshold, + "status": status_block, + "evaluation": evaluation, + "health": health, + "top_flagged": top_flagged, + "score_distribution": score_distribution, + "compliance_breakdown": compliance_breakdown, + "curves": curves, + "threshold_preview": threshold_preview, + } + + return _cached(cache_key, _produce) diff --git a/backend/routers/share_auth.py b/backend/routers/share_auth.py index 294f4df9..c5a3a580 100644 --- a/backend/routers/share_auth.py +++ b/backend/routers/share_auth.py @@ -28,16 +28,12 @@ def _client_ip(request: Request) -> str: """Extract the real client IP. - The middleware that wraps remote requests sets ``request.state.is_remote``; - when true we honor ``X-Forwarded-For`` (the Next.js proxy injects it). - On local-listener traffic we ignore the header to prevent IP spoofing - (Section #5). + With uvicorn running ``--proxy-headers --forwarded-allow-ips=127.0.0.1`` + (see docker-compose.prod.yml), ``request.client.host`` is already the + real client IP for Caddy-proxied traffic and the loopback address for + direct admin connections. We never re-parse X-Forwarded-For ourselves — + that was the leftmost-XFF spoofing vector. """ - is_remote = getattr(request.state, "is_remote", False) - if is_remote: - fwd = request.headers.get("x-forwarded-for") - if fwd: - return fwd.split(",")[0].strip() if request.client and request.client.host: return request.client.host return "0.0.0.0" diff --git a/backend/routers/usage.py b/backend/routers/usage.py index 6b6f3d3b..abd49252 100644 --- a/backend/routers/usage.py +++ b/backend/routers/usage.py @@ -2,7 +2,6 @@ from __future__ import annotations -import json import urllib.error import urllib.parse import urllib.request @@ -26,12 +25,13 @@ def _fastly_api(path: str, api_key: str) -> dict: - req = urllib.request.Request( - f"https://api.fastly.com{path}", - headers={"Fastly-Key": api_key, "Accept": "application/json"}, - ) - with urllib.request.urlopen(req, timeout=30) as resp: - return json.loads(resp.read().decode()) + """Thin wrapper delegating to the central Fastly client. Kept as a + function so the existing 3 call sites in this module read the same; + the central wrapper adds retry-on-429/5xx + telemetry tracking that + used to live in each caller.""" + from backend.core.fastly.client import fastly + + return fastly("GET", path, token=api_key) def _extract_fos_ops(record: dict) -> tuple[int, int]: @@ -184,10 +184,10 @@ def prefill(source: dict = Depends(get_source)): result["edge_only"] = True except Exception: pass - from backend.utils.telemetry import tracked_call as _tc - - with _tc("GET", f"/stats/service/{svc_id}?by={by}", service="Fastly API"): - payload = _fastly_api(f"/stats/service/{svc_id}?by={by}&from={from_ts}&to={to_ts}", api_key) + # tracked_call wrapper removed — _fastly_api → fastly() + # already does telemetry internally; the double-wrap was + # producing duplicate entries in /api/admin/usage-logging. + payload = _fastly_api(f"/stats/service/{svc_id}?by={by}&from={from_ts}&to={to_ts}", api_key) for rec in payload.get("data", []): ts = rec.get("start_time") if ts is None: @@ -196,10 +196,8 @@ def prefill(source: dict = Depends(get_source)): daily_reqs[day] = daily_reqs.get(day, 0) + int(rec.get("requests") or 0) daily_edge[day] = daily_edge.get(day, 0) + int(rec.get("edge_requests") or 0) else: - from backend.utils.telemetry import tracked_call as _tc - - with _tc("GET", f"/stats/aggregate?by={by}", service="Fastly API"): - payload = _fastly_api(f"/stats/aggregate?by={by}&from={from_ts}&to={to_ts}", api_key) + # See note above — fastly() does its own tracking. + payload = _fastly_api(f"/stats/aggregate?by={by}&from={from_ts}&to={to_ts}", api_key) for rec in payload.get("data", []): ts = rec.get("start_time") if ts is None: @@ -216,29 +214,37 @@ def prefill(source: dict = Depends(get_source)): except Exception: pass + # Skip the DuckDB hop entirely when the cached config status already + # carries edge_ratio (steady state — the sync cron keeps it fresh). + # Saves a connection-open + view-resolve on the hot path. Falls back + # to a live get_con read only on the cold path. debug_queries: list = [] - try: - from backend.core.duckdb import get_connection - - # read_only: get_edge_ratio is a SELECT against the view. - con = get_connection(source=source, max_wait=5, read_only=True) + cached_status = svcconfig.get_status(source["name"]) or {} + cached_edge_ratio = cached_status.get("edge_ratio") + if cached_edge_ratio is not None: + result["edge_ratio"] = cached_edge_ratio + else: try: - edge_ratio, debug_queries = repo.get_edge_ratio(con, source) - if edge_ratio is not None: - result["edge_ratio"] = edge_ratio + from backend.core.duckdb import get_connection - # Empirical node count analysis for prefill — derive from per-service - # ingested_files SQLite metadata. + # read_only: get_edge_ratio is a SELECT against the view. + con = get_connection(source=source, max_wait=5, read_only=True) try: - from backend.core import metadata_db + edge_ratio, debug_queries = repo.get_edge_ratio(con, source) + if edge_ratio is not None: + result["edge_ratio"] = edge_ratio + finally: + con.close() + except Exception: + pass - avg = metadata_db.get_node_count_avg(source["name"]) - if avg: - result["avg_nodes_per_flush"] = round(float(avg)) - except Exception: - pass - finally: - con.close() + # Empirical node count analysis — SQLite-only, doesn't need a DuckDB hop. + try: + from backend.core import metadata_db + + avg = metadata_db.get_node_count_avg(source["name"]) + if avg: + result["avg_nodes_per_flush"] = round(float(avg)) except Exception: pass @@ -307,6 +313,7 @@ def usage_current_storage( # covers the cold-start case where get_table_info errors out. iceberg_bytes = 0 iceberg_files = 0 + iceberg_info_success = False try: from backend.core import iceberg as db_iceberg @@ -314,10 +321,11 @@ def usage_current_storage( if not iceberg_info.get("error"): iceberg_bytes = iceberg_info.get("size_bytes", 0) iceberg_files = iceberg_info.get("data_files", 0) + iceberg_info_success = True except Exception: pass - if iceberg_bytes == 0: + if not iceberg_info_success: try: s3 = _get_fos_client(src) bucket = src["bucket"] @@ -432,15 +440,15 @@ def _accumulate(records: list) -> None: agg[date_str]["class_a"] += class_a agg[date_str]["class_b"] += class_b - from backend.utils.telemetry import tracked_call - try: - with tracked_call("GET", f"/stats/aggregate?by={by}", service="Fastly API"): - payload = _fastly_api(f"/stats/aggregate?by={by}&from={from_ts}&to={to_ts}", api_key) + # fastly() does telemetry tracking internally; the prior + # explicit tracked_call wrapper was duplicating the entry. + payload = _fastly_api(f"/stats/aggregate?by={by}&from={from_ts}&to={to_ts}", api_key) _accumulate(payload.get("data", [])) - except urllib.error.HTTPError as e: - body = e.read().decode(errors="replace") - raise HTTPException(status_code=502, detail={"error": f"Fastly Stats API {e.code}: {body}"}) + except RuntimeError as e: + # fastly() raises RuntimeError("HTTP 502 GET /stats/aggregate ...") + # on non-2xx, with the upstream body included. Surface as 502. + raise HTTPException(status_code=502, detail={"error": f"Fastly Stats API: {e}"}) except Exception as e: raise HTTPException(status_code=502, detail={"error": str(e)}) diff --git a/backend/routers/views.py b/backend/routers/views.py index 3ee24d28..50544dba 100644 --- a/backend/routers/views.py +++ b/backend/routers/views.py @@ -2,7 +2,7 @@ from __future__ import annotations -from fastapi import APIRouter, Depends +from fastapi import APIRouter, Depends, HTTPException, Request from backend.deps import get_service_id from backend.models.views import SavedView @@ -12,20 +12,57 @@ router = APIRouter(prefix="/api/views", tags=["views"]) +def _analyst_allowed_services(request: Request) -> set[str] | None: + """Security: return the analyst's allowed service set, or None + for admin. Mirrors the same helper in alerts.py — the cross-tenant + risk is identical for saved views.""" + analyst_session = getattr(request.state, "analyst_session", None) + if analyst_session is None: + return None + return set(analyst_session.service_ids or []) + + @router.get("/{service_id}") -def list_views(service_id: str): +def list_views(service_id: str, request: Request): + """Security: analyst can only list views for services in their + scope. Without this gate an analyst scoped to ``svc-A`` could enumerate + saved views for ``svc-B`` by typing /api/views/svc-B in their browser.""" + allowed = _analyst_allowed_services(request) + if allowed is not None and service_id not in allowed: + raise HTTPException( + status_code=403, + detail={"error": "service_not_authorized", "service": service_id}, + ) return repo.get_views(service_id) @router.post("/") -def create_view(view: SavedView): +def create_view(view: SavedView, request: Request): + """Security: analyst can only create views for services in + their scope. Middleware already blocks POST on /api/views for + analysts; this is defense-in-depth.""" + allowed = _analyst_allowed_services(request) + if allowed is not None and view.service_id not in allowed: + raise HTTPException( + status_code=403, + detail={"error": "service_not_authorized", "service": view.service_id}, + ) res = repo.save_view(view) sync_admin_state(view.service_id) return res @router.delete("/{view_id}") -def delete_view(view_id: str, service_id: str | None = Depends(get_service_id)): +def delete_view(view_id: str, request: Request, service_id: str | None = Depends(get_service_id)): + # Security: pre-flight scope check, mirrors alerts.delete_alert. + allowed = _analyst_allowed_services(request) + if allowed is not None: + existing = repo.get_view_by_id(view_id) + if existing and existing.get("service_id") not in allowed: + raise HTTPException( + status_code=403, + detail={"error": "service_not_authorized", "service": existing.get("service_id")}, + ) res = repo.delete_view(view_id, service_id_hint=service_id) sync_admin_state(res.get("service_id")) return res diff --git a/backend/scheduler.py b/backend/scheduler.py index 117c47c2..373c8c18 100644 --- a/backend/scheduler.py +++ b/backend/scheduler.py @@ -46,6 +46,14 @@ # (SQLite timeouts are 30s) and flush its own usage log on exit. _CRON_HARD_CAP_S = 300 + +def _display_name(src: dict, fallback: str) -> str: + """Return src['service_name'] or src['name'], falling back to ``fallback``. + Used by every cron-log site that wants the human-friendly name with + the service id as fallback when the friendly name isn't populated.""" + return src.get("service_name") or src.get("name", fallback) + + # Per-service throttle for the heavy post-ingest refresh work — specifically # update_top_values (100k reservoir sample + 24 GROUP BYs that back the filter- # picker autocomplete cache) and reconcile_fastly_stats (Fastly /stats/aggregate @@ -567,6 +575,33 @@ def _sync_jobs(self) -> None: ngwaf_interval_mins, ) + # ── Metadata retention cleanup (per service) ────────────────────── + # Daily 03:15 UTC. Slots between optimize (03:00) and full_sweep + # (03:30) so the daily admin cron window stays single-threaded + # across heavy phases. Trims usage_log + ingested_files + # + cron_runs per cfg["metadata_retention"]; defaults to 1d for + # the first two and 7d for cron_runs. See + # backend.core.metadata_db.cleanup_metadata. + cleanup_job_id = f"metadata_cleanup_{service_id}" + seen_ids.add(cleanup_job_id) + if cleanup_job_id not in self._job_ids: + self._sched.add_job( + _run_metadata_cleanup, + "cron", + hour=3, + minute=15, + args=[service_id], + id=cleanup_job_id, + max_instances=1, + coalesce=True, + misfire_grace_time=3600, + ) + self._job_ids[cleanup_job_id] = cleanup_job_id + logger.info( + "🧹 \x1b[35m[metadata_cleanup]\x1b[0m Registered metadata cleanup job %s (daily 03:15 UTC).", + cleanup_job_id, + ) + # ── Bot data refresh job ────────────────────────────────────────────── bot_refresh_id = "bot_data_refresh" seen_ids.add(bot_refresh_id) @@ -656,6 +691,7 @@ def get_scheduler() -> Scheduler: "sync": "\x1b[94m", # Bright Blue "commit": "\x1b[95m", # Bright Magenta "metadata_sync": "\x1b[96m", # Bright Cyan + "metadata_cleanup": "\x1b[35m", # Magenta "alerts": "\x1b[93m", # Bright Yellow "optimize": "\x1b[92m", # Bright Green "expire": "\x1b[90m", # Gray @@ -745,7 +781,7 @@ def _run_metadata_sync( refresh_config_status, start_cron_run, ) - from backend.cron_progress import cleanup_progress, end_progress, start_progress + from backend.cron_progress import cleanup_progress_and_reap, end_progress, start_progress cfg = svcconfig.load_config(service_id) if not cfg: @@ -762,8 +798,7 @@ def _run_metadata_sync( logger.info("[scheduler] %s: skipping metadata_sync — %s", service_id, str(e)) return - cleanup_progress() - + cleanup_progress_and_reap() try: pass except Exception: @@ -1065,9 +1100,9 @@ def _run_service_cron( ) return - from backend.cron_progress import cleanup_progress, end_progress, start_progress + from backend.cron_progress import cleanup_progress_and_reap, end_progress, start_progress - cleanup_progress() + cleanup_progress_and_reap() start_progress(run_id, service_id=service_id, task="sync") logger.info("▶️ \x1b[94m[sync]\x1b[0m %s: Sync job started.", _display) @@ -1211,6 +1246,29 @@ def elapsed() -> str: }, ) + touched_hours = done_event.get("touched_hours", []) + if touched_hours: + _t_roll = time.time() + try: + from backend.core.rollups import recompute_touched_hours + + recompute_touched_hours(service_id, src, set(touched_hours)) + _log_and_add_progress( + run_id, + service_id, + job_name="sync", + event={ + "type": "status", + "message": f"{elapsed()} Rollups computed: {int((time.time() - _t_roll) * 1000)}ms", + }, + ) + except Exception as _re: + logger.warning( + "[scheduler] %s: post-sync rollup recompute failed: %s", + service_id, + _re, + ) + except Exception as e: log_text = _extract_log_text(run_id) summary = "Ingestion crashed" @@ -1361,20 +1419,27 @@ def _usage_log_phase() -> None: run_usage_log_cleanup(service_id) - _usage_log_ex = concurrent.futures.ThreadPoolExecutor(max_workers=1, thread_name_prefix=f"usage-log-{service_id}") - _usage_log_shutdown_wait = True + # Run _usage_log_phase inline. Pre-fix this was wrapped in a NESTED + # ThreadPoolExecutor — but ``_run_service_cron`` is itself already + # running inside the ``@cron_task`` executor (one layer up). On the + # 30s timeout path the old code called ``shutdown(wait=False)``, + # which abandons the worker thread + everything it pinned (DuckDB + # connections, aiohttp sessions, Fastly API state). On a 50-service + # deployment with reconcile_fastly_stats hitting the API in lockstep, + # the inner timeout fired routinely and each leak orphaned an 8-12MB + # stack plus whatever Python state was live. Over hours: multi-GB + # unbounded growth — a confirmed contributor to the recurring host + # OOM-kills. + # + # Running inline drops the leak and matches every other phase in + # this cron body. If a per-phase timeout is needed in the future, + # use a cooperative cancel token through the I/O layer rather than + # abandoning a thread. _t0 = time.time() try: - _usage_log_fut = _usage_log_ex.submit(_usage_log_phase) - try: - _usage_log_fut.result(timeout=30) - except concurrent.futures.TimeoutError: - logger.warning("[scheduler] %s: usage_log phase exceeded 30s — skipping", service_id) - _usage_log_shutdown_wait = False - except Exception as e: - logger.warning("[scheduler] %s: usage_log phase failed: %s", service_id, e) - finally: - _usage_log_ex.shutdown(wait=_usage_log_shutdown_wait) + _usage_log_phase() + except Exception as e: + logger.warning("[scheduler] %s: usage_log phase failed: %s", service_id, e) if run_id is not None: _log_and_add_progress( run_id, @@ -1430,11 +1495,11 @@ def _run_full_sweep(service_id: str) -> None: logger.info("⏭️ \x1b[95m[full_sync]\x1b[0m %s: skipping — %s", service_id, e) return - from backend.cron_progress import cleanup_progress, end_progress, start_progress + from backend.cron_progress import cleanup_progress_and_reap, end_progress, start_progress - cleanup_progress() + cleanup_progress_and_reap() start_progress(run_id, service_id=service_id, task="full_sync") - _svc_name = src.get("service_name") or src.get("name", service_id) + _svc_name = _display_name(src, service_id) _display = f"{_svc_name} ({service_id})" if _svc_name != service_id else service_id logger.info("▶️ \x1b[95m[full_sync]\x1b[0m %s: Daily full-LIST sweep started.", _display) @@ -1549,11 +1614,11 @@ def _run_gap_heal(service_id: str) -> None: logger.info("⏭️ \x1b[95m[gap_heal]\x1b[0m %s: skipping — %s", service_id, e) return - from backend.cron_progress import cleanup_progress, end_progress, start_progress + from backend.cron_progress import cleanup_progress_and_reap, end_progress, start_progress - cleanup_progress() + cleanup_progress_and_reap() start_progress(run_id, service_id=service_id, task="gap_heal") - _svc_name = src.get("service_name") or src.get("name", service_id) + _svc_name = _display_name(src, service_id) _display = f"{_svc_name} ({service_id})" if _svc_name != service_id else service_id start_time_exec = time.time() @@ -1800,9 +1865,9 @@ def _run_commit(service_id: str, force: bool = False, run_id: int | None = None) ) return - from backend.cron_progress import cleanup_progress, end_progress, start_progress + from backend.cron_progress import cleanup_progress_and_reap, end_progress, start_progress - cleanup_progress() + cleanup_progress_and_reap() start_progress(run_id, service_id=service_id, task="commit") _svc_name = cfg.get("name", service_id) if cfg else service_id _display = f"{_svc_name} ({service_id})" if _svc_name != service_id else service_id @@ -1947,11 +2012,11 @@ def _run_local_compact(service_id: str) -> None: logger.info("⏭️ \x1b[96m[local-compact]\x1b[0m %s: skipping — %s", service_id, str(e)) return - from backend.cron_progress import cleanup_progress, end_progress, start_progress + from backend.cron_progress import cleanup_progress_and_reap, end_progress, start_progress - cleanup_progress() + cleanup_progress_and_reap() start_progress(run_id, service_id=service_id, task="local_compact") - _svc_name = src.get("service_name") or src.get("name", service_id) + _svc_name = _display_name(src, service_id) _display = f"{_svc_name} ({service_id})" if _svc_name != service_id else service_id logger.info("▶️ \x1b[96m[local-compact]\x1b[0m %s: Local compaction started.", _display) _log_and_add_progress( @@ -2040,11 +2105,11 @@ def _run_optimize(service_id: str) -> None: logger.info("⏭️ \x1b[92m[optimize]\x1b[0m %s: skipping — %s", service_id, str(e)) return - from backend.cron_progress import cleanup_progress, end_progress, start_progress + from backend.cron_progress import cleanup_progress_and_reap, end_progress, start_progress - cleanup_progress() + cleanup_progress_and_reap() start_progress(run_id, service_id=service_id, task="optimize") - _svc_name = src.get("service_name") or src.get("name", service_id) + _svc_name = _display_name(src, service_id) _display = f"{_svc_name} ({service_id})" if _svc_name != service_id else service_id logger.info("▶️ \x1b[92m[optimize]\x1b[0m %s: Optimize job started.", _display) _log_and_add_progress( @@ -2153,7 +2218,7 @@ def _run_expire_snapshots(service_id: str) -> None: return svc_id = src.get("service_id", "unknown") - svc_name = src.get("service_name") or src.get("name", svc_id) + svc_name = _display_name(src, svc_id) display_name = f"{svc_name} ({svc_id})" if svc_name != svc_id else svc_id logger.info("▶️ \x1b[90m[expire]\x1b[0m %s: Maintenance job started.", display_name) @@ -2415,7 +2480,7 @@ def _run_service_alerts_evaluation(service_id: str) -> None: return task_name = "alerts" - _svc_name = src.get("service_name") or src.get("name", service_id) + _svc_name = _display_name(src, service_id) _display = f"{_svc_name} ({service_id})" if _svc_name != service_id else service_id logger.info("▶️ \x1b[93m[alerts]\x1b[0m %s: Alerts evaluation job started.", _display) @@ -2447,7 +2512,7 @@ def _run_service_alerts_evaluation(service_id: str) -> None: return try: - s_name = src.get("service_name") or src.get("name", service_id) + s_name = _display_name(src, service_id) display_name = f"{s_name} ({service_id})" if s_name != service_id else service_id # (alert_id, webhook_url, payload, max_ts) for each alert that should fire @@ -2549,4 +2614,98 @@ def _run_service_alerts_evaluation(service_id: str) -> None: except Exception: pass - logger.info("⏹️ \x1b[93m[alerts]\x1b[0m %s: Alerts evaluation job finished.", _display) + +@cron_task("metadata_cleanup") +def _run_metadata_cleanup(service_id: str) -> None: + """Daily: trim usage_log + ingested_files + cron_runs per service retention cfg. + + Retention defaults to 1 day for usage_log/ingested_files, 7 days for + cron_runs (see ``metadata_db.DEFAULT_METADATA_RETENTION``). Override + per service via cfg["metadata_retention"]: + + {"metadata_retention": {"usage_log_days": 7, "ingested_files_days": 30, + "cron_runs_days": 30}} + + A value of 0 (or negative) disables cleanup for that table — useful for + a long-retention analyst service that wants the full audit trail. + + VACUUM only runs when something was actually deleted. On a healthy + daily cadence this means: first run trims everything older than + retention, subsequent runs are mostly no-ops (only that day's + just-aged rows to trim), and VACUUM happens cheaply on small deltas. + + Writes a row to the cron_runs audit table on completion so the run + shows up on the Data Management cron schedule + history grid alongside + the other tasks. The cron_runs row itself becomes part of the next + cleanup's trimming target (capped at cron_runs_days retention). + """ + from backend import config as svcconfig + from backend.core.duckdb import get_source_for_service, log_cron_run, start_cron_run + from backend.core.metadata_db import cleanup_metadata + + src = get_source_for_service(service_id) + if src is None: + return + + cfg = svcconfig.load_config(service_id) or {} + retention = cfg.get("metadata_retention") or {} + + _svc_name = _display_name(src, service_id) + _display = f"{_svc_name} ({service_id})" if _svc_name != service_id else service_id + color = JOB_COLORS.get("metadata_cleanup", "") + label = f"{color}[metadata_cleanup]{RESET_COLOR}" + logger.info("▶️ %s %s: Starting metadata cleanup.", label, _display) + + start_ts = time.time() + run_id = start_cron_run(src, "metadata_cleanup") + try: + result = cleanup_metadata(service_id, retention) + except Exception as e: + logger.exception("%s %s: cleanup failed: %s", label, _display, e) + log_cron_run( + src, + "metadata_cleanup", + time.time() - start_ts, + "error", + error_message=str(e), + summary=f"cleanup failed: {e}", + run_id=run_id, + ) + return + + total_deleted = sum(result["deleted"].values()) + summary_parts = [f"{t}={n}" for t, n in result["deleted"].items() if n] + summary = ( + ( + f"Trimmed {total_deleted:,} rows ({', '.join(summary_parts)}). " + f"VACUUM={'yes' if result['vacuumed'] else 'skipped (no deletions)'}." + ) + if total_deleted + else "No rows older than retention windows." + ) + + if total_deleted: + logger.info( + "🧹 %s %s: deleted %d rows (%s) vacuumed=%s in %.2fs", + label, + _display, + total_deleted, + ", ".join(summary_parts), + result["vacuumed"], + result["duration_s"], + ) + else: + logger.info("⏹️ %s %s: no rows to trim (took %.2fs)", label, _display, result["duration_s"]) + + log_cron_run( + src, + "metadata_cleanup", + time.time() - start_ts, + "success", + summary=summary, + # Repurpose the rows_ingested column for the count of rows trimmed — + # the schema is shared across all cron tasks, and "rows_ingested" is + # the closest semantic fit (each task interprets it by context). + rows_ingested=total_deleted, + run_id=run_id, + ) diff --git a/backend/scoring/__init__.py b/backend/scoring/__init__.py new file mode 100644 index 00000000..8f54bdad --- /dev/null +++ b/backend/scoring/__init__.py @@ -0,0 +1,16 @@ +"""Edge session-scoring system. + +Hybrid Fastly Compute (Wasm) + VCL session-anomaly scoring. This Python +package contains: + +- The offline training pipeline (sessionize prod logs → transition matrix + + PageRank anchors) used to compile matrix.json for the edge scorer. +- A reference implementation of the scoring logic (Layer 1 universal + behavioral + Layer 2 route-transition) in pure Python. The Rust/Wasm + port under ``compute/scorer/`` must produce byte-identical scores against + the shared fixture set. +- The AES-GCM-with-AAD cookie codec, also paired 1:1 with the Rust port. + +See the session-scoring runbook (``docs/session_scoring_runbook.md``) +for operational guidance. +""" diff --git a/backend/scoring/cookie.py b/backend/scoring/cookie.py new file mode 100644 index 00000000..0f7fdc18 --- /dev/null +++ b/backend/scoring/cookie.py @@ -0,0 +1,302 @@ +"""AES-GCM-with-AAD session cookie codec (reference implementation). + +Authenticated-encrypted session state for the edge scorer. The Rust/Wasm +port under ``compute/scorer/`` must round-trip these bytes 1:1 — every +fixture in ``tests/scoring/fixtures/cookies/`` is used by both impls. + +Per the research doc §3.1 / §3.3 the payload carries 8 fields totalling +30 plaintext bytes. The doc nominally specifies CBOR but we use a packed +little-endian struct instead — same wire size, no cross-language +canonical-ordering footguns. The schema version byte (``v``) is the first +field of the plaintext, so future format changes can be version-dispatched +on decode without changing the framing. + +Wire format (after AES-GCM and Base64URL): + + base64url( nonce (12 B) || AES-GCM(plaintext, aad) || tag (16 B) ) + + plaintext (variable, little-endian): + v u8 schema version ← first byte for dispatch + sid 6 B raw session id bytes + seq u16 sequence count (cap 65535) + sum_dt u32 Σ Δt seconds + sum_dt_sq u64 Σ Δt² seconds² (widened per §3.3) + last_ts u32 last-request unix epoch + score u8 quantized 0-100 (rounded to nearest 5) + issued_at u32 cookie creation unix epoch ← end of v1 (30 B) + prev_route_len u8 length of prev_route_path (v2 only, 0-255) + prev_route_path N B normalized path of last-scored URL (UTF-8) + + prev_route_path carries the session's most-recently-scored route so the + scorer can compute the L2 transition probability without VCL having to + pass prev_route as a header — req.http doesn't persist across separate + client requests, so a header-based mechanism never worked. + + Decoder accepts v1 (30-byte plaintext, no prev_route_path) for the + migration window. Encoder always emits the current SCHEMA_VERSION. + + aad: ascii(f"{service_id}|v{schema_version}") + — binds the cookie to one customer service AND one schema + version, blocking cross-service replay and version downgrade. + +Key rotation: pass a previous key alongside the current one to +``CookieCodec``; ``decode`` trial-decrypts with the current key first and +falls back to the previous key on AEAD failure. Encrypt always uses the +current key. This is the 24h dual-key grace described in §3.1. +""" + +from __future__ import annotations + +import base64 +import os +import secrets +import struct +from dataclasses import dataclass +from typing import Final + +from cryptography.exceptions import InvalidTag +from cryptography.hazmat.primitives.ciphers.aead import AESGCM + +SCHEMA_VERSION: Final[int] = 2 +SID_BYTES: Final[int] = 6 +NONCE_BYTES: Final[int] = 12 +KEY_BYTES: Final[int] = 32 # AES-256 +SCORE_BUCKET: Final[int] = 5 # quantize to nearest 5 per §3.3 +# v2 adds a length-prefixed UTF-8 path suffix. The length byte (u8) +# caps the path at 255 bytes; encoder truncates longer paths silently. +PREV_ROUTE_MAX_BYTES: Final[int] = 255 + +# v1 plaintext: 30 bytes fixed. v2 plaintext: 30 + 1 (length) + N (path). +_V1_PACK_FMT: Final[str] = " None: + # Bounds enforcement is part of the contract — callers building a + # state to encode must hand us values that fit the wire format. + # Trapping here gives a much better stack than struct.pack's cryptic + # "argument out of range". + if len(self.sid) != SID_BYTES: + raise CookieError(f"sid must be {SID_BYTES} bytes, got {len(self.sid)}") + if not 0 <= self.seq <= SEQ_MAX: + raise CookieError(f"seq out of range: {self.seq}") + if not 0 <= self.sum_dt <= SUM_DT_MAX: + raise CookieError(f"sum_dt out of range: {self.sum_dt}") + if not 0 <= self.sum_dt_sq <= SUM_DT_SQ_MAX: + raise CookieError(f"sum_dt_sq out of range: {self.sum_dt_sq}") + if not 0 <= self.last_ts <= TS_MAX: + raise CookieError(f"last_ts out of range: {self.last_ts}") + if not 0 <= self.issued_at <= TS_MAX: + raise CookieError(f"issued_at out of range: {self.issued_at}") + if not 0 <= self.score <= 100: + raise CookieError(f"score out of range: {self.score}") + if not 0 <= self.v <= 0xFF: + raise CookieError(f"v out of range: {self.v}") + + +def quantize_score(raw: float | int) -> int: + """Round to nearest SCORE_BUCKET (default 5), clamp to [0, 100]. + + Per §1.3 this is the information-leak countermeasure if the cookie is + ever decrypted: an attacker who reads a quantized 65 doesn't know + whether they're at 63 or 67 — losing fine-grained gradient information + they could use to titrate against the threshold.""" + if raw < 0: + return 0 + if raw > 100: + return 100 + bucket = SCORE_BUCKET + return int(round(float(raw) / bucket)) * bucket + + +def new_sid() -> bytes: + """6 cryptographically-random bytes. 2^48 ≈ 281 trillion unique sids.""" + return secrets.token_bytes(SID_BYTES) + + +def _pack_payload(state: SessionState) -> bytes: + """Pack a state into wire format. v1 (legacy) is the 30-byte fixed + header. v2 adds a length-prefixed UTF-8 path suffix; always emit the + length byte even when path is empty so the decoder can dispatch on + plaintext length unambiguously (== 30 → v1, > 30 → v2).""" + head = struct.pack( + _V1_PACK_FMT, + state.v, + state.sid, + state.seq, + state.sum_dt, + state.sum_dt_sq, + state.last_ts, + state.score, + state.issued_at, + ) + if state.v == 1: + return head + path_bytes = state.prev_route_path.encode("utf-8")[:PREV_ROUTE_MAX_BYTES] + return head + bytes([len(path_bytes)]) + path_bytes + + +def _unpack_payload(buf: bytes) -> SessionState: + if len(buf) < _V1_PACK_SIZE: + raise CookieError(f"payload too short: {len(buf)} < {_V1_PACK_SIZE}") + v, sid, seq, sum_dt, sum_dt_sq, last_ts, score, issued_at = struct.unpack(_V1_PACK_FMT, buf[:_V1_PACK_SIZE]) + prev_route_path = "" + if len(buf) > _V1_PACK_SIZE: + path_len = buf[_V1_PACK_SIZE] + end = _V1_PACK_SIZE + 1 + path_len + if len(buf) != end: + raise CookieError( + f"prev_route_path length mismatch: payload {len(buf)} bytes, " + f"declared len {path_len}, expected end {end}" + ) + try: + prev_route_path = buf[_V1_PACK_SIZE + 1 : end].decode("utf-8") + except UnicodeDecodeError as e: + raise CookieError(f"prev_route_path utf-8 decode failed: {e}") from e + return SessionState( + sid=sid, + seq=seq, + sum_dt=sum_dt, + sum_dt_sq=sum_dt_sq, + last_ts=last_ts, + score=score, + issued_at=issued_at, + v=v, + prev_route_path=prev_route_path, + ) + + +def _aad(service_id: str, schema_version: int) -> bytes: + """AAD ties the cookie to one customer service AND one schema version. + + Format chosen to be trivially reproducible in any language: ASCII + ``{service_id}|v{N}`` with no padding, no length prefix, no JSON. + """ + return f"{service_id}|v{schema_version}".encode("ascii") + + +def _b64url_encode(data: bytes) -> str: + """Base64URL without padding — cookie-safe, RFC 4648 §5.""" + return base64.urlsafe_b64encode(data).rstrip(b"=").decode("ascii") + + +def _b64url_decode(s: str) -> bytes: + # Re-pad to a multiple of 4 before standard decode. + pad = "=" * (-len(s) % 4) + return base64.urlsafe_b64decode((s + pad).encode("ascii")) + + +@dataclass +class CookieCodec: + """Encode and decode session cookies. + + Construct with the current 32-byte AES key. Pass ``previous_key`` during + the 24h grace window after a key rotation; ``decode`` then trial- + decrypts with the current key first and falls back to the previous on + AEAD failure (the right move because AEAD verification is constant-time + relative to the key, so the fallback adds at most one AES-GCM verify + on the unhappy path). + """ + + key: bytes + previous_key: bytes | None = None + service_id: str = "" + schema_version: int = SCHEMA_VERSION + + def __post_init__(self) -> None: + if len(self.key) != KEY_BYTES: + raise CookieError(f"key must be {KEY_BYTES} bytes (AES-256), got {len(self.key)}") + if self.previous_key is not None and len(self.previous_key) != KEY_BYTES: + raise CookieError(f"previous_key must be {KEY_BYTES} bytes (AES-256), got {len(self.previous_key)}") + if not self.service_id: + raise CookieError("service_id is required (AAD binding)") + self._aad = _aad(self.service_id, self.schema_version) + self._aead = AESGCM(self.key) + self._aead_prev = AESGCM(self.previous_key) if self.previous_key else None + + def encode(self, state: SessionState, *, nonce: bytes | None = None) -> str: + """Encrypt and base64url-encode a state. ``nonce`` is for tests only; + production calls let it default to a fresh random 96-bit nonce.""" + if state.v != self.schema_version: + raise CookieError(f"state schema version {state.v} != codec schema version {self.schema_version}") + if nonce is None: + nonce = os.urandom(NONCE_BYTES) + elif len(nonce) != NONCE_BYTES: + raise CookieError(f"nonce must be {NONCE_BYTES} bytes, got {len(nonce)}") + + plaintext = _pack_payload(state) + ciphertext = self._aead.encrypt(nonce, plaintext, self._aad) + return _b64url_encode(nonce + ciphertext) + + def decode(self, cookie_value: str) -> SessionState: + """Decrypt, verify, and unpack. Raises ``CookieError`` on any failure + (bad base64, wrong length, tampered ciphertext, wrong key, wrong + service id, wrong schema version).""" + try: + raw = _b64url_decode(cookie_value) + except Exception as e: + raise CookieError(f"base64url decode failed: {e}") from e + + if len(raw) < NONCE_BYTES + _V1_PACK_SIZE + 16: # 16-byte GCM tag + raise CookieError(f"cookie too short: {len(raw)} bytes") + + nonce, ciphertext = raw[:NONCE_BYTES], raw[NONCE_BYTES:] + + plaintext: bytes | None = None + last_err: Exception | None = None + for aead in (self._aead, self._aead_prev): + if aead is None: + continue + try: + plaintext = aead.decrypt(nonce, ciphertext, self._aad) + break + except InvalidTag as e: + last_err = e + continue + if plaintext is None: + raise CookieError(f"AEAD verification failed: {last_err}") from last_err + + state = _unpack_payload(plaintext) + # Accept v1 cookies during the migration window — they carry no + # prev_route_path, so L2 falls back to uniform prior for that one + # request but the request still serves. The decoder is the only + # place we accept old schemas; the encoder always emits the + # current SCHEMA_VERSION. + if state.v != self.schema_version and state.v != 1: + raise CookieError(f"payload schema version {state.v} != codec schema version {self.schema_version}") + return state diff --git a/backend/scoring/evaluate.py b/backend/scoring/evaluate.py new file mode 100644 index 00000000..cd3fe64e --- /dev/null +++ b/backend/scoring/evaluate.py @@ -0,0 +1,299 @@ +"""ROC-AUC evaluation of a trained matrix against labeled negatives. + +Per research doc §9.2 we use labels for EVALUATION ONLY — never to +auto-zero transitions in the trained matrix itself, because letting a +compromised verifier account submit "bad sessions" would create a +matrix-poisoning vector. The evaluator answers: "if I apply the L2 +scorer to these known-malicious sessions and these known-good sessions, +does the AUC clear the quality bar?" + +Inputs: + - matrix (already trained; from backend.scoring.matrix) + - labeled sessions: each is one of the JSONL trace dicts plus a label + field ("good" | "bad"). Labels are sourced from a human verifier + interface (admin UI) or from explicit allow/block lists. + +Outputs: + - AUC (area under the ROC curve) + - per-session L2 scores + - pass/fail vs configurable threshold +""" + +from __future__ import annotations + +import logging +from collections.abc import Iterable +from dataclasses import dataclass, field +from typing import Final + +from backend.scoring.normalize import normalize +from backend.scoring.scorer import score_layer2 + +logger = logging.getLogger(__name__) + +# A trained matrix that can't separate the labeled good from labeled bad +# at AUC > this is not deployment-quality. 0.85 is the doc's implicit +# bar (referenced as "deployment-quality" in §9.2 discussion). Override +# per-call via the ``min_auc`` kwarg on ``evaluate()`` if a particular +# matrix needs a stricter or looser bar; the CLI script and the test +# suite are the current callers. +DEFAULT_MIN_AUC: Final[float] = 0.85 + + +@dataclass +class EvaluatedSession: + session_id: str + label: str # "good" | "bad" + l2_score: int # 0-100 from the L2 scorer + transition_count: int + + +@dataclass +class EvaluationResult: + auc: float + pass_threshold: float + passed: bool + n_good: int + n_bad: int + per_session: list[EvaluatedSession] = field(default_factory=list) + + def summary(self) -> str: + return ( + f"AUC={self.auc:.3f} (threshold {self.pass_threshold:.2f}) — " + f"{'PASS' if self.passed else 'FAIL'} " + f"(n_good={self.n_good}, n_bad={self.n_bad})" + ) + + +def _session_l2_score(session: dict, matrix: dict) -> tuple[int, int]: + """Compute the maximum L2 score across all transitions in a session. + + "Maximum" because in production VCL would block on any single high- + score request — a session whose highest-score transition exceeds the + threshold is operationally caught regardless of where in the session + it happened. Also returns the transition count so empty-session edge + cases can be filtered.""" + events = session.get("events", []) + if len(events) < 2: + return 0, 0 + prev = normalize(events[0].get("url", "/")) + max_score = 0 + n_trans = 0 + for ev in events[1:]: + curr = normalize(ev.get("url", "/")) + score, _, _ = score_layer2(matrix, prev, None, curr) + max_score = max(max_score, score) + prev = curr + n_trans += 1 + return max_score, n_trans + + +def _compute_auc(scores: list[tuple[int, str]]) -> float: + """Area under the ROC curve via the Mann-Whitney U formulation. + + AUC = (#{good_score < bad_score} + 0.5 * #{good_score == bad_score}) + / (n_good * n_bad) + + O(n²) on the input, which is fine for typical evaluation set sizes + (~10-10000 sessions). No SciPy/sklearn dependency.""" + good_scores = [s for s, lbl in scores if lbl == "good"] + bad_scores = [s for s, lbl in scores if lbl == "bad"] + n_good = len(good_scores) + n_bad = len(bad_scores) + if n_good == 0 or n_bad == 0: + # Degenerate: can't compute AUC without one of each. + return 0.5 + + wins = 0.0 + for g in good_scores: + for b in bad_scores: + if g < b: + wins += 1.0 + elif g == b: + wins += 0.5 + return wins / (n_good * n_bad) + + +def evaluate( + matrix: dict, + labeled_sessions: Iterable[tuple[dict, str]], + *, + min_auc: float = DEFAULT_MIN_AUC, +) -> EvaluationResult: + """Score every labeled session against ``matrix`` and report AUC. + + ``labeled_sessions`` is an iterable of (session_dict, label) where + label is ``"good"`` or ``"bad"``. Any other label string is rejected + upfront so a typo doesn't silently degrade AUC.""" + per_session: list[EvaluatedSession] = [] + scores_for_auc: list[tuple[int, str]] = [] + n_good = n_bad = 0 + + for session, label in labeled_sessions: + if label not in ("good", "bad", "neutral"): + raise ValueError(f"unexpected label {label!r}; want 'good', 'bad', or 'neutral'") + score, n_trans = _session_l2_score(session, matrix) + per_session.append( + EvaluatedSession( + session_id=session.get("session_id", "?"), + label=label, + l2_score=score, + transition_count=n_trans, + ) + ) + if label == "neutral": + # Intentionally uncertain: don't bias the AUC in either + # direction. The row stays in per_session for display, but + # is excluded from the scores_for_auc list. + continue + scores_for_auc.append((score, label)) + if label == "good": + n_good += 1 + else: + n_bad += 1 + + auc = _compute_auc(scores_for_auc) + return EvaluationResult( + auc=auc, + pass_threshold=min_auc, + passed=auc >= min_auc, + n_good=n_good, + n_bad=n_bad, + per_session=per_session, + ) + + +def evaluate_from_persisted_scores( + labeled_sessions: Iterable[tuple[dict, str]], + *, + min_auc: float = DEFAULT_MIN_AUC, +) -> EvaluationResult: + """AUC using the score that was ALREADY persisted in DuckDB (i.e. + what the live scorer actually returned at the edge — L1 + L2 + + cookie-compliance combined), instead of recomputing L2 from the + event list. + + Why this exists alongside ``evaluate()``: the L2-only evaluator + can't see cookie compliance or L1 timing signals, so single-URL bot + probes (the most common "bad" label class) score 0 because they + have 0 transitions — even though the LIVE scorer correctly + flagged them at 75 via the cookie-missing rule. The result is + AUC=0 even when the matrix is perfectly tracking labels. + + The offline trainer keeps using ``evaluate()`` because at training + time we only have raw event JSONL, no persisted edge scores. The + /scoring/evaluation API uses THIS function because it does have + them. + + Each session dict must carry a ``max_edge_score`` field (int 0-100). + Sessions without one are dropped — there's nothing to evaluate. + """ + per_session: list[EvaluatedSession] = [] + scores_for_auc: list[tuple[int, str]] = [] + n_good = n_bad = 0 + + for session, label in labeled_sessions: + if label not in ("good", "bad", "neutral"): + raise ValueError(f"unexpected label {label!r}; want 'good', 'bad', or 'neutral'") + max_score = session.get("max_edge_score") + if max_score is None: + continue # no persisted score — can't evaluate + n_events = len(session.get("events", [])) + per_session.append( + EvaluatedSession( + session_id=session.get("session_id", "?"), + label=label, + l2_score=int(max_score), # "l2_score" name kept for back-compat with EvaluatedSession dataclass + transition_count=max(0, n_events - 1), + ) + ) + if label == "neutral": + continue + scores_for_auc.append((int(max_score), label)) + if label == "good": + n_good += 1 + else: + n_bad += 1 + + auc = _compute_auc(scores_for_auc) + return EvaluationResult( + auc=auc, + pass_threshold=min_auc, + passed=auc >= min_auc, + n_good=n_good, + n_bad=n_bad, + per_session=per_session, + ) + + +# Known L1/L2 reason atoms emitted by the live scorer. Sourced from +# compute/scorer/src/scorer.rs — any new atom added there needs to be +# mirrored here (or, longer-term, derived from /scoring/health's +# top_reasons list dynamically). The two compute-side failure atoms +# (compute-unavailable / unauthorized) are excluded — they indicate +# scorer outages, not detection signals, and would skew per-rule AUC. +_KNOWN_REASON_ATOMS = ( + "cookie-missing", + "impossibly-fast", + "robotic-consistency", + "rare-transition", + "low-transition-prob", +) + + +def evaluate_per_reason( + labeled_sessions: Iterable[tuple[dict, str]], + *, + min_auc: float = DEFAULT_MIN_AUC, + min_per_class: int = 3, +) -> dict: + """AUC broken down by which L1/L2 rule fired in each session. + + For each known reason atom (cookie-missing, impossibly-fast, etc.): + - Filter to sessions whose events contain that atom in any + edge_score_reason CSV + - Compute AUC against ``max_edge_score`` over those sessions + - Gate display when n_good < min_per_class OR n_bad < min_per_class + (per-reason populations are strictly smaller than combined, + so this gate fires more often) + + Returns ``{"buckets": [{"reason": ..., "auc": ..., "passed": ..., + "n_good": ..., "n_bad": ..., "has_min_samples": bool}, ...], + "min_per_class": int}`` — the headline /scoring/evaluation gives + the combined AUC, this gives the per-rule breakdown. + """ + sessions_list = list(labeled_sessions) + + buckets: list[dict] = [] + for reason in _KNOWN_REASON_ATOMS: + filtered: list[tuple[dict, str]] = [] + for session, label in sessions_list: + if label not in ("good", "bad", "neutral"): + continue + events = session.get("events") or [] + tripped = False + for ev in events: + ev_reason = ev.get("edge_score_reason") or "" + if reason in {atom.strip() for atom in ev_reason.split(",") if atom.strip()}: + tripped = True + break + if not tripped: + continue + filtered.append((session, label)) + + n_good = sum(1 for _, lbl in filtered if lbl == "good") + n_bad = sum(1 for _, lbl in filtered if lbl == "bad") + bucket: dict = { + "reason": reason, + "n_good": n_good, + "n_bad": n_bad, + "min_per_class": min_per_class, + "has_min_samples": n_good >= min_per_class and n_bad >= min_per_class, + } + if bucket["has_min_samples"]: + result = evaluate_from_persisted_scores(filtered, min_auc=min_auc) + bucket["auc"] = round(float(result.auc), 4) + bucket["passed"] = bool(result.passed) + bucket["threshold"] = float(result.pass_threshold) + buckets.append(bucket) + return {"buckets": buckets, "min_per_class": min_per_class, "known_reasons": list(_KNOWN_REASON_ATOMS)} diff --git a/backend/scoring/fixtures.py b/backend/scoring/fixtures.py new file mode 100644 index 00000000..6457fa2d --- /dev/null +++ b/backend/scoring/fixtures.py @@ -0,0 +1,277 @@ +"""Convert real prod log rows from DuckDB into sessionized JSONL traces. + +The output is the canonical input format for both the training pipeline +(matrix builder, PageRank) and the scorer test fixtures. One JSONL line per +session, ordered by start time. + +Session boundary heuristic (pre-cookie deployment): group rows by +(client_ip, user_agent), then split into separate sessions whenever the gap +between consecutive events exceeds ``SESSION_GAP_SECONDS`` (default 30 min, +industry standard). Once the AES-GCM session cookie is deployed at the edge, +this fallback will be replaced by SID-based grouping. + +Output schema (one JSONL line per session): + + { + "session_id": "ip_", # synthetic until cookie ships + "client_ip": "1.2.3.4", + "user_agent": "...", + "start_ts": "2026-05-15T23:30:00+00:00", + "end_ts": "2026-05-15T23:35:12+00:00", + "event_count": 7, + "events": [ + {"ts": "...", "url": "/", "method": "GET", "status": 200, + "referer": "", "ttfb_ms": 50, "country": "US", "asn": 7922} + ] + } +""" + +from __future__ import annotations + +import hashlib +import json +import logging +from collections.abc import Iterable, Iterator +from dataclasses import dataclass, field +from datetime import datetime, timedelta +from pathlib import Path +from typing import IO, Any + +logger = logging.getLogger(__name__) + +# Industry-standard 30-minute inactivity boundary. Tuned later from production +# session-gap distributions; for now matches GA / Adobe conventions so the +# trained model approximates "what a normal session looks like" in the +# analytics sense. +SESSION_GAP_SECONDS = 30 * 60 + +# Columns the scorer cares about. Kept short on purpose — every extra column +# is bytes-per-event in the JSONL output and we extract millions of events. +_TRACE_COLUMNS = ( + "timestamp", + "ip", + "ua", + "url", + "method", + "status", + "referer", + "ttfb", + "country", + "asn", +) + + +@dataclass +class Event: + """A single request as seen at the edge. Field names match the JSONL + schema exactly so dataclasses.asdict round-trips cleanly.""" + + ts: str + url: str + method: str + status: int + referer: str + ttfb_ms: float + country: str + asn: int | None + + +@dataclass +class Session: + session_id: str + client_ip: str + user_agent: str + events: list[Event] = field(default_factory=list) + + @property + def start_ts(self) -> str: + return self.events[0].ts if self.events else "" + + @property + def end_ts(self) -> str: + return self.events[-1].ts if self.events else "" + + @property + def event_count(self) -> int: + return len(self.events) + + def to_jsonl_dict(self) -> dict[str, Any]: + return { + "session_id": self.session_id, + "client_ip": self.client_ip, + "user_agent": self.user_agent, + "start_ts": self.start_ts, + "end_ts": self.end_ts, + "event_count": self.event_count, + "events": [ + { + "ts": e.ts, + "url": e.url, + "method": e.method, + "status": e.status, + "referer": e.referer, + "ttfb_ms": e.ttfb_ms, + "country": e.country, + "asn": e.asn, + } + for e in self.events + ], + } + + +def _synth_session_id(client_ip: str, user_agent: str, start_ts: str) -> str: + """Stable 12-hex-char session id from the (ip, ua, start_ts) tuple. + + Start-time-anchored so that re-running extraction on the same data + produces the same ids — useful for reproducible test fixtures.""" + h = hashlib.sha1(f"{client_ip}|{user_agent}|{start_ts}".encode()).hexdigest() + return f"ip_{h[:12]}" + + +def _parse_ts(value: Any) -> datetime: + if isinstance(value, datetime): + return value + # DuckDB returns timestamps as datetime when fetched via Python API; this + # branch is the safety net for cases where they come back as strings + # (e.g. when stitched via CSV). + return datetime.fromisoformat(str(value).replace("Z", "+00:00")) + + +def _ts_iso(value: Any) -> str: + """ISO-8601 with second precision, UTC-suffixed. Matches what the scorer + consumes; truncating sub-second avoids burning bytes on JSON timestamps + we wouldn't use anyway.""" + dt = _parse_ts(value) + return dt.isoformat(timespec="seconds") + + +def rows_to_events(rows: Iterable[tuple[Any, ...]]) -> Iterator[tuple[str, str, Event]]: + """Convert raw DuckDB row tuples (in ``_TRACE_COLUMNS`` order) into + ``(client_ip, user_agent, Event)`` triples. Used as the sessionizer's + input stream. Generators throughout so we never materialize the full + 1.8M-row set in memory.""" + for row in rows: + ts, ip, ua, url, method, status, referer, ttfb, country, asn = row + yield ( + ip or "", + ua or "", + Event( + ts=_ts_iso(ts), + url=url or "", + method=(method or "").upper(), + status=int(status) if status is not None else 0, + referer=referer or "", + # ttfb is stored in seconds in the source schema; the scorer + # works in ms. Round to 3 decimals — sub-ms precision is noise + # at this layer. + ttfb_ms=round(float(ttfb) * 1000.0, 3) if ttfb is not None else 0.0, + country=country or "", + asn=int(asn) if asn is not None else None, + ), + ) + + +def sessionize( + events: Iterable[tuple[str, str, Event]], + *, + gap_seconds: int = SESSION_GAP_SECONDS, +) -> Iterator[Session]: + """Group ``(ip, ua, Event)`` triples into sessions. + + REQUIRES the input to be sorted by ``(ip, ua, ts)`` ascending — this is + enforced at the SQL layer with ``ORDER BY ip, ua, timestamp`` so we can + sessionize in a single streaming pass without buffering. Caller is + responsible for the sort. + + Within an (ip, ua) bucket, starts a new session whenever the gap from + the previous event exceeds ``gap_seconds``. + """ + current: Session | None = None + last_ts: datetime | None = None + threshold = timedelta(seconds=gap_seconds) + + for ip, ua, ev in events: + ts = _parse_ts(ev.ts) + + # New session iff we crossed an (ip, ua) boundary or exceeded the gap. + new_session = ( + current is None + or current.client_ip != ip + or current.user_agent != ua + or (last_ts is not None and ts - last_ts > threshold) + ) + + if new_session: + if current is not None and current.events: + yield current + current = Session( + session_id=_synth_session_id(ip, ua, ev.ts), + client_ip=ip, + user_agent=ua, + ) + + assert current is not None + current.events.append(ev) + last_ts = ts + + if current is not None and current.events: + yield current + + +def extract_traces( + con, + *, + service_id: str, + start: datetime | None = None, + end: datetime | None = None, + limit: int | None = None, + gap_seconds: int = SESSION_GAP_SECONDS, +) -> Iterator[Session]: + """Stream sessions from the per-service DuckDB logs view. + + ``con`` is a DuckDB connection (the per-service one returned by + ``backend.deps.get_con``). ``service_id`` is used to resolve the view + name (``logs_``). + """ + view = f"logs_{service_id.lower()}" + where_clauses = [] + if start is not None: + where_clauses.append(f"timestamp >= TIMESTAMP '{start.isoformat()}'") + if end is not None: + where_clauses.append(f"timestamp < TIMESTAMP '{end.isoformat()}'") + where_sql = ("WHERE " + " AND ".join(where_clauses)) if where_clauses else "" + limit_sql = f"LIMIT {int(limit)}" if limit else "" + + sql = f""" + SELECT {", ".join(_TRACE_COLUMNS)} + FROM {view} + {where_sql} + ORDER BY ip, ua, timestamp + {limit_sql} + """ + + logger.info("[scoring.fixtures] streaming events from %s", view) + rows = con.execute(sql).fetchall() # DuckDB streams internally; we get a list back + logger.info("[scoring.fixtures] fetched %d rows, sessionizing …", len(rows)) + + yield from sessionize(rows_to_events(rows), gap_seconds=gap_seconds) + + +def write_jsonl(sessions: Iterable[Session], out: IO[str]) -> int: + """Write sessions as JSONL to an open text file handle. Returns count. + + The text mode wrapping (vs. binary + manual encode) is deliberate — the + output is canonical JSON UTF-8, the file is typically small enough that + OS buffering is fine, and text mode keeps line termination portable. + """ + n = 0 + for s in sessions: + out.write(json.dumps(s.to_jsonl_dict(), separators=(",", ":")) + "\n") + n += 1 + return n + + +def write_jsonl_path(sessions: Iterable[Session], path: Path) -> int: + path.parent.mkdir(parents=True, exist_ok=True) + with path.open("w") as f: + return write_jsonl(sessions, f) diff --git a/backend/scoring/labels.py b/backend/scoring/labels.py new file mode 100644 index 00000000..f4193f94 --- /dev/null +++ b/backend/scoring/labels.py @@ -0,0 +1,190 @@ +"""Admin session labels for the edge scorer. + +Each label is one (service_id, sid) tuple tagged ``good`` / ``bad`` / +``neutral``. Stored in the per-service metadata SQLite DB (same file as +alerts, views, audit). The unique index on (service_id, sid) means +re-labeling a session updates the existing row rather than producing +duplicates — matches the admin's mental model ("I'm changing my mind +about this session," not "I'm creating a new label"). + +Labels feed [backend/scoring/evaluate.py](evaluate.py) for ROC-AUC +quality assessment of the trained matrix. Neutral rows are kept for +display but excluded from the AUC computation (intentionally uncertain +→ shouldn't bias precision/recall in either direction). + +This module is intentionally thin — schema lives in +[backend/core/metadata_db.py](../core/metadata_db.py); we just provide +the typed CRUD surface plus the upsert-on-sid semantics the API +endpoints want. +""" + +from __future__ import annotations + +import uuid +from typing import Literal + +from backend.core.metadata_db import get_con + +Label = Literal["good", "bad", "neutral"] +ALLOWED_LABELS: frozenset[str] = frozenset({"good", "bad", "neutral"}) + + +def _row_to_dict(r) -> dict: + return { + "id": r["id"], + "service_id": r["service_id"], + "sid": r["sid"], + "label": r["label"], + "notes": r["notes"] or "", + "flagged_by": r["flagged_by"] or "", + "sample_ip": r["sample_ip"] or "", + "sample_ua": r["sample_ua"] or "", + "sample_url": r["sample_url"] or "", + "created_at": r["created_at"], + "updated_at": r["updated_at"], + } + + +def save_label( + service_id: str, + sid: str, + label: str, + *, + notes: str = "", + flagged_by: str = "admin", + sample_ip: str = "", + sample_ua: str = "", + sample_url: str = "", +) -> dict: + """Upsert a label keyed on (service_id, sid). + + Re-labeling a session that's already labeled overwrites the prior + label + notes + sample fields and bumps ``updated_at``. The original + ``created_at`` and ``id`` are preserved so external references + (e.g. UI rows) survive. + """ + if label not in ALLOWED_LABELS: + raise ValueError(f"label must be one of {sorted(ALLOWED_LABELS)}, got {label!r}") + if not sid: + raise ValueError("sid is required") + + con = get_con(service_id) + new_id = str(uuid.uuid4()) + con.execute( + """ + INSERT INTO scoring_labels ( + id, service_id, sid, label, notes, flagged_by, + sample_ip, sample_ua, sample_url + ) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?) + ON CONFLICT(service_id, sid) DO UPDATE SET + label = excluded.label, + notes = excluded.notes, + flagged_by = excluded.flagged_by, + sample_ip = excluded.sample_ip, + sample_ua = excluded.sample_ua, + sample_url = excluded.sample_url, + updated_at = datetime('now') + """, + (new_id, service_id, sid, label, notes, flagged_by, sample_ip, sample_ua, sample_url), + ) + con.commit() + # Re-read so we return whatever row landed (could be the existing one + # if this was an UPDATE path, with its original id). + row = con.execute( + "SELECT * FROM scoring_labels WHERE service_id = ? AND sid = ?", + (service_id, sid), + ).fetchone() + return _row_to_dict(row) if row else {"id": new_id, "service_id": service_id, "sid": sid, "label": label} + + +def list_labels(service_id: str, limit: int = 500) -> list[dict]: + """Most-recent first. Limit is a safety cap; expect 0-10k labels total + per service in any reasonable use.""" + con = get_con(service_id) + # ROWID DESC as secondary sort: SQLite's datetime('now') is only + # second-precision, so rows inserted within the same wall-clock + # second otherwise return in implementation-defined order (which + # tripped the most-recent-first test). ROWID is insertion-order + # so the tie-break matches the admin's mental model. + rows = con.execute( + "SELECT * FROM scoring_labels WHERE service_id = ? ORDER BY updated_at DESC, ROWID DESC LIMIT ?", + (service_id, int(limit)), + ).fetchall() + return [_row_to_dict(r) for r in rows] + + +def get_label(service_id: str, sid: str) -> dict | None: + """Look up the label for a single sid. None if not labeled. + + Test-only convenience: production callers use list_labels (bulk fetch + for the admin UI) or get_label_by_id (after a save/update returns the + id). Kept because the labels test suite uses it for round-trip + verification.""" + con = get_con(service_id) + row = con.execute( + "SELECT * FROM scoring_labels WHERE service_id = ? AND sid = ?", + (service_id, sid), + ).fetchone() + return _row_to_dict(row) if row else None + + +def get_label_by_id(service_id: str, label_id: str) -> dict | None: + con = get_con(service_id) + row = con.execute( + "SELECT * FROM scoring_labels WHERE id = ?", + (label_id,), + ).fetchone() + return _row_to_dict(row) if row else None + + +def update_label( + service_id: str, + label_id: str, + *, + label: str | None = None, + notes: str | None = None, +) -> dict: + """PATCH semantics — only update the fields that were passed.""" + con = get_con(service_id) + sets: list[str] = [] + params: list = [] + if label is not None: + if label not in ALLOWED_LABELS: + raise ValueError(f"label must be one of {sorted(ALLOWED_LABELS)}, got {label!r}") + sets.append("label = ?") + params.append(label) + if notes is not None: + sets.append("notes = ?") + params.append(notes) + if not sets: + # No-op update — just return current state without bumping + # updated_at (avoids cache-buster noise from no-change PATCHes). + return get_label_by_id(service_id, label_id) or {} + sets.append("updated_at = datetime('now')") + params.append(label_id) + con.execute(f"UPDATE scoring_labels SET {', '.join(sets)} WHERE id = ?", params) + con.commit() + return get_label_by_id(service_id, label_id) or {} + + +def delete_label(service_id: str, label_id: str) -> dict: + """Hard delete. Idempotent — deleting an already-deleted row returns + success without raising.""" + con = get_con(service_id) + con.execute("DELETE FROM scoring_labels WHERE id = ?", (label_id,)) + con.commit() + return {"status": "success", "id": label_id} + + +def counts_by_label(service_id: str) -> dict[str, int]: + """{label: count}. Used by the status panel's "you've labeled N sessions" + summary. Includes 'good', 'bad', 'neutral' keys with 0 for missing.""" + con = get_con(service_id) + rows = con.execute( + "SELECT label, COUNT(*) AS n FROM scoring_labels WHERE service_id = ? GROUP BY label", + (service_id,), + ).fetchall() + out = {"good": 0, "bad": 0, "neutral": 0} + for r in rows: + out[r["label"]] = int(r["n"]) + return out diff --git a/backend/scoring/matrix.py b/backend/scoring/matrix.py new file mode 100644 index 00000000..e464182a --- /dev/null +++ b/backend/scoring/matrix.py @@ -0,0 +1,190 @@ +"""Transition-matrix builder. + +Reads sessionized JSONL traces (output of ``backend.scoring.fixtures``), +normalizes each URL to its canonical route, counts pairwise (prev → +current) transitions across all sessions, and emits a ``matrix.json`` +that ``backend.scoring.scorer.score_layer2`` consumes directly. + +Output schema (matches what the scorer expects): + + { + "version": "2026-06-01-a", # YYYY-MM-DD- per training run + "built_at": "2026-06-01T18:30:00+00:00", + "vocab_size": 250, # |V| for Laplace smoothing denom + "session_count": 12483, + "transition_count": 137501, + "counts": {prev: {curr: n}}, # raw transition counts + "row_totals": {prev: total_out_count}, + "categories": {route: "category"}, # for L2 category backoff + "anchors": [] # populated by pagerank module + } + +Training-time bot filter (research doc §1.3 + §9.2): we drop the obvious +"naive scraper" cases (single-event sessions, sessions whose mean dwell +falls below the human-floor) BEFORE building the matrix, so they don't +poison the learned distribution. We do NOT use confirmed-bad labels here +— that path is explicitly avoided in the doc to block label-poisoning +attacks; labels are only used by the ROC-AUC evaluator on the +already-trained matrix.""" + +from __future__ import annotations + +import json +import logging +from collections.abc import Iterable, Iterator +from dataclasses import dataclass, field +from datetime import UTC, datetime +from pathlib import Path +from typing import IO, Any + +from backend.scoring.normalize import Route, normalize + +logger = logging.getLogger(__name__) + +# Tuning knobs for the training-time bot filter. Conservative defaults — we +# err on the side of letting borderline sessions in (Laplace smoothing +# absorbs the noise), only filtering the truly degenerate cases. +MIN_EVENTS_PER_SESSION = 2 # < this is just a 1-shot probe, not a session +MIN_MEAN_DWELL_S = 0.2 # mirrors L1's impossibly-fast threshold + + +@dataclass +class MatrixStats: + """Counters surfaced for the training CLI's summary log.""" + + sessions_in: int = 0 + sessions_dropped_short: int = 0 + sessions_dropped_fast: int = 0 + sessions_kept: int = 0 + transitions: int = 0 + routes_seen: int = 0 + + +@dataclass +class TransitionMatrix: + counts: dict[str, dict[str, int]] = field(default_factory=dict) + row_totals: dict[str, int] = field(default_factory=dict) + categories: dict[str, str] = field(default_factory=dict) + vocab: set[str] = field(default_factory=set) + session_count: int = 0 + transition_count: int = 0 + anchors: list[str] = field(default_factory=list) + + def add_transition(self, prev: Route, curr: Route) -> None: + self.counts.setdefault(prev.path, {}) + self.counts[prev.path][curr.path] = self.counts[prev.path].get(curr.path, 0) + 1 + self.row_totals[prev.path] = self.row_totals.get(prev.path, 0) + 1 + self.transition_count += 1 + # Categories: first sighting wins. Routes are deterministic → + # category, so later overrides would be no-ops anyway. + if prev.path not in self.categories: + self.categories[prev.path] = prev.category + if curr.path not in self.categories: + self.categories[curr.path] = curr.category + self.vocab.add(prev.path) + self.vocab.add(curr.path) + + def to_json_dict(self, version: str) -> dict[str, Any]: + return { + "version": version, + "built_at": datetime.now(UTC).isoformat(timespec="seconds"), + "vocab_size": len(self.vocab), + "session_count": self.session_count, + "transition_count": self.transition_count, + "counts": self.counts, + "row_totals": self.row_totals, + "categories": self.categories, + "anchors": self.anchors, + } + + +def _read_jsonl(path: Path) -> Iterator[dict]: + with path.open() as f: + for line in f: + line = line.strip() + if not line: + continue + yield json.loads(line) + + +def _session_mean_dwell_seconds(events: list[dict]) -> float: + """Mean of inter-event time deltas, in seconds. + + Reads timestamps from the JSONL trace format. Returns 0.0 for sessions + with < 2 events (no delta to measure).""" + if len(events) < 2: + return 0.0 + first = datetime.fromisoformat(events[0]["ts"]) + last = datetime.fromisoformat(events[-1]["ts"]) + span = (last - first).total_seconds() + # n-1 deltas across n events. + return span / (len(events) - 1) + + +def build_matrix( + sessions: Iterable[dict], + *, + min_events: int = MIN_EVENTS_PER_SESSION, + min_mean_dwell_s: float = MIN_MEAN_DWELL_S, +) -> tuple[TransitionMatrix, MatrixStats]: + """Build a transition matrix from sessionized traces. + + ``sessions`` is an iterable of JSONL-shaped dicts (as emitted by + ``backend.scoring.fixtures.write_jsonl``). Streams through the input + in one pass — no buffering. + + Returns the matrix plus a MatrixStats summary for logging.""" + matrix = TransitionMatrix() + stats = MatrixStats() + + for session in sessions: + stats.sessions_in += 1 + events = session.get("events", []) + + if len(events) < min_events: + stats.sessions_dropped_short += 1 + continue + + mean_dwell = _session_mean_dwell_seconds(events) + if mean_dwell < min_mean_dwell_s: + stats.sessions_dropped_fast += 1 + continue + + # Walk consecutive pairs (sliding window of 2). + prev_route: Route | None = None + for ev in events: + curr = normalize(ev.get("url", "/")) + if prev_route is not None: + matrix.add_transition(prev_route, curr) + prev_route = curr + stats.sessions_kept += 1 + matrix.session_count += 1 + + stats.transitions = matrix.transition_count + stats.routes_seen = len(matrix.vocab) + return matrix, stats + + +def build_matrix_from_jsonl(path: Path, **kwargs) -> tuple[TransitionMatrix, MatrixStats]: + return build_matrix(_read_jsonl(path), **kwargs) + + +def write_matrix(matrix: TransitionMatrix, version: str, out: IO[str]) -> None: + """Serialize the matrix as canonical JSON. Use sort_keys so two runs + on the same input produce byte-identical output — important for + diffing matrix versions in CI and for the ``fastly compute deploy`` + package hash.""" + json.dump(matrix.to_json_dict(version), out, sort_keys=True, separators=(",", ":")) + + +def write_matrix_path(matrix: TransitionMatrix, version: str, path: Path) -> None: + path.parent.mkdir(parents=True, exist_ok=True) + with path.open("w") as f: + write_matrix(matrix, version, f) + + +def default_version() -> str: + """Version string in YYYY-MM-DD-a form. The trailing letter slot lets + us re-train multiple times in one day (e.g. after a route-template + update); the caller bumps the letter on subsequent runs.""" + return datetime.now(UTC).strftime("%Y-%m-%d-a") diff --git a/backend/scoring/normalize.py b/backend/scoring/normalize.py new file mode 100644 index 00000000..4f3385f2 --- /dev/null +++ b/backend/scoring/normalize.py @@ -0,0 +1,155 @@ +"""URL → canonical route ID normalization. + +Per research doc §5.3: collapse high-cardinality path segments (numeric ids, +UUIDs, slugs that look like keys) into ``*`` placeholders so the transition +matrix doesn't blow up to one row per unique URL. Query strings are +discarded entirely (the scorer keys on path topology, not query +parameters). + +The Rust port under ``compute/scorer/`` must produce identical output for +the same URL — these are the lookup keys for the matrix. + +Categories: every normalized route also gets a top-level category tag +derived from its first path segment, used by Layer 2's category-level +backoff (§4.2). Categories are intentionally coarse: ``product``, ``cart``, +``account``, ``api``, etc. Add/edit the prefix → category map below as new +sections of the site appear. +""" + +from __future__ import annotations + +import re +from dataclasses import dataclass +from typing import Final +from urllib.parse import urlsplit + +# A segment is "id-like" — and therefore gets collapsed to '*' — if it matches +# any of these. Order matters only when patterns overlap; current set is +# mutually exclusive. +_NUMERIC_ID = re.compile(r"^\d+$") +_UUID = re.compile( + r"^[0-9a-f]{8}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{12}$", + re.IGNORECASE, +) +# 24+ hex chars (common for content hashes / Mongo ObjectId variants). +_HEX_HASH = re.compile(r"^[0-9a-fA-F]{24,}$") +# Common ID prefixes: SKU-12345, ORD-XYZ-789, ASIN-B07X, etc. — leading +# ALL-CAPS-WITH-DASHES followed by alphanumerics. +_PREFIXED_ID = re.compile(r"^[A-Z]{2,5}[-_][A-Za-z0-9_-]+$") +# Long alphanumeric tokens that almost certainly aren't navigation (e.g. +# session-id-looking strings in path). Keep this conservative — false- +# positives here collapse real route names. +_LONG_OPAQUE = re.compile(r"^[A-Za-z0-9_-]{20,}$") + +_ID_PATTERNS = (_NUMERIC_ID, _UUID, _HEX_HASH, _PREFIXED_ID, _LONG_OPAQUE) + +# First-path-segment → category. Anything not listed → "other". +# Tuned to be additive: new buckets only widen the L2 backoff coverage, +# they never reclassify existing routes. +_CATEGORY_MAP: Final[dict[str, str]] = { + "": "home", + "api": "api", + "graphql": "api", + "products": "product", + "product": "product", + "items": "product", + "p": "product", + "categories": "browse", + "category": "browse", + "search": "browse", + "browse": "browse", + "cart": "cart", + "basket": "cart", + "checkout": "checkout", + "pay": "checkout", + "order": "checkout", + "orders": "checkout", + "account": "account", + "user": "account", + "users": "account", + "profile": "account", + "settings": "account", + "auth": "auth", + "login": "auth", + "signin": "auth", + "signup": "auth", + "register": "auth", + "logout": "auth", + "admin": "admin", + "static": "asset", + "assets": "asset", + "blog": "content", + "news": "content", + "about": "content", + "help": "content", + "support": "content", + "privacy": "content", + "terms": "content", + "faq": "content", +} + + +@dataclass(frozen=True) +class Route: + """Canonical normalized route plus its category tag. + + ``path`` is the lookup key for the transition matrix. + ``category`` is the L2 category-backoff key (§4.2).""" + + path: str + category: str + + +def _strip_query(url: str) -> str: + """Return just the path component of a URL. Handles both relative + (``/foo/bar?x=1``) and absolute (``https://h/foo/bar?x=1``) inputs.""" + parts = urlsplit(url) + return parts.path or "/" + + +def _looks_like_id(segment: str) -> bool: + if not segment: + return False + for pat in _ID_PATTERNS: + if pat.match(segment): + return True + return False + + +def _category_for(first_segment: str) -> str: + return _CATEGORY_MAP.get(first_segment.lower(), "other") + + +def normalize(url: str) -> Route: + """Convert a raw URL into a canonical (route, category) pair. + + Examples (doc §5.3): + / → Route('/', 'home') + /items/10243 → Route('/items/*', 'product') + /users/drew/profile → Route('/users/*/profile', 'account') + /api/v2/orders/00000abc-... → Route('/api/v2/orders/*', 'api') + /search?q=red+shoes&page=2 → Route('/search', 'browse') + """ + path = _strip_query(url) + # Treat the root specially — there's no segment to inspect, and the + # category is unambiguously 'home'. + if path in ("", "/"): + return Route(path="/", category="home") + + # Split, normalize each segment, rejoin. Empty strings between + # consecutive '/' or at the leading position drop out cleanly via the + # filter; we re-prepend the leading '/' below. + raw_segments = [s for s in path.split("/") if s != ""] + if not raw_segments: + return Route(path="/", category="home") + + normalized: list[str] = [] + for seg in raw_segments: + normalized.append("*" if _looks_like_id(seg) else seg.lower()) + + canonical = "/" + "/".join(normalized) + # Category from the FIRST segment of the original (lowercased) — never + # from a "*" placeholder, since that would obliterate the signal. + first = raw_segments[0].lower() + category = _category_for(first) + return Route(path=canonical, category=category) diff --git a/backend/scoring/pagerank.py b/backend/scoring/pagerank.py new file mode 100644 index 00000000..b1e6a1fe --- /dev/null +++ b/backend/scoring/pagerank.py @@ -0,0 +1,121 @@ +"""PageRank-based funnel-anchor identification (research doc §4.2 / §7). + +Computes the stationary distribution of a Markov walker over the +transition matrix, then declares the top-K routes as "anchors". The +Layer 2 scorer uses anchors for skip-gram lookback: a transition +``prev_anchor → current`` rescues an otherwise-rare ``prev → current`` +when the intervening pages are non-anchor auxiliaries (e.g. /about-us, +/privacy-policy, blog posts). + +Pure standard-library implementation — power-iteration with a damping +factor (canonical PageRank). No NumPy dependency so the same algorithm +is trivial to port to Rust. +""" + +from __future__ import annotations + +import logging +from typing import Final + +from backend.scoring.matrix import TransitionMatrix + +logger = logging.getLogger(__name__) + +PAGERANK_DAMPING: Final[float] = 0.85 +PAGERANK_TOLERANCE: Final[float] = 1e-7 +PAGERANK_MAX_ITER: Final[int] = 200 +DEFAULT_ANCHOR_FRACTION: Final[float] = 0.20 # top 20% of routes by PR + + +def _row_normalized_outlinks(matrix: TransitionMatrix) -> dict[str, dict[str, float]]: + """Convert raw counts → row-stochastic transition probabilities. + + Pages with no outlinks are handled in pagerank() as dangling nodes + that redistribute their mass uniformly (standard PageRank treatment). + """ + out: dict[str, dict[str, float]] = {} + for src, dests in matrix.counts.items(): + total = matrix.row_totals.get(src, 0) + if total <= 0: + continue + out[src] = {dst: cnt / total for dst, cnt in dests.items()} + return out + + +def pagerank( + matrix: TransitionMatrix, + *, + damping: float = PAGERANK_DAMPING, + tol: float = PAGERANK_TOLERANCE, + max_iter: int = PAGERANK_MAX_ITER, +) -> dict[str, float]: + """Compute PageRank scores for every route in the vocab. + + Implementation: standard power iteration with damping. + PR(p) = (1-d)/N + d * Σ_{q→p} PR(q) / outdeg(q) + dangling_mass/N + where dangling_mass = Σ_{q has no outlinks} PR(q) — the "stuck" + probability mass that gets redistributed uniformly each iteration. + """ + vocab = sorted(matrix.vocab) # sorted → deterministic + n = len(vocab) + if n == 0: + return {} + + outlinks = _row_normalized_outlinks(matrix) + inverted: dict[str, list[tuple[str, float]]] = {p: [] for p in vocab} + for src, dests in outlinks.items(): + for dst, prob in dests.items(): + inverted.setdefault(dst, []).append((src, prob)) + + rank = {p: 1.0 / n for p in vocab} + teleport = (1.0 - damping) / n + dangling_nodes = [p for p in vocab if p not in outlinks] + + for it in range(max_iter): + dangling_mass = sum(rank[p] for p in dangling_nodes) + dangling_contribution = damping * dangling_mass / n + new_rank: dict[str, float] = {} + for p in vocab: + incoming = sum(rank[q] * prob for q, prob in inverted.get(p, [])) + new_rank[p] = teleport + dangling_contribution + damping * incoming + + delta = sum(abs(new_rank[p] - rank[p]) for p in vocab) + rank = new_rank + if delta < tol: + logger.debug("[pagerank] converged after %d iterations (delta=%.2e)", it + 1, delta) + break + else: + logger.warning("[pagerank] hit max_iter=%d without converging (final delta=%.2e)", max_iter, delta) + + return rank + + +def select_anchors( + rank: dict[str, float], + *, + fraction: float = DEFAULT_ANCHOR_FRACTION, + min_anchors: int = 5, + max_anchors: int = 50, +) -> list[str]: + """Pick the top-K routes by PageRank as anchors. + + K = clamp(round(n * fraction), min_anchors, max_anchors). The clamp + handles both tiny sites (where 20% would be 1 anchor — too few for + skip-gram to help) and giant sites (where 20% would be hundreds — + too many; the L2 lookback only walks back a few steps anyway).""" + if not rank: + return [] + target = round(len(rank) * fraction) + k = max(min_anchors, min(max_anchors, target)) + # Sort by (-rank, route) so ties break deterministically by route name. + sorted_routes = sorted(rank.items(), key=lambda kv: (-kv[1], kv[0])) + return [route for route, _ in sorted_routes[:k]] + + +def compute_anchors(matrix: TransitionMatrix, **kwargs) -> list[str]: + """Convenience wrapper: pagerank + select_anchors in one call, + mutating ``matrix.anchors`` in place.""" + rank = pagerank(matrix) + anchors = select_anchors(rank, **kwargs) + matrix.anchors = anchors + return anchors diff --git a/backend/scoring/scorer.py b/backend/scoring/scorer.py new file mode 100644 index 00000000..2e855626 --- /dev/null +++ b/backend/scoring/scorer.py @@ -0,0 +1,310 @@ +"""Edge scoring engine: Layer 1 (universal behavioral) + Layer 2 (route +transition) + combined output. + +**Reference implementation, not the runtime path.** This module exists to +serve as the Python-side ground truth in cross-language parity tests; the +production scoring on every customer request runs in the Rust/Wasm port +under ``compute/scorer/`` at the edge. The wire-format fixtures in +``tests/scoring/fixtures/`` are byte-pinned and exercised by both +implementations — any drift between Python and Rust fails the build. +Production application code should NOT import ``score_combined``, +``score_layer1``, or ``ScoreResult.to_headers`` from this module; the +data path is VCL → Compute scorer → response headers, with no Python +in the loop. + +The math here is deliberately straightforward — no NumPy, no library +calls inside hot paths, integers + small dicts only — so it's easy to +keep in lock-step with Rust. + +This module is pure: no I/O, no clock reads, no environment lookups. The +caller is responsible for decoding the cookie (``backend.scoring.cookie``), +loading the matrix (``backend.scoring.matrix``), and normalizing the +incoming request URL (``backend.scoring.normalize``). The scorer just +takes those values and emits a verdict. +""" + +from __future__ import annotations + +import math +from dataclasses import dataclass, field +from typing import Final + +from backend.scoring.cookie import SessionState, quantize_score +from backend.scoring.normalize import Route + +# ── Layer 1 tuning constants (research doc §4.1) ────────────────────────────── + +# "Warmup gate" — timing rules suppressed until the session has a meaningful +# sample for mean/variance. seq=3 is the minimum for any variance estimate, +# the doc bumps that to "> 3" so we score from the 4th request onward. +L1_TIMING_WARMUP_SEQ: Final[int] = 3 + +# Impossibly Fast: mean dwell < this → fire. 200ms is faster than any human +# can read+click; lower than a typical TLS RTT to the nearest POP. +L1_FAST_DWELL_THRESHOLD_S: Final[float] = 0.20 + +# Robotic Consistency: variance must be below this AND mean dwell in the +# suspicious band. Variance threshold = 0.05s² captures `sleep(1)` style +# loops (variance ≈ 0). +# +# Security: was 0.5 — there was a "robotic safe zone" between +# L1_FAST_DWELL_THRESHOLD_S (0.20) and L1_ROBOTIC_DWELL_LOW_S (0.50) +# where a low-variance bot averaging 0.30s/page scored zero. Lower +# the threshold to 0.20 so the robotic detector covers the +# previously-uncovered band. +L1_ROBOTIC_VARIANCE_THRESHOLD: Final[float] = 0.05 +L1_ROBOTIC_DWELL_LOW_S: Final[float] = 0.20 +L1_ROBOTIC_DWELL_HIGH_S: Final[float] = 3.0 + +# Score contributions per fired rule. Sum of all L1 rules is capped at 100 +# in the combined output. Cookie compliance dominates because it's the +# strongest "this is definitely a bot" signal among L1's three rules. +L1_SCORE_FAST: Final[int] = 50 +L1_SCORE_ROBOTIC: Final[int] = 40 +L1_SCORE_COOKIE_MISSING: Final[int] = 75 +# Security: tampered cookies are a strictly stronger anomaly signal +# than missing/expired (missing might be a fresh visitor, tampered is +# intentional). The threshold-matrix admin UI uses score==100 to enable +# hard-block enforcement, so capping tampered at 75 let attackers stay +# below the enforcement bar while exhibiting active anomalous behavior. +L1_SCORE_COOKIE_TAMPERED: Final[int] = 100 + +# ── Layer 2 tuning constants (research doc §4.2) ────────────────────────────── + +# Laplace (additive) smoothing factor. Larger = more conservative (every +# unseen transition gets more probability mass). 0.5 per the doc. +L2_LAPLACE_ALPHA: Final[float] = 0.5 + +# Skip-gram discount: a high-probability anchor→current transition counts +# slightly less than a direct prev→current high-probability transition. +L2_SKIPGRAM_BETA: Final[float] = 0.7 + + +# Maps TransScore in [0, 1] → contribution in [0, 100]. We use a log-shaped +# transform so the score climbs sharply as transition probability drops +# below 1e-3 (the "almost certainly never happens in human traffic" floor). +# At P = 1.0 → 0. At P = 1e-3 → ~50. At P = 1e-6 → ~100. +def _l2_score_from_trans_prob(p: float) -> int: + """Map a TransScore probability to a Layer 2 anomaly score [0, 100].""" + if p >= 1.0: + return 0 + if p <= 1e-12: + return 100 + # -log10(p) maps p=1e-3 → 3, p=1e-6 → 6. Scale by 100/6 so p=1e-6 ≈ 100. + raw = -math.log10(p) * (100.0 / 6.0) + if raw < 0: + return 0 + if raw > 100: + return 100 + return int(round(raw)) + + +# ── Combined output ─────────────────────────────────────────────────────────── + + +@dataclass +class ScoreResult: + """Output of a single scoring evaluation. Maps 1:1 onto the + ``X-Edge-*`` response headers in research doc §6.""" + + score: int = 0 # quantized 0-100, the X-Edge-Score header + l1_score: int = 0 + l2_score: int = 0 + reasons: list[str] = field(default_factory=list) + cookie_compliance: str = "ok" # ok | missing | expired | rotated | tampered + # Diagnostics (not in the header set; useful in tests and analyst UI). + mean_dwell_s: float = 0.0 + variance_s2: float = 0.0 + trans_prob: float = 1.0 + matrix_version: str = "" + + def to_headers(self) -> dict[str, str]: + """Materialize the ``X-Edge-*`` header set per research doc §6.""" + return { + "X-Edge-Score": str(self.score), + "X-Edge-Score-L1": str(self.l1_score), + "X-Edge-Score-L2": str(self.l2_score), + "X-Edge-Cookie-Compliance": self.cookie_compliance, + "X-Edge-Score-Reason": ",".join(self.reasons), + "X-Edge-Matrix-Version": self.matrix_version, + } + + +# ── Layer 1: Universal Behavioral Shield ────────────────────────────────────── + + +def _running_mean_variance(state: SessionState) -> tuple[float, float]: + """Compute mean dwell and timing variance from the cookie's running sums. + + Uses the algebraic identity Var(X) = E[X²] − E[X]² so we never need to + store the history array. seq=0 case (no transitions yet) returns + (0, 0) — the timing rules check the warmup gate separately.""" + if state.seq <= 0: + return 0.0, 0.0 + mean = state.sum_dt / state.seq + second_moment = state.sum_dt_sq / state.seq + # Floating-point can push the second moment slightly below mean² when + # they're nearly equal (all dwells identical). Clamp to 0. + var = max(0.0, second_moment - mean * mean) + return mean, var + + +def score_layer1(state: SessionState) -> tuple[int, list[str], float, float]: + """Apply the L1 rules to a decoded session state. Returns + (score_contribution, reasons, mean_dwell_s, variance_s2).""" + mean, var = _running_mean_variance(state) + score = 0 + reasons: list[str] = [] + + # Impossibly Fast and Robotic Consistency both share the seq>3 warmup + # gate. Below that, only cookie compliance fires (which is handled by + # the caller — see score_combined — because it needs to know whether + # the cookie was missing/tampered, info that's lost by the time we + # have a SessionState in hand). + if state.seq > L1_TIMING_WARMUP_SEQ: + if mean < L1_FAST_DWELL_THRESHOLD_S: + score += L1_SCORE_FAST + reasons.append("impossibly-fast") + if var < L1_ROBOTIC_VARIANCE_THRESHOLD and L1_ROBOTIC_DWELL_LOW_S <= mean <= L1_ROBOTIC_DWELL_HIGH_S: + score += L1_SCORE_ROBOTIC + reasons.append("robotic-consistency") + + return min(score, 100), reasons, mean, var + + +# ── Layer 2: Route Transition Shield ────────────────────────────────────────── + + +def _transition_prob(matrix: dict, prev_route: str, current_route: str, vocab_size: int) -> float: + """Laplace-smoothed P(current | prev) lookup from a serialized matrix. + + Matrix shape (matches what backend.scoring.matrix emits): + { + "counts": {prev_route: {current_route: count, ...}}, + "row_totals": {prev_route: total_out_count}, + "vocab_size": int, + ... + } + + Returns the smoothed conditional probability. Always > 0 (Laplace + floor). Unseen prev rows get the all-uniform smoothed prior.""" + counts = matrix.get("counts", {}) + row_totals = matrix.get("row_totals", {}) + prev_row = counts.get(prev_route, {}) + numerator = prev_row.get(current_route, 0) + L2_LAPLACE_ALPHA + denominator = row_totals.get(prev_route, 0) + L2_LAPLACE_ALPHA * vocab_size + if denominator <= 0: + # Truly empty matrix — return the uniform prior. + return 1.0 / max(vocab_size, 1) + return numerator / denominator + + +def score_layer2( + matrix: dict | None, + prev_route: Route | None, + prev_anchor_route: Route | None, + current_route: Route, +) -> tuple[int, list[str], float]: + """Apply the L2 rules. Returns (score_contribution, reasons, trans_prob). + + - ``matrix`` is the serialized transition matrix; None → L2 disabled + (matrix not yet trained, first 7 days of deployment per §4.3). + - ``prev_route`` is the immediate previous route this session visited + (None on first request — L2 returns 0 with no reasons). + - ``prev_anchor_route`` is the most-recent ANCHOR route (skipping + auxiliary pages like /about-us) — used for the skip-gram lookback. + Pass None to disable skip-gram; pass the same as prev_route to + collapse skip-gram down to "look one step back" semantics. + - ``current_route`` is the request being scored. + """ + if matrix is None or prev_route is None: + return 0, [], 1.0 + + vocab_size = int(matrix.get("vocab_size", 0)) + if vocab_size <= 0: + return 0, [], 1.0 + + direct_p = _transition_prob(matrix, prev_route.path, current_route.path, vocab_size) + if prev_anchor_route is not None and prev_anchor_route.path != prev_route.path: + anchor_p = _transition_prob(matrix, prev_anchor_route.path, current_route.path, vocab_size) * L2_SKIPGRAM_BETA + trans_prob = max(direct_p, anchor_p) + else: + trans_prob = direct_p + + score = _l2_score_from_trans_prob(trans_prob) + reasons = ["low-transition-prob"] if score >= 50 else [] + return score, reasons, trans_prob + + +# ── Combined evaluation ─────────────────────────────────────────────────────── + + +def _blend_weight(matrix_age_days: float) -> float: + """Layer 2 weight ramps from 0 → 1 over the 3 days following Day 7. + + Per §4.3: avoids a step-function score change the moment training + becomes available. Day 7 → 0.0, Day 8 → 0.333, Day 10 → 1.0.""" + if matrix_age_days < 7.0: + return 0.0 + if matrix_age_days >= 10.0: + return 1.0 + return (matrix_age_days - 7.0) / 3.0 + + +def score_combined( + *, + state: SessionState | None, + cookie_compliance: str = "ok", + current_route: Route, + prev_route: Route | None, + prev_anchor_route: Route | None = None, + matrix: dict | None = None, + matrix_age_days: float = 0.0, +) -> ScoreResult: + """One-stop scorer. The caller assembles the inputs from cookie decode + + route history + loaded matrix; this function applies all rules, + blends per §4.3, quantizes per §3.3, and returns the headers. + + ``state=None, cookie_compliance != 'ok'`` is the "no cookie" path — + cookie compliance rule fires when seq>1 (i.e. this is NOT the first + request from this client) and the cookie is missing/tampered.""" + + result = ScoreResult(cookie_compliance=cookie_compliance) + result.matrix_version = str(matrix.get("version", "")) if matrix else "" + + # Cookie-compliance rule (§4.1). Note: we don't have seq if state is + # None, so the caller must hint "is this multi-request?" via cookie + # compliance status. ``missing`` and ``expired`` get the historical + # 75; ``tampered`` gets the full 100 (security) because it's a + # deliberate evasion signal. ``rotated`` is benign (fresh cookie + # post-rotation). + l1_from_cookie = 0 + if cookie_compliance == "tampered": + l1_from_cookie = L1_SCORE_COOKIE_TAMPERED + result.reasons.append("cookie-tampered") + elif cookie_compliance in ("missing", "expired"): + l1_from_cookie = L1_SCORE_COOKIE_MISSING + result.reasons.append(f"cookie-{cookie_compliance}") + + # Layer 1 timing rules (only meaningful with a valid decoded state). + if state is not None: + l1_timing, l1_reasons, mean, var = score_layer1(state) + result.mean_dwell_s = mean + result.variance_s2 = var + result.reasons.extend(l1_reasons) + result.l1_score = min(100, l1_from_cookie + l1_timing) + else: + result.l1_score = l1_from_cookie + + # Layer 2 transition rule (gated on matrix availability). + l2_score, l2_reasons, trans_prob = score_layer2(matrix, prev_route, prev_anchor_route, current_route) + result.l2_score = l2_score + result.trans_prob = trans_prob + result.reasons.extend(l2_reasons) + + # Combined per §4.3, quantized per §3.3. + w_l2 = _blend_weight(matrix_age_days) + raw_combined = result.l1_score + result.l2_score * w_l2 + result.score = quantize_score(raw_combined) + + return result diff --git a/backend/services/service_manager.py b/backend/services/service_manager.py index 6ceb59e9..5bc93c1e 100644 --- a/backend/services/service_manager.py +++ b/backend/services/service_manager.py @@ -1,11 +1,23 @@ """Service management layer for consistent configuration listing and enrichment.""" import os +import threading +import time from typing import Any from backend import config as svcconfig from backend.core import duckdb as _db +# Cache dirs hold thousands of small parquet files; recursively stat'ing +# them on every /api/bootstrap, /api/services, and admin tile render was a +# big chunk of the page-navigation lag (200-1500ms per call). The dir +# contents change on cron tick (every 2 min for most services), so a 60s +# TTL is comfortably below the freshness floor users notice in the +# "Local Cache" column while eliminating the per-request walk. +_DIR_STATS_TTL_SEC = 60.0 +_dir_stats_cache: dict[str, tuple[float, int, int]] = {} +_dir_stats_lock = threading.Lock() + def _get_dir_stats(path: str) -> tuple[int, int]: """Return ``(total_size_bytes, file_count)`` for ``path`` recursively. @@ -15,7 +27,14 @@ def _get_dir_stats(path: str) -> tuple[int, int]: thousands of small parquet files were the main motivator. Symlinks are skipped (preserves the prior os.walk behavior). """ + now = time.monotonic() + with _dir_stats_lock: + entry = _dir_stats_cache.get(path) + if entry and (now - entry[0]) < _DIR_STATS_TTL_SEC: + return (entry[1], entry[2]) if not os.path.exists(path): + with _dir_stats_lock: + _dir_stats_cache[path] = (now, 0, 0) return (0, 0) total_size = 0 file_count = 0 @@ -24,22 +43,35 @@ def _get_dir_stats(path: str) -> tuple[int, int]: d = stack.pop() try: with os.scandir(d) as it: - for entry in it: + for entry_de in it: try: - if entry.is_symlink(): + if entry_de.is_symlink(): continue - if entry.is_dir(follow_symlinks=False): - stack.append(entry.path) - elif entry.is_file(follow_symlinks=False): - total_size += entry.stat(follow_symlinks=False).st_size + if entry_de.is_dir(follow_symlinks=False): + stack.append(entry_de.path) + elif entry_de.is_file(follow_symlinks=False): + total_size += entry_de.stat(follow_symlinks=False).st_size file_count += 1 except OSError: continue except OSError: continue + with _dir_stats_lock: + _dir_stats_cache[path] = (now, total_size, file_count) return (total_size, file_count) +def _bust_dir_stats_cache(path: str | None = None) -> None: + """Invalidate a cached dir-stat entry. Called after operations that + materially change the cache contents (rebuild, teardown, ingest) + so the dashboard's Local Cache column updates immediately.""" + with _dir_stats_lock: + if path is None: + _dir_stats_cache.clear() + return + _dir_stats_cache.pop(path, None) + + def get_enriched_services(active_service_id: str | None = None) -> list[dict[str, Any]]: """Return all configured services enriched with status and database stats. diff --git a/backend/state_sync.py b/backend/state_sync.py index 5a6227e9..45ec4980 100644 --- a/backend/state_sync.py +++ b/backend/state_sync.py @@ -1,5 +1,6 @@ import json import logging +import time from backend import config as svcconfig from backend.core.duckdb import _get_fos_client, get_source_for_service @@ -13,6 +14,217 @@ def get_admin_state_key(source: dict) -> str: return f"{iceberg_root}/meta/admin_state.json" +def get_scoring_matrix_key(source: dict) -> str: + """FOS key for the trained scoring matrix JSON. + + Separate from admin_state because the matrix is a build artifact (gitignored, + not in admin_state.custom_fields). Lives under the same iceberg/meta/ prefix + so analyst hosts read the same blob the admin host wrote. + """ + base_prefix = source.get("prefix", "").strip("/") + iceberg_root = f"{base_prefix}/iceberg" if base_prefix else "iceberg" + return f"{iceberg_root}/meta/scoring_matrix.json" + + +def get_scoring_matrix_history_key(source: dict, version: str) -> str: + """FOS key for a historical (pre-overwrite) scoring matrix. + + Lives under ``iceberg/meta/scoring_matrix_history/{version}.json`` + so the operator can list past matrices and roll back to a known-good + one if a fresh retrain regresses AUC. + """ + base_prefix = source.get("prefix", "").strip("/") + iceberg_root = f"{base_prefix}/iceberg" if base_prefix else "iceberg" + return f"{iceberg_root}/meta/scoring_matrix_history/{version}.json" + + +def list_scoring_matrix_versions(service_id: str) -> list[dict]: + """List archived matrix versions under iceberg/meta/scoring_matrix_history/. + + Returns ``[{"version": "...", "key": "...", "size_bytes": int, + "last_modified": ""}, ...]`` sorted by last_modified descending. + Best-effort: returns empty list on any S3 error. + """ + source = get_source_for_service(service_id) + if not source: + return [] + base_prefix = source.get("prefix", "").strip("/") + iceberg_root = f"{base_prefix}/iceberg" if base_prefix else "iceberg" + prefix = f"{iceberg_root}/meta/scoring_matrix_history/" + try: + s3 = _get_fos_client(source) + out: list[dict] = [] + paginator = s3.get_paginator("list_objects_v2") + for page in paginator.paginate(Bucket=source["bucket"], Prefix=prefix): + for obj in page.get("Contents", []) or []: + key = obj.get("Key", "") + # Strip prefix + .json suffix to recover the version string. + version = key[len(prefix) :].removesuffix(".json") + last_mod = obj.get("LastModified") + out.append( + { + "version": version, + "key": key, + "size_bytes": int(obj.get("Size", 0)), + "last_modified": last_mod.isoformat() if last_mod else None, + } + ) + out.sort(key=lambda r: r.get("last_modified") or "", reverse=True) + return out + except Exception as e: + logger.debug(f"[state_sync] list_scoring_matrix_versions failed: {e}") + return [] + + +def restore_scoring_matrix_version(service_id: str, version: str) -> dict | None: + """Copy a historical scoring_matrix_history/{version}.json back to + the current scoring_matrix.json key. The next backend that calls + fetch_matrix_from_fos sees the restored matrix. + + Returns ``{"version": "...", "restored_at": ""}`` on success, + None when the version doesn't exist. Caller is responsible for + busting analytics caches + deleting the local matrix.json so + _load_matrix's resolution-order step 1 doesn't shadow the restored + FOS matrix. + + Live Wasm at the edge still uses its previously-embedded matrix — + a full restore-to-edge requires re-running deploy_wasm.sh. + """ + import datetime as _dt + + source = get_source_for_service(service_id) + if not source or source.get("access_level") == "read_only": + return None + history_key = get_scoring_matrix_history_key(source, version) + current_key = get_scoring_matrix_key(source) + try: + s3 = _get_fos_client(source) + try: + s3.head_object(Bucket=source["bucket"], Key=history_key) + except Exception: + return None # version doesn't exist + + # SNAPSHOT-BEFORE-OVERWRITE: copy the current live matrix to the + # history prefix BEFORE the restore copy_object overwrites it. + # Without this, a bad restore (operator picks the wrong version) + # is irreversible because the only copy of the prior-live matrix + # was the one we're about to clobber. Best-effort: NoSuchKey (no + # prior current) is silent; other failures log at DEBUG and do + # NOT block the restore — the operator's active intent wins. + epoch_ms = int(time.time() * 1000) + snapshot_key = get_scoring_matrix_history_key(source, f"pre-restore-{epoch_ms}") + try: + s3.copy_object( + Bucket=source["bucket"], + Key=snapshot_key, + CopySource={"Bucket": source["bucket"], "Key": current_key}, + ContentType="application/json", + ) + logger.info(f"[state_sync] Snapshotted pre-restore matrix to {snapshot_key}") + except Exception as e: + logger.debug(f"[state_sync] Could not snapshot pre-restore matrix: {e}") + + s3.copy_object( + Bucket=source["bucket"], + Key=current_key, + CopySource={"Bucket": source["bucket"], "Key": history_key}, + ContentType="application/json", + ) + restored_at = _dt.datetime.now(_dt.UTC).isoformat(timespec="seconds") + logger.info(f"[state_sync] Restored scoring matrix {version!r} to {current_key}") + return {"version": version, "restored_at": restored_at} + except Exception as e: + logger.warning(f"[state_sync] restore_scoring_matrix_version failed: {e}") + return None + + +def publish_matrix_to_fos(service_id: str, matrix: dict) -> None: + """Upload the trained scoring matrix JSON to FOS so analyst replicas + + GCE backend can fetch the same matrix the admin host has on disk. + + Without this, every fresh container needs the matrix scp'd in + manually (which is how the AUC field got bootstrapped the first + time). With this, calling enable_scoring or retrain on any + read_write host pushes the matrix to FOS exactly once, and every + other host's ``fetch_matrix_from_fos`` picks it up. + + Idempotent — calling with the same matrix overwrites the prior copy. + Silent no-op on read_only sources (analyst pods don't write back). + """ + source = get_source_for_service(service_id) + if not source or source.get("access_level") == "read_only": + return + try: + s3 = _get_fos_client(source) + bucket = source["bucket"] + key = get_scoring_matrix_key(source) + + # SNAPSHOT-BEFORE-OVERWRITE: copy the current matrix (if any) to + # the history prefix BEFORE the new put_object. Lets the + # operator roll back to a known-good matrix if a fresh retrain + # regresses AUC. Best-effort: history-snapshot failure (no + # prior current, permission edge case) does NOT block the + # publish — the operator's active intent always wins. + try: + prior = s3.get_object(Bucket=bucket, Key=key) + prior_bytes = prior["Body"].read() + prior_matrix = json.loads(prior_bytes.decode("utf-8")) + prior_version = prior_matrix.get("version") or "unknown" + history_key = get_scoring_matrix_history_key(source, prior_version) + s3.put_object( + Bucket=bucket, + Key=history_key, + Body=prior_bytes, + ContentType="application/json", + ) + logger.info(f"[state_sync] Snapshotted prior matrix to {history_key}") + except Exception as e: + # NoSuchKey on first-ever publish is expected and silent; + # other failures log at DEBUG so we know about them without + # spamming the operator. + logger.debug(f"[state_sync] Could not snapshot prior matrix: {e}") + + s3.put_object( + Bucket=bucket, + Key=key, + Body=json.dumps(matrix).encode("utf-8"), + ContentType="application/json", + ) + logger.info(f"[state_sync] Published scoring matrix to {key} (matrix_version={matrix.get('version', '?')})") + except Exception as e: + logger.warning(f"[state_sync] Failed to publish scoring matrix: {e}") + + +def fetch_matrix_from_fos(service_id: str) -> dict | None: + """Pull the trained matrix JSON from FOS. Returns None if missing + (no admin host has published it yet) or unreadable. + + Read-side path uses CDN when configured (cdn_url + cdn_secret) so + analyst hosts don't burn a Class B FOS op on every backend restart; + falls back to S3 GetObject when the CDN isn't wired. + """ + source = get_source_for_service(service_id) + if not source: + return None + key = get_scoring_matrix_key(source) + try: + if source.get("cdn_url"): + body = _cdn_get(source, key) + m = json.loads(body.decode("utf-8")) + else: + s3 = _get_fos_client(source) + try: + resp = s3.get_object(Bucket=source["bucket"], Key=key) + except s3.exceptions.NoSuchKey: + return None + m = json.loads(resp["Body"].read().decode("utf-8")) + if isinstance(m, dict) and m: + return m + except Exception as e: + logger.debug(f"[state_sync] Could not fetch scoring matrix from FOS: {e}") + return None + + def export_admin_state(service_id: str): source = get_source_for_service(service_id) if not source or source.get("access_level") == "read_only": @@ -52,9 +264,16 @@ def _cdn_get(source: dict, key: str) -> bytes: import urllib.parse import urllib.request + from backend.models.lake import _safe_cdn_url from backend.utils.telemetry import record_cdn_call - cdn_url = (source.get("cdn_url") or "").rstrip("/") + # SSRF guard: ``cdn_url`` is user-supplied at provision time. Reject + # anything that isn't an https Fastly hostname so the helper can't be + # turned into an outbound HTTP probe of internal services (GCE + # metadata, peer VMs, link-local addresses). + cdn_url = _safe_cdn_url((source.get("cdn_url") or "").rstrip("/")) + if not cdn_url: + raise urllib.error.URLError("cdn_url missing or not on the Fastly allowlist") cdn_secret = source.get("cdn_secret") or "" url = f"{cdn_url}/{urllib.parse.quote(key, safe='/')}" if cdn_secret: @@ -99,18 +318,59 @@ def import_admin_state(service_id: str): from backend.core import metadata_db - metadata_db.replace_audit_for_service(service_id, state.get("_audit_logs", [])) - metadata_db.replace_views_for_service(service_id, state.get("_views", [])) + # NON-DESTRUCTIVE on read_only analyst hosts: the analyst pod + # writes views and audit_logs locally (the routers have no + # access_level gate), and export_admin_state refuses to push from + # read_only sources — so the old wholesale replace_*_for_service + # silently wiped analyst writes on every metadata_sync cron tick + # with no chance of recovery. On read_only hosts we upsert/merge + # so local rows survive. On read_write hosts the original + # replace_* still runs — the writer is the source of truth there. + is_read_only = (source or {}).get("access_level") == "read_only" + if is_read_only: + metadata_db.upsert_views_for_service(service_id, state.get("_views", [])) + metadata_db.merge_audit_for_service(service_id, state.get("_audit_logs", [])) + else: + metadata_db.replace_audit_for_service(service_id, state.get("_audit_logs", [])) + metadata_db.replace_views_for_service(service_id, state.get("_views", [])) # Merge custom_fields into the local service config so the analyst's # UI catalog matches what the admin has defined. + # + # WHY THIS IS A MERGE (not an overwrite): scoring is enabled by code + # that injects 6 well-known custom_fields (edge_score, edge_score_l1, + # edge_score_l2, edge_cookie_compliance, edge_score_reason, edge_sid) + # via _SCORING_CUSTOM_FIELDS. If the FOS-stored admin_state.json + # predates scoring enablement (or was last written by a host that + # didn't have scoring), an unconditional overwrite silently strips + # those fields on every metadata_sync tick — which makes ingest + # write all-NULL values for the scoring columns even though Fastly + # is still emitting the data correctly. The 2026-06-02 production + # incident was exactly this. When scoring is enabled in the local + # cfg, ALWAYS re-inject the canonical list from code; the code is + # the source of truth, not whatever happens to be in FOS. if "custom_fields" in state: cfg = svcconfig.load_config(service_id) if cfg is not None: from backend.core import log_fields as _lf lf = _lf.get_lf_config(cfg) - lf["custom_fields"] = state["custom_fields"] + remote_fields = list(state["custom_fields"]) + + scoring_enabled = bool((cfg.get("scoring") or {}).get("enabled")) + if scoring_enabled: + from backend.provision.session_scoring_orchestrator import ( + _SCORING_CUSTOM_FIELDS, + _SCORING_FIELD_NAMES, + ) + + # Strip any scoring-named entries the remote might carry + # (stale, partial, or just plain different) and re-add + # the canonical entries from code. + remote_fields = [cf for cf in remote_fields if cf.get("name") not in _SCORING_FIELD_NAMES] + remote_fields.extend(dict(cf) for cf in _SCORING_CUSTOM_FIELDS) + + lf["custom_fields"] = remote_fields cfg["log_fields"] = lf svcconfig.save_config(service_id, cfg) diff --git a/backend/utils/bounded_cache.py b/backend/utils/bounded_cache.py new file mode 100644 index 00000000..90e47f91 --- /dev/null +++ b/backend/utils/bounded_cache.py @@ -0,0 +1,186 @@ +"""Bounded LRU+TTL cache with lazy reaping. + +Drop-in replacement for the ad-hoc ``dict[key, (timestamp, value)]`` +cache pattern scattered through the codebase. Each cache enforces: + +- **A maximum size.** Writes past ``maxsize`` evict the least-recently- + used entry. Guards against unique-key cardinality (e.g., diverse + dashboard filter combinations each minting a distinct cache key). +- **A TTL.** Reads return ``default`` for expired entries (they appear + absent via ``__contains__`` / ``get``), and every Nth write triggers + a sweep that drops all expired entries. + +Lazy reaping (vs. a background reaper thread) keeps the cache decoupled +from APScheduler — these caches sit upstream of the cron infrastructure +and pulling in scheduler imports here would invert the dependency graph. + +Stored values are arbitrary; the cache stamps insertion time internally +so call sites don't have to thread timestamps through their own tuples. +That said, the existing migration call sites still store ``(timestamp, +payload)`` tuples — we keep the value verbatim so the migration is a +one-line constructor swap. + +Threading: the cache holds a re-entrant lock for all mutations. Call +sites that already wrap their reads/writes in an outer lock are still +safe (RLock allows the same thread to acquire twice). Concurrent reads +on different keys do contend on the lock, but the operations under it +are O(1) dict / OrderedDict moves, so the contention window is small. +""" + +from __future__ import annotations + +import threading +import time +from collections import OrderedDict +from collections.abc import Iterator +from typing import Any + +# Lazy reaper cadence — every Nth write triggers a sweep of expired +# entries. Smaller N = more eager cleanup but more CPU per write; larger +# N = less CPU but more dead entries between sweeps. 100 was picked so +# the worst-case stale-entry count is bounded to ~100 even on a cache +# whose entries all expired (the cache is also bounded by maxsize, so +# the actual upper limit is min(maxsize, stale-count-since-last-reap)). +_REAP_EVERY_N_WRITES = 100 + +_MISSING = object() + + +class BoundedTTLCache: + """Thread-safe LRU+TTL cache with lazy reaping. + + Construct with explicit ``maxsize`` and ``ttl_seconds``: + + cache = BoundedTTLCache(maxsize=500, ttl_seconds=30) + + Then use it like a dict — the cache tracks insert time internally and + treats expired entries as absent: + + cache[key] = value # stores; evicts oldest if over maxsize + v = cache.get(key, default) # returns default if missing OR expired + if key in cache: ... # False if missing OR expired + cache.pop(key, default) # removes; returns value or default + cache.clear() # drop everything + len(cache) # count of CURRENTLY-LIVE entries + + LRU touch happens on successful reads — the read'd key moves to the + most-recently-used end, so the maxsize evictor drops genuinely cold + keys before active ones. + """ + + def __init__(self, *, maxsize: int, ttl_seconds: float): + if maxsize <= 0: + raise ValueError("maxsize must be positive") + if ttl_seconds <= 0: + raise ValueError("ttl_seconds must be positive") + self.maxsize = maxsize + self.ttl = float(ttl_seconds) + self._values: OrderedDict[Any, Any] = OrderedDict() + self._inserted_at: dict[Any, float] = {} + self._writes_since_reap = 0 + self.lock = threading.RLock() + + def __contains__(self, key: Any) -> bool: + return self.get(key, _MISSING) is not _MISSING + + def __getitem__(self, key: Any) -> Any: + result = self.get(key, _MISSING) + if result is _MISSING: + raise KeyError(key) + return result + + def get(self, key: Any, default: Any = None) -> Any: + with self.lock: + if key not in self._values: + return default + if self._is_expired_locked(key): + self._evict_locked(key) + return default + self._values.move_to_end(key) + return self._values[key] + + def __setitem__(self, key: Any, value: Any) -> None: + with self.lock: + self._values[key] = value + self._values.move_to_end(key) + self._inserted_at[key] = time.monotonic() + # Maxsize enforcement: drop LRU entries one at a time until + # under the cap. Usually a single pop is enough. + while len(self._values) > self.maxsize: + oldest_key, _ = self._values.popitem(last=False) + self._inserted_at.pop(oldest_key, None) + self._writes_since_reap += 1 + if self._writes_since_reap >= _REAP_EVERY_N_WRITES: + self._reap_expired_locked() + + def __delitem__(self, key: Any) -> None: + with self.lock: + del self._values[key] + self._inserted_at.pop(key, None) + + def pop(self, key: Any, default: Any = _MISSING) -> Any: + with self.lock: + if key not in self._values: + if default is _MISSING: + raise KeyError(key) + return default + value = self._values.pop(key) + self._inserted_at.pop(key, None) + return value + + def clear(self) -> None: + with self.lock: + self._values.clear() + self._inserted_at.clear() + self._writes_since_reap = 0 + + def __len__(self) -> int: + # Returns the raw entry count including any expired entries that + # haven't been lazily reaped yet. Callers that need a strictly + # live count can call ``reap()`` first; in practice the + # over-count is bounded by ``_REAP_EVERY_N_WRITES``. + with self.lock: + return len(self._values) + + def __iter__(self) -> Iterator[Any]: + # Snapshot the keys so iteration doesn't blow up if a concurrent + # writer mutates the OrderedDict. Callers shouldn't mutate during + # iteration anyway. + with self.lock: + return iter(list(self._values.keys())) + + def keys(self) -> list[Any]: + """Snapshot of currently-stored keys (possibly including + unreaped-expired entries). Returns a fresh list for safe + iteration even if the cache is mutated during the walk.""" + with self.lock: + return list(self._values.keys()) + + def reap(self) -> int: + """Drop all currently-expired entries. Returns the number removed.""" + with self.lock: + return self._reap_expired_locked() + + # ── internal (caller must hold self.lock) ────────────────────────── + + def _is_expired_locked(self, key: Any) -> bool: + inserted = self._inserted_at.get(key) + if inserted is None: + # Defensive: a value with no insert-time record is treated as + # immediately expired so it gets cleaned up next visit. + return True + return (time.monotonic() - inserted) > self.ttl + + def _evict_locked(self, key: Any) -> None: + self._values.pop(key, None) + self._inserted_at.pop(key, None) + + def _reap_expired_locked(self) -> int: + cutoff = time.monotonic() - self.ttl + # Materialise the expired-key list before mutating self._values + # to avoid "dictionary changed size during iteration". + expired_keys = [k for k, ts in self._inserted_at.items() if ts < cutoff] + for k in expired_keys: + self._evict_locked(k) + self._writes_since_reap = 0 + return len(expired_keys) diff --git a/backend/utils/date_utils.py b/backend/utils/date_utils.py index 66eea771..6a99ae8d 100644 --- a/backend/utils/date_utils.py +++ b/backend/utils/date_utils.py @@ -42,6 +42,23 @@ def parse_date_window(start: str, end: str, default_days: int = 7) -> tuple[str, return iso_z(start_dt), iso_z(end_dt) +def safe_iso(dt) -> str | None: + """Normalise a datetime or string to an ISO-8601 string ending in Z. + + DuckDB TIMESTAMP is timezone-naive but always represents UTC; appending + Z ensures JavaScript parses it as UTC instead of local time. Used by + both the duckdb core layer and the repositories layer. + """ + if dt is None: + return None + if hasattr(dt, "isoformat"): + s = dt.isoformat() + if not s.endswith("Z") and "+" not in s and s.count("-") <= 2: + s += "Z" + return s + return str(dt) + + def parse_window_str_to_dt(s: str) -> datetime: """Parse a string returned by ``parse_date_window`` back into a UTC datetime. diff --git a/backend/utils/fastly_auth.py b/backend/utils/fastly_auth.py new file mode 100644 index 00000000..8fec4afa --- /dev/null +++ b/backend/utils/fastly_auth.py @@ -0,0 +1,186 @@ +"""Caller-supplied Fastly token validation for destructive operations. + +Security finding: destructive endpoints (teardown, NGWAF workspace +modification) must NEVER fall back to server-stored credentials, and the +caller-supplied token must be validated as having the ``global`` scope (the +only Fastly scope that grants destructive service operations). If the token +binds to a specific service list, the target ``service_id`` must appear in it. + +The validation goes through Fastly's authoritative ``GET /tokens/self`` +endpoint — we don't try to introspect the token locally. +""" + +from __future__ import annotations + +import logging +from typing import Any + +from fastapi import HTTPException + +from backend.core.fastly.client import fastly + +logger = logging.getLogger(__name__) + +# Fastly's token-scope vocabulary, per +# https://www.fastly.com/documentation/reference/api/auth-tokens/user/ +# ``global`` is the ONLY scope that grants destructive service operations. +# ``global:read``, ``purge_select``, ``purge_all`` must all be rejected for +# destructive use. +_DESTRUCTIVE_SCOPE = "global" + + +def _parse_scopes(raw: Any) -> list[str]: + """Normalize the ``scope`` field from /tokens/self into a list of scopes. + + Fastly returns scope as a space-separated string for some token shapes + and as a list of strings for others. Both forms get normalized to a list + so the membership check below is unambiguous. + """ + if raw is None: + return [] + if isinstance(raw, list): + return [str(s).strip() for s in raw if s] + if isinstance(raw, str): + return [s for s in raw.split() if s] + return [] + + +def validate_destructive_token(token: str, *, service_id: str) -> dict[str, Any]: + """Validate that ``token`` is allowed to perform destructive ops on ``service_id``. + + Raises ``HTTPException(401)`` on any failure: + * empty/missing token + * non-200 response from /tokens/self (invalid token, network error, etc.) + * ``scope`` missing, not parseable, or doesn't include ``global`` + * ``services`` is a non-empty list and ``service_id`` is not a member + + Returns the validated token response dict on success so the caller can log + ``token_data["id"]`` / ``user_id`` for audit. + """ + token = (token or "").strip() + if not token: + raise HTTPException( + status_code=401, + detail={ + "error": "token_required", + "message": ( + "A Fastly API token with the 'global' scope is required " + "for destructive operations. Server-stored credentials are " + "not accepted here." + ), + }, + ) + + try: + token_data = fastly("GET", "/tokens/self", token=token) + except Exception as e: + # Don't leak the raw error to the caller — could include the token + # value or a useful error message for the attacker. + logger.warning("[fastly-auth] /tokens/self call failed: %s", e) + raise HTTPException( + status_code=401, + detail={"error": "token_validation_failed", "message": "Could not validate token with Fastly."}, + ) + + if not isinstance(token_data, dict): + logger.warning("[fastly-auth] /tokens/self returned non-dict: %r", type(token_data)) + raise HTTPException( + status_code=401, + detail={"error": "token_validation_failed", "message": "Unexpected token response shape."}, + ) + + scopes = _parse_scopes(token_data.get("scope")) + if _DESTRUCTIVE_SCOPE not in scopes: + logger.warning( + "[fastly-auth] token (id=%s, user=%s) missing 'global' scope; got=%r", + token_data.get("id"), + token_data.get("user_id") or "(automation)", + scopes, + ) + raise HTTPException( + status_code=401, + detail={ + "error": "insufficient_scope", + "message": ( + "Token does not have the 'global' scope required for destructive operations. " + "Use a Fastly token with full 'global' permissions, not 'global:read', " + "'purge_select', or 'purge_all'." + ), + }, + ) + + # Service binding check: empty/missing services list means "unrestricted", + # which is acceptable. Non-empty list must include the target service. + bound_services = token_data.get("services") + if isinstance(bound_services, list) and bound_services: + if service_id not in bound_services: + logger.warning( + "[fastly-auth] token (id=%s) bound to %d services but not target service_id=%s", + token_data.get("id"), + len(bound_services), + service_id, + ) + raise HTTPException( + status_code=401, + detail={ + "error": "service_not_authorized", + "message": ( + "Token is bound to a service list that does not include the target service. " + "Use a token with global access or with this service in its allow-list." + ), + }, + ) + + # Tenant binding check: the scope+services checks above don't prevent + # "use a global token from MY own Fastly account against someone + # else's service ID". Cross-reference the service's owning + # ``customer_id`` (fetched with the same token, so any access denial + # there fails closed too) against the token holder's + # ``customer_id``. Mismatch → reject. + token_customer = (token_data.get("customer_id") or "").strip() + try: + service_data = fastly("GET", f"/service/{service_id}", token=token) + except Exception as e: + logger.warning( + "[fastly-auth] /service/%s call failed during tenant verification: %s", + service_id, + e, + ) + raise HTTPException( + status_code=401, + detail={ + "error": "tenant_verification_failed", + "message": "Could not verify token tenant against target service.", + }, + ) + + service_customer = "" + if isinstance(service_data, dict): + service_customer = (service_data.get("customer_id") or "").strip() + + if not token_customer or not service_customer or token_customer != service_customer: + logger.warning( + "[fastly-auth] tenant mismatch: token customer=%r vs service customer=%r (token id=%s, service=%s)", + token_customer or "(missing)", + service_customer or "(missing)", + token_data.get("id"), + service_id, + ) + raise HTTPException( + status_code=401, + detail={ + "error": "tenant_mismatch", + "message": ( + "The supplied token is not authorized for the target service's tenant. " + "Use a token issued under the same Fastly account that owns this service." + ), + }, + ) + + logger.info( + "[fastly-auth] destructive op authorized: token id=%s user=%s service=%s", + token_data.get("id"), + token_data.get("user_id") or "(automation)", + service_id, + ) + return token_data diff --git a/backend/utils/pop_utils.py b/backend/utils/pop_utils.py index c4e3a750..d7aa6c0f 100644 --- a/backend/utils/pop_utils.py +++ b/backend/utils/pop_utils.py @@ -1,8 +1,6 @@ import json import os import threading -import urllib.error -import urllib.request CACHE_FILE = "cache/pop_locations.json" @@ -21,31 +19,21 @@ def fetch_pop_locations(api_key: str) -> bool: if not api_key: return False + # /datacenters endpoint requires auth but provides the exact flat + # list of coordinates we need. fastly() handles auth + retry + + # telemetry internally — replaces the old urllib.request flow plus + # the redundant outer tracked_call wrapper. try: - from backend.utils.telemetry import tracked_call - except ImportError: - tracked_call = None + from backend.core.fastly.client import fastly - def _do_fetch(): - try: - headers = {"User-Agent": "FastlyLogAnalysis/1.0", "Accept": "application/json", "Fastly-Key": api_key} - # /datacenters endpoint requires auth but provides the exact flat list of coordinates we need - req = urllib.request.Request("https://api.fastly.com/datacenters", headers=headers) - with urllib.request.urlopen(req, timeout=15) as resp: - pops = json.loads(resp.read().decode("utf-8")) - - os.makedirs(os.path.dirname(CACHE_FILE), exist_ok=True) - with open(CACHE_FILE, "w") as f: - json.dump(pops, f) - return True - except Exception as e: - print(f"Warning: POP fetch failed: {e}") - return False - - if tracked_call: - with tracked_call("GET", "/datacenters", service="Fastly API"): - return _do_fetch() - return _do_fetch() + pops = fastly("GET", "/datacenters", token=api_key) + os.makedirs(os.path.dirname(CACHE_FILE), exist_ok=True) + with open(CACHE_FILE, "w") as f: + json.dump(pops, f) + return True + except Exception as e: + print(f"Warning: POP fetch failed: {e}") + return False def get_pop_locations(): diff --git a/backend/utils/remote_access.py b/backend/utils/remote_access.py index 1ab7dda2..a8e9d280 100644 --- a/backend/utils/remote_access.py +++ b/backend/utils/remote_access.py @@ -83,7 +83,17 @@ # Local "is this a real LAN hostname" allowlist; admins can extend via env. # ``testserver`` is starlette.testclient.TestClient's default Host header. -_LOCAL_HOST_ALLOWLIST = {"localhost", "127.0.0.1", "[::1]", "0.0.0.0", "testserver", "backend", "frontend", "caddy", "web"} +_LOCAL_HOST_ALLOWLIST = { + "localhost", + "127.0.0.1", + "[::1]", + "0.0.0.0", + "testserver", + "backend", + "frontend", + "caddy", + "web", +} import os @@ -101,93 +111,93 @@ def _is_private_or_loopback(ip_str: str) -> bool: - """Check if the provided IP or hostname is loopback, local, or private subnet.""" + """Check if the provided IP or hostname is loopback or a local-test stub. + + The original implementation treated ANY RFC1918 / link-local IP as + "local admin" — which broke down for real users coming in from a + private corporate network (10.x, 172.16/12, 192.168.x). A remote + analyst behind a VPN would be misclassified as an admin and bypass + the analyst-blocked endpoint prefixes (``/api/provision/``, + ``/api/admin/`` etc.) entirely. Even worse, an SSRF probe of + ``169.254.169.254`` (GCE metadata) would land as "local" too. + + Production topology: Caddy connects to uvicorn over loopback + (127.0.0.1, host network mode + ``--forwarded-allow-ips=127.0.0.1``) + so the only legitimate "this is the admin / TestClient" peer is + loopback. Keep ``is_loopback`` and the literal-stub set; drop the + over-broad ``is_private`` rule. + + Function name is retained for backwards compatibility with the rest + of remote_access.py — callers see no signature change. + """ try: ip = ipaddress.ip_address(ip_str) - return ip.is_private or ip.is_loopback + return ip.is_loopback except ValueError: # Hostnames or test client stub names (e.g. "testclient", "localhost") return ip_str in ("testclient", "localhost") -def _host_matches_public_endpoint(request: Request) -> bool: - """Return True if the request's Host header matches the registered - public_endpoint or tunnel_url. Used to identify Caddy-proxied analyst - traffic in deployments where every service binds to 127.0.0.1 — peer - IP can't distinguish Caddy-from-Fastly from local-admin, so the Host - header (which only a Caddy-proxied Fastly visitor can legitimately - present) is the disambiguator. - """ - mgr = get_tunnel_manager() - state = mgr.state - if not (state.public_endpoint or state.tunnel_url): - return False - host_header = (request.headers.get("host") or "").split(":")[0].lower() - if not host_header: - return False - from urllib.parse import urlparse - - if state.tunnel_url and state.tunnel_url.lower() == host_header: - return True - if state.public_endpoint: - pe = urlparse(state.public_endpoint) - if pe.hostname and pe.hostname.lower() == host_header: - return True - return False - - def is_request_remote(request: Request) -> bool: """Decide whether this request is from a remote analyst. - Ground truth: the connected-socket peer address. We never trust the - ``Host`` or ``X-Forwarded-Host`` header for this — those are - sender-supplied. Two rules: - - 1. If the connection came from loopback or a private local subnet - (e.g., container-to-container Docker traffic), it's local by default. - (The Next.js rewrite proxy at localhost:3000 → localhost:8000 means - every analyst request ALSO arrives at FastAPI from 127.0.0.1 or a private - network peer — but we set ``is_remote`` via Next.js sending an - ``X-Remote-Analyst: 1`` header that the middleware trusts ONLY when - the tunnel manager reports an active tunnel.) - 2. Otherwise, the connection is genuinely remote (direct-expose mode, - or the tunnel is forwarding 0.0.0.0). + Production topology: + Fastly edge → Caddy on this VM → uvicorn on 127.0.0.1. + Caddy rewrites X-Forwarded-For to the authoritative Fastly-Client-IP + header (stripping any client-supplied XFF). uvicorn runs with + ``--proxy-headers --forwarded-allow-ips=127.0.0.1`` so it populates + ``request.client.host`` from XFF ONLY when the TCP peer is loopback. + + Therefore by the time the middleware sees a request: + * ``request.client.host == "127.0.0.1"`` — direct loopback connection + (admin SSH-tunnel, container-internal healthcheck, TestClient stub). + * otherwise — Caddy-proxied request and the value is the real client IP. + + We never trust the ``Host`` header or any other client-supplied header for + this classification — the Host header was the source of the critical + auth bypass. + + The ``X-Remote-Analyst: 1`` fallback is honored ONLY when the TCP peer is + loopback AND tunnel sharing is active. This exists for two legitimate + paths: (a) tests using starlette TestClient which always presents + 127.0.0.1 as the peer, and (b) future deployments where the analyst + surface is served via a same-host proxy (e.g., the Next.js dev rewrite at + localhost:3000 → localhost:8000). Direct admin connections never set this + header, so the gate stays closed for them. """ host = request.client.host if request.client else "127.0.0.1" - - # If the connection came from an external public IP, it's genuinely remote. - if not _is_private_or_loopback(host): - return True - # Loopback or private subnet peer — disambiguate by Host header in deployments - # where Caddy also runs on the host network or in container networks (then all - # peers look like loopback or private IPs). Only Caddy proxying a real Fastly - # visitor can present the registered public_endpoint hostname; local admin - # traffic uses localhost/127.0.0.1/private network names. - if _host_matches_public_endpoint(request): + # Caddy-proxied request: uvicorn has rewritten the peer to the real + # client IP via --proxy-headers, so any non-loopback/non-private peer is + # genuinely remote. ``_is_private_or_loopback`` also accepts the stub + # values starlette TestClient uses ("testclient", "localhost") so tests + # don't accidentally hit the remote branch. + if not _is_private_or_loopback(host): return True - # Proxied case: the Next.js rewrite layer marks remote requests. + # Loopback peer. Promote to remote ONLY if the explicit marker is set AND + # tunnel sharing is actually live. Tunnel-sharing gating means a stale + # header on a non-sharing instance can't toggle the branch. if request.headers.get("x-remote-analyst") == "1": mgr = get_tunnel_manager() if mgr.is_sharing_active(): return True - + return False def get_client_ip(request: Request, *, is_remote: bool) -> str: """Return the trusted client IP. - Only honor ``X-Forwarded-For`` when ``is_remote`` (the middleware's - socket-bound check classified this as a remote request). For local - listener traffic we use the raw socket peer to prevent header spoofing - by a same-LAN attacker (Section #5). + With uvicorn running ``--proxy-headers --forwarded-allow-ips=127.0.0.1`` + the framework already populates ``request.client.host`` from X-Forwarded-For + when the TCP peer is loopback (i.e., Caddy on this host). For all other + peers, ``request.client.host`` IS the socket peer. We never re-parse the + XFF header ourselves — that's what made exploitable. The + ``is_remote`` parameter is kept for backwards compatibility but no longer + influences the result. """ - if is_remote: - fwd = request.headers.get("x-forwarded-for", "") - if fwd: - return fwd.split(",")[0].strip() + del is_remote # signal: parameter intentionally ignored, kept for ABI stability return request.client.host if request.client else "0.0.0.0" @@ -257,11 +267,24 @@ def apply_response_hardening(response: Response) -> Response: class _StaticAssetLimiter: - """Per-IP token bucket: 600 requests/min OR 50 MB/min.""" + """Per-IP token bucket: 600 requests/min OR 50 MB/min. + + Security: bound the in-memory ``_reqs`` / ``_bytes`` dicts so a + high-cardinality IP attack (one request per source) cannot OOM the + server by inflating the dicts indefinitely. The original implementation + never evicted; an attacker with a botnet (or one that spoofed XFF before + Phase 0 closed it) could pump ~50 bytes of memory per unique IP per + minute with no upper bound. + """ REQ_LIMIT = 600 BYTE_LIMIT = 50 * 1024 * 1024 WINDOW_S = 60 + # Total distinct IPs we'll track concurrently. Sized to comfortably + # accommodate a busy real workload (thousands of analyst sessions on + # NAT'd corporate networks share a small set of egress IPs) while + # blocking a runaway-cardinality DoS in single-digit-MB territory. + MAX_TRACKED_IPS = 10_000 def __init__(self) -> None: import threading @@ -270,10 +293,32 @@ def __init__(self) -> None: self._reqs: dict[str, list[float]] = {} self._bytes: dict[str, list[tuple[float, int]]] = {} + def _evict_locked(self, cutoff: float) -> None: + """Sweep stale per-IP entries whose all timestamps fall before cutoff.""" + # Iterate over a snapshot so we can mutate during the loop. + for ip in list(self._reqs.keys()): + recent = [t for t in self._reqs[ip] if t >= cutoff] + if not recent: + self._reqs.pop(ip, None) + self._bytes.pop(ip, None) + else: + # Take this opportunity to also trim the surviving list. + self._reqs[ip] = recent + if len(self._reqs) > self.MAX_TRACKED_IPS: + # Cardinality bomb: drop everything rather than spending CPU + # on quadratic LRU tracking. Limits get a one-minute reset for + # all IPs but the next legitimate burst will re-grow the dict. + self._reqs.clear() + self._bytes.clear() + def check(self, ip: str, content_length: int) -> bool: with self._lock: now = time.time() cutoff = now - self.WINDOW_S + # Cheap pre-check: only sweep when we're past the cap. The sweep + # is O(n) so we don't want to run it on every request. + if len(self._reqs) > self.MAX_TRACKED_IPS: + self._evict_locked(cutoff) rs = [t for t in self._reqs.get(ip, []) if t >= cutoff] rs.append(now) self._reqs[ip] = rs diff --git a/backend/utils/router_utils.py b/backend/utils/router_utils.py index 37339218..f1e6888d 100644 --- a/backend/utils/router_utils.py +++ b/backend/utils/router_utils.py @@ -9,11 +9,13 @@ from __future__ import annotations -import traceback +import logging from functools import wraps from fastapi import HTTPException +logger = logging.getLogger(__name__) + # ── Debug request formatting ────────────────────────────────────────────────── _SENSITIVE_HEADERS = frozenset({"fastly-key", "authorization", "x-api-key", "x-api-token"}) @@ -61,6 +63,21 @@ def sse_flush_preamble(count: int = 8): yield f": {' ' * 1024}\n\n" +def sse_event(payload: dict, pad: int = 256): + """Yield one SSE event followed by a padding comment that prevents + proxy buffering of trailing events. + + Used by the SSE routers (provision, session_scoring) which all + previously defined an identical ``def yj`` locally. ``pad=0`` disables + the padding comment for callers that don't need it (e.g. the heartbeat + sites in services/core.py).""" + import json as _json + + yield f"data: {_json.dumps(payload)}\n\n" + if pad: + yield f": {' ' * pad}\n\n" + + # ── State sync ──────────────────────────────────────────────────────────────── @@ -92,7 +109,14 @@ def sync_admin_state(service_id: str | None) -> None: def query_errors(status_code: int = 400): """Decorator that catches exceptions from a route handler and raises a - standard HTTPException with ``{"error": ..., "trace": ...}``. + standard ``HTTPException`` with ``{"error": str(e)}``. + + Security: the previous implementation embedded the full + Python traceback under a ``trace`` key in the response body. Public + callers could read internal file paths, module structure, and even + secret values that leaked into exception messages. The fix is to log + the traceback server-side (where operators can read it during triage) + and return only the exception message to the client. Optionally catches ``ValueError`` / ``LookupError`` as 400/404 before the generic fallback. @@ -117,9 +141,14 @@ def wrapper(*args, **kwargs): except LookupError as e: raise HTTPException(status_code=404, detail={"error": str(e)}) except Exception as e: + # logger.exception records the traceback to server logs + # WITHOUT putting it on the wire. Triage requires opening + # the backend log; that's an acceptable cost for the + # security gain. + logger.exception("[query_errors] unhandled exception in %s", fn.__qualname__) raise HTTPException( status_code=status_code, - detail={"error": str(e), "trace": traceback.format_exc()}, + detail={"error": str(e)}, ) return wrapper diff --git a/backend/utils/sql_validator.py b/backend/utils/sql_validator.py new file mode 100644 index 00000000..0c951b1b --- /dev/null +++ b/backend/utils/sql_validator.py @@ -0,0 +1,558 @@ +"""User-supplied SQL validator for DuckDB (security). + +The audit-drafted fix for the three DuckDB file-read findings (set +``enable_external_access=false`` on the connection) was validated against +production DuckDB 1.5.3 and found to break the iceberg_scan view that every +dashboard query relies on (see security_remediation_final_v6.md Appendix A). +The alternatives — ``allowed_directories``, ``disabled_filesystems`` — either +don't enforce or also block S3 reads required for iceberg_scan. + +This module implements Decision B: a statement-type whitelist + a recursive +parse-tree walker that runs ``json_serialize_sql`` on every user-supplied +SQL string before execution. The walker rejects: + + * Statement types other than SELECT / WITH / SHOW. + * Catalog table references that match the dangerous-schema deny list + (``duckdb_*`` / ``pg_*`` table-name prefixes, ``information_schema`` / + ``pg_catalog`` / ``system`` schema names, any non-``main`` catalog). + * Function calls in a fixed deny set (env-var / setting / secret + exfiltration helpers, all ``read_*`` / ``glob`` / ``lsdir`` file-system + helpers, external-DB scanners). + * Multi-statement payloads. + * Inputs larger than 64 KB (DoS guard on the parser itself). + +Every rejection emits a structured audit log line so attack-shaped probing +(``getenv``, ``read_csv_auto``, ``duckdb_secrets``, etc.) shows up as a +rejection-rate spike per session / service. Operators can use the log +output to either tighten the policy or whitelist a legitimate query +pattern. + +The execution-side defense-in-depth — per-connection memory cap, statement +timeout, auto-injected LIMIT — lives in ``apply_user_query_limits`` so it +can be applied to the connection separately (this module never opens a +connection itself). +""" + +from __future__ import annotations + +import hashlib +import json +import logging +import time +from dataclasses import dataclass +from typing import Any + +import duckdb + +logger = logging.getLogger(__name__) +_audit_logger = logging.getLogger("backend.sql_validator.audit") + + +# ── Tunables ──────────────────────────────────────────────────────────────── + +# Reject inputs larger than this before invoking the parser. The DuckDB +# parser itself is a DoS surface on pathological inputs (deeply nested +# subqueries, very long IN lists, etc.), and no legitimate user query +# should approach 64 KB. +MAX_INPUT_BYTES = 64 * 1024 + +# Statement types accepted by the user-query path. ``SELECT_NODE`` covers +# the underlying expression of SELECT and CTE-wrapping WITH statements +# (``json_serialize_sql`` surfaces them via the statement-level +# ``"node":{"type":"SELECT_NODE"}``). ``SHOW`` is allowed because the +# dashboard's debug panel uses it to introspect the live schema. +ALLOWED_STATEMENT_TYPES = frozenset({"SELECT_NODE", "SET_OPERATION_NODE"}) + +# Table-name prefixes that should never appear in a user query. The +# ``duckdb_`` family enumerates internal state (``duckdb_secrets``, +# ``duckdb_settings``, ``duckdb_extensions``). The ``pg_`` family is the +# PostgreSQL catalog compatibility surface (``pg_settings``, +# ``pg_authid``). +_BLOCKED_TABLE_PREFIXES = ("duckdb_", "pg_") + +# Schema names that bypass the table-name-prefix check. ``information_schema`` +# is the SQL-standard introspection namespace and would otherwise slip +# through because ``information_schema.tables`` has ``table_name="tables"`` +# (no blocked prefix). +_BLOCKED_SCHEMA_NAMES = frozenset({"information_schema", "pg_catalog", "system"}) + +# The app only uses the default ``main`` catalog. Any cross-catalog +# reference (e.g. ``system.information_schema.tables``) is rejected +# because the only way a user query reaches a non-``main`` catalog is +# via ``ATTACH`` (which is itself blocked at the statement-type level). +ALLOWED_CATALOG = "main" + +# Function denylist organised by intent. Each name is matched +# case-insensitively against the ``function_name`` field in the parse +# tree's ``"class":"FUNCTION"`` nodes. +_BLOCKED_FUNCTIONS = frozenset( + { + # Environment / config exfiltration + "getenv", + "current_setting", + "duckdb_secrets", + "duckdb_settings", + "duckdb_variables", + "duckdb_extensions", + # File system reads — all variants of read_* (CSV / Parquet / JSON / + # text / blob / Avro / Iceberg-from-disk). No s3:// exception: + # user SQL targets the materialized FOS view, never raw read_parquet. + "read_csv", + "read_csv_auto", + "read_parquet", + "read_parquet_metadata", + "read_parquet_schema", + "read_json", + "read_json_auto", + "read_json_objects", + "read_json_objects_auto", + "read_ndjson", + "read_ndjson_auto", + "read_ndjson_objects", + "read_text", + "read_text_auto", + "read_blob", + "read_blob_auto", + "read_avro", + "iceberg_scan", + "iceberg_metadata", + "iceberg_snapshots", + "parquet_metadata", + "parquet_schema", + "parquet_kv_metadata", + # File system discovery + "glob", + "lsdir", + # External DB scanners + "sqlite_scan", + "sqlite_attach", + "postgres_scan", + "postgres_attach", + "postgres_query", + "mysql_scan", + "mysql_attach", + "mysql_query", + } +) + + +# ── Public types ───────────────────────────────────────────────────────────── + + +class SQLValidationError(ValueError): + """Raised by ``validate_user_sql`` when the input fails any check. + + The ``reason`` field is the structured rejection code; ``message`` is + the human-readable explanation. Callers should surface ``message`` to + the API caller but log ``reason`` for attack-detection alerting. + """ + + def __init__(self, reason: str, message: str) -> None: + super().__init__(message) + self.reason = reason + self.message = message + + +@dataclass +class ValidationResult: + """Successful validation result. + + ``parse_tree`` is the raw json_serialize_sql output (kept for callers + that want to inspect / transform). ``elapsed_ms`` is the parse + walk + cost — useful for the perf-budget alert. + """ + + parse_tree: dict + elapsed_ms: float + + +# ── Module entry point ────────────────────────────────────────────────────── + + +def validate_user_sql( + sql: str, + *, + parser_con: duckdb.DuckDBPyConnection, + session_id: str | None = None, + service_id: str | None = None, +) -> ValidationResult: + """Validate a user-supplied SQL string against the Decision B policy. + + On rejection: emits an audit log line AND raises ``SQLValidationError``. + On success: returns a ``ValidationResult`` whose ``parse_tree`` is the + parsed JSON representation (caller can ignore it; the parse itself is + the side effect). + + ``parser_con`` is a DuckDB connection used to call ``json_serialize_sql``. + Pass the same read-only connection the query will execute against — + parsing is cheap (~ms) and uses no state. + + The ``session_id`` and ``service_id`` are stamped into the audit log + for attack-pattern detection. Pass ``None`` for system-internal calls + that bypass the user-query path (those should never invoke this + function in the first place). + """ + if not isinstance(sql, str): + _reject(sql, "input_type", "SQL must be a string", session_id, service_id) + + # Size pre-check (cheap; bounds parser cost). + encoded = sql.encode("utf-8", errors="replace") + if len(encoded) > MAX_INPUT_BYTES: + _reject( + sql, + "input_too_large", + f"query exceeds {MAX_INPUT_BYTES} byte limit ({len(encoded)} bytes)", + session_id, + service_id, + ) + + t0 = time.monotonic() + + # Parse via json_serialize_sql. Any parser exception OR a returned + # ``{"error": true, ...}`` envelope counts as a rejection (fail + # closed). This forces the parser to see ALL whitespace and bracket + # balance issues at validation time, not at execution time when a + # malformed payload could land halfway through a statement. + try: + row = parser_con.execute("SELECT json_serialize_sql(?)", [sql]).fetchone() + except Exception as exc: + _reject( + sql, + "parse_error", + f"SQL parse failed: {exc}", + session_id, + service_id, + ) + + if not row or row[0] is None: + _reject(sql, "parse_empty", "SQL parse returned no output", session_id, service_id) + + try: + parsed = json.loads(row[0]) + except json.JSONDecodeError as exc: + _reject(sql, "parse_invalid_json", f"json_serialize_sql output invalid: {exc}", session_id, service_id) + + if not isinstance(parsed, dict): + _reject(sql, "parse_unexpected_shape", "expected JSON object", session_id, service_id) + + # The parser surfaces malformed SQL as ``{"error": true, ...}`` + # rather than raising — fail closed on that branch. + if parsed.get("error") is True: + err_type = parsed.get("error_type", "") + err_msg = parsed.get("error_message", "") + _reject( + sql, + f"parse_error:{err_type}", + f"SQL parse error: {err_msg}", + session_id, + service_id, + ) + + statements = parsed.get("statements") + if not isinstance(statements, list) or len(statements) == 0: + _reject(sql, "no_statements", "no parseable statements", session_id, service_id) + if len(statements) > 1: + _reject( + sql, + "multi_statement", + f"only one statement allowed, got {len(statements)}", + session_id, + service_id, + ) + + # Statement-type whitelist. SELECT and CTE-wrapping WITH both + # surface as a ``SELECT_NODE`` inside ``node``. SET_OPERATION_NODE + # is UNION/INTERSECT/EXCEPT — also legitimate. + stmt = statements[0] + node = stmt.get("node") if isinstance(stmt, dict) else None + node_type = node.get("type") if isinstance(node, dict) else None + if node_type not in ALLOWED_STATEMENT_TYPES: + _reject( + sql, + f"statement_type:{node_type or '?'}", + f"only SELECT / WITH / UNION statements allowed (got {node_type})", + session_id, + service_id, + ) + + # Recursive walk: catalog blocklist (table_name / schema_name / + # catalog_name) + function denylist. The walker visits every dict and + # list nested under ``parsed`` so a buried sub-select or CTE doesn't + # slip through. + _walk_and_validate(parsed, sql, session_id, service_id) + + elapsed_ms = (time.monotonic() - t0) * 1000 + + # Perf budget tracking. The Decision B target is p99 < 10 ms on + # representative queries. Above 50 ms is a yellow-flag, above 200 ms + # means someone's sending pathological input that should be rejected + # by the 64 KB cap (or there's a walker bug). + if elapsed_ms > 50: + logger.warning( + "[sql_validator] slow parse+walk: %.1fms for %d-byte input session=%s service=%s", + elapsed_ms, + len(encoded), + session_id, + service_id, + ) + + return ValidationResult(parse_tree=parsed, elapsed_ms=elapsed_ms) + + +# ── Walker ────────────────────────────────────────────────────────────────── + + +def _walk_and_validate( + node: Any, + original_sql: str, + session_id: str | None, + service_id: str | None, +) -> None: + """Recursively visit every dict/list in the parse tree.""" + if isinstance(node, dict): + # BASE_TABLE: table reference. Check name + schema + catalog. + if node.get("type") == "BASE_TABLE" or "table_name" in node: + _check_table_reference(node, original_sql, session_id, service_id) + + # FUNCTION: function call. Check function_name. + # DuckDB also tags table functions inside TABLE_FUNCTION wrappers + # but the inner node is still {"class":"FUNCTION", "function_name":...} + if node.get("class") == "FUNCTION": + fname = node.get("function_name") + if isinstance(fname, str) and fname.lower() in _BLOCKED_FUNCTIONS: + _reject( + original_sql, + f"function_denylist:{fname.lower()}", + f"function '{fname}' is not allowed in user queries", + session_id, + service_id, + ) + + for value in node.values(): + _walk_and_validate(value, original_sql, session_id, service_id) + elif isinstance(node, list): + for item in node: + _walk_and_validate(item, original_sql, session_id, service_id) + + +def _check_table_reference( + node: dict, + original_sql: str, + session_id: str | None, + service_id: str | None, +) -> None: + """Validate a BASE_TABLE node's name / schema / catalog fields.""" + table_name = (node.get("table_name") or "").strip() + schema_name = (node.get("schema_name") or "").strip().lower() + catalog_name = (node.get("catalog_name") or "").strip().lower() + + # DuckDB replacement scans: a path-shaped string in a FROM clause + # (``SELECT * FROM '/etc/passwd'`` or ``SELECT * FROM 's3://bucket/key'``) + # is parsed as a BASE_TABLE with table_name=, then resolved to an + # implicit read_* function call at execution time — bypassing the + # function denylist entirely. Reject any table name containing path + # separators or dotted segments. Legitimate identifiers never need + # them (schema-qualified names land in schema_name / catalog_name). + if "/" in table_name or "\\" in table_name or "." in table_name: + _reject( + original_sql, + "catalog_blocklist:table_name_path", + f"table name '{table_name}' contains path-like characters", + session_id, + service_id, + ) + + # Reject blocked table-name prefixes (catches duckdb_secrets etc. + # referenced without a schema qualifier). + name_lower = table_name.lower() + for prefix in _BLOCKED_TABLE_PREFIXES: + if name_lower.startswith(prefix): + _reject( + original_sql, + f"catalog_blocklist:table_name_prefix:{prefix}", + f"table '{table_name}' uses blocked prefix '{prefix}'", + session_id, + service_id, + ) + + # Reject the introspection-schema bypass. information_schema.tables + # has table_name="tables" which would otherwise pass the prefix check. + if schema_name in _BLOCKED_SCHEMA_NAMES: + _reject( + original_sql, + f"catalog_blocklist:schema_name:{schema_name}", + f"schema '{schema_name}' is not allowed in user queries", + session_id, + service_id, + ) + + # Reject any non-default catalog. The app uses only 'main'. + if catalog_name and catalog_name != ALLOWED_CATALOG: + _reject( + original_sql, + f"catalog_blocklist:catalog_name:{catalog_name}", + f"catalog '{catalog_name}' is not allowed (only '{ALLOWED_CATALOG}')", + session_id, + service_id, + ) + + +# ── Audit logging + raise ─────────────────────────────────────────────────── + + +def _reject( + sql: str, + reason: str, + message: str, + session_id: str | None, + service_id: str | None, +) -> None: + """Emit a structured audit log line and raise SQLValidationError. + + Never returns — always raises. The log line is JSON-shaped so it can + be aggregated by ``rejection_reason`` and alerted on per session/service. + """ + sql_str = sql if isinstance(sql, str) else str(sql) + query_hash = "sha256:" + hashlib.sha256(sql_str.encode("utf-8", errors="replace")).hexdigest() + snippet = sql_str[:500] + _audit_logger.warning( + "sql_validator_reject reason=%s session=%s service=%s hash=%s len=%d snippet=%r", + reason, + session_id or "-", + service_id or "-", + query_hash, + len(sql_str), + snippet, + ) + raise SQLValidationError(reason=reason, message=message) + + +# ── Execution-side defense in depth ───────────────────────────────────────── + + +def apply_user_query_limits( + con: duckdb.DuckDBPyConnection, + *, + memory_limit: str = "2GB", + timeout_seconds: int = 30, +) -> None: + """Apply per-connection limits before executing a user-supplied query. + + These are independent of the parse-tree validation — they bound the + blast radius of a query that passed the validator (e.g., a perfectly + legal but unconstrained ``SELECT * FROM fos_view`` that scans 100M + rows). Set on the user-query connection only — the cron sync / + compaction paths bypass these. + """ + try: + con.execute(f"SET memory_limit = '{memory_limit}'") + except duckdb.Error as exc: + logger.warning("[sql_validator] failed to apply memory_limit=%s: %s", memory_limit, exc) + # statement_timeout is DuckDB 0.10+; ms units. + try: + con.execute(f"SET statement_timeout = '{timeout_seconds * 1000}ms'") + except duckdb.Error as exc: + logger.debug("[sql_validator] statement_timeout not supported on this DuckDB build: %s", exc) + + +def escape_sql_literal(value: str) -> str: + """Escape a string for safe inclusion inside a DuckDB single-quoted SQL literal. + + Security: ingest paths interpolate S3 object keys (attacker-controlled + via uploads to the monitored bucket) into ``read_json_auto('{path}', ...)`` + calls. Without escaping, a key containing a single quote breaks out of + the literal and the rest of the key is parsed as SQL. + + DuckDB follows the SQL standard rule: a single quote inside a + single-quoted literal is escaped by DOUBLING it (``'O''Brien'``). + Backslashes are NOT escape characters in standard-mode DuckDB literals + (i.e., ``\\n`` is a 2-character sequence, not a newline) so we don't + need to special-case them. NULL bytes are passed through — DuckDB + accepts them inside literals as actual NUL characters, and the + surrounding code already filters by S3-API-valid characters which + excludes \\x00 from object keys. + + Returns the escaped value WITHOUT the surrounding quotes — caller is + expected to wrap with f"'{escape_sql_literal(x)}'". + + Multi-byte UTF-8 sequences pass through unchanged: we operate on the + Python str, so doubled-quote substitution only fires on actual U+0027 + code points, never on a UTF-8 continuation byte whose binary value + happens to be 0x27 (those are always the second/third/fourth byte of + a multi-byte sequence and can never decode as a quote in str form). + """ + if not isinstance(value, str): + raise TypeError(f"escape_sql_literal expected str, got {type(value).__name__}") + return value.replace("'", "''") + + +def has_limit_clause(sql: str, *, parser_con: duckdb.DuckDBPyConnection) -> bool: + """Return True iff ``sql`` parses as a statement with an explicit LIMIT + modifier at any level. + + 026: the previous ``\\bLIMIT\\b`` regex check matched ``LIMIT`` + inside string literals (``WHERE name = 'WITHOUT LIMIT'``) and + inside SQL comments — both false positives that made the + auto-wrap helper SKIP wrapping. A query with attacker-supplied + text containing the word ``LIMIT`` then ran unbounded and could + materialise the entire fact table (OOM / 503). + + The AST-aware check walks DuckDB's ``json_serialize_sql`` parse + tree for any ``LIMIT_MODIFIER`` node — strings and comments are + out of scope by construction. Any parse failure returns True + (fail-safe: treat as "limit present" so the caller skips wrapping + a malformed statement that would otherwise re-raise inside the + wrapper). + """ + try: + row = parser_con.execute("SELECT json_serialize_sql(?)", [sql]).fetchone() + except Exception: + return True + if not row or row[0] is None: + return True + try: + parsed = json.loads(row[0]) + except Exception: + return True + if not isinstance(parsed, dict) or parsed.get("error"): + return True + + def _walk(node: Any) -> bool: + if isinstance(node, dict): + # DuckDB's parse tree tags LIMIT clauses as + # ``LIMIT_MODIFIER`` (resp. ``LIMIT_PERCENT_MODIFIER``) + # nodes inside a ``modifiers`` array on the SELECT_NODE. + mod_type = node.get("type") + if isinstance(mod_type, str) and mod_type.startswith("LIMIT"): + return True + for v in node.values(): + if _walk(v): + return True + elif isinstance(node, list): + for item in node: + if _walk(item): + return True + return False + + return _walk(parsed) + + +def inject_default_limit(sql: str, *, default_limit: int = 100_000) -> str: + """Wrap a user query in ``SELECT * FROM () LIMIT N`` when the + original lacks an explicit LIMIT clause. + + Belt-and-suspenders with the memory_limit setting: prevents accidental + full-table scans from filling the result set even when memory is fine. + Caller may pre-strip the trailing semicolon. + + Note: this helper still uses the regex check for backwards-compat + with internal callers that don't have a parser connection handy. + The route-handler path uses ``has_limit_clause`` directly so the + AST-aware check covers the user-supplied-SQL surface (026). + """ + import re + + if re.search(r"\bLIMIT\b", sql, flags=re.IGNORECASE): + return sql + inner = sql.rstrip().rstrip(";") + return f"SELECT * FROM ({inner}) AS _user_q LIMIT {default_limit}" diff --git a/backend/utils/telemetry_proxy.py b/backend/utils/telemetry_proxy.py index b975a2a2..16e4dcd5 100644 --- a/backend/utils/telemetry_proxy.py +++ b/backend/utils/telemetry_proxy.py @@ -611,11 +611,17 @@ def _run_server() -> None: _SESSION = _LOOP.run_until_complete(_create_session()) - # client_max_size=0 disables aiohttp's 1MB default request-body cap. - # Multipart upload parts can be up to 5GB per S3 spec, and ingest - # commits routinely push multi-MB parquet writes through this proxy. - # Rejecting them with 413 would be a regression vs. direct boto3. - app = web.Application(client_max_size=0) + # Cap request bodies at 4GB. aiohttp's default 1MB cap is too small + # for our use case (Iceberg commit multiparts), but the previous + # ``client_max_size=0`` (unlimited) made the proxy a credible OOM + # vector: ``await request.read()`` buffers the whole body, so two + # concurrent multi-GB PUTs through the proxy could blow past the + # 12GB container limit by themselves. 4GB covers any realistic + # single multipart upload part (S3 individual part max is 5GB but + # we never write parts that big) while bounding worst-case buffer + # bloat. A 413 above 4GB is the right failure mode — callers can + # split into smaller parts. + app = web.Application(client_max_size=4 * 1024 * 1024 * 1024) app.router.add_get("/healthz", handle_healthz) app.router.add_route("*", "/{path_info:.*}", handle_request) diff --git a/backend/utils/terraform_gen.py b/backend/utils/terraform_gen.py index d6d5cc7d..85f4abbe 100644 --- a/backend/utils/terraform_gen.py +++ b/backend/utils/terraform_gen.py @@ -38,12 +38,23 @@ def generate_terraform(cfg: dict[str, Any], fos_access_key: str, fos_secret_key: # Escape every user-supplied string used inside HCL string literals. # The raw values are kept around for non-HCL contexts (e.g. comments, # path construction, derived domain names) where they're safe. - service_id = cfg.get("logging_service_id", "YOUR_SERVICE_ID") + # 023: service_id ends up inside HCL comments verbatim. A newline or + # carriage return would terminate the comment early and let attacker- + # supplied text inject arbitrary HCL. Strip both before any use. + service_id = str(cfg.get("logging_service_id", "YOUR_SERVICE_ID")).replace("\r", "").replace("\n", "") endpoint_name = cfg.get("endpoint_name", "fastly_log_analysis") region = cfg.get("fos_region", "us-east-1") bucket = cfg.get("fos_bucket_name", "your-bucket-name") prefix = cfg.get("fos_prefix", "").strip("/") - period = cfg.get("log_period", 3600) + # 022: log_period flows into the HCL ``period = {period}`` numeric + # literal. An attacker who sets ``log_period = "1; resource ..."`` + # would otherwise break out of the literal and inject HCL. Cast to + # int (with a safe fallback) so the rendered value is always a + # numeric token regardless of what was on the wire. + try: + period = int(cfg.get("log_period", 3600)) + except (TypeError, ValueError): + period = 3600 cdn_service_name = cfg.get("cdn_service_name", "Fastly Log Analysis CDN Proxy") cdn_prefix = cfg.get("cdn_prefix", bucket) cdn_domain = f"{cdn_prefix}.global.ssl.fastly.net" diff --git a/backend/utils/tunnel.py b/backend/utils/tunnel.py index 2eb4dd99..1ee51cc3 100644 --- a/backend/utils/tunnel.py +++ b/backend/utils/tunnel.py @@ -465,6 +465,22 @@ def validate_session(self, session_id: str | None) -> AnalystSession | None: if invite.get("expires_at") and invite["expires_at"] < share_db.iso_z_now(): self._evict(session, reason="invite expired", event="SESSION_TIMEOUT") return None + + # Security: re-sync the mutable permission fields from the + # current invite state. Without this, an admin who tightens an + # analyst's pii_policy / query_window_hours / query_start_time / + # query_end_time / service_ids cannot enforce those tightened + # bounds until the analyst's session naturally times out. Copy + # the latest invite-side values onto the cached AnalystSession + # before returning, so every downstream request sees fresh + # permissions on the next call. + session.pii_policy = invite.get("pii_policy") or session.pii_policy + session.query_window_hours = invite.get("query_window_hours") + session.query_start_time = invite.get("query_start_time") + session.query_end_time = invite.get("query_end_time") + fresh_service_ids = invite.get("service_ids") + if fresh_service_ids is not None: + session.service_ids = list(fresh_service_ids) return session def boot_session(self, session_id: str, *, reason: str = "admin boot") -> bool: @@ -605,6 +621,17 @@ def start_sharing( # Spawn SSH. We do NOT use any user keys — explicitly pass our own. key_path = _ensure_share_key() + # Security: pin the localhost.run host key. Without + # this, the previous StrictHostKeyChecking=no + + # UserKnownHostsFile=/dev/null combo trusts whatever key + # the server presents on first connection — a MitM on the + # outbound path can hijack the tunnel and decrypt analyst + # traffic. _ensure_known_hosts() materializes the pinned + # known_hosts from configs/ssh_known_hosts; if that file + # is missing or empty we REFUSE to start the tunnel + # (fail-safe — better to deny sharing than to fall back + # to TOFU). + known_hosts_path = _ensure_known_hosts() cmd = [ ssh_bin, "-i", @@ -612,9 +639,13 @@ def start_sharing( "-o", "IdentitiesOnly=yes", "-o", - "StrictHostKeyChecking=no", + "StrictHostKeyChecking=yes", + "-o", + f"UserKnownHostsFile={known_hosts_path}", "-o", - "UserKnownHostsFile=/dev/null", + # The pinned known_hosts is the only source of trust; + # never write new entries from the system files. + "GlobalKnownHostsFile=/dev/null", "-o", "ServerAliveInterval=10", "-o", @@ -860,6 +891,67 @@ def _port_in_use(host: str, port: int) -> bool: return False +def _ensure_known_hosts() -> str: + """Locate the pinned ``configs/ssh_known_hosts`` file used to verify + the localhost.run SSH host key. + + Security: this is the *only* trust anchor for the outbound SSH + tunnel. If the file is missing or empty we refuse to start the + tunnel (fail-safe), so a deployment that lost the volume mount can't + silently fall back to TOFU host-key acceptance. + + Resolution order: + 1. ``$SSH_KNOWN_HOSTS_FILE`` — explicit override for tests / unusual + deployments. + 2. ``${CONFIGS_DIR}/ssh_known_hosts`` where CONFIGS_DIR is the + backend's resolved config dir (defaults to ``/app/configs`` in + the container, which is bind-mounted from + ``/mnt/app-data/configs`` per docker-compose.prod.yml). + 3. ``/configs/ssh_known_hosts`` for local development. + + Returns the absolute path to the file. + Raises RuntimeError if the file is missing, unreadable, or empty. + """ + override = os.environ.get("SSH_KNOWN_HOSTS_FILE", "").strip() + candidates: list[str] = [] + if override: + candidates.append(override) + # Prefer the in-container path (production); fall back to the repo + # path (dev). Both should contain the same pinned content. + from backend import config as svcconfig + + try: + candidates.append(str(svcconfig.CONFIGS_DIR / "ssh_known_hosts")) + except Exception: + pass + candidates.append(os.path.join(os.path.dirname(__file__), "..", "..", "configs", "ssh_known_hosts")) + + for path in candidates: + if not path: + continue + try: + with open(path, "rb") as f: + data = f.read() + except OSError: + continue + # File must contain at least one non-comment, non-blank line + # (i.e., a real key entry) — an empty file would otherwise + # silently disable host-key checking with StrictHostKeyChecking=yes + # being functionally TOFU because OpenSSH treats an empty + # known_hosts as "no known keys". + for line in data.decode("utf-8", errors="replace").splitlines(): + stripped = line.strip() + if stripped and not stripped.startswith("#"): + return os.path.abspath(path) + + raise RuntimeError( + "Pinned SSH known_hosts file is missing or empty (security). " + "Looked at: " + ", ".join(candidates) + ". Refusing to start the tunnel — this would otherwise fall back to " + "TOFU host-key trust. Restore configs/ssh_known_hosts or set " + "SSH_KNOWN_HOSTS_FILE to a valid path." + ) + + def _ensure_share_key() -> str: """Generate ed25519 share key at ``data/system/share_key`` if missing. diff --git a/backend/utils/vcl_validator.py b/backend/utils/vcl_validator.py new file mode 100644 index 00000000..59fc0aac --- /dev/null +++ b/backend/utils/vcl_validator.py @@ -0,0 +1,315 @@ +"""VCL static analysis + user-input validation for scoring snippets. + +Anything that ends up interpolated into a VCL snippet that ships to +Fastly is funneled through here so a malformed input can't break the +service version's compile step (which would silently leave the prior +version active and the admin staring at "nothing changed" with no +error). + +The validator runs in three layers from cheap → expensive: + + 1. ``validate_url_exclusion_regex``: cheap input policing on a regex + the operator typed — length cap, no quotes (would break the VCL + string literal), no control chars, must compile under Python's + ``re`` engine. Catches the majority of bad input in microseconds. + + 2. ``lint_vcl``: runs the local ``falco`` binary (Fastly VCL static + analyzer, github.com/ysugimoto/falco) over the full assembled + snippet. Catches structural issues — unmatched braces, wrong + argument types to built-ins, etc. Falco is optional: when the + binary isn't on PATH (some dev environments don't have it), we + log a WARNING and skip the static analysis. Production images + install it via the backend Dockerfile so the path is exercised. + + 3. (implicit, runs at deploy time) Fastly's own VCL compiler when + ``activate_version`` runs against the cloned version. Anything + that slipped past layers 1+2 fails here with a 422 from the + Fastly API and the activate step rolls back. +""" + +from __future__ import annotations + +import logging +import re +import shutil +import subprocess +import tempfile +from dataclasses import dataclass +from pathlib import Path + +logger = logging.getLogger(__name__) + + +# ── Input validation ──────────────────────────────────────────────────────── + +MAX_REGEX_BYTES = 2048 +_DISALLOWED_CHARS_RE = re.compile(r"[\x00-\x08\x0a-\x1f\x7f\"]") + + +class RegexValidationError(ValueError): + """Raised when an operator-supplied URL exclusion regex fails policy. + + ``reason`` is a short machine-readable code so the API caller can + map it to a UI hint; ``message`` is the human-readable explanation. + """ + + def __init__(self, reason: str, message: str) -> None: + super().__init__(message) + self.reason = reason + self.message = message + + +def validate_url_exclusion_regex(value: str) -> str: + """Cheap input policing on a user-typed regex before VCL interpolation. + + Returns the cleaned-up value (stripped of trailing whitespace) on + success. Raises ``RegexValidationError`` on any policy violation. + + Empty / whitespace-only input is valid and signals "fall back to the + default" — the caller is expected to substitute the default regex + when this returns "". + """ + if not isinstance(value, str): + raise RegexValidationError("type", f"regex must be a string, got {type(value).__name__}") + cleaned = value.strip() + if not cleaned: + return "" # "" → caller falls back to default + if len(cleaned.encode("utf-8")) > MAX_REGEX_BYTES: + raise RegexValidationError( + "too_long", + f"regex exceeds {MAX_REGEX_BYTES}-byte limit (got {len(cleaned.encode('utf-8'))})", + ) + bad = _DISALLOWED_CHARS_RE.search(cleaned) + if bad: + ch = bad.group(0) + # Don't echo the byte verbatim — could be a control char. + raise RegexValidationError( + "disallowed_char", + f"regex contains disallowed character (codepoint U+{ord(ch):04X}): " + "double-quotes and control characters are not permitted because they " + "would break the surrounding VCL string literal", + ) + try: + re.compile(cleaned) + except re.error as exc: + raise RegexValidationError( + "invalid_regex", f"regex failed to compile: {exc.msg} at position {exc.pos}" + ) from exc + return cleaned + + +# ── Falco static analysis ─────────────────────────────────────────────────── + + +@dataclass +class LintResult: + """Outcome of running falco on a VCL snippet.""" + + ok: bool + errors: list[str] + warnings: list[str] + # True if falco couldn't run (binary missing). Caller decides whether + # that's a hard fail; production should treat skipped == fail. + skipped: bool = False + skipped_reason: str = "" + + +def _falco_binary() -> str | None: + """Return the falco binary path, or None if unavailable.""" + return shutil.which("falco") + + +def lint_vcl( + snippet: str, + *, + snippet_name: str = "scoring_snippet", + wrap_subroutine: str | None = "vcl_recv", +) -> LintResult: + """Run ``falco lint`` over a VCL snippet body. + + Falco's parser expects a complete VCL file — snippet bodies on their + own (a sequence of statements, not wrapped in a subroutine) are + syntactically invalid as a standalone file even though Fastly accepts + them as snippet content. To match Fastly's behaviour we wrap the + snippet in ``sub { ... }`` plus a minimal backend + declaration before linting; that's what Fastly does internally when + it inlines snippets into the main VCL. + + Pass ``wrap_subroutine=None`` for content that's already a full file + (rarely the case for our snippets). The default ``vcl_recv`` matches + every snippet generator in ``session_scoring_vcl.py``. + + Falco needs a file path, not stdin — write to a tempfile and invoke + ``falco lint`` against it. ``snippet_name`` shows up in error + messages so multi-snippet pipelines can identify which body the + error came from. + """ + falco_bin = _falco_binary() + if falco_bin is None: + logger.warning( + "[vcl_validator] falco binary not on PATH; static analysis skipped for %s " + "(production should install falco via the backend Dockerfile)", + snippet_name, + ) + return LintResult(ok=True, errors=[], warnings=[], skipped=True, skipped_reason="falco binary not found") + + # Compose a syntactically-complete VCL file by wrapping the snippet + # in the same subroutine Fastly inlines it into. We pre-declare the + # backends and the ``var.fastly_req_do_shield`` magic variable that + # the scoring snippets reference so falco's symbol resolver doesn't + # flag them as undefined (Fastly's main VCL declares both — falco's + # standalone lint mode doesn't know that without seeing them). + # + # Two extra bits of boilerplate to keep falco -v output focused on + # the OPERATOR'S snippet rather than wrapper noise: + # 1. ``#FASTLY `` macro inside the sub — without it, falco + # emits a "missing Fastly boilerplate comment" warning on every + # lint run regardless of snippet content. + # 2. Sentinel-guarded "uses" of the declared backends + variable + # AFTER the snippet body — this dead branch (`X-Lint-Sentinel == + # "0"` never matches in practice) satisfies falco's "unused/ + # declaration" + "unused/variable" rules without affecting any + # real request flow. Without these, every lint surfaces 3 + # pre-baked warnings that drown out anything the operator + # actually changed. + if wrap_subroutine: + stage = wrap_subroutine.removeprefix("vcl_") if wrap_subroutine.startswith("vcl_") else wrap_subroutine + full_vcl = ( + "backend F_origin_0 {\n" + ' .host = "example.com";\n' + ' .port = "80";\n' + "}\n" + "backend F_session_scorer {\n" + ' .host = "scorer.edgecompute.app";\n' + ' .port = "443";\n' + "}\n\n" + f"sub {wrap_subroutine} {{\n" + f" #FASTLY {stage}\n" + " declare local var.fastly_req_do_shield BOOL;\n" + f"{snippet}\n" + ' if (req.http.X-Lint-Sentinel == "lint-only-never-fires") {\n' + " set req.backend = F_origin_0;\n" + " set req.backend = F_session_scorer;\n" + " set var.fastly_req_do_shield = false;\n" + " }\n" + "}\n" + ) + else: + full_vcl = snippet + + with tempfile.NamedTemporaryFile(mode="w", suffix=".vcl", delete=False, encoding="utf-8") as tmp: + tmp.write(full_vcl) + tmp_path = Path(tmp.name) + + try: + # ``-v`` emits per-warning [WARNING] / [INFO] lines (not just the + # rolled-up "N warnings" summary). Without it, the parser below + # sees zero diagnostic lines AND zero errors and reports the + # snippet as clean — masking real warnings the operator should + # see (catalog/regex/etc.). The wrapper above is engineered to + # lint cleanly on its own, so any warning that surfaces here is + # from the operator's snippet body. + proc = subprocess.run( + [falco_bin, "-v", "lint", str(tmp_path)], + capture_output=True, + text=True, + timeout=10, + check=False, + ) + except subprocess.TimeoutExpired: + tmp_path.unlink(missing_ok=True) + return LintResult( + ok=False, + errors=[f"falco lint timed out after 10s for snippet {snippet_name!r}"], + warnings=[], + ) + finally: + tmp_path.unlink(missing_ok=True) + + # Falco exits non-zero when there are ANY diagnostics (errors or + # warnings), so the exit code alone isn't a reliable + # "did the snippet pass?" signal. The authoritative source is the + # summary line falco emits at the end: + # "🔥 N errors, ❗ M warnings, 🔈 K recommendations." + # We parse the N to decide pass/fail; lines tagged [ERROR] are + # surfaced as errors, [WARNING] / [INFO] go in warnings. + out = (proc.stdout or "").strip() + err = (proc.stderr or "").strip() + combined = "\n".join(filter(None, [out, err])) + + errors: list[str] = [] + warnings: list[str] = [] + + # Pre-parse: find the summary line if present. + summary_re = re.compile(r"(\d+)\s+errors?\s*,\s*(\d+)\s+warnings?") + summary_match = None + for line in combined.splitlines(): + if "errors" in line and "warnings" in line and summary_re.search(line): + summary_match = summary_re.search(line) + # Don't classify the summary line itself as an error/warning. + continue + stripped = line.strip() + if not stripped: + continue + if "[ERROR]" in stripped or "🔥 [ERROR]" in stripped: + errors.append(stripped) + elif "[WARNING]" in stripped or "❗ [WARNING]" in stripped: + warnings.append(stripped) + elif "[INFO]" in stripped or "🔈 [INFO]" in stripped: + warnings.append(stripped) + + summary_errors = int(summary_match.group(1)) if summary_match else None + ok = summary_errors == 0 if summary_errors is not None else (proc.returncode == 0 and not errors) + + # If falco reported a non-zero error count but no parseable + # [ERROR] line, surface a generic error so the operator isn't + # left guessing. + if summary_errors and summary_errors > 0 and not errors: + errors.append(f"falco reported {summary_errors} error(s) but no parseable diagnostics; stdout={out[:200]!r}") + + return LintResult(ok=ok, errors=errors, warnings=warnings) + + +# ── Convenience: validate a recv-snippet exclusion regex end-to-end ───────── + + +def validate_recv_exclusion_regex_with_lint( + user_regex: str, + *, + build_full_snippet: callable, + require_falco: bool = True, +) -> tuple[str, LintResult]: + """One-call validation: input policy → assemble snippet → falco lint. + + ``user_regex`` is what the operator typed. ``build_full_snippet`` is + a callable that takes the cleaned regex string and returns the + fully-assembled recv-snippet VCL — we don't know the surrounding + context (logging service ID, request secret, etc.) here, so the + caller closes over those. + + ``require_falco``: when True (default), a missing falco binary + raises ``RegexValidationError``. Production must keep this True so + a broken Dockerfile doesn't silently downgrade the security + posture. Tests can pass False to exercise the regex-only path. + + Returns ``(cleaned_regex, lint_result)`` on success. The cleaned + regex is what gets persisted to svc_cfg; the lint_result.warnings + is surfaced to the operator alongside the success message. + + Raises ``RegexValidationError`` on any layer's failure. + """ + cleaned = validate_url_exclusion_regex(user_regex) + # build_full_snippet must accept the cleaned regex (which may be + # "" — meaning "use default"). The caller's closure decides how to + # interpret an empty value. + full_snippet = build_full_snippet(cleaned) + lint = lint_vcl(full_snippet, snippet_name="scoring_recv") + if lint.skipped and require_falco: + raise RegexValidationError( + "falco_unavailable", + f"VCL static-analysis tool unavailable: {lint.skipped_reason}. Refusing to ship unchecked VCL.", + ) + if not lint.ok: + joined = "\n".join(lint.errors[:5]) + raise RegexValidationError("vcl_lint", f"falco lint failed:\n{joined}") + return cleaned, lint diff --git a/compute/scorer/Cargo.lock b/compute/scorer/Cargo.lock new file mode 100644 index 00000000..ee0c084a --- /dev/null +++ b/compute/scorer/Cargo.lock @@ -0,0 +1,919 @@ +# This file is automatically @generated by Cargo. +# It is not intended for manual editing. +version = 4 + +[[package]] +name = "aead" +version = "0.5.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d122413f284cf2d62fb1b7db97e02edb8cda96d769b16e443a4f6195e35662b0" +dependencies = [ + "crypto-common", + "generic-array", +] + +[[package]] +name = "aes" +version = "0.8.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b169f7a6d4742236a0a00c541b845991d0ac43e546831af1249753ab4c3aa3a0" +dependencies = [ + "cfg-if", + "cipher", + "cpufeatures", +] + +[[package]] +name = "aes-gcm" +version = "0.10.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "831010a0f742e1209b3bcea8fab6a8e149051ba6099432c8cb2cc117dec3ead1" +dependencies = [ + "aead", + "aes", + "cipher", + "ctr", + "ghash", + "subtle", +] + +[[package]] +name = "anyhow" +version = "1.0.102" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7f202df86484c868dbad7eaa557ef785d5c66295e41b460ef922eca0723b842c" + +[[package]] +name = "base64" +version = "0.22.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "72b3254f16251a8381aa12e40e3c4d2f0199f8c6508fbecb9d91f575e0fbb8c6" + +[[package]] +name = "bitflags" +version = "1.3.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "bef38d45163c2f1dde094a7dfd33ccf595c92905c8f8f4fdc18d06fb1037718a" + +[[package]] +name = "bitflags" +version = "2.12.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "84d7ced0ae9557296835c32bf1b1e02b44c746701f898460fb000d7eaa84f00a" + +[[package]] +name = "block-buffer" +version = "0.9.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4152116fd6e9dadb291ae18fc1ec3575ed6d84c29642d97890f4b4a3417297e4" +dependencies = [ + "generic-array", +] + +[[package]] +name = "bytes" +version = "1.11.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1e748733b7cbc798e1434b6ac524f0c1ff2ab456fe201501e6497c8417a4fc33" + +[[package]] +name = "cfg-if" +version = "1.0.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9330f8b2ff13f34540b44e946ef35111825727b38d33286ef986142615121801" + +[[package]] +name = "cipher" +version = "0.4.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "773f3b9af64447d2ce9850330c473515014aa235e6a783b02db81ff39e4a3dad" +dependencies = [ + "crypto-common", + "inout", +] + +[[package]] +name = "cpufeatures" +version = "0.2.17" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "59ed5838eebb26a2bb2e58f6d5b5316989ae9d08bab10e0e6d103e656d1b0280" +dependencies = [ + "libc", +] + +[[package]] +name = "crypto-common" +version = "0.1.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "78c8292055d1c1df0cce5d180393dc8cce0abec0a7102adb6c7b1eef6016d60a" +dependencies = [ + "generic-array", + "rand_core", + "typenum", +] + +[[package]] +name = "ctr" +version = "0.9.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0369ee1ad671834580515889b80f2ea915f23b8be8d0daa4bbaf2ac5c7590835" +dependencies = [ + "cipher", +] + +[[package]] +name = "deranged" +version = "0.5.8" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7cd812cc2bc1d69d4764bd80df88b4317eaef9e773c75226407d9bc0876b211c" +dependencies = [ + "powerfmt", + "serde_core", +] + +[[package]] +name = "digest" +version = "0.9.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d3dd60d1080a57a05ab032377049e0591415d2b31afd7028356dbf3cc6dcb066" +dependencies = [ + "generic-array", +] + +[[package]] +name = "displaydoc" +version = "0.2.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1ac70aa55017e108007fbaf5aa0f54b021c98f92ff8af59d42eda9da96e3dd4f" +dependencies = [ + "proc-macro2", + "quote", + "syn 2.0.117", +] + +[[package]] +name = "downcast-rs" +version = "1.2.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "75b325c5dbd37f80359721ad39aca5a29fb04c89279657cffdda8736d0c0b9d2" + +[[package]] +name = "either" +version = "1.16.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "91622ff5e7162018101f2fea40d6ebf4a78bbe5a49736a2020649edf9693679e" + +[[package]] +name = "elsa" +version = "1.11.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9abf33c656a7256451ebb7d0082c5a471820c31269e49d807c538c252352186e" +dependencies = [ + "indexmap", + "stable_deref_trait", +] + +[[package]] +name = "equivalent" +version = "1.0.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "877a4ace8713b0bcf2a4e7eec82529c029f1d0619886d18145fea96c3ffe5c0f" + +[[package]] +name = "fastly" +version = "0.11.13" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5f767502306f09f6dcb76302d09cd2ea8542e228d5f155166f0c2da925e16c61" +dependencies = [ + "anyhow", + "bytes", + "downcast-rs", + "elsa", + "fastly-macros", + "fastly-shared", + "fastly-sys", + "http", + "itertools", + "lazy_static", + "mime", + "serde", + "serde_json", + "serde_repr", + "serde_urlencoded", + "sha2", + "smallvec", + "thiserror", + "time", + "url", +] + +[[package]] +name = "fastly-macros" +version = "0.11.13" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "51ae08eeeb5ed0c1a8b454fc89dca0e316e13b7889e81fc9a435503c1e84a2d7" +dependencies = [ + "proc-macro2", + "quote", + "syn 1.0.109", +] + +[[package]] +name = "fastly-shared" +version = "0.11.13" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d64ed1bba12ca45d1a2a80c2c55d903297adb3eeb4edc9d327c1d51ee709d404" +dependencies = [ + "bitflags 1.3.2", + "http", +] + +[[package]] +name = "fastly-sys" +version = "0.11.13" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8f1b82ebd99583740a074d8962ca75d7d17065b185a94e4919c3a3f2193268b6" +dependencies = [ + "bitflags 1.3.2", + "fastly-shared", + "wasip2", + "wit-bindgen 0.46.0", +] + +[[package]] +name = "form_urlencoded" +version = "1.2.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "cb4cb245038516f5f85277875cdaa4f7d2c9a0fa0468de06ed190163b1581fcf" +dependencies = [ + "percent-encoding", +] + +[[package]] +name = "generic-array" +version = "0.14.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "85649ca51fd72272d7821adaf274ad91c288277713d9c18820d8499a7ff69e9a" +dependencies = [ + "typenum", + "version_check", +] + +[[package]] +name = "getrandom" +version = "0.2.17" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ff2abc00be7fca6ebc474524697ae276ad847ad0a6b3faa4bcb027e9a4614ad0" +dependencies = [ + "cfg-if", + "libc", + "wasi", +] + +[[package]] +name = "ghash" +version = "0.5.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f0d8a4362ccb29cb0b265253fb0a2728f592895ee6854fd9bc13f2ffda266ff1" +dependencies = [ + "opaque-debug", + "polyval", +] + +[[package]] +name = "hashbrown" +version = "0.17.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ed5909b6e89a2db4456e54cd5f673791d7eca6732202bbf2a9cc504fe2f9b84a" + +[[package]] +name = "hex" +version = "0.4.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7f24254aa9a54b5c858eaee2f5bccdb46aaf0e486a595ed5fd8f86ba55232a70" + +[[package]] +name = "http" +version = "1.4.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8be7462df143984c4598a256ef469b251d7d7f9e271135073e78fc535414f3d0" +dependencies = [ + "bytes", + "itoa", +] + +[[package]] +name = "icu_collections" +version = "2.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2984d1cd16c883d7935b9e07e44071dca8d917fd52ecc02c04d5fa0b5a3f191c" +dependencies = [ + "displaydoc", + "potential_utf", + "utf8_iter", + "yoke", + "zerofrom", + "zerovec", +] + +[[package]] +name = "icu_locale_core" +version = "2.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "92219b62b3e2b4d88ac5119f8904c10f8f61bf7e95b640d25ba3075e6cac2c29" +dependencies = [ + "displaydoc", + "litemap", + "tinystr", + "writeable", + "zerovec", +] + +[[package]] +name = "icu_normalizer" +version = "2.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c56e5ee99d6e3d33bd91c5d85458b6005a22140021cc324cea84dd0e72cff3b4" +dependencies = [ + "icu_collections", + "icu_normalizer_data", + "icu_properties", + "icu_provider", + "smallvec", + "zerovec", +] + +[[package]] +name = "icu_normalizer_data" +version = "2.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "da3be0ae77ea334f4da67c12f149704f19f81d1adf7c51cf482943e84a2bad38" + +[[package]] +name = "icu_properties" +version = "2.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "bee3b67d0ea5c2cca5003417989af8996f8604e34fb9ddf96208a033901e70de" +dependencies = [ + "icu_collections", + "icu_locale_core", + "icu_properties_data", + "icu_provider", + "zerotrie", + "zerovec", +] + +[[package]] +name = "icu_properties_data" +version = "2.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8e2bbb201e0c04f7b4b3e14382af113e17ba4f63e2c9d2ee626b720cbce54a14" + +[[package]] +name = "icu_provider" +version = "2.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "139c4cf31c8b5f33d7e199446eff9c1e02decfc2f0eec2c8d71f65befa45b421" +dependencies = [ + "displaydoc", + "icu_locale_core", + "writeable", + "yoke", + "zerofrom", + "zerotrie", + "zerovec", +] + +[[package]] +name = "idna" +version = "1.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3b0875f23caa03898994f6ddc501886a45c7d3d62d04d2d90788d47be1b1e4de" +dependencies = [ + "idna_adapter", + "smallvec", + "utf8_iter", +] + +[[package]] +name = "idna_adapter" +version = "1.2.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "cb68373c0d6620ef8105e855e7745e18b0d00d3bdb07fb532e434244cdb9a714" +dependencies = [ + "icu_normalizer", + "icu_properties", +] + +[[package]] +name = "indexmap" +version = "2.14.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d466e9454f08e4a911e14806c24e16fba1b4c121d1ea474396f396069cf949d9" +dependencies = [ + "equivalent", + "hashbrown", +] + +[[package]] +name = "inout" +version = "0.1.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "879f10e63c20629ecabbb64a8010319738c66a5cd0c29b02d63d272b03751d01" +dependencies = [ + "generic-array", +] + +[[package]] +name = "itertools" +version = "0.13.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "413ee7dfc52ee1a4949ceeb7dbc8a33f2d6c088194d9f922fb8318faf1f01186" +dependencies = [ + "either", +] + +[[package]] +name = "itoa" +version = "1.0.18" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8f42a60cbdf9a97f5d2305f08a87dc4e09308d1276d28c869c684d7777685682" + +[[package]] +name = "lazy_static" +version = "1.5.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "bbd2bcb4c963f2ddae06a2efc7e9f3591312473c50c6685e1f298068316e66fe" + +[[package]] +name = "libc" +version = "0.2.186" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "68ab91017fe16c622486840e4c83c9a37afeff978bd239b5293d61ece587de66" + +[[package]] +name = "litemap" +version = "0.8.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "92daf443525c4cce67b150400bc2316076100ce0b3686209eb8cf3c31612e6f0" + +[[package]] +name = "memchr" +version = "2.8.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6b947ae49db0d222b1dbc6b113ce7248a3fc3a6ca21b696717bfc000ba4484d8" + +[[package]] +name = "mime" +version = "0.3.17" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6877bb514081ee2a7ff5ef9de3281f14a4dd4bceac4c09388074a6b5df8a139a" + +[[package]] +name = "num-conv" +version = "0.2.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "521739c6d2bac4aa25192232afe6841231376b2b26d4d9fae5ecf8ca5772e441" + +[[package]] +name = "opaque-debug" +version = "0.3.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c08d65885ee38876c4f86fa503fb49d7b507c2b62552df7c70b2fce627e06381" + +[[package]] +name = "percent-encoding" +version = "2.3.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9b4f627cb1b25917193a259e49bdad08f671f8d9708acfd5fe0a8c1455d87220" + +[[package]] +name = "polyval" +version = "0.6.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9d1fe60d06143b2430aa532c94cfe9e29783047f06c0d7fd359a9a51b729fa25" +dependencies = [ + "cfg-if", + "cpufeatures", + "opaque-debug", + "universal-hash", +] + +[[package]] +name = "potential_utf" +version = "0.1.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0103b1cef7ec0cf76490e969665504990193874ea05c85ff9bab8b911d0a0564" +dependencies = [ + "zerovec", +] + +[[package]] +name = "powerfmt" +version = "0.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "439ee305def115ba05938db6eb1644ff94165c5ab5e9420d1c1bcedbba909391" + +[[package]] +name = "proc-macro2" +version = "1.0.106" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8fd00f0bb2e90d81d1044c2b32617f68fcb9fa3bb7640c23e9c748e53fb30934" +dependencies = [ + "unicode-ident", +] + +[[package]] +name = "quote" +version = "1.0.45" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "41f2619966050689382d2b44f664f4bc593e129785a36d6ee376ddf37259b924" +dependencies = [ + "proc-macro2", +] + +[[package]] +name = "rand_core" +version = "0.6.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ec0be4795e2f6a28069bec0b5ff3e2ac9bafc99e6a9a7dc3547996c5c816922c" +dependencies = [ + "getrandom", +] + +[[package]] +name = "ryu" +version = "1.0.23" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9774ba4a74de5f7b1c1451ed6cd5285a32eddb5cccb8cc655a4e50009e06477f" + +[[package]] +name = "serde" +version = "1.0.228" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9a8e94ea7f378bd32cbbd37198a4a91436180c5bb472411e48b5ec2e2124ae9e" +dependencies = [ + "serde_core", + "serde_derive", +] + +[[package]] +name = "serde_core" +version = "1.0.228" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "41d385c7d4ca58e59fc732af25c3983b67ac852c1a25000afe1175de458b67ad" +dependencies = [ + "serde_derive", +] + +[[package]] +name = "serde_derive" +version = "1.0.228" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d540f220d3187173da220f885ab66608367b6574e925011a9353e4badda91d79" +dependencies = [ + "proc-macro2", + "quote", + "syn 2.0.117", +] + +[[package]] +name = "serde_json" +version = "1.0.150" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e8014e44b4736ed0538adeecded0fce2a272f22dc9578a7eb6b2d9993c74cfb9" +dependencies = [ + "itoa", + "memchr", + "serde", + "serde_core", + "zmij", +] + +[[package]] +name = "serde_repr" +version = "0.1.20" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "175ee3e80ae9982737ca543e96133087cbd9a485eecc3bc4de9c1a37b47ea59c" +dependencies = [ + "proc-macro2", + "quote", + "syn 2.0.117", +] + +[[package]] +name = "serde_urlencoded" +version = "0.7.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d3491c14715ca2294c4d6a88f15e84739788c1d030eed8c110436aafdaa2f3fd" +dependencies = [ + "form_urlencoded", + "itoa", + "ryu", + "serde", +] + +[[package]] +name = "session-scorer" +version = "0.1.0" +dependencies = [ + "aes-gcm", + "base64", + "fastly", + "getrandom", + "hex", + "serde", + "serde_json", +] + +[[package]] +name = "sha2" +version = "0.9.9" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4d58a1e1bf39749807d89cf2d98ac2dfa0ff1cb3faa38fbb64dd88ac8013d800" +dependencies = [ + "block-buffer", + "cfg-if", + "cpufeatures", + "digest", + "opaque-debug", +] + +[[package]] +name = "smallvec" +version = "1.15.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "67b1b7a3b5fe4f1376887184045fcf45c69e92af734b7aaddc05fb777b6fbd03" + +[[package]] +name = "stable_deref_trait" +version = "1.2.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6ce2be8dc25455e1f91df71bfa12ad37d7af1092ae736f3a6cd0e37bc7810596" + +[[package]] +name = "subtle" +version = "2.6.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "13c2bddecc57b384dee18652358fb23172facb8a2c51ccc10d74c157bdea3292" + +[[package]] +name = "syn" +version = "1.0.109" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "72b64191b275b66ffe2469e8af2c1cfe3bafa67b529ead792a6d0160888b4237" +dependencies = [ + "proc-macro2", + "quote", + "unicode-ident", +] + +[[package]] +name = "syn" +version = "2.0.117" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e665b8803e7b1d2a727f4023456bbbbe74da67099c585258af0ad9c5013b9b99" +dependencies = [ + "proc-macro2", + "quote", + "unicode-ident", +] + +[[package]] +name = "synstructure" +version = "0.13.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "728a70f3dbaf5bab7f0c4b1ac8d7ae5ea60a4b5549c8a5914361c99147a709d2" +dependencies = [ + "proc-macro2", + "quote", + "syn 2.0.117", +] + +[[package]] +name = "thiserror" +version = "1.0.69" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b6aaf5339b578ea85b50e080feb250a3e8ae8cfcdff9a461c9ec2904bc923f52" +dependencies = [ + "thiserror-impl", +] + +[[package]] +name = "thiserror-impl" +version = "1.0.69" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4fee6c4efc90059e10f81e6d42c60a18f76588c3d74cb83a0b242a2b6c7504c1" +dependencies = [ + "proc-macro2", + "quote", + "syn 2.0.117", +] + +[[package]] +name = "time" +version = "0.3.47" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "743bd48c283afc0388f9b8827b976905fb217ad9e647fae3a379a9283c4def2c" +dependencies = [ + "deranged", + "num-conv", + "powerfmt", + "serde_core", + "time-core", + "time-macros", +] + +[[package]] +name = "time-core" +version = "0.1.8" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7694e1cfe791f8d31026952abf09c69ca6f6fa4e1a1229e18988f06a04a12dca" + +[[package]] +name = "time-macros" +version = "0.2.27" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2e70e4c5a0e0a8a4823ad65dfe1a6930e4f4d756dcd9dd7939022b5e8c501215" +dependencies = [ + "num-conv", + "time-core", +] + +[[package]] +name = "tinystr" +version = "0.8.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c8323304221c2a851516f22236c5722a72eaa19749016521d6dff0824447d96d" +dependencies = [ + "displaydoc", + "zerovec", +] + +[[package]] +name = "typenum" +version = "1.20.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b6f5e870be6c3b371b77fe0ee0bafb859fa4964b4404c27de1d380043c4dda20" + +[[package]] +name = "unicode-ident" +version = "1.0.24" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e6e4313cd5fcd3dad5cafa179702e2b244f760991f45397d14d4ebf38247da75" + +[[package]] +name = "universal-hash" +version = "0.5.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "fc1de2c688dc15305988b563c3854064043356019f97a4b46276fe734c4f07ea" +dependencies = [ + "crypto-common", + "subtle", +] + +[[package]] +name = "url" +version = "2.5.8" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ff67a8a4397373c3ef660812acab3268222035010ab8680ec4215f38ba3d0eed" +dependencies = [ + "form_urlencoded", + "idna", + "percent-encoding", + "serde", +] + +[[package]] +name = "utf8_iter" +version = "1.0.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b6c140620e7ffbb22c2dee59cafe6084a59b5ffc27a8859a5f0d494b5d52b6be" + +[[package]] +name = "version_check" +version = "0.9.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0b928f33d975fc6ad9f86c8f283853ad26bdd5b10b7f1542aa2fa15e2289105a" + +[[package]] +name = "wasi" +version = "0.11.1+wasi-snapshot-preview1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ccf3ec651a847eb01de73ccad15eb7d99f80485de043efb2f370cd654f4ea44b" + +[[package]] +name = "wasip2" +version = "1.0.3+wasi-0.2.9" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "20064672db26d7cdc89c7798c48a0fdfac8213434a1186e5ef29fd560ae223d6" +dependencies = [ + "wit-bindgen 0.57.1", +] + +[[package]] +name = "wit-bindgen" +version = "0.46.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f17a85883d4e6d00e8a97c586de764dabcc06133f7f1d55dce5cdc070ad7fe59" +dependencies = [ + "bitflags 2.12.1", +] + +[[package]] +name = "wit-bindgen" +version = "0.57.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1ebf944e87a7c253233ad6766e082e3cd714b5d03812acc24c318f549614536e" +dependencies = [ + "bitflags 2.12.1", +] + +[[package]] +name = "writeable" +version = "0.6.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1ffae5123b2d3fc086436f8834ae3ab053a283cfac8fe0a0b8eaae044768a4c4" + +[[package]] +name = "yoke" +version = "0.8.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "abe8c5fda708d9ca3df187cae8bfb9ceda00dd96231bed36e445a1a48e66f9ca" +dependencies = [ + "stable_deref_trait", + "yoke-derive", + "zerofrom", +] + +[[package]] +name = "yoke-derive" +version = "0.8.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "de844c262c8848816172cef550288e7dc6c7b7814b4ee56b3e1553f275f1858e" +dependencies = [ + "proc-macro2", + "quote", + "syn 2.0.117", + "synstructure", +] + +[[package]] +name = "zerofrom" +version = "0.1.8" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0ec05a11813ea801ff6d75110ad09cd0824ddba17dfe17128ea0d5f68e6c5272" +dependencies = [ + "zerofrom-derive", +] + +[[package]] +name = "zerofrom-derive" +version = "0.1.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "11532158c46691caf0f2593ea8358fed6bbf68a0315e80aae9bd41fbade684a1" +dependencies = [ + "proc-macro2", + "quote", + "syn 2.0.117", + "synstructure", +] + +[[package]] +name = "zerotrie" +version = "0.2.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0f9152d31db0792fa83f70fb2f83148effb5c1f5b8c7686c3459e361d9bc20bf" +dependencies = [ + "displaydoc", + "yoke", + "zerofrom", +] + +[[package]] +name = "zerovec" +version = "0.11.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "90f911cbc359ab6af17377d242225f4d75119aec87ea711a880987b18cd7b239" +dependencies = [ + "yoke", + "zerofrom", + "zerovec-derive", +] + +[[package]] +name = "zerovec-derive" +version = "0.11.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "625dc425cab0dca6dc3c3319506e6593dcb08a9f387ea3b284dbd52a92c40555" +dependencies = [ + "proc-macro2", + "quote", + "syn 2.0.117", +] + +[[package]] +name = "zmij" +version = "1.0.21" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b8848ee67ecc8aedbaf3e4122217aff892639231befc6a1b58d29fff4c2cabaa" diff --git a/compute/scorer/Cargo.toml b/compute/scorer/Cargo.toml new file mode 100644 index 00000000..fbf0e25a --- /dev/null +++ b/compute/scorer/Cargo.toml @@ -0,0 +1,52 @@ +[package] +name = "session-scorer" +version = "0.1.0" +edition = "2021" +authors = ["drew.michael"] +description = "Edge session-anomaly scoring service for Fastly Compute" +license = "Apache-2.0" +publish = false + +[[bin]] +name = "session-scorer" +path = "src/main.rs" + +[dependencies] +# Compute SDK — Wasm runtime + req/resp helpers. +fastly = "0.11" +# Authenticated encryption. aes-gcm = pure-Rust, audited; ships in Wasm cleanly. +aes-gcm = "0.10" +# Matrix.json deserialization. +serde = { version = "1", features = ["derive"] } +serde_json = "1" +# Cookie wire format. +base64 = "0.22" +# For HashMap with deterministic iteration (used for scoring). +# (Pulled transitively by serde_json; declared explicitly for clarity.) +# Hex-encode the raw session id bytes for the X-Edge-Sid response header. +# Promoted from dev-deps once main.rs needed it for live emission. +hex = "0.4" +# Real entropy for SID generation. Without this the placeholder loop in +# getrandom_fallback produces the same 6-byte sid for every cookieless +# visitor (07 26 45 64 83 a2), which collapses every uncookied request +# to one "session" in the labels table. getrandom 0.2 auto-selects the +# WASI random_get import on wasm32-wasip1 — no extra features needed. +getrandom = "0.2" + +[dev-dependencies] +# Cross-language fixture byte-exact tests use the same dependency that +# the runtime uses, so listing it again here is redundant; kept for +# explicit readability if dev-only test helpers ever need it. + +[profile.release] +# Optimize for size + execution speed. Wasm runtimes pay for both. +opt-level = "s" +lto = true +codegen-units = 1 +strip = "symbols" +panic = "abort" + +[profile.dev] +# Faster local builds — we test logic at native speed and only build Wasm +# for the actual deploy. +opt-level = 0 diff --git a/compute/scorer/fastly.toml b/compute/scorer/fastly.toml new file mode 100644 index 00000000..197f6834 --- /dev/null +++ b/compute/scorer/fastly.toml @@ -0,0 +1,26 @@ +# Fastly Compute package manifest. +# https://www.fastly.com/documentation/reference/compute/fastly-toml/ + +manifest_version = 3 +name = "session-scorer" +description = "Edge session-anomaly scoring service. Decrypts an inbound X-Session-State cookie, scores the session via Layer 1 (universal behavioral) + Layer 2 (route transition), and returns X-Edge-* headers for VCL policy consumption." +authors = ["drew.michael"] +language = "rust" + +# DO NOT set service_id here. The session-scorer Wasm is shared across all +# per-customer scoring services (each named "Session Scoring Service for +# {logging_service_id}"), so the deploy target MUST come from --service-id on +# the `fastly compute deploy` command line, never from this file. Committing +# a service_id here would couple the source to one customer's service and +# block multi-tenant rollout. + +[scripts] +build = "cargo build --bin session-scorer --release --target wasm32-wasip1 --color always" + +# Local-dev fixtures consumed by Viceroy (`fastly compute serve`). +# Backend rewrites these per-customer before deploy. +[local_server] + [local_server.dictionaries] + [local_server.dictionaries.scoring_keys] + file = "fixtures/local-dictionary.json" + format = "json" diff --git a/compute/scorer/fixtures/local-dictionary.json b/compute/scorer/fixtures/local-dictionary.json new file mode 100644 index 00000000..be8c928d --- /dev/null +++ b/compute/scorer/fixtures/local-dictionary.json @@ -0,0 +1,4 @@ +{ + "current_key_hex": "0001020304050607080910111213141516171819202122232425262728293031", + "previous_key_hex": "" +} diff --git a/compute/scorer/matrix.default.json b/compute/scorer/matrix.default.json new file mode 100644 index 00000000..cec4d541 --- /dev/null +++ b/compute/scorer/matrix.default.json @@ -0,0 +1 @@ +{"version":"default-empty","vocab_size":0,"session_count":0,"transition_count":0,"counts":{},"row_totals":{},"categories":{},"anchors":[]} diff --git a/compute/scorer/rust-toolchain.toml b/compute/scorer/rust-toolchain.toml new file mode 100644 index 00000000..13a15c4e --- /dev/null +++ b/compute/scorer/rust-toolchain.toml @@ -0,0 +1,3 @@ +[toolchain] +channel = "1.90" +targets = ["wasm32-wasip1"] diff --git a/compute/scorer/src/cookie.rs b/compute/scorer/src/cookie.rs new file mode 100644 index 00000000..b35b2b29 --- /dev/null +++ b/compute/scorer/src/cookie.rs @@ -0,0 +1,540 @@ +//! AES-GCM-with-AAD session cookie codec. +//! +//! Wire-format contract with the Python reference (`backend/scoring/cookie.py`). +//! The packed-binary layout, AAD format, dual-key trial-decrypt order, and +//! base64url encoding all match byte-for-byte. The cross-language fixture +//! tests in [`tests::wire_format`] are the canonical source of truth — if a +//! Python change drifts from Rust (or vice versa), one of those tests fails +//! immediately. +//! +//! Layout (variable, little-endian throughout): +//! +//! ```text +//! v u8 schema version (first byte, for dispatch) +//! sid [u8;6] raw session id +//! seq u16 sequence count +//! sum_dt u32 Σ Δt seconds +//! sum_dt_sq u64 Σ Δt² seconds² +//! last_ts u32 last-request unix epoch +//! score u8 quantized 0-100 +//! issued_at u32 cookie creation unix epoch ← end of v1 (30 B) +//! prev_route_len u8 length of prev_route_path (v2 only, 0-255) +//! prev_route_path [u8;N] normalized path of last-scored URL (UTF-8) +//! ``` +//! +//! v1 cookies still decode (with prev_route_path = ""). Encoder always emits v2. +//! +//! aad: ASCII `{service_id}|v{schema_version}` + +use aes_gcm::{ + aead::{Aead, KeyInit, Payload}, + Aes256Gcm, Key, Nonce, +}; +use base64::{engine::general_purpose::URL_SAFE_NO_PAD, Engine as _}; + +pub const SCHEMA_VERSION: u8 = 2; +pub const SID_BYTES: usize = 6; +pub const NONCE_BYTES: usize = 12; +pub const KEY_BYTES: usize = 32; +pub const SCORE_BUCKET: u8 = 5; +/// v1 plaintext size (fixed 30 bytes). Kept exposed because the decoder +/// uses it to dispatch v1 vs v2 layout. +pub const V1_PLAINTEXT_BYTES: usize = 30; +/// Maximum bytes we'll encode for prev_route_path. Long paths get +/// truncated at encode time — the matrix transition lookup tolerates an +/// unknown prev_route by returning the uniform-prior probability, so the +/// failure mode of truncation is "L2 = uniform" not "crash". +pub const PREV_ROUTE_MAX_BYTES: usize = 255; +pub const GCM_TAG_BYTES: usize = 16; +/// Minimum total envelope size (v1 plaintext + nonce + tag). Used by +/// decode() to reject obviously-malformed cookies before attempting AEAD. +pub const ENVELOPE_BYTES: usize = NONCE_BYTES + V1_PLAINTEXT_BYTES + GCM_TAG_BYTES; + +#[derive(Debug, PartialEq, Eq)] +pub enum CookieError { + /// Cookie too short, wrong nonce length, plaintext length mismatch. + BadFraming(&'static str), + /// Base64URL decode failed. + BadBase64, + /// AEAD verification failed (tampered, wrong key, wrong AAD). + BadAuth, + /// Decoded payload has a schema version this codec doesn't support. + BadSchemaVersion(u8), + /// SessionState bounds violation when building before encode. + OutOfRange(&'static str), + /// Wrong AES key length passed to codec constructor. + BadKeyLength, +} + +#[derive(Debug, Clone, PartialEq, Eq, Default)] +pub struct SessionState { + pub v: u8, + pub sid: [u8; SID_BYTES], + pub seq: u16, + pub sum_dt: u32, + pub sum_dt_sq: u64, + pub last_ts: u32, + pub score: u8, + pub issued_at: u32, + /// Normalized path of the most-recently-scored URL for this session. + /// Carried in the cookie so the scorer can compute the L2 transition + /// probability without VCL having to pass prev_route via a header + /// (req.http doesn't persist across separate client requests, so a + /// header-based mechanism was always broken). Empty when the cookie + /// was a v1 decode or when this is the first request in a session. + pub prev_route_path: String, +} + +impl SessionState { + /// Validate score/version bounds before serialization. The other fields + /// are width-typed (u16, u32, u64) so out-of-range is structurally + /// impossible at the Rust level. + pub fn validate(&self) -> Result<(), CookieError> { + if self.score > 100 { + return Err(CookieError::OutOfRange("score")); + } + // prev_route_path is silently truncated at PREV_ROUTE_MAX_BYTES + // during pack_payload — we don't reject on long paths since the + // fail-mode of truncation is L2 falls back to uniform-prior for + // the unrecognized prefix, which is correct. + Ok(()) + } +} + +/// Quantize a score to the nearest [`SCORE_BUCKET`], clamped to [0, 100]. +/// +/// Matches Python's `quantize_score`: uses bankers-rounding (round-half-to- +/// even) so 12.5 → 10 and 17.5 → 20. The cross-lang round-trip tests pin +/// the expected values, but we get there honestly via Rust's f64::round_ties_even. +pub fn quantize_score(raw: f64) -> u8 { + let clamped = raw.clamp(0.0, 100.0); + let bucket = SCORE_BUCKET as f64; + let rounded = (clamped / bucket).round_ties_even() * bucket; + rounded as u8 +} + +fn pack_payload(state: &SessionState) -> Vec { + // v2 layout: 30-byte fixed header + 1-byte length prefix + N bytes + // of UTF-8 path. We always emit the v2 length prefix even when the + // path is empty so the decoder can dispatch unambiguously on + // plaintext length (== 30 → v1 legacy, > 30 → v2). + let path_bytes = state.prev_route_path.as_bytes(); + let path_len = path_bytes.len().min(PREV_ROUTE_MAX_BYTES); + let mut out = Vec::with_capacity(V1_PLAINTEXT_BYTES + 1 + path_len); + out.push(state.v); + out.extend_from_slice(&state.sid); + out.extend_from_slice(&state.seq.to_le_bytes()); + out.extend_from_slice(&state.sum_dt.to_le_bytes()); + out.extend_from_slice(&state.sum_dt_sq.to_le_bytes()); + out.extend_from_slice(&state.last_ts.to_le_bytes()); + out.push(state.score); + out.extend_from_slice(&state.issued_at.to_le_bytes()); + out.push(path_len as u8); + out.extend_from_slice(&path_bytes[..path_len]); + out +} + +fn unpack_payload(buf: &[u8]) -> Result { + if buf.len() < V1_PLAINTEXT_BYTES { + return Err(CookieError::BadFraming("plaintext too short")); + } + let mut sid = [0u8; SID_BYTES]; + sid.copy_from_slice(&buf[1..7]); + let mut state = SessionState { + v: buf[0], + sid, + seq: u16::from_le_bytes(buf[7..9].try_into().unwrap()), + sum_dt: u32::from_le_bytes(buf[9..13].try_into().unwrap()), + sum_dt_sq: u64::from_le_bytes(buf[13..21].try_into().unwrap()), + last_ts: u32::from_le_bytes(buf[21..25].try_into().unwrap()), + score: buf[25], + issued_at: u32::from_le_bytes(buf[26..30].try_into().unwrap()), + prev_route_path: String::new(), + }; + // v1 cookies stop here (30 bytes total). v2 has a length-prefixed + // UTF-8 path suffix. + if buf.len() > V1_PLAINTEXT_BYTES { + let path_len = buf[V1_PLAINTEXT_BYTES] as usize; + let path_end = V1_PLAINTEXT_BYTES + 1 + path_len; + if buf.len() != path_end { + return Err(CookieError::BadFraming("prev_route_path length")); + } + state.prev_route_path = std::str::from_utf8(&buf[V1_PLAINTEXT_BYTES + 1..path_end]) + .map_err(|_| CookieError::BadFraming("prev_route_path utf-8"))? + .to_string(); + } + Ok(state) +} + +fn aad(service_id: &str, schema_version: u8) -> Vec { + format!("{}|v{}", service_id, schema_version).into_bytes() +} + +/// Encrypt + base64url. `nonce` MUST be unique-per-encrypt under any key +/// (reused nonces under the same key destroy GCM's confidentiality + auth +/// guarantees simultaneously). The caller passes a fresh 96-bit random +/// nonce in production; tests use a fixed nonce to pin wire-format bytes. +pub fn encode( + state: &SessionState, + key: &[u8], + nonce: &[u8], + service_id: &str, + schema_version: u8, +) -> Result { + state.validate()?; + if state.v != schema_version { + return Err(CookieError::BadSchemaVersion(state.v)); + } + if key.len() != KEY_BYTES { + return Err(CookieError::BadKeyLength); + } + if nonce.len() != NONCE_BYTES { + return Err(CookieError::BadFraming("nonce length")); + } + + let cipher = Aes256Gcm::new(Key::::from_slice(key)); + let plaintext = pack_payload(state); + let aad_bytes = aad(service_id, schema_version); + let ciphertext = cipher + .encrypt( + Nonce::from_slice(nonce), + Payload { + msg: plaintext.as_slice(), + aad: &aad_bytes, + }, + ) + .map_err(|_| CookieError::BadAuth)?; + + let mut envelope = Vec::with_capacity(NONCE_BYTES + ciphertext.len()); + envelope.extend_from_slice(nonce); + envelope.extend_from_slice(&ciphertext); + Ok(URL_SAFE_NO_PAD.encode(&envelope)) +} + +/// Decrypt + verify. Trial-decrypts with `key`, then `previous_key` if +/// present (24h post-rotation grace window). All three failure modes +/// (bad framing, bad base64, AEAD failure) surface as distinct +/// [`CookieError`] variants so the caller can categorize for the +/// X-Edge-Cookie-Compliance header. +pub fn decode( + cookie: &str, + key: &[u8], + previous_key: Option<&[u8]>, + service_id: &str, + schema_version: u8, +) -> Result { + if key.len() != KEY_BYTES { + return Err(CookieError::BadKeyLength); + } + if let Some(p) = previous_key { + if p.len() != KEY_BYTES { + return Err(CookieError::BadKeyLength); + } + } + + let raw = URL_SAFE_NO_PAD + .decode(cookie) + .map_err(|_| CookieError::BadBase64)?; + if raw.len() < ENVELOPE_BYTES { + return Err(CookieError::BadFraming("envelope too short")); + } + let nonce = &raw[..NONCE_BYTES]; + let ciphertext = &raw[NONCE_BYTES..]; + let aad_bytes = aad(service_id, schema_version); + + let try_decrypt = |k: &[u8]| -> Option> { + let cipher = Aes256Gcm::new(Key::::from_slice(k)); + cipher + .decrypt( + Nonce::from_slice(nonce), + Payload { msg: ciphertext, aad: &aad_bytes }, + ) + .ok() + }; + + let plaintext = match try_decrypt(key) { + Some(p) => p, + None => previous_key + .and_then(try_decrypt) + .ok_or(CookieError::BadAuth)?, + }; + + let state = unpack_payload(&plaintext)?; + // Accept v1 cookies during the migration window — they have no + // prev_route_path, which means L2 transition lookup falls back to + // uniform probability for that one request, but the request still + // serves. The decoder is the only place we accept old schemas; the + // encoder always emits the current SCHEMA_VERSION. + if state.v != schema_version && state.v != 1 { + return Err(CookieError::BadSchemaVersion(state.v)); + } + Ok(state) +} + +#[cfg(test)] +mod tests { + use super::*; + + const KEY_A: [u8; 32] = [ + 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, + 25, 26, 27, 28, 29, 30, 31, + ]; + const KEY_B: [u8; 32] = [ + 31, 30, 29, 28, 27, 26, 25, 24, 23, 22, 21, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, + 8, 7, 6, 5, 4, 3, 2, 1, 0, + ]; + const NONCE_FIXED: [u8; 12] = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11]; + const SVC: &str = "TestSvc123"; + + fn state() -> SessionState { + SessionState { + v: SCHEMA_VERSION, + sid: [1, 2, 3, 4, 5, 6], + seq: 10, + sum_dt: 100, + sum_dt_sq: 1500, + last_ts: 1_700_000_000, + score: 25, + issued_at: 1_699_990_000, + prev_route_path: String::new(), + } + } + + // ── quantize_score ────────────────────────────────────────────────────── + + #[test] + fn quantize_buckets() { + // Pinned values from the Python parametrized test set. + assert_eq!(quantize_score(0.0), 0); + assert_eq!(quantize_score(1.0), 0); + assert_eq!(quantize_score(2.0), 0); + assert_eq!(quantize_score(3.0), 5); + assert_eq!(quantize_score(7.0), 5); + assert_eq!(quantize_score(8.0), 10); + assert_eq!(quantize_score(12.0), 10); + assert_eq!(quantize_score(13.0), 15); + assert_eq!(quantize_score(47.0), 45); + assert_eq!(quantize_score(48.0), 50); + assert_eq!(quantize_score(97.0), 95); + assert_eq!(quantize_score(98.0), 100); + assert_eq!(quantize_score(100.0), 100); + assert_eq!(quantize_score(-5.0), 0); + assert_eq!(quantize_score(150.0), 100); + // Banker's rounding: 50.5 → 50 (matches Python's round()) + assert_eq!(quantize_score(50.5), 50); + // 52.5/5 = 10.5 → bankers-rounds to 10 → *5 = 50 + assert_eq!(quantize_score(52.5), 50); + } + + // ── pack / unpack ─────────────────────────────────────────────────────── + + /// CROSS-LANGUAGE CONTRACT: this hex string is byte-identical to the one + /// pinned in `tests/scoring/test_cookie.py::test_pack_layout_byte_exact`. + /// If either side changes wire format, both tests update together — or + /// the build breaks. v2 adds a length-prefixed UTF-8 path suffix; the + /// fixture exercises a non-empty path so the byte layout is verified + /// end-to-end including the length prefix. + #[test] + fn pack_layout_byte_exact() { + let s = SessionState { + v: 2, + sid: [0x11, 0x22, 0x33, 0x44, 0x55, 0x66], + seq: 0x1234, + sum_dt: 0x10203040, + sum_dt_sq: 0x0102030405060708, + last_ts: 0x65000000, + score: 80, + issued_at: 0x64000000, + // "/home" → 5 UTF-8 bytes: 2f 68 6f 6d 65 + prev_route_path: "/home".to_string(), + }; + let packed = pack_payload(&s); + let expected = hex::decode( + "02\ + 112233445566\ + 3412\ + 40302010\ + 0807060504030201\ + 00000065\ + 50\ + 00000064\ + 05\ + 2f686f6d65", + ) + .unwrap(); + assert_eq!(&packed[..], &expected[..]); + } + + /// v2 with empty prev_route_path still emits the length-prefix byte + /// (= 0). The wire layout is always v2 from the encoder; v1 is only a + /// decode-side back-compat. + #[test] + fn pack_layout_empty_prev_route() { + let mut s = state(); + s.prev_route_path = String::new(); + let packed = pack_payload(&s); + assert_eq!(packed.len(), V1_PLAINTEXT_BYTES + 1); + assert_eq!(packed[V1_PLAINTEXT_BYTES], 0); + } + + /// v1 decoder back-compat: a 30-byte plaintext should round-trip into + /// a SessionState with prev_route_path = empty (legacy cookies issued + /// before the schema bump). + #[test] + fn unpack_accepts_v1_30_byte_plaintext() { + let v1 = hex::decode( + "01\ + 112233445566\ + 3412\ + 40302010\ + 0807060504030201\ + 00000065\ + 50\ + 00000064", + ) + .unwrap(); + let s = unpack_payload(&v1).unwrap(); + assert_eq!(s.v, 1); + assert_eq!(s.prev_route_path, ""); + assert_eq!(s.score, 80); + } + + #[test] + fn pack_unpack_round_trip_with_path() { + let mut s = state(); + s.prev_route_path = "/checkout".to_string(); + let packed = pack_payload(&s); + // v1 header (30) + 1 length byte + 9 path bytes = 40 bytes + assert_eq!(packed.len(), V1_PLAINTEXT_BYTES + 1 + 9); + let recovered = unpack_payload(&packed).unwrap(); + assert_eq!(recovered, s); + } + + #[test] + fn unpack_rejects_too_short() { + assert!(matches!( + unpack_payload(&[0u8; 29]), + Err(CookieError::BadFraming(_)) + )); + } + + // ── encode / decode round-trip ────────────────────────────────────────── + + #[test] + fn encode_decode_round_trip() { + let s = state(); + let cookie = encode(&s, &KEY_A, &NONCE_FIXED, SVC, SCHEMA_VERSION).unwrap(); + let decoded = decode(&cookie, &KEY_A, None, SVC, SCHEMA_VERSION).unwrap(); + assert_eq!(decoded, s); + } + + #[test] + fn encode_envelope_size_with_empty_path() { + // v2 always emits a length-prefix byte even when path is empty, + // so envelope is one byte larger than the v1 baseline. + let cookie = encode(&state(), &KEY_A, &NONCE_FIXED, SVC, SCHEMA_VERSION).unwrap(); + let raw = URL_SAFE_NO_PAD.decode(&cookie).unwrap(); + assert_eq!(raw.len(), NONCE_BYTES + V1_PLAINTEXT_BYTES + 1 + GCM_TAG_BYTES); + } + + #[test] + fn encode_decode_round_trip_preserves_prev_route_path() { + let mut s = state(); + s.prev_route_path = "/users/{int}/profile".to_string(); + let cookie = encode(&s, &KEY_A, &NONCE_FIXED, SVC, SCHEMA_VERSION).unwrap(); + let decoded = decode(&cookie, &KEY_A, None, SVC, SCHEMA_VERSION).unwrap(); + assert_eq!(decoded.prev_route_path, "/users/{int}/profile"); + assert_eq!(decoded.score, s.score); + assert_eq!(decoded.sid, s.sid); + } + + #[test] + fn encode_truncates_path_over_max() { + let mut s = state(); + s.prev_route_path = "a".repeat(PREV_ROUTE_MAX_BYTES + 50); + let cookie = encode(&s, &KEY_A, &NONCE_FIXED, SVC, SCHEMA_VERSION).unwrap(); + let decoded = decode(&cookie, &KEY_A, None, SVC, SCHEMA_VERSION).unwrap(); + // Truncated to the cap; decoder reads exactly what was encoded. + assert_eq!(decoded.prev_route_path.len(), PREV_ROUTE_MAX_BYTES); + } + + #[test] + fn decode_rejects_tampered_ciphertext() { + let cookie = encode(&state(), &KEY_A, &NONCE_FIXED, SVC, SCHEMA_VERSION).unwrap(); + let mut raw = URL_SAFE_NO_PAD.decode(&cookie).unwrap(); + raw[NONCE_BYTES + 5] ^= 0x01; + let tampered = URL_SAFE_NO_PAD.encode(&raw); + assert_eq!( + decode(&tampered, &KEY_A, None, SVC, SCHEMA_VERSION), + Err(CookieError::BadAuth) + ); + } + + #[test] + fn decode_rejects_wrong_key() { + let cookie = encode(&state(), &KEY_A, &NONCE_FIXED, SVC, SCHEMA_VERSION).unwrap(); + assert_eq!( + decode(&cookie, &KEY_B, None, SVC, SCHEMA_VERSION), + Err(CookieError::BadAuth) + ); + } + + #[test] + fn decode_rejects_wrong_service_id() { + let cookie = encode(&state(), &KEY_A, &NONCE_FIXED, "Foo", SCHEMA_VERSION).unwrap(); + assert_eq!( + decode(&cookie, &KEY_A, None, "Bar", SCHEMA_VERSION), + Err(CookieError::BadAuth) + ); + } + + #[test] + fn decode_rejects_wrong_schema_version() { + // Encode at AAD "svc|v1", decode at AAD "svc|v99" → AAD mismatch + // → BadAuth. (v1 is intentionally accepted by the decoder for + // back-compat, so we exercise a version the encoder won't ever + // emit and the decoder won't accept.) + let mut s = state(); + s.v = 1; + let cookie = encode(&s, &KEY_A, &NONCE_FIXED, SVC, 1).unwrap(); + assert_eq!( + decode(&cookie, &KEY_A, None, SVC, 99), + Err(CookieError::BadAuth) // AAD mismatch surfaces first + ); + } + + #[test] + fn decode_rejects_garbage_base64() { + assert_eq!( + decode("A", &KEY_A, None, SVC, SCHEMA_VERSION), + Err(CookieError::BadBase64) + ); + } + + #[test] + fn decode_rejects_too_short_envelope() { + let cookie = URL_SAFE_NO_PAD.encode([0u8; 20]); + assert!(matches!( + decode(&cookie, &KEY_A, None, SVC, SCHEMA_VERSION), + Err(CookieError::BadFraming(_)) + )); + } + + #[test] + fn dual_key_decrypts_via_previous_during_grace() { + let cookie_under_old = encode(&state(), &KEY_A, &NONCE_FIXED, SVC, SCHEMA_VERSION).unwrap(); + // Encode under A, decode via codec that holds B as current + A as previous. + let decoded = + decode(&cookie_under_old, &KEY_B, Some(&KEY_A), SVC, SCHEMA_VERSION).unwrap(); + assert_eq!(decoded, state()); + } + + #[test] + fn no_previous_key_strict_mode_rejects_old_cookies() { + let cookie = encode(&state(), &KEY_A, &NONCE_FIXED, SVC, SCHEMA_VERSION).unwrap(); + assert_eq!( + decode(&cookie, &KEY_B, None, SVC, SCHEMA_VERSION), + Err(CookieError::BadAuth) + ); + } +} diff --git a/compute/scorer/src/main.rs b/compute/scorer/src/main.rs new file mode 100644 index 00000000..268168a4 --- /dev/null +++ b/compute/scorer/src/main.rs @@ -0,0 +1,748 @@ +//! Fastly Compute entrypoint for the edge session scorer. +//! +//! Wire-up: +//! 1. Read `X-Session-State` cookie from inbound request. +//! 2. Look up the AES-GCM key(s) from the `scoring_keys` Edge Dictionary. +//! 3. Decode + verify the cookie. On any failure, mark compliance accordingly. +//! 4. Pull the previous route from the cookie's seq context (carried in +//! separate header by VCL; see vcl/snippet.fetch.vcl in Phase D). +//! 5. Normalize the current request URL → Route. +//! 6. Score (L1 + L2 + combined) against the embedded matrix. +//! 7. Re-encode the updated state into a fresh cookie. +//! 8. Return X-Edge-* headers + Set-Cookie. VCL strips X-Edge-* before +//! client delivery (research doc §1.3). +//! +//! Fail-open: any internal error path returns score=0 + reason=internal-error +//! so a Compute-side bug never blocks real users (§6). + +mod cookie; +mod matrix; +mod normalize; +mod scorer; + +use fastly::{ConfigStore, Error, Request, Response}; +use std::sync::atomic::{AtomicU64, Ordering}; + +// Lightweight in-process counters. Emitted via dbg_log every +// METRICS_EMIT_EVERY requests so the operator can grep `metrics:` in +// `fastly log-tail` for a rough sense of how often the cookie is being +// tampered with, how often we're hard-blocking, and whether the +// embedded matrix ever fails to load (it shouldn't — it's compiled in). +// Counters are process-wide on the Wasm instance and reset whenever +// Fastly recycles the instance; we accept that imprecision in exchange +// for zero-coordination atomics on the hot path. +static TAMPERED_COOKIE_COUNT: AtomicU64 = AtomicU64::new(0); +static ENFORCE_BLOCK_COUNT: AtomicU64 = AtomicU64::new(0); +static MATRIX_LOAD_FAIL_COUNT: AtomicU64 = AtomicU64::new(0); +static REQUEST_COUNT: AtomicU64 = AtomicU64::new(0); +const METRICS_EMIT_EVERY: u64 = 1000; + +const SERVICE_ID_HEADER: &str = "X-Edge-Service-Id"; +const PREV_ROUTE_HEADER: &str = "X-Edge-Prev-Route"; +const PREV_ANCHOR_HEADER: &str = "X-Edge-Prev-Anchor"; +const MATRIX_AGE_HEADER: &str = "X-Edge-Matrix-Age-Days"; +const SCORER_AUTH_HEADER: &str = "X-Edge-Scorer-Auth"; +const COOKIE_NAME: &str = "X-Session-State"; +const KEYS_STORE: &str = "scoring_keys"; +const REQUEST_SECRET_KEY: &str = "request_secret"; +// Separate config store from the keys so the dev/debug toggle can be flipped +// without ever touching the cookie key. We default-load both — missing +// config store reads degrade to "no debug logging" so a fresh service that +// hasn't been fully configured still serves real requests. +const CONFIG_STORE: &str = "scoring_config"; +const DEBUG_LOG_KEY: &str = "debug_logging_enabled"; +const ENFORCE_THRESHOLD_KEY: &str = "enforce_threshold"; + +#[fastly::main] +fn main(req: Request) -> Result { + // The score function never panics; the worst case returns a + // diagnostic-only response with X-Edge-Score=0 so VCL fails open. + Ok(score_request(&req)) +} + +fn score_request(req: &Request) -> Response { + // ── Auth: reject any request that doesn't bear the right shared + // secret from our VCL service. The secret is written into + // the scoring_keys ConfigStore at provision time and + // embedded into the VCL recv snippet. Stops the scorer + // domain (which is reachable from anywhere on the public + // internet) from being scored on by random people who + // find the hostname. + if !request_auth_ok(req) { + return Response::from_status(401) + .with_header("X-Edge-Score-Reason", "unauthorized") + .with_body("unauthorized"); + } + + // ── Header inputs (VCL fills these before Compute is invoked). ─────────── + let service_id = req.get_header_str(SERVICE_ID_HEADER).unwrap_or("default"); + let prev_route_raw = req.get_header_str(PREV_ROUTE_HEADER); + let prev_anchor_raw = req.get_header_str(PREV_ANCHOR_HEADER); + let matrix_age_days: f64 = req + .get_header_str(MATRIX_AGE_HEADER) + .and_then(|s| s.parse().ok()) + .unwrap_or(0.0); + + // Cheap toggle: flip the `debug_logging_enabled` key in the + // `scoring_config` ConfigStore to true (any truthy string), tail the + // service with `fastly log-tail`, then flip back to off. The store + // lookup is a constant-time hash hit so leaving the check in costs + // ~nothing on the hot path. + let debug = debug_logging_enabled(); + // Wall-clock since the Wasm instance booted, in nanoseconds. We + // diff start vs end to get the time spent scoring this request, + // which goes into the debug log so the operator can see real + // edge-side latency without leaving Fastly's tools. + let t0 = if debug { + std::time::SystemTime::now() + .duration_since(std::time::UNIX_EPOCH) + .map(|d| d.as_nanos()) + .unwrap_or(0) + } else { + 0 + }; + if debug { + dbg_log(&format!( + "incoming: url={} service={} prev_route={:?} prev_anchor={:?} matrix_age_days={}", + req.get_url_str(), + service_id, + prev_route_raw, + prev_anchor_raw, + matrix_age_days, + )); + } + + // ── Resolve current route from the request URL. ────────────────────────── + let current_route = normalize::normalize(req.get_url_str()); + + // ── Load AES-GCM keys from the Edge Dictionary. ────────────────────────── + let (key, prev_key) = match load_keys() { + Ok(pair) => pair, + Err(_) => { + // Misconfigured dictionary is operationally critical but should + // fail open in the request path. Emit a diagnostic header so the + // outage is visible in VCL logs. + return fail_open_response("internal-error-keys"); + } + }; + + // ── Decode the inbound cookie (if any). ────────────────────────────────── + let inbound_cookie = req + .get_header_str("cookie") + .and_then(|h| extract_cookie_value(h, COOKIE_NAME)); + + let (state, compliance) = match inbound_cookie { + None => (None, "missing"), + Some(value) => match cookie::decode( + value, + &key, + prev_key.as_deref(), + service_id, + cookie::SCHEMA_VERSION, + ) { + Ok(s) => (Some(s), "ok"), + Err(_) => { + TAMPERED_COOKIE_COUNT.fetch_add(1, Ordering::Relaxed); + (None, "tampered") + } + }, + }; + + // ── Resolve previous route(s) for L2. ──────────────────────────────────── + // Prefer the prev_route stored in the cookie state (carried forward + // from the last scored request in this session) — req.http doesn't + // persist across separate client requests, so the X-Edge-Prev-Route + // header path was always empty. The header is still consulted as a + // fallback for one-off testing scenarios where the cookie is missing. + let prev_route_from_state = state + .as_ref() + .filter(|s| !s.prev_route_path.is_empty()) + .map(|s| normalize::Route { + path: s.prev_route_path.clone(), + // L2 transition lookup only uses path; category is unused. + // Leaving empty avoids re-running the full normalize() pass. + category: String::new(), + }); + let prev_route = prev_route_from_state + .or_else(|| prev_route_raw.map(|s| normalize::normalize(s))); + let prev_anchor = prev_anchor_raw.map(|s| normalize::normalize(s)); + + // ── Score. ─────────────────────────────────────────────────────────────── + let matrix = matrix::load_embedded(); + if matrix.is_none() { + // The matrix is compiled into the Wasm binary, so a None here + // means the embedded JSON failed to parse at first access — + // operationally this would only happen after a bad deploy. + // Bump the counter so the periodic metrics line surfaces it. + MATRIX_LOAD_FAIL_COUNT.fetch_add(1, Ordering::Relaxed); + } + let result = scorer::score_combined(scorer::ScoreInputs { + state: state.as_ref(), + cookie_compliance: compliance, + current_route: ¤t_route, + prev_route: prev_route.as_ref(), + prev_anchor_route: prev_anchor.as_ref(), + matrix, + matrix_age_days, + }); + + // ── Re-encode the updated cookie. ──────────────────────────────────────── + // We rotate the cookie on every request so the seq/sum_dt fields stay + // fresh and the encryption nonce never repeats. The just-scored + // current_route becomes the next request's prev_route. + let now_secs: u32 = std::time::SystemTime::now() + .duration_since(std::time::UNIX_EPOCH) + .map(|d| d.as_secs() as u32) + .unwrap_or(0); + let updated = update_state(state, &result, ¤t_route.path, now_secs); + let set_cookie = match cookie::encode( + &updated, + &key, + &random_nonce(), + service_id, + cookie::SCHEMA_VERSION, + ) { + Ok(c) => Some(c), + Err(_) => None, + }; + + // ── Build response. ────────────────────────────────────────────────────── + let mut resp = Response::from_status(200); + for (k, v) in result.headers() { + resp.set_header(k, v); + } + // Emit the session id as a hex-encoded response header so VCL can + // capture it into the log line. Used by the admin to label specific + // sessions (good / bad / neutral) for ROC-AUC training. The VCL + // deliver snippet strips this header from the client-facing response + // (same hardening as the other X-Edge-* fields, per research doc + // §1.3). + resp.set_header("X-Edge-Sid", hex::encode(updated.sid)); + + // ENFORCEMENT signal: when the operator has committed an + // enforce_threshold value via the admin UI (writes to the + // scoring_config ConfigStore), set X-Edge-Score-Enforce=1 when the + // request's score meets or exceeds it. VCL reads this in a recv- + // restart-2 snippet and `error 429`s the request. Missing key or + // unparseable value → no enforcement (fail-open). + if let Some(t) = load_enforce_threshold() { + if u32::from(result.score) >= t { + ENFORCE_BLOCK_COUNT.fetch_add(1, Ordering::Relaxed); + resp.set_header("X-Edge-Score-Enforce", "1"); + } + } + + if let Some(cookie_value) = set_cookie { + resp.set_header( + "Set-Cookie", + format!( + "{}={}; Path=/; HttpOnly; Secure; SameSite=Lax", + COOKIE_NAME, cookie_value + ), + ); + } + + if debug { + let t1 = std::time::SystemTime::now() + .duration_since(std::time::UNIX_EPOCH) + .map(|d| d.as_nanos()) + .unwrap_or(0); + let elapsed_us = (t1.saturating_sub(t0)) / 1_000; + dbg_log(&format!( + "scored: score={} l1={} l2={} compliance={} reasons=[{}] mean_dwell_s={:.3} variance_s2={:.3} trans_prob={:.6} matrix_version={} elapsed_us={}", + result.score, + result.l1_score, + result.l2_score, + result.cookie_compliance, + result.reasons.join(","), + result.mean_dwell_s, + result.variance_s2, + result.trans_prob, + result.matrix_version, + elapsed_us, + )); + } + + maybe_emit_metrics(); + + resp +} + +/// Read the debug toggle from the `scoring_config` ConfigStore. Any truthy +/// string ("1", "true", "yes", any non-empty value other than "0"/"false") +/// enables verbose log emission. Missing config store / missing key → off. +/// +/// Always returns a bool — never panics — because this is on the request +/// hot path and a misconfigured store must not 5xx real traffic. +fn debug_logging_enabled() -> bool { + // ConfigStore::open panics if the store doesn't exist. catch_unwind is + // a no-op under wasm32 + panic=abort, so we use try_open to actually + // achieve the "missing store → silent fallback" semantic. A missing + // store on this fast-path code = debug off, which is the right default + // for a fresh service that hasn't been fully configured yet. + let dict = match ConfigStore::try_open(CONFIG_STORE) { + Ok(d) => d, + Err(_) => return false, + }; + match dict.get(DEBUG_LOG_KEY) { + Some(v) => { + let trimmed = v.trim().to_ascii_lowercase(); + !trimmed.is_empty() && trimmed != "0" && trimmed != "false" && trimmed != "no" + } + None => false, + } +} + +/// Read the operator's committed enforce_threshold from scoring_config. +/// +/// Returns ``Some(0..=100)`` when set + parseable; ``None`` otherwise. +/// The fail-open posture matches debug_logging_enabled: missing store, +/// missing key, or unparseable value → None → no enforcement happens. +/// Operator clears enforcement by deleting the key or writing a non- +/// numeric value (e.g. "off"). Values outside 0..100 are clamped. +fn load_enforce_threshold() -> Option { + let dict = ConfigStore::try_open(CONFIG_STORE).ok()?; + let raw = dict.get(ENFORCE_THRESHOLD_KEY)?; + let trimmed = raw.trim(); + if trimmed.is_empty() { + return None; + } + let n: u32 = trimmed.parse().ok()?; + Some(n.min(100)) +} + +/// Write a structured log line to stderr via `eprintln!`. On Wasm, +/// Fastly's runtime captures stderr (alongside stdout) and surfaces it +/// via `fastly log-tail`, so eprintln is the right destination here — +/// it keeps these diagnostic lines visually distinct from any future +/// real stdout output the binary might emit. Native test builds also +/// just print so the dbg_log call sites compile and can be exercised +/// without a Wasm runtime. +fn dbg_log(msg: &str) { + eprintln!("[scoring/dbg] {}", msg); +} + +/// Bump the per-instance request counter and, every `METRICS_EMIT_EVERY` +/// requests, emit a one-line metrics summary via dbg_log. Runs +/// unconditionally (independent of the debug toggle) because the +/// counters themselves are always incremented and the emission cost is +/// amortized to ~one log line per 1000 requests. Reads counters with +/// Relaxed ordering — exact values aren't required, only rough +/// magnitudes for operator visibility. +fn maybe_emit_metrics() { + let count = REQUEST_COUNT.fetch_add(1, Ordering::Relaxed).wrapping_add(1); + if count % METRICS_EMIT_EVERY == 0 { + dbg_log(&format!( + "metrics: tampered={} enforce_block={} matrix_fail={} requests={}", + TAMPERED_COOKIE_COUNT.load(Ordering::Relaxed), + ENFORCE_BLOCK_COUNT.load(Ordering::Relaxed), + MATRIX_LOAD_FAIL_COUNT.load(Ordering::Relaxed), + count, + )); + } +} + +fn fail_open_response(reason: &str) -> Response { + let mut resp = Response::from_status(200); + resp.set_header("X-Edge-Score", "0"); + resp.set_header("X-Edge-Score-L1", "0"); + resp.set_header("X-Edge-Score-L2", "0"); + resp.set_header("X-Edge-Cookie-Compliance", "unknown"); + resp.set_header("X-Edge-Score-Reason", reason); + resp +} + +fn request_auth_ok(req: &Request) -> bool { + let provided = req.get_header_str(SCORER_AUTH_HEADER).unwrap_or(""); + if provided.is_empty() { + return false; + } + // Use try_open so a missing scoring_keys store fails-closed gracefully + // instead of panicking; load_request_secret also returns None when + // the key is missing or empty. Either way: reject the request — better + // than letting unauthenticated traffic through on misconfiguration. + let expected = match load_request_secret() { + Some(v) => v, + None => return false, + }; + // Constant-time compare to avoid timing-leak side channels. The + // comparison is over short strings (32 hex chars) so the gain is + // minor in practice but free to add. + constant_time_eq(provided.as_bytes(), expected.as_bytes()) +} + +fn load_request_secret() -> Option { + let dict = ConfigStore::try_open(KEYS_STORE).ok()?; + let v = dict.get(REQUEST_SECRET_KEY)?; + if v.is_empty() { + None + } else { + Some(v) + } +} + +fn constant_time_eq(a: &[u8], b: &[u8]) -> bool { + if a.len() != b.len() { + return false; + } + let mut diff: u8 = 0; + for (x, y) in a.iter().zip(b.iter()) { + diff |= x ^ y; + } + diff == 0 +} + +fn load_keys() -> Result<(Vec, Option>), Error> { + let dict = ConfigStore::open(KEYS_STORE); + let key_hex = dict + .get("current_key_hex") + .ok_or_else(|| Error::msg("scoring_keys.current_key_hex missing"))?; + let prev_hex = dict.get("previous_key_hex"); + + let key = hex_decode(&key_hex)?; + let prev = match prev_hex { + Some(h) if !h.is_empty() => Some(hex_decode(&h)?), + _ => None, + }; + Ok((key, prev)) +} + +fn hex_decode(s: &str) -> Result, Error> { + if s.len() % 2 != 0 { + return Err(Error::msg("hex key has odd length")); + } + (0..s.len()) + .step_by(2) + .map(|i| u8::from_str_radix(&s[i..i + 2], 16)) + .collect::, _>>() + .map_err(|_| Error::msg("invalid hex in key")) +} + +/// Pull a single named cookie value out of a `Cookie:` header. Minimal — +/// doesn't handle quoted values or escapes (we only consume cookies we +/// ourselves emitted, so the value space is base64url alphabet only). +fn extract_cookie_value<'a>(cookie_header: &'a str, name: &str) -> Option<&'a str> { + for part in cookie_header.split(';') { + let trimmed = part.trim(); + if let Some(eq_idx) = trimmed.find('=') { + let (k, v) = trimmed.split_at(eq_idx); + if k == name { + return Some(&v[1..]); + } + } + } + None +} + +fn random_nonce() -> [u8; cookie::NONCE_BYTES] { + // CSPRNG from the WASI runtime. AES-GCM nonce REUSE is catastrophic + // for confidentiality (an attacker who sees two ciphertexts under + // the same key+nonce recovers the plaintext XOR and can forge), + // so if getrandom fails we PANIC rather than fall back to a weak + // source. On wasm32-wasip1 (Fastly Compute) this branch is + // unreachable in practice — random_get is a host function backed + // by Fastly's runtime entropy and has never been observed to fail. + let mut buf = [0u8; cookie::NONCE_BYTES]; + getrandom::getrandom(&mut buf) + .expect("WASI getrandom must not fail — AES-GCM nonce reuse is catastrophic"); + buf +} + +/// Apply the just-computed score back into a new SessionState for the +/// next request's cookie. If the inbound state was missing/tampered we +/// start a fresh session here. +/// +/// TIMING (used by L1 rules): +/// sum_dt = Σ (now − last_ts) across the session +/// sum_dt_sq = Σ (now − last_ts)² +/// mean_dwell = sum_dt / seq — L1 "impossibly fast" check +/// variance = sum_dt_sq/seq − mean_dwell² — L1 "robotic consistency" check +/// +/// Source of `now`: `SystemTime::now()` on wasm32-wasip1 reads the host +/// clock that Fastly's Compute runtime exposes — wall-clock second +/// precision, same source we already use for debug-log timestamps at +/// main.rs:81. The previous `now_secs = 0` placeholder collapsed both +/// L1 rules to identity functions; this lights them up for real. +/// Session-lifetime bounds. SESSION_IDLE_EXPIRE_S caps the gap between +/// adjacent requests in one session (default 30 min — covers typical +/// user idle on a tab); SESSION_HARD_CAP_S caps the total session +/// duration regardless of activity (default 24h — bounds long-lived +/// sessions so a stolen cookie can't be replayed indefinitely). When +/// either threshold is exceeded, update_state mints a fresh sid + +/// resets all timing accumulators — the cookie remains valid (AES +/// decrypt + AAD check still succeed) but acts as a new session. +const SESSION_IDLE_EXPIRE_S: u32 = 30 * 60; // 30 minutes +const SESSION_HARD_CAP_S: u32 = 24 * 60 * 60; // 24 hours + +fn update_state( + prev: Option, + result: &scorer::ScoreResult, + current_route_path: &str, + now_secs: u32, +) -> cookie::SessionState { + let prev_route_path = current_route_path.to_string(); + match prev { + Some(s) => { + let idle = now_secs.saturating_sub(s.last_ts); + let age = now_secs.saturating_sub(s.issued_at); + // SESSION ROTATION: idle-expire OR hard-cap → mint a fresh + // sid and reset timing. Bounded session lifetime is a + // security feature (stolen cookies can't be replayed after + // their window) and a data-hygiene feature (long-running + // sessions stop biasing the variance estimator). + if idle > SESSION_IDLE_EXPIRE_S || age > SESSION_HARD_CAP_S { + return cookie::SessionState { + v: cookie::SCHEMA_VERSION, + sid: new_random_sid(), + seq: 1, + sum_dt: 0, + sum_dt_sq: 0, + last_ts: now_secs, + score: result.score, + issued_at: now_secs, + prev_route_path, + }; + } + let new_seq = s.seq.saturating_add(1); + // Δt since the previous request in this session. Clamp at a + // 1-hour ceiling to bound the impact of long-idle sessions + // on the variance estimator (the L2 transition matrix + // already discounts inactivity differently). saturating_sub + // protects against clock skew where last_ts > now_secs. + let dt_secs: u32 = idle.min(3600); + let dt64 = u64::from(dt_secs); + let new_sum_dt = s.sum_dt.saturating_add(dt_secs); + let new_sum_dt_sq = s.sum_dt_sq.saturating_add(dt64.saturating_mul(dt64)); + cookie::SessionState { + v: cookie::SCHEMA_VERSION, + sid: s.sid, + seq: new_seq, + sum_dt: new_sum_dt, + sum_dt_sq: new_sum_dt_sq, + last_ts: now_secs, + score: result.score, + issued_at: s.issued_at, + prev_route_path, + } + } + None => cookie::SessionState { + v: cookie::SCHEMA_VERSION, + sid: new_random_sid(), + seq: 1, + sum_dt: 0, + sum_dt_sq: 0, + last_ts: now_secs, + score: result.score, + issued_at: now_secs, + prev_route_path, + }, + } +} + +fn new_random_sid() -> [u8; cookie::SID_BYTES] { + // Same CSPRNG-or-panic policy as random_nonce: a collision in sids + // doesn't break confidentiality but it does collapse distinct + // sessions into the same row in the labels table. Better to abort + // the request and fail-open than to return a deterministic sid. + let mut buf = [0u8; cookie::SID_BYTES]; + getrandom::getrandom(&mut buf).expect("WASI getrandom must not fail when generating session sid"); + buf +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn extract_cookie_value_basic() { + let h = "X-Session-State=ABC; other=foo"; + assert_eq!(extract_cookie_value(h, "X-Session-State"), Some("ABC")); + assert_eq!(extract_cookie_value(h, "other"), Some("foo")); + assert_eq!(extract_cookie_value(h, "missing"), None); + } + + #[test] + fn extract_cookie_value_handles_spacing() { + // Each `;`-delimited segment is .trim()'d, so leading/trailing + // whitespace around the segment is dropped. Real cookies are + // base64url so this is mostly cosmetic, but verify the trim works. + let h = " a=1 ; X-Session-State=XYZ ; b=2"; + assert_eq!(extract_cookie_value(h, "X-Session-State"), Some("XYZ")); + } + + #[test] + fn hex_decode_round_trip() { + let bytes: Vec = (0..32).collect(); + let hex: String = bytes.iter().map(|b| format!("{:02x}", b)).collect(); + assert_eq!(hex_decode(&hex).unwrap(), bytes); + } + + #[test] + fn hex_decode_rejects_odd_length() { + assert!(hex_decode("abc").is_err()); + } + + #[test] + fn hex_decode_rejects_non_hex() { + assert!(hex_decode("zzzz").is_err()); + } + + // ── update_state lifecycle tests ──────────────────────────────────────── + // + // These exercise the session-rotation rules in update_state without + // touching the WASI runtime. The refactor that hoisted `now_secs` to + // be a parameter (rather than reading SystemTime inside the function) + // exists specifically so these tests can drive the clock forward in a + // controlled way — see the comment block above SESSION_IDLE_EXPIRE_S + // for why bounded session lifetime is a security feature. + + fn mk_score_result(score: u8) -> scorer::ScoreResult { + scorer::ScoreResult { + score, + ..Default::default() + } + } + + #[test] + fn test_update_state_increments_event_count() { + // First call (prev = None) → fresh session with seq = 1. + let result = mk_score_result(10); + let s1 = update_state(None, &result, "/home", 1_000); + assert_eq!(s1.seq, 1, "first call should mint seq = 1"); + + // Second call (prev = Some(s1)) → seq grows by 1. + let s2 = update_state(Some(s1.clone()), &result, "/about", 1_001); + assert_eq!(s2.seq, 2, "second call should bump seq to 2"); + assert_eq!(s2.sid, s1.sid, "sid should be preserved inside the window"); + } + + #[test] + fn test_update_state_idle_expire_rotates_sid() { + // Establish a session at t = 1000. + let result = mk_score_result(20); + let s1 = update_state(None, &result, "/home", 1_000); + assert_eq!(s1.seq, 1); + let original_sid = s1.sid; + + // Advance "now" past SESSION_IDLE_EXPIRE_S (30 min). idle = + // now - last_ts > 30*60 → fresh sid + zeroed accumulators. + let now = 1_000 + SESSION_IDLE_EXPIRE_S + 1; + let s2 = update_state(Some(s1), &result, "/home", now); + + assert_ne!(s2.sid, original_sid, "idle-expire must mint a new sid"); + assert_eq!(s2.seq, 1, "accumulators should reset to fresh-session state"); + assert_eq!(s2.sum_dt, 0); + assert_eq!(s2.sum_dt_sq, 0); + assert_eq!(s2.issued_at, now); + assert_eq!(s2.last_ts, now); + } + + #[test] + fn test_update_state_hard_cap_rotates_sid() { + // Establish a session at t = 1000. + let result = mk_score_result(20); + let s1 = update_state(None, &result, "/home", 1_000); + let original_sid = s1.sid; + + // Walk the session forward in small idle steps so idle stays + // small but the cumulative age exceeds SESSION_HARD_CAP_S. We + // can't just jump now forward in one step because that would + // also blow past SESSION_IDLE_EXPIRE_S and trigger the idle + // branch instead of the hard-cap branch. Simulate a long-lived + // session by manually constructing prev with an old issued_at. + let aged = cookie::SessionState { + issued_at: 1_000, + last_ts: 1_000 + SESSION_HARD_CAP_S, // recent activity + ..s1 + }; + let now = 1_000 + SESSION_HARD_CAP_S + 1; // 1s past the cap + let s2 = update_state(Some(aged), &result, "/home", now); + + assert_ne!(s2.sid, original_sid, "hard-cap must mint a new sid"); + assert_eq!(s2.seq, 1, "accumulators should reset on hard-cap rotation"); + assert_eq!(s2.issued_at, now); + } + + #[test] + fn test_update_state_normal_idle_keeps_sid() { + // Establish a session and advance by 5 min — well under the + // 30-min idle ceiling. + let result = mk_score_result(15); + let s1 = update_state(None, &result, "/home", 1_000); + let original_sid = s1.sid; + let original_issued_at = s1.issued_at; + + let now = 1_000 + 5 * 60; + let s2 = update_state(Some(s1), &result, "/about", now); + + assert_eq!(s2.sid, original_sid, "sid must persist inside the window"); + assert_eq!(s2.seq, 2, "seq should grow on normal continuation"); + assert_eq!( + s2.issued_at, original_issued_at, + "issued_at is anchored to session start, not reset on continuation" + ); + assert_eq!(s2.last_ts, now); + } + + #[test] + fn test_update_state_clock_skew_saturates_to_zero() { + // Establish a session at now = 100, then call again at now = 50 + // (clock went backward — e.g. NTP correction, or two Compute + // pops with skewed local clocks). idle = saturating_sub(50, 100) + // = 0, which is well inside the idle window → no rotation. + let result = mk_score_result(0); + let s1 = update_state(None, &result, "/home", 100); + let original_sid = s1.sid; + + let s2 = update_state(Some(s1), &result, "/home", 50); + + assert_eq!(s2.sid, original_sid, "clock skew must not rotate sid"); + assert_eq!(s2.seq, 2, "session continues across clock skew"); + // dt = saturating_sub(50, 100) = 0, so sum_dt stays at 0. + assert_eq!(s2.sum_dt, 0, "sum_dt should not go backward"); + assert_eq!(s2.sum_dt_sq, 0); + assert_eq!(s2.last_ts, 50, "last_ts tracks the most recent observation"); + } + + #[test] + fn test_update_state_sum_dt_clamp() { + // In-window dt should accumulate to the actual elapsed seconds. + // After two calls 7s apart, sum_dt should be 7 and sum_dt_sq + // should be 49 — verifies neither underflow (saturating_sub) + // nor the 1-hour ceiling clamp fires for ordinary spacings. + let result = mk_score_result(0); + let s1 = update_state(None, &result, "/home", 1_000); + assert_eq!(s1.sum_dt, 0, "fresh session starts with sum_dt = 0"); + + let s2 = update_state(Some(s1), &result, "/about", 1_007); + assert_eq!(s2.sum_dt, 7, "sum_dt should equal elapsed seconds"); + assert_eq!(s2.sum_dt_sq, 49, "sum_dt_sq should equal dt²"); + + // The clamp branch: a 2-hour idle is clamped to 1h = 3600s + // before being folded in. Verifies the ceiling fires. + let s3 = update_state(Some(s2), &result, "/x", 1_007 + 7_200); + // 30-min idle limit isn't tripped here? 7200 > 1800 → rotates. + // Confirm rotation rather than asserting the clamp under this + // path; the clamp itself fires for idle ≤ 30min but > 1h is + // unreachable without first crossing the idle cap. Asserting + // rotation here documents the precedence: idle-expire wins + // over the dt clamp. + assert_eq!(s3.seq, 1, "idle-expire takes precedence over dt clamp"); + } + + #[test] + fn test_session_idle_expire_s_constant() { + // 30 minutes, in seconds. Pinned so a refactor that + // accidentally drops a zero is caught at test time rather + // than discovered in production via mass session resets. + assert_eq!(SESSION_IDLE_EXPIRE_S, 30 * 60); + } + + #[test] + fn test_session_hard_cap_s_constant() { + // 24 hours, in seconds. Same rationale as the idle-expire + // constant test above — a bounded session lifetime is a + // security guarantee against indefinite cookie replay. + assert_eq!(SESSION_HARD_CAP_S, 24 * 60 * 60); + } +} diff --git a/compute/scorer/src/matrix.rs b/compute/scorer/src/matrix.rs new file mode 100644 index 00000000..8c3cefd9 --- /dev/null +++ b/compute/scorer/src/matrix.rs @@ -0,0 +1,102 @@ +//! Embedded transition-matrix loader. +//! +//! `matrix.json` is produced by `scripts/scoring/train.py` and embedded at +//! compile time via [`include_bytes!`]. We parse it lazily into a +//! [`TransitionMatrix`] on first use and cache the parsed shape for the +//! lifetime of the Wasm instance — a single parse + many lookups per +//! request, no allocations on the hot path. + +use serde::Deserialize; +use std::collections::HashMap; +use std::sync::OnceLock; + +/// Embedded at compile time. The workspace tracks a no-op placeholder at +/// `matrix.default.json` (vocab_size=0, no counts) so anyone can `cargo +/// build` and `cargo test` on a fresh checkout. The deploy pipeline +/// (`scripts/scoring/build_wasm.sh`, written in Phase D) copies the real +/// trained `matrix.json` over this path before invoking `fastly compute +/// build`, embedding the customer-specific matrix into the published Wasm. +/// +/// When the embedded blob is the empty default (vocab_size == 0), the +/// scorer's L2 layer disables itself — matching the doc's pre-Day-7 +/// behavior (§4.3 blend weight is 0). +const EMBEDDED_MATRIX_BYTES: &[u8] = include_bytes!("../matrix.default.json"); + +#[derive(Debug, Clone, Default, Deserialize)] +pub struct TransitionMatrix { + #[serde(default)] + pub version: String, + #[serde(default)] + pub vocab_size: u32, + #[serde(default)] + pub session_count: u64, + #[serde(default)] + pub transition_count: u64, + #[serde(default)] + pub counts: HashMap>, + #[serde(default)] + pub row_totals: HashMap, + #[serde(default)] + pub categories: HashMap, + #[serde(default)] + pub anchors: Vec, +} + +static MATRIX_CACHE: OnceLock = OnceLock::new(); + +/// Lazily parse and return a static-lifetime reference to the embedded +/// matrix. Returns `None` if the embedded blob is empty (build artifact +/// missing) — the request handler treats that as "L2 disabled, fall back +/// to L1 only" which matches the doc's pre-Day-7 behavior (§4.3). +pub fn load_embedded() -> Option<&'static TransitionMatrix> { + if EMBEDDED_MATRIX_BYTES.is_empty() { + return None; + } + Some(MATRIX_CACHE.get_or_init(|| { + serde_json::from_slice(EMBEDDED_MATRIX_BYTES).expect("embedded matrix.json malformed") + })) +} + +/// Convenience for tests: parse from an arbitrary JSON byte slice. +#[cfg(test)] +pub fn parse(bytes: &[u8]) -> serde_json::Result { + serde_json::from_slice(bytes) +} + +#[cfg(test)] +mod tests { + use super::*; + + const SAMPLE: &str = r#" + { + "version": "test-2026-06-01-a", + "built_at": "2026-06-01T00:00:00+00:00", + "vocab_size": 3, + "session_count": 10, + "transition_count": 20, + "counts": {"/home": {"/products": 15, "/cart": 5}}, + "row_totals": {"/home": 20}, + "categories": {"/home": "home", "/products": "product", "/cart": "cart"}, + "anchors": ["/home", "/products"] + } + "#; + + #[test] + fn parse_sample_round_trip() { + let m = parse(SAMPLE.as_bytes()).unwrap(); + assert_eq!(m.version, "test-2026-06-01-a"); + assert_eq!(m.vocab_size, 3); + assert_eq!(m.counts.get("/home").unwrap().get("/products"), Some(&15)); + assert_eq!(m.row_totals.get("/home"), Some(&20)); + assert_eq!(m.anchors, vec!["/home", "/products"]); + } + + #[test] + fn parse_handles_missing_optional_fields() { + let minimal = r#"{"vocab_size": 1}"#; + let m = parse(minimal.as_bytes()).unwrap(); + assert_eq!(m.vocab_size, 1); + assert!(m.counts.is_empty()); + assert!(m.anchors.is_empty()); + } +} diff --git a/compute/scorer/src/normalize.rs b/compute/scorer/src/normalize.rs new file mode 100644 index 00000000..1ff0fc9f --- /dev/null +++ b/compute/scorer/src/normalize.rs @@ -0,0 +1,289 @@ +//! URL → canonical (route, category) pair. +//! +//! Mirrors `backend/scoring/normalize.py`. Same patterns, same category map, +//! same lowercase-then-collapse order. Anything that normalizes differently +//! between Python and Rust would corrupt matrix lookups at runtime — there's +//! a cross-language fixture test in `tests/cross_lang_normalize.rs`. + +/// Coarse first-segment → category map. Mirrors the Python `_CATEGORY_MAP` +/// dict exactly; both must be edited in lockstep when adding new buckets. +const CATEGORY_MAP: &[(&str, &str)] = &[ + ("", "home"), + ("api", "api"), + ("graphql", "api"), + ("products", "product"), + ("product", "product"), + ("items", "product"), + ("p", "product"), + ("categories", "browse"), + ("category", "browse"), + ("search", "browse"), + ("browse", "browse"), + ("cart", "cart"), + ("basket", "cart"), + ("checkout", "checkout"), + ("pay", "checkout"), + ("order", "checkout"), + ("orders", "checkout"), + ("account", "account"), + ("user", "account"), + ("users", "account"), + ("profile", "account"), + ("settings", "account"), + ("auth", "auth"), + ("login", "auth"), + ("signin", "auth"), + ("signup", "auth"), + ("register", "auth"), + ("logout", "auth"), + ("admin", "admin"), + ("static", "asset"), + ("assets", "asset"), + ("blog", "content"), + ("news", "content"), + ("about", "content"), + ("help", "content"), + ("support", "content"), + ("privacy", "content"), + ("terms", "content"), + ("faq", "content"), +]; + +#[derive(Debug, Clone, PartialEq, Eq, Hash)] +pub struct Route { + pub path: String, + pub category: String, +} + +fn strip_query(url: &str) -> &str { + // Find a '?' or '#' to delimit. We don't bother with scheme/host parsing + // because Fastly Compute hands us a relative path already, and the + // Python side calls urlsplit which also discards everything after '?'. + let path = url.split('?').next().unwrap_or(url); + let path = path.split('#').next().unwrap_or(path); + + // Drop scheme://host if present (urlsplit-equivalent: only keep the path + // component). + if let Some(idx) = path.find("://") { + // Look for the FIRST '/' after the scheme separator. + let rest = &path[idx + 3..]; + if let Some(slash) = rest.find('/') { + return &rest[slash..]; + } + return "/"; + } + path +} + +fn looks_like_id(segment: &str) -> bool { + if segment.is_empty() { + return false; + } + + // Numeric only. + if segment.chars().all(|c| c.is_ascii_digit()) { + return true; + } + // UUID v4 (8-4-4-4-12 hex chars). + if segment.len() == 36 + && segment.chars().enumerate().all(|(i, c)| match i { + 8 | 13 | 18 | 23 => c == '-', + _ => c.is_ascii_hexdigit(), + }) + { + return true; + } + // 24+ hex chars (content hash / Mongo ObjectId). + if segment.len() >= 24 && segment.chars().all(|c| c.is_ascii_hexdigit()) { + return true; + } + // Prefixed id: 2-5 uppercase letters, separator (- or _), then alphanumeric. + if let Some(idx) = segment.find(|c: char| c == '-' || c == '_') { + let prefix = &segment[..idx]; + if (2..=5).contains(&prefix.len()) && prefix.chars().all(|c| c.is_ascii_uppercase()) { + let suffix = &segment[idx + 1..]; + if !suffix.is_empty() + && suffix + .chars() + .all(|c| c.is_ascii_alphanumeric() || c == '-' || c == '_') + { + return true; + } + } + } + // Long opaque alphanumeric (>= 20 chars). + if segment.len() >= 20 + && segment + .chars() + .all(|c| c.is_ascii_alphanumeric() || c == '-' || c == '_') + { + return true; + } + false +} + +fn category_for(first_segment: &str) -> &'static str { + let lower = first_segment.to_ascii_lowercase(); + for (k, v) in CATEGORY_MAP { + if *k == lower { + return v; + } + } + "other" +} + +/// Normalize a URL to its (canonical route, category) pair. +pub fn normalize(url: &str) -> Route { + let path = strip_query(url); + if path.is_empty() || path == "/" { + return Route { + path: "/".to_string(), + category: "home".to_string(), + }; + } + let segments: Vec<&str> = path.split('/').filter(|s| !s.is_empty()).collect(); + if segments.is_empty() { + return Route { + path: "/".to_string(), + category: "home".to_string(), + }; + } + + let normalized: Vec = segments + .iter() + .map(|s| { + if looks_like_id(s) { + "*".to_string() + } else { + s.to_ascii_lowercase() + } + }) + .collect(); + let canonical = format!("/{}", normalized.join("/")); + let category = category_for(segments[0]).to_string(); + Route { + path: canonical, + category, + } +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn doc_explicit_examples() { + // /items/10243 → /items/* + assert_eq!(normalize("/items/10243").path, "/items/*"); + // /api/v2/orders/ → /api/v2/orders/* + assert_eq!( + normalize("/api/v2/orders/00000abc-1234-5678-9abc-deadbeef0000").path, + "/api/v2/orders/*" + ); + } + + #[test] + fn trivial_paths() { + assert_eq!(normalize("/").path, "/"); + assert_eq!(normalize("").path, "/"); + assert_eq!(normalize("/home").path, "/home"); + } + + #[test] + fn query_string_stripped() { + assert_eq!(normalize("/search?q=red+shoes").path, "/search"); + assert_eq!(normalize("/items/42?ref=email").path, "/items/*"); + } + + #[test] + fn numeric_id_collapse() { + assert_eq!(normalize("/blog/12345").path, "/blog/*"); + assert_eq!(normalize("/orders/789/items/42").path, "/orders/*/items/*"); + } + + #[test] + fn uuid_collapse() { + assert_eq!( + normalize("/sessions/123e4567-e89b-12d3-a456-426614174000").path, + "/sessions/*" + ); + } + + #[test] + fn hex_hash_collapse() { + assert_eq!( + normalize("/jobs/64bc89ff1a2b3c4d5e6f7081").path, + "/jobs/*" + ); + } + + #[test] + fn prefixed_id_collapse() { + assert_eq!(normalize("/inventory/SKU-12345").path, "/inventory/*"); + assert_eq!(normalize("/orders/ORD-789-ABC").path, "/orders/*"); + } + + #[test] + fn long_opaque_collapse() { + assert_eq!( + normalize("/oauth/callback/abcdef0123456789xyzwAA").path, + "/oauth/callback/*" + ); + } + + #[test] + fn absolute_url_strips_scheme_host() { + assert_eq!( + normalize("https://www.example.com/api/v1/users/777?token=abc").path, + "/api/v1/users/*" + ); + } + + #[test] + fn double_slashes_collapse() { + assert_eq!(normalize("/foo//bar").path, "/foo/bar"); + } + + #[test] + fn does_not_collapse_short_alphanumeric() { + // "v2" — too short for LONG_OPAQUE + assert_eq!(normalize("/api/v2").path, "/api/v2"); + assert_eq!(normalize("/faq").path, "/faq"); + assert_eq!(normalize("/cart").path, "/cart"); + } + + #[test] + fn does_not_collapse_hyphenated_slug() { + assert_eq!(normalize("/about-us").path, "/about-us"); + assert_eq!(normalize("/privacy-policy").path, "/privacy-policy"); + } + + #[test] + fn lowercased() { + assert_eq!(normalize("/Products/Foo").path, "/products/foo"); + } + + #[test] + fn category_mapping() { + assert_eq!(normalize("/").category, "home"); + assert_eq!(normalize("/products/42").category, "product"); + assert_eq!(normalize("/cart").category, "cart"); + assert_eq!(normalize("/checkout/step-1").category, "checkout"); + assert_eq!(normalize("/account/settings").category, "account"); + assert_eq!(normalize("/api/v2/orders/123").category, "api"); + assert_eq!(normalize("/graphql").category, "api"); + assert_eq!(normalize("/login").category, "auth"); + assert_eq!(normalize("/admin/dashboard").category, "admin"); + assert_eq!(normalize("/blog/post").category, "content"); + assert_eq!(normalize("/about-us").category, "other"); + } + + #[test] + fn known_limitation_word_like_user_id() { + // Documents the deliberate v1 limitation that /users/drew/profile + // doesn't auto-collapse without per-site route-template config. + let r = normalize("/users/drew/profile"); + assert_eq!(r.path, "/users/drew/profile"); + assert_eq!(r.category, "account"); + } +} diff --git a/compute/scorer/src/scorer.rs b/compute/scorer/src/scorer.rs new file mode 100644 index 00000000..22598a5b --- /dev/null +++ b/compute/scorer/src/scorer.rs @@ -0,0 +1,544 @@ +//! Layer 1 (universal behavioral) + Layer 2 (route transition) + combined. +//! +//! Mirrors `backend/scoring/scorer.py`. Constants are pinned to the same +//! values; the math is the same algebraic identity for variance and the +//! same `-log10(p) * 100/6` curve for the L2 score. Every Python test in +//! `tests/scoring/test_scorer.py` has a paired Rust test here so behavioural +//! drift between the two impls is caught at build time. + +use crate::cookie::{quantize_score, SessionState}; +use crate::matrix::TransitionMatrix; +use crate::normalize::Route; + +// ── Layer 1 tuning (research doc §4.1) ────────────────────────────────────── + +pub const L1_TIMING_WARMUP_SEQ: u16 = 3; +pub const L1_FAST_DWELL_THRESHOLD_S: f64 = 0.20; +pub const L1_ROBOTIC_VARIANCE_THRESHOLD: f64 = 0.05; +// Security #037: was 0.5 — there was a "robotic safe zone" between +// L1_FAST_DWELL_THRESHOLD_S (0.20) and L1_ROBOTIC_DWELL_LOW_S (0.50) +// where a low-variance bot averaging 0.30s/page scored zero. The +// audit verified the gap was exploitable. Lower the threshold so the +// robotic detector covers the previously-uncovered band; the only +// behavior change is that bots in that gap now get the ROBOTIC score +// instead of zero. +pub const L1_ROBOTIC_DWELL_LOW_S: f64 = 0.20; +pub const L1_ROBOTIC_DWELL_HIGH_S: f64 = 3.0; +pub const L1_SCORE_FAST: u8 = 50; +pub const L1_SCORE_ROBOTIC: u8 = 40; +pub const L1_SCORE_COOKIE_MISSING: u8 = 75; +// Security #036: cookie tampering is a strictly stronger anomaly signal +// than missing — missing might be a fresh visitor or a privacy-mode +// browser, tampering is intentional. Cap tampered sessions at 100 +// rather than the 75 ceiling missing/expired share. +pub const L1_SCORE_COOKIE_TAMPERED: u8 = 100; + +// ── Layer 2 tuning (research doc §4.2) ────────────────────────────────────── + +pub const L2_LAPLACE_ALPHA: f64 = 0.5; +pub const L2_SKIPGRAM_BETA: f64 = 0.7; + +/// Map a transition probability ∈ [0, 1] to an L2 anomaly score [0, 100]. +/// Pinned to mirror Python's `_l2_score_from_trans_prob`. +pub fn l2_score_from_trans_prob(p: f64) -> u8 { + if p >= 1.0 { + return 0; + } + if p <= 1e-12 { + return 100; + } + let raw = -p.log10() * (100.0 / 6.0); + let clamped = raw.clamp(0.0, 100.0); + clamped.round() as u8 +} + +// ── Combined output ───────────────────────────────────────────────────────── + +#[derive(Debug, Default, Clone)] +pub struct ScoreResult { + pub score: u8, + pub l1_score: u8, + pub l2_score: u8, + pub reasons: Vec, + pub cookie_compliance: String, + pub mean_dwell_s: f64, + pub variance_s2: f64, + pub trans_prob: f64, + pub matrix_version: String, +} + +impl ScoreResult { + pub fn headers(&self) -> Vec<(&'static str, String)> { + vec![ + ("X-Edge-Score", self.score.to_string()), + ("X-Edge-Score-L1", self.l1_score.to_string()), + ("X-Edge-Score-L2", self.l2_score.to_string()), + ("X-Edge-Cookie-Compliance", self.cookie_compliance.clone()), + ("X-Edge-Score-Reason", self.reasons.join(",")), + ("X-Edge-Matrix-Version", self.matrix_version.clone()), + ] + } +} + +// ── Layer 1 ───────────────────────────────────────────────────────────────── + +fn running_mean_variance(state: &SessionState) -> (f64, f64) { + if state.seq == 0 { + return (0.0, 0.0); + } + let n = state.seq as f64; + let mean = state.sum_dt as f64 / n; + let second_moment = state.sum_dt_sq as f64 / n; + let var = (second_moment - mean * mean).max(0.0); + (mean, var) +} + +// FOLLOW-UP for security #038 (Unwindowed mean allows amortized delays). +// +// The current implementation accumulates sum_dt + sum_dt_sq over the +// entire session. An attacker who's fast at the start of a session +// (triggering impossibly-fast → robotic-consistency) can deliberately +// slow down later to drag the mean back into "normal" territory, +// effectively rolling the L1 score off. The audit confirmed this is +// exploitable in principle. +// +// A real fix would replace the cumulative sum with a sliding window of +// the last N (~20) dwells. That requires: +// 1. Cookie schema v3: add a fixed-size ring buffer of u16 dwells +// (40 bytes for 20 entries) to SessionState. +// 2. Backward-compat: v2 cookies treated as "missing" (rotate). +// 3. Both Rust + Python implementations + cross-language fixture tests. +// 4. update_state pushes the new dwell into the buffer (eviction = FIFO). +// 5. score_layer1 computes mean/variance over the buffer only. +// +// Partial mitigation already in place: +// - 30min idle expire (cookie::SESSION_IDLE_EXPIRE_S) rotates the +// session and clears the timing accumulator, bounding the +// amortization window. +// - 24h hard cap (cookie::SESSION_HARD_CAP_S) caps total session +// lifetime. +// - The threshold-matrix admin UI applies the highest score the +// session has ever held when blocking decisions are made, so +// dragging the mean down can't UN-block a session that was +// previously flagged. +// +// Tracking: see security_remediation_final_v6.md §5/#038 for the +// audit re-scoping requirement. +pub fn score_layer1(state: &SessionState) -> (u8, Vec, f64, f64) { + let (mean, var) = running_mean_variance(state); + let mut score: u8 = 0; + let mut reasons: Vec = vec![]; + + if state.seq > L1_TIMING_WARMUP_SEQ { + if mean < L1_FAST_DWELL_THRESHOLD_S { + score = score.saturating_add(L1_SCORE_FAST); + reasons.push("impossibly-fast".into()); + } + if var < L1_ROBOTIC_VARIANCE_THRESHOLD + && (L1_ROBOTIC_DWELL_LOW_S..=L1_ROBOTIC_DWELL_HIGH_S).contains(&mean) + { + score = score.saturating_add(L1_SCORE_ROBOTIC); + reasons.push("robotic-consistency".into()); + } + } + + (score.min(100), reasons, mean, var) +} + +// ── Layer 2 ───────────────────────────────────────────────────────────────── + +fn transition_prob( + matrix: &TransitionMatrix, + prev: &str, + current: &str, + vocab_size: u32, +) -> f64 { + let v = vocab_size as f64; + let prev_row = matrix.counts.get(prev); + let numerator = prev_row + .and_then(|row| row.get(current).copied()) + .unwrap_or(0) as f64 + + L2_LAPLACE_ALPHA; + let row_total = matrix.row_totals.get(prev).copied().unwrap_or(0) as f64; + let denominator = row_total + L2_LAPLACE_ALPHA * v; + if denominator <= 0.0 { + return 1.0 / v.max(1.0); + } + numerator / denominator +} + +pub fn score_layer2( + matrix: Option<&TransitionMatrix>, + prev_route: Option<&Route>, + prev_anchor_route: Option<&Route>, + current_route: &Route, +) -> (u8, Vec, f64) { + let matrix = match matrix { + Some(m) => m, + None => return (0, vec![], 1.0), + }; + let prev = match prev_route { + Some(r) => r, + None => return (0, vec![], 1.0), + }; + let vocab_size = matrix.vocab_size; + if vocab_size == 0 { + return (0, vec![], 1.0); + } + + let direct_p = transition_prob(matrix, &prev.path, ¤t_route.path, vocab_size); + let trans_prob = match prev_anchor_route { + Some(anchor) if anchor.path != prev.path => { + let anchor_p = transition_prob(matrix, &anchor.path, ¤t_route.path, vocab_size) + * L2_SKIPGRAM_BETA; + direct_p.max(anchor_p) + } + _ => direct_p, + }; + + let score = l2_score_from_trans_prob(trans_prob); + let reasons = if score >= 50 { + vec!["low-transition-prob".into()] + } else { + vec![] + }; + (score, reasons, trans_prob) +} + +// ── Combined ──────────────────────────────────────────────────────────────── + +pub fn blend_weight(matrix_age_days: f64) -> f64 { + if matrix_age_days < 7.0 { + return 0.0; + } + if matrix_age_days >= 10.0 { + return 1.0; + } + (matrix_age_days - 7.0) / 3.0 +} + +pub struct ScoreInputs<'a> { + pub state: Option<&'a SessionState>, + pub cookie_compliance: &'a str, + pub current_route: &'a Route, + pub prev_route: Option<&'a Route>, + pub prev_anchor_route: Option<&'a Route>, + pub matrix: Option<&'a TransitionMatrix>, + pub matrix_age_days: f64, +} + +pub fn score_combined(inp: ScoreInputs<'_>) -> ScoreResult { + let mut result = ScoreResult { + cookie_compliance: inp.cookie_compliance.to_string(), + matrix_version: inp + .matrix + .map(|m| m.version.clone()) + .unwrap_or_default(), + ..Default::default() + }; + + // Security #036: tampered cookies score 100 (the audit's required + // ceiling); missing/expired keep the historical 75. This matters + // because the threshold-matrix admin UI uses score==100 to enable + // hard-block enforcement — capping tampered at 75 meant an attacker + // editing the cookie's payload to evade an L2 anomaly could keep + // their session below the enforcement bar. + let mut l1_from_cookie: u8 = 0; + match inp.cookie_compliance { + "tampered" => { + l1_from_cookie = L1_SCORE_COOKIE_TAMPERED; + result.reasons.push("cookie-tampered".into()); + } + "missing" | "expired" => { + l1_from_cookie = L1_SCORE_COOKIE_MISSING; + result.reasons.push(format!("cookie-{}", inp.cookie_compliance)); + } + _ => {} + } + + if let Some(state) = inp.state { + let (l1_timing, l1_reasons, mean, var) = score_layer1(state); + result.mean_dwell_s = mean; + result.variance_s2 = var; + result.reasons.extend(l1_reasons); + result.l1_score = l1_from_cookie.saturating_add(l1_timing).min(100); + } else { + result.l1_score = l1_from_cookie; + } + + let (l2_score, l2_reasons, trans_prob) = score_layer2( + inp.matrix, + inp.prev_route, + inp.prev_anchor_route, + inp.current_route, + ); + result.l2_score = l2_score; + result.trans_prob = trans_prob; + result.reasons.extend(l2_reasons); + + let w_l2 = blend_weight(inp.matrix_age_days); + let raw = result.l1_score as f64 + result.l2_score as f64 * w_l2; + result.score = quantize_score(raw); + + result +} + +#[cfg(test)] +mod tests { + use super::*; + use crate::matrix::TransitionMatrix; + use std::collections::HashMap; + + fn state(seq: u16, sum_dt: u32, sum_dt_sq: u64) -> SessionState { + SessionState { + v: crate::cookie::SCHEMA_VERSION, + sid: [0, 1, 2, 3, 4, 5], + seq, + sum_dt, + sum_dt_sq, + last_ts: 1_700_000_000, + score: 0, + issued_at: 1_699_990_000, + prev_route_path: String::new(), + } + } + + fn r(path: &str, category: &str) -> Route { + Route { + path: path.to_string(), + category: category.to_string(), + } + } + + fn matrix(counts: &[(&str, &[(&str, u64)])], vocab_size: u32) -> TransitionMatrix { + let mut m = TransitionMatrix { + version: "test-v1".into(), + vocab_size, + ..Default::default() + }; + for (src, dests) in counts { + let mut row = HashMap::new(); + let mut total: u64 = 0; + for (dst, n) in *dests { + row.insert(dst.to_string(), *n); + total += n; + } + m.counts.insert(src.to_string(), row); + m.row_totals.insert(src.to_string(), total); + } + m + } + + // ── running_mean_variance ──────────────────────────────────────────────── + + #[test] + fn running_seq_zero_returns_zeros() { + let (m, v) = running_mean_variance(&state(0, 0, 0)); + assert_eq!(m, 0.0); + assert_eq!(v, 0.0); + } + + #[test] + fn running_uniform_dwells() { + // 5 identical 2s dwells → mean=2, var=0 + let (m, v) = running_mean_variance(&state(5, 10, 20)); + assert_eq!(m, 2.0); + assert_eq!(v, 0.0); + } + + #[test] + fn running_mixed() { + // Dwells [1,2,3,4] → mean=2.5, var=1.25 + let (m, v) = running_mean_variance(&state(4, 10, 30)); + assert!((m - 2.5).abs() < 1e-9); + assert!((v - 1.25).abs() < 1e-9); + } + + // ── Layer 1 warmup gate ────────────────────────────────────────────────── + + #[test] + fn l1_below_warmup_no_rules_fire() { + let (score, reasons, _, _) = score_layer1(&state(L1_TIMING_WARMUP_SEQ, 0, 0)); + assert_eq!(score, 0); + assert!(reasons.is_empty()); + } + + #[test] + fn l1_impossibly_fast_fires() { + let (score, reasons, mean, _) = score_layer1(&state(5, 0, 0)); + assert!(reasons.contains(&"impossibly-fast".to_string())); + assert!(score >= L1_SCORE_FAST); + assert_eq!(mean, 0.0); + } + + #[test] + fn l1_robotic_consistency_fires_uniform_1s_loops() { + // 10 dwells of 1s → mean=1, var=0 + let (score, reasons, _, var) = score_layer1(&state(10, 10, 10)); + assert!(reasons.contains(&"robotic-consistency".to_string())); + assert!(score >= L1_SCORE_ROBOTIC); + assert!(var < L1_ROBOTIC_VARIANCE_THRESHOLD); + } + + #[test] + fn l1_robotic_does_not_fire_outside_band() { + // 10s mean — outside the bot-suspicious band. + let (_, reasons, _, _) = score_layer1(&state(10, 100, 1000)); + assert!(!reasons.contains(&"robotic-consistency".to_string())); + } + + // ── Layer 2 ────────────────────────────────────────────────────────────── + + #[test] + fn l2_score_curve_pinned() { + // Same band assertions as the Python parametrized test. + assert_eq!(l2_score_from_trans_prob(1.0), 0); + assert!(matches!(l2_score_from_trans_prob(0.5), 0..=10)); + assert!(matches!(l2_score_from_trans_prob(0.1), 10..=25)); + assert!(matches!(l2_score_from_trans_prob(0.01), 30..=40)); + assert!(matches!(l2_score_from_trans_prob(0.001), 45..=55)); + assert!(matches!(l2_score_from_trans_prob(1e-6), 95..=100)); + assert_eq!(l2_score_from_trans_prob(0.0), 100); + } + + #[test] + fn l2_no_matrix_returns_zero() { + let (score, reasons, p) = score_layer2(None, Some(&r("/a", "home")), None, &r("/b", "home")); + assert_eq!(score, 0); + assert!(reasons.is_empty()); + assert_eq!(p, 1.0); + } + + #[test] + fn l2_high_prob_no_score() { + let m = matrix(&[("/home", &[("/products", 99), ("/other", 1)])], 10); + let (score, _, p) = score_layer2(Some(&m), Some(&r("/home", "home")), None, &r("/products", "product")); + assert_eq!(score, 0); + assert!(p > 0.9); + } + + #[test] + fn l2_rare_pair_fires() { + let mut m = matrix(&[("/home", &[("/checkout", 1)])], 100); + m.row_totals.insert("/home".into(), 10_000); + let (score, reasons, p) = score_layer2( + Some(&m), + Some(&r("/home", "home")), + None, + &r("/checkout", "checkout"), + ); + assert!(score >= 50); + assert!(reasons.contains(&"low-transition-prob".to_string())); + assert!(p < 0.001); + } + + #[test] + fn l2_skipgram_rescues_via_anchor() { + let mut m = matrix( + &[ + ("/about-us", &[("/checkout", 1)]), + ("/product", &[("/checkout", 100)]), + ], + 10, + ); + m.row_totals.insert("/about-us".into(), 1000); + m.row_totals.insert("/product".into(), 100); + let (score, _, p) = score_layer2( + Some(&m), + Some(&r("/about-us", "content")), + Some(&r("/product", "product")), + &r("/checkout", "checkout"), + ); + assert!(p > 0.5); + assert!(score < 10); + } + + // ── Blend weight ───────────────────────────────────────────────────────── + + #[test] + fn blend_weight_pinned() { + assert_eq!(blend_weight(0.0), 0.0); + assert_eq!(blend_weight(6.99), 0.0); + assert_eq!(blend_weight(7.0), 0.0); + assert!((blend_weight(8.5) - 0.5).abs() < 1e-9); + assert_eq!(blend_weight(10.0), 1.0); + assert_eq!(blend_weight(30.0), 1.0); + } + + // ── Combined output ────────────────────────────────────────────────────── + + #[test] + fn combined_clean_session_zero() { + let s = state(10, 50, 300); + let m = matrix(&[("/home", &[("/products", 100)])], 10); + let r1 = r("/home", "home"); + let r2 = r("/products", "product"); + let result = score_combined(ScoreInputs { + state: Some(&s), + cookie_compliance: "ok", + current_route: &r2, + prev_route: Some(&r1), + prev_anchor_route: None, + matrix: Some(&m), + matrix_age_days: 30.0, + }); + assert_eq!(result.score, 0); + assert_eq!(result.l1_score, 0); + assert_eq!(result.l2_score, 0); + assert!(result.reasons.is_empty()); + } + + #[test] + fn combined_missing_cookie_high_score() { + let r1 = r("/home", "home"); + let r2 = r("/checkout", "checkout"); + let result = score_combined(ScoreInputs { + state: None, + cookie_compliance: "missing", + current_route: &r2, + prev_route: Some(&r1), + prev_anchor_route: None, + matrix: None, + matrix_age_days: 0.0, + }); + assert!(result.l1_score >= L1_SCORE_COOKIE_MISSING); + assert!(result.reasons.iter().any(|r| r == "cookie-missing")); + assert!(result.score >= 75); + } + + #[test] + fn combined_caps_at_100() { + let s = state(10, 0, 0); // fast + let mut m = matrix(&[("/a", &[("/b", 1)])], 1000); + m.row_totals.insert("/a".into(), 1_000_000); + let r1 = r("/a", "other"); + let r2 = r("/b", "other"); + let result = score_combined(ScoreInputs { + state: Some(&s), + cookie_compliance: "missing", // +75 + current_route: &r2, + prev_route: Some(&r1), + prev_anchor_route: None, + matrix: Some(&m), + matrix_age_days: 30.0, + }); + assert_eq!(result.score, 100); + } + + #[test] + fn combined_score_quantized_to_nearest_5() { + let s = state(10, 0, 0); + let r1 = r("/a", "other"); + let r2 = r("/b", "other"); + let result = score_combined(ScoreInputs { + state: Some(&s), + cookie_compliance: "ok", + current_route: &r2, + prev_route: Some(&r1), + prev_anchor_route: None, + matrix: None, + matrix_age_days: 0.0, + }); + assert_eq!(result.score % 5, 0); + } +} diff --git a/configs/ssh_known_hosts b/configs/ssh_known_hosts new file mode 100644 index 00000000..99d790d3 --- /dev/null +++ b/configs/ssh_known_hosts @@ -0,0 +1,30 @@ +# localhost.run SSH host keys for the tunnel-share reverse-tunnel. +# +# Security #011 / v6 doc Decision: the tunnel manager refuses to start +# the reverse SSH tunnel unless the localhost.run server presents a key +# that appears here. The original implementation passed +# StrictHostKeyChecking=no + UserKnownHostsFile=/dev/null, which lets any +# MitM on the outbound path hijack the tunnel — analyst sessions / PII +# travel over an end-to-end-encrypted channel whose endpoint was just +# replaced. +# +# Fingerprint verification (fetched 2026-06-03 from the GCP origin VM): +# ssh-rsa SHA256:FV8IMJ4IYjYUTnd6on7PqbRjaZf4c1EhhEBgeUdE94I +# +# To rotate (see docs/ssh-known-hosts-runbook.md for the full procedure): +# 1. From a trusted network, run: +# ssh-keyscan -t rsa,ed25519,ecdsa localhost.run +# or use a verbose ssh: +# ssh -v -o BatchMode=yes -o UserKnownHostsFile=/tmp/_kh \ +# -o StrictHostKeyChecking=accept-new localhost.run exit +# grep '^localhost.run' /tmp/_kh +# 2. Compare the SHA256 fingerprint against localhost.run's published +# value AND against the existing fingerprint above. If both differ +# from a recent ssh-keyscan from a SECOND independent network, abort +# the rotation and investigate — you may be the target of a MitM. +# 3. Replace the line below, commit, and redeploy. Existing tunnels +# stay up until they reconnect; new connections will use the +# updated key. +# +# Format: +localhost.run ssh-rsa AAAAB3NzaC1yc2EAAAADAQABAAABAQC3lJnhW1oCXuAYV9IBdcJA+Vx7AHL5S/ZQvV2fhceOAPgO2kNQZla6xvUwoE4iw8lYu3zoE1KtieCU9yInWOVI6W/wFaT/ETH1tn55T2FVsK/zaxPiHZVJGLPPdEEid0vS2p1JDfc9onZ0pNSHLl1QusIOeMUyZ2bUMMLLgw46KOT9S3s/LmxgoJ3PocVUn5rVXz/Dng7Y8jYNe4IFrZOAUsi7hNBa+OYja6ceefpDvNDEJ1BdhbYfGolBdNA7f+FNl0kfaWru4Cblr843wBe2ckO/sNqgeAMXO/qH+SSgQxUXF2AgAw+TGp3yCIyYoOPvOgvcPsQziJLmDbUuQpnH diff --git a/docker-compose.prod.yml b/docker-compose.prod.yml index 2c2fd313..ba73fec8 100644 --- a/docker-compose.prod.yml +++ b/docker-compose.prod.yml @@ -15,32 +15,102 @@ # Why all-loopback for app services: # - Eliminates the "listening on 0.0.0.0" surface even if a firewall rule # were ever misconfigured. -# - Backend's RemoteAccessMiddleware uses socket peer to distinguish admin -# (peer=127.0.0.1) from analyst (anything else). Caddy in host mode also -# hits backend on 127.0.0.1 — so backend can't distinguish admin from -# analyst-via-Caddy by peer alone. The DNS-rebinding gate (Host header -# check against registered public_endpoint) is what separates them now. +# - Backend reads request.client.host as the trust signal. uvicorn runs +# with --proxy-headers --forwarded-allow-ips=127.0.0.1 so it populates +# request.client.host from X-Forwarded-For ONLY when the TCP peer is +# loopback (i.e. only Caddy on this host can set it). Caddy rewrites +# XFF to the authoritative Fastly-Client-IP value before forwarding, +# so request.client.host is the real client IP for Caddy-proxied +# traffic and 127.0.0.1 for direct SSH-tunnel admin access. # # Public exposure: # Only port 80 is open on the VM's public interface, and the GCP firewall -# restricts it to Fastly's published origin IP ranges. +# restricts it to Fastly's published origin IP ranges. Caddy additionally +# refuses to trust the Fastly-Client-IP header unless the TCP peer is in +# Fastly's published edge range (defense-in-depth against any firewall +# misconfiguration). + +x-prod-logging: &prod-logging + # Extra E2 (security remediation v6): the default Docker json-file driver + # has no size cap and no rotation. A container that's been up for months + # can either run the host disk to zero or, conversely, lose months of + # access-log history every time the container is restarted (which kills + # any retroactive incident-response investigation). Cap each file at + # 50 MB and keep 10 rotated files — gives us ~500 MB / container with a + # multi-week retention window even on chatty endpoints. Adjust if disk + # pressure ever becomes a concern. + driver: json-file + options: + max-size: "50m" + max-file: "10" + compress: "true" services: backend: network_mode: host ports: !reset [] networks: !reset null + logging: *prod-logging # Override the default '--host 0.0.0.0' → bind to loopback only. - command: ["uvicorn", "backend.main:app", "--host", "127.0.0.1", "--port", "8000"] + # + # --proxy-headers + --forwarded-allow-ips=127.0.0.1 (security): + # tells uvicorn to populate request.client.host from X-Forwarded-For ONLY + # when the TCP peer is loopback (i.e., only Caddy on this same host can + # set it). Caddy rewrites XFF to the authoritative Fastly-Client-IP value + # before forwarding, so request.client.host is the real client IP for + # remote traffic and 127.0.0.1 for direct SSH-tunnel admin access. Without + # both flags, leftmost-XFF spoofing becomes exploitable AND + # the admin Host-spoof bypass returns. Do not remove without first + # rewriting backend/utils/remote_access.py:is_request_remote — the startup + # assertion in backend/main.py will FATAL-exit otherwise. + command: + [ + "uvicorn", + "backend.main:app", + "--host", + "127.0.0.1", + "--port", + "8000", + "--proxy-headers", + "--forwarded-allow-ips=127.0.0.1", + ] + # Container-level memory cap. Pre-2026-06-04 this was unset (Memory: 0 + # = unlimited) — the backend was free to grow until the kernel OOM- + # killer fired on the WHOLE host. That took out sshd + caddy on each + # OOM event (witnessed 2026-06-01 + multiple times on 2026-06-03/04). + # 12g leaves ~3GB host headroom; when the backend genuinely runs + # away docker SIGKILLs only the backend (clean restart-unless-stopped + # recovery) instead of wedging the whole VM. + mem_limit: 12g + memswap_limit: 12g environment: - # Bump DuckDB memory from 4GB (base) to 8GB. The VM has 16GB, frontend + - # caddy + OS use ~3GB, leaving plenty of headroom. Helps sort spills and - # aggregation working sets on heavy dashboard queries. - - DUCKDB_MEMORY_LIMIT=8GB + # DuckDB sees this via os.getenv at connection-open. backend/core/ + # duckdb.py:760-768 issues ``SET max_memory = `` per-session; + # the auto-calc fallback at ~line 803 is now gated on this env + # being UNSET (pre-fix it silently overrode this value with + # ``int(physical_ram * 0.6)`` ≈ 9-10GB on the 16GB VM, which left + # too little headroom and caused recurring host OOM-kills). + # 6GB chosen so: DuckDB 6 + Python/pyiceberg/aiohttp working set + # ~3 + scoring/cache working set ~2 = ~11GB < 12GB container cap + # < 16GB host. Bump only after confirming RSS plateaus below 9GB. + - DUCKDB_MEMORY_LIMIT=6GB # Hard-fail at startup if /app/data isn't a real mount point. Guards # against the broken-fstab failure mode (data accidentally written to # an ephemeral boot-disk stub). See _enforce_data_dir_mounted in main.py. - STRICT_DATA_DIR_CHECK=1 + # Companion to the uvicorn command flags above: the startup assertion in + # backend/main.py reads this and refuses to boot if it is unset or empty. + # Prevents silent regression to leftmost-XFF spoofing if a + # future config refactor drops the --proxy-headers flag. + - TRUSTED_PROXY_IPS=127.0.0.1 + # Pass through DEBUG_RESPONSES from the host environment (typically + # set via the .env file next to this compose file, which Compose + # auto-loads for variable substitution). When unset it expands to + # the literal empty string and the gate in backend/models/common.py + # (`_debug_responses_enabled`) returns False — same as if the var + # were never set. Plain `- DEBUG_RESPONSES` (no value) would also + # work but the explicit substitution makes the source obvious. + - DEBUG_RESPONSES=${DEBUG_RESPONSES:-} volumes: !override - /mnt/app-data/configs:/app/configs - /mnt/app-data/data:/app/data @@ -52,6 +122,7 @@ services: network_mode: host ports: !reset [] networks: !reset null + logging: *prod-logging environment: # Bind Next.js standalone server to loopback only (overrides Dockerfile's # HOSTNAME=0.0.0.0). @@ -67,6 +138,7 @@ services: network_mode: host ports: !reset [] networks: !reset [] + logging: *prod-logging volumes: - ./Caddyfile:/etc/caddy/Caddyfile:ro - caddy_data:/data diff --git a/docs/demo_production_guide.md b/docs/demo_production_guide.md new file mode 100644 index 00000000..7e3cb1ff --- /dev/null +++ b/docs/demo_production_guide.md @@ -0,0 +1,389 @@ +# Fastly Log Analytics — Launch Video Production Guide + +The end-to-end production plan for the first public release demo of **Fastly Log Analytics**. Part 1 aligns the team on goals, positioning, and recording strategy. Part 2 is the camera-ready scene-by-scene script (paced for Google Vids teleprompter). Part 3 is the operational checklist that gets us from "ready to record" to "ready to publish." + +| Field | Value | +| :--- | :--- | +| **Runtime target** | 4:45 – 5:00 | +| **Aspect / resolution** | 16:9, 1080p (1920×1080) | +| **Delivery format** | MP4 (H.264, AAC) + captioned SRT | +| **Primary distribution** | GitHub repo README, Fastly developer hub, conference loop reels | +| **Status** | Draft script ready for team review | + +--- + +## Part 1 — Production Strategy & Setup (Team Alignment) + +### 1. Goal & Success Criteria + +**Goal.** In under five minutes, convince a Fastly customer-facing engineer (SA / SE / TAM) that they can stand up request-level log analytics — including security signals, bot detection, and performance analytics — using **only Fastly products** and get a teammate up to speed in one viewing. + +**Audience.** +- **Primary:** Fastly SAs, SEs, TAMs preparing for customer conversations about logging cost or observability gaps. +- **Secondary:** Cost-conscious Fastly customers and prospects already streaming real-time logs but paying a third-party for ingestion and storage. + +**Core message.** Real-time log streaming + Fastly Object Storage + this open-source tool = production-grade, request-level observability with no third-party logging vendor in the picture. + +**A viewer should be able to answer, by the end:** +1. What problem does this solve, and why now? *(Request-level visibility into every visitor — humans, bots, crawlers, scrapers, partners, attackers, fast and slow alike. Who they are, what they're doing, what impact they're having on the origin, and where it's costing you — so you can make informed decisions about what to optimize, shield, rate-limit, or block at the edge.)* +2. How does it work end-to-end? *(Edge logs land in a Fastly Object Storage bucket as raw `.gz`; the app ingests and atomically commits them into an Apache Iceberg table in that same bucket; a local DuckDB + Parquet cache serves the dashboard so analytics queries never re-hit the cloud.)* +3. How do I get it running? *(Wizard. Five fields. One click. Logs flow.)* +4. Where's the catch? *(There isn't one — it's Apache-2.0, your only Fastly costs are Fastly Object Storage class operations and storage, and nothing leaves your account.)* + +**Tone.** Developer-direct. Confident, not salesy. No music swells. No stock B-roll. Cursor movements deliberate; the product does the heavy lifting. + +### 2. Brand & Visual Identity + +| Element | Spec | +| :--- | :--- | +| **Color palette** | App's native dark theme — no recoloring. Use a slightly softer off-black/slate canvas (`#121214` or `#18181B`) behind the bright Fastly red (`#FF282D`) to prevent visual contrast clipping and vibration on high-contrast displays. | +| **Typography (titles)** | Inter or system-ui at 600 weight. Avoid Google Vids' default decorative fonts. | +| **Cursor** | macOS default at 1.5× size (System Settings → Accessibility → Display → Pointer size). | +| **Window chrome** | Hide all bookmarks, extensions, profile avatars, and personal tabs. Use a fresh browser profile named "Demo". | +| **Lower-thirds** | Scene-opener card with scene title + section icon for 2 seconds, then dissolve. | +| **End card** | GitHub URL, docs URL, Apache-2.0 badge, 4-second hold. | + +### 3. Recording Stack — Google Vids + +Google Vids stays the recording, narration, and assembly surface — it's collaborative, browser-based, and keeps assets in one place. + +- **Capture.** Native recording studio. Single dedicated 1080p Chrome window. No second monitor visible in the capture. +- **Teleprompter.** Paste each scene's voiceover (Part 2) into the script panel of its slide. Toggle the teleprompter overlay in the studio so the presenter can read while the cursor drives. +- **Voiceover (dual-track for v1).** Record the full script **twice**: once as a manual VO from the assigned SE/SA, and once using Google Vids' AI voice ("Narrator"). Cut both versions on the same timeline; pick the final mix after stakeholder review. Keep both source tracks archived in case a future update wants to switch. +- **Editing.** Drop clips onto the timeline, run **Automatic Transcript Trim** to strip filler words and pauses, then apply 0.5-second cross-fades between scenes. Reserve hard cuts for *within* a scene. + +### 4. The "Time-Jump" Recording Strategy + +A freshly provisioned service takes minutes to receive its first edge log. Rather than fake it, lean into the cut — record two environments plus one pre-built opener, and bridge them with the narration. + +| Source | Purpose | State | +| :--- | :--- | :--- | +| **Opener (pre-built)** | Animated Scene 1 explaining Fastly's edge position → VCL → log streaming → this tool. Not screen-captured. | Built in Vids / Keynote / After Effects per the **Scene 1 — Opener Slide Build Spec** in Part 2. | +| **Environment A: Fresh** | Captures Scene 2 (the provisioning wizard). | Unconfigured app at `http://localhost:3000/`. No services exist. | +| **Environment B: Populated** | Captures Scenes 3–10 (the analytics surface). | Same app, separate instance. **Seven days of realistic, PII-scrubbed production logs** ingested ahead of time. | + +**Two cuts.** + +1. **Opener → Env A** (end of Scene 1, ~26 s mark): 1-second cross-fade from the final state of the animated opener into the landing page. The cursor in Env A is pre-positioned near "Provision New Service" so Scene 2 begins clicking with no settle time. +2. **Env A → Env B** (end of Scene 2 / start of Scene 3): on the wizard's green success screen, hold one second of stillness, then dissolve (1-second cross-fade) to Env B's dashboard with the 7-day range preset. The voiceover in Scene 3 names the jump explicitly — no attempt to hide it. + +### 5. Narrative Arc + +```mermaid +graph TD + A["1 · Hook (animated)
Edge → VCL → log stream → your bucket"] --> B["2 · Provisioning
Token → service → bucket → fields → deploy"] + B --> C["3 · Time-Jump
One-week dissolve"] + C --> D["4 · Dashboard
Click-to-filter, saved views"] + D --> E["5 · Deep Dives
Insights · Security · Performance"] + E --> F["6 · Pipeline
Iceberg, compaction, cron history"] + F --> G["7 · Custom Fields
VCL expressions to columns"] + G --> H["8 · Cost Visibility
Usage page + log-line accounting"] + H --> I["9 · Collaboration
Invite Analyst vs Share Dashboard"] + I --> J["10 · Close
GitHub · docs · CTA"] +``` + +### 6. Positioning Anchors (Every Scene Should Reinforce One) + +| Anchor | Where it lands | +| :--- | :--- | +| **Start with the stream.** The pipeline begins with Fastly's free real-time log streaming pushing into a Fastly Object Storage bucket. No third party in the path. | Scenes 1, 2, 6 | +| **Who, what, where, how much.** The pitch is visibility — answering "who's hitting me, what are they doing, what's it costing, what should I do at the edge?" Covers humans, bots, crawlers, scrapers, partners, attackers; fast spikes and slow-and-low alike. Avoid framing as generic "troubleshooting" or as a narrow bot-defense tool. | Scenes 1, 4, 5 | +| **Edge-policy enablement.** Categorized request-level logs are the input to a real decision: optimize, rate-limit, shield, or block — at the edge. | Scenes 4, 5, 7 | +| **Sub-second local speeds.** Dashboards query a local DuckDB + Parquet cache. Repeated refreshes cost **zero** Fastly Object Storage Class-B operations. | Scenes 4, 6, 8 | +| **Apache-2.0, your hardware, your data.** No vendor lock-in, no SaaS subscription, no data leaving your account. | Scenes 1, 9, 10 | + +### 7. Team Decisions (Locked for v1) + +- [x] **Voiceover:** Record **both** tracks — human (SE/SA) and Google Vids AI ("Narrator") — and pick the final mix in post. Lets us A/B which lands better with the target audience without re-shooting. +- [x] **Pipeline depth:** Scene 6 leads with the **Cron Runs** view — the operational story is part of the pitch. +- [x] **VCL custom fields:** Demo with **all field groups toggled on** (matches the seeded Env B dataset). Voiceover explicitly says "you choose which groups you need" so viewers don't think they're forced into the full set. +- [x] **NGWAF:** Showcase the **linking step during provisioning** in Scene 2. NGWAF enrichment appears again in Scene 5 as payoff. +- [x] **CTA destination:** GitHub repo only for v1. Revisit once a developer-hub landing page exists. + +--- + +## Part 2 — Scene-by-Scene Script + +**Pacing notes for the presenter / AI voice:** +- Voiceover word counts target ~150 wpm. Where a cell looks long, trust the budget — it's been timed. +- Cursor moves should *complete* a half-beat before the corresponding sentence ends, never after. +- All durations are upper bounds. Coming in under is fine; running over is not. + +**Runtime budget:** 30 + 60 + 8 + 38 + 35 + 32 + 28 + 33 + 23 + 15 = **5:02** + +--- + +### Scene 1 — Hook (Animated Opener) + +- **Duration:** 30 s +- **Format:** Pre-built animated slide (no live app footage). Cross-fades into Scene 2's landing page at the very end. + +| On-screen action | Voiceover | +| :--- | :--- | +| Animated opener slide builds in four beats (see **Scene 1 — Opener Slide Build Spec** below). Final 2 s cross-fades to Environment A landing page with the cursor pre-positioned near **"Provision New Service."** | *"Fastly processes every edge request, exposing rich diagnostic data through VCL variables. Instead of routing this sensitive data to expensive third-party platforms that charge by the gigabyte, you can stream it securely to your own Fastly Object Storage bucket. Fastly Log Analytics runs directly on your hardware as a self-hosted dashboard, giving you instant SQL-powered insights into traffic anomalies, security events, and real-time costs. No third-party data egress, no SaaS bills, complete compliance control, and raw sub-second querying power."* | + +#### Scene 1 — Opener Slide Build Spec + +**Aspect:** 16:9, dark background (`#0E0E10` — matches the app's native theme). All text in white (`#F5F5F7`) except accents. +**Accent color:** Fastly red `#FF282D` for the Fastly Edge node, the FOS bucket, and the dashboard node. +**Dim color:** mid-grey `#5A5A5F` for the third-party path (deliberately deprioritized). + +The slide builds in four timed beats, synced to the voiceover. Each beat's elements enter with **fade-in + 12px slide-up**, staggered 150 ms apart within a beat. Once an element is on screen it stays until the cross-fade out. + +**Reference flow (use as the visual blueprint — screenshot rendered Mermaid as a starting frame and refine in Vids / Keynote):** + +```mermaid +flowchart LR + V["👥 Visitors"] -->|requests| F["⚡ Fastly Edge
VCL processes every request"] + F -->|responses| V + F -->|backend fetch| O["🖥️ Your Origin"] + + F -. "VCL variables
req.url · client.geo.country
tls.client.ja4 · fastly.info.state · …" .-> L["📡 Real-time
log stream"] + + L -. "typical path" .-> T["💸 Third-party
analytics service"] + L ==> |"this path"| B["🪣 Fastly Object Storage
your account"] + B ==> D["📊 Fastly Log Analytics
open source · your hardware"] + + classDef edge fill:#FF282D,stroke:#FF282D,color:#fff,font-weight:bold + classDef dim fill:#2A2A2E,stroke:#5A5A5F,color:#8A8A8F + classDef bright fill:#FF282D,stroke:#FF282D,color:#fff,font-weight:bold + class F edge + class T dim + class B,D bright +``` + +**Beat-by-beat build (timing aligned to voiceover):** + +| Beat | Time | VO line landing on this beat | Elements that enter | +| :--- | :--- | :--- | :--- | +| **1** | 0 – 7 s | *"Fastly processes every edge request, exposing rich diagnostic data through VCL variables."* | Rapid entry of **Visitors** (left, user-cluster icon), **Fastly Edge** (center, Fastly logo, red accent, subtle pulse), and **Your Origin** (right, server-stack icon). Connection paths draw immediately at high speed. A greyed-out SaaS cloud node overlays a money-loss icon to signify expensive third-party bills. | +| **2** | 7 – 14 s | *"Instead of routing this sensitive data to expensive third-party platforms that charge by the gigabyte, you can stream it securely to your own Fastly Object Storage bucket."* | Title card wipes in. Below the Fastly Edge node, a vertical stack of VCL chips cascades down in Fastly Red: `req.url`, `client.geo.country`, `tls.client.ja4`, `…`. The expensive third-party path cancels out, and a thick, glowing pipeline draws securely into the **Fastly Object Storage** bucket icon labeled "your account". | +| **3** | 14 – 21 s | *"Fastly Log Analytics runs directly on your hardware as a self-hosted dashboard, giving you instant SQL-powered insights into traffic anomalies, security events, and real-time costs."* | Data streams rapidly from the storage bucket into a dynamic dashboard frame labeled **"Fastly Log Analytics — open source · your hardware"**. Four mini icons representing **SQL Engine**, **Shield/WAF**, **Anomaly Spike**, and **Cost Meter** expand dynamically above the dashboard wireframe. | +| **4** | 21 – 28 s | *"No third-party data egress, no SaaS bills, complete compliance control, and raw sub-second querying power."* | Four horizontal high-contrast value statements wipe onto the canvas: **[✓] No Third-Party Egress** -> **[✓] No SaaS Bills** -> **[✓] Complete Compliance** -> **[✓] Sub-second SQL**. The tagline **"Keep it in your court."** scales up, and the lower-third **"Apache-2.0 · Self-Hosted"** badge slides into view. | +| **Exit** | 28 – 30 s | (silent) | The entire high-density graphic settles, glowing softly, and cross-dissolves (1.0s) into the Environment A landing page with the cursor pre-positioned near the **"Provision New Service"** card so Scene 2 can begin with no settle time. | + +**Asset prep checklist:** +- [ ] Fastly logo (SVG, red-on-transparent) — sourced from Fastly brand library. +- [ ] User-cluster, server-stack, bucket, and chart-window icons — Vids' built-in icon library or Lucide / Heroicons set (consistent stroke weight). +- [ ] VCL variable chips — built as rounded-rectangle text components in Vids; reuse one as a master. +- [ ] Third-party box stays deliberately generic and unbranded (no real vendor logo — legal liability, also lets viewers project their own incumbent). + +#### Recommended Animation Pipelines + +Google Vids excels at timeline compilation but lacks advanced keyframing and vector animation. It is highly recommended to construct the Scene 1 animation in a dedicated design tool, screen-record it in 1080p, and import it as a single master clip: + +* **Option A: Keynote (Fastest & Easiest):** Design the horizontal flow in Keynote on a dark background. Use **Magic Move** transitions and staggered entry builds (e.g., set build delays to exactly `0.15s` or `0.18s` for the cascading VCL chips). Play the presentation full-screen and screen-record. +* **Option B: Figma Smart Animate (Maximum Polish):** Design 4 consecutive frames representing the state at each Beat. Link them in Prototype mode using `After Delay` triggers with `Smart Animate` set to `Ease In and Out` (duration `400ms – 600ms`) to create fluid vector line draws and scaling pulses. Screen-record the prototype viewport. + +**Build effort estimate:** ~15–20 min in Keynote or Figma to configure the build steps, then a 30-second screen capture to import into Google Vids as a finished asset. + +--- + +### Scene 2 — Provisioning Wizard + +- **Duration:** 60 s +- **URL:** Starts at `http://localhost:3000/`, advances through the wizard. +- **State:** Provisioning wizard with NGWAF workspace available on the linked Fastly account. + +| On-screen action | Voiceover | +| :--- | :--- | +| **Click "Provision New Service."** Wizard slides to the token field. | *"One click starts the wizard."* | +| **Paste a Fastly API token.** Click **Next**. | *"Paste an API token — the wizard uses it to set everything up on your behalf."* | +| **Pick a VCL service** from the dropdown. Click **Next**. | *"Pick the VCL service whose logs you want to analyze."* | +| **Storage step.** Region pre-filled; type a bucket name. Click **Next**. | *"For storage, name a Fastly Object Storage bucket. The wizard creates it, mints scoped read-write keys, and stands up a CDN-fronting Fastly service so all future log reads come back through cache — at zero egress cost."* | +| **NGWAF link step.** Click the **Link NGWAF workspace** dropdown and select the available workspace. Click **Next**. | *"If you run Next-Gen WAF, link the workspace here. The app will sync verified-bot intelligence and enrich matching log rows automatically — no extra setup later."* | +| **Fields step.** **Toggle every field group on** — core HTTP through QUIC / HTTP3. Highlight the live byte-count meter as it climbs. Click **Deploy**. | *"Now pick the log field groups. We're turning everything on for this demo — you'd choose the groups your team actually needs. The configurator shows the per-line cost in real time so you never blow Fastly's log-format size limit."* | +| **Watch the install log stream.** Hold on the green success screen for ~2 s. | *"The wizard provisions the bucket, attaches a structured JSON logging endpoint, writes the matching VCL, and registers the NGWAF workspace — all auto-rolled back if anything fails. Logs are now flowing."* | + +--- + +### Scene 3 — The Time-Jump + +- **Duration:** 8 s +- **Visual:** **[CUT]** 1-second cross-fade from Env A success screen to Env B `/dashboard/` with a 7-day range pre-selected. + +| On-screen action | Voiceover | +| :--- | :--- | +| Hold one beat on the success screen. Cross-fade to the populated dashboard. Cursor lands on the date-range picker. | *"A new service takes a few minutes to start collecting logs. Let's fast-forward one week."* | + +--- + +### Scene 4 — Interactive Dashboard + +- **Duration:** 38 s +- **URL:** `/dashboard/` (Environment B) +- **State:** Fully populated dashboard, 7-day window. + +| On-screen action | Voiceover | +| :--- | :--- | +| **Click-and-drag** across a visible traffic spike on the requests-over-time chart to zoom in. | *"Every visualization is a filter. Drag the timeline to isolate an anomaly…"* | +| **Click a country** on the choropleth request map. Filter chip appears in the header. | *"…click a region on the global request map…"* | +| **Click a `404`** in the status-code panel. A second filter chip appears. | *"…or click any status code, host, or user-agent to drill in. Dashboards respond in milliseconds because they query a local DuckDB cache — not the cloud."* | +| **Open the Saved Views dropdown** and hover the **"Pin current view"** action. | *"Pin any filter combination as a saved view to reopen it with one click."* | + +--- + +### Scene 5 — Deep Dives: Insights · Security · Performance + +- **Duration:** 35 s +- **Navigation:** Sidebar → **Insights**, then **Security**, then **Performance**. + +| On-screen action | Voiceover | +| :--- | :--- | +| **Insights tab.** Cursor lands on a populated anomaly card (e.g., "Regional surge — IN"). | *"The Insights view runs automated anomaly detection — error spikes, regional surges, new IPs, cache regressions, latency drift — by comparing a recent window against a longer baseline."* | +| **Security tab.** Scroll past the Verified Bots panel and Top TLS Fingerprints chart. | *"The Security view surfaces TLS fingerprints, request-header anomalies, proxy and anonymizer breakdowns, and — when NGWAF is linked — verified-bot intelligence joined onto matching log rows."* | +| **Performance tab.** Pause on the Slowest URLs table and the Origin-vs-Edge processing chart. | *"And the Performance view zeroes in on where to spend optimization effort: slowest URLs and networks, origin TTFB, cache-TTL distribution, and how each request's time splits between edge and origin."* | + +--- + +### Scene 6 — Pipeline & Log Management + +- **Duration:** 32 s +- **URL:** `/admin` (Log Management), **Cron Runs tab open by default**. +- **State:** Several days of `sync` and `local_compact` runs visible, all green; ingestion log populated below. + +| On-screen action | Voiceover | +| :--- | :--- | +| **Open with the Cron Runs tab already focused.** Slow scroll through ~10 alternating `sync` / `local_compact` rows with green status and duration columns visible. | *"This is what makes the whole thing trustworthy: every scheduled job — sync, local compaction, snapshot expiration — writes a row with start time, duration, and status. Operators see the pipeline's health at a glance."* | +| **Hover the most recent `sync` row** to expand its event log. | *"`sync` downloads new log files, buffers them locally as Parquet, and atomically commits them to an Apache Iceberg table in your bucket — crash-safe by design, so an interrupted run never corrupts the table."* | +| **Click `local_compact` row.** | *"And a local compaction job merges cached Parquet files in the background — no extra Fastly Object Storage round-trips — so queries stay fast as the dataset grows."* | + +--- + +### Scene 7 — Custom Log Fields + +- **Duration:** 28 s +- **URL:** `/admin/fields` or the Fields configurator modal. + +| On-screen action | Voiceover | +| :--- | :--- | +| **Toggle a built-in field group** on (e.g., QUIC / HTTP3). Highlight the live byte-count meter. | *"Field groups toggle on and off. The configurator estimates the per-line cost and warns before you hit Fastly's log-format size limit."* | +| **Click "Add Custom Field."** Type name `user_tier`, expression `%{req.http.X-User-Tier}V`. Save. | *"For anything specific to your app, add a custom VCL expression — like a user-tier header. It's validated, compiled into the log format, and pushed straight to the edge service."* | + +--- + +### Scene 8 — Cost Visibility & Log-Line Accounting + +- **Duration:** 33 s +- **URLs:** `/usage`, then `/admin` → Log Accounting panel. + +| On-screen action | Voiceover | +| :--- | :--- | +| **Open `/usage`.** Hover the storage breakdown bar chart, then drag the cost-estimator slider. | *"Logging shouldn't produce surprise bills. The Usage page breaks down storage by tier, counts every Class-A and Class-B operation, and pre-fills a cost estimator from your actual traffic."* | +| **Navigate to Log Accounting.** Pause on the hour-by-hour reconciliation grid. | *"And the Log Accounting panel reconciles Fastly's authoritative log counters against your locally-ingested rows, hour by hour — so any pipeline gap shows up immediately, not buried in a monthly total."* | + +--- + +### Scene 9 — Secure Collaboration + +- **Duration:** 23 s +- **UI:** **Invite Analyst** modal, then **Share Dashboard** modal. + +| On-screen action | Voiceover | +| :--- | :--- | +| **Click "Invite Analyst."** Show the generated read-only config JSON. | *"To bring a teammate in, generate a read-only credential package — they paste it into their own copy of the app and start querying the same bucket."* | +| **Close, click "Share Dashboard."** Hover the three sharing modes; cursor rests on **"Sever All Access."** | *"Or share live — over an SSH tunnel, your own hostname, or your public IP — with per-analyst passcodes, IP allowlists, and one-click revoke. No Fastly Object Storage credentials leave your machine."* | + +--- + +### Scene 10 — Close + +- **Duration:** 15 s +- **State:** `/dashboard/` zoomed slightly out; end card overlay at ~7 s. + +| On-screen action | Voiceover | +| :--- | :--- | +| Slow drift across the dashboard, cursor settling center. End card fades up at 7 s: GitHub URL, docs URL, Apache-2.0 badge. | *"Fastly Log Analytics — request-level observability, on your hardware, built only from Fastly products. Star the repo, read the docs, and spin up your own in minutes."* | + +--- + +## Part 3 — Production Checklist + +### A. Pre-Production (T-minus 2 days) + +**Environment B data prep — the demo lives or dies here.** + +- [ ] Seed Environment B with **7 days of contiguous, realistic** production logs. Minimum thresholds: ≥1 visible traffic spike, ≥1 anomaly that lights up Insights, ≥1 NGWAF signal hit, ≥3 distinct countries on the map, ≥1 cache regression. +- [ ] **Scrub PII / customer data**: rewrite real IPs to RFC-1918 / documentation ranges, swap any identifiable hostnames, and run a final grep for the product name of any real customer. +- [ ] Wait for at least one cycle of `local_compact` to run so the Cron Runs tab looks lived-in. +- [ ] Pre-pin a **Saved View** in Env B that survives the recording (so Scene 4's dropdown isn't empty). +- [ ] Pre-create one **Custom Field** in Env B so the configurator table isn't empty in Scene 7. + +**Environment A prep.** + +- [ ] Verify `http://localhost:3000/` opens to the landing page with no prior config (`configs/` empty, no DuckDB). +- [ ] Stage a **dedicated, throwaway Fastly account** for the wizard. The recorded API token must be revoked the moment recording ends — assume the screen capture leaks. +- [ ] Stage a clean VCL service in that account (no existing log endpoints) so the dropdown isn't confusing. +- [ ] Pre-stage on a second screen / sticky note: API token, bucket name, target service ID. The presenter never types anything novel on-camera. + +**Hardware & capture.** + +- [ ] Recording browser: fresh Chrome profile named "Demo." No extensions, no bookmarks bar, no signed-in Google account. +- [ ] Display: 1920×1080 scaled resolution. macOS menu bar auto-hidden. Dock auto-hidden. Notifications silenced (Focus → Do Not Disturb). +- [ ] Cursor enlarged to 1.5× (Accessibility → Display). +- [ ] Audio: presenter wears wired headphones; mic at consistent distance if recording manually. + +**Assets.** + +- [ ] Title card (Fastly red, project name, tagline) — 3 s. +- [ ] Scene lower-third PNGs (one per scene, transparent background). +- [ ] End card with GitHub URL, docs URL, Apache-2.0 badge. +- [ ] Captioning source: paste finalized voiceover into Google Vids so auto-captions generate against the canonical text. + +### B. Day-of Recording + +- [ ] Re-run the full wizard against the throwaway account once **off-camera** as a dress rehearsal — confirms tokens valid, service available, no API errors. +- [ ] Record scenes **in numeric order** even though the cut comes between 2 and 3 — easier to edit, harder to lose track. +- [ ] Record each scene in **two takes minimum**. Keep the second take as the working track; the first is insurance. +- [ ] **Dual-track VO:** capture the human voiceover during screen recording. After all visuals are captured, generate the AI ("Narrator") VO against the same script in Google Vids and place it on a parallel audio track. Final mix gets picked in post. +- [ ] Between scenes, **don't quit the browser** — keep the demo state intact in case a retake is needed. +- [ ] Immediately after wrap: **revoke the recorded API token, delete the throwaway bucket, and tear down the throwaway service.** Confirm the token is dead before anyone walks away. + +### C. Post-Production + +- [ ] Apply Automatic Transcript Trim to every clip to automatically strip pauses and filler words. +- [ ] **Pacing Compression (Scene 2):** Apply a `1.25× – 1.5×` speed-up on the screen recording of the wizard typing/loading segments to keep the total runtime strictly under 5 minutes. +- [ ] **The "Time-Jump" Dissolve:** Apply a perfect `1.0s` cross-fade between Scene 2's green success screen and Scene 3's pre-populated dashboard, ensuring the VO transition aligns perfectly with the middle of the dissolve. +- [ ] Audio: normalize VO to −16 LUFS, gate cursor-click noise (if present from manual recording). +- [ ] Captions: review every line against the script for accuracy — especially "Fastly," "NGWAF," "Iceberg," "DuckDB," "Parquet," which AI captioning frequently mangles. +- [ ] Color-check the title and end cards on a non-OLED monitor; Fastly red can clip. +- [ ] Export master: H.264 / AAC / MP4, 1080p, ≤200 MB. Generate accompanying `.srt`. + +### D. QA & Sign-Off + +- [ ] **Watch on three devices**: 4K monitor, 13" laptop, phone in portrait. Confirm UI text is legible on the smallest. +- [ ] **Watch with sound off.** If the visual story is incomprehensible without VO, the cursor work is too fast. +- [ ] **Engineer review** (factual accuracy): one engineer who didn't write the script confirms every claim about the product is true *today* (not "soon"). +- [ ] **Legal / brand review:** confirm Fastly product naming, NGWAF reference, third-party logos (none expected), and that no real customer data or names are visible. +- [ ] **Security review:** scrub the final cut for any visible secrets — API tokens, bucket names tied to real accounts, hostnames, internal Slack URLs. +- [ ] **Sign-off matrix:** Product owner ✅ · Engineering ✅ · DevRel ✅ · Legal ✅ · Security ✅. + +### E. Contingency — If Something Goes Wrong On-Camera + +| Failure | Recovery | +| :--- | :--- | +| Wizard step 2 errors (token rejected, service list empty). | Cut. Verify token scope and account. Re-record from Scene 2 start with a fresh take — don't try to splice a recovery. | +| Provisioning hangs > 30 s on the install-log screen. | Let it run; pause the teleprompter. If still hung at 60 s, kill the recording — there's a real bug to fix before continuing. | +| Env B dashboard shows no anomaly in the recorded 7-day window. | Re-seed with a longer history and re-record Scenes 4 + 5. Don't ship a demo where the Insights tab looks empty. | +| AI voiceover mispronounces "Fastly" or "Iceberg." | Manually re-record the affected line in Google Vids and splice. Do not ship a mispronunciation of a product name. | + +### F. Distribution + +- [ ] Upload the master MP4 to YouTube as **unlisted** for stakeholder review; promote to public only after sign-off matrix is green. +- [ ] Embed in the GitHub repo `README.md` (replacing or augmenting the current architecture diagrams). +- [ ] Link from `docs/features.md` and the project's GitHub repo description. +- [ ] Cut a 60-second highlight reel (Scenes 1 + 4 + 8 + 10) for social posts and conference loops. +- [ ] Archive the source Google Vids project + raw recordings to shared drive; tag with the release version. + +--- + +## Session scoring (v1.1.0) + +Session scoring is now live for the demo service. Operators manage it from **/admin/session-scoring** in the dashboard. + +- **Headline capability:** real-time edge scoring (L1 cookie+timing signals plus L2 PageRank transition matrix → 0–100 score) with enforce-threshold-driven 429s for high-score sessions, gated by `fastly.ddos_detected` so the scorer is bypassed under attack. +- **Operations runbook:** see [`docs/session_scoring_runbook.md`](session_scoring_runbook.md) for enable/disable, threshold tuning, matrix retrain/restore, key rotation, and audit log review. + +--- + +*Last revised for the v1.0 launch cut. Update the runtime budget table in Part 2 whenever a scene's duration changes — that table is the source of truth, not the per-scene headers.* diff --git a/docs/features.md b/docs/features.md index 1cf5faa5..1e38f8f1 100644 --- a/docs/features.md +++ b/docs/features.md @@ -125,3 +125,42 @@ If a Next-Gen WAF workspace is linked during provisioning, the app syncs verifie ### Bring your own bucket (manual config) If you already have a bucket, you can configure the app by writing a JSON file in `configs/` instead of running the provisioning wizard. See `config.example.json` for the schema. Manual configuration supports the same `read_write` / `read_only` access levels as the wizard. + +## Session Scoring + +### What it does +Real-time edge scoring of every request as it transits Fastly. A two-layer scorer (L1 cookie+timing heuristics, L2 PageRank transition matrix over URL paths) produces a 0–100 score per request along with a reason code. Scores are logged alongside every line and land in DuckDB as ordinary columns, queryable from the dashboard, raw logs viewer, and SQL pad. + +### Custom log fields +Enabling scoring on a service appends six custom fields to the log format: + +- `sid` — rotating AES-encrypted session identifier carried in a first-party cookie (30 min idle / 24 h hard cap) +- `edge_score` — final 0–100 score for the request +- `edge_score_reason` — short reason code explaining the score (e.g. `tampered_cookie`, `path_anomaly`) +- `edge_cookie_compliance` — whether the request presented a valid, untampered session cookie +- `edge_l1_score` — L1 contribution (cookie integrity + inter-request timing) +- `edge_l2_score` — L2 contribution (PageRank transition probability for the current path given prior paths in the session) + +### VCL pattern +Scoring is wired into the request lifecycle through six VCL snippets that share state via a single restart. The order is `recv → pass → fetch → deliver → miss → enforce`, with the `enforce` snippet only firing on `req.restarts == 1` after the scorer has annotated the request. `recv` also unsets six client-controllable `X-Edge-*` headers to prevent score injection from the wire. + +### Admin UI +The `/admin/session-scoring` page is the operator console: + +- **StatusPanel** — per-service enable/disable, current threshold, enforcement state +- **ScoringHealthCard** — scorer counters (requests, tampered cookies, enforcement blocks, matrix load failures) +- **ThresholdSlider** — preview score distribution at a candidate threshold, then commit; toggles live enforcement +- **ROC + PR curves, per-reason AUC** — evaluation against the labeled set +- **Top-flagged, labels** — review the highest-scoring requests and assign ground-truth labels for retraining +- **Matrix history** — list past PageRank matrix versions and restore any of them (pre-restore snapshot is taken automatically) +- **Audit log** — every mutation (enable, threshold change, key rotation, matrix restore, etc.) is recorded per-service +- **Retrain, key rotation** — rebuild the L2 matrix from labeled data; rotate the AES sid key (current → previous slot) + +### Key endpoints +`/scoring/enable`, `/scoring/disable`, `/scoring/status`, `/scoring/labels`, `/scoring/top-flagged`, `/scoring/score-distribution`, `/scoring/compliance-breakdown`, `/scoring/threshold`, `/scoring/threshold-preview`, `/scoring/enforce-threshold`, `/scoring/retrain`, `/scoring/dashboard`, `/scoring/evaluation/per-reason`, `/scoring/audit`, `/scoring/rotate-key`, `/scoring/matrix-versions`, `/scoring/matrix-versions/{v}/restore`. + +### DDoS gate +The Compute scorer is bypassed when `fastly.ddos_detected` fires. Volumetric defense is Fastly's job; rate limiting is explicitly out of scope for session scoring. The score column on requests served during a DDoS event will be absent rather than synthesized. + +### Operations +See [session_scoring_runbook.md](session_scoring_runbook.md) for enable/disable procedures, threshold tuning, key rotation, matrix restore, and incident response. diff --git a/docs/session_scoring_runbook.md b/docs/session_scoring_runbook.md new file mode 100644 index 00000000..b5c548eb --- /dev/null +++ b/docs/session_scoring_runbook.md @@ -0,0 +1,273 @@ +# Session Scoring — Operator Runbook + +Day-2 operations for the session-scoring subsystem (v1.1.0). Aimed at the on-call engineer who already has the app deployed and a Fastly token in hand. + +| Field | Value | +| :--- | :--- | +| **Subsystem version** | v1.1.0 | +| **Surface area** | `/admin/session-scoring` (UI) · `/api/services/{svc}/scoring/*` (API) | +| **Storage** | `backend/metadata.db` (audit log) · Fastly Object Storage (`scoring_matrix_history/{version}.json`) · Compute ConfigStore (`enforce_threshold` key) | +| **Edge components** | 6 VCL snippets (recv / pass / fetch / deliver / miss / enforce) + scorer Wasm service | +| **Audit scope** | Per-host; not mirrored via `state_sync` | + +> Reminder: rate-limiting is **out of scope** for v1.1.0. Session scoring observes and (optionally) blocks at the score-threshold level. When `fastly.ddos_detected` fires, Compute is bypassed entirely — the gate is upstream. + +--- + +## Enable / Disable + +### Enable scoring on a service + +**UI.** `/admin/session-scoring` → pick the service → **Enable**. Wait for the install log to settle on green. + +**API.** + +```bash +curl -sS -X POST \ + "$HOST/api/services/$SVC/scoring/enable" \ + -H "Authorization: Bearer $TOKEN" +``` + +The orchestrator installs the following on your behalf — all rolled back together if any step fails: + +| Component | Where it lands | +| :--- | :--- | +| VCL snippets | The target VCL service — six snippets: `recv`, `pass`, `fetch`, `deliver`, `miss`, `enforce` | +| Custom log fields | Appended to the service's existing log format (does not displace your existing fields) | +| Scorer Wasm service | A separate Compute service in your account; receives the scoring requests from VCL | +| ConfigStores | `enforce_threshold` (live enforcement value) + `cookie_keys` (AES current/previous slots) | +| Cookie keys | Generated and seeded into the current slot of `cookie_keys` | + +Confirm with `GET /api/services/$SVC/scoring/status` — `enabled: true`, snippets installed, scorer service ID populated. + +### Disable scoring cleanly + +**UI.** `/admin/session-scoring` → service row → **Disable** → confirm. + +**API.** + +```bash +curl -sS -X POST \ + "$HOST/api/services/$SVC/scoring/disable" \ + -H "Authorization: Bearer $TOKEN" +``` + +What gets torn down: VCL snippets removed, custom fields unregistered, scorer Wasm service deactivated, ConfigStore entries cleared. + +What is **preserved per-host** (intentionally): + +- The `scoring_audit` table for this service in `metadata.db` — every prior mutation stays queryable. +- Matrix history under `scoring_matrix_history/` in the FOS bucket — re-enabling later can restore a prior version. + +If you need a hard wipe, delete the `scoring_audit` rows manually and remove the `scoring_matrix_history/{*}.json` objects from the bucket. Do this only when you're sure no compliance or forensics need them. + +--- + +## Operate + +### Rotate the AES cookie key + +**When.** On a regular cadence (quarterly is reasonable) or immediately on suspected compromise of a host that ran the admin UI. + +**How.** + +```bash +curl -sS -X POST \ + "$HOST/api/services/$SVC/scoring/rotate-key" \ + -H "Authorization: Bearer $TOKEN" +``` + +Or in the UI: `/admin/session-scoring` → service detail → **Rotate AES key**. + +**What happens.** The current key moves to the **previous** slot and a freshly generated key takes the **current** slot. The scorer accepts cookies signed by **either** key for one grace cycle, so in-flight sessions don't see a wave of tampered-cookie events at the moment of rotation. + +**IMPORTANT — do not double-rotate.** Rotating twice within seconds discards the original key (it cascades out of the previous slot before any session can be re-issued). Cookies signed under that original key will then be flagged as tampered. Always wait — at minimum long enough for one full request/response round-trip on your slowest sessions, comfortably one full minute — between rotations. The audit log records every rotation with its timestamp; check it before rotating again. + +### Roll back a bad matrix + +**When.** A retrain hurt AUC, the score distribution shifted in a way that doesn't match recent traffic, or per-reason metrics show a rule degrading. + +**How.** `/admin/session-scoring` → **Matrix history** tab → find the target version → **Restore** → confirm. + +API equivalent: + +```bash +curl -sS -X POST \ + "$HOST/api/services/$SVC/scoring/matrix-versions/$VERSION/restore" \ + -H "Authorization: Bearer $TOKEN" +``` + +**What happens immediately:** + +- A pre-restore snapshot is saved (so the restore itself is reversible — restoring the rolled-back version brings you back). +- Admin AUC and the dashboard score distribution reflect the restored matrix on the next refresh. +- The audit log records the restore with the source and target versions. + +**IMPORTANT — edge enforcement lags until the Wasm is redeployed.** The scorer service holds its scoring matrix **embedded in the Wasm binary**. Restoring a version updates the admin/control-plane view but does *not* re-flash the edge. Until you redeploy the scorer, edge enforcement runs against the **old** matrix while admin metrics show the **new** one — and that mismatch is exactly the kind of thing that causes "the dashboard looks fine but customers are still getting 429s." + +The restore API response includes a `deploy_hint` field with the exact command for your environment. The general shape is: + +```bash +scripts/scoring/deploy_wasm.sh \ + --service-id \ + --token +``` + +Run it. Confirm in `/admin/session-scoring` that the scorer service shows the redeploy timestamp matching the restore moment. Only then is the rollback fully live end-to-end. + +### Emergency disable of enforcement + +Enforcement is the part that returns `429`s. Scoring (the score appearing in logs) is independent of enforcement — you can run with scoring on and enforcement off indefinitely. + +**Fastest path (API).** + +```bash +# Verify the current enforced threshold +curl -sS "$HOST/api/services/$SVC/scoring/enforce-threshold" \ + -H "Authorization: Bearer $TOKEN" + +# Clear it +curl -sS -X PUT \ + "$HOST/api/services/$SVC/scoring/enforce-threshold?confirm=true" \ + -H "Authorization: Bearer $TOKEN" \ + -H "Content-Type: application/json" \ + -d '{"threshold": null}' +``` + +**UI path.** `/admin/session-scoring` → service detail → **ThresholdSlider** → **Disable** (the button is labeled this way only when enforcement is currently live). + +**Propagation.** The change is written to the Compute `enforce_threshold` ConfigStore key. Effective within seconds — the scorer reads the key on every request. No Wasm redeploy needed. + +**What it does and doesn't do.** + +- Disables 429s. The scorer stops emitting `X-Edge-Score-Enforce=1`, the VCL `enforce` snippet has nothing to act on, no requests get restarted with a 429. +- **Does not** disable scoring. Scores keep appearing in logs. The compliance/dashboard views continue to update. You retain visibility while the false-positive trigger is investigated. + +**Total kill switch.** If you need scoring itself off (not just enforcement), use the disable endpoint from the previous section: `POST /api/services/$SVC/scoring/disable`. That removes the VCL snippets and stops the scorer from running at all. + +### Change the enforce response code + +The enforce snippet defaults to returning `HTTP 429 Too Many Requests` for flagged requests. Operators can override this per-service to any 4xx/5xx code — common picks are `403` (policy block), `451` (legal), and `503` (degraded). The status code is baked into VCL at deploy time, so a change does a focused snippet swap (~5–10s end-to-end). + +**UI path.** `/admin/session-scoring` → service detail → **ThresholdSlider** → **Enforce response code** selector (next to the threshold). Picking a new code opens a confirm dialog before publishing. + +**API.** + +```bash +# Read the current code (returns default 429 when never overridden) +curl -sS "$HOST/api/services/$SVC/scoring/enforce-status-code" \ + -H "Authorization: Bearer $TOKEN" + +# Set a new code +curl -sS -X PUT \ + "$HOST/api/services/$SVC/scoring/enforce-status-code?confirm=true" \ + -H "Authorization: Bearer $TOKEN" \ + -H "Content-Type: application/json" \ + -d '{"status_code": 403}' + +# Reset to default (429) +curl -sS -X PUT \ + "$HOST/api/services/$SVC/scoring/enforce-status-code?confirm=true" \ + -H "Authorization: Bearer $TOKEN" \ + -H "Content-Type: application/json" \ + -d '{"status_code": null}' +``` + +**Validation.** Backend rejects anything outside 400–599. The reason phrase is auto-mapped to the HTTP standard for known codes (403→Forbidden, 451→Unavailable For Legal Reasons, 503→Service Unavailable, …) and falls back to `Blocked` for unmapped codes. Audit-logged as `scoring_enforce_status_code_changed`. + +**Note.** Disabling enforcement (previous section) is unaffected by the response code — flipping enforcement off / on does not change the configured code, and changing the code does not flip enforcement. + +### Read the audit log + +**UI.** `/admin/session-scoring` → **Audit** tab. Filterable by event type and time window. + +**API.** + +```bash +curl -sS "$HOST/api/services/$SVC/scoring/audit?limit=200" \ + -H "Authorization: Bearer $TOKEN" +``` + +**What's recorded.** Every mutation: enable, disable, threshold commit/clear, enforcement set/cleared, retrain, key rotation, matrix restore. Each row has the actor, timestamp, and a structured payload describing the before/after state. + +**Per-host scope.** The audit log lives in this host's `metadata.db` and is **not** mirrored via `state_sync`. If the same logical service is operated from multiple admin hosts (e.g. a primary and a hot-standby), each host has its own independent audit log. For a complete history, query each host and merge by timestamp. + +--- + +## Diagnose + +### "AUC dropped after the last retrain." + +1. `/admin/session-scoring` → **Per-reason AUC** card. Look for the rule whose AUC dropped — that's usually a single contributor driving the aggregate. +2. Confirm against the Matrix history tab: the version timestamp lines up with the retrain in the audit log. +3. If the regression is real and not a data artifact, roll back to the prior matrix (see *Roll back a bad matrix*). Remember the edge redeploy step. + +### "Enforcement is 429-ing real users." + +1. Open ThresholdSlider — the counterfactual preview shows what the **Precision %** and block volume would be at every candidate threshold against the recent score distribution. +2. If a higher threshold preserves true-positive coverage at acceptable precision: commit the new threshold via the slider. +3. If no threshold looks acceptable: disable enforcement entirely (see *Emergency disable of enforcement*). Scoring stays on for visibility while you dig in. + +### "Cookie compliance shows a lot of tampered cookies." + +Most common causes, in order: + +1. **A double rotation** within the grace window (see the *IMPORTANT* note in *Rotate the AES cookie key*). Correlate the spike timestamp against `rotate-key` entries in the audit log. +2. **A replay or tampering attack** — a real bot population trying to forge cookies. Cross-reference with the dashboard's top-flagged-sessions list. +3. **A misconfigured upstream cache** stripping the `Set-Cookie` on first response (rarer, but causes the same symptom). + +The fix for cause 1 is patience — wait one grace cycle and the noise subsides. The fix for cause 2 is enforcement (if it isn't already on). The fix for cause 3 is a cache config audit. + +### "What does the scorer think it's doing?" + +The Rust scorer keeps four `AtomicU64` counters and flushes them via `dbg_log` every 1000 requests: + +- `TAMPERED_COOKIE_COUNT` — cookies that failed AES verification. +- `ENFORCE_BLOCK_COUNT` — requests that emitted `X-Edge-Score-Enforce=1`. +- `MATRIX_LOAD_FAIL_COUNT` — matrix lookup failures (should be zero in steady state). +- `REQUEST_COUNT` — total requests processed. + +In the backend's ingested logs, grep for the emitted line: + +``` +metrics: tampered=... enforce_block=... matrix_fail=... requests=... +``` + +Rates are easier to reason about than absolute counts — divide each by the delta in `REQUEST_COUNT` between two flushes to get per-request rates. + +If `MATRIX_LOAD_FAIL_COUNT` is non-zero, the embedded matrix is corrupt or unreadable — redeploy the Wasm. If `TAMPERED_COOKIE_COUNT / REQUEST_COUNT` exceeds the baseline you've established for this service, run the cookie-compliance diagnosis above. + +--- + +## Reference + +- **Endpoints.** Full schema is at `/docs` (FastAPI Swagger UI on the running backend). Quick index: + + | Method | Path | Purpose | + | :--- | :--- | :--- | + | POST | `/api/services/{svc}/scoring/enable` | Provision scoring on a service | + | POST | `/api/services/{svc}/scoring/disable` | Tear down scoring (audit preserved) | + | GET | `/api/services/{svc}/scoring/status` | Installation + health snapshot | + | GET | `/api/services/{svc}/scoring/labels` | Score labels (good / suspicious / bad bands) | + | GET | `/api/services/{svc}/scoring/top-flagged` | Highest-scoring sessions in window | + | GET | `/api/services/{svc}/scoring/score-distribution` | Histogram for ThresholdSlider | + | GET | `/api/services/{svc}/scoring/compliance-breakdown` | Cookie compliance counters | + | GET / PUT | `/api/services/{svc}/scoring/threshold` | Commit-style threshold | + | GET | `/api/services/{svc}/scoring/threshold-preview` | Counterfactual preview | + | POST | `/api/services/{svc}/scoring/retrain` | Recompute matrix from recent traffic | + | GET | `/api/services/{svc}/scoring/dashboard` | Aggregated dashboard payload | + | GET | `/api/services/{svc}/scoring/evaluation/per-reason` | Per-rule AUC | + | GET | `/api/services/{svc}/scoring/audit` | Audit log | + | POST | `/api/services/{svc}/scoring/rotate-key` | Rotate AES cookie key | + | GET | `/api/services/{svc}/scoring/matrix-versions` | List matrix snapshots | + | POST | `/api/services/{svc}/scoring/matrix-versions/{v}/restore` | Restore a matrix version | + | GET / PUT | `/api/services/{svc}/scoring/enforce-threshold` | Live enforcement value (ConfigStore-backed) | + +- **Custom log fields** added by enable: see `docs/features.md` for the full schema and the field-group toggles. +- **Architecture notes.** L1 (cookie + timing) + L2 (PageRank transition matrix) → 0–100 score. AES-encrypted cookie state with rotating `sid`; 30-minute idle timeout; 24-hour hard cap. See `backend/scoring/scorer.py` + `compute/scorer/src/scorer.rs` for the implementation. +- **Security headers.** The `recv` snippet unsets six client-controllable `X-Edge-*` headers before any scoring logic runs — clients cannot spoof scores or compliance state. +- **DDoS boundary.** `fastly.ddos_detected` bypasses the Compute scorer entirely; under attack, Fastly's upstream gate handles it. + +--- + +*Last revised for v1.1.0. When the API surface changes, update the endpoint table above — operators copy-paste from it, so stale entries cost real time.* diff --git a/frontend/Dockerfile b/frontend/Dockerfile index 6e0a46d7..91edb0c0 100644 --- a/frontend/Dockerfile +++ b/frontend/Dockerfile @@ -1,6 +1,6 @@ # --- OpenAPI Schema Generation Stage --- # Needs the Python backend to introspect FastAPI routes and emit openapi.json. -FROM python:3.12-slim-bullseye AS api-schema +FROM python:3.12-slim-bookworm AS api-schema COPY --from=ghcr.io/astral-sh/uv:0.6.14 /uv /uvx /bin/ diff --git a/frontend/__tests__/components/AppLayout.test.tsx b/frontend/__tests__/components/AppLayout.test.tsx index ae81b108..d23a67b4 100644 --- a/frontend/__tests__/components/AppLayout.test.tsx +++ b/frontend/__tests__/components/AppLayout.test.tsx @@ -1,10 +1,21 @@ import { render, screen, act } from '@testing-library/react' import { expect, test, vi, beforeEach } from 'vitest' +import { QueryClient, QueryClientProvider } from '@tanstack/react-query' import { AppLayout } from '@/components/AppLayout' import { useServiceStore } from '@/stores/serviceStore' import { useBootstrap } from '@/hooks/useBootstrap' import React from 'react' +// AppLayout now calls useQueryClient() to implement the navigation-cancel +// pattern (cancel in-flight queries on route change). Tests need a real +// QueryClientProvider in the tree or useQueryClient throws. +function renderWithQueryClient(ui: React.ReactElement) { + const queryClient = new QueryClient({ + defaultOptions: { queries: { retry: false } }, + }) + return render({ui}) +} + // Mock next/navigation vi.mock('next/navigation', () => ({ usePathname: vi.fn(() => '/dashboard'), @@ -62,7 +73,7 @@ beforeEach(() => { }) test('renders AppLayout with standard navigation', () => { - render(
Content
) + renderWithQueryClient(
Content
) expect(screen.getAllByText('Dashboard').length).toBeGreaterThan(0) expect(screen.getByText('Usage & Cost')).toBeInTheDocument() @@ -79,7 +90,7 @@ test('hides restricted items for analysts', () => { }) }) - render(
Content
) + renderWithQueryClient(
Content
) expect(screen.getAllByText('Dashboard').length).toBeGreaterThan(0) expect(screen.queryByText('Usage & Cost')).not.toBeInTheDocument() @@ -99,7 +110,7 @@ test('renders analyst watermark when bootstrap reports remote analyst', () => { isLoading: false, } as any) - render(
Content
) + renderWithQueryClient(
Content
) const watermark = screen.getByTestId('analyst-watermark') expect(watermark).toBeInTheDocument() @@ -108,6 +119,6 @@ test('renders analyst watermark when bootstrap reports remote analyst', () => { }) test('does not render watermark for non-analyst users', () => { - render(
Content
) + renderWithQueryClient(
Content
) expect(screen.queryByTestId('analyst-watermark')).not.toBeInTheDocument() }) diff --git a/frontend/__tests__/components/TeardownDialog.test.tsx b/frontend/__tests__/components/TeardownDialog.test.tsx index 320f4d37..a32ff8b0 100644 --- a/frontend/__tests__/components/TeardownDialog.test.tsx +++ b/frontend/__tests__/components/TeardownDialog.test.tsx @@ -24,7 +24,7 @@ vi.mock('@/hooks/useSSE', () => ({ }) })) -test('TeardownDialog starts the SSE stream when the user clicks Execute Teardown', async () => { +test('TeardownDialog starts the SSE stream when the user enters a token and clicks Execute Teardown', async () => { const user = userEvent.setup() const onOpenChange = vi.fn() const onComplete = vi.fn() @@ -40,15 +40,64 @@ test('TeardownDialog starts the SSE stream when the user clicks Execute Teardown expect(screen.getByText('Teardown: Test Service')).toBeDefined() - // Role-based lookup: this is the actual - - Share Dashboard -
@@ -814,63 +877,99 @@ export default function AdminPage() { -
-
-
- -

- Displays a panel at the bottom of the screen with the underlying DuckDB SQL queries and their execution times. -

-
- -
-
-
- -

- Displays a panel at the bottom of the screen with all Fastly API calls and FOS operations (HEAD, GET, PUT, etc) made during the request. -

+
+ {/* Compact 2-up grid for the simple toggle/button rows. Each box + has a fixed shape: title + description block at the top, then a + right-aligned control strip pinned to the bottom — so the four + cards line up visually even when the control sets differ in + width (single Switch vs Switch + inputs + button). Bot + Intelligence Sources stays full-width below because it embeds + a data table that would compress poorly in a half-column. */} +
+
+
+ +

+ Bottom-of-screen panel with DuckDB SQL queries and execution times. +

+ {!debugBackendOn && ( +

+ Disabled — backend ``DEBUG_RESPONSES`` env is off. +

+ )} +
+
+ +
- -
-
-
- -

- Records every FOS Class A/B operation and CDN download with function name and process context for cost optimization analysis. -

+ +
+
+ +

+ Bottom-of-screen panel with all Fastly API calls and FOS operations per request. +

+ {!debugBackendOn && ( +

+ Disabled — backend ``DEBUG_RESPONSES`` env is off. +

+ )} +
+
+ +
-
- {usageLoggingEnabled && ( - <> - - - - )} - + +
+
+ +

+ Records every Class A/B operation and CDN download with function + process context for cost analysis. +

+
+
+ {usageLoggingEnabled && ( + <> + + + + )} + +
-
-
-
- -

- Fastly PoP coordinates used by the Impossible Distance insight to detect geo/RTT spoofing. -

+ +
+
+ +

+ Fastly PoP coordinates used by the Impossible Distance insight for geo/RTT spoofing detection. +

+
+
+ +
-
{/* Bot Intelligence Sources */} @@ -1188,6 +1287,54 @@ export default function AdminPage() {
+ {/* Security: token must be supplied before workspace fetch + AND before workspace save. Single input drives both. */} + {ngwafService && !ngwafSaved && ( +
+ +

+ Required to list AND save NGWAF workspace bindings (security /). +

+
+ setNgwafApiToken(e.target.value)} + className="h-8 font-mono text-xs flex-1" + autoComplete="off" + /> + +
+
+ )} + {ngwafFetching ? (

Loading workspaces…

) : ngwafWorkspaces.length > 0 ? ( @@ -1223,13 +1370,22 @@ export default function AdminPage() { + + + Back to Admin + + + + + + + + Overview + Labels + Matrix history + Audit + + + + + + + + + +
+ + +
+
+ + + + + + + + + + + + +
+
+ ) +} diff --git a/frontend/app/admin/share/loading.tsx b/frontend/app/admin/share/loading.tsx new file mode 100644 index 00000000..5ce251d2 --- /dev/null +++ b/frontend/app/admin/share/loading.tsx @@ -0,0 +1,5 @@ +import { FormSkeleton } from '@/components/skeletons/PageSkeleton' + +export default function Loading() { + return +} diff --git a/frontend/app/admin/share/page.tsx b/frontend/app/admin/share/page.tsx index d9d3b03a..14e27911 100644 --- a/frontend/app/admin/share/page.tsx +++ b/frontend/app/admin/share/page.tsx @@ -1,7 +1,8 @@ 'use client' import * as React from 'react' -import { ArrowLeft, ShieldAlert } from 'lucide-react' +import { useQuery, useQueryClient } from '@tanstack/react-query' +import { AlertTriangle, ArrowLeft, ShieldAlert } from 'lucide-react' import Link from 'next/link' import { Alert, AlertDescription } from '@/components/ui/alert' @@ -13,31 +14,48 @@ import { InvitationsPanel } from '@/components/share-dashboard/InvitationsPanel' import { SessionsPanel } from '@/components/share-dashboard/SessionsPanel' import { SharingControlPanel } from '@/components/share-dashboard/SharingControlPanel' import type { ShareStatus } from '@/components/share-dashboard/utils' +import { FormSkeleton } from '@/components/skeletons/PageSkeleton' import { client, extractApiError } from '@/lib/api' +// Shared query key so the hover-prefetch on the Admin → Share Dashboard +// link (in /admin/page.tsx) populates the same React Query cache entry +// the page reads on mount. Resulting UX: by the time the operator +// clicks Share Dashboard, the status payload is already in cache — +// page paints real content immediately instead of skeleton-then-swap. +export const SHARE_STATUS_QUERY_KEY = ['admin', 'share', 'status'] as const + export default function ShareDashboardPage() { - const [status, setStatus] = React.useState(null) - const [statusError, setStatusError] = React.useState('') + const queryClient = useQueryClient() const [actionError, setActionError] = React.useState('') const [activeTab, setActiveTab] = React.useState('invites') const [auditEmailFilter, setAuditEmailFilter] = React.useState('') - const refresh = React.useCallback(async () => { - setStatusError('') - try { - const { data, response } = await client.GET('/api/admin/share/status' as any, {}) + // React Query handles the polling, cache, and prefetch interop. The + // refetchInterval matches the previous setInterval(refresh, 10_000) + // cadence so live updates while the operator is on this page still + // refresh at the same rate. + const { data: status, error: statusError, refetch } = useQuery({ + queryKey: SHARE_STATUS_QUERY_KEY, + queryFn: async ({ signal }) => { + const { data, response } = await client.GET('/api/admin/share/status' as any, { signal, }) if (!response.ok) throw new Error(`status ${response.status}`) - setStatus(data as any) - } catch (e: any) { - setStatusError(extractApiError(e) || 'unable to load status') - } - }, []) - - React.useEffect(() => { - refresh() - const id = setInterval(refresh, 10_000) - return () => clearInterval(id) - }, [refresh]) + return data as ShareStatus + }, + refetchInterval: 10_000, + // Treat as fresh for 5s so the hover-prefetch immediately preceding + // a click is reused, but live polling stays at 10s. + staleTime: 5_000, + }) + const refresh = React.useCallback(async () => { + await refetch() + queryClient.invalidateQueries({ queryKey: SHARE_STATUS_QUERY_KEY }) + }, [refetch, queryClient]) + const statusErrorMsg = statusError + ? extractApiError(statusError as any) || (statusError as Error).message || 'unable to load status' + : '' + // React Query yields ``undefined`` until the first fetch resolves; the + // child panels' props are typed ``ShareStatus | null``, so coerce. + const statusForPanels: ShareStatus | null = status ?? null return (
@@ -46,15 +64,28 @@ export default function ShareDashboardPage() { description="Start the share tunnel, manage analyst invitations, monitor live sessions, and review the audit trail." icon={ShieldAlert} > - + Back to Admin - {statusError && ( + + + + Secure your server before sharing.{' '} + Remote sharing exposes this dashboard to invited analysts over the public internet. + Before enabling the tunnel, confirm the host has an up-to-date OS, firewall rules + restricting inbound access to the share port, and (recommended) only allows the + tunnel through a reverse proxy you control. Each invited analyst gets read-only + access scoped to the services you grant — but the underlying server is yours to + harden. + + + + {statusErrorMsg && ( - {statusError} + {statusErrorMsg} )} {actionError && ( @@ -63,40 +94,54 @@ export default function ShareDashboardPage() { )} - + {statusForPanels === null && !statusErrorMsg ? ( + // First-render skeleton: React Query yields ``undefined`` (which + // we coerce to null) until the initial /api/admin/share/status + // fetch returns. Pre-fix the page showed an empty + // SharingControlPanel + tabs with no data — looked broken until + // ~300ms later when status arrived. With the hover-prefetch on + // the Admin → Share Dashboard link, this skeleton usually does + // not appear at all (cache hit), but it's a clean fallback on + // cold navigations. + + ) : ( + <> + - - - Invitations - Sessions - Audit - + + + Invitations + Sessions + Audit + - - { - setAuditEmailFilter(email) - setActiveTab('audit') - }} - /> - + + { + setAuditEmailFilter(email) + setActiveTab('audit') + }} + /> + - - - + + + - - setAuditEmailFilter('')} - /> - - + + setAuditEmailFilter('')} + /> + + + + )}
) } diff --git a/frontend/app/admin/usage-log/loading.tsx b/frontend/app/admin/usage-log/loading.tsx new file mode 100644 index 00000000..57345862 --- /dev/null +++ b/frontend/app/admin/usage-log/loading.tsx @@ -0,0 +1,5 @@ +import { TableSkeleton } from '@/components/skeletons/PageSkeleton' + +export default function Loading() { + return +} diff --git a/frontend/app/admin/usage-log/page.tsx b/frontend/app/admin/usage-log/page.tsx index 33202f69..1198e531 100644 --- a/frontend/app/admin/usage-log/page.tsx +++ b/frontend/app/admin/usage-log/page.tsx @@ -99,8 +99,8 @@ function LogAccountingPanel() { const preset = LOG_ACCOUNTING_PRESETS[presetIdx] const { data, isLoading, isFetching, error } = useQuery({ queryKey: ['log-accounting', preset.hours, preset.by], - queryFn: async () => { - const { data, error } = await client.GET('/api/admin/log-accounting', { + queryFn: async ({ signal }) => { + const { data, error } = await client.GET('/api/admin/log-accounting', { signal, params: { query: { hours: preset.hours, by: preset.by } }, }) if (error) throw new Error(extractApiError(error)) @@ -283,8 +283,8 @@ export default function UsageLogPage() { const { data, isLoading, isFetching, refetch } = useQuery({ queryKey: ['usage-log', activeServiceId, startTime, endTime, usageType, processFilter, operationFilter], - queryFn: async () => { - const { data, error } = await client.GET('/api/admin/usage-log', { + queryFn: async ({ signal }) => { + const { data, error } = await client.GET('/api/admin/usage-log', { signal, params: { query: { service_id: activeServiceId || '', diff --git a/frontend/app/alerts/loading.tsx b/frontend/app/alerts/loading.tsx new file mode 100644 index 00000000..57345862 --- /dev/null +++ b/frontend/app/alerts/loading.tsx @@ -0,0 +1,5 @@ +import { TableSkeleton } from '@/components/skeletons/PageSkeleton' + +export default function Loading() { + return +} diff --git a/frontend/app/alerts/page.tsx b/frontend/app/alerts/page.tsx index a338327c..31cbbb29 100644 --- a/frontend/app/alerts/page.tsx +++ b/frontend/app/alerts/page.tsx @@ -56,6 +56,8 @@ import { ColumnVisibilityDropdown } from '@/components/DataTable' import { VisibilityState } from '@tanstack/react-table' import type { components } from '@/types/api.generated' import { ButtonGroup } from '@/components/ui/button-group' +import { useTimeLayout } from '@/lib/chart-helpers' +import { useTimezoneStore } from '@/stores/timezoneStore' type Alert = components["schemas"]["Alert"] @@ -72,9 +74,10 @@ export default function AlertsPage() { const { data: loggingSettings } = useQuery({ queryKey: ['loggingSettings', activeServiceId], - queryFn: async () => { + queryFn: async ({ signal }) => { if (!activeServiceId) return null const { data } = await client.GET("/api/services/{service_id}/logging-settings", { + signal, params: { path: { service_id: activeServiceId } } }) return data as any @@ -86,14 +89,15 @@ export default function AlertsPage() { const { data: alertsRes, isLoading, isFetching, refetch } = useQuery({ queryKey: ['alerts', activeServiceId], - queryFn: async () => { + queryFn: async ({ signal }) => { if (activeServiceId) { const { data } = await client.GET("/api/alerts/{service_id}", { + signal, params: { path: { service_id: activeServiceId } } }) return data } else { - const { data } = await client.GET("/api/alerts/") + const { data } = await client.GET("/api/alerts/", { signal }) return data } }, @@ -534,6 +538,11 @@ function CreateAlertForm({ initialAlert, onSuccess }: { initialAlert?: Alert | n const metricField = React.useMemo(() => catalog?.fields?.find(f => f.id === metric), [catalog, metric]) + const { timezone } = useTimezoneStore() + const startTime = React.useMemo(() => previewData?.times?.[0], [previewData]) + const endTime = React.useMemo(() => previewData?.times?.[previewData?.times?.length - 1], [previewData]) + const timeLayout = useTimeLayout(startTime, endTime, timezone) + const getHoverTemplate = React.useCallback((m: string, label?: string) => { const pre = label ? `${label}: ` : '' const field = m === metric ? metricField : catalog?.fields?.find(f => f.id === m) @@ -898,20 +907,14 @@ function CreateAlertForm({ initialAlert, onSuccess }: { initialAlert?: Alert | n }] : []) ]} layout={{ - hovermode: 'x unified', + ...timeLayout, margin: { t: 10, r: 10, l: 40, b: 30 }, - showlegend: true, - legend: { orientation: 'h', y: 1.15, x: 1, xanchor: 'right', yanchor: 'bottom' }, paper_bgcolor: 'transparent', plot_bgcolor: 'transparent', xaxis: { + ...timeLayout.xaxis, showgrid: false, - type: 'date', - zeroline: false, - nticks: 8, - tickangle: -45, - automargin: true, - tickformatstops: CHART_LAYOUT_DEFAULTS.tickformatstops + zeroline: false }, yaxis: { title: metricField?.unit || (metric === 'requests' ? 'reqs' : ''), diff --git a/frontend/app/charts/loading.tsx b/frontend/app/charts/loading.tsx new file mode 100644 index 00000000..31dce024 --- /dev/null +++ b/frontend/app/charts/loading.tsx @@ -0,0 +1,5 @@ +import { ChartsGridSkeleton } from '@/components/skeletons/PageSkeleton' + +export default function Loading() { + return +} diff --git a/frontend/app/charts/page.tsx b/frontend/app/charts/page.tsx index 1c022f25..fc45322d 100644 --- a/frontend/app/charts/page.tsx +++ b/frontend/app/charts/page.tsx @@ -7,6 +7,7 @@ import { useFilterStore } from '@/stores/filterStore' import { useServiceStore } from '@/stores/serviceStore' import { useFilterPayload } from '@/hooks/useFilterPayload' import { useServiceQuery } from '@/hooks/useServiceQuery' +import { STALE_VIEW_RETRY_OPTIONS, throwIfStaleAggregates } from '@/lib/staleViewRetry' import { formatValue } from '@/lib/format' import { PlotlyChart } from '@/components/PlotlyChart' import { AnalyticsCard } from '@/components/AnalyticsCard' @@ -56,8 +57,8 @@ export default function ChartsPage() { const { data: aggregates, isLoading, isFetching } = useServiceQuery( ['charts', 'aggregates', activeServiceId, startTime, endTime, filterPayload], - async () => { - const { data } = await client.POST("/api/dashboard/aggregates", { + async ({ signal }) => { + const { data } = await client.POST("/api/dashboard/aggregates", { signal, body: { start_time: startTime, end_time: endTime, @@ -66,8 +67,9 @@ export default function ChartsPage() { chart_metric: 'requests' } }) - return data - } + return throwIfStaleAggregates(data) + }, + STALE_VIEW_RETRY_OPTIONS, ) const chartLayout = { diff --git a/frontend/app/dashboard/loading.tsx b/frontend/app/dashboard/loading.tsx new file mode 100644 index 00000000..3e863411 --- /dev/null +++ b/frontend/app/dashboard/loading.tsx @@ -0,0 +1,5 @@ +import { DashboardSkeleton } from '@/components/skeletons/PageSkeleton' + +export default function Loading() { + return +} diff --git a/frontend/app/dashboard/page.tsx b/frontend/app/dashboard/page.tsx index 49466e26..6300b10d 100644 --- a/frontend/app/dashboard/page.tsx +++ b/frontend/app/dashboard/page.tsx @@ -1,17 +1,39 @@ 'use client' import React from 'react' +import dynamic from 'next/dynamic' import { useCardVisibility } from '@/hooks/useCardVisibility' import { useQuery, keepPreviousData } from '@tanstack/react-query' import { useServiceQuery } from '@/hooks/useServiceQuery' import { client } from '@/lib/api' +import { STALE_VIEW_RETRY_OPTIONS, throwIfStaleAggregates } from '@/lib/staleViewRetry' import { useFilterStore } from '@/stores/filterStore' import { useServiceStore } from '@/stores/serviceStore' import { useIsDataReady } from '@/hooks/useIsDataReady' import { useFieldLabel } from '@/hooks/useFieldLabel' import { TimeSeriesChart } from '@/components/charts/TimeSeriesChart' import { FilterPopover } from '@/components/FilterPopover' -import { ChoroplethMap } from '@/components/Map/ChoroplethMap' +import { LazyMount } from '@/components/LazyMount' + +// ChoroplethMap pulls in d3-geo and the world-110m topojson. Static-import +// blocked the dashboard's initial JS parse/eval; dynamic-import slices it +// off the critical path so the rest of the page paints immediately. +// ssr:false because d3-geo uses canvas/SVG measurement APIs that don't +// work in the server-render pass. +const ChoroplethMap = dynamic( + () => import('@/components/Map/ChoroplethMap').then((m) => ({ default: m.ChoroplethMap })), + { + ssr: false, + loading: () => ( +
+ Loading map… +
+ ), + }, +) import { TopTenTable } from '@/components/Dashboard/TopTenTable' import { DashboardHeader } from '@/components/Dashboard/DashboardHeader' import { DataTable } from '@/components/DataTable' @@ -35,6 +57,7 @@ import { AnalyticsCard } from '@/components/AnalyticsCard' import { useShallow } from 'zustand/react/shallow' import { useLogFieldsCatalog } from '@/hooks/useLogFieldsCatalog' import { useDashboardCards } from '@/hooks/useDashboardCards' +import { FlagSessionPopover, type LabelValue } from '@/components/SessionScoring/FlagSessionPopover' // ── Constants ────────────────────────────────────────────────────────────────── @@ -127,6 +150,21 @@ const CATEGORIZED_CARD_IDS = new Set(CARD_CATEGORIES.flatMap(c => c.cardIds)) const COLLAPSED_SECTIONS_KEY = 'dashboard_collapsed_sections' +// Raw-logs panel: which columns to fetch. Previously the panel pulled SELECT * +// (~75 cols) on every dashboard load, which dominated /api/dashboard/raw time +// because wide text fields (ua, referer, url, ja3, etc.) bloat the parquet +// read. Default set covers the columns most users actually look at; everything +// else can be opted in via the column dropdown (which triggers a refetch). +// `timestamp` is always included so the default sort doesn't break. +const RAW_COLUMNS_STORAGE_KEY = 'dashboard_raw_columns' +const DEFAULT_RAW_COLUMNS = [ + 'timestamp', 'ip', 'country', 'host', 'url', 'method', + 'status', 'cache', 'elapsed', 'resp_bytes', 'ttfb', 'ua', 'edge_sid', +] +// Catalog ids that aren't real parquet columns and can't be returned per-row +// (they're aggregate-only views like the exploded waf_sig signal breakdown). +const RAW_DROPDOWN_EXCLUDE = new Set(['waf_sig_ind', 'edge_score_reason_ind', '_source_file']) + // ── Page ─────────────────────────────────────────────────────────────────────── export default function DashboardPage() { @@ -230,8 +268,8 @@ export default function DashboardPage() { const { data: aggregates, isLoading: isLoadingAggs, isFetching: isFetchingAggs } = useServiceQuery( ['dashboard', 'aggregates', activeServiceId, startTime, endTime, filterPayload, metric, config.effectiveInterval], - async () => { - const { data } = await client.POST("/api/dashboard/aggregates", { + async ({ signal }) => { + const { data } = await client.POST("/api/dashboard/aggregates", { signal, body: { start_time: startTime!, end_time: endTime!, @@ -240,14 +278,15 @@ export default function DashboardPage() { chart_interval: config.effectiveInterval } }) - return data - } + return throwIfStaleAggregates(data) + }, + STALE_VIEW_RETRY_OPTIONS, ) const { data: compareAggregates } = useQuery({ queryKey: ['dashboard', 'aggregates', 'compare', activeServiceId, compareStartTime, compareEndTime, filterPayload, metric, config.effectiveInterval], - queryFn: async () => { - const { data } = await client.POST("/api/dashboard/aggregates", { + queryFn: async ({ signal }) => { + const { data } = await client.POST("/api/dashboard/aggregates", { signal, body: { start_time: compareStartTime!, end_time: compareEndTime!, @@ -256,18 +295,47 @@ export default function DashboardPage() { chart_interval: config.effectiveInterval } }) - return data + return throwIfStaleAggregates(data) }, - enabled: isReady && compareMode && !!compareStartTime && !!compareEndTime + enabled: isReady && compareMode && !!compareStartTime && !!compareEndTime, + ...STALE_VIEW_RETRY_OPTIONS, }) const [sorting, setSorting] = React.useState([{ id: 'timestamp', desc: true }]) + // User-selected raw-log columns. `timestamp` is forced into the list + // because the default sort references it; without it the API picks an + // arbitrary sort col and the table feels broken. + const [selectedRawColumns, setSelectedRawColumns] = React.useState(() => { + if (typeof window === 'undefined') return DEFAULT_RAW_COLUMNS + try { + const raw = localStorage.getItem(RAW_COLUMNS_STORAGE_KEY) + const parsed = raw ? JSON.parse(raw) : null + if (Array.isArray(parsed) && parsed.length > 0) { + return parsed.includes('timestamp') ? parsed : ['timestamp', ...parsed] + } + } catch { /* fall through to default */ } + return DEFAULT_RAW_COLUMNS + }) + + const toggleRawColumn = React.useCallback((id: string, visible: boolean) => { + setSelectedRawColumns(prev => { + const set = new Set(prev) + if (visible) set.add(id) + else if (id !== 'timestamp') set.delete(id) + const next = Array.from(set) + try { + localStorage.setItem(RAW_COLUMNS_STORAGE_KEY, JSON.stringify(next)) + } catch { /* ignore quota / private-mode errors */ } + return next + }) + }, []) + const { data: rawLogs, isLoading: isLoadingRaw, isFetching: isFetchingRaw } = useServiceQuery( - ['dashboard', 'raw', activeServiceId, startTime, endTime, filterPayload, sorting], - async () => { + ['dashboard', 'raw', activeServiceId, startTime, endTime, filterPayload, sorting, selectedRawColumns], + async ({ signal }) => { const sort = sorting[0] - const { data } = await client.POST("/api/dashboard/raw", { + const { data } = await client.POST("/api/dashboard/raw", { signal, body: { start_time: startTime!, end_time: endTime!, @@ -276,7 +344,7 @@ export default function DashboardPage() { page: 1, sort_col: sort?.id, sort_dir: sort?.desc ? 'desc' : 'asc', - columns: [] + columns: selectedRawColumns } }) return data @@ -285,8 +353,8 @@ export default function DashboardPage() { const { data: topBotsData } = useQuery({ queryKey: ['dashboard', 'top-bots', activeServiceId, startTime, endTime, filterPayload], - queryFn: async () => { - const { data } = await client.POST("/api/security/top-bots", { + queryFn: async ({ signal }) => { + const { data } = await client.POST("/api/security/top-bots", { signal, body: { start_time: startTime!, end_time: endTime!, @@ -495,12 +563,72 @@ export default function DashboardPage() { }, [addFilter]) // ── Raw logs columns ─────────────────────────────────────────────────────── - - const [rawVisibility, setRawVisibility] = React.useState>({}) + + // Catalog-driven option list for the raw-logs column dropdown. Lets + // users toggle on heavy fields (ua, referer, ja4, etc.) that aren't in + // DEFAULT_RAW_COLUMNS — toggling refetches with the expanded set. + const rawColumnOptions = React.useMemo(() => { + const fields = (catalog?.fields as any[]) || [] + const seen = new Set() + const out: { id: string; label: string }[] = [] + for (const f of fields) { + if (!f?.id || RAW_DROPDOWN_EXCLUDE.has(f.id) || f.group === 'METRICS') continue + if (seen.has(f.id)) continue + seen.add(f.id) + out.push({ id: f.id, label: getFieldLabel(f.id) }) + } + // Defensive: ensure any currently-selected column not present in the + // catalog (e.g. custom field that bootstrap hasn't loaded yet) still + // shows up checked in the dropdown. + for (const id of selectedRawColumns) { + if (!seen.has(id)) { + seen.add(id) + out.push({ id, label: getFieldLabel(id) }) + } + } + return out + }, [catalog, getFieldLabel, selectedRawColumns]) + + const rawColumnVisibility = React.useMemo(() => { + const v: Record = {} + for (const opt of rawColumnOptions) v[opt.id] = selectedRawColumns.includes(opt.id) + return v + }, [rawColumnOptions, selectedRawColumns]) + + // hasSidCol still drives the FLAG-COLUMN render below — it can't + // be determined until rawLogs returns. labelsQuery, however, fires + // immediately on serviceId (see comment on labelsQuery below). + const hasSidCol = !!rawLogs?.columns?.includes('edge_sid') + + // Pull session-labels for the active service so we can render a + // colored Flag icon per row reflecting the current label state. + // Fire as soon as a serviceId is known — previously this was gated + // on `hasSidCol`, which created a real request waterfall: rawLogs + // took ~1s on prod, and this 10ms query couldn't start until then, + // blocking DataTable's first paint by the full rawLogs round-trip. + // The result is harmless when the service has no edge_sid column + // (the FLAG column simply doesn't render and the data goes unused). + const labelsQuery = useQuery({ + queryKey: ['scoring-labels', activeServiceId], + enabled: !!activeServiceId, + queryFn: async ({ signal }) => { + const { data, response } = await client.GET( + '/api/services/{service_id}/scoring/labels' as any, + { params: { path: { service_id: activeServiceId || '' } } } as any, + ) + if (!response.ok) throw new Error(`status ${response.status}`) + return data as { labels: Array<{ sid: string; label: LabelValue }> } + }, + }) + const labelBySid = React.useMemo(() => { + const m = new Map() + for (const l of labelsQuery.data?.labels ?? []) m.set(l.sid, l.label) + return m + }, [labelsQuery.data]) const columns: ColumnDef[] = React.useMemo(() => { if (!rawLogs?.columns) return [] - return rawLogs.columns.map(col => ({ + const dataCols: ColumnDef[] = rawLogs.columns.map((col: string): ColumnDef => ({ id: col, accessorFn: (row) => row[col], meta: { label: getFieldLabel(col) }, @@ -544,7 +672,32 @@ export default function DashboardPage() { ) } })) - }, [rawLogs?.columns, full, abbr, addFilter, getFieldLabel]) + // Flag column: only shown when edge_sid is present in the schema + // (i.e. session scoring is enabled). Disabled for rows where the + // sid is empty (cookieless requests — already caught by L1). + if (hasSidCol && activeServiceId) { + dataCols.push({ + id: '__flag', + accessorFn: (_row: any) => '', + meta: { label: 'Flag' }, + header: 'Flag', + cell: ({ row }: { row: any }) => { + const sid = String(row.original['edge_sid'] ?? '') + return ( + + ) + }, + } as ColumnDef) + } + return dataCols + }, [rawLogs?.columns, full, abbr, addFilter, getFieldLabel, hasSidCol, activeServiceId, labelBySid]) const visibleCardList = React.useMemo( () => allCards.filter((c: any) => visibleCards.has(c.id)), @@ -770,6 +923,15 @@ export default function DashboardPage() { {/* ── Aggregation cards ── */} {visibleCardList.length > 0 && (() => { const visibleById = new Map(visibleCardList.map((c: any) => [c.id, c])) + // Wrap each card in LazyMount so the FIRST dashboard paint + // only mounts the cards above the fold (~5-10) instead of + // all 86. Off-screen cards land as the user scrolls — the + // rootMargin of 600px (one screen) pre-mounts before the + // user actually reaches them, so they feel instant. Cuts + // initial DOM nodes from ~860 to ~100 and skips ~80 + // TopTenTable mount cycles on first render. The loading + // placeholder branch is NOT wrapped — it's already cheap + // and we want every "Initializing..." tile visible. const renderCard = (card: any) => { if (!isReady || (isLoadingAggs && !aggregates)) { return ( @@ -782,47 +944,50 @@ export default function DashboardPage() { } if (card.id === '_bot_name') { return ( - } - field="_bot_name" - inActiveFormat={card.inActiveFormat} - data={{ - total: topBotsData?.bots?.reduce((acc: number, b: any) => acc + b.request_count, 0) || 0, - top: (topBotsData?.bots ?? []).map((b: any) => ({ value: b.id, label: b.name, count: b.request_count })) - }} - compareData={undefined} - onRowClick={handleRowClick} - /> + + } + field="_bot_name" + inActiveFormat={card.inActiveFormat} + data={{ + total: topBotsData?.bots?.reduce((acc: number, b: any) => acc + b.request_count, 0) || 0, + top: (topBotsData?.bots ?? []).map((b: any) => ({ value: b.id, label: b.name, count: b.request_count })) + }} + compareData={undefined} + onRowClick={handleRowClick} + /> + ) } if (card.id === '_ngwaf_bot_name') { return ( + + acc + b.request_count, 0), + top: (topBotsData?.ngwaf_bots ?? []).map((b: any) => ({ value: b.name, label: b.name, count: b.request_count })) + }} + compareData={undefined} + onRowClick={handleRowClick} + /> + + ) + } + return ( + acc + b.request_count, 0), - top: (topBotsData?.ngwaf_bots ?? []).map((b: any) => ({ value: b.name, label: b.name, count: b.request_count })) - }} - compareData={undefined} + data={aggregates?.data?.[card.id]} + compareData={compareMode ? compareAggregates?.data?.[card.id] : undefined} onRowClick={handleRowClick} /> - ) - } - return ( - + ) } @@ -886,13 +1051,12 @@ export default function DashboardPage() { contentClassName="p-0" headerAction={
- {rawLogs?.columns && ( - ({ id: c, label: getFieldLabel(c) }))} - visibility={rawVisibility} - onChange={(id, vis) => setRawVisibility(prev => ({ ...prev, [id]: vis }))} - /> - )} + +
+ ) : ( +
+ {locked ? 'disabled (delete_after=false)' : fmtDays(value)} +
+ )} +
+ ) +} diff --git a/frontend/components/PlotlyChart/PlotlyChart.tsx b/frontend/components/PlotlyChart/PlotlyChart.tsx index 29d465b8..aa50bae5 100644 --- a/frontend/components/PlotlyChart/PlotlyChart.tsx +++ b/frontend/components/PlotlyChart/PlotlyChart.tsx @@ -1,10 +1,31 @@ 'use client' -import React, { useRef, useEffect } from 'react' +import React, { useRef, useEffect, useState } from 'react' import dynamic from 'next/dynamic' import { useTheme } from 'next-themes' -const Plot = dynamic(() => import('react-plotly.js'), { ssr: false }) +// Use the cartesian-only Plotly distribution via react-plotly.js's factory +// API. The default `import 'react-plotly.js'` pulls full plotly.js (~4.7 MB +// minified) — we only render scatter / line / bar / pie / heatmap (see the +// dashboard repository), all of which are covered by the cartesian build +// (~1.4 MB minified, ~3.4× smaller). Initial dashboard render felt the +// difference: full-Plotly chunk fetch + parse on every fresh dashboard +// hit was visibly delaying the time-series chart's first paint behind the +// rest of the page. The factory pattern lets us load the leaner Plotly +// without touching every PlotlyChart caller. +const Plot = dynamic( + async () => { + const [{ default: createPlotlyComponent }, plotlyModule] = await Promise.all([ + import('react-plotly.js/factory'), + // No types ship with the cartesian-dist-min package — the runtime + // shape (`{Plot, plot, react, ...}` plus trace-type registrations) + // matches the full plotly.js for everything the factory needs. + import('plotly.js-cartesian-dist-min' as any) as any, + ]) + return createPlotlyComponent(plotlyModule.default || plotlyModule) + }, + { ssr: false }, +) interface PlotlyChartProps { data: any[] @@ -94,18 +115,46 @@ export const PlotlyChart = React.memo(function PlotlyChart({ ...config } + // Viewport gate: don't trigger the dynamic-import of the 1.4MB + // plotly.js-cartesian-dist chunk until this chart is within 600px of + // the viewport. `dynamic(...)` only starts fetching when is + // actually rendered, so withholding the render = withholding the + // chunk fetch. Charts already above the fold mount immediately + // (the initial visible=undefined falls through to true when no + // IntersectionObserver exists, e.g. SSR or older browsers). + const containerRef = useRef(null) + const [visible, setVisible] = useState(() => typeof IntersectionObserver === 'undefined') + + useEffect(() => { + if (visible || !containerRef.current || typeof IntersectionObserver === 'undefined') return + const node = containerRef.current + const observer = new IntersectionObserver( + ([entry]) => { + if (entry.isIntersecting) { + setVisible(true) + observer.disconnect() + } + }, + { rootMargin: '600px' }, + ) + observer.observe(node) + return () => observer.disconnect() + }, [visible]) + return ( -
- +
+ {visible ? ( + + ) : null}
) }) diff --git a/frontend/components/QueryProvider.tsx b/frontend/components/QueryProvider.tsx index ae5eab6b..add88cba 100644 --- a/frontend/components/QueryProvider.tsx +++ b/frontend/components/QueryProvider.tsx @@ -13,7 +13,17 @@ export default function QueryProvider({ children }: { children: React.ReactNode const [queryClient] = useState(() => new QueryClient({ defaultOptions: { queries: { + // staleTime: queries stay "fresh" for 30s after fetch. Repeat + // navigations to a route within 30s skip the network entirely + // — that's the difference between "click → instant snapshot" + // vs "click → spinner → repaint" for revisits. staleTime: 30 * 1000, + // gcTime: keep cached data in memory for 5 min after the last + // subscriber unmounts. Without this React Query drops the + // cache the moment a page unmounts, so navigating away and + // back pays a cold fetch even within seconds. 5 min covers + // typical click-back behaviour without bloating memory. + gcTime: 5 * 60 * 1000, refetchOnWindowFocus: false, }, }, diff --git a/frontend/components/ReportShell.tsx b/frontend/components/ReportShell.tsx index fd1bfd74..fdd7cc14 100644 --- a/frontend/components/ReportShell.tsx +++ b/frontend/components/ReportShell.tsx @@ -9,6 +9,7 @@ import { NoServiceSelected } from '@/components/NoServiceSelected' import { PageHeader } from '@/components/ui/page-header' import { Loader2, LucideIcon } from 'lucide-react' import { cn } from '@/lib/utils' +import { DashboardSkeleton } from '@/components/skeletons/PageSkeleton' interface ReportShellProps { title: string @@ -57,14 +58,13 @@ export function ReportShell({ {!isReady ? ( -
-
- -

- Initializing analysis... -

-
-
+ // Content-shaped skeleton instead of a centered spinner. The + // prior "Initializing analysis…" loader was small and centered + // in 400px of empty space — users perceived it as "the page + // isn't loading" because it didn't look like real content + // taking shape. The skeleton mirrors the dashboard layout so + // the swap to real content doesn't reflow the page. + ) : ( children )} diff --git a/frontend/components/ServiceSwitcher/ServiceSwitcher.tsx b/frontend/components/ServiceSwitcher/ServiceSwitcher.tsx index a94b867e..e6fa0802 100644 --- a/frontend/components/ServiceSwitcher/ServiceSwitcher.tsx +++ b/frontend/components/ServiceSwitcher/ServiceSwitcher.tsx @@ -20,16 +20,20 @@ import { PopoverTrigger, } from '@/components/ui/popover' import { useServiceStore } from '@/stores/serviceStore' -import { useBootstrap } from '@/hooks/useBootstrap' import { buttonVariants } from '@/components/ui/button' +// Pre-fix this component also called useBootstrap() to "ensure services are +// loaded". AppLayout (which wraps this) already calls useBootstrap and +// populates useServiceStore as a side effect — having ServiceSwitcher call +// it too added a second hook subscription that triggered extra renders on +// every bootstrap settle, even though React Query deduped the network +// request itself. The store is the right read source. export function ServiceSwitcher() { const [open, setOpen] = React.useState(false) const { services, activeServiceId, setActiveServiceId } = useServiceStore() const router = useRouter() const pathname = usePathname() - useBootstrap() // Ensure services are loaded and first service is auto-selected const activeService = services.find((s) => s.id === activeServiceId) diff --git a/frontend/components/SessionScoring/AuditLogTab.tsx b/frontend/components/SessionScoring/AuditLogTab.tsx new file mode 100644 index 00000000..02b541f2 --- /dev/null +++ b/frontend/components/SessionScoring/AuditLogTab.tsx @@ -0,0 +1,176 @@ +'use client' + +import * as React from 'react' +import { useQuery } from '@tanstack/react-query' +import { ScrollText, Info } from 'lucide-react' + +import { AnalyticsCard } from '@/components/AnalyticsCard' +import { AuditLogHelp } from '@/components/SessionScoring/help-content' +import { Badge } from '@/components/ui/badge' +import { Button } from '@/components/ui/button' +import { Skeleton } from '@/components/ui/skeleton' +import { + Table, + TableBody, + TableCell, + TableHead, + TableHeader, + TableRow, +} from '@/components/ui/table' +import { client } from '@/lib/api' +import { formatTimeAgo } from '@/lib/date' + +interface AuditLogTabProps { + serviceId: string +} + +interface AuditRow { + id: number | string + timestamp: string + service_id: string + action: string + actor: string | null + details: unknown +} + +interface AuditResponse { + audit: AuditRow[] + limit?: number +} + +const AUDIT_LIMIT = 200 + +function detailsToString(details: unknown): string { + if (details == null) return '' + if (typeof details === 'string') return details + try { + return JSON.stringify(details) + } catch { + return String(details) + } +} + +function truncate(s: string, max: number) { + if (s.length <= max) return s + return s.slice(0, max) + '…' +} + +export function AuditLogTab({ serviceId }: AuditLogTabProps) { + const query = useQuery({ + queryKey: ['scoring-audit', serviceId, AUDIT_LIMIT], + queryFn: async () => { + const { data, response } = await client.GET( + '/api/services/{service_id}/scoring/audit' as any, + { + params: { + path: { service_id: serviceId }, + query: { limit: AUDIT_LIMIT }, + }, + } as any, + ) + if (!response.ok) throw new Error(`status ${response.status}`) + return data as AuditResponse + }, + staleTime: 30_000, + }) + + if (query.isError) { + return ( + } + helpContent={} + helpTitle="About Scoring Audit Log" + > +
+
+ + Failed to load audit log +
+

+ {(query.error as any)?.message || 'Unknown error'} +

+ +
+
+ ) + } + + const rows = query.data?.audit ?? [] + + return ( + } + description="Recent operator actions on this service's scoring config (most-recent first)." + helpContent={} + helpTitle="About Scoring Audit Log" + isLoading={query.isLoading} + isFetching={query.isFetching} + contentClassName="p-0" + > + {query.isLoading ? ( +
+ {Array.from({ length: 3 }).map((_, i) => ( + + ))} +
+ ) : ( +
+ + + + Timestamp + Action + Actor + Details + + + + {rows.length === 0 && ( + + + No audit events yet. Mutations to scoring (enable, retrain, + rotate key, etc.) will appear here. + + + )} + {rows.map((r) => { + const details = detailsToString(r.details) + return ( + + + {formatTimeAgo(r.timestamp) || r.timestamp} + + + + {r.action} + + + + {r.actor || '—'} + + + {details ? truncate(details, 60) : '—'} + + + ) + })} + +
+
+ )} +
+ ) +} diff --git a/frontend/components/SessionScoring/ComplianceChart.tsx b/frontend/components/SessionScoring/ComplianceChart.tsx new file mode 100644 index 00000000..0eb27655 --- /dev/null +++ b/frontend/components/SessionScoring/ComplianceChart.tsx @@ -0,0 +1,60 @@ +'use client' + +import { useQuery } from '@tanstack/react-query' + +import { client } from '@/lib/api' +import { ComplianceHelp } from '@/components/SessionScoring/help-content' + +import { StackedHourlyBarChart } from './StackedHourlyBarChart' + +interface ComplianceChartProps { + serviceId: string + sinceHours?: number +} + +interface CompRow { + hour: string + compliance: string + count: number +} + +const COMPLIANCE_COLORS: Record = { + ok: '#10b981', + missing: '#94a3b8', + tampered: '#e11d48', + expired: '#f59e0b', + unknown: '#7c3aed', +} + +export function ComplianceChart({ serviceId, sinceHours = 24 }: ComplianceChartProps) { + const { data, isLoading, isFetching } = useQuery({ + queryKey: ['scoring-compliance', serviceId, sinceHours], + queryFn: async () => { + const { data, response } = await client.GET( + '/api/services/{service_id}/scoring/compliance-breakdown' as any, + { + params: { + path: { service_id: serviceId }, + query: { since_hours: sinceHours }, + }, + } as any, + ) + if (!response.ok) throw new Error(`status ${response.status}`) + return data as { rows: CompRow[] } + }, + }) + + return ( + + title={`Cookie compliance — last ${sinceHours}h`} + description="Breakdown of edge_cookie_compliance per hour. 'missing' is the canonical bot signal (no cookie at all); 'tampered' is the post-cookie threat (someone modified the payload)." + helpContent={} + helpTitle="About Cookie Compliance" + isLoading={isLoading} + isFetching={isFetching} + rows={data?.rows ?? []} + categoryKey="compliance" + colors={COMPLIANCE_COLORS} + /> + ) +} diff --git a/frontend/components/SessionScoring/ExcludeRegexCard.tsx b/frontend/components/SessionScoring/ExcludeRegexCard.tsx new file mode 100644 index 00000000..eef4881e --- /dev/null +++ b/frontend/components/SessionScoring/ExcludeRegexCard.tsx @@ -0,0 +1,368 @@ +'use client' + +import * as React from 'react' +import { useMutation, useQuery, useQueryClient } from '@tanstack/react-query' +import { AlertTriangle, Check, Filter, Loader2, RotateCcw } from 'lucide-react' + +import { AnalyticsCard } from '@/components/AnalyticsCard' +import { Alert, AlertDescription } from '@/components/ui/alert' +import { Button } from '@/components/ui/button' +import { ConfirmDialog } from '@/components/ui/confirm-dialog' +import { Label } from '@/components/ui/label' +import { Skeleton } from '@/components/ui/skeleton' +import { Textarea } from '@/components/ui/textarea' +import { client } from '@/lib/api' + +interface ExcludeRegexResponse { + current: string + is_default: boolean + default: string + effective: string +} + +interface ExcludeRegexCardProps { + serviceId: string +} + +/** + * Operator-facing control for "which URLs bypass the scoring Compute call". + * + * The backend uses the result as the regex on the right-hand side of + * `std.tolower(req.url) !~ ""` in the scoring recv VCL snippet — + * a match means the request is NOT routed to Compute (saves cost on + * static assets / health checks / etc.). + * + * The default matches common static-asset file extensions; the operator + * can override per-service. The override is validated through three + * layers (input policy + falco static analysis + Fastly VCL compile) + * before the cloned version is activated. + */ +// Cached on-blur lint result. `regex` records the last value we asked the +// backend about so we can skip the round-trip if the operator blurs out +// after clicking right back to the same content; `error` / `lint_warnings` +// hold the verdict to render inline. +type LintResult = + | { regex: string; ok: true; lint_warnings: string[] } + | { regex: string; ok: false; error: string; reason: string } + +export function ExcludeRegexCard({ serviceId }: ExcludeRegexCardProps) { + const queryClient = useQueryClient() + const [draft, setDraft] = React.useState('') + const [confirmOpen, setConfirmOpen] = React.useState(false) + const [lintResult, setLintResult] = React.useState(null) + const [lintPending, setLintPending] = React.useState(false) + + const { data, isLoading } = useQuery({ + queryKey: ['scoring-exclude-regex', serviceId], + queryFn: async () => { + const { data, response } = await client.GET( + '/api/services/{service_id}/scoring/exclude-regex' as any, + { params: { path: { service_id: serviceId } } } as any, + ) + if (!response.ok) throw new Error(`status ${response.status}`) + return data as ExcludeRegexResponse + }, + staleTime: 60_000, + }) + + // Initialise the draft from the stored value once the fetch completes. + // ``enable_scoring`` populates cfg.scoring.exclude_url_regex with the + // literal default on first turn-on, so `data.current` is always a real + // regex (never empty). Fall back to `data.effective` for legacy services + // enabled before that change landed (their cfg still has null). Don't + // reset on re-renders — the operator may have been typing. + const initialisedRef = React.useRef(false) + React.useEffect(() => { + if (data && !initialisedRef.current) { + setDraft(data.current || data.effective) + initialisedRef.current = true + } + }, [data]) + + const isDirty = data ? draft !== (data.current || data.effective) : false + + // "Reset & publish" vs "Save & publish" button label: the operator is + // resetting iff the draft matches the bundled default AND the current + // stored value is something else (a custom override). After publish + // the cfg lands back at the default literal — semantically a reset. + const isResetToDefault = data ? draft === data.default && !data.is_default : false + + const saveMut = useMutation({ + mutationFn: async (regex: string) => { + // Omit the ``token`` query param entirely — the backend's + // ``_resolve_token`` falls back to the cfg-stored ``fastly_api_key`` + // when one isn't supplied, which is the same pattern the + // enforce-threshold + enforce-status-code endpoints rely on. The + // operator only needed to type a token here if they wanted to + // override the stored key, which is almost never the case in + // practice. If the operator does need to override later, expose a + // collapsible "advanced" affordance — don't make every edit prompt. + const { data: resp, response } = await client.PUT( + '/api/services/{service_id}/scoring/exclude-regex' as any, + { + params: { + path: { service_id: serviceId }, + query: { confirm: true }, + }, + body: { regex } as any, + } as any, + ) + if (!response.ok) { + // Surface backend's structured detail.error to the toast/alert. + const err = (resp as any)?.detail?.error || (resp as any)?.detail || `HTTP ${response.status}` + throw new Error(typeof err === 'string' ? err : JSON.stringify(err)) + } + return resp as { ok: true; effective_regex: string; is_default: boolean; message: string; lint_warnings: string[] } + }, + onSuccess: () => { + queryClient.invalidateQueries({ queryKey: ['scoring-exclude-regex', serviceId] }) + setConfirmOpen(false) + }, + }) + + const handleSave = () => { + saveMut.mutate(draft) + } + + const handleResetToDefault = () => { + if (data) setDraft(data.default) + } + + // Pre-publish dry-run lint: fires on textarea blur if the draft has + // changed since the last check. Hits the dedicated + // ``/scoring/exclude-regex/validate`` endpoint which runs the same + // input-policy + falco static-analysis layers the publish flow uses, + // but without touching cfg or Fastly. Gives the operator immediate + // feedback BEFORE they commit to the save+publish round-trip. + const runLintCheck = React.useCallback( + async (regex: string) => { + // Skip when nothing's there to lint or we've already validated this + // exact string — avoids spamming falco when the operator clicks + // around the card. + if (regex.trim() === '') { + setLintResult(null) + return + } + if (lintResult && lintResult.regex === regex) return + setLintPending(true) + try { + const { data: resp, response } = await client.POST( + '/api/services/{service_id}/scoring/exclude-regex/validate' as any, + { + params: { path: { service_id: serviceId } }, + body: { regex } as any, + } as any, + ) + if (!response.ok) { + const err = + (resp as any)?.detail?.error || (resp as any)?.detail || `HTTP ${response.status}` + setLintResult({ + regex, + ok: false, + error: typeof err === 'string' ? err : JSON.stringify(err), + reason: 'http_error', + }) + return + } + const r = resp as + | { ok: true; lint_warnings: string[] } + | { ok: false; error: string; reason: string } + if (r.ok) { + setLintResult({ regex, ok: true, lint_warnings: r.lint_warnings || [] }) + } else { + setLintResult({ regex, ok: false, error: r.error, reason: r.reason }) + } + } catch (e) { + setLintResult({ + regex, + ok: false, + error: e instanceof Error ? e.message : String(e), + reason: 'network_error', + }) + } finally { + setLintPending(false) + } + }, + [serviceId, lintResult], + ) + + // Clear the cached lint verdict whenever the operator edits the draft — + // stale "passed" indicators would mislead about the CURRENT value. + const handleDraftChange = (v: string) => { + setDraft(v) + if (lintResult && lintResult.regex !== v) setLintResult(null) + } + + if (isLoading) { + return ( + }> + + + ) + } + + if (!data) { + return ( + }> + + + Failed to load current exclusion regex. + + + ) + } + + return ( + } + description="Requests whose URL matches this regex are NOT sent to the scoring Compute service. The default skips common static-asset extensions; override to scope scoring to specific traffic patterns." + > +
+
+ +