diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
index 66a9994e..00c0e2ae 100644
--- a/.github/workflows/ci.yml
+++ b/.github/workflows/ci.yml
@@ -40,6 +40,25 @@ jobs:
sudo curl -sL https://github.com/ysugimoto/falco/releases/latest/download/falco-linux-amd64 -o /usr/local/bin/falco
sudo chmod +x /usr/local/bin/falco
+ - name: Install gitleaks
+ # Same curl-binary-to-PATH pattern as falco above. Version pinned so
+ # a detector-rule change doesn't suddenly fail an unrelated PR; bump
+ # deliberately when wanted. Mirrors `.pre-commit-config.yaml`.
+ run: |
+ GITLEAKS_VERSION=8.30.1
+ sudo curl -sSfL "https://github.com/gitleaks/gitleaks/releases/download/v${GITLEAKS_VERSION}/gitleaks_${GITLEAKS_VERSION}_linux_x64.tar.gz" \
+ | sudo tar -xz -C /usr/local/bin gitleaks
+ sudo chmod +x /usr/local/bin/gitleaks
+ gitleaks version
+
+ - name: Secret scan (gitleaks)
+ # Scans full git history against the .gitleaks.toml allowlist.
+ # `--exit-code 1` is the default; explicit for clarity. Anything
+ # the allowlist doesn't cover fails the build with a redacted
+ # diagnostic — see CONTRIBUTING.md / AGENTS.md for the
+ # suppression playbook.
+ run: gitleaks detect --no-banner --redact --config .gitleaks.toml --exit-code 1
+
- name: Install terraform
# Required by tests/utils/test_terraform_gen.py — runs `terraform fmt`
# against generator output and `validate` when TERRAFORM_VALIDATE=1.
diff --git a/.gitignore b/.gitignore
index 1e2088e1..33202558 100644
--- a/.gitignore
+++ b/.gitignore
@@ -3,6 +3,10 @@ setup-state.json
*.duckdb
*.duckdb.wal
/configs/*
+# Security: the SSH known_hosts pin IS source-controlled — it's the
+# trust anchor for the reverse-tunnel host-key check. Override the
+# blanket configs/* ignore.
+!/configs/ssh_known_hosts
/data/*
/data/system/*
__pycache__/
@@ -36,6 +40,43 @@ node_modules/
/.antigravitycli
+# Local Swival tool state (REPL history, per-session audit dirs). Per-user.
+.swival/
+/audit-findings/
+
+# Ad-hoc debug screenshots dropped at the repo root by browser-driven
+# verification sessions. Intentional docs/assets/*.png are tracked; these
+# root-level ones are throwaway.
+/test_*.png
+
# Ad-hoc lint/test output dumps
frontend/*_out.txt
frontend/*_output.txt
+
+# Session-scoring fixture extracts contain real prod IPs / UAs / payloads.
+# Reproducible via scripts/scoring/extract_traces.py against local data.
+tests/fixtures/scoring/
+
+# Trained matrix.json carries real customer route names. Regenerable via
+# scripts/scoring/train.py against a fresh trace extract.
+compute/scorer/matrix.json
+
+# Rust build artifacts.
+compute/scorer/target/
+compute/scorer/bin/
+compute/scorer/pkg/
+
+# Per-deployment secrets: AES cookie keys, deploy-time IDs the service files
+# might reference. NEVER commit.
+.scoring/
+.aider*
+
+# Ad-hoc working directory for local profiling — HAR captures, per-page JSON
+# summaries, query trace dumps. The reusable harness scripts (profile.js,
+# split_per_page.py) live here for now; treat the whole tree as throwaway.
+/scratch/
+
+# Local-only VS Code config (file-watcher / Pylance excludes for the
+# regenerating .next + cache trees). Personal to each contributor's editor
+# setup — not promoted to the repo by default.
+.vscode/
diff --git a/.gitleaks.toml b/.gitleaks.toml
new file mode 100644
index 00000000..f09b2c4a
--- /dev/null
+++ b/.gitleaks.toml
@@ -0,0 +1,61 @@
+# gitleaks configuration — extends the built-in detector set with this
+# repo's allowlist for tracked test fixtures, Rust lockfile checksums,
+# and the public SSH host key.
+#
+# Run locally: make secret-scan
+# Pre-commit: installed via .pre-commit-config.yaml
+# CI: invoked by .github/workflows/ci.yml
+#
+# Suppression mechanisms in increasing scope:
+# - inline `#gitleaks:allow` on the offending line
+# - .gitleaksignore — fingerprint list for one-off historical findings
+# - this file's [allowlist] paths — for whole files / directories
+
+[extend]
+# Inherit gitleaks' built-in ruleset (~100 detectors: AWS, GCP, Azure,
+# GitHub, GitLab, Slack, Stripe, Twilio, Mailgun, Square, PyPI, npm,
+# generic-api-key, private-key, etc.). The default config also
+# allowlists npm/yarn/pnpm/poetry/go.mod/go.sum/node_modules/venv —
+# we add the gaps below.
+useDefault = true
+
+[allowlist]
+description = "fastly-log-analytics — tracked test fixtures, Rust lockfile, public keys"
+
+# Path-based allowlist. Regex matched against the file's path relative
+# to the repo root. Only entries for TRACKED files matter for the
+# default git-history scan; the .next/configs/data entries below also
+# keep ad-hoc `gitleaks detect --no-git` working-tree runs clean.
+paths = [
+ # Rust dependency lockfile — sha256 checksums look like generic
+ # API keys to gitleaks. Not covered by the built-in lockfile allowlist.
+ '''^compute/scorer/Cargo\.lock$''',
+
+ # Tracked, intentional test fixtures
+ '''^compute/scorer/fixtures/local-dictionary\.json$''', # placeholder AES key = 0x00..0x1f
+ '''^compute/scorer/src/cookie\.rs$''', # test-mode constants
+ '''^tests/scoring/.*\.py$''', # cookie/scoring test fixtures
+ '''^tests/repositories/test_alerts\.py$''', # zeros Slack webhook fixture
+ '''^tests/utils/test_sql_validator\.py$''', # blocked-function NAMES (e.g. "AWS_SECRET_ACCESS_KEY")
+
+ # Public SSH host key for localhost.run — sharing is the entire point
+ # (trust anchor for the reverse-tunnel host-key check).
+ '''^configs/ssh_known_hosts$''',
+
+ # Documentation: release notes and runbooks may reference example
+ # tokens / credentials in prose.
+ '''^docs/''',
+ '''^CHANGELOG\.md$''',
+ '''^AGENTS\.md$''',
+
+ # Working-tree-only artifacts (all gitignored; matter only for
+ # ad-hoc `--no-git` runs). gitleaks uses Go's RE2 engine, which
+ # doesn't support negative lookahead, so we list the per-service
+ # config filename pattern explicitly rather than "everything under
+ # configs/ except ssh_known_hosts".
+ '''^frontend/\.next/''', # Next.js build cache
+ '''^configs/.*\.json(\.bak.*)?$''', # real per-service Fastly configs (gitignored)
+ '''^data/''', # real SSH share key, share DB, runtime data
+ '''.*/__pycache__/''', # Python bytecode
+ '''\.pyc$''',
+]
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
index 34317f91..5a150d76 100644
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -26,6 +26,20 @@ repos:
- id: check-merge-conflict
- id: debug-statements
+ # Secret scanner. Blocks commits that introduce credentials / API keys /
+ # private keys / tokens. Configured via .gitleaks.toml at repo root
+ # (extends gitleaks' default ruleset with this repo's allowlist for
+ # tracked test fixtures and Rust lockfile checksums). Re-run with
+ # `make secret-scan` locally; CI runs the same invocation.
+ #
+ # If a legitimate placeholder trips the scanner, suppress with:
+ # - inline `#gitleaks:allow` on the line, OR
+ # - add the file/path glob to .gitleaks.toml [allowlist] paths
+ - repo: https://github.com/gitleaks/gitleaks
+ rev: v8.30.1
+ hooks:
+ - id: gitleaks
+
# Regenerate the committed OpenAPI snapshot + typed frontend client
# whenever the FastAPI surface or the generator script changes. If the
# regenerated files differ from the staged version, pre-commit fails
@@ -40,3 +54,9 @@ repos:
language: system
pass_filenames: false
entry: bash -c 'cd frontend && npm run --silent gen:types'
+ - id: typecheck-frontend
+ name: Typecheck frontend
+ files: ^frontend/.*\.(ts|tsx)$
+ language: system
+ pass_filenames: false
+ entry: bash -c 'cd frontend && npx tsc --noEmit'
diff --git a/AGENTS.md b/AGENTS.md
index 86234af6..605de0e6 100644
--- a/AGENTS.md
+++ b/AGENTS.md
@@ -225,6 +225,19 @@ Removes the FOS logging endpoint from the Fastly service, the CDN VCL service, t
## Frontend Patterns
+> **REQUIRED READING before any frontend work:**
+> [`frontend/node_modules/next/dist/docs/`](frontend/node_modules/next/dist/docs/)
+> — the Next.js 16 App Router docs are vendored locally. Read the relevant
+> sections (loading.tsx, prefetching, streaming, instant-navigation, caching,
+> linking-and-navigating) BEFORE proposing or implementing changes to
+> components / pages / hooks. **Click-feel bugs are almost always a Next
+> conventions violation that the docs would have flagged.** Past failures
+> from skipping this: shipping pages without `loading.tsx`, blocking
+> layouts on uncached data, per-instance `setInterval` storms, missing
+> `signal` cancellation, polling intervals tuned for "live feel" not
+> backend cost. The conventions section below distills the rules but
+> defer to the docs for any pattern not listed.
+
**Stack:** Next.js 16 app router, React 19, TanStack Query v5, Zustand, shadcn/ui, Recharts, openapi-fetch.
**Type-safe client:**
@@ -254,6 +267,100 @@ A global middleware in [frontend/lib/api.ts](frontend/lib/api.ts) checks `respon
9. **`empty_schema_response(runner)`** in [_base.py](backend/repositories/_base.py) — return this when a repo function hits a service with no logs.
10. **`origin_latency_us_expr(actual_cols)`** in `_base.py` — don't hand-roll the `COALESCE("ottfb", "ttfb" * 1000000.0)` fragment.
+### Next.js navigation + loading conventions (READ BEFORE TOUCHING FRONTEND)
+
+Distilled from `frontend/node_modules/next/dist/docs/` — these are the
+rules to follow so click-to-render feels instant. Failure modes I've shipped
+before and you should not repeat:
+
+**1. Every navigable route MUST have a `loading.tsx`.** Without it, dynamic
+routes (all our `'use client'` pages) get NO prefetched fallback — the
+browser sits on the previous page until the destination's JS is ready and
+its useQueries have settled. With it, Next.js renders the skeleton the
+instant the user clicks. Use a variant from
+[components/skeletons/PageSkeleton.tsx](frontend/components/skeletons/PageSkeleton.tsx)
+— don't hand-roll Array.from + Skeleton inline.
+
+**2. Layouts MUST NOT block on uncached data.** If `app/layout.tsx` or any
+shared layout awaits a fetch / accesses cookies / etc. before rendering
+children, **`loading.tsx` will not show a fallback at all** — Next.js waits
+for the layout to settle first. The previous fix to `AppLayout` removed an
+`isLoading ? : children` gate that was doing exactly this; any
+new layout-level data must use `useQuery` with `staleTime` so re-renders
+are cheap, and the layout must never short-circuit children behind a
+loading boolean.
+
+**3. Cancel in-flight queries on every route change.** AppLayout's
+`useEffect([pathname])` calls `queryClient.cancelQueries({ type: 'active' })`
+so the old page's leftover polls (e.g. SystemHealthCard's 10s health-snapshot
+poll) don't compete with the new page's mount work. **Always thread `signal`
+through queryFns** so cancellation actually aborts the network request —
+this hasn't been done universally yet, but new queryFns should follow:
+```typescript
+queryFn: async ({ signal }) => {
+ const { data } = await client.GET(..., { signal })
+ return data
+}
+```
+
+**4. Poll intervals must respect backend cost.** Default is 10s+. The
+SystemHealthCard fix bumped a 2s poll to 10s because the endpoint took 1-1.7s
+under load — at 2s polling that was constant backend pressure. If real-time
+updates matter, add a manual Refresh button, don't poll faster than 5s.
+Always set `refetchIntervalInBackground: false` so background tabs don't
+keep hammering.
+
+**5. NEVER spawn per-instance `setInterval` for visible-tick state.** If
+multiple components need a 1Hz "now" value (countdowns, "X seconds ago"
+displays), they share the single
+[useNowMs](frontend/hooks/useNowSeconds.ts) hook — one `setInterval` for
+the whole tree. Past offenders: SystemJobBox (10 instances × 1s tick on
+/admin), CronScheduleBox (5+ on /logs), useElapsedTime (per-consumer
+ticker). All now consume `useNowMs`. If a new component needs a ticker,
+use this hook; do not roll your own.
+
+**6. Async buttons need IMMEDIATE feedback.** Every button whose `onClick`
+does async work must render ``
++ a pending label (`Stopping…`, `Saving…`, `Severing…`) while pending.
+`disabled={busy}` ALONE looks dead. Pattern lives in
+[ExcludeRegexCard](frontend/components/SessionScoring/ExcludeRegexCard.tsx);
+share-dashboard buttons follow the same shape after the recent fix.
+
+**7. Prefetch behavior:**
+ - Static routes → full route prefetched on Link viewport entry
+ - Dynamic routes (all our `'use client'` pages) → **partially prefetched
+ only if `loading.tsx` exists** (covers the shell to the loading
+ boundary). Without loading.tsx, NO prefetch happens.
+ - `` is the default; use `prefetch={false}` only
+ in dense lists (infinite-scroll tables) where the link cardinality
+ would balloon the prefetch traffic.
+ - **Hover-prefetch data, not just bundle:** when a Link target needs an
+ API call to render meaningfully, add `onMouseEnter` that calls
+ `queryClient.prefetchQuery(...)`. Example: the Admin → Share Dashboard
+ link in [admin/page.tsx](frontend/app/admin/page.tsx#L791) warms the
+ share-status query so the destination renders real content
+ immediately instead of skeleton-then-swap.
+
+**8. Wrap `router.replace()` inside effects in `startTransition`.** A
+synchronous `router.replace()` inside `useEffect` causes a render cascade
+that blocks paint. Examples:
+[useUrlServiceSync](frontend/hooks/useUrlServiceSync.ts),
+[AppLayout redirect block](frontend/components/AppLayout.tsx#L163). All
+existing call sites are wrapped; new ones must follow.
+
+**9. React Query defaults are set in
+[QueryProvider](frontend/components/QueryProvider.tsx):** `staleTime: 30s`,
+`gcTime: 5min`, `refetchOnWindowFocus: false`. Don't override per-query
+unless you need to — and when you do, document why.
+
+**10. When a click feels slow, MEASURE before guessing.** I have a working
+playwright reproducer at `/tmp/nav-perf-test2.mjs` that times each phase
+of a click (URL change, DOM ready, network idle, individual API requests).
+Run it against the live tunnel (`localhost:3001`) BEFORE proposing a fix.
+Click-feedback bugs are almost always about: (a) polls running while
+navigation is in flight, (b) heavy useQuery fan-out on mount, (c) layout
+re-renders triggered by store subscriptions. The trace shows which.
+
### Removed modules — don't recreate
- `backend/utils/audit_helpers.py` (referenced the long-removed DuckDB `_ingested_files` table)
@@ -361,8 +468,8 @@ The tunnel exposes the same FastAPI app to the public internet. Middleware class
### 21. `sync_data` orphan-cleanup vs local-compaction outputs
Local compaction writes merged rollups to three places: `/data/daily/`, `/data/weekly/`, and `/data/timestamp_hour=*/compacted_*.parquet`. None of these are tracked by the iceberg snapshot, so they are NOT in `cloud_files`/`active_paths`. The orphan-cleanup loop in [backend/core/iceberg.py](backend/core/iceberg.py) `sync_data()` walks the cache and deletes anything not in `active_paths`; without explicit allow-rules it nukes every compacted output, and the [`local_compacted_files` registry](backend/core/metadata_db.py) then blocks re-download of the source files — silently dropping rows from the view (production: 1.65M → 302K on 2026-05-31, then 1.66M → 1.62M on 2026-06-01 from the per-partition `compacted_*` variant). The fix is two-pronged: orphan-cleanup restricts its walk to `timestamp_hour=*` dirs AND skips `compacted_*.parquet` filenames. **If you add a new local-only output pattern, add it to both the dir skip and the file skip.** Integration coverage in [tests/core/test_local_compaction.py](tests/core/test_local_compaction.py)::`test_compaction_outputs_survive_iceberg_sync_orphan_cleanup` exercises the round-trip with real `compact_local_partitions` + real `sync_data`.
-### 22. `unattended-upgrades` OOMs the production VM
-The single-tenant 16 GB e2-standard-4 deploy runs backend + frontend + caddy at a steady-state working set around 10-13 GB. The Debian/Ubuntu nightly `apt-daily-upgrade.timer` forks a transient 1-2 GB downloader on top of that, and on 2026-06-01 it triggered an OOM kill that wedged the kernel (sshd died; needed `gcloud compute instances reset`). `~/restart.sh` on the VM re-asserts `systemctl mask apt-daily.timer apt-daily-upgrade.timer unattended-upgrades.service` on every restart so a re-image / apt-reinstall can't silently re-enable them. Trade-off: no automatic security patching — patch manually on a planned maintenance window with the backend container stopped. **If you bump the VM to a class with more RAM (e.g. `e2-custom-4-32768`), you may safely re-enable upgrades.** See `restart.sh` for the canonical incantation.
+### 22. `unattended-upgrades` can OOM a memory-tight VM
+A 16 GB Linux VM running backend + frontend + caddy holds a steady-state working set in the 10-13 GB range. The Debian/Ubuntu nightly `apt-daily-upgrade.timer` forks a transient 1-2 GB downloader on top of that, which can trip an OOM kill that wedges the kernel (sshd dies; needs a VM reset). The mitigation is to `systemctl mask apt-daily.timer apt-daily-upgrade.timer unattended-upgrades.service` on the host and re-assert it on every restart so a re-image / apt-reinstall can't silently re-enable them. Trade-off: no automatic security patching — patch manually on a planned maintenance window with the backend container stopped. **If you provision a VM with more RAM, you may safely re-enable upgrades.**
## AI Agent Directives
@@ -385,6 +492,18 @@ These apply to every change, regardless of scope.
10. **Keep Python imports at module level.** Conditional mid-function imports trigger `UnboundLocalError` (Trap #2).
11. **Run `ruff format` before committing** (or rely on `make ci`).
+### Secrets & sensitive data
+
+12. **Scan for committed secrets BEFORE every commit.** The repo has a `secret-scan` Makefile target (gitleaks) that's wired into both `make ci` and the pre-commit hook (`.pre-commit-config.yaml`). Either run pre-commit (`uv run pre-commit run --all-files`) or `make secret-scan` before pushing. CI also runs it (`.github/workflows/ci.yml`) and will fail the build, but catching it locally is faster.
+13. **Allowlist suppression order** when a legitimate placeholder trips the scanner:
+ - **Inline** (single line): append `# gitleaks:allow` to the offending line. Cheapest for a one-off test fixture.
+ - **Fingerprint** (one-off historical): add the finding's `{file}:{rule-id}:{commit}:{secret-hash}` line to `.gitleaksignore` at repo root.
+ - **Path** (entire file or directory): add a regex to the `[allowlist] paths` array in `.gitleaks.toml`. Use this when adding a new directory of test fixtures.
+14. **Never commit a real credential to suppress the scanner.** The point of the gate is exactly this. If a legitimate secret needs to live in the tree (e.g. an SSH public key used as a trust anchor), document why in a comment adjacent to the allowlist entry and explain why exposure is intentional.
+15. **Never put real customer values in code, scripts, tests, or docs.** This includes Fastly service IDs (use `` or `${FASTLY_SERVICE_ID:?}` env vars in scripts), bucket names, real domains, real IPs (Fastly edge ranges are fine — they're published), real email addresses (use `you@example.com`), or screenshots that show the above. Test fixtures use placeholders (`TestLogSvcABC123`, `FAKE_TOKEN`, `"FROM_CONFIG"`). Real deployment values come from env vars / per-host config that's gitignored.
+16. **Files that must never be committed** (covered by `.gitignore` — verify before any new directory of generated content lands):
+ - `.env` (real env), `configs/*.json` except `configs/ssh_known_hosts`, `data/system/` (real SSH key + share DB), `.scoring/` (per-deployment AES keys), `tests/fixtures/scoring/` (real prod traces). The `.gitleaks.toml` allowlist also covers these so a working-tree (`--no-git`) scan stays clean for ad-hoc local runs.
+
### Provisioning Wizard
12. The token entered in step 2 must be threaded to any API call needing Fastly credentials (including the NGWAF workspace fetch). Don't rely on stored-config fallback alone.
diff --git a/CHANGELOG.md b/CHANGELOG.md
index 26cb882e..a912a4b2 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -5,6 +5,116 @@ All notable changes to this project will be documented in this file.
The format is based on [Keep a Changelog 1.1.0](https://keepachangelog.com/en/1.1.0/),
and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
+## [1.1.0] - 2026-06-03
+
+Edge session scoring. Every request is classified in real-time at the edge by a Fastly Compute service that runs an L1 (cookie compliance + timing rules) + L2 (PageRank-trained transition matrix) scorer, returning a combined 0-100 score that lands in DuckDB for analyst review. Operators can label sessions, watch live ROC-AUC, retrain the matrix, roll back to a prior matrix, rotate the AES cookie key, and push a hard enforcement threshold that rejects flagged requests at the edge with an operator-chosen HTTP status code (default 429).
+
+### Highlights
+
+- **Edge scoring** — Fastly Compute scorer + 6-snippet VCL preflight pattern (recv/pass/fetch/deliver/miss/enforce), AES-GCM-encrypted session cookie carrying rotating sid + transition state, `fastly.ddos_detected` gate so Compute is bypassed under L7 attack.
+- **Admin UI** at `/admin/session-scoring` — StatusPanel with live AUC against accumulated labels, ScoringHealthCard with fire rate / score distribution / top reasons / matrix-staleness alert, ThresholdSlider with counterfactual flag/pass preview + precision/recall + commit-threshold persistence, RocPrCurves with ROC + Precision-Recall plots, TopFlaggedTable + LabelsTab with click-to-view-events per sid, RetrainButton (DuckDB traces → train.py → publish matrix to FOS), SinceHoursPicker driving all six cards on one shared time window.
+- **Labels CRUD** — POST/PATCH/DELETE per-sid labels (good/bad/neutral) feed `evaluate_from_persisted_scores` to compute live ROC-AUC. Min-samples gate (≥3 per class) prevents noisy display.
+- **ROC + PR curves** + per-reason AUC breakdown (split by L1/L2 rule: cookie-missing, impossibly-fast, robotic-consistency, rare-transition, low-transition-prob).
+- **Composite `/scoring/dashboard`** endpoint collapses the 8 per-card requests into one in-flight-collapsed payload; the existing per-card endpoints stay mounted for back-compat.
+- **`edge_score_reason` virtual field** — CSV-split via DuckDB `unnest(string_split(...))`, top-N cards + click-to-filter same as NGWAF signals.
+- **FOS matrix persistence** — `enable_scoring` publishes the trained matrix to FOS; backend auto-fetches on startup (no more per-host scp).
+- **Matrix version history + rollback** — every publish snapshots the prior matrix to `iceberg/meta/scoring_matrix_history/{version}.json`; new `/scoring/matrix-versions` lists them and `/scoring/matrix-versions/{v}/restore?confirm=true` copies a historical matrix back. AUC reflects the rollback immediately; Wasm at edge keeps the embedded matrix until `deploy_wasm.sh` re-runs (deploy_hint surfaced).
+- **Threshold enforcement (live blocking)** — operator commits a threshold, scorer reads it from `scoring_config` ConfigStore, emits `X-Edge-Score-Enforce: 1` when score≥threshold, the new `Session Scoring - Enforce` VCL snippet rejects those requests on the post-scoring restart. Effective at the edge within seconds. Confirm-dialog-gated PUT endpoint + LIVE warning chip in the slider UI. The response code defaults to 429 (Too Many Requests) and is operator-overridable per-service via a new `Enforce response code` selector (403 / 429 / 451 / 503; backend accepts any 4xx/5xx) — picks land via a focused `update_enforce_status_code` orchestrator that swaps only the enforce snippet (~5–10s end-to-end vs. the full enable_scoring flow). Audit-logged as `scoring_enforce_status_code_changed`.
+- **URL exclusion regex override** — operator-tunable per-service regex for "which URLs bypass the scorer". Defaults to the built-in static-asset extension list; the new `ExcludeRegexCard` on the Session Scoring page accepts a custom regex (e.g. exclude `/healthz`, exclude entire path prefixes, scope scoring to specific traffic). The PUT endpoint validates input through three layers before any VCL ships: (1) input policy — length cap, no quote / control chars, must compile under Python's `re`; (2) [falco](https://github.com/ysugimoto/falco) static analysis on the assembled recv snippet (catches regex+VCL composition errors that slip past Python's compiler); (3) Fastly's own VCL compiler at activate time. A focused `update_recv_exclusion_regex` orchestrator clones the active version, swaps only the recv snippet, and activates — ~5–15s end-to-end vs. the full enable_scoring flow. Confirm-dialog-gated. Audit-logged as `scoring_exclude_regex_changed`. Falco shipped in the backend Docker image; production sets `SCORING_REQUIRE_FALCO=1` so a missing binary fails closed instead of degrading to input-policy-only.
+- **AES key rotation** — `POST /scoring/rotate-key` mints a fresh 32-byte key, moves the prior to `previous_key_hex` (grace slot — Rust cookie codec falls back to it so in-flight cookies keep decoding through one rotation cycle).
+- **Cookie lifecycle bounds** — `SESSION_IDLE_EXPIRE_S` (30 min) + `SESSION_HARD_CAP_S` (24h) in the Rust scorer mint a fresh sid when either threshold is exceeded. Stolen cookies can't replay beyond their window; long-running sessions stop biasing the L1 variance estimator.
+- **Per-reason AUC breakdown UI** — `PerReasonAucCard` renders AUC split by which L1/L2 rule fired (cookie-missing, impossibly-fast, robotic-consistency, rare-transition, low-transition-prob).
+- **Operator audit log** — new `scoring_audit` table + `/scoring/audit` endpoint records every scoring_enabled, scoring_disabled, threshold_committed/cleared/enforced, matrix_retrained/restored, key_rotated event with actor + timestamp + details. Per-host, never mirrored via state_sync.
+
+### Reliability
+
+- **Cron-progress reliability** — `end_progress` auto-emits `done` when the last event isn't terminal; `list_active_runs` triple-guards (last-event filter + 5-min staleness + DB-status cross-check via `get_cron_run_status`); `reap_zombie_runs` called from every cron-tick cleanup. Fixed a production incident where 382 stale "sync" entries piled up on the System Health card.
+- **state_sync merge guards** — `import_admin_state` no longer overwrites scoring `custom_fields` with stale FOS payloads (root cause of a production data-loss incident); sibling fixes in `cli.handle_update_logs`, `provision.write_service_config`, and `api_service_log_fields_set` close every "remote-overwrites-code-managed-state" path.
+- **Defense-in-depth** — `enable_scoring` rollback + `disable_scoring` final-save reload cfg right before writing to close the 30-120s race window where concurrent writers got clobbered.
+- **Per-key in-flight collapse** in `_cached` so the dashboard's 8-card mount no longer queues queries behind one global lock.
+
+### Performance
+
+- `security/top-bots` consolidated UA + NGWAF onto one temp table (was 2 independent Iceberg scans per dashboard mount).
+- `dashboard/raw` uses `get_source_extent` for cached steady-state extent.
+- `usage/prefill` cached-status fast path skips DuckDB hop when the sync cron has populated it.
+- `get_enriched_services` 60s TTL cache on the recursive cache-dir `scandir` (was 200-1500ms per `/api/bootstrap`).
+- `loading.tsx` Suspense skeletons + dynamic imports (LabelsTab, ChoroplethMap) cut admin-page click lag.
+
+### Cleanup
+
+- Dropped dead `@daypicker/react` dep + dead `frontend/components/ui/calendar.tsx`.
+- Collapsed 7-site `cleanup_progress + reap` boilerplate into `cleanup_progress_and_reap()` helper.
+- Refactored `security.py`'s ad-hoc temp-table to use the existing `QueryRunner.temp_table()` context manager.
+- Narrowed `get_cron_run_status` exception scope to `sqlite3.Error` with DEBUG log so future triage isn't flying blind.
+
+### Security
+
+Capability-focused hardening across the FastAPI backend, Fastly VCL, Next.js frontend, and Rust scorer. All changes deployed and verified.
+
+- **Trust-boundary normalisation**:
+ - uvicorn runs with `--proxy-headers --forwarded-allow-ips=127.0.0.1` so `request.client.host` is the real client IP via Caddy's authoritative XFF rewrite.
+ - `is_request_remote()` reads `request.client.host` instead of the forgeable Host header; in-app leftmost-XFF parsing is gone.
+ - Caddyfile gates `Fastly-Client-IP → X-Forwarded-For` rewrite on `remote_ip` matching Fastly edge ranges. Startup assertion on `TRUSTED_PROXY_IPS` / `UVICORN_FORWARDED_ALLOW_IPS` + integration test prevent silent regression.
+ - Next.js `/admin` middleware gates on the Caddy-injected `X-Proxied-By-Caddy: true` marker instead of the forgeable Host header.
+- **Destructive-op auth**:
+ - `/api/provision/teardown` validates a caller-supplied Fastly token via `/tokens/self` for the `global` scope before any destructive op; never falls back to server-stored credentials. Frontend TeardownDialog prompts admin for the token.
+ - `/api/provision/ngwaf-workspaces` token-gated (constant-time stored-key match OR validated `global`-scope token); NGWAF workspace mutation enforces analyst-session scope.
+- **DuckDB user-SQL safety**:
+ - New `backend/utils/sql_validator.py` enforces a statement-type whitelist + recursive parse-tree walker with catalog blocklist (`duckdb_*` / `pg_*` prefixes, `information_schema` / `pg_catalog` / `system` schemas, non-`main` catalogs) + function denylist (`read_csv` / `read_parquet` / `iceberg_scan` / `glob` / `lsdir` / `getenv` / `current_setting` / `duckdb_secrets` / postgres / sqlite / mysql scanners) + fail-closed parse + audit logging + perf budget. Replaces a regex-based blocklist that missed `read_csv_auto`, `information_schema`, `duckdb_secrets`, `INSTALL/LOAD`, and `getenv`.
+ - `escape_sql_literal` helper applied at four ingest call sites; characterisation tests cover the PoC payload + multi-byte UTF-8 + backslash + empty + long-with-many-quotes.
+ - `time_range` validated via `dateutil.isoparse` before SQL interpolation.
+ - `get_con` / `get_meta_con` dropped the auto-query-param `read_only` flag.
+- **VCL header & cache discipline**:
+ - `vcl_recv` preamble unsets every internal `x-of-*` / `x-fos-edge-data` / `x-is-cluster-fetch` / `X-Edge-*` header on the inbound request.
+ - Origin-metric VCL fields: numeric regex gates + `json.escape` on string values (log-injection).
+ - VCL ua/referer keeps its `substr` cap.
+ - Fastly `vcl_hash` now keys on the full `req.url` (path + query), not just `req.url.path` — closes cross-query cache poisoning. Auth `key` querystring is already stripped earlier so no secrets leak into cache keys.
+- **Cross-tenant scope enforcement**:
+ - `/api/alerts/*` and `/api/views/*` enforce analyst-session scope on every read and mutation; pre-flight scope check on PATCH / DELETE via new `get_alert_by_id` / `get_view_by_id` helpers so unauthorised mutations never land.
+ - `/api/sources`, `/api/log-fields/catalog`, NGWAF workspace listing — analyst-scope filtering.
+ - Cache-layer audit confirmed every per-tenant cache (`session_scoring._cached`, iceberg, bot_sources) includes `service_id` in the key.
+- **Path-traversal cages**:
+ - `/api/download` path traversal: `realpath` + `commonpath` cage.
+ - Cache cleanup rejects bucket separators + `realpath` cage.
+ - `service_id` alphanumeric/dash/underscore validation in path helpers.
+- **Secret & data hygiene**:
+ - `claim_token` TOCTOU → atomic UPDATE with rowcount check.
+ - `share_db` quarantine narrowed to actual SQLite corruption signatures (was wiping the DB on transient `OperationalError`).
+ - Email-enumeration timing equalised via dummy scrypt on miss.
+ - `validate_session` re-syncs `pii_policy` / window / `service_ids` on every call so admin permission edits take effect immediately.
+ - `_StaticAssetLimiter` bounded at 10 k tracked IPs.
+ - `logging-settings/update` moved GET → POST/PATCH (CSRF).
+ - `query_errors` decorator logs traceback server-side, never in the response body; sweep fixture asserts no `trace` key leaks from any route.
+- **SSH host-key pinning**: `configs/ssh_known_hosts` pinned, source-controlled, and gitignore-excepted; tunnel manager refuses to start when the file is missing (fail-safe; no TOFU fallback).
+- **Scorer signal tightening**: Python + Rust parity — `L1_SCORE_COOKIE_TAMPERED = 100` (was capped at 75 with missing/expired); `L1_ROBOTIC_DWELL_LOW_S 0.5 → 0.20` (closes the 0.20s–0.50s robotic-bot threshold gap). Tracked follow-up sliding-window mean (needs cookie-schema v3) — partial mitigations via `SESSION_IDLE_EXPIRE_S=30 min` + `SESSION_HARD_CAP_S=24h` + session-max scoring bound the practical attack window.
+
+### Tests
+
+- 3070 backend tests
+- 65 scorer Rust tests (+8)
+- 265 frontend vitest tests (+13)
+- `make ci` green: lint + format + mypy + pytest + vcl-test + verify-deps + typecheck-frontend + test-frontend + osv.
+
+### Infrastructure
+
+- Backend Docker image: `python:3.12-slim-bullseye` → `python:3.12-slim-bookworm` (cuts CVE-laden Debian 11 base; remaining 13 high CVEs are deep-dependency / OpenSSL CVEs every major Python base inherits). Frontend image's api-schema stage bumped to match.
+- Backend image now ships [`falco`](https://github.com/ysugimoto/falco) v2.3.0 (Fastly VCL static analyser) — required by the scoring-recv-snippet validator.
+- **Secret scanning** — [`gitleaks`](https://github.com/gitleaks/gitleaks) v8.30.1 wired in three places: `.pre-commit-config.yaml` (blocks accidentally-staged credentials at commit time), `make secret-scan` Makefile target chained into `make ci`, and a dedicated step in `.github/workflows/ci.yml` (fails the build on any non-allowlisted finding). Configuration in `.gitleaks.toml` extends the built-in ruleset and adds path allowlists for tracked test fixtures, Rust lockfile checksums, the public SSH host key, and (for working-tree-only scans) the gitignored real-config / `.next/` / `data/system/` directories. Verified clean against the full branch history. Policy + suppression playbook documented in **AGENTS.md** §Secrets.
+- **CDN cache-key hardening** — `backend/core/fastly/utils.py` `vcl_recv` now runs `querystring.filter_except` to drop all non-S3-API query parameters (caller-injected tracking params, marketing UTMs, session IDs) BEFORE the cache lookup, followed by `querystring.sort` to canonicalise the remaining param order. Composes with the `vcl_hash` fix: untrusted params can no longer fracture the cache OR leak the auth `key` into the cache key.
+- Dependency freshness sweep on all four ecosystems:
+ - **Python:** `aiohttp 3.13.5 → 3.14.0`, `cfn-lint 1.51.2 → 1.51.4`, `distlib 0.4.0 → 0.4.1`, `filelock 3.29.0 → 3.29.1`, `idna 3.17 → 3.18`, `joserfc 1.6.8 → 1.7.0`.
+ - **Frontend:** `@tanstack/react-query 5.100.14 → 5.101.0` (+ devtools), `@types/react 19.2.15 → 19.2.16`, `eslint-config-next 16.2.6 → 16.2.7`, `next 16.2.6 → 16.2.7`, `react/react-dom 19.2.6 → 19.2.7`.
+ - **Rust:** `bitflags 2.11.1 → 2.12.1`.
+ - **Deferred (major bumps reserved for 1.2):** TypeScript 5.9 → 6.0 (compiler-API breaking changes); Fastly Rust SDK 0.11 → 0.12 (Compute@Edge API changes); jsdom / eslint / vitest where we're already ahead of the npm "latest" tag.
+
+### Known limitations
+
+- Rate limiting at the edge is NOT included. The DDoS gate (`fastly.ddos_detected`) handles attack-scale traffic by bypassing Compute; sustained-low-rate abuse is left to the operator's existing WAF/NGWAF policies. A future rate-limiting feature is tracked separately.
+- When a matrix is rolled back via the UI, the edge Wasm continues to use its embedded matrix until `scripts/scoring/deploy_wasm.sh` re-runs. The Restore endpoint returns a `deploy_hint` with the exact command. See `docs/session_scoring_runbook.md`.
+
+[1.1.0]: https://github.com/fastly/fastly-log-analytics/releases/tag/v1.1.0
+
## [1.0.0] - 2026-06-01
Initial public release. Self-hosted dashboard for searching, filtering, and visualizing request-level Fastly logs streamed to Fastly Object Storage.
diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md
index 922c88b2..d9e67b1d 100644
--- a/CONTRIBUTING.md
+++ b/CONTRIBUTING.md
@@ -8,6 +8,18 @@ Contributions are welcome — bug reports, feature requests, and pull requests a
- Keep pull requests focused. One feature or fix per PR.
- Make sure the project builds and runs before submitting.
+## Rust scorer prerequisites
+
+The session scoring Compute@Edge service (`compute/scorer/`) requires:
+
+- Rust 1.90+ (pinned in `compute/scorer/rust-toolchain.toml`)
+- `wasm32-wasip1` target: `rustup target add wasm32-wasip1`
+- [viceroy](https://github.com/fastly/Viceroy) (Fastly's local Compute runtime) — optional, only needed for running the scorer locally
+- The scorer is rebuilt and deployed via:
+ ```
+ scripts/scoring/deploy_wasm.sh --service-id --token
+ ```
+
## License
This project is licensed under the [Apache License 2.0](LICENSE). By submitting a pull request, you agree that your contribution will be licensed under the same terms.
diff --git a/Caddyfile b/Caddyfile
index f6d202bc..bb149eb2 100644
--- a/Caddyfile
+++ b/Caddyfile
@@ -1,15 +1,30 @@
# Caddyfile — plain HTTP origin behind Fastly, hardened.
#
# Fastly terminates TLS at the edge and proxies to this VM on port 80.
-# GCP firewall locks port 80 to Fastly's IP ranges. Backend and frontend
-# bind to 127.0.0.1 only (host network mode) — Caddy reaches them on
-# loopback, nothing else can.
+# Backend and frontend bind to 127.0.0.1 only (host network mode) — Caddy
+# reaches them on loopback, nothing else can.
+#
+# Trust topology (security #013/#029/#032/#034 and extra E1):
+# The reverse_proxy directives use header_up to rewrite X-Forwarded-For
+# from the Fastly-Client-IP header. That trust is only valid for
+# requests that actually came through Fastly — anyone connecting
+# directly to port 80 can set Fastly-Client-IP to whatever they want.
+# The @from_fastly remote_ip matcher gates the header rewrite on the
+# TCP peer being inside Fastly's published edge ranges. Direct callers
+# skip the header_up clause, so request.client.host in uvicorn comes
+# from their real (untrusted) peer IP and the IP-based gates kick in.
#
# Routing:
# /api/* → backend directly (preserves Host header so the backend's
# DNS-rebinding gate matches the registered public_endpoint;
# peer = 127.0.0.1 from Caddy in host net mode).
# else → Next.js frontend.
+#
+# Note on edge IP list maintenance:
+# The Fastly CIDRs below are the published v4 ranges as of 2026-06-03
+# (https://api.fastly.com/public-ip-list). When Fastly adds a new edge
+# range, refresh this list. A stale list means legitimate traffic from a
+# new POP is treated as direct (untrusted) until Caddy reloads.
{
# No auto-HTTPS — Fastly handles TLS termination at the edge.
@@ -60,19 +75,48 @@
}
}
+ # Defense in depth (extra E1): replace any client-supplied X-Forwarded-For
+ # with Caddy's authoritative view of the TCP peer. Then, only when the
+ # TCP peer is a Fastly edge IP, override with Fastly-Client-IP.
+ #
+ # Non-Fastly direct caller: XFF = their real peer IP. uvicorn (with
+ # --proxy-headers --forwarded-allow-ips=127.0.0.1) sees Caddy at the
+ # loopback peer and trusts XFF, so request.client.host = the real
+ # attacker IP. Backend's DNS-rebinding and remote-host checks then
+ # fire correctly instead of misclassifying as admin.
+ # Fastly-edge caller: the second directive overrides XFF with the
+ # client IP that Fastly's edge signed and attached.
+ request_header X-Forwarded-For {http.request.remote.host}
+
+ # Caddy-injected internal proxy marker (security #032): the frontend
+ # middleware blocks /admin requests when this header is present, while
+ # direct SSH-tunnel admin connections (which bypass Caddy) have no
+ # such header and reach the admin surface. Set unconditionally — there
+ # is no legitimate reason for an upstream to send this themselves.
+ request_header X-Proxied-By-Caddy "true"
+
+ # Named matcher: TCP peer is an actual Fastly edge IP.
+ @from_fastly_v4 {
+ remote_ip 23.235.32.0/20 43.249.72.0/22 103.244.50.0/24 103.245.222.0/23 103.245.224.0/24 104.156.80.0/20 140.248.64.0/18 140.248.128.0/17 146.75.0.0/17 151.101.0.0/16 157.52.64.0/18 167.82.0.0/17 167.82.128.0/20 167.82.160.0/20 167.82.224.0/20 172.111.64.0/18 185.31.16.0/22 199.27.72.0/21 199.232.0.0/16
+ }
+
+ # When AND ONLY WHEN the request came from a Fastly edge, propagate the
+ # authoritative Fastly-Client-IP as X-Forwarded-For. Requests bypassing
+ # Fastly retain the {client_ip} XFF set above (their real TCP peer),
+ # so a direct port-80 attacker cannot spoof their source IP regardless
+ # of what Fastly-Client-IP value they send.
+ request_header @from_fastly_v4 X-Forwarded-For {http.request.header.Fastly-Client-IP}
+
# API → backend (preserve Host so backend's DNS-rebinding gate matches the
# registered public_endpoint).
@api path /api/*
reverse_proxy @api 127.0.0.1:8000 {
flush_interval -1
- # Replace X-Forwarded-For with Fastly's authoritative client-IP header.
- header_up X-Forwarded-For {http.request.header.Fastly-Client-IP}
}
# Everything else → Next.js frontend.
reverse_proxy 127.0.0.1:3000 {
flush_interval -1
- header_up X-Forwarded-For {http.request.header.Fastly-Client-IP}
}
# Detailed access log: JSON format with every request's client IP, host,
diff --git a/Makefile b/Makefile
index 1223acc8..fb9c995c 100644
--- a/Makefile
+++ b/Makefile
@@ -1,4 +1,4 @@
-.PHONY: test lint format typecheck ci install install-hooks clean gen-types verify-deps
+.PHONY: test lint format typecheck ci install install-hooks clean gen-types verify-deps secret-scan osv outdated
# Prevent a VIRTUAL_ENV from another project leaking into uv commands
unexport VIRTUAL_ENV
@@ -57,6 +57,28 @@ outdated:
osv:
uv run python scripts/check_osv.py
+# Secret scanner — gitleaks, configured via .gitleaks.toml at repo root.
+# Scans git history by default (no `--no-git`), so any committed credential
+# trips the gate even if later removed. Use `gitleaks detect --no-git`
+# locally to also scan the working tree (catches secrets in untracked /
+# unstaged files before you accidentally `git add` them).
+#
+# Suppression mechanisms in increasing scope:
+# - inline `#gitleaks:allow` on the offending line
+# - .gitleaksignore — fingerprint list for one-off historical findings
+# - .gitleaks.toml [allowlist] paths — for whole files / directories
+#
+# Skips cleanly with a loud warning if the binary isn't on PATH. Production
+# CI installs it via curl in .github/workflows/ci.yml (same pattern as falco).
+secret-scan:
+ @if command -v gitleaks > /dev/null; then \
+ gitleaks detect --no-banner --redact --config .gitleaks.toml --exit-code 1; \
+ else \
+ echo "⚠️ Skipping secret-scan: gitleaks not on PATH."; \
+ echo " Install: brew install gitleaks (or see https://github.com/gitleaks/gitleaks#installing)"; \
+ echo " Pre-commit + CI install it automatically — local dev is recommended."; \
+ fi
+
# Verify package.json + package-lock.json resolve cleanly under `npm ci`.
# Local `make ci` previously used the already-installed node_modules and
# silently tolerated peer-dep conflicts that would break GitHub Actions
@@ -71,7 +93,19 @@ vcl-test:
echo "Skipping VCL tests: falco linter not found in PATH"; \
fi
-ci: lint format-check typecheck test vcl-test verify-deps typecheck-frontend test-frontend osv outdated
+# Run the underlying targets in parallel with a -j2 cap. Backend pytest
+# (~26s) and frontend vitest (~35s) are the two long poles; running them
+# concurrently saves ~25-30s wall vs. sequential, and the -j2 cap keeps
+# them from oversubscribing the box (both invocations already parallelise
+# internally via pytest-xdist / vitest workers).
+#
+# Order matters here — make's scheduler picks leftmost-available targets
+# first, so the slow ones (`test`, `test-frontend`) are listed first to
+# claim the two parallel slots immediately. Lighter checks fill in as
+# slots free up.
+ci:
+ @$(MAKE) -j2 test test-frontend typecheck-frontend lint format-check typecheck vcl-test verify-deps secret-scan osv
+
clean:
find . -type d -name __pycache__ -exec rm -rf {} + 2>/dev/null || true
diff --git a/README.md b/README.md
index d4d36c93..2000e9a5 100644
--- a/README.md
+++ b/README.md
@@ -18,6 +18,7 @@ You'll need:
- **Docker** (recommended) — or Python 3.10+ and Node.js 24+ for a manual install
- *Optional:* a Fastly API token with the **Billing** permission to power the [Usage & Cost page](docs/features.md#usage--cost-page)
- *Optional:* [`falco`](https://github.com/ysugimoto/falco) to validate VCL during provisioning (highly recommended; the app degrades gracefully without it)
+- *Optional:* **Rust 1.90+** with the `wasm32-wasip1` target (`rustup target add wasm32-wasip1`) — only needed if you plan to rebuild the [Session Scoring](docs/session_scoring_runbook.md) Compute Wasm scorer from source
---
@@ -94,6 +95,7 @@ You run the application as a central web-accessible server (either on a dedicate
- **Log field configuration** — built-in field groups (HTTP, network, geo, TLS, NGWAF) plus custom VCL expressions
- **Alerts** — threshold-based, webhook-delivered
- **Live dashboard sharing** — three modes (SSH tunnel, your own hostname, your own IP) with per-analyst passcode invites, IP allowlisting, and instant revoke
+- **Session scoring** — edge-computed 0-100 risk score per request combining cookie/timing signals with a PageRank transition matrix, with live threshold enforcement, audit logging, key rotation, and matrix version history. See the [runbook](docs/session_scoring_runbook.md) and [feature reference](docs/features.md)
See [docs/features.md](docs/features.md) for the full feature reference.
diff --git a/backend/Dockerfile b/backend/Dockerfile
index d5e3f209..d417d2ee 100644
--- a/backend/Dockerfile
+++ b/backend/Dockerfile
@@ -1,5 +1,5 @@
# --- Build Stage ---
-FROM python:3.12-slim-bullseye AS builder
+FROM python:3.12-slim-bookworm AS builder
# Install build dependencies
RUN apt-get update && apt-get install -y --no-install-recommends \
@@ -23,7 +23,7 @@ COPY pyproject.toml uv.lock ./
RUN uv sync --no-dev --frozen --no-install-project
# --- Production Stage ---
-FROM python:3.12-slim-bullseye AS runner
+FROM python:3.12-slim-bookworm AS runner
# Set environment variables
ENV PYTHONUNBUFFERED=1 \
@@ -34,11 +34,34 @@ ENV PYTHONUNBUFFERED=1 \
WORKDIR /app
-# Install runtime dependencies (curl for healthcheck)
+# Install runtime dependencies (curl for healthcheck, tar for falco install).
RUN apt-get update && apt-get install -y --no-install-recommends \
curl \
+ tar \
&& rm -rf /var/lib/apt/lists/*
+# Falco — Fastly VCL static analyser (github.com/ysugimoto/falco).
+# Used by backend.utils.vcl_validator to lint the scoring recv snippet
+# before publishing a custom URL-exclusion regex. Required in
+# production: a missing binary triggers the validator's hard-fail
+# branch when SCORING_REQUIRE_FALCO=1 is set on the backend service.
+#
+# Pin to a specific release so a future falco-side change doesn't
+# surprise us; bump when you actively want a new version.
+ARG FALCO_VERSION=2.3.0
+RUN set -eux; \
+ arch="$(dpkg --print-architecture)"; \
+ case "$arch" in \
+ amd64|arm64) falco_arch="$arch" ;; \
+ *) echo "unsupported arch: $arch" >&2; exit 1 ;; \
+ esac; \
+ url="https://github.com/ysugimoto/falco/releases/download/v${FALCO_VERSION}/falco-linux-${falco_arch}.tar.gz"; \
+ curl -fsSL -o /tmp/falco.tar.gz "$url"; \
+ tar -xzf /tmp/falco.tar.gz -C /usr/local/bin/; \
+ chmod +x /usr/local/bin/falco; \
+ rm /tmp/falco.tar.gz; \
+ falco --version
+
# Copy the virtual environment from the builder
COPY --from=builder /app/.venv /app/.venv
@@ -48,6 +71,15 @@ COPY backend/ ./backend/
COPY pyproject.toml README.md uv.lock ./
COPY scripts/generate_openapi.py scripts/
+# Include the default empty scoring matrix. The trained matrix.json is
+# a build artifact (gitignored, produced by scripts/scoring/train.py)
+# so we can't bake it in at image build time. The backend's _load_matrix()
+# prefers matrix.json if present (operator dropped it in via a volume
+# mount or post-build copy) and falls back to matrix.default.json so
+# the /scoring/evaluation endpoint returns a meaningful "no signal"
+# response instead of erroring out.
+COPY compute/scorer/matrix.default.json ./compute/scorer/matrix.default.json
+
# Expose the backend port
EXPOSE 8000
diff --git a/backend/config.py b/backend/config.py
index 11644fd5..2d4e72ea 100644
--- a/backend/config.py
+++ b/backend/config.py
@@ -28,6 +28,7 @@
import json
import os
+import re
import sys
import tempfile
import threading
@@ -76,11 +77,46 @@ def _ensure_dirs():
_ensured_dirs.add(d)
+_SERVICE_ID_RE = re.compile(r"^[A-Za-z0-9_-]+$")
+_SERVICE_ID_MAX_LEN = 64
+
+
+def _validate_service_id(service_id: str) -> str:
+ """Security: defense in depth against path traversal in any helper
+ that builds a path from ``service_id``.
+
+ Real Fastly service IDs are opaque 22-char alphanumeric strings, but the
+ test suite and a handful of legacy provisioning paths use hyphenated
+ IDs like ``svc-1`` / ``test-service-id``. The regex therefore accepts
+ ``[A-Za-z0-9_-]+`` — every character allowed is safe inside a filename
+ and contains no path-separator / dot / null-byte. Without this,
+ ``service_id="/etc/passwd"`` or ``service_id="../../tmp/x"`` would
+ compose with ``pathlib`` semantics — absolute paths discard the base
+ entirely, relative ``..`` traverses out, and ``\\x00`` truncates on
+ some kernels.
+
+ Length cap (64) is well above the longest legitimate Fastly ID (22)
+ and bounds memory in error-logging paths.
+ """
+ if not isinstance(service_id, str):
+ raise ValueError(f"invalid service_id type {type(service_id).__name__}: must be str (security)")
+ if not service_id or len(service_id) > _SERVICE_ID_MAX_LEN:
+ raise ValueError(
+ f"invalid service_id length {len(service_id) if service_id else 0}: "
+ f"1..{_SERVICE_ID_MAX_LEN} characters required (security)"
+ )
+ if not _SERVICE_ID_RE.match(service_id):
+ raise ValueError(f"invalid service_id {service_id!r}: must be alphanumeric / dash / underscore (security)")
+ return service_id
+
+
def config_path(service_id: str) -> Path:
+ _validate_service_id(service_id)
return CONFIGS_DIR / f"{service_id}.json"
def duckdb_path(service_id: str) -> str:
+ _validate_service_id(service_id)
return str(SERVICES_DATA_DIR / f"{service_id}.duckdb")
@@ -91,8 +127,18 @@ def load_config(service_id: str) -> dict | None:
result (e.g. update_status) won't poison the cache. The on-disk file is
revalidated via st_mtime_ns, so external edits and save_config writes
are picked up on the next call without explicit invalidation.
+
+ Returns ``None`` (not a raised exception) for invalid service IDs —
+ several call sites pass unsanitized input (e.g., a stale URL param,
+ an iteration over a stale config list) and rely on the None response
+ to mean "no config". Security's validation in ``config_path`` is
+ still what blocks the actual path-traversal attack; this just makes
+ the helper friendlier at call sites that don't pre-validate.
"""
- path = config_path(service_id)
+ try:
+ path = config_path(service_id)
+ except ValueError:
+ return None
try:
mtime_ns = path.stat().st_mtime_ns
except FileNotFoundError:
diff --git a/backend/core/data_migrations.py b/backend/core/data_migrations.py
new file mode 100644
index 00000000..db9a1438
--- /dev/null
+++ b/backend/core/data_migrations.py
@@ -0,0 +1,155 @@
+"""Data-migration framework for per-service one-time setup tasks.
+
+Background — why a second migration system?
+ ``backend.core.sqlite_migrations`` already exists for SCHEMA changes
+ (CREATE TABLE / ADD COLUMN) on the per-service metadata.db. Those run
+ synchronously inside ``_init_schema``, must be transactional, and are
+ cheap — a fresh DB has the latest ``_SCHEMA`` and migrations are
+ no-ops on it.
+
+ Data migrations are different: long-running, non-transactional setup
+ work that touches state OUTSIDE the metadata.db (e.g. the rollups
+ parquet files under ``/rollups/``). The rollups initial
+ backfill on a service with months of data can take many minutes; we
+ cannot block FastAPI startup behind it (containerised deploys kill
+ the boot loop on healthcheck timeout).
+
+Design:
+ * ``MIGRATIONS: list[Migration]`` — ordered registry, append-only. The
+ list order IS the run order.
+ * A row in the per-service ``applied_data_migrations`` table marks a
+ migration as done. Failed migrations leave NO row and retry on the
+ next boot.
+ * ``run_pending(service_id, source)`` diffs the registry against the
+ table, spawns ONE daemon thread per service to run the unapplied
+ migrations in sequence. Across services they parallelise.
+ * Each migration is a pure function ``(service_id, source) -> str | None``.
+ The return string is recorded in the ``notes`` column for audit.
+ Exceptions bubble up to the runner, which logs + skips the row write.
+
+Adding a migration:
+ 1. Write an idempotent function ``def _migrate_(service_id,
+ source) -> str | None:`` somewhere appropriate (typically in the
+ module that owns the affected data — e.g. rollups migration lives
+ in ``backend.core.rollups``).
+ 2. Append ``Migration(...)`` to ``MIGRATIONS`` below with a stable
+ date-prefixed name (``"YYYY-MM-DD_short_description"``).
+ 3. The next service-boot picks it up automatically. No manual run-
+ once script needed.
+
+What this is NOT:
+ * Not a schema migration tool — use ``sqlite_migrations.py`` for DDL.
+ * Not a transactional system — individual migrations should write
+ their own progress markers (per-field stamps, etc.) so a crash
+ mid-run can be detected and partial work resumed on next attempt.
+"""
+
+from __future__ import annotations
+
+import logging
+import threading
+import time
+from collections.abc import Callable
+from dataclasses import dataclass
+
+logger = logging.getLogger(__name__)
+
+
+@dataclass(frozen=True)
+class Migration:
+ name: str
+ description: str
+ fn: Callable[[str, dict], str | None]
+
+
+def _rollups_initial_backfill(service_id: str, source: dict) -> str | None:
+ """Build the initial hourly top-N rollups for the dashboard.
+
+ Idempotent: ``ensure_field_backfills`` checks per-field markers in
+ ``/rollups/backfill_markers.json`` and only re-runs the COPY
+ for fields without a marker. Safe to retry after a crash.
+ """
+ from backend.core import rollups
+
+ rollups.ensure_field_backfills(service_id, source)
+ return "rollups: ensure_field_backfills complete"
+
+
+# Ordered registry. Append-only — never remove or reorder entries.
+# Names must be globally unique and stable; the DB matches by name.
+MIGRATIONS: list[Migration] = [
+ Migration(
+ name="2026-06-04_rollups_initial_backfill",
+ description="Build initial hourly top-N rollups for dashboard top-N queries",
+ fn=_rollups_initial_backfill,
+ ),
+]
+
+
+def list_pending(service_id: str) -> list[Migration]:
+ """Return registered migrations that haven't been applied to this service."""
+ from backend.core import metadata_db
+
+ applied = metadata_db.list_applied_data_migrations(service_id)
+ return [m for m in MIGRATIONS if m.name not in applied]
+
+
+def run_pending(service_id: str, source: dict) -> None:
+ """Spawn a daemon thread that runs pending data migrations sequentially.
+
+ Returns immediately — does not block the caller. Per-service threads
+ are independent, so several services with pending migrations apply
+ in parallel; within a single service the migrations run in registry
+ order.
+ """
+ pending = list_pending(service_id)
+ if not pending:
+ return
+ names = [m.name for m in pending]
+ logger.info("[migrations] service %s: %d pending — %s", service_id, len(pending), names)
+ t = threading.Thread(
+ target=_run_sequence,
+ args=(service_id, source, pending),
+ daemon=True,
+ name=f"data-migrations-{service_id}",
+ )
+ t.start()
+
+
+def _run_sequence(service_id: str, source: dict, migrations: list[Migration]) -> None:
+ from backend.core import metadata_db
+
+ for mig in migrations:
+ t0 = time.time()
+ logger.info("[migrations] %s/%s: starting — %s", service_id, mig.name, mig.description)
+ try:
+ notes = mig.fn(service_id, source)
+ except Exception as e:
+ logger.exception(
+ "[migrations] %s/%s: FAILED after %.2fs — will retry next startup: %s",
+ service_id,
+ mig.name,
+ time.time() - t0,
+ e,
+ )
+ # Important: do NOT record this migration as applied. Returning
+ # here also halts the sequence — a later migration that depends
+ # on a failed predecessor must not be allowed to run.
+ return
+ duration = time.time() - t0
+ try:
+ metadata_db.record_applied_data_migration(
+ service_id, mig.name, duration_s=duration, status="success", notes=notes
+ )
+ except Exception as e:
+ # Recording failed but the migration itself succeeded. Next boot
+ # will re-run it; the migration is idempotent so this is safe,
+ # just wasted work. Loud warning so we can spot the divergence.
+ logger.warning(
+ "[migrations] %s/%s: applied but COULD NOT RECORD (will re-run next boot): %s",
+ service_id,
+ mig.name,
+ e,
+ )
+ continue
+ logger.info("[migrations] %s/%s: applied in %.2fs", service_id, mig.name, duration)
diff --git a/backend/core/duckdb.py b/backend/core/duckdb.py
index 2db7c2c7..cdeefff3 100644
--- a/backend/core/duckdb.py
+++ b/backend/core/duckdb.py
@@ -38,19 +38,7 @@
_ORPHAN_THRESHOLD_MINS = 5
-def _safe_iso(dt) -> str | None:
- """Normalise a DuckDB datetime or string to an ISO-8601 string ending in Z."""
- if dt is None:
- return None
- if hasattr(dt, "isoformat"):
- s = dt.isoformat()
- # DuckDB TIMESTAMP is timezone-naive but always represents UTC.
- # Append Z so JavaScript parses it as UTC instead of local time.
- if not s.endswith("Z") and "+" not in s and s.count("-") <= 2:
- s += "Z"
- return s
- return str(dt)
-
+from backend.utils.date_utils import safe_iso as _safe_iso # noqa: E402
# Cached per-process constants — computed once, reused on every connection open.
_cached_n_threads: int | None = None
@@ -258,28 +246,46 @@ def _configure_fos(con: duckdb.DuckDBPyConnection, source: dict):
# nested in CREATE SECRET, so the keys go in as a literal SQL
# fragment. Keys are a hardcoded set, never user input.
hdr_map_sql = "MAP {" + ", ".join(f"'{k}': ?" for k in headers) + "}"
- with _fos_proxy_secret_lock:
- con.execute(
- f"""
- CREATE OR REPLACE SECRET fos_proxy (
- TYPE S3,
- KEY_ID ?,
- SECRET ?,
- REGION ?,
- ENDPOINT ?,
- USE_SSL false,
- URL_STYLE 'path',
- EXTRA_HTTP_HEADERS {hdr_map_sql}
- )
- """,
- [
- source["access_key_id"],
- source["secret_access_key"],
- source["region"],
- proxy_ep,
- *headers.values(),
- ],
+ create_secret_sql = f"""
+ CREATE OR REPLACE SECRET fos_proxy (
+ TYPE S3,
+ KEY_ID ?,
+ SECRET ?,
+ REGION ?,
+ ENDPOINT ?,
+ USE_SSL false,
+ URL_STYLE 'path',
+ EXTRA_HTTP_HEADERS {hdr_map_sql}
)
+ """
+ secret_params = [
+ source["access_key_id"],
+ source["secret_access_key"],
+ source["region"],
+ proxy_ep,
+ *headers.values(),
+ ]
+ with _fos_proxy_secret_lock:
+ # _load_httpfs above runs INSTALL/LOAD httpfs, which starts an implicit
+ # transaction with a catalog snapshot taken BEFORE we acquired the lock.
+ # If another thread committed its own CREATE OR REPLACE SECRET while we
+ # were waiting, our stale snapshot trips a write-write conflict even
+ # though only one thread is inside this critical section. Rolling back
+ # discards the stale snapshot so CREATE OR REPLACE sees current catalog
+ # state. The retry handles the rare case where the rollback itself
+ # races with another commit (e.g. a third thread queued behind us).
+ for attempt in range(3):
+ try:
+ con.rollback()
+ except Exception:
+ pass
+ try:
+ con.execute(create_secret_sql, secret_params)
+ break
+ except Exception as e:
+ if "write-write conflict" in str(e).lower() and attempt < 2:
+ continue
+ raise
try:
con.execute("SET http_timeout=60;")
con.execute("SET http_retries=5;")
@@ -787,14 +793,22 @@ def get_connection(
global _cached_n_threads, _cached_mem_limit_gb
if _cached_n_threads is None:
_cached_n_threads = min(multiprocessing.cpu_count(), 8)
- if _cached_mem_limit_gb is None:
- try:
- _total_ram = os.sysconf("SC_PAGE_SIZE") * os.sysconf("SC_PHYS_PAGES")
- _cached_mem_limit_gb = max(1, int(_total_ram * 0.6 / (1024**3)))
- except (AttributeError, ValueError):
- _cached_mem_limit_gb = 4
con.execute(f"SET threads = {_cached_n_threads};")
- con.execute(f"SET memory_limit = '{_cached_mem_limit_gb}GB';")
+ # CRITICAL: only auto-derive memory_limit when DUCKDB_MEMORY_LIMIT is
+ # UNSET. Pre-fix, the env-based ``SET max_memory`` at line 762 was
+ # silently overridden here by ``SET memory_limit`` (they're aliases
+ # in DuckDB — the second SET wins). Container env DUCKDB_MEMORY_LIMIT=8GB
+ # was clobbered by ~60% of physical RAM (~9-10GB on the 16GB VM),
+ # leaving only ~6GB headroom for Python + pyiceberg + aiohttp + OS +
+ # frontend + caddy — recurring host OOM-kills followed.
+ if not os.getenv("DUCKDB_MEMORY_LIMIT"):
+ if _cached_mem_limit_gb is None:
+ try:
+ _total_ram = os.sysconf("SC_PAGE_SIZE") * os.sysconf("SC_PHYS_PAGES")
+ _cached_mem_limit_gb = max(1, int(_total_ram * 0.6 / (1024**3)))
+ except (AttributeError, ValueError):
+ _cached_mem_limit_gb = 4
+ con.execute(f"SET memory_limit = '{_cached_mem_limit_gb}GB';")
con.execute("SET checkpoint_threshold = '512MB';")
# ALWAYS update the view to ensure local buffer files
@@ -938,9 +952,18 @@ def log_cron_run(
service_id = source["name"]
cfg = svcconfig.load_config(service_id) or {}
prov = cfg.get("provisioning", {})
- cron_key = "cron_sync" if task == "sync" else "cron_compact"
- cron_cfg = prov.get(cron_key, {})
- log_enabled = cron_cfg.get("log_enabled", True)
+ # Map each cron task to the cfg block whose log_enabled flag governs it.
+ # Tasks not in the map always log — the prior ``"cron_sync" if task ==
+ # "sync" else "cron_compact"`` ternary silently coupled metadata_cleanup,
+ # optimize, expire, full_sync, gap_heal, alerts, ngwaf_sync, etc. to
+ # cron_compact's log_enabled. Setting cron_compact.log_enabled=false on
+ # a service would suppress success rows for every task except sync.
+ _TASK_TO_CRON_KEY = {
+ "sync": "cron_sync",
+ "local_compact": "cron_compact",
+ }
+ cron_key = _TASK_TO_CRON_KEY.get(task)
+ log_enabled = prov.get(cron_key, {}).get("log_enabled", True) if cron_key else True
if status == "success" and corrupt_rows and corrupt_rows > 0:
status = "partial_success"
@@ -1117,162 +1140,166 @@ def get_sync_status(
if m:
latest_ingested_file_at = f"{m.group(1)} {m.group(2).replace('-', ':').replace('.', ':')}"
- table_exists = (
- con.execute("SELECT 1 FROM information_schema.tables WHERE table_name = ?", [table_name]).fetchone() is not None
- )
-
# The iceberg view is always the source of truth for row counts.
# We fetch row counts and time extents if the table exists, even if skip_fos=True,
# because these are derived from local metadata (Iceberg manifests) and are
# relatively cheap. This allows the UI to auto-range correctly even during
# lightweight status polls.
+ #
+ # The split-path query inside the try block reads parquet DIRECTLY via
+ # read_parquet() and doesn't need the iceberg view to exist in the
+ # current connection.
+ # This matters because sync-status opens a fresh RO connection that
+ # doesn't yet have the per-session view; without this, every sync-
+ # status poll fell through to ingested_files.row_count (which sums
+ # raw FOS line counts BEFORE the timestamp filter and consistently
+ # over-reports ~2-3×).
latest_log_at = None
earliest_log_at = None
local_rows = local_rows_ingested
- if table_exists:
- try:
- # Fetch row count and time extents. The view is built with
- # read_parquet('cache//data/**/*.parquet') UNION ALL
- # read_parquet([buffer_paths]) — DuckDB opens every parquet
- # footer (~150 µs × 1.7 k data files = ~155 ms warm) plus the
- # cheap buffer side. Split the query: cache the data-side
- # count/min/max keyed by a data-dir mtime fingerprint (only
- # changes on commit/optimize), run the buffer side fresh each
- # call (~1 ms for <100 files), then merge. Cache hits go from
- # ~240 ms full-view query down to ~1 ms (data cached + buffer
- # query + fingerprint stat).
- stats = None
- data_fp = _data_stats_fingerprint(src)
- cache_key = src["name"]
- if data_fp is not None:
- try:
- with _data_stats_cache_lock:
- cached = _data_stats_cache.get(cache_key)
- if cached is not None and cached[0] == data_fp:
- d_count, d_min, d_max = cached[1], cached[2], cached[3]
- else:
- data_glob = os.path.join(_cache_dir(src), "data", "**", "*.parquet")
- d_row = con.execute(
- "SELECT count(*), min(timestamp), max(timestamp) "
- f"FROM read_parquet('{data_glob}', union_by_name=true, hive_partitioning=false)"
- ).fetchone()
- d_count = (d_row[0] or 0) if d_row else 0
- d_min = d_row[1] if d_row else None
- d_max = d_row[2] if d_row else None
- with _data_stats_cache_lock:
- _data_stats_cache[cache_key] = (data_fp, d_count, d_min, d_max)
-
- from backend.core import iceberg as _ice
-
- buf_paths = [p for p in _ice.buffer_files(src) if os.path.isfile(p)]
- if buf_paths:
- paths_sql = ", ".join(f"'{p}'" for p in buf_paths)
- b_row = con.execute(
- "SELECT count(*), min(timestamp), max(timestamp) "
- f"FROM read_parquet([{paths_sql}], union_by_name=true, hive_partitioning=false)"
- ).fetchone()
- b_count = (b_row[0] or 0) if b_row else 0
- b_min = b_row[1] if b_row else None
- b_max = b_row[2] if b_row else None
- else:
- b_count, b_min, b_max = 0, None, None
-
- mins = [m for m in (d_min, b_min) if m is not None]
- maxs = [m for m in (d_max, b_max) if m is not None]
- stats = (
- d_count + b_count,
- min(mins) if mins else None,
- max(maxs) if maxs else None,
- )
- except Exception as split_err:
- # Bust the data cache so we don't pin a half-built result.
+ try:
+ # Fetch row count and time extents. The view is built with
+ # read_parquet('cache//data/**/*.parquet') UNION ALL
+ # read_parquet([buffer_paths]) — DuckDB opens every parquet
+ # footer (~150 µs × 1.7 k data files = ~155 ms warm) plus the
+ # cheap buffer side. Split the query: cache the data-side
+ # count/min/max keyed by a data-dir mtime fingerprint (only
+ # changes on commit/optimize), run the buffer side fresh each
+ # call (~1 ms for <100 files), then merge. Cache hits go from
+ # ~240 ms full-view query down to ~1 ms (data cached + buffer
+ # query + fingerprint stat).
+ stats = None
+ data_fp = _data_stats_fingerprint(src)
+ cache_key = src["name"]
+ if data_fp is not None:
+ try:
+ with _data_stats_cache_lock:
+ cached = _data_stats_cache.get(cache_key)
+ if cached is not None and cached[0] == data_fp:
+ d_count, d_min, d_max = cached[1], cached[2], cached[3]
+ else:
+ data_glob = os.path.join(_cache_dir(src), "data", "**", "*.parquet")
+ d_row = con.execute(
+ "SELECT count(*), min(timestamp), max(timestamp) "
+ f"FROM read_parquet('{data_glob}', union_by_name=true, hive_partitioning=false)"
+ ).fetchone()
+ d_count = (d_row[0] or 0) if d_row else 0
+ d_min = d_row[1] if d_row else None
+ d_max = d_row[2] if d_row else None
with _data_stats_cache_lock:
- _data_stats_cache.pop(cache_key, None)
- # Stale-cache failure modes ("No files found", missing
- # catalog entries) must flow to the outer view-rebuild
- # handler below — the cure is the same. Re-raise here
- # rather than swallowing, so the existing recovery path
- # still triggers clear_source_caches+update_iceberg_view.
- err_str = str(split_err)
- if (
- "No files found" in err_str
- or "Catalog Error: Table with name" in err_str
- or "does not exist" in err_str
- or "No such file or directory" in err_str
- ):
- raise
- logger.debug("[sync-status] split-stats query failed, falling back to view: %s", split_err)
-
- if stats is None:
+ _data_stats_cache[cache_key] = (data_fp, d_count, d_min, d_max)
+
+ from backend.core import iceberg as _ice
+
+ buf_paths = [p for p in _ice.buffer_files(src) if os.path.isfile(p)]
+ if buf_paths:
+ paths_sql = ", ".join(f"'{p}'" for p in buf_paths)
+ b_row = con.execute(
+ "SELECT count(*), min(timestamp), max(timestamp) "
+ f"FROM read_parquet([{paths_sql}], union_by_name=true, hive_partitioning=false)"
+ ).fetchone()
+ b_count = (b_row[0] or 0) if b_row else 0
+ b_min = b_row[1] if b_row else None
+ b_max = b_row[2] if b_row else None
+ else:
+ b_count, b_min, b_max = 0, None, None
+
+ mins = [m for m in (d_min, b_min) if m is not None]
+ maxs = [m for m in (d_max, b_max) if m is not None]
+ stats = (
+ d_count + b_count,
+ min(mins) if mins else None,
+ max(maxs) if maxs else None,
+ )
+ except Exception as split_err:
+ # Bust the data cache so we don't pin a half-built result.
+ with _data_stats_cache_lock:
+ _data_stats_cache.pop(cache_key, None)
+ # Stale-cache failure modes ("No files found", missing
+ # catalog entries) must flow to the outer view-rebuild
+ # handler below — the cure is the same. Re-raise here
+ # rather than swallowing, so the existing recovery path
+ # still triggers clear_source_caches+update_iceberg_view.
+ err_str = str(split_err)
+ if (
+ "No files found" in err_str
+ or "Catalog Error: Table with name" in err_str
+ or "does not exist" in err_str
+ or "No such file or directory" in err_str
+ ):
+ raise
+ logger.debug("[sync-status] split-stats query failed, falling back to view: %s", split_err)
+
+ if stats is None:
+ stats = con.execute(f"SELECT count(*), min(timestamp), max(timestamp) FROM {table_name}").fetchone()
+ if stats:
+ view_rows = stats[0] if stats[0] is not None else 0
+ # When the view returns a real (non-zero) count, trust it
+ # as the source of truth — it reflects the rows actually
+ # queryable in Iceberg. ingested_files.row_count records
+ # the raw JSON line count from each FOS file BEFORE the
+ # `WHERE timestamp IS NOT NULL` filter and any time-range
+ # filter, and never reflects post-compaction dedup, so it
+ # consistently over-reports. Only fall back when the view
+ # itself is empty (the "WHERE false" transient-failure
+ # fallback) — there we degrade to the metadata sum so the
+ # header doesn't read 0 while we have data on disk.
+ if view_rows > 0:
+ local_rows = view_rows
+ earliest_log_at = stats[1]
+ latest_log_at = stats[2]
+ else:
+ local_rows = local_rows_ingested
+ except Exception as e:
+ if (
+ "No files found" in str(e)
+ or "Catalog Error: Table with name" in str(e)
+ or "does not exist" in str(e)
+ or "No such file or directory" in str(e)
+ ):
+ try:
+ from backend.core import iceberg
+
+ # Bust the cached view SQL FIRST. Without this, when ingest
+ # is mid-commit and holding the per-service lock,
+ # update_iceberg_view falls back to executing the cached
+ # SQL — which is exactly the stale SQL that referenced
+ # the missing parquet, looping us right back into the same
+ # error. Clearing the cache forces a real rebuild on the
+ # next view-update window (possibly the next poll).
+ #
+ # ``keep_snapshot_cache=True``: do NOT also wipe the
+ # snapshot/path cache. If we wipe both, then a transient
+ # catalog-load failure (FOS rate limit, network blip)
+ # causes update_iceberg_view to fall through to its
+ # empty-view branch — "WHERE false" — which then sticks
+ # in _view_cache and shows the user "Total Logs: 0"
+ # despite millions of rows being in the table.
+ iceberg.clear_source_caches(src.get("name", "default"), keep_snapshot_cache=True)
+ iceberg.update_iceberg_view(con, src)
stats = con.execute(f"SELECT count(*), min(timestamp), max(timestamp) FROM {table_name}").fetchone()
- if stats:
- view_rows = stats[0] if stats[0] is not None else 0
- # When the view returns a real (non-zero) count, trust it
- # as the source of truth — it reflects the rows actually
- # queryable in Iceberg. ingested_files.row_count records
- # the raw JSON line count from each FOS file BEFORE the
- # `WHERE timestamp IS NOT NULL` filter and any time-range
- # filter, and never reflects post-compaction dedup, so it
- # consistently over-reports. Only fall back when the view
- # itself is empty (the "WHERE false" transient-failure
- # fallback) — there we degrade to the metadata sum so the
- # header doesn't read 0 while we have data on disk.
- if view_rows > 0:
- local_rows = view_rows
+ if stats:
+ local_rows = stats[0] if stats[0] is not None else 0
earliest_log_at = stats[1]
latest_log_at = stats[2]
- else:
- local_rows = local_rows_ingested
- except Exception as e:
- if (
- "No files found" in str(e)
- or "Catalog Error: Table with name" in str(e)
- or "does not exist" in str(e)
- or "No such file or directory" in str(e)
- ):
- try:
- from backend.core import iceberg
-
- # Bust the cached view SQL FIRST. Without this, when ingest
- # is mid-commit and holding the per-service lock,
- # update_iceberg_view falls back to executing the cached
- # SQL — which is exactly the stale SQL that referenced
- # the missing parquet, looping us right back into the same
- # error. Clearing the cache forces a real rebuild on the
- # next view-update window (possibly the next poll).
- #
- # ``keep_snapshot_cache=True``: do NOT also wipe the
- # snapshot/path cache. If we wipe both, then a transient
- # catalog-load failure (FOS rate limit, network blip)
- # causes update_iceberg_view to fall through to its
- # empty-view branch — "WHERE false" — which then sticks
- # in _view_cache and shows the user "Total Logs: 0"
- # despite millions of rows being in the table.
- iceberg.clear_source_caches(src.get("name", "default"), keep_snapshot_cache=True)
- iceberg.update_iceberg_view(con, src)
- stats = con.execute(f"SELECT count(*), min(timestamp), max(timestamp) FROM {table_name}").fetchone()
- if stats:
- local_rows = stats[0] if stats[0] is not None else 0
- earliest_log_at = stats[1]
- latest_log_at = stats[2]
- except Exception as retry_e:
- # The fallback to ``local_rows_ingested`` below is the
- # designed degradation path — when the cache is mid-
- # rebuild and we couldn't acquire the lock, ``local_rows``
- # still reflects the row count we tracked at ingest time.
- # Demoted from print/warning to debug because the cascade
- # spams stderr on every sync-status poll until ingest
- # releases the lock; the bust above breaks the loop on
- # the next attempt regardless.
- logger.debug("[sync-status] log stats unavailable mid-rebuild: %s", retry_e)
- local_rows = local_rows_ingested
- else:
- # Unexpected exception — this one is worth keeping as a
- # warning since it doesn't match any of the known "stale
- # cache" patterns above and the fallback may hide real bugs.
- logger.warning("[sync-status] Failed to get log stats from view: %s", e)
+ except Exception as retry_e:
+ # The fallback to ``local_rows_ingested`` below is the
+ # designed degradation path — when the cache is mid-
+ # rebuild and we couldn't acquire the lock, ``local_rows``
+ # still reflects the row count we tracked at ingest time.
+ # Demoted from print/warning to debug because the cascade
+ # spams stderr on every sync-status poll until ingest
+ # releases the lock; the bust above breaks the loop on
+ # the next attempt regardless.
+ logger.debug("[sync-status] log stats unavailable mid-rebuild: %s", retry_e)
local_rows = local_rows_ingested
+ else:
+ # Unexpected exception — this one is worth keeping as a
+ # warning since it doesn't match any of the known "stale
+ # cache" patterns above and the fallback may hide real bugs.
+ logger.warning("[sync-status] Failed to get log stats from view: %s", e)
+ local_rows = local_rows_ingested
# Latest available filename mirrors latest_file_name since FOS LIST is
# not consulted here (comment above explains why). Reuse the summary's
diff --git a/backend/core/duckdb_pool.py b/backend/core/duckdb_pool.py
new file mode 100644
index 00000000..f8a90797
--- /dev/null
+++ b/backend/core/duckdb_pool.py
@@ -0,0 +1,386 @@
+"""Per-service DuckDB connection pool.
+
+Each API request previously opened a fresh DuckDB connection, ran ~10 PRAGMAs,
+configured S3 + iceberg, and called ``update_iceberg_view`` to bind the per-
+service view onto the new connection. Steady-state cost was ~50ms of setup
+plus another ~45ms on first-query overhead — paid by every request.
+
+This module caches read-only connections in a per-service pool. A request
+checks out a fully-configured connection, runs its queries, then returns it.
+The view binding is re-validated on checkout via the existing fast-path
+fingerprint (``_view_cache``); a cache hit is a few-µs dict lookup, so the
+hot path checkout is genuinely cheap.
+
+The pool is opt-in via ``DUCKDB_CONNECTION_POOL`` env var (default on); set
+to ``"0"`` to disable and fall back to the always-fresh-connection path.
+Exists primarily so tests and ops have an emergency switch if a pooling
+regression slips through.
+
+Lifecycle:
+ * Pool is created lazily on first checkout for a service.
+ * Idle connections are stored in a LIFO queue (recently-used first, so the
+ OS page cache stays hot on the file descriptors that are currently warm).
+ * Pool size is bounded by ``max_size`` (default 8 per service). When the
+ pool is empty and ``in_use < max_size``, the next checkout creates a new
+ connection. When ``in_use == max_size``, waiters block on a Condition.
+ * If a request returns a connection that errored mid-query, the connection
+ is discarded (closed) rather than returned to the pool — the next
+ checkout creates a fresh one.
+ * On checkin, we DROP any temp tables the request created (sweep against
+ ``information_schema``) so a long-lived pool connection doesn't slowly
+ accumulate state across requests. A leaked temp table from a prior
+ request would otherwise show up as ``CATALOG ENTRY ALREADY EXISTS`` if a
+ later request happened to pick the same uuid (improbable, but
+ deterministic at scale).
+
+Concurrency:
+ * Multiple connections to the same DuckDB file on the same process are safe
+ — they share the in-memory database state.
+ * Read-only + read-only across pool connections is fine.
+ * Read-only pool + one read-write writer (ingest) is the project's existing
+ contract; ``get_connection`` already handles ``DBBusyError`` retries.
+
+Failure handling:
+ * If view rebind fails on checkout, we discard the connection and try a
+ fresh one. After ``max_retries`` consecutive failures we surface
+ ``DBBusyError`` to the caller (which becomes a 503 in deps.py).
+"""
+
+from __future__ import annotations
+
+import logging
+import os
+import queue
+import threading
+import time
+from contextlib import contextmanager
+
+import duckdb
+
+logger = logging.getLogger(__name__)
+
+
+def _pool_enabled() -> bool:
+ return os.getenv("DUCKDB_CONNECTION_POOL", "1").lower() not in ("0", "false", "no", "off")
+
+
+def _pool_max_size() -> int:
+ raw = os.getenv("DUCKDB_POOL_MAX_SIZE", "8")
+ try:
+ return max(1, int(raw))
+ except (TypeError, ValueError):
+ return 8
+
+
+# Per-connection state tracking. DuckDB connection objects are slotted
+# C types — they don't accept arbitrary attribute assignment — so we
+# keep our metadata in a module-level dict keyed by id(con). Entries are
+# cleared when the connection is closed/discarded.
+#
+# Fingerprint = id() of the ``_view_cache`` tuple at the time the view
+# was last bound to this connection. The tuple is replaced (not mutated)
+# when the cache rotates, so identity is a sufficient fresh-check.
+_conn_state: dict[int, dict] = {}
+_conn_state_lock = threading.Lock()
+
+
+def _set_conn_state(con: duckdb.DuckDBPyConnection, **kv) -> None:
+ with _conn_state_lock:
+ state = _conn_state.setdefault(id(con), {})
+ state.update(kv)
+
+
+def _get_conn_state(con: duckdb.DuckDBPyConnection, key: str, default=None):
+ with _conn_state_lock:
+ return _conn_state.get(id(con), {}).get(key, default)
+
+
+def _forget_conn(con: duckdb.DuckDBPyConnection) -> None:
+ with _conn_state_lock:
+ _conn_state.pop(id(con), None)
+
+
+def _safe_buffer_mtime(src: dict | None) -> float | None:
+ """Return mtime of the service's buffer dir, or None if it can't be read.
+
+ Used as part of the pool's checkout fingerprint so that the sync cron
+ removing buffer parquet files (without touching ``_view_cache``) still
+ invalidates pooled connections. Any add/remove inside the dir bumps the
+ dir's own mtime — so a single stat is enough.
+ """
+ if src is None:
+ return None
+ try:
+ from backend.core.iceberg import _buffer_dir
+
+ path = _buffer_dir(src)
+ return os.path.getmtime(path)
+ except Exception:
+ return None
+
+
+class _Pool:
+ """Per-service pool. Not exposed directly — use ``checkout_connection``."""
+
+ def __init__(self, service_key: str, max_size: int):
+ self.service_key = service_key
+ self.max_size = max_size
+ # LIFO so the most-recently-used connection (warmest in any OS / DuckDB
+ # internal caches) is the next checkout.
+ self._idle: queue.LifoQueue = queue.LifoQueue(maxsize=max_size)
+ self._lock = threading.Lock()
+ # ``in_use`` is the count of connections currently checked out plus
+ # connections idle in the queue. Bounded by ``max_size``.
+ self._in_use = 0
+ self._cond = threading.Condition(self._lock)
+ # Cumulative counters for diagnostics — exposed via ``stats()``.
+ self._created_total = 0
+ self._reused_total = 0
+ self._discarded_total = 0
+
+ def acquire(self, src: dict, max_wait: float) -> duckdb.DuckDBPyConnection:
+ deadline = time.monotonic() + max_wait
+ with self._cond:
+ while True:
+ # Fast path: idle connection available
+ try:
+ con = self._idle.get_nowait()
+ self._reused_total += 1
+ return self._prepare_checkout(con, src)
+ except queue.Empty:
+ pass
+
+ # Capacity available: build a new one outside the lock
+ if self._in_use < self.max_size:
+ self._in_use += 1
+ self._created_total += 1
+ break # fall through to the unlocked build path
+
+ # Saturated: wait for a return
+ remaining = deadline - time.monotonic()
+ if remaining <= 0:
+ raise _PoolBusy(
+ f"pool for {self.service_key} saturated at {self.max_size}"
+ )
+ self._cond.wait(timeout=remaining)
+
+ # Outside lock: build fresh. _in_use was already incremented; if the
+ # build raises we MUST decrement and notify a waiter, hence the try.
+ try:
+ from backend.core.duckdb import get_connection
+
+ con = get_connection(source=src, read_only=True, max_wait=max_wait)
+ _set_conn_state(con, service_key=self.service_key)
+ self._stamp_fingerprint(con, src)
+ return con
+ except Exception:
+ with self._cond:
+ self._in_use -= 1
+ self._cond.notify()
+ raise
+
+ def release(self, con: duckdb.DuckDBPyConnection, *, errored: bool = False) -> None:
+ """Return a connection to the pool. Pass ``errored=True`` to discard
+ instead — the next checkout will build fresh."""
+ if errored:
+ self._discard(con)
+ return
+ try:
+ self._cleanup_temp_tables(con)
+ except Exception as e:
+ # Cleanup failure means the connection is in unknown state — discard.
+ logger.debug("[pool] %s: cleanup failed, discarding: %s", self.service_key, e)
+ self._discard(con)
+ return
+ with self._cond:
+ try:
+ self._idle.put_nowait(con)
+ self._cond.notify()
+ return
+ except queue.Full:
+ # Pool already at max idle (shouldn't happen given in_use cap,
+ # but defensive). Close this one and free the slot.
+ pass
+ # Outside lock: close
+ try:
+ con.close()
+ except Exception:
+ pass
+ with self._cond:
+ self._in_use -= 1
+ self._cond.notify()
+
+ def _discard(self, con: duckdb.DuckDBPyConnection) -> None:
+ _forget_conn(con)
+ try:
+ con.close()
+ except Exception:
+ pass
+ with self._cond:
+ self._in_use -= 1
+ self._discarded_total += 1
+ self._cond.notify()
+
+ def _prepare_checkout(self, con: duckdb.DuckDBPyConnection, src: dict) -> duckdb.DuckDBPyConnection:
+ """Re-validate the view binding before handing the connection out.
+
+ Two checks make up the fingerprint:
+
+ 1. id() of the iceberg ``_view_cache`` tuple for this service.
+ The tuple is replaced (not mutated) when the cache rotates, so
+ identity is a sufficient check that the SQL we'd bind matches
+ what we bound last time.
+
+ 2. mtime of the buffer directory. The sync cron's commit step
+ DELETES buffer parquet files without calling update_iceberg_view —
+ so the view-cache tuple keeps looking "fresh" while the files
+ it references are gone. mtime catches that: any add/remove in
+ the dir bumps it. Cost ~1 syscall (~µs).
+
+ If either differs from what we last stamped, rebind. If the rebind
+ fails, discard the connection and let the caller retry.
+ """
+ try:
+ from backend.core import iceberg
+
+ current = iceberg._view_cache.get(self.service_key)
+ stamped_view = _get_conn_state(con, "view_fingerprint")
+ stamped_buf = _get_conn_state(con, "buffer_mtime")
+ current_buf = _safe_buffer_mtime(src)
+ if (
+ current is not None
+ and id(current) == stamped_view
+ and current_buf == stamped_buf
+ ):
+ # View AND underlying buffer set match what we bound last
+ # time — nothing to do.
+ return con
+ iceberg.update_iceberg_view(con, src)
+ self._stamp_fingerprint(con, src)
+ return con
+ except Exception as e:
+ logger.warning("[pool] %s: view refresh on checkout failed, discarding: %s", self.service_key, e)
+ self._discard(con)
+ raise
+
+ def _stamp_fingerprint(self, con: duckdb.DuckDBPyConnection, src: dict | None = None) -> None:
+ try:
+ from backend.core import iceberg
+
+ current = iceberg._view_cache.get(self.service_key)
+ buf_mtime = _safe_buffer_mtime(src) if src is not None else None
+ _set_conn_state(
+ con,
+ view_fingerprint=id(current) if current is not None else None,
+ buffer_mtime=buf_mtime,
+ )
+ except Exception:
+ _set_conn_state(con, view_fingerprint=None, buffer_mtime=None)
+
+ def _cleanup_temp_tables(self, con: duckdb.DuckDBPyConnection) -> None:
+ """Drop any t_-style temp tables left behind by repositories
+ whose ``temp_table`` context manager exited cleanly does the DROP
+ itself; this is belt-and-suspenders for the failure paths."""
+ try:
+ rows = con.execute(
+ "SELECT table_name FROM duckdb_tables() "
+ "WHERE schema_name = 'main' AND temporary = true"
+ ).fetchall()
+ except Exception:
+ return
+ for (name,) in rows:
+ try:
+ con.execute(f"DROP TABLE IF EXISTS {name}")
+ except Exception:
+ # Best-effort — if a single table fails to drop, keep going.
+ pass
+
+ def stats(self) -> dict:
+ with self._cond:
+ return {
+ "service": self.service_key,
+ "max_size": self.max_size,
+ "in_use": self._in_use,
+ "idle": self._idle.qsize(),
+ "created_total": self._created_total,
+ "reused_total": self._reused_total,
+ "discarded_total": self._discarded_total,
+ }
+
+
+class _PoolBusy(Exception):
+ """Raised when the pool is saturated and the wait deadline elapsed."""
+
+
+_pools: dict[str, _Pool] = {}
+_pools_lock = threading.Lock()
+
+
+def _get_pool(service_key: str, max_size: int | None = None) -> _Pool:
+ if max_size is None:
+ max_size = _pool_max_size()
+ with _pools_lock:
+ pool = _pools.get(service_key)
+ if pool is None:
+ pool = _Pool(service_key, max_size=max_size)
+ _pools[service_key] = pool
+ return pool
+
+
+@contextmanager
+def checkout_connection(src: dict, max_wait: float = 10.0):
+ """Yield a fully-configured DuckDB connection from the per-service pool.
+
+ Falls back to the legacy always-fresh path when ``DUCKDB_CONNECTION_POOL``
+ is disabled. Returns the connection to the pool on clean exit; discards
+ it on any exception so a poisoned connection doesn't get reused.
+ """
+ if not _pool_enabled():
+ from backend.core.duckdb import get_connection
+
+ con = get_connection(source=src, read_only=True, max_wait=max_wait)
+ try:
+ yield con
+ finally:
+ try:
+ con.close()
+ except Exception:
+ pass
+ return
+
+ service_key = src.get("name") or src.get("service_id") or "default"
+ pool = _get_pool(service_key)
+ con = pool.acquire(src, max_wait=max_wait)
+ errored = False
+ try:
+ yield con
+ except Exception:
+ errored = True
+ raise
+ finally:
+ pool.release(con, errored=errored)
+
+
+def get_all_stats() -> list[dict]:
+ """Diagnostics: return current pool state for every service."""
+ with _pools_lock:
+ return [pool.stats() for pool in _pools.values()]
+
+
+def shutdown_all() -> None:
+ """Close every idle connection across every pool. Called on app shutdown
+ so DuckDB releases its file handles cleanly."""
+ with _pools_lock:
+ pools = list(_pools.values())
+ _pools.clear()
+ for pool in pools:
+ while True:
+ try:
+ con = pool._idle.get_nowait()
+ except queue.Empty:
+ break
+ _forget_conn(con)
+ try:
+ con.close()
+ except Exception:
+ pass
diff --git a/backend/core/fastly/service.py b/backend/core/fastly/service.py
index 5e6b249e..5e6b16bb 100644
--- a/backend/core/fastly/service.py
+++ b/backend/core/fastly/service.py
@@ -25,22 +25,6 @@ def find_service_by_name(name: str, token: str) -> dict | None:
return None
-def find_dictionary_by_name(service_id: str, version: int, name: str, token: str) -> dict | None:
- try:
- dicts = fastly("GET", f"/service/{service_id}/version/{version}/dictionary", token=token)
- for d in dicts:
- if d.get("name") == name:
- return d
- except RuntimeError:
- pass
- return None
-
-
-def upsert_dictionary_items(service_id: str, dictionary_id: str, items: dict[str, str], token: str):
- payload = {"items": [{"item_key": k, "item_value": v} for k, v in items.items()]}
- return fastly("PATCH", f"/service/{service_id}/dictionary/{dictionary_id}/items", payload, token=token)
-
-
def find_condition(name: str, service_id: str, version: int, token: str) -> dict | None:
try:
conditions = fastly("GET", f"/service/{service_id}/version/{version}/condition", token=token)
diff --git a/backend/core/fastly/utils.py b/backend/core/fastly/utils.py
index a346d72e..e80eb238 100644
--- a/backend/core/fastly/utils.py
+++ b/backend/core/fastly/utils.py
@@ -1,5 +1,6 @@
import argparse
import re
+import secrets
# Candidate field names on Fastly's /stats/service response that carry the
# "log lines emitted" counter. Ordered: most-likely first. If all four miss
@@ -148,8 +149,25 @@ def load_vcl(rate_limiting: bool = True) -> str:
set req.http.Fastly-Client-IP = client.ip;
}
- # Block requests that do not provide the correct secret key (purges are exempt)
- if (req.method != "FASTLYPURGE" && req.restarts == 0 && fastly.ff.visits_this_service == 0 && subfield(req.url.qs, "key", "&") != table.lookup(cdn_auth, "secret", "") && req.http.x-fastly-key != table.lookup(cdn_auth, "secret", "")) {
+ # Handle FASTLYPURGE natively. Without this, an unsigned purge on a
+ # cache miss is forwarded to the FOS origin, which returns 403 — and
+ # Fastly caches that 403 for the object's TTL. An attacker can poison
+ # the cache for legitimate clients by issuing purges against arbitrary
+ # keys. ``return(purge)`` short-circuits the pipeline before any
+ # backend fetch happens.
+ if (req.method == "FASTLYPURGE") {
+ return(purge);
+ }
+
+ # Block requests that do not provide the correct secret key.
+ # NOTE on the auth fallback: the third argument to ``table.lookup`` is
+ # returned when ``cdn_auth.secret`` is absent from the edge dictionary.
+ # Defaulting to ``""`` is fail-open — an attacker who sends an empty
+ # ``key`` query param trivially matches. ``__FALLBACK_SECRET__`` is
+ # substituted in load_vcl() with ``secrets.token_hex(32)``, which is
+ # never knowable to an attacker and therefore fails closed when the
+ # dictionary is unprovisioned.
+ if (req.restarts == 0 && fastly.ff.visits_this_service == 0 && subfield(req.url.qs, "key", "&") != table.lookup(cdn_auth, "secret", "__FALLBACK_SECRET__") && req.http.x-fastly-key != table.lookup(cdn_auth, "secret", "__FALLBACK_SECRET__")) {
#RATELIMIT_BEGIN
declare local var.last_minute INTEGER;
set var.last_minute = ratelimit.ratecounter_increment(auth_fail_rc, req.http.Fastly-Client-IP, 1);
@@ -171,8 +189,26 @@ def load_vcl(rate_limiting: bool = True) -> str:
set req.enable_segmented_caching = true;
set segmented_caching.block_size = 20971520; # 20 MB, the maximum
- # Strip only the key from the URL before forwarding to Fastly Object Storage
- set req.url = querystring.filter(req.url, "key");
+ # Cache-key hardening (post-auth — auth check above still reads the
+ # `key` qs param from the original req.url):
+ # 1. querystring.filter_except keeps ONLY the S3-API parameters the
+ # FOS origin actually understands and strips everything else
+ # (including our auth `key` secret, any caller-injected tracking
+ # params, marketing UTM params, session IDs, etc.). Unexpected
+ # params no longer fracture the cache or leak into req.hash.
+ # 2. querystring.sort canonicalises the remaining param order so
+ # `?prefix=foo&max-keys=10` and `?max-keys=10&prefix=foo` resolve
+ # to one cache entry instead of two.
+ # Allow-list rationale (S3 API surface FOS exposes):
+ # - List objects v2: list-type, prefix, delimiter, continuation-token,
+ # start-after, max-keys, encoding-type, fetch-owner
+ # - List objects v1: marker
+ # - Get object: versionId, partNumber, response-content-type,
+ # response-content-disposition, response-cache-control
+ # Anything else is silently dropped. If a legitimate S3 param needs to
+ # pass through later, add it to this list and re-deploy.
+ set req.url = querystring.filter_except(req.url, "list-type,prefix,delimiter,continuation-token,start-after,max-keys,encoding-type,fetch-owner,marker,versionId,partNumber,response-content-type,response-content-disposition,response-cache-control");
+ set req.url = querystring.sort(req.url);
# Never cache admin_state.json — it changes on every mutation
if (req.url ~ "/iceberg/meta/admin_state\\.json$") {
@@ -195,7 +231,19 @@ def load_vcl(rate_limiting: bool = True) -> str:
return(lookup);
}
sub vcl_hash {
- set req.hash += req.url.path;
+ # Security: hash on the full URL (path + query string), not just
+ # req.url.path. Before this fix, two requests that differed only in
+ # query parameters (e.g. ListObjectsV2 with different ?prefix= values,
+ # or ?versionId= variants) shared a single cache entry — the second
+ # caller would receive the first caller's object listing. The CDN
+ # auth `key` querystring has already been stripped from req.url by
+ # the querystring.filter_except in vcl_recv, AND remaining params are
+ # sorted by querystring.sort, so the cache key (a) does NOT include
+ # the secret and (b) is normalised across param-order variants.
+ # Expect a one-time cache-hit-rate dip + origin egress spike on
+ # rollout while prior entries are stranded; the canary monitors
+ # those signals and auto-rolls back if they exceed v6 §6 thresholds.
+ set req.hash += req.url;
set req.hash += req.http.host;
#FASTLY hash
return(hash);
@@ -271,4 +319,12 @@ def load_vcl(rate_limiting: bool = True) -> str:
}"""
if not rate_limiting:
vcl = re.sub(r"\s*#RATELIMIT_BEGIN.*?#RATELIMIT_END", "", vcl, flags=re.DOTALL)
+ # Substitute the placeholder with a fresh random fallback secret so
+ # that when ``cdn_auth.secret`` is missing from the edge dictionary,
+ # the lookup returns an unguessable value and the auth check fails
+ # closed instead of allowing empty-key requests through. A new secret
+ # per load_vcl() call is fine: real auth uses the dictionary value
+ # (this fallback is never matched in steady state).
+ fallback_secret = secrets.token_hex(32)
+ vcl = vcl.replace("__FALLBACK_SECRET__", fallback_secret)
return vcl
diff --git a/backend/core/iceberg.py b/backend/core/iceberg.py
index 870a2b14..33c0bc2e 100644
--- a/backend/core/iceberg.py
+++ b/backend/core/iceberg.py
@@ -525,6 +525,7 @@ def _patched_open(self, path, mode="rb", **kwargs):
)
from backend.core.log_fields import LOG_FIELD_CATALOG
+from backend.utils.sql_validator import escape_sql_literal
# ---------------------------------------------------------------------------
# Iceberg Schema — derived from LOG_FIELD_CATALOG (single source of truth).
@@ -1095,11 +1096,14 @@ def _read_metadata_pointer(source: dict, identifier: tuple) -> str | None:
try:
from backend.core.duckdb import _get_fos_client
+ from backend.models.lake import _safe_cdn_url
s3 = _get_fos_client(source)
bucket = source["bucket"]
base_prefix = source.get("prefix", "").strip("/")
- cdn_url = (source.get("cdn_url") or "").rstrip("/")
+ # SSRF guard: only follow ``cdn_url`` when it parses as an https
+ # Fastly hostname. Otherwise fall through to the S3 SDK.
+ cdn_url = _safe_cdn_url((source.get("cdn_url") or "").rstrip("/"))
cdn_secret = source.get("cdn_secret") or ""
iceberg_root = f"{base_prefix}/iceberg" if base_prefix else "iceberg"
@@ -1807,7 +1811,7 @@ def optimize_table(source: dict, target_file_size_mb: int = 128, min_files_per_p
# Use DuckDB to read only these files (most efficient)
paths = [f.file_path for f in files]
- paths_sql = ", ".join(f"'{p}'" for p in paths)
+ paths_sql = ", ".join(f"'{escape_sql_literal(p)}'" for p in paths)
try:
# Read into PyArrow. Must materialise to a Table — pyiceberg's
@@ -2397,10 +2401,10 @@ def configure_duckdb_s3(con) -> None:
unmatched URLs and silently bypass telemetry.
"""
try:
- con.execute("INSTALL iceberg; INSTALL avro; INSTALL httpfs; INSTALL parquet;")
con.execute("LOAD iceberg; LOAD avro; LOAD httpfs; LOAD parquet;")
except Exception:
try:
+ con.execute("INSTALL iceberg; INSTALL avro; INSTALL httpfs; INSTALL parquet;")
con.execute("LOAD iceberg; LOAD avro; LOAD httpfs; LOAD parquet;")
except Exception:
pass
@@ -2724,13 +2728,24 @@ def get_last_view_stats(source: dict) -> dict:
def inject_view_debug(debug_list: list, source: dict):
stats = get_last_view_stats(source)
if stats and stats.get("sql"):
+ # Apply the same path-list compaction as the per-query recorder
+ # in repositories/_base. The view-build SQL is the WORST offender
+ # because it inlines every buffer file twice (in the UNION ALL
+ # RHS) — pre-compaction it accounted for ~30 KB on its own in
+ # the dashboard response.
+ from backend.repositories._base import _compact_sql_for_debug
+
mode = (
"FAST PATH (Local Cache / Buffer Match)"
if stats.get("was_fast_path")
else "SLOW PATH (S3 Read / Manifest Resolve)"
)
debug_list.insert(
- 0, {"sql": f"-- DuckDB Iceberg View Resolution [{mode}] --\n{stats['sql']}", "time_ms": stats["time_ms"]}
+ 0,
+ {
+ "sql": _compact_sql_for_debug(f"-- DuckDB Iceberg View Resolution [{mode}] --\n{stats['sql']}"),
+ "time_ms": stats["time_ms"],
+ },
)
@@ -2848,7 +2863,7 @@ def _rebuild_locked(con, source: dict, source_key: str) -> None:
del _rebuild_signals[source_key]
-def update_iceberg_view(con, source: dict, lock_timeout: float = 5.0) -> None:
+def update_iceberg_view(con, source: dict, lock_timeout: float = 5.0, force: bool = False) -> None:
"""Refresh the per-service DuckDB view over the Iceberg table + buffer.
``lock_timeout`` (default 5s) caps how long we wait on the per-service
@@ -2860,12 +2875,23 @@ def update_iceberg_view(con, source: dict, lock_timeout: float = 5.0) -> None:
match the pattern …/buffer/batch_*.parquet`` on the next read. Five
seconds is long enough to outlast a typical commit without making
sync-status polls feel sticky.
+
+ ``force=True`` skips the lock-free fast path and goes straight to a
+ full rebuild under the lock. The QueryRunner self-heal path uses
+ this: when a query already failed with a stale-view IOException,
+ the fast path can't help — its buf_set check might match cached
+ state that's still inconsistent with what the DuckDB query planner
+ just saw on disk, OR (the symptom-from-prod) the cached view SQL
+ has hardcoded file paths and re-executing it just re-binds the same
+ bad SQL. Force-rebuild reads disk fresh under the lock and
+ regenerates the SQL.
"""
source_key = source.get("name", "default")
# Lock-free fast path first. Parallel dashboard reads (6+ endpoints
# per page load) only need the lock when a real rebuild is required.
- if _try_fast_path_view(con, source):
+ # Skipped on ``force=True`` (see self-heal path in QueryRunner).
+ if not force and _try_fast_path_view(con, source):
return
lock = _get_service_lock(source_key)
@@ -2982,8 +3008,7 @@ def _update_iceberg_view_locked(con, source: dict) -> None:
dynamic_arrow_schema = get_arrow_schema(log_fields_config)
dynamic_schema_field_names = {f.name for f in dynamic_arrow_schema}
- logger.info("▶️ %s %s: View refresh started.", _ICE_PLAIN, source_key)
- logger.info("%s %s: Refreshing view...", _ICE, source_key)
+ logger.info("▶️ %s %s: View refresh started...", _ICE_PLAIN, source_key)
# Try to load from persistent cache if memory cache is empty
_load_persistent_cache(source)
@@ -3195,7 +3220,7 @@ def _strip_computed(read_parquet_expr: str) -> str:
# (a) plan_files() returned S3 URIs and no local files are cached yet, OR
# (b) plan_files() failed silently but iceberg_loc is known (avoids WHERE false view)
if iceberg_loc and not local_paths and (s3_paths or not local_iceberg_files):
- parts.append(_strip_computed(f"iceberg_scan('{iceberg_loc}', allow_moved_paths=true)"))
+ parts.append(_strip_computed(f"iceberg_scan('{escape_sql_literal(iceberg_loc)}', allow_moved_paths=true)"))
logger.info(
"%s Falling back to iceberg_scan for %s (s3_paths=%d, local_iceberg_files=%d).",
_ICE,
@@ -3204,7 +3229,13 @@ def _strip_computed(read_parquet_expr: str) -> str:
len(local_iceberg_files),
)
elif s3_paths:
- logger.info(
+ # Demoted from INFO to DEBUG (2026-06-01): this fires on every
+ # view refresh whenever the local cache lags the iceberg manifest
+ # (very common during catch-up / right after a commit). Useful for
+ # debugging stale-view issues, not useful as a routine signal —
+ # was spamming the GCE backend log every few seconds with no
+ # actionable content.
+ logger.debug(
"%s Skipping %d missing cloud files in view (local files present, CDN sync pending).",
_ICE,
len(s3_paths),
@@ -3215,7 +3246,7 @@ def _strip_computed(read_parquet_expr: str) -> str:
buf_files = [p for p in buf_files if os.path.isfile(p)]
if buf_files:
- paths_sql = ", ".join(f"'{p}'" for p in buf_files)
+ paths_sql = ", ".join(f"'{escape_sql_literal(p)}'" for p in buf_files)
parts.append(_strip_computed(f"read_parquet([{paths_sql}], union_by_name=true, hive_partitioning=false)"))
if not parts:
@@ -3258,11 +3289,30 @@ def _strip_computed(read_parquet_expr: str) -> str:
is_analyst = source.get("access_level") == "read_only"
if tr and (is_analyst or not source.get("provisioning", {}).get("cron_sync", {}).get("enabled", True)):
+ # Security: validate via isoparse before interpolation. Without
+ # this, an attacker-controlled tr["start"] / tr["end"] dict value
+ # (these come from saved-view JSON which originates from the
+ # frontend) is interpolated raw into DuckDB SQL — a payload like
+ # "2024-01-01'; ATTACH '/tmp/x.db' AS y; --"
+ # would execute multi-statement SQL against the connection.
+ # isoparse rejects anything that isn't a valid ISO-8601 timestamp;
+ # we then interpolate the canonical .isoformat() output, which
+ # contains only digits, ":", "-", "T", "+", and "Z".
+ import dateutil.parser as _dt
+
where_clauses = []
if tr.get("start"):
- where_clauses.append(f"timestamp >= '{tr['start']}'::TIMESTAMPTZ")
+ try:
+ start_iso = _dt.isoparse(str(tr["start"])).isoformat()
+ except (ValueError, TypeError) as e:
+ raise ValueError(f"invalid time_range start: {e}") from e
+ where_clauses.append(f"timestamp >= '{start_iso}'::TIMESTAMPTZ")
if tr.get("end"):
- where_clauses.append(f"timestamp <= '{tr['end']}'::TIMESTAMPTZ")
+ try:
+ end_iso = _dt.isoparse(str(tr["end"])).isoformat()
+ except (ValueError, TypeError) as e:
+ raise ValueError(f"invalid time_range end: {e}") from e
+ where_clauses.append(f"timestamp <= '{end_iso}'::TIMESTAMPTZ")
if where_clauses:
union_sql = f"SELECT * FROM ({union_sql}) WHERE {' AND '.join(where_clauses)}"
@@ -3329,8 +3379,7 @@ def _strip_computed(read_parquet_expr: str) -> str:
t_end = time.time()
duration_ms = (t_end - t_start) * 1000
- logger.info("%s %s: View refresh complete (%.0f ms).", _ICE, source_key, duration_ms)
- logger.info("⏹️ %s %s: View refresh finished.", _ICE_PLAIN, source_key)
+ logger.info("⏹️ %s %s: View refresh complete (%.0f ms).", _ICE_PLAIN, source_key, duration_ms)
_view_cache[source_key] = (
metadata_loc,
buf_set,
@@ -3431,6 +3480,17 @@ def _save_manifest_metadata_cache(source: dict, live_manifest_paths: list[str])
"files": m_files,
"size": m_size,
}
+ # Mirror the on-disk prune in memory. Pre-fix this dict was only
+ # ever appended to (lines 3428, 2656) — entries for manifests
+ # dropped by snapshot expiry or compaction stayed resident
+ # forever, growing into multi-hundred-MB RSS over days of uptime
+ # and compounding the host-OOM problem. Compute the live set
+ # ONCE outside the loop so the cost is O(live + cache) rather
+ # than O(live × cache).
+ live_set = set(live_manifest_paths)
+ dead_keys = [k for k in _manifest_metadata_cache if k not in live_set]
+ for k in dead_keys:
+ _manifest_metadata_cache.pop(k, None)
try:
tmp = cache_file + ".tmp"
diff --git a/backend/core/ingest.py b/backend/core/ingest.py
index d1586adf..172cbae7 100644
--- a/backend/core/ingest.py
+++ b/backend/core/ingest.py
@@ -21,6 +21,7 @@
)
from backend.core.log_fields import LOG_FIELD_CATALOG
from backend.utils import field_codes as fc
+from backend.utils.sql_validator import escape_sql_literal
logger = logging.getLogger(__name__)
@@ -463,6 +464,7 @@ def elapsed() -> str:
processed_count = 0
deleted = 0
successfully_processed_files = []
+ touched_hours: set[str] = set()
mem_con = None
# Increase parallelism for S3 deletions
@@ -546,7 +548,12 @@ def elapsed() -> str:
read_paths = [s3_to_local[p] for p in read_paths_s3]
mem_con.execute("DROP TABLE IF EXISTS _ingest_staging")
- paths_sql = ", ".join(f"'{p}'" for p in read_paths)
+ # Security: escape single quotes in each local path before
+ # interpolating into the SQL literal. The local paths inherit
+ # their basename from the attacker-controllable S3 object key,
+ # so a key like ``raw/'); ATTACH '...; --`` would otherwise
+ # break out of the literal and execute arbitrary DuckDB SQL.
+ paths_sql = ", ".join(f"'{escape_sql_literal(p)}'" for p in read_paths)
try:
_execute_query_with_retry(
@@ -563,13 +570,14 @@ def elapsed() -> str:
valid_paths = []
for i, read_path in enumerate(read_paths):
try:
- # Quick accessibility test: read 1 row without loading the whole file.
+ # Security: per-file isolation read also needs escaping.
+ safe_read_path = escape_sql_literal(read_path)
_execute_query_with_retry(
mem_con,
- f"SELECT 1 FROM read_json_auto('{read_path}', sample_size=1) LIMIT 1",
+ f"SELECT 1 FROM read_json_auto('{safe_read_path}', sample_size=1) LIMIT 1",
max_retries=2,
)
- valid_paths.append(f"'{read_path}'")
+ valid_paths.append(f"'{safe_read_path}'")
except Exception as file_err:
f_name = read_paths_s3[i].split("/")[-1]
err_msg = str(file_err)
@@ -601,7 +609,13 @@ def elapsed() -> str:
# Translate filename column from local→s3 so downstream
# count_map / _source_file / file_sizes all key on s3://.
if local_to_s3:
- path_map_rows = ", ".join(f"('{local}', '{s3}')" for local, s3 in local_to_s3.items())
+ # Security: same escaping treatment for the
+ # local→s3 mapping table — both halves originate from
+ # attacker-controllable object keys.
+ path_map_rows = ", ".join(
+ f"('{escape_sql_literal(local)}', '{escape_sql_literal(s3)}')"
+ for local, s3 in local_to_s3.items()
+ )
mem_con.execute("DROP TABLE IF EXISTS _ingest_path_map")
mem_con.execute(
f"CREATE TEMP TABLE _ingest_path_map AS SELECT * FROM (VALUES {path_map_rows}) AS t(local, s3)"
@@ -626,7 +640,12 @@ def elapsed() -> str:
filename_expr = '"filename"'
if svc_id:
- escaped = svc_id.replace("'", "''")
+ # Security: consistent escape helper across the
+ # ingest path. Functionally identical to the inline
+ # .replace but routes through escape_sql_literal so
+ # any future hardening on the canonical helper
+ # (e.g., extra char classes) flows here.
+ escaped = escape_sql_literal(svc_id)
backend_expr = f"regexp_replace(\"backend\", '^{escaped}--', '') AS \"backend\""
else:
backend_expr = '"backend"'
@@ -671,6 +690,16 @@ def elapsed() -> str:
arrow_table = _fetched.read_all() if hasattr(_fetched, "read_all") else _fetched
valid_rows = len(arrow_table)
+ if valid_rows > 0:
+ chunk_hours = {
+ r[0]
+ for r in mem_con.execute(
+ "SELECT DISTINCT strftime(timestamp, '%Y-%m-%d-%H') FROM _ingest_staging WHERE timestamp IS NOT NULL"
+ ).fetchall()
+ if r[0] is not None
+ }
+ touched_hours.update(chunk_hours)
+
total_rows_batch = mem_con.execute("SELECT count(*) FROM _ingest_staging").fetchone()[0]
corrupt_in_batch = total_rows_batch - valid_rows
@@ -692,7 +721,9 @@ def elapsed() -> str:
corrupt_s3_paths.append(s3_path)
if corrupt_read_paths:
- paths_sql_str = ", ".join(f"'{p}'" for p in corrupt_read_paths)
+ # Security: corrupt-file diagnostic path
+ # also needs escaping. Same vector as above.
+ paths_sql_str = ", ".join(f"'{escape_sql_literal(p)}'" for p in corrupt_read_paths)
q = f"""
SELECT filename, column0 FROM read_csv([{paths_sql_str}], header=false, sep='', quote='', escape='', columns={{'column0': 'VARCHAR'}}, filename=true)
WHERE NOT json_valid(column0) OR json_extract(column0, '$.timestamp') IS NULL
@@ -731,8 +762,9 @@ def elapsed() -> str:
# Apply the same transformation (decoding, filename
# normalization) and inject the original s3:// fname
- # so attribution stays consistent.
- safe_fname = fname.replace("'", "''")
+ # so attribution stays consistent. Security:
+ # escape via the shared helper.
+ safe_fname = escape_sql_literal(fname)
mem_con.execute(
f"""
INSERT INTO _ingest_staging BY NAME
@@ -903,4 +935,5 @@ def _do_delete(keys, bucket, client):
"corrupt_details": total_corrupt_details,
"deleted_files": deleted,
"message": f"Successfully ingested {processed_count} new files ({total_inserted} rows) and deleted {deleted} raw files.",
+ "touched_hours": list(touched_hours),
}
diff --git a/backend/core/log_fields.py b/backend/core/log_fields.py
index 1b7ab79d..a93b3c9e 100644
--- a/backend/core/log_fields.py
+++ b/backend/core/log_fields.py
@@ -221,7 +221,7 @@
"group": "A",
"label": "Host",
"description": "HTTP Host header (domain name) captured at the true client edge before any rewrites.",
- "vcl": '"host":"%{json.escape(if(req.http.x-fos-edge-data:host != "", req.http.x-fos-edge-data:host, req.http.Host))}V"',
+ "vcl": '"host":"%{json.escape(substr(if(req.http.x-fos-edge-data:host != "", req.http.x-fos-edge-data:host, req.http.Host), 0, 512))}V"',
"duckdb_type": "VARCHAR",
"typical_bytes": 22,
"required_by": ["new_probe_urls"],
@@ -829,6 +829,17 @@
"required_by": [],
},
# ── Group L — Origin Metrics ───────────────────────────────────────────
+ # Security: each origin-metric field interpolates the value of a
+ # client-spoofable internal header (``x-of-ttfb`` etc.). Without a
+ # regex guard on the value, an attacker who reached vcl_recv with a
+ # crafted header like ``x-of-ttfb: 0, "waf": 1`` would break out of
+ # the unquoted numeric slot and inject arbitrary JSON keys into the
+ # log line. The ``~ "^[0-9]+$"`` test gates each numeric field to
+ # digit-only values; ``x-of-oip`` (the only string field) gets
+ # ``json.escape(...)`` so quotes / backslashes / control bytes
+ # serialize as their JSON-escape equivalents instead of breaking
+ # out of the string literal. the earlier fix also unsets all
+ # these headers on inbound req, so this is belt-and-suspenders.
{
"id": "ottfb",
"group": "L",
@@ -836,7 +847,7 @@
"description": "µs from fetch start to first byte of origin/shield response headers. Null on HITs.",
"formatter": "number",
"unit": "µs",
- "vcl": '"ottfb":%{if(req.http.x-of-ttfb, req.http.x-of-ttfb, "null")}V',
+ "vcl": '"ottfb":%{if(req.http.x-of-ttfb ~ "^[0-9]+$", req.http.x-of-ttfb, "null")}V',
"duckdb_type": "UBIGINT",
"typical_bytes": 16,
"required_by": ["origin_latency_spike", "region_latency"],
@@ -848,7 +859,7 @@
"description": "µs from fetch start to full response body received. Null on HITs.",
"formatter": "number",
"unit": "µs",
- "vcl": '"ottlb":%{if(req.http.x-of-ttlb, req.http.x-of-ttlb, "null")}V',
+ "vcl": '"ottlb":%{if(req.http.x-of-ttlb ~ "^[0-9]+$", req.http.x-of-ttlb, "null")}V',
"duckdb_type": "UBIGINT",
"typical_bytes": 16,
"required_by": ["origin_latency_spike"],
@@ -859,7 +870,7 @@
"label": "Origin Status",
"description": "HTTP status returned by origin or shield. Null on HITs.",
"formatter": "status",
- "vcl": '"ost":%{if(req.http.x-of-status, req.http.x-of-status, "null")}V',
+ "vcl": '"ost":%{if(req.http.x-of-status ~ "^[0-9]+$", req.http.x-of-status, "null")}V',
"duckdb_type": "USMALLINT",
"typical_bytes": 10,
"required_by": ["origin_error_rate", "origin_ip_failure"],
@@ -869,7 +880,9 @@
"group": "L",
"label": "Origin Bytes",
"description": "Bytes written in the response (resp.bytes_written). Null on HITs. Same variable as resp_bytes but null-on-HIT makes it queryable as 'total bytes fetched from origin'.",
- "vcl": '"obytes":%{if(req.http.x-of-start, "" + resp.bytes_written, "null")}V',
+ # resp.bytes_written is a Fastly-internal counter (not from a header),
+ # so no JSON-injection risk; the x-of-start guard is preserved as-is.
+ "vcl": '"obytes":%{if(req.http.x-of-start ~ "^[0-9]+$", "" + resp.bytes_written, "null")}V',
"duckdb_type": "UBIGINT",
"typical_bytes": 15,
"required_by": [],
@@ -879,7 +892,10 @@
"group": "L",
"label": "Origin IP",
"description": "IP address of the backend server that handled the fetch. Null on HITs.",
- "vcl": '"oip":"%{if(req.http.x-of-oip, req.http.x-of-oip, "")}V"',
+ # json.escape converts the value to JSON-string-safe form so
+ # quotes / backslashes / control bytes get their \\uXXXX escapes
+ # instead of terminating the literal early.
+ "vcl": '"oip":"%{json.escape(if(req.http.x-of-oip, req.http.x-of-oip, ""))}V"',
"duckdb_type": "VARCHAR",
"typical_bytes": 15,
"required_by": ["origin_ip_failure"],
@@ -890,7 +906,7 @@
"label": "Origin Retries",
"description": "Backend connection retry count before success or failure. Null on HITs.",
"formatter": "number",
- "vcl": '"oretries":%{if(req.http.x-of-oretries, req.http.x-of-oretries, "null")}V',
+ "vcl": '"oretries":%{if(req.http.x-of-oretries ~ "^[0-9]+$", req.http.x-of-oretries, "null")}V',
"duckdb_type": "UTINYINT",
"typical_bytes": 13,
"required_by": ["origin_retries"],
@@ -1066,6 +1082,22 @@
"typical_bytes": 0,
"required_by": [],
},
+ {
+ "id": "edge_score_reason_ind",
+ "group": "VIRTUAL",
+ "label": "Score Reasons",
+ "description": (
+ "Individual scoring reasons extracted from the comma-separated "
+ "edge_score_reason field (e.g. 'cookie-missing', 'impossibly-fast', "
+ "'robotic-consistency', 'rare-transition'). Lets the dashboard "
+ "show top-N reason breakdowns and filter by a single reason "
+ "even when one request triggers multiple."
+ ),
+ "vcl": None,
+ "duckdb_type": "VARCHAR",
+ "typical_bytes": 0,
+ "required_by": [],
+ },
# ── Internal ──────────────────────────────────────────────────────────
{
"id": "_source_file",
@@ -1442,11 +1474,26 @@ def generate_log_format(log_fields_config: dict) -> str:
# Overwrite the static substr limit in the built-in VCL
vcl = vcl.replace("substr(req.url, 0, 2000)", f"substr(req.url, 0, {limit})")
elif field["id"] == "ua":
- # Strip the hardcoded substr since we now do it at the edge (in vcl_recv)
- vcl = '"ua":"%{json.escape(if(req.http.x-fos-edge-data:ua != "", req.http.x-fos-edge-data:ua, req.http.User-Agent))}V"'
+ # Security: keep the substr cap even when generating the
+ # alternative VCL variant. The edge-side substr (in vcl_recv)
+ # is a *first* truncation — but we never want a 100 KB header
+ # to slip through if the edge snippet is missing or fails to
+ # run (e.g., on a request that bypasses our snippet stack).
+ # An unbounded UA can truncate the entire JSON log line at
+ # the 16 KB Fastly limit, dropping the request from the audit
+ # trail entirely (repudiation attack).
+ ua_limit = limits.get("ua", 1000)
+ vcl = (
+ f'"ua":"%{{json.escape(substr(if(req.http.x-fos-edge-data:ua != "",'
+ f' req.http.x-fos-edge-data:ua, req.http.User-Agent), 0, {ua_limit}))}}V"'
+ )
elif field["id"] == "referer":
- # Strip the hardcoded substr since we now do it at the edge (in vcl_recv)
- vcl = '"referer":"%{json.escape(if(req.http.x-fos-edge-data:referer != "", req.http.x-fos-edge-data:referer, req.http.Referer))}V"'
+ # Same reasoning as above — keep the substr cap.
+ ref_limit = limits.get("referer", 1000)
+ vcl = (
+ f'"referer":"%{{json.escape(substr(if(req.http.x-fos-edge-data:referer != "",'
+ f' req.http.x-fos-edge-data:referer, req.http.Referer), 0, {ref_limit}))}}V"'
+ )
parts.append(vcl)
@@ -1459,6 +1506,44 @@ def generate_log_format(log_fields_config: dict) -> str:
stage = cf.get("collection_stage", "edge")
value_type = cf.get("value_type", "string")
+ if stage == "deliver":
+ # Deliver-stage fields (session-scoring) need TWO gates:
+ # 1. edge-only (fastly.ff.visits_this_service == 0) — the
+ # shield POP never ran our scoring snippets, so the
+ # req.http subfields don't exist there.
+ # 2. non-empty value — avoid breaking JSON.
+ # Combined into ONE if() with compound AND so we don't end up
+ # with nested if(if(...) != "", ...) which Fastly's parser
+ # rejects ("if() condition must be a simple expression, not a
+ # function call").
+ raw_expr = cf.get("vcl_log_expression") or f"req.http.x-fos-edge-data:{name}"
+ if value_type in ("numeric", "boolean"):
+ # 014: ``!= ""`` only rejects empty strings — any other
+ # text (`"true"`, ``"abc"``, ``"]"``) flows straight into
+ # the JSON log line unquoted and breaks the JSON
+ # structure, dropping the line from ingestion (log
+ # injection / repudiation). Match a strict numeric form
+ # so non-digit values fall through to ``"null"``.
+ vcl_macro = (
+ f"if(fastly.ff.visits_this_service == 0 && "
+ f'{raw_expr} ~ "^-?[0-9]+(\\.[0-9]+)?$", {raw_expr}, "null")'
+ )
+ entry = f'"{name}":%{{{vcl_macro}}}V'
+ else:
+ # 016: clamp the string-field value to a sane length
+ # (default 2000) BEFORE json.escape so a multi-megabyte
+ # attacker-controlled custom field cannot push the log
+ # line past Fastly's 16 KB limit and silently drop the
+ # whole entry. The substr is INSIDE json.escape so the
+ # encoded length stays bounded.
+ cf_limit = int(cf.get("byte_limit") or limits.get(name) or 2000)
+ vcl_macro = (
+ f'json.escape(if(fastly.ff.visits_this_service == 0, substr({raw_expr}, 0, {cf_limit}), ""))'
+ )
+ entry = f'"{name}":"%{{{vcl_macro}}}V"'
+ parts.append(entry)
+ continue
+
if stage == "edge":
expr = f"req.http.x-fos-edge-data:{name}"
elif stage == "origin":
@@ -1468,11 +1553,17 @@ def generate_log_format(log_fields_config: dict) -> str:
expr = f"req.http.x-fos-edge-data:{name}"
if value_type in ("numeric", "boolean"):
- # Avoid empty strings breaking JSON numbers
- vcl_macro = f'if({expr} != "", {expr}, "null")'
+ # 014: see deliver-stage comment above — strict numeric
+ # regex instead of ``!= ""`` so a custom-field header value
+ # like ``"]"`` cannot break out of the JSON log line.
+ vcl_macro = f'if({expr} ~ "^-?[0-9]+(\\.[0-9]+)?$", {expr}, "null")'
entry = f'"{name}":%{{{vcl_macro}}}V'
else:
- vcl_macro = f"json.escape({expr})"
+ # 016: substr-clamp the value before json.escape so an
+ # oversized custom string field cannot push the line past
+ # Fastly's 16 KB log-line limit.
+ cf_limit = int(cf.get("byte_limit") or limits.get(name) or 2000)
+ vcl_macro = f"json.escape(substr({expr}, 0, {cf_limit}))"
entry = f'"{name}":"%{{{vcl_macro}}}V"'
parts.append(entry)
diff --git a/backend/core/metadata_db.py b/backend/core/metadata_db.py
index 63364f95..ce98edeb 100644
--- a/backend/core/metadata_db.py
+++ b/backend/core/metadata_db.py
@@ -22,6 +22,7 @@
import os
import sqlite3
import threading
+import time
from datetime import UTC, datetime, timedelta
from backend.utils.date_utils import iso_z, iso_z_now
@@ -228,7 +229,23 @@ def teardown(service_id: str) -> None:
error_count INTEGER DEFAULT 0,
PRIMARY KEY (file_name, source_name)
)""",
- "CREATE INDEX IF NOT EXISTS idx_ingested_files_source ON ingested_files(source_name)",
+ # Covers `/usage/prefill`'s source+range narrowing
+ # (`WHERE source_name = ? AND ingested_at BETWEEN ? AND ?`) and the
+ # bounded `list_unbackfilled_fastly_edge_files` scan (see :1128). The
+ # previous `idx_ingested_files_source` indexed source_name alone — SQLite
+ # had to walk every row for the matching source and filter ingested_at
+ # in memory (~250ms per query on populated services). The composite
+ # satisfies the range scan directly and is a strict superset for
+ # source_name-only lookups (SQLite uses leading-column prefixes), so the
+ # old index is redundant and dropped here. Index name matches the
+ # by-name reference in `list_unbackfilled_fastly_edge_files`'s docstring.
+ "CREATE INDEX IF NOT EXISTS idx_ingested_files_source_ingested_at ON ingested_files(source_name, ingested_at)",
+ "DROP INDEX IF EXISTS idx_ingested_files_source",
+ # Earlier in this branch a redundant `idx_ingested_files_source_ts` was
+ # added under a different name before discovering the existing
+ # by-name reference above; clean it up so no service ends up with two
+ # functionally identical composites.
+ "DROP INDEX IF EXISTS idx_ingested_files_source_ts",
# Single-row-per-service rollup maintained by ``insert_ingested_files``.
# Without it, ``get_ingested_files_status_summary`` had to SUM(row_count)
# + SUM(file_size_bytes) across the whole table on every cron tick —
@@ -278,6 +295,13 @@ def teardown(service_id: str) -> None:
log_output TEXT
)""",
"CREATE INDEX IF NOT EXISTS idx_cron_task_started ON cron_runs(task, started_at)",
+ # Covers `/logs`'s unfiltered pagination
+ # (`ORDER BY started_at DESC LIMIT ? OFFSET ?` with no `WHERE task`) and
+ # `main.py`'s sync-status probe (`WHERE task='sync' AND status != 'running'
+ # ORDER BY started_at DESC LIMIT 1`). Without it, SQLite falls back to a
+ # TEMP B-TREE sort over the full table because `idx_cron_task_started`
+ # requires a leading-`task` predicate to satisfy the ORDER BY.
+ "CREATE INDEX IF NOT EXISTS idx_cron_started ON cron_runs(started_at DESC)",
"""CREATE TABLE IF NOT EXISTS asn_names (
asn INTEGER PRIMARY KEY,
name TEXT NOT NULL,
@@ -321,6 +345,44 @@ def teardown(service_id: str) -> None:
last_triggered_at TEXT,
created_at TEXT DEFAULT (datetime('now'))
)""",
+ # Admin-flagged sessions for the edge session-scoring system. Each row
+ # is one (service, sid) tuple labeled good/bad/neutral by the admin.
+ # Feeds backend.scoring.evaluate.evaluate() for matrix ROC-AUC; the
+ # neutral label is captured for UI completeness but excluded from the
+ # AUC computation (intentionally uncertain).
+ """CREATE TABLE IF NOT EXISTS scoring_labels (
+ id TEXT PRIMARY KEY,
+ service_id TEXT NOT NULL,
+ sid TEXT NOT NULL,
+ label TEXT NOT NULL CHECK (label IN ('good', 'bad', 'neutral')),
+ notes TEXT DEFAULT '',
+ flagged_by TEXT,
+ sample_ip TEXT,
+ sample_ua TEXT,
+ sample_url TEXT,
+ created_at TEXT DEFAULT (datetime('now')),
+ updated_at TEXT DEFAULT (datetime('now'))
+ )""",
+ "CREATE UNIQUE INDEX IF NOT EXISTS idx_scoring_labels_svc_sid ON scoring_labels(service_id, sid)",
+ "CREATE INDEX IF NOT EXISTS idx_scoring_labels_svc_label ON scoring_labels(service_id, label)",
+ # Operator audit log specifically for scoring-config mutations.
+ # Separate from audit_logs (which gets state_sync'd) because scoring-
+ # audit is per-host operator-attribution data that should NOT mirror
+ # to read_only analyst replicas.
+ """CREATE TABLE IF NOT EXISTS scoring_audit (
+ id INTEGER PRIMARY KEY AUTOINCREMENT,
+ timestamp TEXT NOT NULL DEFAULT (datetime('now')),
+ service_id TEXT NOT NULL,
+ action TEXT NOT NULL,
+ actor TEXT NOT NULL,
+ details TEXT
+ )""",
+ "CREATE INDEX IF NOT EXISTS idx_scoring_audit_svc_ts ON scoring_audit(service_id, timestamp DESC)",
+ # Plain timestamp index for the list_scoring_audit ORDER BY timestamp DESC
+ # path when the service_id predicate is already satisfied — keeps the sort
+ # itself indexed instead of falling back to a TEMP B-TREE on large audit
+ # tables.
+ "CREATE INDEX IF NOT EXISTS idx_scoring_audit_ts ON scoring_audit(timestamp DESC)",
"""CREATE TABLE IF NOT EXISTS usage_log (
id INTEGER PRIMARY KEY AUTOINCREMENT,
timestamp TEXT,
@@ -372,6 +434,44 @@ def teardown(service_id: str) -> None:
# 500-row page. Including (operation_class, count, bytes) makes the
# aggregate covering too (5× faster than non-covering on the same query).
"CREATE INDEX IF NOT EXISTS idx_usage_service_ts ON usage_log(service_id, timestamp, operation_class, count, bytes)",
+ # Hourly rollup of usage_log keyed by (service, hour-prefix of timestamp,
+ # operation_class, operation_type). Powers the /admin/usage-log aggregate
+ # GROUP BY which used to scan millions of usage_log rows (~600 ms steady
+ # state). With the rollup the aggregate becomes a small indexed sum over
+ # at most 24 hours × a few op-class/type pairs. Maintained by the
+ # AFTER INSERT trigger below (incremental, always-consistent) plus a
+ # backfill helper for services upgrading from a pre-rollup install.
+ """CREATE TABLE IF NOT EXISTS usage_log_hourly_summary (
+ service_id TEXT NOT NULL,
+ hour TEXT NOT NULL,
+ operation_class TEXT NOT NULL DEFAULT '',
+ operation_type TEXT NOT NULL DEFAULT '',
+ count INTEGER NOT NULL DEFAULT 0,
+ bytes INTEGER NOT NULL DEFAULT 0,
+ last_updated TEXT NOT NULL DEFAULT (datetime('now')),
+ PRIMARY KEY (service_id, hour, operation_class, operation_type)
+ )""",
+ "CREATE INDEX IF NOT EXISTS idx_usage_hourly_svc_hour ON usage_log_hourly_summary(service_id, hour)",
+ # AFTER INSERT trigger: every row added to usage_log bumps its hour bucket
+ # in the summary. Hour key = first 13 chars of timestamp ("YYYY-MM-DDTHH").
+ # Coalesce on empty operation_class/operation_type because rows can have
+ # NULLs; the rollup uses '' as a normalised sentinel. ON CONFLICT path
+ # supports the reconcile_fastly_stats compaction pattern where multiple
+ # rows for the same (hour, class, type) accumulate.
+ """CREATE TRIGGER IF NOT EXISTS trg_usage_log_summary_insert
+ AFTER INSERT ON usage_log
+ WHEN NEW.timestamp IS NOT NULL AND length(NEW.timestamp) >= 13 AND NEW.service_id IS NOT NULL
+ BEGIN
+ INSERT INTO usage_log_hourly_summary
+ (service_id, hour, operation_class, operation_type, count, bytes, last_updated)
+ VALUES (NEW.service_id, substr(NEW.timestamp, 1, 13),
+ COALESCE(NEW.operation_class, ''), COALESCE(NEW.operation_type, ''),
+ COALESCE(NEW.count, 1), COALESCE(NEW.bytes, 0), datetime('now'))
+ ON CONFLICT(service_id, hour, operation_class, operation_type)
+ DO UPDATE SET count = count + excluded.count,
+ bytes = bytes + excluded.bytes,
+ last_updated = excluded.last_updated;
+ END""",
# Tracks Iceberg parquet basenames that local_compaction merged into a
# bigger local file and then deleted from disk. WITHOUT this table the
# sync_data fast-path check sees the deletions as "missing local files"
@@ -383,28 +483,31 @@ def teardown(service_id: str) -> None:
file_name TEXT PRIMARY KEY,
compacted_at TEXT DEFAULT (datetime('now'))
)""",
+ # Tracking table for the data-migration framework
+ # (``backend.core.data_migrations``). Each row records one applied
+ # data-migration: long-running, one-time data setup tasks (e.g. the
+ # rollups initial backfill) that are NOT schema DDL changes. Schema
+ # migrations use ``PRAGMA user_version`` via ``sqlite_migrations.py``
+ # — these two systems are intentionally separate because schema
+ # changes must block startup, while data migrations run async on a
+ # daemon thread so a multi-hour backfill can't wedge the boot loop.
+ """CREATE TABLE IF NOT EXISTS applied_data_migrations (
+ name TEXT PRIMARY KEY,
+ applied_at TEXT NOT NULL DEFAULT (datetime('now')),
+ duration_s REAL,
+ status TEXT NOT NULL DEFAULT 'success',
+ notes TEXT
+ )""",
]
def _init_schema(con: sqlite3.Connection) -> None:
+ from backend.core import sqlite_migrations
+
for stmt in _SCHEMA:
con.execute(stmt)
con.commit()
- # Bring pre-migration-framework service DBs up to current. Migrations
- # are idempotent (each checks before mutating) so this is also safe to
- # call on fresh DBs that already have everything from ``_SCHEMA``.
- # On a healthy fresh install the loop exits on the first version check.
- from backend.core import sqlite_migrations
-
- applied = sqlite_migrations.apply_pending(con)
- if applied:
- logger.info("[metadata_db] applied %d pending migration(s)", applied)
- # New DBs leap straight to LATEST so the migration loop doesn't waste
- # a check on every open. Idempotency means doing the work first is
- # harmless, but skipping the inspection is cheaper at scale.
- if sqlite_migrations.get_current_version(con) < sqlite_migrations.LATEST_VERSION:
- con.execute(f"PRAGMA user_version = {sqlite_migrations.LATEST_VERSION}")
- con.commit()
+ sqlite_migrations.apply_pending(con)
# ── alerts ────────────────────────────────────────────────────────────────────
@@ -627,6 +730,44 @@ def replace_views_for_service(service_id: str, views: list[dict]) -> None:
con.commit()
+def upsert_views_for_service(service_id: str, views: list[dict]) -> None:
+ """Upsert saved views by id WITHOUT deleting local-only rows.
+
+ Used by state_sync.import_admin_state on read_only analyst hosts so
+ locally-created views (which the analyst created on their own pod) are
+ preserved through every metadata_sync cron tick. Without this, the
+ cron's wholesale DELETE+INSERT silently wiped any analyst-side view
+ that hadn't been mirrored back to FOS — and ``export_admin_state``
+ refuses to push from read_only hosts, so the loss was permanent.
+ """
+ if not views:
+ return
+ con = get_con(service_id)
+ con.executemany(
+ "INSERT INTO views (id, service_id, name, filters_json, time_range_type, start_time, end_time, page, created_at) "
+ "VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?) "
+ "ON CONFLICT(id) DO UPDATE SET "
+ "name=excluded.name, filters_json=excluded.filters_json, "
+ "time_range_type=excluded.time_range_type, start_time=excluded.start_time, "
+ "end_time=excluded.end_time, page=excluded.page, created_at=excluded.created_at",
+ [
+ (
+ v.get("id"),
+ v.get("service_id"),
+ v.get("name"),
+ v.get("filters_json"),
+ v.get("time_range_type"),
+ v.get("start_time"),
+ v.get("end_time"),
+ v.get("page"),
+ v.get("created_at"),
+ )
+ for v in views
+ ],
+ )
+ con.commit()
+
+
# ── audit_logs ────────────────────────────────────────────────────────────────
@@ -741,6 +882,37 @@ def replace_audit_for_service(service_id: str, rows: list[dict]) -> None:
con.commit()
+def merge_audit_for_service(service_id: str, rows: list[dict]) -> None:
+ """Insert audit log entries from remote without deleting local ones.
+
+ Used by state_sync.import_admin_state on read_only analyst hosts to
+ preserve local audit entries created by the analyst's own actions
+ (which the wholesale ``replace_audit_for_service`` would have wiped on
+ every cron tick).
+
+ Dedup key: (timestamp, source_name, event_type, actor) — a row with
+ those four fields equal to an existing row is considered the same
+ event and skipped. ``timestamp`` has second precision so collisions
+ between distinct events are improbable, and even if they happen the
+ audit log tolerates the missed insert.
+ """
+ if not rows:
+ return
+ con = get_con(service_id)
+ for r in rows:
+ existing = con.execute(
+ "SELECT 1 FROM audit_logs WHERE source_name = ? AND timestamp = ? AND event_type = ? AND actor = ? LIMIT 1",
+ (r.get("source_name"), r.get("timestamp"), r.get("event_type"), r.get("actor")),
+ ).fetchone()
+ if existing:
+ continue
+ con.execute(
+ "INSERT INTO audit_logs (timestamp, source_name, event_type, details, actor) VALUES (?, ?, ?, ?, ?)",
+ (r.get("timestamp"), r.get("source_name"), r.get("event_type"), r.get("details"), r.get("actor")),
+ )
+ con.commit()
+
+
# ── ingested_files ────────────────────────────────────────────────────────────
@@ -1503,6 +1675,114 @@ def purge_cron_runs(
con.commit()
+def record_scoring_audit(
+ service_id: str,
+ action: str,
+ *,
+ actor: str = "operator",
+ details: dict | None = None,
+) -> None:
+ """Append an operator-attribution row to the scoring_audit log.
+
+ Called from every scoring-config-mutating endpoint (enable, disable,
+ threshold commit + enforce, retrain, rotate-key, matrix-rollback).
+ Best-effort: any SQLite failure is logged at DEBUG and swallowed so
+ a busy WAL doesn't block the actual operator action.
+ """
+ try:
+ con = get_con(service_id)
+ con.execute(
+ "INSERT INTO scoring_audit (service_id, action, actor, details) VALUES (?, ?, ?, ?)",
+ (service_id, action, actor, json.dumps(details) if details else None),
+ )
+ con.commit()
+ except sqlite3.Error as e:
+ logger.debug("[metadata_db] record_scoring_audit(%s, %s) failed: %s", service_id, action, e)
+
+
+def list_scoring_audit(
+ service_id: str,
+ *,
+ limit: int = 100,
+ since: str | None = None,
+) -> list[dict]:
+ """Most-recent first. Optional ISO ``since`` timestamp lower bound."""
+ try:
+ con = get_con(service_id)
+ if since:
+ rows = con.execute(
+ "SELECT id, timestamp, action, actor, details FROM scoring_audit "
+ "WHERE service_id = ? AND timestamp >= ? ORDER BY id DESC LIMIT ?",
+ (service_id, since, limit),
+ ).fetchall()
+ else:
+ rows = con.execute(
+ "SELECT id, timestamp, action, actor, details FROM scoring_audit "
+ "WHERE service_id = ? ORDER BY id DESC LIMIT ?",
+ (service_id, limit),
+ ).fetchall()
+ out = []
+ for r in rows:
+ row = dict(r)
+ if row.get("details"):
+ try:
+ row["details"] = json.loads(row["details"])
+ except (ValueError, TypeError):
+ pass
+ out.append(row)
+ return out
+ except sqlite3.Error as e:
+ logger.debug("[metadata_db] list_scoring_audit(%s) failed: %s", service_id, e)
+ return []
+
+
+def prune_scoring_audit(service_id: str, *, keep_last: int = 10000) -> None:
+ """Trim scoring_audit to the most recent ``keep_last`` rows per service.
+
+ Cheap unbounded growth guard — every scoring-config mutation appends
+ one row, and the table is only ever read by the admin UI / state_sync
+ export which already caps its own page size. Best-effort: any SQLite
+ failure is logged at DEBUG and swallowed so trimming never blocks the
+ caller (typically a maintenance cron, not the operator hot path).
+ """
+ try:
+ con = get_con(service_id)
+ # Tiebreak on id DESC so concurrent inserts that landed in the same
+ # `datetime('now')` second are deterministically ordered (otherwise
+ # SQLite is free to pick any row from the tied group, which makes
+ # prune flaky under burst workloads and breaks reproducibility tests).
+ con.execute(
+ "DELETE FROM scoring_audit WHERE service_id = ? AND id NOT IN ("
+ "SELECT id FROM scoring_audit WHERE service_id = ? ORDER BY timestamp DESC, id DESC LIMIT ?)",
+ (service_id, service_id, keep_last),
+ )
+ con.commit()
+ except sqlite3.Error as e:
+ logger.debug("[metadata_db] prune_scoring_audit(%s) failed: %s", service_id, e)
+
+
+def get_cron_run_status(service_id: str, run_id: int) -> str | None:
+ """Return the status string for a single cron_runs row, or None if
+ the row doesn't exist. Used by cron_progress.list_active_runs to
+ cross-check the in-memory state against the DB-of-truth (catches
+ abandoned-worker-thread zombies that completed log_cron_run but
+ never fired end_progress).
+
+ Narrowed exception scope: catches sqlite3.Error (DB unreachable,
+ table missing, locked) and logs at DEBUG so the next 'why isn't
+ the cross-check firing?' triage isn't flying blind. Returns None
+ on any DB failure so list_active_runs falls back to the in-memory
+ signal (we'd rather show a false in-flight than miss a real one).
+ """
+ try:
+ con = get_con(service_id)
+ row = con.execute("SELECT status FROM cron_runs WHERE id = ?", (run_id,)).fetchone()
+ return row["status"] if row else None
+ except sqlite3.Error as e:
+ logger.debug("[metadata_db] get_cron_run_status(%s, %s) failed: %s", service_id, run_id, e)
+ return None
+
+
def get_cron_runs(
service_id: str,
*,
@@ -2019,6 +2299,180 @@ def clear_usage_log(service_id: str) -> None:
con.commit()
+USAGE_LOG_HOURLY_BACKFILL_NAME = "2026-06-04_usage_log_hourly_summary_backfill"
+
+# Per-process guard so the in-process check doesn't hit SQLite on every read.
+# The DB-level marker (applied_data_migrations) is the source of truth across
+# restarts; this cache just trims redundant lookups within one process.
+_usage_log_backfilled: set[str] = set()
+_usage_log_backfill_lock = threading.Lock()
+
+
+def _ensure_usage_log_hourly_backfilled(con: sqlite3.Connection, service_id: str) -> None:
+ """Populate usage_log_hourly_summary for services upgrading from a
+ pre-trigger install. Idempotent; runs at most once per service.
+
+ Detection: presence of the named row in ``applied_data_migrations``. The
+ trigger handles all NEW inserts; this backfill catches the rows that
+ existed before the trigger was added. Synchronous so /admin/usage-log
+ returns correct data on first access (typically <1 s for ~1 M rows).
+ """
+ if service_id in _usage_log_backfilled:
+ return
+ with _usage_log_backfill_lock:
+ if service_id in _usage_log_backfilled:
+ return
+ try:
+ applied = con.execute(
+ "SELECT 1 FROM applied_data_migrations WHERE name = ?",
+ (USAGE_LOG_HOURLY_BACKFILL_NAME,),
+ ).fetchone()
+ if applied is None:
+ t0 = time.time()
+ logger.info("[usage_log] backfilling hourly summary for %s", service_id)
+ # Wipe any partial summary rows the trigger may have written
+ # for this service since boot — we're rebuilding from raw so
+ # the GROUP BY sum is exact, not double-counted on top of
+ # trigger-written rows.
+ con.execute("DELETE FROM usage_log_hourly_summary WHERE service_id = ?", (service_id,))
+ con.execute(
+ """
+ INSERT INTO usage_log_hourly_summary
+ (service_id, hour, operation_class, operation_type, count, bytes, last_updated)
+ SELECT service_id,
+ substr(timestamp, 1, 13),
+ COALESCE(operation_class, ''),
+ COALESCE(operation_type, ''),
+ SUM(COALESCE(count, 1)),
+ SUM(COALESCE(bytes, 0)),
+ datetime('now')
+ FROM usage_log
+ WHERE service_id = ?
+ AND timestamp IS NOT NULL
+ AND length(timestamp) >= 13
+ GROUP BY 1, 2, 3, 4
+ """,
+ (service_id,),
+ )
+ con.execute(
+ "INSERT OR REPLACE INTO applied_data_migrations "
+ "(name, applied_at, duration_s, status, notes) VALUES (?, ?, ?, ?, ?)",
+ (USAGE_LOG_HOURLY_BACKFILL_NAME, iso_z_now(), time.time() - t0, "success",
+ "rebuilt usage_log_hourly_summary from raw"),
+ )
+ con.commit()
+ logger.info("[usage_log] hourly backfill complete for %s in %.2fs", service_id, time.time() - t0)
+ except Exception as e:
+ logger.warning("[usage_log] hourly summary backfill failed for %s: %s", service_id, e)
+ _usage_log_backfilled.add(service_id)
+
+
+def _query_usage_log_aggregate_rollup(
+ con: sqlite3.Connection,
+ service_id: str,
+ start: str,
+ end: str,
+ usage_type: str,
+) -> list[sqlite3.Row]:
+ """Compute the (operation_class, operation_type) totals exactly using the
+ hourly rollup for fully-contained hours plus raw usage_log for the two
+ boundary hours (which usually aren't hour-aligned).
+
+ The rollup PK lookup is sub-millisecond; the boundary raw scans cover at
+ most 2 hours of data (~80 k rows in a busy service) and ride the
+ idx_usage_service_ts index. Combined cost is typically ~1-2 ms vs the
+ 600 ms full-window GROUP BY this replaces.
+ """
+ # Hour bucket prefix is "YYYY-MM-DDTHH" (13 chars). Timestamps in
+ # usage_log are stored as ISO strings, so prefix comparison is correct.
+ start_hour = (start or "")[:13]
+ end_hour = (end or "")[:13]
+
+ class_filter = ""
+ class_params: list = []
+ if usage_type:
+ if usage_type == "CDN":
+ class_filter = "AND operation_class = 'CDN'"
+ elif usage_type == "FOS-A":
+ class_filter = "AND operation_class = 'A'"
+ elif usage_type == "FOS-B":
+ class_filter = "AND operation_class = 'B'"
+ elif usage_type == "FOS":
+ class_filter = "AND operation_class IN ('A', 'B')"
+ else:
+ class_filter = "AND operation_class = ?"
+ class_params = [usage_type]
+
+ # Sub-hour range collapses to a single raw scan — no hour bucket fully
+ # contained, both boundary parts would target the same hour anyway.
+ if start_hour == end_hour:
+ rows = con.execute(
+ f"""
+ SELECT operation_class, operation_type,
+ SUM(count) AS c, SUM(COALESCE(bytes, 0)) AS b
+ FROM usage_log
+ WHERE service_id = ? AND timestamp >= ? AND timestamp <= ? {class_filter}
+ GROUP BY operation_class, operation_type
+ """,
+ [service_id, start, end] + class_params,
+ ).fetchall()
+ return rows
+
+ # Boundary range comparisons keyed on timestamp directly (not
+ # `substr(timestamp, 1, 13)`) so SQLite can ride idx_usage_service_ts
+ # as a pure range scan — substr() forces per-row evaluation, ~5x slower
+ # on the end-of-day boundary (18k rows: 90ms with substr vs ~15ms with
+ # pure range). The hour boundary is the start of the FOLLOWING hour, so
+ # we strip any " " or "T" between date/time and use the ISO Z form to
+ # match what writers store.
+ def _next_hour_start(hour_prefix: str) -> str:
+ # "2026-06-04T23" → "2026-06-05T00:00:00.000Z"
+ try:
+ dt = datetime.strptime(hour_prefix, "%Y-%m-%dT%H").replace(tzinfo=UTC)
+ except ValueError:
+ return hour_prefix + ":59:59.999Z"
+ nxt = dt + timedelta(hours=1)
+ return nxt.strftime("%Y-%m-%dT%H:%M:%S.000Z")
+
+ def _hour_start(hour_prefix: str) -> str:
+ return hour_prefix + ":00:00.000Z"
+
+ start_hour_end = _next_hour_start(start_hour)
+ end_hour_start = _hour_start(end_hour)
+
+ # Three-part UNION ALL: interior hours from rollup, boundary hours from
+ # raw usage_log. SUM(SUM(...)) collapses the two sources into a single
+ # (op_class, op_type) tuple per group.
+ rollup_class_filter = class_filter # same syntax works against the rollup
+ rows = con.execute(
+ f"""
+ SELECT operation_class, operation_type,
+ SUM(c) AS c, SUM(b) AS b
+ FROM (
+ SELECT operation_class, operation_type, count AS c, bytes AS b
+ FROM usage_log_hourly_summary
+ WHERE service_id = ? AND hour > ? AND hour < ? {rollup_class_filter}
+ UNION ALL
+ SELECT operation_class, operation_type, count AS c, COALESCE(bytes, 0) AS b
+ FROM usage_log
+ WHERE service_id = ? AND timestamp >= ? AND timestamp < ? {class_filter}
+ UNION ALL
+ SELECT operation_class, operation_type, count AS c, COALESCE(bytes, 0) AS b
+ FROM usage_log
+ WHERE service_id = ? AND timestamp >= ? AND timestamp <= ? {class_filter}
+ )
+ GROUP BY operation_class, operation_type
+ """,
+ # Interior rollup params
+ [service_id, start_hour, end_hour] + class_params
+ # Start-boundary raw params: [start, next_hour_after_start_hour)
+ + [service_id, start, start_hour_end] + class_params
+ # End-boundary raw params: [start_of_end_hour, end]
+ + [service_id, end_hour_start, end] + class_params,
+ ).fetchall()
+ return rows
+
+
def get_usage_logs(
service_id: str,
start: str,
@@ -2065,20 +2519,33 @@ def get_usage_logs(
)
entries = [dict(r) for r in cur.fetchall()]
- # One GROUP BY (operation_class, operation_type) does the work of both the
- # 5-CASE-WHEN totals query AND the per-class breakdown — they're the same
- # 800K-row scan over usage_log, just shaped differently. Doing both in
- # one query saves a full pass per Usage Log page load (~1s on prod).
- grouped = con.execute(
- f"""
- SELECT operation_class, operation_type,
- sum(count) AS c, sum(coalesce(bytes, 0)) AS b
- FROM usage_log
- WHERE {where}
- GROUP BY 1, 2
- """,
- params,
- ).fetchall()
+ # Aggregate path: prefer the usage_log_hourly_summary rollup when only the
+ # service+timestamp predicates are active (the common admin-page case). The
+ # rollup is maintained incrementally by trg_usage_log_summary_insert, so
+ # it's always consistent — no scheduler needed. We can only use it when no
+ # process_context / operation_type LIKE filters are present (the rollup
+ # doesn't carry those columns); the operation_class filter IS supported
+ # because the rollup stores it as a normalised key. Backfill of any
+ # service that predates the trigger happens lazily on first read.
+ rollup_eligible = not process_context and not operation_type
+ if rollup_eligible:
+ _ensure_usage_log_hourly_backfilled(con, service_id)
+ grouped = _query_usage_log_aggregate_rollup(con, service_id, start, end, usage_type)
+ else:
+ # One GROUP BY (operation_class, operation_type) does the work of both the
+ # 5-CASE-WHEN totals query AND the per-class breakdown — they're the same
+ # 800K-row scan over usage_log, just shaped differently. Doing both in
+ # one query saves a full pass per Usage Log page load (~1s on prod).
+ grouped = con.execute(
+ f"""
+ SELECT operation_class, operation_type,
+ sum(count) AS c, sum(coalesce(bytes, 0)) AS b
+ FROM usage_log
+ WHERE {where}
+ GROUP BY 1, 2
+ """,
+ params,
+ ).fetchall()
totals = {"A": 0, "B": 0, "CDN": 0}
bytes_by_class = {"A": 0, "B": 0, "CDN": 0}
@@ -2105,3 +2572,369 @@ def get_usage_logs(
}
return entries, total, res_agg
+
+
+# ── Metadata retention / cleanup ──────────────────────────────────────────────
+# usage_log and ingested_files are append-only and unbounded by default.
+# On a long-running deploy they grow without limit (witnessed: 5.7 GB
+# metadata.db with 8.25M usage_log rows + 2.35M ingested_files rows). The
+# UI doesn't need that history beyond a short window — Usage & Cost pages
+# query a configurable window; Data Management shows recent files; cron_runs
+# is a short audit trail. Trim by age; keep VACUUM gated to actual deletions
+# because a no-op VACUUM still rewrites the whole file.
+
+# Per-table retention windows (days). Override via cfg["metadata_retention"]
+# per service. 0 (or negative) disables cleanup for that table / artefact.
+#
+# rollups_days is not a SQLite table but a per-hour parquet tree under
+# ``/rollups/hour/field=X/hour=Y/``. The cleanup helper deletes
+# hour-dirs older than this window. Default 90d gives broad dashboard
+# query coverage while bounding disk; set to 0 to keep all history.
+DEFAULT_METADATA_RETENTION = {
+ "usage_log_days": 1,
+ "ingested_files_days": 1,
+ "cron_runs_days": 7,
+ "rollups_days": 90,
+}
+
+# Tables surfaced in the storage stats endpoint. Order matters for the UI.
+_STATS_TABLES = (
+ "usage_log",
+ "ingested_files",
+ "cron_runs",
+ "alerts",
+ "saved_views",
+ "audit_log",
+ "in_flight_buffers",
+ "locally_compacted_files",
+)
+
+# (table, retention_key, timestamp_column) for each trimmable table.
+_CLEANUP_TABLES = (
+ ("usage_log", "usage_log_days", "timestamp"),
+ ("ingested_files", "ingested_files_days", "ingested_at"),
+ ("cron_runs", "cron_runs_days", "started_at"),
+)
+
+
+def get_metadata_storage_stats(service_id: str) -> dict:
+ """Per-table row count + estimated bytes for this service's metadata.db.
+
+ Bytes come from SQLite's ``dbstat`` virtual table (compiled into stock
+ Python sqlite3 ≥3.31). If a table doesn't exist (older schema), it's
+ omitted rather than erroring. Total ``db_bytes`` is the sum across the
+ whole file — including indexes, free pages, and tables not in
+ ``_STATS_TABLES``, so it won't equal sum-of-per-table-bytes.
+ """
+ con = get_con(service_id)
+ out: dict[str, dict] = {}
+ for t in _STATS_TABLES:
+ try:
+ rows = con.execute(f"SELECT count(*) FROM {t}").fetchone()[0]
+ except sqlite3.OperationalError:
+ continue
+ try:
+ row = con.execute("SELECT sum(pgsize) FROM dbstat WHERE name = ?", (t,)).fetchone()
+ bytes_ = int(row[0]) if row and row[0] is not None else 0
+ except sqlite3.OperationalError:
+ bytes_ = None
+ out[t] = {"rows": int(rows or 0), "bytes": bytes_}
+
+ db_bytes: int | None
+ try:
+ row = con.execute("SELECT sum(pgsize) FROM dbstat").fetchone()
+ db_bytes = int(row[0]) if row and row[0] is not None else 0
+ except sqlite3.OperationalError:
+ db_bytes = None
+
+ return {
+ "tables": out,
+ "db_bytes": db_bytes,
+ "db_path": db_path(service_id),
+ }
+
+
+def is_ingested_files_dedup_active(service_id: str) -> bool:
+ """Return True when the ``ingested_files`` table is the active dedup gate.
+
+ The sync's ``delete_after`` flag (default True) makes ingest a destructive
+ op: a successfully-ingested .gz is DELETEd from FOS, so a future LIST
+ can never re-discover it — the ``ingested_files`` row is vestigial
+ after that point. When ``delete_after`` is set to False, the raw files
+ stay in FOS forever and the daily ``full_sync`` (cron) does a complete
+ LIST; the only thing stopping it from re-ingesting every prior file is
+ a matching entry in ``ingested_files``. In that mode the table CANNOT
+ be trimmed without causing re-ingestion storms.
+ """
+ from backend import config as svcconfig
+
+ cfg = svcconfig.load_config(service_id) or {}
+ delete_after = cfg.get("provisioning", {}).get("cron_sync", {}).get("delete_after", True)
+ # Treat anything other than an explicit False as safe-to-trim. None,
+ # missing, truthy strings — all default to the safe path.
+ return delete_after is not False
+
+
+def cleanup_metadata(
+ service_id: str,
+ retention: dict | None = None,
+ on_event=None,
+) -> dict:
+ """Delete rows older than the per-table retention window. VACUUM if any were deleted.
+
+ retention shape: ``{"usage_log_days": int, "ingested_files_days": int,
+ "cron_runs_days": int}``. Missing keys fall back to
+ ``DEFAULT_METADATA_RETENTION``. A value of 0 (or negative) disables
+ cleanup for that table — useful for an analyst-only service that wants
+ to retain the full audit trail.
+
+ ``ingested_files_days`` is **force-overridden to 0** when
+ ``cron_sync.delete_after`` is False on this service — see
+ ``is_ingested_files_dedup_active``. The override is announced via an
+ ``on_event`` status message so the operator knows the configured
+ retention is being ignored.
+
+ ``on_event``: optional callable receiving event dicts at each milestone
+ (status messages, per-table delete results, VACUUM start/end). The
+ callback is invoked synchronously from the worker — the manual-cleanup
+ endpoint uses a thread-safe queue to bridge to SSE. Event shapes:
+
+ {"type": "status", "message": str}
+ {"type": "progress", "current": int, "total": int, "message": str}
+
+ The scheduled cron passes ``on_event=None`` and gets silent operation
+ (events still arrive in the function's return dict for logging).
+
+ Returns ``{"deleted": {table: count}, "before": {table: rows},
+ "after": {table: rows}, "vacuumed": bool, "duration_s": float}``.
+ """
+ import time as _t
+
+ def _emit(event: dict) -> None:
+ if on_event is None:
+ return
+ try:
+ on_event(event)
+ except Exception:
+ # Never let an event-sink failure abort the cleanup itself.
+ pass
+
+ cfg = {**DEFAULT_METADATA_RETENTION, **(retention or {})}
+
+ # Safety override: when cron_sync.delete_after is False, ingested_files
+ # is the dedup gate against re-LIST → re-ingest by the daily full_sync.
+ # Trimming it would re-ingest every aged-out file. Force-disable the
+ # ingested_files retention regardless of what cfg / caller passed,
+ # and surface the override so the operator sees why it didn't apply.
+ if not is_ingested_files_dedup_active(service_id):
+ configured = int(cfg.get("ingested_files_days") or 0)
+ if configured > 0:
+ _emit(
+ {
+ "type": "status",
+ "message": (
+ f"ingested_files retention ({configured}d) ignored — "
+ "cron_sync.delete_after=false makes this table the dedup gate. "
+ "Trimming would cause full_sync to re-ingest aged-out files."
+ ),
+ }
+ )
+ cfg["ingested_files_days"] = 0
+
+ con = get_con(service_id)
+ t0 = _t.time()
+
+ # Steps: 3 deletes + 1 vacuum + 1 post-count = 5. Set up the progress
+ # framing so the modal can render a determinate bar.
+ total_steps = len(_CLEANUP_TABLES) + 2
+
+ _emit({"type": "status", "message": "Reading current row counts…"})
+ before: dict[str, int] = {}
+ for table, _, _ in _CLEANUP_TABLES:
+ try:
+ before[table] = int(con.execute(f"SELECT count(*) FROM {table}").fetchone()[0] or 0)
+ except sqlite3.OperationalError:
+ before[table] = 0
+
+ deleted: dict[str, int] = {}
+ for idx, (table, key, ts_col) in enumerate(_CLEANUP_TABLES, start=1):
+ days = cfg.get(key)
+ try:
+ days_int = int(days) if days is not None else 0
+ except (TypeError, ValueError):
+ days_int = 0
+ if days_int <= 0:
+ deleted[table] = 0
+ _emit(
+ {
+ "type": "progress",
+ "current": idx,
+ "total": total_steps,
+ "message": f"{table}: retention disabled (0 days) — skipped",
+ }
+ )
+ continue
+ _emit({"type": "status", "message": f"Trimming {table} (older than {days_int}d)…"})
+ try:
+ cur = con.execute(
+ f"DELETE FROM {table} WHERE {ts_col} < datetime('now', ?)",
+ (f"-{days_int} days",),
+ )
+ deleted[table] = int(cur.rowcount or 0)
+ con.commit()
+ _emit(
+ {
+ "type": "progress",
+ "current": idx,
+ "total": total_steps,
+ "message": f"{table}: deleted {deleted[table]:,} rows (kept rows ≤{days_int}d old)",
+ }
+ )
+ except sqlite3.OperationalError as e:
+ logger.warning("[metadata_cleanup] %s: skip %s — %s", service_id, table, e)
+ deleted[table] = 0
+ _emit(
+ {
+ "type": "progress",
+ "current": idx,
+ "total": total_steps,
+ "message": f"{table}: skipped ({e})",
+ }
+ )
+
+ vacuumed = False
+ if any(deleted.values()):
+ # VACUUM cannot run inside an open transaction. Commit + drop the
+ # Python wrapper's auto-BEGIN so the next execute() autocommits.
+ _emit(
+ {
+ "type": "status",
+ "message": "VACUUMing — rewrites the whole file, may take minutes on large DBs…",
+ }
+ )
+ con.commit()
+ old_iso = con.isolation_level
+ con.isolation_level = None
+ try:
+ con.execute("VACUUM")
+ vacuumed = True
+ _emit(
+ {
+ "type": "progress",
+ "current": len(_CLEANUP_TABLES) + 1,
+ "total": total_steps,
+ "message": "VACUUM complete — file shrunk to reflect deletions",
+ }
+ )
+ except sqlite3.OperationalError as e:
+ # Locked / busy — not fatal, the delete already shrank the row count.
+ logger.warning("[metadata_cleanup] %s: VACUUM skipped — %s", service_id, e)
+ _emit(
+ {
+ "type": "progress",
+ "current": len(_CLEANUP_TABLES) + 1,
+ "total": total_steps,
+ "message": f"VACUUM skipped ({e}) — row counts already reduced",
+ }
+ )
+ finally:
+ con.isolation_level = old_iso
+ else:
+ _emit(
+ {
+ "type": "progress",
+ "current": len(_CLEANUP_TABLES) + 1,
+ "total": total_steps,
+ "message": "Nothing deleted — VACUUM skipped (no-op rewrite would waste cycles)",
+ }
+ )
+
+ after: dict[str, int] = {}
+ for table, _, _ in _CLEANUP_TABLES:
+ try:
+ after[table] = int(con.execute(f"SELECT count(*) FROM {table}").fetchone()[0] or 0)
+ except sqlite3.OperationalError:
+ after[table] = 0
+ _emit(
+ {
+ "type": "progress",
+ "current": total_steps,
+ "total": total_steps,
+ "message": f"Final counts: {', '.join(f'{t}={n:,}' for t, n in after.items())}",
+ }
+ )
+
+ # Rollup parquet tree cleanup — independent of the SQLite tables. Skip
+ # silently when the rollups module / source aren't available; rollups
+ # are an optimisation, never a correctness dependency.
+ rollups_deleted = 0
+ try:
+ rollups_days = int(cfg.get("rollups_days") or 0)
+ except (TypeError, ValueError):
+ rollups_days = 0
+ if rollups_days > 0:
+ try:
+ from backend.core import rollups as _rollups
+ from backend.core.duckdb import get_source_for_service
+
+ src = get_source_for_service(service_id)
+ if src is not None:
+ rollups_deleted = _rollups.cleanup_old_rollups(service_id, src, rollups_days)
+ if rollups_deleted:
+ _emit(
+ {
+ "type": "status",
+ "message": f"Rollups: dropped {rollups_deleted} hour-dir(s) older than {rollups_days}d",
+ }
+ )
+ except Exception as e:
+ logger.warning("[metadata_cleanup] %s: rollups cleanup skipped — %s", service_id, e)
+
+ return {
+ "deleted": deleted,
+ "before": before,
+ "after": after,
+ "vacuumed": vacuumed,
+ "rollups_deleted": rollups_deleted,
+ "duration_s": round(_t.time() - t0, 3),
+ }
+
+
+# ── Data-migration tracking ───────────────────────────────────────────────────
+# See backend/core/data_migrations.py for the runner. These helpers exist here
+# (not in the runner module) so the runner can stay free of sqlite imports —
+# the per-service connection lifecycle lives entirely in this module.
+
+
+def list_applied_data_migrations(service_id: str) -> set[str]:
+ """Return the set of applied data-migration names for a service.
+
+ Used by the runner to diff against the registered MIGRATIONS list and
+ determine which still need to run. Returns an empty set for a fresh DB.
+ """
+ con = get_con(service_id)
+ try:
+ rows = con.execute("SELECT name FROM applied_data_migrations").fetchall()
+ return {r["name"] for r in rows}
+ except sqlite3.OperationalError:
+ # Schema not yet initialised — caller will hit this on its first
+ # successful query path; treat as "nothing applied yet".
+ return set()
+
+
+def record_applied_data_migration(
+ service_id: str,
+ name: str,
+ *,
+ duration_s: float,
+ status: str = "success",
+ notes: str | None = None,
+) -> None:
+ """Persist a successful (or failed) migration completion."""
+ con = get_con(service_id)
+ con.execute(
+ "INSERT OR REPLACE INTO applied_data_migrations (name, applied_at, duration_s, status, notes) "
+ "VALUES (?, ?, ?, ?, ?)",
+ (name, iso_z_now(), float(duration_s), status, notes),
+ )
+ con.commit()
diff --git a/backend/core/rollups.py b/backend/core/rollups.py
new file mode 100644
index 00000000..6b65ca09
--- /dev/null
+++ b/backend/core/rollups.py
@@ -0,0 +1,403 @@
+"""
+Hourly Top-N rollups for the dashboard.
+
+For each tracked field (e.g. ``ip``, ``country``, ``url``, custom fields), we
+keep one parquet file per hour at
+``/rollups/hour/field=/hour=/compacted_*.parquet``
+holding the top-K most-common values for that field in that hour.
+
+The dashboard reads these instead of scanning the base ``logs`` view when no
+filters are active, which cuts the unfiltered 24h top-N from a multi-second
+scan to tens of milliseconds. The active hour is always served live off the
+base table (rollups don't include the in-progress hour).
+
+Writers:
+- ``recompute_touched_hours``: per sync tick, batched per-field COPY ...
+ PARTITION_BY (field, hour). Only re-computes the hours actually touched
+ by the new chunk.
+- ``backfill_rollups``: one-shot bulk build over all historical hours,
+ invoked at first-boot and when a new field is added.
+- ``cleanup_old_rollups``: drops per-hour directories older than the cfg
+ retention window. Called from the daily ``metadata_cleanup`` cron.
+
+Reader:
+- ``QueryRunner.execute_top_n_rollups`` in
+ ``backend/repositories/_base.py``.
+"""
+
+from __future__ import annotations
+
+import json
+import logging
+import os
+import re
+import shutil
+import uuid
+from datetime import UTC, datetime, timedelta
+
+logger = logging.getLogger(__name__)
+
+# How many top values per (field, hour) we persist. Dashboards render
+# 10-25 at a time; 500 gives generous headroom for filter overlays and
+# the long-tail "Other" rollup.
+TOP_K = 500
+
+# SQL identifier safelist. Field names land verbatim inside ``"..."``
+# quoted identifiers and inside SELECT projections; service names land
+# in the table identifier ``logs_``. Both come from cfg / DuckDB
+# schema and are PROBABLY already validated upstream — but a single
+# stray double-quote or backtick in either would break the query in a
+# way that's both a correctness bug and a privilege boundary (the
+# fields are derived from admin-controlled custom_field entries).
+# Defense in depth: this module reject anything not matching the
+# pattern with a logged warning.
+_SAFE_IDENT_RE = re.compile(r"^[A-Za-z_][A-Za-z0-9_]*$")
+
+
+def _is_safe_ident(name: str) -> bool:
+ return bool(name) and bool(_SAFE_IDENT_RE.match(name))
+
+
+def _safe_table_for(source: dict) -> str | None:
+ """Return ``logs_`` iff the service name is a safe identifier."""
+ name = source.get("name") or ""
+ if not _is_safe_ident(name):
+ logger.warning("[rollups] refusing to query unsafe service name: %r", name)
+ return None
+ return f"logs_{name}"
+
+
+def _get_fields(src: dict) -> list[str]:
+ """Return the dashboard fields eligible for rollup.
+
+ Custom-field names are validated against ``_SAFE_IDENT_RE`` — anything
+ failing the check is skipped with a warning rather than fed into SQL.
+ """
+ from backend.repositories.dashboard import _VIRTUAL_FIELDS, FIELDS
+
+ lf_config = src.get("log_fields") or {}
+ custom_field_names: list[str] = []
+ for cf in lf_config.get("custom_fields", []):
+ if not cf.get("enabled", True) or not cf.get("show_in_dashboard", True):
+ continue
+ name = cf.get("name") or ""
+ if not _is_safe_ident(name):
+ logger.warning("[rollups] skipping custom field with unsafe name: %r", name)
+ continue
+ custom_field_names.append(name)
+ # Virtual fields (e.g. waf_sig_ind) are computed views over CSV columns
+ # — they aren't column names, so they can't be rolled up directly.
+ actual_fields = [f for f in FIELDS if f not in _VIRTUAL_FIELDS and _is_safe_ident(f)]
+ return actual_fields + custom_field_names
+
+
+def _rollups_root(source: dict) -> str:
+ from backend.core.duckdb import _cache_dir
+
+ return os.path.join(_cache_dir(source), "rollups", "hour")
+
+
+def _markers_path(source: dict) -> str:
+ """JSON file tracking which fields have been backfilled.
+
+ Replaces the prior single ``.backfill_done`` marker which couldn't
+ distinguish "fully backfilled" from "backfilled before a new custom
+ field was added". Shape: ``{"field": "ISO timestamp", ...}``.
+ """
+ from backend.core.duckdb import _cache_dir
+
+ return os.path.join(_cache_dir(source), "rollups", "backfill_markers.json")
+
+
+def _load_markers(source: dict) -> dict[str, str]:
+ path = _markers_path(source)
+ if not os.path.exists(path):
+ return {}
+ try:
+ with open(path) as f:
+ data = json.load(f)
+ return data if isinstance(data, dict) else {}
+ except (OSError, json.JSONDecodeError) as e:
+ logger.warning("[rollups] could not read markers at %s: %s", path, e)
+ return {}
+
+
+def _save_markers(source: dict, markers: dict[str, str]) -> None:
+ path = _markers_path(source)
+ os.makedirs(os.path.dirname(path), exist_ok=True)
+ # Atomic write so a crash mid-write doesn't truncate the file.
+ tmp_path = f"{path}.tmp.{uuid.uuid4().hex[:8]}"
+ try:
+ with open(tmp_path, "w") as f:
+ json.dump(markers, f)
+ os.replace(tmp_path, path)
+ except OSError as e:
+ logger.warning("[rollups] could not write markers to %s: %s", path, e)
+ try:
+ os.remove(tmp_path)
+ except OSError:
+ pass
+
+
+def _publish_field_partitions(tmp_field_dir: str, dst_root: str, field: str) -> int:
+ """Move per-hour parquet files from a temp PARTITION_BY tree into the
+ canonical ``rollups/hour/field=X/hour=Y/`` layout.
+
+ The publish order is RENAME-then-UNLINK to close the race window where
+ a concurrent dashboard read could observe an empty hour directory.
+ Worst case after this change: a dashboard read briefly sees BOTH the
+ new and old parquet for the same hour and double-counts that hour
+ until the unlink lands — which is bounded and self-corrects on the
+ next refresh. Pre-fix, the dashboard could observe ZERO files for the
+ hour (undercount), which was indistinguishable from a real traffic dip.
+
+ Caller MUST hold the per-service iceberg lock around the whole call.
+ Returns the number of hour-dirs published.
+ """
+ field_dir = os.path.join(tmp_field_dir, f"field={field}")
+ if not os.path.isdir(field_dir):
+ return 0
+
+ published = 0
+ for hour_dirname in os.listdir(field_dir):
+ if not hour_dirname.startswith("hour="):
+ continue
+ src_hour_dir = os.path.join(field_dir, hour_dirname)
+ dst_hour_dir = os.path.join(dst_root, f"field={field}", hour_dirname)
+ os.makedirs(dst_hour_dir, exist_ok=True)
+
+ # 1. Rename new files into place first (overcounting window OK).
+ new_names: set[str] = set()
+ for fname in os.listdir(src_hour_dir):
+ if not fname.endswith(".parquet"):
+ continue
+ new_name = f"compacted_{uuid.uuid4().hex[:12]}.parquet"
+ os.rename(os.path.join(src_hour_dir, fname), os.path.join(dst_hour_dir, new_name))
+ new_names.add(new_name)
+
+ # 2. Now unlink any pre-existing files that we didn't just write.
+ if new_names:
+ for existing in os.listdir(dst_hour_dir):
+ if existing.endswith(".parquet") and existing not in new_names:
+ try:
+ os.remove(os.path.join(dst_hour_dir, existing))
+ except OSError as e:
+ logger.warning("[rollups] could not unlink stale %s: %s", existing, e)
+ published += 1
+
+ return published
+
+
+def _build_copy_query(table_ident: str, field: str, where_sql: str) -> str:
+ """Return the COPY ... TO PARTITION_BY (field, hour) SQL for one field.
+
+ Inputs must already be validated — this function does NO escaping.
+ Callers (recompute_touched_hours / backfill_rollups) gate via
+ ``_is_safe_ident`` and ``_safe_table_for``.
+ """
+ return f"""
+ SELECT field, hour, value, count FROM (
+ SELECT
+ '{field}' AS field,
+ strftime(timestamp, '%Y-%m-%d-%H') AS hour,
+ CAST("{field}" AS VARCHAR) AS value,
+ COUNT(*) AS count,
+ ROW_NUMBER() OVER (
+ PARTITION BY strftime(timestamp, '%Y-%m-%d-%H')
+ ORDER BY COUNT(*) DESC
+ ) AS rn
+ FROM {table_ident}
+ WHERE {where_sql}
+ GROUP BY 1, 2, 3
+ ) WHERE rn <= {TOP_K}
+ """
+
+
+def recompute_touched_hours(service_id: str, source: dict, hours: set[str]) -> None:
+ """Recompute rollups for all dashboard fields across the given hours.
+
+ Excludes the active (current UTC) hour — the dashboard serves the
+ in-progress hour live off the base table. One COPY query per field
+ handles all touched hours via PARTITION_BY, so the work is O(fields)
+ not O(fields × hours).
+ """
+ if not hours:
+ return
+
+ active_hour = datetime.now(UTC).strftime("%Y-%m-%d-%H")
+ parsed: list[tuple[str, datetime]] = []
+ for h in hours:
+ if h == active_hour:
+ continue
+ try:
+ parsed.append((h, datetime.strptime(h, "%Y-%m-%d-%H").replace(tzinfo=UTC)))
+ except ValueError:
+ logger.warning("[rollups] skipping malformed hour token: %r", h)
+ if not parsed:
+ return
+
+ table_ident = _safe_table_for(source)
+ if not table_ident:
+ return
+
+ min_start = min(dt for _, dt in parsed)
+ max_end = max(dt for _, dt in parsed) + timedelta(hours=1)
+ hour_list_sql = ", ".join(f"'{h}'" for h, _ in parsed)
+ where_sql = (
+ f"timestamp >= '{min_start.isoformat()}' "
+ f"AND timestamp < '{max_end.isoformat()}' "
+ f"AND strftime(timestamp, '%Y-%m-%d-%H') IN ({hour_list_sql})"
+ )
+ _run_per_field_copy(service_id, source, table_ident, where_sql, _get_fields(source))
+
+
+def backfill_rollups(service_id: str, source: dict, fields: list[str] | None = None) -> None:
+ """One-shot bulk build for all historical hours up to (but not including)
+ the current hour.
+
+ ``fields``: if provided, only backfills the given subset (used when a
+ new custom field is added — see :func:`ensure_field_backfills`).
+ Defaults to all eligible fields.
+ """
+ table_ident = _safe_table_for(source)
+ if not table_ident:
+ return
+
+ target_fields = fields if fields is not None else _get_fields(source)
+ if not target_fields:
+ return
+
+ dt_end = datetime.now(UTC).replace(minute=0, second=0, microsecond=0)
+ where_sql = f"timestamp < '{dt_end.isoformat()}'"
+ _run_per_field_copy(service_id, source, table_ident, where_sql, target_fields)
+
+ # Stamp completion in the markers file so _ensure_rollups can detect
+ # which fields still need a backfill on next startup / cfg change.
+ markers = _load_markers(source)
+ stamp = datetime.now(UTC).isoformat()
+ for f in target_fields:
+ markers[f] = stamp
+ _save_markers(source, markers)
+
+
+def ensure_field_backfills(service_id: str, source: dict) -> None:
+ """Backfill any eligible fields that don't yet have a marker entry.
+
+ Triggered at startup (full backfill if no markers) and by callers that
+ mutate the log_fields config (new field added). Idempotent — fields
+ already in the markers file are skipped.
+ """
+ markers = _load_markers(source)
+ eligible = _get_fields(source)
+ missing = [f for f in eligible if f not in markers]
+ if not missing:
+ return
+ logger.info(
+ "[rollups] service %s: backfilling %d new field(s): %s",
+ service_id,
+ len(missing),
+ missing,
+ )
+ backfill_rollups(service_id, source, fields=missing)
+
+
+def cleanup_old_rollups(service_id: str, source: dict, max_age_days: int) -> int:
+ """Delete per-hour rollup directories older than ``max_age_days``.
+
+ ``max_age_days <= 0`` disables cleanup (keep everything). Returns the
+ number of hour-dirs deleted. Safe to call concurrently with the
+ writers because we only ever delete hours STRICTLY older than the
+ cutoff — current and just-written hours are never candidates.
+ """
+ if max_age_days <= 0:
+ return 0
+ rollup_root = _rollups_root(source)
+ if not os.path.isdir(rollup_root):
+ return 0
+
+ cutoff = (datetime.now(UTC) - timedelta(days=max_age_days)).strftime("%Y-%m-%d-%H")
+ deleted = 0
+ try:
+ for field_entry in os.listdir(rollup_root):
+ if not field_entry.startswith("field="):
+ continue
+ field_dir = os.path.join(rollup_root, field_entry)
+ for hour_entry in os.listdir(field_dir):
+ if not hour_entry.startswith("hour="):
+ continue
+ hour = hour_entry[len("hour=") :]
+ # String compare works because the format is fixed-width
+ # YYYY-MM-DD-HH which sorts lexicographically by time.
+ if hour < cutoff:
+ hour_dir = os.path.join(field_dir, hour_entry)
+ try:
+ shutil.rmtree(hour_dir)
+ deleted += 1
+ except OSError as e:
+ logger.warning("[rollups] could not delete %s: %s", hour_dir, e)
+ except OSError as e:
+ logger.warning("[rollups] cleanup walk failed for %s: %s", service_id, e)
+ return deleted
+
+
+def _run_per_field_copy(
+ service_id: str,
+ source: dict,
+ table_ident: str,
+ where_sql: str,
+ fields: list[str],
+) -> None:
+ """Shared core of recompute_touched_hours and backfill_rollups.
+
+ One COPY query per field, writing to a per-field temp directory via
+ PARTITION_BY (field, hour), then publishing each hour-dir under the
+ per-service iceberg lock.
+ """
+ import duckdb
+
+ from backend.core.duckdb import _cache_dir, get_connection
+ from backend.core.iceberg import _get_service_lock
+
+ cache_root = _cache_dir(source)
+ rollups_dir = _rollups_root(source)
+ os.makedirs(rollups_dir, exist_ok=True)
+ lock_key = source.get("name", "default")
+
+ con = get_connection(source=source, read_only=True)
+ try:
+ try:
+ cols = {c[0] for c in con.execute(f"DESCRIBE {table_ident}").fetchall()}
+ except duckdb.Error as e:
+ logger.warning("[rollups] %s: could not describe %s: %s", service_id, table_ident, e)
+ return
+
+ for field in fields:
+ if not _is_safe_ident(field):
+ # Belt-and-suspenders — _get_fields already filters, but
+ # defend against direct callers passing raw names.
+ logger.warning("[rollups] skipping unsafe field name: %r", field)
+ continue
+ if field not in cols:
+ continue
+
+ tmp_field_dir = os.path.join(cache_root, "rollups", "tmp", field)
+ shutil.rmtree(tmp_field_dir, ignore_errors=True)
+ os.makedirs(tmp_field_dir, exist_ok=True)
+
+ inner = _build_copy_query(table_ident, field, where_sql)
+ query = (
+ f"COPY ({inner}) TO '{tmp_field_dir}' "
+ "(FORMAT PARQUET, PARTITION_BY (field, hour), OVERWRITE_OR_IGNORE, COMPRESSION ZSTD)"
+ )
+ try:
+ con.execute(query)
+ except duckdb.Error as e:
+ logger.warning("[rollups] %s: COPY failed for field=%s: %s", service_id, field, e)
+ shutil.rmtree(tmp_field_dir, ignore_errors=True)
+ continue
+
+ with _get_service_lock(lock_key):
+ _publish_field_partitions(tmp_field_dir, rollups_dir, field)
+ shutil.rmtree(tmp_field_dir, ignore_errors=True)
+ finally:
+ con.close()
diff --git a/backend/core/share_db.py b/backend/core/share_db.py
index 4d2569e2..74b9ead4 100644
--- a/backend/core/share_db.py
+++ b/backend/core/share_db.py
@@ -39,6 +39,8 @@
from datetime import UTC, datetime, timedelta
from typing import Any
+from backend.utils.date_utils import iso_z, iso_z_now
+
logger = logging.getLogger(__name__)
# ── Locations ────────────────────────────────────────────────────────────────
@@ -90,11 +92,50 @@ def get_safe_share_db_connection(path: str) -> sqlite3.Connection:
con.execute("SELECT 1").fetchone()
return con
except sqlite3.DatabaseError as exc:
+ # Security: ``DatabaseError`` is the parent of
+ # ``OperationalError``, which fires for transient conditions like
+ # "database is locked" / "disk I/O error" / FD exhaustion. The
+ # quarantine path renames the DB out from under any other open
+ # connections AND wipes all share state — running it on a transient
+ # error means a single lock-timeout under load can permanently
+ # delete every invite, session, and audit row in the share DB.
+ #
+ # Restrict the quarantine to actual file-corruption signatures from
+ # SQLite: "file is not a database" / "database disk image is malformed"
+ # / "unsupported file format". Anything else (lock timeout, I/O error,
+ # full disk, missing parent dir) is re-raised so the caller sees the
+ # real error instead of silently nuking the DB.
+ msg = str(exc).lower()
+ is_corruption = (
+ "malformed" in msg
+ or "not a database" in msg
+ or "unsupported file format" in msg
+ or "image is malformed" in msg
+ )
+ if not is_corruption:
+ # ERROR (not WARNING) so this near-miss is alertable from the
+ # existing log-error monitoring without needing a new metric
+ # plumbing — quarantine-skipped events should be rare; if we
+ # start seeing them at volume it's a signal that the
+ # is_corruption substrings need updating.
+ logger.error(
+ "[share_db] DatabaseError on open of %s NOT classified as corruption (err_type=%s); re-raising: %s",
+ path,
+ type(exc).__name__,
+ exc,
+ )
+ raise
+
epoch = int(time.time())
corrupt_path = f"{path}.corrupt-{epoch}"
try:
os.replace(path, corrupt_path)
- logger.error("[share_db] corrupt DB at %s quarantined to %s (%s)", path, corrupt_path, exc)
+ logger.error(
+ "[share_db] corrupt DB at %s quarantined to %s (reason=corruption, %s)",
+ path,
+ corrupt_path,
+ exc,
+ )
except OSError:
logger.exception("[share_db] failed to quarantine corrupt DB at %s", path)
raise
@@ -344,16 +385,7 @@ def apply_pending(con: sqlite3.Connection) -> int:
# ── Time helpers ─────────────────────────────────────────────────────────────
-
-
-def iso_z_now() -> str:
- return datetime.now(UTC).strftime("%Y-%m-%dT%H:%M:%SZ")
-
-
-def iso_z(dt: datetime) -> str:
- if dt.tzinfo is None:
- dt = dt.replace(tzinfo=UTC)
- return dt.astimezone(UTC).strftime("%Y-%m-%dT%H:%M:%SZ")
+# Handled via backend.utils.date_utils imports above to avoid duplication.
# ── Passcode hashing (constant-time scrypt) ─────────────────────────────────
@@ -773,7 +805,15 @@ def get_remote_invites(*, con: sqlite3.Connection | None = None) -> list[dict]:
def get_remote_invite_by_email_passcode(
email: str, passcode: str, *, con: sqlite3.Connection | None = None
) -> dict | None:
- """Constant-time lookup. Returns the invite dict on success, else None."""
+ """Constant-time lookup. Returns the invite dict on success, else None.
+
+ Security: when no invite exists for ``email`` (e.g., email
+ enumeration attack), still run one scrypt verification against a dummy
+ hash with the same parameters so the response time matches the
+ invite-exists branch (~30 ms). Without this, an attacker measuring the
+ response latency can distinguish "email is registered, passcode wrong"
+ (slow) from "email never invited" (fast) and enumerate emails.
+ """
con = con or get_global_share_con()
norm_email = (email or "").strip().lower()
rows = con.execute(
@@ -790,12 +830,40 @@ def get_remote_invite_by_email_passcode(
if match is None:
match = dict(row)
if match is None:
+ # Equalize timing ONLY when the email has no invite at all. If
+ # rows existed (email present, passcode wrong) we already paid one
+ # scrypt per row inside the loop — running the dummy verification
+ # again would push the wrong-passcode branch to ``(N+1)×scrypt``
+ # while the no-email branch stays at ``1×scrypt``, recreating
+ # the 2× timing side-channel this function is meant to close.
+ if not rows:
+ _equalize_passcode_timing(passcode)
return None
match["pii_policy"] = json.loads(match.get("pii_policy") or '{"mask_ips": false}')
match["service_ids"] = get_remote_invite_services(match["id"], con=con)
return match
+_dummy_hash: str | None = None
+
+
+def _equalize_passcode_timing(passcode: str) -> None:
+ """Run one scrypt verification against a fixed dummy hash so the timing
+ of the "no email match" branch matches the "email match, wrong passcode"
+ branch.
+
+ The dummy hash uses the same _SCRYPT_N/_R/_P/_DKLEN parameters as
+ ``hash_passcode`` so verification cost is identical. Generated once per
+ process and reused — generating per-call would add measurable extra cost
+ to the miss branch."""
+ global _dummy_hash
+ if _dummy_hash is None:
+ # Synthesize via the real hash function so any future parameter
+ # change in ``hash_passcode`` is automatically reflected here.
+ _dummy_hash = hash_passcode("__dummy_for_timing_equalization__")
+ verify_passcode(passcode, _dummy_hash)
+
+
def update_remote_invite_services(
invite_id: str, service_ids: list[str], *, con: sqlite3.Connection | None = None
) -> None:
@@ -1028,21 +1096,37 @@ def claim_token(token: str, ip: str, *, con: sqlite3.Connection | None = None) -
Returns the row dict on success; ``None`` if the token does not exist, is
expired, or was already claimed.
+
+ Security (TOCTOU): use a single atomic UPDATE with the
+ ``claimed_at IS NULL`` predicate baked into the WHERE clause. Earlier
+ versions ran SELECT-then-check-then-UPDATE under the same transaction,
+ but two concurrent claims could both pass the SELECT before either
+ UPDATE landed and end up double-redeeming. Now whichever transaction's
+ UPDATE commits first wins (rowcount == 1); the loser sees rowcount == 0
+ and returns None.
+
+ The SELECT after UPDATE re-reads the just-claimed row so we can return
+ the invite_id to the caller. Doing it inside the same ``with con:``
+ block keeps it in the same write transaction.
"""
con = con or get_global_share_con()
now = iso_z_now()
with con:
+ cur = con.execute(
+ """
+ UPDATE remote_invite_claim_tokens
+ SET claimed_at = ?, claimed_from_ip = ?
+ WHERE token = ?
+ AND claimed_at IS NULL
+ AND expires_at >= ?
+ """,
+ (now, ip, token, now),
+ )
+ if cur.rowcount != 1:
+ return None
row = con.execute("SELECT * FROM remote_invite_claim_tokens WHERE token=?", (token,)).fetchone()
if row is None:
return None
- if row["claimed_at"] is not None:
- return None
- if row["expires_at"] < now:
- return None
- con.execute(
- "UPDATE remote_invite_claim_tokens SET claimed_at=?, claimed_from_ip=? WHERE token=?",
- (now, ip, token),
- )
return dict(row)
@@ -1277,11 +1361,22 @@ def apply_pii_policy(obj, policy: dict):
return obj
masked_keys = {"ip", "ip_address", "client_ip", "remote_addr"}
- def _walk(node):
+ def _walk(node, parent_key=None):
if isinstance(node, dict):
- return {k: (mask_ip(v) if isinstance(v, str) and k in masked_keys else _walk(v)) for k, v in node.items()}
+ return {
+ k: (mask_ip(v) if isinstance(v, str) and k in masked_keys else _walk(v, parent_key=k))
+ for k, v in node.items()
+ }
if isinstance(node, list):
- return [_walk(x) for x in node]
+ # Array fields inherit the parent dict key for masking — e.g.
+ # ``{"client_ip": ["1.2.3.4", "5.6.7.8"]}`` must mask each string
+ # the same way the scalar form would. Without threading the
+ # parent key through, list-of-string IP fields slipped past the
+ # masker entirely.
+ return [
+ (mask_ip(x) if isinstance(x, str) and parent_key in masked_keys else _walk(x, parent_key=parent_key))
+ for x in node
+ ]
return node
return _walk(obj)
diff --git a/backend/cron_progress.py b/backend/cron_progress.py
index a2c8a86c..00176fcf 100644
--- a/backend/cron_progress.py
+++ b/backend/cron_progress.py
@@ -10,9 +10,121 @@
def start_progress(run_id: int, service_id: str = None, task: str = None):
with _lock:
if run_id not in _progress:
+ now = time.time()
_progress[run_id] = []
- _last_update[run_id] = time.time()
- _run_metadata[run_id] = {"service_id": service_id, "task": task}
+ _last_update[run_id] = now
+ _run_metadata[run_id] = {
+ "service_id": service_id,
+ "task": task,
+ "started_at": now,
+ }
+
+
+_STALE_AFTER_SECONDS = 300 # 5 min — covers slow syncs, kills zombie entries
+
+
+def list_active_runs() -> list[dict]:
+ """Return metadata for runs that are GENUINELY in flight.
+
+ A run is considered active when ALL of these hold:
+ 1. It's in ``_run_metadata`` (was started_progress'd)
+ 2. Its last progress event is NOT terminal (done/error)
+ 3. Its ``_last_update`` was within the last 5 minutes
+ 4. The persisted ``cron_runs.status`` is still ``'running'``
+
+ Condition (4) is the DB-truth backstop: when an APScheduler
+ watchdog abandons a worker thread (interpreter shutdown, OOM
+ kill, executor recycle) or some other path completes ``log_cron_run``
+ without firing the in-memory ``end_progress``, the in-memory dict
+ falsely shows the run as in-flight even though the DB knows it
+ succeeded. Production observed 13+ such ghosts on 2026-06-03
+ after a backend restart — DB rows said ``status='success'`` with
+ durations of 2-6 seconds while the in-memory dict held them as
+ active for 100+ seconds. Cross-checking against the DB gives a
+ correct answer regardless of what happened to the in-memory
+ state.
+
+ Condition (3) covers the residual: a run whose DB write also got
+ skipped (something crashed before ``log_cron_run``). After 5 min
+ of zero progress, we declare it a zombie regardless.
+ """
+ now = time.time()
+ with _lock:
+ candidates = []
+ for run_id, meta in _run_metadata.items():
+ events = _progress.get(run_id) or []
+ if events and events[-1].get("type") in ("done", "error"):
+ continue
+ last_update = _last_update.get(run_id, now)
+ if now - last_update > _STALE_AFTER_SECONDS:
+ continue
+ candidates.append((run_id, meta))
+
+ # DB cross-check happens OUTSIDE the lock so a slow SQLite call
+ # doesn't block other progress operations. The query is cheap
+ # (PK lookup per run_id) and runs once per snapshot poll.
+ out = []
+ for run_id, meta in candidates:
+ if _db_status_is_terminal(meta.get("service_id"), run_id):
+ continue
+ entry = {"run_id": run_id}
+ entry.update(meta)
+ out.append(entry)
+ return out
+
+
+def _db_status_is_terminal(service_id: str | None, run_id: int) -> bool:
+ """Return True if the cron_runs row for this run_id has a terminal
+ status ('success' or 'error') in per-service SQLite.
+
+ Best-effort: any DB error (missing service, table not yet created,
+ SQLite locked) returns False so the in-memory truth still serves
+ the badge (we'd rather show one false-in-flight than hide a
+ genuinely running one).
+ """
+ if not service_id:
+ return False
+ try:
+ from backend.core import metadata_db
+
+ status = metadata_db.get_cron_run_status(service_id, run_id)
+ return status in ("success", "error")
+ except Exception:
+ return False
+
+
+def reap_zombie_runs() -> int:
+ """Eagerly evict zombie run metadata from in-memory state.
+
+ Mirrors list_active_runs' staleness check but actually mutates
+ the dicts. Called from the scheduler's per-tick cleanup so
+ /admin/health-snapshot doesn't drift by minutes between sync
+ ticks. Returns the count evicted for log telemetry.
+
+ Why this and not just rely on cleanup_progress's 1-hour TTL: a
+ zombie sync that ran for 2 minutes then died leaves a stale entry
+ that's still <1h old. cleanup_progress wouldn't touch it.
+ list_active_runs filters the badge but the entry still bloats
+ _run_metadata and shows up in any other code path that walks
+ the dict (admin.py:210/238/1022 — patched 2026-06-02 but easy
+ to regress).
+ """
+ now = time.time()
+ evicted = 0
+ with _lock:
+ for run_id in list(_run_metadata.keys()):
+ last_update = _last_update.get(run_id, now)
+ if now - last_update > _STALE_AFTER_SECONDS:
+ events = _progress.get(run_id) or []
+ # Stale + no terminal event = zombie. Append a synthetic
+ # error so any SSE subscriber sees the run ended.
+ if not events or events[-1].get("type") not in ("done", "error"):
+ _progress.setdefault(run_id, []).append(
+ {"type": "error", "message": "scheduler reaped zombie cron (no progress in 5m)"}
+ )
+ _run_metadata.pop(run_id, None)
+ evicted += 1
+ return evicted
def add_progress(run_id: int, event: dict):
@@ -57,13 +169,46 @@ def get_latest_progress_for_service(service_id: str) -> dict | None:
def end_progress(run_id: int, final_event: dict | None = None):
+ """Mark a cron run as ended.
+
+ AUTO-DONE: if no ``final_event`` is provided AND the run's last
+ event isn't already a terminal type ("done"/"error"), automatically
+ append a ``{"type": "done"}`` event so ``list_active_runs`` can
+ filter the run out. Without this, callers that emit only "status"
+ events during their lifetime (the sync path's view-refresh message
+ is the canonical example) leave the run "active" until the 1-hour
+ TTL — accumulating dozens of stale entries on the System Health card.
+
+ Explicit callers that want a richer terminal event can still pass
+ ``final_event={"type": "done", "rows": N}`` and the same append path
+ runs. The auto-emit only kicks in when the caller forgot.
+ """
with _lock:
if run_id in _progress:
+ events = _progress[run_id]
+ last_type = events[-1].get("type") if events else None
if final_event:
_progress[run_id].append(final_event)
+ elif last_type not in ("done", "error"):
+ _progress[run_id].append({"type": "done"})
_last_update[run_id] = time.time()
+def cleanup_progress_and_reap():
+ """Convenience helper that runs cleanup_progress + reap_zombie_runs.
+
+ The two are always called as a pair from every cron entrypoint
+ (7 scheduler functions today). Wrapping them prevents the
+ common bug where a new cron runner remembers cleanup but forgets
+ the reap — leaving zombie entries in the System Health card.
+
+ Returns the reap count for log telemetry; cleanup_progress's
+ return value is None.
+ """
+ cleanup_progress()
+ return reap_zombie_runs()
+
+
def cleanup_progress():
now = time.time()
with _lock:
diff --git a/backend/deps.py b/backend/deps.py
index 40ce522e..8e88b729 100644
--- a/backend/deps.py
+++ b/backend/deps.py
@@ -69,8 +69,16 @@ def get_source(service_id: str | None = Depends(get_service_id)) -> dict:
class _ConnectionHolder:
"""Holds a single DuckDB connection for the lifetime of one request.
- Used as a context-manager-style dependency so FastAPI closes the
- connection when the request finishes.
+ Read-only requests check out a pooled, pre-warmed connection via
+ ``duckdb_pool.checkout_connection`` (saves ~50ms per request of
+ pragma / S3 / iceberg-view setup). Write-mode connections still take
+ the always-fresh ``get_connection`` path because ingest holds the
+ write lock and pooling would defeat its lifecycle.
+
+ Used as a context-manager-style dependency so FastAPI returns the
+ connection to the pool (or closes the fresh one) when the request
+ finishes. On any exception the connection is discarded rather than
+ pooled so a poisoned connection doesn't get reused.
"""
def __init__(self, source: dict, skip_view_update: bool = False, read_only: bool = True):
@@ -78,37 +86,91 @@ def __init__(self, source: dict, skip_view_update: bool = False, read_only: bool
self._skip_view_update = skip_view_update
self._read_only = read_only
self.con: duckdb.DuckDBPyConnection | None = None
+ # Set when we exit cleanly so __exit__ knows to return-vs-discard.
+ self._errored = False
+ # Used only on the pooled path so __exit__ can release.
+ self._pool_cm = None
def __enter__(self) -> duckdb.DuckDBPyConnection:
+ # Write mode + skip_view_update fall back to the fresh-connection
+ # path: the pool exists for the dominant read-only HTTP request
+ # workload, not for ingest's exclusive writer or for callers that
+ # explicitly opt out of view binding. The pool itself can also be
+ # disabled globally via DUCKDB_CONNECTION_POOL=0 (tests + emergency
+ # rollback); when disabled we go straight through ``get_connection``
+ # so behaviour matches the pre-pool design exactly.
+ from backend.core import duckdb_pool
+
+ use_pool = (
+ self._read_only
+ and not self._skip_view_update
+ and duckdb_pool._pool_enabled()
+ )
try:
- self.con = get_connection(
- source=self._source,
- max_wait=10, # Increased wait slightly for safety
- skip_view_update=self._skip_view_update,
- read_only=self._read_only,
- )
+ if use_pool:
+ self._pool_cm = duckdb_pool.checkout_connection(self._source, max_wait=10.0)
+ self.con = self._pool_cm.__enter__()
+ else:
+ self.con = get_connection(
+ source=self._source,
+ max_wait=10,
+ skip_view_update=self._skip_view_update,
+ read_only=self._read_only,
+ )
except DBBusyError as e:
raise HTTPException(
status_code=503, # 503 Service Unavailable so frontend fetch throws and React Query keeps cached data
detail={"error": str(e), "busy": True},
)
+ except Exception as e:
+ # Pool exhaustion (after wait timeout) surfaces as _PoolBusy.
+ # Translate to 503 so the frontend handles it the same as
+ # DBBusyError instead of throwing an opaque 500.
+ from backend.core.duckdb_pool import _PoolBusy
+
+ if isinstance(e, _PoolBusy):
+ raise HTTPException(
+ status_code=503,
+ detail={"error": str(e), "busy": True},
+ )
+ raise
return self.con
- def __exit__(self, *_):
+ def __exit__(self, exc_type, exc_val, exc_tb):
+ self._errored = exc_type is not None
+ if self._pool_cm is not None:
+ # Forward the exception to the pool context manager so it can
+ # mark the connection errored and discard.
+ try:
+ self._pool_cm.__exit__(exc_type, exc_val, exc_tb)
+ except Exception:
+ pass
+ self._pool_cm = None
+ self.con = None
+ return False
if self.con:
try:
self.con.close()
except Exception:
pass
self.con = None
+ return False
-def get_con(source: dict = Depends(get_source), read_only: bool = True) -> duckdb.DuckDBPyConnection:
+def get_con(source: dict = Depends(get_source)) -> duckdb.DuckDBPyConnection:
"""Dependency that yields a DuckDB connection and closes it after the request.
- Defaults to read_only=True for dashboard queries to prevent blocking on crons.
+ Always opens in read-only mode for HTTP request handlers — write-mode
+ connections are used only by the scheduler/cron pipeline, never by
+ user-facing routes.
+
+ Security: do NOT take ``read_only`` as a parameter. FastAPI converts
+ primitive-typed dependency parameters into query parameters, so any
+ request to a route using this dep could send ``?read_only=false`` and
+ force an exclusive write-lock acquisition that blocks readers and the
+ sync cron (503 DoS). The flag is hardcoded inside the holder instead.
"""
- holder = _ConnectionHolder(source, read_only=read_only)
+ holder = _ConnectionHolder(source, read_only=True)
with holder as con:
yield con
@@ -136,12 +198,51 @@ def __init__(
self.con = con
-def get_meta_con(source: dict = Depends(get_source), read_only: bool = True) -> duckdb.DuckDBPyConnection:
+# ── Tenant-scope enforcement (security) ─────────────
+
+
+def require_service_access(
+ request,
+ service_id: str | None = Depends(get_service_id),
+) -> str | None:
+ """Reject the request with 403 if the caller (analyst session) does not
+ have access to the requested ``service_id``.
+
+ Local admin requests (analyst_session is None) bypass this check entirely
+ — admins have access to every configured service. Analysts must have the
+ target ``service_id`` in their invite's ``service_ids`` list.
+
+ Use as a dependency on any route that returns or mutates per-service
+ data. Routes that take no ``service_id`` parameter and that expose a
+ list of services across the whole tenant must filter the list manually
+ using ``request.state.analyst_session.service_ids`` — this helper only
+ enforces the single-service case.
+ """
+ analyst_session = getattr(request.state, "analyst_session", None)
+ if analyst_session is None:
+ return service_id # admin / local — unrestricted
+ allowed = set(analyst_session.service_ids or [])
+ if service_id is None:
+ # Analyst calls with no explicit service must default to one of their
+ # scoped services. Return the first one (or None if invite is empty).
+ return next(iter(allowed), None)
+ if service_id not in allowed:
+ raise HTTPException(
+ status_code=403,
+ detail={"error": "service_not_authorized", "service": service_id},
+ )
+ return service_id
+
+
+def get_meta_con(source: dict = Depends(get_source)) -> duckdb.DuckDBPyConnection:
"""Dependency that yields a DuckDB connection, skipping the Iceberg view update.
Use this for metadata routes (e.g. cron logs, admin settings) that don't
need to query the main logs table, to avoid blocking on S3 manifest reads.
+
+ Security: ``read_only`` is hardcoded True for the same reason as
+ ``get_con`` above.
"""
- holder = _ConnectionHolder(source, skip_view_update=True, read_only=read_only)
+ holder = _ConnectionHolder(source, skip_view_update=True, read_only=True)
with holder as con:
yield con
diff --git a/backend/main.py b/backend/main.py
index bc4691b2..239327e3 100644
--- a/backend/main.py
+++ b/backend/main.py
@@ -43,6 +43,8 @@
logging.getLogger("pyiceberg.io").setLevel(logging.WARNING)
logging.getLogger("apscheduler").setLevel(logging.WARNING)
+logger = logging.getLogger("backend.main")
+
from fastapi import FastAPI, Request
from fastapi.middleware.cors import CORSMiddleware
from fastapi.middleware.gzip import GZipMiddleware
@@ -98,6 +100,17 @@ def _initialize_service(cfg: dict):
if src:
_db.refresh_config_status(sid)
_ensure_persistent_view(sid, src)
+ # Data migrations: queues any pending one-time setup work
+ # (e.g. the initial rollups backfill) onto a daemon thread
+ # per service. Returns immediately so startup isn't gated
+ # on a potentially multi-minute backfill. See
+ # backend/core/data_migrations.py for the framework.
+ try:
+ from backend.core import data_migrations
+
+ data_migrations.run_pending(sid, src)
+ except Exception as e:
+ logging.warning("[fastapi] Service %s: could not queue data migrations: %s", sid, e)
logging.info("[fastapi] Service %s initialised.", sid)
except Exception as e:
logging.warning("[fastapi] Could not initialise service %s: %s", sid, e)
@@ -161,6 +174,50 @@ def _ensure_pop_cache():
logging.warning("[fastapi] Could not prefetch POP locations: %s", e)
+def _ensure_scoring_matrix():
+ """Pull the trained scoring matrix from FOS at startup for any
+ service that has scoring enabled.
+
+ Without this, the /scoring/evaluation endpoint falls back to the
+ bundled matrix.default.json (empty transitions → AUC ≈ 0.5) until
+ an operator manually drops compute/scorer/matrix.json into the
+ container. The fetch is best-effort: missing FOS object, no scoring
+ enabled, S3 timeout — all silently no-op so a slow FOS doesn't
+ block startup.
+ """
+ try:
+ from backend.provision.session_scoring_orchestrator import _MATRIX_PATH
+ from backend.state_sync import fetch_matrix_from_fos
+
+ for cfg in svcconfig.list_configs():
+ if not (cfg.get("scoring") or {}).get("enabled"):
+ continue
+ sid = cfg.get("service_id") or cfg.get("name")
+ try:
+ matrix = fetch_matrix_from_fos(sid)
+ if not matrix:
+ continue
+ _MATRIX_PATH.parent.mkdir(parents=True, exist_ok=True)
+ with _MATRIX_PATH.open("w") as f:
+ import json as _json
+
+ _json.dump(matrix, f)
+ logging.info(
+ "[fastapi] Pulled scoring matrix from FOS for %s (version=%s)",
+ sid,
+ matrix.get("version", "?"),
+ )
+ # First-write-wins: with multiple scoring-enabled services,
+ # the matrix file is global. They SHOULD all be the same
+ # matrix (one trainer, one deploy), but if they differ
+ # we use whichever loaded first and log a warning above.
+ break
+ except Exception as e:
+ logging.warning("[fastapi] Could not pull scoring matrix for %s: %s", sid, e)
+ except Exception as e:
+ logging.warning("[fastapi] _ensure_scoring_matrix failed: %s", e)
+
+
def _background_startup():
"""Run initialisation tasks that should not block the web server startup."""
# Tag everything done here so the s3fs/boto3 hooks attribute their
@@ -181,6 +238,7 @@ def _background_startup():
logging.warning("[fastapi] reload_default_source failed: %s", e)
_ensure_pop_cache()
+ _ensure_scoring_matrix()
try:
from backend.scheduler import get_scheduler
@@ -223,6 +281,56 @@ def _enforce_data_dir_mounted() -> None:
raise RuntimeError(msg)
+def _enforce_proxy_headers_configured() -> None:
+ """Security regression guard for.
+
+ The remote-access middleware reads ``request.client.host`` and trusts it as
+ the client's real IP. That only works if uvicorn is launched with
+ ``--proxy-headers --forwarded-allow-ips=`` —
+ without those flags the framework returns the loopback peer address for
+ every Caddy-proxied request and every IP-based gate (rate-limiting, admin
+ detection, whitelist) becomes ineffective.
+
+ Production sets ``TRUSTED_PROXY_IPS=127.0.0.1`` in docker-compose.prod.yml
+ alongside the uvicorn flags. If that env var is missing or empty at boot,
+ refuse to start (or, for local dev where the var is unset, emit a loud
+ WARNING) so a future config refactor cannot silently re-introduce the
+ pre-patch vulnerability.
+
+ Set ``REQUIRE_PROXY_HEADERS=1`` in production to make this a hard FATAL.
+ Local dev / tests leave both env vars unset and the function is a no-op.
+
+ Defense in depth: even when our own ``TRUSTED_PROXY_IPS`` env is set, we
+ also probe uvicorn's own ``UVICORN_FORWARDED_ALLOW_IPS`` env var (the
+ env-equivalent of the ``--forwarded-allow-ips`` CLI flag). If a future
+ refactor passes the CLI flag without exporting our companion env var,
+ uvicorn's variable lets us detect it.
+ """
+ trusted = (os.environ.get("TRUSTED_PROXY_IPS") or "").strip()
+ uvicorn_trusted = (os.environ.get("UVICORN_FORWARDED_ALLOW_IPS") or "").strip()
+ require_strict = os.environ.get("REQUIRE_PROXY_HEADERS") == "1" or os.environ.get("STRICT_DATA_DIR_CHECK") == "1"
+ effective = trusted or uvicorn_trusted
+ if effective:
+ logging.info(
+ "[fastapi] proxy-headers trust set: TRUSTED_PROXY_IPS=%s UVICORN_FORWARDED_ALLOW_IPS=%s",
+ trusted or "(unset)",
+ uvicorn_trusted or "(unset)",
+ )
+ return
+ msg = (
+ "TRUSTED_PROXY_IPS is unset. uvicorn must be launched with "
+ "`--proxy-headers --forwarded-allow-ips=127.0.0.1` AND have "
+ "TRUSTED_PROXY_IPS=127.0.0.1 in its environment so the remote-access "
+ "middleware can read request.client.host as the real client IP. "
+ "Without this, leftmost-XFF spoofing becomes exploitable "
+ "and the admin Host-spoof bypass returns. See docker-compose.prod.yml."
+ )
+ if require_strict:
+ logging.critical("FATAL: %s", msg)
+ raise RuntimeError(msg)
+ logging.warning("[fastapi] %s", msg)
+
+
@asynccontextmanager
async def lifespan(app: FastAPI):
"""Startup / shutdown lifecycle."""
@@ -232,6 +340,12 @@ async def lifespan(app: FastAPI):
# ingestion logic that would otherwise blindly write to the wrong path.
_enforce_data_dir_mounted()
+ # Proxy-headers regression guard (security). Production
+ # must have TRUSTED_PROXY_IPS set in env (mirrors the uvicorn
+ # --forwarded-allow-ips flag). Without it, IP-based gates become
+ # ineffective and the Host-spoof admin bypass returns.
+ _enforce_proxy_headers_configured()
+
# Verify dependencies
try:
import pyarrow # noqa: F401
@@ -289,7 +403,7 @@ async def lifespan(app: FastAPI):
app = FastAPI(
title="Fastly Log Analytics API",
- version="1.0.0",
+ version="1.1.0",
description=(
"FastAPI backend for the Fastly Log Analytics tool. "
"Serves the Next.js frontend and exposes an OpenAPI spec at /openapi.json."
@@ -380,13 +494,24 @@ async def telemetry_middleware(request: Request, call_next):
app.include_router(alerts.router)
app.include_router(origin.router)
-from backend.routers import admin, bootstrap, debug, provision, services, share_admin, share_auth, usage
+from backend.routers import (
+ admin,
+ bootstrap,
+ debug,
+ provision,
+ services,
+ session_scoring,
+ share_admin,
+ share_auth,
+ usage,
+)
app.include_router(bootstrap.router)
app.include_router(services.router)
app.include_router(usage.router)
app.include_router(admin.router)
app.include_router(provision.router)
+app.include_router(session_scoring.router)
app.include_router(debug.router)
app.include_router(share_auth.router)
app.include_router(share_admin.router)
diff --git a/backend/models/common.py b/backend/models/common.py
index 16ea6937..09c6b6c6 100644
--- a/backend/models/common.py
+++ b/backend/models/common.py
@@ -120,7 +120,9 @@ class DebugCall(BaseModel):
caller: str | None = None
-from pydantic import Field
+import os as _os
+
+from pydantic import Field, model_serializer
class HasDataMixin(BaseModel):
@@ -130,6 +132,25 @@ class HasDataMixin(BaseModel):
total: int = 0
+# 038: telemetry payloads (raw SQL + outbound API URL/timing) are useful
+# during development and incident response but they're an information-leak
+# surface in normal operation — every analyst dashboard fetch echoes the
+# server's internal SQL and the FOS object keys it touched. Gate inclusion
+# on a process-level ``DEBUG_RESPONSES`` env var so production
+# deployments default to "telemetry excluded from API responses" and an
+# operator who needs the debug panel during triage can flip the flag and
+# restart the process. The frontend DebugPanel reads ``_debug_queries`` /
+# ``_debug_calls`` via optional-chain access so a missing field renders
+# as an empty panel rather than throwing.
+#
+# Implementation uses ``model_serializer`` (not ``Field(exclude=...)``)
+# so the OpenAPI schema continues to describe the fields — keeps the
+# committed snapshot stable regardless of which mode the deployment
+# is running in, and avoids per-deployment frontend type drift.
+def _debug_responses_enabled() -> bool:
+ return _os.getenv("DEBUG_RESPONSES", "").lower() in ("1", "true", "yes")
+
+
class BaseResponse(BaseModel):
"""Base response that automatically includes telemetry if present."""
@@ -137,6 +158,14 @@ class BaseResponse(BaseModel):
debug_calls: list[DebugCall] = Field(default_factory=list, serialization_alias="_debug_calls")
is_cached: bool = Field(default=False, serialization_alias="_is_cached")
+ @model_serializer(mode="wrap")
+ def _strip_debug_when_disabled(self, handler):
+ data = handler(self)
+ if not _debug_responses_enabled():
+ data.pop("_debug_queries", None)
+ data.pop("_debug_calls", None)
+ return data
+
@classmethod
def with_telemetry(cls, **data):
"""Helper to create a response with context-local telemetry."""
diff --git a/backend/models/custom_fields.py b/backend/models/custom_fields.py
index 2326e9ab..f2855337 100644
--- a/backend/models/custom_fields.py
+++ b/backend/models/custom_fields.py
@@ -15,7 +15,7 @@ class CustomField(BaseModel):
label: str
description: str = ""
vcl_log_expression: str
- collection_stage: Literal["edge", "origin"] = "edge"
+ collection_stage: Literal["edge", "origin", "deliver"] = "edge"
origin_log_frequency: Literal["all", "miss_pass"] = "all"
duckdb_type: Literal["VARCHAR", "INTEGER", "BIGINT", "DOUBLE", "BOOLEAN"] = "VARCHAR"
value_type: Literal["string", "numeric", "boolean", "ip", "url"] = "string"
@@ -51,7 +51,7 @@ class CustomFieldUpdate(BaseModel):
label: str | None = None
description: str | None = None
vcl_log_expression: str | None = None
- collection_stage: Literal["edge", "origin"] | None = None
+ collection_stage: Literal["edge", "origin", "deliver"] | None = None
origin_log_frequency: Literal["all", "miss_pass"] | None = None
duckdb_type: Literal["VARCHAR", "INTEGER", "BIGINT", "DOUBLE", "BOOLEAN"] | None = None
value_type: Literal["string", "numeric", "boolean", "ip", "url"] | None = None
@@ -74,7 +74,7 @@ class CustomFieldsListResponse(BaseResponse):
class VclLintRequest(BaseModel):
vcl_log_expression: str
- collection_stage: Literal["edge", "origin"] = "edge"
+ collection_stage: Literal["edge", "origin", "deliver"] = "edge"
log_fields_config: dict | None = None
diff --git a/backend/models/lake.py b/backend/models/lake.py
index 8b2c53c6..2375ec54 100644
--- a/backend/models/lake.py
+++ b/backend/models/lake.py
@@ -3,6 +3,41 @@
from __future__ import annotations
import json
+import urllib.parse
+
+# Hostname suffixes allowed for ``cdn_url`` when the SSRF check below
+# decides whether to issue an outbound HTTP request. Any other hostname
+# (including bare IPs, ``localhost``, link-local addresses, or
+# attacker-supplied internal hostnames) is rejected — the field is
+# user-controlled at provision time and an attacker who can inject
+# ``http://169.254.169.254`` would otherwise turn fetch_lake_info into
+# an SSRF probe of the GCE metadata service.
+_CDN_URL_ALLOWED_HOST_SUFFIXES = (
+ ".fastly.net",
+ ".fastlystorage.app",
+)
+
+
+def _safe_cdn_url(cdn_url: str) -> str | None:
+ """Return ``cdn_url`` only if it's an https:// URL on an allowlisted
+ Fastly hostname, else None. Caller treats None as "skip the CDN
+ fast path and fall through to the SDK".
+ """
+ if not cdn_url:
+ return None
+ try:
+ parsed = urllib.parse.urlsplit(cdn_url)
+ except ValueError:
+ return None
+ if parsed.scheme != "https":
+ return None
+ hostname = (parsed.hostname or "").lower()
+ if not hostname:
+ return None
+ for suffix in _CDN_URL_ALLOWED_HOST_SUFFIXES:
+ if hostname.endswith(suffix):
+ return cdn_url
+ return None
def fetch_lake_info(source: dict, use_temp_cache: bool = False) -> dict:
@@ -29,9 +64,8 @@ def fetch_lake_info(source: dict, use_temp_cache: bool = False) -> dict:
namespace, table_name = db_iceberg._table_identifier(source)
summary_key = f"{iceberg_root}/{namespace}/{table_name}/table_summary.json"
- cdn_url = (source.get("cdn_url") or "").rstrip("/")
+ cdn_url = _safe_cdn_url((source.get("cdn_url") or "").rstrip("/"))
if cdn_url:
- import urllib.parse
import urllib.request
from backend.utils.telemetry import record_cdn_call
diff --git a/backend/provision/cli.py b/backend/provision/cli.py
index 38f66243..3c6eb13f 100644
--- a/backend/provision/cli.py
+++ b/backend/provision/cli.py
@@ -266,6 +266,27 @@ def handle_update_logs(args):
else (cfg.get("log_fields") or _build_log_fields_config(args))
)
+ # MERGE GUARD (sibling of state_sync.import_admin_state fix from
+ # 2026-06-02 incident): _build_log_fields_config(args) returns
+ # {schema_version, preset, groups, field_overrides} — it has NO
+ # custom_fields key. Assigning the result wholesale to
+ # cfg["log_fields"] would strip the 6 scoring custom_fields the
+ # orchestrator injected, the user's own custom_fields, and any
+ # format_hash/updated_at metadata. Preserve custom_fields from the
+ # on-disk cfg, then if scoring is enabled re-inject the canonical
+ # _SCORING_CUSTOM_FIELDS from code as the source of truth.
+ existing_lf = cfg.get("log_fields") or {}
+ existing_custom = list(existing_lf.get("custom_fields") or [])
+ if cfg.get("scoring", {}).get("enabled"):
+ from backend.provision.session_scoring_orchestrator import (
+ _SCORING_CUSTOM_FIELDS,
+ _SCORING_FIELD_NAMES,
+ )
+
+ existing_custom = [cf for cf in existing_custom if cf.get("name") not in _SCORING_FIELD_NAMES]
+ existing_custom.extend(dict(cf) for cf in _SCORING_CUSTOM_FIELDS)
+ new_lf_config["custom_fields"] = existing_custom
+
if getattr(args, "dry_run", False):
print(lf.generate_log_format(new_lf_config))
return
diff --git a/backend/provision/fastly_api.py b/backend/provision/fastly_api.py
index 07091b96..2621dcc4 100644
--- a/backend/provision/fastly_api.py
+++ b/backend/provision/fastly_api.py
@@ -82,6 +82,68 @@ def generate_capture_vcl(log_fields_config: dict) -> dict[str, str]:
custom_edge = [cf for cf in enabled_custom if cf.get("collection_stage", "edge") == "edge"]
custom_origin = [cf for cf in enabled_custom if cf.get("collection_stage", "edge") == "origin"]
+ # "deliver" stage: capture from response headers in vcl_deliver and
+ # promote into req.http.x-fos-edge-data:* so the same log-format
+ # consumer that handles edge-stage fields picks them up. Used by the
+ # session-scoring integration to capture X-Edge-Score* response headers
+ # from the scorer Compute backend.
+ custom_deliver = [cf for cf in enabled_custom if cf.get("collection_stage", "edge") == "deliver"]
+
+ # Security: scrub internal-routing headers a client could spoof.
+ # The cluster-fetch / edge-data headers are set by THIS service's own
+ # snippets on the origin-bound bereq (vcl_miss / vcl_pass) and must
+ # never appear on an inbound req. Without this scrub, a client header
+ # like ``x-is-cluster-fetch: 1`` makes the conditional in vcl_deliver
+ # incorrectly classify the response as internal-cluster traffic and
+ # SKIP the "strip internal headers" cleanup — leaking origin-side
+ # metric headers (x-of-oip = origin backend IP, x-of-ttfb, etc.) to
+ # the client. Run BEFORE the edge-capture conditional so even
+ # configurations without any group-L / custom fields get the scrub.
+ # 020: Build scrub as a list so we can append per-custom-field
+ # unsets. ``unset req.http.x-fos-edge-data;`` strips the bare
+ # header but does NOT strip arbitrary subfield variants
+ # (``req.http.x-fos-edge-data:my_field``) on Fastly VCL — those
+ # are independent header slots once the colon-subfield syntax is
+ # in play. A client that knows a custom-field name (and they often
+ # leak through CSP, error pages, or just by being mentioned in
+ # public docs) can pre-set ``x-fos-edge-data:`` and have
+ # the log line read the spoofed value instead of the edge-captured
+ # one. Per-name scrubs close the gap.
+ scrub_lines = [
+ "# [security] strip client-supplied internal-routing headers",
+ "if (req.restarts == 0 && fastly.ff.visits_this_service == 0) {",
+ " unset req.http.x-is-cluster-fetch;",
+ " unset req.http.x-fos-edge-data;",
+ " unset req.http.x-fos-origin-data;",
+ " unset req.http.x-of-start;",
+ " unset req.http.x-of-ttfb;",
+ " unset req.http.x-of-ttlb;",
+ " unset req.http.x-of-ost;",
+ " unset req.http.x-of-oip;",
+ " unset req.http.x-of-oretries;",
+ " unset req.http.x-of-status;",
+ " unset req.http.x-edge-req-id;",
+ " # Session-scoring internal markers. X-Edge-Scoring-Pass=1 from a",
+ " # client would bypass scoring entirely; x-edge-score* / X-Edge-Sid",
+ " # from a client could forge a clean score / sid that the deliver",
+ " # subfields propagate into the log line. Scrub them all at the",
+ " # client edge regardless of whether scoring is currently enabled.",
+ " unset req.http.X-Edge-Scoring-Pass;",
+ " unset req.http.x-edge-score;",
+ " unset req.http.X-Edge-Score;",
+ " unset req.http.X-Edge-Score-Reason;",
+ " unset req.http.X-Edge-Score-Enforce;",
+ " unset req.http.X-Edge-Sid;",
+ " unset req.http.X-Edge-Score-Set-Cookie;",
+ ]
+ if enabled_custom:
+ scrub_lines.append(" # --- Per-custom-field subfield scrubs (020) ---")
+ for cf in enabled_custom:
+ name = cf["name"]
+ scrub_lines.append(f" unset req.http.x-fos-edge-data:{name};")
+ scrub_lines.append(f" unset req.http.x-fos-origin-data:{name};")
+ scrub_lines.append("}")
+ edge_header_scrub = "\n".join(scrub_lines)
# recv: edge capture + optional group-L request ID + custom edge fields
if required or custom_edge:
@@ -108,9 +170,9 @@ def generate_capture_vcl(log_fields_config: dict) -> dict[str, str]:
recv_lines.append(f" set req.http.x-fos-edge-data:{cf['name']} = {cf['vcl_log_expression']};")
recv_lines.append("}")
- recv_vcl = "\n".join(recv_lines)
+ recv_vcl = edge_header_scrub + "\n" + "\n".join(recv_lines)
else:
- recv_vcl = "# No edge data capture required for current log configuration."
+ recv_vcl = edge_header_scrub + "\n# No edge data capture required for current log configuration."
if group_l:
recv_vcl += (
@@ -123,10 +185,22 @@ def generate_capture_vcl(log_fields_config: dict) -> dict[str, str]:
# miss and pass: unset edge headers + optional group-L timing
base_unset = "if (req.backend.is_origin) {\n unset bereq.http.x-fos-edge-data;\n}"
+ # Session-scoring services route the first-pass request to the scorer
+ # Compute backend via `return(pass)` in vcl_recv. That triggers the
+ # PASS subroutine for the scorer fetch, which would otherwise capture
+ # x-of-start AT THE SCORER FETCH TIME — polluting the eventual TTFB/
+ # TTLB numbers with scorer-leg latency. The X-Edge-Scoring-Pass=="1"
+ # marker (set by session_scoring_vcl.recv_snippet just before the
+ # `return(pass)`) is our discriminator. Non-scoring services never set
+ # this header, so the guard is always true and timing fires normally.
+ _scoring_guard_open = 'if (req.http.X-Edge-Scoring-Pass != "1") {\n'
+ _scoring_guard_close = "}\n"
+
if group_l:
miss_vcl = base_unset + (
"\n# [group-L] Record timing start for origin fetch\n"
- "set req.http.x-of-start = time.elapsed.usec;\n"
+ + _scoring_guard_open
+ + "set req.http.x-of-start = time.elapsed.usec;\n"
"unset bereq.http.x-of-start;\n"
'set bereq.http.x-is-cluster-fetch = "1";\n'
"if (req.http.x-edge-req-id) {\n"
@@ -134,11 +208,12 @@ def generate_capture_vcl(log_fields_config: dict) -> dict[str, str]:
"} else if (req.http.x-req-id) {\n"
" set bereq.http.x-edge-req-id = req.http.x-req-id;\n"
"}\n"
- "unset bereq.http.x-req-id;"
+ "unset bereq.http.x-req-id;\n" + _scoring_guard_close
)
pass_vcl = base_unset + (
"\n# [group-L] Record timing start for PASS fetch\n"
- "set req.http.x-of-start = time.elapsed.usec;\n"
+ + _scoring_guard_open
+ + "set req.http.x-of-start = time.elapsed.usec;\n"
"unset bereq.http.x-of-start;\n"
'set bereq.http.x-is-cluster-fetch = "1";\n'
"if (req.http.x-edge-req-id) {\n"
@@ -146,7 +221,7 @@ def generate_capture_vcl(log_fields_config: dict) -> dict[str, str]:
"} else if (req.http.x-req-id) {\n"
" set bereq.http.x-edge-req-id = req.http.x-req-id;\n"
"}\n"
- "unset bereq.http.x-req-id;"
+ "unset bereq.http.x-req-id;\n" + _scoring_guard_close
)
else:
miss_vcl = base_unset
@@ -159,7 +234,9 @@ def generate_capture_vcl(log_fields_config: dict) -> dict[str, str]:
if group_l:
fetch_lines.append(
"# [group-L] Record TTFB and capture origin metadata\n"
- 'if (req.http.x-of-start != "") {\n'
+ # Skip the scoring sub-fetch — we want TTFB for the real
+ # origin, not the scorer Compute backend.
+ 'if (req.http.X-Edge-Scoring-Pass != "1" && req.http.x-of-start != "") {\n'
" declare local var.ttfb INTEGER;\n"
" set var.ttfb = std.atoi(time.elapsed.usec);\n"
" set var.ttfb -= std.atoi(req.http.x-of-start);\n"
@@ -182,7 +259,10 @@ def generate_capture_vcl(log_fields_config: dict) -> dict[str, str]:
if group_l:
error_lines.append(
"# [group-L] Capture timing for failed origin fetches\n"
- 'if (req.http.x-of-start != "") {\n'
+ # Skip the scoring sub-fetch — a scorer error is fail-open
+ # handled by our session-scoring snippet and shouldn't
+ # pollute the customer's origin-error telemetry.
+ 'if (req.http.X-Edge-Scoring-Pass != "1" && req.http.x-of-start != "") {\n'
" declare local var.ttfb INTEGER;\n"
" set var.ttfb = std.atoi(time.elapsed.usec);\n"
" set var.ttfb -= std.atoi(req.http.x-of-start);\n"
@@ -194,12 +274,14 @@ def generate_capture_vcl(log_fields_config: dict) -> dict[str, str]:
)
snippets["error"] = "\n".join(error_lines)
- if group_l or custom_origin:
+ if group_l or custom_origin or custom_deliver:
deliver_lines = []
if group_l:
deliver_lines.append(
"# [group-L] Record TTLB, capture bytes, strip all internal headers\n"
- 'if (req.http.x-of-start != "") {\n'
+ # Skip scoring sub-fetch — don't capture scorer-leg TTLB
+ # into the real-request's telemetry.
+ 'if (req.http.X-Edge-Scoring-Pass != "1" && req.http.x-of-start != "") {\n'
" declare local var.ttlb INTEGER;\n"
" set var.ttlb = std.atoi(time.elapsed.usec);\n"
" set var.ttlb -= std.atoi(req.http.x-of-start);\n"
@@ -257,6 +339,20 @@ def generate_capture_vcl(log_fields_config: dict) -> dict[str, str]:
deliver_lines.append(f" unset resp.http.x-fos-origin-data:{name};")
deliver_lines.append("}")
+ if custom_deliver:
+ # Deliver-stage fields read from the RESPONSE headers
+ # (e.g. resp.http.X-Edge-Score after a Compute scorer sub-fetch
+ # returned). The expression in vcl_log_expression points at the
+ # ``req.http.*`` slot the upstream snippet copied it into — same
+ # final namespace as edge fields, just captured a stage later in
+ # the request lifecycle.
+ deliver_lines.append("# --- Custom Deliver Fields ---")
+ for cf in custom_deliver:
+ name = cf["name"]
+ deliver_lines.append(f'if ({cf["vcl_log_expression"]} != "") {{')
+ deliver_lines.append(f" set req.http.x-fos-edge-data:{name} = {cf['vcl_log_expression']};")
+ deliver_lines.append("}")
+
snippets["deliver"] = "\n".join(deliver_lines)
return snippets
@@ -303,6 +399,55 @@ def validate_log_format(log_fields_config: dict = None) -> list[str]:
return _validate_log_format_regex(raw)
+def install_capture_snippets(
+ service_id: str,
+ version: int,
+ log_fields_config: dict | None,
+ token: str,
+) -> None:
+ """Install the auto-generated "Fastly Log Analysis *" capture VCL
+ snippets on the given draft version. Idempotent via ``ensure_vcl_
+ snippet``'s content/priority diff.
+
+ Mapping table here is the single source of truth for which subroutine
+ each capture phase targets and at what priority. Both the
+ full-provisioning path (`ensure_logging_endpoint`) and the
+ session-scoring orchestrator (which installs onto an existing service
+ that already has a logging endpoint) call into this helper.
+
+ Note on the Origin Error snippet: a prior copy of this logic in
+ ``session_scoring_orchestrator.enable_scoring`` omitted the error
+ snippet install, so a service first provisioned via the orchestrator
+ silently lacked failed-origin TTFB capture. This helper closes that
+ drift by installing all phases via one loop.
+ """
+ snippets = generate_capture_vcl(log_fields_config)
+ # (snippet_name, subroutine_type, priority, required)
+ # 'required' phases ("recv", "miss", "pass") are always generated.
+ # Group-L phases ("fetch", "deliver", "error") only exist when
+ # group L is enabled — guarded by `in snippets`.
+ install_plan = (
+ ("Fastly Log Analysis Capture", "recv", 1, True),
+ ("Fastly Log Analysis Miss", "miss", 100, True),
+ ("Fastly Log Analysis Pass", "pass", 100, True),
+ ("Fastly Log Analysis Origin Fetch", "fetch", 100, False),
+ ("Fastly Log Analysis Origin Deliver", "deliver", 100, False),
+ ("Fastly Log Analysis Origin Error", "error", 100, False),
+ )
+ for snip_name, kind, priority, required in install_plan:
+ if not required and kind not in snippets:
+ continue
+ ensure_vcl_snippet(
+ snip_name,
+ kind,
+ snippets[kind],
+ priority,
+ service_id,
+ version,
+ token,
+ )
+
+
def _validate_log_format_regex(raw: str) -> list[str]:
"""Regex-based fallback log format checks."""
errors = []
@@ -698,26 +843,7 @@ def ensure_logging_endpoint(cfg: dict, fos_access_key: str, fos_secret_key: str,
if status_cb:
status_cb("⏳ Deploying VCL snippets to capture edge values...")
- vcl_snippets = generate_capture_vcl(cfg.get("log_fields"))
- ensure_vcl_snippet("Fastly Log Analysis Capture", "recv", vcl_snippets["recv"], 1, service_id, new_ver, token)
- ensure_vcl_snippet("Fastly Log Analysis Miss", "miss", vcl_snippets["miss"], 100, service_id, new_ver, token)
- ensure_vcl_snippet("Fastly Log Analysis Pass", "pass", vcl_snippets["pass"], 100, service_id, new_ver, token)
- if "fetch" in vcl_snippets:
- ensure_vcl_snippet(
- "Fastly Log Analysis Origin Fetch", "fetch", vcl_snippets["fetch"], 100, service_id, new_ver, token
- )
- ensure_vcl_snippet(
- "Fastly Log Analysis Origin Error", "error", vcl_snippets["error"], 100, service_id, new_ver, token
- )
- ensure_vcl_snippet(
- "Fastly Log Analysis Origin Deliver",
- "deliver",
- vcl_snippets["deliver"],
- 100,
- service_id,
- new_ver,
- token,
- )
+ install_capture_snippets(service_id, new_ver, cfg.get("log_fields"), token)
ok("Logging endpoint and VCL snippets added to draft")
@@ -991,27 +1117,8 @@ def update_logging_endpoint(cfg: dict, token: str):
yield {"type": "progress", "current": 3, "total": total_steps}
- vcl_snippets = generate_capture_vcl(lf_config)
- ensure_vcl_snippet("Fastly Log Analysis Capture", "recv", vcl_snippets["recv"], 1, service_id, new_ver, token)
- ensure_vcl_snippet("Fastly Log Analysis Miss", "miss", vcl_snippets["miss"], 100, service_id, new_ver, token)
- ensure_vcl_snippet("Fastly Log Analysis Pass", "pass", vcl_snippets["pass"], 100, service_id, new_ver, token)
- if "fetch" in vcl_snippets:
- ensure_vcl_snippet(
- "Fastly Log Analysis Origin Fetch", "fetch", vcl_snippets["fetch"], 100, service_id, new_ver, token
- )
- ensure_vcl_snippet(
- "Fastly Log Analysis Origin Error", "error", vcl_snippets["error"], 100, service_id, new_ver, token
- )
- ensure_vcl_snippet(
- "Fastly Log Analysis Origin Deliver",
- "deliver",
- vcl_snippets["deliver"],
- 100,
- service_id,
- new_ver,
- token,
- )
- else:
+ install_capture_snippets(service_id, new_ver, lf_config, token)
+ if "fetch" not in generate_capture_vcl(lf_config):
for snip in [
"Fastly Log Analysis Origin Fetch",
"Fastly Log Analysis Origin Error",
diff --git a/backend/provision/orchestrator.py b/backend/provision/orchestrator.py
index a3993699..b9aad6d6 100644
--- a/backend/provision/orchestrator.py
+++ b/backend/provision/orchestrator.py
@@ -1,10 +1,13 @@
import json
+import logging
import os
import queue
import shutil
import threading
import time
+logger = logging.getLogger(__name__)
+
from backend.core import log_fields as lf
from backend.core.fastly.client import fastly
from backend.core.fastly.utils import (
@@ -37,18 +40,58 @@ def _sync_crontab():
def write_service_config(state: dict):
- """Write a service config JSON file to configs/{service_id}.json."""
+ """Write a service config JSON file to configs/{service_id}.json.
+
+ PRESERVE-ON-RE-RUN: this function is called from /api/provision/ingest
+ (analyst-join, wizard re-run, key rotation). The ``state`` dict is the
+ request body — it has no awareness of code-managed keys that
+ ``enable_scoring`` / ``ngwaf_workspace_id`` PATCH / log_fields PATCH
+ may have injected into the existing config. Without preserving those
+ keys, re-running the wizard silently strips ``cfg["scoring"]``,
+ ``cfg["log_fields"]["custom_fields"]``, and ``cfg["ngwaf_workspace_id"]``
+ — same bug class as the 2026-06-02 state_sync incident, just with the
+ request body as the stale-overwriter instead of FOS admin_state.json.
+ """
from backend import config as svcconfig
service_id = state.get("logging_service_id") or state.get("service_id")
db_path = svcconfig.duckdb_path(service_id)
+ # Snapshot the existing on-disk cfg so we can preserve code-managed
+ # keys that the request body doesn't carry. None on first-ever ingest
+ # (which is fine — there's nothing to preserve).
+ existing_cfg = svcconfig.load_config(service_id) or {}
+
fos_key = state.get("fos_access_key_id") or state.get("fos_access_key", "")
fos_secret = state.get("fos_secret_access_key") or state.get("fos_secret_key", "")
bucket = state.get("fos_bucket") or state.get("fos_bucket_name", "")
region = state.get("fos_region", "us-east-1")
cdn_url = state.get("cdn_url", "")
+ # Build log_fields: prefer the request body, but if the request body
+ # omits custom_fields (or sends an empty list) AND we have existing
+ # custom_fields on disk, preserve them. Then if scoring is enabled,
+ # re-inject the canonical _SCORING_CUSTOM_FIELDS from code.
+ incoming_lf = dict(state.get("log_fields") or {})
+ incoming_custom = incoming_lf.get("custom_fields")
+ existing_custom = list((existing_cfg.get("log_fields") or {}).get("custom_fields") or [])
+ if not incoming_custom and existing_custom:
+ incoming_lf["custom_fields"] = existing_custom
+ # Re-inject scoring fields from code when scoring is enabled in either
+ # the incoming state OR the existing cfg (the wizard re-run rarely
+ # carries scoring in the body).
+ scoring_block = state.get("scoring") or existing_cfg.get("scoring") or {}
+ if scoring_block.get("enabled"):
+ from backend.provision.session_scoring_orchestrator import (
+ _SCORING_CUSTOM_FIELDS,
+ _SCORING_FIELD_NAMES,
+ )
+
+ current_custom = list(incoming_lf.get("custom_fields") or [])
+ current_custom = [cf for cf in current_custom if cf.get("name") not in _SCORING_FIELD_NAMES]
+ current_custom.extend(dict(cf) for cf in _SCORING_CUSTOM_FIELDS)
+ incoming_lf["custom_fields"] = current_custom
+
cfg = {
"service_id": service_id,
"name": state.get("name") or state.get("service_name") or service_id,
@@ -66,9 +109,19 @@ def write_service_config(state: dict):
"fastly_api_key": state.get("fastly_api_key") or state.get("admin_token", ""),
"log_retention_days": int(state.get("log_retention_days", 30)),
"duckdb_path": db_path,
- "log_fields": state.get("log_fields", {}),
+ "log_fields": incoming_lf,
}
+ # Preserve code-managed top-level keys that the request body doesn't
+ # carry — primarily ``scoring`` (set by enable_scoring) and
+ # ``ngwaf_workspace_id`` (set by the NGWAF-config PATCH). Anything else
+ # the existing cfg has that the wizard body lacks survives the rewrite.
+ for preserved_key in ("scoring", "ngwaf_workspace_id"):
+ if preserved_key not in state and preserved_key in existing_cfg:
+ cfg[preserved_key] = existing_cfg[preserved_key]
+ elif preserved_key in state:
+ cfg[preserved_key] = state[preserved_key]
+
if "log_period" in state:
cfg["log_period"] = state["log_period"]
elif "log_period" in state.get("provisioning", {}):
@@ -457,12 +510,38 @@ def cleanup_local_data(service_id: str, bucket: str = None, remove_data: bool =
pass
if bucket:
- # Look for cache dir in both common locations
- for base in [os.getcwd(), os.path.join(os.path.dirname(__file__), "..", "..")]:
- svc_cache_dir = os.path.join(base, "cache", bucket)
- if os.path.exists(svc_cache_dir):
- shutil.rmtree(svc_cache_dir)
- ok(f"Removed local cache: {svc_cache_dir}")
+ # Security: ``bucket`` is supplied via the provisioning
+ # API and historically had no path-shape validation. A payload
+ # like ``../../../tmp/anything`` would compose with
+ # os.path.join to produce a path outside the cache root and
+ # shutil.rmtree would happily wipe whatever lived there.
+ # Reject any separator/traversal token up front, then
+ # additionally verify the resolved path stays under the
+ # resolved cache root (defense in depth — catches edge cases
+ # like symlink escapes from inside an attacker-writable
+ # parent dir).
+ if any(c in bucket for c in ("/", "\\", "..", "\x00")):
+ logger.warning("[teardown] refusing to remove cache for bucket=%r with path-shape characters", bucket)
+ else:
+ for base in [os.getcwd(), os.path.join(os.path.dirname(__file__), "..", "..")]:
+ cache_root = os.path.realpath(os.path.join(base, "cache"))
+ svc_cache_dir = os.path.realpath(os.path.join(cache_root, bucket))
+ # Reject anything that resolved outside the cache root —
+ # belt-and-suspenders for symlinks pointing elsewhere.
+ try:
+ common = os.path.commonpath([cache_root, svc_cache_dir])
+ except ValueError:
+ continue
+ if common != cache_root:
+ logger.warning(
+ "[teardown] refusing to remove cache: resolved path %s escapes %s",
+ svc_cache_dir,
+ cache_root,
+ )
+ continue
+ if os.path.exists(svc_cache_dir):
+ shutil.rmtree(svc_cache_dir)
+ ok(f"Removed local cache: {svc_cache_dir}")
_sync_crontab()
@@ -476,6 +555,15 @@ def generate_analyst_invite(service_id: str) -> dict:
if cfg.get("access_level") != "read_write":
raise RuntimeError("Invite generation requires a read_write service configuration")
api_token = cfg.get("fastly_api_key", "").strip()
+ # Fail fast when the stored token is missing. Without this, the Fastly
+ # API call below would go out with token="" and either time out or
+ # return an error envelope; either way the downstream key["access_key"]
+ # would raise an unhelpful KeyError instead of a clean 400-style message.
+ # Caller (route handler) wraps RuntimeError → HTTPException(400).
+ if not api_token:
+ raise RuntimeError(
+ f"Service {service_id} has no stored fastly_api_key. Rotate the credential before generating a viewer key."
+ )
bucket = cfg.get("fos_bucket", "")
region = cfg.get("fos_region", "us-east-1")
key = fastly(
@@ -488,6 +576,13 @@ def generate_analyst_invite(service_id: str) -> dict:
},
token=api_token,
)
+ # Defensive: a malformed Fastly response shouldn't bubble up as a raw
+ # KeyError on access_key / secret_key — surface a clear error instead.
+ if not isinstance(key, dict) or "access_key" not in key or "secret_key" not in key:
+ raise RuntimeError(
+ f"Fastly access-key API returned unexpected shape (keys={list(key.keys()) if isinstance(key, dict) else type(key).__name__}); "
+ "cannot generate analyst invite."
+ )
iceberg_metadata_location = None
try:
diff --git a/backend/provision/session_scoring_orchestrator.py b/backend/provision/session_scoring_orchestrator.py
new file mode 100644
index 00000000..340847a3
--- /dev/null
+++ b/backend/provision/session_scoring_orchestrator.py
@@ -0,0 +1,1064 @@
+"""End-to-end ``enable_scoring`` / ``disable_scoring`` for a single
+customer's logging service.
+
+This is the user-facing "turn on session scoring" flow. It composes the
+existing primitives:
+
+ - ``ensure_scoring_service`` / ``delete_scoring_service`` (Compute
+ service + ConfigStores + AES key + resource links, in
+ backend/provision/session_scoring_setup.py)
+ - ``scripts/scoring/deploy_wasm.sh`` (build + push the Wasm)
+ - ``ensure_vcl_snippet`` + ``ensure_condition`` (Fastly idempotent
+ helpers from backend/core/fastly/service.py)
+ - ``update_logging_endpoint`` (regenerate log format + push, from
+ backend/provision/fastly_api.py)
+
+The VCL mutation follows the same proven pattern as
+``ensure_logging_endpoint`` ([backend/provision/fastly_api.py:636](backend/provision/fastly_api.py#L636)):
+ get_active → clone → mutate draft → validate → activate
+ → on any exception, re-activate the prior version (leave the draft
+ dangling for debug) and re-raise.
+"""
+
+from __future__ import annotations
+
+import datetime as _dt
+import logging
+import subprocess
+import urllib.parse
+from pathlib import Path
+from typing import Any
+
+from backend import config as svcconfig
+from backend.core.fastly.client import fastly
+from backend.core.fastly.service import (
+ ensure_vcl_snippet,
+ get_active_version,
+ list_vcl_snippets,
+)
+from backend.provision.session_scoring_setup import (
+ delete_scoring_service,
+ ensure_scoring_service,
+)
+from backend.provision.session_scoring_vcl import (
+ SCORING_BACKEND_API_NAME,
+ SCORING_DELIVER_NAME,
+ SCORING_ENFORCE_NAME,
+ SCORING_FETCH_NAME,
+ SCORING_FETCH_PRIORITY,
+ SCORING_MISS_NAME,
+ SCORING_PASS_NAME,
+ SCORING_RECV_NAME,
+ SCORING_SNIPPET_PRIORITY,
+ generate_scoring_vcl,
+ scoring_snippet_names,
+)
+from backend.provision.utils import BOLD, _c, fail, info, ok, warn
+
+logger = logging.getLogger(__name__)
+
+# Locations of the matrix files relative to repo root.
+_REPO_ROOT = Path(__file__).resolve().parent.parent.parent
+_MATRIX_PATH = _REPO_ROOT / "compute" / "scorer" / "matrix.json"
+_DEPLOY_WASM_SCRIPT = _REPO_ROOT / "scripts" / "scoring" / "deploy_wasm.sh"
+
+# Custom-field definitions the orchestrator adds/removes when enabling/
+# disabling scoring. Kept as a single source of truth so disable_scoring
+# can find them by name to undo cleanly.
+# vcl_log_expression points at req.http.x-fos-edge-data:edge_* subfields
+# (NOT the source req.http.x-edge-data:* subfields). Why: subfield writes
+# in vcl_recv propagate to the log emitter; writes anywhere else don't.
+# Our session_scoring recv snippet (pass 2) copies x-edge-data:* into
+# x-fos-edge-data:edge_* exactly so this log format can read them.
+# stage="deliver" is kept so the field shows up in the right tab in the
+# UI; the value is actually populated in recv pass 2 via the manual
+# promotion in session_scoring_vcl.recv_snippet.
+_SCORING_CUSTOM_FIELDS: list[dict[str, Any]] = [
+ {
+ "name": "edge_score",
+ "label": "Edge Score",
+ "description": "Combined session-anomaly score (0–100, quantized to nearest 5) from the edge scorer.",
+ "vcl_log_expression": "req.http.x-edge-score:score",
+ "collection_stage": "deliver",
+ "duckdb_type": "INTEGER",
+ "value_type": "numeric",
+ "bytes_estimate": 4,
+ "enabled": True,
+ },
+ {
+ "name": "edge_score_l1",
+ "label": "Edge Score (Layer 1)",
+ "description": "Layer-1 (universal behavioral) score contribution.",
+ "vcl_log_expression": "req.http.x-edge-score:l1",
+ "collection_stage": "deliver",
+ "duckdb_type": "INTEGER",
+ "value_type": "numeric",
+ "bytes_estimate": 4,
+ "enabled": True,
+ },
+ {
+ "name": "edge_score_l2",
+ "label": "Edge Score (Layer 2)",
+ "description": "Layer-2 (route transition) score contribution.",
+ "vcl_log_expression": "req.http.x-edge-score:l2",
+ "collection_stage": "deliver",
+ "duckdb_type": "INTEGER",
+ "value_type": "numeric",
+ "bytes_estimate": 4,
+ "enabled": True,
+ },
+ {
+ "name": "edge_cookie_compliance",
+ "label": "Cookie Compliance",
+ "description": "ok | missing | tampered | unknown.",
+ "vcl_log_expression": "req.http.x-edge-score:compliance",
+ "collection_stage": "deliver",
+ "duckdb_type": "VARCHAR",
+ "value_type": "string",
+ "bytes_estimate": 10,
+ "enabled": True,
+ },
+ {
+ "name": "edge_score_reason",
+ "label": "Score Reason",
+ "description": "Comma-separated list of fired scoring rules.",
+ "vcl_log_expression": "req.http.x-edge-score:reason",
+ "collection_stage": "deliver",
+ "duckdb_type": "VARCHAR",
+ "value_type": "string",
+ "bytes_estimate": 60,
+ "enabled": True,
+ },
+ {
+ "name": "edge_sid",
+ "label": "Session ID",
+ "description": (
+ "12-hex-char rotating session id from the edge scorer cookie. "
+ "Empty when the inbound request had no valid cookie. Used as "
+ "the key for admin session labels (good / bad / neutral)."
+ ),
+ "vcl_log_expression": "req.http.x-edge-score:sid",
+ "collection_stage": "deliver",
+ "duckdb_type": "VARCHAR",
+ "value_type": "string",
+ "bytes_estimate": 12,
+ "enabled": True,
+ },
+]
+_SCORING_FIELD_NAMES = {cf["name"] for cf in _SCORING_CUSTOM_FIELDS}
+
+
+def _deploy_wasm(scoring_service_id: str, token: str, status_cb=None) -> None:
+ """Invoke scripts/scoring/deploy_wasm.sh as a subprocess.
+
+ If the trained matrix exists (`compute/scorer/matrix.json` with
+ vocab_size > 0) it gets embedded; otherwise we deploy with the empty
+ default and L2 self-disables. The script's `trap EXIT` restores the
+ default placeholder afterward so the working tree stays clean.
+ """
+ info("Building + deploying Wasm to the scoring Compute service")
+ if status_cb:
+ status_cb("⏳ Building + deploying Wasm to the scoring service...")
+
+ if not _DEPLOY_WASM_SCRIPT.exists():
+ raise RuntimeError(f"deploy script not found at {_DEPLOY_WASM_SCRIPT}")
+
+ cmd = [
+ str(_DEPLOY_WASM_SCRIPT),
+ "--service-id",
+ scoring_service_id,
+ "--token",
+ token,
+ ]
+ # Only pass --matrix if a trained one exists; otherwise the script
+ # uses the empty default (and refuses to deploy a real-matrix-required
+ # path, which is correct for the first enable when nothing's trained
+ # yet). We pre-check vocab_size to give a clear error if a malformed
+ # matrix is sitting in the path.
+ if _MATRIX_PATH.exists():
+ import json as _json
+
+ try:
+ with _MATRIX_PATH.open() as f:
+ m = _json.load(f)
+ if m.get("vocab_size", 0) > 0:
+ cmd.extend(["--matrix", str(_MATRIX_PATH)])
+ info(f" using trained matrix (vocab_size={m['vocab_size']}, version={m.get('version')})")
+ else:
+ info(" trained matrix is empty; deploying with default-empty (L2 disabled)")
+ except Exception:
+ warn(" matrix.json present but unreadable; falling back to default-empty")
+
+ # If no real matrix, the script's vocab_size==0 check would fail. Skip
+ # passing --matrix entirely so it just rebuilds with whatever's in
+ # matrix.default.json (i.e. the tracked empty default).
+ proc = subprocess.run(
+ cmd,
+ capture_output=True,
+ text=True,
+ cwd=str(_REPO_ROOT),
+ )
+ if proc.returncode != 0:
+ # Surface the script's stderr so the operator can see what failed.
+ raise RuntimeError(
+ f"deploy_wasm.sh failed (exit {proc.returncode}):\n"
+ f"--- stdout ---\n{proc.stdout}\n--- stderr ---\n{proc.stderr}"
+ )
+ ok("Wasm deployed to scoring service")
+
+
+def _add_scoring_backend(
+ logging_service_id: str,
+ version: int,
+ scoring_domain: str,
+ token: str,
+) -> None:
+ """Add the scoring Compute service as a backend on the cloned VCL
+ version. Backend name is the constant from session_scoring_vcl so the
+ recv snippet can reference it by name."""
+ payload = {
+ "name": SCORING_BACKEND_API_NAME,
+ "address": scoring_domain,
+ "port": 443,
+ "use_ssl": True,
+ "ssl_cert_hostname": scoring_domain,
+ "ssl_sni_hostname": scoring_domain,
+ # The Fastly Compute service routes by Host header. Without
+ # override_host, the upstream Host arrives as the customer's
+ # domain (e.g. www.example.com) and the scorer's
+ # edgecompute.app service can't dispatch it — TLS SNI matches
+ # but the Host header doesn't. Forcing it to the scoring
+ # domain fixes routing.
+ "override_host": scoring_domain,
+ # The edgecompute.app cert is from Fastly's internal CA and may not
+ # validate cleanly when one Fastly service backends to another. Both
+ # ends are inside Fastly's network so we trade strict verification
+ # for reliability — security is not at risk because the path never
+ # leaves Fastly's edge.
+ "ssl_check_cert": False,
+ "auto_loadbalance": False,
+ # Aggressive: Wasm execution is ~600µs and intra-Fastly network
+ # adds ~5-20ms warm-state. 50ms gives ~2.5x typical round-trip.
+ # Cold-start Compute instances (rare in production) will fail-
+ # open at this budget — acceptable trade vs. holding real users.
+ # If fail-open rate climbs, bump these back up after seeing
+ # per-POP latency distributions.
+ "connect_timeout": 50,
+ "first_byte_timeout": 50,
+ "between_bytes_timeout": 50,
+ }
+ # Idempotent: if the backend already exists, PUT-update it when the
+ # config has drifted (e.g. we tuned the timeouts). POST a new one
+ # only when it's missing. Without the PUT path, re-running enable
+ # on a version with an existing backend would silently keep stale
+ # timeouts in place.
+ existing_match = None
+ try:
+ existing = (
+ fastly(
+ "GET",
+ f"/service/{logging_service_id}/version/{version}/backend",
+ token=token,
+ )
+ or []
+ )
+ for b in existing:
+ if b.get("name") == SCORING_BACKEND_API_NAME:
+ existing_match = b
+ break
+ except RuntimeError:
+ pass
+
+ if existing_match is not None:
+ drift = any(existing_match.get(k) != v for k, v in payload.items() if k in existing_match)
+ if not drift:
+ ok(f"Scoring backend already current on version {version}")
+ return
+ encoded = urllib.parse.quote(SCORING_BACKEND_API_NAME, safe="")
+ fastly(
+ "PUT",
+ f"/service/{logging_service_id}/version/{version}/backend/{encoded}",
+ payload,
+ token=token,
+ )
+ ok(f"Updated scoring backend {SCORING_BACKEND_API_NAME} (drifted settings)")
+ return
+
+ fastly(
+ "POST",
+ f"/service/{logging_service_id}/version/{version}/backend",
+ payload,
+ token=token,
+ )
+ ok(f"Added scoring backend {SCORING_BACKEND_API_NAME} ({scoring_domain})")
+
+
+def _remove_scoring_backend(logging_service_id: str, version: int, token: str) -> None:
+ """Remove the scoring backend (idempotent — 404 is fine)."""
+ encoded = urllib.parse.quote(SCORING_BACKEND_API_NAME, safe="")
+ try:
+ fastly(
+ "DELETE",
+ f"/service/{logging_service_id}/version/{version}/backend/{encoded}",
+ token=token,
+ expect_empty=True,
+ )
+ ok(f"Removed scoring backend {SCORING_BACKEND_API_NAME}")
+ except RuntimeError as exc:
+ if "404" in str(exc):
+ ok("Scoring backend already absent")
+ else:
+ raise
+
+
+def _remove_scoring_snippets(logging_service_id: str, version: int, token: str) -> None:
+ """Delete the six scoring snippets by name (idempotent)."""
+ present = set(list_vcl_snippets(logging_service_id, version, token))
+ for name in scoring_snippet_names():
+ if name not in present:
+ continue
+ encoded = urllib.parse.quote(name, safe="")
+ try:
+ fastly(
+ "DELETE",
+ f"/service/{logging_service_id}/version/{version}/snippet/{encoded}",
+ token=token,
+ expect_empty=True,
+ )
+ ok(f"Removed snippet {name}")
+ except RuntimeError as exc:
+ if "404" in str(exc):
+ continue
+ raise
+
+
+def _add_scoring_custom_fields(cfg: dict) -> dict:
+ """Merge the 6 scoring custom_fields into cfg.log_fields.custom_fields.
+ Existing fields with the same name are replaced (idempotent re-runs
+ pick up any tuning we've done to bytes_estimate / label / etc.)."""
+ cfg.setdefault("log_fields", {})
+ cfg["log_fields"].setdefault("custom_fields", [])
+ existing = [cf for cf in cfg["log_fields"]["custom_fields"] if cf.get("name") not in _SCORING_FIELD_NAMES]
+ cfg["log_fields"]["custom_fields"] = existing + [dict(cf) for cf in _SCORING_CUSTOM_FIELDS]
+ return cfg
+
+
+def _remove_scoring_custom_fields(cfg: dict) -> dict:
+ """Strip the 6 scoring custom_fields from cfg, leaving any others
+ untouched."""
+ if "log_fields" not in cfg or "custom_fields" not in cfg["log_fields"]:
+ return cfg
+ cfg["log_fields"]["custom_fields"] = [
+ cf for cf in cfg["log_fields"]["custom_fields"] if cf.get("name") not in _SCORING_FIELD_NAMES
+ ]
+ return cfg
+
+
+def enable_scoring(
+ logging_service_id: str,
+ token: str,
+ *,
+ status_cb=None,
+) -> dict[str, Any]:
+ """Provision (or reuse) the Compute scoring service, deploy the Wasm,
+ then mutate the customer's VCL service to call it via the restart
+ pattern.
+
+ Idempotent — re-running with scoring already enabled returns the
+ existing state without making changes (the underlying ensure_* helpers
+ are all no-ops on the happy path).
+
+ Returns:
+ {
+ "scoring_service_id": "...",
+ "scoring_service_name": "Session Scoring Service for {id}",
+ "scoring_domain": "fos-...-session-scorer.edgecompute.app",
+ "scoring_keys_store_id": "...",
+ "scoring_config_store_id": "...",
+ "aes_key_hex": "..." (only on first creation),
+ "logging_service_active_version": int (post-activate),
+ }
+ """
+ cfg = svcconfig.load_config(logging_service_id)
+ if not cfg:
+ raise RuntimeError(f"No config found for logging service {logging_service_id}")
+
+ # ── Stage 1: Compute scoring service + AES key + ConfigStores. ──────────
+ info(f"Enabling session scoring for {_c(BOLD, logging_service_id)}")
+ if status_cb:
+ status_cb(f"⏳ Enabling session scoring for {logging_service_id}...")
+
+ # On a re-run we lose `aes_key_hex` and `request_secret` from
+ # ensure_scoring_service (they're write-only in the ConfigStore).
+ # Preserve whatever the prior provision stashed in cfg so VCL
+ # generation still has the secret available. If neither has one
+ # (e.g. the scoring service was provisioned before the secret
+ # feature existed), generate a fresh one and PATCH it into the
+ # ConfigStore so this enable is self-healing.
+ prior_scoring = cfg.get("scoring") or {}
+
+ scoring_meta = ensure_scoring_service(logging_service_id, token, status_cb=status_cb)
+ scoring_service_id = scoring_meta["scoring_service_id"]
+ scoring_domain = scoring_meta["scoring_domain"]
+ request_secret = scoring_meta.get("request_secret") or prior_scoring.get("request_secret") or ""
+ if not request_secret:
+ import secrets as _secrets
+
+ request_secret = _secrets.token_hex(32)
+ keys_store_id = scoring_meta.get("scoring_keys_store_id") or prior_scoring.get("scoring_keys_store_id")
+ if not keys_store_id:
+ raise RuntimeError("Cannot heal missing request_secret: no scoring_keys_store_id available.")
+ # Upsert the secret. POST returns 409 if it already exists; in
+ # that case PATCH instead. We try POST first because the common
+ # case here is "no entry exists yet".
+ try:
+ fastly(
+ "POST",
+ f"/resources/stores/config/{keys_store_id}/item",
+ {"item_key": "request_secret", "item_value": request_secret},
+ token=token,
+ )
+ except RuntimeError:
+ fastly(
+ "PATCH",
+ f"/resources/stores/config/{keys_store_id}/item/request_secret",
+ {"item_value": request_secret},
+ token=token,
+ )
+ info("Healed missing request_secret in scoring_keys store")
+
+ # ── Stage 2: build + deploy Wasm. ───────────────────────────────────────
+ _deploy_wasm(scoring_service_id, token, status_cb=status_cb)
+
+ # ── Stage 3: write scoring metadata into the LOGGING service config. ────
+ # Preserve operator-tunable overrides across re-enables — the previous
+ # implementation replaced the entire ``scoring`` block, silently wiping
+ # the operator's per-service exclude_url_regex and enforce_status_code.
+ # Pull them off the pre-existing block before the replace.
+ from backend.provision.session_scoring_vcl import DEFAULT_ASSET_EXT_REGEX
+
+ prior_scoring = cfg.get("scoring") or {}
+ cfg["scoring"] = {
+ "enabled": True,
+ "scoring_service_id": scoring_service_id,
+ "scoring_service_name": scoring_meta["scoring_service_name"],
+ "scoring_domain": scoring_domain,
+ "scoring_keys_store_id": scoring_meta["scoring_keys_store_id"],
+ "scoring_config_store_id": scoring_meta["scoring_config_store_id"],
+ # Stash the secret here so re-runs of enable_scoring can recover
+ # it (the ConfigStore is write-only from our perspective). The
+ # config file is gitignored under /configs/* so this never leaks.
+ "request_secret": request_secret,
+ "enabled_at": _dt.datetime.now(_dt.UTC).isoformat(timespec="seconds"),
+ # First-enable defaults: persist the actual values so the admin UI
+ # shows what's actually in use (no empty-as-sentinel cleverness)
+ # and so a future change to the bundled default doesn't silently
+ # alter the per-service behaviour.
+ "exclude_url_regex": prior_scoring.get("exclude_url_regex") or DEFAULT_ASSET_EXT_REGEX,
+ }
+ # Preserve any operator-set enforce_status_code override across the
+ # block replace; absence means "use the bundled default 429" — there's
+ # no need to materialise the default since the enforce snippet's
+ # default arg already covers it.
+ if prior_scoring.get("enforce_status_code") is not None:
+ cfg["scoring"]["enforce_status_code"] = prior_scoring["enforce_status_code"]
+ # Add the scoring custom_fields so update_logging_endpoint picks them up.
+ _add_scoring_custom_fields(cfg)
+ svcconfig.save_config(logging_service_id, cfg)
+ n_scoring = len(_SCORING_FIELD_NAMES)
+ ok(f"Stashed scoring metadata + {n_scoring} custom_fields into service config")
+
+ # ── Stage 4: clone the LOGGING service's active VCL version. ────────────
+ active_ver = get_active_version(logging_service_id, token)
+ if active_ver is None:
+ raise RuntimeError(f"Logging service {logging_service_id} has no active version")
+ info(f"Logging service active version: {active_ver}")
+ if status_cb:
+ status_cb(f"🔄 Cloning version {active_ver} to add scoring...")
+ clone = fastly(
+ "PUT",
+ f"/service/{logging_service_id}/version/{active_ver}/clone",
+ token=token,
+ )
+ new_ver = int(clone["number"])
+ ts = _dt.datetime.now(_dt.UTC).strftime("%Y-%m-%d %H:%M:%S UTC")
+ fastly(
+ "PUT",
+ f"/service/{logging_service_id}/version/{new_ver}",
+ {"comment": f"Enable session scoring (scorer={scoring_service_id}) {ts}"},
+ token=token,
+ )
+ ok(f"Draft version: {new_ver}")
+
+ try:
+ # ── Stage 5: add the scoring Compute service as a backend. ──────────
+ _add_scoring_backend(logging_service_id, new_ver, scoring_domain, token)
+
+ # ── Stage 6: install the six scoring VCL snippets. ──────────────────
+ info("Installing 6 scoring VCL snippets (recv / pass / fetch / deliver / miss / enforce)")
+ if status_cb:
+ status_cb("⏳ Installing scoring VCL snippets...")
+ # Pick up the operator's overrides (if any) so a re-enable carries
+ # the customised exclusion regex AND enforce-status-code forward.
+ # None / "" / out-of-range → defaults.
+ scoring_cfg = cfg.get("scoring") or {}
+ exclude_url_regex = scoring_cfg.get("exclude_url_regex")
+ enforce_status_code = scoring_cfg.get("enforce_status_code")
+ vcl_snippets = generate_scoring_vcl(
+ logging_service_id,
+ request_secret,
+ exclude_url_regex=exclude_url_regex,
+ enforce_status_code=enforce_status_code,
+ )
+ for snip_name, vcl_type, prio in (
+ (SCORING_RECV_NAME, "recv", SCORING_SNIPPET_PRIORITY),
+ (SCORING_PASS_NAME, "pass", SCORING_SNIPPET_PRIORITY),
+ # Fetch gets priority 1 so `return(deliver)` for the scorer
+ # backend fires before any other fetch-stage snippet runs.
+ (SCORING_FETCH_NAME, "fetch", SCORING_FETCH_PRIORITY),
+ (SCORING_DELIVER_NAME, "deliver", SCORING_SNIPPET_PRIORITY),
+ (SCORING_MISS_NAME, "miss", SCORING_SNIPPET_PRIORITY),
+ # Enforce snippet runs at recv-restart-2 (priority 101 — after
+ # the main Recv routing block) and 429s requests the scorer
+ # flagged via X-Edge-Score-Enforce. Off by default — fires
+ # only when the operator commits an enforce_threshold via
+ # the admin UI.
+ (SCORING_ENFORCE_NAME, "recv", SCORING_SNIPPET_PRIORITY + 1),
+ ):
+ ensure_vcl_snippet(
+ snip_name,
+ vcl_type,
+ vcl_snippets[snip_name],
+ prio,
+ logging_service_id,
+ new_ver,
+ token,
+ )
+ ok("Installed 6 scoring VCL snippets")
+
+ # ── Stage 7: regenerate the capture-VCL + log format for the
+ # 6 new custom_fields. update_logging_endpoint handles
+ # both: it diffs the format, pushes the new one, and
+ # re-runs ensure_vcl_snippet for capture snippets so
+ # the new deliver-stage capture VCL gets installed.
+ info("Regenerating log format + capture VCL for scoring fields")
+ if status_cb:
+ status_cb("⏳ Updating log format to include score fields...")
+ # update_logging_endpoint targets the active version by default.
+ # We want it to write to OUR draft, so we pass a hint via the cfg
+ # — but update_logging_endpoint doesn't accept a version arg. So
+ # we call it after activate, which means it'd create yet another
+ # version. To avoid that double-activation, we manually install
+ # the capture snippets on the draft here via the shared helper
+ # (which also installs the Origin Error snippet that an earlier
+ # inline copy of this logic was silently missing).
+ from backend.provision.fastly_api import install_capture_snippets
+
+ install_capture_snippets(logging_service_id, new_ver, cfg.get("log_fields"), token)
+
+ # Update the logging endpoint's format string on the draft version.
+ # The existing s3 logging endpoint must already exist (it was
+ # provisioned at setup). We PUT to update its format.
+ from backend.core.fastly.service import list_s3_endpoints
+ from backend.provision.fastly_api import load_log_format
+
+ endpoint_name = cfg.get("provisioning", {}).get("endpoint_name", "Fastly Object Storage Logs")
+ existing_endpoints = list_s3_endpoints(logging_service_id, new_ver, token)
+ if endpoint_name not in existing_endpoints:
+ warn(
+ f"Logging endpoint {endpoint_name!r} not found on draft v{new_ver} — "
+ "skipping format update. Score fields will land in resp headers but not the log line."
+ )
+ else:
+ new_format = load_log_format(cfg.get("log_fields"))
+ encoded = urllib.parse.quote(endpoint_name, safe="")
+ fastly(
+ "PUT",
+ f"/service/{logging_service_id}/version/{new_ver}/logging/s3/{encoded}",
+ {"format": new_format, "format_version": 2},
+ token=token,
+ )
+ ok(f"Updated logging endpoint format to include {len(_SCORING_FIELD_NAMES)} score fields")
+
+ # ── Stage 8: validate ──────────────────────────────────────────────
+ info(f"Validating draft version {new_ver}")
+ if status_cb:
+ status_cb(f"⏳ Validating draft version {new_ver}...")
+ result = fastly(
+ "GET",
+ f"/service/{logging_service_id}/version/{new_ver}/validate",
+ token=token,
+ )
+ if result.get("status") != "ok":
+ raise RuntimeError(f"Validation failed: {result.get('errors') or result}")
+ ok("Draft validated")
+
+ # ── Stage 9: activate ──────────────────────────────────────────────
+ info(f"Activating version {new_ver}")
+ if status_cb:
+ status_cb(f"⏳ Activating version {new_ver}...")
+ fastly(
+ "PUT",
+ f"/service/{logging_service_id}/version/{new_ver}/activate",
+ token=token,
+ )
+ ok(f"Version {new_ver} active")
+ if status_cb:
+ status_cb(f"✅ Session scoring enabled (active version {new_ver}).")
+
+ scoring_meta["logging_service_active_version"] = new_ver
+
+ # Publish the new custom_fields list to FOS's admin_state.json so
+ # read_only analyst hosts (and the GCE prod backend) pick them up
+ # on their next import_admin_state tick. Without this, a stale
+ # admin_state.json from before scoring was enabled would silently
+ # strip our 6 custom_fields on every metadata_sync — exactly the
+ # 2026-06-02 incident that motivated the import_admin_state merge
+ # fix in backend/state_sync.py.
+ try:
+ from backend.state_sync import export_admin_state
+
+ export_admin_state(logging_service_id)
+ ok("Published custom_fields to FOS admin_state.json")
+ except Exception as exc:
+ warn(f"Could not export admin_state to FOS (non-fatal): {exc}")
+
+ # Also publish the trained scoring matrix to FOS so analyst hosts
+ # (and any fresh backend container) see the exact same matrix
+ # that's currently embedded in the deployed Wasm. Without this,
+ # the /scoring/evaluation endpoint falls back to the default-empty
+ # matrix on read_only hosts and reports AUC ≈ 0.5 even though the
+ # live scorer is using a real trained one.
+ try:
+ from backend.state_sync import publish_matrix_to_fos
+
+ if _MATRIX_PATH.exists():
+ import json as _json
+
+ with _MATRIX_PATH.open() as f:
+ matrix = _json.load(f)
+ publish_matrix_to_fos(logging_service_id, matrix)
+ ok(f"Published scoring matrix to FOS (version={matrix.get('version', '?')})")
+ except Exception as exc:
+ warn(f"Could not publish scoring matrix to FOS (non-fatal): {exc}")
+
+ return scoring_meta
+
+ except Exception as exc:
+ # ── Stage 10: rollback ─────────────────────────────────────────────
+ fail(f"enable_scoring failed: {exc}")
+ info(f"Rolling back — re-activating version {active_ver}")
+ try:
+ fastly(
+ "PUT",
+ f"/service/{logging_service_id}/version/{active_ver}/activate",
+ token=token,
+ )
+ except RuntimeError:
+ pass
+ # Also revert the on-disk config so a retry starts from clean state.
+ #
+ # DEFENSE-IN-DEPTH: re-load cfg here instead of trusting the in-
+ # memory copy from line ~381. The Fastly stages above can take
+ # 30-60s; during that window a concurrent writer (metadata_sync
+ # tick re-injecting scoring fields, an admin PATCHing log_fields,
+ # an ngwaf_workspace_id update) may have mutated configs/.json.
+ # Writing the stale snapshot back wholesale would clobber those
+ # concurrent changes. Re-reading + mutating + saving means we
+ # only touch the scoring-related keys our rollback is supposed
+ # to revert.
+ try:
+ fresh = svcconfig.load_config(logging_service_id) or cfg
+ except Exception:
+ fresh = cfg
+ fresh.pop("scoring", None)
+ _remove_scoring_custom_fields(fresh)
+ svcconfig.save_config(logging_service_id, fresh)
+ raise
+
+
+def disable_scoring(
+ logging_service_id: str,
+ token: str,
+ *,
+ status_cb=None,
+) -> None:
+ """Tear down session scoring for this customer.
+
+ Reverse of enable_scoring: clone active VCL → remove the 6 scoring
+ snippets + scoring backend → strip the 6 custom_fields → regenerate
+ log format → validate → activate → delete the scoring Compute
+ service + ConfigStores. Idempotent — 404s tolerated everywhere."""
+ cfg = svcconfig.load_config(logging_service_id)
+ if not cfg:
+ raise RuntimeError(f"No config found for logging service {logging_service_id}")
+
+ scoring = cfg.get("scoring") or {}
+ if not scoring.get("enabled"):
+ warn("Session scoring is not enabled for this service — nothing to disable")
+ if status_cb:
+ status_cb("✅ Session scoring already disabled.")
+ return
+
+ scoring_service_id = scoring.get("scoring_service_id", "")
+ scoring_keys_store_id = scoring.get("scoring_keys_store_id", "")
+ scoring_config_store_id = scoring.get("scoring_config_store_id", "")
+
+ info(f"Disabling session scoring for {_c(BOLD, logging_service_id)}")
+ if status_cb:
+ status_cb(f"⏳ Disabling session scoring for {logging_service_id}...")
+
+ # ── Stage 1: clone active version. ──────────────────────────────────────
+ active_ver = get_active_version(logging_service_id, token)
+ if active_ver is None:
+ raise RuntimeError(f"Logging service {logging_service_id} has no active version")
+ clone = fastly(
+ "PUT",
+ f"/service/{logging_service_id}/version/{active_ver}/clone",
+ token=token,
+ )
+ new_ver = int(clone["number"])
+ ts = _dt.datetime.now(_dt.UTC).strftime("%Y-%m-%d %H:%M:%S UTC")
+ fastly(
+ "PUT",
+ f"/service/{logging_service_id}/version/{new_ver}",
+ {"comment": f"Disable session scoring {ts}"},
+ token=token,
+ )
+
+ try:
+ # ── Stage 2: remove scoring VCL bits. ───────────────────────────────
+ _remove_scoring_snippets(logging_service_id, new_ver, token)
+ _remove_scoring_backend(logging_service_id, new_ver, token)
+
+ # ── Stage 3: drop the 6 custom_fields + regen log format. ───────────
+ _remove_scoring_custom_fields(cfg)
+ svcconfig.save_config(logging_service_id, cfg)
+
+ from backend.core.fastly.service import list_s3_endpoints
+ from backend.provision.fastly_api import generate_capture_vcl, load_log_format
+
+ capture = generate_capture_vcl(cfg.get("log_fields"))
+ # Re-install (or remove if no fields left) the capture VCL.
+ ensure_vcl_snippet(
+ "Fastly Log Analysis Capture",
+ "recv",
+ capture["recv"],
+ 1,
+ logging_service_id,
+ new_ver,
+ token,
+ )
+
+ endpoint_name = cfg.get("provisioning", {}).get("endpoint_name", "Fastly Object Storage Logs")
+ existing_endpoints = list_s3_endpoints(logging_service_id, new_ver, token)
+ if endpoint_name in existing_endpoints:
+ new_format = load_log_format(cfg.get("log_fields"))
+ encoded = urllib.parse.quote(endpoint_name, safe="")
+ fastly(
+ "PUT",
+ f"/service/{logging_service_id}/version/{new_ver}/logging/s3/{encoded}",
+ {"format": new_format, "format_version": 2},
+ token=token,
+ )
+
+ # ── Stage 4: validate + activate. ──────────────────────────────────
+ result = fastly(
+ "GET",
+ f"/service/{logging_service_id}/version/{new_ver}/validate",
+ token=token,
+ )
+ if result.get("status") != "ok":
+ raise RuntimeError(f"Validation failed: {result.get('errors') or result}")
+ fastly(
+ "PUT",
+ f"/service/{logging_service_id}/version/{new_ver}/activate",
+ token=token,
+ )
+ ok(f"Logging service version {new_ver} active (scoring stripped)")
+ except Exception as exc:
+ fail(f"disable_scoring VCL phase failed: {exc}")
+ try:
+ fastly(
+ "PUT",
+ f"/service/{logging_service_id}/version/{active_ver}/activate",
+ token=token,
+ )
+ except RuntimeError:
+ pass
+ raise
+
+ # ── Stage 5: tear down the Compute service + stores. ───────────────────
+ delete_scoring_service(
+ scoring_service_id,
+ scoring_keys_store_id=scoring_keys_store_id,
+ scoring_config_store_id=scoring_config_store_id,
+ token=token,
+ status_cb=status_cb,
+ )
+
+ # ── Stage 6: clear the scoring block from config. ──────────────────────
+ # DEFENSE-IN-DEPTH: re-load cfg right before the final save. The
+ # Fastly + Compute teardown stages above can take 60-120s; the
+ # in-memory cfg loaded at line ~644 is a stale snapshot that would
+ # clobber any concurrent writer mutations (metadata_sync tick,
+ # custom_fields PATCH, ngwaf_workspace_id update). Same load-mutate-
+ # save-just-the-target-keys pattern as the enable_scoring rollback.
+ try:
+ fresh = svcconfig.load_config(logging_service_id) or cfg
+ except Exception:
+ fresh = cfg
+ fresh.pop("scoring", None)
+ svcconfig.save_config(logging_service_id, fresh)
+
+ # Publish the new custom_fields list (now without scoring) so analyst
+ # boxes stop seeing the scoring entries on their next metadata_sync.
+ try:
+ from backend.state_sync import export_admin_state
+
+ export_admin_state(logging_service_id)
+ except Exception as exc:
+ warn(f"Could not export admin_state to FOS after disable (non-fatal): {exc}")
+
+ if status_cb:
+ status_cb("✅ Session scoring disabled.")
+ ok("Session scoring disabled")
+
+
+def update_recv_exclusion_regex(
+ logging_service_id: str,
+ token: str,
+ *,
+ new_regex: str,
+) -> dict[str, Any]:
+ """Re-publish ONLY the recv VCL snippet with a new exclusion regex.
+
+ Lighter-weight than running ``enable_scoring`` end-to-end: we keep
+ the existing Compute service / ConfigStores / Wasm / log-format
+ untouched, and ONLY clone the active VCL version → swap the recv
+ snippet body → activate. Takes ~5-10s in practice.
+
+ ``new_regex`` is the operator's pre-validated regex string (already
+ passed through ``backend.utils.vcl_validator.validate_url_exclusion_regex``
+ + falco lint by the API layer). Empty string means "use the default"
+ and persists as ``None`` in cfg so a future default change auto-picks-up.
+
+ Returns:
+ {
+ "effective_regex": str, # what got interpolated
+ "is_default": bool,
+ "logging_service_active_version": int, # post-activate
+ }
+
+ Raises ``RuntimeError`` on any Fastly API failure; the rollback path
+ re-activates the prior version so the service is never left in an
+ inconsistent state.
+ """
+ cfg = svcconfig.load_config(logging_service_id)
+ if not cfg:
+ raise RuntimeError(f"No config found for logging service {logging_service_id}")
+ scoring = cfg.get("scoring") or {}
+ if not scoring.get("enabled"):
+ raise RuntimeError(
+ f"Session scoring is not enabled for {logging_service_id}; "
+ "run enable_scoring first before customising the recv exclusion regex."
+ )
+ request_secret = scoring.get("request_secret")
+ if not request_secret:
+ raise RuntimeError(
+ "Cannot re-publish recv snippet without request_secret in cfg; "
+ "the snippet bodies for peer snippets depend on it. Re-run enable_scoring."
+ )
+
+ # Persist the override first — that way even if the Fastly activation
+ # below fails, a future enable_scoring run picks up the new value.
+ # None is the canonical "use default" representation so the JSON cfg
+ # file doesn't end up with an empty-string sentinel.
+ cleaned = (new_regex or "").strip()
+ scoring["exclude_url_regex"] = cleaned or None
+ cfg["scoring"] = scoring
+ svcconfig.save_config(logging_service_id, cfg)
+
+ # Generate the recv snippet body with the new regex.
+ from backend.provision.session_scoring_vcl import (
+ DEFAULT_ASSET_EXT_REGEX,
+ recv_snippet,
+ resolve_exclude_url_regex,
+ )
+
+ effective_regex = resolve_exclude_url_regex(cleaned or None)
+ is_default = effective_regex == DEFAULT_ASSET_EXT_REGEX
+ new_recv_body = recv_snippet(logging_service_id, request_secret, exclude_url_regex=cleaned or None)
+
+ # Clone → swap → activate.
+ active_ver = get_active_version(logging_service_id, token)
+ if active_ver is None:
+ raise RuntimeError(f"Logging service {logging_service_id} has no active version")
+ info(f"Cloning version {active_ver} to update recv-snippet exclusion regex")
+ clone = fastly("PUT", f"/service/{logging_service_id}/version/{active_ver}/clone", token=token)
+ new_ver = int(clone["number"])
+ ts = _dt.datetime.now(_dt.UTC).strftime("%Y-%m-%d %H:%M:%S UTC")
+ fastly(
+ "PUT",
+ f"/service/{logging_service_id}/version/{new_ver}",
+ {"comment": f"Update scoring recv exclusion regex {ts}"},
+ token=token,
+ )
+
+ try:
+ ensure_vcl_snippet(
+ SCORING_RECV_NAME,
+ "recv",
+ new_recv_body,
+ SCORING_SNIPPET_PRIORITY,
+ logging_service_id,
+ new_ver,
+ token,
+ )
+ result = fastly("GET", f"/service/{logging_service_id}/version/{new_ver}/validate", token=token)
+ if result.get("status") != "ok":
+ raise RuntimeError(f"Validation failed: {result.get('errors') or result}")
+ fastly(
+ "PUT",
+ f"/service/{logging_service_id}/version/{new_ver}/activate",
+ token=token,
+ )
+ ok(f"Logging service version {new_ver} active (recv exclusion regex updated)")
+ except Exception as exc:
+ fail(f"update_recv_exclusion_regex failed: {exc}")
+ # Re-activate the prior version so the service isn't left on the
+ # half-updated draft. Best-effort — if this fails too, the draft
+ # is left for the operator to clean up manually.
+ try:
+ fastly(
+ "PUT",
+ f"/service/{logging_service_id}/version/{active_ver}/activate",
+ token=token,
+ )
+ except RuntimeError:
+ pass
+ raise
+
+ return {
+ "effective_regex": effective_regex,
+ "is_default": is_default,
+ "logging_service_active_version": new_ver,
+ }
+
+
+def update_enforce_status_code(
+ logging_service_id: str,
+ token: str,
+ *,
+ new_status_code: int | None,
+) -> dict[str, Any]:
+ """Re-publish ONLY the enforce VCL snippet with a new status code.
+
+ Mirrors ``update_recv_exclusion_regex``: clone the active version,
+ swap the enforce snippet body, validate, activate. Takes ~5-10s.
+
+ ``new_status_code`` is the operator's pre-validated int (400-599) or
+ ``None`` to reset to the default 429. The PUT endpoint validates the
+ range BEFORE calling here; this function defends with
+ ``resolve_enforce_status_code`` but trusts its caller.
+
+ Returns:
+ {
+ "effective_status_code": int,
+ "is_default": bool,
+ "logging_service_active_version": int,
+ }
+
+ Raises ``RuntimeError`` on any Fastly API failure; the rollback path
+ re-activates the prior version so the service is never left in an
+ inconsistent state.
+ """
+ cfg = svcconfig.load_config(logging_service_id)
+ if not cfg:
+ raise RuntimeError(f"No config found for logging service {logging_service_id}")
+ scoring = cfg.get("scoring") or {}
+ if not scoring.get("enabled"):
+ raise RuntimeError(
+ f"Session scoring is not enabled for {logging_service_id}; "
+ "run enable_scoring first before customising the enforce status code."
+ )
+
+ # Persist the override first so a future enable_scoring re-bake also
+ # picks it up even if the activation below fails. None is the canonical
+ # "use default" representation (mirrors exclude_url_regex shape).
+ from backend.provision.session_scoring_vcl import (
+ DEFAULT_ENFORCE_STATUS_CODE,
+ enforce_snippet,
+ resolve_enforce_status_code,
+ )
+
+ effective_code = resolve_enforce_status_code(new_status_code)
+ is_default = effective_code == DEFAULT_ENFORCE_STATUS_CODE
+ scoring["enforce_status_code"] = None if is_default else effective_code
+ cfg["scoring"] = scoring
+ svcconfig.save_config(logging_service_id, cfg)
+
+ # 034: enforce_snippet now bakes the request_secret into its shield-auth
+ # boundary check. Re-publishing without the secret would emit invalid
+ # VCL — fail loudly here rather than letting the activation fail later.
+ request_secret = scoring.get("request_secret")
+ if not request_secret:
+ raise RuntimeError(
+ "Cannot re-publish enforce snippet without request_secret in cfg; "
+ "run enable_scoring first or restore scoring.request_secret."
+ )
+ new_enforce_body = enforce_snippet(request_secret, effective_code)
+
+ # Clone → swap → activate.
+ active_ver = get_active_version(logging_service_id, token)
+ if active_ver is None:
+ raise RuntimeError(f"Logging service {logging_service_id} has no active version")
+ info(f"Cloning version {active_ver} to update enforce-snippet status code → {effective_code}")
+ clone = fastly("PUT", f"/service/{logging_service_id}/version/{active_ver}/clone", token=token)
+ new_ver = int(clone["number"])
+ ts = _dt.datetime.now(_dt.UTC).strftime("%Y-%m-%d %H:%M:%S UTC")
+ fastly(
+ "PUT",
+ f"/service/{logging_service_id}/version/{new_ver}",
+ {"comment": f"Update scoring enforce status code → {effective_code} ({ts})"},
+ token=token,
+ )
+
+ try:
+ ensure_vcl_snippet(
+ SCORING_ENFORCE_NAME,
+ "recv",
+ new_enforce_body,
+ SCORING_SNIPPET_PRIORITY + 1,
+ logging_service_id,
+ new_ver,
+ token,
+ )
+ result = fastly("GET", f"/service/{logging_service_id}/version/{new_ver}/validate", token=token)
+ if result.get("status") != "ok":
+ raise RuntimeError(f"Validation failed: {result.get('errors') or result}")
+ fastly(
+ "PUT",
+ f"/service/{logging_service_id}/version/{new_ver}/activate",
+ token=token,
+ )
+ ok(f"Logging service version {new_ver} active (enforce status code → {effective_code})")
+ except Exception as exc:
+ fail(f"update_enforce_status_code failed: {exc}")
+ # Re-activate the prior version so the service isn't left on the
+ # half-updated draft. Best-effort.
+ try:
+ fastly(
+ "PUT",
+ f"/service/{logging_service_id}/version/{active_ver}/activate",
+ token=token,
+ )
+ except RuntimeError:
+ pass
+ raise
+
+ return {
+ "effective_status_code": effective_code,
+ "is_default": is_default,
+ "logging_service_active_version": new_ver,
+ }
diff --git a/backend/provision/session_scoring_setup.py b/backend/provision/session_scoring_setup.py
new file mode 100644
index 00000000..36a85984
--- /dev/null
+++ b/backend/provision/session_scoring_setup.py
@@ -0,0 +1,387 @@
+"""Provision + tear down a per-customer session-scoring Compute service.
+
+Pattern mirrors ``ensure_cdn_service`` / ``delete_cdn_service`` in
+``backend.provision.fastly_api`` — primitive args in, status callback for
+SSE progress, idempotent in both directions, no implicit state outside
+the Fastly API + returned dict.
+
+Naming convention (from the research doc):
+ - service name: ``Session Scoring Service for {logging_service_id}``
+ - domain: ``fos-{logging_service_id.lower()}-session-scorer.edgecompute.app``
+ - keys store: ``scoring_keys_{compute_service_id}``
+ - config store: ``scoring_config_{compute_service_id}``
+
+The Wasm deploy itself (``fastly compute deploy``) is NOT done here —
+that's the matrix-deploy concern owned by ``scripts/scoring/deploy_wasm.sh``
+and gets invoked separately after a training run produces a matrix. This
+keeps the provisioner small (~5s API calls) and the deploy slow (~30s
+build + upload) as distinct lifecycle stages.
+"""
+
+from __future__ import annotations
+
+import secrets
+from typing import Any
+
+from backend.core.fastly.client import fastly
+from backend.provision.utils import BOLD, _c, info, ok, warn
+
+SCORING_SERVICE_NAME_PREFIX = "Session Scoring Service for "
+SCORING_DOMAIN_TEMPLATE = "fos-{sid_lower}-session-scorer.edgecompute.app"
+KEYS_STORE_NAME_TEMPLATE = "scoring_keys_{sid}"
+CONFIG_STORE_NAME_TEMPLATE = "scoring_config_{sid}"
+
+# Resource-link names match the ConfigStore::open() arguments in
+# compute/scorer/src/main.rs. Both must be edited in lockstep.
+KEYS_RESOURCE_LINK_NAME = "scoring_keys"
+CONFIG_RESOURCE_LINK_NAME = "scoring_config"
+
+# Initial values for the config stores.
+DEBUG_LOG_KEY = "debug_logging_enabled"
+DEBUG_LOG_DEFAULT = "0"
+CURRENT_KEY_HEX = "current_key_hex"
+PREVIOUS_KEY_HEX = "previous_key_hex" # blank until first rotation
+# Shared secret VCL → Compute. The customer's VCL service embeds this
+# secret in the X-Edge-Scorer-Auth request header before calling the
+# scorer; the scorer rejects requests without a matching value. Stops
+# the scorer's edgecompute.app domain from being scored on by anyone
+# who happens to find the hostname.
+REQUEST_SECRET_KEY = "request_secret"
+
+
+def _scoring_service_name(logging_service_id: str) -> str:
+ return f"{SCORING_SERVICE_NAME_PREFIX}{logging_service_id}"
+
+
+def _scoring_domain(logging_service_id: str) -> str:
+ return SCORING_DOMAIN_TEMPLATE.format(sid_lower=logging_service_id.lower())
+
+
+def _find_scoring_service(logging_service_id: str, token: str) -> dict | None:
+ """Return the existing scoring service for this logging service, if any.
+ Idempotency lever — ``ensure_scoring_service`` reuses an existing
+ service rather than failing on duplicate-name."""
+ name = _scoring_service_name(logging_service_id)
+ try:
+ services = fastly("GET", "/service", token=token) or []
+ except RuntimeError:
+ return None
+ for svc in services:
+ if svc.get("name") == name:
+ return svc
+ return None
+
+
+def _find_config_store(store_name: str, token: str) -> dict | None:
+ try:
+ resp = fastly("GET", "/resources/stores/config", token=token)
+ except RuntimeError:
+ return None
+ # Fastly's list endpoint returns either a list or {"data": [...]} depending
+ # on the version; tolerate both.
+ items = resp if isinstance(resp, list) else resp.get("data", [])
+ for item in items:
+ if item.get("name") == store_name:
+ return item
+ return None
+
+
+def ensure_scoring_service(
+ logging_service_id: str,
+ token: str,
+ *,
+ status_cb=None,
+) -> dict[str, Any]:
+ """Create (or reuse) the per-customer session-scoring Compute service,
+ its two ConfigStores, the AES-256 key, and resource links from v1 of
+ the service to the stores.
+
+ Returns a dict suitable for stashing into the customer's config:
+
+ {
+ "scoring_service_id": "...",
+ "scoring_service_name": "Session Scoring Service for ...",
+ "scoring_domain": "fos-...-session-scorer.edgecompute.app",
+ "scoring_keys_store_id": "...",
+ "scoring_config_store_id": "...",
+ "aes_key_hex": "..." # only populated on first creation
+ }
+
+ Idempotent: re-running against an existing scoring service no-ops the
+ create steps. The returned ``aes_key_hex`` is empty when reusing an
+ existing service (we don't have a way to read back the key once it's
+ in the store)."""
+ name = _scoring_service_name(logging_service_id)
+ domain = _scoring_domain(logging_service_id)
+
+ info(f"Ensuring scoring service {_c(BOLD, name)}")
+ if status_cb:
+ status_cb(f"⏳ Ensuring scoring service '{name}'...")
+
+ existing = _find_scoring_service(logging_service_id, token)
+ if existing:
+ ok(f"Scoring service already exists ({existing['id']})")
+ if status_cb:
+ status_cb(f"✅ Scoring service '{name}' already exists.")
+ scoring_service_id = existing["id"]
+ keys_store = _find_config_store(KEYS_STORE_NAME_TEMPLATE.format(sid=scoring_service_id), token)
+ cfg_store = _find_config_store(CONFIG_STORE_NAME_TEMPLATE.format(sid=scoring_service_id), token)
+ return {
+ "scoring_service_id": scoring_service_id,
+ "scoring_service_name": name,
+ "scoring_domain": domain,
+ "scoring_keys_store_id": (keys_store or {}).get("id", ""),
+ "scoring_config_store_id": (cfg_store or {}).get("id", ""),
+ "aes_key_hex": "",
+ # On reuse, neither secret is readable back from the store.
+ # The orchestrator falls back to whatever it stashed in
+ # cfg["scoring"]["request_secret"] on a prior provision.
+ "request_secret": "",
+ }
+
+ # 1. Create the wasm Compute service.
+ svc = fastly("POST", "/service", {"name": name, "type": "wasm"}, token=token)
+ scoring_service_id = svc["id"]
+ ok(f"Created scoring service {scoring_service_id}")
+ if status_cb:
+ status_cb(f"✅ Created scoring service '{name}'.")
+
+ # 2. Add the domain to version 1 (auto-created with the service).
+ fastly(
+ "POST",
+ f"/service/{scoring_service_id}/version/1/domain",
+ {"name": domain},
+ token=token,
+ )
+ ok(f"Added domain {domain}")
+ if status_cb:
+ status_cb(f"✅ Added domain '{domain}'.")
+
+ # 3. Add a placeholder backend (Compute services require at least one).
+ # The scorer never calls it; it's just to make the service version
+ # valid.
+ fastly(
+ "POST",
+ f"/service/{scoring_service_id}/version/1/backend",
+ {
+ "name": "placeholder_origin",
+ "address": "127.0.0.1",
+ "port": 80,
+ "override_host": "example.com",
+ },
+ token=token,
+ )
+ ok("Added placeholder backend")
+
+ # 4. Create the two ConfigStores, namespaced by the scoring service id.
+ keys_store_name = KEYS_STORE_NAME_TEMPLATE.format(sid=scoring_service_id)
+ cfg_store_name = CONFIG_STORE_NAME_TEMPLATE.format(sid=scoring_service_id)
+
+ keys_store = fastly("POST", "/resources/stores/config", {"name": keys_store_name}, token=token)
+ cfg_store = fastly("POST", "/resources/stores/config", {"name": cfg_store_name}, token=token)
+ ok(f"Created config stores {keys_store_name}, {cfg_store_name}")
+ if status_cb:
+ status_cb("✅ Created config stores.")
+
+ # 5. Generate the AES-256 key + request secret and write both to
+ # scoring_keys. The request secret is the shared-secret header
+ # value that VCL embeds in X-Edge-Scorer-Auth so the Compute
+ # service can reject requests not coming from "our" VCL.
+ aes_key_hex = secrets.token_hex(32)
+ request_secret = secrets.token_hex(32)
+ fastly(
+ "POST",
+ f"/resources/stores/config/{keys_store['id']}/item",
+ {"item_key": CURRENT_KEY_HEX, "item_value": aes_key_hex},
+ token=token,
+ )
+ fastly(
+ "POST",
+ f"/resources/stores/config/{keys_store['id']}/item",
+ {"item_key": REQUEST_SECRET_KEY, "item_value": request_secret},
+ token=token,
+ )
+ fastly(
+ "POST",
+ f"/resources/stores/config/{cfg_store['id']}/item",
+ {"item_key": DEBUG_LOG_KEY, "item_value": DEBUG_LOG_DEFAULT},
+ token=token,
+ )
+ ok("Populated config stores")
+
+ # 6. Link both stores to the service version so the Wasm can open them
+ # by the short ResourceLink names (scoring_keys / scoring_config).
+ fastly(
+ "POST",
+ f"/service/{scoring_service_id}/version/1/resource",
+ {"name": KEYS_RESOURCE_LINK_NAME, "resource_id": keys_store["id"]},
+ token=token,
+ )
+ fastly(
+ "POST",
+ f"/service/{scoring_service_id}/version/1/resource",
+ {"name": CONFIG_RESOURCE_LINK_NAME, "resource_id": cfg_store["id"]},
+ token=token,
+ )
+ ok("Linked stores to service v1")
+ if status_cb:
+ status_cb("✅ Linked config stores to service v1.")
+
+ return {
+ "scoring_service_id": scoring_service_id,
+ "scoring_service_name": name,
+ "scoring_domain": domain,
+ "scoring_keys_store_id": keys_store["id"],
+ "scoring_config_store_id": cfg_store["id"],
+ "aes_key_hex": aes_key_hex,
+ "request_secret": request_secret,
+ }
+
+
+def rotate_aes_key(
+ scoring_keys_store_id: str,
+ *,
+ token: str,
+) -> dict:
+ """Rotate the AES-GCM cookie-state encryption key for a scoring service.
+
+ Pulls the current ``current_key_hex`` from the scoring_keys
+ ConfigStore, moves it to ``previous_key_hex``, generates a fresh
+ 32-byte key, writes it as the new ``current_key_hex``. The Rust
+ scorer's cookie codec tries current first then previous, so cookies
+ issued under the old key keep decoding through one full rotation
+ cycle (typically the cookie idle-expire window, ~hours).
+
+ Idempotent — calling twice rotates twice, and the previous-previous
+ key is dropped (only one rotation grace level by design). Fastly
+ ConfigStore items use PUT for replace; ``item_value`` is the new
+ hex string.
+
+ Returns ``{"current_key_hex": "", "previous_key_hex": "",
+ "rotated_at": ""}`` so the caller can audit.
+ """
+ import datetime as _dt
+ import secrets
+
+ if not scoring_keys_store_id:
+ raise ValueError("scoring_keys_store_id is required")
+
+ # Fetch current to move it into previous_key_hex slot.
+ try:
+ cur_item = fastly(
+ "GET",
+ f"/resources/stores/config/{scoring_keys_store_id}/item/{CURRENT_KEY_HEX}",
+ token=token,
+ )
+ prev_value = (cur_item or {}).get("item_value", "") or ""
+ except Exception:
+ prev_value = ""
+
+ new_key = secrets.token_hex(32)
+ rotated_at = _dt.datetime.now(_dt.UTC).isoformat(timespec="seconds")
+
+ # PATCH updates an existing item. If previous_key_hex doesn't exist
+ # yet (first rotation ever), PATCH 404s — fall back to POST.
+ def _upsert_item(key: str, value: str) -> None:
+ try:
+ fastly(
+ "PATCH",
+ f"/resources/stores/config/{scoring_keys_store_id}/item/{key}",
+ {"item_value": value},
+ token=token,
+ )
+ except Exception:
+ fastly(
+ "POST",
+ f"/resources/stores/config/{scoring_keys_store_id}/item",
+ {"item_key": key, "item_value": value},
+ token=token,
+ )
+
+ if prev_value:
+ _upsert_item(PREVIOUS_KEY_HEX, prev_value)
+ _upsert_item(CURRENT_KEY_HEX, new_key)
+ ok(f"Rotated AES key at {rotated_at} (previous_key preserved for grace window)")
+
+ return {
+ "current_key_hex": new_key,
+ "previous_key_hex": prev_value,
+ "rotated_at": rotated_at,
+ }
+
+
+def delete_scoring_service(
+ scoring_service_id: str,
+ *,
+ scoring_keys_store_id: str = "",
+ scoring_config_store_id: str = "",
+ token: str,
+ status_cb=None,
+) -> None:
+ """Tear down the Compute service AND both ConfigStores. Idempotent —
+ deleting an already-deleted resource is a no-op.
+
+ Order: service first (deactivate → delete), then stores. Service must
+ go first because the resource-link tying the stores to the service
+ will block store-deletion otherwise."""
+ if not scoring_service_id:
+ warn("delete_scoring_service called with empty service id — nothing to do")
+ return
+
+ info(f"Tearing down scoring service {_c(BOLD, scoring_service_id)}")
+ if status_cb:
+ status_cb(f"⏳ Tearing down scoring service '{scoring_service_id}'...")
+
+ # 1. Deactivate any active versions so we can delete the service.
+ try:
+ versions = fastly("GET", f"/service/{scoring_service_id}/version", token=token) or []
+ for v in versions:
+ if v.get("active"):
+ if status_cb:
+ status_cb(f"⏳ Deactivating version {v['number']}...")
+ fastly(
+ "PUT",
+ f"/service/{scoring_service_id}/version/{v['number']}/deactivate",
+ token=token,
+ )
+ except RuntimeError as exc:
+ if "404" in str(exc):
+ ok("Scoring service already deleted")
+ return
+ # fall through; delete still might work
+ warn(f"Failed to deactivate versions (will try delete anyway): {exc}")
+
+ # 2. Delete the service.
+ try:
+ fastly("DELETE", f"/service/{scoring_service_id}", token=token, expect_empty=True)
+ ok("Scoring service deleted")
+ except RuntimeError as exc:
+ if "404" in str(exc):
+ ok("Scoring service already deleted")
+ else:
+ raise
+
+ # 3. Delete the config stores. Each lookup-then-delete is tolerant of
+ # "already deleted" so this is safe to re-run.
+ for label, store_id in (
+ ("scoring_keys", scoring_keys_store_id),
+ ("scoring_config", scoring_config_store_id),
+ ):
+ if not store_id:
+ continue
+ try:
+ fastly(
+ "DELETE",
+ f"/resources/stores/config/{store_id}",
+ token=token,
+ expect_empty=True,
+ )
+ ok(f"Deleted {label} store ({store_id})")
+ except RuntimeError as exc:
+ if "404" in str(exc):
+ ok(f"{label} store already deleted")
+ else:
+ warn(f"Could not delete {label} store {store_id}: {exc}")
+
+ if status_cb:
+ status_cb("✅ Scoring service torn down.")
diff --git a/backend/provision/session_scoring_vcl.py b/backend/provision/session_scoring_vcl.py
new file mode 100644
index 00000000..bc3dbf4b
--- /dev/null
+++ b/backend/provision/session_scoring_vcl.py
@@ -0,0 +1,478 @@
+"""VCL snippet generator for the session-scoring restart pattern.
+
+Adapted from the canonical Fastly preflight pattern (fiddle 4b1a74ee).
+Six snippets — recv / pass / fetch / miss / deliver / enforce — coordinate to:
+
+ 1. recv: on first pass, route to the scorer Compute backend with
+ X-Edge-Scoring-Pass=1, return(pass).
+ 2. pass: inject the auth + service-id headers on bereq for the
+ upcoming scorer sub-fetch (pass is the correct subroutine
+ for bereq header mutations under return(pass)).
+ 3. fetch: when the backend is the scorer, return(deliver) to skip
+ cache + go straight to deliver with the scorer response.
+ 4. deliver: pass-1 captures all seven scorer values (score, l1, l2,
+ compliance, reason, sid, enforce) + the rotated Set-Cookie
+ into subfields of req.http.x-edge-score (single consolidated
+ header — eight subfields total), unsets the resp.http
+ .x-edge-* leaks, and issues a naked `restart`. pass-2 emits
+ the rotated cookie via `add resp.http.Set-Cookie` (additive
+ — preserves any origin cookies).
+ 5. miss: unset bereq.http.x-edge-score + X-Edge-Scoring-Pass so
+ neither leaks to the real origin on pass 2.
+ 6. enforce: on the post-scoring restart, error 429 when the scorer
+ emitted X-Edge-Score-Enforce=1 (operator committed an
+ enforce_threshold and the request's score met it).
+
+**Storage strategy.** All seven scoring values (plus the rotated
+Set-Cookie) are stored as SUBFIELDS of ``req.http.x-edge-score`` —
+single consolidated header keeps the per-request header budget small.
+Log format reads the subfields via ``req.http.x-edge-score:score`` etc.
+
+**Why restart from vcl_deliver.** Empirically (v440), req.http
+modifications made in vcl_fetch before `return(restart)` are invisible
+to Fastly's log-format evaluator. Restarting from vcl_deliver after
+writing the subfields is the working pattern.
+
+Fail-open contract: any error reaching the scorer (5xx, timeout, DNS
+failure) sets ``req.http.x-edge-score:score = "0"`` and
+``req.http.x-edge-score:compliance = "unknown"`` so the request flows
+normally to origin and the log line still has populated score fields
+(vs. NULLs that look like a misconfiguration).
+"""
+
+from __future__ import annotations
+
+# Backend name handling. Fastly's API creates a backend whose VCL-visible
+# name is "F_" + the raw name you submitted. So:
+# - SCORING_BACKEND_API_NAME is what we POST to /backend → "session_scorer"
+# - SCORING_BACKEND_VCL_NAME is what VCL sees → "F_session_scorer"
+SCORING_BACKEND_API_NAME = "session_scorer"
+SCORING_BACKEND_VCL_NAME = f"F_{SCORING_BACKEND_API_NAME}"
+
+# Snippet names. Stable string constants so disable_scoring can find and
+# remove the exact snippets by name. Fastly only accepts
+# [A-Za-z0-9_. -] in snippet names — no colons, slashes, or other
+# punctuation.
+SCORING_RECV_NAME = "Session Scoring - Recv"
+SCORING_PASS_NAME = "Session Scoring - Pass"
+SCORING_FETCH_NAME = "Session Scoring - Fetch"
+SCORING_DELIVER_NAME = "Session Scoring - Deliver"
+SCORING_MISS_NAME = "Session Scoring - Miss"
+SCORING_ENFORCE_NAME = "Session Scoring - Enforce"
+
+# Snippet priority — lower runs first. 100 is the "after everything
+# else" slot used by most user snippets on this service.
+SCORING_SNIPPET_PRIORITY = 100
+
+# vcl_fetch needs a low priority specifically — when the backend is
+# the scorer, we want `return(deliver)` to fire IMMEDIATELY, before
+# any other fetch-stage snippet (group-L timing, custom origin field
+# captures, etc.) gets a chance to run against the scorer's response.
+# Priority 1 puts us first in the fetch subroutine.
+SCORING_FETCH_PRIORITY = 1
+
+
+# Default asset-extension regex: requests whose URL matches this regex
+# bypass the scorer entirely. Static assets carry no session signal and
+# routing them through Compute is wasted cost + capacity.
+#
+# This is the DEFAULT. Operators can override it per-service via the
+# Session Scoring admin page; the operator-supplied value lives in the
+# service config under ``scoring.exclude_url_regex`` and is interpolated
+# into the recv snippet by ``recv_snippet`` below. An empty / unset
+# override falls back to this default.
+DEFAULT_ASSET_EXT_REGEX = (
+ # Anchored at the start AND restricted to the path segment via
+ # ``[^?]*`` (any non-``?`` chars). Without the anchor + path-only
+ # restriction, ``/api/login?file=.png`` would also match — the
+ # extension test would see ``.png`` in the query string and skip
+ # scoring entirely, letting an attacker bypass session scoring on
+ # any dynamic endpoint by appending an asset extension to the
+ # query string. The fix bounds the match to the URL path.
+ r"^[^?]*"
+ r"\.(aif|aiff|au|avi|bin|bmp|cab|carb|cct|cdf|class|css|dcr|doc|"
+ r"dtd|exe|flv|gcf|gff|gif|grv|hdml|hqx|ico|ini|jpeg|jpg|js|mov|"
+ r"mp3|mp4|nc|pct|pdf|png|ppc|pws|svg|swa|swf|txt|vbs|w32|wav|"
+ r"wbmp|wml|wmlc|wmls|wmlsc|xsd|zip|webp|woff|woff2|ttf|bz2|gz|"
+ r"tgz|tar|pem|cer|sql|xml|dat|pub|log|json|md|bak|rar|eml|lzma|"
+ r"war|bz|7z|ts|m3u8)($|\?)"
+)
+
+# Backwards-compat alias for tests / external callers that referenced
+# the old name before the override path landed.
+_ASSET_EXT_REGEX = DEFAULT_ASSET_EXT_REGEX
+
+
+def resolve_exclude_url_regex(operator_override: str | None) -> str:
+ """Pick between the operator's override and the built-in default.
+
+ Empty / None / whitespace-only → default. The operator-facing API
+ interprets the empty string as "I want the default" — same shape
+ as Pydantic optional-field handling.
+ """
+ if operator_override is None:
+ return DEFAULT_ASSET_EXT_REGEX
+ cleaned = operator_override.strip()
+ return cleaned or DEFAULT_ASSET_EXT_REGEX
+
+
+# Default HTTP status code returned by the enforce snippet when the scorer
+# flags a request. Operator-overridable via cfg.scoring.enforce_status_code;
+# bake-into-VCL at deploy so each change does a snippet swap (see
+# update_enforce_status_code orchestrator) rather than needing a
+# ConfigStore-to-VCL binding for a value that changes rarely.
+DEFAULT_ENFORCE_STATUS_CODE = 429
+
+# Allowed range — anything outside 4xx/5xx makes no sense for "reject".
+_ENFORCE_STATUS_CODE_MIN = 400
+_ENFORCE_STATUS_CODE_MAX = 599
+
+
+def enforce_reason_phrase(status_code: int) -> str:
+ """HTTP reason phrase for the enforce snippet's synthetic body.
+
+ Delegates to Python's ``http.HTTPStatus`` so any IANA-registered code
+ yields its canonical phrase (403 → "Forbidden", 451 → "Unavailable
+ For Legal Reasons", 511 → "Network Authentication Required", …).
+ Non-standard codes the operator might pick (419, 444, 530, 599) fall
+ back to ``"Blocked"`` — keeps the synthetic body meaningful even when
+ the stdlib map doesn't know the code."""
+ import http
+
+ try:
+ return http.HTTPStatus(status_code).phrase
+ except ValueError:
+ return "Blocked"
+
+
+def resolve_enforce_status_code(operator_override: int | None) -> int:
+ """Pick the effective enforce status code. None / out-of-range → default.
+
+ The PUT endpoint validates the operator's input before persistence,
+ so out-of-range here means a stale or corrupted cfg — fall back to
+ the safe default rather than baking a nonsensical code into VCL."""
+ if operator_override is None:
+ return DEFAULT_ENFORCE_STATUS_CODE
+ try:
+ code = int(operator_override)
+ except (TypeError, ValueError):
+ return DEFAULT_ENFORCE_STATUS_CODE
+ if not (_ENFORCE_STATUS_CODE_MIN <= code <= _ENFORCE_STATUS_CODE_MAX):
+ return DEFAULT_ENFORCE_STATUS_CODE
+ return code
+
+
+def recv_snippet(
+ logging_service_id: str,
+ request_secret: str,
+ *,
+ exclude_url_regex: str | None = None,
+) -> str:
+ """vcl_recv snippet: at the EDGE on the first pass (no shield hop yet
+ AND req.restarts == 0 AND scoring-pass marker not set AND URL doesn't
+ match the exclusion regex), route to the scorer Compute backend with
+ X-Edge-Scoring-Pass=1, `return(pass)`. After the scoring restart
+ completes (req.restarts == 1) and the score was captured, re-enable
+ shielding for the real-origin pass so the cached object can be
+ served from the shield POP normally.
+
+ ``exclude_url_regex`` is the operator-supplied regex of URLs to
+ SKIP from scoring. None or "" falls back to ``DEFAULT_ASSET_EXT_REGEX``.
+ The caller (orchestrator) is responsible for having validated the
+ regex via backend.utils.vcl_validator BEFORE getting here — this
+ function trusts its input and string-substitutes verbatim into the
+ VCL boolean expression.
+
+ ``request_secret`` is also baked into the edge/shield boundary check
+ (``req.http.X-Edge-Shield-Auth != "{request_secret}"``). The original
+ boundary used ``fastly.ff.visits_this_service == 0``, which an
+ attacker could flip by setting their own ``Fastly-FF`` header
+ (Fastly's edge propagates the value verbatim to the next hop). The
+ secret-comparison form fails closed: only the edge's own pass/miss
+ subroutines (which know the secret because it's literally baked into
+ their VCL bodies) can set the header to a value that satisfies the
+ check.
+
+ Note: `logging_service_id` is kept as an argument for symmetry with
+ peer snippet generators."""
+ _ = logging_service_id
+ effective_regex = resolve_exclude_url_regex(exclude_url_regex)
+ return f"""# Session Scoring: client-edge header scrub (anti-spoofing).
+# Edge-only — see X-Edge-Shield-Auth note below — so any client-supplied
+# X-Edge-* gets stripped before it can be forged into a clean score.
+# Also strip the X-Edge-Shield-Auth header itself so a client cannot
+# pre-set it and skip our edge-only protections.
+if (req.restarts == 0 && req.http.X-Edge-Shield-Auth != "{request_secret}") {{
+ unset req.http.X-Edge-Shield-Auth;
+ unset req.http.X-Edge-Scoring-Pass;
+ unset req.http.X-Edge-Score;
+ unset req.http.X-Edge-Score-Reason;
+ unset req.http.X-Edge-Score-Enforce;
+ unset req.http.X-Edge-Sid;
+ unset req.http.X-Edge-Score-Set-Cookie;
+}}
+
+# Session Scoring: route the first-pass dynamic request to the scorer.
+# Edge-only — the pass/miss snippets set X-Edge-Shield-Auth on the
+# bereq going to the shield, so the shield's vcl_recv reads back a
+# matching secret and skips this block. An attacker who tries to spoof
+# the boundary by sending their own Fastly-FF header cannot satisfy the
+# secret comparison, so the edge-only logic still runs on their hop and
+# they get scored / scrubbed normally.
+#
+# DDoS bypass (fastly.ddos_detected): when Fastly's L7 DDoS detection
+# flags this request, do NOT route to Compute. Two reasons:
+# 1. Cost ceiling — under attack, Compute invocations scale linearly
+# with attack volume. Skipping flagged requests caps the blast
+# radius while NGWAF / Fastly's mitigation handles the actual block.
+# 2. Signal quality — the scorer's L2 transition matrix learns from
+# benign traffic shapes; feeding attack traffic in pollutes the
+# matrix even though those scores wouldn't be acted on.
+# See: https://www.fastly.com/documentation/reference/vcl/variables/miscellaneous/fastly-ddos-detected/
+if (req.http.X-Edge-Shield-Auth != "{request_secret}" && req.restarts == 0 && req.http.X-Edge-Scoring-Pass != "1" && !fastly.ddos_detected && std.tolower(req.url) !~ "{effective_regex}") {{
+ set req.backend = {SCORING_BACKEND_VCL_NAME};
+ set req.http.X-Edge-Scoring-Pass = "1";
+ # PASS — skip cache for the scoring sub-fetch. On the post-restart
+ # pass the scoring snippet doesn't re-fire because X-Edge-Scoring-Pass
+ # got unset in pass-1 deliver and req.restarts is now 1.
+ return(pass);
+}}
+
+# Post-scoring restart: we captured the score in pass-1 deliver and the
+# request flow is now headed for the real origin. Without this block,
+# the previous `return(pass)` would have permanently disabled shielding
+# for this request — re-enable it so the real-origin fetch can land on
+# the shield POP normally. `var.fastly_req_do_shield` is the magic
+# variable Fastly's auto-generated main VCL reads to decide whether to
+# shield the request.
+if (req.restarts == 1 && req.http.x-edge-score) {{
+ set var.fastly_req_do_shield = true;
+}}"""
+
+
+def pass_snippet(logging_service_id: str, request_secret: str) -> str:
+ """vcl_pass snippet: when this is the scoring sub-fetch (backend ==
+ scorer), inject the auth + service-id headers on bereq for the
+ upcoming sub-fetch. Also unset bereq.http.x-edge-score so any
+ attacker-supplied inbound x-edge-score doesn't get echoed into the
+ scorer's view of the request.
+
+ Also stamps ``bereq.http.X-Edge-Shield-Auth = "{request_secret}"``
+ on every pass — this is what the shield POP's vcl_recv reads back to
+ decide "skip edge-only blocks because this hop already ran them on
+ the edge". An attacker who tries to spoof Fastly-FF cannot satisfy
+ the shield-auth comparison because the secret is only ever set by
+ pass_snippet / miss_snippet (compiled into our VCL, never sent to
+ clients)."""
+ return f"""# Session Scoring: inject auth + service-id on the scorer sub-fetch.
+# vcl_pass is the right subroutine for bereq mutations when recv used
+# return(pass).
+if (req.backend == {SCORING_BACKEND_VCL_NAME}) {{
+ set bereq.http.X-Edge-Service-Id = "{logging_service_id}";
+ # Shared-secret header — the scorer compares this to the
+ # request_secret ConfigStore entry and 401s on mismatch. Embedded
+ # literally in VCL which is compiled and never sent to clients.
+ set bereq.http.X-Edge-Scorer-Auth = "{request_secret}";
+ # X-Edge-Scoring-Pass is an internal marker; the scorer doesn't need
+ # to see it and we don't want it polluting any downstream telemetry.
+ unset bereq.http.X-Edge-Scoring-Pass;
+}}
+# Strip any inbound x-edge-score header an attacker may have set; the
+# real one is built by us in vcl_deliver after the scorer responds.
+unset bereq.http.x-edge-score;
+# Shield-auth marker — the shield POP's vcl_recv reads this back via
+# req.http.X-Edge-Shield-Auth and skips the edge-only branches when it
+# matches. Unspoofable from outside because the secret is baked into
+# the compiled VCL and never returned to clients.
+set bereq.http.X-Edge-Shield-Auth = "{request_secret}";"""
+
+
+def fetch_snippet() -> str:
+ """vcl_fetch snippet: when the backend is the scorer, return(deliver)
+ so the response goes straight to deliver without any cache-related
+ handling. (return(pass) in recv already prevents caching, but
+ return(deliver) here is the canonical preflight-pattern shape and
+ avoids any weird interactions with beresp's TTL.)"""
+ return f"""# Session Scoring: skip cache handling for the scorer sub-fetch.
+if (req.backend == {SCORING_BACKEND_VCL_NAME}) {{
+ return(deliver);
+}}"""
+
+
+def deliver_snippet(request_secret: str) -> str:
+ """vcl_deliver snippet — the heart of the pattern.
+
+ PASS 1 (X-Edge-Scoring-Pass == "1"): scorer's response is in
+ resp.http.x-edge-*. Stash all seven scorer values (score, l1, l2,
+ compliance, reason, sid, enforce) into req.http.x-edge-score
+ subfields (single consolidated header), stash Set-Cookie into a
+ :set-cookie subfield (eight subfields total), scrub the
+ resp.http.x-edge-* headers (anti-leak), then naked `restart`.
+
+ PASS 2 (X-Edge-Scoring-Pass already gone): the stashed cookie gets
+ emitted via `add resp.http.Set-Cookie` (additive — preserves any
+ Set-Cookie the real origin set).
+
+ The subfield writes in pass-1 deliver propagate to vcl_log via the
+ req.http persistence across restart. The log format reads
+ req.http.x-edge-score:score etc."""
+ return f"""# Session Scoring: pass-1 stash + naked restart; pass-2 emit cookie.
+
+# ── PASS 1: capture scorer response into req.http.x-edge-score subfields ──
+if (req.http.X-Edge-Scoring-Pass == "1") {{
+ unset req.http.X-Edge-Scoring-Pass;
+ if (resp.status == 200) {{
+ set req.http.x-edge-score:score = resp.http.x-edge-score;
+ set req.http.x-edge-score:l1 = resp.http.x-edge-score-l1;
+ set req.http.x-edge-score:l2 = resp.http.x-edge-score-l2;
+ set req.http.x-edge-score:compliance = resp.http.X-Edge-Cookie-Compliance;
+ set req.http.x-edge-score:reason = resp.http.x-edge-score-reason;
+ # Hex-encoded 6-byte session id. Used by the admin labeling UI to
+ # target individual sessions; the scorer issues a fresh sid when
+ # the inbound cookie is missing/tampered.
+ set req.http.x-edge-score:sid = resp.http.x-edge-sid;
+ # Enforcement signal — set by the Rust scorer when the operator
+ # has committed enforce_threshold to the scoring_config ConfigStore
+ # AND the request's score met it. Captured here so the recv-
+ # restart-2 Enforce snippet can read it via subfield.
+ set req.http.x-edge-score:enforce = resp.http.x-edge-score-enforce;
+ }} else {{
+ # Scorer returned non-200 — fail open. No cookie to rotate.
+ set req.http.x-edge-score:score = "0";
+ set req.http.x-edge-score:l1 = "0";
+ set req.http.x-edge-score:l2 = "0";
+ set req.http.x-edge-score:compliance = "unknown";
+ set req.http.x-edge-score:reason = "compute-unavailable";
+ }}
+ # Stash the rotated cookie as a subfield too; pass-2 reads it back
+ # and emits via add resp.http.Set-Cookie.
+ set req.http.x-edge-score:set-cookie = resp.http.Set-Cookie;
+ # Anti-leak: strip the scorer's resp.http.x-edge-* headers so they
+ # don't reach the client even if the restart path were to short-
+ # circuit somehow.
+ unset resp.http.x-edge-score;
+ unset resp.http.x-edge-score-l1;
+ unset resp.http.x-edge-score-l2;
+ unset resp.http.x-edge-score-reason;
+ unset resp.http.x-edge-sid;
+ unset resp.http.X-Edge-Cookie-Compliance;
+ unset resp.http.X-Edge-Matrix-Version;
+ unset resp.http.x-edge-score-enforce;
+ restart;
+}}
+
+# ── PASS 2: real origin response — emit the rotated cookie additively ──
+# Only emit at the EDGE. We detect "this is the edge hop" via the
+# absence of the shield-auth secret on req.http.X-Edge-Shield-Auth;
+# the shield POP receives that header from us (set in pass/miss), so
+# the shield sees the match and skips this block. A spoofed
+# Fastly-FF header cannot fake the secret, so attacker-induced
+# duplicate Set-Cookie emission is no longer possible.
+if (req.http.X-Edge-Shield-Auth != "{request_secret}" && req.http.x-edge-score:set-cookie != "") {{
+ add resp.http.Set-Cookie = req.http.x-edge-score:set-cookie;
+}}"""
+
+
+def enforce_snippet(request_secret: str, status_code: int = DEFAULT_ENFORCE_STATUS_CODE) -> str:
+ """vcl_recv snippet that errors ``status_code`` when the scorer flagged the
+ request as over-threshold.
+
+ Fires on req.restarts == 1 (after the scoring sub-fetch + restart)
+ when the deliver pass-1 snippet captured ``X-Edge-Score-Enforce: 1``
+ from the scorer's response. The scorer only emits that header when
+ the operator has committed an enforce_threshold value via the
+ admin UI AND the request's score met it.
+
+ Edge-only — the shield-auth secret comparison replaces the original
+ ``fastly.ff.visits_this_service == 0`` check, which an attacker
+ could flip by sending their own ``Fastly-FF`` header.
+ ``error `` instead of a `synth` keeps the door open
+ for a custom vcl_error page later.
+
+ ``status_code`` defaults to 429 (Too Many Requests). Operators can
+ override via cfg.scoring.enforce_status_code; valid range 400-599.
+ The reason phrase is auto-mapped via ``enforce_reason_phrase``."""
+ code = resolve_enforce_status_code(status_code)
+ reason = enforce_reason_phrase(code)
+ return (
+ f"# Session Scoring: enforce committed threshold by erroring flagged requests.\n"
+ f"# Status code ({code} {reason}) is operator-configurable via\n"
+ f"# cfg.scoring.enforce_status_code; the update_enforce_status_code\n"
+ f"# orchestrator swaps this snippet on change. Default 429.\n"
+ f"# Fires only on the post-scoring restart (req.restarts == 1) when the\n"
+ f"# deliver pass-1 captured X-Edge-Score-Enforce=1 from the scorer.\n"
+ f"# Edge-only (unspoofable shield-auth comparison) so shield hops don't double-enforce.\n"
+ f'if (req.http.X-Edge-Shield-Auth != "{request_secret}" && req.restarts == 1 && req.http.x-edge-score:enforce == "1") {{\n'
+ f' error {code} "{reason}";\n'
+ f"}}"
+ )
+
+
+def miss_snippet(request_secret: str) -> str:
+ """vcl_miss snippet: defensive unsets. Strip inbound x-edge-score
+ (attacker could try to forge it) and X-Edge-Scoring-Pass (don't
+ leak the internal marker to the real origin on pass-2 fetch).
+
+ Also stamps ``bereq.http.X-Edge-Shield-Auth = "{request_secret}"``
+ on every miss-driven backend fetch so the shield POP's vcl_recv
+ sees the secret and skips edge-only blocks. Without this, only
+ pass-driven fetches set the marker, and a cacheable miss flow
+ would leave the marker unset on the shield hop — re-triggering
+ the edge-only branches and double-scoring."""
+ return f"""# Session Scoring: strip internal scoring headers before forwarding to
+# the real origin. x-edge-score could be attacker-supplied; the
+# X-Edge-Scoring-Pass marker is internal-only.
+unset bereq.http.x-edge-score;
+unset bereq.http.X-Edge-Scoring-Pass;
+# Shield-auth marker — match pass_snippet so the shield POP recognises
+# this hop as edge-originated and skips re-running edge-only blocks.
+set bereq.http.X-Edge-Shield-Auth = "{request_secret}";"""
+
+
+def generate_scoring_vcl(
+ logging_service_id: str,
+ request_secret: str,
+ *,
+ exclude_url_regex: str | None = None,
+ enforce_status_code: int | None = None,
+) -> dict[str, str]:
+ """Return a {snippet_name: vcl_body} dict for all six snippets.
+
+ Caller passes each (name, body) pair to ``ensure_vcl_snippet``
+ individually so the existing idempotent diff-and-update path
+ handles re-deploys cleanly.
+
+ ``request_secret`` is the shared secret VCL embeds in the
+ X-Edge-Scorer-Auth header so the scoring Compute service can
+ reject requests that didn't originate from this VCL service.
+
+ ``exclude_url_regex`` is the operator's per-service override of the
+ URL-exclusion regex used by recv_snippet. None / "" → default.
+ Pre-validated by backend.utils.vcl_validator at the API layer
+ before reaching this function.
+
+ ``enforce_status_code`` is the operator's per-service override of the
+ HTTP status code the enforce snippet returns when the scorer flags
+ a request. None / out-of-range → default (429).
+ """
+ return {
+ SCORING_RECV_NAME: recv_snippet(logging_service_id, request_secret, exclude_url_regex=exclude_url_regex),
+ SCORING_PASS_NAME: pass_snippet(logging_service_id, request_secret),
+ SCORING_FETCH_NAME: fetch_snippet(),
+ SCORING_DELIVER_NAME: deliver_snippet(request_secret),
+ SCORING_MISS_NAME: miss_snippet(request_secret),
+ SCORING_ENFORCE_NAME: enforce_snippet(request_secret, resolve_enforce_status_code(enforce_status_code)),
+ }
+
+
+def scoring_snippet_names() -> list[str]:
+ """Names of the snippets we install. Used by disable_scoring to find
+ and remove them by name from the cloned VCL version."""
+ return [
+ SCORING_RECV_NAME,
+ SCORING_PASS_NAME,
+ SCORING_FETCH_NAME,
+ SCORING_DELIVER_NAME,
+ SCORING_MISS_NAME,
+ SCORING_ENFORCE_NAME,
+ ]
diff --git a/backend/repositories/_base.py b/backend/repositories/_base.py
index d818fdd8..2795f369 100644
--- a/backend/repositories/_base.py
+++ b/backend/repositories/_base.py
@@ -8,10 +8,39 @@
from __future__ import annotations
import contextlib
+import re
import time
+from typing import Any
import duckdb
+# Pre-compile once; called per ``runner.execute`` invocation.
+_PARQUET_LIST_RE = re.compile(r"read_parquet\(\[\s*('[^']+'\s*(?:,\s*'[^']+'\s*)*)\]")
+
+
+def _compact_sql_for_debug(sql: str) -> str:
+ """Replace explicit ``read_parquet([...long file list...])`` literals
+ with ``read_parquet([N files])`` for transport in the debug-panel
+ payload.
+
+ The dashboard's per-request SQL embeds hundreds of buffer/rollup
+ parquet paths in a single ``read_parquet`` call. Shipping those
+ verbatim made ``_debug_queries`` ~220 KB of the response (60% of
+ total) — pure network + JSON-parse cost on every dashboard refresh
+ when the operator has ``DEBUG_RESPONSES=true`` set. The path list
+ isn't useful to a human reading the debug panel; the count is.
+
+ Compacting cuts the field to ~tens of bytes per query without
+ losing the SQL shape an operator cares about for tuning.
+ """
+
+ def _replace(m: re.Match) -> str:
+ # Count items by quote pairs — cheap and exact.
+ count = m.group(1).count("'") // 2
+ return f"read_parquet([{count} files]"
+
+ return _PARQUET_LIST_RE.sub(_replace, sql)
+
@contextlib.contextmanager
def _attach_sqlite(con: duckdb.DuckDBPyConnection, sqlite_path: str, alias: str):
@@ -81,17 +110,7 @@ def _get_schema(con: duckdb.DuckDBPyConnection, src: dict) -> list[dict]:
return get_schema(con, src)
-def safe_iso(dt) -> str | None:
- """Normalise a datetime or string to an ISO-8601 string ending in Z."""
- if dt is None:
- return None
- if hasattr(dt, "isoformat"):
- s = dt.isoformat()
- # Append Z for naive UTC datetimes that lack a tz suffix
- if not s.endswith("Z") and "+" not in s and s.count("-") <= 2:
- s += "Z"
- return s
- return str(dt)
+from backend.utils.date_utils import safe_iso # noqa: E402, F401 — re-export
def _is_stale_view_error(e: Exception) -> bool:
@@ -236,10 +255,45 @@ def debug_calls(self) -> list[dict]:
return []
def execute(self, q: str, p: list | None = None):
- """Execute a query and track its execution time."""
+ """Execute a query and track its execution time.
+
+ Self-heals on stale-view errors: if the connection's bound view
+ references a buffer parquet file that no longer exists (the sync
+ cron deleted it between the view bind and this query), refresh
+ the view once and retry. Belt-and-suspenders alongside the pool's
+ checkout fingerprint — that catches the common case, this catches
+ the race where a commit lands while a query is in flight.
+
+ ``execute_with_retry`` below also does this, but most callers use
+ plain ``execute()``, so the retry needs to live here too. The
+ cost when nothing's stale is a single Python try/except — no SQL,
+ no extra round-trip.
+ """
t0 = time.time()
- res = self.con.execute(q, p if p is not None else [])
- self.debug_queries.append({"sql": q.strip(), "time_ms": round((time.time() - t0) * 1000, 2)})
+ try:
+ res = self.con.execute(q, p if p is not None else [])
+ except Exception as e:
+ if not _is_stale_view_error(e):
+ raise
+ try:
+ from backend.core import iceberg as db_iceberg
+
+ # force=True skips the fast path. We're already in an
+ # error state because the view's cached SQL referenced a
+ # file that no longer exists on disk; the fast path
+ # would re-execute that same cached SQL (binding it,
+ # which succeeds — but the next query against the view
+ # would re-raise the same IOException). Force-rebuild
+ # reads disk under the lock and regenerates the SQL.
+ db_iceberg.update_iceberg_view(self.con, self.src, force=True)
+ except Exception:
+ # Refresh itself failed — surface the ORIGINAL error so
+ # callers see the real symptom, not the rebind side-effect.
+ raise e
+ res = self.con.execute(q, p if p is not None else [])
+ self.debug_queries.append(
+ {"sql": _compact_sql_for_debug(q.strip()), "time_ms": round((time.time() - t0) * 1000, 2)}
+ )
return res
def get_schema_cols(self) -> list[str]:
@@ -356,6 +410,185 @@ def temp_table(
except Exception:
pass
+ def execute_top_n_rollups(
+ self,
+ fields: list[str],
+ start_time: str | None,
+ end_time: str | None,
+ limit: int = 10,
+ ) -> tuple[list[tuple[str, Any, int]], list[str]]:
+ import os
+ from datetime import UTC, datetime, timedelta
+
+ from backend.core.duckdb import _cache_dir
+ from backend.core.rollups import _is_safe_ident, _safe_table_for
+ from backend.utils.date_utils import parse_iso_utc
+
+ cache_dir = _cache_dir(self.src)
+ rollup_dir = os.path.join(cache_dir, "rollups", "hour")
+ if not os.path.exists(rollup_dir):
+ return [], fields
+
+ # Defense-in-depth: field names land in a SQL IN-list as quoted
+ # literals AND the service name lands in the base-table identifier.
+ # Both should already be safe (FIELDS + validate_custom_field
+ # constrain custom names; service IDs are Fastly-format slugs), but
+ # we re-validate here so a future caller can't pierce the boundary.
+ safe_fields = [f for f in fields if _is_safe_ident(f)]
+ if not safe_fields:
+ return [], fields
+ base_table = _safe_table_for(self.src)
+ if not base_table:
+ # Service name failed the identifier safelist; refuse to query.
+ return [], fields
+
+ # Parse bounds
+ st_dt = parse_iso_utc(start_time) if start_time else None
+ et_dt = parse_iso_utc(end_time) if end_time else None
+
+ hour_cond = ""
+ if st_dt:
+ st_str = st_dt.strftime("%Y-%m-%d-%H")
+ hour_cond += f" AND hour >= '{st_str}'"
+ if et_dt:
+ # Half-open semantics: a request ending exactly on an hour
+ # boundary (e.g. ``end_time=2026-06-04T15:00:00``) should
+ # EXCLUDE the 15:00 hour rollup (which covers [15:00, 16:00)).
+ # Subtracting 1 microsecond before strftime keeps mid-hour
+ # ends inclusive of the surrounding hour while making exact
+ # boundaries exclusive — matching how the live-hour query
+ # below uses ``timestamp < et_dt``.
+ et_inclusive = (et_dt - timedelta(microseconds=1)).strftime("%Y-%m-%d-%H")
+ hour_cond += f" AND hour <= '{et_inclusive}'"
+
+ active_dt = datetime.now(UTC).replace(minute=0, second=0, microsecond=0)
+ active_dt_end = active_dt + timedelta(hours=1)
+ active_str = active_dt.strftime("%Y-%m-%d-%H")
+
+ # Glob `rollups/hour/**/*.parquet` was the obvious shape but it has
+ # DuckDB enumerate every file under the tree before the WHERE clause
+ # can prune ANYTHING. On a service with N fields × H hours of rollups
+ # that's N*H file stats up front, dominating wall time (witnessed
+ # 2026-06-04: ~2.8s on 18,648 files for a 24h query that should be
+ # reading ~1,700). Hive-partition pruning kicks in AFTER the glob
+ # expands, not before.
+ #
+ # Instead: enumerate the exact (field, hour) combinations we want in
+ # Python (cheap directory listdir per field, bounded by safe_fields ×
+ # hours-in-window), then pass DuckDB an explicit file list. Skips
+ # the glob, hands DuckDB only the files it needs.
+ st_str_floor = st_dt.strftime("%Y-%m-%d-%H") if st_dt else None
+ # End cutoff for the directory-list filter — `et_inclusive` was
+ # already computed above for the SQL fallback path. Use the same
+ # bounds here so the half-open semantics match.
+ if et_dt:
+ et_str_floor = (et_dt - timedelta(microseconds=1)).strftime("%Y-%m-%d-%H")
+ else:
+ et_str_floor = None
+
+ target_paths: list[str] = []
+ for field in safe_fields:
+ field_dir = os.path.join(rollup_dir, f"field={field}")
+ if not os.path.isdir(field_dir):
+ continue
+ try:
+ hour_entries = os.listdir(field_dir)
+ except OSError:
+ continue
+ for hour_entry in hour_entries:
+ if not hour_entry.startswith("hour="):
+ continue
+ hour = hour_entry[len("hour=") :]
+ # Lexicographic string compare is correct here because the
+ # YYYY-MM-DD-HH format is fixed-width.
+ if st_str_floor and hour < st_str_floor:
+ continue
+ if et_str_floor and hour > et_str_floor:
+ continue
+ if hour >= active_str:
+ # Active hour is served live, not from rollups.
+ continue
+ hour_dir = os.path.join(field_dir, hour_entry)
+ try:
+ for fname in os.listdir(hour_dir):
+ if fname.endswith(".parquet"):
+ target_paths.append(os.path.join(hour_dir, fname))
+ except OSError:
+ continue
+
+ if not target_paths:
+ rolled_res: list = []
+ else:
+ # Inline the explicit path list as a SQL array literal. DuckDB
+ # handles thousands of paths fine in a single statement; the
+ # SQL string size is ~80 bytes/path × few-thousand = a few MB
+ # at worst, well within parser limits. hive_partitioning=1
+ # still lets DuckDB read `field` from the path so the SELECT's
+ # `field` column resolves; `value`/`count` come from parquet
+ # content.
+ paths_sql = ", ".join("'" + p.replace("'", "''") + "'" for p in target_paths)
+ q = f"""
+ SELECT field, value, SUM(count) AS c
+ FROM read_parquet([{paths_sql}], hive_partitioning=1)
+ GROUP BY field, value
+ """
+ try:
+ rolled_res = self.execute(q).fetchall()
+ except Exception:
+ rolled_res = []
+
+ # We also need to get the live active hour stats from the base table
+ live_res = []
+
+ live_where = f"timestamp >= '{active_dt.isoformat()}' AND timestamp < '{active_dt_end.isoformat()}'"
+ # We only query the active hour if it overlaps with the requested time window
+ should_query_live = True
+ if et_dt and et_dt <= active_dt:
+ should_query_live = False
+ if st_dt and st_dt >= active_dt_end:
+ should_query_live = False
+
+ if should_query_live:
+ # We run a standard execute_top_n_batch query on the base table for just the active hour
+ try:
+ actual_cols = self.get_schema_cols()
+ from backend.core.duckdb import _get_schema
+
+ schema_types = {col["name"]: col["type"] for col in _get_schema(self.con, self.src)}
+
+ # To prevent creating a massive UNION, we'll create a temp table for just the live hour
+ tmp_name = self.create_filtered_temp_table(fields, actual_cols, base_table, live_where)
+ if tmp_name:
+ try:
+ live_res, _ = self.execute_top_n_batch(fields, tmp_name, actual_cols, schema_types, limit=limit)
+ finally:
+ try:
+ self.execute(f"DROP TABLE IF EXISTS {tmp_name}")
+ except Exception:
+ pass
+ except Exception:
+ pass
+
+ # Combine rolled and live
+ combined = {}
+ for field, value, count in rolled_res:
+ key = (field, value)
+ combined[key] = combined.get(key, 0) + count
+
+ for field, value, count in live_res:
+ key = (field, value)
+ combined[key] = combined.get(key, 0) + count
+
+ # Sort and limit
+ top_results = []
+ for field in fields:
+ field_items = [(k[1], v) for k, v in combined.items() if k[0] == field]
+ field_items.sort(key=lambda x: x[1], reverse=True)
+ for val, count in field_items[:limit]:
+ top_results.append((field, val, count))
+
+ return top_results, fields
+
def execute_top_n_batch(
self, fields: list[str], table_name: str, actual_cols: list[str], schema_types: dict[str, str], limit: int = 10
) -> tuple[list[tuple], list[str]]:
diff --git a/backend/repositories/alerts.py b/backend/repositories/alerts.py
index 4525fcb9..04de4a4b 100644
--- a/backend/repositories/alerts.py
+++ b/backend/repositories/alerts.py
@@ -45,6 +45,25 @@ def _find_alert_service(alert_id: str) -> str | None:
return None
+def get_alert_by_id(alert_id: str) -> dict | None:
+ """Return the alert row whose id matches ``alert_id`` (or None).
+
+ Security (defense-in-depth): the cross-tenant scope check in
+ ``backend/routers/alerts.py:delete_alert`` calls this to look up
+ ``service_id`` BEFORE mutating, so an analyst attempting a
+ cross-tenant delete gets 403 and the underlying row stays untouched.
+ Without this helper that check is dead code and the gate falls
+ through to the middleware (which already blocks DELETE on
+ /api/alerts for analysts, but the router-level gate is the
+ secondary belt-and-suspenders).
+ """
+ for sid in _all_service_ids():
+ for a in metadata_db.list_alerts(sid, filter_service_id=sid):
+ if a.get("id") == alert_id:
+ return a
+ return None
+
+
def toggle_alert(alert_id: str, enabled: bool, service_id_hint: str | None = None) -> dict:
"""Toggle an alert. ``service_id_hint`` (from request context) avoids the
cross-service scan when known; falls back to scan when not provided."""
diff --git a/backend/repositories/dashboard.py b/backend/repositories/dashboard.py
index 0fb74532..7d12b93d 100644
--- a/backend/repositories/dashboard.py
+++ b/backend/repositories/dashboard.py
@@ -27,16 +27,31 @@
from backend.repositories.utils.pagination import calc_offset
# ── In-memory caches ──────────────────────────────────────────────────────────
+# Bounded + actively-reaped: dashboard responses can be 30-240MB per entry,
+# and diverse filter/time-range/interval combinations mint a distinct key
+# each. The previous plain-dict version had a 30s TTL but only checked it
+# on hit — stale entries were never evicted, so the cache grew unboundedly
+# across hours of dashboard use (a primary OOM contributor on the 16GB VM).
+# 500 entries × ~30MB = ~15GB worst case; in practice the working set is
+# much smaller, but the cap is a hard backstop.
+from backend.utils.bounded_cache import BoundedTTLCache
-_dashboard_cache: dict[str, tuple[float, Any]] = {}
DASHBOARD_CACHE_TTL = 30 # seconds
+_dashboard_cache: BoundedTTLCache = BoundedTTLCache(maxsize=500, ttl_seconds=DASHBOARD_CACHE_TTL)
# ── aggregates ────────────────────────────────────────────────────────────────
from backend.core.log_fields import LOG_FIELD_CATALOG
-FIELDS = [f["id"] for f in LOG_FIELD_CATALOG if f["id"] != "_source_file"] + ["waf_sig_ind"]
+# Virtual fields are catalog ids whose value is computed by exploding a
+# real backing column (CSV string) into individual rows via DuckDB's
+# unnest(string_split(...)). They live in the FIELDS list so the dashboard
+# top-N machinery picks them up, but the cross-cutting loops below skip
+# them in batch-stats / column-need passes (their backing column is what
+# actually goes into the temp table).
+_VIRTUAL_FIELDS = ("waf_sig_ind", "edge_score_reason_ind")
+FIELDS = [f["id"] for f in LOG_FIELD_CATALOG if f["id"] != "_source_file"] + list(_VIRTUAL_FIELDS)
def _add_bot_columns(actual_cols: set[str], columns: list[str], select_cols: list[str]) -> tuple[bool, bool]:
@@ -89,9 +104,13 @@ def get_aggregates(
)
cache_key = hashlib.sha256(f"{_key_payload}:{source_name}".encode()).hexdigest()
now = time.time()
- if DASHBOARD_CACHE_TTL > 0 and cache_key in _dashboard_cache:
- cached_at, cached_res = _dashboard_cache[cache_key]
- if now - cached_at < DASHBOARD_CACHE_TTL:
+ if DASHBOARD_CACHE_TTL > 0:
+ # BoundedTTLCache's ``__contains__`` / ``[]`` already enforce TTL
+ # internally, so an entry that reads as present is by definition
+ # still fresh — no need for the legacy ``now - cached_at`` check.
+ cached_entry = _dashboard_cache.get(cache_key)
+ if cached_entry is not None:
+ cached_at, cached_res = cached_entry
cached_res = cached_res.copy()
cached_res["_is_cached"] = True
return cached_res
@@ -122,7 +141,13 @@ def get_aggregates(
if "timestamp" in actual_cols:
needed_cols.add('"timestamp"')
for field in fields:
- if field == "waf_sig_ind":
+ if field in _VIRTUAL_FIELDS:
+ # Virtual fields are exploded from a backing column further
+ # down; make sure the backing column is in the temp table.
+ if field == "waf_sig_ind" and "waf_sig" in actual_cols:
+ needed_cols.add('"waf_sig"')
+ elif field == "edge_score_reason_ind" and "edge_score_reason" in actual_cols:
+ needed_cols.add('"edge_score_reason"')
continue
if field in actual_cols:
needed_cols.add(f'"{field}"')
@@ -146,60 +171,126 @@ def get_aggregates(
needed_cols.add(f'"{mc}"')
cols_str = ", ".join(needed_cols) if needed_cols else "*"
- # Use TEMP TABLE instead of TEMP VIEW to materialize the filtered results in memory.
- # This prevents DuckDB from re-scanning the underlying files for every branch of the UNION ALL.
- temp_table = f"t_{uuid.uuid4().hex}"
- sql = f"CREATE TEMP TABLE {temp_table} AS SELECT {cols_str} FROM {table_name} WHERE {where_clause}"
- if not runner.create_temp_table(sql, params):
- empty = {f: {"top": [], "total": 0} for f in fields}
- return {
- "data": empty,
- "time_series": [],
- "map_data": [],
- "where_clause": "1=1",
- "interval": interval,
- "metric": "requests",
- "total_rows": 0,
- "total_rows_total": 0,
- **runner.telemetry(),
- }
-
- # All subsequent queries use the temp table
- table_name = temp_table
- where_clause = "1=1"
- params = []
+ # Only take the rollup fast-path when no filters AND a populated
+ # rollups tree actually exists on disk. Without the existence check
+ # the dashboard routed unfiltered queries to execute_top_n_rollups
+ # on services where the initial backfill hadn't completed (or in
+ # tests with no rollups built), producing an empty top-N — the field
+ # totals stayed at their zero-initialisers since the populate loop
+ # is gated on a non-empty top-N. Witnessed in
+ # test_get_aggregates_with_data 2026-06-04: 60 mock logs seeded,
+ # field_totals["url"] computed correctly via Q2, but results["url"]
+ # ["total"] stuck at 0 because no rollup row arrived to trigger the
+ # populate path. The temp-table fallback always populates totals.
+ from backend.core.duckdb import _cache_dir as _cache_dir_for_rollups
+
+ rollup_dir = os.path.join(_cache_dir_for_rollups(src), "rollups", "hour")
+ use_rollups = not filters and os.path.isdir(rollup_dir)
+
+ if use_rollups:
+ table_name = _safe_table(source_name)
+ else:
+ # Use TEMP TABLE instead of TEMP VIEW to materialize the filtered results in memory.
+ # This prevents DuckDB from re-scanning the underlying files for every branch of the UNION ALL.
+ temp_table = f"t_{uuid.uuid4().hex}"
+ sql = f"CREATE TEMP TABLE {temp_table} AS SELECT {cols_str} FROM {table_name} WHERE {where_clause}"
+ if not runner.create_temp_table(sql, params):
+ empty = {f: {"top": [], "total": 0} for f in fields}
+ return {
+ "data": empty,
+ "time_series": [],
+ "map_data": [],
+ "where_clause": "1=1",
+ "interval": interval,
+ "metric": "requests",
+ "total_rows": 0,
+ "total_rows_total": 0,
+ **runner.telemetry(),
+ }
+ # All subsequent queries use the temp table
+ table_name = temp_table
+ where_clause = "1=1"
+ params = []
results: dict[str, Any] = {f: {"top": [], "total": 0} for f in fields}
try:
- # Optimization: Combine count(*) and field counts into a single scan
- count_cols: list[str] = [CANONICAL_METRICS["requests"]]
- valid_fields: list[str] = []
- for field in fields:
- if field == "waf_sig_ind":
- continue
- if field in actual_cols:
- count_cols.append(f"count({resolve_col(field, actual_cols)})")
- valid_fields.append(field)
field_totals: dict[str, int] = {}
total_rows = 0
earliest_log_at = None
latest_log_at = None
- if count_cols:
- count_res = runner.execute(f"SELECT {', '.join(count_cols)} FROM {table_name}").fetchone()
- total_rows = count_res[0]
- for i, field in enumerate(valid_fields):
- field_totals[field] = count_res[i + 1]
+
+ if use_rollups:
+ # When the rollup fast-path is active, skip the wide per-column
+ # COUNT entirely. Two reasons it dominated wall time before:
+ # 1. 72 count(col) calls in one statement force DuckDB to
+ # touch every column for every row in the window — ~1s on
+ # prod's 24h × 3M-row view (witnessed 2026-06-04: Q2 was
+ # 1063ms of a 3194ms dashboard).
+ # 2. The output of all 72 counts is reconstructible from the
+ # rollup query's (field, value, count) rows: SUM by field
+ # across the result IS field_totals[field] for any field
+ # the user displays. We pay for it once via the rollup
+ # read instead of twice.
+ #
+ # Caveat: TOP_K per (field, hour) caps the rollup to the 500
+ # most-frequent values per hour. For high-cardinality fields
+ # (timestamp at per-second granularity, or unique-per-request
+ # ids) the SUM under-counts vs the true non-null count. In
+ # practice the dashboard shows top-10 with their percentages;
+ # mild under-counting of the denominator is acceptable for
+ # the perf win. If we ever need exact per-field totals here,
+ # add a `__total__` aggregate row to each rollup parquet.
+ try:
+ total_rows = runner.execute(
+ f"SELECT {CANONICAL_METRICS['requests']} FROM {table_name} WHERE {where_clause}", params
+ ).fetchone()[0]
+ except Exception:
+ total_rows = 0
+ else:
+ # Non-rollups path keeps the wide COUNT — we have the
+ # filtered temp table loaded; one combined scan is cheaper
+ # than re-counting per field downstream.
+ count_cols: list[str] = [CANONICAL_METRICS["requests"]]
+ valid_fields: list[str] = []
+ for field in fields:
+ if field in _VIRTUAL_FIELDS:
+ continue
+ if field in actual_cols:
+ count_cols.append(f"count({resolve_col(field, actual_cols)})")
+ valid_fields.append(field)
+ if count_cols:
+ count_res = runner.execute(
+ f"SELECT {', '.join(count_cols)} FROM {table_name} WHERE {where_clause}", params
+ ).fetchone()
+ total_rows = count_res[0]
+ for i, field in enumerate(valid_fields):
+ field_totals[field] = count_res[i + 1]
orig_table_name = _safe_table(source_name)
total_rows_total, earliest_log_at, latest_log_at = get_source_extent(runner, src, orig_table_name)
schema_types = {col["name"]: col["type"] for col in _get_schema(con, src)}
- batch_fields = [f for f in fields if f != "waf_sig_ind" and f in field_totals]
- all_top_res, field_order = runner.execute_top_n_batch(
- batch_fields, table_name, actual_cols, schema_types, limit=10
- )
+ # When use_rollups=True, field_totals is empty here — populate it
+ # below from the rollup query results. Use the full eligible field
+ # list (anything non-virtual + in schema) as batch_fields; the
+ # rollup helper silently skips fields it has no data for.
+ if use_rollups:
+ batch_fields = [f for f in fields if f not in _VIRTUAL_FIELDS and f in actual_cols]
+ else:
+ batch_fields = [f for f in fields if f not in _VIRTUAL_FIELDS and f in field_totals]
+ if use_rollups:
+ all_top_res, field_order = runner.execute_top_n_rollups(batch_fields, start_time, end_time, limit=10)
+ # Derive field_totals from the rollup result (cheap Python sum).
+ # Each row is (field, value, count); per-field sum = total of
+ # values covered by the top-K rollup for that field.
+ for f_name, _f_val, f_count in all_top_res:
+ field_totals[f_name] = field_totals.get(f_name, 0) + int(f_count)
+ else:
+ all_top_res, field_order = runner.execute_top_n_batch(
+ batch_fields, table_name, actual_cols, schema_types, limit=10
+ )
if all_top_res:
# Group results back by field
@@ -228,33 +319,44 @@ def get_aggregates(
results[f_name]["top"].append(entry)
- # Special handling for individual WAF signals (remains separate due to unnest overhead)
- if "waf_sig_ind" in FIELDS:
- if "waf_sig" in actual_cols:
- q = f"""
- WITH split_data AS (
- SELECT trim(signal) AS signal
- FROM (
- SELECT unnest(string_split("waf_sig", ',')) AS signal
- FROM {table_name}
- WHERE "waf_sig" IS NOT NULL AND "waf_sig" != ''
- )
- WHERE trim(signal) != ''
- ),
- total_count AS (SELECT {CANONICAL_METRICS["requests"]} AS tc FROM split_data),
- top_values AS (
- SELECT signal AS value, {CANONICAL_METRICS["requests"]} AS c
- FROM split_data GROUP BY 1 ORDER BY 2 DESC LIMIT 10
+ # Virtual fields: explode comma-separated CSV columns into individual
+ # rows via unnest(string_split(...)). Generalized helper handles both
+ # waf_sig_ind (backed by waf_sig) and edge_score_reason_ind (backed
+ # by edge_score_reason) — same pattern, different backing columns.
+ def _exploded_top_n(virtual_id: str, backing_col: str) -> None:
+ if virtual_id not in fields:
+ return
+ if backing_col not in actual_cols:
+ results[virtual_id] = {"top": [], "total": 0}
+ return
+ q = f"""
+ WITH split_data AS (
+ SELECT trim(signal) AS signal
+ FROM (
+ SELECT unnest(string_split("{backing_col}", ',')) AS signal
+ FROM {table_name}
+ WHERE "{backing_col}" IS NOT NULL AND "{backing_col}" != '' AND {where_clause}
)
- SELECT tv.value, tv.c, tc.tc FROM top_values tv CROSS JOIN total_count tc
- """
- res = runner.execute(q).fetchall()
- if res:
- results["waf_sig_ind"] = {"top": [{"value": r[0], "count": r[1]} for r in res], "total": res[0][2]}
- else:
- results["waf_sig_ind"] = {"top": [], "total": 0}
+ WHERE trim(signal) != ''
+ ),
+ total_count AS (SELECT {CANONICAL_METRICS["requests"]} AS tc FROM split_data),
+ top_values AS (
+ SELECT signal AS value, {CANONICAL_METRICS["requests"]} AS c
+ FROM split_data GROUP BY 1 ORDER BY 2 DESC LIMIT 10
+ )
+ SELECT tv.value, tv.c, tc.tc FROM top_values tv CROSS JOIN total_count tc
+ """
+ res = runner.execute(q).fetchall()
+ if res:
+ results[virtual_id] = {
+ "top": [{"value": r[0], "count": r[1]} for r in res],
+ "total": res[0][2],
+ }
else:
- results["waf_sig_ind"] = {"top": [], "total": 0}
+ results[virtual_id] = {"top": [], "total": 0}
+
+ _exploded_top_n("waf_sig_ind", "waf_sig")
+ _exploded_top_n("edge_score_reason_ind", "edge_score_reason")
# Special handling for conn_requests (bucketed histogram)
if "conn_requests" in actual_cols:
@@ -268,7 +370,7 @@ def get_aggregates(
END AS bucket,
{CANONICAL_METRICS["requests"]} AS c
FROM {table_name}
- WHERE "conn_requests" IS NOT NULL AND "conn_requests" > 0
+ WHERE "conn_requests" IS NOT NULL AND "conn_requests" > 0 AND {where_clause}
GROUP BY 1
ORDER BY MIN("conn_requests")
"""
@@ -296,7 +398,7 @@ def get_aggregates(
SELECT {time_bucket_select(interval)},
{CANONICAL_METRICS["5xx_rate"]} AS value
FROM {table_name}
- WHERE timestamp IS NOT NULL
+ WHERE timestamp IS NOT NULL AND {where_clause}
GROUP BY 1 ORDER BY 1
"""
elif chart_metric == "4xx" and "status" in actual_cols:
@@ -305,7 +407,7 @@ def get_aggregates(
SELECT {time_bucket_select(interval)},
{CANONICAL_METRICS["4xx_rate"]} AS value
FROM {table_name}
- WHERE timestamp IS NOT NULL
+ WHERE timestamp IS NOT NULL AND {where_clause}
GROUP BY 1 ORDER BY 1
"""
elif chart_metric == "hit_rate" and ("cache" in actual_cols or "resp_state" in actual_cols):
@@ -317,7 +419,7 @@ def get_aggregates(
SELECT {time_bucket_select(interval)},
{hit_rate_expr} AS value
FROM {table_name}
- WHERE timestamp IS NOT NULL
+ WHERE timestamp IS NOT NULL AND {where_clause}
GROUP BY 1 ORDER BY 1
"""
elif chart_metric.endswith("_latency") and ("elapsed" in actual_cols or "elapsed_us" in actual_cols):
@@ -331,7 +433,7 @@ def get_aggregates(
SELECT {time_bucket_select(interval)},
{percentile_ms_expr(sql_elapsed, percentile)} AS value
FROM {table_name}
- WHERE timestamp IS NOT NULL AND {sql_elapsed} IS NOT NULL
+ WHERE timestamp IS NOT NULL AND {sql_elapsed} IS NOT NULL AND {where_clause}
GROUP BY 1 ORDER BY 1
"""
elif chart_metric == "throughput" and "resp_bytes" in actual_cols and "elapsed" in actual_cols:
@@ -343,7 +445,7 @@ def get_aggregates(
SELECT {time_bucket_select(interval)},
{CANONICAL_METRICS["throughput"].format(cache_col=sql_cache, elapsed_col=sql_elapsed_val, resp_bytes_col=sql_resp_bytes)} AS value
FROM {table_name}
- WHERE timestamp IS NOT NULL
+ WHERE timestamp IS NOT NULL AND {where_clause}
GROUP BY 1 ORDER BY 1
"""
elif chart_metric == "req_size" and any(c in actual_cols for c in ["req_header_bytes", "req_bytes"]):
@@ -354,7 +456,7 @@ def get_aggregates(
SELECT {time_bucket_select(interval)},
{CANONICAL_METRICS["req_size"].format(header_bytes_col=header_col, req_bytes_col=body_col)} AS value
FROM {table_name}
- WHERE timestamp IS NOT NULL
+ WHERE timestamp IS NOT NULL AND {where_clause}
GROUP BY 1 ORDER BY 1
"""
elif chart_metric == "ttfb" and "ttfb" in actual_cols:
@@ -363,7 +465,7 @@ def get_aggregates(
SELECT {time_bucket_select(interval)},
{CANONICAL_METRICS["ttfb_ms"]} AS value
FROM {table_name}
- WHERE timestamp IS NOT NULL
+ WHERE timestamp IS NOT NULL AND {where_clause}
GROUP BY 1 ORDER BY 1
"""
else:
@@ -372,11 +474,11 @@ def get_aggregates(
SELECT {time_bucket_select(interval)},
{CANONICAL_METRICS["requests"]} AS value
FROM {table_name}
- WHERE timestamp IS NOT NULL
+ WHERE timestamp IS NOT NULL AND {where_clause}
GROUP BY 1 ORDER BY 1
"""
- ts_res = runner.execute(ts_q, []).fetchall()
+ ts_res = runner.execute(ts_q, params).fetchall()
for r in ts_res:
if r[0] is None:
continue
@@ -388,13 +490,32 @@ def get_aggregates(
# Map data
map_data: list[dict] = []
if "country" in actual_cols:
- map_q = f"""
- SELECT "country" AS country, {CANONICAL_METRICS["requests"]} AS count
- FROM {table_name}
- WHERE "country" IS NOT NULL
- GROUP BY 1
- """
- map_data = [{"country": r[0], "count": r[1]} for r in runner.execute(map_q, []).fetchall()]
+ # When use_rollups is active AND the request asked for country
+ # in its top-N field set, we already have the per-country counts
+ # in all_top_res from the rollup read — re-running the same
+ # GROUP BY on the base view was costing ~140ms of pure
+ # duplication on prod (witnessed 2026-06-04: Q8 = 138ms of a
+ # 1687ms backend total). Derive map_data from all_top_res
+ # instead. The rollup caps at TOP_K=500 per (field, hour)
+ # which for `country` (~200 distinct values worldwide) is
+ # effectively the full distribution; no visible difference
+ # in the choropleth.
+ derived = False
+ if use_rollups and any(f == "country" for f, _, _ in all_top_res):
+ country_counts: dict[str, int] = {}
+ for f_name, f_val, f_count in all_top_res:
+ if f_name == "country" and f_val is not None:
+ country_counts[f_val] = country_counts.get(f_val, 0) + int(f_count)
+ map_data = [{"country": k, "count": v} for k, v in country_counts.items()]
+ derived = True
+ if not derived:
+ map_q = f"""
+ SELECT "country" AS country, {CANONICAL_METRICS["requests"]} AS count
+ FROM {table_name}
+ WHERE "country" IS NOT NULL AND {where_clause}
+ GROUP BY 1
+ """
+ map_data = [{"country": r[0], "count": r[1]} for r in runner.execute(map_q, params).fetchall()]
payload: dict[str, Any] = {
"data": results,
@@ -414,10 +535,11 @@ def get_aggregates(
return payload
finally:
- try:
- con.execute(f"DROP TABLE IF EXISTS {temp_table}")
- except Exception:
- pass
+ if not use_rollups:
+ try:
+ con.execute(f"DROP TABLE IF EXISTS {temp_table}")
+ except Exception:
+ pass
# ── raw ───────────────────────────────────────────────────────────────────────
@@ -519,27 +641,16 @@ def get_raw(
records = filtered_records
col_names = columns
+ # Total-rows + extent come from get_source_extent which itself
+ # prefers the cached config status (populated by the sync cron) and
+ # only falls back to a live aggregate when the cache is missing.
+ # The previous inline COUNT/min/max scanned the whole Iceberg
+ # manifest on every dashboard mount — get_source_extent caches the
+ # warm path and skips the scan entirely in steady state.
try:
- from backend import config as svcconfig
-
- cached_status = svcconfig.get_status(src["name"])
- if cached_status:
- total_rows_total = cached_status.get("local_rows", 0)
- earliest_log_at = cached_status.get("earliest_log_at")
- latest_log_at = cached_status.get("latest_log_at")
- else:
- agg_res = runner.execute(
- f"SELECT {CANONICAL_METRICS['requests']}, min(timestamp), max(timestamp) FROM {table_name}"
- ).fetchone()
- if agg_res:
- total_rows_total = agg_res[0]
- earliest_log_at = safe_iso(agg_res[1])
- latest_log_at = safe_iso(agg_res[2])
+ total_rows_total, earliest_log_at, latest_log_at = get_source_extent(runner, src, table_name)
except Exception:
- try:
- total_rows_total = runner.execute(f"SELECT {CANONICAL_METRICS['requests']} FROM {table_name}").fetchone()[0]
- except Exception:
- pass
+ pass
return {
"columns": col_names,
@@ -716,14 +827,21 @@ def get_field_values(
sorted_vals = sorted(bot_counts.values(), key=lambda x: x["count"], reverse=True)
return {"values": sorted_vals[:limit], "field": field, **runner.telemetry()}
- is_signals_individual = field == "waf_sig_ind"
- backing_col = "waf_sig" if is_signals_individual else clean_field
+ # Virtual fields that explode a CSV backing column: filter-lookup
+ # routes through the same unnest path so click-to-filter on a
+ # specific signal / reason works the same as native columns.
+ _VIRTUAL_BACKING = {
+ "waf_sig_ind": "waf_sig",
+ "edge_score_reason_ind": "edge_score_reason",
+ }
+ is_signals_individual = field in _VIRTUAL_BACKING
+ backing_col = _VIRTUAL_BACKING[field] if is_signals_individual else clean_field
if backing_col not in actual_cols:
raise LookupError(f"Field '{field}' not found")
search_params = list(params)
- if is_signals_individual or clean_field == "waf_sig":
+ if is_signals_individual or clean_field in ("waf_sig", "edge_score_reason"):
search_cond = ""
if search:
search_cond = "AND trim(signal) ILIKE ?"
diff --git a/backend/repositories/insights/repository.py b/backend/repositories/insights/repository.py
index addecd5c..47c456c2 100644
--- a/backend/repositories/insights/repository.py
+++ b/backend/repositories/insights/repository.py
@@ -18,7 +18,14 @@
# ── Caches ────────────────────────────────────────────────────────────────────
INSIGHTS_CACHE_TTL = 300 # seconds
-_insights_cache: dict = {}
+# Bounded + lazy-reaped. Pre-migration this was a plain dict; entries
+# were time-bucketed by ``int(time.time() / TTL)`` so each TTL window
+# minted distinct keys but old buckets were never removed. Across hours
+# of admin use the bucket-count grew linearly. 500 entries × insights
+# payload (~100KB) caps this around ~50MB.
+from backend.utils.bounded_cache import BoundedTTLCache as _BoundedTTLCache
+
+_insights_cache: _BoundedTTLCache = _BoundedTTLCache(maxsize=500, ttl_seconds=INSIGHTS_CACHE_TTL)
_insights_cache_lock = threading.Lock()
diff --git a/backend/repositories/query.py b/backend/repositories/query.py
index bd70f1da..c9a585bc 100644
--- a/backend/repositories/query.py
+++ b/backend/repositories/query.py
@@ -9,22 +9,14 @@
import duckdb
-from backend.repositories._base import _get_schema, _safe_table
-from backend.utils.telemetry import get_tracked_calls
-
-_BLOCKED_KEYWORDS = (
- "DROP",
- "DELETE",
- "UPDATE",
- "INSERT",
- "ALTER",
- "TRUNCATE",
- "CREATE",
- "ATTACH",
- "COPY",
- "EXPORT",
- "IMPORT",
+from backend.repositories._base import _compact_sql_for_debug, _get_schema, _safe_table
+from backend.utils.sql_validator import (
+ SQLValidationError,
+ apply_user_query_limits,
+ has_limit_clause,
+ validate_user_sql,
)
+from backend.utils.telemetry import get_tracked_calls
def execute_query(
@@ -33,16 +25,41 @@ def execute_query(
sql: str,
max_rows: int,
want_explain: bool,
+ *,
+ session_id: str | None = None,
+ service_id: str | None = None,
) -> dict:
if src:
table_name = _safe_table(src["name"])
if table_name != "logs":
sql = re.sub(r"\blogs\b", table_name, sql, flags=re.IGNORECASE)
- sql_upper = sql.upper()
- for kw in _BLOCKED_KEYWORDS:
- if re.search(rf"\b{kw}\b", sql_upper):
- raise PermissionError(f"Only read-only queries are allowed (blocked keyword: {kw})")
+ # Security (Decision B): run the user SQL through the
+ # parse-tree validator. The previous regex-based ``_BLOCKED_KEYWORDS``
+ # check missed:
+ # - read_csv_auto / read_parquet / iceberg_scan family (arbitrary
+ # file/S3 read via table functions)
+ # - getenv / current_setting / duckdb_secrets (env/secret exfil)
+ # - information_schema.* (introspection bypass via non-prefix name)
+ # - INSTALL / LOAD (which don't contain any blocked keyword)
+ # The validator runs ``json_serialize_sql`` and walks the resulting
+ # parse tree so every nested subquery / CTE / table-function is
+ # inspected. See backend/utils/sql_validator.py for the policy.
+ try:
+ validate_user_sql(
+ sql,
+ parser_con=con,
+ session_id=session_id,
+ service_id=service_id,
+ )
+ except SQLValidationError as exc:
+ # PermissionError is what the route handler maps to HTTP 403.
+ raise PermissionError(exc.message) from exc
+
+ # Execution-side defense-in-depth: cap memory and timeout on the
+ # connection before running the user query. Independent of parse
+ # validation — a legal query can still scan 100M rows.
+ apply_user_query_limits(con)
_debug_queries: list[dict] = []
if src:
@@ -55,7 +72,9 @@ def execute_query(
t_exp = time.monotonic()
plan_rows = con.execute(f"EXPLAIN {sql}").fetchall()
explain_plan = "\n".join(r[1] for r in plan_rows if r[1])
- _debug_queries.append({"sql": f"EXPLAIN {sql}", "time_ms": round((time.monotonic() - t_exp) * 1000, 2)})
+ _debug_queries.append(
+ {"sql": _compact_sql_for_debug(f"EXPLAIN {sql}"), "time_ms": round((time.monotonic() - t_exp) * 1000, 2)}
+ )
# Auto-apply LIMIT max_rows+1 when the query doesn't already have one.
# Without this, `SELECT * FROM logs ORDER BY timestamp DESC` materializes
@@ -67,9 +86,15 @@ def execute_query(
# result sets where the LIMIT semantics differ or aren't supported.
exec_sql = sql
sql_stripped_upper = sql.strip().upper().lstrip("(")
- is_simple_select = sql_stripped_upper.startswith(("SELECT", "WITH", "FROM", "VALUES", "TABLE")) and not re.search(
- r"\bLIMIT\b", sql_upper
- )
+ # 026: ``re.search(r"\bLIMIT\b", sql)`` matches inside string
+ # literals (``WHERE name = 'WITHOUT LIMIT'``) and inside SQL
+ # comments — both false positives that cause the auto-wrap to
+ # SKIP wrapping, leaving the query unbounded. The AST-aware
+ # check inspects the parse tree so strings/comments are out of
+ # scope.
+ is_simple_select = sql_stripped_upper.startswith(
+ ("SELECT", "WITH", "FROM", "VALUES", "TABLE")
+ ) and not has_limit_clause(sql, parser_con=con)
if is_simple_select:
# Strip trailing semicolon so the wrapper LIMIT lands in the same statement.
inner = sql.rstrip().rstrip(";")
@@ -79,7 +104,7 @@ def execute_query(
result = con.execute(exec_sql)
df = result.fetchdf()
elapsed_ms = round((time.monotonic() - t0) * 1000, 2)
- _debug_queries.append({"sql": exec_sql.strip(), "time_ms": elapsed_ms})
+ _debug_queries.append({"sql": _compact_sql_for_debug(exec_sql.strip()), "time_ms": elapsed_ms})
fetched_rows = len(df)
if is_simple_select:
diff --git a/backend/repositories/security.py b/backend/repositories/security.py
index 1cfaec6f..4d65f9cd 100644
--- a/backend/repositories/security.py
+++ b/backend/repositories/security.py
@@ -34,66 +34,90 @@ def get_top_bots(
params, where_clause = build_where_clause(start_time, end_time, filters, actual_cols, inline_params=True)
- # ── Arcjet UA-matched bots ────────────────────────────────────────────────
arcjet_bots: list[dict] = []
+ # ── Single filtered TEMP TABLE shared across arcjet UA + NGWAF JOIN ─────
+ # Previously the function ran TWO independent scans over the same
+ # filtered window: a UA TopN (LIMIT 2000) for arcjet classification
+ # then a SECOND scan with an NGWAF JOIN for waf bot names. With the
+ # dashboard's security panel mounted, both ran on every request.
+ # Materializing one filtered temp table with the columns BOTH passes
+ # need (ua + waf_req_id) collapses the scan to one Iceberg manifest
+ # walk and keeps both downstream queries reading from memory.
+ cols_needed: list[str] = []
if "ua" in actual_cols:
- try:
- from backend.utils.bot_sources import build_matcher, get_bot_regex_pattern
-
- pattern = get_bot_regex_pattern(200)
- ua_filter = f"AND regexp_matches(ua, '{pattern.replace(chr(39), chr(39) * 2)}')" if pattern else ""
-
- q = f"""
- SELECT ua, count(*) AS cnt
- FROM {table_name}
- WHERE {where_clause} AND ua IS NOT NULL {ua_filter}
- GROUP BY ua
- ORDER BY cnt DESC
- LIMIT 2000
- """
- rows = runner.execute(q).fetchall()
-
- match_ua = build_matcher()
- bot_counts: dict[str, dict] = {}
- for ua_val, cnt in rows:
- for entry in match_ua(ua_val):
- bot_id = entry.get("id", "unknown")
- if bot_id not in bot_counts:
- cats = entry.get("categories", [])
- bot_counts[bot_id] = {
- "id": bot_id,
- "name": bot_id.replace("-", " ").title(),
- "category": cats[0] if cats else "unknown",
- "request_count": 0,
- }
- bot_counts[bot_id]["request_count"] += cnt
-
- arcjet_bots = sorted(bot_counts.values(), key=lambda x: x["request_count"], reverse=True)[:n]
- except Exception as e:
- logging.getLogger(__name__).error("[security] arcjet top bots failed: %s", e)
+ cols_needed.append("ua")
+ if "waf_req_id" in actual_cols:
+ cols_needed.append("waf_req_id")
+ # If the schema has neither (very minimal log_fields preset), skip
+ # both passes — there's nothing to classify.
+ if not cols_needed:
+ return {"bots": [], "ngwaf_bots": []}
+
+ # Use QueryRunner.temp_table context manager so the DROP runs even
+ # if an intermediate query raises (was a manual try/finally before).
+ with runner.temp_table(cols_needed, actual_cols, table_name, where_clause, params) as temp_table:
+ if temp_table is None:
+ return {"bots": [], "ngwaf_bots": []}
+ if "ua" in actual_cols:
+ try:
+ from backend.utils.bot_sources import build_matcher, get_bot_regex_pattern
- # ── NGWAF cache bot names ─────────────────────────────────────────────────
- ngwaf_bots: list[dict] = []
- from backend.repositories._base import attach_ngwaf_cache
+ pattern = get_bot_regex_pattern(200)
+ ua_filter = f"AND regexp_matches(ua, '{pattern.replace(chr(39), chr(39) * 2)}')" if pattern else ""
- with attach_ngwaf_cache(con, actual_cols, alias="ngwaf_top") as attached:
- if attached:
- try:
q = f"""
- SELECT nb.bot_name, nb.category, count(*) AS cnt
- FROM {table_name}
- INNER JOIN ngwaf_top.ngwaf_bots nb USING (waf_req_id)
- WHERE {where_clause} AND nb.bot_name IS NOT NULL
- GROUP BY 1, 2
- ORDER BY 3 DESC
- LIMIT {n}
+ SELECT ua, count(*) AS cnt
+ FROM {temp_table}
+ WHERE ua IS NOT NULL {ua_filter}
+ GROUP BY ua
+ ORDER BY cnt DESC
+ LIMIT 2000
"""
- res = runner.execute(q).fetchall()
- ngwaf_bots = [{"name": r[0], "category": r[1], "request_count": r[2]} for r in res]
+ rows = runner.execute(q).fetchall()
+
+ match_ua = build_matcher()
+ bot_counts: dict[str, dict] = {}
+ for ua_val, cnt in rows:
+ for entry in match_ua(ua_val):
+ bot_id = entry.get("id", "unknown")
+ if bot_id not in bot_counts:
+ cats = entry.get("categories", [])
+ bot_counts[bot_id] = {
+ "id": bot_id,
+ "name": bot_id.replace("-", " ").title(),
+ "category": cats[0] if cats else "unknown",
+ "request_count": 0,
+ }
+ bot_counts[bot_id]["request_count"] += cnt
+
+ arcjet_bots = sorted(bot_counts.values(), key=lambda x: x["request_count"], reverse=True)[:n]
except Exception as e:
- logging.getLogger(__name__).error("[security] NGWAF top bots failed: %s", e)
-
- return {"bots": arcjet_bots, "ngwaf_bots": ngwaf_bots}
+ logging.getLogger(__name__).error("[security] arcjet top bots failed: %s", e)
+
+ # ── NGWAF cache bot names ─────────────────────────────────────────────
+ ngwaf_bots: list[dict] = []
+ from backend.repositories._base import attach_ngwaf_cache
+
+ with attach_ngwaf_cache(con, actual_cols, alias="ngwaf_top") as attached:
+ if attached:
+ try:
+ # Join against the temp table instead of re-scanning the
+ # source view — same filter window, no second manifest walk.
+ q = f"""
+ SELECT nb.bot_name, nb.category, count(*) AS cnt
+ FROM {temp_table} t
+ INNER JOIN ngwaf_top.ngwaf_bots nb USING (waf_req_id)
+ WHERE nb.bot_name IS NOT NULL
+ GROUP BY 1, 2
+ ORDER BY 3 DESC
+ LIMIT {n}
+ """
+ res = runner.execute(q).fetchall()
+ ngwaf_bots = [{"name": r[0], "category": r[1], "request_count": r[2]} for r in res]
+ except Exception as e:
+ logging.getLogger(__name__).error("[security] NGWAF top bots failed: %s", e)
+
+ return {"bots": arcjet_bots, "ngwaf_bots": ngwaf_bots, **runner.telemetry()}
def get_security_aggregates(
diff --git a/backend/repositories/sessions.py b/backend/repositories/sessions.py
index 2f3aaa56..7ec69299 100644
--- a/backend/repositories/sessions.py
+++ b/backend/repositories/sessions.py
@@ -171,8 +171,7 @@ def get_sessions(
data_sql = f"""
{sessions_cte}
- SELECT *, ({flag_expr}) AS flagged,
- COUNT(*) OVER () AS _total_count
+ SELECT *, ({flag_expr}) AS flagged
FROM sessions_agg
{flagged_filter}
ORDER BY {sort_by} {sort_dir}
@@ -181,16 +180,14 @@ def get_sessions(
rows = runner.execute(data_sql, params).fetchall()
col_names = [desc[0] for desc in con.description]
- total = 0
sessions: list[dict] = []
for row in rows:
d = dict(zip(col_names, row))
- total = int(d.pop("_total_count", 0))
for k in ("session_start", "session_end"):
if d.get(k) is not None:
d[k] = str(d[k])
- # Ensure we have ua and ja4 if requested in group_cols
sessions.append(d)
+ total = len(sessions)
if not rows and offset > 0:
count_sql = f"""
diff --git a/backend/repositories/views.py b/backend/repositories/views.py
index b26c0476..9e6155cc 100644
--- a/backend/repositories/views.py
+++ b/backend/repositories/views.py
@@ -30,6 +30,27 @@ def _find_view_service(view_id: str) -> str | None:
return None
+def get_view_by_id(view_id: str) -> dict | None:
+ """Return the saved-view row whose id matches ``view_id`` (or None).
+
+ Security mirror of ``alerts.get_alert_by_id`` — the router-level
+ cross-tenant scope gate calls this before delete_view so an
+ unauthorized analyst gets 403 without the row being deleted.
+ """
+ for cfg in svcconfig.list_configs():
+ sid = cfg.get("service_id")
+ if not sid:
+ continue
+ for v in metadata_db.list_views(sid):
+ if v.get("id") == view_id:
+ # Stamp the owning service_id onto the result so the
+ # caller's scope check can compare without re-scanning.
+ out = dict(v)
+ out.setdefault("service_id", sid)
+ return out
+ return None
+
+
def delete_view(view_id: str, service_id_hint: str | None = None) -> dict:
sid = service_id_hint or _find_view_service(view_id)
if not sid:
diff --git a/backend/routers/admin.py b/backend/routers/admin.py
index feff30aa..f2d7fe1b 100644
--- a/backend/routers/admin.py
+++ b/backend/routers/admin.py
@@ -186,7 +186,7 @@ def ingest_endpoint(
from fastapi import HTTPException
from backend.core.duckdb import start_cron_run
- from backend.cron_progress import _run_metadata, start_progress
+ from backend.cron_progress import list_active_runs, start_progress
from backend.repositories.dashboard import _dashboard_cache
from backend.scheduler import _run_metadata_sync, _run_service_cron
@@ -207,9 +207,9 @@ def ingest_endpoint(
t.start()
except RuntimeError as e:
run_id = None
- for rid, meta in _run_metadata.items():
- if meta.get("service_id") == source["name"] and meta.get("task") == "metadata_sync":
- run_id = rid
+ for entry in list_active_runs():
+ if entry.get("service_id") == source["name"] and entry.get("task") == "metadata_sync":
+ run_id = entry["run_id"]
break
if run_id is None:
raise HTTPException(status_code=503, detail={"error": str(e), "busy": True})
@@ -235,9 +235,9 @@ def ingest_endpoint(
t.start()
except RuntimeError as e:
run_id = None
- for rid, meta in _run_metadata.items():
- if meta.get("service_id") == src["name"] and meta.get("task") == "sync":
- run_id = rid
+ for entry in list_active_runs():
+ if entry.get("service_id") == src["name"] and entry.get("task") == "sync":
+ run_id = entry["run_id"]
break
if run_id is None:
raise HTTPException(status_code=503, detail={"error": str(e), "busy": True})
@@ -354,7 +354,32 @@ def download_file(
if not key:
raise HTTPException(status_code=400, detail={"error": "Missing key parameter"})
- local_path = os.path.abspath(os.path.join(_cache_dir(source), key))
+ # Cross-tenant guard: a single FOS bucket can host multiple services
+ # separated by per-source prefixes. The path-traversal cage below
+ # bounds local cache reads, but a sibling-tenant key like
+ # ``other_tenant/file.log`` would still mint a presigned URL or CDN
+ # redirect for that object. Require the key to live under this
+ # service's prefix before any FOS / CDN URL minting.
+ src_prefix = source.get("prefix", "")
+ if src_prefix and not key.startswith(src_prefix):
+ raise HTTPException(status_code=400, detail={"error": "invalid_key"})
+
+ # Security: ``os.path.join(base, key)`` returns ``key`` when
+ # ``key`` is absolute, which a malicious caller exploits by passing
+ # ``key=/etc/passwd``. Resolve both paths and require commonpath ==
+ # cache_dir so a path-traversal payload (absolute path or
+ # ``../../../etc/passwd``) is rejected at the boundary.
+ cache_dir = os.path.realpath(_cache_dir(source))
+ candidate = os.path.realpath(os.path.join(cache_dir, key))
+ try:
+ common = os.path.commonpath([cache_dir, candidate])
+ except ValueError:
+ # commonpath raises ValueError when paths have different drives /
+ # mixed absolute/relative. Treat as path-escape and reject.
+ raise HTTPException(status_code=400, detail={"error": "invalid_key"})
+ if common != cache_dir:
+ raise HTTPException(status_code=400, detail={"error": "invalid_key"})
+ local_path = candidate
if os.path.exists(local_path):
return FileResponse(local_path, filename=os.path.basename(local_path))
@@ -448,7 +473,10 @@ def zip_worker(q: queue.Queue):
cdn = src.get("cdn_url", "").rstrip("/")
fos_client = _db._get_fos_client(src)
paginator = fos_client.get_paginator("list_objects_v2", caller_hint="download_all")
- pages = paginator.paginate(Bucket=src["bucket"])
+ # Cross-tenant guard: scope to this service's prefix
+ # so a shared bucket with multiple services doesn't
+ # leak sibling data into the zip.
+ pages = paginator.paginate(Bucket=src["bucket"], Prefix=src.get("prefix", ""))
for page in pages:
if "Contents" not in page:
@@ -472,7 +500,33 @@ def zip_worker(q: queue.Queue):
return StreamingResponse(_stream_from_worker(zip_worker), media_type="application/zip", headers=headers)
+_DIR_SIZE_CACHE: dict[str, tuple[float, int]] = {}
+_DIR_SIZE_TTL_S = 30.0
+
+
def _get_dir_size(path: str) -> int:
+ # Cache results per-path with a 30s TTL. The cache walk is O(files-in-tree)
+ # and the per-service cache grew from ~300 files to ~19k after the rollups
+ # backfill (one parquet per field × hour). At ~700ms per uncached walk,
+ # SyncStatusBadge's 15s poll was paying that cost on every refresh; the
+ # cache turns it into a single getsize_sum sweep per minute.
+ #
+ # Files only grow incrementally (ingest + rollup-recompute) so a 30s
+ # staleness window means the dashboard's reported disk usage can lag by
+ # at most that window. Worth it for the perf vs measuring exact-to-the-
+ # millisecond size on a poll endpoint.
+ import time as _t
+
+ now = _t.monotonic()
+ cached = _DIR_SIZE_CACHE.get(path)
+ if cached is not None and (now - cached[0]) < _DIR_SIZE_TTL_S:
+ return cached[1]
+ total = _scan_dir_size(path)
+ _DIR_SIZE_CACHE[path] = (now, total)
+ return total
+
+
+def _scan_dir_size(path: str) -> int:
total = 0
if not os.path.exists(path):
return 0
@@ -482,7 +536,7 @@ def _get_dir_size(path: str) -> int:
if entry.is_file():
total += entry.stat().st_size
elif entry.is_dir():
- total += _get_dir_size(entry.path)
+ total += _scan_dir_size(entry.path)
except Exception:
pass
return total
@@ -512,13 +566,28 @@ def sync_status(
return SyncStatusResponse.with_telemetry(configured=False)
try:
- from backend.core.duckdb import get_connection
+ # Fast path: skip_fos=true callers (FilterBar polling, badge in
+ # the page header, etc.) only need the cached snapshot that the
+ # sync cron refreshes every minute. Return it without grabbing a
+ # DuckDB connection, so that a busy dashboard load — agg/raw/
+ # bots all racing for connections — doesn't starve sync-status
+ # and trigger 503s when its max_wait expires.
+ cached_status = svcconfig.get_status(src["name"]) if skip_fos and not force else None
+ # get_status returns {} (not None) when no status has been
+ # persisted yet — fall through to the DB path in that case.
+ if cached_status:
+ cached_status["access_level"] = src.get("access_level", "read_write")
+ cached_status["storage_mode"] = _db.STORAGE_MODE
+ cached_status["configured"] = True
+ status = cached_status
+ else:
+ from backend.core.duckdb import get_connection
- _con = get_connection(source=src, max_wait=5, skip_view_update=True)
- try:
- status = get_sync_status(_con, src, skip_fos=skip_fos, force=force)
- finally:
- _con.close()
+ _con = get_connection(source=src, max_wait=5, skip_view_update=True)
+ try:
+ status = get_sync_status(_con, src, skip_fos=skip_fos, force=force)
+ finally:
+ _con.close()
db_path = src.get("duckdb_path") or svcconfig.duckdb_path(service_id)
db_exists = os.path.exists(db_path)
@@ -604,6 +673,195 @@ def compaction_stats(source: dict = Depends(get_source)):
return _lc.compaction_stats(source)
+@router.patch("/admin/metadata-retention")
+def update_metadata_retention(body: dict, source: dict = Depends(get_source)):
+ """Update the per-service ``metadata_retention`` config block.
+
+ Body shape: any subset of ``{usage_log_days, ingested_files_days,
+ cron_runs_days}``. Each value is coerced to int; negative / non-numeric
+ inputs are clamped to 0 (which disables cleanup for that table per
+ cleanup_metadata's semantics). Missing keys preserve their current
+ value. Returns the resolved retention (defaults merged with cfg) so the
+ UI can confirm what was saved.
+ """
+ from backend import config as svcconfig
+ from backend.core import metadata_db as _mdb
+ from backend.core.metadata_db import DEFAULT_METADATA_RETENTION
+
+ service_id = source["name"]
+ cfg = svcconfig.load_config(service_id)
+ if cfg is None:
+ raise HTTPException(status_code=404, detail={"error": "Service not found"})
+
+ from backend.core.metadata_db import is_ingested_files_dedup_active
+
+ current = dict(cfg.get("metadata_retention") or {})
+ for key in ("usage_log_days", "ingested_files_days", "cron_runs_days"):
+ if key in body:
+ try:
+ v = int(body[key])
+ except (TypeError, ValueError):
+ v = 0
+ current[key] = max(0, v)
+
+ # Mirror the cleanup helper's safety override at the write layer:
+ # if delete_after=false on this service, refuse to persist a non-zero
+ # ingested_files_days. Storing it would mislead the operator into
+ # thinking the value will be honored when the cleanup ignores it.
+ if not is_ingested_files_dedup_active(service_id) and int(current.get("ingested_files_days") or 0) > 0:
+ current["ingested_files_days"] = 0
+
+ cfg["metadata_retention"] = current
+ svcconfig.save_config(service_id, cfg)
+ try:
+ _mdb.record_audit(
+ service_id=service_id,
+ event_type="metadata_retention_update",
+ details=current,
+ )
+ except Exception:
+ pass
+
+ return {"retention": {**DEFAULT_METADATA_RETENTION, **current}}
+
+
+@router.get("/admin/metadata-storage")
+def metadata_storage(source: dict = Depends(get_source)):
+ """Per-table row count + estimated bytes for this service's metadata.db.
+
+ Includes the resolved retention policy (per-service cfg merged with
+ defaults). The UI uses this to render the Metadata Storage card on
+ the admin page — table sizes, bytes, and a Cleanup-now button.
+ """
+ from backend import config as svcconfig
+ from backend.core.metadata_db import (
+ DEFAULT_METADATA_RETENTION,
+ get_metadata_storage_stats,
+ is_ingested_files_dedup_active,
+ )
+
+ service_id = source["name"]
+ stats = get_metadata_storage_stats(service_id)
+ cfg = svcconfig.load_config(service_id) or {}
+ retention = {**DEFAULT_METADATA_RETENTION, **(cfg.get("metadata_retention") or {})}
+ # ingested_files_locked surfaces the safety override: when
+ # cron_sync.delete_after=False the ingested_files table is the
+ # dedup gate, so the cleanup helper force-disables its trimming
+ # regardless of the configured retention. UI uses this to disable
+ # the input + show a tooltip explaining the override.
+ ingested_files_locked = not is_ingested_files_dedup_active(service_id)
+ return {**stats, "retention": retention, "ingested_files_locked": ingested_files_locked}
+
+
+@router.post("/admin/metadata-cleanup")
+def metadata_cleanup_now(source: dict = Depends(get_source)):
+ """Trigger an immediate metadata cleanup, streaming progress as SSE.
+
+ Equivalent to the daily ``metadata_cleanup`` cron at 03:15 UTC but
+ on-demand. The DELETE phase is fast; VACUUM rewrites the whole file
+ and on a multi-GB metadata.db can take minutes. Streaming gives the
+ operator real-time feedback instead of a 5-minute hang behind a
+ spinning button.
+
+ Event shapes (between SSE ``data:`` lines):
+
+ {"type": "status", "message": str}
+ {"type": "progress", "current": int, "total": int, "message": str}
+ {"type": "done", "message": str, "result": {...}}
+ {"type": "error", "message": str}
+
+ Writes a row to ``cron_runs`` with task=``metadata_cleanup`` so the
+ manual run shows up on the Data Management schedule + history grid
+ alongside the scheduled cron's runs.
+ """
+ import json as _json
+ import queue as _queue
+ import threading
+ import time as _t
+
+ from backend import config as svcconfig
+ from backend.core.duckdb import log_cron_run, start_cron_run
+ from backend.core.metadata_db import cleanup_metadata
+
+ service_id = source["name"]
+ cfg = svcconfig.load_config(service_id) or {}
+ retention = cfg.get("metadata_retention") or {}
+
+ # Bridge cleanup_metadata's on_event callback to the SSE generator via
+ # a thread-safe queue. The worker thread runs the cleanup synchronously
+ # (DELETE then VACUUM — both block the SQLite writer) and pushes events
+ # as they happen; the streaming generator consumes them and yields SSE
+ # frames. Sentinel ``None`` marks end-of-stream.
+ events: _queue.Queue = _queue.Queue()
+
+ def worker():
+ started = _t.time()
+ run_id = start_cron_run(source, "metadata_cleanup")
+ try:
+ result = cleanup_metadata(service_id, retention, on_event=events.put)
+ except Exception as e:
+ err = str(e)
+ events.put({"type": "error", "message": f"Cleanup failed: {err}"})
+ try:
+ log_cron_run(
+ source,
+ "metadata_cleanup",
+ _t.time() - started,
+ "error",
+ error_message=err,
+ summary=f"cleanup failed: {err}",
+ run_id=run_id,
+ )
+ finally:
+ events.put(None)
+ return
+
+ total_deleted = sum(result["deleted"].values())
+ if total_deleted:
+ parts = [f"{t}={n}" for t, n in result["deleted"].items() if n]
+ summary = (
+ f"Trimmed {total_deleted:,} rows ({', '.join(parts)}). "
+ f"VACUUM={'yes' if result['vacuumed'] else 'skipped'}."
+ )
+ else:
+ summary = "No rows older than retention windows."
+ try:
+ log_cron_run(
+ source,
+ "metadata_cleanup",
+ _t.time() - started,
+ "success",
+ summary=summary,
+ rows_ingested=total_deleted,
+ run_id=run_id,
+ )
+ finally:
+ events.put({"type": "done", "message": summary, "result": result})
+ events.put(None)
+
+ threading.Thread(target=worker, daemon=True, name=f"metadata-cleanup-{service_id}").start()
+
+ def stream():
+ # Pre-pad to defeat any reverse-proxy / browser buffering; SSE
+ # clients flush on the first blank-line delimiter.
+ yield ":" + " " * 2048 + "\n\n"
+ while True:
+ event = events.get()
+ if event is None:
+ break
+ yield f"data: {_json.dumps(event)}\n\n"
+
+ return StreamingResponse(
+ stream(),
+ media_type="text/event-stream",
+ headers={
+ "Cache-Control": "no-cache, no-transform",
+ "X-Accel-Buffering": "no",
+ "Connection": "keep-alive",
+ },
+ )
+
+
@router.get("/admin/health-snapshot")
def health_snapshot():
"""One-shot health snapshot for the admin page system health card.
@@ -664,17 +922,22 @@ def health_snapshot():
out[label] = None
# ── In-flight cron runs ──────────────────────────────────────────
+ # Use list_active_runs() (which filters out runs whose last event is
+ # done/error) instead of iterating _run_metadata directly. The dict
+ # holds entries for an hour after completion (the cleanup TTL), so the
+ # raw iteration was showing dozens of stale "sync" entries in the
+ # System Health card.
try:
- from backend.cron_progress import _run_metadata
+ from backend.cron_progress import list_active_runs
in_flight = []
- for run_id, meta in list(_run_metadata.items()):
+ for entry in list_active_runs():
in_flight.append(
{
- "run_id": run_id,
- "service_id": meta.get("service_id"),
- "task": meta.get("task"),
- "started_at": meta.get("started_at"),
+ "run_id": entry["run_id"],
+ "service_id": entry.get("service_id"),
+ "task": entry.get("task"),
+ "started_at": entry.get("started_at"),
}
)
out["in_flight_runs"] = in_flight
@@ -730,15 +993,16 @@ def _fetch_fastly_log_counts(
uses (`YYYY-MM-DDTHH` for hour, `YYYY-MM-DD` for day) so the outer-join
in api_log_accounting can key on string equality directly.
"""
- import json
import logging
- import urllib.request
from datetime import UTC, datetime
- url = f"https://api.fastly.com/stats/service/{logging_svc_id}?by={by}&from={from_ts}&to={to_ts}"
- req = urllib.request.Request(url, headers={"Fastly-Key": api_key, "Accept": "application/json"})
- with urllib.request.urlopen(req, timeout=30) as resp:
- payload = json.loads(resp.read().decode())
+ from backend.core.fastly.client import fastly
+
+ payload = fastly(
+ "GET",
+ f"/stats/service/{logging_svc_id}?by={by}&from={from_ts}&to={to_ts}",
+ token=api_key,
+ )
width = 13 if by == "hour" else 10
records = payload.get("data", []) or []
@@ -1013,12 +1277,12 @@ def iceberg_commit_endpoint(source: dict = Depends(get_source)):
return {"ok": True, "message": "Commit started.", "run_id": run_id}
except RuntimeError as e:
- from backend.cron_progress import _run_metadata
+ from backend.cron_progress import list_active_runs
run_id = None
- for rid, meta in _run_metadata.items():
- if meta.get("service_id") == source["name"] and meta.get("task") == "commit":
- run_id = rid
+ for entry in list_active_runs():
+ if entry.get("service_id") == source["name"] and entry.get("task") == "commit":
+ run_id = entry["run_id"]
break
if run_id is None:
raise HTTPException(status_code=503, detail={"error": str(e), "busy": True})
diff --git a/backend/routers/alerts.py b/backend/routers/alerts.py
index a08ec953..7b23f46b 100644
--- a/backend/routers/alerts.py
+++ b/backend/routers/alerts.py
@@ -5,7 +5,7 @@
from datetime import UTC
import duckdb
-from fastapi import APIRouter, Depends
+from fastapi import APIRouter, Depends, HTTPException, Request
from pydantic import BaseModel
from backend.deps import get_con, get_service_id
@@ -16,16 +16,44 @@
router = APIRouter(prefix="/api/alerts", tags=["alerts"])
+def _analyst_allowed_services(request: Request) -> set[str] | None:
+ """Return the set of service IDs the caller (analyst session) can see,
+ or ``None`` for admin requests (no scope restriction).
+
+ Security: every read / mutation on the alerts collection must
+ filter by this set so an analyst scoped to ``svc-A`` cannot
+ enumerate or modify ``svc-B``'s alerts via the cross-tenant pattern
+ GET /api/alerts/ , GET /api/alerts/{other_id}, etc.
+ """
+ analyst_session = getattr(request.state, "analyst_session", None)
+ if analyst_session is None:
+ return None # admin — unrestricted
+ return set(analyst_session.service_ids or [])
+
+
@router.get("/", response_model=AlertListResponse)
-def list_all_alerts():
+def list_all_alerts(request: Request):
+ """Return alerts visible to the caller.
+
+ Admin: every alert across every service. Analyst: only alerts for
+ services in their invite's scope (security).
+ """
+ allowed = _analyst_allowed_services(request)
alerts = repo.get_alerts()
+ if allowed is not None:
+ alerts = [a for a in alerts if a.get("service_id") in allowed]
from datetime import datetime
return AlertListResponse.with_telemetry(data=alerts, evaluated_at=datetime.now(UTC).isoformat())
@router.get("/{service_id}", response_model=AlertListResponse)
-def list_service_alerts(service_id: str):
+def list_service_alerts(service_id: str, request: Request):
+ """Return alerts for one service. Analyst gets 403 if the service
+ isn't in their invite (security)."""
+ allowed = _analyst_allowed_services(request)
+ if allowed is not None and service_id not in allowed:
+ raise HTTPException(status_code=403, detail={"error": "service_not_authorized", "service": service_id})
alerts = repo.get_alerts(service_id)
from datetime import datetime
@@ -33,20 +61,44 @@ def list_service_alerts(service_id: str):
@router.post("/", response_model=AlertResponse)
-def create_alert(alert: Alert):
+def create_alert(alert: Alert, request: Request):
+ """Create an alert. Analyst can only create alerts for services in
+ their invite scope (security). The Phase-1 analyst middleware
+ also blocks POSTs on /api/alerts for analysts entirely (not in the
+ _ANALYST_ALLOWED_WRITE_PREFIXES list), so this is defense-in-depth
+ for the admin-impersonating-analyst case."""
+ allowed = _analyst_allowed_services(request)
+ if allowed is not None and alert.service_id not in allowed:
+ raise HTTPException(
+ status_code=403,
+ detail={"error": "service_not_authorized", "service": alert.service_id},
+ )
res = repo.save_alert(alert)
sync_admin_state(alert.service_id)
return AlertPreviewResponse.with_telemetry(data=res)
@router.post("/preview", response_model=AlertPreviewResponse)
-def preview_alert(alert: Alert, lookback_hours: int = 24, con: duckdb.DuckDBPyConnection = Depends(get_con)):
+def preview_alert(
+ alert: Alert,
+ request: Request,
+ lookback_hours: int = 24,
+ con: duckdb.DuckDBPyConnection = Depends(get_con),
+):
import datetime
- from fastapi import HTTPException
-
from backend.core.duckdb import _safe_table_name, get_source_for_service
+ # Security: analyst can only preview alerts against their scoped
+ # services. Without this an analyst could compose an Alert against
+ # another tenant's service_id and read its time-series data.
+ allowed = _analyst_allowed_services(request)
+ if allowed is not None and alert.service_id not in allowed:
+ raise HTTPException(
+ status_code=403,
+ detail={"error": "service_not_authorized", "service": alert.service_id},
+ )
+
src = get_source_for_service(alert.service_id)
if not src:
raise HTTPException(status_code=404, detail="Service not found")
@@ -137,14 +189,47 @@ class _ToggleBody(BaseModel):
@router.patch("/{alert_id}/enabled", response_model=AlertResponse)
-def toggle_alert_enabled(alert_id: str, body: _ToggleBody, service_id: str | None = Depends(get_service_id)):
+def toggle_alert_enabled(
+ alert_id: str,
+ body: _ToggleBody,
+ request: Request,
+ service_id: str | None = Depends(get_service_id),
+):
+ # Security: pre-flight scope check BEFORE the mutation. Earlier
+ # implementation toggled first and then 403'd on the result, so a
+ # cross-tenant write would still land and the analyst would just see
+ # an error after the fact. Now the toggle never runs for an
+ # unauthorized session.
+ allowed = _analyst_allowed_services(request)
+ if allowed is not None:
+ existing = repo.get_alert_by_id(alert_id)
+ if existing and existing.get("service_id") not in allowed:
+ raise HTTPException(
+ status_code=403,
+ detail={"error": "service_not_authorized", "service": existing.get("service_id")},
+ )
res = repo.toggle_alert(alert_id, body.enabled, service_id_hint=service_id)
sync_admin_state(res.get("service_id"))
return AlertPreviewResponse.with_telemetry(data=res)
@router.delete("/{alert_id}", response_model=AlertResponse)
-def delete_alert(alert_id: str, service_id: str | None = Depends(get_service_id)):
+def delete_alert(
+ alert_id: str,
+ request: Request,
+ service_id: str | None = Depends(get_service_id),
+):
+ # Pre-flight scope check: look up the alert's service_id before
+ # deleting so we don't leak the existence of cross-tenant alerts
+ # via a delete-then-403 pattern.
+ allowed = _analyst_allowed_services(request)
+ if allowed is not None:
+ existing = repo.get_alert_by_id(alert_id)
+ if existing and existing.get("service_id") not in allowed:
+ raise HTTPException(
+ status_code=403,
+ detail={"error": "service_not_authorized", "service": existing.get("service_id")},
+ )
res = repo.delete_alert(alert_id, service_id_hint=service_id)
sync_admin_state(res.get("service_id"))
return AlertPreviewResponse.with_telemetry(data=res)
diff --git a/backend/routers/bootstrap.py b/backend/routers/bootstrap.py
index 94954a6e..e38f5b0b 100644
--- a/backend/routers/bootstrap.py
+++ b/backend/routers/bootstrap.py
@@ -3,7 +3,7 @@
from __future__ import annotations
import duckdb
-from fastapi import APIRouter, Depends, Request
+from fastapi import APIRouter, Depends, HTTPException, Request
from backend.deps import get_meta_con, get_service_id, get_source
from backend.models.common import BootstrapResponse
@@ -18,7 +18,7 @@ def bootstrap(
service_id: str | None = Depends(get_service_id),
):
from backend.core import duckdb as _db
- from backend.core.duckdb import STORAGE_MODE, get_schema
+ from backend.core.duckdb import STORAGE_MODE
from backend.services.service_manager import get_enriched_services
from backend.utils.countries import COUNTRY_MAP
from backend.utils.pop_utils import get_pop_lat_lon_map
@@ -80,20 +80,15 @@ def bootstrap(
if active_svc and active_svc.get("status"):
schema = active_svc["status"].get("schema", [])
- if not schema and valid_active_id:
- src = _db.get_source_for_service(valid_active_id)
- if src:
- try:
- from backend.core.duckdb import get_connection
-
- # read_only: schema lookup only, no writes.
- con = get_connection(source=src, max_wait=3, skip_view_update=True, read_only=True)
- try:
- schema = get_schema(con, src)
- finally:
- con.close()
- except Exception:
- pass
+ # NOTE: the previous fallback opened a read-only DuckDB connection here
+ # and ran get_schema() against the source on cold-cache loads. That call
+ # acquired the per-service lock + did a parquet glob, costing 1-3s on
+ # the very first /api/bootstrap after a backend restart and blocking
+ # the whole admin UI from rendering. With the status-refresh cron
+ # populating active_svc["status"]["schema"], the cache is the source
+ # of truth — drop the fallback. If schema is empty here, the dashboard
+ # renders without a hint banner; the user can refresh once the cron
+ # has run (typically <60s after startup).
pops = get_pop_lat_lon_map()
@@ -147,13 +142,26 @@ def bootstrap(
@router.get("/sources")
@query_errors(status_code=500)
-def sources_endpoint():
+def sources_endpoint(request: Request):
+ """Return storage metadata (endpoint / bucket / prefix / region) for the
+ configured sources the caller is authorized to see.
+
+ Security: filter by analyst session scope. Without this, an
+ authenticated analyst can enumerate every service's S3 bucket / endpoint
+ / prefix configuration, including ones not in their invite. Admin
+ requests (no analyst_session on request.state) see the full list.
+ """
from backend import config as svcconfig
from backend.core.duckdb import _safe_table_name
+ analyst_session = getattr(request.state, "analyst_session", None)
+ allowed: set[str] | None = set(analyst_session.service_ids or []) if analyst_session else None
+
configs = svcconfig.list_configs()
sources = []
for cfg in configs:
+ if allowed is not None and cfg.get("service_id") not in allowed:
+ continue
src = svcconfig.config_to_source(cfg)
sources.append(
{
@@ -171,12 +179,25 @@ def sources_endpoint():
@router.get("/schema")
@query_errors(status_code=500)
def schema_endpoint(
+ request: Request,
source: dict = Depends(get_source),
con: duckdb.DuckDBPyConnection = Depends(get_meta_con),
):
from backend import config as svcconfig
from backend.core.duckdb import _safe_table_name, get_schema
+ # Cross-tenant guard: an analyst session scoped to ``svc-A`` must not
+ # be able to read ``svc-B``'s schema (custom-field names, types, and
+ # PII flags). Mirrors the check in ``log_fields_catalog``.
+ analyst_session = getattr(request.state, "analyst_session", None)
+ if analyst_session is not None:
+ allowed = set(analyst_session.service_ids or [])
+ if source.get("name") not in allowed:
+ raise HTTPException(
+ status_code=403,
+ detail={"error": "service_not_authorized", "service": source.get("name")},
+ )
+
# Try cache first
cached_status = svcconfig.get_status(source["name"])
if cached_status and "schema" in cached_status:
@@ -187,10 +208,29 @@ def schema_endpoint(
@router.get("/log-fields/catalog")
@query_errors(status_code=500)
-def log_fields_catalog(service_id: str | None = Depends(get_service_id)):
+def log_fields_catalog(
+ request: Request,
+ service_id: str | None = Depends(get_service_id),
+):
+ """Return the log-fields catalog for the requested service.
+
+ Security: enforce analyst session scope on the requested
+ ``service_id``. Without this, an analyst scoped to ``svc-A`` can pass
+ ``?service_id=svc-B`` and read svc-B's custom field configuration
+ (including PII-related field configs).
+ """
from backend.core import log_fields as lf
from backend.core.log_fields import INSIGHT_DEFINITIONS
+ analyst_session = getattr(request.state, "analyst_session", None)
+ if analyst_session is not None and service_id is not None:
+ allowed = set(analyst_session.service_ids or [])
+ if service_id not in allowed:
+ raise HTTPException(
+ status_code=403,
+ detail={"error": "service_not_authorized", "service": service_id},
+ )
+
# Try to load existing limits
field_limits = {}
if service_id:
@@ -225,11 +265,24 @@ def log_fields_catalog(service_id: str | None = Depends(get_service_id)):
@router.get("/insight-availability", response_model=InsightsAvailabilityResponse)
@query_errors(status_code=500)
def insight_availability(
+ request: Request,
source: dict = Depends(get_source),
con: duckdb.DuckDBPyConnection = Depends(get_meta_con),
):
from backend.core.duckdb import get_schema
+ # Cross-tenant guard: insight availability discloses which fields are
+ # populated (presence/absence of optional columns), so it needs the
+ # same scope check as the schema endpoint.
+ analyst_session = getattr(request.state, "analyst_session", None)
+ if analyst_session is not None:
+ allowed = set(analyst_session.service_ids or [])
+ if source.get("name") not in allowed:
+ raise HTTPException(
+ status_code=403,
+ detail={"error": "service_not_authorized", "service": source.get("name")},
+ )
+
actual_cols = {col["name"] for col in get_schema(con, source)}
from backend.core.log_fields import INSIGHT_DEFINITIONS
diff --git a/backend/routers/debug.py b/backend/routers/debug.py
index 028df86a..ab507db1 100644
--- a/backend/routers/debug.py
+++ b/backend/routers/debug.py
@@ -33,3 +33,20 @@ def clear_sqlite():
"""Drain the SQLite ring buffer. Manual reset for the Debug Panel."""
sqlite_profiler.clear()
return {"ok": True, **sqlite_profiler.buffer_stats()}
+
+
+@router.get("/state")
+def debug_state():
+ """Report whether the backend will include ``_debug_queries`` /
+ ``_debug_calls`` arrays in API responses.
+
+ Controlled by the process-level ``DEBUG_RESPONSES`` env var (defaults
+ OFF in production for security; ON in local-dev ``.env``). The admin
+ page calls this to dim the "Query debugging panel" + "API call panel"
+ toggles when the backend won't populate them — so the operator gets
+ a clear tooltip explaining why their toggle has no effect, instead of
+ silently flipping a switch that does nothing.
+ """
+ from backend.models.common import _debug_responses_enabled
+
+ return {"debug_responses_enabled": _debug_responses_enabled()}
diff --git a/backend/routers/provision.py b/backend/routers/provision.py
index 54cce078..f5efed14 100644
--- a/backend/routers/provision.py
+++ b/backend/routers/provision.py
@@ -178,25 +178,45 @@ def provision_check_fos(
return {"ok": False, "error": err_msg, "_debug_calls": get_tracked_calls()}
-@router.get("/teardown")
-def provision_teardown(
- token: str = Query(default=""),
- service_id: str | None = Query(default=None),
- remove_logging: bool = Query(default=True),
- remove_cdn: bool = Query(default=True),
- remove_bucket: bool = Query(default=True),
- remove_cache: bool = Query(default=True),
- remove_cron: bool = Query(default=False),
-):
+@router.post("/teardown")
+def provision_teardown(body: dict | None = None):
+ """Destructive service teardown over SSE.
+
+ Switched from ``GET`` to ``POST`` to defend against CSRF: a GET
+ endpoint with side effects can be triggered by any cross-origin
+ ````, ````, or ``