diff --git a/docker/base-image/Dockerfile b/docker/base-image/Dockerfile index c0593b18..9f294afd 100644 --- a/docker/base-image/Dockerfile +++ b/docker/base-image/Dockerfile @@ -130,6 +130,13 @@ RUN mkdir -p /workspace /data /logs && \ RUN mkdir -p /tmp/secure && \ chmod 1777 /tmp/secure +# #1089: writable-layer dir for the subscription-token hot-reload override +# (/var/lib/trinity/oauth-token). Owned by the agent (UID 1000) so the +# agent-server process can write it; on the writable layer (NOT /home/developer) +# so it survives a plain stop+start but is wiped on recreate. +RUN mkdir -p /var/lib/trinity && \ + chown developer:developer /var/lib/trinity + USER developer CMD ["/app/startup.sh"] diff --git a/docker/base-image/agent_server/models.py b/docker/base-image/agent_server/models.py index 4fa19d96..c570eca2 100644 --- a/docker/base-image/agent_server/models.py +++ b/docker/base-image/agent_server/models.py @@ -261,3 +261,20 @@ class CredentialInjectResponse(BaseModel): """Response from credential injection""" status: str # "success" files_written: List[str] + + +class TokenReloadRequest(BaseModel): + """Request to hot-reload the subscription OAuth token (#1089). + + Surgical alternative to a container recreate: mutates the agent-server + process env so the NEXT claude subprocess uses the rotated token while + in-flight turns keep their already-inherited old token and finish. + """ + token: str # CLAUDE_CODE_OAUTH_TOKEN value to apply + remove_api_key: bool = False # also drop ANTHROPIC_API_KEY from env + + +class TokenReloadResponse(BaseModel): + """Response from a subscription token hot-reload""" + status: str # "success" + reloaded: bool diff --git a/docker/base-image/agent_server/routers/credentials.py b/docker/base-image/agent_server/routers/credentials.py index cb0a40db..04e770a3 100644 --- a/docker/base-image/agent_server/routers/credentials.py +++ b/docker/base-image/agent_server/routers/credentials.py @@ -15,6 +15,8 @@ CredentialReadResponse, CredentialInjectRequest, CredentialInjectResponse, + TokenReloadRequest, + TokenReloadResponse, ) from ..state import agent_state from ..services.trinity_mcp import inject_trinity_mcp_if_configured @@ -132,6 +134,60 @@ async def update_credentials(request: CredentialUpdateRequest): raise HTTPException(status_code=500, detail=f"Credential update failed: {str(e)}") +# Writable-layer override path (#1089). Deliberately NOT under /home/developer — +# that path is the persistent agent-{name}-workspace volume which +# `recreate_container_with_updated_config` preserves, so a token written there +# would survive a recreate and shadow the freshly-baked Config.Env (DB token). +# The writable layer instead survives a plain stop+start (same container) but is +# wiped on recreate (new container, fresh layer) — self-reconciling by Docker +# semantics, no marker logic needed. The directory is created + chowned to UID +# 1000 in the base-image Dockerfile (before the USER switch). +_TOKEN_OVERRIDE = Path("/var/lib/trinity/oauth-token") + + +@router.post("/api/credentials/reload-token", response_model=TokenReloadResponse) +async def reload_subscription_token(request: TokenReloadRequest): + """Hot-reload CLAUDE_CODE_OAUTH_TOKEN for the NEXT claude subprocess (#1089). + + Mutates the agent-server process env so the next `subprocess.Popen` for + `claude` inherits the rotated token; in-flight subprocesses keep their + already-inherited old token and finish. Also persists the token to the + writable-layer override so it survives a plain stop+start (fleet restart + bypasses `start_agent_internal`, which would otherwise revert to the old + Config.Env token — F2). + + Deliberately does NOT rewrite .env / .mcp.json or re-inject Trinity MCP: the + subscription token is not a .env credential, and the `/update` / `/inject` + endpoints destructively rewrite whole files. + """ + if not request.token: + raise HTTPException(status_code=400, detail="token is required") + + os.environ["CLAUDE_CODE_OAUTH_TOKEN"] = request.token + if request.remove_api_key: + os.environ.pop("ANTHROPIC_API_KEY", None) + + # Persist to the writable-layer override. Parent dir is created + chowned in + # the Dockerfile, so the agent (UID 1000) can write here. Create the file + # atomically with 0600 via os.open() rather than write_text()+chmod(): the + # latter creates the file under the process umask (typically 0644) and leaves + # it world-readable until the follow-up chmod, a brief but avoidable window. + # The mode arg only applies on *creation*, so also fchmod the fd — a + # pre-existing override (older write path / tampering) keeps its own perms + # through O_CREAT|O_TRUNC, and we must still force it back to 0600. + fd = os.open(_TOKEN_OVERRIDE, os.O_WRONLY | os.O_CREAT | os.O_TRUNC, 0o600) + with os.fdopen(fd, "w") as f: + os.fchmod(f.fileno(), 0o600) + f.write(request.token) + + # Add the new token to the log-redaction set (drops the old exact-match + # value; OAuth tokens stay caught by the sk-ant value regex regardless). + refresh_credential_values() + + logger.info("Hot-reloaded CLAUDE_CODE_OAUTH_TOKEN (next subprocess; in-flight turns unaffected)") + return TokenReloadResponse(status="success", reloaded=True) + + @router.get("/api/credentials/status") async def get_credentials_status(): """ diff --git a/docker/base-image/startup.sh b/docker/base-image/startup.sh index 59d0f7a0..4f566159 100644 --- a/docker/base-image/startup.sh +++ b/docker/base-image/startup.sh @@ -343,6 +343,20 @@ if [ -d "/config/mcp-servers" ]; then done fi +# === Rotated subscription token: durable override (#1089) === +# A hot-reload (POST /api/credentials/reload-token) persists the rotated +# CLAUDE_CODE_OAUTH_TOKEN to this writable-layer path so it survives a plain +# stop+start. The container's baked Config.Env still holds the OLD token and a +# fleet restart (ops.py) does a raw stop+start that bypasses start_agent_internal +# — so export the override (when present and non-empty) BEFORE launching the +# agent server, so the rotated token wins. The file is wiped on recreate (fresh +# writable layer), so a DB-driven recreate cleanly reverts to the freshly-baked +# Config.Env token — no marker logic needed. +if [ -s /var/lib/trinity/oauth-token ]; then + export CLAUDE_CODE_OAUTH_TOKEN="$(cat /var/lib/trinity/oauth-token)" + echo "Applied rotated subscription token from durable override" +fi + # Start Agent Web Server (self-contained UI) if [ "${ENABLE_AGENT_UI}" = "true" ]; then echo "Starting Agent Web UI on port ${AGENT_SERVER_PORT:-8000}..." diff --git a/docs/memory/architecture.md b/docs/memory/architecture.md index 264bd53f..c3314a65 100644 --- a/docs/memory/architecture.md +++ b/docs/memory/architecture.md @@ -276,12 +276,15 @@ Vector 0.43.1 (`timberio/vector:0.43.1-alpine`). Captures all container stdout/s **Internal server** `agent-server.py` (FastAPI, port 8000): - `/api/chat` - Claude Code execution (messages persisted to database) - `/health` - Health check. Returns `{status}` plus `active_tasks` (concurrent executions across `/api/chat` + `/api/task`), `last_task_at`, `consecutive_failures` (reset on success — consumed by the dispatch breaker #526 and fleet health #307) and the #333 `diagnostics` gauges (#1020). `mailbox_depth` intentionally NOT emitted — no agent-side mailbox until the actor model (#945); the backend derives queue depth from `CapacityManager`. Counters live in `agent_server/state.py`; backend reads them in `monitoring_service.py` with graceful defaults for older images. -- `/api/credentials/update` - Hot-reload credentials +- `/api/credentials/update` - Hot-reload credentials (rewrites `.env`/`.mcp.json`) +- `/api/credentials/reload-token` - Surgical subscription-token hot-reload (#1089): mutates the agent-server process `os.environ["CLAUDE_CODE_OAUTH_TOKEN"]` so the NEXT claude subprocess uses the rotated token while in-flight subprocesses keep theirs; persists to the writable-layer override `/var/lib/trinity/oauth-token` (0600). Does NOT touch `.env`/`.mcp.json`. See [Subscription Token Rotation](#subscription-token-rotation-via-hot-reload-1089) - `/api/chat/session` - Context window stats - `/api/files`, `/api/files/download` (100MB limit), `/api/files/mkdir` (workspace-confined, #37) The agent server also runs two loops: the 15-min git `auto_sync` heartbeat (see [Git Sync Health](#git-sync-health-389390)) and the 5s liveness heartbeat (see [Heartbeat Liveness](#heartbeat-liveness-reliability-004-307)). +**Durable subscription-token override (#1089):** `startup.sh` exports `CLAUDE_CODE_OAUTH_TOKEN` from `/var/lib/trinity/oauth-token` (when present, non-empty) **before** launching the agent server, so a token rotated via hot-reload survives a plain stop+start (a fleet restart via `routers/ops.py` does a raw `container_stop`+`container_start` that bypasses `start_agent_internal` and would otherwise revert to the baked `Config.Env` token). The path is deliberately on the writable layer, **not** under the persisted `/home/developer` volume: it survives `stop`→`start` (same container) but is wiped on recreate (fresh layer), so a DB-driven recreate cleanly re-bakes `Config.Env` from the DB and the stale override is gone — self-reconciling, no marker logic. Dir created+chowned to UID 1000 in the base-image Dockerfile. + **Template-supplied pre-check** (SCHED-COND-001, #454): if the template ships an executable `~/.trinity/pre-check`, the backend's internal endpoint `POST /api/internal/agents/{name}/pre-check` runs it via `docker exec` before a cron-triggered chat. Language-agnostic — interpreter selected by shebang. The hook's stdout becomes the chat message; empty stdout + exit 0 records a skipped execution (Claude never invoked). Uses the same `execute_command_in_container` primitive as `git_service.py`, `ssh_service.py`, and the agent terminal — no agent-server HTTP endpoint. **Persistent chat:** all chat messages auto-saved to SQLite (`chat_sessions`, `chat_messages`) with full observability (costs, context, tool calls, execution time); sessions survive container restarts/deletions; users see only their own messages (admins see all). @@ -367,6 +370,12 @@ agent:heartbeat:misses:{name} → STRING(int), ~60s TTL. Consecutive-miss counte Trigger-boundary dedup — policy in Architectural Invariant #18, table DDL under `idempotency_keys`. `services/idempotency_service.py` (key derivation + `begin`/`complete`/`fail`) over `db/idempotency.py`. The `(scope, key)` PRIMARY KEY is the atomic claim: `claim()` INSERTs an `in_flight` row; a concurrent loser catches `IntegrityError` and reads the surviving row — cross-process safe across uvicorn workers and the standalone scheduler (shared SQLite file). Lifecycle: `claim` → (`attach_execution`) → `complete` (stores `response_snapshot` for replay) or `release` (deletes the in_flight row so a failed attempt can retry; never deletes a `completed` row). Rows older than 24h are treated as expired and re-claimed; the cleanup service purges them (`idempotency_purge_expired`). Duplicates within 24h short-circuit with the original result + `X-Idempotent-Replay: true`; an in-flight duplicate returns 409. Fail-open — a key never blocks a real execution. +### Subscription Token Rotation via Hot-Reload (#1089) + +Rotating an agent's subscription token used to recreate the container, making "rotate a credential" and "kill every in-flight turn" the same operation (#1037). Token rotation now hot-reloads the running container; recreate is reserved for image/template/auth-**mode** changes (TARGET_ARCHITECTURE §Agent Runtime). The agent server authenticates Claude purely from `CLAUDE_CODE_OAUTH_TOKEN` (no `.credentials.json`) and is a single uvicorn worker, so mutating its process env makes the **next** subprocess use the new token while in-flight subprocesses finish on the old one. + +Backend orchestration in `services/subscription_auto_switch.py`: `_hot_reload_subscription_token(agent_name)` POSTs the agent's current DB token to the agent-server `POST /api/credentials/reload-token`, falling back to `_restart_agent` on a 404 (old base image), transport failure, or missing token (`no_container`/`not_running` short-circuit otherwise). Three producer paths converted, all under the #799 `agent_switch_lock`: **auto-switch** (`_perform_auto_switch`, SUB-003), **manual sub→sub reassignment** (`PUT /api/subscriptions/agents/{name}` — auth-mode changes none/api-key→sub still recreate), and **key rollover** (`reload_subscription_for_all_agents(sub_id)` fans a best-effort reload across every running agent on a re-registered subscription). Durable override (`/var/lib/trinity/oauth-token`) + `startup.sh` read make a rotation survive a plain restart — see the agent-server [Durable subscription-token override](#agent-containers) note. Agent-server endpoint mirroring follows Invariant #5. + ### Real-time Delivery (RELIABILITY-003, #306) **Transport** (`event_bus.py`): Redis Streams. `ConnectionManager`/`FilteredWebSocketManager` are thin shims that `XADD` to the MAXLEN-trimmed `trinity:events` stream; one `StreamDispatcher` per backend process runs `XREAD BLOCK` and fans out to registered clients, evicting a client after 3 consecutive delivery failures. New broadcast sites keep calling `manager.broadcast(...)` / `filtered_manager.broadcast_filtered(...)` — never publish to the stream directly (Invariant #10). diff --git a/docs/memory/feature-flows.md b/docs/memory/feature-flows.md index 27c8b2d4..6bee3954 100644 --- a/docs/memory/feature-flows.md +++ b/docs/memory/feature-flows.md @@ -12,6 +12,7 @@ | Date | ID | Feature | Flow | |------|-----|---------|------| | 2026-06-14 | #1022 | fix(scheduler): persist a descriptive `error` on dispatch timeout — a dispatch `httpx.TimeoutException` (whose `str()` is `''`) previously landed in the cron path's generic handler and persisted a **blank** `error`. Now re-raised before that handler as a named non-blank message (`"dispatch to /api/internal/execute-task timed out after {N}s — outcome unknown"`); outcome is genuinely UNKNOWN (backend spawns the bg task before replying → may already be running → orphan recovered by cleanup). New `_describe_exception()` helper (type-name fallback) normalizes any blank-stringifying exception across all execution/retry/process-schedule error paths. Dispatch + pre-check HTTP deadlines lifted from literals to config: `DISPATCH_TIMEOUT` (default 30s) and `PRE_CHECK_TIMEOUT` (default 70s). Scheduler-only (`src/scheduler/`); +270 lines of tests (incl. pre-check config-deadline + retry-path blank-error regressions). | [scheduler-service.md](feature-flows/scheduler-service.md), [scheduler-pre-check.md](feature-flows/scheduler-pre-check.md) | +| 2026-06-13 | #1089 | feat: subscription token rotation via **hot-reload, not container recreate** — a dedicated agent-server `POST /api/credentials/reload-token` mutates the running container's `CLAUDE_CODE_OAUTH_TOKEN` env so the next claude subprocess uses the rotated token while in-flight turns finish (closes the #1037 collateral-kill class). Three producer paths converted under the #799 `agent_switch_lock`: auto-switch (SUB-003), manual sub→sub reassignment (auth-mode changes still recreate), and key-rollover fan-out on `POST /api/subscriptions` upsert. Durable writable-layer override (`/var/lib/trinity/oauth-token` + `startup.sh` read) survives a plain restart; recreate self-reconciles to the DB token. Falls back to the old `_restart_agent` recreate on a 404 (old base image). | [subscription-auto-switch.md](feature-flows/subscription-auto-switch.md) | | 2026-06-11 | #858 | fix: first-time setup token silently lost — `docker/backend/Dockerfile` had drifted and lost `ENV PYTHONUNBUFFERED=1` (which `docker/scheduler/Dockerfile` still set), so CPython block-buffered the lifespan's stdout to the Docker log pipe (~8KB) and the printed setup token never reached `docker logs`, deadlocking fresh installs (the only documented path through the `routers/setup.py` token gate). Two-layer fix: (1) restore `PYTHONUNBUFFERED=1` (catches every `print()`); (2) the setup-token block + ~76 other lifespan `print()` calls now emit via the structured `logger` — the token as a single multi-line `logger.warning` **relocated to immediately after `setup_logging()`**, before the event-bus/audit-write startup that could otherwise hang and suppress it (the `StreamHandler` flushes per record, so it's immune to future Dockerfile drift and flows through Vector). `setup_opentelemetry()`'s import-time print + the `register_enterprise` prints stay `print(..., flush=True)` (they run before `setup_logging()`). New `unit/test_858_dockerfile_unbuffered.py` backend↔scheduler parity guard (2 tests). Note: stdout→stderr stream move for the converted lines (Docker/Vector capture both). Known follow-up #1165: prod runs uvicorn `--workers 2`, so the per-process token is still ~50% flaky until unified. | [first-time-setup.md](feature-flows/first-time-setup.md) | | 2026-06-10 | #1130 | fix: retired `gemini-2.0-flash` replaced with env-configurable models — `GEMINI_TEXT_MODEL` (image-gen prompt refinement) + `GEMINI_TRANSCRIPTION_MODEL` (Telegram voice), both default `gemini-3.5-flash`, defined in `config.py`, empty-string-safe wiring in both compose files (#1076 pattern). | [image-generation.md](feature-flows/image-generation.md), [telegram-integration.md](feature-flows/telegram-integration.md) | | 2026-06-10 | #1108 | feat(ui): Agent Detail **Guardrails** tab renamed to **Settings** — sectioned config home. New `components/settings/SettingsPanel.vue` renders `GuardrailsPanel` unchanged as section #1; future per-agent settings land as additive sections, not new tabs. `?tab=guardrails` deep links alias to `settings` via `TAB_ALIASES`. Pure frontend. | [agent-guardrails.md](feature-flows/agent-guardrails.md) | diff --git a/docs/memory/feature-flows/subscription-auto-switch.md b/docs/memory/feature-flows/subscription-auto-switch.md index a1abe8fd..8e416a08 100644 --- a/docs/memory/feature-flows/subscription-auto-switch.md +++ b/docs/memory/feature-flows/subscription-auto-switch.md @@ -1,8 +1,8 @@ # Feature Flow: Subscription Auto-Switch (SUB-003) > **Requirement**: `docs/requirements/SUB-003-subscription-auto-switch.md` -> **Issue**: #153, threshold + scope update #441 -> **Status**: Implemented (2026-03-21), updated 2026-04-25 (#441) +> **Issue**: #153, threshold + scope update #441, hot-reload #1089 +> **Status**: Implemented (2026-03-21), updated 2026-04-25 (#441), 2026-06-13 (#1089 — switch hot-reloads instead of recreating the container) ## Overview @@ -34,11 +34,29 @@ Find best alternative subscription (fewest agents, not rate-limited in last 2h) ↓ No alternative? → return None (log warning) ↓ Found -Switch: DB update + container restart + log activity + send notification +Switch: DB update + token HOT-RELOAD (not container recreate, #1089) + log activity + send notification ↓ Return switch result → caller surfaces 429/503 with auto_switch info + retry hint ``` +## Token Application: Hot-Reload, not Recreate (#1089) + +The switch step (and the manual `PUT /api/subscriptions/agents/{name}` sub→sub +path, and the `POST /api/subscriptions` key-rollover upsert) applies the new +token via `_hot_reload_subscription_token(agent_name)` — a POST to the +agent-server `POST /api/credentials/reload-token` that mutates the running +container's `CLAUDE_CODE_OAUTH_TOKEN` env. The **next** Claude subprocess uses +the new token while **in-flight** turns finish on the old one, so a rotation no +longer kills every parallel execution (#1037). Falls back to the previous +`_restart_agent` recreate on a 404 (old base image), transport failure, or a +missing token. Durability across a plain restart is handled by the +`/var/lib/trinity/oauth-token` writable-layer override that `startup.sh` reads +before launching the agent server. The override is created **atomically at mode +`0600`** via `os.open(..., O_CREAT, 0o600)` — not `write_text()`+`chmod()`, which +would leave the token file briefly world-readable under the process umask between +create and chmod. Canonical home: architecture.md +§"Subscription Token Rotation via Hot-Reload". + ## Trigger Surface | Layer | Signal | Failure kind | @@ -69,7 +87,9 @@ import from `backend.services`. Keep the two in sync when editing either. | Router | `src/backend/routers/chat.py` | 429 interception in chat proxy + background tasks | | Frontend | `src/frontend/src/views/Settings.vue` | Toggle in Subscriptions section | | Tests | `tests/test_subscription_auto_switch.py` | Smoke tests | -| Tests | `tests/unit/test_subscription_auto_switch_pingpong.py` | Unit regression for #444 ping-pong prevention; `TestRateLimitAging` (#476) pins 2h-window correctness | +| Tests | `tests/unit/test_subscription_auto_switch_pingpong.py` | Unit regression for #444 ping-pong prevention; `TestRateLimitAging` (#476) pins 2h-window correctness; `TestHotReloadSwitch` + `TestKeyRolloverFanOut` (#1089) pin the hot-reload helper, auto-switch wire-in, and key-rollover fan-out | +| Tests | `tests/unit/test_subscription_reassign_hotreload.py` | #1089 — manual sub→sub hot-reload under the lock (no `container_stop`), mode-change still recreates, register/upsert key-rollover fan-out, and the admin-only gate on `register_subscription` (non-admin → 403 before any create or fan-out) | +| Tests | `tests/unit/test_reload_token_endpoint.py` | #1089 — agent-server `POST /api/credentials/reload-token`: sets env, atomically writes the `/var/lib/trinity/oauth-token` override at `0600`, no `.env` write, `remove_api_key` pops `ANTHROPIC_API_KEY`, empty token → 400 | | Tests | `tests/unit/test_subscription_auto_switch_no_cred_import.py` | Chain-level regression for #606 — pins `_restart_agent → start_agent_internal → inject_assigned_credentials` reaches the `lifecycle.py:155` `subscription_mode` short-circuit and never re-enters file-based credential import | | Tests | `tests/unit/test_iso_cutoff.py` | Format parity between `iso_cutoff(N)` and `utc_now_iso()` (#476) | | Util | `src/backend/utils/helpers.py::iso_cutoff` | Canonical cutoff helper for ISO-Z TEXT comparisons (#476) | @@ -137,5 +157,5 @@ anyway, so the omission was silent. - **All subscriptions exhausted**: No switch, error surfaces as normal 429/503. `_perform_auto_switch` does **not** clear rate-limit events for the old subscription — those events are the signal that keeps `is_subscription_rate_limited()` truthful, so the just-drained sub is not offered as a candidate on the next cycle (issue #444). - **API key agents**: Auto-switch only applies to subscription-based agents - **Flip-flopping** (#441 update): the 2h skip-list (`is_subscription_rate_limited` ∧ `select_best_alternative_subscription`) is now the only thrash guard. Pre-#441 the threshold also required 2 consecutive 429s before switching, but that gated user-visible failures unnecessarily — the skip-list alone is sufficient because a just-drained sub stays flagged for 2h post-switch. -- **Concurrent switches**: SQLite serialization prevents races +- **Concurrent switches** (#799/#1089): a per-agent `agent_switch_lock` serializes the assign+apply window so a manual `PUT /api/subscriptions/agents/{name}` reassignment can't interleave with a concurrent auto-switch. The `old_sub_id` snapshot is taken **inside** that lock, immediately before the DB assign — a concurrent switch therefore can't change the agent's subscription between the read and the assign (TOCTOU). Without this, a sub→sub swap could be mis-classified as an auth-mode change (or vice-versa) and routed into a needless container recreate instead of a hot-reload. - **Cleanup**: Records older than 24h are pruned hourly by `CleanupService` (phase 6, #476); the 2h "is rate-limited" window drives candidate filtering independently of cleanup diff --git a/docs/memory/requirements.md b/docs/memory/requirements.md index 78d44834..20e741f9 100644 --- a/docs/memory/requirements.md +++ b/docs/memory/requirements.md @@ -1560,6 +1560,7 @@ All subsections 18.1–18.10 were deleted with the code. Flow docs archived at ` - `src/backend/routers/chat.py` - 429 interception hooks - `src/frontend/src/views/Settings.vue` - Toggle UI - **Negative markers on `is_auth_failure` (#904, 2026-05-21)**: substring match on `AUTH_INDICATORS` now short-circuits to False when the error message also contains an unambiguous signal-kill / OOM / timeout marker (`sigkill`, `sigterm`, `sigint`, `exit code -9`, `exit code 137`, `exit code 143`, `out of memory`, `oom`, `memory cgroup`, `terminated by`, `killed by`). Prevents the SUB-003 trigger from firing on cgroup OOM kills whose detail string happens to contain a word like "token" or "authentication" via downstream wrapping. The same exclusion list lives in `src/scheduler/service.py:_is_auth_failure` to keep the two surfaces from drifting (see §10.4.1). +- **Hot-reload, not recreate (#1089, 2026-06-13)**: the auto-switch no longer recreates the container — `_perform_auto_switch` hot-reloads the new token in place so in-flight turns on the agent survive. See §20.6. ### 20.5 Per-Subscription Usage Tracking (SUB-004) - **Status**: ✅ Implemented (2026-04-01) @@ -1580,6 +1581,32 @@ All subsections 18.1–18.10 were deleted with the code. Flow docs archived at ` - `src/backend/db/chat.py` - Session creation with subscription_id - `src/frontend/src/views/Settings.vue` - Usage display (if applicable) +### 20.6 Credential Rotation via Hot-Reload, not Container Recreate (#1089) +- **Status**: ✅ Implemented (2026-06-13) +- **GitHub Issue**: #1089 +- **Extends**: SUB-002 / SUB-003 +- **Priority**: HIGH (`theme-reliability`) +- **Builds on**: #799 (per-agent `agent_switch_lock`) +- **Description**: Rotating an agent's subscription token used to **recreate the container**, making "rotate a credential" and "kill every in-flight turn" the same operation (#1037 collateral kills — one 429 on a shared subscription would auto-switch and destroy every parallel execution). Token rotation now goes through a surgical hot-reload of the running container; recreate is reserved for image/template/auth-**mode** changes. This removes the credential↔execution collision class structurally (TARGET_ARCHITECTURE §Agent Runtime). +- **Mechanism**: the agent server spawns Claude via `subprocess.Popen(..., env={**os.environ, ...})` and authenticates purely from the `CLAUDE_CODE_OAUTH_TOKEN` env var (no `.credentials.json` write); it is a single uvicorn worker. Mutating the agent-server process `os.environ["CLAUDE_CODE_OAUTH_TOKEN"]` makes the **next** Claude subprocess use the new token; **in-flight** subprocesses keep their already-inherited old token and finish. +- **Rotation paths converted to hot-reload**: + 1. **Auto-switch** (SUB-003): `_perform_auto_switch` hot-reloads instead of `_restart_agent` (runs inside the #799 `agent_switch_lock`). + 2. **Manual reassignment** (`PUT /api/subscriptions/agents/{name}`): a sub→sub swap hot-reloads under the lock; an auth-**mode** change (none/api-key → subscription) still recreates so `ANTHROPIC_API_KEY` is dropped and the OAuth token is baked into `Config.Env`. + 3. **Key rollover** (`POST /api/subscriptions` upsert): re-registering a subscription's token fans a best-effort hot-reload out to every running agent on that subscription (one agent's failure never fails the upsert nor blocks the others). +- **Key Features**: + - Agent-server endpoint `POST /api/credentials/reload-token` (`{token, remove_api_key}`) — mutates `os.environ` + persists the token to the writable-layer override; does **not** rewrite `.env`/`.mcp.json` or re-inject Trinity MCP. + - **Durable override (F2)**: the token is written to `/var/lib/trinity/oauth-token` (0600), deliberately **not** under `/home/developer` (the persisted workspace volume). `startup.sh` exports it before launching the agent server, so a plain fleet restart (`ops.py` raw stop+start, which bypasses `start_agent_internal`) keeps the rotated token. **Self-reconciling by Docker semantics**: the writable layer survives `stop`→`start` but is wiped on recreate (fresh layer), so a DB-driven recreate re-bakes `Config.Env` (DB token) and the stale override is gone — no marker logic. + - **Back-compat fallback**: running containers on an older base image return **404** for the endpoint → the backend falls back to `_restart_agent` (identical to pre-#1089 behavior). Per #1037, recreate stays out of scope; the fallback inherits whatever #1037 lands. An agent only gains the endpoint once recreated onto a rebuilt base image (no automatic fleet-wide adoption). +- **Backend helpers** (`services/subscription_auto_switch.py`): `_hot_reload_subscription_token(agent_name)` (POST + restart fallback on 404/transport/no-token; `no_container`/`not_running` short-circuits) and `reload_subscription_for_all_agents(subscription_id)` (key-rollover fan-out under the lock). +- **Files**: + - `docker/base-image/agent_server/routers/credentials.py` - `reload-token` endpoint + writable-layer override write + - `docker/base-image/agent_server/models.py` - `TokenReloadRequest`/`TokenReloadResponse` + - `docker/base-image/Dockerfile` - `mkdir+chown /var/lib/trinity` (Invariant #17 non-root) + - `docker/base-image/startup.sh` - export override token before agent-server launch + - `src/backend/services/subscription_auto_switch.py` - hot-reload helper + fan-out + auto-switch wire-in + - `src/backend/routers/subscriptions.py` - manual sub→sub under lock + key-rollover fan-out +- **Known limitations**: cross-worker race on the process-local `agent_switch_lock` (prod `--workers 2`) is flagged for #1166/#799 (escalate to Redis `SETNX`); a bulk `delete_subscription` still leaves the deleted token live until next start (pre-existing, out of scope). Both self-heal via the durable override / `check_api_key_env_matches` reconciliation. + --- ## Non-Functional Requirements diff --git a/docs/planning/TARGET_ARCHITECTURE.md b/docs/planning/TARGET_ARCHITECTURE.md index 7eb107f6..07637faf 100644 --- a/docs/planning/TARGET_ARCHITECTURE.md +++ b/docs/planning/TARGET_ARCHITECTURE.md @@ -275,7 +275,7 @@ When a single container's `max_parallel_tasks` ceiling — bounded by container **Post-execution hooks**: companion to the existing pre-check hook. `~/.trinity/post-check` runs after every task completion (language-agnostic, shebang-selected). Enables custom alerting, output validation, or state transitions defined by the agent template. -**Credential rotation via hot-reload, not recreate**: rotating an agent's token (subscription auto-switch, key rollover) goes through the existing `/api/credentials/update` hot-reload endpoint and does **not** recreate the container. "Rotate a credential" and "kill every in-flight turn" must stop being the same operation (#1037) — container recreate is reserved for image/template changes, where the pre-recreate "stop pulling, finish in-flight" handshake applies. This removes the credential↔execution collision class structurally instead of recovering from it after the fact. +**Credential rotation via hot-reload, not recreate** ✅ *Implemented (#1089, 2026-06-13)*: rotating an agent's subscription token (auto-switch, manual sub→sub reassignment, key rollover) goes through a dedicated agent-server `POST /api/credentials/reload-token` endpoint and does **not** recreate the container. (A new surgical endpoint, not the existing `/api/credentials/update` — the latter destructively rewrites `.env`/`.mcp.json`; the token is an env-only credential.) "Rotate a credential" and "kill every in-flight turn" are no longer the same operation (#1037) — container recreate is reserved for image/template/auth-**mode** changes, where the pre-recreate "stop pulling, finish in-flight" handshake applies. A writable-layer durable override (`/var/lib/trinity/oauth-token`, read by `startup.sh`) keeps a rotation across a plain restart; recreate self-reconciles to the DB token (fresh layer wipes the override). This removes the credential↔execution collision class structurally instead of recovering from it after the fact. See architecture.md §"Subscription Token Rotation via Hot-Reload". --- diff --git a/src/backend/routers/subscriptions.py b/src/backend/routers/subscriptions.py index 4b673dde..263359bb 100644 --- a/src/backend/routers/subscriptions.py +++ b/src/backend/routers/subscriptions.py @@ -89,6 +89,21 @@ async def register_subscription( ) logger.info(f"Registered subscription '{request.name}' by {current_user.username}") + + # #1089 (F1): a re-register (upsert) is a key rollover — fan a best-effort + # hot-reload out to every running agent on this subscription so they pick + # up the new token without a recreate. Swallowed on failure: the fan-out + # must never fail the registration. (No-op on first registration — no + # agents are assigned yet.) + try: + from services.subscription_auto_switch import reload_subscription_for_all_agents + await reload_subscription_for_all_agents(subscription.id) + except Exception as e: + logger.error( + f"[#1089] key-rollover hot-reload fan-out failed for " + f"subscription '{request.name}': {e}" + ) + return subscription except HTTPException: @@ -228,9 +243,15 @@ async def assign_subscription_to_agent( """ Assign a subscription to an agent. - Owner access required. If the agent is running, it will be restarted - so the container is recreated with `CLAUDE_CODE_OAUTH_TOKEN` env var - and `ANTHROPIC_API_KEY` removed. + Owner access required. The token is applied to a running agent based on the + kind of change (#1089): + - sub → sub swap: hot-reloaded in place via /api/credentials/reload-token, + so in-flight turns survive (no container recreate); + - none/api-key → subscription (an auth-MODE change): the container is + recreated so `ANTHROPIC_API_KEY` is dropped and `CLAUDE_CODE_OAUTH_TOKEN` + is baked into Config.Env. + Both run under the #799 per-agent switch lock so a manual reassignment can't + interleave with a concurrent auto-switch on the same agent. """ # Owner or admin only — shared users must not mutate subscription assignments if not db.can_user_share_agent(current_user.username, agent_name): @@ -242,42 +263,66 @@ async def assign_subscription_to_agent( raise HTTPException(status_code=404, detail=f"Subscription '{subscription_name}' not found") try: - db.assign_subscription_to_agent(agent_name, subscription.id) - - logger.info( - f"Assigned subscription '{subscription_name}' to agent '{agent_name}' " - f"by {current_user.username}" + from services.subscription_auto_switch import ( + agent_switch_lock, + _hot_reload_subscription_token, ) - # If agent is running, restart it so the container is recreated - # with CLAUDE_CODE_OAUTH_TOKEN env var and without ANTHROPIC_API_KEY - from services.docker_service import get_agent_container, get_agent_status_from_container - from services.docker_utils import container_stop - from services.agent_service import start_agent_internal - - container = get_agent_container(agent_name) - restart_result = None - injection_result = None - - if container: - agent_status = get_agent_status_from_container(container) - if agent_status.status == "running": - try: - await container_stop(container) - await start_agent_internal(agent_name) - restart_result = "success" - injection_result = {"status": "success"} - logger.info( - f"Restarted agent '{agent_name}' to apply subscription token" - ) - except Exception as e: - logger.error(f"Failed to restart agent '{agent_name}' for subscription: {e}") - restart_result = f"failed: {e}" - injection_result = {"status": "failed", "error": str(e)} + # #799/#1089: serialize the assign + apply window per agent so a manual + # reassignment can't interleave with a concurrent auto-switch. + async with await agent_switch_lock(agent_name): + # #1089: snapshot the agent's CURRENT subscription under the lock, + # before reassigning, so a concurrent auto-switch can't change it + # between the read and the assign (TOCTOU). A sub→sub swap + # (old_sub_id is not None) hot-reloads the token in place; an + # auth-mode change (old_sub_id is None) still needs the container + # recreated. + old_sub_id = db.get_agent_subscription_id(agent_name) + db.assign_subscription_to_agent(agent_name, subscription.id) + + logger.info( + f"Assigned subscription '{subscription_name}' to agent '{agent_name}' " + f"by {current_user.username}" + ) + + restart_result = None + injection_result = None + + if old_sub_id is not None: + # sub → sub: hot-reload the token without recreating the container. + # The helper itself short-circuits ("not_running"/"no_container") + # for stopped agents and falls back to a recreate on 404 / transport + # failure / missing token. + restart_result = await _hot_reload_subscription_token(agent_name) + injection_result = {"status": restart_result} else: - injection_result = {"status": "agent_not_running"} - else: - injection_result = {"status": "agent_not_running"} + # none/api-key → subscription: an auth-MODE change still requires a + # recreate so ANTHROPIC_API_KEY is dropped and the OAuth token is + # baked into Config.Env. + from services.docker_service import get_agent_container, get_agent_status_from_container + from services.docker_utils import container_stop + from services.agent_service import start_agent_internal + + container = get_agent_container(agent_name) + if container: + agent_status = get_agent_status_from_container(container) + if agent_status.status == "running": + try: + await container_stop(container) + await start_agent_internal(agent_name) + restart_result = "success" + injection_result = {"status": "success"} + logger.info( + f"Restarted agent '{agent_name}' to apply subscription token" + ) + except Exception as e: + logger.error(f"Failed to restart agent '{agent_name}' for subscription: {e}") + restart_result = f"failed: {e}" + injection_result = {"status": "failed", "error": str(e)} + else: + injection_result = {"status": "agent_not_running"} + else: + injection_result = {"status": "agent_not_running"} return { "success": True, diff --git a/src/backend/services/subscription_auto_switch.py b/src/backend/services/subscription_auto_switch.py index 7ab05767..41abd1ac 100644 --- a/src/backend/services/subscription_auto_switch.py +++ b/src/backend/services/subscription_auto_switch.py @@ -270,8 +270,11 @@ async def _perform_auto_switch( # utils/helpers.py, issue #476) and the 24h cleanup in # services/cleanup_service.py removes them from disk. - # Restart agent container to apply new subscription token - restart_result = await _restart_agent(agent_name) + # Rotate the subscription token on the running container via hot-reload so + # in-flight turns survive the switch (#1089). Falls back to a full restart on + # a 404 (old base image without the endpoint), transport failure, or when no + # token is resolvable — identical to the previous recreate behavior. + restart_result = await _hot_reload_subscription_token(agent_name) # Log activity event from services.activity_service import activity_service @@ -355,3 +358,98 @@ async def _restart_agent(agent_name: str) -> str: except Exception as e: logger.error(f"[SUB-003] Failed to restart agent '{agent_name}': {e}") return f"failed: {e}" + + +async def _hot_reload_subscription_token(agent_name: str) -> str: + """Push the agent's current DB subscription token to the running container + via ``POST /api/credentials/reload-token`` (#1089). + + The agent server mutates its own ``os.environ["CLAUDE_CODE_OAUTH_TOKEN"]``, + so the NEXT claude subprocess uses the rotated token while in-flight turns + keep their already-inherited old token and finish — "rotate a credential" + is no longer the same operation as "kill every running turn". + + Falls back to the full ``_restart_agent`` recreate path (today's behavior, + no regression) on: + - a 404 — an old base image that predates the endpoint, + - any transport / circuit failure (``AgentClientError`` family), or + - no resolvable token for the agent's current subscription. + Returns ``"no_container"`` / ``"not_running"`` when the agent is not a + running container, mirroring ``_restart_agent``. + """ + try: + from services.docker_service import ( + get_agent_container, + get_agent_status_from_container, + ) + from services.agent_client import get_agent_client, AgentClientError + + container = get_agent_container(agent_name) + if not container: + return "no_container" + if get_agent_status_from_container(container).status != "running": + return "not_running" + + sub_id = db.get_agent_subscription_id(agent_name) + token = db.get_subscription_token(sub_id) if sub_id else None + if not token: + # No token to push (e.g. assignment cleared mid-flight). Fall back to + # the recreate path, which re-bakes Config.Env from the DB. + return await _restart_agent(agent_name) + + client = get_agent_client(agent_name) + try: + # remove_api_key=False is intentional: subscription-backed agents never + # carry ANTHROPIC_API_KEY in env (popped at create time, lifecycle.py). + # The param is kept for a future mode-change-via-hot-reload caller. + resp = await client.post( + "/api/credentials/reload-token", + json={"token": token, "remove_api_key": False}, + timeout=10.0, + ) + except AgentClientError as e: + logger.warning( + f"[SUB-003] hot-reload transport failure for '{agent_name}': {e}; " + f"falling back to restart" + ) + return await _restart_agent(agent_name) + + if resp.status_code >= 400: # 404 = old base image without the endpoint + logger.info( + f"[SUB-003] hot-reload returned HTTP {resp.status_code} for " + f"'{agent_name}'; falling back to restart" + ) + return await _restart_agent(agent_name) + + logger.info(f"[SUB-003] Hot-reloaded subscription token for '{agent_name}' (no recreate)") + return "hot_reloaded" + except Exception as e: + logger.error( + f"[SUB-003] hot-reload error for '{agent_name}': {e}; falling back to restart" + ) + return await _restart_agent(agent_name) + + +async def reload_subscription_for_all_agents(subscription_id: str) -> dict[str, str]: + """Hot-reload the subscription token on every running agent assigned to + `subscription_id` (#1089 key rollover — re-registering a subscription's + token via the `/api/subscriptions` upsert). + + Best-effort per agent, each under the #799 per-agent switch lock so a + rollout can't interleave with a concurrent auto-switch: a failure on one + agent is logged and does NOT abort the fan-out or block the others. Stopped + agents are skipped by the helper (`not_running`) — they pick up the new + token on next start (Config.Env is re-baked from the DB on recreate). + Returns ``{agent_name: result}`` for observability. + """ + results: dict[str, str] = {} + for agent_name in db.get_agents_by_subscription(subscription_id): + try: + async with await agent_switch_lock(agent_name): + results[agent_name] = await _hot_reload_subscription_token(agent_name) + except Exception as e: + logger.error( + f"[SUB-003] key-rollover hot-reload failed for '{agent_name}': {e}" + ) + results[agent_name] = f"failed: {e}" + return results diff --git a/tests/unit/test_reload_token_endpoint.py b/tests/unit/test_reload_token_endpoint.py new file mode 100644 index 00000000..22fa8368 --- /dev/null +++ b/tests/unit/test_reload_token_endpoint.py @@ -0,0 +1,140 @@ +""" +Unit tests for the agent-server hot-reload-token endpoint (#1089). + +``POST /api/credentials/reload-token`` mutates the live agent-server process +env so the NEXT claude subprocess uses the rotated subscription token (in-flight +turns keep their already-inherited old token and finish), and persists the token +to the writable-layer override (``/var/lib/trinity/oauth-token``, 0600) so it +survives a plain stop+start (F2 durability). It must NOT rewrite ``.env`` / +``.mcp.json`` or re-inject Trinity MCP — those are the destructive whole-file +flows owned by ``/api/credentials/update`` and ``/api/credentials/inject``. + +Module: docker/base-image/agent_server/routers/credentials.py + +`agent_server` is registered as a namespace package by tests/unit/conftest.py +(``_preload_real_agent_server``), so the real base-image router imports directly. +""" + +import os +import stat + +import pytest +from fastapi import FastAPI +from fastapi.testclient import TestClient + +from agent_server.routers import credentials as cred_router + + +@pytest.fixture +def client(tmp_path, monkeypatch): + """TestClient over the credentials router with the writable-layer override + redirected to a tmp path (the host has no /var/lib/trinity), the sanitizer + refresh stubbed (don't read the host ~/.env), and the MCP re-inject spied so + we can assert the destructive whole-file flow is never triggered.""" + override = tmp_path / "oauth-token" + monkeypatch.setattr(cred_router, "_TOKEN_OVERRIDE", override) + + refresh_calls: list[int] = [] + monkeypatch.setattr( + cred_router, "refresh_credential_values", lambda: refresh_calls.append(1) + ) + inject_calls: list[int] = [] + monkeypatch.setattr( + cred_router, + "inject_trinity_mcp_if_configured", + lambda: (inject_calls.append(1), False)[1], + ) + + # The endpoint mutates os.environ directly (not via monkeypatch); snapshot + # the two keys it touches and restore them so nothing leaks across tests. + saved = {k: os.environ.get(k) for k in ("CLAUDE_CODE_OAUTH_TOKEN", "ANTHROPIC_API_KEY")} + + app = FastAPI() + app.include_router(cred_router.router) + c = TestClient(app) + c._override = override # type: ignore[attr-defined] + c._refresh_calls = refresh_calls # type: ignore[attr-defined] + c._inject_calls = inject_calls # type: ignore[attr-defined] + try: + yield c + finally: + for k, v in saved.items(): + if v is None: + os.environ.pop(k, None) + else: + os.environ[k] = v + + +def test_reload_sets_env_and_writes_durable_override(client): + """Happy path: env mutated for the next subprocess + durable override + written 0600 + sanitizer refreshed + NO destructive MCP re-inject.""" + os.environ.pop("CLAUDE_CODE_OAUTH_TOKEN", None) + + resp = client.post( + "/api/credentials/reload-token", json={"token": "sk-ant-oat01-rotated"} + ) + + assert resp.status_code == 200 + assert resp.json() == {"status": "success", "reloaded": True} + # env mutated so the NEXT claude subprocess inherits the new token + assert os.environ["CLAUDE_CODE_OAUTH_TOKEN"] == "sk-ant-oat01-rotated" + # durable override written (survives a plain stop+start) with 0600 perms + assert client._override.read_text() == "sk-ant-oat01-rotated" + assert stat.S_IMODE(client._override.stat().st_mode) == 0o600 + # sanitizer redaction set refreshed; the whole-file MCP re-inject NOT done + assert client._refresh_calls == [1] + assert client._inject_calls == [] + + +def test_override_retightened_when_preexisting_world_readable(client): + """#1089 hardening: if the override already exists with loose perms (e.g. + 0644 left by an older write path or tampering), a reload re-tightens it to + 0600. ``os.open(..., 0o600)`` only applies its mode on *creation* — for an + existing file the mode arg is ignored — so the atomic create is paired with + an fchmod to enforce 0600 on the existing fd too (the old write_text()+chmod() + always re-tightened; the os.open() refinement must not silently lose that).""" + client._override.write_text("stale-token") + client._override.chmod(0o644) + + resp = client.post( + "/api/credentials/reload-token", json={"token": "sk-ant-oat01-retighten"} + ) + + assert resp.status_code == 200 + assert client._override.read_text() == "sk-ant-oat01-retighten" + assert stat.S_IMODE(client._override.stat().st_mode) == 0o600 + + +def test_reload_does_not_write_env_or_other_files(client): + """The endpoint writes ONLY the override — no sibling .env / .mcp.json + (proves it is not reusing the destructive /update or /inject flow).""" + client.post("/api/credentials/reload-token", json={"token": "tok"}) + + siblings = {p.name for p in client._override.parent.iterdir()} + assert siblings == {"oauth-token"} + + +def test_remove_api_key_true_pops_anthropic_key(client, monkeypatch): + monkeypatch.setenv("ANTHROPIC_API_KEY", "sk-ant-api-should-go") + + resp = client.post( + "/api/credentials/reload-token", + json={"token": "tok", "remove_api_key": True}, + ) + + assert resp.status_code == 200 + assert "ANTHROPIC_API_KEY" not in os.environ + + +def test_remove_api_key_defaults_false_preserves_key(client, monkeypatch): + monkeypatch.setenv("ANTHROPIC_API_KEY", "sk-ant-api-stays") + + resp = client.post("/api/credentials/reload-token", json={"token": "tok"}) + + assert resp.status_code == 200 + assert os.environ["ANTHROPIC_API_KEY"] == "sk-ant-api-stays" + + +def test_empty_token_returns_400(client): + resp = client.post("/api/credentials/reload-token", json={"token": ""}) + assert resp.status_code == 400 diff --git a/tests/unit/test_subscription_auto_switch_pingpong.py b/tests/unit/test_subscription_auto_switch_pingpong.py index 649aa2af..7436767e 100644 --- a/tests/unit/test_subscription_auto_switch_pingpong.py +++ b/tests/unit/test_subscription_auto_switch_pingpong.py @@ -534,3 +534,249 @@ async def test_no_switch_when_agent_has_no_subscription(self, svc): ) assert result is None assert svc._spy_calls == [] + + +# ============================================================================= +# #1089: hot-reload subscription token (rotate without recreating the container) +# ============================================================================= +# +# `_hot_reload_subscription_token` pushes the agent's current DB subscription +# token to the running container via POST /api/credentials/reload-token, so the +# NEXT claude subprocess uses the new token while in-flight turns keep their +# already-inherited old token and finish. It falls back to the full +# `_restart_agent` path (today's behavior) on: +# - transport failure (AgentClientError / AgentNotReachableError), +# - HTTP >= 400 (a 404 means an old base image without the endpoint), or +# - no resolvable token. +# Early-returns `no_container` / `not_running` exactly like `_restart_agent`. + +import asyncio # noqa: E402 (used by the hot-reload + key-rollover tests below) +import types as _types # noqa: E402 (module-level helpers for the tests below) +from unittest.mock import AsyncMock # noqa: E402 + + +def _docker_stub(*, container: object = object(), status: str = "running"): + """Fake `services.docker_service` exposing the two helper lookups.""" + mod = _types.ModuleType("services.docker_service") + mod.get_agent_container = lambda name: container + _status = _types.SimpleNamespace(status=status) + mod.get_agent_status_from_container = lambda c: _status + return mod + + +class _StubAgentClientError(Exception): + pass + + +class _StubAgentNotReachableError(_StubAgentClientError): + pass + + +def _agent_client_stub(*, post): + """Fake `services.agent_client`. `post` is bound to `client.post` + (an AsyncMock or coroutine fn). `AgentClientError` is the base the helper + catches; `AgentNotReachableError` subclasses it (transport-failure case).""" + mod = _types.ModuleType("services.agent_client") + mod.AgentClientError = _StubAgentClientError + mod.AgentNotReachableError = _StubAgentNotReachableError + client = _types.SimpleNamespace(post=post) + mod.get_agent_client = lambda name: client + return mod + + +class TestHotReloadSwitch: + """#1089 — the auto-switch path hot-reloads the token instead of recreating + the container; it falls back to restart on 404 / transport error / no token, + and short-circuits when the agent isn't a running container.""" + + @pytest.fixture + def auto_switch(self, monkeypatch): + import importlib + + stub_db = _install_database_stub() + # Token resolution defaults: agent on sub-a, sub-a token present. + stub_db.get_agent_subscription_id.return_value = "sub-a" + stub_db.get_subscription_token.return_value = "sk-ant-oat01-new-token" + + import services.subscription_auto_switch as mod + importlib.reload(mod) + + # Spy the fallback so we can assert when it IS / IS NOT taken. + restart_calls: list[str] = [] + + async def _restart_spy(agent_name): + restart_calls.append(agent_name) + return "restarted_fallback" + + monkeypatch.setattr(mod, "_restart_agent", _restart_spy) + mod._restart_calls = restart_calls # type: ignore[attr-defined] + mod._stub_db = stub_db # type: ignore[attr-defined] + return mod + + @pytest.mark.asyncio + async def test_happy_path_posts_reload_token_no_recreate(self, auto_switch, monkeypatch): + post = AsyncMock(return_value=_types.SimpleNamespace(status_code=200)) + monkeypatch.setitem(sys.modules, "services.docker_service", _docker_stub()) + monkeypatch.setitem(sys.modules, "services.agent_client", _agent_client_stub(post=post)) + + result = await auto_switch._hot_reload_subscription_token("agent-x") + + assert result == "hot_reloaded" + assert auto_switch._restart_calls == [] # NO container recreate — in-flight turns survive + post.assert_awaited_once() + args, kwargs = post.call_args + assert args[0] == "/api/credentials/reload-token" + assert kwargs["json"] == {"token": "sk-ant-oat01-new-token", "remove_api_key": False} + + @pytest.mark.asyncio + async def test_falls_back_to_restart_on_404(self, auto_switch, monkeypatch): + """404 = old base image without the endpoint → restart fallback.""" + post = AsyncMock(return_value=_types.SimpleNamespace(status_code=404)) + monkeypatch.setitem(sys.modules, "services.docker_service", _docker_stub()) + monkeypatch.setitem(sys.modules, "services.agent_client", _agent_client_stub(post=post)) + + result = await auto_switch._hot_reload_subscription_token("agent-x") + + assert result == "restarted_fallback" + assert auto_switch._restart_calls == ["agent-x"] + + @pytest.mark.asyncio + async def test_falls_back_to_restart_on_transport_error(self, auto_switch, monkeypatch): + post = AsyncMock(side_effect=_StubAgentNotReachableError("connection refused")) + monkeypatch.setitem(sys.modules, "services.docker_service", _docker_stub()) + monkeypatch.setitem(sys.modules, "services.agent_client", _agent_client_stub(post=post)) + + result = await auto_switch._hot_reload_subscription_token("agent-x") + + assert result == "restarted_fallback" + assert auto_switch._restart_calls == ["agent-x"] + + @pytest.mark.asyncio + async def test_falls_back_to_restart_when_no_token(self, auto_switch, monkeypatch): + auto_switch._stub_db.get_subscription_token.return_value = None + post = AsyncMock() + monkeypatch.setitem(sys.modules, "services.docker_service", _docker_stub()) + monkeypatch.setitem(sys.modules, "services.agent_client", _agent_client_stub(post=post)) + + result = await auto_switch._hot_reload_subscription_token("agent-x") + + assert result == "restarted_fallback" + assert auto_switch._restart_calls == ["agent-x"] + post.assert_not_awaited() # never reached the POST + + @pytest.mark.asyncio + async def test_no_container_short_circuits(self, auto_switch, monkeypatch): + monkeypatch.setitem(sys.modules, "services.docker_service", _docker_stub(container=None)) + + result = await auto_switch._hot_reload_subscription_token("agent-x") + + assert result == "no_container" + assert auto_switch._restart_calls == [] + + @pytest.mark.asyncio + async def test_not_running_short_circuits(self, auto_switch, monkeypatch): + monkeypatch.setitem(sys.modules, "services.docker_service", _docker_stub(status="stopped")) + + result = await auto_switch._hot_reload_subscription_token("agent-x") + + assert result == "not_running" + assert auto_switch._restart_calls == [] + + @pytest.mark.asyncio + async def test_perform_auto_switch_hot_reloads_not_restarts(self, auto_switch, monkeypatch): + """The auto-switch wire-in: `_perform_auto_switch` routes through the + hot-reload helper, so `restart_result == "hot_reloaded"` and the + recreate path (`_restart_agent`) is never taken.""" + # Stub the heavy local-import targets in `_perform_auto_switch`. + act_mod = _types.ModuleType("services.activity_service") + act_svc = MagicMock() + act_svc.track_activity = AsyncMock(return_value="act-1") + act_svc.complete_activity = AsyncMock(return_value=None) + act_mod.activity_service = act_svc + monkeypatch.setitem(sys.modules, "services.activity_service", act_mod) + + models_mod = _types.ModuleType("models") + models_mod.ActivityType = _types.SimpleNamespace(SCHEDULE_END="schedule_end") + models_mod.ActivityState = _types.SimpleNamespace(COMPLETED="completed", FAILED="failed") + monkeypatch.setitem(sys.modules, "models", models_mod) + + hot_calls: list[str] = [] + + async def _hot_spy(agent_name): + hot_calls.append(agent_name) + return "hot_reloaded" + + monkeypatch.setattr(auto_switch, "_hot_reload_subscription_token", _hot_spy) + + new_sub = MagicMock() + new_sub.id = "sub-b" + new_sub.name = "sub-B" + + result = await auto_switch._perform_auto_switch( + agent_name="agent-x", + old_subscription_id="sub-a", + old_subscription_name="sub-A", + new_subscription=new_sub, + failure_kind="rate_limit", + event_count=1, + ) + + assert result["switched"] is True + assert result["restart_result"] == "hot_reloaded" + assert hot_calls == ["agent-x"] # hot-reload used + assert auto_switch._restart_calls == [] # recreate path NOT taken + + +class TestKeyRolloverFanOut: + """#1089 (F1) — re-registering a subscription's token fans a best-effort + hot-reload out to every running agent on that subscription. One agent's + failure must not abort the fan-out nor block the others.""" + + @pytest.fixture + def auto_switch(self, monkeypatch): + import importlib + + stub_db = _install_database_stub() + import services.subscription_auto_switch as mod + importlib.reload(mod) + mod._stub_db = stub_db # type: ignore[attr-defined] + return mod + + @pytest.mark.asyncio + async def test_fan_out_attempts_every_agent_despite_one_failure(self, auto_switch, monkeypatch): + auto_switch._stub_db.get_agents_by_subscription.return_value = ["a1", "a2", "a3"] + + seen: list[str] = [] + + async def _hot(name): + seen.append(name) + if name == "a2": + raise RuntimeError("boom") + return "hot_reloaded" + + monkeypatch.setattr(auto_switch, "_hot_reload_subscription_token", _hot) + + async def _lock(name): + return asyncio.Lock() + + monkeypatch.setattr(auto_switch, "agent_switch_lock", _lock) + + results = await auto_switch.reload_subscription_for_all_agents("sub-a") + + assert seen == ["a1", "a2", "a3"] # all attempted, fan-out not aborted + assert results["a1"] == "hot_reloaded" + assert results["a3"] == "hot_reloaded" + assert results["a2"].startswith("failed:") + + @pytest.mark.asyncio + async def test_fan_out_no_agents_is_noop(self, auto_switch, monkeypatch): + auto_switch._stub_db.get_agents_by_subscription.return_value = [] + + async def _hot(name): + raise AssertionError("must not be called when no agents are assigned") + + monkeypatch.setattr(auto_switch, "_hot_reload_subscription_token", _hot) + + results = await auto_switch.reload_subscription_for_all_agents("sub-a") + + assert results == {} diff --git a/tests/unit/test_subscription_reassign_hotreload.py b/tests/unit/test_subscription_reassign_hotreload.py new file mode 100644 index 00000000..585c9849 --- /dev/null +++ b/tests/unit/test_subscription_reassign_hotreload.py @@ -0,0 +1,319 @@ +""" +Unit tests for subscription hot-reload on the router paths (#1089). + +Two producer paths beyond the SUB-003 auto-switch: + - Manual reassignment (PUT /api/subscriptions/agents/{name}, T4): a sub→sub + swap hot-reloads the token in place (in-flight turns survive, no + container_stop) under the #799 per-agent lock; an auth-MODE change + (none/api-key → subscription) still recreates so ANTHROPIC_API_KEY is + dropped and the OAuth token is baked into Config.Env. + - Key rollover (POST /api/subscriptions upsert, T5): re-registering a + subscription's token fans a best-effort hot-reload out to every running + agent on that subscription; one agent's failure never fails the upsert nor + blocks the others. + +Module: src/backend/routers/subscriptions.py + src/backend/services/subscription_auto_switch.py + +NOTE: `routers.subscriptions` (and `db_models`) are imported lazily inside the +fixtures rather than at module top. tests/unit/test_subscription_auto_switch_pingpong.py +pops `utils` from sys.modules at collection time; importing the full `routers` +package (which transitively needs `utils.url_validation`) at MODULE TOP here +would then fail collection if that file is collected first. The conftest's +autouse `_restore_unit_sys_modules` restores `utils` before each test runs, so +a fixture-time (test-run) import is safe regardless of collection order. +""" + +import asyncio +import sys +import types +from unittest.mock import MagicMock + +import pytest + + +def _live_auto_switch(): + """Return the live ``services.subscription_auto_switch`` object the endpoint + actually calls into, resolved via ``sys.modules`` — NOT the ``services`` + package attribute. + + ``routers/subscriptions.py`` reaches the hot-reload helpers through a + function-local ``from services.subscription_auto_switch import (...)``, which + binds from ``sys.modules["services.subscription_auto_switch"]``. A plain + ``import services.subscription_auto_switch as x`` instead binds ``x`` from the + ``services`` *package attribute*. The conftest's autouse #762 fixture restores + ``sys.modules["services"]`` before/after every test, and under some + pytest-randomly orderings (seed 99999) the package attribute and the + ``sys.modules`` submodule entry drift to two different module objects. Patching + the package-attribute object then misses the one the endpoint calls, so the + real helper runs and the test flakes (#1089). Resolving via ``sys.modules`` + keeps the fixture's patch in lockstep with the endpoint regardless of order. + """ + import services.subscription_auto_switch # noqa: F401 — ensure it is imported + return sys.modules["services.subscription_auto_switch"] + + +@pytest.fixture +def owner_user(): + u = MagicMock() + u.username = "owner" + u.role = "user" + return u + + +@pytest.fixture +def admin_user(): + u = MagicMock() + u.username = "admin" + u.role = "admin" + return u + + +@pytest.fixture +def manual_env(monkeypatch): + """Stub the db + the local-import targets the manual-reassign endpoint + reaches, and spy both the hot-reload helper and the recreate path so each + test can assert exactly which one ran.""" + import routers.subscriptions as rs # lazy: see module docstring + + fake_db = MagicMock() + fake_db.can_user_share_agent.return_value = True + sub = MagicMock() + sub.id = "sub-b" + sub.name = "sub-B" + fake_db.get_subscription_by_name.return_value = sub + monkeypatch.setattr(rs, "db", fake_db) + + container = object() + docker_service = types.ModuleType("services.docker_service") + docker_service.get_agent_container = lambda name: container + docker_service.get_agent_status_from_container = ( + lambda c: types.SimpleNamespace(status="running") + ) + monkeypatch.setitem(sys.modules, "services.docker_service", docker_service) + + stop_calls: list = [] + docker_utils = types.ModuleType("services.docker_utils") + + async def _stop(c): + stop_calls.append(c) + + docker_utils.container_stop = _stop + monkeypatch.setitem(sys.modules, "services.docker_utils", docker_utils) + + start_calls: list = [] + agent_service = types.ModuleType("services.agent_service") + + async def _start(name): + start_calls.append(name) + + agent_service.start_agent_internal = _start + monkeypatch.setitem(sys.modules, "services.agent_service", agent_service) + + auto_switch = _live_auto_switch() + + hot_calls: list = [] + + async def _hot(name): + hot_calls.append(name) + return "hot_reloaded" + + monkeypatch.setattr(auto_switch, "_hot_reload_subscription_token", _hot) + + lock_acquired: list = [] + + async def _lock(name): + lock_acquired.append(name) + return asyncio.Lock() + + monkeypatch.setattr(auto_switch, "agent_switch_lock", _lock) + + return types.SimpleNamespace( + rs=rs, + db=fake_db, + container=container, + stop_calls=stop_calls, + start_calls=start_calls, + hot_calls=hot_calls, + lock_acquired=lock_acquired, + ) + + +class TestManualReassignHotReload: + """T4 — PUT /api/subscriptions/agents/{name}.""" + + @pytest.mark.asyncio + async def test_sub_to_sub_hot_reloads_no_container_stop(self, manual_env, owner_user): + """A sub→sub swap on a running agent hot-reloads the token under the + per-agent lock — the container is NOT stopped/recreated.""" + manual_env.db.get_agent_subscription_id.return_value = "sub-a" # already on a sub + + result = await manual_env.rs.assign_subscription_to_agent( + agent_name="agent-x", + subscription_name="sub-B", + current_user=owner_user, + ) + + assert manual_env.hot_calls == ["agent-x"] # hot-reload taken + assert manual_env.stop_calls == [] # NO container recreate + assert manual_env.start_calls == [] + assert manual_env.lock_acquired == ["agent-x"] # under the #799 lock + assert result["restart_result"] == "hot_reloaded" + # DB switched before applying the token + manual_env.db.assign_subscription_to_agent.assert_called_once_with("agent-x", "sub-b") + + @pytest.mark.asyncio + async def test_mode_change_none_to_sub_still_recreates(self, manual_env, owner_user): + """An auth-mode change (no prior subscription → subscription) keeps the + recreate path so ANTHROPIC_API_KEY is dropped and the token is baked in.""" + manual_env.db.get_agent_subscription_id.return_value = None # api-key/none → sub + + result = await manual_env.rs.assign_subscription_to_agent( + agent_name="agent-x", + subscription_name="sub-B", + current_user=owner_user, + ) + + assert manual_env.hot_calls == [] # hot-reload NOT taken + assert manual_env.stop_calls == [manual_env.container] # recreate path + assert manual_env.start_calls == ["agent-x"] + assert manual_env.lock_acquired == ["agent-x"] # still serialized + assert result["restart_result"] == "success" + + @pytest.mark.asyncio + async def test_old_sub_snapshot_read_under_lock(self, manual_env, owner_user, monkeypatch): + """#1089 TOCTOU: the agent's CURRENT subscription is snapshotted AFTER the + per-agent switch lock is entered, never before. Reading it outside the + lock lets a concurrent auto-switch change the assignment between the read + and the assign, so the recreate-vs-hot-reload branch could be chosen + against a stale `old_sub_id`.""" + auto_switch = _live_auto_switch() + + order: list[str] = [] + + class _RecordingLock: + async def __aenter__(self): + order.append("lock_enter") + return self + + async def __aexit__(self, *exc): + order.append("lock_exit") + return False + + async def _lock(name): + return _RecordingLock() + + monkeypatch.setattr(auto_switch, "agent_switch_lock", _lock) + + def _read_sub(name): + order.append("read_sub") + return "sub-a" # already on a sub → hot-reload branch + + manual_env.db.get_agent_subscription_id.side_effect = _read_sub + + await manual_env.rs.assign_subscription_to_agent( + agent_name="agent-x", + subscription_name="sub-B", + current_user=owner_user, + ) + + # the snapshot read happens strictly INSIDE the lock window + assert order == ["lock_enter", "read_sub", "lock_exit"] + assert manual_env.hot_calls == ["agent-x"] # branch chosen off the in-lock read + + @pytest.mark.asyncio + async def test_non_owner_rejected(self, manual_env, owner_user): + """Owner/admin gate is unchanged — a non-owner gets 403 before any switch.""" + from fastapi import HTTPException + + manual_env.db.can_user_share_agent.return_value = False + + with pytest.raises(HTTPException) as exc: + await manual_env.rs.assign_subscription_to_agent( + agent_name="agent-x", + subscription_name="sub-B", + current_user=owner_user, + ) + assert exc.value.status_code == 403 + assert manual_env.hot_calls == [] + assert manual_env.stop_calls == [] + + +@pytest.fixture +def register_env(monkeypatch): + """Stub the db + the key-rollover fan-out for the register/upsert endpoint.""" + import routers.subscriptions as rs # lazy: see module docstring + + fake_db = MagicMock() + fake_db.get_user_by_username.return_value = {"id": 1} + created = MagicMock() + created.id = "sub-x" + created.name = "sub-X" + fake_db.create_subscription.return_value = created + monkeypatch.setattr(rs, "db", fake_db) + + # register_subscription 503s without an encryption key configured. + monkeypatch.setenv("CREDENTIAL_ENCRYPTION_KEY", "0" * 64) + + auto_switch = _live_auto_switch() + + fanout_calls: list = [] + + async def _fanout(sub_id): + fanout_calls.append(sub_id) + return {} + + monkeypatch.setattr(auto_switch, "reload_subscription_for_all_agents", _fanout) + + return types.SimpleNamespace( + rs=rs, db=fake_db, created=created, fanout_calls=fanout_calls, auto_switch=auto_switch + ) + + +class TestRegisterKeyRollover: + """T5 / F1 — POST /api/subscriptions upsert fans a hot-reload out to every + running agent on that subscription, best-effort.""" + + @pytest.mark.asyncio + async def test_upsert_fans_out_hot_reload(self, register_env, admin_user): + from db_models import SubscriptionCredentialCreate + + request = SubscriptionCredentialCreate(name="sub-X", token="sk-ant-oat01-rolled") + + result = await register_env.rs.register_subscription(request, current_user=admin_user) + + assert result is register_env.created + assert register_env.fanout_calls == ["sub-x"] # fanned out to the upserted sub id + + @pytest.mark.asyncio + async def test_fan_out_failure_does_not_fail_upsert(self, register_env, admin_user, monkeypatch): + """Best-effort: a fan-out blow-up is logged and swallowed — the upsert + still succeeds and returns the registered subscription.""" + from db_models import SubscriptionCredentialCreate + + async def _boom(sub_id): + raise RuntimeError("redis down") + + monkeypatch.setattr(register_env.auto_switch, "reload_subscription_for_all_agents", _boom) + + request = SubscriptionCredentialCreate(name="sub-X", token="sk-ant-oat01-rolled") + + result = await register_env.rs.register_subscription(request, current_user=admin_user) + + assert result is register_env.created # upsert NOT failed by the fan-out error + + @pytest.mark.asyncio + async def test_non_admin_rejected(self, register_env, owner_user): + """register_subscription is admin-only — a non-admin gets 403 before any + create or key-rollover fan-out (mirrors the owner gate on reassign).""" + from fastapi import HTTPException + from db_models import SubscriptionCredentialCreate + + request = SubscriptionCredentialCreate(name="sub-X", token="sk-ant-oat01-rolled") + + with pytest.raises(HTTPException) as exc: + await register_env.rs.register_subscription(request, current_user=owner_user) + + assert exc.value.status_code == 403 + assert register_env.fanout_calls == [] # never reached the rollover fan-out + register_env.db.create_subscription.assert_not_called()