Abilityai · AndriiPasternak31 · Jun 11, 2026 · Jun 13, 2026 · Jun 13, 2026 · Jun 13, 2026
diff --git a/docker/base-image/Dockerfile b/docker/base-image/Dockerfile
@@ -130,6 +130,13 @@ RUN mkdir -p /workspace /data /logs && \
 RUN mkdir -p /tmp/secure && \
     chmod 1777 /tmp/secure
 
+# #1089: writable-layer dir for the subscription-token hot-reload override
+# (/var/lib/trinity/oauth-token). Owned by the agent (UID 1000) so the
+# agent-server process can write it; on the writable layer (NOT /home/developer)
+# so it survives a plain stop+start but is wiped on recreate.
+RUN mkdir -p /var/lib/trinity && \
+    chown developer:developer /var/lib/trinity
+
 USER developer
 
 CMD ["/app/startup.sh"]

diff --git a/docker/base-image/agent_server/models.py b/docker/base-image/agent_server/models.py
@@ -261,3 +261,20 @@ class CredentialInjectResponse(BaseModel):
     """Response from credential injection"""
     status: str  # "success"
     files_written: List[str]
+
+
+class TokenReloadRequest(BaseModel):
+    """Request to hot-reload the subscription OAuth token (#1089).
+
+    Surgical alternative to a container recreate: mutates the agent-server
+    process env so the NEXT claude subprocess uses the rotated token while
+    in-flight turns keep their already-inherited old token and finish.
+    """
+    token: str  # CLAUDE_CODE_OAUTH_TOKEN value to apply
+    remove_api_key: bool = False  # also drop ANTHROPIC_API_KEY from env
+
+
+class TokenReloadResponse(BaseModel):
+    """Response from a subscription token hot-reload"""
+    status: str  # "success"
+    reloaded: bool
diff --git a/docker/base-image/agent_server/routers/credentials.py b/docker/base-image/agent_server/routers/credentials.py
@@ -15,6 +15,8 @@
     CredentialReadResponse,
     CredentialInjectRequest,
     CredentialInjectResponse,
+    TokenReloadRequest,
+    TokenReloadResponse,
 )
 from ..state import agent_state
 from ..services.trinity_mcp import inject_trinity_mcp_if_configured
@@ -132,6 +134,60 @@ async def update_credentials(request: CredentialUpdateRequest):
         raise HTTPException(status_code=500, detail=f"Credential update failed: {str(e)}")
 
 
+# Writable-layer override path (#1089). Deliberately NOT under /home/developer —
+# that path is the persistent agent-{name}-workspace volume which
+# `recreate_container_with_updated_config` preserves, so a token written there
+# would survive a recreate and shadow the freshly-baked Config.Env (DB token).
+# The writable layer instead survives a plain stop+start (same container) but is
+# wiped on recreate (new container, fresh layer) — self-reconciling by Docker
+# semantics, no marker logic needed. The directory is created + chowned to UID
+# 1000 in the base-image Dockerfile (before the USER switch).
+_TOKEN_OVERRIDE = Path("/var/lib/trinity/oauth-token")
+
+
+@router.post("/api/credentials/reload-token", response_model=TokenReloadResponse)
+async def reload_subscription_token(request: TokenReloadRequest):
+    """Hot-reload CLAUDE_CODE_OAUTH_TOKEN for the NEXT claude subprocess (#1089).
+
+    Mutates the agent-server process env so the next `subprocess.Popen` for
+    `claude` inherits the rotated token; in-flight subprocesses keep their
+    already-inherited old token and finish. Also persists the token to the
+    writable-layer override so it survives a plain stop+start (fleet restart
+    bypasses `start_agent_internal`, which would otherwise revert to the old
+    Config.Env token — F2).
+
+    Deliberately does NOT rewrite .env / .mcp.json or re-inject Trinity MCP: the
+    subscription token is not a .env credential, and the `/update` / `/inject`
+    endpoints destructively rewrite whole files.
+    """
+    if not request.token:
+        raise HTTPException(status_code=400, detail="token is required")
+
+    os.environ["CLAUDE_CODE_OAUTH_TOKEN"] = request.token
+    if request.remove_api_key:
+        os.environ.pop("ANTHROPIC_API_KEY", None)
+
+    # Persist to the writable-layer override. Parent dir is created + chowned in
+    # the Dockerfile, so the agent (UID 1000) can write here. Create the file
+    # atomically with 0600 via os.open() rather than write_text()+chmod(): the
+    # latter creates the file under the process umask (typically 0644) and leaves
+    # it world-readable until the follow-up chmod, a brief but avoidable window.
+    # The mode arg only applies on *creation*, so also fchmod the fd — a
+    # pre-existing override (older write path / tampering) keeps its own perms
+    # through O_CREAT|O_TRUNC, and we must still force it back to 0600.
+    fd = os.open(_TOKEN_OVERRIDE, os.O_WRONLY | os.O_CREAT | os.O_TRUNC, 0o600)
+    with os.fdopen(fd, "w") as f:
+        os.fchmod(f.fileno(), 0o600)
+        f.write(request.token)
+
+    # Add the new token to the log-redaction set (drops the old exact-match
+    # value; OAuth tokens stay caught by the sk-ant value regex regardless).
+    refresh_credential_values()
+
+    logger.info("Hot-reloaded CLAUDE_CODE_OAUTH_TOKEN (next subprocess; in-flight turns unaffected)")
+    return TokenReloadResponse(status="success", reloaded=True)
+
+
 @router.get("/api/credentials/status")
 async def get_credentials_status():
     """

diff --git a/docker/base-image/startup.sh b/docker/base-image/startup.sh
@@ -343,6 +343,20 @@ if [ -d "/config/mcp-servers" ]; then
     done
 fi
 
+# === Rotated subscription token: durable override (#1089) ===
+# A hot-reload (POST /api/credentials/reload-token) persists the rotated
+# CLAUDE_CODE_OAUTH_TOKEN to this writable-layer path so it survives a plain
+# stop+start. The container's baked Config.Env still holds the OLD token and a
+# fleet restart (ops.py) does a raw stop+start that bypasses start_agent_internal
+# — so export the override (when present and non-empty) BEFORE launching the
+# agent server, so the rotated token wins. The file is wiped on recreate (fresh
+# writable layer), so a DB-driven recreate cleanly reverts to the freshly-baked
+# Config.Env token — no marker logic needed.
+if [ -s /var/lib/trinity/oauth-token ]; then
+    export CLAUDE_CODE_OAUTH_TOKEN="$(cat /var/lib/trinity/oauth-token)"
+    echo "Applied rotated subscription token from durable override"
+fi
+
 # Start Agent Web Server (self-contained UI)
 if [ "${ENABLE_AGENT_UI}" = "true" ]; then
     echo "Starting Agent Web UI on port ${AGENT_SERVER_PORT:-8000}..."

diff --git a/docs/memory/architecture.md b/docs/memory/architecture.md
@@ -276,12 +276,15 @@ Vector 0.43.1 (`timberio/vector:0.43.1-alpine`). Captures all container stdout/s
 **Internal server** `agent-server.py` (FastAPI, port 8000):
 - `/api/chat` - Claude Code execution (messages persisted to database)
 - `/health` - Health check. Returns `{status}` plus `active_tasks` (concurrent executions across `/api/chat` + `/api/task`), `last_task_at`, `consecutive_failures` (reset on success — consumed by the dispatch breaker #526 and fleet health #307) and the #333 `diagnostics` gauges (#1020). `mailbox_depth` intentionally NOT emitted — no agent-side mailbox until the actor model (#945); the backend derives queue depth from `CapacityManager`. Counters live in `agent_server/state.py`; backend reads them in `monitoring_service.py` with graceful defaults for older images.
-- `/api/credentials/update` - Hot-reload credentials
+- `/api/credentials/update` - Hot-reload credentials (rewrites `.env`/`.mcp.json`)
+- `/api/credentials/reload-token` - Surgical subscription-token hot-reload (#1089): mutates the agent-server process `os.environ["CLAUDE_CODE_OAUTH_TOKEN"]` so the NEXT claude subprocess uses the rotated token while in-flight subprocesses keep theirs; persists to the writable-layer override `/var/lib/trinity/oauth-token` (0600). Does NOT touch `.env`/`.mcp.json`. See [Subscription Token Rotation](#subscription-token-rotation-via-hot-reload-1089)
 - `/api/chat/session` - Context window stats
 - `/api/files`, `/api/files/download` (100MB limit), `/api/files/mkdir` (workspace-confined, #37)
 
 The agent server also runs two loops: the 15-min git `auto_sync` heartbeat (see [Git Sync Health](#git-sync-health-389390)) and the 5s liveness heartbeat (see [Heartbeat Liveness](#heartbeat-liveness-reliability-004-307)).
 
+**Durable subscription-token override (#1089):** `startup.sh` exports `CLAUDE_CODE_OAUTH_TOKEN` from `/var/lib/trinity/oauth-token` (when present, non-empty) **before** launching the agent server, so a token rotated via hot-reload survives a plain stop+start (a fleet restart via `routers/ops.py` does a raw `container_stop`+`container_start` that bypasses `start_agent_internal` and would otherwise revert to the baked `Config.Env` token). The path is deliberately on the writable layer, **not** under the persisted `/home/developer` volume: it survives `stop`→`start` (same container) but is wiped on recreate (fresh layer), so a DB-driven recreate cleanly re-bakes `Config.Env` from the DB and the stale override is gone — self-reconciling, no marker logic. Dir created+chowned to UID 1000 in the base-image Dockerfile.
+
 **Template-supplied pre-check** (SCHED-COND-001, #454): if the template ships an executable `~/.trinity/pre-check`, the backend's internal endpoint `POST /api/internal/agents/{name}/pre-check` runs it via `docker exec` before a cron-triggered chat. Language-agnostic — interpreter selected by shebang. The hook's stdout becomes the chat message; empty stdout + exit 0 records a skipped execution (Claude never invoked). Uses the same `execute_command_in_container` primitive as `git_service.py`, `ssh_service.py`, and the agent terminal — no agent-server HTTP endpoint.
 
 **Persistent chat:** all chat messages auto-saved to SQLite (`chat_sessions`, `chat_messages`) with full observability (costs, context, tool calls, execution time); sessions survive container restarts/deletions; users see only their own messages (admins see all).
@@ -367,6 +370,12 @@ agent:heartbeat:misses:{name} → STRING(int), ~60s TTL. Consecutive-miss counte
 
 Trigger-boundary dedup — policy in Architectural Invariant #18, table DDL under `idempotency_keys`. `services/idempotency_service.py` (key derivation + `begin`/`complete`/`fail`) over `db/idempotency.py`. The `(scope, key)` PRIMARY KEY is the atomic claim: `claim()` INSERTs an `in_flight` row; a concurrent loser catches `IntegrityError` and reads the surviving row — cross-process safe across uvicorn workers and the standalone scheduler (shared SQLite file). Lifecycle: `claim` → (`attach_execution`) → `complete` (stores `response_snapshot` for replay) or `release` (deletes the in_flight row so a failed attempt can retry; never deletes a `completed` row). Rows older than 24h are treated as expired and re-claimed; the cleanup service purges them (`idempotency_purge_expired`). Duplicates within 24h short-circuit with the original result + `X-Idempotent-Replay: true`; an in-flight duplicate returns 409. Fail-open — a key never blocks a real execution.
 
+### Subscription Token Rotation via Hot-Reload (#1089)
+
+Rotating an agent's subscription token used to recreate the container, making "rotate a credential" and "kill every in-flight turn" the same operation (#1037). Token rotation now hot-reloads the running container; recreate is reserved for image/template/auth-**mode** changes (TARGET_ARCHITECTURE §Agent Runtime). The agent server authenticates Claude purely from `CLAUDE_CODE_OAUTH_TOKEN` (no `.credentials.json`) and is a single uvicorn worker, so mutating its process env makes the **next** subprocess use the new token while in-flight subprocesses finish on the old one.
+
+Backend orchestration in `services/subscription_auto_switch.py`: `_hot_reload_subscription_token(agent_name)` POSTs the agent's current DB token to the agent-server `POST /api/credentials/reload-token`, falling back to `_restart_agent` on a 404 (old base image), transport failure, or missing token (`no_container`/`not_running` short-circuit otherwise). Three producer paths converted, all under the #799 `agent_switch_lock`: **auto-switch** (`_perform_auto_switch`, SUB-003), **manual sub→sub reassignment** (`PUT /api/subscriptions/agents/{name}` — auth-mode changes none/api-key→sub still recreate), and **key rollover** (`reload_subscription_for_all_agents(sub_id)` fans a best-effort reload across every running agent on a re-registered subscription). Durable override (`/var/lib/trinity/oauth-token`) + `startup.sh` read make a rotation survive a plain restart — see the agent-server [Durable subscription-token override](#agent-containers) note. Agent-server endpoint mirroring follows Invariant #5.
+
 ### Real-time Delivery (RELIABILITY-003, #306)
 
 **Transport** (`event_bus.py`): Redis Streams. `ConnectionManager`/`FilteredWebSocketManager` are thin shims that `XADD` to the MAXLEN-trimmed `trinity:events` stream; one `StreamDispatcher` per backend process runs `XREAD BLOCK` and fans out to registered clients, evicting a client after 3 consecutive delivery failures. New broadcast sites keep calling `manager.broadcast(...)` / `filtered_manager.broadcast_filtered(...)` — never publish to the stream directly (Invariant #10).

diff --git a/docs/memory/feature-flows.md b/docs/memory/feature-flows.md
@@ -12,6 +12,7 @@
 | Date | ID | Feature | Flow |
 |------|-----|---------|------|
 | 2026-06-14 | #1022 | fix(scheduler): persist a descriptive `error` on dispatch timeout — a dispatch `httpx.TimeoutException` (whose `str()` is `''`) previously landed in the cron path's generic handler and persisted a **blank** `error`. Now re-raised before that handler as a named non-blank message (`"dispatch to /api/internal/execute-task timed out after {N}s — outcome unknown"`); outcome is genuinely UNKNOWN (backend spawns the bg task before replying → may already be running → orphan recovered by cleanup). New `_describe_exception()` helper (type-name fallback) normalizes any blank-stringifying exception across all execution/retry/process-schedule error paths. Dispatch + pre-check HTTP deadlines lifted from literals to config: `DISPATCH_TIMEOUT` (default 30s) and `PRE_CHECK_TIMEOUT` (default 70s). Scheduler-only (`src/scheduler/`); +270 lines of tests (incl. pre-check config-deadline + retry-path blank-error regressions). | [scheduler-service.md](feature-flows/scheduler-service.md), [scheduler-pre-check.md](feature-flows/scheduler-pre-check.md) |
+| 2026-06-13 | #1089 | feat: subscription token rotation via **hot-reload, not container recreate** — a dedicated agent-server `POST /api/credentials/reload-token` mutates the running container's `CLAUDE_CODE_OAUTH_TOKEN` env so the next claude subprocess uses the rotated token while in-flight turns finish (closes the #1037 collateral-kill class). Three producer paths converted under the #799 `agent_switch_lock`: auto-switch (SUB-003), manual sub→sub reassignment (auth-mode changes still recreate), and key-rollover fan-out on `POST /api/subscriptions` upsert. Durable writable-layer override (`/var/lib/trinity/oauth-token` + `startup.sh` read) survives a plain restart; recreate self-reconciles to the DB token. Falls back to the old `_restart_agent` recreate on a 404 (old base image). | [subscription-auto-switch.md](feature-flows/subscription-auto-switch.md) |
 | 2026-06-11 | #858 | fix: first-time setup token silently lost — `docker/backend/Dockerfile` had drifted and lost `ENV PYTHONUNBUFFERED=1` (which `docker/scheduler/Dockerfile` still set), so CPython block-buffered the lifespan's stdout to the Docker log pipe (~8KB) and the printed setup token never reached `docker logs`, deadlocking fresh installs (the only documented path through the `routers/setup.py` token gate). Two-layer fix: (1) restore `PYTHONUNBUFFERED=1` (catches every `print()`); (2) the setup-token block + ~76 other lifespan `print()` calls now emit via the structured `logger` — the token as a single multi-line `logger.warning` **relocated to immediately after `setup_logging()`**, before the event-bus/audit-write startup that could otherwise hang and suppress it (the `StreamHandler` flushes per record, so it's immune to future Dockerfile drift and flows through Vector). `setup_opentelemetry()`'s import-time print + the `register_enterprise` prints stay `print(..., flush=True)` (they run before `setup_logging()`). New `unit/test_858_dockerfile_unbuffered.py` backend↔scheduler parity guard (2 tests). Note: stdout→stderr stream move for the converted lines (Docker/Vector capture both). Known follow-up #1165: prod runs uvicorn `--workers 2`, so the per-process token is still ~50% flaky until unified. | [first-time-setup.md](feature-flows/first-time-setup.md) |
 | 2026-06-10 | #1130 | fix: retired `gemini-2.0-flash` replaced with env-configurable models — `GEMINI_TEXT_MODEL` (image-gen prompt refinement) + `GEMINI_TRANSCRIPTION_MODEL` (Telegram voice), both default `gemini-3.5-flash`, defined in `config.py`, empty-string-safe wiring in both compose files (#1076 pattern). | [image-generation.md](feature-flows/image-generation.md), [telegram-integration.md](feature-flows/telegram-integration.md) |
 | 2026-06-10 | #1108 | feat(ui): Agent Detail **Guardrails** tab renamed to **Settings** — sectioned config home. New `components/settings/SettingsPanel.vue` renders `GuardrailsPanel` unchanged as section #1; future per-agent settings land as additive sections, not new tabs. `?tab=guardrails` deep links alias to `settings` via `TAB_ALIASES`. Pure frontend. | [agent-guardrails.md](feature-flows/agent-guardrails.md) |