Skip to content
Open
7 changes: 7 additions & 0 deletions docker/base-image/Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -130,6 +130,13 @@ RUN mkdir -p /workspace /data /logs && \
RUN mkdir -p /tmp/secure && \
chmod 1777 /tmp/secure

# #1089: writable-layer dir for the subscription-token hot-reload override
# (/var/lib/trinity/oauth-token). Owned by the agent (UID 1000) so the
# agent-server process can write it; on the writable layer (NOT /home/developer)
# so it survives a plain stop+start but is wiped on recreate.
RUN mkdir -p /var/lib/trinity && \
chown developer:developer /var/lib/trinity

USER developer

CMD ["/app/startup.sh"]
Expand Down
17 changes: 17 additions & 0 deletions docker/base-image/agent_server/models.py
Original file line number Diff line number Diff line change
Expand Up @@ -261,3 +261,20 @@ class CredentialInjectResponse(BaseModel):
"""Response from credential injection"""
status: str # "success"
files_written: List[str]


class TokenReloadRequest(BaseModel):
"""Request to hot-reload the subscription OAuth token (#1089).

Surgical alternative to a container recreate: mutates the agent-server
process env so the NEXT claude subprocess uses the rotated token while
in-flight turns keep their already-inherited old token and finish.
"""
token: str # CLAUDE_CODE_OAUTH_TOKEN value to apply
remove_api_key: bool = False # also drop ANTHROPIC_API_KEY from env


class TokenReloadResponse(BaseModel):
"""Response from a subscription token hot-reload"""
status: str # "success"
reloaded: bool
56 changes: 56 additions & 0 deletions docker/base-image/agent_server/routers/credentials.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,8 @@
CredentialReadResponse,
CredentialInjectRequest,
CredentialInjectResponse,
TokenReloadRequest,
TokenReloadResponse,
)
from ..state import agent_state
from ..services.trinity_mcp import inject_trinity_mcp_if_configured
Expand Down Expand Up @@ -132,6 +134,60 @@ async def update_credentials(request: CredentialUpdateRequest):
raise HTTPException(status_code=500, detail=f"Credential update failed: {str(e)}")


# Writable-layer override path (#1089). Deliberately NOT under /home/developer —
# that path is the persistent agent-{name}-workspace volume which
# `recreate_container_with_updated_config` preserves, so a token written there
# would survive a recreate and shadow the freshly-baked Config.Env (DB token).
# The writable layer instead survives a plain stop+start (same container) but is
# wiped on recreate (new container, fresh layer) — self-reconciling by Docker
# semantics, no marker logic needed. The directory is created + chowned to UID
# 1000 in the base-image Dockerfile (before the USER switch).
_TOKEN_OVERRIDE = Path("/var/lib/trinity/oauth-token")


@router.post("/api/credentials/reload-token", response_model=TokenReloadResponse)
async def reload_subscription_token(request: TokenReloadRequest):
"""Hot-reload CLAUDE_CODE_OAUTH_TOKEN for the NEXT claude subprocess (#1089).

Mutates the agent-server process env so the next `subprocess.Popen` for
`claude` inherits the rotated token; in-flight subprocesses keep their
already-inherited old token and finish. Also persists the token to the
writable-layer override so it survives a plain stop+start (fleet restart
bypasses `start_agent_internal`, which would otherwise revert to the old
Config.Env token — F2).

Deliberately does NOT rewrite .env / .mcp.json or re-inject Trinity MCP: the
subscription token is not a .env credential, and the `/update` / `/inject`
endpoints destructively rewrite whole files.
"""
if not request.token:
raise HTTPException(status_code=400, detail="token is required")

os.environ["CLAUDE_CODE_OAUTH_TOKEN"] = request.token
if request.remove_api_key:
os.environ.pop("ANTHROPIC_API_KEY", None)

# Persist to the writable-layer override. Parent dir is created + chowned in
# the Dockerfile, so the agent (UID 1000) can write here. Create the file
# atomically with 0600 via os.open() rather than write_text()+chmod(): the
# latter creates the file under the process umask (typically 0644) and leaves
# it world-readable until the follow-up chmod, a brief but avoidable window.
# The mode arg only applies on *creation*, so also fchmod the fd — a
# pre-existing override (older write path / tampering) keeps its own perms
# through O_CREAT|O_TRUNC, and we must still force it back to 0600.
fd = os.open(_TOKEN_OVERRIDE, os.O_WRONLY | os.O_CREAT | os.O_TRUNC, 0o600)
with os.fdopen(fd, "w") as f:
os.fchmod(f.fileno(), 0o600)
f.write(request.token)

# Add the new token to the log-redaction set (drops the old exact-match
# value; OAuth tokens stay caught by the sk-ant value regex regardless).
refresh_credential_values()

logger.info("Hot-reloaded CLAUDE_CODE_OAUTH_TOKEN (next subprocess; in-flight turns unaffected)")
return TokenReloadResponse(status="success", reloaded=True)


@router.get("/api/credentials/status")
async def get_credentials_status():
"""
Expand Down
14 changes: 14 additions & 0 deletions docker/base-image/startup.sh
Original file line number Diff line number Diff line change
Expand Up @@ -343,6 +343,20 @@ if [ -d "/config/mcp-servers" ]; then
done
fi

# === Rotated subscription token: durable override (#1089) ===
# A hot-reload (POST /api/credentials/reload-token) persists the rotated
# CLAUDE_CODE_OAUTH_TOKEN to this writable-layer path so it survives a plain
# stop+start. The container's baked Config.Env still holds the OLD token and a
# fleet restart (ops.py) does a raw stop+start that bypasses start_agent_internal
# — so export the override (when present and non-empty) BEFORE launching the
# agent server, so the rotated token wins. The file is wiped on recreate (fresh
# writable layer), so a DB-driven recreate cleanly reverts to the freshly-baked
# Config.Env token — no marker logic needed.
if [ -s /var/lib/trinity/oauth-token ]; then
export CLAUDE_CODE_OAUTH_TOKEN="$(cat /var/lib/trinity/oauth-token)"
echo "Applied rotated subscription token from durable override"
fi

# Start Agent Web Server (self-contained UI)
if [ "${ENABLE_AGENT_UI}" = "true" ]; then
echo "Starting Agent Web UI on port ${AGENT_SERVER_PORT:-8000}..."
Expand Down
11 changes: 10 additions & 1 deletion docs/memory/architecture.md
Original file line number Diff line number Diff line change
Expand Up @@ -276,12 +276,15 @@ Vector 0.43.1 (`timberio/vector:0.43.1-alpine`). Captures all container stdout/s
**Internal server** `agent-server.py` (FastAPI, port 8000):
- `/api/chat` - Claude Code execution (messages persisted to database)
- `/health` - Health check. Returns `{status}` plus `active_tasks` (concurrent executions across `/api/chat` + `/api/task`), `last_task_at`, `consecutive_failures` (reset on success — consumed by the dispatch breaker #526 and fleet health #307) and the #333 `diagnostics` gauges (#1020). `mailbox_depth` intentionally NOT emitted — no agent-side mailbox until the actor model (#945); the backend derives queue depth from `CapacityManager`. Counters live in `agent_server/state.py`; backend reads them in `monitoring_service.py` with graceful defaults for older images.
- `/api/credentials/update` - Hot-reload credentials
- `/api/credentials/update` - Hot-reload credentials (rewrites `.env`/`.mcp.json`)
- `/api/credentials/reload-token` - Surgical subscription-token hot-reload (#1089): mutates the agent-server process `os.environ["CLAUDE_CODE_OAUTH_TOKEN"]` so the NEXT claude subprocess uses the rotated token while in-flight subprocesses keep theirs; persists to the writable-layer override `/var/lib/trinity/oauth-token` (0600). Does NOT touch `.env`/`.mcp.json`. See [Subscription Token Rotation](#subscription-token-rotation-via-hot-reload-1089)
- `/api/chat/session` - Context window stats
- `/api/files`, `/api/files/download` (100MB limit), `/api/files/mkdir` (workspace-confined, #37)

The agent server also runs two loops: the 15-min git `auto_sync` heartbeat (see [Git Sync Health](#git-sync-health-389390)) and the 5s liveness heartbeat (see [Heartbeat Liveness](#heartbeat-liveness-reliability-004-307)).

**Durable subscription-token override (#1089):** `startup.sh` exports `CLAUDE_CODE_OAUTH_TOKEN` from `/var/lib/trinity/oauth-token` (when present, non-empty) **before** launching the agent server, so a token rotated via hot-reload survives a plain stop+start (a fleet restart via `routers/ops.py` does a raw `container_stop`+`container_start` that bypasses `start_agent_internal` and would otherwise revert to the baked `Config.Env` token). The path is deliberately on the writable layer, **not** under the persisted `/home/developer` volume: it survives `stop`→`start` (same container) but is wiped on recreate (fresh layer), so a DB-driven recreate cleanly re-bakes `Config.Env` from the DB and the stale override is gone — self-reconciling, no marker logic. Dir created+chowned to UID 1000 in the base-image Dockerfile.

**Template-supplied pre-check** (SCHED-COND-001, #454): if the template ships an executable `~/.trinity/pre-check`, the backend's internal endpoint `POST /api/internal/agents/{name}/pre-check` runs it via `docker exec` before a cron-triggered chat. Language-agnostic — interpreter selected by shebang. The hook's stdout becomes the chat message; empty stdout + exit 0 records a skipped execution (Claude never invoked). Uses the same `execute_command_in_container` primitive as `git_service.py`, `ssh_service.py`, and the agent terminal — no agent-server HTTP endpoint.

**Persistent chat:** all chat messages auto-saved to SQLite (`chat_sessions`, `chat_messages`) with full observability (costs, context, tool calls, execution time); sessions survive container restarts/deletions; users see only their own messages (admins see all).
Expand Down Expand Up @@ -367,6 +370,12 @@ agent:heartbeat:misses:{name} → STRING(int), ~60s TTL. Consecutive-miss counte

Trigger-boundary dedup — policy in Architectural Invariant #18, table DDL under `idempotency_keys`. `services/idempotency_service.py` (key derivation + `begin`/`complete`/`fail`) over `db/idempotency.py`. The `(scope, key)` PRIMARY KEY is the atomic claim: `claim()` INSERTs an `in_flight` row; a concurrent loser catches `IntegrityError` and reads the surviving row — cross-process safe across uvicorn workers and the standalone scheduler (shared SQLite file). Lifecycle: `claim` → (`attach_execution`) → `complete` (stores `response_snapshot` for replay) or `release` (deletes the in_flight row so a failed attempt can retry; never deletes a `completed` row). Rows older than 24h are treated as expired and re-claimed; the cleanup service purges them (`idempotency_purge_expired`). Duplicates within 24h short-circuit with the original result + `X-Idempotent-Replay: true`; an in-flight duplicate returns 409. Fail-open — a key never blocks a real execution.

### Subscription Token Rotation via Hot-Reload (#1089)

Rotating an agent's subscription token used to recreate the container, making "rotate a credential" and "kill every in-flight turn" the same operation (#1037). Token rotation now hot-reloads the running container; recreate is reserved for image/template/auth-**mode** changes (TARGET_ARCHITECTURE §Agent Runtime). The agent server authenticates Claude purely from `CLAUDE_CODE_OAUTH_TOKEN` (no `.credentials.json`) and is a single uvicorn worker, so mutating its process env makes the **next** subprocess use the new token while in-flight subprocesses finish on the old one.

Backend orchestration in `services/subscription_auto_switch.py`: `_hot_reload_subscription_token(agent_name)` POSTs the agent's current DB token to the agent-server `POST /api/credentials/reload-token`, falling back to `_restart_agent` on a 404 (old base image), transport failure, or missing token (`no_container`/`not_running` short-circuit otherwise). Three producer paths converted, all under the #799 `agent_switch_lock`: **auto-switch** (`_perform_auto_switch`, SUB-003), **manual sub→sub reassignment** (`PUT /api/subscriptions/agents/{name}` — auth-mode changes none/api-key→sub still recreate), and **key rollover** (`reload_subscription_for_all_agents(sub_id)` fans a best-effort reload across every running agent on a re-registered subscription). Durable override (`/var/lib/trinity/oauth-token`) + `startup.sh` read make a rotation survive a plain restart — see the agent-server [Durable subscription-token override](#agent-containers) note. Agent-server endpoint mirroring follows Invariant #5.

### Real-time Delivery (RELIABILITY-003, #306)

**Transport** (`event_bus.py`): Redis Streams. `ConnectionManager`/`FilteredWebSocketManager` are thin shims that `XADD` to the MAXLEN-trimmed `trinity:events` stream; one `StreamDispatcher` per backend process runs `XREAD BLOCK` and fans out to registered clients, evicting a client after 3 consecutive delivery failures. New broadcast sites keep calling `manager.broadcast(...)` / `filtered_manager.broadcast_filtered(...)` — never publish to the stream directly (Invariant #10).
Expand Down
1 change: 1 addition & 0 deletions docs/memory/feature-flows.md
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@
| Date | ID | Feature | Flow |
|------|-----|---------|------|
| 2026-06-14 | #1022 | fix(scheduler): persist a descriptive `error` on dispatch timeout — a dispatch `httpx.TimeoutException` (whose `str()` is `''`) previously landed in the cron path's generic handler and persisted a **blank** `error`. Now re-raised before that handler as a named non-blank message (`"dispatch to /api/internal/execute-task timed out after {N}s — outcome unknown"`); outcome is genuinely UNKNOWN (backend spawns the bg task before replying → may already be running → orphan recovered by cleanup). New `_describe_exception()` helper (type-name fallback) normalizes any blank-stringifying exception across all execution/retry/process-schedule error paths. Dispatch + pre-check HTTP deadlines lifted from literals to config: `DISPATCH_TIMEOUT` (default 30s) and `PRE_CHECK_TIMEOUT` (default 70s). Scheduler-only (`src/scheduler/`); +270 lines of tests (incl. pre-check config-deadline + retry-path blank-error regressions). | [scheduler-service.md](feature-flows/scheduler-service.md), [scheduler-pre-check.md](feature-flows/scheduler-pre-check.md) |
| 2026-06-13 | #1089 | feat: subscription token rotation via **hot-reload, not container recreate** — a dedicated agent-server `POST /api/credentials/reload-token` mutates the running container's `CLAUDE_CODE_OAUTH_TOKEN` env so the next claude subprocess uses the rotated token while in-flight turns finish (closes the #1037 collateral-kill class). Three producer paths converted under the #799 `agent_switch_lock`: auto-switch (SUB-003), manual sub→sub reassignment (auth-mode changes still recreate), and key-rollover fan-out on `POST /api/subscriptions` upsert. Durable writable-layer override (`/var/lib/trinity/oauth-token` + `startup.sh` read) survives a plain restart; recreate self-reconciles to the DB token. Falls back to the old `_restart_agent` recreate on a 404 (old base image). | [subscription-auto-switch.md](feature-flows/subscription-auto-switch.md) |
| 2026-06-11 | #858 | fix: first-time setup token silently lost — `docker/backend/Dockerfile` had drifted and lost `ENV PYTHONUNBUFFERED=1` (which `docker/scheduler/Dockerfile` still set), so CPython block-buffered the lifespan's stdout to the Docker log pipe (~8KB) and the printed setup token never reached `docker logs`, deadlocking fresh installs (the only documented path through the `routers/setup.py` token gate). Two-layer fix: (1) restore `PYTHONUNBUFFERED=1` (catches every `print()`); (2) the setup-token block + ~76 other lifespan `print()` calls now emit via the structured `logger` — the token as a single multi-line `logger.warning` **relocated to immediately after `setup_logging()`**, before the event-bus/audit-write startup that could otherwise hang and suppress it (the `StreamHandler` flushes per record, so it's immune to future Dockerfile drift and flows through Vector). `setup_opentelemetry()`'s import-time print + the `register_enterprise` prints stay `print(..., flush=True)` (they run before `setup_logging()`). New `unit/test_858_dockerfile_unbuffered.py` backend↔scheduler parity guard (2 tests). Note: stdout→stderr stream move for the converted lines (Docker/Vector capture both). Known follow-up #1165: prod runs uvicorn `--workers 2`, so the per-process token is still ~50% flaky until unified. | [first-time-setup.md](feature-flows/first-time-setup.md) |
| 2026-06-10 | #1130 | fix: retired `gemini-2.0-flash` replaced with env-configurable models — `GEMINI_TEXT_MODEL` (image-gen prompt refinement) + `GEMINI_TRANSCRIPTION_MODEL` (Telegram voice), both default `gemini-3.5-flash`, defined in `config.py`, empty-string-safe wiring in both compose files (#1076 pattern). | [image-generation.md](feature-flows/image-generation.md), [telegram-integration.md](feature-flows/telegram-integration.md) |
| 2026-06-10 | #1108 | feat(ui): Agent Detail **Guardrails** tab renamed to **Settings** — sectioned config home. New `components/settings/SettingsPanel.vue` renders `GuardrailsPanel` unchanged as section #1; future per-agent settings land as additive sections, not new tabs. `?tab=guardrails` deep links alias to `settings` via `TAB_ALIASES`. Pure frontend. | [agent-guardrails.md](feature-flows/agent-guardrails.md) |
Expand Down
Loading
Loading