diff --git a/config/agent-templates/test-codex/CLAUDE.md b/config/agent-templates/test-codex/CLAUDE.md new file mode 100644 index 00000000..9959decc --- /dev/null +++ b/config/agent-templates/test-codex/CLAUDE.md @@ -0,0 +1,45 @@ +# Test Codex Agent + +You are a test agent running on **OpenAI's Codex CLI** (`codex exec`), Trinity's +third agent runtime alongside Claude Code and Gemini. + +> Trinity mirrors this file to `AGENTS.md` at startup — Codex reads `AGENTS.md`, +> not `CLAUDE.md`. + +## Your Purpose + +Validate that Trinity's Codex runtime works correctly: +- Codex CLI integration (`codex exec --json`) +- The `-o` durable result record (authoritative response) +- MCP tool access (Trinity MCP wired via `config.toml`) +- Cost tracking (estimated from `turn.completed.usage` tokens) +- Chat continuity (`codex exec resume `) +- Sandbox safety (`danger-full-access` — the Trinity container is the boundary; `read-only` when the agent is read-only) + +## Key Differences from Claude Code + +1. **Instructions file:** You read `AGENTS.md` (Trinity mirrors `CLAUDE.md` → `AGENTS.md`). +2. **Cost:** No native cost field — Trinity estimates it from token counts. +3. **Sandbox:** You run under `--sandbox danger-full-access`, which disables Codex's + own inner (bubblewrap) sandbox — the hardened Trinity container (`cap_drop ALL`, + `no-new-privileges`, AppArmor) is the security boundary, the same posture Claude + and Gemini run under. A read-only agent runs `--sandbox read-only`. +4. **Provider:** OpenAI (not Anthropic). +5. **Session tab:** Not available for Codex agents — use the **Chat** tab (continuity + is wired there). The Session tab's cached-UUID `--resume` model is Claude-specific. + +## Authentication + +Codex authenticates with `OPENAI_API_KEY`, injected via the agent's `.env` +(Quick Inject → `OPENAI_API_KEY`). Codex agents are not assigned a Claude +subscription. + +## Testing Commands + +When asked to test, verify: +- `/test` — basic functionality +- Tool calling works (shell commands, web search) +- MCP servers are accessible +- Cost / token tracking reports correctly + +Report any differences in behavior compared to Claude Code agents. diff --git a/config/agent-templates/test-codex/template.yaml b/config/agent-templates/test-codex/template.yaml new file mode 100644 index 00000000..b7e11098 --- /dev/null +++ b/config/agent-templates/test-codex/template.yaml @@ -0,0 +1,34 @@ +name: test-codex +display_name: Test Codex Agent +description: Test agent using OpenAI's Codex CLI runtime for validation (#1187) +version: "1.0.0" +author: Trinity Platform +priority: 10 # Lower = higher in list (after system templates) + +type: business-assistant + +# Use the OpenAI Codex runtime instead of Claude Code +runtime: + type: codex + model: gpt-5.1-codex + +resources: + cpu: "2" + memory: "2g" + +capabilities: + - chat + - code-generation + - web-search + +mcp_servers: [] + +# Codex authenticates with an OpenAI API key read from the agent's .env +# (CRED-002). Declaring it here makes the Quick Inject UI prompt for it. +credentials: + env_file: + - OPENAI_API_KEY + +slash_commands: + - name: /test + description: Test command to verify the Codex runtime is working diff --git a/docker/base-image/Dockerfile b/docker/base-image/Dockerfile index c0593b18..aae9533c 100644 --- a/docker/base-image/Dockerfile +++ b/docker/base-image/Dockerfile @@ -50,6 +50,13 @@ RUN npm install -g @anthropic-ai/claude-code@latest # Install Gemini CLI for multi-runtime support RUN npm install -g @google/gemini-cli +# Install OpenAI Codex CLI for multi-runtime support (#1187). +# Pinned (not @latest) so base-image rebuilds are reproducible and a +# breaking Codex release can't silently change the agent runtime. The npm +# package resolves the platform-specific Rust binary via optional deps; +# the linux-x64 build lands here. Bump deliberately after testing. +RUN npm install -g @openai/codex@0.139.0 + RUN useradd -m -s /bin/bash -u 1000 developer && \ echo "developer:developer" | chpasswd && \ usermod -aG sudo developer && \ diff --git a/docker/base-image/agent_server/models.py b/docker/base-image/agent_server/models.py index 4fa19d96..c148154f 100644 --- a/docker/base-image/agent_server/models.py +++ b/docker/base-image/agent_server/models.py @@ -109,6 +109,12 @@ class ExecutionMetadata(BaseModel): compact_events: List[CompactEvent] = [] # Auto-compact events observed mid-turn recovered_from_jsonl: bool = False # Stdout race + JSONL fallback fired (response from disk, not stream) model_name: Optional[str] = None # Actual model id from assistant.message.model (e.g., "claude-sonnet-4-5") — #678 + # #1187: typed terminal-result seed (the #945 taxonomy). Populated by + # newer runtimes (Codex) and currently UNUSED by the backend in the MVP — + # the backend still infers AUTH from the HTTP status. A fast-follow makes + # the backend read error_code directly and retire status-inference. + status: Optional[str] = None # "success" | "error" + error_code: Optional[str] = None # "AUTH" | "RATE_LIMIT" | "TIMEOUT" | "AGENT_ERROR" | "RUNTIME_UNAVAILABLE" # ============================================================================ diff --git a/docker/base-image/agent_server/services/claude_code.py b/docker/base-image/agent_server/services/claude_code.py index 4725b5f4..c85acddb 100644 --- a/docker/base-image/agent_server/services/claude_code.py +++ b/docker/base-image/agent_server/services/claude_code.py @@ -40,7 +40,7 @@ ) from .headless_executor import _attempt_empty_result_recovery, execute_headless_task from .process_registry import get_process_registry -from .runtime_adapter import AgentRuntime +from .runtime_adapter import AgentRuntime, RuntimeCapabilities from .stream_parser import process_stream_line from .subprocess_lifecycle import ( _capture_pgid, @@ -73,6 +73,18 @@ class ClaudeCodeRuntime(AgentRuntime): """Claude Code implementation of AgentRuntime interface.""" + @classmethod + def capabilities(cls) -> RuntimeCapabilities: + # Claude is the reference runtime: full continuity (--continue), the + # Session tab's cached-UUID --resume machinery, MCP, and native cost + # reporting (Claude Code emits total_cost_usd directly). (#1187) + return RuntimeCapabilities( + chat_continuity=True, + session_tab_resume=True, + mcp_support=True, + cost_reporting="native", + ) + def is_available(self) -> bool: """Check if Claude Code CLI is installed.""" try: diff --git a/docker/base-image/agent_server/services/codex_runtime.py b/docker/base-image/agent_server/services/codex_runtime.py new file mode 100644 index 00000000..69463a84 --- /dev/null +++ b/docker/base-image/agent_server/services/codex_runtime.py @@ -0,0 +1,1059 @@ +"""OpenAI Codex CLI execution service (#1187). + +Implements the :class:`AgentRuntime` interface for OpenAI's Codex CLI, the third +Trinity agent runtime alongside Claude Code and Gemini. + +Built **independently** on the existing per-runtime primitives (process +registry, concurrency-safe orphan drain, activity tracking, credential +sanitizer) rather than on a shared subprocess helper — see #1187 decision 4. +That keeps Codex from inheriting Gemini's blanket ``kill_cgroup_orphans()`` +(which SIGKILLs sibling executions in the same cgroup); Codex uses the +concurrency-safe ``_drain_bounded`` path that preserves other in-flight work. + +Safety parity with the Claude path (#1187 decision 8, Phase C): + * **System prompt / identity** — the backend's effective ``system_prompt`` + is prepended to every turn (Codex ``exec`` has no ``--append-system-prompt``); + persistent identity comes from ``AGENTS.md`` (startup copies ``CLAUDE.md``). + * **Read-only mode** — when ``~/.trinity/read-only-config.json`` is enabled, + Codex runs with ``--sandbox read-only`` (the Claude hook can't apply here). + * **Guardrails** — read-only is honored via the sandbox; ``disallowed_tools`` + that have no Codex equivalent are SURFACED in the logs, never silently + dropped. + * **Credential sanitization** — every stdout line, the final response, and + stderr pass through ``utils.credential_sanitizer`` exactly as the Claude / + headless paths do. + +Codex specifics: + * Non-interactive: ``codex exec [PROMPT]``; ``--json`` emits a JSONL event + stream; ``-o/--output-last-message FILE`` is the durable result record + (#548/#333) — read-then-delete in ``finally``. + * Continuity: ``codex exec resume `` replays a prior thread. + * No native cost — derived from ``turn.completed.usage`` token counts. +""" +from __future__ import annotations + +import asyncio +import json +import logging +import os +import re +import subprocess +import uuid +from concurrent.futures import ThreadPoolExecutor +from dataclasses import dataclass, field +from datetime import datetime +from pathlib import Path +from typing import Dict, List, Optional, Tuple + +from fastapi import HTTPException + +from ..models import ExecutionLogEntry, ExecutionMetadata +from ..state import agent_state +from ..utils.credential_sanitizer import ( + sanitize_dict, + sanitize_subprocess_line, + sanitize_text, +) +from ..utils.subprocess_pgroup import EXECUTION_TAG_NAME +from ._runtime_config import _DEFAULT_EXECUTION_TIMEOUT_SEC, _load_guardrails +from .activity_tracking import complete_tool_execution, start_tool_execution +from .process_registry import get_process_registry +from .runtime_adapter import AgentRuntime, RuntimeCapabilities +from .subprocess_lifecycle import ( + _capture_pgid, + _drain_bounded, + _safe_close_pipes, + _terminate_process_group, +) + +logger = logging.getLogger(__name__) + +# One long-lived reader-thread worker (mirrors claude_code.py / gemini_runtime.py). +# A fresh ThreadPoolExecutor per call relies on CPython's non-deterministic +# weakref cleanup of the worker thread under load (#333 hardening). +_executor = ThreadPoolExecutor(max_workers=1, thread_name_prefix="codex-subproc") + +# GPT-5 context window (input). Cosmetic — drives the context gauge only. +CODEX_CONTEXT_WINDOW = 272000 + +# Codex / GPT-5 pricing per 1K tokens (USD). Codex reports no cost; we derive +# it from token counts. ``cached`` is the discounted rate for cached input +# tokens. Bump deliberately when OpenAI pricing changes (#1137-style). +CODEX_PRICING: Dict[str, Dict[str, float]] = { + "gpt-5.1-codex": {"input": 0.00125, "cached": 0.000125, "output": 0.01}, + "gpt-5.1-codex-max": {"input": 0.00125, "cached": 0.000125, "output": 0.01}, + "gpt-5-codex": {"input": 0.00125, "cached": 0.000125, "output": 0.01}, + "gpt-5.1": {"input": 0.00125, "cached": 0.000125, "output": 0.01}, + "gpt-5": {"input": 0.00125, "cached": 0.000125, "output": 0.01}, + "gpt-5-mini": {"input": 0.00025, "cached": 0.000025, "output": 0.002}, + "gpt-5-nano": {"input": 0.00005, "cached": 0.000005, "output": 0.0004}, + # Unknown / future model → GPT-5 standard pricing. + "default": {"input": 0.00125, "cached": 0.000125, "output": 0.01}, +} + + +def _resolve_pricing(model: Optional[str]) -> Dict[str, float]: + """Pricing for ``model`` — exact key first, then longest matching prefix, + then the ``default`` fallback (never KeyErrors).""" + if not model: + return CODEX_PRICING["default"] + key = model.lower() + if key in CODEX_PRICING: + return CODEX_PRICING[key] + # Longest-prefix match so "gpt-5.1-codex-2025-xx" resolves to the codex rate. + candidates = [k for k in CODEX_PRICING if k != "default" and key.startswith(k)] + if candidates: + return CODEX_PRICING[max(candidates, key=len)] + return CODEX_PRICING["default"] + + +def calculate_codex_cost( + input_tokens: int, + cached_input_tokens: int, + output_tokens: int, + model: Optional[str] = None, +) -> float: + """Estimated USD cost for a Codex turn. + + ``reasoning_output_tokens`` is a SUBSET of ``output_tokens`` — bill + ``output_tokens`` once, never ``output_tokens + reasoning_output_tokens``. + Cached input tokens bill at the cheaper cached rate; only the uncached + remainder bills at the full input rate. + """ + pricing = _resolve_pricing(model) + uncached_input = max(0, input_tokens - cached_input_tokens) + cached = max(0, cached_input_tokens) + input_cost = (uncached_input / 1000) * pricing["input"] + ( + cached / 1000 + ) * pricing["cached"] + output_cost = (output_tokens / 1000) * pricing["output"] + return round(input_cost + output_cost, 6) + + +# --------------------------------------------------------------------------- +# Credentials, sandbox, CODEX_HOME (parity wiring — #1187 Phase C/T4) +# --------------------------------------------------------------------------- + +_API_KEY_VARS = ("OPENAI_API_KEY", "CODEX_API_KEY") +_AGENT_HOME = "/home/developer" +_READ_ONLY_CONFIG = Path(_AGENT_HOME) / ".trinity" / "read-only-config.json" + + +def _parse_env_value(raw_value: str) -> str: + """Extract a value from a ``.env`` ``KEY=VALUE`` right-hand side. + + Handles the shapes a human SSH-editing ``.env`` would produce that Trinity's + own plain ``KEY=VALUE`` writer never emits: a quoted value (the quotes are + stripped and an interior ``#`` is kept), and an unquoted value with a + trailing ``# inline comment`` (dropped at the first whitespace-``#``). + """ + value = raw_value.strip() + if value[:1] in ('"', "'"): + quote = value[0] + end = value.find(quote, 1) + return value[1:end] if end != -1 else value[1:] + comment = value.find(" #") + if comment != -1: + value = value[:comment].rstrip() + return value + + +def _load_openai_api_key() -> Optional[str]: + """Resolve the OpenAI/Codex API key. + + The per-agent ``.env`` (CRED-002) is copied to ``/home/developer/.env`` by + startup.sh but is NOT exported into the agent-server process — so unlike the + Claude/Gemini key (a container env var), the Codex key must be read from the + process env (if present) OR parsed out of ``.env`` (the cold-start path the + outside-voice review flagged). Accepts either OPENAI_API_KEY or CODEX_API_KEY. + """ + for var in _API_KEY_VARS: + value = os.environ.get(var) + if value: + return value + env_path = Path(_AGENT_HOME) / ".env" + try: + for raw in env_path.read_text().splitlines(): + line = raw.strip() + if not line or line.startswith("#") or "=" not in line: + continue + # Tolerate `export KEY=VALUE` (a hand-edited .env), not just KEY=VALUE. + if line.startswith("export "): + line = line[len("export "):].lstrip() + key, _, value = line.partition("=") + if key.strip() in _API_KEY_VARS: + cleaned = _parse_env_value(value) + if cleaned: + return cleaned + except (IOError, OSError): + pass + return None + + +def _codex_home() -> str: + """Non-workspace home for Codex state + the ``-o`` result file. + + Codex defaults ``CODEX_HOME`` to ``~/.codex`` — inside the git-tracked agent + repo, which would dirty auto-sync. Relocate it under ``$TMPDIR`` (the + disk-backed ``/home/developer/.tmp`` scratch dir, #1098) which startup.sh + gitignores for Codex agents. + """ + explicit = os.environ.get("CODEX_HOME") + if explicit: + return explicit + tmpdir = os.environ.get("TMPDIR") or os.path.join(_AGENT_HOME, ".tmp") + return os.path.join(tmpdir, "codex") + + +def _ensure_codex_home() -> str: + home = _codex_home() + try: + os.makedirs(home, exist_ok=True) + except OSError as exc: # pragma: no cover - defensive + logger.warning("[Codex] could not create CODEX_HOME %s: %s", home, exc) + return home + + +def _is_read_only() -> bool: + """True when the backend has put this agent in read-only mode. + + The signal is the same JSON file the Claude read-only *hook* consumes + (``~/.trinity/read-only-config.json`` → ``enabled``). Codex can't run Claude + hooks, so we read the file directly and translate it to ``--sandbox + read-only`` (a sandbox-native, non-cooperative enforcement). + + An absent file ⇒ not read-only (the normal writable-agent state — silent). + A present-but-unreadable/corrupt file fails OPEN **and logs**, matching the + reference hook (``read-only-guard.py`` logs ``read_only_config_load_error`` + and allows). Diverging one runtime to fail-closed would make read-only + enforcement inconsistent across runtimes (CSO #1187 finding 3); if the + platform wants fail-closed, change both loaders together in a dedicated + issue. + """ + try: + raw = _READ_ONLY_CONFIG.read_text() + except FileNotFoundError: + return False + except OSError as exc: + logger.warning( + "[Codex] read-only config unreadable (%s); treating as not read-only", exc + ) + return False + try: + return bool(json.loads(raw).get("enabled")) + except json.JSONDecodeError as exc: + logger.warning( + "[Codex] read-only config malformed (%s); treating as not read-only", exc + ) + return False + + +def _resolve_sandbox_mode() -> str: + """Map Trinity's mode to a Codex ``--sandbox`` value. + + Normal (writable) agents run with ``danger-full-access``, which DISABLES + Codex's own bubblewrap sandbox. ``workspace-write``/``read-only`` both invoke + ``bwrap`` to create a user namespace, which the hardened Trinity container + forbids (``bwrap: No permissions to create a new namespace``) — so any + in-sandbox mode blocks EVERY shell tool. The Trinity container is already the + security boundary (``cap_drop ALL`` + AppArmor + ``no-new-privileges``), + exactly the posture Claude and Gemini run under (no internal sandbox), so + dropping Codex's redundant inner sandbox weakens nothing. + + Read-only mode is the deliberate exception: it keeps ``--sandbox read-only`` + (sandbox-native write protection) as the interim enforcement. A fail-closed + read-only enforcement story for Codex is a fast-follow. + """ + return "read-only" if _is_read_only() else "danger-full-access" + + +def _surface_unmapped_guardrails(allowed_tools: Optional[List[str]]) -> None: + """Honor what maps to Codex's control surface; SURFACE (never silently + drop) the rest (#1187 decision 8 + the unresolved-decision caveat). + + Read-only is enforced via the sandbox. Claude ``disallowed_tools`` names + (Bash, Write, Edit, WebSearch, …) have no 1:1 Codex ``exec`` CLI toggle in + the MVP, so we log them at WARNING for operator visibility rather than + pretending they're enforced. + """ + guardrails = _load_guardrails() + disallowed = guardrails.get("disallowed_tools") or [] + if disallowed: + logger.warning( + "[Codex] guardrails disallow %s — Codex exec has no per-tool CLI " + "toggle in the MVP; only read-only (sandbox) and network access are " + "enforced. Tracking finer-grained Codex tool gating as a fast-follow.", + disallowed, + ) + if allowed_tools: + logger.info( + "[Codex] allowed_tools=%s requested; Codex exec runs its full tool " + "set under the sandbox (no allowlist CLI flag in the MVP).", + allowed_tools, + ) + + +def _compose_prompt(system_prompt: Optional[str], prompt: str) -> str: + """Codex ``exec`` has no system-prompt flag, so the effective platform + prompt (platform instructions + execution context + caller prompt, always + sent by the backend) is prepended to the user message. Persistent identity + additionally comes from AGENTS.md.""" + if system_prompt: + return f"{system_prompt}\n\n---\n\n{prompt}" + return prompt + + +def _ensure_within(base: str, path: str) -> str: + """Resolve ``path`` and confirm it stays within ``base``; raise otherwise. + + Defense-in-depth at the filesystem sink. The result filename is already + reduced to a safe token by ``_safe_result_token`` + a fixed ``-last.txt`` + suffix, so this never trips in practice — but anchoring the containment + check at the ``open``/``unlink`` sink keeps the safety property local to the + operation that actually touches the filesystem (and is the barrier static + analysis recognizes).""" + base_real = os.path.realpath(base) + target = os.path.realpath(path) + if target != base_real and not target.startswith(base_real + os.sep): + raise ValueError(f"result path escapes codex_home: {path!r}") + return target + + +def _read_and_consume_result_file(path: str, base: str) -> Optional[str]: + """Read the ``-o`` durable result file. Deletion is the caller's ``finally`` + (read-then-delete, happy + error path — #1187 decision 5). ``base`` anchors + the sink-side containment guard (see ``_ensure_within``).""" + try: + with open(_ensure_within(base, path), "r", encoding="utf-8", errors="replace") as fh: + return fh.read() + except ValueError: + # Containment guard tripped — must never happen in practice; surface it + # rather than masking a genuine path bug as a benign missing file. + logger.warning("[Codex] refusing to read result file outside codex_home: %r", path) + return None + except (IOError, OSError): + return None + + +def _safe_unlink(path: str, base: str) -> None: + try: + os.unlink(_ensure_within(base, path)) + except ValueError: + logger.warning("[Codex] refusing to unlink result file outside codex_home: %r", path) + except OSError: + pass + + +def _safe_result_token(execution_id: str) -> str: + """Filesystem-safe token for the ``-o`` result filename. ``execution_id`` is + system-generated today (uuid4 fallback / backend urlsafe token), but never + build a path from it unguarded: reduce it to a basename and a conservative + charset so a '/' or '..' can't escape CODEX_HOME (defense-in-depth — CSO + #1187 finding 2).""" + token = re.sub(r"[^A-Za-z0-9_.-]", "_", os.path.basename(execution_id)) + return token or "codex" + + +def _resolve_returned_session_id(metadata: ExecutionMetadata) -> Optional[str]: + """The thread id to cache for chat continuity (review I4). + + Codex emits ``thread.started`` on every ``exec``; if it somehow didn't, + return ``None`` so the next turn degrades to a fresh run — NOT a fabricated + id (e.g. the random ``execution_id``), which would make the next + ``codex exec resume `` fail hard and repeat every turn. + """ + return metadata.session_id + + +def _finalize_codex_response( + result_file_text: Optional[str], response_parts: List[str] +) -> str: + """The ``-o`` file is the authoritative response; JSONL ``agent_message`` + parts are the fallback when the file is missing/empty (#1187 decision 5).""" + if result_file_text and result_file_text.strip(): + return result_file_text.strip() + return "\n".join(response_parts).strip() + + +# --------------------------------------------------------------------------- +# JSONL event parsing +# --------------------------------------------------------------------------- + +# item.type values that represent tool/command activity (vs. agent_message / +# reasoning / todo_list). Confirmed against codex exec_events.rs ThreadItemDetails. +_CODEX_TOOL_ITEM_TYPES = { + "command_execution", + "file_change", + "mcp_tool_call", + "web_search", +} + +_CODEX_TOOL_DISPLAY = { + "command_execution": "Shell", + "file_change": "FileChange", + "mcp_tool_call": "McpTool", + "web_search": "WebSearch", +} + + +@dataclass +class _CodexParseState: + """Mutable accumulators threaded through per-event parsing.""" + + execution_log: List[ExecutionLogEntry] + metadata: ExecutionMetadata + response_parts: List[str] + model: Optional[str] = None + seen_tool_ids: set = field(default_factory=set) + + +def _tool_display_name(item: dict, item_type: str) -> str: + if item_type == "mcp_tool_call": + tool = item.get("tool") or item.get("name") + server = item.get("server") + if tool: + return f"{server}.{tool}" if server else str(tool) + return _CODEX_TOOL_DISPLAY.get(item_type, item_type) + + +def _tool_input(item: dict, item_type: str) -> dict: + if item_type == "command_execution": + return {"command": item.get("command")} + if item_type == "web_search": + return {"query": item.get("query")} + if item_type == "file_change": + return {"changes": item.get("changes")} + if item_type == "mcp_tool_call": + return {"arguments": item.get("arguments")} + return {} + + +def _tool_output(item: dict, item_type: str) -> str: + for key in ("aggregated_output", "output", "result", "stdout"): + value = item.get(key) + if isinstance(value, str) and value: + return value + return "" + + +def _record_tool_use(state: _CodexParseState, tool_id: str, item: dict, item_type: str) -> None: + if tool_id in state.seen_tool_ids: + return + state.seen_tool_ids.add(tool_id) + name = _tool_display_name(item, item_type) + tool_input = _tool_input(item, item_type) + state.execution_log.append( + ExecutionLogEntry( + id=tool_id, + type="tool_use", + tool=name, + input=tool_input, + timestamp=datetime.now().isoformat(), + ) + ) + try: + start_tool_execution(tool_id, name, tool_input) + except Exception: # noqa: BLE001 - activity tracking is best-effort + logger.debug("[Codex] start_tool_execution failed for %s", tool_id, exc_info=True) + + +def _record_tool_result(state: _CodexParseState, tool_id: str, item: dict, item_type: str) -> None: + name = _tool_display_name(item, item_type) + output = _tool_output(item, item_type) + status = item.get("status") + exit_code = item.get("exit_code") + is_error = status == "failed" or (isinstance(exit_code, int) and exit_code != 0) + state.execution_log.append( + ExecutionLogEntry( + id=tool_id, + type="tool_result", + tool=name, + output=output or None, + success=not is_error, + timestamp=datetime.now().isoformat(), + ) + ) + try: + complete_tool_execution(tool_id, not is_error, output) + except Exception: # noqa: BLE001 + logger.debug("[Codex] complete_tool_execution failed for %s", tool_id, exc_info=True) + + +def _process_codex_event(event: dict, state: _CodexParseState) -> None: + """Update ``state`` from one parsed Codex JSONL event. Tolerant of unknown + event/item types and missing fields — the ``-o`` file is authoritative for + the response, so a best-effort parser here only affects tokens, tool + activity, and error classification.""" + event_type = event.get("type") + + if event_type == "thread.started": + state.metadata.session_id = event.get("thread_id") or state.metadata.session_id + + elif event_type == "turn.completed": + usage = event.get("usage") or {} + input_tokens = int(usage.get("input_tokens") or 0) + cached = int(usage.get("cached_input_tokens") or 0) + output_tokens = int(usage.get("output_tokens") or 0) + # reasoning_output_tokens is a subset of output_tokens — do NOT add it. + state.metadata.input_tokens = input_tokens + state.metadata.output_tokens = output_tokens + state.metadata.cache_read_tokens = cached + state.metadata.cost_usd = calculate_codex_cost( + input_tokens, cached, output_tokens, state.model + ) + + elif event_type == "turn.failed": + error = event.get("error") or {} + state.metadata.error_type = "turn_failed" + state.metadata.error_message = ( + error.get("message") if isinstance(error, dict) else str(error) + ) or "Codex turn failed" + + elif event_type == "error": + state.metadata.error_type = "error" + state.metadata.error_message = event.get("message") or "Codex error" + + elif event_type in ("item.started", "item.updated", "item.completed"): + item = event.get("item") or {} + item_type = item.get("type") or (item.get("details") or {}).get("type") + if not item_type: + return + item_id = item.get("id") or str(uuid.uuid4()) + + if item_type == "agent_message": + if event_type == "item.completed": + text = item.get("text") or item.get("message") or "" + if text: + state.response_parts.append(text) + elif item_type in _CODEX_TOOL_ITEM_TYPES: + if event_type == "item.started": + _record_tool_use(state, item_id, item, item_type) + elif event_type == "item.completed": + _record_tool_use(state, item_id, item, item_type) # no-op if seen + _record_tool_result(state, item_id, item, item_type) + elif item_type == "error": + state.metadata.error_type = "error" + state.metadata.error_message = ( + item.get("message") or state.metadata.error_message or "Codex item error" + ) + + +def parse_codex_jsonl( + lines: List[str], model: Optional[str] = None +) -> Tuple[str, List[ExecutionLogEntry], ExecutionMetadata, List[Dict]]: + """Parse a full Codex ``--json`` line stream (unit-test entrypoint). + + Returns ``(response_text, execution_log, metadata, raw_messages)`` where + ``response_text`` is the JSONL-assembled fallback (the live path overrides + it with the ``-o`` file).""" + metadata = ExecutionMetadata() + metadata.context_window = CODEX_CONTEXT_WINDOW + state = _CodexParseState(execution_log=[], metadata=metadata, response_parts=[], model=model) + raw_messages: List[Dict] = [] + for line in lines: + line = line.strip() + if not line: + continue + try: + event = json.loads(line) + except json.JSONDecodeError: + continue + if isinstance(event, dict): + raw_messages.append(event) + _process_codex_event(event, state) + metadata.tool_count = len([e for e in state.execution_log if e.type == "tool_use"]) + response_text = "\n".join(state.response_parts).strip() + return response_text, state.execution_log, metadata, raw_messages + + +# --------------------------------------------------------------------------- +# Error classification (return-code path) +# --------------------------------------------------------------------------- + +# AUTH detection is anchored, not bare-substring. Bare "401"/"api key" are +# over-broad — a non-auth failure whose output merely contains "401" (e.g. an +# upstream MCP/tool returning 401) must NOT be read as an auth failure, because +# 503 is the backend's AUTH signal and the dispatch breaker counts AUTH only +# (#1187 decision 3, review I1). Each pattern names an actual auth condition and +# uses word boundaries so it won't fire on an incidental token. +_AUTH_PATTERNS = ( + re.compile(r"\bunauthorized\b", re.IGNORECASE), + re.compile(r"\b401\s+unauthorized\b", re.IGNORECASE), + re.compile(r"\b(?:invalid|incorrect|missing|no)[ _]api[ _]key\b", re.IGNORECASE), + re.compile(r"\bnot\s+authenticated\b", re.IGNORECASE), + re.compile(r"\bauthentication\s+(?:failed|error)\b", re.IGNORECASE), +) +_RATE_MARKERS = ("429", "rate limit", "rate_limit", "quota", "too many requests") + + +def _classify_codex_failure( + return_code: int, stderr: str, metadata: ExecutionMetadata +) -> Tuple[int, str]: + """Map a non-zero Codex exit (+ stderr + parsed error) to an HTTP status. + + auth → 503, rate-limit → 429, everything else → 500 (runtime-unavailable). + Crucially a generic runtime failure is 500, NOT 503 — 503 is the backend's + AUTH signal and the dispatch breaker counts AUTH only (#1187 decision 3).""" + haystack = " ".join( + s for s in (stderr or "", metadata.error_message or "") if s + ) + haystack_lower = haystack.lower() + if any(marker in haystack_lower for marker in _RATE_MARKERS): + return 429, f"Codex rate limit: {(stderr or metadata.error_message or '')[:300]}" + if any(pattern.search(haystack) for pattern in _AUTH_PATTERNS): + return 503, ( + f"Codex authentication failure: {(stderr or metadata.error_message or '')[:300]}. " + "Check OPENAI_API_KEY." + ) + detail = stderr.strip() or metadata.error_message or "see agent logs" + return 500, f"Codex execution failed (exit code {return_code}): {detail[:300]}" + + +# --------------------------------------------------------------------------- +# Runtime +# --------------------------------------------------------------------------- + +class CodexRuntime(AgentRuntime): + """OpenAI Codex CLI implementation of AgentRuntime.""" + + def __init__(self) -> None: + # Codex thread id for the interactive chat session (continuity). The + # singleton instance persists across /api/chat calls in a container. + self._chat_thread_id: Optional[str] = None + + # -- capability declaration (#1187 Phase G) -------------------------------- + @classmethod + def capabilities(cls) -> RuntimeCapabilities: + return RuntimeCapabilities( + chat_continuity=True, # codex exec resume + session_tab_resume=False, # MVP: Session tab stays Claude/Gemini + mcp_support=True, # codex mcp add + cost_reporting="estimated", # no native cost → derived from tokens + ) + + def is_available(self) -> bool: + try: + result = subprocess.run( + ["codex", "--version"], capture_output=True, text=True, timeout=5 + ) + return result.returncode == 0 + except Exception: + return False + + def get_default_model(self) -> str: + return "gpt-5.1-codex" + + def get_context_window(self, model: Optional[str] = None) -> int: + return CODEX_CONTEXT_WINDOW + + def configure_mcp(self, mcp_servers: Dict) -> bool: + """Delegate to the shared Codex MCP configuration in trinity_mcp.py.""" + from .trinity_mcp import _configure_codex_mcp_servers + + return _configure_codex_mcp_servers(mcp_servers) + + # -- command construction -------------------------------------------------- + def _build_codex_command( + self, + *, + model: Optional[str], + sandbox_mode: str, + result_file: str, + agent_home: str, + resume_thread_id: Optional[str], + ) -> List[str]: + cmd = ["codex", "exec"] + # Exec-level flags belong to `codex exec`, NOT to the `resume` + # sub-subcommand. In codex 0.139.0, `exec resume [OPTIONS] [SESSION_ID] + # [PROMPT]` has a NARROWER option set and rejects -C/--sandbox/--json/-o + # ("error: unexpected argument '-C' found", exit 2 — breaks every + # turn-2+ continuity call). So they MUST be emitted BEFORE `resume`. + cmd += [ + "--json", + "--skip-git-repo-check", + "-C", + agent_home, + "--sandbox", + sandbox_mode, + "-o", + result_file, + ] + # Normal mode is `danger-full-access` (no inner sandbox; the Trinity + # container is the boundary — see _resolve_sandbox_mode), which already + # permits network access, so no `sandbox_workspace_write.network_access` + # override is needed. Read-only stays `read-only`. We no longer emit + # `workspace-write` at all. + if model: + cmd += ["-m", model] + # Continuity: `codex exec resume ` replays a prior + # thread. Emitted AFTER the exec-level flags above (narrower arg set). + if resume_thread_id: + cmd += ["resume", resume_thread_id] + # End-of-options separator (review I3): the caller appends the prompt as + # the next (positional) token — for a resume it is resume's PROMPT arg — + # so a prompt starting with "-"/"--" can never be reparsed as a flag + # (worst case weakening the sandbox). + cmd.append("--") + return cmd + + # -- core subprocess execution (stubbed in unit tests) --------------------- + async def _execute_codex( + self, + *, + prompt: str, + model: Optional[str], + system_prompt: Optional[str], + resume_thread_id: Optional[str], + timeout_seconds: int, + allowed_tools: Optional[List[str]], + execution_id: Optional[str], + concurrent_reader: bool = False, + ) -> Tuple[str, List[ExecutionLogEntry], ExecutionMetadata, List[Dict], Optional[str]]: + execution_id = execution_id or str(uuid.uuid4()) + + api_key = _load_openai_api_key() + if not api_key: + raise HTTPException( + status_code=503, + detail=( + "OpenAI API key not configured in agent container. Inject " + "OPENAI_API_KEY via credentials." + ), + ) + + codex_home = _ensure_codex_home() + result_file = os.path.join(codex_home, f"{_safe_result_token(execution_id)}-last.txt") + sandbox_mode = _resolve_sandbox_mode() + _surface_unmapped_guardrails(allowed_tools) + composed_prompt = _compose_prompt(system_prompt, prompt) + + cmd = self._build_codex_command( + model=model, + sandbox_mode=sandbox_mode, + result_file=result_file, + agent_home=_AGENT_HOME, + resume_thread_id=resume_thread_id, + ) + cmd.append(composed_prompt) + + env = { + **os.environ, + EXECUTION_TAG_NAME: execution_id, + "CODEX_HOME": codex_home, + # Inject under both names — the ecosystem standard is OPENAI_API_KEY; + # some Codex builds also read CODEX_API_KEY. Defensive (verified in + # /verify-local). + "OPENAI_API_KEY": api_key, + "CODEX_API_KEY": api_key, + } + + metadata = ExecutionMetadata() + metadata.context_window = self.get_context_window(model) + metadata.execution_id = execution_id + execution_log: List[ExecutionLogEntry] = [] + raw_messages: List[Dict] = [] + response_parts: List[str] = [] + state = _CodexParseState( + execution_log=execution_log, + metadata=metadata, + response_parts=response_parts, + model=model, + ) + stderr_lines: List[str] = [] + + registry = get_process_registry() + logger.info( + "[Codex] exec sandbox=%s resume=%s model=%s execution_id=%s", + sandbox_mode, bool(resume_thread_id), model or "(default)", execution_id, + ) + + # stdin=DEVNULL: the prompt is a positional arg, so Codex must not block + # waiting on stdin. start_new_session=True isolates the process group so + # cleanup signals only Codex's descendants, never sibling executions. + process = subprocess.Popen( + cmd, + stdin=subprocess.DEVNULL, + stdout=subprocess.PIPE, + stderr=subprocess.PIPE, + text=True, + bufsize=1, + start_new_session=True, + env=env, + ) + process_pgid = _capture_pgid(process) + registry.register( + execution_id, process, metadata={"type": "codex", "pgid": process_pgid} + ) + + import threading + + def read_stdout() -> None: + try: + for line in iter(process.stdout.readline, ""): + if not line: + break + try: + sanitized = sanitize_subprocess_line(line) + try: + event = json.loads(sanitized.strip()) + except json.JSONDecodeError: + continue + if isinstance(event, dict): + event = sanitize_dict(event) + raw_messages.append(event) + try: + registry.publish_log_entry(execution_id, event) + except Exception as pub_err: # noqa: BLE001 + logger.debug( + "[Codex] publish_log_entry failed (continuing): %s", + pub_err, + ) + _process_codex_event(event, state) + except Exception as line_err: # noqa: BLE001 + logger.debug( + "[Codex] per-line processing error (continuing): %s", + line_err, + ) + except Exception as exc: # noqa: BLE001 + logger.error("[Codex] error reading stdout: %s", exc) + + def read_stderr() -> None: + try: + for line in iter(process.stderr.readline, ""): + if not line: + break + stderr_lines.append(line) + except Exception as exc: # noqa: BLE001 + logger.error("[Codex] error reading stderr: %s", exc) + + def read_subprocess_output() -> Tuple[str, int]: + stdout_thread = threading.Thread(target=read_stdout, daemon=True) + stderr_thread = threading.Thread(target=read_stderr, daemon=True) + stdout_thread.start() + stderr_thread.start() + try: + return_code = process.wait(timeout=timeout_seconds) + except subprocess.TimeoutExpired: + logger.error( + "[Codex] execution %s timed out after %ss — killing group", + execution_id, timeout_seconds, + ) + _terminate_process_group( + process, graceful_timeout=5, pgid=process_pgid, + execution_tag=execution_id, + ) + _drain_bounded( + process, stdout_thread, stderr_thread, grace=3, + pgid=process_pgid, execution_tag=execution_id, + ) + raise + _drain_bounded( + process, stdout_thread, stderr_thread, grace=5, + pgid=process_pgid, execution_tag=execution_id, + ) + stderr = "".join(stderr_lines) + return (sanitize_text(stderr) if stderr else stderr), return_code + + # The lock-serialized chat path uses the bounded single-worker executor; + # the concurrent /api/task path uses the loop's default executor so + # parallel task readers don't serialize behind one worker (review I2, + # parity with Claude's headless path). None → default executor. + reader_executor = None if concurrent_reader else _executor + + loop = asyncio.get_event_loop() + try: + try: + stderr_output, return_code = await asyncio.wait_for( + loop.run_in_executor(reader_executor, read_subprocess_output), + timeout=timeout_seconds + 60, + ) + except asyncio.TimeoutError: + logger.error( + "[Codex] outer timeout on %s — killing group as last resort", + execution_id, + ) + await loop.run_in_executor( + None, + lambda: _terminate_process_group( + process, graceful_timeout=2, pgid=process_pgid, + execution_tag=execution_id, + ), + ) + await loop.run_in_executor(None, _safe_close_pipes, process) + raise HTTPException( + status_code=504, + detail=f"Codex execution timed out after {timeout_seconds} seconds", + ) + except subprocess.TimeoutExpired: + raise HTTPException( + status_code=504, + detail=f"Codex execution timed out after {timeout_seconds} seconds", + ) + + if return_code != 0: + status_code, detail = _classify_codex_failure( + return_code, stderr_output, metadata + ) + # NOTE: no metadata.status write here — this path raises + # HTTPException and the local metadata is discarded, so the + # backend reads the failure from the HTTP status, not metadata. + logger.error("[Codex] %s", detail) + raise HTTPException(status_code=status_code, detail=detail) + + # -o file is authoritative; JSONL parts are the fallback. + result_text = _read_and_consume_result_file(result_file, codex_home) + response_text = _finalize_codex_response(result_text, response_parts) + response_text = sanitize_text(response_text) + + tool_use_count = len([e for e in execution_log if e.type == "tool_use"]) + metadata.tool_count = tool_use_count + if not response_text: + response_text = ( + "(Task completed)" if tool_use_count else "(No response from Codex)" + ) + metadata.status = "success" + session_id = _resolve_returned_session_id(metadata) + logger.info( + "[Codex] done execution_id=%s cost=$%s tokens=%s/%s tools=%s", + execution_id, metadata.cost_usd, metadata.input_tokens, + metadata.output_tokens, metadata.tool_count, + ) + return response_text, execution_log, metadata, raw_messages, session_id + finally: + # Read-then-delete in finally — happy + error path (#1187 decision 5). + _safe_unlink(result_file, codex_home) + registry.unregister(execution_id) + + # -- public interface ------------------------------------------------------ + async def execute( + self, + prompt: str, + model: Optional[str] = None, + continue_session: bool = False, + stream: bool = False, + system_prompt: Optional[str] = None, + execution_id: Optional[str] = None, + ) -> Tuple[str, List[ExecutionLogEntry], ExecutionMetadata, List[Dict]]: + if not self.is_available(): + raise HTTPException( + status_code=503, + detail="Codex CLI is not available in this container", + ) + + resume_thread_id: Optional[str] = None + if continue_session and agent_state.session_started and self._chat_thread_id: + resume_thread_id = self._chat_thread_id + else: + agent_state.session_started = True + self._chat_thread_id = None + + guardrails = _load_guardrails() + timeout_seconds = int( + guardrails.get("execution_timeout_sec") or _DEFAULT_EXECUTION_TIMEOUT_SEC + ) + + try: + response, log, metadata, raw, session_id = await self._execute_codex( + prompt=prompt, + model=model, + system_prompt=system_prompt, + resume_thread_id=resume_thread_id, + timeout_seconds=timeout_seconds, + allowed_tools=None, + execution_id=execution_id, + concurrent_reader=False, # chat is lock-serialized → bounded reader + ) + except HTTPException: + raise + except TimeoutError as exc: + raise HTTPException(status_code=504, detail=str(exc)) + except (BrokenPipeError, ConnectionResetError) as pipe_err: + logger.info("[Codex] subprocess pipe closed before completion: %s", pipe_err) + raise HTTPException( + status_code=502, + detail="Agent subprocess closed before the chat could complete", + ) + except Exception as exc: # noqa: BLE001 + logger.error("[Codex] execution error: %s", exc) + raise HTTPException(status_code=500, detail=f"Execution error: {exc}") + + # Track thread id for the next continue_session turn. + if session_id: + self._chat_thread_id = session_id + agent_state.session_started = True + + # Update session rollups (mirrors the Gemini path). + if metadata.cost_usd: + agent_state.session_total_cost += metadata.cost_usd + agent_state.session_total_output_tokens += metadata.output_tokens + if metadata.input_tokens > agent_state.session_context_tokens: + agent_state.session_context_tokens = metadata.input_tokens + agent_state.session_context_window = metadata.context_window + return response, log, metadata, raw + + async def execute_headless( + self, + prompt: str, + model: Optional[str] = None, + allowed_tools: Optional[List[str]] = None, + system_prompt: Optional[str] = None, + timeout_seconds: int = 900, + max_turns: Optional[int] = None, + execution_id: Optional[str] = None, + resume_session_id: Optional[str] = None, + persist_session: bool = False, + images: Optional[List[Dict]] = None, + ) -> Tuple[str, List[ExecutionLogEntry], ExecutionMetadata, Optional[str]]: + if not self.is_available(): + raise HTTPException( + status_code=503, + detail="Codex CLI is not available in this container", + ) + if images: + logger.warning("[Codex] images are not supported in the MVP — ignoring") + if max_turns is not None: + logger.info( + "[Codex] max_turns=%s requested; Codex exec has no turn cap CLI " + "flag — relying on the %ss wall-clock timeout.", + max_turns, timeout_seconds, + ) + + try: + response, log, metadata, raw, session_id = await self._execute_codex( + prompt=prompt, + model=model, + system_prompt=system_prompt, + resume_thread_id=resume_session_id, + timeout_seconds=timeout_seconds, + allowed_tools=allowed_tools, + execution_id=execution_id, + concurrent_reader=True, # /api/task runs concurrently → default reader + ) + except HTTPException: + raise + except TimeoutError as exc: + raise HTTPException(status_code=504, detail=str(exc)) + except (BrokenPipeError, ConnectionResetError) as pipe_err: + # 502 (not 503) so the SUB-003 auth-switch isn't tripped by an early + # child exit — parity with the Claude/Gemini headless paths (#474). + logger.info("[Codex] subprocess pipe closed before completion: %s", pipe_err) + raise HTTPException( + status_code=502, + detail="Agent subprocess closed before task could complete", + ) + except Exception as exc: # noqa: BLE001 + logger.error("[Codex] task execution error: %s", exc) + raise HTTPException(status_code=500, detail=f"Task execution error: {exc}") + + return response, log, metadata, session_id + + +# Global Codex runtime instance (singleton, mirrors claude/gemini). +_codex_runtime: Optional[CodexRuntime] = None + + +def get_codex_runtime() -> CodexRuntime: + global _codex_runtime + if _codex_runtime is None: + _codex_runtime = CodexRuntime() + return _codex_runtime diff --git a/docker/base-image/agent_server/services/gemini_runtime.py b/docker/base-image/agent_server/services/gemini_runtime.py index ec8b1503..7d619069 100644 --- a/docker/base-image/agent_server/services/gemini_runtime.py +++ b/docker/base-image/agent_server/services/gemini_runtime.py @@ -21,7 +21,7 @@ from ..utils.subprocess_pgroup import EXECUTION_TAG_NAME from ..utils.orphan_sweep import kill_cgroup_orphans from .activity_tracking import start_tool_execution, complete_tool_execution -from .runtime_adapter import AgentRuntime +from .runtime_adapter import AgentRuntime, RuntimeCapabilities logger = logging.getLogger(__name__) @@ -90,6 +90,18 @@ def calculate_gemini_cost(input_tokens: int, output_tokens: int, model: Optional class GeminiRuntime(AgentRuntime): """Gemini CLI implementation of AgentRuntime interface.""" + @classmethod + def capabilities(cls) -> RuntimeCapabilities: + # Gemini supports chat continuity (--resume) and MCP, but NOT the + # Session-tab cached-UUID resume (execute_headless ignores + # resume_session_id), and cost is derived from tokens. (#1187) + return RuntimeCapabilities( + chat_continuity=True, + session_tab_resume=False, + mcp_support=True, + cost_reporting="estimated", + ) + def is_available(self) -> bool: """Check if Gemini CLI is installed.""" try: diff --git a/docker/base-image/agent_server/services/runtime_adapter.py b/docker/base-image/agent_server/services/runtime_adapter.py index e40e19a6..a91bc1b6 100644 --- a/docker/base-image/agent_server/services/runtime_adapter.py +++ b/docker/base-image/agent_server/services/runtime_adapter.py @@ -7,6 +7,7 @@ import os import logging from abc import ABC, abstractmethod +from dataclasses import dataclass, asdict from typing import List, Dict, Optional, Tuple from datetime import datetime @@ -15,6 +16,24 @@ logger = logging.getLogger(__name__) +@dataclass(frozen=True) +class RuntimeCapabilities: + """What a runtime supports, so callers gate on a capability instead of + branching on the runtime name (#1187). + + ``cost_reporting`` is a string, not a bool: ``"native"`` means the CLI + reports a real cost (Claude Code), ``"estimated"`` means Trinity derives + it from token counts (Gemini, Codex). + """ + chat_continuity: bool = False + session_tab_resume: bool = False + mcp_support: bool = False + cost_reporting: str = "estimated" # "native" | "estimated" + + def to_dict(self) -> Dict[str, object]: + return asdict(self) + + class AgentRuntime(ABC): """ Abstract base class for agent execution runtimes. @@ -142,28 +161,60 @@ async def execute_headless( """ pass + @classmethod + def capabilities(cls) -> RuntimeCapabilities: + """Declare what this runtime supports. + + Conservative by default (#1187, AC2): a runtime that forgets to + override this is treated as the least-capable — no Session-tab + resume, no assumed MCP, estimated cost. Override per runtime to + declare real support. + """ + return RuntimeCapabilities() + + +# Accepted AGENT_RUNTIME values (lowercased). Unknown values fail loudly +# rather than silently selecting Claude (#1187 Phase D). +_CLAUDE_RUNTIMES = frozenset({"claude-code", "claude"}) +_GEMINI_RUNTIMES = frozenset({"gemini-cli", "gemini"}) +_CODEX_RUNTIMES = frozenset({"codex"}) +KNOWN_RUNTIMES = _CLAUDE_RUNTIMES | _GEMINI_RUNTIMES | _CODEX_RUNTIMES + def get_runtime() -> AgentRuntime: """ Factory function to get the appropriate runtime based on configuration. Reads AGENT_RUNTIME environment variable to determine which runtime to use. - Defaults to Claude Code for backward compatibility. + Defaults to Claude Code (env unset) for backward compatibility, but an + explicitly-set UNKNOWN value raises instead of silently falling back to + Claude — a typo'd runtime should fail loudly, not run the wrong engine + (#1187 Phase D). Returns: - AgentRuntime instance (ClaudeCodeRuntime or GeminiRuntime) + AgentRuntime instance (ClaudeCodeRuntime, GeminiRuntime, or CodexRuntime) + + Raises: + ValueError: if AGENT_RUNTIME is set to an unrecognized value. """ runtime_type = os.getenv("AGENT_RUNTIME", "claude-code").lower() - if runtime_type == "gemini-cli" or runtime_type == "gemini": + if runtime_type in _GEMINI_RUNTIMES: from .gemini_runtime import get_gemini_runtime - runtime = get_gemini_runtime() logger.info("Using Gemini CLI runtime") - return runtime - else: - # Default to Claude Code + return get_gemini_runtime() + if runtime_type in _CODEX_RUNTIMES: + from .codex_runtime import get_codex_runtime + logger.info("Using OpenAI Codex runtime") + return get_codex_runtime() + if runtime_type in _CLAUDE_RUNTIMES: from .claude_code import get_claude_runtime - runtime = get_claude_runtime() logger.info("Using Claude Code runtime") - return runtime + return get_claude_runtime() + + raise ValueError( + f"Unknown AGENT_RUNTIME={runtime_type!r}. " + f"Known runtimes: {sorted(KNOWN_RUNTIMES)}. " + "Refusing to silently fall back to Claude Code." + ) diff --git a/docker/base-image/agent_server/services/trinity_mcp.py b/docker/base-image/agent_server/services/trinity_mcp.py index c398ef9c..0fd24dcf 100644 --- a/docker/base-image/agent_server/services/trinity_mcp.py +++ b/docker/base-image/agent_server/services/trinity_mcp.py @@ -7,6 +7,7 @@ import json import logging import subprocess +import tomllib # py3.11+; agent base image is python 3.13 from pathlib import Path logger = logging.getLogger(__name__) @@ -31,6 +32,8 @@ def inject_trinity_mcp_if_configured() -> bool: runtime = os.getenv("AGENT_RUNTIME", "claude-code").lower() + if runtime == "codex": + return _inject_codex_mcp(trinity_mcp_url, trinity_mcp_api_key) if runtime == "gemini-cli": return _inject_gemini_mcp(trinity_mcp_url, trinity_mcp_api_key) else: @@ -143,6 +146,8 @@ def configure_mcp_servers(mcp_servers: dict) -> bool: runtime = os.getenv("AGENT_RUNTIME", "claude-code").lower() + if runtime == "codex": + return _configure_codex_mcp_servers(mcp_servers) if runtime == "gemini-cli": return _configure_gemini_mcp_servers(mcp_servers) else: @@ -211,3 +216,224 @@ def _configure_gemini_mcp_servers(mcp_servers: dict) -> bool: logger.info(f"Configured {success_count}/{len(mcp_servers)} MCP servers for Gemini CLI") return success_count > 0 or len(mcp_servers) == 0 + + +# --------------------------------------------------------------------------- +# Codex CLI MCP configuration (#1187 Phase F) +# +# Codex reads MCP servers from ``$CODEX_HOME/config.toml`` under +# ``[mcp_servers.]``. We write that file DIRECTLY (the same approach the +# Gemini path uses for its settings.json — deterministic, avoids `codex mcp +# add` CLI-syntax drift) and MERGE so the Trinity-MCP injection and the +# template-MCP configuration (two separate calls) don't clobber each other. +# +# CODEX_HOME is the relocated, gitignored scratch path (see codex_runtime.py); +# both this config writer and the runtime resolve it via the same helper so the +# file we write is the file Codex reads. +# --------------------------------------------------------------------------- + +def _codex_config_path() -> Path: + from .codex_runtime import _codex_home # lazy: avoid an import cycle + + return Path(_codex_home()) / "config.toml" + + +def _read_codex_config(path: Path) -> dict: + try: + with open(path, "rb") as fh: + return tomllib.load(fh) + except (IOError, OSError): + return {} + except tomllib.TOMLDecodeError as exc: + # Do NOT silently reset on a decode error. If we returned {} here, the + # next _upsert_codex_mcp_servers would rewrite the file from {} and + # drop every previously-written server — including the Trinity MCP + # wiring — with no trace. Back the bad file up and log loudly so the + # corruption is recoverable and visible; the caller re-injects its + # servers onto a clean slate on this run. + try: + backup = path.with_name(path.name + ".corrupt") + path.replace(backup) + logger.error( + "Codex config.toml is malformed (%s); backed it up to %s and " + "starting from an empty config. MCP servers are re-written this " + "run.", + exc, backup, + ) + except OSError as backup_err: + logger.error( + "Codex config.toml is malformed (%s) and the backup also failed " + "(%s); rewriting from an empty config.", + exc, backup_err, + ) + return {} + + +# Bare TOML keys are limited to ASCII letters, digits, '_' and '-'. Anything +# else (space, '.', ']', '#', control chars) must be a quoted basic-string key. +_BARE_KEY_CHARS = frozenset( + "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789_-" +) + +# Basic-string escapes with TOML shorthand. Everything else < 0x20 (plus 0x7F) +# becomes a \uXXXX escape; otherwise an out-of-band character (e.g. a newline +# in a server name or env value) yields invalid TOML. +_TOML_SHORTHAND_ESCAPES = { + "\\": "\\\\", + '"': '\\"', + "\b": "\\b", + "\t": "\\t", + "\n": "\\n", + "\f": "\\f", + "\r": "\\r", +} + + +def _toml_escape(value: str) -> str: + out: list[str] = [] + for ch in value: + shorthand = _TOML_SHORTHAND_ESCAPES.get(ch) + if shorthand is not None: + out.append(shorthand) + elif ord(ch) < 0x20 or ord(ch) == 0x7F: + out.append(f"\\u{ord(ch):04X}") + else: + out.append(ch) + return "".join(out) + + +def _toml_key(key: str) -> str: + """Render a TOML key segment: bare when it is a valid bare key, otherwise a + quoted basic-string key. Used for both ``key = ...`` lines and the dotted + segments of ``[table.header]`` lines so a server name or env key with a + space/dot/special char can't produce unparseable TOML.""" + if key and all(c in _BARE_KEY_CHARS for c in key): + return key + return f'"{_toml_escape(key)}"' + + +def _toml_scalar(value) -> str: + if isinstance(value, bool): + return "true" if value else "false" + if isinstance(value, (int, float)): + return str(value) + if isinstance(value, list): + # A list of dicts is a TOML array-of-tables ([[name]]), which this + # writer never emits. Stringifying the dicts would silently corrupt a + # pre-existing config on round-trip. Raise instead: the caller + # (_upsert_codex_mcp_servers) serializes BEFORE write_text, so a raise + # leaves the original file intact and logs a warning — a safe no-op + # rather than a mangled rewrite (#1187 review). + if any(isinstance(item, dict) for item in value): + raise TypeError( + "codex config writer does not support TOML array-of-tables; " + "refusing to serialize to avoid corrupting the existing file" + ) + return "[" + ", ".join(_toml_scalar(item) for item in value) + "]" + if isinstance(value, dict): + # _serialize_table routes dicts to sub-tables, so a dict reaching here + # is an unexpected nesting. Raise rather than emit a stringified dict. + raise TypeError( + "codex config writer received a dict where a scalar was expected" + ) + return f'"{_toml_escape(str(value))}"' + + +def _serialize_table(path: list[str], table: dict, lines: list[str]) -> None: + """Recursively emit a TOML table. ``path`` is the header segments (empty for + the document root). Scalar keys are always emitted before any nested-table + headers (TOML requires it). A table with only sub-tables is left as an + implicit super-table (no redundant ``[parent]`` header), matching the + hand-written output this replaced.""" + scalars = {k: v for k, v in table.items() if not isinstance(v, dict)} + sub_tables = {k: v for k, v in table.items() if isinstance(v, dict)} + # Emit a header for a non-root table that has its own scalar keys, or that + # is entirely empty (so an explicit empty table round-trips). Skip it for a + # pure super-table whose only contents are nested tables. + emit_header = bool(path) and (bool(scalars) or not sub_tables) + if emit_header: + lines.append("[" + ".".join(_toml_key(seg) for seg in path) + "]") + for key, value in scalars.items(): + lines.append(f"{_toml_key(key)} = {_toml_scalar(value)}") + if emit_header or (scalars and sub_tables): + lines.append("") + for key, sub in sub_tables.items(): + _serialize_table(path + [key], sub, lines) + + +def _serialize_codex_config(config: dict) -> str: + """Serialize the codex config we manage to TOML: arbitrary top-level scalars + and tables (preserved on round-trip) plus the ``[mcp_servers.]`` + tables we write. Table/key names and string values are quoted/escaped so a + special character can never yield unparseable TOML.""" + lines: list[str] = [] + _serialize_table([], config, lines) + return "\n".join(lines).rstrip() + "\n" + + +def _upsert_codex_mcp_servers(servers: dict) -> bool: + """Merge ``servers`` into ``$CODEX_HOME/config.toml`` ``[mcp_servers.*]``, + preserving any existing servers + top-level settings.""" + path = _codex_config_path() + try: + path.parent.mkdir(parents=True, exist_ok=True) + config = _read_codex_config(path) + config.setdefault("mcp_servers", {}) + config["mcp_servers"].update(servers) + path.write_text(_serialize_codex_config(config)) + return True + except Exception as e: # noqa: BLE001 + logger.warning(f"Failed to write codex config.toml: {e}") + return False + + +def _inject_codex_mcp(trinity_mcp_url: str, trinity_mcp_api_key: str) -> bool: + """Wire the Trinity HTTP MCP server into the codex config. + + The bearer token is referenced by ENV VAR (``bearer_token_env_var``), NOT + written as a literal — the secret stays in the agent's environment and is + never persisted to config.toml (#1187 Phase F). + """ + # trinity_mcp_api_key is intentionally unused: Codex reads it from the + # TRINITY_MCP_API_KEY env var at run time. Accepting it keeps the + # _inject_*_mcp signatures uniform across runtimes. + del trinity_mcp_api_key + server = { + "url": trinity_mcp_url, + "bearer_token_env_var": "TRINITY_MCP_API_KEY", + } + if _upsert_codex_mcp_servers({"trinity": server}): + logger.info("Injected Trinity MCP server into codex config.toml") + return True + return False + + +def _configure_codex_mcp_servers(mcp_servers: dict) -> bool: + """Configure template-supplied MCP servers for Codex via config.toml. + + Stdio servers (command + args) are supported, matching the Gemini path's + scope. A server with no command is skipped with a warning. + """ + servers: dict = {} + for server_name, config in mcp_servers.items(): + command = config.get("command", "") + if not command: + logger.warning(f"Skipping MCP server '{server_name}': no command specified") + continue + entry: dict = {"command": command} + args = config.get("args") + if args: + entry["args"] = args + env = config.get("env") + if isinstance(env, dict) and env: + entry["env"] = env + servers[server_name] = entry + + if not servers: + return len(mcp_servers) == 0 + + ok = _upsert_codex_mcp_servers(servers) + logger.info( + f"Configured {len(servers)}/{len(mcp_servers)} MCP servers for Codex" + ) + return ok diff --git a/docker/base-image/startup.sh b/docker/base-image/startup.sh index 59d0f7a0..06f80fe4 100644 --- a/docker/base-image/startup.sh +++ b/docker/base-image/startup.sh @@ -307,6 +307,31 @@ if [ -d "/generated-creds" ]; then echo "Credential files copied" fi +# === Codex runtime setup (#1187) === +# Codex is the third agent runtime. Two Codex-specific quirks are fixed here: +# 1. Identity: Codex reads AGENTS.md (NOT CLAUDE.md). Mirror the agent's +# instructions so a Codex agent gets the same platform identity Claude does +# (the per-turn system prompt is additionally prepended by codex_runtime.py). +# 2. CODEX_HOME defaults to ~/.codex — inside the git-tracked repo, which would +# dirty auto-sync. Relocate it onto the disk-backed scratch dir (#1098). +# NOTE: startup.sh must NOT write to .gitignore (#953) — the canonical list +# (`_GITIGNORE_PATTERNS` in src/backend/services/git_service.py, applied on git +# init/push by `_build_gitignore_merge_command`) carries `.tmp/`, so the +# relocated CODEX_HOME under $TMPDIR is excluded from git without a shell write. +if [ "${AGENT_RUNTIME}" = "codex" ]; then + echo "Configuring Codex runtime..." + + if [ -f "/home/developer/CLAUDE.md" ] && [ ! -f "/home/developer/AGENTS.md" ]; then + cp /home/developer/CLAUDE.md /home/developer/AGENTS.md 2>/dev/null && \ + echo " Mirrored CLAUDE.md -> AGENTS.md for Codex" || \ + echo " Warning: could not create AGENTS.md" + fi + + CODEX_HOME_DIR="${CODEX_HOME:-${AGENT_TMPDIR}/codex}" + mkdir -p "${CODEX_HOME_DIR}" 2>/dev/null && chmod 700 "${CODEX_HOME_DIR}" 2>/dev/null || \ + echo " Warning: could not create CODEX_HOME ${CODEX_HOME_DIR}" +fi + # Ensure core agent-server dependencies are installed correctly # This prevents template repos from breaking the agent server with incompatible packages echo "Verifying agent-server dependencies..." diff --git a/docs/TRINITY_COMPATIBLE_AGENT_GUIDE.md b/docs/TRINITY_COMPATIBLE_AGENT_GUIDE.md index 84325114..6f523c1c 100644 --- a/docs/TRINITY_COMPATIBLE_AGENT_GUIDE.md +++ b/docs/TRINITY_COMPATIBLE_AGENT_GUIDE.md @@ -156,6 +156,7 @@ credentials.json .npm/ .ssh/ .trinity/ +.tmp/ # Large generated content - DO NOT COMMIT content/ diff --git a/docs/memory/architecture.md b/docs/memory/architecture.md index 3cd7d970..d53d1ff9 100644 --- a/docs/memory/architecture.md +++ b/docs/memory/architecture.md @@ -325,6 +325,16 @@ Services that run continuously in the backend process: Canonical home for each multi-component feature. Endpoint signatures live in [API Endpoints](#api-endpoints); table DDL in [Database Schema](#database-schema). +### Agent Runtimes — multi-runtime / "harness == runtime" (#1187) + +A Trinity **harness IS an `AgentRuntime`** — the pluggable execution engine inside the agent container. Three ship today: **Claude Code** (default), **Gemini CLI**, and **OpenAI Codex** (#1187). `AGENT_RUNTIME` (container env, set from `template.yaml runtime:` via `crud.py`; also a `trinity.agent-runtime` label) selects one; `runtime_adapter.get_runtime()` is the factory — it **validates** the value against `KNOWN_RUNTIMES` and raises on an unknown one rather than silently defaulting to Claude. + +**ABC** (`agent_server/services/runtime_adapter.py`): `execute` (chat), `execute_headless` (stateless task), `configure_mcp`, `is_available`, `get_default_model`, `get_context_window`, plus a non-abstract `capabilities()` classmethod returning a `RuntimeCapabilities` dataclass (`chat_continuity`, `session_tab_resume`, `mcp_support`, `cost_reporting: "native"|"estimated"`) — conservative by default so a new runtime that forgets to override is treated as least-capable. Each runtime is a singleton (`get__runtime()`). + +**Codex** (`codex_runtime.py`, built independently on the per-runtime primitives — NOT a shared helper, so it never inherits Gemini's blanket `kill_cgroup_orphans()`): `codex exec --json` → JSONL events (`thread.started`→session id, `turn.completed.usage`→tokens where `reasoning_output_tokens` is a SUBSET of `output_tokens`, `item.completed`→response/tool activity, `turn.failed`/`error`); `-o/--output-last-message` is the authoritative result (read-then-delete in `finally`); `codex exec resume ` for chat continuity; cost estimated via `CODEX_PRICING`. Concurrency-safe orphan cleanup via `_drain_bounded` (`kill_cgroup_orphans(extra_pids=…)` preserves sibling executions). Error→HTTP: auth→503, rate→429, runtime-unavailable→**500** (not 503 — avoids the AUTH collision), pipe-drop→**502** (SUB-003 guard). + +**Parity surface** (every runtime must wire these — see the [Harness Authoring Guide](harness-authoring-guide.md)): platform **system prompt** (Codex prepends it + mirrors `CLAUDE.md`→`AGENTS.md` at startup), **sandbox** (`_resolve_sandbox_mode`: normal mode → `--sandbox danger-full-access` — Codex's own bubblewrap sandbox can't create a user namespace inside the hardened agent container (`bwrap: No permissions to create a new namespace`), which blocks every shell tool, so it's dropped and the Trinity container is the sole boundary, same posture as Claude/Gemini; **read-only mode** → `--sandbox read-only`, read from `~/.trinity/read-only-config.json` since the Claude PreToolUse hook doesn't apply — a fail-closed read-only enforcement story for Codex is a fast-follow), **guardrails** (`_load_guardrails()`; unmappable Claude tool-names are surfaced in logs, not silently dropped), and **credential sanitization** (`utils/credential_sanitizer` over response + logs). Codex credentials: `OPENAI_API_KEY` from process env else parsed from `/home/developer/.env` (CRED-002; not exported into the agent-server process), injected into the subprocess; `CODEX_HOME` is relocated under `$TMPDIR` (gitignored) so codex state never dirties the repo. Codex agents skip Claude-subscription auto-assign in `crud.py`/`lifecycle.py` (`is_claude_runtime`). Backend reads nothing runtime-specific in MVP: it still infers AUTH from HTTP 503; `ExecutionMetadata.status`/`error_code` ship unused (fast-follow). The **Session tab** is gated off for runtimes lacking `session_tab_resume` (one backend constant `RUNTIMES_WITHOUT_SESSION_TAB_RESUME` in `sessions.py` runs a stateless turn; frontend hides the tab). MCP: `_configure_codex_mcp_servers`/`_inject_codex_mcp` write `$CODEX_HOME/config.toml` directly, the Trinity HTTP MCP referencing the token via `bearer_token_env_var` (never persisted as a literal). The platform prompt is **runtime-aware** (`platform_prompt_service.get_platform_system_prompt(runtime=…)`/`compose_system_prompt(runtime=…)`, threaded from `routers/chat.py` + `task_execution_service.py` via the `trinity.agent-runtime` label resolved best-effort by `docker_service.get_agent_runtime`): for Codex it strips the Claude-only `mcp__trinity__` tool-name prefix (which otherwise made Codex emit `unknown MCP server`) and references the auto-discovered `trinity` tools by bare name; Claude/Gemini/unknown keep the canonical naming. Frontend: `RuntimeBadge.vue` codex case, `AgentDetail.vue` default model + terminal map, `AgentTerminal.vue` `codex` mode. + ### Capacity & Backlog (#428) `CapacityManager` (CAPACITY-CONSOLIDATE) is the single public API for admit/release/status across `/chat` (`max_concurrent=max_parallel_tasks`, `queue_in_memory` policy) and `/task` (`queue_persistent` policy). It composes two private internals — `slot_service.py` (atomic N-ary counter, Redis ZSET `agent:slots:{name}`, dynamic per-agent TTL) and `backlog_service.py` (SQLite FIFO over `schedule_executions.status='queued'`, drain-on-release) — and owns the in-memory overflow store (Redis LIST, depth 3). diff --git a/docs/memory/feature-flows.md b/docs/memory/feature-flows.md index 27c8b2d4..919e99b3 100644 --- a/docs/memory/feature-flows.md +++ b/docs/memory/feature-flows.md @@ -11,6 +11,7 @@ | Date | ID | Feature | Flow | |------|-----|---------|------| +| 2026-06-14 | #1187 | feat(agent-runtime): **OpenAI Codex** as the third agent runtime ("harness == runtime") alongside Claude Code + Gemini. New `codex_runtime.py` implements `AgentRuntime` on the existing per-runtime primitives (NOT a shared helper → never inherits Gemini's blanket `kill_cgroup_orphans()`): `codex exec --json` JSONL parse (`thread.started`/`turn.completed.usage`/`item.completed`; `reasoning_output_tokens` ⊂ `output_tokens`, no double-count), `-o/--output-last-message` authoritative result (read-then-delete in `finally`), `codex exec resume ` continuity, estimated cost (`CODEX_PRICING`), concurrency-safe `_drain_bounded` orphan cleanup, error→HTTP (auth **503** / rate **429** / runtime-unavailable **500**-not-503 / pipe-drop **502**). **Safety parity** (blocking): platform system-prompt prepended + `CLAUDE.md`→`AGENTS.md` mirror, **sandbox** normal→`--sandbox danger-full-access` (drops Codex's inner bwrap sandbox — it can't create a user namespace in the hardened container, which blocks every tool — leaving the Trinity container as the sole boundary, same as Claude) / read-only→`--sandbox read-only` (reads `~/.trinity/read-only-config.json`; fail-closed enforcement a fast-follow), guardrails honored/surfaced, credential sanitizer over output+logs. **Runtime-aware platform prompt** (E2E fix): Codex strips the Claude-only `mcp__trinity__` tool prefix (else the model emits `unknown MCP server`) via `platform_prompt_service.{get_platform_system_prompt,compose_system_prompt}(runtime=…)`, threaded from `chat.py` + `task_execution_service.py` through the `trinity.agent-runtime` label (`docker_service.get_agent_runtime`). New `RuntimeCapabilities` dataclass + per-runtime `capabilities()`; `get_runtime()` validates `AGENT_RUNTIME` and fails loud on unknown. Codex skips Claude-subscription assign (`is_claude_runtime`, crud+lifecycle); `OPENAI_API_KEY` read from `.env`; `CODEX_HOME` relocated under `$TMPDIR` + gitignored. MCP via `$CODEX_HOME/config.toml` (`bearer_token_env_var` — token never persisted). Session tab gated off via one backend constant (`RUNTIMES_WITHOUT_SESSION_TAB_RESUME`); frontend `RuntimeBadge`/default-model/terminal map + hidden Session tab; `test-codex` template; `@openai/codex@0.139.0` pinned. E2E-verified live (boot/chat/cost/resume/read-only/no-leak); resume arg-order bug fixed (exec flags must precede the `resume` sub-subcommand). | [codex-runtime.md](feature-flows/codex-runtime.md) | | 2026-06-14 | #1022 | fix(scheduler): persist a descriptive `error` on dispatch timeout — a dispatch `httpx.TimeoutException` (whose `str()` is `''`) previously landed in the cron path's generic handler and persisted a **blank** `error`. Now re-raised before that handler as a named non-blank message (`"dispatch to /api/internal/execute-task timed out after {N}s — outcome unknown"`); outcome is genuinely UNKNOWN (backend spawns the bg task before replying → may already be running → orphan recovered by cleanup). New `_describe_exception()` helper (type-name fallback) normalizes any blank-stringifying exception across all execution/retry/process-schedule error paths. Dispatch + pre-check HTTP deadlines lifted from literals to config: `DISPATCH_TIMEOUT` (default 30s) and `PRE_CHECK_TIMEOUT` (default 70s). Scheduler-only (`src/scheduler/`); +270 lines of tests (incl. pre-check config-deadline + retry-path blank-error regressions). | [scheduler-service.md](feature-flows/scheduler-service.md), [scheduler-pre-check.md](feature-flows/scheduler-pre-check.md) | | 2026-06-11 | #858 | fix: first-time setup token silently lost — `docker/backend/Dockerfile` had drifted and lost `ENV PYTHONUNBUFFERED=1` (which `docker/scheduler/Dockerfile` still set), so CPython block-buffered the lifespan's stdout to the Docker log pipe (~8KB) and the printed setup token never reached `docker logs`, deadlocking fresh installs (the only documented path through the `routers/setup.py` token gate). Two-layer fix: (1) restore `PYTHONUNBUFFERED=1` (catches every `print()`); (2) the setup-token block + ~76 other lifespan `print()` calls now emit via the structured `logger` — the token as a single multi-line `logger.warning` **relocated to immediately after `setup_logging()`**, before the event-bus/audit-write startup that could otherwise hang and suppress it (the `StreamHandler` flushes per record, so it's immune to future Dockerfile drift and flows through Vector). `setup_opentelemetry()`'s import-time print + the `register_enterprise` prints stay `print(..., flush=True)` (they run before `setup_logging()`). New `unit/test_858_dockerfile_unbuffered.py` backend↔scheduler parity guard (2 tests). Note: stdout→stderr stream move for the converted lines (Docker/Vector capture both). Known follow-up #1165: prod runs uvicorn `--workers 2`, so the per-process token is still ~50% flaky until unified. | [first-time-setup.md](feature-flows/first-time-setup.md) | | 2026-06-10 | #1130 | fix: retired `gemini-2.0-flash` replaced with env-configurable models — `GEMINI_TEXT_MODEL` (image-gen prompt refinement) + `GEMINI_TRANSCRIPTION_MODEL` (Telegram voice), both default `gemini-3.5-flash`, defined in `config.py`, empty-string-safe wiring in both compose files (#1076 pattern). | [image-generation.md](feature-flows/image-generation.md), [telegram-integration.md](feature-flows/telegram-integration.md) | @@ -272,6 +273,7 @@ | Telegram Integration | [telegram-integration.md](feature-flows/telegram-integration.md) | Per-agent Telegram bots with webhook transport, group chat support, and `/login` email verification (TELEGRAM-001, TGRAM-GROUP) | | Unified Channel Access Control | [unified-channel-access-control.md](feature-flows/unified-channel-access-control.md) | Cross-channel access gate keyed on verified email — policy, /login, access requests (#311) | | VoIP Telephony | [voip-telephony.md](feature-flows/voip-telephony.md) | Outbound phone calls over the Gemini Live bridge via Twilio Media Streams; per-agent voice binding, ticket-authed WS, post-call transcript processing. Flag-gated default OFF (VOIP-001, #1056) | +| OpenAI Codex Runtime | [codex-runtime.md](feature-flows/codex-runtime.md) | Third agent runtime ("harness == runtime") — `codex exec` engine with full safety parity (system prompt, read-only sandbox, guardrails, sanitization), `RuntimeCapabilities`, Session-tab gate, MCP via config.toml. See also the [Harness Authoring Guide](harness-authoring-guide.md) (#1187) | | Nevermined x402 Payments | [nevermined-payments.md](feature-flows/nevermined-payments.md) | Per-agent paid API via x402 payment protocol (NVM-001) | ### Mobile & PWA diff --git a/docs/memory/feature-flows/codex-runtime.md b/docs/memory/feature-flows/codex-runtime.md new file mode 100644 index 00000000..9dea36ba --- /dev/null +++ b/docs/memory/feature-flows/codex-runtime.md @@ -0,0 +1,119 @@ +# Feature: OpenAI Codex Runtime (#1187) + +## Overview + +A Trinity **harness IS an `AgentRuntime`** — the pluggable execution engine inside +the agent container. **Codex** is the third runtime, alongside Claude Code +(default) and Gemini CLI. An agent runs on Codex when its template declares +`runtime: { type: codex, model: gpt-5.1-codex }`; the backend creates the +container with `AGENT_RUNTIME=codex`, and `codex_runtime.py` implements the ABC. + +The hard part is **not** wiring a new CLI — it's achieving full parity with +Trinity's Claude-specific safety layer (system prompt, read-only mode, +guardrails, credential sanitization), which a naive port silently bypasses. + +This is an MVP (follow-up to spike #854). For adding a *fourth* runtime, see the +[Harness Authoring Guide](../harness-authoring-guide.md). + +## Flow: UI → API → Runtime → Side Effects + +1. **Create.** Template `runtime.type: codex` → `crud.py` sets `AGENT_RUNTIME=codex` + env + `trinity.agent-runtime=codex` label. Codex agents **skip** Claude-subscription + auto-assign (`is_claude_runtime()` gate) — no `CLAUDE_CODE_OAUTH_TOKEN`, no + persisted `subscription_id`. `lifecycle.py` mirrors the skip on recreate. +2. **Startup** (`startup.sh`). Mirrors `CLAUDE.md` → `AGENTS.md` (Codex reads + `AGENTS.md`), creates `CODEX_HOME` under `$TMPDIR` (off the git-tracked repo), + gitignores `.tmp/`. MCP config (`trinity_mcp.py`) writes `$CODEX_HOME/config.toml`. +3. **Chat / Task.** `POST /api/chat` → `runtime.execute()`; `POST /api/task` → + `runtime.execute_headless()`. Both build `codex exec --json --skip-git-repo-check + -C /home/developer --sandbox -o /-last.txt + [-m model] [resume ] -- ` and stream the JSONL. + `` = `danger-full-access` normally (no inner bwrap sandbox — see Safety + parity), `read-only` when the agent is read-only. Exec-level flags **precede** + `resume` (the `resume` sub-subcommand rejects them — the turn-2+ continuity + fix); `--` ends options so a `-`-leading prompt can't be reparsed as a flag. +4. **Parse.** `thread.started`→session id; `turn.completed.usage`→tokens (cost + estimated; `reasoning_output_tokens` ⊂ `output_tokens`, never double-counted); + `item.completed`→agent message / tool activity; `turn.failed`/`error`→error. + The `-o` file is the **authoritative** response (read-then-delete in `finally`); + JSONL `agent_message` is the fallback. +5. **Return.** `(response_text, execution_log, ExecutionMetadata, …)` — same shape + as Claude/Gemini, so the backend treats Codex executions identically. + +## Safety parity (Phase C — blocking) + +| Control | Claude | Codex | +|---------|--------|-------| +| System prompt | `--append-system-prompt` | prepended to the prompt; `AGENTS.md` for identity; MCP-tool naming made runtime-aware (no `mcp__trinity__` prefix — see MCP) | +| Sandbox | none (container is the boundary) | normal → `--sandbox danger-full-access` (Codex's own bwrap sandbox can't create a user namespace in the hardened container — `bwrap: No permissions…` — which blocks every tool; drop it, the container stays the boundary, same as Claude) | +| Read-only | PreToolUse hook on `~/.trinity/read-only-config.json` | reads the same file → `--sandbox read-only` (Codex has no PreToolUse hook; a fail-closed enforcement story is a fast-follow) | +| Guardrails | `--disallowedTools` + turn caps | sandbox + network; unmappable tool-names **logged** (not dropped) | +| Credential redaction | sanitizer over response + logs | identical sanitizer calls | + +## Error → HTTP mapping + +auth (missing/invalid key, 401) → **503**; rate-limit → **429**; +runtime-unavailable → **500** (NOT 503 — 503 is the backend's AUTH signal, and the +dispatch breaker counts AUTH only); early pipe drop → **502** (SUB-003 guard). +Generic failures staying at 500 keep the AUTH path and SUB-003 auto-switch inert +for Codex; the #678 reader-race retry never matches a Codex 502 (no +`recovery_attempted` marker). + +## Capabilities & Session tab + +`CodexRuntime.capabilities()` → `chat_continuity=True` (`codex exec resume`), +`session_tab_resume=False`, `mcp_support=True`, `cost_reporting="estimated"`. +Because `session_tab_resume=False`, the backend gates the Session-tab cached-UUID +`--resume` turn off (one constant `RUNTIMES_WITHOUT_SESSION_TAB_RESUME` in +`sessions.py` → stateless turn) and the frontend hides the Session tab. The Chat +tab (with continuity) stays. + +## MCP + +`_inject_codex_mcp` / `_configure_codex_mcp_servers` write `$CODEX_HOME/config.toml` +directly (merging, idempotent — same approach the Gemini path uses for its +settings.json). The Trinity HTTP MCP references the token via `bearer_token_env_var` += `TRINITY_MCP_API_KEY` — **the literal secret is never persisted** to config.toml. + +Registering the server is only half of MCP working. `PLATFORM_INSTRUCTIONS` +documents the tools with Claude's `mcp__trinity__` prefix; Codex +auto-discovers MCP tools by bare name and answers `unknown MCP server` to a +prefixed call. So the platform prompt is **runtime-aware**: +`platform_prompt_service.get_platform_system_prompt(runtime=…)` / +`compose_system_prompt(runtime=…)` strip the prefix and add a Codex orientation +note for `runtime="codex"` (Claude/Gemini/unknown unchanged). The runtime is +threaded from `routers/chat.py` + `services/task_execution_service.py`, resolved +best-effort + lazily from the `trinity.agent-runtime` label +(`docker_service.get_agent_runtime`). + +## Key files + +| Layer | File | +|-------|------| +| Base image | `docker/base-image/Dockerfile` (`@openai/codex@0.139.0`), `startup.sh` (AGENTS.md, CODEX_HOME) | +| Runtime | `docker/base-image/agent_server/services/codex_runtime.py` | +| Contract | `runtime_adapter.py` (`RuntimeCapabilities`, factory + validation), `models.py` (`ExecutionMetadata.status/error_code`) | +| MCP | `agent_server/services/trinity_mcp.py`, `services/platform_prompt_service.py` (runtime-aware tool naming) | +| Backend | `services/agent_service/{crud,lifecycle,helpers,terminal}.py`, `routers/sessions.py`, `routers/chat.py` + `services/task_execution_service.py` (thread runtime), `services/docker_service.py` (`get_agent_runtime`) | +| Frontend | `components/RuntimeBadge.vue`, `components/AgentTerminal.vue`, `views/AgentDetail.vue` | +| Template | `config/agent-templates/test-codex/` | + +## Tests + +Unit (`tests/unit/test_codex_*`, `test_runtime_*`, `test_session_tab_gate_codex`, +`test_platform_prompt_runtime`): JSONL parser → metadata, cost (no reasoning +double-count + cached pricing + default), error→status (pipe-drop 502, generic +500-not-503), capabilities matrix, factory + unknown-runtime validation, +subscription skip, MCP config (+ no dup on restart + token-not-persisted), +**sandbox resolution** (normal → `danger-full-access`, read-only stays, no dead +`network_access` flag), **runtime-aware prompt** (codex omits `mcp__trinity__` + +gets the orientation note, Claude/Gemini unchanged), resume arg-order guard, +Session-tab gate + backend inertness. E2E in `/verify-local`: a real +`AGENT_RUNTIME=codex` agent with an injected `OPENAI_API_KEY`, one `/api/chat` + +one `/api/task` turn (tools create+read a file; MCP `list_agents`). + +## Out of scope (fast-follow) + +Shared subprocess-helper DRY extraction; Session-tab cached-UUID resume for Codex; +backend reading `ExecutionMetadata.error_code` directly; Codex SSE streaming; +vision/images; a post-creation runtime-switch endpoint. diff --git a/docs/memory/harness-authoring-guide.md b/docs/memory/harness-authoring-guide.md new file mode 100644 index 00000000..0aa8fd78 --- /dev/null +++ b/docs/memory/harness-authoring-guide.md @@ -0,0 +1,165 @@ +# Harness Authoring Guide — adding an agent runtime + +> A Trinity **harness IS an `AgentRuntime`**: the pluggable execution engine that +> runs inside the agent container. This guide is the checklist for adding a +> fourth runtime, using the OpenAI **Codex** runtime (#1187) as the worked +> example. Claude Code is the reference implementation; Gemini and Codex are the +> two ports. + +The work spans **four surfaces** (Invariant #13 — keep them in sync): the agent +server (the runtime + parity), the backend (credentials + Session-tab gate), the +MCP config, and the frontend. Do them in this order — the runtime + capability +contract first, then everything that depends on it. + +--- + +## 1. Install the CLI in the base image + +`docker/base-image/Dockerfile` — `RUN npm install -g @` +**before** the `USER developer` switch (global installs need root). **Pin** the +version (not `@latest`) so base-image rebuilds are reproducible. Verify +` --version` in the booted image during `/verify-local`. + +## 2. Implement the `AgentRuntime` ABC + +New `docker/base-image/agent_server/services/_runtime.py`. Implement all +abstract methods: `execute` (chat, with continuity), `execute_headless` +(stateless task), `configure_mcp`, `is_available`, `get_default_model`, +`get_context_window`. Build on the **existing per-runtime primitives** — do NOT +extract a shared helper as part of this PR, and do NOT copy Gemini's blanket +`kill_cgroup_orphans()`: + +| Concern | Reuse | +|---------|-------| +| Cancellation / process tracking | `process_registry` (`register`/`unregister`/`active_execution_pids`) | +| Live SSE logs | `registry.publish_log_entry(execution_id, entry)` | +| Activity timeline | `activity_tracking.start_tool_execution` / `complete_tool_execution(id, success, output)` | +| Orphan tag | `utils/subprocess_pgroup.EXECUTION_TAG_NAME` on the subprocess env | +| **Concurrency-safe** orphan cleanup | `subprocess_lifecycle._capture_pgid` + `_terminate_process_group(pgid=…)` + `_drain_bounded(...)` — the latter runs `kill_cgroup_orphans(extra_pids=active_execution_pids())`, preserving sibling executions | +| Credential redaction | `utils/credential_sanitizer.sanitize_text` / `sanitize_dict` / `sanitize_subprocess_line` | + +Spawn with `start_new_session=True` and `env={**os.environ, EXECUTION_TAG_NAME: execution_id}`. +Parse the CLI's machine output into `ExecutionMetadata` + `ExecutionLogEntry`. +Map errors to HTTP status: **auth → 503**, **rate-limit → 429**, +**runtime-unavailable → 500** (NOT 503 — 503 is the backend's AUTH signal and the +dispatch breaker counts AUTH only), **early pipe drop → 502** (the SUB-003 guard). +Add a singleton `get__runtime()`. + +## 3. Override `capabilities()` + +`RuntimeCapabilities` (in `runtime_adapter.py`) is how callers gate on a feature +instead of branching on the runtime name. Declare honest values: + +```python +@classmethod +def capabilities(cls) -> RuntimeCapabilities: + return RuntimeCapabilities( + chat_continuity=True, # the runtime can resume a conversation + session_tab_resume=False, # NOT the Session tab's cached-UUID --resume model + mcp_support=True, + cost_reporting="estimated", # "native" only if the CLI reports real cost + ) +``` + +The ABC default is conservative, so forgetting this leaves the runtime +least-capable (safe) rather than over-claiming. + +## 4. Register + validate in the factory + +`runtime_adapter.get_runtime()` — add the branch and the value to +`KNOWN_RUNTIMES`. The factory **validates** `AGENT_RUNTIME` and **raises** on an +unknown value (it must never silently fall back to Claude). Test: factory +selects your runtime; an unknown value raises. + +## 5. Wire the parity layer (BLOCKING — do not ship without it) + +A naive port silently bypasses Trinity's Claude-specific safety layer. Every +runtime MUST wire all four: + +1. **System prompt / identity** — the backend always sends an effective + `system_prompt` (`task_execution_service.py`). If your CLI has no + `--append-system-prompt` equivalent, **prepend** it to the prompt. Map the + agent's persistent identity file if the CLI reads a different one (Codex reads + `AGENTS.md`, so `startup.sh` mirrors `CLAUDE.md` → `AGENTS.md`). +2. **Sandbox / read-only mode** — the read-only signal is + `~/.trinity/read-only-config.json` (`enabled`), the same file Claude's + PreToolUse hook reads. Your runtime can't run Claude hooks, so read the file + and translate. **Caution:** if your CLI ships an *internal* sandbox that needs + user namespaces (Codex's bubblewrap), it will fail inside the hardened agent + container (`bwrap: No permissions to create a new namespace`) and block every + tool. The Trinity container is already the boundary (cap_drop ALL + AppArmor + + no-new-privileges), so disable the inner sandbox for normal mode (Codex → + `--sandbox danger-full-access`) and reserve the sandboxed setting for read-only + (Codex → `--sandbox read-only`). Read-only *enforcement* for a CLI without + hooks is an open question — surface it for a platform decision rather than + assuming. +3. **Guardrails** — `_runtime_config._load_guardrails()` yields `disallowed_tools` + + turn caps. Honor what maps to your CLI's controls; **surface (log) what + doesn't** — never silently drop a guardrail. +4. **Credential sanitization** — run the sanitizer over the response text AND the + raw log/stderr, exactly as the Claude/headless paths do. + +## 6. Credentials + subscription (backend) + +If the runtime authenticates with its own key (not a Claude subscription): +- The key arrives in the agent's `.env` (CRED-002), which is copied to disk but + NOT exported into the agent-server process — so the runtime must read it from + the process env OR parse `/home/developer/.env`, then inject it into the + subprocess env. +- Skip the Claude-subscription auto-assign in `crud.py` and the OAuth juggle in + `lifecycle.py` via `agent_service/helpers.is_claude_runtime(...)`, so the + runtime gets no `CLAUDE_CODE_OAUTH_TOKEN` and no persisted `subscription_id`. +- Relocate any CLI home/state dir off the git-tracked workspace (Codex: + `CODEX_HOME` under `$TMPDIR`, gitignored) and read-then-delete transient result + files in a `finally`. + +## 7. MCP (`trinity_mcp.py`) + +Add a branch in both `inject_trinity_mcp_if_configured` (the Trinity HTTP MCP + +bearer token) and `configure_mcp_servers` (template MCP servers). **Do not fall +through to the Claude path.** Write the runtime's native config; reference the +bearer token by env var, never persist the literal secret. (Codex writes +`$CODEX_HOME/config.toml` with `bearer_token_env_var`.) + +**Tool-name naming in the platform prompt** — registering the MCP server is only +half the job. `PLATFORM_INSTRUCTIONS` documents the tools with Claude's +`mcp__trinity__` prefix; a CLI that names MCP tools differently (Codex +auto-discovers them by bare name) will emit `unknown MCP server` if it copies the +prefixed call. Make the prompt runtime-aware in +`services/platform_prompt_service.py` (`get_platform_system_prompt(runtime=…)` / +`compose_system_prompt(runtime=…)`), thread the runtime from `routers/chat.py` + +`services/task_execution_service.py` via the `trinity.agent-runtime` label +(`docker_service.get_agent_runtime`, resolved lazily + guarded), and add your +runtime to the non-Claude branch. + +## 8. Session-tab gate (backend, if `session_tab_resume=False`) + +Add the runtime to the single constant `RUNTIMES_WITHOUT_SESSION_TAB_RESUME` in +`routers/sessions.py`. The turn endpoint then runs a plain stateless turn (drops +the cached UUID) for it. Verify the #678 reader-race retry stays inert for your +runtime's 502 shape and a generic failure never sets `error_code=AUTH`. + +## 9. Frontend + +- `components/RuntimeBadge.vue` — add the runtime's `isRuntime` computed, + label, tooltip, color, and icon. +- `views/AgentDetail.vue` — `defaultModel` branch; hide the Session tab if + `session_tab_resume` is false (`agent.runtime !== ''`). +- `components/AgentTerminal.vue` — the `cli` terminal-mode mapping; add the + matching backend mode in `services/agent_service/terminal.py`. + +## 10. Template, tests, docs + +- A `config/agent-templates//` template (`runtime: { type: , model }`) + declaring required credentials under `credentials.env_file` so the inject UI + prompts for them. Mirror `test-codex/` / `test-gemini/`. +- Unit tests: parser → metadata, cost (no double-count), error→status mapping, + capabilities, factory + unknown-runtime validation, MCP config (+ no dup on + restart), subscription skip, Session-tab gate + backend inertness. **Parity + tests:** read-only blocks writes, guardrails honored/surfaced, secret redacted, + system prompt reaches the runtime, 2-turn chat continuity. +- E2E (`/verify-local`): a real `AGENT_RUNTIME=` agent, one `/api/chat` + + one `/api/task` turn → non-empty response, cost/tokens populated, cold-start + auth works. +- Update `requirements.md`, `architecture.md` (the [Agent Runtimes](architecture.md#agent-runtimes--multi-runtime--harness--runtime-1187) + subsystem block), and a `feature-flows/` entry. diff --git a/docs/memory/requirements.md b/docs/memory/requirements.md index 78d44834..4d2c284e 100644 --- a/docs/memory/requirements.md +++ b/docs/memory/requirements.md @@ -2876,6 +2876,59 @@ Standalone mobile-friendly admin page for managing agents on the go. Designed as --- +## 40. Multi-Runtime Harnesses — OpenAI Codex (#1187) + +### 40.1 Codex CLI Execution Engine (#1187 — MVP) + +Trinity agents may run on the **OpenAI Codex CLI** as a third execution runtime +("harness == runtime") alongside Claude Code and Gemini. A template selects it +via `runtime: { type: codex, model: gpt-5.1-codex }`; the container is created +with `AGENT_RUNTIME=codex` and `codex_runtime.py` implements the `AgentRuntime` +ABC. Follow-up to spike #854. + +**Functional requirements:** +- **FR-1 — Execution:** `/api/chat` and `/api/task` run via `codex exec --json`; + the `-o/--output-last-message` file is the authoritative response (read-then-delete); + JSONL `agent_message` is the fallback. Tokens/cost from `turn.completed.usage` + (estimated cost — Codex has no native cost; `reasoning_output_tokens` is a subset + of `output_tokens`, never double-counted). +- **FR-2 — Chat continuity:** `codex exec resume ` continues the Chat-tab + conversation. The Session tab's cached-UUID `--resume` model is NOT supported in + the MVP (gated off for codex; chat continuity lives in the Chat tab). +- **FR-3 — Safety parity (blocking):** the platform system prompt reaches Codex + (prepended per-turn; `CLAUDE.md`→`AGENTS.md` mirrored at startup for identity); + read-only mode maps to `--sandbox read-only`; guardrails are honored where they + map to Codex's control surface and surfaced (logged) where they don't; Codex + output + logs pass through the credential sanitizer. +- **FR-4 — Sandbox + network:** normal (writable) agents run `--sandbox danger-full-access`, + which DISABLES Codex's own bubblewrap sandbox — `workspace-write`/`read-only` both invoke + `bwrap` to create a user namespace, which the hardened Trinity container forbids + (`bwrap: No permissions to create a new namespace`), blocking every shell tool. The Trinity + container is already the boundary (`cap_drop ALL` + AppArmor + `no-new-privileges`), the same + posture Claude/Gemini run under, so dropping the redundant inner sandbox weakens nothing. + Read-only agents keep `--sandbox read-only` (sandbox-native write protection) as the interim + enforcement — a fail-closed read-only enforcement story for Codex is a fast-follow. +- **FR-5 — Credentials:** `OPENAI_API_KEY` from the agent's `.env` (CRED-002), + loaded into the subprocess env; Codex agents are NOT assigned a Claude subscription. +- **FR-6 — MCP:** Trinity HTTP MCP + template MCP servers wired via `$CODEX_HOME/config.toml`; + the bearer token is referenced by env var, never persisted as a literal. +- **FR-7 — Capabilities:** each runtime declares `RuntimeCapabilities` + (`chat_continuity`, `session_tab_resume`, `mcp_support`, `cost_reporting`); + `get_runtime()` validates `AGENT_RUNTIME` and fails loudly on unknown values. + +**Non-functional:** concurrency-safe orphan cleanup (must not kill sibling +executions); `CODEX_HOME` relocated off the git-tracked workspace; error→HTTP +mapping keeps non-auth failures at 500 (never 503) so the dispatch breaker's +AUTH-only counting and the SUB-003 auth switch stay inert for Codex. + +**Out of scope (fast-follow):** shared subprocess-helper DRY extraction; Session-tab +cached-UUID resume for Codex; backend reading `ExecutionMetadata.error_code` +directly; Codex SSE streaming; vision/images; a post-creation runtime-switch +endpoint. See the [Harness Authoring Guide](harness-authoring-guide.md) for adding +a fourth runtime. + +--- + ## Out of Scope - Multi-tenant deployment (single org only) diff --git a/docs/security-reports/cso-diff-2026-06-18-1187.md b/docs/security-reports/cso-diff-2026-06-18-1187.md new file mode 100644 index 00000000..7840969d --- /dev/null +++ b/docs/security-reports/cso-diff-2026-06-18-1187.md @@ -0,0 +1,101 @@ +# CSO Security Audit — Codex Runtime MVP (#1187 / PR #1203) + +**Mode**: diff +**Scope**: `origin/dev...HEAD` — branch `AndriiPasternak31/issue-1187-plan` (42 files, +4423 / −41) +**Date**: 2026-06-18 + +OpenAI Codex added as Trinity's third agent runtime ("harness == runtime"). New +attack surface audited: subprocess execution of the Codex CLI inside the agent +container (`codex_runtime.py`, 1059 lines), a new credential type (OpenAI key), +generation of `$CODEX_HOME/config.toml`, backend runtime resolution / credential +isolation / prompt adaptation / session-tab gating, and a sandbox-posture +decision. + +--- + +## Summary + +| Category | CRITICAL | HIGH | MEDIUM | LOW | +|----------|----------|------|--------|-----| +| Secrets | 0 | 0 | 0 | 0 | +| Dependencies | 0 | 0 | 0 | 0 (1 INFO) | +| Auth Boundaries | 0 | 0 | 0 | 0 | +| Injection | 0 | 0 | 0 | 0 | +| Platform Patterns | 0 | 0 | 0 | 0 | +| Configuration | 0 | 0 | 1 | 1 | + +## Findings + +### CRITICAL / HIGH +None. + +### MEDIUM — Codex read-only enforcement is unverified +`docker/base-image/agent_server/services/codex_runtime.py:217-267` + +Read-only agents map to `--sandbox read-only`; `_is_read_only()` fails **open** +on a corrupt/unreadable config (matches the existing Claude `read-only-guard.py` +reference, by deliberate design — diverging one runtime to fail-closed would make +read-only inconsistent across runtimes). The PR itself flags Codex read-only +enforcement as an "open discussion." Because `read-only` invokes `bwrap` (which +the hardened Trinity container forbids — `bwrap: No permissions to create a new +namespace`), the actual behavior is undetermined: fail-closed-by-breaking-all- +tools vs. silently unenforced. This is an opt-in feature, and the worst case is +not a write leak past the container boundary (the container's `cap_drop ALL` + +`no-new-privileges` + AppArmor still hold) — but it should be verified before any +read-only Codex agent is relied on. **Track as a fast-follow.** + +### LOW — Sandbox documentation understates actual access (doc drift) +`config/agent-templates/test-codex/CLAUDE.md` (+ feature docs) + +Implementation uses `--sandbox danger-full-access` for normal agents (inner +sandbox disabled; the Trinity container is the only boundary). But the template +and docs describe `--sandbox workspace-write` with "writes outside the workspace +are blocked." The docs **understate** the real access level. The posture itself +is sound (identical to Claude/Gemini — no inner sandbox), but operator-facing +text should match the code. (This is the known `/validate-pr 1203` wording nit.) + +### INFORMATIONAL +- New pinned npm global `@openai/codex@0.139.0` in the base image — a new + third-party CLI that runs inside agent containers with full access. Pinning + (not `@latest`) is correct for reproducibility; flag for periodic CVE review. +- Normal-mode `danger-full-access` is a deliberate, documented posture matching + Claude/Gemini (container = the boundary). Not a regression. + +## What was verified clean + +- **Secrets**: only hit is `trinity_mcp_SUPERSECRETVALUE` — a fake test fixture + (`tests/unit/test_codex_mcp_config.py`). No real keys, no `.env` in the diff. +- **Subprocess injection**: `subprocess.Popen` is list-form (no `shell=True`); + the `--` end-of-options separator stops a `-`-leading prompt being reparsed as + a flag (e.g. weakening `--sandbox`); `stdin=DEVNULL`; `start_new_session=True` + isolates the process group so cleanup never SIGKILLs sibling executions. +- **Path traversal**: the `-o` result filename is reduced by `_safe_result_token` + (charset + basename), and the open/unlink sink is guarded by `_ensure_within` + containment against `$CODEX_HOME`. +- **TOML injection**: the `config.toml` writer escapes keys/values (`_toml_escape` + / `_toml_key`, `_toml_scalar` raises on array-of-tables/nested dicts), so a + hostile MCP server name or env value cannot inject TOML tables; a malformed + pre-existing file is backed up, not silently reset. +- **Credential handling**: the OpenAI key is read from env or `.env`, injected + only into the Codex subprocess, never logged; the sanitizer already covers it + (`OPENAI_.*`, `sk-…`, `sk-proj-…`, `Bearer …`, `TRINITY_MCP.*`). The Trinity MCP + bearer token is referenced via `bearer_token_env_var` — **never** written as a + literal to `config.toml` (stronger than the Claude/Gemini literal-token files). +- **Cross-runtime credential isolation**: non-Claude runtimes never receive an + injected Claude OAuth subscription token — enforced at both create + (`crud.py`, `is_claude_runtime` gate) and recreate + (`lifecycle.py`, `env_vars.pop('CLAUDE_CODE_OAUTH_TOKEN')`). +- **CODEX_HOME** relocated to gitignored `$TMPDIR` (`.tmp/` added to + `_GITIGNORE_PATTERNS` in `git_service.py`) so generated config + result files + can't be auto-synced into the agent's git repo. +- **Auth boundaries**: no new routers/endpoints; session/chat changes preserve + `AuthorizedAgent` + per-user ownership; `validate_runtime` adds a clean 400 for + a bad `runtime:`; `get_runtime()` now fails loud on an unknown runtime instead + of silently defaulting to Claude. +- **No new outbound HTTP/socket sinks** in the agent-server diff. + +## Recommendation + +**CLEAR** — no CRITICAL or HIGH findings; merge is not blocked on security. The +diff is consistently security-conscious. Two non-blocking follow-ups: (1) MEDIUM — +verify Codex read-only behavior; (2) LOW — fix the sandbox doc wording. diff --git a/src/backend/routers/chat.py b/src/backend/routers/chat.py index e3dfc7a1..7fe44204 100644 --- a/src/backend/routers/chat.py +++ b/src/backend/routers/chat.py @@ -610,6 +610,15 @@ async def _run_chat_and_finalize( if request.model: payload["model"] = request.model # Inject platform instructions + execution context (#171) into every chat request. + # Resolve the agent runtime (best-effort, never raises) so the MCP-tool + # naming in the platform prompt matches the harness (#1187 F-MCP). Lazy + + # guarded import so a re-import under a stubbed services.docker_service + # (unit tests) can't break dispatch; Claude default on any failure. + try: + from services.docker_service import get_agent_runtime + agent_runtime = get_agent_runtime(name) + except Exception: + agent_runtime = "claude-code" try: exec_ctx = ExecutionContext( agent_name=name, @@ -623,10 +632,11 @@ async def _run_chat_and_finalize( payload["system_prompt"] = compose_system_prompt( execution_context=exec_ctx, include_execution_context=is_execution_context_enabled(), + runtime=agent_runtime, ) except Exception as e: logger.warning(f"[Chat] execution context build failed, falling back: {e}") - payload["system_prompt"] = get_platform_system_prompt() + payload["system_prompt"] = get_platform_system_prompt(runtime=agent_runtime) # Pass execution ID so agent registers process under the same ID (enables termination) if task_execution_id: payload["execution_id"] = task_execution_id diff --git a/src/backend/routers/sessions.py b/src/backend/routers/sessions.py index 990cacfd..cb18eccb 100644 --- a/src/backend/routers/sessions.py +++ b/src/backend/routers/sessions.py @@ -26,7 +26,7 @@ from db_models import WebFileUpload, SessionMessageInsert from dependencies import AuthorizedAgent, get_current_user from models import User -from services.docker_service import get_agent_container +from services.docker_service import get_agent_container, get_agent_status_from_container from services.session_cleanup_service import get_session_cleanup_service from services.settings_service import is_session_tab_enabled from services.task_execution_service import get_task_execution_service @@ -81,6 +81,33 @@ def _enabled_or_404() -> None: raise HTTPException(status_code=404, detail="Not Found") +# #1187 Phase H: runtimes whose Session-tab turns must NOT use the cached-UUID +# `--resume` machinery. They support plain chat continuity (in the Chat tab) but +# not the Session tab's Claude-JSONL resume/fallback/reaping model, so we run a +# stateless turn for them instead. ONE backend constant (decision 1) — keep in +# sync with the agent-side `RuntimeCapabilities.session_tab_resume`. +RUNTIMES_WITHOUT_SESSION_TAB_RESUME = {"codex"} + + +def _supports_session_tab_resume(agent_name: str) -> bool: + """False for runtimes (e.g. Codex) that lack the cached-UUID `--resume` + machinery. Defaults to True on any lookup failure — assume Claude-like + so a transient Docker hiccup never silently downgrades a real session.""" + try: + container = get_agent_container(agent_name) + if container is None: + return True + status = get_agent_status_from_container(container) + runtime = (status.runtime or "claude-code").lower() + return runtime not in RUNTIMES_WITHOUT_SESSION_TAB_RESUME + except Exception: + logger.warning( + "[Session] runtime lookup failed for %s; assuming resume-capable", + agent_name, exc_info=True, + ) + return True + + def _session_or_404(session_id: str, user: User, agent_name: str): """Resolve a session row and enforce per-user ownership. @@ -611,6 +638,17 @@ async def send_session_message( async with _InflightSentinel(session.id, lock_ttl): cached_uuid = db.get_cached_claude_session_id(session.id) + # #1187 Phase H: for runtimes lacking the cached-UUID --resume model + # (Codex), run a plain stateless turn — the Claude-specific resume + + # JSONL-not-found fallback below would mis-handle their failure markers. + # Only pay the runtime lookup when there's actually a cached UUID to drop. + if cached_uuid and not _supports_session_tab_resume(name): + logger.info( + "[Session] agent=%s runtime lacks session-tab resume — " + "running a stateless turn (no --resume)", name, + ) + cached_uuid = None + service = get_task_execution_service() fallback_fired = False fallback_reason: Optional[str] = None diff --git a/src/backend/services/agent_service/crud.py b/src/backend/services/agent_service/crud.py index 8d9ca8b8..fb426086 100644 --- a/src/backend/services/agent_service/crud.py +++ b/src/backend/services/agent_service/crud.py @@ -33,7 +33,7 @@ from services.settings_service import get_anthropic_api_key, get_github_pat, get_agent_full_capabilities, get_agent_quota_for_role, get_agent_default_resources, get_agent_default_require_email from services.github_service import GitHubService, GitHubError from utils.helpers import sanitize_agent_name, utc_now_iso -from .helpers import validate_base_image +from .helpers import validate_base_image, is_claude_runtime, validate_runtime from .lifecycle import RESTRICTED_CAPABILITIES, FULL_CAPABILITIES from .capabilities import AGENT_TMPFS_MOUNT, AGENT_DEFAULT_TMPDIR @@ -376,6 +376,11 @@ async def create_agent_internal( except Exception as e: logger.warning(f"Error loading template config: {e}") + # #1187: runtime is final here (request value, possibly overridden by the + # template). Reject an unknown one now (clear 400) instead of letting the + # agent container crash-loop on boot when get_runtime() can't resolve it. + validate_runtime(config.runtime) + if config.port is None: config.port = get_next_available_port() @@ -493,21 +498,33 @@ async def create_agent_internal( import json as _json env_vars['AGENT_GUARDRAILS'] = _json.dumps(_guardrails) - # Auto-assign subscription (round-robin) — #74 + # Auto-assign subscription (round-robin) — #74. + # Subscriptions are Claude-OAuth tokens (CLAUDE_CODE_OAUTH_TOKEN) and apply + # ONLY to the Claude Code runtime. Non-Claude runtimes (Gemini, Codex) bring + # their own credentials via .env (CRED-002), so skip the assign entirely — + # otherwise a Codex agent would get a persisted subscription_id + # (has_subscription=True) and a spurious Claude token injected on every + # create/recreate (#1187 decision 7). auto_assigned_subscription_id = None - try: - least_used = db.get_least_used_subscription() - if least_used: - token = db.get_subscription_token(least_used.id) - if token: - env_vars['CLAUDE_CODE_OAUTH_TOKEN'] = token - env_vars.pop('ANTHROPIC_API_KEY', None) - auto_assigned_subscription_id = least_used.id - logger.info(f"Auto-assigned subscription '{least_used.name}' to agent {config.name}") - else: - logger.warning(f"Failed to decrypt subscription '{least_used.name}' token, using platform API key") - except Exception as e: - logger.warning(f"Subscription auto-assign failed for {config.name}: {e}") + if is_claude_runtime(config.runtime): + try: + least_used = db.get_least_used_subscription() + if least_used: + token = db.get_subscription_token(least_used.id) + if token: + env_vars['CLAUDE_CODE_OAUTH_TOKEN'] = token + env_vars.pop('ANTHROPIC_API_KEY', None) + auto_assigned_subscription_id = least_used.id + logger.info(f"Auto-assigned subscription '{least_used.name}' to agent {config.name}") + else: + logger.warning(f"Failed to decrypt subscription '{least_used.name}' token, using platform API key") + except Exception as e: + logger.warning(f"Subscription auto-assign failed for {config.name}: {e}") + else: + logger.info( + f"Skipping subscription auto-assign for agent {config.name} " + f"(runtime={(config.runtime or 'claude-code')!r} is non-Claude — uses its own .env credentials)" + ) # Add Google API key if using Gemini runtime # Gemini CLI expects GEMINI_API_KEY environment variable diff --git a/src/backend/services/agent_service/helpers.py b/src/backend/services/agent_service/helpers.py index 7c6ab49c..7cd38372 100644 --- a/src/backend/services/agent_service/helpers.py +++ b/src/backend/services/agent_service/helpers.py @@ -31,6 +31,49 @@ "trinity-agent-base:*", ] +# #1187: runtime classification. Subscriptions are Claude-OAuth tokens, so the +# auto-assign (crud) and the CLAUDE_CODE_OAUTH_TOKEN / ANTHROPIC_API_KEY juggle +# (lifecycle recreate) apply ONLY to the Claude Code runtime. Gemini and Codex +# bring their own credentials via .env (CRED-002). +CLAUDE_RUNTIME_NAMES = frozenset({"claude-code", "claude"}) + + +def is_claude_runtime(runtime: Optional[str]) -> bool: + """True for the Claude Code runtime, INCLUDING the default (unset/empty → + claude-code). Non-Claude runtimes (gemini-cli, gemini, codex) return False. + """ + return (runtime or "claude-code").lower() in CLAUDE_RUNTIME_NAMES + + +# #1187: every runtime the platform can launch. Mirrors the agent-side +# `runtime_adapter.KNOWN_RUNTIMES` (which lives in the base image and isn't +# importable from the backend). Keep the two in sync when adding a runtime. +KNOWN_RUNTIME_NAMES = CLAUDE_RUNTIME_NAMES | frozenset({"gemini-cli", "gemini", "codex"}) + + +def validate_runtime(runtime: Optional[str]) -> None: + """Reject an unknown ``runtime`` at create time. + + The agent-side ``get_runtime()`` already fails loud on a bad + ``AGENT_RUNTIME``, but only when the container boots — by then the agent + exists and the failure is a cryptic crash-loop. Catching a typo'd + ``runtime:`` here turns it into a clear 400 at creation. Unset/empty is + valid (defaults to claude-code). + + Raises: + HTTPException(400): if ``runtime`` is set to an unrecognized value. + """ + if not runtime: + return + if runtime.lower() not in KNOWN_RUNTIME_NAMES: + raise HTTPException( + status_code=400, + detail=( + f"Unknown runtime {runtime!r}. " + f"Supported runtimes: {sorted(KNOWN_RUNTIME_NAMES)}." + ), + ) + def validate_base_image(image: str) -> None: """ diff --git a/src/backend/services/agent_service/lifecycle.py b/src/backend/services/agent_service/lifecycle.py index 859f9d07..a398e03a 100644 --- a/src/backend/services/agent_service/lifecycle.py +++ b/src/backend/services/agent_service/lifecycle.py @@ -25,7 +25,7 @@ from services.agent_service.helpers import validate_base_image from services.settings_service import get_anthropic_api_key, get_github_pat, get_agent_full_capabilities, get_agent_default_resources from services.skill_service import skill_service -from .helpers import check_shared_folder_mounts_match, check_api_key_env_matches, check_github_pat_env_matches, check_resource_limits_match, check_full_capabilities_match, check_guardrails_env_matches +from .helpers import check_shared_folder_mounts_match, check_api_key_env_matches, check_github_pat_env_matches, check_resource_limits_match, check_full_capabilities_match, check_guardrails_env_matches, is_claude_runtime from .file_sharing import check_public_folder_mount_matches from .read_only import inject_read_only_hooks, remove_read_only_hooks @@ -362,11 +362,26 @@ async def recreate_container_with_updated_config(agent_name: str, old_container, # Claude Code prioritizes ANTHROPIC_API_KEY over CLAUDE_CODE_OAUTH_TOKEN, # so when a subscription is assigned we must remove the API key and set # the token env var instead. + # + # This whole juggle is Claude-only: subscriptions are Claude-OAuth tokens. + # Non-Claude runtimes (Gemini, Codex) authenticate from their own .env + # (CRED-002) and must NEVER receive a Claude subscription token on recreate, + # even if a subscription row somehow exists for them (#1187 decision 7). + _runtime = ( + env_vars.get('AGENT_RUNTIME') + or labels.get('trinity.agent-runtime') + or 'claude-code' + ) + _is_claude_runtime = is_claude_runtime(_runtime) subscription_id = db.get_agent_subscription_id(agent_name) has_subscription = subscription_id is not None use_platform_key = db.get_use_platform_api_key(agent_name) - if has_subscription: + if not _is_claude_runtime: + # Non-Claude: leave the agent's own credentials in place; never inject a + # Claude token. + env_vars.pop('CLAUDE_CODE_OAUTH_TOKEN', None) + elif has_subscription: # Subscription assigned — inject token, remove API key token = db.get_subscription_token(subscription_id) if token: diff --git a/src/backend/services/agent_service/terminal.py b/src/backend/services/agent_service/terminal.py index e7b17f76..9b4d3695 100644 --- a/src/backend/services/agent_service/terminal.py +++ b/src/backend/services/agent_service/terminal.py @@ -185,7 +185,8 @@ async def handle_terminal_session( session_start = datetime.utcnow() # Step 5: Create exec with TTY - # Support multiple terminal modes: claude (Claude Code), gemini (Gemini CLI), bash + # Support multiple terminal modes: claude (Claude Code), gemini + # (Gemini CLI), codex (OpenAI Codex CLI, #1187), bash if mode == "claude": cmd = ["claude", "--dangerously-skip-permissions"] if model: @@ -194,6 +195,10 @@ async def handle_terminal_session( cmd = ["gemini"] if model: cmd.extend(["--model", model]) + elif mode == "codex": + cmd = ["codex"] + if model: + cmd.extend(["-m", model]) else: cmd = ["/bin/bash"] diff --git a/src/backend/services/docker_service.py b/src/backend/services/docker_service.py index dd9e9b9a..393db235 100644 --- a/src/backend/services/docker_service.py +++ b/src/backend/services/docker_service.py @@ -28,6 +28,24 @@ def get_agent_container(name: str): return None +def get_agent_runtime(name: str) -> str: + """Best-effort resolve an agent's execution runtime from its Docker label. + + Reads the ``trinity.agent-runtime`` label (written at create time by + ``crud.py``). Used to make the platform system prompt runtime-aware (#1187 + F-MCP). Never raises and never blocks dispatch — any failure (no Docker + client, container gone, missing label) falls back to ``"claude-code"``, which + preserves the historical Claude/Gemini prompt naming. + """ + container = get_agent_container(name) + if container is None: + return "claude-code" + try: + return container.labels.get("trinity.agent-runtime", "claude-code") or "claude-code" + except Exception: + return "claude-code" + + def get_agent_status_from_container(container) -> AgentStatus: """Convert a Docker container to AgentStatus using container labels.""" labels = container.labels @@ -151,7 +169,11 @@ def list_all_agents_fast() -> List[AgentStatus]: }, container_id=container.id, template=labels.get("trinity.template", None) or None, - runtime=labels.get("trinity.runtime", "claude-code"), # From label instead of env vars + # Label written at create time is `trinity.agent-runtime` (crud.py); + # `trinity.runtime` is never written, so reading it always reported + # claude-code and broke the RuntimeBadge for Codex/Gemini agents + # in every fast-path view (#1187 review I6). + runtime=labels.get("trinity.agent-runtime", "claude-code"), base_image_version=labels.get("trinity.base-image-version"), # Label only, no image lookup ) agents.append(agent) diff --git a/src/backend/services/git_service.py b/src/backend/services/git_service.py index d72de311..9ea028de 100644 --- a/src/backend/services/git_service.py +++ b/src/backend/services/git_service.py @@ -679,6 +679,7 @@ def delete_agent_git_config(agent_name: str) -> bool: ".npm/", ".ssh/", ".trinity/", + ".tmp/", # #1098 disk-backed scratch (TMPDIR); #1187 relocated CODEX_HOME # Large generated content "content/", # Claude Code runtime — commit commands/skills/agents, exclude runtime data diff --git a/src/backend/services/platform_prompt_service.py b/src/backend/services/platform_prompt_service.py index c87afdf7..53d98c1a 100644 --- a/src/backend/services/platform_prompt_service.py +++ b/src/backend/services/platform_prompt_service.py @@ -141,6 +141,42 @@ - Only available during user-facing sessions (public link, Slack, Telegram, WhatsApp). The tool returns an error if called from a scheduled task or agent-to-agent call.""" +# --------------------------------------------------------------------------- +# Runtime-aware MCP tool naming (#1187 F-MCP) +# --------------------------------------------------------------------------- + +# Claude Code exposes Trinity MCP tools as ``mcp__trinity__``; the +# PLATFORM_INSTRUCTIONS above document them that way. Codex auto-discovers MCP +# tools from the configured ``trinity`` server and invokes them by their bare +# names, so the Claude-only prefix must be stripped for Codex agents — otherwise +# the model emits ``mcp__trinity`` and Codex answers "unknown MCP server". +# Mirrors runtime_adapter._CODEX_RUNTIMES (the only non-Claude-named surface in +# the MVP); Gemini and unknown runtimes keep the canonical Claude naming. +_CODEX_RUNTIMES = frozenset({"codex"}) + +# Prepended to the Codex variant. Intentionally avoids the literal +# ``mcp__trinity__`` token so the stripped prompt contains it nowhere. +_CODEX_MCP_ORIENTATION = ( + "## MCP Tools (Codex runtime)\n\n" + "A Trinity MCP server named `trinity` is configured for you. Call its tools " + "by the bare names documented below — `list_agents`, `chat_with_agent`, " + "`share_file`, `write_user_memory` — exactly as your client auto-discovers " + "them. Do not add any vendor-specific tool-name prefix.\n\n---\n\n" +) + + +def _adapt_instructions_for_runtime(instructions: str, runtime: str) -> str: + """Rewrite the MCP-tool references in ``instructions`` for ``runtime``. + + Codex → strip the Claude-only ``mcp__trinity__`` prefix and prepend a short + orientation note. Claude/Gemini/unknown → return the text unchanged (the + plan's ``default claude-code`` behavior). Pure — never mutates the input. + """ + if (runtime or "").lower() in _CODEX_RUNTIMES: + return _CODEX_MCP_ORIENTATION + instructions.replace("mcp__trinity__", "") + return instructions + + def format_user_memory_block(memory_record: dict) -> Optional[str]: """Format a user-memory record into a system-prompt block for injection. @@ -274,17 +310,23 @@ async def summarize_user_memory_background( ) -def get_platform_system_prompt() -> str: +def get_platform_system_prompt(runtime: str = "claude-code") -> str: """ Build the full platform system prompt. Combines static platform instructions with the operator's custom prompt from the trinity_prompt database setting. + Args: + runtime: the agent's execution runtime (``trinity.agent-runtime`` label). + Codex gets MCP-tool references without the Claude-only + ``mcp__trinity__`` prefix; Claude/Gemini/unknown keep the canonical + naming (#1187 F-MCP). + Returns: Combined system prompt string """ - parts = [PLATFORM_INSTRUCTIONS] + parts = [_adapt_instructions_for_runtime(PLATFORM_INSTRUCTIONS, runtime)] # Append custom prompt from database setting (operator-configurable) custom_prompt = db.get_setting_value("trinity_prompt", default=None) @@ -519,14 +561,18 @@ def compose_system_prompt( caller_prompt: Optional[str] = None, *, include_execution_context: bool = True, + runtime: str = "claude-code", ) -> str: """Compose the full system prompt: platform instructions + execution context + caller prompt. Single composition entry point. Keeps ordering and defaults in one place (invariant #15). Callers should use this instead of concatenating prompt fragments themselves. + + ``runtime`` is threaded to :func:`get_platform_system_prompt` so the MCP-tool + naming matches the agent's harness (Codex vs. Claude/Gemini, #1187 F-MCP). """ - parts: List[str] = [get_platform_system_prompt()] + parts: List[str] = [get_platform_system_prompt(runtime=runtime)] if include_execution_context and execution_context is not None: # Auto-fill collaborators and platform URL without mutating the caller's diff --git a/src/backend/services/task_execution_service.py b/src/backend/services/task_execution_service.py index d32f04dd..5954c9b9 100644 --- a/src/backend/services/task_execution_service.py +++ b/src/backend/services/task_execution_service.py @@ -47,6 +47,23 @@ is_execution_context_enabled, ) + +def _resolve_agent_runtime(agent_name: str) -> str: + """Best-effort resolve an agent's runtime for the platform prompt (#1187). + + Lazy + guarded import: a top-level ``from services.docker_service import + get_agent_runtime`` would make a *re-import* of this module fail when a unit + test has stubbed ``services.docker_service`` with a partial stub that lacks + the symbol (the conftest pops + re-imports this module between tests). Resolve + to the Claude default on any failure — never block dispatch. + """ + try: + from services.docker_service import get_agent_runtime + + return get_agent_runtime(agent_name) + except Exception: + return "claude-code" + logger = logging.getLogger(__name__) @@ -737,6 +754,9 @@ async def execute_task( # ---- 4. Call agent with retry -------------------------------- # Compose platform prompt + execution context (#171) + caller system_prompt. # Never let context-building fail the request. + # Resolve the agent runtime (best-effort, never raises) so the + # MCP-tool naming matches the harness (#1187 F-MCP). + agent_runtime = _resolve_agent_runtime(agent_name) try: exec_ctx = ExecutionContext( agent_name=agent_name, @@ -757,12 +777,13 @@ async def execute_task( execution_context=exec_ctx, caller_prompt=system_prompt, include_execution_context=is_execution_context_enabled(), + runtime=agent_runtime, ) except Exception as e: logger.warning( f"[TaskExecService] execution context build failed, falling back: {e}" ) - platform_prompt = get_platform_system_prompt() + platform_prompt = get_platform_system_prompt(runtime=agent_runtime) effective_system_prompt = ( platform_prompt + "\n\n" + system_prompt if system_prompt else platform_prompt ) diff --git a/src/frontend/src/components/AgentTerminal.vue b/src/frontend/src/components/AgentTerminal.vue index 1f0420c8..d1f9079e 100644 --- a/src/frontend/src/components/AgentTerminal.vue +++ b/src/frontend/src/components/AgentTerminal.vue @@ -136,7 +136,12 @@ const errorMessage = ref('') // Computed - Runtime-aware labels const isGemini = computed(() => props.runtime === 'gemini-cli' || props.runtime === 'gemini') -const cliModeLabel = computed(() => isGemini.value ? 'Gemini CLI' : 'Claude Code') +const isCodex = computed(() => props.runtime === 'codex') +const cliModeLabel = computed(() => { + if (isCodex.value) return 'Codex CLI' + if (isGemini.value) return 'Gemini CLI' + return 'Claude Code' +}) // Terminal and WebSocket instances let terminal = null @@ -325,8 +330,9 @@ function connect() { // Build WebSocket URL - use agent-specific terminal endpoint const protocol = location.protocol === 'https:' ? 'wss:' : 'ws:' - // Map 'cli' mode to the appropriate runtime command (claude or gemini) - const terminalMode = selectedMode.value === 'cli' ? (isGemini.value ? 'gemini' : 'claude') : 'bash' + // Map 'cli' mode to the appropriate runtime command (claude, gemini, or codex) + const cliRuntimeMode = isCodex.value ? 'codex' : (isGemini.value ? 'gemini' : 'claude') + const terminalMode = selectedMode.value === 'cli' ? cliRuntimeMode : 'bash' // Include model parameter if specified (for gemini --model or claude --model flags) const modelParam = props.model ? `&model=${encodeURIComponent(props.model)}` : '' const wsUrl = `${protocol}//${location.host}/api/agents/${encodeURIComponent(props.agentName)}/terminal?mode=${terminalMode}${modelParam}` diff --git a/src/frontend/src/components/RuntimeBadge.vue b/src/frontend/src/components/RuntimeBadge.vue index 159390b6..24d076fd 100644 --- a/src/frontend/src/components/RuntimeBadge.vue +++ b/src/frontend/src/components/RuntimeBadge.vue @@ -50,6 +50,21 @@ + + + + + + {{ label }} @@ -86,15 +101,21 @@ const isGeminiRuntime = computed(() => { return props.runtime === 'gemini-cli' || props.runtime === 'gemini' }) +const isCodexRuntime = computed(() => { + return props.runtime === 'codex' +}) + const label = computed(() => { if (isClaudeRuntime.value) return 'Claude' if (isGeminiRuntime.value) return 'Gemini' + if (isCodexRuntime.value) return 'Codex' return props.runtime }) const tooltipText = computed(() => { if (isClaudeRuntime.value) return 'Anthropic Claude Code Runtime' if (isGeminiRuntime.value) return 'Google Gemini CLI Runtime' + if (isCodexRuntime.value) return 'OpenAI Codex CLI Runtime' return `Runtime: ${props.runtime}` }) @@ -105,6 +126,9 @@ const badgeClasses = computed(() => { if (isGeminiRuntime.value) { return 'bg-brand-gemini-50 dark:bg-brand-gemini-950/50 text-brand-gemini-700 dark:text-brand-gemini-300 border border-brand-gemini-200 dark:border-brand-gemini-800' } + if (isCodexRuntime.value) { + return 'bg-emerald-50 dark:bg-emerald-950/50 text-emerald-700 dark:text-emerald-300 border border-emerald-200 dark:border-emerald-800' + } return 'bg-gray-100 dark:bg-gray-700 text-gray-700 dark:text-gray-300 border border-gray-200 dark:border-gray-600' }) diff --git a/src/frontend/src/views/AgentDetail.vue b/src/frontend/src/views/AgentDetail.vue index c0e41e67..4616aea1 100644 --- a/src/frontend/src/views/AgentDetail.vue +++ b/src/frontend/src/views/AgentDetail.vue @@ -545,6 +545,9 @@ const defaultModel = computed(() => { if (runtime === 'gemini-cli' || runtime === 'gemini') { return 'gemini-2.5-flash' } + if (runtime === 'codex') { + return 'gpt-5.1-codex' // OpenAI Codex default (#1187) + } return 'sonnet' // Claude default }) @@ -650,7 +653,10 @@ const visibleTabs = computed(() => { // Session tab — SESSION_TAB_2026-04. Sits between Chat and the rest; // gated on the platform feature flag so it's invisible until enabled. - if (sessionsStore.sessionTabEnabled) { + // Hidden for runtimes without cached-UUID --resume (Codex, #1187): they lack + // the Session tab's resume machinery, so the backend runs stateless turns and + // the tab would be misleading. Chat (with continuity) stays available. + if (sessionsStore.sessionTabEnabled && agent.value?.runtime !== 'codex') { tabs.push({ id: 'session', label: 'Session' }) } diff --git a/tests/requirements-test.txt b/tests/requirements-test.txt index 9af7fb03..cff32ad2 100644 --- a/tests/requirements-test.txt +++ b/tests/requirements-test.txt @@ -43,7 +43,10 @@ twilio>=9.10.0 Pillow>=11.1.0 tenacity>=9.0.0 passlib[bcrypt]>=1.7.4 -bcrypt>=4.2.0 +# Cap bcrypt <5: bcrypt 5.0.0 removed the `__about__` shim passlib 1.7.4 reads at +# import, so an uncapped fresh venv breaks every test at conftest DB-init. Pin +# until passlib ships a 5.x-compatible release. +bcrypt>=4.2.0,<5 pydantic-settings>=2.0.0 google-genai>=1.0.0 payments-py>=1.0.0 diff --git a/tests/unit/test_agent_readiness_probe.py b/tests/unit/test_agent_readiness_probe.py index 147d30ab..591ddaad 100644 --- a/tests/unit/test_agent_readiness_probe.py +++ b/tests/unit/test_agent_readiness_probe.py @@ -62,6 +62,7 @@ def wait_for_agent_ready(monkeypatch): check_resource_limits_match=None, check_full_capabilities_match=None, check_guardrails_env_matches=None, + is_claude_runtime=None, # added to lifecycle.py import (#1187) ), "services.settings_service": types.SimpleNamespace( get_anthropic_api_key=None, get_github_pat=None, diff --git a/tests/unit/test_codex_backend_inertness.py b/tests/unit/test_codex_backend_inertness.py new file mode 100644 index 00000000..556c5da2 --- /dev/null +++ b/tests/unit/test_codex_backend_inertness.py @@ -0,0 +1,50 @@ +"""Backend inertness for Codex-shaped failures (#1187 Phase H / decision 3). + +Two backend behaviors must NOT misfire for Codex: + + 1. The #678 stdout reader-race auto-retry must stay inert. It fires only on a + Claude-specific 502 dict body (recovery_attempted + raw_message_count==0 + + parse_failure_count==0 + num_turns<5 + "result message"). A Codex 502 + (early child exit) carries a plain string detail, so the signature check + must return False — no spurious same-execution_id retry. + + 2. A generic Codex failure must surface as 500, never 503. The backend infers + error_code=AUTH only from a 503, and the dispatch breaker counts AUTH only, + so a 500 keeps Codex failures out of the AUTH path. (The 500 mapping itself + is asserted in test_codex_runtime.test_codex_generic_failure_not_auth.) +""" + +from __future__ import annotations + +from services.task_execution_service import _is_reader_race_signature + + +def test_reader_race_inert_for_codex_pipe_drop_string_detail(): + # Codex 502 detail is a plain string, not the reader-race dict. + assert _is_reader_race_signature( + "Agent subprocess closed before task could complete" + ) is False + + +def test_reader_race_inert_for_codex_dict_without_recovery_marker(): + # Even a dict-shaped Codex error lacks recovery_attempted / the zeroed + # counters, so it can never trip the Claude reader-race retry. + codex_like = { + "message": "Codex execution failed (exit code 1): boom", + "raw_message_count": 4, + "parse_failure_count": 0, + } + assert _is_reader_race_signature(codex_like) is False + + +def test_reader_race_still_fires_for_genuine_claude_signature(): + """Guard the guard: the real Claude reader-race body still matches, so the + inertness above is specificity, not a broken predicate.""" + claude_sig = { + "recovery_attempted": True, + "raw_message_count": 0, + "parse_failure_count": 0, + "metadata": {"num_turns": 1}, + "message": "No result message received from Claude Code", + } + assert _is_reader_race_signature(claude_sig) is True diff --git a/tests/unit/test_codex_mcp_config.py b/tests/unit/test_codex_mcp_config.py new file mode 100644 index 00000000..eae96a4e --- /dev/null +++ b/tests/unit/test_codex_mcp_config.py @@ -0,0 +1,291 @@ +"""Codex MCP configuration tests (#1187 Phase F). + +Codex reads MCP servers from ``$CODEX_HOME/config.toml``. Trinity writes that +file directly and merges, so: + * the Trinity HTTP MCP server references the bearer token by ENV VAR and never + persists the literal secret to disk, + * template stdio servers are written as command + args, + * re-running (agent restart) merges idempotently — no duplicate tables, + * the dispatcher routes AGENT_RUNTIME=codex to the codex writer. +""" + +from __future__ import annotations + +import tomllib + +import pytest + +from agent_server.services import trinity_mcp # noqa: E402 + + +def test_codex_trinity_mcp_uses_env_var_not_literal_token(tmp_path, monkeypatch): + monkeypatch.setenv("CODEX_HOME", str(tmp_path)) + secret = "trinity_mcp_SUPERSECRETVALUE" + + assert trinity_mcp._inject_codex_mcp("http://mcp-server:8080/mcp", secret) is True + + raw = (tmp_path / "config.toml").read_text() + cfg = tomllib.loads(raw) + trinity = cfg["mcp_servers"]["trinity"] + assert trinity["url"] == "http://mcp-server:8080/mcp" + assert trinity["bearer_token_env_var"] == "TRINITY_MCP_API_KEY" + # SECURITY: the literal token must NEVER appear in config.toml. + assert secret not in raw + + +def test_codex_template_mcp_servers(tmp_path, monkeypatch): + monkeypatch.setenv("CODEX_HOME", str(tmp_path)) + servers = { + "github": { + "command": "npx", + "args": ["-y", "@modelcontextprotocol/server-github"], + } + } + assert trinity_mcp._configure_codex_mcp_servers(servers) is True + + cfg = tomllib.loads((tmp_path / "config.toml").read_text()) + github = cfg["mcp_servers"]["github"] + assert github["command"] == "npx" + assert github["args"] == ["-y", "@modelcontextprotocol/server-github"] + + +def test_codex_mcp_config_merges_no_dup_on_restart(tmp_path, monkeypatch): + monkeypatch.setenv("CODEX_HOME", str(tmp_path)) + + trinity_mcp._inject_codex_mcp("http://mcp-server:8080/mcp", "tok1") + trinity_mcp._configure_codex_mcp_servers( + {"github": {"command": "npx", "args": ["x"]}} + ) + # Simulate a restart re-injecting Trinity MCP — must NOT duplicate. + trinity_mcp._inject_codex_mcp("http://mcp-server:8080/mcp", "tok2") + + raw = (tmp_path / "config.toml").read_text() + cfg = tomllib.loads(raw) + assert set(cfg["mcp_servers"].keys()) == {"trinity", "github"} + assert raw.count("[mcp_servers.trinity]") == 1 + assert raw.count("[mcp_servers.github]") == 1 + + +def test_codex_template_server_env_table(tmp_path, monkeypatch): + """A per-server env table serializes as a nested [mcp_servers.x.env] table + and round-trips as valid TOML.""" + monkeypatch.setenv("CODEX_HOME", str(tmp_path)) + servers = { + "custom": {"command": "run-it", "env": {"FOO": "bar"}}, + } + assert trinity_mcp._configure_codex_mcp_servers(servers) is True + cfg = tomllib.loads((tmp_path / "config.toml").read_text()) + assert cfg["mcp_servers"]["custom"]["command"] == "run-it" + assert cfg["mcp_servers"]["custom"]["env"] == {"FOO": "bar"} + + +def test_codex_dispatch_routes_to_codex(tmp_path, monkeypatch): + monkeypatch.setenv("CODEX_HOME", str(tmp_path)) + monkeypatch.setenv("AGENT_RUNTIME", "codex") + monkeypatch.setenv("TRINITY_MCP_URL", "http://mcp-server:8080/mcp") + monkeypatch.setenv("TRINITY_MCP_API_KEY", "key") + + assert trinity_mcp.inject_trinity_mcp_if_configured() is True + cfg = tomllib.loads((tmp_path / "config.toml").read_text()) + assert "trinity" in cfg["mcp_servers"] + + +# --------------------------------------------------------------------------- +# M1 — TOML writer hardening (#1187 review): a server name or value with +# special characters must NOT produce unparseable TOML that silently drops +# every server on the next merge. +# --------------------------------------------------------------------------- + +def test_codex_server_name_with_space_is_valid_toml(tmp_path, monkeypatch): + """A template MCP server whose name contains a space must serialize to a + quoted table key that round-trips — not a raw `[mcp_servers.my server]` + header that tomllib rejects.""" + monkeypatch.setenv("CODEX_HOME", str(tmp_path)) + servers = {"my server": {"command": "run-it"}} + + assert trinity_mcp._configure_codex_mcp_servers(servers) is True + + raw = (tmp_path / "config.toml").read_text() + cfg = tomllib.loads(raw) # must not raise TOMLDecodeError + assert cfg["mcp_servers"]["my server"]["command"] == "run-it" + + +def test_codex_server_name_with_dot_does_not_misnest(tmp_path, monkeypatch): + """A dotted server name must be a single quoted key, not silently nested + as `mcp_servers.a.b`.""" + monkeypatch.setenv("CODEX_HOME", str(tmp_path)) + servers = {"a.b": {"command": "x"}} + + assert trinity_mcp._configure_codex_mcp_servers(servers) is True + + cfg = tomllib.loads((tmp_path / "config.toml").read_text()) + assert "a.b" in cfg["mcp_servers"] + assert cfg["mcp_servers"]["a.b"]["command"] == "x" + + +def test_codex_value_with_control_chars_is_escaped(tmp_path, monkeypatch): + """A command/env value containing a newline/tab must be escaped so the + file stays valid TOML and the value round-trips intact.""" + monkeypatch.setenv("CODEX_HOME", str(tmp_path)) + servers = {"weird": {"command": "run", "env": {"K": "line1\nline2\ttab"}}} + + assert trinity_mcp._configure_codex_mcp_servers(servers) is True + + cfg = tomllib.loads((tmp_path / "config.toml").read_text()) + assert cfg["mcp_servers"]["weird"]["env"]["K"] == "line1\nline2\ttab" + + +def test_codex_malformed_config_is_backed_up_not_silently_reset(tmp_path, monkeypatch): + """A malformed config.toml must be backed up (and logged), never silently + swallowed — otherwise the next merge rewrites from {} and drops every + previously-written server including the Trinity MCP wiring.""" + monkeypatch.setenv("CODEX_HOME", str(tmp_path)) + config_path = tmp_path / "config.toml" + config_path.write_text("this is { not valid toml [[[") + + # The upsert must still succeed and write the new server... + assert trinity_mcp._configure_codex_mcp_servers( + {"github": {"command": "npx"}} + ) is True + cfg = tomllib.loads(config_path.read_text()) + assert "github" in cfg["mcp_servers"] + # ...and the corrupt original must be preserved for recovery, not lost. + backups = list(tmp_path.glob("config.toml.corrupt*")) + assert backups, "malformed config.toml was not backed up" + + +def test_codex_preserves_foreign_top_level_table_on_merge(tmp_path, monkeypatch): + """If Codex (or a user) writes its own top-level scalars/tables to + config.toml, a Trinity MCP merge must preserve them — not drop everything + that isn't `mcp_servers`.""" + monkeypatch.setenv("CODEX_HOME", str(tmp_path)) + config_path = tmp_path / "config.toml" + config_path.write_text( + 'model = "gpt-5.1-codex"\n\n[history]\npersistence = "save-all"\n' + ) + + assert trinity_mcp._inject_codex_mcp("http://mcp-server:8080/mcp", "tok") is True + + cfg = tomllib.loads(config_path.read_text()) + assert cfg["model"] == "gpt-5.1-codex" + assert cfg["history"]["persistence"] == "save-all" + assert cfg["mcp_servers"]["trinity"]["url"] == "http://mcp-server:8080/mcp" + + +def test_codex_array_of_tables_preserved_not_corrupted(tmp_path, monkeypatch): + """The hand-rolled TOML writer doesn't emit array-of-tables. Rather than + silently stringify one in a pre-existing config (corruption), the merge must + fail safe: leave the original file untouched (#1187 review). + """ + monkeypatch.setenv("CODEX_HOME", str(tmp_path)) + config_path = tmp_path / "config.toml" + original = ( + '[[shell_environment_policy.experimental]]\nname = "a"\n\n' + '[[shell_environment_policy.experimental]]\nname = "b"\n' + ) + config_path.write_text(original) + + # The merge bails (returns False) rather than rewriting a mangled file. + assert trinity_mcp._inject_codex_mcp("http://mcp-server:8080/mcp", "tok") is False + + # The original config is preserved byte-for-byte and still parses with the + # array-of-tables intact — NOT corrupted into stringified dicts. + after = config_path.read_text() + assert after == original + rt = tomllib.loads(after) + assert rt["shell_environment_policy"]["experimental"] == [ + {"name": "a"}, + {"name": "b"}, + ] + + +# --------------------------------------------------------------------------- +# Added coverage (#1187 follow-up): the template-server dispatcher, the TOML +# scalar/escape primitives, and the no-command / empty-input edges. +# --------------------------------------------------------------------------- + +def test_configure_mcp_servers_dispatcher_routes_codex(tmp_path, monkeypatch): + """configure_mcp_servers() (template servers, distinct from the Trinity-MCP + injector) routes AGENT_RUNTIME=codex to the codex writer.""" + monkeypatch.setenv("CODEX_HOME", str(tmp_path)) + monkeypatch.setenv("AGENT_RUNTIME", "codex") + assert trinity_mcp.configure_mcp_servers({"gh": {"command": "npx"}}) is True + cfg = tomllib.loads((tmp_path / "config.toml").read_text()) + assert "gh" in cfg["mcp_servers"] + + +def test_configure_mcp_servers_empty_is_noop_true(monkeypatch): + """No servers → True without touching disk (the early-return guard).""" + monkeypatch.setenv("AGENT_RUNTIME", "codex") + assert trinity_mcp.configure_mcp_servers({}) is True + + +def test_configure_codex_skips_server_without_command(tmp_path, monkeypatch, caplog): + """A template server with no command is skipped with a warning; when it is + the only server, nothing is written and the call reports False (no servers + configured).""" + monkeypatch.setenv("CODEX_HOME", str(tmp_path)) + import logging + + caplog.set_level(logging.WARNING, logger="agent_server.services.trinity_mcp") + result = trinity_mcp._configure_codex_mcp_servers({"broken": {"args": ["x"]}}) + assert result is False # one server in, zero written → not all-empty input + assert any("no command specified" in r.getMessage() for r in caplog.records) + # Nothing was written for an all-skipped batch. + assert not (tmp_path / "config.toml").exists() + + +def test_configure_codex_empty_input_returns_true(tmp_path, monkeypatch): + """An empty server dict is a no-op success (len == 0 → True).""" + monkeypatch.setenv("CODEX_HOME", str(tmp_path)) + assert trinity_mcp._configure_codex_mcp_servers({}) is True + + +# -- TOML scalar / escape primitives --------------------------------------- + +def test_toml_escape_encodes_control_characters(): + """A control char (< 0x20, or DEL 0x7F) with no shorthand becomes a \\uXXXX + escape so the file stays valid TOML.""" + out = trinity_mcp._toml_escape("a\x01b\x7fc") + assert "\\u0001" in out + assert "\\u007F" in out + # Shorthand escapes still win for the common control chars. + assert trinity_mcp._toml_escape("x\ty\nz") == "x\\ty\\nz" + + +def test_toml_scalar_bool_int_float(): + assert trinity_mcp._toml_scalar(True) == "true" + assert trinity_mcp._toml_scalar(False) == "false" + assert trinity_mcp._toml_scalar(7) == "7" + assert trinity_mcp._toml_scalar(1.5) == "1.5" + + +def test_toml_scalar_list_of_scalars(): + assert trinity_mcp._toml_scalar(["a", "b"]) == '["a", "b"]' + + +def test_toml_scalar_rejects_dict(): + """A dict reaching _toml_scalar is unexpected nesting — it must raise rather + than emit a stringified dict (which would corrupt the file).""" + with pytest.raises(TypeError): + trinity_mcp._toml_scalar({"k": "v"}) + + +def test_toml_scalar_rejects_array_of_tables(): + with pytest.raises(TypeError): + trinity_mcp._toml_scalar([{"name": "a"}]) + + +def test_codex_preserves_foreign_bool_and_int_scalars(tmp_path, monkeypatch): + """Foreign top-level bool/int settings survive a Trinity MCP merge + round-trip (exercises the bool/int branches of the scalar writer).""" + monkeypatch.setenv("CODEX_HOME", str(tmp_path)) + config_path = tmp_path / "config.toml" + config_path.write_text("[tui]\nenabled = true\nmax_width = 120\n") + + assert trinity_mcp._inject_codex_mcp("http://mcp-server:8080/mcp", "tok") is True + + cfg = tomllib.loads(config_path.read_text()) + assert cfg["tui"]["enabled"] is True + assert cfg["tui"]["max_width"] == 120 + assert cfg["mcp_servers"]["trinity"]["url"] == "http://mcp-server:8080/mcp" diff --git a/tests/unit/test_codex_runtime.py b/tests/unit/test_codex_runtime.py new file mode 100644 index 00000000..a0e44df7 --- /dev/null +++ b/tests/unit/test_codex_runtime.py @@ -0,0 +1,1409 @@ +"""Unit tests for the Codex runtime (#1187). + +Covers the pure, deterministic pieces of ``codex_runtime.py``: + + * ``calculate_codex_cost`` — token→cost with cached-input pricing and the + "reasoning is a subset of output, don't double-count" rule. + * ``parse_codex_jsonl`` — Codex ``--json`` event stream → ExecutionMetadata, + response text, tool activity log, and error classification. + * ``_finalize_codex_response`` — the ``-o`` output file is authoritative; the + JSONL ``agent_message`` is only a fallback. + * ``CodexRuntime.capabilities`` — the conservative-but-honest capability flags. + * error→HTTP mapping in ``execute_headless`` — a dropped subprocess pipe is a + 502 (NOT 503, which collides with the SUB-003 auth switch); a generic + runtime failure is a 500 (NOT 503 — it must not be read as AUTH). + +The async subprocess itself is stubbed; these tests never spawn ``codex``. +""" + +from __future__ import annotations + +import json +import logging +import os + +import pytest +from fastapi import HTTPException + +# conftest wires the real docker/base-image/agent_server as `agent_server`. +from agent_server.services import codex_runtime # noqa: E402 +from agent_server.services.codex_runtime import ( # noqa: E402 + CodexRuntime, + calculate_codex_cost, + parse_codex_jsonl, + _finalize_codex_response, +) +from agent_server.services.runtime_adapter import RuntimeCapabilities # noqa: E402 + + +# --------------------------------------------------------------------------- +# calculate_codex_cost +# --------------------------------------------------------------------------- + +def test_calculate_codex_cost_no_reasoning_double_count(): + """reasoning_output_tokens is a SUBSET of output_tokens — cost must be + output_tokens * rate, never output_tokens + reasoning_output_tokens.""" + model = "gpt-5.1-codex" + rates = codex_runtime.CODEX_PRICING[model] + + # 0 cached so the input side is unambiguous. + cost = calculate_codex_cost( + input_tokens=1000, + cached_input_tokens=0, + output_tokens=500, # of which 300 were reasoning + model=model, + ) + expected = (1000 / 1000) * rates["input"] + (500 / 1000) * rates["output"] + assert cost == pytest.approx(round(expected, 6)) + + # Sanity: a (buggy) double-count would be strictly larger. + double_counted = (1000 / 1000) * rates["input"] + ( + (500 + 300) / 1000 + ) * rates["output"] + assert cost < double_counted + + +def test_calculate_codex_cost_cached_pricing(): + """Cached input tokens bill at the cheaper cached rate; only the + uncached remainder bills at the full input rate.""" + model = "gpt-5.1-codex" + rates = codex_runtime.CODEX_PRICING[model] + + cost = calculate_codex_cost( + input_tokens=1000, + cached_input_tokens=400, + output_tokens=0, + model=model, + ) + expected = (600 / 1000) * rates["input"] + (400 / 1000) * rates["cached"] + assert cost == pytest.approx(round(expected, 6)) + # Cached must be cheaper than billing all 1000 at the full input rate. + assert cost < (1000 / 1000) * rates["input"] + + +def test_calculate_codex_cost_default_fallback(): + """An unknown model falls back to the 'default' pricing, never KeyErrors.""" + cost = calculate_codex_cost( + input_tokens=1000, + cached_input_tokens=0, + output_tokens=1000, + model="some-future-model-x", + ) + rates = codex_runtime.CODEX_PRICING["default"] + expected = (1000 / 1000) * rates["input"] + (1000 / 1000) * rates["output"] + assert cost == pytest.approx(round(expected, 6)) + + +def test_calculate_codex_cost_cached_never_exceeds_input(): + """Defensive: cached_input_tokens > input_tokens must not produce a + negative uncached charge.""" + cost = calculate_codex_cost( + input_tokens=100, + cached_input_tokens=500, # nonsensical, but must not go negative + output_tokens=0, + model="gpt-5.1-codex", + ) + assert cost >= 0 + + +# --------------------------------------------------------------------------- +# parse_codex_jsonl +# --------------------------------------------------------------------------- + +def _events_to_lines(events) -> list[str]: + return [json.dumps(e) for e in events] + + +def test_codex_runtime_parser_thread_and_usage(): + """thread.started → session_id; turn.completed.usage → tokens + cost + (cached → cache_read_tokens; reasoning NOT double-counted).""" + events = [ + {"type": "thread.started", "thread_id": "thr_abc123"}, + {"type": "turn.started"}, + { + "type": "turn.completed", + "usage": { + "input_tokens": 2000, + "cached_input_tokens": 500, + "output_tokens": 800, + "reasoning_output_tokens": 300, + }, + }, + ] + response, log, metadata, raw = parse_codex_jsonl( + _events_to_lines(events), model="gpt-5.1-codex" + ) + + assert metadata.session_id == "thr_abc123" + assert metadata.input_tokens == 2000 + assert metadata.output_tokens == 800 + assert metadata.cache_read_tokens == 500 + assert metadata.cost_usd is not None and metadata.cost_usd > 0 + # raw transcript captured every line + assert len(raw) == len(events) + + +def test_codex_runtime_parser_agent_message_response(): + """item.completed/agent_message text becomes the response (JSONL fallback).""" + events = [ + {"type": "thread.started", "thread_id": "thr_x"}, + { + "type": "item.completed", + "item": {"id": "item_1", "type": "agent_message", "text": "Hello world"}, + }, + {"type": "turn.completed", "usage": {"input_tokens": 10, "output_tokens": 5}}, + ] + response, log, metadata, raw = parse_codex_jsonl(_events_to_lines(events)) + assert "Hello world" in response + + +def test_codex_runtime_parser_tool_items(): + """command_execution / web_search items produce tool activity log entries + and bump the tool count.""" + events = [ + {"type": "thread.started", "thread_id": "thr_y"}, + { + "type": "item.completed", + "item": { + "id": "cmd_1", + "type": "command_execution", + "command": "ls -la", + "exit_code": 0, + "status": "completed", + "aggregated_output": "a\nb\n", + }, + }, + { + "type": "item.completed", + "item": { + "id": "ws_1", + "type": "web_search", + "query": "trinity agent platform", + }, + }, + {"type": "turn.completed", "usage": {"input_tokens": 1, "output_tokens": 1}}, + ] + response, log, metadata, raw = parse_codex_jsonl(_events_to_lines(events)) + tool_uses = [e for e in log if e.type == "tool_use"] + assert len(tool_uses) >= 2 + tools = {e.tool for e in tool_uses} + # command_execution maps to a shell-ish tool; web_search to a search tool + assert any("ash" in t or "hell" in t or "ommand" in t for t in tools) + assert any("earch" in t for t in tools) + + +def test_codex_runtime_parser_turn_failed(): + """turn.failed → error_type/error_message captured on metadata.""" + events = [ + {"type": "thread.started", "thread_id": "thr_z"}, + {"type": "turn.failed", "error": {"message": "model is overloaded"}}, + ] + response, log, metadata, raw = parse_codex_jsonl(_events_to_lines(events)) + assert metadata.error_message and "overloaded" in metadata.error_message + + +def test_codex_runtime_parser_tolerates_garbage_lines(): + """A non-JSON line must not kill the parser — later events still parse.""" + lines = [ + "not json at all", + json.dumps({"type": "thread.started", "thread_id": "thr_g"}), + "{ partial", + json.dumps( + {"type": "turn.completed", "usage": {"input_tokens": 5, "output_tokens": 2}} + ), + ] + response, log, metadata, raw = parse_codex_jsonl(lines) + assert metadata.session_id == "thr_g" + assert metadata.input_tokens == 5 + + +# --------------------------------------------------------------------------- +# -o output file is authoritative +# --------------------------------------------------------------------------- + +def test_finalize_response_prefers_output_file(): + """The -o file content (durable record) wins over JSONL-assembled parts.""" + assert ( + _finalize_codex_response("FROM FILE", ["from jsonl"]) == "FROM FILE" + ) + + +def test_finalize_response_falls_back_to_jsonl(): + """When the -o file is empty/missing, fall back to JSONL response parts.""" + assert _finalize_codex_response(None, ["part a", "part b"]) == "part a\npart b" + assert _finalize_codex_response("", ["only jsonl"]) == "only jsonl" + + +# --------------------------------------------------------------------------- +# capabilities +# --------------------------------------------------------------------------- + +def test_codex_capabilities(): + caps = CodexRuntime.capabilities() + assert isinstance(caps, RuntimeCapabilities) + assert caps.chat_continuity is True + assert caps.session_tab_resume is False # MVP: Session tab stays Claude/Gemini + assert caps.mcp_support is True + assert caps.cost_reporting == "estimated" # no native cost → derived from tokens + + +# --------------------------------------------------------------------------- +# error → HTTP status mapping (execute_headless) +# --------------------------------------------------------------------------- + +@pytest.fixture +def available_runtime(monkeypatch): + """A CodexRuntime whose is_available() is forced True so the early 503 + availability guard doesn't fire.""" + rt = CodexRuntime() + monkeypatch.setattr(rt, "is_available", lambda: True) + return rt + + +@pytest.mark.asyncio +async def test_codex_runtime_pipe_drop(available_runtime, monkeypatch): + """A BrokenPipeError from the subprocess collector → HTTP 502, NOT 503. + 503 collides with the SUB-003 auth-switch; an early child exit is not auth.""" + + async def _raise_pipe(**_kw): + raise BrokenPipeError(32, "Broken pipe") + + monkeypatch.setattr(available_runtime, "_execute_codex", _raise_pipe) + + with pytest.raises(HTTPException) as exc_info: + await available_runtime.execute_headless(prompt="hello") + assert exc_info.value.status_code == 502 + + +@pytest.mark.asyncio +async def test_codex_generic_failure_not_auth(available_runtime, monkeypatch, caplog): + """A generic runtime failure surfaces as 500 (runtime-unavailable), NEVER + 503 — so the backend never infers error_code=AUTH for a non-auth Codex + failure (#1187 decision 3).""" + + async def _raise_runtime(**_kw): + raise RuntimeError("codex blew up for an unrelated reason") + + monkeypatch.setattr(available_runtime, "_execute_codex", _raise_runtime) + + with pytest.raises(HTTPException) as exc_info: + await available_runtime.execute_headless(prompt="hello") + assert exc_info.value.status_code == 500 + assert exc_info.value.status_code != 503 + + +@pytest.mark.asyncio +async def test_codex_unavailable_raises_503(monkeypatch): + """When the codex CLI isn't installed, execute_headless fails fast 503.""" + rt = CodexRuntime() + monkeypatch.setattr(rt, "is_available", lambda: False) + with pytest.raises(HTTPException) as exc_info: + await rt.execute_headless(prompt="hi") + assert exc_info.value.status_code == 503 + + +# --------------------------------------------------------------------------- +# I1 — _classify_codex_failure: AUTH detection must be anchored, not a bare +# substring match (a non-auth failure that merely contains "401" → 500, not 503) +# --------------------------------------------------------------------------- + +from agent_server.models import ExecutionMetadata # noqa: E402 + + +def _classify(stderr: str = "", error_message=None, return_code: int = 1): + meta = ExecutionMetadata() + meta.error_message = error_message + return codex_runtime._classify_codex_failure(return_code, stderr, meta) + + +def test_classify_bare_401_is_not_auth(): + """A non-auth failure whose output merely contains '401' (e.g. an upstream + tool returning 401) must map to 500, NOT 503 — 503 is the backend AUTH + signal the dispatch breaker counts (#1187 decision 3).""" + status, _ = _classify(stderr="tool exited: upstream returned 401 Not Found") + assert status == 500 + + +def test_classify_bare_api_key_phrase_is_not_auth(): + """A failure mentioning 'api key' in passing (no auth verb) is not an auth + failure — drop the over-broad bare 'api key' marker.""" + status, _ = _classify(stderr="please set the weather api key in your tool config") + assert status == 500 + + +def test_classify_real_unauthorized_is_auth(): + status, detail = _classify(stderr="Error: 401 Unauthorized") + assert status == 503 + assert "OPENAI_API_KEY" in detail + + +def test_classify_invalid_api_key_is_auth(): + status, _ = _classify(error_message="invalid_api_key: the provided key is wrong") + assert status == 503 + + +def test_classify_rate_limit_still_429(): + status, _ = _classify(stderr="429 Too Many Requests: rate limit exceeded") + assert status == 429 + + +def test_classify_generic_failure_is_500(): + status, _ = _classify(stderr="segfault in tool runner", return_code=139) + assert status == 500 + + +# --------------------------------------------------------------------------- +# I3 — end-of-options separator: the built command ends with "--" so a prompt +# starting with "-"/"--" is parsed as the positional prompt, never as a flag. +# --------------------------------------------------------------------------- + +def test_build_command_ends_with_end_of_options_separator(): + rt = CodexRuntime() + cmd = rt._build_codex_command( + model="gpt-5.1-codex", + sandbox_mode="danger-full-access", + result_file="/tmp/out.txt", + agent_home="/home/developer", + resume_thread_id=None, + ) + # The positional prompt is appended by _execute_codex right after this list; + # "--" must be the final token so the prompt can never be read as options. + assert cmd[-1] == "--" + + +# --------------------------------------------------------------------------- +# F-TOOLS (#1187 E2E): codex's own bubblewrap sandbox cannot create a user +# namespace inside the hardened Trinity container ("bwrap: No permissions to +# create a new namespace"), which blocks EVERY shell tool. Normal (writable) +# agents must therefore run with --sandbox danger-full-access (no internal +# sandbox; the Trinity container is the boundary, same posture as Claude/Gemini). +# Read-only mode keeps --sandbox read-only. +# --------------------------------------------------------------------------- + +def test_resolve_sandbox_normal_is_danger_full_access(monkeypatch): + """Normal (writable) mode maps to danger-full-access so codex skips its own + bwrap sandbox (which can't run in the hardened container).""" + monkeypatch.setattr(codex_runtime, "_is_read_only", lambda: False) + assert codex_runtime._resolve_sandbox_mode() == "danger-full-access" + + +def test_resolve_sandbox_read_only_stays_read_only(monkeypatch): + """Read-only mode keeps the sandbox-native read-only enforcement.""" + monkeypatch.setattr(codex_runtime, "_is_read_only", lambda: True) + assert codex_runtime._resolve_sandbox_mode() == "read-only" + + +def test_build_command_normal_mode_has_no_network_access_override(): + """danger-full-access already permits network — the old + `-c sandbox_workspace_write.network_access=true` override is dead and must + not be emitted (it would be ignored at best, confusing at worst).""" + rt = CodexRuntime() + cmd = rt._build_codex_command( + model="gpt-5.1-codex", + sandbox_mode="danger-full-access", + result_file="/tmp/out.txt", + agent_home="/home/developer", + resume_thread_id=None, + ) + joined = " ".join(cmd) + assert "danger-full-access" in cmd + assert "network_access" not in joined + assert "sandbox_workspace_write" not in joined + + +def test_build_command_resume_still_ends_with_separator(): + rt = CodexRuntime() + cmd = rt._build_codex_command( + model=None, + sandbox_mode="read-only", + result_file="/tmp/out.txt", + agent_home="/home/developer", + resume_thread_id="thr_123", + ) + assert "resume" in cmd and "thr_123" in cmd + assert cmd[-1] == "--" + # Arg ORDER is load-bearing: the `resume` sub-subcommand (codex 0.139.0) has + # a narrower option set and rejects exec-level flags that trail it + # ("unexpected argument '-C' found", exit 2). They MUST precede `resume`, and + # `resume ` must be the last tokens before the "--" separator. Guards the + # turn-2+ continuity regression. + resume_at = cmd.index("resume") + for exec_flag in ("--json", "-C", "--sandbox", "-o", "--skip-git-repo-check"): + assert cmd.index(exec_flag) < resume_at, f"{exec_flag} must precede resume" + assert cmd[resume_at + 1] == "thr_123" + assert cmd[-2:] == ["thr_123", "--"] + + +# --------------------------------------------------------------------------- +# I4 — thread-id fallback: when Codex emits no thread.started, degrade to a +# fresh next turn (None), never a fabricated id that would wedge resume. +# --------------------------------------------------------------------------- + +def test_resolve_session_id_none_when_thread_missing(): + meta = ExecutionMetadata() # session_id stays None + assert codex_runtime._resolve_returned_session_id(meta) is None + + +def test_resolve_session_id_is_thread_id_when_present(): + meta = ExecutionMetadata() + meta.session_id = "thr_real" + assert codex_runtime._resolve_returned_session_id(meta) == "thr_real" + + +@pytest.mark.asyncio +async def test_chat_does_not_cache_fake_thread_id(available_runtime, monkeypatch): + """A chat turn that returns no thread id must leave _chat_thread_id None so + the next turn runs fresh rather than `codex exec resume `.""" + + async def _fake(**_kw): + return ("resp", [], ExecutionMetadata(), [], None) + + monkeypatch.setattr(available_runtime, "_execute_codex", _fake) + await available_runtime.execute("hello") + assert available_runtime._chat_thread_id is None + + +# --------------------------------------------------------------------------- +# I5 — .env key parser tolerates `export ` prefix and inline comments. +# --------------------------------------------------------------------------- + +def _write_env(tmp_path, monkeypatch, contents: str): + monkeypatch.delenv("OPENAI_API_KEY", raising=False) + monkeypatch.delenv("CODEX_API_KEY", raising=False) + monkeypatch.setattr(codex_runtime, "_AGENT_HOME", str(tmp_path)) + (tmp_path / ".env").write_text(contents) + + +def test_load_api_key_tolerates_export_prefix(tmp_path, monkeypatch): + _write_env(tmp_path, monkeypatch, "export OPENAI_API_KEY=sk-exported\n") + assert codex_runtime._load_openai_api_key() == "sk-exported" + + +def test_load_api_key_strips_inline_comment(tmp_path, monkeypatch): + _write_env(tmp_path, monkeypatch, "OPENAI_API_KEY=sk-plain # primary key\n") + assert codex_runtime._load_openai_api_key() == "sk-plain" + + +def test_load_api_key_quoted_value_keeps_hash(tmp_path, monkeypatch): + """A '#' inside a quoted value is part of the value, not a comment.""" + _write_env(tmp_path, monkeypatch, 'OPENAI_API_KEY="sk-a#b"\n') + assert codex_runtime._load_openai_api_key() == "sk-a#b" + + +def test_load_api_key_plain_still_works(tmp_path, monkeypatch): + _write_env(tmp_path, monkeypatch, "CODEX_API_KEY=ck-123\n") + assert codex_runtime._load_openai_api_key() == "ck-123" + + +# --------------------------------------------------------------------------- +# I2 — reader-executor selection: the concurrent /api/task path uses the +# unbounded default executor; the lock-serialized chat path keeps the bounded +# single-worker executor (parity with Claude's headless path). +# --------------------------------------------------------------------------- + +@pytest.mark.asyncio +async def test_headless_requests_concurrent_reader(available_runtime, monkeypatch): + captured = {} + + async def _fake(**kw): + captured.update(kw) + return ("r", [], ExecutionMetadata(), [], "thr") + + monkeypatch.setattr(available_runtime, "_execute_codex", _fake) + await available_runtime.execute_headless(prompt="hi") + assert captured.get("concurrent_reader") is True + + +@pytest.mark.asyncio +async def test_chat_requests_serialized_reader(available_runtime, monkeypatch): + captured = {} + + async def _fake(**kw): + captured.update(kw) + return ("r", [], ExecutionMetadata(), [], "thr") + + monkeypatch.setattr(available_runtime, "_execute_codex", _fake) + await available_runtime.execute("hi") + assert captured.get("concurrent_reader") is False + + +# --------------------------------------------------------------------------- +# CSO #1187 finding 2 — the -o result filename is derived from execution_id; +# a '/' or '..' must never let it escape CODEX_HOME (defense-in-depth). +# --------------------------------------------------------------------------- + +def test_safe_result_token_blocks_traversal(tmp_path): + codex_home = str(tmp_path) + for hostile in ("../../etc/passwd", "a/b/c", "..", "../sibling"): + token = codex_runtime._safe_result_token(hostile) + assert "/" not in token and "\\" not in token + path = os.path.join(codex_home, f"{token}-last.txt") + # The result file stays directly inside CODEX_HOME — no escape. + assert os.path.dirname(os.path.realpath(path)) == os.path.realpath(codex_home) + + +def test_safe_result_token_passes_clean_id_through(): + clean = "exec_AbC-123_def.456" + assert codex_runtime._safe_result_token(clean) == clean + + +def test_safe_result_token_never_empty(): + # A pathological all-separator id still yields a usable filename token. + assert codex_runtime._safe_result_token("/") == "codex" + assert codex_runtime._safe_result_token("") == "codex" + + +# --------------------------------------------------------------------------- +# CSO #1187 finding 3 — read-only detection: an absent config is the normal +# writable state (silent); a present-but-corrupt config fails OPEN but LOGS +# (parity with the Claude reference hook, which also fails open + logs). +# --------------------------------------------------------------------------- + +def test_is_read_only_missing_file_is_silent(tmp_path, monkeypatch, caplog): + monkeypatch.setattr(codex_runtime, "_READ_ONLY_CONFIG", tmp_path / "ro.json") + caplog.set_level(logging.WARNING, logger="agent_server.services.codex_runtime") + assert codex_runtime._is_read_only() is False + assert not any("read-only config" in r.getMessage() for r in caplog.records) + + +def test_is_read_only_enabled_true(tmp_path, monkeypatch): + cfg = tmp_path / "ro.json" + cfg.write_text(json.dumps({"enabled": True})) + monkeypatch.setattr(codex_runtime, "_READ_ONLY_CONFIG", cfg) + assert codex_runtime._is_read_only() is True + + +def test_is_read_only_enabled_false(tmp_path, monkeypatch): + cfg = tmp_path / "ro.json" + cfg.write_text(json.dumps({"enabled": False})) + monkeypatch.setattr(codex_runtime, "_READ_ONLY_CONFIG", cfg) + assert codex_runtime._is_read_only() is False + + +def test_is_read_only_malformed_fails_open_and_logs(tmp_path, monkeypatch, caplog): + cfg = tmp_path / "ro.json" + cfg.write_text("{not valid json") + monkeypatch.setattr(codex_runtime, "_READ_ONLY_CONFIG", cfg) + caplog.set_level(logging.WARNING, logger="agent_server.services.codex_runtime") + assert codex_runtime._is_read_only() is False + assert any("malformed" in r.getMessage() for r in caplog.records) + + +def test_is_read_only_unreadable_fails_open_and_logs(tmp_path, monkeypatch, caplog): + """A present-but-unreadable config (e.g. it is a directory → OSError, not + FileNotFoundError) fails OPEN and LOGS — parity with the malformed path.""" + cfg = tmp_path / "ro_is_a_dir.json" + cfg.mkdir() # reading a directory as a file raises OSError (not FileNotFound) + monkeypatch.setattr(codex_runtime, "_READ_ONLY_CONFIG", cfg) + caplog.set_level(logging.WARNING, logger="agent_server.services.codex_runtime") + assert codex_runtime._is_read_only() is False + assert any("unreadable" in r.getMessage() for r in caplog.records) + + +# =========================================================================== +# Added coverage (#1187 follow-up): pure helpers, getters, parser branches, +# the execute() chat path, and the _execute_codex orchestration body — all +# previously exercised only indirectly or not at all. +# =========================================================================== + +import threading # noqa: E402 +from pathlib import Path # noqa: E402 + +from agent_server.services.codex_runtime import ( # noqa: E402 + _compose_prompt, + _read_and_consume_result_file, + _resolve_pricing, + _safe_unlink, +) + + +# --------------------------------------------------------------------------- +# _resolve_pricing — longest-prefix match (the documented "gpt-5.1-codex-2025-xx +# resolves to the codex rate" behavior, distinct from exact-key and default). +# --------------------------------------------------------------------------- + +def test_resolve_pricing_longest_prefix_match(): + """A versioned/suffixed model name resolves to its base model's rate via the + longest-prefix branch, NOT the default fallback.""" + rates = _resolve_pricing("gpt-5.1-codex-2025-11-01") + assert rates is codex_runtime.CODEX_PRICING["gpt-5.1-codex"] + + +def test_resolve_pricing_prefers_longest_prefix(): + """When several keys are prefixes, the LONGEST wins ('gpt-5-mini-2025' → + the mini rate, not the broader 'gpt-5' rate).""" + rates = _resolve_pricing("gpt-5-mini-2025-xx") + assert rates is codex_runtime.CODEX_PRICING["gpt-5-mini"] + + +def test_resolve_pricing_none_and_unknown_use_default(): + assert _resolve_pricing(None) is codex_runtime.CODEX_PRICING["default"] + assert _resolve_pricing("anthropic-claude") is codex_runtime.CODEX_PRICING["default"] + + +# --------------------------------------------------------------------------- +# _compose_prompt — Codex exec has no system-prompt flag, so the effective +# platform prompt is PREPENDED with a "---" separator; no system prompt passes +# the user message through unchanged. +# --------------------------------------------------------------------------- + +def test_compose_prompt_prepends_system_prompt(): + out = _compose_prompt("PLATFORM RULES", "do the thing") + assert out == "PLATFORM RULES\n\n---\n\ndo the thing" + + +def test_compose_prompt_passthrough_without_system_prompt(): + assert _compose_prompt(None, "just the user message") == "just the user message" + assert _compose_prompt("", "user msg") == "user msg" + + +# --------------------------------------------------------------------------- +# _read_and_consume_result_file — reads the -o file; missing file → None. +# NOTE the read does NOT delete (deletion is the caller's finally). +# --------------------------------------------------------------------------- + +def test_read_result_file_reads_content(tmp_path): + f = tmp_path / "out.txt" + f.write_text("the durable answer") + assert _read_and_consume_result_file(str(f), str(tmp_path)) == "the durable answer" + # The reader itself must not delete — finally owns deletion. + assert f.exists() + + +def test_read_result_file_missing_returns_none(tmp_path): + assert _read_and_consume_result_file(str(tmp_path / "nope.txt"), str(tmp_path)) is None + + +def test_safe_unlink_removes_and_tolerates_missing(tmp_path): + f = tmp_path / "gone.txt" + f.write_text("x") + _safe_unlink(str(f), str(tmp_path)) + assert not f.exists() + # Second unlink (already gone) must not raise. + _safe_unlink(str(f), str(tmp_path)) + + +def test_ensure_within_rejects_escape(tmp_path): + """Sink-side containment guard: a path resolving outside ``base`` is + rejected, and the two wrappers degrade safely (no read, no unlink) rather + than touching the out-of-bounds file.""" + base = tmp_path / "codex_home" + base.mkdir() + outside = tmp_path / "outside.txt" + outside.write_text("secret") + + with pytest.raises(ValueError): + codex_runtime._ensure_within(str(base), str(outside)) + + # Reader refuses → returns None and leaves the file untouched. + assert _read_and_consume_result_file(str(outside), str(base)) is None + assert outside.exists() + # Unlink refuses → no-op, must not raise, file still present. + _safe_unlink(str(outside), str(base)) + assert outside.exists() + + +# --------------------------------------------------------------------------- +# _surface_unmapped_guardrails — Codex exec has no per-tool CLI toggle in the +# MVP, so disallowed tools are SURFACED (logged), never silently dropped. +# --------------------------------------------------------------------------- + +def test_surface_unmapped_guardrails_logs_disallowed(monkeypatch, caplog): + monkeypatch.setattr( + codex_runtime, "_load_guardrails", lambda: {"disallowed_tools": ["Bash", "Write"]} + ) + caplog.set_level(logging.WARNING, logger="agent_server.services.codex_runtime") + # Must not raise — surfacing is best-effort. + codex_runtime._surface_unmapped_guardrails(allowed_tools=None) + msg = " ".join(r.getMessage() for r in caplog.records) + assert "Bash" in msg and "Write" in msg + + +def test_surface_unmapped_guardrails_logs_allowed_tools(monkeypatch, caplog): + monkeypatch.setattr(codex_runtime, "_load_guardrails", lambda: {}) + caplog.set_level(logging.INFO, logger="agent_server.services.codex_runtime") + codex_runtime._surface_unmapped_guardrails(allowed_tools=["Read", "Grep"]) + assert any("allowed_tools" in r.getMessage() for r in caplog.records) + + +# --------------------------------------------------------------------------- +# _load_openai_api_key — process env wins; otherwise parse .env; nothing +# anywhere → None (drives the upstream "key not configured" 503). +# --------------------------------------------------------------------------- + +def test_load_api_key_process_env_wins(tmp_path, monkeypatch): + """The container env var is the fast path — used before .env is even read.""" + monkeypatch.setenv("OPENAI_API_KEY", "sk-from-env") + # Point _AGENT_HOME at an empty dir so a stray real .env can't interfere. + monkeypatch.setattr(codex_runtime, "_AGENT_HOME", str(tmp_path)) + assert codex_runtime._load_openai_api_key() == "sk-from-env" + + +def test_load_api_key_skips_blank_and_comment_lines(tmp_path, monkeypatch): + _write_env( + tmp_path, + monkeypatch, + "# a comment\n\nNOISE_WITHOUT_EQUALS\nOPENAI_API_KEY=sk-after-noise\n", + ) + assert codex_runtime._load_openai_api_key() == "sk-after-noise" + + +def test_load_api_key_none_when_absent_everywhere(tmp_path, monkeypatch): + """No env var and no .env file → None (not a crash, not a sentinel).""" + monkeypatch.delenv("OPENAI_API_KEY", raising=False) + monkeypatch.delenv("CODEX_API_KEY", raising=False) + monkeypatch.setattr(codex_runtime, "_AGENT_HOME", str(tmp_path)) # no .env here + assert codex_runtime._load_openai_api_key() is None + + +# --------------------------------------------------------------------------- +# Trivial-but-load-bearing getters + is_available probe. +# --------------------------------------------------------------------------- + +def test_get_default_model_and_context_window(): + rt = CodexRuntime() + assert rt.get_default_model() == "gpt-5.1-codex" + assert rt.get_context_window() == codex_runtime.CODEX_CONTEXT_WINDOW + # Model arg is cosmetic for the window — always the GPT-5 input window. + assert rt.get_context_window("gpt-5-nano") == codex_runtime.CODEX_CONTEXT_WINDOW + + +def test_is_available_true_when_version_succeeds(monkeypatch): + class _OK: + returncode = 0 + + monkeypatch.setattr(codex_runtime.subprocess, "run", lambda *a, **k: _OK()) + assert CodexRuntime().is_available() is True + + +def test_is_available_false_on_nonzero_and_exception(monkeypatch): + class _Fail: + returncode = 1 + + monkeypatch.setattr(codex_runtime.subprocess, "run", lambda *a, **k: _Fail()) + assert CodexRuntime().is_available() is False + + def _boom(*a, **k): + raise FileNotFoundError("codex not installed") + + monkeypatch.setattr(codex_runtime.subprocess, "run", _boom) + assert CodexRuntime().is_available() is False + + +def test_configure_mcp_delegates_to_trinity_writer(tmp_path, monkeypatch): + """configure_mcp() routes to the shared Codex config writer, which writes + $CODEX_HOME/config.toml.""" + monkeypatch.setenv("CODEX_HOME", str(tmp_path)) + rt = CodexRuntime() + assert rt.configure_mcp({"github": {"command": "npx"}}) is True + assert (tmp_path / "config.toml").exists() + + +# --------------------------------------------------------------------------- +# parse_codex_jsonl — additional event/item shapes not covered by the base set: +# file_change + mcp_tool_call items, the mcp "server.tool" display name, started +# →completed dedup, tool failure, top-level + item-level error events, and the +# blank-line skip. +# --------------------------------------------------------------------------- + +def test_parser_mcp_and_file_change_tools(): + events = [ + {"type": "thread.started", "thread_id": "thr_m"}, + { + "type": "item.completed", + "item": { + "id": "mcp_1", + "type": "mcp_tool_call", + "server": "trinity", + "tool": "list_agents", + "arguments": {"q": "x"}, + "status": "completed", + }, + }, + { + "type": "item.completed", + "item": { + "id": "fc_1", + "type": "file_change", + "changes": [{"path": "a.py", "kind": "modify"}], + "status": "completed", + }, + }, + ] + response, log, metadata, raw = parse_codex_jsonl(_events_to_lines(events)) + tool_uses = {e.tool for e in log if e.type == "tool_use"} + # mcp_tool_call with a server renders "server.tool". + assert "trinity.list_agents" in tool_uses + assert "FileChange" in tool_uses + assert metadata.tool_count == 2 + + +def test_parser_tool_started_then_completed_dedup(): + """An item.started followed by item.completed for the SAME tool id yields a + single tool_use (dedup), plus one tool_result.""" + events = [ + {"type": "thread.started", "thread_id": "thr_d"}, + { + "type": "item.started", + "item": {"id": "cmd_dup", "type": "command_execution", "command": "ls"}, + }, + { + "type": "item.completed", + "item": { + "id": "cmd_dup", + "type": "command_execution", + "command": "ls", + "exit_code": 0, + "status": "completed", + "aggregated_output": "ok", + }, + }, + ] + response, log, metadata, raw = parse_codex_jsonl(_events_to_lines(events)) + tool_uses = [e for e in log if e.type == "tool_use"] + tool_results = [e for e in log if e.type == "tool_result"] + assert len(tool_uses) == 1 # not 2 — the started/completed pair is deduped + assert len(tool_results) == 1 + assert tool_results[0].success is True + + +def test_parser_tool_failure_marks_result_unsuccessful(): + events = [ + {"type": "thread.started", "thread_id": "thr_f"}, + { + "type": "item.completed", + "item": { + "id": "cmd_bad", + "type": "command_execution", + "command": "false", + "exit_code": 1, + "status": "failed", + }, + }, + ] + _, log, _, _ = parse_codex_jsonl(_events_to_lines(events)) + results = [e for e in log if e.type == "tool_result"] + assert results and results[0].success is False + + +def test_parser_top_level_error_event(): + events = [ + {"type": "thread.started", "thread_id": "thr_e"}, + {"type": "error", "message": "stream aborted"}, + ] + _, _, metadata, _ = parse_codex_jsonl(_events_to_lines(events)) + assert metadata.error_message == "stream aborted" + + +def test_parser_item_level_error(): + events = [ + {"type": "thread.started", "thread_id": "thr_ie"}, + {"type": "item.completed", "item": {"id": "x", "type": "error", "message": "boom"}}, + ] + _, _, metadata, _ = parse_codex_jsonl(_events_to_lines(events)) + assert metadata.error_message == "boom" + + +def test_parser_skips_blank_lines(): + lines = ["", " ", json.dumps({"type": "thread.started", "thread_id": "thr_b"})] + _, _, metadata, raw = parse_codex_jsonl(lines) + assert metadata.session_id == "thr_b" + assert len(raw) == 1 # blank lines never reach raw_messages + + +# --------------------------------------------------------------------------- +# execute() — the interactive chat path (mirrors the execute_headless error +# mapping, plus continuity, the fresh-session reset, and the session rollups). +# --------------------------------------------------------------------------- + +@pytest.mark.asyncio +async def test_execute_unavailable_raises_503(monkeypatch): + rt = CodexRuntime() + monkeypatch.setattr(rt, "is_available", lambda: False) + with pytest.raises(HTTPException) as exc_info: + await rt.execute("hi") + assert exc_info.value.status_code == 503 + + +@pytest.mark.asyncio +async def test_execute_pipe_drop_is_502(available_runtime, monkeypatch): + async def _raise_pipe(**_kw): + raise ConnectionResetError("peer reset") + + monkeypatch.setattr(available_runtime, "_execute_codex", _raise_pipe) + with pytest.raises(HTTPException) as exc_info: + await available_runtime.execute("hi") + assert exc_info.value.status_code == 502 + + +@pytest.mark.asyncio +async def test_execute_generic_failure_is_500(available_runtime, monkeypatch): + async def _raise_runtime(**_kw): + raise RuntimeError("unrelated") + + monkeypatch.setattr(available_runtime, "_execute_codex", _raise_runtime) + with pytest.raises(HTTPException) as exc_info: + await available_runtime.execute("hi") + assert exc_info.value.status_code == 500 + + +@pytest.mark.asyncio +async def test_execute_continuity_uses_cached_thread_id(available_runtime, monkeypatch): + """A continue_session turn with a prior thread id resumes it.""" + captured = {} + + async def _fake(**kw): + captured.update(kw) + return ("r", [], ExecutionMetadata(), [], "thr_next") + + monkeypatch.setattr(available_runtime, "_execute_codex", _fake) + monkeypatch.setattr(codex_runtime.agent_state, "session_started", True) + available_runtime._chat_thread_id = "thr_prev" + + await available_runtime.execute("hello", continue_session=True) + assert captured.get("resume_thread_id") == "thr_prev" + # The returned thread id is cached for the NEXT turn. + assert available_runtime._chat_thread_id == "thr_next" + + +@pytest.mark.asyncio +async def test_execute_fresh_session_does_not_resume(available_runtime, monkeypatch): + """Without continue_session the turn runs fresh (resume_thread_id None) and + marks the session started.""" + captured = {} + + async def _fake(**kw): + captured.update(kw) + return ("r", [], ExecutionMetadata(), [], None) + + monkeypatch.setattr(available_runtime, "_execute_codex", _fake) + available_runtime._chat_thread_id = "thr_stale" + await available_runtime.execute("hello", continue_session=False) + assert captured.get("resume_thread_id") is None + assert codex_runtime.agent_state.session_started is True + + +@pytest.mark.asyncio +async def test_execute_updates_session_rollups(available_runtime, monkeypatch): + """A successful chat turn folds cost/tokens/context into agent_state.""" + meta = ExecutionMetadata() + meta.cost_usd = 0.05 + meta.output_tokens = 340 + meta.input_tokens = 9_000 + meta.context_window = codex_runtime.CODEX_CONTEXT_WINDOW + + async def _fake(**_kw): + return ("answer", [], meta, [], "thr_roll") + + monkeypatch.setattr(available_runtime, "_execute_codex", _fake) + # Force the context-tokens high-water comparison to take the update branch. + monkeypatch.setattr(codex_runtime.agent_state, "session_context_tokens", 0) + before_cost = codex_runtime.agent_state.session_total_cost + before_out = codex_runtime.agent_state.session_total_output_tokens + + response, _, _, _ = await available_runtime.execute("hi") + assert response == "answer" + assert codex_runtime.agent_state.session_total_cost == pytest.approx(before_cost + 0.05) + assert codex_runtime.agent_state.session_total_output_tokens == before_out + 340 + assert codex_runtime.agent_state.session_context_tokens == 9_000 + assert codex_runtime.agent_state.session_context_window == codex_runtime.CODEX_CONTEXT_WINDOW + + +# --------------------------------------------------------------------------- +# _execute_codex — the orchestration body, end-to-end, with subprocess.Popen +# stubbed. Never spawns the real codex CLI; the fake writes the -o result file +# and emits a JSONL event stream the real reader threads + parser consume. +# --------------------------------------------------------------------------- + +class _FakePipe: + """Minimal stdout/stderr stand-in: readline() yields each line then ''.""" + + def __init__(self, lines): + self._it = iter(lines) + + def readline(self): + return next(self._it, "") + + def close(self): + pass + + +class _FakeRegistry: + def __init__(self): + self.registered = [] + self.unregistered = [] + + def register(self, execution_id, process, metadata=None): + self.registered.append(execution_id) + + def unregister(self, execution_id): + self.unregistered.append(execution_id) + + def publish_log_entry(self, execution_id, event): + pass + + +def _install_fake_codex( + monkeypatch, + *, + result_text, + stdout_events, + returncode=0, + extra_raw_lines=(), + wait_exc=None, +): + """Wire codex_runtime so _execute_codex runs its real body against a fake + subprocess. Returns the registry so the caller can assert register/unregister. + + ``extra_raw_lines`` are appended to stdout verbatim (un-JSON-encoded) to + exercise the reader's malformed-line tolerance. ``wait_exc``, when set, is + raised by ``process.wait`` to exercise the subprocess-timeout path. + """ + monkeypatch.setattr(codex_runtime, "_load_openai_api_key", lambda: "sk-test") + monkeypatch.setattr(codex_runtime, "_is_read_only", lambda: False) + monkeypatch.setattr(codex_runtime, "_load_guardrails", lambda: {}) + # Neutralize OS-level process-group operations — there is no real pgid. + monkeypatch.setattr(codex_runtime, "_capture_pgid", lambda proc: None) + monkeypatch.setattr(codex_runtime, "_terminate_process_group", lambda *a, **k: None) + monkeypatch.setattr(codex_runtime, "_safe_close_pipes", lambda *a, **k: None) + + def _drain(process, t_out, t_err, grace=5, pgid=None, execution_tag=None): + # Join the reader threads so parsed state is settled before we read it. + t_out.join(timeout=5) + t_err.join(timeout=5) + + monkeypatch.setattr(codex_runtime, "_drain_bounded", _drain) + + registry = _FakeRegistry() + monkeypatch.setattr(codex_runtime, "get_process_registry", lambda: registry) + + class _FakePopen: + def __init__(self, cmd, **kwargs): + self.cmd = cmd + self.pid = 4242 + self.returncode = returncode + # The real codex writes the -o file; emulate that for the happy path. + if result_text is not None: + oidx = cmd.index("-o") + Path(cmd[oidx + 1]).write_text(result_text) + self.stdout = _FakePipe( + [json.dumps(e) + "\n" for e in stdout_events] + list(extra_raw_lines) + ) + self.stderr = _FakePipe([]) + + def wait(self, timeout=None): + if wait_exc is not None: + raise wait_exc + return self.returncode + + def poll(self): + return self.returncode + + monkeypatch.setattr(codex_runtime.subprocess, "Popen", _FakePopen) + return registry + + +@pytest.mark.asyncio +async def test_execute_codex_body_happy_path(tmp_path, monkeypatch): + """The -o file is the authoritative response; tokens/cost/session_id come + from the JSONL stream; the result file is unlinked and the registry is + register/unregister-balanced.""" + monkeypatch.setenv("CODEX_HOME", str(tmp_path)) + events = [ + {"type": "thread.started", "thread_id": "thr_live_42"}, + { + "type": "turn.completed", + "usage": { + "input_tokens": 1200, + "cached_input_tokens": 200, + "output_tokens": 340, + }, + }, + ] + registry = _install_fake_codex( + monkeypatch, result_text="FINAL ANSWER FROM -o FILE", stdout_events=events + ) + + rt = CodexRuntime() + response, log, metadata, raw, session_id = await rt._execute_codex( + prompt="do it", + model="gpt-5.1-codex", + system_prompt="PLATFORM", + resume_thread_id=None, + timeout_seconds=30, + allowed_tools=None, + execution_id="exec_body_1", + concurrent_reader=True, + ) + + assert response == "FINAL ANSWER FROM -o FILE" + assert metadata.input_tokens == 1200 + assert metadata.output_tokens == 340 + assert metadata.cache_read_tokens == 200 + assert metadata.cost_usd and metadata.cost_usd > 0 + assert metadata.status == "success" + assert session_id == "thr_live_42" + # finally: result file consumed, registry balanced. + assert not (tmp_path / "codex" / "exec_body_1-last.txt").exists() + assert registry.registered == ["exec_body_1"] + assert registry.unregistered == ["exec_body_1"] + + +@pytest.mark.asyncio +async def test_execute_codex_body_nonzero_exit_classifies_failure(tmp_path, monkeypatch): + """A non-zero exit routes through _classify_codex_failure → HTTPException, + and the result file is still cleaned up in finally.""" + monkeypatch.setenv("CODEX_HOME", str(tmp_path)) + registry = _install_fake_codex( + monkeypatch, + result_text=None, # codex wrote nothing + stdout_events=[{"type": "thread.started", "thread_id": "thr_x"}], + returncode=1, + ) + rt = CodexRuntime() + with pytest.raises(HTTPException) as exc_info: + await rt._execute_codex( + prompt="boom", + model=None, + system_prompt=None, + resume_thread_id=None, + timeout_seconds=30, + allowed_tools=None, + execution_id="exec_body_2", + concurrent_reader=False, + ) + # Generic failure (no auth/rate markers) → 500, never 503. + assert exc_info.value.status_code == 500 + assert registry.unregistered == ["exec_body_2"] + + +@pytest.mark.asyncio +async def test_execute_codex_body_missing_key_is_503(tmp_path, monkeypatch): + """No API key resolvable → 503 before any subprocess is spawned.""" + monkeypatch.setenv("CODEX_HOME", str(tmp_path)) + monkeypatch.setattr(codex_runtime, "_load_openai_api_key", lambda: None) + rt = CodexRuntime() + with pytest.raises(HTTPException) as exc_info: + await rt._execute_codex( + prompt="x", + model=None, + system_prompt=None, + resume_thread_id=None, + timeout_seconds=30, + allowed_tools=None, + execution_id="exec_body_3", + concurrent_reader=False, + ) + assert exc_info.value.status_code == 503 + + +@pytest.mark.asyncio +async def test_execute_headless_happy_path_through_real_body(tmp_path, monkeypatch): + """execute_headless() drives the real _execute_codex body (concurrent reader) + and returns the -o response + thread id.""" + monkeypatch.setenv("CODEX_HOME", str(tmp_path)) + monkeypatch.setattr(CodexRuntime, "is_available", lambda self: True) + _install_fake_codex( + monkeypatch, + result_text="HEADLESS RESULT", + stdout_events=[ + {"type": "thread.started", "thread_id": "thr_headless"}, + {"type": "turn.completed", "usage": {"input_tokens": 5, "output_tokens": 2}}, + ], + ) + rt = CodexRuntime() + response, log, metadata, session_id = await rt.execute_headless( + prompt="task", execution_id="exec_hl_1", timeout_seconds=30 + ) + assert response == "HEADLESS RESULT" + assert session_id == "thr_headless" + + +@pytest.mark.asyncio +async def test_execute_headless_warns_on_images_and_max_turns(available_runtime, monkeypatch, caplog): + """images are unsupported (warned + ignored) and max_turns has no CLI flag + (info-logged), but neither aborts the run.""" + captured = {} + + async def _fake(**kw): + captured.update(kw) + return ("ok", [], ExecutionMetadata(), [], "thr") + + monkeypatch.setattr(available_runtime, "_execute_codex", _fake) + caplog.set_level(logging.INFO, logger="agent_server.services.codex_runtime") + await available_runtime.execute_headless( + prompt="t", images=[{"data": "x"}], max_turns=5 + ) + text = " ".join(r.getMessage() for r in caplog.records) + assert "images are not supported" in text + assert "max_turns" in text + + +# --------------------------------------------------------------------------- +# CODEX_HOME resolution — explicit env wins; else under $TMPDIR; else the home +# default. Kept out of the git-tracked repo so codex state never dirties sync. +# --------------------------------------------------------------------------- + +def test_codex_home_prefers_explicit_env(monkeypatch): + monkeypatch.setenv("CODEX_HOME", "/somewhere/codex-home") + assert codex_runtime._codex_home() == "/somewhere/codex-home" + + +def test_codex_home_falls_back_to_tmpdir(monkeypatch): + monkeypatch.delenv("CODEX_HOME", raising=False) + monkeypatch.setenv("TMPDIR", "/scratch/tmp") + assert codex_runtime._codex_home() == "/scratch/tmp/codex" + + +def test_codex_home_falls_back_to_agent_home_tmp(monkeypatch): + monkeypatch.delenv("CODEX_HOME", raising=False) + monkeypatch.delenv("TMPDIR", raising=False) + # _AGENT_HOME/.tmp/codex when neither env is set. + assert codex_runtime._codex_home().endswith("/.tmp/codex") + + +def test_ensure_codex_home_creates_dir(tmp_path, monkeypatch): + target = tmp_path / "ch" + monkeypatch.setenv("CODEX_HOME", str(target)) + assert codex_runtime._ensure_codex_home() == str(target) + assert target.is_dir() + + +# --------------------------------------------------------------------------- +# Parser/tracking resilience: an item with no resolvable type is skipped; a +# failure inside best-effort activity tracking never breaks parsing. +# --------------------------------------------------------------------------- + +def test_parser_item_without_type_is_skipped(): + events = [ + {"type": "thread.started", "thread_id": "thr_nt"}, + {"type": "item.completed", "item": {"id": "x"}}, # no 'type' / details.type + {"type": "turn.completed", "usage": {"input_tokens": 1, "output_tokens": 1}}, + ] + _, log, metadata, _ = parse_codex_jsonl(_events_to_lines(events)) + assert metadata.tool_count == 0 + assert not log + + +def test_parser_survives_activity_tracking_exceptions(monkeypatch): + """start/complete_tool_execution are best-effort — a raise inside them is + swallowed and the tool log entries are still recorded.""" + + def _boom(*a, **k): + raise RuntimeError("activity sink down") + + monkeypatch.setattr(codex_runtime, "start_tool_execution", _boom) + monkeypatch.setattr(codex_runtime, "complete_tool_execution", _boom) + events = [ + {"type": "thread.started", "thread_id": "thr_at"}, + { + "type": "item.completed", + "item": { + "id": "cmd_at", + "type": "command_execution", + "command": "ls", + "exit_code": 0, + "status": "completed", + "aggregated_output": "ok", + }, + }, + ] + _, log, _, _ = parse_codex_jsonl(_events_to_lines(events)) # must not raise + assert any(e.type == "tool_use" for e in log) + assert any(e.type == "tool_result" for e in log) + + +# --------------------------------------------------------------------------- +# _execute_codex — additional body branches: malformed stdout lines are +# tolerated; an empty -o file with no JSONL parts yields a sentinel response; +# a subprocess wall-clock timeout maps to 504. +# --------------------------------------------------------------------------- + +@pytest.mark.asyncio +async def test_execute_codex_body_tolerates_malformed_stdout(tmp_path, monkeypatch): + """A non-JSON line and a non-dict JSON line on stdout are skipped; the run + still completes off the -o file.""" + monkeypatch.setenv("CODEX_HOME", str(tmp_path)) + _install_fake_codex( + monkeypatch, + result_text="STILL FINE", + stdout_events=[{"type": "thread.started", "thread_id": "thr_mal"}], + extra_raw_lines=["this is not json\n", "12345\n"], # garbage + non-dict + ) + rt = CodexRuntime() + response, _, _, _, _ = await rt._execute_codex( + prompt="p", model=None, system_prompt=None, resume_thread_id=None, + timeout_seconds=30, allowed_tools=None, execution_id="exec_mal", + concurrent_reader=False, + ) + assert response == "STILL FINE" + + +@pytest.mark.asyncio +async def test_execute_codex_body_empty_response_sentinel(tmp_path, monkeypatch): + """Empty -o file + no agent_message parts + no tools → '(No response from + Codex)' rather than an empty string.""" + monkeypatch.setenv("CODEX_HOME", str(tmp_path)) + _install_fake_codex( + monkeypatch, + result_text="", # codex produced an empty result file + stdout_events=[ + {"type": "thread.started", "thread_id": "thr_empty"}, + {"type": "turn.completed", "usage": {"input_tokens": 1, "output_tokens": 0}}, + ], + ) + rt = CodexRuntime() + response, _, _, _, _ = await rt._execute_codex( + prompt="p", model=None, system_prompt=None, resume_thread_id=None, + timeout_seconds=30, allowed_tools=None, execution_id="exec_empty", + concurrent_reader=False, + ) + assert response == "(No response from Codex)" + + +@pytest.mark.asyncio +async def test_execute_codex_body_subprocess_timeout_is_504(tmp_path, monkeypatch): + """A subprocess wall-clock timeout (process.wait raises TimeoutExpired) maps + to HTTP 504 and still unregisters the execution.""" + import subprocess as _sp + + monkeypatch.setenv("CODEX_HOME", str(tmp_path)) + registry = _install_fake_codex( + monkeypatch, + result_text=None, + stdout_events=[{"type": "thread.started", "thread_id": "thr_to"}], + wait_exc=_sp.TimeoutExpired(cmd="codex", timeout=1), + ) + rt = CodexRuntime() + with pytest.raises(HTTPException) as exc_info: + await rt._execute_codex( + prompt="p", model=None, system_prompt=None, resume_thread_id=None, + timeout_seconds=1, allowed_tools=None, execution_id="exec_to", + concurrent_reader=False, + ) + assert exc_info.value.status_code == 504 + assert registry.unregistered == ["exec_to"] + + +@pytest.mark.asyncio +async def test_execute_chat_timeout_maps_to_504(available_runtime, monkeypatch): + """execute() maps a bare TimeoutError from the body to HTTP 504.""" + + async def _raise_timeout(**_kw): + raise TimeoutError("slow") + + monkeypatch.setattr(available_runtime, "_execute_codex", _raise_timeout) + with pytest.raises(HTTPException) as exc_info: + await available_runtime.execute("hi") + assert exc_info.value.status_code == 504 + + +@pytest.mark.asyncio +async def test_execute_headless_timeout_maps_to_504(available_runtime, monkeypatch): + async def _raise_timeout(**_kw): + raise TimeoutError("slow") + + monkeypatch.setattr(available_runtime, "_execute_codex", _raise_timeout) + with pytest.raises(HTTPException) as exc_info: + await available_runtime.execute_headless(prompt="hi") + assert exc_info.value.status_code == 504 diff --git a/tests/unit/test_codex_skips_subscription_assign.py b/tests/unit/test_codex_skips_subscription_assign.py new file mode 100644 index 00000000..dd6529b7 --- /dev/null +++ b/tests/unit/test_codex_skips_subscription_assign.py @@ -0,0 +1,41 @@ +"""Non-Claude runtimes must not get a Claude subscription (#1187 decision 7). + +Subscriptions are Claude-OAuth tokens (CLAUDE_CODE_OAUTH_TOKEN). Both the +create-time auto-assign (``crud.py``) and the recreate-time auth juggle +(``lifecycle.py``) gate on ``is_claude_runtime`` — the shared decision tested +here. If this returns True for ``codex``/``gemini`` a Codex agent would get a +persisted ``subscription_id`` (``has_subscription=True``) and a spurious Claude +token injected on every create/recreate. + +The full create/recreate flow is exercised end-to-end in /verify-local; this +unit test pins the gating predicate that both call sites share. +""" + +from __future__ import annotations + +from services.agent_service.helpers import is_claude_runtime + + +def test_claude_variants_are_claude(): + assert is_claude_runtime("claude-code") is True + assert is_claude_runtime("claude") is True + # case-insensitive — labels/env may vary in case + assert is_claude_runtime("Claude-Code") is True + + +def test_unset_or_empty_defaults_to_claude(): + """Back-compat: an unset/empty runtime is the Claude default, so existing + agents keep their subscription behavior.""" + assert is_claude_runtime(None) is True + assert is_claude_runtime("") is True + + +def test_codex_is_not_claude(): + """The load-bearing assertion: a Codex agent must NOT be assigned a Claude + subscription / injected a Claude OAuth token.""" + assert is_claude_runtime("codex") is False + + +def test_gemini_is_not_claude(): + assert is_claude_runtime("gemini-cli") is False + assert is_claude_runtime("gemini") is False diff --git a/tests/unit/test_platform_prompt_runtime.py b/tests/unit/test_platform_prompt_runtime.py new file mode 100644 index 00000000..53625f3f --- /dev/null +++ b/tests/unit/test_platform_prompt_runtime.py @@ -0,0 +1,105 @@ +"""Runtime-aware platform prompt tests (#1187 F-MCP). + +``PLATFORM_INSTRUCTIONS`` documents the Trinity MCP tools with Claude Code's +``mcp__trinity__`` naming. Codex auto-discovers MCP tools from the +configured ``trinity`` server and calls them by their bare names — the +Claude-only prefix makes a Codex model emit ``mcp__trinity`` → "unknown MCP +server". So the platform prompt must be runtime-aware: + + * ``runtime="codex"`` → no ``mcp__trinity__`` prefix anywhere, plus a short + Codex-specific orientation note. + * ``runtime="claude-code"`` (and Gemini / unknown / default) → the canonical + text, unchanged. +""" + +from __future__ import annotations + +import pytest + +from services import platform_prompt_service +from services.platform_prompt_service import ( + PLATFORM_INSTRUCTIONS, + compose_system_prompt, + get_platform_system_prompt, +) + + +@pytest.fixture(autouse=True) +def _no_custom_prompt(monkeypatch): + """Pin the operator-configurable ``trinity_prompt`` to empty so these tests + exercise only the runtime branching, independent of DB state.""" + monkeypatch.setattr( + platform_prompt_service.db, "get_setting_value", lambda *a, **k: None + ) + + +# --------------------------------------------------------------------------- +# get_platform_system_prompt +# --------------------------------------------------------------------------- + +def test_claude_prompt_keeps_mcp_trinity_prefix(): + """The default (Claude) prompt is unchanged — it still documents the tools + with the canonical mcp__trinity__ prefix.""" + prompt = get_platform_system_prompt("claude-code") + assert "mcp__trinity__list_agents" in prompt + assert "mcp__trinity__chat_with_agent" in prompt + assert "mcp__trinity__share_file" in prompt + assert "mcp__trinity__write_user_memory" in prompt + # No-arg call preserves the historical default behavior. + assert get_platform_system_prompt() == prompt + + +def test_codex_prompt_omits_mcp_trinity_prefix(): + """The Codex prompt must contain NO mcp__trinity__ token anywhere — that + prefix is what made Codex emit `unknown MCP server`.""" + prompt = get_platform_system_prompt("codex") + assert "mcp__trinity__" not in prompt + # The bare tool names survive so the model still knows what to call. + assert "list_agents" in prompt + assert "chat_with_agent" in prompt + assert "share_file" in prompt + assert "write_user_memory" in prompt + + +def test_codex_prompt_includes_codex_orientation(): + """A Codex-specific orientation note is present so the model knows the + `trinity` MCP server is configured and tools are called by bare name.""" + prompt = get_platform_system_prompt("codex") + assert "Codex" in prompt + assert "trinity" in prompt # references the configured MCP server name + # The claude prompt must NOT carry the Codex-only orientation. + assert "Codex" not in get_platform_system_prompt("claude-code") + + +def test_unknown_and_gemini_runtimes_keep_claude_naming(): + """Gemini and any unrecognized runtime fall back to the canonical Claude + naming (the plan's `default claude-code` behavior).""" + for runtime in ("gemini-cli", "gemini", "something-new", ""): + prompt = get_platform_system_prompt(runtime) + assert "mcp__trinity__list_agents" in prompt + + +# --------------------------------------------------------------------------- +# compose_system_prompt threads runtime through +# --------------------------------------------------------------------------- + +def test_compose_threads_runtime_to_platform_prompt(): + composed = compose_system_prompt(runtime="codex") + assert "mcp__trinity__" not in composed + + +def test_compose_default_runtime_is_claude(): + composed = compose_system_prompt() + assert "mcp__trinity__list_agents" in composed + + +# --------------------------------------------------------------------------- +# The source constant itself is Claude-flavored (transformation is non-mutating) +# --------------------------------------------------------------------------- + +def test_source_constant_unchanged_after_codex_render(): + """Rendering the Codex prompt must not mutate the shared module constant.""" + before = PLATFORM_INSTRUCTIONS + get_platform_system_prompt("codex") + assert PLATFORM_INSTRUCTIONS == before + assert "mcp__trinity__" in PLATFORM_INSTRUCTIONS diff --git a/tests/unit/test_runtime_capabilities_matrix.py b/tests/unit/test_runtime_capabilities_matrix.py new file mode 100644 index 00000000..3a6c9e44 --- /dev/null +++ b/tests/unit/test_runtime_capabilities_matrix.py @@ -0,0 +1,58 @@ +"""Cross-runtime capability matrix (#1187 Phase G). + +Each runtime declares honest capabilities so callers (Session-tab gate, frontend +tab visibility, cost-label rendering) can gate on a capability instead of +branching on the runtime name. The ABC default is conservative so a future +runtime that forgets to override is treated as the least-capable. +""" + +from __future__ import annotations + +from agent_server.services.runtime_adapter import AgentRuntime +from agent_server.services.codex_runtime import CodexRuntime +from agent_server.services.claude_code import ClaudeCodeRuntime +from agent_server.services.gemini_runtime import GeminiRuntime + + +def test_default_capabilities_are_conservative(): + caps = AgentRuntime.capabilities() + assert caps.chat_continuity is False + assert caps.session_tab_resume is False + assert caps.mcp_support is False + assert caps.cost_reporting == "estimated" + + +def test_claude_is_the_reference_runtime(): + caps = ClaudeCodeRuntime.capabilities() + assert caps.chat_continuity is True + assert caps.session_tab_resume is True # the Session tab is Claude's machinery + assert caps.mcp_support is True + assert caps.cost_reporting == "native" # Claude emits total_cost_usd + + +def test_gemini_has_continuity_but_no_session_resume(): + caps = GeminiRuntime.capabilities() + assert caps.chat_continuity is True + assert caps.session_tab_resume is False # execute_headless ignores resume + assert caps.cost_reporting == "estimated" + + +def test_codex_matches_gemini_shape_for_resume_and_cost(): + caps = CodexRuntime.capabilities() + assert caps.chat_continuity is True # codex exec resume + assert caps.session_tab_resume is False # MVP: Session tab stays Claude/Gemini + assert caps.mcp_support is True + assert caps.cost_reporting == "estimated" + + +def test_capabilities_to_dict_is_serializable_for_callers(): + """to_dict() is what the backend/frontend serialize to gate UI — every flag + must round-trip as a plain JSON-friendly dict.""" + caps = CodexRuntime.capabilities() + d = caps.to_dict() + assert d == { + "chat_continuity": True, + "session_tab_resume": False, + "mcp_support": True, + "cost_reporting": "estimated", + } diff --git a/tests/unit/test_runtime_factory_codex.py b/tests/unit/test_runtime_factory_codex.py new file mode 100644 index 00000000..0708f12b --- /dev/null +++ b/tests/unit/test_runtime_factory_codex.py @@ -0,0 +1,43 @@ +"""Runtime factory tests for the Codex runtime + unknown-runtime validation (#1187). + +``get_runtime()`` must: + * return a CodexRuntime when AGENT_RUNTIME=codex, + * keep defaulting to Claude when AGENT_RUNTIME is unset (back-compat), + * but RAISE on an explicitly-set unknown value instead of silently selecting + Claude — a typo'd runtime should fail loudly, not run the wrong engine. +""" + +from __future__ import annotations + +import pytest + +from agent_server.services.runtime_adapter import ( # noqa: E402 + KNOWN_RUNTIMES, + get_runtime, +) +from agent_server.services.codex_runtime import CodexRuntime # noqa: E402 + + +def test_runtime_factory_codex(monkeypatch): + monkeypatch.setenv("AGENT_RUNTIME", "codex") + assert isinstance(get_runtime(), CodexRuntime) + + +def test_runtime_factory_unknown_raises(monkeypatch): + monkeypatch.setenv("AGENT_RUNTIME", "totally-bogus-runtime") + with pytest.raises(ValueError) as exc_info: + get_runtime() + # The error must name the offending value and not silently fall back. + assert "totally-bogus-runtime" in str(exc_info.value) + + +def test_runtime_factory_default_is_claude(monkeypatch): + """Env unset → Claude Code (unchanged back-compat). The empty-default path + must NOT raise.""" + monkeypatch.delenv("AGENT_RUNTIME", raising=False) + runtime = get_runtime() + assert runtime.__class__.__name__ == "ClaudeCodeRuntime" + + +def test_codex_is_a_known_runtime(): + assert "codex" in KNOWN_RUNTIMES diff --git a/tests/unit/test_runtime_label_fast_path.py b/tests/unit/test_runtime_label_fast_path.py new file mode 100644 index 00000000..c95c719b --- /dev/null +++ b/tests/unit/test_runtime_label_fast_path.py @@ -0,0 +1,64 @@ +"""Fast-path runtime-label read (#1187 review I6). + +``list_all_agents_fast()`` builds AgentStatus from container LABELS only (no +``container.attrs`` inspect). It must read the runtime from the label that is +actually written at create time (``trinity.agent-runtime``, see +``crud.py``) — not ``trinity.runtime``, which is never written and so always +reports ``claude-code``, making the RuntimeBadge wrong for Codex/Gemini agents +in every fast-path view (monitoring, ops, telemetry). +""" + +from __future__ import annotations + +from types import SimpleNamespace + +from services import docker_service + + +class _FakeContainers: + def __init__(self, containers): + self._containers = containers + + def list(self, **_kwargs): + return self._containers + + +class _FakeClient: + def __init__(self, containers): + self.containers = _FakeContainers(containers) + + +def _fake_container(labels: dict): + return SimpleNamespace( + labels=labels, + name="agent-demo", + status="running", + id="container123", + ) + + +def test_fast_path_reads_agent_runtime_label(monkeypatch): + labels = { + "trinity.platform": "agent", + "trinity.agent-type": "custom", + "trinity.agent-runtime": "codex", + } + monkeypatch.setattr( + docker_service, "docker_client", _FakeClient([_fake_container(labels)]) + ) + + agents = docker_service.list_all_agents_fast() + + assert len(agents) == 1 + assert agents[0].runtime == "codex" + + +def test_fast_path_defaults_to_claude_when_label_absent(monkeypatch): + labels = {"trinity.platform": "agent", "trinity.agent-type": "custom"} + monkeypatch.setattr( + docker_service, "docker_client", _FakeClient([_fake_container(labels)]) + ) + + agents = docker_service.list_all_agents_fast() + + assert agents[0].runtime == "claude-code" diff --git a/tests/unit/test_runtime_resolution_backend.py b/tests/unit/test_runtime_resolution_backend.py new file mode 100644 index 00000000..05e5f156 --- /dev/null +++ b/tests/unit/test_runtime_resolution_backend.py @@ -0,0 +1,87 @@ +"""Backend runtime resolution (#1187 F-MCP). + +The backend resolves an agent's runtime to make the platform system prompt +runtime-aware (Codex strips the Claude-only ``mcp__trinity__`` tool-name +prefix). Two seams cover this, both **best-effort, never-raise, Claude-default**: + + * ``docker_service.get_agent_runtime(name)`` reads the ``trinity.agent-runtime`` + container label, and + * ``task_execution_service._resolve_agent_runtime(name)`` wraps it behind a + guarded local import so a unit-test stub of ``services.docker_service`` (or a + missing symbol / Docker outage) can never block dispatch. + +Neither may raise or block dispatch — any failure falls back to ``claude-code``, +preserving the historical Claude/Gemini prompt naming. +""" + +from __future__ import annotations + +import sys +import types +from types import SimpleNamespace + +from services import docker_service +from services.task_execution_service import _resolve_agent_runtime + + +# --------------------------------------------------------------------------- +# docker_service.get_agent_runtime +# --------------------------------------------------------------------------- + +def test_get_agent_runtime_reads_label(monkeypatch): + container = SimpleNamespace(labels={"trinity.agent-runtime": "codex"}) + monkeypatch.setattr(docker_service, "get_agent_container", lambda name: container) + assert docker_service.get_agent_runtime("demo") == "codex" + + +def test_get_agent_runtime_defaults_when_label_absent(monkeypatch): + container = SimpleNamespace(labels={"trinity.platform": "agent"}) + monkeypatch.setattr(docker_service, "get_agent_container", lambda name: container) + assert docker_service.get_agent_runtime("demo") == "claude-code" + + +def test_get_agent_runtime_defaults_when_container_missing(monkeypatch): + monkeypatch.setattr(docker_service, "get_agent_container", lambda name: None) + assert docker_service.get_agent_runtime("ghost") == "claude-code" + + +def test_get_agent_runtime_never_raises(monkeypatch): + """A Docker hiccup mid-read (labels access throws) falls back to claude-code, + never propagating — runtime resolution must not block dispatch.""" + + class _RaisingLabels: + def get(self, *a, **k): + raise RuntimeError("docker hiccup") + + monkeypatch.setattr( + docker_service, + "get_agent_container", + lambda name: SimpleNamespace(labels=_RaisingLabels()), + ) + assert docker_service.get_agent_runtime("demo") == "claude-code" + + +# --------------------------------------------------------------------------- +# task_execution_service._resolve_agent_runtime — the guarded wrapper +# --------------------------------------------------------------------------- + +def test_resolve_agent_runtime_delegates_to_docker_service(monkeypatch): + # _resolve_agent_runtime does a *local* `from services.docker_service import + # get_agent_runtime` at call time, reading sys.modules. Other unit tests + # (e.g. test_voice_*) install a MagicMock at sys.modules["services.docker_ + # service"] that the conftest restore doesn't clear, so patching a module + # reference captured at import time misses what the function actually reads. + # Control sys.modules directly (auto-restored) — bulletproof regardless of + # any leaked stub (the documented module-identity gotcha). + fake_mod = types.SimpleNamespace(get_agent_runtime=lambda name: "codex") + monkeypatch.setitem(sys.modules, "services.docker_service", fake_mod) + assert _resolve_agent_runtime("demo") == "codex" + + +def test_resolve_agent_runtime_falls_back_on_error(monkeypatch): + """If the docker_service lookup fails (a partial stub lacking the symbol, or + a Docker outage), resolution degrades to claude-code rather than raising.""" + # A module object with NO get_agent_runtime → the local from-import raises + # ImportError, which the guard swallows (the "partial stub" scenario). + monkeypatch.setitem(sys.modules, "services.docker_service", types.SimpleNamespace()) + assert _resolve_agent_runtime("demo") == "claude-code" diff --git a/tests/unit/test_session_tab_gate_codex.py b/tests/unit/test_session_tab_gate_codex.py new file mode 100644 index 00000000..920dff30 --- /dev/null +++ b/tests/unit/test_session_tab_gate_codex.py @@ -0,0 +1,90 @@ +"""Session-tab runtime gate (#1187 Phase H). + +The cached-UUID ``--resume`` turn is gated so a Codex agent runs a stateless +turn instead. The gate must: + * recognize codex as a non-resume runtime, + * leave Claude (and Gemini, in the MVP) resume-capable, + * fail safe (assume resume-capable) on any Docker lookup hiccup. +""" + +from __future__ import annotations + +import importlib.util +import types +from pathlib import Path + + +# `from routers import sessions` would execute routers/__init__.py, which eagerly +# imports all 50+ routers — including routers/agents.py → `from +# services.agent_service import get_agents_by_prefix`. Sibling unit tests +# (e.g. test_inject_assigned_credentials.py) install a stub `services.agent_service` +# in sys.modules at collection time, so under `-p randomly` that broad import +# raises ImportError while *this* module is being collected (the #1187 +# regression-diff failure). Exec'ing sessions.py in isolation — whose own +# dependency chain never reaches services.agent_service — sidesteps the +# pollution without mutating sys.modules. Absolute imports inside sessions.py +# resolve via sys.path (conftest puts src/backend on it). +def _load_sessions_router() -> types.ModuleType: + for base in ( + Path(__file__).resolve().parents[2] / "src" / "backend", # host / CI + Path("/app"), # trinity-backend container + ): + path = base / "routers" / "sessions.py" + if path.exists(): + spec = importlib.util.spec_from_file_location( + "routers_sessions_under_test", str(path) + ) + module = importlib.util.module_from_spec(spec) + spec.loader.exec_module(module) # type: ignore[union-attr] + return module + raise RuntimeError("Cannot locate routers/sessions.py") + + +sessions = _load_sessions_router() + + +def _status(runtime): + return types.SimpleNamespace(runtime=runtime) + + +def test_codex_in_no_resume_constant(): + assert "codex" in sessions.RUNTIMES_WITHOUT_SESSION_TAB_RESUME + + +def test_supports_resume_false_for_codex(monkeypatch): + monkeypatch.setattr(sessions, "get_agent_container", lambda name: object()) + monkeypatch.setattr( + sessions, "get_agent_status_from_container", lambda c: _status("codex") + ) + assert sessions._supports_session_tab_resume("a") is False + + +def test_supports_resume_true_for_claude(monkeypatch): + monkeypatch.setattr(sessions, "get_agent_container", lambda name: object()) + monkeypatch.setattr( + sessions, "get_agent_status_from_container", lambda c: _status("claude-code") + ) + assert sessions._supports_session_tab_resume("a") is True + + +def test_supports_resume_true_for_gemini_in_mvp(monkeypatch): + """Only codex is gated in the MVP — Gemini keeps its (existing) Session tab.""" + monkeypatch.setattr(sessions, "get_agent_container", lambda name: object()) + monkeypatch.setattr( + sessions, "get_agent_status_from_container", lambda c: _status("gemini-cli") + ) + assert sessions._supports_session_tab_resume("a") is True + + +def test_supports_resume_true_when_container_missing(monkeypatch): + monkeypatch.setattr(sessions, "get_agent_container", lambda name: None) + assert sessions._supports_session_tab_resume("a") is True + + +def test_supports_resume_defaults_true_on_lookup_failure(monkeypatch): + def _boom(name): + raise RuntimeError("docker socket down") + + monkeypatch.setattr(sessions, "get_agent_container", _boom) + # Must not raise, and must fail safe to resume-capable. + assert sessions._supports_session_tab_resume("a") is True diff --git a/tests/unit/test_validate_runtime.py b/tests/unit/test_validate_runtime.py new file mode 100644 index 00000000..4e818fb2 --- /dev/null +++ b/tests/unit/test_validate_runtime.py @@ -0,0 +1,90 @@ +"""Create-time runtime validation (#1187 review). + +``create_agent_internal`` calls ``validate_runtime(config.runtime)`` after the +runtime is finalized (request value, possibly overridden by the template). A +known runtime (or the unset default) passes; a typo'd one raises HTTP 400 here +instead of letting the agent container crash-loop on boot when the agent-side +``get_runtime()`` can't resolve ``AGENT_RUNTIME``. +""" + +from __future__ import annotations + +import sys + +import pytest +from fastapi import HTTPException + +# Cross-file sys.modules hygiene (#1187). Sibling unit modules +# (test_inject_assigned_credentials, test_start_agent_skip_inject, +# test_subscription_auto_switch_no_cred_import) replace +# ``services.agent_service[.helpers]`` with Mocks in ``sys.modules`` at +# module-collection time and never restore them; the unit conftest's +# baseline-restore covers neither key (not in ``_SYS_MODULES_BASELINE_VALUES`` +# nor ``_POP_PREFIXES``). Under pytest-randomly that Mock can be resident when +# THIS module is imported, making the ``from``-import below bind +# ``KNOWN_RUNTIME_NAMES``/``validate_runtime`` to Mock attributes ("'Mock' +# object is not iterable"). Evict the stubbed subtree first so we always bind +# the real module from disk, independent of collection order. +_STUBBED_MODULE_NAMES = [ + "services.agent_service.helpers", + "services.agent_service.read_only", + "services.agent_service.file_sharing", + "services.agent_service", +] + +for _stub in _STUBBED_MODULE_NAMES: # import-time eviction (monkeypatch can't reach) + sys.modules.pop(_stub, None) + +from services.agent_service.helpers import ( # noqa: E402 (must follow the stub eviction above) + KNOWN_RUNTIME_NAMES, + validate_runtime, +) + + +@pytest.fixture(autouse=True) +def _restore_sys_modules(): + """Snapshot/restore the stubbed subtree around each test. + + Pairs with ``_STUBBED_MODULE_NAMES`` to form the sanctioned snapshot/restore + escape hatch the sys.modules lint recognizes (precedent: + tests/unit/test_telegram_webhook_backfill.py), and prevents this module's + import-time eviction from leaking the unstubbed real modules into the + collection state other files depend on. + """ + saved = {name: sys.modules.get(name) for name in _STUBBED_MODULE_NAMES} + try: + yield + finally: + for name, value in saved.items(): + if value is None: + sys.modules.pop(name, None) + else: + sys.modules[name] = value + + +def test_known_runtimes_pass(): + for runtime in KNOWN_RUNTIME_NAMES: + validate_runtime(runtime) # must not raise + # case-insensitive — template/env casing varies + validate_runtime("Codex") + validate_runtime("Claude-Code") + + +def test_unset_or_empty_is_valid_default(): + """Back-compat: unset/empty is the Claude default, not an error.""" + validate_runtime(None) + validate_runtime("") + + +def test_unknown_runtime_raises_400(): + with pytest.raises(HTTPException) as exc: + validate_runtime("codez") # typo + assert exc.value.status_code == 400 + assert "codez" in exc.value.detail + + +def test_codex_is_a_known_runtime(): + """Guard against the backend list drifting from the agent-side + ``runtime_adapter.KNOWN_RUNTIMES`` — codex must be accepted.""" + assert "codex" in KNOWN_RUNTIME_NAMES + validate_runtime("codex")