diff --git a/docker/base-image/agent_server/services/headless_executor.py b/docker/base-image/agent_server/services/headless_executor.py index 3171d244..01f8d9ae 100644 --- a/docker/base-image/agent_server/services/headless_executor.py +++ b/docker/base-image/agent_server/services/headless_executor.py @@ -187,6 +187,32 @@ def _attempt_empty_result_recovery( return (status_code, body) +def _timeout_504_detail( + ctx: "HeadlessRunContext", + message: str, + termination_reason: str, + stalled_tool: Optional[str] = None, +) -> Dict: + """Structured 504 body for an agent-side timeout kill (#1094, #1201). + + Carries the telemetry accumulated before the kill (cost / context / tool + calls) under ``metadata`` — the same sanitized + ``ExecutionMetadata.model_dump()`` shape the #678 empty-result body uses — + so the backend's HTTPError salvage (`task_execution_service.py` HTTPError + branch) persists cost/context onto the timed-out FAILED row instead of + writing it bare. ``termination_reason`` ("max_duration" | "stall_no_output") + and the timeout-failure semantics are unchanged. + """ + detail: Dict = { + "message": message, + "termination_reason": termination_reason, + "metadata": sanitize_dict(ctx.metadata.model_dump()), + } + if stalled_tool is not None: + detail["stalled_tool"] = stalled_tool + return detail + + @dataclass class HeadlessRunContext: """State carrier for the headless execution lifecycle. @@ -1035,10 +1061,11 @@ async def execute_headless_task( # termination_reason see budget timeouts from either path. raise HTTPException( status_code=504, - detail={ - "message": f"Task execution timed out after {ctx.effective_timeout} seconds", - "termination_reason": "max_duration", - }, + detail=_timeout_504_detail( + ctx, + f"Task execution timed out after {ctx.effective_timeout} seconds", + "max_duration", + ), ) except subprocess.TimeoutExpired: # Inner process.wait() bounded out; tree has already been killed. @@ -1054,22 +1081,24 @@ async def execute_headless_task( ) raise HTTPException( status_code=504, - detail={ - "message": ( + detail=_timeout_504_detail( + ctx, + ( f"Killed: tool '{ctx.stalled_tool}' produced no output " f"for {_STALL_LIMIT_S:.0f}s (stall watchdog)" ), - "termination_reason": "stall_no_output", - "stalled_tool": ctx.stalled_tool, - }, + "stall_no_output", + stalled_tool=ctx.stalled_tool, + ), ) logger.error(f"[Headless Task] Task {ctx.task_session_id} timed out after {ctx.effective_timeout}s") raise HTTPException( status_code=504, - detail={ - "message": f"Task execution timed out after {ctx.effective_timeout} seconds", - "termination_reason": "max_duration", - }, + detail=_timeout_504_detail( + ctx, + f"Task execution timed out after {ctx.effective_timeout} seconds", + "max_duration", + ), ) except RuntimeError as e: # Permission mode validation failure — fast-fail with actionable error diff --git a/tests/unit/test_1201_timeout_telemetry_salvage.py b/tests/unit/test_1201_timeout_telemetry_salvage.py new file mode 100644 index 00000000..23968c1d --- /dev/null +++ b/tests/unit/test_1201_timeout_telemetry_salvage.py @@ -0,0 +1,113 @@ +"""Unit tests for #1201: agent-side timeout (504) must carry execution +telemetry so the backend can salvage cost/context onto the FAILED row. + +Before #1201, the 504 raised on an agent-side budget timeout / stall-kill +(`headless_executor.py`) carried only ``{message, termination_reason}``. The +cost / context / tool-call telemetry parsed before the kill (``ctx.metadata``) +was discarded, so the backend wrote a bare FAILED row — zero cost accounting +for exactly the long, tool-heavy runs that reach the timeout. + +Fix: the 504 detail now includes ``metadata`` = sanitized +``ExecutionMetadata.model_dump()`` (the #678 structured-error shape). The +backend's existing HTTPError salvage branch reads ``detail["metadata"]`` and +persists cost / context_used / context_max onto the FAILED row — unchanged. + +These tests pin the agent-side contract (`_timeout_504_detail`): the body +carries the telemetry in the exact shape the backend salvage consumes. + +Module under test: + docker/base-image/agent_server/services/headless_executor.py +""" +from __future__ import annotations + +import pytest + +# conftest.py preloads the real agent_server namespace package. +from agent_server.services import headless_executor as he # noqa: E402 +from agent_server.models import ExecutionMetadata # noqa: E402 + + +def _ctx(metadata: ExecutionMetadata) -> "he.HeadlessRunContext": + return he.HeadlessRunContext( + cmd=["claude", "--print"], + task_session_id="t-1201", + task_start_iso="2026-01-01T00:00:00Z", + effective_timeout=2400, + images=None, + prompt="hello", + metadata=metadata, + ) + + +def _populated_metadata() -> ExecutionMetadata: + return ExecutionMetadata( + cost_usd=0.4242, + input_tokens=1234, + output_tokens=99, + cache_read_tokens=500, + cache_creation_tokens=100, + context_window=200000, + tool_count=7, + num_turns=12, + ) + + +def test_max_duration_504_carries_metadata(): + ctx = _ctx(_populated_metadata()) + detail = he._timeout_504_detail( + ctx, "Task execution timed out after 2400 seconds", "max_duration" + ) + + assert detail["termination_reason"] == "max_duration" + assert detail["message"] == "Task execution timed out after 2400 seconds" + assert "stalled_tool" not in detail # only set for stall kills + + md = detail["metadata"] + assert md["cost_usd"] == pytest.approx(0.4242) # cost accounting preserved + assert md["input_tokens"] == 1234 + assert md["context_window"] == 200000 + assert md["tool_count"] == 7 + + +def test_stall_504_carries_metadata_and_tool(): + ctx = _ctx(_populated_metadata()) + detail = he._timeout_504_detail( + ctx, + "Killed: tool 'mcp__x__hang' produced no output for 300s (stall watchdog)", + "stall_no_output", + stalled_tool="mcp__x__hang", + ) + + assert detail["termination_reason"] == "stall_no_output" + assert detail["stalled_tool"] == "mcp__x__hang" + assert detail["metadata"]["cost_usd"] == pytest.approx(0.4242) + + +def test_metadata_shape_matches_backend_salvage_contract(): + """The metadata dict must carry every key the backend HTTPError salvage + reads (task_execution_service.py): cost_usd + the token fields + _compute_context_used consumes + context_window. Guards against an + ExecutionMetadata rename silently breaking salvage.""" + detail = he._timeout_504_detail(_ctx(_populated_metadata()), "m", "max_duration") + md = detail["metadata"] + for key in ( + "cost_usd", + "input_tokens", + "cache_read_tokens", + "cache_creation_tokens", + "context_window", + ): + assert key in md, f"backend salvage reads metadata['{key}']" + + # Mirror _compute_context_used: cache_read + cache_creation wins when > 0. + expected_context_used = md["cache_read_tokens"] + md["cache_creation_tokens"] + assert expected_context_used == 600 + + +def test_empty_metadata_still_well_formed(): + """A timeout before any telemetry parsed (default ExecutionMetadata) still + produces a valid body — salvage just yields null cost / None context.""" + detail = he._timeout_504_detail(_ctx(ExecutionMetadata()), "m", "max_duration") + assert detail["metadata"]["cost_usd"] is None + assert detail["metadata"]["input_tokens"] == 0 + assert detail["metadata"]["context_window"] == 200000 # model default