[feat][eval] Add Codex fallback and JSONL-based cost tracking for ACW modes

ayazhankadessova · ayazhankadessova · commit 817456af5ee7 · 2026-03-05T05:17:16.000+03:00
session.py: Add fallback_backend parameter to run_prompt() -- when primary
backend (e.g. Codex) exhausts retries, automatically retry with fallback
backend (e.g. Claude Opus), dropping provider-specific extra_flags.

pipeline.py: Restore consensus default to Codex (gpt-5.2-codex) with
Claude Opus as fallback_backend. Add Codex flags and fallback to
run_consensus_stage() for parity. Prevents empty-consensus failures
when Codex API key is missing or expired.

eval_harness.py: Add _snapshot_jsonl_usage() and _diff_jsonl_usage()
helpers that read ~/.claude/projects/**/*.jsonl session files to compute
token usage deltas. run_full_impl() now snapshots before/after the
pipeline to track cost for both full and impl modes, which previously
reported $0.00 because ACW does not return token data.
diff --git a/python/agentize/eval/eval_harness.py b/python/agentize/eval/eval_harness.py
@@ -103,6 +103,79 @@ def _parse_claude_usage(stdout: str, model: str) -> dict:
     return result
 
 
+# ---------------------------------------------------------------------------
+# JSONL-based cost tracking for ACW modes
+# ---------------------------------------------------------------------------
+
+def _snapshot_jsonl_usage() -> dict:
+    """Snapshot current token totals from all Claude JSONL session files.
+
+    Returns a dict with keys: input_tokens, output_tokens, cost_usd.
+    This reads ~/.claude/projects/**/*.jsonl and sums usage from all
+    assistant messages, allowing before/after diffing to measure cost
+    of ACW subprocess calls.
+    """
+    from agentize.usage import match_model_pricing
+
+    totals = {"input_tokens": 0, "output_tokens": 0, "cost_usd": 0.0}
+    projects_dir = Path.home() / ".claude" / "projects"
+    if not projects_dir.exists():
+        return totals
+
+    for jsonl_path in projects_dir.glob("**/*.jsonl"):
+        try:
+            with open(jsonl_path, "r", encoding="utf-8", errors="ignore") as f:
+                for line in f:
+                    line = line.strip()
+                    if not line:
+                        continue
+                    try:
+                        entry = json.loads(line)
+                        if entry.get("type") == "assistant":
+                            message = entry.get("message", {})
+                            usage = message.get("usage", {})
+                            inp = usage.get("input_tokens", 0)
+                            out = usage.get("output_tokens", 0)
+                            if inp > 0 or out > 0:
+                                totals["input_tokens"] += inp
+                                totals["output_tokens"] += out
+                                model_id = message.get("model", "")
+                                rates = match_model_pricing(model_id)
+                                if rates:
+                                    cache_read = usage.get("cache_read_input_tokens", 0)
+                                    cache_write = usage.get("cache_creation_input_tokens", 0)
+                                    non_cache = max(0, inp - cache_read - cache_write)
+                                    totals["cost_usd"] += (
+                                        non_cache * rates["input"] / 1_000_000
+                                        + out * rates["output"] / 1_000_000
+                                        + cache_read * rates["cache_read"] / 1_000_000
+                                        + cache_write * rates["cache_write"] / 1_000_000
+                                    )
+                    except (json.JSONDecodeError, KeyError):
+                        continue
+        except (OSError, IOError):
+            continue
+
+    return totals
+
+
+def _diff_jsonl_usage(before: dict, after: dict) -> dict:
+    """Compute the delta between two JSONL usage snapshots.
+
+    Returns a dict compatible with result fields: input_tokens, output_tokens,
+    tokens, cost_usd.
+    """
+    inp = max(0, after["input_tokens"] - before["input_tokens"])
+    out = max(0, after["output_tokens"] - before["output_tokens"])
+    cost = max(0.0, after["cost_usd"] - before["cost_usd"])
+    return {
+        "input_tokens": inp,
+        "output_tokens": out,
+        "tokens": inp + out,
+        "cost_usd": cost,
+    }
+
+
 # ---------------------------------------------------------------------------
 # Task loading
 # ---------------------------------------------------------------------------
@@ -587,10 +660,10 @@ def run_full_impl(
     start_time = time.time()
     result = _make_result(instance_id)
 
-    # Estimate planning cost for full mode (ACW doesn't return token data).
-    # Uses model config: understander=sonnet, bold/critique/reducer/consensus=opus.
+    # Snapshot JSONL usage before running to measure ACW cost via delta
+    usage_before = _snapshot_jsonl_usage()
     if not skip_planning:
-        result["cost_note"] = "planning costs estimated; ACW subprocess has no token tracking"
+        result["cost_note"] = "planning+impl cost estimated via JSONL session delta"
 
     # Run the pipeline body in a daemon thread so we can enforce a timeout.
     # A daemon thread is killed when the main thread moves on; this is
@@ -626,6 +699,14 @@ def _run_pipeline():
         result["status"] = status_bucket[0] if status_bucket else "error"
         result["wall_time"] = time.time() - start_time
 
+    # Compute cost from JSONL delta
+    usage_after = _snapshot_jsonl_usage()
+    delta = _diff_jsonl_usage(usage_before, usage_after)
+    result["input_tokens"] = delta["input_tokens"]
+    result["output_tokens"] = delta["output_tokens"]
+    result["tokens"] = delta["tokens"]
+    result["cost_usd"] = delta["cost_usd"]
+
     return result
 
 
diff --git a/python/agentize/workflow/api/session.py b/python/agentize/workflow/api/session.py
@@ -165,6 +165,7 @@ def run_prompt(
         retry_delay: float = 0,
         input_path: str | Path | None = None,
         output_path: str | Path | None = None,
+        fallback_backend: tuple[str, str] | None = None,
     ) -> StageResult:
         input_path_resolved, output_path_resolved = self._resolve_paths(
             name, input_path, output_path
@@ -201,6 +202,36 @@ def run_prompt(
                 if attempt <= retry and retry_delay > 0:
                     time.sleep(retry_delay)
 
+        # If primary backend exhausted retries and a fallback is configured, try it
+        if fallback_backend and fallback_backend != backend:
+            self._log(
+                f"Stage '{name}' failed with {backend[0]}:{backend[1]}, "
+                f"falling back to {fallback_backend[0]}:{fallback_backend[1]}"
+            )
+            try:
+                self._write_prompt(prompt, input_path_resolved)
+                process = self._run_stage(
+                    name,
+                    fallback_backend,
+                    input_path_resolved,
+                    output_path_resolved,
+                    tools=tools,
+                    permission_mode=permission_mode,
+                    timeout=timeout,
+                    extra_flags=None,  # drop provider-specific flags
+                )
+                self._validate_output(name, output_path_resolved, process)
+                if self._log_output_dump:
+                    self._log(f"{name} dumped to {output_path_resolved}")
+                return StageResult(
+                    stage=name,
+                    input_path=input_path_resolved,
+                    output_path=output_path_resolved,
+                    process=process,
+                )
+            except Exception as exc:
+                last_error = exc
+
         raise PipelineError(name, attempts, last_error)
 
     def stage(
diff --git a/python/agentize/workflow/planner/pipeline.py b/python/agentize/workflow/planner/pipeline.py
@@ -37,7 +37,7 @@
     "bold": ("claude", "opus"),
     "critique": ("claude", "opus"),
     "reducer": ("claude", "opus"),
-    "consensus": ("claude", "opus"),
+    "consensus": ("codex", "gpt-5.2-codex"),
 }
 
 # Tool configurations per stage (Claude provider only)
@@ -268,6 +268,7 @@ def _write_consensus_prompt(path: Path) -> str:
         tools=STAGE_TOOLS.get("consensus"),
         permission_mode=STAGE_PERMISSION_MODE.get("consensus"),
         extra_flags=codex_flags,
+        fallback_backend=("claude", "opus"),
     )
 
     return results
@@ -315,12 +316,20 @@ def _write_consensus_prompt(path: Path) -> str:
         log_acw_command=True,
         log_output_dump=log_output_dump,
     )
+    consensus_provider = stage_backends["consensus"][0]
+    codex_flags = (
+        ["-s", "read-only", "--enable", "web_search_request",
+         "-c", "model_reasoning_effort=xhigh"]
+        if consensus_provider == "codex" else None
+    )
     return session.run_prompt(
         "consensus",
         _write_consensus_prompt,
         stage_backends["consensus"],
         tools=STAGE_TOOLS.get("consensus"),
         permission_mode=STAGE_PERMISSION_MODE.get("consensus"),
+        extra_flags=codex_flags,
+        fallback_backend=("claude", "opus"),
         input_path=input_path,
         output_path=output_path,
     )