Skip to content

Commit 817456a

Browse files
[feat][eval] Add Codex fallback and JSONL-based cost tracking for ACW modes
session.py: Add fallback_backend parameter to run_prompt() -- when primary backend (e.g. Codex) exhausts retries, automatically retry with fallback backend (e.g. Claude Opus), dropping provider-specific extra_flags. pipeline.py: Restore consensus default to Codex (gpt-5.2-codex) with Claude Opus as fallback_backend. Add Codex flags and fallback to run_consensus_stage() for parity. Prevents empty-consensus failures when Codex API key is missing or expired. eval_harness.py: Add _snapshot_jsonl_usage() and _diff_jsonl_usage() helpers that read ~/.claude/projects/**/*.jsonl session files to compute token usage deltas. run_full_impl() now snapshots before/after the pipeline to track cost for both full and impl modes, which previously reported $0.00 because ACW does not return token data.
1 parent 2bd2162 commit 817456a

3 files changed

Lines changed: 125 additions & 4 deletions

File tree

python/agentize/eval/eval_harness.py

Lines changed: 84 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -103,6 +103,79 @@ def _parse_claude_usage(stdout: str, model: str) -> dict:
103103
return result
104104

105105

106+
# ---------------------------------------------------------------------------
107+
# JSONL-based cost tracking for ACW modes
108+
# ---------------------------------------------------------------------------
109+
110+
def _snapshot_jsonl_usage() -> dict:
111+
"""Snapshot current token totals from all Claude JSONL session files.
112+
113+
Returns a dict with keys: input_tokens, output_tokens, cost_usd.
114+
This reads ~/.claude/projects/**/*.jsonl and sums usage from all
115+
assistant messages, allowing before/after diffing to measure cost
116+
of ACW subprocess calls.
117+
"""
118+
from agentize.usage import match_model_pricing
119+
120+
totals = {"input_tokens": 0, "output_tokens": 0, "cost_usd": 0.0}
121+
projects_dir = Path.home() / ".claude" / "projects"
122+
if not projects_dir.exists():
123+
return totals
124+
125+
for jsonl_path in projects_dir.glob("**/*.jsonl"):
126+
try:
127+
with open(jsonl_path, "r", encoding="utf-8", errors="ignore") as f:
128+
for line in f:
129+
line = line.strip()
130+
if not line:
131+
continue
132+
try:
133+
entry = json.loads(line)
134+
if entry.get("type") == "assistant":
135+
message = entry.get("message", {})
136+
usage = message.get("usage", {})
137+
inp = usage.get("input_tokens", 0)
138+
out = usage.get("output_tokens", 0)
139+
if inp > 0 or out > 0:
140+
totals["input_tokens"] += inp
141+
totals["output_tokens"] += out
142+
model_id = message.get("model", "")
143+
rates = match_model_pricing(model_id)
144+
if rates:
145+
cache_read = usage.get("cache_read_input_tokens", 0)
146+
cache_write = usage.get("cache_creation_input_tokens", 0)
147+
non_cache = max(0, inp - cache_read - cache_write)
148+
totals["cost_usd"] += (
149+
non_cache * rates["input"] / 1_000_000
150+
+ out * rates["output"] / 1_000_000
151+
+ cache_read * rates["cache_read"] / 1_000_000
152+
+ cache_write * rates["cache_write"] / 1_000_000
153+
)
154+
except (json.JSONDecodeError, KeyError):
155+
continue
156+
except (OSError, IOError):
157+
continue
158+
159+
return totals
160+
161+
162+
def _diff_jsonl_usage(before: dict, after: dict) -> dict:
163+
"""Compute the delta between two JSONL usage snapshots.
164+
165+
Returns a dict compatible with result fields: input_tokens, output_tokens,
166+
tokens, cost_usd.
167+
"""
168+
inp = max(0, after["input_tokens"] - before["input_tokens"])
169+
out = max(0, after["output_tokens"] - before["output_tokens"])
170+
cost = max(0.0, after["cost_usd"] - before["cost_usd"])
171+
return {
172+
"input_tokens": inp,
173+
"output_tokens": out,
174+
"tokens": inp + out,
175+
"cost_usd": cost,
176+
}
177+
178+
106179
# ---------------------------------------------------------------------------
107180
# Task loading
108181
# ---------------------------------------------------------------------------
@@ -587,10 +660,10 @@ def run_full_impl(
587660
start_time = time.time()
588661
result = _make_result(instance_id)
589662

590-
# Estimate planning cost for full mode (ACW doesn't return token data).
591-
# Uses model config: understander=sonnet, bold/critique/reducer/consensus=opus.
663+
# Snapshot JSONL usage before running to measure ACW cost via delta
664+
usage_before = _snapshot_jsonl_usage()
592665
if not skip_planning:
593-
result["cost_note"] = "planning costs estimated; ACW subprocess has no token tracking"
666+
result["cost_note"] = "planning+impl cost estimated via JSONL session delta"
594667

595668
# Run the pipeline body in a daemon thread so we can enforce a timeout.
596669
# A daemon thread is killed when the main thread moves on; this is
@@ -626,6 +699,14 @@ def _run_pipeline():
626699
result["status"] = status_bucket[0] if status_bucket else "error"
627700
result["wall_time"] = time.time() - start_time
628701

702+
# Compute cost from JSONL delta
703+
usage_after = _snapshot_jsonl_usage()
704+
delta = _diff_jsonl_usage(usage_before, usage_after)
705+
result["input_tokens"] = delta["input_tokens"]
706+
result["output_tokens"] = delta["output_tokens"]
707+
result["tokens"] = delta["tokens"]
708+
result["cost_usd"] = delta["cost_usd"]
709+
629710
return result
630711

631712

python/agentize/workflow/api/session.py

Lines changed: 31 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -165,6 +165,7 @@ def run_prompt(
165165
retry_delay: float = 0,
166166
input_path: str | Path | None = None,
167167
output_path: str | Path | None = None,
168+
fallback_backend: tuple[str, str] | None = None,
168169
) -> StageResult:
169170
input_path_resolved, output_path_resolved = self._resolve_paths(
170171
name, input_path, output_path
@@ -201,6 +202,36 @@ def run_prompt(
201202
if attempt <= retry and retry_delay > 0:
202203
time.sleep(retry_delay)
203204

205+
# If primary backend exhausted retries and a fallback is configured, try it
206+
if fallback_backend and fallback_backend != backend:
207+
self._log(
208+
f"Stage '{name}' failed with {backend[0]}:{backend[1]}, "
209+
f"falling back to {fallback_backend[0]}:{fallback_backend[1]}"
210+
)
211+
try:
212+
self._write_prompt(prompt, input_path_resolved)
213+
process = self._run_stage(
214+
name,
215+
fallback_backend,
216+
input_path_resolved,
217+
output_path_resolved,
218+
tools=tools,
219+
permission_mode=permission_mode,
220+
timeout=timeout,
221+
extra_flags=None, # drop provider-specific flags
222+
)
223+
self._validate_output(name, output_path_resolved, process)
224+
if self._log_output_dump:
225+
self._log(f"{name} dumped to {output_path_resolved}")
226+
return StageResult(
227+
stage=name,
228+
input_path=input_path_resolved,
229+
output_path=output_path_resolved,
230+
process=process,
231+
)
232+
except Exception as exc:
233+
last_error = exc
234+
204235
raise PipelineError(name, attempts, last_error)
205236

206237
def stage(

python/agentize/workflow/planner/pipeline.py

Lines changed: 10 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -37,7 +37,7 @@
3737
"bold": ("claude", "opus"),
3838
"critique": ("claude", "opus"),
3939
"reducer": ("claude", "opus"),
40-
"consensus": ("claude", "opus"),
40+
"consensus": ("codex", "gpt-5.2-codex"),
4141
}
4242

4343
# Tool configurations per stage (Claude provider only)
@@ -268,6 +268,7 @@ def _write_consensus_prompt(path: Path) -> str:
268268
tools=STAGE_TOOLS.get("consensus"),
269269
permission_mode=STAGE_PERMISSION_MODE.get("consensus"),
270270
extra_flags=codex_flags,
271+
fallback_backend=("claude", "opus"),
271272
)
272273

273274
return results
@@ -315,12 +316,20 @@ def _write_consensus_prompt(path: Path) -> str:
315316
log_acw_command=True,
316317
log_output_dump=log_output_dump,
317318
)
319+
consensus_provider = stage_backends["consensus"][0]
320+
codex_flags = (
321+
["-s", "read-only", "--enable", "web_search_request",
322+
"-c", "model_reasoning_effort=xhigh"]
323+
if consensus_provider == "codex" else None
324+
)
318325
return session.run_prompt(
319326
"consensus",
320327
_write_consensus_prompt,
321328
stage_backends["consensus"],
322329
tools=STAGE_TOOLS.get("consensus"),
323330
permission_mode=STAGE_PERMISSION_MODE.get("consensus"),
331+
extra_flags=codex_flags,
332+
fallback_backend=("claude", "opus"),
324333
input_path=input_path,
325334
output_path=output_path,
326335
)

0 commit comments

Comments
 (0)