From b2064b7fc41c5991ffb8e7c72ba00c006ad9e72b Mon Sep 17 00:00:00 2001
From: Marcello Alarcon <marce@MTA-digital-MacBook-Pro.local>
Date: Thu, 14 May 2026 10:47:45 -0300
Subject: [PATCH] fix(heartbeat-runner): capture cost_usd and token counts from
 Claude CLI
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

step7_invoke_claude was hardcoding tokens_in/tokens_out/cost_usd to None
with a comment claiming "Claude CLI doesn't expose token counts easily".
That comment is outdated — the runner already passes `--output-format json`
to the CLI, which emits a single-line JSON object containing:

  - `total_cost_usd` (top level)
  - `usage.input_tokens`, `usage.output_tokens`
  - `usage.cache_creation_input_tokens`, `usage.cache_read_input_tokens`
  - `modelUsage.<model>.{costUSD,inputTokens,outputTokens,...}`

Result: the heartbeat_runs.cost_usd column was always NULL despite the
schema, making cost tracking and budget analysis impossible.

Sourcery review rounds 1, 2, and 3 incorporated (2026-05-14):

Round 1:
  - Narrowed the catch from blanket Exception to specific failure modes
    (json.JSONDecodeError + IndexError + TypeError + AttributeError).
  - Added warning logging on parse failure (silent data loss was the
    original problem; swallowing the new parse failure would be
    self-defeating).
  - Rounded total_cost_usd to 6 decimal places (micro-dollar) before
    returning so aggregated SUM queries don't accumulate float drift.

Round 2:
  - Switched from `print(file=sys.stderr)` to `logging.getLogger(__name__)`
    + `log.warning(...)` — consistent control plane, no new dependency.
  - Extracted `_parse_claude_cli_usage(output)` helper so step7 stays
    focused on subprocess orchestration. Helper is pure and unit-testable.
  - SECURITY: removed the 200-char raw output preview from the failure
    log. CLI output can contain PII or sensitive model-generated content
    from the user's prompt. Log now contains ONLY the exception class
    and the output length.

Round 3:
  - Accept stringified `total_cost_usd` defensively (forward-compat with
    a possible future Decimal-serializer wrap on the CLI side). Today
    CLI 2.1.141 always emits float, but the cost is one isinstance check
    and a try/float() — cheap forward-compat for monetary data.
  - Non-numeric / dict / list / None all still fall back to NULL safely.

Round 3 also raised two points we didn't change:
  - "json.loads with no import in the diff" → false positive. `json` is
    already imported at module top (line 21 of upstream/main, unchanged
    by this PR, hence not in the diff). No NameError.
  - "Pass status into the helper" → the helper is only called when
    status == 'success' (one-line ternary in the caller). Logging
    status inside the helper would be redundant.

## Why this matters now

Anthropic announced (2026-05-14) that starting 2026-06-15, Max 5x plan
subscribers get a dedicated $100/month credit for Agent SDK / claude -p
usage, separate from interactive subscription limits. To decide whether
the credit covers the workload (and how much extra-usage cap to set),
operators need per-run cost data — which is exactly what this fix
restores. With the fix deployed today, 30 days of real data accumulate
before the credit launches.

## Test plan

- [x] CLI shape verified against Claude CLI 2.1.141 (single-line JSON,
  `total_cost_usd` top-level, `usage.input_tokens`, `usage.output_tokens`).
- [x] Helper sanity scenarios:
  - Float cost (today's reality): `(tokens, tokens, 0.524192)` ✓
  - String cost (forward-compat): `(1, 2, 1.23e-4)` ✓
  - Garbage string cost: `(1, 2, None)` — ValueError caught ✓
  - Dict cost (defensive): `(3, 4, None)` — falls to None ✓
  - Missing `usage`: `(None, None, 1.0)` ✓
  - Malformed JSON: `(None, None, None)`, log.warning fires ✓
  - Empty output: `(None, None, None)`, no log ✓
- [x] Failure path unchanged: status != "success" → helper not invoked,
  all three fields remain None (existing contract preserved).
- [x] SECURITY: failure log contains only exception class + output
  length, never the raw output content.

Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
---
 dashboard/backend/heartbeat_runner.py | 57 +++++++++++++++++++++++++--
 1 file changed, 54 insertions(+), 3 deletions(-)
diff --git a/dashboard/backend/heartbeat_runner.py b/dashboard/backend/heartbeat_runner.py
index 9966643b..eb82b99f 100644
--- a/dashboard/backend/heartbeat_runner.py
+++ b/dashboard/backend/heartbeat_runner.py
@@ -19,6 +19,7 @@
 
 import argparse
 import json
+import logging
 import os
 import signal
 import subprocess
@@ -28,6 +29,8 @@
 from datetime import datetime, timezone
 from pathlib import Path
 
+log = logging.getLogger(__name__)
+
 # Workspace root
 WORKSPACE = Path(__file__).resolve().parent.parent.parent
 sys.path.insert(0, str(Path(__file__).resolve().parent))
@@ -284,17 +287,65 @@ def step7_invoke_claude(
 
     duration_ms = int((time.time() - start_time) * 1000)
 
+    tokens_in, tokens_out, cost_usd = _parse_claude_cli_usage(output) if status == "success" else (None, None, None)
+
     return {
         "status": status,
         "output": output,
         "error": error,
         "duration_ms": duration_ms,
-        "tokens_in": None,   # Claude CLI doesn't expose token counts easily
-        "tokens_out": None,
-        "cost_usd": None,
+        "tokens_in": tokens_in,
+        "tokens_out": tokens_out,
+        "cost_usd": cost_usd,
     }
 
 
+def _parse_claude_cli_usage(output: str) -> tuple[int | None, int | None, float | None]:
+    """Extract (tokens_in, tokens_out, cost_usd) from a Claude CLI ``--output-format json`` payload.
+
+    The CLI emits a single-line JSON object on success with ``total_cost_usd`` at top level and
+    ``usage.{input,output}_tokens`` nested. On any malformation the function logs a metadata-only
+    warning (no raw output) and returns ``(None, None, None)`` so the caller can persist a NULL row
+    without crashing.
+
+    ``total_cost_usd`` is rounded to 6 decimal places (micro-dollar precision) so aggregated SUM
+    queries on ``heartbeat_runs.cost_usd`` (FLOAT column) don't accumulate float-representation
+    drift. Full ``Decimal`` would require a schema migration; rounding is the pragmatic mid-point.
+
+    SECURITY: the raw CLI output may contain PII or model-generated content from the user's prompt
+    (emails, credentials, customer data). The log line on failure intentionally contains ONLY the
+    exception class and the output length — no content preview.
+    """
+    if not output:
+        return None, None, None
+    try:
+        payload = json.loads(output.strip().splitlines()[-1])
+        # `total_cost_usd` is float in the CLI today (2.1.141 verified), but accept stringified
+        # numerics defensively in case the format changes (e.g. wrapped Decimal serializer).
+        # Anything else (None, dict, list) falls back to NULL — caller persists missing data.
+        raw_cost = payload.get("total_cost_usd")
+        if isinstance(raw_cost, (int, float)):
+            cost_value = raw_cost
+        elif isinstance(raw_cost, str):
+            try:
+                cost_value = float(raw_cost)
+            except ValueError:
+                cost_value = None
+        else:
+            cost_value = None
+        cost_usd = round(cost_value, 6) if cost_value is not None else None
+        usage = payload.get("usage") or {}
+        return usage.get("input_tokens"), usage.get("output_tokens"), cost_usd
+    except (json.JSONDecodeError, IndexError, TypeError, AttributeError) as parse_exc:
+        log.warning(
+            "step7_invoke_claude: failed to parse Claude CLI JSON output "
+            "(%s, output_len=%d); cost/tokens will be NULL for this run.",
+            parse_exc.__class__.__name__,
+            len(output),
+        )
+        return None, None, None
+
+
 # ── Step 8: Persist status ────────────────────────────────────────────────────
 
 def step8_persist(run_id: str, heartbeat_id: str, result: dict, trigger_id: str | None, triggered_by: str, prompt_preview: str, conn):