From 49cc40abb7d65af7b4c94e20f5a4e0ab507aa33e Mon Sep 17 00:00:00 2001 From: Curtis Galione Date: Tue, 28 Apr 2026 20:23:41 -0700 Subject: [PATCH 1/2] feat(openai_agents): pull cached tokens through into metrics MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Both _response_log_data (Responses API) and _usage_to_metrics (chat-completions / Generation spans) only emitted total / prompt / completion tokens. Cached / reasoning / audio token counts surfaced via the OpenAI usage `*_tokens_details` sub-objects were dropped, so the OpenAI Agents SDK integration never logged metrics like prompt_cached_tokens — even though the OpenAI wrapper already does. Walk *_tokens_details inside _usage_to_metrics (mapping the input/output prefix to prompt/completion to stay consistent with Braintrust's convention) and route _response_log_data through the same helper. Mirrors the JS fix in #1186. Tests cover the four cases from the JS PR: cached tokens present on a Response span, zero is preserved, missing details produces no metric, and Generation spans extract cached tokens too. --- .../openai_agents/test_openai_agents.py | 134 ++++++++++++++++++ .../integrations/openai_agents/tracing.py | 27 +++- 2 files changed, 158 insertions(+), 3 deletions(-) diff --git a/py/src/braintrust/integrations/openai_agents/test_openai_agents.py b/py/src/braintrust/integrations/openai_agents/test_openai_agents.py index 79dd13dc..448a14fa 100644 --- a/py/src/braintrust/integrations/openai_agents/test_openai_agents.py +++ b/py/src/braintrust/integrations/openai_agents/test_openai_agents.py @@ -321,3 +321,137 @@ class TestAutoInstrumentOpenAIAgents: def test_auto_instrument_openai_agents(self): verify_autoinstrument_script("test_auto_openai_agents.py") + + +# --------------------------------------------------------------------------- +# Cached-token metric extraction +# --------------------------------------------------------------------------- + + +class _StubResponseUsage: + def __init__(self, usage: dict): + self._usage = usage + + def model_dump(self) -> dict: + return self._usage + + +class _StubResponse: + def __init__(self, usage: dict, output=None, metadata=None): + self.usage = _StubResponseUsage(usage) + self.output = output + self.metadata = metadata + + def model_dump(self, exclude=None): # noqa: ARG002 + return {} + + +class _StubResponseSpanData: + def __init__(self, usage: dict, input_=None, output=None): + self.input = input_ + self.response = _StubResponse(usage, output=output) + + +class _StubGenerationSpanData: + def __init__(self, usage: dict, model: str = "gpt-4o-mini"): + self.usage = usage + self.input = [{"role": "user", "content": "test"}] + self.output = [{"role": "assistant", "content": "response"}] + self.model = model + self.model_config = {} + + +class _StubSpan: + def __init__(self, span_data): + self.span_data = span_data + self.started_at = None + self.ended_at = None + + +def test_response_span_extracts_cached_tokens_from_usage(): + """Mirrors JS test: Response span extracts cached tokens from usage.""" + processor = BraintrustTracingProcessor() + span = _StubSpan( + _StubResponseSpanData( + usage={ + "input_tokens": 100, + "output_tokens": 50, + "total_tokens": 150, + "input_tokens_details": { + "cached_tokens": 80, # check for this later + }, + }, + input_="test input", + output="test output", + ) + ) + + data = processor._response_log_data(span) + + metrics = data["metrics"] + assert metrics["prompt_tokens"] == 100 + assert metrics["completion_tokens"] == 50 + assert metrics["tokens"] == 150 + assert metrics["prompt_cached_tokens"] == 80 + + +def test_response_span_handles_zero_cached_tokens(): + """Mirrors JS test: Zero cached tokens should be logged, not skipped.""" + processor = BraintrustTracingProcessor() + span = _StubSpan( + _StubResponseSpanData( + usage={ + "input_tokens": 100, + "output_tokens": 50, + "input_tokens_details": { + "cached_tokens": 0, # Zero is a valid value + }, + } + ) + ) + + data = processor._response_log_data(span) + + assert data["metrics"]["prompt_cached_tokens"] == 0 + + +def test_response_span_handles_missing_cached_tokens(): + """Mirrors JS test: Should not add prompt_cached_tokens if not in usage.""" + processor = BraintrustTracingProcessor() + span = _StubSpan( + _StubResponseSpanData( + usage={ + "input_tokens": 100, + "output_tokens": 50, + # No input_tokens_details at all + } + ) + ) + + data = processor._response_log_data(span) + + assert "prompt_cached_tokens" not in data["metrics"] + + +def test_generation_span_extracts_cached_tokens_from_usage(): + """Mirrors JS test: Generation span extracts cached tokens from usage.""" + processor = BraintrustTracingProcessor() + span = _StubSpan( + _StubGenerationSpanData( + usage={ + "input_tokens": 200, + "output_tokens": 75, + "total_tokens": 275, + "input_tokens_details": { + "cached_tokens": 150, + }, + } + ) + ) + + data = processor._generation_log_data(span) + + metrics = data["metrics"] + assert metrics["prompt_tokens"] == 200 + assert metrics["completion_tokens"] == 75 + assert metrics["prompt_cached_tokens"] == 150 diff --git a/py/src/braintrust/integrations/openai_agents/tracing.py b/py/src/braintrust/integrations/openai_agents/tracing.py index 6fde395f..e5605441 100644 --- a/py/src/braintrust/integrations/openai_agents/tracing.py +++ b/py/src/braintrust/integrations/openai_agents/tracing.py @@ -69,6 +69,14 @@ def _maybe_timestamp_elapsed(end: str | None, start: str | None) -> float | None return (datetime.datetime.fromisoformat(end) - datetime.datetime.fromisoformat(start)).total_seconds() +# Maps the prefix of an OpenAI usage `*_tokens_details` field to the Braintrust +# metric prefix (e.g. `input_tokens_details.cached_tokens` → `prompt_cached_tokens`). +_TOKEN_PREFIX_MAP = { + "input": "prompt", + "output": "completion", +} + + def _usage_to_metrics(usage: dict[str, Any]) -> dict[str, Any]: """Convert an OpenAI-style usage dict to Braintrust metrics.""" metrics: dict[str, Any] = {} @@ -86,6 +94,19 @@ def _usage_to_metrics(usage: dict[str, Any]) -> dict[str, Any]: metrics["tokens"] = usage["total_tokens"] elif "input_tokens" in usage and "output_tokens" in usage: metrics["tokens"] = usage["input_tokens"] + usage["output_tokens"] + + # Walk *_tokens_details sub-objects so we capture cached / reasoning / audio + # token counts (e.g. input_tokens_details.cached_tokens → prompt_cached_tokens). + for key, value in usage.items(): + if not key.endswith("_tokens_details") or not isinstance(value, dict): + continue + raw_prefix = key[: -len("_tokens_details")] + prefix = _TOKEN_PREFIX_MAP.get(raw_prefix, raw_prefix) + for sub_key, sub_value in value.items(): + if isinstance(sub_value, bool) or not isinstance(sub_value, (int, float)): + continue + metrics[f"{prefix}_{sub_key}"] = sub_value + return metrics @@ -166,9 +187,9 @@ def _response_log_data(self, span: tracing.Span[tracing.ResponseSpanData]) -> di if ttft is not None: data["metrics"]["time_to_first_token"] = ttft if span.span_data.response is not None and span.span_data.response.usage is not None: - data["metrics"]["tokens"] = span.span_data.response.usage.total_tokens - data["metrics"]["prompt_tokens"] = span.span_data.response.usage.input_tokens - data["metrics"]["completion_tokens"] = span.span_data.response.usage.output_tokens + usage = span.span_data.response.usage + usage_dict = usage.model_dump() if hasattr(usage, "model_dump") else dict(usage) + data["metrics"].update(_usage_to_metrics(usage_dict)) return data From 4ea89eeec2bc79a08260297d7643e75ee23546b1 Mon Sep 17 00:00:00 2001 From: Abhijeet Prasad Date: Wed, 29 Apr 2026 12:39:03 -0400 Subject: [PATCH 2/2] leverage VCR tests --- .../openai_agents/test_openai_agents.py | 140 +----------------- .../integrations/openai_agents/tracing.py | 3 +- 2 files changed, 7 insertions(+), 136 deletions(-) diff --git a/py/src/braintrust/integrations/openai_agents/test_openai_agents.py b/py/src/braintrust/integrations/openai_agents/test_openai_agents.py index 448a14fa..7a6db120 100644 --- a/py/src/braintrust/integrations/openai_agents/test_openai_agents.py +++ b/py/src/braintrust/integrations/openai_agents/test_openai_agents.py @@ -125,6 +125,12 @@ async def test_openai_agents_integration_setup_creates_spans(memory_logger): llm_spans = [span for span in spans if span.get("span_attributes", {}).get("type") == "llm"] assert llm_spans + llm_metrics = [span.get("metrics", {}) for span in llm_spans] + assert any(metrics.get("prompt_tokens") is not None for metrics in llm_metrics) + assert any(metrics.get("completion_tokens") is not None for metrics in llm_metrics) + assert any(metrics.get("tokens") is not None for metrics in llm_metrics) + assert any(metrics.get("prompt_cached_tokens") == 0 for metrics in llm_metrics) + assert any(metrics.get("completion_reasoning_tokens") == 0 for metrics in llm_metrics) @pytest.mark.asyncio @@ -321,137 +327,3 @@ class TestAutoInstrumentOpenAIAgents: def test_auto_instrument_openai_agents(self): verify_autoinstrument_script("test_auto_openai_agents.py") - - -# --------------------------------------------------------------------------- -# Cached-token metric extraction -# --------------------------------------------------------------------------- - - -class _StubResponseUsage: - def __init__(self, usage: dict): - self._usage = usage - - def model_dump(self) -> dict: - return self._usage - - -class _StubResponse: - def __init__(self, usage: dict, output=None, metadata=None): - self.usage = _StubResponseUsage(usage) - self.output = output - self.metadata = metadata - - def model_dump(self, exclude=None): # noqa: ARG002 - return {} - - -class _StubResponseSpanData: - def __init__(self, usage: dict, input_=None, output=None): - self.input = input_ - self.response = _StubResponse(usage, output=output) - - -class _StubGenerationSpanData: - def __init__(self, usage: dict, model: str = "gpt-4o-mini"): - self.usage = usage - self.input = [{"role": "user", "content": "test"}] - self.output = [{"role": "assistant", "content": "response"}] - self.model = model - self.model_config = {} - - -class _StubSpan: - def __init__(self, span_data): - self.span_data = span_data - self.started_at = None - self.ended_at = None - - -def test_response_span_extracts_cached_tokens_from_usage(): - """Mirrors JS test: Response span extracts cached tokens from usage.""" - processor = BraintrustTracingProcessor() - span = _StubSpan( - _StubResponseSpanData( - usage={ - "input_tokens": 100, - "output_tokens": 50, - "total_tokens": 150, - "input_tokens_details": { - "cached_tokens": 80, # check for this later - }, - }, - input_="test input", - output="test output", - ) - ) - - data = processor._response_log_data(span) - - metrics = data["metrics"] - assert metrics["prompt_tokens"] == 100 - assert metrics["completion_tokens"] == 50 - assert metrics["tokens"] == 150 - assert metrics["prompt_cached_tokens"] == 80 - - -def test_response_span_handles_zero_cached_tokens(): - """Mirrors JS test: Zero cached tokens should be logged, not skipped.""" - processor = BraintrustTracingProcessor() - span = _StubSpan( - _StubResponseSpanData( - usage={ - "input_tokens": 100, - "output_tokens": 50, - "input_tokens_details": { - "cached_tokens": 0, # Zero is a valid value - }, - } - ) - ) - - data = processor._response_log_data(span) - - assert data["metrics"]["prompt_cached_tokens"] == 0 - - -def test_response_span_handles_missing_cached_tokens(): - """Mirrors JS test: Should not add prompt_cached_tokens if not in usage.""" - processor = BraintrustTracingProcessor() - span = _StubSpan( - _StubResponseSpanData( - usage={ - "input_tokens": 100, - "output_tokens": 50, - # No input_tokens_details at all - } - ) - ) - - data = processor._response_log_data(span) - - assert "prompt_cached_tokens" not in data["metrics"] - - -def test_generation_span_extracts_cached_tokens_from_usage(): - """Mirrors JS test: Generation span extracts cached tokens from usage.""" - processor = BraintrustTracingProcessor() - span = _StubSpan( - _StubGenerationSpanData( - usage={ - "input_tokens": 200, - "output_tokens": 75, - "total_tokens": 275, - "input_tokens_details": { - "cached_tokens": 150, - }, - } - ) - ) - - data = processor._generation_log_data(span) - - metrics = data["metrics"] - assert metrics["prompt_tokens"] == 200 - assert metrics["completion_tokens"] == 75 - assert metrics["prompt_cached_tokens"] == 150 diff --git a/py/src/braintrust/integrations/openai_agents/tracing.py b/py/src/braintrust/integrations/openai_agents/tracing.py index e5605441..bf1a7274 100644 --- a/py/src/braintrust/integrations/openai_agents/tracing.py +++ b/py/src/braintrust/integrations/openai_agents/tracing.py @@ -187,8 +187,7 @@ def _response_log_data(self, span: tracing.Span[tracing.ResponseSpanData]) -> di if ttft is not None: data["metrics"]["time_to_first_token"] = ttft if span.span_data.response is not None and span.span_data.response.usage is not None: - usage = span.span_data.response.usage - usage_dict = usage.model_dump() if hasattr(usage, "model_dump") else dict(usage) + usage_dict = span.span_data.response.usage.model_dump() data["metrics"].update(_usage_to_metrics(usage_dict)) return data