diff --git a/py/noxfile.py b/py/noxfile.py index 0f1663f4..5222809c 100644 --- a/py/noxfile.py +++ b/py/noxfile.py @@ -206,6 +206,18 @@ def test_btx_openai(session, version): _run_tests(session, "braintrust/btx", version=version, env={"BTX_PROVIDER": "openai", "BTX_CLIENT": "openai"}) +@nox.session() +@nox.parametrize("version", ANTHROPIC_VERSIONS, ids=ANTHROPIC_VERSIONS) +def test_btx_anthropic(session, version): + """Run the BTX cross-language LLM-span spec tests (Anthropic provider).""" + _install_test_deps(session) + _install_matrix_dep(session, "anthropic", version) + session.install("pyyaml") + _run_tests( + session, "braintrust/btx", version=version, env={"BTX_PROVIDER": "anthropic", "BTX_CLIENT": "anthropic"} + ) + + @nox.session() def test_openai_ddtrace(session): _install_test_deps(session) diff --git a/py/src/braintrust/btx/span_validator.py b/py/src/braintrust/btx/span_validator.py index d8dd78a4..41dfeb54 100644 --- a/py/src/braintrust/btx/span_validator.py +++ b/py/src/braintrust/btx/span_validator.py @@ -56,10 +56,21 @@ def _is_reasoning_message(value: Any) -> bool: return True +def _is_positive_number(value: Any) -> bool: + return isinstance(value, (int, float)) and not isinstance(value, bool) and value > 0 + + +def _undefined_or_null(value: Any) -> bool: + """True if the value is absent (None/null) — used for fields that must not be populated.""" + return value is None + + _NAMED_MATCHERS: dict[str, Any] = { "is_non_negative_number": _is_non_negative_number, + "is_positive_number": _is_positive_number, "is_non_empty_string": _is_non_empty_string, "is_reasoning_message": _is_reasoning_message, + "undefined_or_null": _undefined_or_null, } @@ -146,6 +157,9 @@ def _validate_value(actual: Any, expected: Any, path: str, errors: list[str]) -> return for key, exp_val in expected.items(): if key not in actual: + # A missing key satisfies undefined_or_null — treat absence as None. + if isinstance(exp_val, FnMatcher) and exp_val.expr == "undefined_or_null": + continue errors.append(f"{path}.{key}: key not found in actual span") else: _validate_value(actual[key], exp_val, f"{path}.{key}", errors) diff --git a/py/src/braintrust/btx/spec-ref.txt b/py/src/braintrust/btx/spec-ref.txt index 45c7a584..5c314e5a 100644 --- a/py/src/braintrust/btx/spec-ref.txt +++ b/py/src/braintrust/btx/spec-ref.txt @@ -1 +1 @@ -v0.0.1 +v0.0.5 diff --git a/py/src/braintrust/btx/spec_executor.py b/py/src/braintrust/btx/spec_executor.py index ca4b943d..0602d8c7 100644 --- a/py/src/braintrust/btx/spec_executor.py +++ b/py/src/braintrust/btx/spec_executor.py @@ -122,7 +122,7 @@ def _dispatch(spec: LlmSpanSpec, client: Any) -> None: _execute_responses(spec.requests, client) elif provider == "anthropic" and endpoint == "/v1/messages": - _execute_anthropic_messages(spec.requests, client) + _execute_anthropic_messages(spec.requests, client, extra_headers=spec.headers or {}) else: raise NotImplementedError(f"BTX executor: provider={provider!r} endpoint={endpoint!r} not implemented") @@ -191,7 +191,9 @@ def _execute_responses(requests: list[dict[str, Any]], client: Any) -> None: # --------------------------------------------------------------------------- -def _execute_anthropic_messages(requests: list[dict[str, Any]], client: Any) -> None: +def _execute_anthropic_messages( + requests: list[dict[str, Any]], client: Any, extra_headers: dict[str, str] | None = None +) -> None: """Execute Anthropic messages requests. Handles streaming (stream=True) by consuming the stream context manager. @@ -209,14 +211,20 @@ def _execute_anthropic_messages(requests: list[dict[str, Any]], client: Any) -> is_streaming = full_req.get("stream", False) conversation_history.extend(req.get("messages", [])) + create_kwargs: dict[str, Any] = dict(full_req) + if extra_headers: + create_kwargs["extra_headers"] = extra_headers + if is_streaming: - with client.messages.create(**full_req) as stream: - final = stream.get_final_message() - if hasattr(final, "content") and final.content: - text_blocks = [b.text for b in final.content if hasattr(b, "text")] - conversation_history.append({"role": "assistant", "content": " ".join(text_blocks)}) + # Iterate the stream to exhaustion — the Braintrust TracedMessageStream + # context manager captures metrics and logs the span on __exit__. + # We can't call get_final_message() on the traced wrapper, so we + # skip history accumulation for streaming (no multi-turn streaming specs). + with client.messages.create(**create_kwargs) as stream: + for _ in stream: + pass else: - response = client.messages.create(**full_req) + response = client.messages.create(**create_kwargs) if hasattr(response, "content") and response.content: text_blocks = [b.text for b in response.content if hasattr(b, "text")] conversation_history.append({"role": "assistant", "content": " ".join(text_blocks)}) diff --git a/py/src/braintrust/btx/spec_loader.py b/py/src/braintrust/btx/spec_loader.py index 329c82db..9f5c3d48 100644 --- a/py/src/braintrust/btx/spec_loader.py +++ b/py/src/braintrust/btx/spec_loader.py @@ -1,15 +1,17 @@ """Load BTX LLM-span spec YAML files. -Handles the three custom YAML tags used in the spec: +Handles the custom YAML tags used in the spec: !fn — named predicate or arbitrary lambda (eval'd in Python) !starts_with — string prefix check !or [...] — at-least-one-of validator + !gen — value generated by the test runner (e.g. test_runner_client) """ from __future__ import annotations import dataclasses import os +import uuid from pathlib import Path from typing import Any @@ -17,19 +19,13 @@ # --------------------------------------------------------------------------- -# Matcher types (parallel to SpecMatcher.java) +# Matcher / generator types # --------------------------------------------------------------------------- @dataclasses.dataclass class FnMatcher: - """A named or lambda-expression validator. - - For well-known names (is_non_negative_number, etc.) the span_validator - module dispatches them to dedicated functions. For arbitrary Python - expressions the expression string is stored and eval()'d at validation - time. - """ + """A named or lambda-expression validator.""" expr: str # e.g. "is_non_negative_number" or "lambda value: value > 0" @@ -44,6 +40,18 @@ class OrMatcher: alternatives: list[Any] +@dataclasses.dataclass +class GenValue: + """A value generated by the test runner at execution time. + + The generator name determines what value is produced: + test_runner_client — a string identifying this SDK/client (e.g. "python-openai") + vcr_nonce — a random string that changes every run (busts caches) + """ + + generator: str # e.g. "test_runner_client", "vcr_nonce" + + # --------------------------------------------------------------------------- # YAML custom constructors # --------------------------------------------------------------------------- @@ -64,6 +72,11 @@ def _or_constructor(loader: yaml.SafeLoader, node: yaml.Node) -> OrMatcher: return OrMatcher(alternatives=alternatives) +def _gen_constructor(loader: yaml.SafeLoader, node: yaml.Node) -> GenValue: + generator = loader.construct_scalar(node) # type: ignore[arg-type] + return GenValue(generator=generator) + + def _make_loader() -> type: """Return a SafeLoader subclass with BTX custom tags registered.""" @@ -73,9 +86,57 @@ class BtxLoader(yaml.SafeLoader): BtxLoader.add_constructor("!fn", _fn_constructor) BtxLoader.add_constructor("!starts_with", _starts_with_constructor) BtxLoader.add_constructor("!or", _or_constructor) + BtxLoader.add_constructor("!gen", _gen_constructor) return BtxLoader +# --------------------------------------------------------------------------- +# Generator resolution +# --------------------------------------------------------------------------- + +# Stable client identifier for this SDK implementation. +_CLIENT_ID = "python-btx" + +# Per-process nonce — constant within a run so cassette body matching is stable, +# but differs across runs so cache-busting specs actually bust caches. +_VCR_NONCE = str(uuid.uuid4())[:8] + +_GENERATORS: dict[str, str] = { + "test_runner_client": _CLIENT_ID, + "vcr_nonce": _VCR_NONCE, +} + + +def _resolve_gen(value: GenValue) -> str: + if value.generator in _GENERATORS: + return _GENERATORS[value.generator] + raise ValueError(f"Unknown !gen generator: {value.generator!r}") + + +def _resolve_variables(variables: dict[str, Any]) -> dict[str, str]: + """Resolve all !gen values in the variables map to concrete strings.""" + resolved: dict[str, str] = {} + for key, val in variables.items(): + if isinstance(val, GenValue): + resolved[key] = _resolve_gen(val) + else: + resolved[key] = str(val) + return resolved + + +def _substitute_templates(obj: Any, variables: dict[str, str]) -> Any: + """Recursively substitute {{var}} placeholders in strings.""" + if isinstance(obj, str): + for key, value in variables.items(): + obj = obj.replace(f"{{{{{key}}}}}", value) + return obj + if isinstance(obj, dict): + return {k: _substitute_templates(v, variables) for k, v in obj.items()} + if isinstance(obj, list): + return [_substitute_templates(item, variables) for item in obj] + return obj + + # --------------------------------------------------------------------------- # Spec dataclass # --------------------------------------------------------------------------- @@ -89,6 +150,7 @@ class LlmSpanSpec: endpoint: str requests: list[dict[str, Any]] expected_brainstore_spans: list[dict[str, Any]] + headers: dict[str, str] source_path: Path @property @@ -98,13 +160,19 @@ def display_name(self) -> str: @classmethod def from_dict(cls, data: dict[str, Any], source_path: Path) -> "LlmSpanSpec": + # Resolve variables and substitute templates in requests + raw_variables = data.get("variables", {}) + variables = _resolve_variables(raw_variables) + requests = _substitute_templates(data.get("requests", []), variables) + return cls( name=data["name"], type=data["type"], provider=data["provider"], endpoint=data["endpoint"], - requests=data.get("requests", []), + requests=requests, expected_brainstore_spans=data.get("expected_brainstore_spans", []), + headers=data.get("headers", {}), source_path=source_path, ) @@ -117,13 +185,6 @@ def from_dict(cls, data: dict[str, Any], source_path: Path) -> "LlmSpanSpec": def _spec_root(override: str | None = None) -> Path: - """Return the llm_span spec root directory. - - Priority: - 1. ``override`` argument (used by the pytest fixture after fetching specs) - 2. ``BTX_SPEC_ROOT`` environment variable - 3. ``/spec/test/llm_span`` (local dev snapshot) - """ if override: return Path(override) env = os.environ.get("BTX_SPEC_ROOT") @@ -139,10 +200,8 @@ def load_specs( """Load all YAML spec files under *spec_root*. Args: - spec_root: Path to the ``test/llm_span`` directory. Falls back to - :func:`_spec_root` resolution if ``None``. + spec_root: Path to the ``test/llm_span`` directory. providers: Optional allow-list of provider names (e.g. ``["openai"]``). - If ``None``, all providers are loaded. Returns: Sorted list of :class:`LlmSpanSpec` instances. @@ -159,7 +218,6 @@ def load_specs( specs: list[LlmSpanSpec] = [] for yaml_path in sorted(root.rglob("*.yaml")): - # Filter by provider directory if requested provider_dir = yaml_path.parent.name if providers is not None and provider_dir not in providers: continue diff --git a/py/src/braintrust/integrations/anthropic/_utils.py b/py/src/braintrust/integrations/anthropic/_utils.py index 29a289b8..cce1143f 100644 --- a/py/src/braintrust/integrations/anthropic/_utils.py +++ b/py/src/braintrust/integrations/anthropic/_utils.py @@ -82,14 +82,18 @@ def extract_anthropic_usage(usage: Any) -> tuple[dict[str, float], dict[str, Any for source_name, value in server_tool_use.items(): _set_numeric_metric(metrics, f"server_tool_use_{source_name}", value) - if "prompt_cache_creation_tokens" not in metrics and cache_creation_breakdown: - metrics["prompt_cache_creation_tokens"] = sum(cache_creation_breakdown) + if any(v > 0 for v in cache_creation_breakdown): + # Per-TTL breakdown has non-zero values — omit the aggregate so consumers + # can rely on the breakdown fields exclusively (spec: undefined_or_null). + metrics.pop("prompt_cache_creation_tokens", None) + cache_creation_total = sum(cache_creation_breakdown) + else: + # No breakdown or all-zero breakdown — keep the aggregate. + cache_creation_total = metrics.get("prompt_cache_creation_tokens", 0) if metrics: total_prompt_tokens = ( - metrics.get("prompt_tokens", 0) - + metrics.get("prompt_cached_tokens", 0) - + metrics.get("prompt_cache_creation_tokens", 0) + metrics.get("prompt_tokens", 0) + metrics.get("prompt_cached_tokens", 0) + cache_creation_total ) metrics["prompt_tokens"] = total_prompt_tokens metrics["tokens"] = total_prompt_tokens + metrics.get("completion_tokens", 0) diff --git a/py/src/braintrust/integrations/anthropic/test_anthropic.py b/py/src/braintrust/integrations/anthropic/test_anthropic.py index 7865e85e..30b72641 100644 --- a/py/src/braintrust/integrations/anthropic/test_anthropic.py +++ b/py/src/braintrust/integrations/anthropic/test_anthropic.py @@ -337,7 +337,6 @@ def to_dict(self): "prompt_tokens": 21.0, "completion_tokens": 7.0, "prompt_cached_tokens": 3.0, - "prompt_cache_creation_tokens": 7.0, "prompt_cache_creation_5m_tokens": 2.0, "prompt_cache_creation_1h_tokens": 5.0, "server_tool_use_web_search_requests": 2.0, @@ -371,7 +370,7 @@ def test_anthropic_messages_create_prompt_cache_5m_metrics(memory_logger): span = find_span_by_name(memory_logger.pop(), "anthropic.messages.create") assert span["output"]["role"] == response.role - assert span["metrics"]["prompt_cache_creation_tokens"] == response.usage.cache_creation_input_tokens + assert "prompt_cache_creation_tokens" not in span["metrics"] assert ( span["metrics"]["prompt_cache_creation_5m_tokens"] == response.usage.cache_creation.ephemeral_5m_input_tokens ) @@ -403,7 +402,7 @@ def test_anthropic_messages_create_prompt_cache_1h_metrics(memory_logger): span = find_span_by_name(memory_logger.pop(), "anthropic.messages.create") assert span["output"]["role"] == response.role - assert span["metrics"]["prompt_cache_creation_tokens"] == response.usage.cache_creation_input_tokens + assert "prompt_cache_creation_tokens" not in span["metrics"] assert ( span["metrics"]["prompt_cache_creation_5m_tokens"] == response.usage.cache_creation.ephemeral_5m_input_tokens ) @@ -1342,7 +1341,10 @@ def test_setup_creates_spans(memory_logger): usage.input_tokens + usage.cache_read_input_tokens + usage.cache_creation_input_tokens ) assert metrics["completion_tokens"] == usage.output_tokens - assert metrics["prompt_cache_creation_tokens"] == usage.cache_creation_input_tokens + # When breakdown is all zeros, the aggregate is still reported. + # When breakdown has non-zero values, the aggregate is omitted. + if ephemeral_5m > 0 or ephemeral_1h > 0: + assert "prompt_cache_creation_tokens" not in metrics assert metrics["prompt_cache_creation_5m_tokens"] == ephemeral_5m assert metrics["prompt_cache_creation_1h_tokens"] == ephemeral_1h assert "service_tier" not in metrics @@ -1373,7 +1375,7 @@ def test_extract_anthropic_usage_preserves_nested_numeric_fields(): assert metrics["prompt_tokens"] == 15 assert metrics["completion_tokens"] == 12 assert metrics["tokens"] == 27 - assert metrics["prompt_cache_creation_tokens"] == 7 + assert "prompt_cache_creation_tokens" not in metrics assert metrics["prompt_cache_creation_5m_tokens"] == 3 assert metrics["prompt_cache_creation_1h_tokens"] == 4 assert metrics["server_tool_use_web_search_requests"] == 2