braintrustdata
diff --git a/‎.agents/skills/sdk-integrations/SKILL.md‎
Lines changed: 3 additions & 1 deletion b/‎.agents/skills/sdk-integrations/SKILL.md‎
Lines changed: 3 additions & 1 deletion
diff --git a/‎AGENTS.md‎
Lines changed: 9 additions & 0 deletions b/‎AGENTS.md‎
Lines changed: 9 additions & 0 deletions
diff --git a/‎py/noxfile.py‎
Lines changed: 17 additions & 1 deletion b/‎py/noxfile.py‎
Lines changed: 17 additions & 1 deletion
diff --git a/‎py/src/braintrust/integrations/agentscope/tracing.py‎
Lines changed: 9 additions & 12 deletions b/‎py/src/braintrust/integrations/agentscope/tracing.py‎
Lines changed: 9 additions & 12 deletions
diff --git a/‎py/src/braintrust/integrations/agno/tracing.py‎
Lines changed: 1 addition & 22 deletions b/‎py/src/braintrust/integrations/agno/tracing.py‎
Lines changed: 1 addition & 22 deletions
diff --git a/‎py/src/braintrust/integrations/anthropic/_utils.py‎
Lines changed: 8 additions & 20 deletions b/‎py/src/braintrust/integrations/anthropic/_utils.py‎
Lines changed: 8 additions & 20 deletions
diff --git a/‎py/src/braintrust/integrations/anthropic/test_anthropic.py‎
Lines changed: 49 additions & 0 deletions b/‎py/src/braintrust/integrations/anthropic/test_anthropic.py‎
Lines changed: 49 additions & 0 deletions
@@ -105,12 +105,14 @@ Do not start by wiring patchers and only later asking what the logged span shoul
 
 Keep provider-local code inside `py/src/braintrust/integrations/<provider>/`.
 
+If tracing or normalization logic is genuinely shared across multiple integrations, prefer adding it to `py/src/braintrust/integrations/utils.py` instead of copying it into each provider package. Avoid duplicating code between integrations unless there is a clear provider-specific reason the behavior must diverge.
+
 Typical file ownership:
 
 - `__init__.py`: export the integration class, `setup_<provider>()`, and public `wrap_*()` helpers
 - `integration.py`: define the `BaseIntegration` subclass and register patchers
 - `patchers.py`: define patchers and manual `wrap_*()` helpers
-- `tracing.py`: keep provider-specific tracing, stream handling, normalization, and metadata extraction
+- `tracing.py`: keep provider-specific tracing, stream handling, normalization, and metadata extraction; move cross-integration helpers to `py/src/braintrust/integrations/utils.py`
 - `test_*.py`: keep provider behavior tests next to the integration
 - `cassettes/`: keep VCR recordings next to the integration tests when the provider uses HTTP
 
 
@@ -75,6 +75,12 @@ Root `Makefile` exists as a convenience wrapper. The authoritative SDK workflow
 
 `py/noxfile.py` is the source of truth for compatibility coverage.
 
+Testing preferences:
+
+- Prefer VCR-backed integration tests with checked-in cassettes whenever practical.
+- Avoid mocks, fakes, and heavily synthetic tests unless there is no reasonable cassette-based alternative or the code under test is truly internal/purely local.
+- When fixing a bug or issue, default to a red/green workflow: first add or update a test that reproduces the problem and fails, then implement the fix, unless the user explicitly asks for a different approach.
+
 Key facts:
 
 - `test_core` runs without optional vendor packages.
@@ -87,6 +93,8 @@ When changing behavior, run the narrowest affected session first, then expand on
 
 ## VCR
 
+VCR/cassette coverage is the default and preferred testing strategy for provider and integration behavior in this repo. Reach for cassette-backed tests before introducing mocks or fakes, and keep new coverage aligned with the existing VCR patterns unless there is a strong reason not to.
+
 VCR cassette directories:
 
 - `py/src/braintrust/cassettes/`
@@ -162,6 +170,7 @@ Avoid editing `py/src/braintrust/version.py` while also running build commands.
 
 - Keep tests near the code they cover.
 - Reuse existing fixtures and cassette patterns.
+- Prefer extending an existing cassette-backed test over adding a new mock-heavy test.
 - If a change affects examples or integrations, update the nearest example or focused test.
 - For CLI/devserver changes, consider whether wheel-mode behavior also needs coverage.
 - Do **not** add `from __future__ import annotations` unless it is absolutely required (e.g., a genuine forward-reference that cannot be resolved any other way). This import changes annotation evaluation semantics at runtime and can silently break `get_type_hints()`, Pydantic models, and other runtime introspection. Prefer quoted string literals (`"MyClass"`) or `TYPE_CHECKING` guards for forward references instead.
@@ -10,6 +10,7 @@
     nox -h                     Get help.
 """
 
+import functools
 import glob
 import os
 import pathlib
@@ -476,14 +477,29 @@ def _get_braintrust_wheel():
     return wheels[0]
 
 
+@functools.cache
+def _integration_subdirs_to_ignore() -> list[str]:
+    """Return integration subdirectories that require dedicated sessions.
+
+    Top-level tests in ``src/braintrust/integrations/`` (e.g. shared utils and
+    versioning tests) should still run in ``test_core``.
+    """
+    integrations_root = pathlib.Path("src") / INTEGRATION_DIR
+    return [
+        f"{INTEGRATION_DIR}/{child.name}"
+        for child in integrations_root.iterdir()
+        if child.is_dir() and child.name != "__pycache__"
+    ]
+
+
 def _run_core_tests(session):
     """Run all tests which don't require optional dependencies."""
     _run_tests(
         session,
         SRC_DIR,
         ignore_paths=[
             WRAPPER_DIR,
-            INTEGRATION_DIR,
+            *_integration_subdirs_to_ignore(),
             CONTRIB_DIR,
             DEVSERVER_DIR,
         ],
 
@@ -6,14 +6,11 @@
 
 from braintrust.logger import start_span
 from braintrust.span_types import SpanTypeAttribute
-
-
-def _clean(mapping: dict[str, Any]) -> dict[str, Any]:
-    return {key: value for key, value in mapping.items() if value is not None}
+from braintrust.util import clean_nones
 
 
 def _args_kwargs_input(args: Any, kwargs: dict[str, Any]) -> dict[str, Any]:
-    return _clean(
+    return clean_nones(
         {
             "args": list(args) if args else None,
             "kwargs": kwargs if kwargs else None,
@@ -34,7 +31,7 @@ def _pipeline_metadata(args: Any, kwargs: dict[str, Any]) -> dict[str, Any]:
     if agents:
         agent_names = [getattr(agent, "name", agent.__class__.__name__) for agent in agents]
 
-    return _clean({"agent_names": agent_names})
+    return clean_nones({"agent_names": agent_names})
 
 
 def _extract_metrics(*candidates: Any) -> dict[str, float] | None:
@@ -69,7 +66,7 @@ def _model_provider_name(instance: Any) -> str:
 
 
 def _model_metadata(instance: Any) -> dict[str, Any]:
-    return _clean(
+    return clean_nones(
         {
             "model": getattr(instance, "model_name", None),
             "provider": _model_provider_name(instance),
@@ -95,7 +92,7 @@ def _model_call_input(args: Any, kwargs: dict[str, Any]) -> dict[str, Any]:
     if structured_model is None and len(args) > 3:
         structured_model = args[3]
 
-    return _clean(
+    return clean_nones(
         {
             "messages": messages,
             "tools": tools,
@@ -125,7 +122,7 @@ def _model_call_output(result: Any) -> Any:
     else:
         return result
 
-    normalized = _clean(
+    normalized = clean_nones(
         {
             "role": "assistant" if data.get("content") is not None else None,
             "content": data.get("content"),
@@ -178,7 +175,7 @@ async def _wrapper(wrapped: Any, instance: Any, args: Any, kwargs: dict[str, Any
 
 _agent_call_wrapper = _make_task_wrapper(
     name_fn=lambda instance, _a, _k: f"{_agent_name(instance)}.reply",
-    metadata_fn=lambda instance, _a, _k: _clean({"agent_class": instance.__class__.__name__}),
+    metadata_fn=lambda instance, _a, _k: clean_nones({"agent_class": instance.__class__.__name__}),
 )
 
 _sequential_pipeline_wrapper = _make_task_wrapper(
@@ -224,13 +221,13 @@ async def _toolkit_call_tool_function_wrapper(wrapped: Any, instance: Any, args:
             start_span(
                 name=f"{tool_name}.execute",
                 type=SpanTypeAttribute.TOOL,
-                input=_clean(
+                input=clean_nones(
                     {
                         "tool_name": tool_name,
                         "tool_call": tool_call,
                     }
                 ),
-                metadata=_clean({"toolkit_class": instance.__class__.__name__}),
+                metadata=clean_nones({"toolkit_class": instance.__class__.__name__}),
             )
         )
         try:
 
@@ -2,6 +2,7 @@
 from inspect import isawaitable
 from typing import Any
 
+from braintrust.integrations.utils import _try_to_dict
 from braintrust.logger import start_span
 from braintrust.span_types import SpanTypeAttribute
 from braintrust.util import is_numeric
@@ -24,28 +25,6 @@ def get_args_kwargs(args: list[str], kwargs: dict[str, Any], keys: list[str]):
     return {k: args[i] if args else kwargs.get(k) for i, k in enumerate(keys)}, omit(kwargs, keys)
 
 
-def _try_to_dict(obj: Any) -> Any:
-    """Convert object to dict, handling different object types like OpenAI wrapper."""
-    if isinstance(obj, dict):
-        return obj
-    if hasattr(obj, "model_dump") and callable(obj.model_dump):
-        try:
-            return obj.model_dump()
-        except Exception:
-            pass
-    if hasattr(obj, "dict") and callable(obj.dict):
-        try:
-            return obj.dict()
-        except Exception:
-            pass
-    if hasattr(obj, "__dict__"):
-        try:
-            return obj.__dict__.copy()
-        except Exception:
-            pass
-    return obj
-
-
 def is_sync_iterator(result: Any) -> bool:
     return hasattr(result, "__iter__") and hasattr(result, "__next__")
 
 
@@ -2,6 +2,7 @@
 
 from typing import Any
 
+from braintrust.integrations.utils import _try_to_dict as _shared_try_to_dict
 from braintrust.util import is_numeric
 
 
@@ -36,27 +37,14 @@ def __getattr__(self, name: str) -> Any:
 
 
 def _try_to_dict(obj: Any) -> dict[str, Any] | None:
-    if isinstance(obj, dict):
-        return obj
-
-    if hasattr(obj, "model_dump"):
-        try:
-            candidate = obj.model_dump(mode="python")
-        except TypeError:
-            candidate = obj.model_dump()
-        return candidate if isinstance(candidate, dict) else None
-
-    if hasattr(obj, "to_dict"):
-        candidate = obj.to_dict()
-        return candidate if isinstance(candidate, dict) else None
-
-    if hasattr(obj, "dict"):
-        candidate = obj.dict()
-        return candidate if isinstance(candidate, dict) else None
-
-    if hasattr(obj, "__dict__"):
-        return vars(obj)
+    """Anthropic-flavoured object→dict conversion.
 
+    Delegates to the shared ``_try_to_dict`` first, then returns ``None``
+    (instead of the original object) when conversion fails.
+    """
+    result = _shared_try_to_dict(obj)
+    if isinstance(result, dict):
+        return result
     return None
 
 
 
@@ -166,6 +166,55 @@ def test_extract_anthropic_usage_includes_server_tool_use_metrics_from_objects()
     assert metadata == {}
 
 
+def test_extract_anthropic_usage_supports_to_dict_only_objects():
+    class ToDictOnly:
+        __slots__ = ("_payload",)
+
+        def __init__(self, payload):
+            self._payload = payload
+
+        def to_dict(self):
+            return self._payload
+
+    usage = ToDictOnly(
+        {
+            "input_tokens": 11,
+            "output_tokens": 7,
+            "cache_read_input_tokens": 3,
+            "cache_creation": ToDictOnly(
+                {
+                    "ephemeral_5m_input_tokens": 2,
+                    "ephemeral_1h_input_tokens": 5,
+                }
+            ),
+            "server_tool_use": ToDictOnly(
+                {
+                    "web_search_requests": 2,
+                    "web_fetch_requests": 1,
+                }
+            ),
+            "service_tier": "standard",
+        }
+    )
+
+    metrics, metadata = extract_anthropic_usage(usage)
+
+    assert metrics == {
+        "prompt_tokens": 21.0,
+        "completion_tokens": 7.0,
+        "prompt_cached_tokens": 3.0,
+        "prompt_cache_creation_tokens": 7.0,
+        "server_tool_use_web_search_requests": 2.0,
+        "server_tool_use_web_fetch_requests": 1.0,
+        "tokens": 28.0,
+    }
+    assert metadata == {
+        "cache_creation_ephemeral_5m_input_tokens": 2,
+        "cache_creation_ephemeral_1h_input_tokens": 5,
+        "usage_service_tier": "standard",
+    }
+
+
 @pytest.mark.vcr(match_on=["method", "scheme", "host", "port", "path"])
 def test_anthropic_messages_create_with_image_attachment_input(memory_logger):
     assert not memory_logger.pop()