Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 3 additions & 1 deletion .agents/skills/sdk-integrations/SKILL.md
Original file line number Diff line number Diff line change
Expand Up @@ -105,12 +105,14 @@ Do not start by wiring patchers and only later asking what the logged span shoul

Keep provider-local code inside `py/src/braintrust/integrations/<provider>/`.

If tracing or normalization logic is genuinely shared across multiple integrations, prefer adding it to `py/src/braintrust/integrations/utils.py` instead of copying it into each provider package. Avoid duplicating code between integrations unless there is a clear provider-specific reason the behavior must diverge.

Typical file ownership:

- `__init__.py`: export the integration class, `setup_<provider>()`, and public `wrap_*()` helpers
- `integration.py`: define the `BaseIntegration` subclass and register patchers
- `patchers.py`: define patchers and manual `wrap_*()` helpers
- `tracing.py`: keep provider-specific tracing, stream handling, normalization, and metadata extraction
- `tracing.py`: keep provider-specific tracing, stream handling, normalization, and metadata extraction; move cross-integration helpers to `py/src/braintrust/integrations/utils.py`
- `test_*.py`: keep provider behavior tests next to the integration
- `cassettes/`: keep VCR recordings next to the integration tests when the provider uses HTTP

Expand Down
9 changes: 9 additions & 0 deletions AGENTS.md
Original file line number Diff line number Diff line change
Expand Up @@ -75,6 +75,12 @@ Root `Makefile` exists as a convenience wrapper. The authoritative SDK workflow

`py/noxfile.py` is the source of truth for compatibility coverage.

Testing preferences:

- Prefer VCR-backed integration tests with checked-in cassettes whenever practical.
- Avoid mocks, fakes, and heavily synthetic tests unless there is no reasonable cassette-based alternative or the code under test is truly internal/purely local.
- When fixing a bug or issue, default to a red/green workflow: first add or update a test that reproduces the problem and fails, then implement the fix, unless the user explicitly asks for a different approach.

Key facts:

- `test_core` runs without optional vendor packages.
Expand All @@ -87,6 +93,8 @@ When changing behavior, run the narrowest affected session first, then expand on

## VCR

VCR/cassette coverage is the default and preferred testing strategy for provider and integration behavior in this repo. Reach for cassette-backed tests before introducing mocks or fakes, and keep new coverage aligned with the existing VCR patterns unless there is a strong reason not to.

VCR cassette directories:

- `py/src/braintrust/cassettes/`
Expand Down Expand Up @@ -162,6 +170,7 @@ Avoid editing `py/src/braintrust/version.py` while also running build commands.

- Keep tests near the code they cover.
- Reuse existing fixtures and cassette patterns.
- Prefer extending an existing cassette-backed test over adding a new mock-heavy test.
- If a change affects examples or integrations, update the nearest example or focused test.
- For CLI/devserver changes, consider whether wheel-mode behavior also needs coverage.
- Do **not** add `from __future__ import annotations` unless it is absolutely required (e.g., a genuine forward-reference that cannot be resolved any other way). This import changes annotation evaluation semantics at runtime and can silently break `get_type_hints()`, Pydantic models, and other runtime introspection. Prefer quoted string literals (`"MyClass"`) or `TYPE_CHECKING` guards for forward references instead.
18 changes: 17 additions & 1 deletion py/noxfile.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@
nox -h Get help.
"""

import functools
import glob
import os
import pathlib
Expand Down Expand Up @@ -476,14 +477,29 @@ def _get_braintrust_wheel():
return wheels[0]


@functools.cache
def _integration_subdirs_to_ignore() -> list[str]:
"""Return integration subdirectories that require dedicated sessions.

Top-level tests in ``src/braintrust/integrations/`` (e.g. shared utils and
versioning tests) should still run in ``test_core``.
"""
integrations_root = pathlib.Path("src") / INTEGRATION_DIR
return [
f"{INTEGRATION_DIR}/{child.name}"
for child in integrations_root.iterdir()
if child.is_dir() and child.name != "__pycache__"
]


def _run_core_tests(session):
"""Run all tests which don't require optional dependencies."""
_run_tests(
session,
SRC_DIR,
ignore_paths=[
WRAPPER_DIR,
INTEGRATION_DIR,
*_integration_subdirs_to_ignore(),
CONTRIB_DIR,
DEVSERVER_DIR,
],
Expand Down
21 changes: 9 additions & 12 deletions py/src/braintrust/integrations/agentscope/tracing.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,14 +6,11 @@

from braintrust.logger import start_span
from braintrust.span_types import SpanTypeAttribute


def _clean(mapping: dict[str, Any]) -> dict[str, Any]:
return {key: value for key, value in mapping.items() if value is not None}
from braintrust.util import clean_nones


def _args_kwargs_input(args: Any, kwargs: dict[str, Any]) -> dict[str, Any]:
return _clean(
return clean_nones(
{
"args": list(args) if args else None,
"kwargs": kwargs if kwargs else None,
Expand All @@ -34,7 +31,7 @@ def _pipeline_metadata(args: Any, kwargs: dict[str, Any]) -> dict[str, Any]:
if agents:
agent_names = [getattr(agent, "name", agent.__class__.__name__) for agent in agents]

return _clean({"agent_names": agent_names})
return clean_nones({"agent_names": agent_names})


def _extract_metrics(*candidates: Any) -> dict[str, float] | None:
Expand Down Expand Up @@ -69,7 +66,7 @@ def _model_provider_name(instance: Any) -> str:


def _model_metadata(instance: Any) -> dict[str, Any]:
return _clean(
return clean_nones(
{
"model": getattr(instance, "model_name", None),
"provider": _model_provider_name(instance),
Expand All @@ -95,7 +92,7 @@ def _model_call_input(args: Any, kwargs: dict[str, Any]) -> dict[str, Any]:
if structured_model is None and len(args) > 3:
structured_model = args[3]

return _clean(
return clean_nones(
{
"messages": messages,
"tools": tools,
Expand Down Expand Up @@ -125,7 +122,7 @@ def _model_call_output(result: Any) -> Any:
else:
return result

normalized = _clean(
normalized = clean_nones(
{
"role": "assistant" if data.get("content") is not None else None,
"content": data.get("content"),
Expand Down Expand Up @@ -178,7 +175,7 @@ async def _wrapper(wrapped: Any, instance: Any, args: Any, kwargs: dict[str, Any

_agent_call_wrapper = _make_task_wrapper(
name_fn=lambda instance, _a, _k: f"{_agent_name(instance)}.reply",
metadata_fn=lambda instance, _a, _k: _clean({"agent_class": instance.__class__.__name__}),
metadata_fn=lambda instance, _a, _k: clean_nones({"agent_class": instance.__class__.__name__}),
)

_sequential_pipeline_wrapper = _make_task_wrapper(
Expand Down Expand Up @@ -224,13 +221,13 @@ async def _toolkit_call_tool_function_wrapper(wrapped: Any, instance: Any, args:
start_span(
name=f"{tool_name}.execute",
type=SpanTypeAttribute.TOOL,
input=_clean(
input=clean_nones(
{
"tool_name": tool_name,
"tool_call": tool_call,
}
),
metadata=_clean({"toolkit_class": instance.__class__.__name__}),
metadata=clean_nones({"toolkit_class": instance.__class__.__name__}),
)
)
try:
Expand Down
23 changes: 1 addition & 22 deletions py/src/braintrust/integrations/agno/tracing.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
from inspect import isawaitable
from typing import Any

from braintrust.integrations.utils import _try_to_dict
from braintrust.logger import start_span
from braintrust.span_types import SpanTypeAttribute
from braintrust.util import is_numeric
Expand All @@ -24,28 +25,6 @@ def get_args_kwargs(args: list[str], kwargs: dict[str, Any], keys: list[str]):
return {k: args[i] if args else kwargs.get(k) for i, k in enumerate(keys)}, omit(kwargs, keys)


def _try_to_dict(obj: Any) -> Any:
"""Convert object to dict, handling different object types like OpenAI wrapper."""
if isinstance(obj, dict):
return obj
if hasattr(obj, "model_dump") and callable(obj.model_dump):
try:
return obj.model_dump()
except Exception:
pass
if hasattr(obj, "dict") and callable(obj.dict):
try:
return obj.dict()
except Exception:
pass
if hasattr(obj, "__dict__"):
try:
return obj.__dict__.copy()
except Exception:
pass
return obj


def is_sync_iterator(result: Any) -> bool:
return hasattr(result, "__iter__") and hasattr(result, "__next__")

Expand Down
28 changes: 8 additions & 20 deletions py/src/braintrust/integrations/anthropic/_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@

from typing import Any

from braintrust.integrations.utils import _try_to_dict as _shared_try_to_dict
from braintrust.util import is_numeric


Expand Down Expand Up @@ -36,27 +37,14 @@ def __getattr__(self, name: str) -> Any:


def _try_to_dict(obj: Any) -> dict[str, Any] | None:
if isinstance(obj, dict):
return obj

if hasattr(obj, "model_dump"):
try:
candidate = obj.model_dump(mode="python")
except TypeError:
candidate = obj.model_dump()
return candidate if isinstance(candidate, dict) else None

if hasattr(obj, "to_dict"):
candidate = obj.to_dict()
return candidate if isinstance(candidate, dict) else None

if hasattr(obj, "dict"):
candidate = obj.dict()
return candidate if isinstance(candidate, dict) else None

if hasattr(obj, "__dict__"):
return vars(obj)
"""Anthropic-flavoured object→dict conversion.

Delegates to the shared ``_try_to_dict`` first, then returns ``None``
(instead of the original object) when conversion fails.
"""
result = _shared_try_to_dict(obj)
if isinstance(result, dict):
return result
return None


Expand Down
49 changes: 49 additions & 0 deletions py/src/braintrust/integrations/anthropic/test_anthropic.py
Original file line number Diff line number Diff line change
Expand Up @@ -166,6 +166,55 @@ def test_extract_anthropic_usage_includes_server_tool_use_metrics_from_objects()
assert metadata == {}


def test_extract_anthropic_usage_supports_to_dict_only_objects():
class ToDictOnly:
__slots__ = ("_payload",)

def __init__(self, payload):
self._payload = payload

def to_dict(self):
return self._payload

usage = ToDictOnly(
{
"input_tokens": 11,
"output_tokens": 7,
"cache_read_input_tokens": 3,
"cache_creation": ToDictOnly(
{
"ephemeral_5m_input_tokens": 2,
"ephemeral_1h_input_tokens": 5,
}
),
"server_tool_use": ToDictOnly(
{
"web_search_requests": 2,
"web_fetch_requests": 1,
}
),
"service_tier": "standard",
}
)

metrics, metadata = extract_anthropic_usage(usage)

assert metrics == {
"prompt_tokens": 21.0,
"completion_tokens": 7.0,
"prompt_cached_tokens": 3.0,
"prompt_cache_creation_tokens": 7.0,
"server_tool_use_web_search_requests": 2.0,
"server_tool_use_web_fetch_requests": 1.0,
"tokens": 28.0,
}
assert metadata == {
"cache_creation_ephemeral_5m_input_tokens": 2,
"cache_creation_ephemeral_1h_input_tokens": 5,
"usage_service_tier": "standard",
}


@pytest.mark.vcr(match_on=["method", "scheme", "host", "port", "path"])
def test_anthropic_messages_create_with_image_attachment_input(memory_logger):
assert not memory_logger.pop()
Expand Down
Loading
Loading