diff --git a/py/src/braintrust/integrations/anthropic/test_anthropic.py b/py/src/braintrust/integrations/anthropic/test_anthropic.py index bcbee1bd..b363fb1e 100644 --- a/py/src/braintrust/integrations/anthropic/test_anthropic.py +++ b/py/src/braintrust/integrations/anthropic/test_anthropic.py @@ -5,11 +5,13 @@ import time import unittest.mock from pathlib import Path +from types import SimpleNamespace import anthropic import pytest from braintrust import logger from braintrust.integrations.anthropic import AnthropicIntegration, wrap_anthropic +from braintrust.integrations.anthropic.tracing import _log_message_to_span from braintrust.test_helpers import init_test_logger @@ -37,6 +39,43 @@ def memory_logger(): yield bgl +def test_log_message_to_span_includes_stop_reason_and_stop_sequence(): + span = unittest.mock.MagicMock() + message = SimpleNamespace( + role="assistant", + content=[{"type": "text", "text": "done"}], + model=MODEL, + stop_reason="stop_sequence", + stop_sequence="DONE", + usage={ + "input_tokens": 11, + "output_tokens": 7, + "cache_read_input_tokens": 0, + "cache_creation_input_tokens": 0, + }, + ) + + _log_message_to_span(message, span, time_to_first_token=0.123) + + span.log.assert_called_once_with( + output={ + "role": "assistant", + "content": [{"type": "text", "text": "done"}], + "model": MODEL, + "stop_reason": "stop_sequence", + "stop_sequence": "DONE", + }, + metrics={ + "prompt_tokens": 11.0, + "completion_tokens": 7.0, + "prompt_cached_tokens": 0.0, + "prompt_cache_creation_tokens": 0.0, + "tokens": 18.0, + "time_to_first_token": 0.123, + }, + ) + + @pytest.mark.vcr def test_anthropic_messages_create_stream_true(memory_logger): assert not memory_logger.pop() @@ -351,6 +390,8 @@ def test_anthropic_messages_sync(memory_logger): metrics = log["metrics"] _assert_metrics_are_valid(metrics, start, end) assert log["metadata"]["model"] == MODEL + assert log["output"]["model"] == msg.model + assert log["output"]["stop_reason"] == msg.stop_reason def _assert_metrics_are_valid(metrics, start, end): diff --git a/py/src/braintrust/integrations/anthropic/tracing.py b/py/src/braintrust/integrations/anthropic/tracing.py index 9f5e737e..15974cf4 100644 --- a/py/src/braintrust/integrations/anthropic/tracing.py +++ b/py/src/braintrust/integrations/anthropic/tracing.py @@ -452,8 +452,14 @@ def _log_message_to_span(message, span, time_to_first_token: float | None = None output = { k: v - for k, v in {"role": getattr(message, "role", None), "content": getattr(message, "content", None)}.items() - if v + for k, v in { + "role": getattr(message, "role", None), + "content": getattr(message, "content", None), + "model": getattr(message, "model", None), + "stop_reason": getattr(message, "stop_reason", None), + "stop_sequence": getattr(message, "stop_sequence", None), + }.items() + if v is not None } or None span.log(output=output, metrics=metrics)