feat(integrations): attach speech outputs for litellm and openai

AbhiPrasad · AbhiPrasad · commit f10c4e653dc6 · 2026-04-13T11:30:55.000-04:00
Add LiteLLM speech/aspeech tracing with VCR coverage and materialize generated audio as Braintrust attachments.

Share audio MIME inference and output attachment helpers in integrations.utils, then reuse them for OpenAI speech so both integrations emit the same attachment-backed span shape.
diff --git a/py/src/braintrust/integrations/litellm/__init__.py b/py/src/braintrust/integrations/litellm/__init__.py
@@ -10,9 +10,9 @@ def patch_litellm() -> bool:
     This wraps litellm.completion, litellm.acompletion, litellm.responses,
     litellm.aresponses, litellm.image_generation, litellm.aimage_generation,
     litellm.embedding, litellm.aembedding, litellm.moderation,
-    litellm.transcription, and litellm.atranscription to automatically
-    create Braintrust spans with detailed token metrics,
-    timing, and costs.
+    litellm.speech, litellm.aspeech, litellm.transcription, and
+    litellm.atranscription to automatically create Braintrust spans with
+    detailed token metrics, timing, and costs.
 
     Returns:
         True if LiteLLM was patched (or already patched), False if LiteLLM is not installed.
diff --git a/py/src/braintrust/integrations/litellm/cassettes/test_litellm_aspeech.yaml b/py/src/braintrust/integrations/litellm/cassettes/test_litellm_aspeech.yaml
diff --git a/py/src/braintrust/integrations/litellm/cassettes/test_litellm_speech.yaml b/py/src/braintrust/integrations/litellm/cassettes/test_litellm_speech.yaml
diff --git a/py/src/braintrust/integrations/litellm/patchers.py b/py/src/braintrust/integrations/litellm/patchers.py
@@ -9,12 +9,14 @@
     _aembedding_wrapper_async,
     _aimage_generation_wrapper_async,
     _aresponses_wrapper_async,
+    _aspeech_wrapper_async,
     _atranscription_wrapper_async,
     _completion_wrapper,
     _embedding_wrapper,
     _image_generation_wrapper,
     _moderation_wrapper,
     _responses_wrapper,
+    _speech_wrapper,
     _transcription_wrapper,
 )
 
@@ -78,6 +80,18 @@ class LiteLLMModerationPatcher(FunctionWrapperPatcher):
     wrapper = _moderation_wrapper
 
 
+class LiteLLMSpeechPatcher(FunctionWrapperPatcher):
+    name = "litellm.speech"
+    target_path = "speech"
+    wrapper = _speech_wrapper
+
+
+class LiteLLMAspeechPatcher(FunctionWrapperPatcher):
+    name = "litellm.aspeech"
+    target_path = "aspeech"
+    wrapper = _aspeech_wrapper_async
+
+
 class LiteLLMTranscriptionPatcher(FunctionWrapperPatcher):
     name = "litellm.transcription"
     target_path = "transcription"
@@ -104,6 +118,8 @@ class LiteLLMATranscriptionPatcher(FunctionWrapperPatcher):
     LiteLLMEmbeddingPatcher,
     LiteLLMAembeddingPatcher,
     LiteLLMModerationPatcher,
+    LiteLLMSpeechPatcher,
+    LiteLLMAspeechPatcher,
     LiteLLMTranscriptionPatcher,
     LiteLLMATranscriptionPatcher,
 )
@@ -122,8 +138,8 @@ def wrap_litellm(litellm: Any) -> Any:
     that exposes the same top-level callables such as ``completion``,
     ``acompletion``, ``responses``, ``aresponses``, ``image_generation``,
     ``aimage_generation``, ``embedding``, ``aembedding``, ``moderation``,
-    ``transcription``, and ``atranscription``). Each patcher is applied
-    idempotently — calling
+    ``speech``, ``aspeech``, ``transcription``, and ``atranscription``).
+    Each patcher is applied idempotently — calling
     ``wrap_litellm`` twice on the same object is safe.
 
     Args:
diff --git a/py/src/braintrust/integrations/litellm/test_litellm.py b/py/src/braintrust/integrations/litellm/test_litellm.py
@@ -18,6 +18,15 @@
 TEST_AUDIO_FILE = os.path.join(os.path.dirname(__file__), "..", "..", "fixtures", "test_audio.wav")
 
 
+def _assert_speech_output_attachment(span) -> None:
+    assert span["output"]["type"] == "audio"
+    assert span["output"]["audio_size_bytes"] > 0
+    attachment = span["output"]["file"]["file_data"]
+    assert isinstance(attachment, Attachment)
+    assert attachment.reference["content_type"].startswith("audio/")
+    assert attachment.reference["filename"].startswith("generated_speech")
+
+
 @pytest.fixture(autouse=True)
 def _patch():
     patch_litellm()
@@ -402,6 +411,57 @@ async def test_litellm_atranscription(memory_logger):
     assert span["output"] == "you"
 
 
+@pytest.mark.vcr
+def test_litellm_speech(memory_logger):
+    assert not memory_logger.pop()
+
+    response = litellm.speech(
+        model="tts-1",
+        voice="alloy",
+        input="Hello, this is a test.",
+        response_format="mp3",
+    )
+
+    assert response
+    assert response.content
+
+    spans = memory_logger.pop()
+    assert len(spans) == 1
+    span = spans[0]
+    assert span["metadata"]["model"] == "tts-1"
+    assert span["metadata"]["voice"] == "alloy"
+    assert span["metadata"]["response_format"] == "mp3"
+    assert span["metadata"]["provider"] == "litellm"
+    assert span["input"] == "Hello, this is a test."
+    _assert_speech_output_attachment(span)
+
+
+@pytest.mark.vcr
+@pytest.mark.asyncio
+async def test_litellm_aspeech(memory_logger):
+    assert not memory_logger.pop()
+
+    response = await litellm.aspeech(
+        model="tts-1",
+        voice="alloy",
+        input="Hello, this is a test.",
+        response_format="mp3",
+    )
+
+    assert response
+    assert response.content
+
+    spans = memory_logger.pop()
+    assert len(spans) == 1
+    span = spans[0]
+    assert span["metadata"]["model"] == "tts-1"
+    assert span["metadata"]["voice"] == "alloy"
+    assert span["metadata"]["response_format"] == "mp3"
+    assert span["metadata"]["provider"] == "litellm"
+    assert span["input"] == "Hello, this is a test."
+    _assert_speech_output_attachment(span)
+
+
 @pytest.mark.vcr
 @pytest.mark.asyncio
 async def test_litellm_acompletion_with_system_prompt(memory_logger):
diff --git a/py/src/braintrust/integrations/litellm/tracing.py b/py/src/braintrust/integrations/litellm/tracing.py
@@ -6,6 +6,7 @@
 from typing import Any
 
 from braintrust.integrations.utils import (
+    _extract_audio_output,
     _materialize_attachment,
     _parse_openai_usage_metrics,
     _prettify_response_params,
@@ -441,6 +442,46 @@ def _moderation_wrapper(wrapped, instance, args, kwargs):
         return moderation_response
 
 
+def _speech_wrapper(wrapped, instance, args, kwargs):
+    """wrapt wrapper for litellm.speech."""
+    updated_span_payload = _update_span_payload_from_params(kwargs, input_key="input")
+
+    with start_span(
+        **merge_dicts(dict(name="Speech", span_attributes={"type": SpanTypeAttribute.LLM}), updated_span_payload)
+    ) as span:
+        start = time.time()
+        speech_response = wrapped(*args, **kwargs)
+        span.log(
+            metrics=_timing_metrics(start, time.time()),
+            output=_extract_audio_output(
+                speech_response,
+                response_format=kwargs.get("response_format"),
+                prefix="generated_speech",
+            ),
+        )
+        return speech_response
+
+
+async def _aspeech_wrapper_async(wrapped, instance, args, kwargs):
+    """wrapt wrapper for litellm.aspeech."""
+    updated_span_payload = _update_span_payload_from_params(kwargs, input_key="input")
+
+    with start_span(
+        **merge_dicts(dict(name="Speech", span_attributes={"type": SpanTypeAttribute.LLM}), updated_span_payload)
+    ) as span:
+        start = time.time()
+        speech_response = await wrapped(*args, **kwargs)
+        span.log(
+            metrics=_timing_metrics(start, time.time()),
+            output=_extract_audio_output(
+                speech_response,
+                response_format=kwargs.get("response_format"),
+                prefix="generated_speech",
+            ),
+        )
+        return speech_response
+
+
 def _transcription_wrapper(wrapped, instance, args, kwargs):
     """wrapt wrapper for litellm.transcription."""
     updated_span_payload = _update_audio_span_payload_from_params(kwargs)
diff --git a/py/src/braintrust/integrations/openai/test_openai.py b/py/src/braintrust/integrations/openai/test_openai.py
@@ -1933,6 +1933,15 @@ def _assert_audio_input_attachment(span) -> None:
     assert span["input"]["file"].reference["content_type"].startswith("audio/")
 
 
+def _assert_audio_output_attachment(span) -> None:
+    assert span["output"]["type"] == "audio"
+    assert span["output"]["audio_size_bytes"] > 0
+    attachment = span["output"]["file"]["file_data"]
+    assert isinstance(attachment, Attachment)
+    assert attachment.reference["content_type"].startswith("audio/")
+    assert attachment.reference["filename"].startswith("generated_speech")
+
+
 def _write_test_png(path: str, *, width: int = 64, height: int = 64) -> None:
     """Write a simple opaque red RGBA PNG without external dependencies."""
 
@@ -2067,7 +2076,7 @@ def test_openai_audio_speech(memory_logger):
     assert span["metadata"]["voice"] == "alloy"
     assert span["metadata"]["provider"] == "openai"
     assert span["input"] == "Hello, this is a test."
-    assert span["output"] == {"type": "audio"}
+    _assert_audio_output_attachment(span)
 
 
 @pytest.mark.vcr
@@ -2175,7 +2184,7 @@ async def test_openai_audio_speech_async(memory_logger):
         assert span["metadata"]["voice"] == "alloy"
         assert span["metadata"]["provider"] == "openai"
         assert span["input"] == "Hello, this is a test."
-        assert span["output"] == {"type": "audio"}
+        _assert_audio_output_attachment(span)
 
 
 @pytest.mark.asyncio
diff --git a/py/src/braintrust/integrations/openai/tracing.py b/py/src/braintrust/integrations/openai/tracing.py
@@ -8,6 +8,7 @@
 from typing import Any
 
 from braintrust.integrations.utils import (
+    _extract_audio_output,
     _materialize_attachment,
     _parse_openai_usage_metrics,
     _prettify_response_params,
@@ -1276,7 +1277,7 @@ def __init__(self, create_fn: Callable[..., Any] | None, acreate_fn: Callable[..
         super().__init__(create_fn, acreate_fn, "Speech")
 
     def process_output(self, response: Any, span: Span):
-        span.log(output={"type": "audio"})
+        span.log(output=_extract_audio_output(response, prefix="generated_speech"))
 
 
 class _AudioFileWrapper(BaseWrapper):
diff --git a/py/src/braintrust/integrations/test_utils.py b/py/src/braintrust/integrations/test_utils.py
@@ -5,6 +5,8 @@
 from braintrust.integrations.utils import (
     _attachment_filename_for_mime_type,
     _camel_to_snake,
+    _extract_audio_output,
+    _infer_audio_mime_type,
     _is_supported_metric_value,
     _log_and_end_span,
     _log_error_and_end_span,
@@ -301,6 +303,42 @@ def test_materialize_attachment_returns_none_for_non_data_url_strings():
     assert _materialize_attachment("https://example.com/image.png") is None
 
 
+def test_infer_audio_mime_type_prefers_response_headers():
+    raw_response = unittest.mock.Mock(headers={"content-type": "audio/mpeg; charset=binary"})
+    response = unittest.mock.Mock(response=raw_response)
+
+    assert _infer_audio_mime_type(response, response_format="wav") == "audio/mpeg"
+
+
+def test_extract_audio_output_materializes_attachment_from_binary_response():
+    raw_response = unittest.mock.Mock(headers={"content-type": "audio/mpeg"})
+    response = unittest.mock.Mock(content=b"audio-bytes", response=raw_response)
+
+    output = _extract_audio_output(response, prefix="generated_speech")
+
+    assert output["type"] == "audio"
+    assert output["mime_type"] == "audio/mpeg"
+    assert output["audio_size_bytes"] == len(b"audio-bytes")
+    attachment = output["file"]["file_data"]
+    assert isinstance(attachment, Attachment)
+    assert attachment.reference["content_type"] == "audio/mpeg"
+    assert attachment.reference["filename"] == "generated_speech.mp3"
+
+
+def test_extract_audio_output_supports_mapping_with_raw_response_only():
+    raw_response = unittest.mock.Mock(headers={"content-type": "audio/wav"}, content=b"wave")
+
+    output = _extract_audio_output({"response": raw_response}, prefix="generated_speech")
+
+    assert output["type"] == "audio"
+    assert output["mime_type"] == "audio/wav"
+    assert output["audio_size_bytes"] == len(b"wave")
+    attachment = output["file"]["file_data"]
+    assert isinstance(attachment, Attachment)
+    assert attachment.reference["content_type"] == "audio/wav"
+    assert attachment.reference["filename"] == "generated_speech.wav"
+
+
 def test_serialize_response_format_with_pydantic_basemodel_subclass():
     pydantic = pytest.importorskip("pydantic")
 
diff --git a/py/src/braintrust/integrations/utils.py b/py/src/braintrust/integrations/utils.py
@@ -343,6 +343,72 @@ def _materialize_attachment(
     return None
 
 
+_AUDIO_FORMAT_TO_MIME_TYPE = {
+    "mp3": "audio/mpeg",
+    "wav": "audio/wav",
+    "opus": "audio/opus",
+    "aac": "audio/aac",
+    "flac": "audio/flac",
+    "pcm": "audio/pcm",
+}
+
+
+def _infer_audio_mime_type(response: Any, response_format: Any = None) -> str:
+    raw_response = getattr(response, "response", None)
+    if raw_response is None and isinstance(response, Mapping):
+        raw_response = response.get("response")
+
+    headers = getattr(raw_response, "headers", None)
+    if headers is not None:
+        content_type = headers.get("content-type")
+        if isinstance(content_type, str) and content_type:
+            return content_type.split(";", 1)[0].strip()
+
+    if isinstance(response_format, str) and response_format:
+        normalized = response_format.lower()
+        return _AUDIO_FORMAT_TO_MIME_TYPE.get(
+            normalized,
+            normalized if "/" in normalized else f"audio/{normalized}",
+        )
+
+    return "application/octet-stream"
+
+
+def _extract_audio_output(
+    response: Any,
+    *,
+    response_format: Any = None,
+    prefix: str = "generated_audio",
+) -> dict[str, Any]:
+    audio_bytes = getattr(response, "content", None)
+    if not isinstance(audio_bytes, (bytes, bytearray)) and isinstance(response, Mapping):
+        raw_response = response.get("response")
+        audio_bytes = getattr(raw_response, "content", None)
+
+    if not isinstance(audio_bytes, (bytes, bytearray)):
+        return {"type": "audio"}
+
+    mime_type = _infer_audio_mime_type(response, response_format)
+    resolved_attachment = _materialize_attachment(
+        audio_bytes,
+        mime_type=mime_type,
+        prefix=prefix,
+    )
+    if resolved_attachment is None:
+        return {
+            "type": "audio",
+            "mime_type": mime_type,
+            "audio_size_bytes": len(audio_bytes),
+        }
+
+    return {
+        "type": "audio",
+        "mime_type": resolved_attachment.mime_type,
+        "audio_size_bytes": len(audio_bytes),
+        **resolved_attachment.multimodal_part_payload,
+    }
+
+
 def _is_not_given(value: object) -> bool:
     """Return ``True`` when *value* is a provider ``NOT_GIVEN`` sentinel.