Skip to content

Commit f10c4e6

Browse files
committed
feat(integrations): attach speech outputs for litellm and openai
Add LiteLLM speech/aspeech tracing with VCR coverage and materialize generated audio as Braintrust attachments. Share audio MIME inference and output attachment helpers in integrations.utils, then reuse them for OpenAI speech so both integrations emit the same attachment-backed span shape.
1 parent cf4f32d commit f10c4e6

10 files changed

Lines changed: 1351 additions & 8 deletions

File tree

py/src/braintrust/integrations/litellm/__init__.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -10,9 +10,9 @@ def patch_litellm() -> bool:
1010
This wraps litellm.completion, litellm.acompletion, litellm.responses,
1111
litellm.aresponses, litellm.image_generation, litellm.aimage_generation,
1212
litellm.embedding, litellm.aembedding, litellm.moderation,
13-
litellm.transcription, and litellm.atranscription to automatically
14-
create Braintrust spans with detailed token metrics,
15-
timing, and costs.
13+
litellm.speech, litellm.aspeech, litellm.transcription, and
14+
litellm.atranscription to automatically create Braintrust spans with
15+
detailed token metrics, timing, and costs.
1616
1717
Returns:
1818
True if LiteLLM was patched (or already patched), False if LiteLLM is not installed.

py/src/braintrust/integrations/litellm/cassettes/test_litellm_aspeech.yaml

Lines changed: 556 additions & 0 deletions
Large diffs are not rendered by default.

py/src/braintrust/integrations/litellm/cassettes/test_litellm_speech.yaml

Lines changed: 556 additions & 0 deletions
Large diffs are not rendered by default.

py/src/braintrust/integrations/litellm/patchers.py

Lines changed: 18 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -9,12 +9,14 @@
99
_aembedding_wrapper_async,
1010
_aimage_generation_wrapper_async,
1111
_aresponses_wrapper_async,
12+
_aspeech_wrapper_async,
1213
_atranscription_wrapper_async,
1314
_completion_wrapper,
1415
_embedding_wrapper,
1516
_image_generation_wrapper,
1617
_moderation_wrapper,
1718
_responses_wrapper,
19+
_speech_wrapper,
1820
_transcription_wrapper,
1921
)
2022

@@ -78,6 +80,18 @@ class LiteLLMModerationPatcher(FunctionWrapperPatcher):
7880
wrapper = _moderation_wrapper
7981

8082

83+
class LiteLLMSpeechPatcher(FunctionWrapperPatcher):
84+
name = "litellm.speech"
85+
target_path = "speech"
86+
wrapper = _speech_wrapper
87+
88+
89+
class LiteLLMAspeechPatcher(FunctionWrapperPatcher):
90+
name = "litellm.aspeech"
91+
target_path = "aspeech"
92+
wrapper = _aspeech_wrapper_async
93+
94+
8195
class LiteLLMTranscriptionPatcher(FunctionWrapperPatcher):
8296
name = "litellm.transcription"
8397
target_path = "transcription"
@@ -104,6 +118,8 @@ class LiteLLMATranscriptionPatcher(FunctionWrapperPatcher):
104118
LiteLLMEmbeddingPatcher,
105119
LiteLLMAembeddingPatcher,
106120
LiteLLMModerationPatcher,
121+
LiteLLMSpeechPatcher,
122+
LiteLLMAspeechPatcher,
107123
LiteLLMTranscriptionPatcher,
108124
LiteLLMATranscriptionPatcher,
109125
)
@@ -122,8 +138,8 @@ def wrap_litellm(litellm: Any) -> Any:
122138
that exposes the same top-level callables such as ``completion``,
123139
``acompletion``, ``responses``, ``aresponses``, ``image_generation``,
124140
``aimage_generation``, ``embedding``, ``aembedding``, ``moderation``,
125-
``transcription``, and ``atranscription``). Each patcher is applied
126-
idempotently — calling
141+
``speech``, ``aspeech``, ``transcription``, and ``atranscription``).
142+
Each patcher is applied idempotently — calling
127143
``wrap_litellm`` twice on the same object is safe.
128144
129145
Args:

py/src/braintrust/integrations/litellm/test_litellm.py

Lines changed: 60 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,15 @@
1818
TEST_AUDIO_FILE = os.path.join(os.path.dirname(__file__), "..", "..", "fixtures", "test_audio.wav")
1919

2020

21+
def _assert_speech_output_attachment(span) -> None:
22+
assert span["output"]["type"] == "audio"
23+
assert span["output"]["audio_size_bytes"] > 0
24+
attachment = span["output"]["file"]["file_data"]
25+
assert isinstance(attachment, Attachment)
26+
assert attachment.reference["content_type"].startswith("audio/")
27+
assert attachment.reference["filename"].startswith("generated_speech")
28+
29+
2130
@pytest.fixture(autouse=True)
2231
def _patch():
2332
patch_litellm()
@@ -402,6 +411,57 @@ async def test_litellm_atranscription(memory_logger):
402411
assert span["output"] == "you"
403412

404413

414+
@pytest.mark.vcr
415+
def test_litellm_speech(memory_logger):
416+
assert not memory_logger.pop()
417+
418+
response = litellm.speech(
419+
model="tts-1",
420+
voice="alloy",
421+
input="Hello, this is a test.",
422+
response_format="mp3",
423+
)
424+
425+
assert response
426+
assert response.content
427+
428+
spans = memory_logger.pop()
429+
assert len(spans) == 1
430+
span = spans[0]
431+
assert span["metadata"]["model"] == "tts-1"
432+
assert span["metadata"]["voice"] == "alloy"
433+
assert span["metadata"]["response_format"] == "mp3"
434+
assert span["metadata"]["provider"] == "litellm"
435+
assert span["input"] == "Hello, this is a test."
436+
_assert_speech_output_attachment(span)
437+
438+
439+
@pytest.mark.vcr
440+
@pytest.mark.asyncio
441+
async def test_litellm_aspeech(memory_logger):
442+
assert not memory_logger.pop()
443+
444+
response = await litellm.aspeech(
445+
model="tts-1",
446+
voice="alloy",
447+
input="Hello, this is a test.",
448+
response_format="mp3",
449+
)
450+
451+
assert response
452+
assert response.content
453+
454+
spans = memory_logger.pop()
455+
assert len(spans) == 1
456+
span = spans[0]
457+
assert span["metadata"]["model"] == "tts-1"
458+
assert span["metadata"]["voice"] == "alloy"
459+
assert span["metadata"]["response_format"] == "mp3"
460+
assert span["metadata"]["provider"] == "litellm"
461+
assert span["input"] == "Hello, this is a test."
462+
_assert_speech_output_attachment(span)
463+
464+
405465
@pytest.mark.vcr
406466
@pytest.mark.asyncio
407467
async def test_litellm_acompletion_with_system_prompt(memory_logger):

py/src/braintrust/integrations/litellm/tracing.py

Lines changed: 41 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,7 @@
66
from typing import Any
77

88
from braintrust.integrations.utils import (
9+
_extract_audio_output,
910
_materialize_attachment,
1011
_parse_openai_usage_metrics,
1112
_prettify_response_params,
@@ -441,6 +442,46 @@ def _moderation_wrapper(wrapped, instance, args, kwargs):
441442
return moderation_response
442443

443444

445+
def _speech_wrapper(wrapped, instance, args, kwargs):
446+
"""wrapt wrapper for litellm.speech."""
447+
updated_span_payload = _update_span_payload_from_params(kwargs, input_key="input")
448+
449+
with start_span(
450+
**merge_dicts(dict(name="Speech", span_attributes={"type": SpanTypeAttribute.LLM}), updated_span_payload)
451+
) as span:
452+
start = time.time()
453+
speech_response = wrapped(*args, **kwargs)
454+
span.log(
455+
metrics=_timing_metrics(start, time.time()),
456+
output=_extract_audio_output(
457+
speech_response,
458+
response_format=kwargs.get("response_format"),
459+
prefix="generated_speech",
460+
),
461+
)
462+
return speech_response
463+
464+
465+
async def _aspeech_wrapper_async(wrapped, instance, args, kwargs):
466+
"""wrapt wrapper for litellm.aspeech."""
467+
updated_span_payload = _update_span_payload_from_params(kwargs, input_key="input")
468+
469+
with start_span(
470+
**merge_dicts(dict(name="Speech", span_attributes={"type": SpanTypeAttribute.LLM}), updated_span_payload)
471+
) as span:
472+
start = time.time()
473+
speech_response = await wrapped(*args, **kwargs)
474+
span.log(
475+
metrics=_timing_metrics(start, time.time()),
476+
output=_extract_audio_output(
477+
speech_response,
478+
response_format=kwargs.get("response_format"),
479+
prefix="generated_speech",
480+
),
481+
)
482+
return speech_response
483+
484+
444485
def _transcription_wrapper(wrapped, instance, args, kwargs):
445486
"""wrapt wrapper for litellm.transcription."""
446487
updated_span_payload = _update_audio_span_payload_from_params(kwargs)

py/src/braintrust/integrations/openai/test_openai.py

Lines changed: 11 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1933,6 +1933,15 @@ def _assert_audio_input_attachment(span) -> None:
19331933
assert span["input"]["file"].reference["content_type"].startswith("audio/")
19341934

19351935

1936+
def _assert_audio_output_attachment(span) -> None:
1937+
assert span["output"]["type"] == "audio"
1938+
assert span["output"]["audio_size_bytes"] > 0
1939+
attachment = span["output"]["file"]["file_data"]
1940+
assert isinstance(attachment, Attachment)
1941+
assert attachment.reference["content_type"].startswith("audio/")
1942+
assert attachment.reference["filename"].startswith("generated_speech")
1943+
1944+
19361945
def _write_test_png(path: str, *, width: int = 64, height: int = 64) -> None:
19371946
"""Write a simple opaque red RGBA PNG without external dependencies."""
19381947

@@ -2067,7 +2076,7 @@ def test_openai_audio_speech(memory_logger):
20672076
assert span["metadata"]["voice"] == "alloy"
20682077
assert span["metadata"]["provider"] == "openai"
20692078
assert span["input"] == "Hello, this is a test."
2070-
assert span["output"] == {"type": "audio"}
2079+
_assert_audio_output_attachment(span)
20712080

20722081

20732082
@pytest.mark.vcr
@@ -2175,7 +2184,7 @@ async def test_openai_audio_speech_async(memory_logger):
21752184
assert span["metadata"]["voice"] == "alloy"
21762185
assert span["metadata"]["provider"] == "openai"
21772186
assert span["input"] == "Hello, this is a test."
2178-
assert span["output"] == {"type": "audio"}
2187+
_assert_audio_output_attachment(span)
21792188

21802189

21812190
@pytest.mark.asyncio

py/src/braintrust/integrations/openai/tracing.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,7 @@
88
from typing import Any
99

1010
from braintrust.integrations.utils import (
11+
_extract_audio_output,
1112
_materialize_attachment,
1213
_parse_openai_usage_metrics,
1314
_prettify_response_params,
@@ -1276,7 +1277,7 @@ def __init__(self, create_fn: Callable[..., Any] | None, acreate_fn: Callable[..
12761277
super().__init__(create_fn, acreate_fn, "Speech")
12771278

12781279
def process_output(self, response: Any, span: Span):
1279-
span.log(output={"type": "audio"})
1280+
span.log(output=_extract_audio_output(response, prefix="generated_speech"))
12801281

12811282

12821283
class _AudioFileWrapper(BaseWrapper):

py/src/braintrust/integrations/test_utils.py

Lines changed: 38 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,8 @@
55
from braintrust.integrations.utils import (
66
_attachment_filename_for_mime_type,
77
_camel_to_snake,
8+
_extract_audio_output,
9+
_infer_audio_mime_type,
810
_is_supported_metric_value,
911
_log_and_end_span,
1012
_log_error_and_end_span,
@@ -301,6 +303,42 @@ def test_materialize_attachment_returns_none_for_non_data_url_strings():
301303
assert _materialize_attachment("https://example.com/image.png") is None
302304

303305

306+
def test_infer_audio_mime_type_prefers_response_headers():
307+
raw_response = unittest.mock.Mock(headers={"content-type": "audio/mpeg; charset=binary"})
308+
response = unittest.mock.Mock(response=raw_response)
309+
310+
assert _infer_audio_mime_type(response, response_format="wav") == "audio/mpeg"
311+
312+
313+
def test_extract_audio_output_materializes_attachment_from_binary_response():
314+
raw_response = unittest.mock.Mock(headers={"content-type": "audio/mpeg"})
315+
response = unittest.mock.Mock(content=b"audio-bytes", response=raw_response)
316+
317+
output = _extract_audio_output(response, prefix="generated_speech")
318+
319+
assert output["type"] == "audio"
320+
assert output["mime_type"] == "audio/mpeg"
321+
assert output["audio_size_bytes"] == len(b"audio-bytes")
322+
attachment = output["file"]["file_data"]
323+
assert isinstance(attachment, Attachment)
324+
assert attachment.reference["content_type"] == "audio/mpeg"
325+
assert attachment.reference["filename"] == "generated_speech.mp3"
326+
327+
328+
def test_extract_audio_output_supports_mapping_with_raw_response_only():
329+
raw_response = unittest.mock.Mock(headers={"content-type": "audio/wav"}, content=b"wave")
330+
331+
output = _extract_audio_output({"response": raw_response}, prefix="generated_speech")
332+
333+
assert output["type"] == "audio"
334+
assert output["mime_type"] == "audio/wav"
335+
assert output["audio_size_bytes"] == len(b"wave")
336+
attachment = output["file"]["file_data"]
337+
assert isinstance(attachment, Attachment)
338+
assert attachment.reference["content_type"] == "audio/wav"
339+
assert attachment.reference["filename"] == "generated_speech.wav"
340+
341+
304342
def test_serialize_response_format_with_pydantic_basemodel_subclass():
305343
pydantic = pytest.importorskip("pydantic")
306344

py/src/braintrust/integrations/utils.py

Lines changed: 66 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -343,6 +343,72 @@ def _materialize_attachment(
343343
return None
344344

345345

346+
_AUDIO_FORMAT_TO_MIME_TYPE = {
347+
"mp3": "audio/mpeg",
348+
"wav": "audio/wav",
349+
"opus": "audio/opus",
350+
"aac": "audio/aac",
351+
"flac": "audio/flac",
352+
"pcm": "audio/pcm",
353+
}
354+
355+
356+
def _infer_audio_mime_type(response: Any, response_format: Any = None) -> str:
357+
raw_response = getattr(response, "response", None)
358+
if raw_response is None and isinstance(response, Mapping):
359+
raw_response = response.get("response")
360+
361+
headers = getattr(raw_response, "headers", None)
362+
if headers is not None:
363+
content_type = headers.get("content-type")
364+
if isinstance(content_type, str) and content_type:
365+
return content_type.split(";", 1)[0].strip()
366+
367+
if isinstance(response_format, str) and response_format:
368+
normalized = response_format.lower()
369+
return _AUDIO_FORMAT_TO_MIME_TYPE.get(
370+
normalized,
371+
normalized if "/" in normalized else f"audio/{normalized}",
372+
)
373+
374+
return "application/octet-stream"
375+
376+
377+
def _extract_audio_output(
378+
response: Any,
379+
*,
380+
response_format: Any = None,
381+
prefix: str = "generated_audio",
382+
) -> dict[str, Any]:
383+
audio_bytes = getattr(response, "content", None)
384+
if not isinstance(audio_bytes, (bytes, bytearray)) and isinstance(response, Mapping):
385+
raw_response = response.get("response")
386+
audio_bytes = getattr(raw_response, "content", None)
387+
388+
if not isinstance(audio_bytes, (bytes, bytearray)):
389+
return {"type": "audio"}
390+
391+
mime_type = _infer_audio_mime_type(response, response_format)
392+
resolved_attachment = _materialize_attachment(
393+
audio_bytes,
394+
mime_type=mime_type,
395+
prefix=prefix,
396+
)
397+
if resolved_attachment is None:
398+
return {
399+
"type": "audio",
400+
"mime_type": mime_type,
401+
"audio_size_bytes": len(audio_bytes),
402+
}
403+
404+
return {
405+
"type": "audio",
406+
"mime_type": resolved_attachment.mime_type,
407+
"audio_size_bytes": len(audio_bytes),
408+
**resolved_attachment.multimodal_part_payload,
409+
}
410+
411+
346412
def _is_not_given(value: object) -> bool:
347413
"""Return ``True`` when *value* is a provider ``NOT_GIVEN`` sentinel.
348414

0 commit comments

Comments
 (0)