diff --git a/py/src/braintrust/oai.py b/py/src/braintrust/oai.py index df848f46..4f61fa44 100644 --- a/py/src/braintrust/oai.py +++ b/py/src/braintrust/oai.py @@ -1,5 +1,6 @@ import abc import base64 +import inspect import re import time from collections.abc import Callable @@ -62,6 +63,52 @@ def __repr__(self) -> str: return repr(self._response) +class ResponseWrapperProxy: + """Proxy that preserves sync context manager behavior for wrapped responses.""" + + def __init__(self, response: Any): + self._response = response + + def __enter__(self): + if hasattr(self._response, "__enter__"): + return self._response.__enter__() + return self._response + + def __exit__(self, exc_type, exc_val, exc_tb): + if hasattr(self._response, "__exit__"): + return self._response.__exit__(exc_type, exc_val, exc_tb) + return None + + def __iter__(self): + if hasattr(self._response, "__iter__"): + return self._response.__iter__() + raise TypeError("Response object is not iterable") + + def __next__(self): + if hasattr(self._response, "__next__"): + return next(self._response) + raise StopIteration + + def __getattr__(self, name: str) -> Any: + return getattr(self._response, name) + + @property + def __class__(self): # type: ignore + return self._response.__class__ + + def __str__(self) -> str: + return str(self._response) + + def __repr__(self) -> str: + return repr(self._response) + + +async def _maybe_await(value: Any) -> Any: + if inspect.isawaitable(value): + return await value + return value + + def log_headers(response: Any, span: Span): cached_value = response.headers.get(X_CACHED_HEADER) or response.headers.get(X_LEGACY_CACHED_HEADER) @@ -143,13 +190,24 @@ def _process_attachments_in_input(input_data: Any) -> Any: class ChatCompletionWrapper: - def __init__(self, create_fn: Callable[..., Any] | None, acreate_fn: Callable[..., Any] | None): + def __init__( + self, + create_fn: Callable[..., Any] | None, + acreate_fn: Callable[..., Any] | None, + return_raw_response: bool = False, + ): self.create_fn = create_fn self.acreate_fn = acreate_fn + self.return_raw_response = return_raw_response def create(self, *args: Any, **kwargs: Any) -> Any: params = self._parse_params(kwargs) stream = kwargs.get("stream", False) + if stream and self.return_raw_response: + create_response = self.create_fn(*args, **kwargs) + if hasattr(create_response, "parse"): + self._patch_raw_stream_parse(create_response, params) + return create_response span = start_span( **merge_dicts(dict(name="Chat Completion", span_attributes={"type": SpanTypeAttribute.LLM}), params) @@ -195,7 +253,7 @@ def gen(): metrics=metrics, output=log_response["choices"], ) - return raw_response + return create_response if self.return_raw_response else raw_response finally: if should_end: span.end() @@ -203,6 +261,11 @@ def gen(): async def acreate(self, *args: Any, **kwargs: Any) -> Any: params = self._parse_params(kwargs) stream = kwargs.get("stream", False) + if stream and self.return_raw_response: + create_response = await self.acreate_fn(*args, **kwargs) + if hasattr(create_response, "parse"): + self._patch_async_raw_stream_parse(create_response, params) + return create_response span = start_span( **merge_dicts(dict(name="Chat Completion", span_attributes={"type": SpanTypeAttribute.LLM}), params) @@ -214,7 +277,7 @@ async def acreate(self, *args: Any, **kwargs: Any) -> Any: create_response = await self.acreate_fn(*args, **kwargs) if hasattr(create_response, "parse"): - raw_response = create_response.parse() + raw_response = await _maybe_await(create_response.parse()) log_headers(create_response, span) else: raw_response = create_response @@ -251,11 +314,89 @@ async def gen(): metrics=metrics, output=log_response["choices"], ) - return raw_response + return create_response if self.return_raw_response else raw_response finally: if should_end: span.end() + @classmethod + def _patch_raw_stream_parse(cls, create_response: Any, params: dict[str, Any]) -> None: + original_parse = create_response.parse + wrapped_stream = None + + def parse(*args: Any, **kwargs: Any) -> Any: + nonlocal wrapped_stream + if wrapped_stream is not None and not args and not kwargs: + return wrapped_stream + + raw_stream = original_parse(*args, **kwargs) + span = start_span( + **merge_dicts(dict(name="Chat Completion", span_attributes={"type": SpanTypeAttribute.LLM}), params) + ) + start = time.time() + log_headers(create_response, span) + + def gen(): + try: + first = True + all_results = [] + for item in raw_stream: + if first: + span.log(metrics={"time_to_first_token": time.time() - start}) + first = False + all_results.append(_try_to_dict(item)) + yield item + + span.log(**cls._postprocess_streaming_results(all_results)) + finally: + span.end() + + wrapped = ResponseWrapperProxy(gen()) + if not args and not kwargs: + wrapped_stream = wrapped + return wrapped + + create_response.parse = parse + + @classmethod + def _patch_async_raw_stream_parse(cls, create_response: Any, params: dict[str, Any]) -> None: + original_parse = create_response.parse + wrapped_stream = None + + async def parse(*args: Any, **kwargs: Any) -> Any: + nonlocal wrapped_stream + if wrapped_stream is not None and not args and not kwargs: + return wrapped_stream + + raw_stream = await _maybe_await(original_parse(*args, **kwargs)) + span = start_span( + **merge_dicts(dict(name="Chat Completion", span_attributes={"type": SpanTypeAttribute.LLM}), params) + ) + start = time.time() + log_headers(create_response, span) + + async def gen(): + try: + first = True + all_results = [] + async for item in raw_stream: + if first: + span.log(metrics={"time_to_first_token": time.time() - start}) + first = False + all_results.append(_try_to_dict(item)) + yield item + + span.log(**cls._postprocess_streaming_results(all_results)) + finally: + span.end() + + wrapped = AsyncResponseWrapper(gen()) + if not args and not kwargs: + wrapped_stream = wrapped + return wrapped + + create_response.parse = parse + @classmethod def _parse_params(cls, params: dict[str, Any]) -> dict[str, Any]: # First, destructively remove span_info @@ -350,10 +491,17 @@ def _postprocess_streaming_results(cls, all_results: list[dict[str, Any]]) -> di class ResponseWrapper: - def __init__(self, create_fn: Callable[..., Any] | None, acreate_fn: Callable[..., Any] | None, name: str = "openai.responses.create"): + def __init__( + self, + create_fn: Callable[..., Any] | None, + acreate_fn: Callable[..., Any] | None, + name: str = "openai.responses.create", + return_raw_response: bool = False, + ): self.create_fn = create_fn self.acreate_fn = acreate_fn self.name = name + self.return_raw_response = return_raw_response def create(self, *args: Any, **kwargs: Any) -> Any: params = self._parse_params(kwargs) @@ -401,7 +549,7 @@ def gen(): event_data["metrics"] = {} event_data["metrics"]["time_to_first_token"] = time.time() - start span.log(**event_data) - return raw_response + return create_response if self.return_raw_response else raw_response finally: if should_end: span.end() @@ -419,7 +567,7 @@ async def acreate(self, *args: Any, **kwargs: Any) -> Any: start = time.time() create_response = await self.acreate_fn(*args, **kwargs) if hasattr(create_response, "parse"): - raw_response = create_response.parse() + raw_response = await _maybe_await(create_response.parse()) log_headers(create_response, span) else: raw_response = create_response @@ -453,7 +601,7 @@ async def gen(): event_data["metrics"] = {} event_data["metrics"]["time_to_first_token"] = time.time() - start span.log(**event_data) - return raw_response + return create_response if self.return_raw_response else raw_response finally: if should_end: span.end() @@ -582,10 +730,17 @@ def _postprocess_streaming_results(cls, all_results: list[Any]) -> dict[str, Any class BaseWrapper(abc.ABC): - def __init__(self, create_fn: Callable[..., Any] | None, acreate_fn: Callable[..., Any] | None, name: str): + def __init__( + self, + create_fn: Callable[..., Any] | None, + acreate_fn: Callable[..., Any] | None, + name: str, + return_raw_response: bool = False, + ): self._create_fn = create_fn self._acreate_fn = acreate_fn self._name = name + self._return_raw_response = return_raw_response @abc.abstractmethod def process_output(self, response: dict[str, Any], span: Span): @@ -607,7 +762,7 @@ def create(self, *args: Any, **kwargs: Any) -> Any: log_response = _try_to_dict(raw_response) self.process_output(log_response, span) - return raw_response + return create_response if self._return_raw_response else raw_response async def acreate(self, *args: Any, **kwargs: Any) -> Any: params = self._parse_params(kwargs) @@ -617,13 +772,13 @@ async def acreate(self, *args: Any, **kwargs: Any) -> Any: ) as span: create_response = await self._acreate_fn(*args, **kwargs) if hasattr(create_response, "parse"): - raw_response = create_response.parse() + raw_response = await _maybe_await(create_response.parse()) log_headers(create_response, span) else: raw_response = create_response log_response = _try_to_dict(raw_response) self.process_output(log_response, span) - return raw_response + return create_response if self._return_raw_response else raw_response @classmethod def _parse_params(cls, params: dict[str, Any]) -> dict[str, Any]: @@ -646,8 +801,13 @@ def _parse_params(cls, params: dict[str, Any]) -> dict[str, Any]: class EmbeddingWrapper(BaseWrapper): - def __init__(self, create_fn: Callable[..., Any] | None, acreate_fn: Callable[..., Any] | None): - super().__init__(create_fn, acreate_fn, "Embedding") + def __init__( + self, + create_fn: Callable[..., Any] | None, + acreate_fn: Callable[..., Any] | None, + return_raw_response: bool = False, + ): + super().__init__(create_fn, acreate_fn, "Embedding", return_raw_response=return_raw_response) def process_output(self, response: dict[str, Any], span: Span): usage = response.get("usage") @@ -661,8 +821,13 @@ def process_output(self, response: dict[str, Any], span: Span): class ModerationWrapper(BaseWrapper): - def __init__(self, create_fn: Callable[..., Any] | None, acreate_fn: Callable[..., Any] | None): - super().__init__(create_fn, acreate_fn, "Moderation") + def __init__( + self, + create_fn: Callable[..., Any] | None, + acreate_fn: Callable[..., Any] | None, + return_raw_response: bool = False, + ): + super().__init__(create_fn, acreate_fn, "Moderation", return_raw_response=return_raw_response) def process_output(self, response: Any, span: Span): span.log( @@ -719,6 +884,7 @@ class CompletionsV1Wrapper(NamedWrapper): def __init__(self, completions: Any): self.__completions = completions super().__init__(completions) + self.with_raw_response = RawCompletionsV1Wrapper(completions.with_raw_response) def create(self, *args: Any, **kwargs: Any) -> Any: return ChatCompletionWrapper(self.__completions.with_raw_response.create, None).create(*args, **kwargs) @@ -728,6 +894,7 @@ class EmbeddingV1Wrapper(NamedWrapper): def __init__(self, embedding: Any): self.__embedding = embedding super().__init__(embedding) + self.with_raw_response = RawEmbeddingV1Wrapper(embedding.with_raw_response) def create(self, *args: Any, **kwargs: Any) -> Any: return EmbeddingWrapper(self.__embedding.with_raw_response.create, None).create(*args, **kwargs) @@ -737,6 +904,7 @@ class ModerationV1Wrapper(NamedWrapper): def __init__(self, moderation: Any): self.__moderation = moderation super().__init__(moderation) + self.with_raw_response = RawModerationV1Wrapper(moderation.with_raw_response) def create(self, *args: Any, **kwargs: Any) -> Any: return ModerationWrapper(self.__moderation.with_raw_response.create, None).create(*args, **kwargs) @@ -746,6 +914,7 @@ class AsyncCompletionsV1Wrapper(NamedWrapper): def __init__(self, completions: Any): self.__completions = completions super().__init__(completions) + self.with_raw_response = AsyncRawCompletionsV1Wrapper(completions.with_raw_response) async def create(self, *args: Any, **kwargs: Any) -> Any: response = await ChatCompletionWrapper(None, self.__completions.with_raw_response.create).acreate( @@ -758,6 +927,7 @@ class AsyncEmbeddingV1Wrapper(NamedWrapper): def __init__(self, embedding: Any): self.__embedding = embedding super().__init__(embedding) + self.with_raw_response = AsyncRawEmbeddingV1Wrapper(embedding.with_raw_response) async def create(self, *args: Any, **kwargs: Any) -> Any: response = await EmbeddingWrapper(None, self.__embedding.with_raw_response.create).acreate(*args, **kwargs) @@ -768,6 +938,7 @@ class AsyncModerationV1Wrapper(NamedWrapper): def __init__(self, moderation: Any): self.__moderation = moderation super().__init__(moderation) + self.with_raw_response = AsyncRawModerationV1Wrapper(moderation.with_raw_response) async def create(self, *args: Any, **kwargs: Any) -> Any: response = await ModerationWrapper(None, self.__moderation.with_raw_response.create).acreate(*args, **kwargs) @@ -790,6 +961,7 @@ class ResponsesV1Wrapper(NamedWrapper): def __init__(self, responses: Any): self.__responses = responses super().__init__(responses) + self.with_raw_response = RawResponsesV1Wrapper(responses.with_raw_response) def create(self, *args: Any, **kwargs: Any) -> Any: return ResponseWrapper(self.__responses.with_raw_response.create, None).create(*args, **kwargs) @@ -802,6 +974,7 @@ class AsyncResponsesV1Wrapper(NamedWrapper): def __init__(self, responses: Any): self.__responses = responses super().__init__(responses) + self.with_raw_response = AsyncRawResponsesV1Wrapper(responses.with_raw_response) async def create(self, *args: Any, **kwargs: Any) -> Any: response = await ResponseWrapper(None, self.__responses.with_raw_response.create).acreate(*args, **kwargs) @@ -848,6 +1021,96 @@ def __init__(self, beta: Any): self.chat = BetaChatV1Wrapper(beta.chat) +class RawCompletionsV1Wrapper(NamedWrapper): + def __init__(self, completions: Any): + self.__completions = completions + super().__init__(completions) + + def create(self, *args: Any, **kwargs: Any) -> Any: + return ChatCompletionWrapper(self.__completions.create, None, return_raw_response=True).create(*args, **kwargs) + + +class RawEmbeddingV1Wrapper(NamedWrapper): + def __init__(self, embedding: Any): + self.__embedding = embedding + super().__init__(embedding) + + def create(self, *args: Any, **kwargs: Any) -> Any: + return EmbeddingWrapper(self.__embedding.create, None, return_raw_response=True).create(*args, **kwargs) + + +class RawModerationV1Wrapper(NamedWrapper): + def __init__(self, moderation: Any): + self.__moderation = moderation + super().__init__(moderation) + + def create(self, *args: Any, **kwargs: Any) -> Any: + return ModerationWrapper(self.__moderation.create, None, return_raw_response=True).create(*args, **kwargs) + + +class AsyncRawCompletionsV1Wrapper(NamedWrapper): + def __init__(self, completions: Any): + self.__completions = completions + super().__init__(completions) + + async def create(self, *args: Any, **kwargs: Any) -> Any: + return await ChatCompletionWrapper(None, self.__completions.create, return_raw_response=True).acreate( + *args, **kwargs + ) + + +class AsyncRawEmbeddingV1Wrapper(NamedWrapper): + def __init__(self, embedding: Any): + self.__embedding = embedding + super().__init__(embedding) + + async def create(self, *args: Any, **kwargs: Any) -> Any: + return await EmbeddingWrapper(None, self.__embedding.create, return_raw_response=True).acreate(*args, **kwargs) + + +class AsyncRawModerationV1Wrapper(NamedWrapper): + def __init__(self, moderation: Any): + self.__moderation = moderation + super().__init__(moderation) + + async def create(self, *args: Any, **kwargs: Any) -> Any: + return await ModerationWrapper(None, self.__moderation.create, return_raw_response=True).acreate(*args, **kwargs) + + +class RawResponsesV1Wrapper(NamedWrapper): + def __init__(self, responses: Any): + self.__responses = responses + super().__init__(responses) + + def create(self, *args: Any, **kwargs: Any) -> Any: + return ResponseWrapper(self.__responses.create, None, return_raw_response=True).create(*args, **kwargs) + + def parse(self, *args: Any, **kwargs: Any) -> Any: + return ResponseWrapper( + self.__responses.parse, + None, + "openai.responses.parse", + return_raw_response=True, + ).create(*args, **kwargs) + + +class AsyncRawResponsesV1Wrapper(NamedWrapper): + def __init__(self, responses: Any): + self.__responses = responses + super().__init__(responses) + + async def create(self, *args: Any, **kwargs: Any) -> Any: + return await ResponseWrapper(None, self.__responses.create, return_raw_response=True).acreate(*args, **kwargs) + + async def parse(self, *args: Any, **kwargs: Any) -> Any: + return await ResponseWrapper( + None, + self.__responses.parse, + "openai.responses.parse", + return_raw_response=True, + ).acreate(*args, **kwargs) + + # This wraps 1.*.* versions of the openai module, eg https://github.com/openai/openai-python/tree/v1.1.0 class OpenAIV1Wrapper(NamedWrapper): def __init__(self, openai: Any): diff --git a/py/src/braintrust/wrappers/cassettes/test_openai_responses_with_raw_response.yaml b/py/src/braintrust/wrappers/cassettes/test_openai_responses_with_raw_response.yaml new file mode 100644 index 00000000..4a843c06 --- /dev/null +++ b/py/src/braintrust/wrappers/cassettes/test_openai_responses_with_raw_response.yaml @@ -0,0 +1,594 @@ +interactions: +- request: + body: '{"input":"What''s 12 + 12?","model":"gpt-4o-mini","instructions":"Just + the number please"}' + headers: + accept: + - application/json + accept-encoding: + - gzip, deflate + connection: + - keep-alive + content-length: + - '89' + content-type: + - application/json + host: + - api.openai.com + user-agent: + - OpenAI/Python 1.82.0 + x-stainless-arch: + - arm64 + x-stainless-async: + - 'false' + x-stainless-lang: + - python + x-stainless-os: + - MacOS + x-stainless-package-version: + - 1.82.0 + x-stainless-read-timeout: + - '600' + x-stainless-retry-count: + - '0' + x-stainless-runtime: + - CPython + x-stainless-runtime-version: + - 3.13.3 + method: POST + uri: https://api.openai.com/v1/responses + response: + body: + string: "{\n \"id\": \"resp_6837cfde1ae481988e431cc55453c7aa0c3bde2282983b10\",\n + \ \"object\": \"response\",\n \"created_at\": 1748488158,\n \"status\": + \"completed\",\n \"background\": false,\n \"error\": null,\n \"incomplete_details\": + null,\n \"instructions\": \"Just the number please\",\n \"max_output_tokens\": + null,\n \"model\": \"gpt-4o-mini-2024-07-18\",\n \"output\": [\n {\n + \ \"id\": \"msg_6837cfde976c8198bd470dd12d208a270c3bde2282983b10\",\n + \ \"type\": \"message\",\n \"status\": \"completed\",\n \"content\": + [\n {\n \"type\": \"output_text\",\n \"annotations\": + [],\n \"text\": \"24\"\n }\n ],\n \"role\": \"assistant\"\n + \ }\n ],\n \"parallel_tool_calls\": true,\n \"previous_response_id\": + null,\n \"reasoning\": {\n \"effort\": null,\n \"summary\": null\n + \ },\n \"service_tier\": \"default\",\n \"store\": true,\n \"temperature\": + 1.0,\n \"text\": {\n \"format\": {\n \"type\": \"text\"\n }\n + \ },\n \"tool_choice\": \"auto\",\n \"tools\": [],\n \"top_p\": 1.0,\n + \ \"truncation\": \"disabled\",\n \"usage\": {\n \"input_tokens\": 22,\n + \ \"input_tokens_details\": {\n \"cached_tokens\": 0\n },\n \"output_tokens\": + 2,\n \"output_tokens_details\": {\n \"reasoning_tokens\": 0\n },\n + \ \"total_tokens\": 24\n },\n \"user\": null,\n \"metadata\": {}\n}" + headers: + CF-RAY: + - 9472cacbad828c83-EWR + Connection: + - keep-alive + Content-Type: + - application/json + Date: + - Thu, 29 May 2025 03:09:18 GMT + Server: + - cloudflare + Set-Cookie: + - __cf_bm=mI1hauJmw2.LoIBdsHg_QC7_NHqZdnOMzWVIW8FsTRQ-1748488158-1.0.1.1-PfdMmNepTIKHAoMhhN4llNkTT13MFq_85COUdhxqy3ZRC49jL54D1VIubFkA9P6h7l9xUlSX8At.zsM0DfeQZ2ty5TkOlp7EiC.O7LqtTUI; + path=/; expires=Thu, 29-May-25 03:39:18 GMT; domain=.api.openai.com; HttpOnly; + Secure; SameSite=None + - _cfuvid=tPn8I_CbxQE_7n7eR0tfI8z79gYpGXIT.dYXd5DQUl4-1748488158744-0.0.1.1-604800000; + path=/; domain=.api.openai.com; HttpOnly; Secure; SameSite=None + Transfer-Encoding: + - chunked + X-Content-Type-Options: + - nosniff + alt-svc: + - h3=":443"; ma=86400 + cf-cache-status: + - DYNAMIC + content-length: + - '1224' + openai-organization: + - braintrust-data + openai-processing-ms: + - '611' + openai-version: + - '2020-10-01' + strict-transport-security: + - max-age=31536000; includeSubDomains; preload + x-ratelimit-limit-requests: + - '30000' + x-ratelimit-limit-tokens: + - '150000000' + x-ratelimit-remaining-requests: + - '29999' + x-ratelimit-remaining-tokens: + - '149999959' + x-ratelimit-reset-requests: + - 2ms + x-ratelimit-reset-tokens: + - 0s + x-request-id: + - req_ed1e4b8509568a7c5b182e720020214a + status: + code: 200 + message: OK +- request: + body: '{"input":"What''s 12 + 12?","model":"gpt-4o-mini","instructions":"Just + the number please"}' + headers: + accept: + - application/json + accept-encoding: + - gzip, deflate + connection: + - keep-alive + content-length: + - '89' + content-type: + - application/json + host: + - api.openai.com + user-agent: + - OpenAI/Python 1.82.0 + x-stainless-arch: + - arm64 + x-stainless-async: + - 'false' + x-stainless-lang: + - python + x-stainless-os: + - MacOS + x-stainless-package-version: + - 1.82.0 + x-stainless-raw-response: + - 'true' + x-stainless-read-timeout: + - '600' + x-stainless-retry-count: + - '0' + x-stainless-runtime: + - CPython + x-stainless-runtime-version: + - 3.13.3 + method: POST + uri: https://api.openai.com/v1/responses + response: + body: + string: "{\n \"id\": \"resp_6837cfdf19e881988f887435d986060f01b880250e793a38\",\n + \ \"object\": \"response\",\n \"created_at\": 1748488159,\n \"status\": + \"completed\",\n \"background\": false,\n \"error\": null,\n \"incomplete_details\": + null,\n \"instructions\": \"Just the number please\",\n \"max_output_tokens\": + null,\n \"model\": \"gpt-4o-mini-2024-07-18\",\n \"output\": [\n {\n + \ \"id\": \"msg_6837cfdf90688198bdf4259a7a85b4e101b880250e793a38\",\n + \ \"type\": \"message\",\n \"status\": \"completed\",\n \"content\": + [\n {\n \"type\": \"output_text\",\n \"annotations\": + [],\n \"text\": \"24\"\n }\n ],\n \"role\": \"assistant\"\n + \ }\n ],\n \"parallel_tool_calls\": true,\n \"previous_response_id\": + null,\n \"reasoning\": {\n \"effort\": null,\n \"summary\": null\n + \ },\n \"service_tier\": \"default\",\n \"store\": true,\n \"temperature\": + 1.0,\n \"text\": {\n \"format\": {\n \"type\": \"text\"\n }\n + \ },\n \"tool_choice\": \"auto\",\n \"tools\": [],\n \"top_p\": 1.0,\n + \ \"truncation\": \"disabled\",\n \"usage\": {\n \"input_tokens\": 22,\n + \ \"input_tokens_details\": {\n \"cached_tokens\": 0\n },\n \"output_tokens\": + 2,\n \"output_tokens_details\": {\n \"reasoning_tokens\": 0\n },\n + \ \"total_tokens\": 24\n },\n \"user\": null,\n \"metadata\": {}\n}" + headers: + CF-RAY: + - 9472cad0c942159f-EWR + Connection: + - keep-alive + Content-Type: + - application/json + Date: + - Thu, 29 May 2025 03:09:19 GMT + Server: + - cloudflare + Set-Cookie: + - __cf_bm=QIXfljmZxZub9UwwuM6OT1N99na9ASw8TDtcrRd6Z8Q-1748488159-1.0.1.1-zbWSSmIJdFD_aTFzj0JP0WZWpbLM0wEDzYNyALtNsO07NwGG62DBhYk_IRdRHZ9Wf2ooX6MZUcYruGgEgAnGeXE7ply4Hwvl3PEANnp5GJk; + path=/; expires=Thu, 29-May-25 03:39:19 GMT; domain=.api.openai.com; HttpOnly; + Secure; SameSite=None + - _cfuvid=F7Q0gtFcXfGctocXEvQSjRs1S7gyxSThlxKpcdlXKG8-1748488159693-0.0.1.1-604800000; + path=/; domain=.api.openai.com; HttpOnly; Secure; SameSite=None + Transfer-Encoding: + - chunked + X-Content-Type-Options: + - nosniff + alt-svc: + - h3=":443"; ma=86400 + cf-cache-status: + - DYNAMIC + content-length: + - '1224' + openai-organization: + - braintrust-data + openai-processing-ms: + - '558' + openai-version: + - '2020-10-01' + strict-transport-security: + - max-age=31536000; includeSubDomains; preload + x-ratelimit-limit-requests: + - '30000' + x-ratelimit-limit-tokens: + - '150000000' + x-ratelimit-remaining-requests: + - '29999' + x-ratelimit-remaining-tokens: + - '149999959' + x-ratelimit-reset-requests: + - 2ms + x-ratelimit-reset-tokens: + - 0s + x-request-id: + - req_9593682049a6bc80e64872aee04f3bdf + status: + code: 200 + message: OK +- request: + body: '{"input":"What''s 12 + 12?","model":"gpt-4o-mini","text":{"format":{"type":"json_schema","strict":true,"name":"NumberAnswer","schema":{"properties":{"value":{"title":"Value","type":"integer"},"reasoning":{"title":"Reasoning","type":"string"}},"required":["value","reasoning"],"title":"NumberAnswer","type":"object","additionalProperties":false}}}}' + headers: + accept: + - application/json + accept-encoding: + - gzip, deflate + connection: + - keep-alive + content-length: + - '346' + content-type: + - application/json + cookie: + - _cfuvid=tPn8I_CbxQE_7n7eR0tfI8z79gYpGXIT.dYXd5DQUl4-1748488158744-0.0.1.1-604800000 + host: + - api.openai.com + user-agent: + - OpenAI/Python 1.70.0 + x-stainless-arch: + - arm64 + x-stainless-async: + - 'false' + x-stainless-lang: + - python + x-stainless-os: + - MacOS + x-stainless-package-version: + - 1.70.0 + x-stainless-read-timeout: + - '600' + x-stainless-retry-count: + - '0' + x-stainless-runtime: + - CPython + x-stainless-runtime-version: + - 3.11.10 + method: POST + uri: https://api.openai.com/v1/responses + response: + body: + string: "{\n \"id\": \"resp_688bbcd514f881999cfcc310629a9e1f03cea7458a5a3aec\",\n + \ \"object\": \"response\",\n \"created_at\": 1753988309,\n \"status\": + \"completed\",\n \"background\": false,\n \"error\": null,\n \"incomplete_details\": + null,\n \"instructions\": null,\n \"max_output_tokens\": null,\n \"max_tool_calls\": + null,\n \"model\": \"gpt-4o-mini-2024-07-18\",\n \"output\": [\n {\n + \ \"id\": \"msg_688bbcd58880819997d3952964cda68f03cea7458a5a3aec\",\n + \ \"type\": \"message\",\n \"status\": \"completed\",\n \"content\": + [\n {\n \"type\": \"output_text\",\n \"annotations\": + [],\n \"logprobs\": [],\n \"text\": \"{\\\"value\\\":24,\\\"reasoning\\\":\\\"Adding + 12 and 12 gives you 24 because you combine the two values together.\\\"}\"\n + \ }\n ],\n \"role\": \"assistant\"\n }\n ],\n \"parallel_tool_calls\": + true,\n \"previous_response_id\": null,\n \"prompt_cache_key\": null,\n + \ \"reasoning\": {\n \"effort\": null,\n \"summary\": null\n },\n \"safety_identifier\": + null,\n \"service_tier\": \"default\",\n \"store\": true,\n \"temperature\": + 1.0,\n \"text\": {\n \"format\": {\n \"type\": \"json_schema\",\n + \ \"description\": null,\n \"name\": \"NumberAnswer\",\n \"schema\": + {\n \"properties\": {\n \"value\": {\n \"title\": + \"Value\",\n \"type\": \"integer\"\n },\n \"reasoning\": + {\n \"title\": \"Reasoning\",\n \"type\": \"string\"\n + \ }\n },\n \"required\": [\n \"value\",\n \"reasoning\"\n + \ ],\n \"title\": \"NumberAnswer\",\n \"type\": \"object\",\n + \ \"additionalProperties\": false\n },\n \"strict\": true\n + \ }\n },\n \"tool_choice\": \"auto\",\n \"tools\": [],\n \"top_logprobs\": + 0,\n \"top_p\": 1.0,\n \"truncation\": \"disabled\",\n \"usage\": {\n \"input_tokens\": + 59,\n \"input_tokens_details\": {\n \"cached_tokens\": 0\n },\n + \ \"output_tokens\": 28,\n \"output_tokens_details\": {\n \"reasoning_tokens\": + 0\n },\n \"total_tokens\": 87\n },\n \"user\": null,\n \"metadata\": + {}\n}" + headers: + CF-RAY: + - 967f53d37fb9ebeb-SJC + Connection: + - keep-alive + Content-Type: + - application/json + Date: + - Thu, 31 Jul 2025 18:58:30 GMT + Server: + - cloudflare + Set-Cookie: + - __cf_bm=MfH4U1GWQf0HgvfGCN2oeZz30rG272lA840H0ocITl4-1753988310-1.0.1.1-_2MfZykpsYaiBMLuP6hYF4d3UgjFvRm39iz6QLyl7KiQpHUQQy5kISxKt1Cz8a2u5kNTnEry7BCShyyz2tC1GV4R_AKK6yKQRPkdnSHFZEs; + path=/; expires=Thu, 31-Jul-25 19:28:30 GMT; domain=.api.openai.com; HttpOnly; + Secure; SameSite=None + - _cfuvid=wq6YLS4G9pN7FGqxeyEbqKUCae8GWtIXy1WxAM3P2oQ-1753988310124-0.0.1.1-604800000; + path=/; domain=.api.openai.com; HttpOnly; Secure; SameSite=None + Transfer-Encoding: + - chunked + X-Content-Type-Options: + - nosniff + alt-svc: + - h3=":443"; ma=86400 + cf-cache-status: + - DYNAMIC + content-length: + - '1953' + openai-organization: + - braintrust-data + openai-processing-ms: + - '1051' + openai-project: + - proj_vsCSXafhhByzWOThMrJcZiw9 + openai-version: + - '2020-10-01' + strict-transport-security: + - max-age=31536000; includeSubDomains; preload + x-envoy-decorator-operation: + - tasksapi.openai.svc.cluster.local:8081/* + x-envoy-upstream-service-time: + - '1058' + x-ratelimit-limit-requests: + - '30000' + x-ratelimit-limit-tokens: + - '150000000' + x-ratelimit-remaining-requests: + - '29999' + x-ratelimit-remaining-tokens: + - '149999922' + x-ratelimit-reset-requests: + - 2ms + x-ratelimit-reset-tokens: + - 0s + x-request-id: + - req_08be9de3dcbf180a2e859fb2097486c5 + status: + code: 200 + message: OK +- request: + body: '{"input":"What''s 12 + 12?","model":"gpt-4o-mini","text":{"format":{"type":"json_schema","strict":true,"name":"NumberAnswer","schema":{"properties":{"value":{"title":"Value","type":"integer"},"reasoning":{"title":"Reasoning","type":"string"}},"required":["value","reasoning"],"title":"NumberAnswer","type":"object","additionalProperties":false}}}}' + headers: + accept: + - application/json + accept-encoding: + - gzip, deflate + connection: + - keep-alive + content-length: + - '346' + content-type: + - application/json + cookie: + - _cfuvid=F7Q0gtFcXfGctocXEvQSjRs1S7gyxSThlxKpcdlXKG8-1748488159693-0.0.1.1-604800000 + host: + - api.openai.com + user-agent: + - OpenAI/Python 1.70.0 + x-stainless-arch: + - arm64 + x-stainless-async: + - 'false' + x-stainless-lang: + - python + x-stainless-os: + - MacOS + x-stainless-package-version: + - 1.70.0 + x-stainless-read-timeout: + - '600' + x-stainless-retry-count: + - '0' + x-stainless-runtime: + - CPython + x-stainless-runtime-version: + - 3.11.10 + method: POST + uri: https://api.openai.com/v1/responses + response: + body: + string: "{\n \"id\": \"resp_688bbf3e7b98819abf56493949e6e93601991c68d025e3ab\",\n + \ \"object\": \"response\",\n \"created_at\": 1753988926,\n \"status\": + \"completed\",\n \"background\": false,\n \"error\": null,\n \"incomplete_details\": + null,\n \"instructions\": null,\n \"max_output_tokens\": null,\n \"max_tool_calls\": + null,\n \"model\": \"gpt-4o-mini-2024-07-18\",\n \"output\": [\n {\n + \ \"id\": \"msg_688bbf4025d4819a875a763f1c4d534101991c68d025e3ab\",\n + \ \"type\": \"message\",\n \"status\": \"completed\",\n \"content\": + [\n {\n \"type\": \"output_text\",\n \"annotations\": + [],\n \"logprobs\": [],\n \"text\": \"{\\\"value\\\":24,\\\"reasoning\\\":\\\"12 + plus 12 equals 24 because when you combine the two numbers, you are adding + their values together.\\\"}\"\n }\n ],\n \"role\": \"assistant\"\n + \ }\n ],\n \"parallel_tool_calls\": true,\n \"previous_response_id\": + null,\n \"prompt_cache_key\": null,\n \"reasoning\": {\n \"effort\": + null,\n \"summary\": null\n },\n \"safety_identifier\": null,\n \"service_tier\": + \"default\",\n \"store\": true,\n \"temperature\": 1.0,\n \"text\": {\n + \ \"format\": {\n \"type\": \"json_schema\",\n \"description\": + null,\n \"name\": \"NumberAnswer\",\n \"schema\": {\n \"properties\": + {\n \"value\": {\n \"title\": \"Value\",\n \"type\": + \"integer\"\n },\n \"reasoning\": {\n \"title\": + \"Reasoning\",\n \"type\": \"string\"\n }\n },\n + \ \"required\": [\n \"value\",\n \"reasoning\"\n ],\n + \ \"title\": \"NumberAnswer\",\n \"type\": \"object\",\n \"additionalProperties\": + false\n },\n \"strict\": true\n }\n },\n \"tool_choice\": \"auto\",\n + \ \"tools\": [],\n \"top_logprobs\": 0,\n \"top_p\": 1.0,\n \"truncation\": + \"disabled\",\n \"usage\": {\n \"input_tokens\": 59,\n \"input_tokens_details\": + {\n \"cached_tokens\": 0\n },\n \"output_tokens\": 32,\n \"output_tokens_details\": + {\n \"reasoning_tokens\": 0\n },\n \"total_tokens\": 91\n },\n + \ \"user\": null,\n \"metadata\": {}\n}" + headers: + CF-RAY: + - 967f62e628f6df9a-SJC + Connection: + - keep-alive + Content-Type: + - application/json + Date: + - Thu, 31 Jul 2025 19:08:51 GMT + Server: + - cloudflare + Set-Cookie: + - __cf_bm=7K88wLiwUejSYSpUrbbQTTUHXJchrH_l2kI9fgnxcXs-1753988931-1.0.1.1-0GVYMjJh__VuGclT3740O0X6qzhwEkxZ73NnDU0yto26W19LY229WoGh8h.VuBySdJvEqCoVFc72gOID6NOkvrw6Lw.BT.hvHaBAQIcmhQI; + path=/; expires=Thu, 31-Jul-25 19:38:51 GMT; domain=.api.openai.com; HttpOnly; + Secure; SameSite=None + - _cfuvid=VbWwIvwEnDviFllW86zIw2EM2IHt6E3y9YGf8UmVJhc-1753988931644-0.0.1.1-604800000; + path=/; domain=.api.openai.com; HttpOnly; Secure; SameSite=None + Transfer-Encoding: + - chunked + X-Content-Type-Options: + - nosniff + alt-svc: + - h3=":443"; ma=86400 + cf-cache-status: + - DYNAMIC + content-length: + - '1979' + openai-organization: + - braintrust-data + openai-processing-ms: + - '5122' + openai-project: + - proj_vsCSXafhhByzWOThMrJcZiw9 + openai-version: + - '2020-10-01' + strict-transport-security: + - max-age=31536000; includeSubDomains; preload + x-envoy-decorator-operation: + - tasksapi.openai.svc.cluster.local:8081/* + x-envoy-upstream-service-time: + - '5181' + x-ratelimit-limit-requests: + - '30000' + x-ratelimit-limit-tokens: + - '150000000' + x-ratelimit-remaining-requests: + - '29999' + x-ratelimit-remaining-tokens: + - '149999922' + x-ratelimit-reset-requests: + - 2ms + x-ratelimit-reset-tokens: + - 0s + x-request-id: + - req_e9aad4c6852e5f8a6c7a20d0333cb94f + status: + code: 200 + message: OK +- request: + body: '{"input":"What''s 12 + 12?","instructions":"Just the number please","model":"gpt-4o-mini"}' + headers: + Accept: + - application/json + Accept-Encoding: + - gzip, deflate, zstd + Connection: + - keep-alive + Content-Length: + - '89' + Content-Type: + - application/json + Host: + - api.openai.com + User-Agent: + - OpenAI/Python 2.24.0 + X-Stainless-Arch: + - arm64 + X-Stainless-Async: + - 'false' + X-Stainless-Lang: + - python + X-Stainless-OS: + - MacOS + X-Stainless-Package-Version: + - 2.24.0 + X-Stainless-Raw-Response: + - 'true' + X-Stainless-Runtime: + - CPython + X-Stainless-Runtime-Version: + - 3.13.3 + x-stainless-read-timeout: + - '600' + x-stainless-retry-count: + - '0' + method: POST + uri: https://api.openai.com/v1/responses + response: + body: + string: "{\n \"id\": \"resp_072781d3a25cbe2c0069ab5142908081949a0756e144d6d8ce\",\n + \ \"object\": \"response\",\n \"created_at\": 1772835138,\n \"status\": + \"completed\",\n \"background\": false,\n \"billing\": {\n \"payer\": + \"developer\"\n },\n \"completed_at\": 1772835139,\n \"error\": null,\n + \ \"frequency_penalty\": 0.0,\n \"incomplete_details\": null,\n \"instructions\": + \"Just the number please\",\n \"max_output_tokens\": null,\n \"max_tool_calls\": + null,\n \"model\": \"gpt-4o-mini-2024-07-18\",\n \"output\": [\n {\n + \ \"id\": \"msg_072781d3a25cbe2c0069ab51439ab481948765d84f6fffa749\",\n + \ \"type\": \"message\",\n \"status\": \"completed\",\n \"content\": + [\n {\n \"type\": \"output_text\",\n \"annotations\": + [],\n \"logprobs\": [],\n \"text\": \"24\"\n }\n + \ ],\n \"role\": \"assistant\"\n }\n ],\n \"parallel_tool_calls\": + true,\n \"presence_penalty\": 0.0,\n \"previous_response_id\": null,\n \"prompt_cache_key\": + null,\n \"prompt_cache_retention\": null,\n \"reasoning\": {\n \"effort\": + null,\n \"summary\": null\n },\n \"safety_identifier\": null,\n \"service_tier\": + \"default\",\n \"store\": true,\n \"temperature\": 1.0,\n \"text\": {\n + \ \"format\": {\n \"type\": \"text\"\n },\n \"verbosity\": \"medium\"\n + \ },\n \"tool_choice\": \"auto\",\n \"tools\": [],\n \"top_logprobs\": + 0,\n \"top_p\": 1.0,\n \"truncation\": \"disabled\",\n \"usage\": {\n \"input_tokens\": + 22,\n \"input_tokens_details\": {\n \"cached_tokens\": 0\n },\n + \ \"output_tokens\": 2,\n \"output_tokens_details\": {\n \"reasoning_tokens\": + 0\n },\n \"total_tokens\": 24\n },\n \"user\": null,\n \"metadata\": + {}\n}" + headers: + CF-RAY: + - 9d84b37cedcfac27-YYZ + Connection: + - keep-alive + Content-Type: + - application/json + Date: + - Fri, 06 Mar 2026 22:12:19 GMT + Server: + - cloudflare + Strict-Transport-Security: + - max-age=31536000; includeSubDomains; preload + Transfer-Encoding: + - chunked + X-Content-Type-Options: + - nosniff + alt-svc: + - h3=":443"; ma=86400 + cf-cache-status: + - DYNAMIC + content-length: + - '1549' + openai-organization: + - braintrust-data + openai-processing-ms: + - '1177' + openai-project: + - proj_vsCSXafhhByzWOThMrJcZiw9 + openai-version: + - '2020-10-01' + set-cookie: + - __cf_bm=PhT3ig72Cs3IPJOJSyZypWFXwQcxYyROSjZ0yHr6BuE-1772835138.0708199-1.0.1.1-ujgBleuh8ySPZNcaDbUg59SqFC6KLLFQFyKabdC7YsgVZp_FAUv9MtIRcBi1AJIAR4ab39f3t3EuQeuz_8ttgYgkWPp0Od67BdtXDE_D_fjeJKdQA9NjtFEh9hasAo0.; + HttpOnly; Secure; Path=/; Domain=api.openai.com; Expires=Fri, 06 Mar 2026 + 22:42:19 GMT + x-ratelimit-limit-requests: + - '30000' + x-ratelimit-limit-tokens: + - '150000000' + x-ratelimit-remaining-requests: + - '29999' + x-ratelimit-remaining-tokens: + - '149999957' + x-ratelimit-reset-requests: + - 2ms + x-ratelimit-reset-tokens: + - 0s + x-request-id: + - req_dcc95a0d1dd041a0a880bfc76e6c4b50 + status: + code: 200 + message: OK +version: 1 diff --git a/py/src/braintrust/wrappers/test_openai.py b/py/src/braintrust/wrappers/test_openai.py index d763cf22..44108f46 100644 --- a/py/src/braintrust/wrappers/test_openai.py +++ b/py/src/braintrust/wrappers/test_openai.py @@ -17,6 +17,45 @@ TEST_MODEL = "gpt-4o-mini" # cheapest model for tests TEST_PROMPT = "What's 12 + 12?" TEST_SYSTEM_PROMPT = "You are a helpful assistant that only responds with numbers." +RAW_RESPONSE_TEST_CASES = [ + pytest.param( + "responses", + TEST_MODEL, + TEST_PROMPT, + { + "id": "resp_test_123", + "output": [{"content": [{"text": "24"}]}], + "usage": {"input_tokens": 5, "output_tokens": 2, "total_tokens": 7}, + }, + lambda span: ( + TEST_MODEL in span["metadata"]["model"] + and span["output"][0]["content"][0]["text"] == "24" + and span["metrics"]["tokens"] == 7 + and span["metrics"]["prompt_tokens"] == 5 + and span["metrics"]["completion_tokens"] == 2 + ), + id="responses", + ), + pytest.param( + "embeddings", + "text-embedding-ada-002", + "This is a test", + { + "data": [{"embedding": [0.1, 0.2, 0.3]}], + "usage": {"prompt_tokens": 3, "total_tokens": 3}, + }, + lambda span: span["output"]["embedding_length"] == 3, + id="embeddings", + ), + pytest.param( + "moderations", + "omni-moderation-latest", + "This is a test", + {"results": [{"flagged": False, "categories": {"violence": False}}]}, + lambda span: span["output"][0]["flagged"] is False, + id="moderations", + ), +] @pytest.fixture @@ -26,6 +65,23 @@ def memory_logger(): yield bgl +class FakeRawResponse: + def __init__(self, headers, payload): + self.headers = headers + self.payload = payload + self.parse_calls = 0 + + def parse(self): + self.parse_calls += 1 + return self.payload + + +class AsyncFakeRawResponse(FakeRawResponse): + async def parse(self): + self.parse_calls += 1 + return self.payload + + def test_tracing_processor_sets_current_span(memory_logger): """Ensure that on_trace_start sets the span as current so nested spans work.""" pytest.importorskip("agents", reason="agents package not available") @@ -94,6 +150,77 @@ def test_openai_chat_metrics(memory_logger): assert TEST_PROMPT in str(span["input"]) +def test_openai_chat_with_raw_response_traces_and_returns_raw_response(memory_logger, monkeypatch): + assert not memory_logger.pop() + + client = openai.OpenAI(api_key="test-key") + payload = { + "id": "chatcmpl_test_123", + "choices": [{"index": 0, "message": {"role": "assistant", "content": "24"}, "finish_reason": "stop"}], + "usage": {"prompt_tokens": 5, "completion_tokens": 2, "total_tokens": 7}, + } + fake_raw_response = FakeRawResponse({"x-request-id": "req_chat_test_123"}, payload) + monkeypatch.setattr(client.chat.completions.with_raw_response, "create", lambda *args, **kwargs: fake_raw_response) + + wrapped_client = wrap_openai(client) + response = wrapped_client.chat.completions.with_raw_response.create( + model=TEST_MODEL, + messages=[{"role": "user", "content": TEST_PROMPT}], + ) + + assert response is fake_raw_response + assert response.headers["x-request-id"] == "req_chat_test_123" + assert response.parse_calls == 1 + + spans = memory_logger.pop() + assert len(spans) == 1 + span = spans[0] + assert span["metadata"]["provider"] == "openai" + assert TEST_MODEL in span["metadata"]["model"] + assert span["output"][0]["message"]["content"] == "24" + assert span["metrics"]["tokens"] == 7 + assert span["metrics"]["prompt_tokens"] == 5 + assert span["metrics"]["completion_tokens"] == 2 + + +@pytest.mark.asyncio +async def test_openai_chat_with_raw_response_async_traces_and_returns_raw_response(memory_logger, monkeypatch): + assert not memory_logger.pop() + + client = AsyncOpenAI(api_key="test-key") + payload = { + "id": "chatcmpl_test_async_123", + "choices": [{"index": 0, "message": {"role": "assistant", "content": "24"}, "finish_reason": "stop"}], + "usage": {"prompt_tokens": 5, "completion_tokens": 2, "total_tokens": 7}, + } + fake_raw_response = AsyncFakeRawResponse({"x-request-id": "req_chat_async_test_123"}, payload) + + async def fake_create(*args, **kwargs): + return fake_raw_response + + monkeypatch.setattr(client.chat.completions.with_raw_response, "create", fake_create) + + wrapped_client = wrap_openai(client) + response = await wrapped_client.chat.completions.with_raw_response.create( + model=TEST_MODEL, + messages=[{"role": "user", "content": TEST_PROMPT}], + ) + + assert response is fake_raw_response + assert response.headers["x-request-id"] == "req_chat_async_test_123" + assert response.parse_calls == 1 + + spans = memory_logger.pop() + assert len(spans) == 1 + span = spans[0] + assert span["metadata"]["provider"] == "openai" + assert TEST_MODEL in span["metadata"]["model"] + assert span["output"][0]["message"]["content"] == "24" + assert span["metrics"]["tokens"] == 7 + assert span["metrics"]["prompt_tokens"] == 5 + assert span["metrics"]["completion_tokens"] == 2 + + @pytest.mark.vcr def test_openai_responses_metrics(memory_logger): assert not memory_logger.pop() @@ -274,6 +401,226 @@ class SimpleAnswer(BaseModel): assert_metrics_are_valid(metrics, start, end) +@pytest.mark.vcr +def test_openai_responses_with_raw_response(memory_logger): + assert not memory_logger.pop() + + client = wrap_openai(openai.OpenAI()) + + start = time.time() + response = client.responses.with_raw_response.create( + model=TEST_MODEL, + input=TEST_PROMPT, + instructions="Just the number please", + ) + end = time.time() + + assert response + assert response.headers["x-request-id"] + + parsed = response.parse() + assert parsed.output + assert len(parsed.output) > 0 + content = parsed.output[0].content[0].text + assert "24" in content or "twenty-four" in content.lower() + + spans = memory_logger.pop() + assert len(spans) == 1 + span = spans[0] + assert span + metrics = span["metrics"] + assert_metrics_are_valid(metrics, start, end) + assert 0 <= metrics.get("prompt_cached_tokens", 0) + assert 0 <= metrics.get("completion_reasoning_tokens", 0) + assert TEST_MODEL in span["metadata"]["model"] + assert span["metadata"]["provider"] == "openai" + assert TEST_PROMPT in str(span["input"]) + assert len(span["output"]) > 0 + assert span["output"][0]["content"][0]["text"] + + +@pytest.mark.parametrize( + ("resource_name", "model", "input_value", "payload", "assert_span"), + RAW_RESPONSE_TEST_CASES, +) +def test_openai_with_raw_response_traces_and_returns_raw_response( + memory_logger, monkeypatch, resource_name, model, input_value, payload, assert_span +): + """ + Regression test for https://github.com/braintrustdata/braintrust-sdk-python/issues/36. + Traced `with_raw_response` calls must still return the raw response. + """ + assert not memory_logger.pop() + + client = openai.OpenAI(api_key="test-key") + request_id = f"req_test_{resource_name}" + fake_raw_response = FakeRawResponse({"x-request-id": request_id}, payload) + monkeypatch.setattr( + getattr(client, resource_name).with_raw_response, + "create", + lambda *args, **kwargs: fake_raw_response, + ) + + wrapped_client = wrap_openai(client) + response = getattr(wrapped_client, resource_name).with_raw_response.create(model=model, input=input_value) + + assert response is fake_raw_response + assert response.headers["x-request-id"] == request_id + assert response.parse_calls == 1 + + spans = memory_logger.pop() + assert len(spans) == 1 + span = spans[0] + assert span["metadata"]["provider"] == "openai" + assert span["input"] == input_value + assert assert_span(span) + + +@pytest.mark.asyncio +@pytest.mark.parametrize( + ("resource_name", "model", "input_value", "payload", "assert_span"), + RAW_RESPONSE_TEST_CASES, +) +async def test_openai_with_raw_response_async_traces_and_returns_raw_response( + memory_logger, monkeypatch, resource_name, model, input_value, payload, assert_span +): + assert not memory_logger.pop() + + client = AsyncOpenAI(api_key="test-key") + request_id = f"req_test_async_{resource_name}" + fake_raw_response = AsyncFakeRawResponse({"x-request-id": request_id}, payload) + + async def fake_create(*args, **kwargs): + return fake_raw_response + + monkeypatch.setattr(getattr(client, resource_name).with_raw_response, "create", fake_create) + + wrapped_client = wrap_openai(client) + response = await getattr(wrapped_client, resource_name).with_raw_response.create(model=model, input=input_value) + + assert response is fake_raw_response + assert response.headers["x-request-id"] == request_id + assert response.parse_calls == 1 + + spans = memory_logger.pop() + assert len(spans) == 1 + span = spans[0] + assert span["metadata"]["provider"] == "openai" + assert span["input"] == input_value + assert assert_span(span) + + +def test_openai_chat_with_raw_response_streaming_preserves_raw_response(memory_logger, monkeypatch): + assert not memory_logger.pop() + + client = openai.OpenAI(api_key="test-key") + expected_chunks = [ + {"choices": [{"delta": {"role": "assistant", "content": "24"}}]}, + {"choices": [{"delta": {}}], "usage": {"prompt_tokens": 5, "output_tokens": 2, "total_tokens": 7}}, + ] + + class FakeRawResponse: + def __init__(self): + self.headers = {"x-request-id": "req_streaming_test"} + self.parse_calls = 0 + + def parse(self): + self.parse_calls += 1 + for chunk in expected_chunks: + yield chunk + + fake_raw_response = FakeRawResponse() + monkeypatch.setattr(client.chat.completions.with_raw_response, "create", lambda *args, **kwargs: fake_raw_response) + + wrapped_client = wrap_openai(client) + response = wrapped_client.chat.completions.with_raw_response.create( + model=TEST_MODEL, + messages=[{"role": "user", "content": TEST_PROMPT}], + stream=True, + ) + + assert response is fake_raw_response + assert response.headers["x-request-id"] == "req_streaming_test" + assert response.parse_calls == 0 + assert not memory_logger.pop() + + chunks = list(response.parse()) + assert response.parse_calls == 1 + assert chunks == expected_chunks + + spans = memory_logger.pop() + assert len(spans) == 1 + span = spans[0] + assert span["metadata"]["provider"] == "openai" + assert span["metadata"]["stream"] is True + assert span["output"][0]["message"]["content"] == "24" + assert span["metrics"]["tokens"] == 7 + assert span["metrics"]["prompt_tokens"] == 5 + assert span["metrics"]["completion_tokens"] == 2 + + +@pytest.mark.asyncio +async def test_openai_chat_with_raw_response_streaming_async_preserves_raw_response(memory_logger, monkeypatch): + assert not memory_logger.pop() + + client = AsyncOpenAI(api_key="test-key") + expected_chunks = [ + {"choices": [{"delta": {"role": "assistant", "content": "24"}}]}, + {"choices": [{"delta": {}}], "usage": {"prompt_tokens": 5, "output_tokens": 2, "total_tokens": 7}}, + ] + + class FakeRawResponse: + def __init__(self): + self.headers = {"x-request-id": "req_streaming_async_test"} + self.parse_calls = 0 + + async def parse(self): + self.parse_calls += 1 + + async def gen(): + for chunk in expected_chunks: + yield chunk + + return gen() + + fake_raw_response = FakeRawResponse() + + async def fake_create(*args, **kwargs): + return fake_raw_response + + monkeypatch.setattr(client.chat.completions.with_raw_response, "create", fake_create) + + wrapped_client = wrap_openai(client) + response = await wrapped_client.chat.completions.with_raw_response.create( + model=TEST_MODEL, + messages=[{"role": "user", "content": TEST_PROMPT}], + stream=True, + ) + + assert response is fake_raw_response + assert response.headers["x-request-id"] == "req_streaming_async_test" + assert response.parse_calls == 0 + assert not memory_logger.pop() + + stream = await response.parse() + chunks = [] + async for chunk in stream: + chunks.append(chunk) + + assert response.parse_calls == 1 + assert chunks == expected_chunks + + spans = memory_logger.pop() + assert len(spans) == 1 + span = spans[0] + assert span["metadata"]["provider"] == "openai" + assert span["metadata"]["stream"] is True + assert span["output"][0]["message"]["content"] == "24" + assert span["metrics"]["tokens"] == 7 + assert span["metrics"]["prompt_tokens"] == 5 + assert span["metrics"]["completion_tokens"] == 2 + + @pytest.mark.vcr def test_openai_responses_sparse_indices(memory_logger): """Test that streaming responses with sparse/out-of-order indices are handled correctly."""