Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
Record `gen_ai.client.operation.time_to_first_chunk` and `gen_ai.client.operation.time_per_output_chunk` metrics for chat completion streams.
Original file line number Diff line number Diff line change
Expand Up @@ -182,7 +182,11 @@ def __init__(
invocation: InferenceInvocation,
capture_content: bool,
) -> None:
super().__init__(stream)
super().__init__(
stream,
start_time_s=invocation.monotonic_start_s,
timing_target=invocation,
)
self._self_invocation = invocation
self._self_choice_buffers = []
self._self_capture_content = capture_content
Expand All @@ -203,7 +207,11 @@ def __init__(
invocation: InferenceInvocation,
capture_content: bool,
) -> None:
super().__init__(stream)
super().__init__(
stream,
start_time_s=invocation.monotonic_start_s,
timing_target=invocation,
)
self._self_invocation = invocation
self._self_choice_buffers = []
self._self_capture_content = capture_content
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -254,3 +254,172 @@ async def test_async_chat_completion_metrics(
assert_all_metric_attributes(
output_token_usage, latest_experimental_enabled
)


# TTFC and per-output-chunk histograms have a different attribute shape than
# the duration/token histograms: streaming responses in the recorded cassettes
# do not include system_fingerprint or service_tier, so we assert only the
# core gen_ai.* attributes that should always be populated.
def assert_streaming_metric_attributes(
data_point, latest_experimental_enabled, expected_request_model
):
assert GenAIAttributes.GEN_AI_OPERATION_NAME in data_point.attributes
assert (
data_point.attributes[GenAIAttributes.GEN_AI_OPERATION_NAME]
== GenAIAttributes.GenAiOperationNameValues.CHAT.value
)

provider_name_attr_name = (
"gen_ai.provider.name"
if latest_experimental_enabled
else GenAIAttributes.GEN_AI_SYSTEM
)
assert provider_name_attr_name in data_point.attributes
assert (
data_point.attributes[provider_name_attr_name]
== GenAIAttributes.GenAiSystemValues.OPENAI.value
)

assert GenAIAttributes.GEN_AI_REQUEST_MODEL in data_point.attributes
assert (
data_point.attributes[GenAIAttributes.GEN_AI_REQUEST_MODEL]
== expected_request_model
)
assert GenAIAttributes.GEN_AI_RESPONSE_MODEL in data_point.attributes
assert data_point.attributes[GenAIAttributes.GEN_AI_RESPONSE_MODEL]

assert (
data_point.attributes[ServerAttributes.SERVER_ADDRESS]
== "api.openai.com"
)


def test_chat_completion_streaming_metrics(
metric_reader, openai_client, instrument_with_content, vcr
):
"""Regression test for the openai_v2 sync chat stream wrapper wiring.

Exercises the actual ChatStreamWrapper path so that removing
timing_target=invocation in chat_wrappers.py would cause this test to
fail, not just the util-layer tests.
"""
if not is_experimental_mode():
pytest.skip("new stream wrapper only")

latest_experimental_enabled = is_experimental_mode()
request_model = "gpt-4"

with vcr.use_cassette("test_chat_completion_streaming.yaml"):
response = openai_client.chat.completions.create(
messages=USER_ONLY_PROMPT,
model=request_model,
stream=True,
stream_options={"include_usage": True},
)
for _ in response:
pass

metrics = metric_reader.get_metrics_data().resource_metrics
assert len(metrics) == 1
metric_data = metrics[0].scope_metrics[0].metrics

ttfc_metric = next(
(
m
for m in metric_data
if m.name == "gen_ai.client.operation.time_to_first_chunk"
),
None,
)
assert ttfc_metric is not None
assert len(ttfc_metric.data.data_points) == 1
ttfc_point = ttfc_metric.data.data_points[0]
assert ttfc_point.count == 1
assert ttfc_point.sum >= 0
assert_streaming_metric_attributes(
ttfc_point, latest_experimental_enabled, request_model
)

chunk_metric = next(
(
m
for m in metric_data
if m.name == "gen_ai.client.operation.time_per_output_chunk"
),
None,
)
assert chunk_metric is not None
assert len(chunk_metric.data.data_points) == 1
chunk_point = chunk_metric.data.data_points[0]
assert chunk_point.count >= 1
assert chunk_point.sum >= 0
assert_streaming_metric_attributes(
chunk_point, latest_experimental_enabled, request_model
)


@pytest.mark.asyncio()
async def test_async_chat_completion_streaming_metrics(
metric_reader, async_openai_client, instrument_with_content, vcr
):
"""Regression test for the openai_v2 async chat stream wrapper wiring.

The async path has separate __init__ wiring from the sync path in
chat_wrappers.py, so it needs its own coverage. Removing
timing_target=invocation in AsyncChatStreamWrapper would still pass
every util-layer test, but would silently break TTFC and per-output-chunk
metrics for async OpenAI streaming.
"""
if not is_experimental_mode():
pytest.skip("new stream wrapper only")

latest_experimental_enabled = is_experimental_mode()
request_model = "gpt-4"

with vcr.use_cassette("test_async_chat_completion_streaming.yaml"):
response = await async_openai_client.chat.completions.create(
messages=USER_ONLY_PROMPT,
model=request_model,
stream=True,
stream_options={"include_usage": True},
)
async for _ in response:
pass

metrics = metric_reader.get_metrics_data().resource_metrics
assert len(metrics) == 1
metric_data = metrics[0].scope_metrics[0].metrics

ttfc_metric = next(
(
m
for m in metric_data
if m.name == "gen_ai.client.operation.time_to_first_chunk"
),
None,
)
assert ttfc_metric is not None
assert len(ttfc_metric.data.data_points) == 1
ttfc_point = ttfc_metric.data.data_points[0]
assert ttfc_point.count == 1
assert ttfc_point.sum >= 0
assert_streaming_metric_attributes(
ttfc_point, latest_experimental_enabled, request_model
)

chunk_metric = next(
(
m
for m in metric_data
if m.name == "gen_ai.client.operation.time_per_output_chunk"
),
None,
)
assert chunk_metric is not None
assert len(chunk_metric.data.data_points) == 1
chunk_point = chunk_metric.data.data_points[0]
assert chunk_point.count >= 1
assert chunk_point.sum >= 0
assert_streaming_metric_attributes(
chunk_point, latest_experimental_enabled, request_model
)
Original file line number Diff line number Diff line change
@@ -0,0 +1,102 @@
interactions:
- request:
body: |-
{
"contents": [
{
"parts": [
{
"text": "Create a haiku about Open Telemetry."
}
],
"role": "user"
}
]
}
headers:
accept:
- '*/*'
accept-encoding:
- identity
connection:
- keep-alive
content-length:
- '93'
content-type:
- application/json
host:
- us-central1-aiplatform.googleapis.com
user-agent:
- google-genai-sdk/1.32.0 gl-python/3.10.18
x-goog-api-client:
- <REDACTED>
x-goog-user-project:
- <REDACTED>
method: POST
uri: https://test-location-aiplatform.googleapis.com/v1beta1/projects/test-project/locations/test-location/publishers/google/models/gemini-2.5-flash:generateContent
response:
body:
string: |-
{
"candidates": [
{
"content": {
"role": "model",
"parts": [
{
"text": "Open data streams,\nMetrics, logs, and traces flow,\nClearly see inside."
}
]
},
"finishReason": "STOP",
"avgLogprobs": -5.934557172987196
}
],
"usageMetadata": {
"promptTokenCount": 8,
"candidatesTokenCount": 18,
"totalTokenCount": 459,
"trafficType": "ON_DEMAND",
"promptTokensDetails": [
{
"modality": "TEXT",
"tokenCount": 8
}
],
"candidatesTokensDetails": [
{
"modality": "TEXT",
"tokenCount": 18
}
],
"thoughtsTokenCount": 433
},
"modelVersion": "gemini-2.5-flash",
"createTime": "2025-10-10T16:32:45.350496Z",
"responseId": "LTXpaKCyFdPlnvgPuajSiQQ"
}
headers:
Accept-Ranges:
- none
Content-Type:
- application/json; charset=UTF-8
Date:
- Fri, 10 Oct 2025 16:32:59 GMT
Server:
- scaffolding on HTTPServer2
Transfer-Encoding:
- chunked
Vary:
- X-Origin
- Referer
- Origin,Accept-Encoding
X-Content-Type-Options:
- nosniff
X-Frame-Options:
- SAMEORIGIN
X-XSS-Protection:
- '0'
status:
code: 200
message: OK
version: 1
Loading