From 54277f093226063e30d7ff748191265bea95c2e4 Mon Sep 17 00:00:00 2001 From: duanbing Date: Wed, 20 May 2026 13:37:43 +0800 Subject: [PATCH 1/2] novita: fix Wan 2.7 R2V media item types to match upstream enum MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The Wan 2.7 R2V (`/v3/async/wan2.7-r2v`) endpoint requires each item in the `media` array to carry a `type` value from the enum: - `reference_image` - `reference_video` - `first_frame` We were sending `image` and `video`, which Novita rejects with the generic "failed to exec task" 500 — every R2V submission via the playground / legacy `image_urls`+`video_urls` shape was failing silently for that reason. Two changes in `build_body`: 1. Repack each `image_urls[]` URL as `{type: "reference_image", url}` and each `video_urls[]` URL as `{type: "reference_video", url}`. No way to express `first_frame` or per-item `reference_voice` from the legacy flat shape — callers who want those use the new pass-through path below. 2. Pass `media` through the allowed-fields whitelist for the R2V shape so direct API callers / a future media-editor UI can submit the rich shape (`[{type, url, reference_voice?}, ...]`) verbatim. The `!body.contains_key("media")` guard in the repack block ensures the pass-through wins when both shapes are present. Also cap the synthesised `media` array at 5 items to match Novita's documented ceiling (combined images+videos ≤ 5), so users who upload more get a deterministic truncate-from-front rather than a 422. --- tensorzero-core/src/providers/novita.rs | 29 +++++++++++++++++++------ 1 file changed, 22 insertions(+), 7 deletions(-) diff --git a/tensorzero-core/src/providers/novita.rs b/tensorzero-core/src/providers/novita.rs index b072294148..3a59766835 100644 --- a/tensorzero-core/src/providers/novita.rs +++ b/tensorzero-core/src/providers/novita.rs @@ -394,6 +394,7 @@ fn build_body(shape: &NovitaRequestShape, input: &Value) -> Result "shot_type", "watermark", "negative_prompt", + "media", ], // Wan 2.7 video editing. Per `/v3/async/wan2.7-videoedit`: // video_url (remapped from `video_urls[0]`), prompt (auto, @@ -638,24 +639,38 @@ fn build_body(shape: &NovitaRequestShape, input: &Value) -> Result } // Wan 2.7 R2V: body wants `media` — an array of objects with a - // `type` ("image"|"video") + `url`. Playground sends - // `image_urls` and `video_urls` as separate arrays. Build the - // unified list; total capped at 5 by Novita (images ≤5, - // videos ≤3 — we don't enforce here, let upstream return its - // own error since `parameter_schema` already gates the inputs). + // `type` (`reference_image`|`reference_video`|`first_frame`) + + // `url`, plus optional per-item `reference_voice` (voice-clone + // audio, MP3/WAV/FLAC, 3–30s). + // + // Two input shapes are supported: + // (a) Playground/legacy flat shape: `image_urls` + `video_urls` + // arrays. Each entry becomes a `reference_image` / + // `reference_video` media item. No way to express + // `first_frame` or `reference_voice` in this shape. + // (b) New rich shape: caller passes a `media` array of objects + // directly. Used by the SPA's media-editor UI and any + // direct API caller. We pass it through verbatim, just + // capping at Novita's 5-item ceiling. + // + // Novita enforces (total ≤ 5; images 0–5; videos 0–3) on its end, + // so we don't double-validate beyond the 5-cap. if matches!(shape, NovitaRequestShape::Wan27ReferenceToVideo) && !body.contains_key("media") { let mut media: Vec = Vec::new(); if let Some(imgs) = input.get("image_urls").and_then(Value::as_array) { for u in imgs.iter().filter_map(Value::as_str) { - media.push(json!({ "type": "image", "url": u })); + media.push(json!({ "type": "reference_image", "url": u })); } } if let Some(vids) = input.get("video_urls").and_then(Value::as_array) { for u in vids.iter().filter_map(Value::as_str) { - media.push(json!({ "type": "video", "url": u })); + media.push(json!({ "type": "reference_video", "url": u })); } } if !media.is_empty() { + if media.len() > 5 { + media.truncate(5); + } body.insert("media".into(), Value::Array(media)); } } From 6498b88269f65d9c3e3f830b335359b9f47a805d Mon Sep 17 00:00:00 2001 From: duanbing Date: Fri, 29 May 2026 07:14:45 +0800 Subject: [PATCH 2/2] usage: preserve OpenAI cached_tokens through Usage MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit TensorZero normalized provider usage to {input_tokens, output_tokens} and dropped OpenAI's prompt_tokens_details.cached_tokens. RouterBase routes chat LLMs (incl. Claude/Gemini via Novita's OpenAI-compat endpoint) through this gateway and needs the prompt-cache read count to bill cache reads at the discounted rate and show users their savings. - Add `cached_tokens: Option` to Usage (ts-bindings + skip-if-none). - Parse prompt_tokens_details.cached_tokens in the OpenAI provider's OpenAIUsage → Usage conversion. - Thread through Usage::zero() and the streaming/cross-inference aggregators (sum, treating None as 0). Anthropic/Bedrock/Vertex native paths leave it None (out of scope; our Anthropic models use the openai-compat path). cargo check --package tensorzero-core (lib) clean; no new clippy warnings. Co-Authored-By: Claude Opus 4.7 (1M context) --- tensorzero-core/src/embeddings.rs | 2 + .../src/endpoints/batch_inference.rs | 1 + tensorzero-core/src/endpoints/inference.rs | 9 ++++ .../openai_compatible/types/streaming.rs | 5 ++ .../src/function/function_config.rs | 11 +++++ tensorzero-core/src/inference/types/mod.rs | 14 ++++++ .../src/inference/types/streams.rs | 27 +++++++++++ tensorzero-core/src/inference/types/usage.rs | 47 +++++++++++++++++++ tensorzero-core/src/model.rs | 2 + tensorzero-core/src/providers/anthropic.rs | 1 + tensorzero-core/src/providers/aws_bedrock.rs | 2 + tensorzero-core/src/providers/azure.rs | 1 + tensorzero-core/src/providers/deepseek.rs | 1 + tensorzero-core/src/providers/dummy.rs | 6 +++ .../src/providers/fireworks/mod.rs | 4 ++ .../src/providers/gcp_vertex_anthropic.rs | 1 + .../src/providers/gcp_vertex_gemini/mod.rs | 6 +++ .../src/providers/google_ai_studio_gemini.rs | 4 ++ tensorzero-core/src/providers/groq.rs | 2 + tensorzero-core/src/providers/hyperbolic.rs | 1 + tensorzero-core/src/providers/mistral.rs | 1 + tensorzero-core/src/providers/novita.rs | 6 ++- tensorzero-core/src/providers/openai/mod.rs | 17 +++++++ .../src/providers/openai/responses.rs | 6 +++ tensorzero-core/src/providers/openrouter.rs | 2 + tensorzero-core/src/providers/sglang.rs | 1 + tensorzero-core/src/providers/tgi.rs | 1 + tensorzero-core/src/providers/together.rs | 6 +++ tensorzero-core/src/providers/vllm.rs | 1 + tensorzero-core/src/providers/xai.rs | 1 + tensorzero-core/src/relay.rs | 1 + .../src/variant/best_of_n_sampling.rs | 7 +++ .../src/variant/chat_completion/mod.rs | 7 +++ tensorzero-core/src/variant/mixture_of_n.rs | 9 ++++ tensorzero-core/src/variant/mod.rs | 3 ++ tensorzero-core/tests/e2e/cache.rs | 7 +++ .../db/batch_inference_endpoint_internals.rs | 4 ++ tensorzero-core/tests/e2e/mixture_of_n.rs | 6 +++ 38 files changed, 232 insertions(+), 1 deletion(-) diff --git a/tensorzero-core/src/embeddings.rs b/tensorzero-core/src/embeddings.rs index 8346936c41..bd85804f6b 100644 --- a/tensorzero-core/src/embeddings.rs +++ b/tensorzero-core/src/embeddings.rs @@ -423,6 +423,7 @@ impl EmbeddingModelResponse { usage: Usage { input_tokens: cache_lookup.input_tokens, output_tokens: cache_lookup.output_tokens, + cached_tokens: None, }, latency: Latency::NonStreaming { response_time: Duration::from_secs(0), @@ -442,6 +443,7 @@ impl EmbeddingModelResponse { Usage { input_tokens: Some(0), output_tokens: Some(0), + cached_tokens: None, } } else { self.usage diff --git a/tensorzero-core/src/endpoints/batch_inference.rs b/tensorzero-core/src/endpoints/batch_inference.rs index 1953ba87d8..e668bc12ae 100644 --- a/tensorzero-core/src/endpoints/batch_inference.rs +++ b/tensorzero-core/src/endpoints/batch_inference.rs @@ -1162,6 +1162,7 @@ fn convert_row_to_inference_response( let usage = Usage { input_tokens: row.input_tokens, output_tokens: row.output_tokens, + cached_tokens: None, }; match function { diff --git a/tensorzero-core/src/endpoints/inference.rs b/tensorzero-core/src/endpoints/inference.rs index 6c467ae3f1..64bcd12b1b 100644 --- a/tensorzero-core/src/endpoints/inference.rs +++ b/tensorzero-core/src/endpoints/inference.rs @@ -1720,6 +1720,7 @@ impl InferenceResponseChunk { inference_result.usage().map(|_| Usage { input_tokens: Some(0), output_tokens: Some(0), + cached_tokens: None, }) } else { inference_result.usage().copied() @@ -2627,6 +2628,7 @@ mod tests { usage: Some(Usage { input_tokens: Some(10), output_tokens: Some(20), + cached_tokens: None, }), raw_usage: Some(raw_usage_entries.clone()), raw_response: None, @@ -2678,6 +2680,7 @@ mod tests { usage: Some(Usage { input_tokens: Some(10), output_tokens: Some(20), + cached_tokens: None, }), raw_usage: Some(raw_usage_entries), raw_response: None, @@ -2713,6 +2716,7 @@ mod tests { usage: Some(Usage { input_tokens: Some(10), output_tokens: Some(20), + cached_tokens: None, }), raw_usage: None, raw_response: None, @@ -2745,6 +2749,7 @@ mod tests { usage: Some(Usage { input_tokens: Some(100), output_tokens: Some(50), + cached_tokens: None, }), raw_usage: None, raw_response: None, @@ -2791,6 +2796,7 @@ mod tests { usage: Some(Usage { input_tokens: Some(30), output_tokens: Some(20), + cached_tokens: None, }), raw_usage: Some(raw_usage_entries), raw_response: None, @@ -2878,6 +2884,7 @@ mod tests { usage: Usage { input_tokens: Some(100), output_tokens: Some(50), + cached_tokens: None, }, latency: Latency::NonStreaming { response_time: Duration::from_millis(100), @@ -2976,6 +2983,7 @@ mod tests { usage: Usage { input_tokens: Some(100), output_tokens: Some(50), + cached_tokens: None, }, latency: Latency::NonStreaming { response_time: Duration::from_millis(100), @@ -3056,6 +3064,7 @@ mod tests { usage: Usage { input_tokens: Some(100), output_tokens: Some(50), + cached_tokens: None, }, latency: Latency::NonStreaming { response_time: Duration::from_millis(100), diff --git a/tensorzero-core/src/endpoints/openai_compatible/types/streaming.rs b/tensorzero-core/src/endpoints/openai_compatible/types/streaming.rs index 41a599f677..6f6c8e6952 100644 --- a/tensorzero-core/src/endpoints/openai_compatible/types/streaming.rs +++ b/tensorzero-core/src/endpoints/openai_compatible/types/streaming.rs @@ -304,6 +304,7 @@ mod tests { usage: Some(Usage { input_tokens: Some(10), output_tokens: Some(20), + cached_tokens: None, }), raw_usage: None, finish_reason: None, @@ -407,6 +408,7 @@ mod tests { usage: Some(Usage { input_tokens: Some(50), output_tokens: Some(50), + cached_tokens: None, }), raw_usage: Some(vec![raw_usage_entry.clone()]), finish_reason: None, @@ -464,6 +466,7 @@ mod tests { usage: Some(Usage { input_tokens: Some(50), output_tokens: Some(50), + cached_tokens: None, }), raw_usage: Some(vec![raw_usage_entry]), finish_reason: None, @@ -504,6 +507,7 @@ mod tests { usage: Some(Usage { input_tokens: Some(15), output_tokens: Some(25), + cached_tokens: None, }), raw_usage: None, finish_reason: None, @@ -555,6 +559,7 @@ mod tests { usage: Some(Usage { input_tokens: Some(100), output_tokens: Some(200), + cached_tokens: None, }), raw_usage: None, finish_reason: None, diff --git a/tensorzero-core/src/function/function_config.rs b/tensorzero-core/src/function/function_config.rs index e3b9aa44ad..2ce0ca2af8 100644 --- a/tensorzero-core/src/function/function_config.rs +++ b/tensorzero-core/src/function/function_config.rs @@ -1898,6 +1898,7 @@ mod tests { let usage = Usage { input_tokens: Some(10), output_tokens: Some(10), + cached_tokens: None, }; let latency = Latency::NonStreaming { response_time: Duration::from_millis(100), @@ -1966,6 +1967,7 @@ mod tests { let usage = Usage { input_tokens: Some(10), output_tokens: Some(10), + cached_tokens: None, }; let latency = Latency::NonStreaming { response_time: Duration::from_millis(100), @@ -2020,6 +2022,7 @@ mod tests { let usage = Usage { input_tokens: Some(10), output_tokens: Some(10), + cached_tokens: None, }; let latency = Latency::NonStreaming { response_time: Duration::from_millis(100), @@ -2077,6 +2080,7 @@ mod tests { let usage = Usage { input_tokens: Some(10), output_tokens: Some(10), + cached_tokens: None, }; let model_response = ModelInferenceResponseWithMetadata { id: Uuid::now_v7(), @@ -2131,6 +2135,7 @@ mod tests { let usage = Usage { input_tokens: Some(10), output_tokens: Some(10), + cached_tokens: None, }; let model_response = ModelInferenceResponseWithMetadata { id: Uuid::now_v7(), @@ -2185,6 +2190,7 @@ mod tests { let usage = Usage { input_tokens: Some(10), output_tokens: Some(0), + cached_tokens: None, }; let model_response = ModelInferenceResponseWithMetadata { id: Uuid::now_v7(), @@ -2257,6 +2263,7 @@ mod tests { let usage = Usage { input_tokens: Some(10), output_tokens: Some(10), + cached_tokens: None, }; let latency = Latency::NonStreaming { response_time: Duration::from_millis(100), @@ -2305,6 +2312,7 @@ mod tests { let usage = Usage { input_tokens: Some(10), output_tokens: Some(10), + cached_tokens: None, }; let latency = Latency::NonStreaming { response_time: Duration::from_millis(100), @@ -2361,6 +2369,7 @@ mod tests { let usage = Usage { input_tokens: Some(10), output_tokens: Some(10), + cached_tokens: None, }; let model_response = ModelInferenceResponseWithMetadata { id: Uuid::now_v7(), @@ -2414,6 +2423,7 @@ mod tests { let usage = Usage { input_tokens: Some(10), output_tokens: Some(10), + cached_tokens: None, }; let model_response = ModelInferenceResponseWithMetadata { id: Uuid::now_v7(), @@ -2473,6 +2483,7 @@ mod tests { let usage = Usage { input_tokens: Some(10), output_tokens: Some(10), + cached_tokens: None, }; let latency = Latency::NonStreaming { response_time: Duration::from_millis(100), diff --git a/tensorzero-core/src/inference/types/mod.rs b/tensorzero-core/src/inference/types/mod.rs index bb32b34333..e0067cdcc5 100644 --- a/tensorzero-core/src/inference/types/mod.rs +++ b/tensorzero-core/src/inference/types/mod.rs @@ -1375,6 +1375,7 @@ impl ModelInferenceResponseWithMetadata { Usage { input_tokens: Some(0), output_tokens: Some(0), + cached_tokens: None, } } else { self.usage @@ -1659,6 +1660,7 @@ impl ModelInferenceResponse { usage: Usage { input_tokens: cache_lookup.input_tokens, output_tokens: cache_lookup.output_tokens, + cached_tokens: None, }, provider_latency: Latency::NonStreaming { response_time: Duration::from_secs(0), @@ -2308,6 +2310,7 @@ mod tests { let usage = Usage { input_tokens: Some(10), output_tokens: Some(20), + cached_tokens: None, }; let raw_request = "raw request".to_string(); let model_inference_responses = vec![ModelInferenceResponseWithMetadata { @@ -3123,6 +3126,7 @@ mod tests { Usage { input_tokens: Some(10), output_tokens: Some(20), + cached_tokens: None, }, false, ), @@ -3130,6 +3134,7 @@ mod tests { Usage { input_tokens: Some(15), output_tokens: Some(25), + cached_tokens: None, }, false, ), @@ -3155,6 +3160,7 @@ mod tests { Usage { input_tokens: Some(10), output_tokens: Some(20), + cached_tokens: None, }, false, ), @@ -3162,6 +3168,7 @@ mod tests { Usage { input_tokens: None, output_tokens: Some(25), + cached_tokens: None, }, false, ), @@ -3187,6 +3194,7 @@ mod tests { Usage { input_tokens: Some(10), output_tokens: Some(20), + cached_tokens: None, }, false, ), @@ -3194,6 +3202,7 @@ mod tests { Usage { input_tokens: Some(15), output_tokens: None, + cached_tokens: None, }, false, ), @@ -3219,6 +3228,7 @@ mod tests { Usage { input_tokens: None, output_tokens: None, + cached_tokens: None, }, false, ), @@ -3226,6 +3236,7 @@ mod tests { Usage { input_tokens: None, output_tokens: None, + cached_tokens: None, }, false, ), @@ -3252,6 +3263,7 @@ mod tests { Usage { input_tokens: Some(10), output_tokens: Some(20), + cached_tokens: None, }, true, ), // This will be treated as 0/0 due to cached=true @@ -3259,6 +3271,7 @@ mod tests { Usage { input_tokens: None, output_tokens: Some(25), + cached_tokens: None, }, false, ), @@ -3333,6 +3346,7 @@ mod tests { let usage = Usage { input_tokens: Some(10), output_tokens: Some(20), + cached_tokens: None, }; // Create responses with different finish reasons and IDs diff --git a/tensorzero-core/src/inference/types/streams.rs b/tensorzero-core/src/inference/types/streams.rs index 4ee9604cd3..ac3cbd48cf 100644 --- a/tensorzero-core/src/inference/types/streams.rs +++ b/tensorzero-core/src/inference/types/streams.rs @@ -1039,6 +1039,7 @@ mod tests { usage: Some(Usage { input_tokens: Some(2), output_tokens: Some(4), + cached_tokens: None, }), raw_usage: None, raw_response: None, @@ -1072,6 +1073,7 @@ mod tests { model_inference_usage: Usage { input_tokens: Some(2), output_tokens: Some(4), + cached_tokens: None, }, finish_reason: Some(FinishReason::Stop), }; @@ -1126,10 +1128,12 @@ mod tests { let usage1 = Usage { input_tokens: Some(10), output_tokens: Some(5), + cached_tokens: None, }; let usage2 = Usage { input_tokens: Some(5), output_tokens: Some(10), + cached_tokens: None, }; let chunks = vec![ InferenceResultChunk::Json(JsonInferenceResultChunk { @@ -1194,6 +1198,7 @@ mod tests { model_inference_usage: Usage { input_tokens: Some(15), output_tokens: Some(15), + cached_tokens: None, }, finish_reason: Some(FinishReason::Stop), }; @@ -1203,6 +1208,7 @@ mod tests { Usage { input_tokens: Some(15), output_tokens: Some(15), + cached_tokens: None, } ); match response { @@ -1235,6 +1241,7 @@ mod tests { let model_inference_usage = Usage { input_tokens: Some(10), output_tokens: Some(5), + cached_tokens: None, }; let chunks = vec![ InferenceResultChunk::Json(JsonInferenceResultChunk { @@ -1328,6 +1335,7 @@ mod tests { let model_inference_usage = Usage { input_tokens: Some(15), output_tokens: Some(10), + cached_tokens: None, }; let chunks = vec![ InferenceResultChunk::Json(JsonInferenceResultChunk { @@ -1464,10 +1472,12 @@ mod tests { let usage1 = Usage { input_tokens: Some(10), output_tokens: Some(5), + cached_tokens: None, }; let usage2 = Usage { input_tokens: Some(5), output_tokens: Some(10), + cached_tokens: None, }; let chunks = vec![ InferenceResultChunk::Json(JsonInferenceResultChunk { @@ -1532,6 +1542,7 @@ mod tests { model_inference_usage: Usage { input_tokens: Some(15), output_tokens: Some(15), + cached_tokens: None, }, finish_reason: Some(FinishReason::Stop), }; @@ -1541,6 +1552,7 @@ mod tests { Usage { input_tokens: Some(15), output_tokens: Some(15), + cached_tokens: None, } ); match response { @@ -1589,10 +1601,12 @@ mod tests { let usage1 = Usage { input_tokens: Some(10), output_tokens: Some(5), + cached_tokens: None, }; let usage2 = Usage { input_tokens: Some(5), output_tokens: Some(10), + cached_tokens: None, }; let dynamic_output_schema = JSONSchema::compile_background(serde_json::json!({ "type": "object", @@ -1666,6 +1680,7 @@ mod tests { model_inference_usage: Usage { input_tokens: Some(15), output_tokens: Some(15), + cached_tokens: None, }, finish_reason: Some(FinishReason::ToolCall), }; @@ -1675,6 +1690,7 @@ mod tests { Usage { input_tokens: Some(15), output_tokens: Some(15), + cached_tokens: None, } ); match response { @@ -1808,6 +1824,7 @@ mod tests { usage: Some(Usage { input_tokens: Some(2), output_tokens: Some(4), + cached_tokens: None, }), raw_usage: None, raw_response: None, @@ -1858,6 +1875,7 @@ mod tests { model_inference_usage: Usage { input_tokens: Some(2), output_tokens: Some(4), + cached_tokens: None, }, finish_reason: Some(FinishReason::Stop), }; @@ -1867,6 +1885,7 @@ mod tests { Usage { input_tokens: Some(2), output_tokens: Some(4), + cached_tokens: None, } ); let chat_result = match result { @@ -1966,6 +1985,7 @@ mod tests { usage: Some(Usage { input_tokens: Some(10), output_tokens: Some(20), + cached_tokens: None, }), raw_usage: None, raw_chunk: "chunk2".to_string(), @@ -2058,6 +2078,7 @@ mod tests { usage: Some(Usage { input_tokens: Some(15), output_tokens: Some(25), + cached_tokens: None, }), raw_usage: None, raw_chunk: "chunk2".to_string(), @@ -2141,6 +2162,7 @@ mod tests { usage: Some(Usage { input_tokens: Some(5), output_tokens: Some(10), + cached_tokens: None, }), raw_usage: None, raw_chunk: "chunk2".to_string(), @@ -2228,6 +2250,7 @@ mod tests { usage: Some(Usage { input_tokens: Some(20), output_tokens: Some(15), + cached_tokens: None, }), raw_usage: None, raw_chunk: "chunk2".to_string(), @@ -2299,6 +2322,7 @@ mod tests { usage: Some(Usage { input_tokens: Some(5), output_tokens: Some(5), + cached_tokens: None, }), raw_usage: None, raw_response: None, @@ -2422,6 +2446,7 @@ mod tests { usage: Some(Usage { input_tokens: Some(20), output_tokens: Some(30), + cached_tokens: None, }), raw_usage: None, raw_response: None, @@ -2504,6 +2529,7 @@ mod tests { usage: Some(Usage { input_tokens: Some(10), output_tokens: Some(20), + cached_tokens: None, }), raw_usage: None, raw_response: "raw response".to_string(), @@ -2522,6 +2548,7 @@ mod tests { Some(Usage { input_tokens: Some(10), output_tokens: Some(20), + cached_tokens: None, }) ); assert_eq!(result.finish_reason, Some(FinishReason::ToolCall)); diff --git a/tensorzero-core/src/inference/types/usage.rs b/tensorzero-core/src/inference/types/usage.rs index 3427b9b557..18282cefd3 100644 --- a/tensorzero-core/src/inference/types/usage.rs +++ b/tensorzero-core/src/inference/types/usage.rs @@ -58,6 +58,9 @@ pub struct RawResponseEntry { pub struct Usage { pub input_tokens: Option, pub output_tokens: Option, + #[cfg_attr(feature = "ts-bindings", ts(export, optional_fields))] + #[serde(skip_serializing_if = "Option::is_none", default)] + pub cached_tokens: Option, } impl Usage { @@ -65,6 +68,7 @@ impl Usage { Usage { input_tokens: Some(0), output_tokens: Some(0), + cached_tokens: Some(0), } } @@ -103,6 +107,7 @@ where let Usage { input_tokens: chunk_input_tokens, output_tokens: chunk_output_tokens, + cached_tokens: chunk_cached_tokens, } = chunk_usage; acc.input_tokens = match (acc.input_tokens, chunk_input_tokens) { @@ -143,6 +148,18 @@ where } }; + acc.cached_tokens = match (acc.cached_tokens, chunk_cached_tokens) { + (_, None) => acc.cached_tokens, + (None, chunk_value) => chunk_value, + (Some(current_value), Some(chunk_value)) => { + if current_value < chunk_value { + Some(chunk_value) + } else { + Some(current_value) + } + } + }; + acc }) } @@ -163,6 +180,7 @@ where let Usage { input_tokens: mi_input_tokens, output_tokens: mi_output_tokens, + cached_tokens: mi_cached_tokens, } = mi_usage; Usage { @@ -174,6 +192,11 @@ where (Some(a), Some(b)) => Some(a + b), _ => None, }, + // Sum `cached_tokens` treating `None` as 0; result is `None` only if both are `None`. + cached_tokens: match (acc.cached_tokens, mi_cached_tokens) { + (None, None) => None, + (a, b) => Some(a.unwrap_or(0) + b.unwrap_or(0)), + }, } }) } @@ -250,6 +273,7 @@ mod tests { let usage = Usage { input_tokens: Some(100), output_tokens: Some(50), + cached_tokens: None, }; let result = aggregate_usage_from_single_streaming_model_inference(vec![usage]); assert_eq!( @@ -271,14 +295,17 @@ mod tests { Usage { input_tokens: Some(100), output_tokens: Some(10), + cached_tokens: None, }, Usage { input_tokens: Some(100), output_tokens: Some(25), + cached_tokens: None, }, Usage { input_tokens: Some(100), output_tokens: Some(50), + cached_tokens: None, }, ]; let result = aggregate_usage_from_single_streaming_model_inference(chunks); @@ -301,14 +328,17 @@ mod tests { Usage { input_tokens: None, output_tokens: None, + cached_tokens: None, }, Usage { input_tokens: None, output_tokens: None, + cached_tokens: None, }, Usage { input_tokens: Some(200), output_tokens: Some(100), + cached_tokens: None, }, ]; let result = aggregate_usage_from_single_streaming_model_inference(chunks); @@ -330,10 +360,12 @@ mod tests { Usage { input_tokens: None, output_tokens: None, + cached_tokens: None, }, Usage { input_tokens: None, output_tokens: None, + cached_tokens: None, }, ]; let result = aggregate_usage_from_single_streaming_model_inference(chunks); @@ -353,10 +385,12 @@ mod tests { Usage { input_tokens: Some(100), output_tokens: None, + cached_tokens: None, }, Usage { input_tokens: None, output_tokens: Some(50), + cached_tokens: None, }, ]; let result = aggregate_usage_from_single_streaming_model_inference(chunks); @@ -381,10 +415,12 @@ mod tests { Usage { input_tokens: Some(100), output_tokens: Some(50), + cached_tokens: None, }, Usage { input_tokens: Some(80), // Smaller than previous (unexpected) output_tokens: Some(30), // Smaller than previous (unexpected) + cached_tokens: None, }, ]; // This will panic due to debug_assert! when non-cumulative values are detected @@ -413,6 +449,7 @@ mod tests { let usage = Usage { input_tokens: Some(100), output_tokens: Some(50), + cached_tokens: None, }; let result = aggregate_usage_across_model_inferences(vec![usage]); assert_eq!( @@ -433,10 +470,12 @@ mod tests { Usage { input_tokens: Some(100), output_tokens: Some(50), + cached_tokens: None, }, Usage { input_tokens: Some(200), output_tokens: Some(100), + cached_tokens: None, }, ]; let result = aggregate_usage_across_model_inferences(usages); @@ -458,10 +497,12 @@ mod tests { Usage { input_tokens: Some(100), output_tokens: Some(50), + cached_tokens: None, }, Usage { input_tokens: None, // This should propagate None for input_tokens output_tokens: Some(100), + cached_tokens: None, }, ]; let result = aggregate_usage_across_model_inferences(usages); @@ -482,10 +523,12 @@ mod tests { Usage { input_tokens: None, output_tokens: Some(50), + cached_tokens: None, }, Usage { input_tokens: Some(100), output_tokens: None, + cached_tokens: None, }, ]; let result = aggregate_usage_across_model_inferences(usages); @@ -505,10 +548,12 @@ mod tests { Usage { input_tokens: None, output_tokens: None, + cached_tokens: None, }, Usage { input_tokens: None, output_tokens: None, + cached_tokens: None, }, ]; let result = aggregate_usage_across_model_inferences(usages); @@ -529,11 +574,13 @@ mod tests { Usage { input_tokens: Some(69), output_tokens: Some(1), + cached_tokens: None, }, // message_delta chunk: only output_tokens, no input_tokens Usage { input_tokens: None, output_tokens: Some(100), + cached_tokens: None, }, ]; diff --git a/tensorzero-core/src/model.rs b/tensorzero-core/src/model.rs index ea818adf3e..02e80b513a 100644 --- a/tensorzero-core/src/model.rs +++ b/tensorzero-core/src/model.rs @@ -2926,6 +2926,7 @@ mod tests { Usage { input_tokens: Some(10), output_tokens: Some(1), + cached_tokens: None, } ); assert_eq!(&*response.model_provider_name, "good_provider"); @@ -3192,6 +3193,7 @@ mod tests { Usage { input_tokens: Some(10), output_tokens: Some(1), + cached_tokens: None, } ); assert_eq!(&*response.model_provider_name, "good_provider"); diff --git a/tensorzero-core/src/providers/anthropic.rs b/tensorzero-core/src/providers/anthropic.rs index fe927424fa..c2bcabd216 100644 --- a/tensorzero-core/src/providers/anthropic.rs +++ b/tensorzero-core/src/providers/anthropic.rs @@ -1255,6 +1255,7 @@ impl From for Usage { Usage { input_tokens: total_input_tokens, output_tokens: value.output_tokens, + cached_tokens: None, } } } diff --git a/tensorzero-core/src/providers/aws_bedrock.rs b/tensorzero-core/src/providers/aws_bedrock.rs index 05da9c35e6..00742a97db 100644 --- a/tensorzero-core/src/providers/aws_bedrock.rs +++ b/tensorzero-core/src/providers/aws_bedrock.rs @@ -879,6 +879,7 @@ fn convert_converse_response( let usage = Usage { input_tokens: Some(total_input_tokens), output_tokens: Some(response.usage.output_tokens as u32), + cached_tokens: None, }; // Extract raw usage from response @@ -1243,6 +1244,7 @@ fn process_stream_event( let usage = Some(Usage { input_tokens: Some(total_input_tokens), output_tokens: Some(event.usage.output_tokens as u32), + cached_tokens: None, }); Ok(Some(ProviderInferenceResponseChunk::new_with_raw_usage( diff --git a/tensorzero-core/src/providers/azure.rs b/tensorzero-core/src/providers/azure.rs index f0510c78f5..01721b36a3 100644 --- a/tensorzero-core/src/providers/azure.rs +++ b/tensorzero-core/src/providers/azure.rs @@ -1124,6 +1124,7 @@ mod tests { usage: Some(OpenAIUsage { prompt_tokens: Some(10), completion_tokens: Some(20), + prompt_tokens_details: None, }), }; let generic_request = ModelInferenceRequest { diff --git a/tensorzero-core/src/providers/deepseek.rs b/tensorzero-core/src/providers/deepseek.rs index e5e74f1834..2a0df423d3 100644 --- a/tensorzero-core/src/providers/deepseek.rs +++ b/tensorzero-core/src/providers/deepseek.rs @@ -1064,6 +1064,7 @@ mod tests { usage: OpenAIUsage { prompt_tokens: Some(10), completion_tokens: Some(20), + prompt_tokens_details: None, }, }; let generic_request = ModelInferenceRequest { diff --git a/tensorzero-core/src/providers/dummy.rs b/tensorzero-core/src/providers/dummy.rs index 1f4e7c58d9..9a74ad3a35 100644 --- a/tensorzero-core/src/providers/dummy.rs +++ b/tensorzero-core/src/providers/dummy.rs @@ -85,22 +85,27 @@ impl DummyProvider { "input_tokens_zero" => Usage { input_tokens: Some(0), output_tokens: Some(output_tokens), + cached_tokens: None, }, "output_tokens_zero" => Usage { input_tokens: Some(10), output_tokens: Some(0), + cached_tokens: None, }, "input_tokens_output_tokens_zero" => Usage { input_tokens: Some(0), output_tokens: Some(0), + cached_tokens: None, }, "input_five_output_six" => Usage { input_tokens: Some(5), output_tokens: Some(6), + cached_tokens: None, }, _ => Usage { input_tokens: Some(10), output_tokens: Some(output_tokens), + cached_tokens: None, }, } } @@ -933,6 +938,7 @@ impl EmbeddingProvider for DummyProvider { let usage = Usage { input_tokens: Some(10), output_tokens: Some(0), + cached_tokens: None, }; let latency = Latency::NonStreaming { response_time: Duration::from_millis(100), diff --git a/tensorzero-core/src/providers/fireworks/mod.rs b/tensorzero-core/src/providers/fireworks/mod.rs index 5368a3a073..43763ac85a 100644 --- a/tensorzero-core/src/providers/fireworks/mod.rs +++ b/tensorzero-core/src/providers/fireworks/mod.rs @@ -1029,6 +1029,7 @@ mod tests { usage: OpenAIUsage { prompt_tokens: Some(10), completion_tokens: Some(20), + prompt_tokens_details: None, }, }; @@ -1218,6 +1219,7 @@ mod tests { usage: OpenAIUsage { prompt_tokens: Some(10), completion_tokens: Some(20), + prompt_tokens_details: None, }, }; let generic_request = ModelInferenceRequest { @@ -1357,6 +1359,7 @@ mod tests { let usage = OpenAIUsage { prompt_tokens: Some(10), completion_tokens: Some(20), + prompt_tokens_details: None, }; let chunk = FireworksChatChunk { choices: vec![], @@ -1411,6 +1414,7 @@ mod tests { Some(Usage { input_tokens: Some(10), output_tokens: Some(20), + cached_tokens: None, }), "expected usage to include provider raw_usage entries" ); diff --git a/tensorzero-core/src/providers/gcp_vertex_anthropic.rs b/tensorzero-core/src/providers/gcp_vertex_anthropic.rs index c4daccff0e..4a53560ae7 100644 --- a/tensorzero-core/src/providers/gcp_vertex_anthropic.rs +++ b/tensorzero-core/src/providers/gcp_vertex_anthropic.rs @@ -757,6 +757,7 @@ impl From for Usage { Usage { input_tokens: total_input_tokens, output_tokens: value.output_tokens, + cached_tokens: None, } } } diff --git a/tensorzero-core/src/providers/gcp_vertex_gemini/mod.rs b/tensorzero-core/src/providers/gcp_vertex_gemini/mod.rs index fb88023812..c69be5cb66 100644 --- a/tensorzero-core/src/providers/gcp_vertex_gemini/mod.rs +++ b/tensorzero-core/src/providers/gcp_vertex_gemini/mod.rs @@ -820,6 +820,7 @@ fn make_provider_batch_inference_output( let usage = Usage { input_tokens: usage_metadata.prompt_token_count, output_tokens: usage_metadata.output_tokens(), + cached_tokens: None, }; let (output, finish_reason) = get_response_content( @@ -3018,6 +3019,7 @@ impl<'a> TryFrom> for ProviderInferenceR let usage = Usage { input_tokens: usage_metadata.prompt_token_count, output_tokens: usage_metadata.output_tokens(), + cached_tokens: None, }; let system = generic_request.system.clone(); @@ -3111,6 +3113,7 @@ fn convert_stream_response_with_metadata_to_chunk( Some(Usage { input_tokens: metadata.prompt_token_count, output_tokens: metadata.output_tokens(), + cached_tokens: None, }) } else { None @@ -3786,6 +3789,7 @@ mod tests { Usage { input_tokens: None, output_tokens: None, + cached_tokens: None, } ); assert_eq!(model_inference_response.provider_latency, latency); @@ -3901,6 +3905,7 @@ mod tests { Usage { input_tokens: Some(15), output_tokens: Some(20), + cached_tokens: None, } ); assert_eq!(model_inference_response.provider_latency, latency); @@ -4030,6 +4035,7 @@ mod tests { Usage { input_tokens: Some(25), output_tokens: Some(40), + cached_tokens: None, } ); assert_eq!(model_inference_response.provider_latency, latency); diff --git a/tensorzero-core/src/providers/google_ai_studio_gemini.rs b/tensorzero-core/src/providers/google_ai_studio_gemini.rs index 8ec2479a54..f42e5b0306 100644 --- a/tensorzero-core/src/providers/google_ai_studio_gemini.rs +++ b/tensorzero-core/src/providers/google_ai_studio_gemini.rs @@ -1307,6 +1307,7 @@ impl From for Usage { Usage { input_tokens: usage_metadata.prompt_token_count, output_tokens, + cached_tokens: None, } } } @@ -2154,6 +2155,7 @@ mod tests { Usage { input_tokens: Some(10), output_tokens: Some(10), + cached_tokens: None, } ); assert_eq!(model_inference_response.provider_latency, latency); @@ -2269,6 +2271,7 @@ mod tests { Usage { input_tokens: Some(15), output_tokens: Some(20), + cached_tokens: None, } ); assert_eq!(model_inference_response.provider_latency, latency); @@ -2397,6 +2400,7 @@ mod tests { Usage { input_tokens: Some(25), output_tokens: Some(40), + cached_tokens: None, } ); assert_eq!(model_inference_response.provider_latency, latency); diff --git a/tensorzero-core/src/providers/groq.rs b/tensorzero-core/src/providers/groq.rs index a81e0a0b5f..cacd8d70ce 100644 --- a/tensorzero-core/src/providers/groq.rs +++ b/tensorzero-core/src/providers/groq.rs @@ -1210,6 +1210,7 @@ impl From for Usage { Usage { input_tokens: Some(usage.prompt_tokens), output_tokens: Some(usage.completion_tokens), + cached_tokens: None, } } } @@ -2568,6 +2569,7 @@ mod tests { Some(Usage { input_tokens: Some(10), output_tokens: Some(20), + cached_tokens: None, }), "expected usage to include provider raw_usage entries" ); diff --git a/tensorzero-core/src/providers/hyperbolic.rs b/tensorzero-core/src/providers/hyperbolic.rs index f49110eb2e..11adf5b98c 100644 --- a/tensorzero-core/src/providers/hyperbolic.rs +++ b/tensorzero-core/src/providers/hyperbolic.rs @@ -621,6 +621,7 @@ mod tests { usage: Some(OpenAIUsage { prompt_tokens: Some(10), completion_tokens: Some(20), + prompt_tokens_details: None, }), }; let generic_request = ModelInferenceRequest { diff --git a/tensorzero-core/src/providers/mistral.rs b/tensorzero-core/src/providers/mistral.rs index 56f8597f2b..c1f43e7b67 100644 --- a/tensorzero-core/src/providers/mistral.rs +++ b/tensorzero-core/src/providers/mistral.rs @@ -646,6 +646,7 @@ impl From for Usage { Usage { input_tokens: Some(usage.prompt_tokens), output_tokens: Some(usage.completion_tokens), + cached_tokens: None, } } } diff --git a/tensorzero-core/src/providers/novita.rs b/tensorzero-core/src/providers/novita.rs index 3a59766835..1214529f08 100644 --- a/tensorzero-core/src/providers/novita.rs +++ b/tensorzero-core/src/providers/novita.rs @@ -626,7 +626,11 @@ fn build_body(shape: &NovitaRequestShape, input: &Value) -> Result // already set them explicitly. if matches!(shape, NovitaRequestShape::Wan27VideoEdit) { if let Some(imgs) = input.get("image_urls").and_then(Value::as_array) { - let slot_names = ["reference_image_url", "reference_image_url_2", "reference_image_url_3"]; + let slot_names = [ + "reference_image_url", + "reference_image_url_2", + "reference_image_url_3", + ]; for (idx, slot) in slot_names.iter().enumerate() { if body.contains_key(*slot) { continue; diff --git a/tensorzero-core/src/providers/openai/mod.rs b/tensorzero-core/src/providers/openai/mod.rs index dc0553d8c7..d3de6ed5a3 100644 --- a/tensorzero-core/src/providers/openai/mod.rs +++ b/tensorzero-core/src/providers/openai/mod.rs @@ -2555,10 +2555,18 @@ impl<'a> OpenAIBatchRequest<'a> { } } +#[derive(Clone, Debug, Default, Deserialize, PartialEq, Serialize)] +pub(super) struct OpenAIPromptTokensDetails { + #[serde(default)] + pub cached_tokens: Option, +} + #[derive(Clone, Debug, Deserialize, PartialEq, Serialize)] pub(super) struct OpenAIUsage { pub prompt_tokens: Option, pub completion_tokens: Option, + #[serde(default)] + pub prompt_tokens_details: Option, } impl From for Usage { @@ -2566,6 +2574,7 @@ impl From for Usage { Usage { input_tokens: usage.prompt_tokens, output_tokens: usage.completion_tokens, + cached_tokens: usage.prompt_tokens_details.and_then(|d| d.cached_tokens), } } } @@ -2589,6 +2598,7 @@ impl From for Usage { Usage { input_tokens: usage.prompt_tokens, output_tokens: Some(0), // this is always zero for embeddings + cached_tokens: None, } } } @@ -3657,6 +3667,7 @@ mod tests { usage: Some(OpenAIUsage { prompt_tokens: Some(10), completion_tokens: Some(20), + prompt_tokens_details: None, }), }; let generic_request = ModelInferenceRequest { @@ -3750,6 +3761,7 @@ mod tests { usage: Some(OpenAIUsage { prompt_tokens: Some(15), completion_tokens: Some(25), + prompt_tokens_details: None, }), }; let generic_request = ModelInferenceRequest { @@ -3835,6 +3847,7 @@ mod tests { usage: Some(OpenAIUsage { prompt_tokens: Some(5), completion_tokens: Some(0), + prompt_tokens_details: None, }), }; let request_body = OpenAIRequest { @@ -3889,6 +3902,7 @@ mod tests { usage: Some(OpenAIUsage { prompt_tokens: Some(10), completion_tokens: Some(10), + prompt_tokens_details: None, }), }; @@ -4249,6 +4263,7 @@ mod tests { let usage = OpenAIUsage { prompt_tokens: Some(10), completion_tokens: Some(20), + prompt_tokens_details: None, }; let chunk = OpenAIChatChunk { choices: vec![], @@ -4300,6 +4315,7 @@ mod tests { Some(Usage { input_tokens: Some(10), output_tokens: Some(20), + cached_tokens: None, }), "expected usage to include provider raw_usage entries" ); @@ -5773,6 +5789,7 @@ mod tests { let openai_usage = Some(OpenAIUsage { prompt_tokens: Some(10), completion_tokens: Some(20), + prompt_tokens_details: None, }); let usage: Usage = openai_usage.into(); assert_eq!(usage.input_tokens, Some(10), "input_tokens should be 10"); diff --git a/tensorzero-core/src/providers/openai/responses.rs b/tensorzero-core/src/providers/openai/responses.rs index f57aaaeb6e..21f06c5989 100644 --- a/tensorzero-core/src/providers/openai/responses.rs +++ b/tensorzero-core/src/providers/openai/responses.rs @@ -126,6 +126,7 @@ impl From for Usage { Usage { input_tokens: usage.input_tokens, output_tokens: usage.output_tokens, + cached_tokens: None, } } } @@ -1449,6 +1450,7 @@ pub(super) fn openai_responses_to_tensorzero_chunk( Usage { input_tokens, output_tokens, + cached_tokens: None, } }); let raw_usage = usage_value.map(|usage| { @@ -1545,6 +1547,7 @@ pub(super) fn openai_responses_to_tensorzero_chunk( Usage { input_tokens, output_tokens, + cached_tokens: None, } }); @@ -2502,6 +2505,7 @@ mod tests { Some(Usage { input_tokens: Some(15), output_tokens: Some(25), + cached_tokens: None, }), "expected usage to include provider raw_usage entries" ); @@ -2597,6 +2601,7 @@ mod tests { Some(Usage { input_tokens: Some(100), output_tokens: Some(200), + cached_tokens: None, }) ); assert_eq!(result.finish_reason, Some(FinishReason::Stop)); @@ -2698,6 +2703,7 @@ mod tests { Some(Usage { input_tokens: Some(10), output_tokens: Some(100), + cached_tokens: None, }), "expected usage to include provider raw_usage entries" ); diff --git a/tensorzero-core/src/providers/openrouter.rs b/tensorzero-core/src/providers/openrouter.rs index 8a772c4c7a..89565844d8 100644 --- a/tensorzero-core/src/providers/openrouter.rs +++ b/tensorzero-core/src/providers/openrouter.rs @@ -1567,6 +1567,7 @@ impl From for Usage { Usage { input_tokens: usage.prompt_tokens, output_tokens: usage.completion_tokens, + cached_tokens: None, } } } @@ -3143,6 +3144,7 @@ mod tests { Some(Usage { input_tokens: Some(10), output_tokens: Some(20), + cached_tokens: None, }), "expected usage to include provider raw_usage entries" ); diff --git a/tensorzero-core/src/providers/sglang.rs b/tensorzero-core/src/providers/sglang.rs index 9adbc10212..105e91e854 100644 --- a/tensorzero-core/src/providers/sglang.rs +++ b/tensorzero-core/src/providers/sglang.rs @@ -981,6 +981,7 @@ mod tests { usage: Some(OpenAIUsage { prompt_tokens: Some(10), completion_tokens: Some(20), + prompt_tokens_details: None, }), }; let generic_request = ModelInferenceRequest { diff --git a/tensorzero-core/src/providers/tgi.rs b/tensorzero-core/src/providers/tgi.rs index 7c7a75e7d8..20a04a44a9 100644 --- a/tensorzero-core/src/providers/tgi.rs +++ b/tensorzero-core/src/providers/tgi.rs @@ -679,6 +679,7 @@ impl From for Usage { Usage { input_tokens: Some(usage.prompt_tokens), output_tokens: Some(usage.completion_tokens), + cached_tokens: None, } } } diff --git a/tensorzero-core/src/providers/together.rs b/tensorzero-core/src/providers/together.rs index 5eeee6b4dd..726d457e52 100644 --- a/tensorzero-core/src/providers/together.rs +++ b/tensorzero-core/src/providers/together.rs @@ -1035,6 +1035,7 @@ mod tests { usage: OpenAIUsage { prompt_tokens: Some(10), completion_tokens: Some(20), + prompt_tokens_details: None, }, }; let generic_request = ModelInferenceRequest { @@ -1105,6 +1106,7 @@ mod tests { usage: OpenAIUsage { prompt_tokens: Some(10), completion_tokens: Some(20), + prompt_tokens_details: None, }, }; let together_response_with_metadata = TogetherResponseWithMetadata { @@ -1155,6 +1157,7 @@ mod tests { usage: OpenAIUsage { prompt_tokens: Some(10), completion_tokens: Some(20), + prompt_tokens_details: None, }, }; let together_response_with_metadata = TogetherResponseWithMetadata { @@ -1211,6 +1214,7 @@ mod tests { usage: OpenAIUsage { prompt_tokens: Some(10), completion_tokens: Some(20), + prompt_tokens_details: None, }, }; @@ -1599,6 +1603,7 @@ mod tests { let usage = OpenAIUsage { prompt_tokens: Some(10), completion_tokens: Some(20), + prompt_tokens_details: None, }; let chunk = TogetherChatChunk { choices: vec![], @@ -1652,6 +1657,7 @@ mod tests { Some(Usage { input_tokens: Some(10), output_tokens: Some(20), + cached_tokens: None, }), "expected usage to include provider raw_usage entries" ); diff --git a/tensorzero-core/src/providers/vllm.rs b/tensorzero-core/src/providers/vllm.rs index df5dd6597f..4be3d83843 100644 --- a/tensorzero-core/src/providers/vllm.rs +++ b/tensorzero-core/src/providers/vllm.rs @@ -744,6 +744,7 @@ mod tests { usage: Some(OpenAIUsage { prompt_tokens: Some(10), completion_tokens: Some(20), + prompt_tokens_details: None, }), }; let generic_request = ModelInferenceRequest { diff --git a/tensorzero-core/src/providers/xai.rs b/tensorzero-core/src/providers/xai.rs index 925bbb6f35..59485a3b8c 100644 --- a/tensorzero-core/src/providers/xai.rs +++ b/tensorzero-core/src/providers/xai.rs @@ -71,6 +71,7 @@ impl From for Usage { Usage { input_tokens: usage.prompt_tokens, output_tokens, + cached_tokens: None, } } } diff --git a/tensorzero-core/src/relay.rs b/tensorzero-core/src/relay.rs index a8c0c32144..59d2d3f6de 100644 --- a/tensorzero-core/src/relay.rs +++ b/tensorzero-core/src/relay.rs @@ -196,6 +196,7 @@ impl TensorzeroRelay { (Some(total), Some(prompt)) => Some(total - prompt), _ => None, }, + cached_tokens: None, }) .unwrap_or_default(), model, diff --git a/tensorzero-core/src/variant/best_of_n_sampling.rs b/tensorzero-core/src/variant/best_of_n_sampling.rs index 62e4eb6f74..bfd2c88ccf 100644 --- a/tensorzero-core/src/variant/best_of_n_sampling.rs +++ b/tensorzero-core/src/variant/best_of_n_sampling.rs @@ -1078,6 +1078,7 @@ mod tests { usage: Usage { input_tokens: Some(50), output_tokens: Some(100), + cached_tokens: None, }, latency: Latency::NonStreaming { response_time: std::time::Duration::from_millis(500), @@ -1116,6 +1117,7 @@ mod tests { usage: Usage { input_tokens: Some(15), output_tokens: Some(25), + cached_tokens: None, }, latency: Latency::NonStreaming { response_time: std::time::Duration::from_millis(550), @@ -1173,6 +1175,7 @@ mod tests { usage: Usage { input_tokens: Some(50), output_tokens: Some(100), + cached_tokens: None, }, latency: Latency::NonStreaming { response_time: std::time::Duration::from_millis(500), @@ -1214,6 +1217,7 @@ mod tests { usage: Usage { input_tokens: Some(15), output_tokens: Some(25), + cached_tokens: None, }, latency: Latency::NonStreaming { response_time: std::time::Duration::from_millis(550), @@ -1287,6 +1291,7 @@ mod tests { usage: Usage { input_tokens: Some(50), output_tokens: Some(100), + cached_tokens: None, }, latency: Latency::NonStreaming { response_time: std::time::Duration::from_millis(500), @@ -1325,6 +1330,7 @@ mod tests { usage: Usage { input_tokens: Some(15), output_tokens: Some(25), + cached_tokens: None, }, latency: Latency::NonStreaming { response_time: std::time::Duration::from_millis(550), @@ -1439,6 +1445,7 @@ mod tests { let expected_usage = Usage { input_tokens: Some(75), output_tokens: Some(126), + cached_tokens: None, }; let expected_content = vec!["Candidate answer 1".to_string().into()]; assert_eq!(selected.usage_considering_cached(), expected_usage); diff --git a/tensorzero-core/src/variant/chat_completion/mod.rs b/tensorzero-core/src/variant/chat_completion/mod.rs index 4b09acb873..323481d911 100644 --- a/tensorzero-core/src/variant/chat_completion/mod.rs +++ b/tensorzero-core/src/variant/chat_completion/mod.rs @@ -1725,6 +1725,7 @@ mod tests { Usage { input_tokens: Some(10), output_tokens: Some(1), + cached_tokens: None, } ); match result { @@ -1809,6 +1810,7 @@ mod tests { Usage { input_tokens: Some(10), output_tokens: Some(1), + cached_tokens: None, } ); match result { @@ -1906,6 +1908,7 @@ mod tests { Usage { input_tokens: Some(10), output_tokens: Some(1), + cached_tokens: None, } ); match result { @@ -2010,6 +2013,7 @@ mod tests { Usage { input_tokens: Some(10), output_tokens: Some(1), + cached_tokens: None, } ); match result { @@ -2142,6 +2146,7 @@ mod tests { Usage { input_tokens: Some(10), output_tokens: Some(1), + cached_tokens: None, } ); match result { @@ -2267,6 +2272,7 @@ mod tests { Usage { input_tokens: Some(10), output_tokens: Some(1), + cached_tokens: None, } ); match result { @@ -2597,6 +2603,7 @@ mod tests { Some(&Usage { input_tokens: Some(10), output_tokens: Some(16), + cached_tokens: None, }) ); break; diff --git a/tensorzero-core/src/variant/mixture_of_n.rs b/tensorzero-core/src/variant/mixture_of_n.rs index bd3f4e0039..fc28819103 100644 --- a/tensorzero-core/src/variant/mixture_of_n.rs +++ b/tensorzero-core/src/variant/mixture_of_n.rs @@ -1175,6 +1175,7 @@ mod tests { usage: Usage { input_tokens: Some(50), output_tokens: Some(100), + cached_tokens: None, }, latency: Latency::NonStreaming { response_time: std::time::Duration::from_millis(500), @@ -1210,6 +1211,7 @@ mod tests { usage: Usage { input_tokens: Some(15), output_tokens: Some(25), + cached_tokens: None, }, latency: Latency::NonStreaming { response_time: std::time::Duration::from_millis(550), @@ -1264,6 +1266,7 @@ mod tests { usage: Usage { input_tokens: Some(10), output_tokens: Some(20), + cached_tokens: None, }, latency: Latency::NonStreaming { response_time: std::time::Duration::from_millis(500), @@ -1302,6 +1305,7 @@ mod tests { usage: Usage { input_tokens: Some(15), output_tokens: Some(25), + cached_tokens: None, }, latency: Latency::NonStreaming { response_time: std::time::Duration::from_millis(550), @@ -1381,6 +1385,7 @@ mod tests { usage: Usage { input_tokens: Some(10), output_tokens: Some(20), + cached_tokens: None, }, latency: Latency::NonStreaming { response_time: std::time::Duration::from_millis(500), @@ -1416,6 +1421,7 @@ mod tests { usage: Usage { input_tokens: Some(15), output_tokens: Some(25), + cached_tokens: None, }, latency: Latency::NonStreaming { response_time: std::time::Duration::from_millis(550), @@ -1532,6 +1538,7 @@ mod tests { let expected_usage = Usage { input_tokens: Some(35), output_tokens: Some(46), + cached_tokens: None, }; let expected_content = InternalJsonInferenceOutput { raw: Some("{\"answer\":\"Hello\"}".to_string()), @@ -1797,6 +1804,7 @@ mod tests { Some(Usage { input_tokens: Some(10), output_tokens: Some(20), + cached_tokens: None, }), None, // raw_usage_entries ) @@ -1847,6 +1855,7 @@ mod tests { usage: Some(Usage { input_tokens: Some(10), output_tokens: Some(20), + cached_tokens: None, }), raw_usage: None, raw_response: None, diff --git a/tensorzero-core/src/variant/mod.rs b/tensorzero-core/src/variant/mod.rs index 68e62f0344..03f02b184f 100644 --- a/tensorzero-core/src/variant/mod.rs +++ b/tensorzero-core/src/variant/mod.rs @@ -1349,6 +1349,7 @@ mod tests { Usage { input_tokens: Some(10), output_tokens: Some(1), + cached_tokens: None, } ); match inference_result { @@ -1461,6 +1462,7 @@ mod tests { Usage { input_tokens: Some(10), output_tokens: Some(1), + cached_tokens: None, } ); match inference_result { @@ -1679,6 +1681,7 @@ mod tests { Usage { input_tokens: Some(10), output_tokens: Some(1), + cached_tokens: None, } ); match inference_result { diff --git a/tensorzero-core/tests/e2e/cache.rs b/tensorzero-core/tests/e2e/cache.rs index 543d5a37f4..3febedab86 100644 --- a/tensorzero-core/tests/e2e/cache.rs +++ b/tensorzero-core/tests/e2e/cache.rs @@ -149,6 +149,7 @@ async fn test_cache_write_and_read() { Usage { input_tokens: Some(10), output_tokens: Some(16), + cached_tokens: None, } ); assert_eq!(*result.model_provider_name, *"test_provider"); @@ -167,6 +168,7 @@ async fn test_cache_write_and_read() { Usage { input_tokens: Some(10), output_tokens: Some(16), + cached_tokens: None, } ); assert_eq!( @@ -245,6 +247,7 @@ async fn test_cache_stream_write_and_read() { usage: Some(Usage { input_tokens: Some(20), output_tokens: Some(40), + cached_tokens: None, }), raw_usage: None, raw_response: "raw response".to_string(), @@ -259,6 +262,7 @@ async fn test_cache_stream_write_and_read() { usage: Some(Usage { input_tokens: Some(100), output_tokens: Some(200), + cached_tokens: None, }), raw_usage: None, raw_response: "raw response 2".to_string(), @@ -276,6 +280,7 @@ async fn test_cache_stream_write_and_read() { &Usage { input_tokens: Some(1), output_tokens: Some(2), + cached_tokens: None, }, None, ) @@ -310,6 +315,7 @@ async fn test_cache_stream_write_and_read() { &Some(Usage { input_tokens: Some(20), output_tokens: Some(40), + cached_tokens: None, }) ); } else { @@ -318,6 +324,7 @@ async fn test_cache_stream_write_and_read() { &Some(Usage { input_tokens: Some(100), output_tokens: Some(200), + cached_tokens: None, }) ); }; diff --git a/tensorzero-core/tests/e2e/db/batch_inference_endpoint_internals.rs b/tensorzero-core/tests/e2e/db/batch_inference_endpoint_internals.rs index bd5fcac76b..2a6f3d4274 100644 --- a/tensorzero-core/tests/e2e/db/batch_inference_endpoint_internals.rs +++ b/tensorzero-core/tests/e2e/db/batch_inference_endpoint_internals.rs @@ -489,6 +489,7 @@ async fn test_write_read_completed_batch_inference_chat(clickhouse: ClickHouseCo usage: Usage { input_tokens: Some(10), output_tokens: Some(20), + cached_tokens: None, }, finish_reason: Some(FinishReason::Stop), }; @@ -500,6 +501,7 @@ async fn test_write_read_completed_batch_inference_chat(clickhouse: ClickHouseCo usage: Usage { input_tokens: Some(20), output_tokens: Some(30), + cached_tokens: None, }, finish_reason: Some(FinishReason::ToolCall), }; @@ -767,6 +769,7 @@ async fn test_write_read_completed_batch_inference_json(clickhouse: ClickHouseCo usage: Usage { input_tokens: Some(10), output_tokens: Some(20), + cached_tokens: None, }, finish_reason: Some(FinishReason::Stop), }; @@ -778,6 +781,7 @@ async fn test_write_read_completed_batch_inference_json(clickhouse: ClickHouseCo usage: Usage { input_tokens: Some(20), output_tokens: Some(30), + cached_tokens: None, }, finish_reason: Some(FinishReason::ToolCall), }; diff --git a/tensorzero-core/tests/e2e/mixture_of_n.rs b/tensorzero-core/tests/e2e/mixture_of_n.rs index 4fca623581..20c6fb31aa 100644 --- a/tensorzero-core/tests/e2e/mixture_of_n.rs +++ b/tensorzero-core/tests/e2e/mixture_of_n.rs @@ -87,6 +87,7 @@ async fn test_mixture_of_n_dummy_candidates_dummy_judge_inner( Usage { input_tokens: Some(input_tokens), output_tokens: Some(output_tokens), + cached_tokens: None, }, ) } else { @@ -106,6 +107,7 @@ async fn test_mixture_of_n_dummy_candidates_dummy_judge_inner( Usage { input_tokens: Some(input_tokens), output_tokens: Some(output_tokens), + cached_tokens: None, }, ) }; @@ -153,6 +155,7 @@ async fn test_mixture_of_n_dummy_candidates_dummy_judge_inner( let mut usage_sum = Usage { input_tokens: Some(0), output_tokens: Some(0), + cached_tokens: None, }; for result in results { @@ -209,6 +212,7 @@ async fn test_mixture_of_n_dummy_candidates_dummy_judge_inner( Usage { input_tokens: Some(40), output_tokens: Some(8), + cached_tokens: None, } ); } else { @@ -218,6 +222,7 @@ async fn test_mixture_of_n_dummy_candidates_dummy_judge_inner( Usage { input_tokens: Some(40), output_tokens: Some(4), + cached_tokens: None, } ); } @@ -230,6 +235,7 @@ async fn test_mixture_of_n_dummy_candidates_dummy_judge_inner( Usage { input_tokens: Some(0), output_tokens: Some(0), + cached_tokens: None, } ); } else {