diff --git a/tensorzero-core/src/providers/novita.rs b/tensorzero-core/src/providers/novita.rs index cc4b7650ff..b072294148 100644 --- a/tensorzero-core/src/providers/novita.rs +++ b/tensorzero-core/src/providers/novita.rs @@ -76,6 +76,36 @@ pub enum NovitaRequestShape { /// prompt, negative_prompt, model_name (std/pro tier), /// keep_original_sound, character_orientation. KlingV3MotionControl, + /// Wan 2.7 text-to-video. Per `/v3/async/wan2.7-t2v`: prompt + /// (auto), duration (int 2–15), size (e.g. "1920*1080"), seed, + /// audio_url (optional), negative_prompt (≤500), watermark, + /// prompt_extend. + #[serde(rename = "wan_2_7_text_to_video")] + Wan27TextToVideo, + /// Wan 2.7 image-to-video. Per `/v3/async/wan2.7-i2v`: prompt + /// (auto, optional), image_url (remapped from image_urls[0]), + /// duration (int 2–15), resolution (720P|1080P), seed, + /// negative_prompt, watermark, prompt_extend, driving_audio_url, + /// last_frame_url. `first_clip_url` (video continuation) is not + /// surfaced — would need a separate variant if exposed. + #[serde(rename = "wan_2_7_image_to_video")] + Wan27ImageToVideo, + /// Wan 2.7 reference-to-video. Per `/v3/async/wan2.7-r2v`: prompt + /// (auto), media (array of 1–5 image/video reference URLs), + /// duration (int 2–10), size, seed, audio (bool, default true), + /// shot_type (single|multi), watermark, negative_prompt. + /// Media array is built from `image_urls` + `video_urls` below. + #[serde(rename = "wan_2_7_reference_to_video")] + Wan27ReferenceToVideo, + /// Wan 2.7 video editing. Per `/v3/async/wan2.7-videoedit`: + /// video_url (required, remapped from video_urls[0]), prompt + /// (auto, optional), duration (int 0–10, 0 preserves input), + /// ratio (16:9|9:16|1:1|4:3|3:4), resolution (720P|1080P), + /// audio_setting (auto|origin), seed, watermark, prompt_extend, + /// negative_prompt, reference_image_url(_2,_3) (up to 3, + /// remapped from image_urls[0..3]). + #[serde(rename = "wan_2_7_video_edit")] + Wan27VideoEdit, } impl NovitaProvider { @@ -322,6 +352,66 @@ fn build_body(shape: &NovitaRequestShape, input: &Value) -> Result "keep_original_sound", "character_orientation", ], + // Wan 2.7 text-to-video. Per `/v3/async/wan2.7-t2v`: prompt + // (auto), duration (int 2–15), size, seed, audio_url, + // negative_prompt (≤500), watermark, prompt_extend. + NovitaRequestShape::Wan27TextToVideo => &[ + "duration", + "size", + "seed", + "audio_url", + "negative_prompt", + "watermark", + "prompt_extend", + ], + // Wan 2.7 image-to-video. Per `/v3/async/wan2.7-i2v`: prompt + // (auto, optional), image_url (remapped from + // `image_urls[0]`), duration, resolution (720P|1080P), seed, + // negative_prompt, watermark, prompt_extend, + // driving_audio_url, last_frame_url. `first_clip_url` (video + // continuation) is not surfaced; if exposed, give it its own + // variant since it's mutually exclusive with image_url. + NovitaRequestShape::Wan27ImageToVideo => &[ + "duration", + "resolution", + "seed", + "negative_prompt", + "watermark", + "prompt_extend", + "driving_audio_url", + "last_frame_url", + ], + // Wan 2.7 reference-to-video. Per `/v3/async/wan2.7-r2v`: + // prompt (auto), media (array of refs, built from + // image_urls + video_urls below), duration (int 2–10), size, + // seed, audio (bool), shot_type (single|multi), watermark, + // negative_prompt. + NovitaRequestShape::Wan27ReferenceToVideo => &[ + "duration", + "size", + "seed", + "audio", + "shot_type", + "watermark", + "negative_prompt", + ], + // Wan 2.7 video editing. Per `/v3/async/wan2.7-videoedit`: + // video_url (remapped from `video_urls[0]`), prompt (auto, + // optional), duration (int 0–10, 0 = preserve input + // length), ratio (5 enum), resolution (720P|1080P), + // audio_setting (auto|origin), seed, watermark, + // prompt_extend, negative_prompt, reference_image_url(_2,_3) + // (up to 3, remapped from image_urls[0..3] below). + NovitaRequestShape::Wan27VideoEdit => &[ + "duration", + "ratio", + "resolution", + "audio_setting", + "seed", + "watermark", + "prompt_extend", + "negative_prompt", + ], }; if let Some(input_obj) = input.as_object() { @@ -497,6 +587,98 @@ fn build_body(shape: &NovitaRequestShape, input: &Value) -> Result } } + // Wan 2.7 I2V: body wants `image_url` (single string). Playground + // sends `image_urls` array for parity with Veo/Sora/Kling i2v. + if matches!(shape, NovitaRequestShape::Wan27ImageToVideo) && !body.contains_key("image_url") { + if let Some(value) = input.get("image_url").and_then(Value::as_str) { + body.insert("image_url".into(), Value::from(value)); + } else if let Some(first) = input + .get("image_urls") + .and_then(Value::as_array) + .and_then(|arr| arr.first()) + .and_then(Value::as_str) + { + body.insert("image_url".into(), Value::from(first)); + } + } + + // Wan 2.7 Video Editing: body wants `video_url` (single string, + // required). Playground sends `video_urls` array. + if matches!(shape, NovitaRequestShape::Wan27VideoEdit) && !body.contains_key("video_url") { + if let Some(value) = input.get("video_url").and_then(Value::as_str) { + body.insert("video_url".into(), Value::from(value)); + } else if let Some(first) = input + .get("video_urls") + .and_then(Value::as_array) + .and_then(|arr| arr.first()) + .and_then(Value::as_str) + { + body.insert("video_url".into(), Value::from(first)); + } + } + + // Wan 2.7 Video Editing: up to 3 reference images, body wants + // `reference_image_url`, `reference_image_url_2`, + // `reference_image_url_3` (each a single string). Playground + // ships `image_urls` as a flat array — split into the three + // Wan-specific fields. Only forwarded when the caller didn't + // already set them explicitly. + if matches!(shape, NovitaRequestShape::Wan27VideoEdit) { + if let Some(imgs) = input.get("image_urls").and_then(Value::as_array) { + let slot_names = ["reference_image_url", "reference_image_url_2", "reference_image_url_3"]; + for (idx, slot) in slot_names.iter().enumerate() { + if body.contains_key(*slot) { + continue; + } + if let Some(url) = imgs.get(idx).and_then(Value::as_str) { + body.insert((*slot).to_string(), Value::from(url)); + } + } + } + } + + // Wan 2.7 R2V: body wants `media` — an array of objects with a + // `type` ("image"|"video") + `url`. Playground sends + // `image_urls` and `video_urls` as separate arrays. Build the + // unified list; total capped at 5 by Novita (images ≤5, + // videos ≤3 — we don't enforce here, let upstream return its + // own error since `parameter_schema` already gates the inputs). + if matches!(shape, NovitaRequestShape::Wan27ReferenceToVideo) && !body.contains_key("media") { + let mut media: Vec = Vec::new(); + if let Some(imgs) = input.get("image_urls").and_then(Value::as_array) { + for u in imgs.iter().filter_map(Value::as_str) { + media.push(json!({ "type": "image", "url": u })); + } + } + if let Some(vids) = input.get("video_urls").and_then(Value::as_array) { + for u in vids.iter().filter_map(Value::as_str) { + media.push(json!({ "type": "video", "url": u })); + } + } + if !media.is_empty() { + body.insert("media".into(), Value::Array(media)); + } + } + + // Wan 2.7 T2V/I2V/R2V/Video Edit all accept `prompt` as a body + // field. The shape-specific arms above explicitly *omit* "prompt" + // from `allowed` because it's already inserted at the top of + // build_body (line ~184). This block is a no-op for them, but + // kept as a safety net mirroring the Kling pattern in case the + // upstream caller paths change. + if matches!( + shape, + NovitaRequestShape::Wan27TextToVideo + | NovitaRequestShape::Wan27ImageToVideo + | NovitaRequestShape::Wan27ReferenceToVideo + | NovitaRequestShape::Wan27VideoEdit + ) && !body.contains_key("prompt") + { + if let Some(value) = input.get("prompt").and_then(Value::as_str) { + body.insert("prompt".into(), Value::from(value)); + } + } + if matches!( shape, NovitaRequestShape::GeminiImageTextToImage | NovitaRequestShape::GeminiImageEdit