From f24d0b5927eac2383f93c51c4435b87d874bf0b3 Mon Sep 17 00:00:00 2001 From: cpinn <4450689+cpinn@users.noreply.github.com> Date: Tue, 5 May 2026 08:54:34 +0000 Subject: [PATCH 1/4] fix: add Together models Qwen/Qwen3.6-Plus +2 more --- packages/proxy/schema/index.ts | 3 +++ packages/proxy/schema/model_list.json | 36 +++++++++++++++++++++++++++ 2 files changed, 39 insertions(+) diff --git a/packages/proxy/schema/index.ts b/packages/proxy/schema/index.ts index 95d2d2cd..750dbaf5 100644 --- a/packages/proxy/schema/index.ts +++ b/packages/proxy/schema/index.ts @@ -607,6 +607,9 @@ export const AvailableEndpointTypes: { [name: string]: ModelEndpointType[] } = { "grok-2-1212": ["xAI"], "grok-vision-beta": ["xAI"], "grok-beta": ["xAI"], + "Qwen/Qwen3.6-Plus": ["together"], + "zai-org/GLM-5.1": ["together"], + "MiniMaxAI/MiniMax-M2.7": ["together"], "gemini-3.1-flash-image-preview": ["google", "vertex"], "gemini-2.5-flash-image": ["google", "vertex"], "mistral-medium-2508": ["mistral"], diff --git a/packages/proxy/schema/model_list.json b/packages/proxy/schema/model_list.json index 2f9e6ad2..7a9697e6 100644 --- a/packages/proxy/schema/model_list.json +++ b/packages/proxy/schema/model_list.json @@ -4665,6 +4665,17 @@ "together" ] }, + "Qwen/Qwen3.6-Plus": { + "format": "openai", + "flavor": "chat", + "input_cost_per_mil_tokens": 0.5, + "output_cost_per_mil_tokens": 3, + "displayName": "Qwen3.6 Plus", + "max_input_tokens": 1000000, + "available_providers": [ + "together" + ] + }, "magistral-medium-latest": { "format": "openai", "flavor": "chat", @@ -9578,6 +9589,19 @@ "baseten" ] }, + "zai-org/GLM-5.1": { + "format": "openai", + "flavor": "chat", + "input_cost_per_mil_tokens": 1.4, + "output_cost_per_mil_tokens": 4.4, + "displayName": "GLM 5.1", + "reasoning": true, + "max_input_tokens": 202752, + "max_output_tokens": 128000, + "available_providers": [ + "together" + ] + }, "accounts/fireworks/models/glm-4p5": { "format": "openai", "flavor": "chat", @@ -9747,6 +9771,18 @@ "baseten" ] }, + "MiniMaxAI/MiniMax-M2.7": { + "format": "openai", + "flavor": "chat", + "input_cost_per_mil_tokens": 0.3, + "output_cost_per_mil_tokens": 1.2, + "input_cache_read_cost_per_mil_tokens": 0.06, + "displayName": "MiniMax M2.7", + "max_input_tokens": 202752, + "available_providers": [ + "together" + ] + }, "accounts/fireworks/models/minimax-m2p1": { "format": "openai", "flavor": "chat", From 1bf0e1aece1865764099e5fff3b0e8d7dbde8207 Mon Sep 17 00:00:00 2001 From: Caitlin Pinn Date: Tue, 5 May 2026 14:39:41 -0700 Subject: [PATCH 2/4] fix streaming only --- packages/proxy/schema/model_list.json | 2 + packages/proxy/schema/models.ts | 10 +++ packages/proxy/src/providers/openai.test.ts | 70 +++++++++++++++++++++ packages/proxy/src/proxy.ts | 10 +++ 4 files changed, 92 insertions(+) diff --git a/packages/proxy/schema/model_list.json b/packages/proxy/schema/model_list.json index 7a9697e6..fdabe091 100644 --- a/packages/proxy/schema/model_list.json +++ b/packages/proxy/schema/model_list.json @@ -4672,6 +4672,8 @@ "output_cost_per_mil_tokens": 3, "displayName": "Qwen3.6 Plus", "max_input_tokens": 1000000, + "supports_streaming": true, + "streaming_only": true, "available_providers": [ "together" ] diff --git a/packages/proxy/schema/models.ts b/packages/proxy/schema/models.ts index 8da75b1f..7aac8d33 100644 --- a/packages/proxy/schema/models.ts +++ b/packages/proxy/schema/models.ts @@ -96,6 +96,16 @@ export const ModelSchema = z.object({ .number() .nullish() .describe("The model supports a maximum output token limit."), + supports_streaming: z + .boolean() + .nullish() + .describe("The model supports native streaming responses."), + streaming_only: z + .boolean() + .nullish() + .describe( + "The upstream provider requires requests for this model to be sent with streaming enabled.", + ), available_providers: z.array(z.enum(ModelEndpointType)).nullish(), }); diff --git a/packages/proxy/src/providers/openai.test.ts b/packages/proxy/src/providers/openai.test.ts index 2a9ed639..86c2fa00 100644 --- a/packages/proxy/src/providers/openai.test.ts +++ b/packages/proxy/src/providers/openai.test.ts @@ -310,6 +310,76 @@ it("falls back to provider base URL when metadata.api_base is not a string", asy expect(requests[0].url).toBe("https://api.openai.com/v1/chat/completions"); }); +it("forces streaming-only OpenAI-compatible models onto the stream path", async () => { + const encoder = new TextEncoder(); + const requests: Array<{ body: JsonBodyType | null }> = []; + const fetch = async (_input: RequestInfo | URL, init?: RequestInit) => { + requests.push({ + body: init?.body ? JSON.parse(init.body as string) : null, + }); + + return new Response( + new ReadableStream({ + start(controller) { + controller.enqueue( + encoder.encode( + 'data: {"id":"chatcmpl-stream-only","object":"chat.completion.chunk","created":123,"model":"stream-only-model","choices":[{"index":0,"delta":{"role":"assistant","content":"Hello"},"finish_reason":null}]}\n\n', + ), + ); + controller.enqueue( + encoder.encode( + 'data: {"id":"chatcmpl-stream-only","object":"chat.completion.chunk","created":123,"model":"stream-only-model","choices":[{"index":0,"delta":{"content":" world"},"finish_reason":"stop"}],"usage":{"prompt_tokens":3,"completion_tokens":2,"total_tokens":5}}\n\n', + ), + ); + controller.enqueue(encoder.encode("data: [DONE]\n\n")); + controller.close(); + }, + }), + { + headers: { + "content-type": "text/event-stream; charset=utf-8", + }, + }, + ); + }; + + const { events, headers } = await callProxyV1< + OpenAIChatCompletionCreateParams, + OpenAIChatCompletionChunk + >({ + body: { + model: "Qwen/Qwen3.6-Plus", + messages: [{ role: "user", content: "hello" }], + stream: false, + }, + fetch, + getApiSecrets: async () => [ + { + type: "openai", + name: "together", + secret: "test-secret", + metadata: { + api_base: "https://api.together.xyz/v1", + }, + }, + ], + }); + + expect(requests).toHaveLength(1); + expect(requests[0].body).toMatchObject({ + model: "Qwen/Qwen3.6-Plus", + stream: true, + }); + expect(headers["content-type"]).toBe("text/event-stream; charset=utf-8"); + expect(headers["cache-control"]).toContain("no-transform"); + + const streamedEvents = events(); + expect(streamedEvents).toHaveLength(2); + expect(streamedEvents[0].data.choices[0]?.delta?.content).toBe("Hello"); + expect(streamedEvents[1].data.choices[0]?.delta?.content).toBe(" world"); + expect(streamedEvents[1].data.choices[0]?.finish_reason).toBe("stop"); +}); + it("uses custom api base when routing appropriate models through responses", async () => { const { fetch, requests } = createCapturingFetch({ captureOnly: true }); diff --git a/packages/proxy/src/proxy.ts b/packages/proxy/src/proxy.ts index cd414cff..00dc6d70 100644 --- a/packages/proxy/src/proxy.ts +++ b/packages/proxy/src/proxy.ts @@ -389,6 +389,16 @@ export async function proxyV1({ } } + if ( + model && + isObject(bodyData) && + getAvailableModels()[model]?.streaming_only === true && + bodyData.stream !== true + ) { + bodyData.stream = true; + body = JSON.stringify(bodyData); + } + // Create attributes object that includes model for all metrics // Use undefined instead of null since OpenTelemetry doesn't accept null const baseAttributes: Record = { From f5a1a72a7c3d60126f86c249d846982443b55f12 Mon Sep 17 00:00:00 2001 From: Caitlin Pinn Date: Tue, 5 May 2026 15:36:14 -0700 Subject: [PATCH 3/4] run as streaming only --- .../proxy/scripts/verify_proxy_models.test.ts | 24 +++++++ packages/proxy/scripts/verify_proxy_models.ts | 12 +++- packages/proxy/src/providers/openai.test.ts | 70 ------------------- packages/proxy/src/proxy.ts | 10 --- 4 files changed, 34 insertions(+), 82 deletions(-) diff --git a/packages/proxy/scripts/verify_proxy_models.test.ts b/packages/proxy/scripts/verify_proxy_models.test.ts index c91d265c..af4d5d40 100644 --- a/packages/proxy/scripts/verify_proxy_models.test.ts +++ b/packages/proxy/scripts/verify_proxy_models.test.ts @@ -23,6 +23,30 @@ describe("buildVerificationRequest", () => { endpoint: "chat/completions", }); }); + + it("forces stream mode for streaming-only models", () => { + expect( + buildVerificationRequest("Qwen/Qwen3.6-Plus", { + "Qwen/Qwen3.6-Plus": { + available_providers: ["together"], + format: "openai", + streaming_only: true, + }, + }), + ).toEqual({ + body: { + messages: [ + { + content: "ok", + role: "user", + }, + ], + model: "Qwen/Qwen3.6-Plus", + stream: true, + }, + endpoint: "chat/completions", + }); + }); }); describe("extractErrorMessage", () => { diff --git a/packages/proxy/scripts/verify_proxy_models.ts b/packages/proxy/scripts/verify_proxy_models.ts index 0eb28449..0de0d33a 100644 --- a/packages/proxy/scripts/verify_proxy_models.ts +++ b/packages/proxy/scripts/verify_proxy_models.ts @@ -37,6 +37,7 @@ type VerificationModelSpec = { available_providers?: ModelEndpointType[]; endpoint_types?: ModelEndpointType[]; format?: ModelFormat; + streaming_only?: boolean; }; type ModelCatalog = Record; @@ -187,7 +188,11 @@ export function resolveVercelProtectionBypassSecret( return secret; } -export function buildVerificationRequest(model: string): VerificationRequest { +export function buildVerificationRequest( + model: string, + modelCatalog: ModelCatalog = readModelCatalog(), +): VerificationRequest { + const modelSpec = modelCatalog[model]; return { endpoint: "chat/completions", body: { @@ -198,6 +203,7 @@ export function buildVerificationRequest(model: string): VerificationRequest { }, ], model, + ...(modelSpec?.streaming_only ? { stream: true } : {}), }, }; } @@ -227,11 +233,12 @@ export function extractErrorMessage(responseBody: string): string { async function verifyModel(args: { apiKey: string; model: string; + modelCatalog: ModelCatalog; proxyBaseUrl: string; timeoutMs: number; vercelProtectionBypassSecret: string; }): Promise { - const request = buildVerificationRequest(args.model); + const request = buildVerificationRequest(args.model, args.modelCatalog); const url = new URL(request.endpoint, withTrailingSlash(args.proxyBaseUrl)); const controller = new AbortController(); const timeout = setTimeout(() => controller.abort(), args.timeoutMs); @@ -353,6 +360,7 @@ async function main(): Promise { result = await verifyModel({ apiKey, model, + modelCatalog, proxyBaseUrl: argv["proxy-base-url"], timeoutMs: argv["timeout-ms"], vercelProtectionBypassSecret, diff --git a/packages/proxy/src/providers/openai.test.ts b/packages/proxy/src/providers/openai.test.ts index 86c2fa00..2a9ed639 100644 --- a/packages/proxy/src/providers/openai.test.ts +++ b/packages/proxy/src/providers/openai.test.ts @@ -310,76 +310,6 @@ it("falls back to provider base URL when metadata.api_base is not a string", asy expect(requests[0].url).toBe("https://api.openai.com/v1/chat/completions"); }); -it("forces streaming-only OpenAI-compatible models onto the stream path", async () => { - const encoder = new TextEncoder(); - const requests: Array<{ body: JsonBodyType | null }> = []; - const fetch = async (_input: RequestInfo | URL, init?: RequestInit) => { - requests.push({ - body: init?.body ? JSON.parse(init.body as string) : null, - }); - - return new Response( - new ReadableStream({ - start(controller) { - controller.enqueue( - encoder.encode( - 'data: {"id":"chatcmpl-stream-only","object":"chat.completion.chunk","created":123,"model":"stream-only-model","choices":[{"index":0,"delta":{"role":"assistant","content":"Hello"},"finish_reason":null}]}\n\n', - ), - ); - controller.enqueue( - encoder.encode( - 'data: {"id":"chatcmpl-stream-only","object":"chat.completion.chunk","created":123,"model":"stream-only-model","choices":[{"index":0,"delta":{"content":" world"},"finish_reason":"stop"}],"usage":{"prompt_tokens":3,"completion_tokens":2,"total_tokens":5}}\n\n', - ), - ); - controller.enqueue(encoder.encode("data: [DONE]\n\n")); - controller.close(); - }, - }), - { - headers: { - "content-type": "text/event-stream; charset=utf-8", - }, - }, - ); - }; - - const { events, headers } = await callProxyV1< - OpenAIChatCompletionCreateParams, - OpenAIChatCompletionChunk - >({ - body: { - model: "Qwen/Qwen3.6-Plus", - messages: [{ role: "user", content: "hello" }], - stream: false, - }, - fetch, - getApiSecrets: async () => [ - { - type: "openai", - name: "together", - secret: "test-secret", - metadata: { - api_base: "https://api.together.xyz/v1", - }, - }, - ], - }); - - expect(requests).toHaveLength(1); - expect(requests[0].body).toMatchObject({ - model: "Qwen/Qwen3.6-Plus", - stream: true, - }); - expect(headers["content-type"]).toBe("text/event-stream; charset=utf-8"); - expect(headers["cache-control"]).toContain("no-transform"); - - const streamedEvents = events(); - expect(streamedEvents).toHaveLength(2); - expect(streamedEvents[0].data.choices[0]?.delta?.content).toBe("Hello"); - expect(streamedEvents[1].data.choices[0]?.delta?.content).toBe(" world"); - expect(streamedEvents[1].data.choices[0]?.finish_reason).toBe("stop"); -}); - it("uses custom api base when routing appropriate models through responses", async () => { const { fetch, requests } = createCapturingFetch({ captureOnly: true }); diff --git a/packages/proxy/src/proxy.ts b/packages/proxy/src/proxy.ts index 00dc6d70..cd414cff 100644 --- a/packages/proxy/src/proxy.ts +++ b/packages/proxy/src/proxy.ts @@ -389,16 +389,6 @@ export async function proxyV1({ } } - if ( - model && - isObject(bodyData) && - getAvailableModels()[model]?.streaming_only === true && - bodyData.stream !== true - ) { - bodyData.stream = true; - body = JSON.stringify(bodyData); - } - // Create attributes object that includes model for all metrics // Use undefined instead of null since OpenTelemetry doesn't accept null const baseAttributes: Record = { From 774566b04cded884b084c680c7b5a42d95bb08d9 Mon Sep 17 00:00:00 2001 From: Caitlin Pinn Date: Tue, 5 May 2026 18:56:26 -0700 Subject: [PATCH 4/4] fix verification --- .github/workflows/verify-deployed-models.yaml | 1 + 1 file changed, 1 insertion(+) diff --git a/.github/workflows/verify-deployed-models.yaml b/.github/workflows/verify-deployed-models.yaml index 60d83bde..ed13af65 100644 --- a/.github/workflows/verify-deployed-models.yaml +++ b/.github/workflows/verify-deployed-models.yaml @@ -120,6 +120,7 @@ jobs: uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2 with: fetch-depth: 0 + ref: ${{ inputs.head_sha != '' && inputs.head_sha || inputs.head_ref }} - name: Set up Node.js uses: actions/setup-node@49933ea5288caeca8642d1e84afbd3f7d6820020 # v4.4.0