From 0c547bc79c6e5d8b48df7fb16cf2fec5f337b143 Mon Sep 17 00:00:00 2001 From: tsushanth <78000697+tsushanth@users.noreply.github.com> Date: Thu, 11 Jun 2026 10:26:29 -0700 Subject: [PATCH] fix(openai): drop ?model= on native /realtime STT URL to avoid invalid_model MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit OpenAI's native wss://api.openai.com/.../realtime endpoint now treats a ?model= query param on the WebSocket upgrade URL as selecting a conversation session, and rejects the subsequent transcription-mode session.update with error.invalid_request_error.invalid_model (close 4000). Every transcription model (gpt-4o-mini-transcribe, gpt-4o-transcribe, whisper-1) currently fails this way against the native endpoint, so realtime STT through this plugin produces zero transcripts. The ?model= upgrade-URL convention exists for OpenAI-compatible proxies (LiteLLM, Cloudflare AI Gateway, etc.) that route at the HTTP upgrade without parsing the first JSON frame — see #1467. Drop the param only when the host is api.openai.com; non-OpenAI hosts still receive the model on the URL so proxy routing keeps working. The model is conveyed to OpenAI via session.update → audio.input.transcription.model instead. Updates the corresponding URL builder test and adds a second test for the explicit-OpenAI-baseURL path. Closes #1756 --- ...penai-stt-native-realtime-invalid-model.md | 18 ++++++++++++++++++ plugins/openai/src/stt.test.ts | 16 ++++++++++++++-- plugins/openai/src/stt.ts | 19 ++++++++++++------- 3 files changed, 44 insertions(+), 9 deletions(-) create mode 100644 .changeset/fix-openai-stt-native-realtime-invalid-model.md diff --git a/.changeset/fix-openai-stt-native-realtime-invalid-model.md b/.changeset/fix-openai-stt-native-realtime-invalid-model.md new file mode 100644 index 000000000..4b4d8e714 --- /dev/null +++ b/.changeset/fix-openai-stt-native-realtime-invalid-model.md @@ -0,0 +1,18 @@ +--- +'@livekit/agents-plugin-openai': patch +--- + +Fix `openai` realtime STT (transcription session) failing on every model +with `invalid_request_error.invalid_model` when connecting directly to +`wss://api.openai.com/.../realtime`. + +OpenAI's native endpoint now treats a `?model=` query param on the +WebSocket upgrade URL as selecting a conversation session, so the +subsequent transcription-mode `session.update` is rejected — surfacing +as `invalid_model` and a `4000` close. Drop the `?model=` parameter +when the host is `api.openai.com` (the model is conveyed via +`session.update → audio.input.transcription.model` instead). + +OpenAI-compatible proxies (LiteLLM, Cloudflare AI Gateway, etc.) still +receive the model on the upgrade URL so they can route by model before +the first frame, preserving the original intent of #1467. diff --git a/plugins/openai/src/stt.test.ts b/plugins/openai/src/stt.test.ts index 107ca4b5d..46edf619e 100644 --- a/plugins/openai/src/stt.test.ts +++ b/plugins/openai/src/stt.test.ts @@ -74,14 +74,26 @@ describe('OpenAI STT options', () => { }); describe('buildRealtimeSttUrl', () => { - it('points at OpenAI realtime with intent and model when no baseURL is set', () => { + it('points at OpenAI realtime with intent but omits model on the native endpoint', () => { + // OpenAI's native /realtime endpoint rejects `?model=` with + // invalid_request_error.invalid_model when intent=transcription, so the + // model is conveyed via the subsequent session.update instead. const url = new URL(buildRealtimeSttUrl(undefined, 'gpt-realtime-whisper')); expect(url.protocol).toBe('wss:'); expect(url.host).toBe('api.openai.com'); expect(url.pathname).toBe('/v1/realtime'); expect(url.searchParams.get('intent')).toBe('transcription'); - expect(url.searchParams.get('model')).toBe('gpt-realtime-whisper'); + expect(url.searchParams.get('model')).toBe(null); + }); + + it('omits the model when an explicit baseURL still points at api.openai.com', () => { + const url = new URL(buildRealtimeSttUrl('https://api.openai.com/v1', 'gpt-4o-mini-transcribe')); + + expect(url.host).toBe('api.openai.com'); + expect(url.pathname).toBe('/v1/realtime'); + expect(url.searchParams.get('intent')).toBe('transcription'); + expect(url.searchParams.get('model')).toBe(null); }); it('upgrades https baseURL to wss and appends /realtime when path is /v1', () => { diff --git a/plugins/openai/src/stt.ts b/plugins/openai/src/stt.ts index 9b2ee417c..1da789516 100644 --- a/plugins/openai/src/stt.ts +++ b/plugins/openai/src/stt.ts @@ -28,12 +28,15 @@ const DEFAULT_REALTIME_MODEL = 'gpt-realtime-whisper'; /** * Build the realtime transcription WebSocket URL. * - * Includes the model on the upgrade URL so OpenAI-compatible gateways - * (which can only see the URL at the WebSocket upgrade, not the subsequent - * `session.update` frame) can route by model. Mirrors the existing - * convention in `realtime/realtime_model.ts` for the conversational - * Realtime API. OpenAI's native endpoint accepts and ignores the - * parameter, so this is a no-op for direct connections. + * For OpenAI-compatible gateways (LiteLLM, Cloudflare AI Gateway, etc.) the + * model is included on the upgrade URL so the gateway can route by model + * before the subsequent `session.update` frame arrives. OpenAI's own + * `wss://api.openai.com/.../realtime` endpoint, on the other hand, treats a + * `?model=` query param as selecting a conversation session and rejects the + * subsequent transcription-mode `session.update` with + * `error.invalid_request_error.invalid_model`, so the model is intentionally + * omitted for native OpenAI connections — the model is conveyed via + * `session.update → audio.input.transcription.model` instead. * * The scheme of `baseURL` is respected: `http://` maps to `ws://` * and `https://` maps to `wss://`. @@ -56,7 +59,9 @@ export function buildRealtimeSttUrl(baseURL: string | undefined, model: string): } url.searchParams.set('intent', 'transcription'); - url.searchParams.set('model', model); + if (url.hostname !== 'api.openai.com') { + url.searchParams.set('model', model); + } return url.toString(); }