diff --git a/.changeset/fix-openai-stt-native-realtime-invalid-model.md b/.changeset/fix-openai-stt-native-realtime-invalid-model.md new file mode 100644 index 000000000..4b4d8e714 --- /dev/null +++ b/.changeset/fix-openai-stt-native-realtime-invalid-model.md @@ -0,0 +1,18 @@ +--- +'@livekit/agents-plugin-openai': patch +--- + +Fix `openai` realtime STT (transcription session) failing on every model +with `invalid_request_error.invalid_model` when connecting directly to +`wss://api.openai.com/.../realtime`. + +OpenAI's native endpoint now treats a `?model=` query param on the +WebSocket upgrade URL as selecting a conversation session, so the +subsequent transcription-mode `session.update` is rejected — surfacing +as `invalid_model` and a `4000` close. Drop the `?model=` parameter +when the host is `api.openai.com` (the model is conveyed via +`session.update → audio.input.transcription.model` instead). + +OpenAI-compatible proxies (LiteLLM, Cloudflare AI Gateway, etc.) still +receive the model on the upgrade URL so they can route by model before +the first frame, preserving the original intent of #1467. diff --git a/plugins/openai/src/stt.test.ts b/plugins/openai/src/stt.test.ts index 107ca4b5d..46edf619e 100644 --- a/plugins/openai/src/stt.test.ts +++ b/plugins/openai/src/stt.test.ts @@ -74,14 +74,26 @@ describe('OpenAI STT options', () => { }); describe('buildRealtimeSttUrl', () => { - it('points at OpenAI realtime with intent and model when no baseURL is set', () => { + it('points at OpenAI realtime with intent but omits model on the native endpoint', () => { + // OpenAI's native /realtime endpoint rejects `?model=` with + // invalid_request_error.invalid_model when intent=transcription, so the + // model is conveyed via the subsequent session.update instead. const url = new URL(buildRealtimeSttUrl(undefined, 'gpt-realtime-whisper')); expect(url.protocol).toBe('wss:'); expect(url.host).toBe('api.openai.com'); expect(url.pathname).toBe('/v1/realtime'); expect(url.searchParams.get('intent')).toBe('transcription'); - expect(url.searchParams.get('model')).toBe('gpt-realtime-whisper'); + expect(url.searchParams.get('model')).toBe(null); + }); + + it('omits the model when an explicit baseURL still points at api.openai.com', () => { + const url = new URL(buildRealtimeSttUrl('https://api.openai.com/v1', 'gpt-4o-mini-transcribe')); + + expect(url.host).toBe('api.openai.com'); + expect(url.pathname).toBe('/v1/realtime'); + expect(url.searchParams.get('intent')).toBe('transcription'); + expect(url.searchParams.get('model')).toBe(null); }); it('upgrades https baseURL to wss and appends /realtime when path is /v1', () => { diff --git a/plugins/openai/src/stt.ts b/plugins/openai/src/stt.ts index 9b2ee417c..1da789516 100644 --- a/plugins/openai/src/stt.ts +++ b/plugins/openai/src/stt.ts @@ -28,12 +28,15 @@ const DEFAULT_REALTIME_MODEL = 'gpt-realtime-whisper'; /** * Build the realtime transcription WebSocket URL. * - * Includes the model on the upgrade URL so OpenAI-compatible gateways - * (which can only see the URL at the WebSocket upgrade, not the subsequent - * `session.update` frame) can route by model. Mirrors the existing - * convention in `realtime/realtime_model.ts` for the conversational - * Realtime API. OpenAI's native endpoint accepts and ignores the - * parameter, so this is a no-op for direct connections. + * For OpenAI-compatible gateways (LiteLLM, Cloudflare AI Gateway, etc.) the + * model is included on the upgrade URL so the gateway can route by model + * before the subsequent `session.update` frame arrives. OpenAI's own + * `wss://api.openai.com/.../realtime` endpoint, on the other hand, treats a + * `?model=` query param as selecting a conversation session and rejects the + * subsequent transcription-mode `session.update` with + * `error.invalid_request_error.invalid_model`, so the model is intentionally + * omitted for native OpenAI connections — the model is conveyed via + * `session.update → audio.input.transcription.model` instead. * * The scheme of `baseURL` is respected: `http://` maps to `ws://` * and `https://` maps to `wss://`. @@ -56,7 +59,9 @@ export function buildRealtimeSttUrl(baseURL: string | undefined, model: string): } url.searchParams.set('intent', 'transcription'); - url.searchParams.set('model', model); + if (url.hostname !== 'api.openai.com') { + url.searchParams.set('model', model); + } return url.toString(); }