diff --git a/app/RealtimeTranslation.tsx b/app/RealtimeTranslation.tsx index b21b3f9..4125fdd 100644 --- a/app/RealtimeTranslation.tsx +++ b/app/RealtimeTranslation.tsx @@ -13,6 +13,7 @@ import { LanguagePickerRow } from "../components/realtime/LanguagePickerRow"; import { SessionFooter } from "../components/realtime/SessionFooter"; import { TranscriptView } from "../components/realtime/TranscriptView"; import { log } from "../lib/logger"; +import { getUserFacingRealtimeErrorMessage } from "../lib/realtimeUserError"; import { DEFAULT_BIDIRECTIONAL_ENABLED, DEFAULT_BIDIRECTIONAL_LANGUAGE, @@ -314,8 +315,7 @@ export function RealtimeTranslation({ }, error, ); - const message = - error instanceof Error ? error.message : "Unexpected error"; + const message = getUserFacingRealtimeErrorMessage(error, "translation"); Alert.alert("Translation", message); setIsSessionActive(false); } finally { @@ -348,8 +348,7 @@ export function RealtimeTranslation({ }, error, ); - const message = - error instanceof Error ? error.message : "Unexpected error"; + const message = getUserFacingRealtimeErrorMessage(error, "translation"); Alert.alert("Translation", message); } finally { setIsStopping(false); diff --git a/app/VoiceChat.tsx b/app/VoiceChat.tsx index 0d05349..fa37103 100644 --- a/app/VoiceChat.tsx +++ b/app/VoiceChat.tsx @@ -17,6 +17,7 @@ import { VoiceSpeedCustomization } from "../components/VoiceSpeedCustomization"; import { loadShowRealtimeErrorAlerts } from "../lib/developerSettings"; import { log } from "../lib/logger"; import { composeMainPrompt } from "../lib/mainPrompt"; +import { getUserFacingRealtimeErrorMessage } from "../lib/realtimeUserError"; import { TokenUsageTracker } from "../lib/tokenUsageTracker"; import { loadTranscriptionPreference } from "../lib/transcriptionPreference"; import type { VadMode } from "../lib/vadPreference"; @@ -173,10 +174,10 @@ export function VoiceChat({ }, ); const message = - typeof payload?.error?.message === "string" && - payload.error.message.trim().length > 0 - ? payload.error.message.trim() - : "The voice session encountered an unexpected error."; + getUserFacingRealtimeErrorMessage( + payload?.error?.message, + "voice", + ) || "The voice session encountered an unexpected error."; // Only show alert if developer setting is enabled const shouldShowAlert = await loadShowRealtimeErrorAlerts(); @@ -396,8 +397,8 @@ export function VoiceChat({ "Starting OpenAI voice session", {}, { - hasBaseUrl: Boolean(baseConnectionOptions.baseUrl), - hasModel: Boolean(baseConnectionOptions.model), + baseUrl: baseConnectionOptions.baseUrl ?? "default", + model: baseConnectionOptions.model ?? "default", audioOutput, voice: selectedVoice, hasInstructions: finalPrompt.trim().length > 0, @@ -405,6 +406,7 @@ export function VoiceChat({ toolNames: voiceToolNames, transcriptionEnabled, selectedLanguage, + hasMicPermission, }, ); const customConnectionOptions: OpenAIConnectionOptions = { @@ -522,8 +524,7 @@ export function VoiceChat({ }, error, ); - const message = - error instanceof Error ? error.message : "Unexpected error"; + const message = getUserFacingRealtimeErrorMessage(error, "voice"); // Emit error status emitVoiceSessionStatus(`Connection failed: ${message}`); diff --git a/lib/realtimeUserError.ts b/lib/realtimeUserError.ts new file mode 100644 index 0000000..66ce92c --- /dev/null +++ b/lib/realtimeUserError.ts @@ -0,0 +1,78 @@ +const DEFAULT_MESSAGES = { + voice: "The voice session could not be started.", + translation: "The translation session could not be started.", +} as const; + +type RealtimeUserErrorContext = keyof typeof DEFAULT_MESSAGES; + +const isNonEmptyString = (value: unknown): value is string => + typeof value === "string" && value.trim().length > 0; + +const firstLine = (value: string) => value.split(/\r?\n/, 1)[0]?.trim() ?? ""; + +export function getUserFacingRealtimeErrorMessage( + error: unknown, + context: RealtimeUserErrorContext = "voice", +): string { + const fallback = DEFAULT_MESSAGES[context]; + const rawMessage = (() => { + if (error instanceof Error) { + return error.message; + } + if (isNonEmptyString(error)) { + return error; + } + return fallback; + })(); + + const message = rawMessage.trim(); + if (!message) { + return fallback; + } + + if (message.includes("OpenAI Realtime endpoint rejected")) { + return "OpenAI rejected the realtime connection setup."; + } + + if (message.includes("Timed out waiting for the WebRTC connection")) { + return "Timed out while establishing the realtime connection."; + } + + if (message.includes("WebRTC connection failed with state:")) { + return "The realtime connection failed before the session was ready."; + } + + if ( + message.includes("An OpenAI API key must be set") || + message.includes("Missing OpenAI API key") + ) { + return "An OpenAI API key is required to start this session."; + } + + if (message.includes("Failed to build the OpenAI Realtime endpoint URL")) { + return "The configured OpenAI endpoint is invalid."; + } + + if (message.includes("The local WebRTC session description is missing")) { + return "The device could not prepare the realtime connection offer."; + } + + if (message.includes("Could not decode the SDP answer returned by OpenAI")) { + return "OpenAI returned an invalid realtime connection response."; + } + + if ( + message.startsWith("{") || + message.startsWith("[") || + message.includes("Response body:") + ) { + return fallback; + } + + const summary = firstLine(message); + if (!summary) { + return fallback; + } + + return summary.length > 140 ? `${summary.slice(0, 139).trimEnd()}…` : summary; +} diff --git a/modules/vm-webrtc/ios/OpenAIWebRTCClient.swift b/modules/vm-webrtc/ios/OpenAIWebRTCClient.swift index 6f3f74a..b4d3dcc 100644 --- a/modules/vm-webrtc/ios/OpenAIWebRTCClient.swift +++ b/modules/vm-webrtc/ios/OpenAIWebRTCClient.swift @@ -10,7 +10,7 @@ final class OpenAIWebRTCClient: OpenAIWebRTCBase { // MARK: Subclass-provided endpoint constants - override var defaultEndpoint: String { "https://api.openai.com/v1/realtime" } + override var defaultEndpoint: String { "https://api.openai.com/v1/realtime/calls" } override var defaultModel: String { "gpt-realtime" } // MARK: Chat-specific stored properties @@ -41,6 +41,7 @@ final class OpenAIWebRTCClient: OpenAIWebRTCBase { let audioMixPlayer = AudioMixPlayer() var toolDefinitions: [[String: Any]] = [] + private var initialSessionConfiguration: [String: Any]? lazy var eventHandler = WebRTCEventHandler() // MARK: Init / deinit @@ -69,7 +70,7 @@ final class OpenAIWebRTCClient: OpenAIWebRTCBase { // MARK: Virtual hook overrides override func dataChannelDidOpen() { - sendInitialSessionConfiguration() + handleDataChannelOpenAfterInitialSessionSetup() } override func handleDataChannelMessage(_ event: [String: Any]) { @@ -220,17 +221,31 @@ final class OpenAIWebRTCClient: OpenAIWebRTCBase { attributes: logAttributes( for: .info, metadata: [ - "hasModel": (model?.isEmpty == false), - "hasBaseURL": (baseURL?.isEmpty == false), + "resolvedModel": resolvedModelName(model), + "baseURL": baseURL ?? "", "audioOutput": audioOutput.rawValue, "voice": sessionVoice, ])) - let endpointURL = try buildEndpointURL(baseURL: baseURL, model: model) + let endpointURL = try buildEndpointURL( + baseURL: baseURL, + model: model, + appendModelQuery: false + ) self.logger.log( "[VmWebrtc] Resolved OpenAI endpoint", attributes: logAttributes(for: .debug, metadata: ["endpoint": endpointURL.absoluteString]) ) + let sessionConfiguration = buildInitialRealtimeSessionConfiguration(model: model) + initialSessionConfiguration = sessionConfiguration + self.logger.log( + "[VmWebrtc] Prepared initial realtime session configuration", + attributes: logAttributes( + for: .debug, + metadata: [ + "session": sessionConfiguration, + "sessionJSON": prettyJSONString(from: sessionConfiguration) ?? "", + ])) try configureAudioSession(for: audioOutput) self.logger.log( @@ -282,12 +297,26 @@ final class OpenAIWebRTCClient: OpenAIWebRTCBase { attributes: logAttributes(for: .error)) throw OpenAIWebRTCError.missingLocalDescription } + self.logger.log( + "[VmWebrtc] Final local SDP ready for OpenAI exchange", + attributes: logAttributes( + for: .debug, + metadata: [ + "endpoint": endpointURL.absoluteString, + "sdpLength": localSDP.count, + "sdpSummary": analyzeSDP(localSDP), + "sdp": localSDP, + ])) emitModuleEvent( "onVoiceSessionStatus", payload: ["status_update": "Connecting to OpenAI endpoint..."]) - let answerSDP = try await exchangeSDPWithOpenAI( - apiKey: resolvedApiKey, endpointURL: endpointURL, offerSDP: localSDP) + let answerSDP = try await exchangeRealtimeCallWithOpenAI( + apiKey: resolvedApiKey, + endpointURL: endpointURL, + offerSDP: localSDP, + session: sessionConfiguration + ) let remoteDescription = RTCSessionDescription(type: .answer, sdp: answerSDP) try await setRemoteDescription(remoteDescription, for: connection) self.logger.log( @@ -324,22 +353,19 @@ final class OpenAIWebRTCClient: OpenAIWebRTCBase { eventHandler.resetAudioStreamingState() eventHandler.resetFunctionCallState() eventHandler.shadowObserve_reset(reason: "connection_closed") + initialSessionConfiguration = nil return super.closeConnection() } - // MARK: Session configuration (sent once data channel opens) - - func sendInitialSessionConfiguration() { - guard !hasSentInitialSessionConfig else { return } + // MARK: Session configuration / startup - guard let dataChannel, dataChannel.readyState == .open else { - self.logger.log( - "[VmWebrtc] Data channel not ready for initial session configuration", - attributes: logAttributes(for: .warn, metadata: ["hasChannel": dataChannel != nil])) - return - } + private func resolvedModelName(_ model: String?) -> String { + let trimmedModel = model?.trimmingCharacters(in: .whitespacesAndNewlines) ?? "" + return trimmedModel.isEmpty ? defaultModel : trimmedModel + } + private func buildInitialRealtimeSessionConfiguration(model: String?) -> [String: Any] { let tools = buildTools() if tools.isEmpty && !toolDefinitions.isEmpty { @@ -349,26 +375,44 @@ final class OpenAIWebRTCClient: OpenAIWebRTCBase { for: .warn, metadata: ["definitionCount": toolDefinitions.count])) } - var session: [String: Any] = [ - "instructions": sessionInstructions, - "voice": sessionVoice, - "tools": tools, - ] + var inputAudioConfig: [String: Any] = [:] switch turnDetectionMode { case .semantic: - session["turn_detection"] = [ + inputAudioConfig["turn_detection"] = [ "type": "semantic_vad", "create_response": true, "eagerness": "low", ] case .server: - session["turn_detection"] = [ + inputAudioConfig["turn_detection"] = [ "type": "server_vad", "create_response": true, ] } + if transcriptionEnabled { + inputAudioConfig["transcription"] = ["model": "whisper-1"] + } + + var audioConfig: [String: Any] = [ + "output": [ + "voice": sessionVoice, + ] + ] + + if !inputAudioConfig.isEmpty { + audioConfig["input"] = inputAudioConfig + } + + var session: [String: Any] = [ + "type": "realtime", + "model": resolvedModelName(model), + "instructions": sessionInstructions, + "audio": audioConfig, + "tools": tools, + ] + if let ratio = retentionRatio { session["truncation"] = [ "type": "retention_ratio", @@ -376,25 +420,34 @@ final class OpenAIWebRTCClient: OpenAIWebRTCBase { ] } - if transcriptionEnabled { - session["input_audio_transcription"] = ["model": "whisper-1"] + return session + } + + func handleDataChannelOpenAfterInitialSessionSetup() { + guard !hasSentInitialSessionConfig else { return } + + guard let dataChannel, dataChannel.readyState == .open else { + self.logger.log( + "[VmWebrtc] Data channel not ready for initial session configuration", + attributes: logAttributes(for: .warn, metadata: ["hasChannel": dataChannel != nil])) + return } - if let prettyData = try? JSONSerialization.data( - withJSONObject: session, options: [.prettyPrinted]), - let prettyString = String(data: prettyData, encoding: .utf8) - { + if let session = initialSessionConfiguration { self.logger.log( - "📑 [VmWebrtc] Sending session.update payload", - attributes: logAttributes(for: .debug, metadata: ["session": prettyString])) + "📑 [VmWebrtc] Initial session already configured via /v1/realtime/calls", + attributes: logAttributes( + for: .debug, + metadata: [ + "session": session, + "sessionJSON": prettyJSONString(from: session) ?? "", + ])) } else { self.logger.log( - "📑 [VmWebrtc] Sending session.update payload (fallback formatting)", - attributes: logAttributes(for: .debug, metadata: ["session": session])) + "[VmWebrtc] Missing cached initial session configuration when data channel opened", + attributes: logAttributes(for: .warn)) } - _ = sendEvent(["type": "session.update", "session": session]) - Task { @MainActor in self.emitModuleEvent( "onVoiceSessionStatus", payload: ["status_update": "Started Voice Session"]) diff --git a/modules/vm-webrtc/ios/OpenAIWebRTCCore.swift b/modules/vm-webrtc/ios/OpenAIWebRTCCore.swift index 8383d79..408ef44 100644 --- a/modules/vm-webrtc/ios/OpenAIWebRTCCore.swift +++ b/modules/vm-webrtc/ios/OpenAIWebRTCCore.swift @@ -8,7 +8,7 @@ enum OpenAIWebRTCError: LocalizedError { case invalidEndpoint case missingLocalDescription case missingAPIKey - case openAIRejected(Int) + case openAIRejected(status: Int, details: String?) case openAIResponseDecoding case connectionTimeout case connectionFailed(String) @@ -23,7 +23,7 @@ enum OpenAIWebRTCError: LocalizedError { return "The local WebRTC session description is missing after ICE gathering." case .missingAPIKey: return "An OpenAI API key must be set before starting a session." - case .openAIRejected(let status): + case .openAIRejected(let status, _): return "OpenAI Realtime endpoint rejected the SDP offer with status code \(status)." case .openAIResponseDecoding: return "Could not decode the SDP answer returned by OpenAI." diff --git a/modules/vm-webrtc/ios/OpenAIWebRTCTranslatorClient.swift b/modules/vm-webrtc/ios/OpenAIWebRTCTranslatorClient.swift index 801a51b..92ec5c0 100644 --- a/modules/vm-webrtc/ios/OpenAIWebRTCTranslatorClient.swift +++ b/modules/vm-webrtc/ios/OpenAIWebRTCTranslatorClient.swift @@ -95,6 +95,16 @@ final class OpenAIWebRTCTranslatorClient: OpenAIWebRTCBase { attributes: logAttributes(for: .error)) throw OpenAIWebRTCError.missingLocalDescription } + logger.log( + "[VmWebrtc][Translator] Final local SDP ready for OpenAI exchange", + attributes: logAttributes( + for: .debug, + metadata: [ + "endpoint": endpointURL.absoluteString, + "sdpLength": localSDP.count, + "sdpSummary": analyzeSDP(localSDP), + "sdp": localSDP, + ])) emitModuleEvent( "onVoiceSessionStatus", payload: ["status_update": "Connecting to OpenAI endpoint..."]) diff --git a/modules/vm-webrtc/ios/VmWebrtcModule.swift b/modules/vm-webrtc/ios/VmWebrtcModule.swift index 9edac17..ef6a161 100644 --- a/modules/vm-webrtc/ios/VmWebrtcModule.swift +++ b/modules/vm-webrtc/ios/VmWebrtcModule.swift @@ -185,9 +185,12 @@ public class VmWebrtcModule: Module { "openOpenAIConnectionAsync called", attributes: [ "model": options.model ?? "nil", + "baseUrl": options.baseUrl ?? "nil", "audioOutput": options.audioOutput ?? "nil", "voice": options.voice ?? "nil", "vadMode": options.vadMode ?? "nil", + "toolDefinitionCount": options.toolDefinitions?.count ?? 0, + "transcriptionEnabled": options.transcriptionEnabled ?? false, ]) let outputPreference = AudioOutputPreference(rawValue: options.audioOutput ?? "handset") ?? .handset diff --git a/modules/vm-webrtc/ios/WebRtcClientHelpers.swift b/modules/vm-webrtc/ios/WebRtcClientHelpers.swift index 8f08afa..b339b5f 100644 --- a/modules/vm-webrtc/ios/WebRtcClientHelpers.swift +++ b/modules/vm-webrtc/ios/WebRtcClientHelpers.swift @@ -5,11 +5,21 @@ import WebRTC extension OpenAIWebRTCBase { // MARK: - Helper Methods - func buildEndpointURL(baseURL: String?, model: String?) throws -> URL { + func buildEndpointURL(baseURL: String?, model: String?, appendModelQuery: Bool = true) throws + -> URL + { let endpoint = (baseURL?.isEmpty == false ? baseURL! : defaultEndpoint) + let resolvedModel = (model?.isEmpty == false ? model! : defaultModel) self.logger.log( "[VmWebrtc] " + "Building OpenAI endpoint URL", - attributes: logAttributes(for: .debug, metadata: ["base": endpoint])) + attributes: logAttributes( + for: .debug, + metadata: [ + "base": endpoint, + "providedBaseURL": baseURL ?? "", + "resolvedModel": resolvedModel, + "appendModelQuery": appendModelQuery, + ])) guard var components = URLComponents(string: endpoint) else { self.logger.log( "[VmWebrtc] " + "Failed to parse OpenAI endpoint", @@ -18,10 +28,9 @@ extension OpenAIWebRTCBase { } var items = components.queryItems ?? [] - if items.contains(where: { $0.name == "model" }) == false { - items.append( - URLQueryItem( - name: "model", value: (model?.isEmpty == false ? model! : defaultModel))) + let hadModelQueryItem = items.contains(where: { $0.name == "model" }) + if appendModelQuery && items.contains(where: { $0.name == "model" }) == false { + items.append(URLQueryItem(name: "model", value: resolvedModel)) } components.queryItems = items @@ -31,12 +40,131 @@ extension OpenAIWebRTCBase { attributes: logAttributes(for: .error, metadata: ["endpoint": endpoint])) throw OpenAIWebRTCError.invalidEndpoint } + let endpointSummary = describeEndpoint(url) self.logger.log( "[VmWebrtc] " + "OpenAI endpoint URL ready", - attributes: logAttributes(for: .debug, metadata: ["url": url.absoluteString])) + attributes: logAttributes( + for: .debug, + metadata: [ + "url": url.absoluteString, + "queryItems": summarizeQueryItems(items), + "appendedModelQuery": appendModelQuery && hadModelQueryItem == false, + "endpointSummary": endpointSummary, + ])) + if (endpointSummary["mode"] as? String) == "legacy_realtime_query_sdp" { + self.logger.log( + "[VmWebrtc] OpenAI endpoint is using the legacy realtime query-parameter SDP exchange", + attributes: logAttributes(for: .warn, metadata: endpointSummary)) + } return url } + func summarizeQueryItems(_ items: [URLQueryItem]) -> [String: String] { + var result: [String: String] = [:] + for item in items { + result[item.name] = item.value ?? "" + } + return result + } + + func describeEndpoint(_ url: URL) -> [String: Any] { + let components = URLComponents(url: url, resolvingAgainstBaseURL: false) + let queryItems = components?.queryItems ?? [] + let path = url.path + let hasModelQuery = queryItems.contains(where: { $0.name == "model" }) + let mode: String + + switch path { + case "/v1/realtime": + mode = hasModelQuery ? "legacy_realtime_query_sdp" : "legacy_realtime" + case "/v1/realtime/calls": + mode = "realtime_calls" + case "/v1/realtime/translations/calls": + mode = "translation_calls" + default: + mode = "custom" + } + + return [ + "scheme": url.scheme ?? "", + "host": url.host ?? "", + "path": path, + "mode": mode, + "hasModelQuery": hasModelQuery, + "queryItems": summarizeQueryItems(queryItems), + ] + } + + func analyzeSDP(_ sdp: String) -> [String: Any] { + let lines = sdp + .components(separatedBy: .newlines) + .map { $0.trimmingCharacters(in: .whitespacesAndNewlines) } + .filter { !$0.isEmpty } + let mediaSections = lines.filter { $0.hasPrefix("m=") } + let rtpmapLines = lines.filter { $0.hasPrefix("a=rtpmap:") } + let directionLines = lines.filter { + $0 == "a=sendrecv" || $0 == "a=sendonly" || $0 == "a=recvonly" || $0 == "a=inactive" + } + let candidateLines = lines.filter { $0.hasPrefix("a=candidate:") } + + return [ + "lineCount": lines.count, + "mediaSections": mediaSections, + "hasAudioMedia": mediaSections.contains(where: { $0.hasPrefix("m=audio") }), + "hasVideoMedia": mediaSections.contains(where: { $0.hasPrefix("m=video") }), + "hasApplicationMedia": mediaSections.contains(where: { $0.hasPrefix("m=application") }), + "hasOpus48000Stereo": lines.contains(where: { $0.contains("opus/48000/2") }), + "hasOpus48000Mono": lines.contains(where: { $0.contains("opus/48000/1") }), + "directionAttributes": directionLines, + "candidateCount": candidateLines.count, + "iceUfragPresent": lines.contains(where: { $0.hasPrefix("a=ice-ufrag:") }), + "icePwdPresent": lines.contains(where: { $0.hasPrefix("a=ice-pwd:") }), + "fingerprintPresent": lines.contains(where: { $0.hasPrefix("a=fingerprint:") }), + "setupAttributes": lines.filter { $0.hasPrefix("a=setup:") }, + "midAttributes": lines.filter { $0.hasPrefix("a=mid:") }, + "rtpmapLines": rtpmapLines, + ] + } + + func summarizeHTTPHeaders(_ headers: [AnyHashable: Any]) -> [String: String] { + var result: [String: String] = [:] + for (key, value) in headers { + result[String(describing: key)] = String(describing: value) + } + return result + } + + func prettyJSONString(from jsonObject: Any) -> String? { + guard JSONSerialization.isValidJSONObject(jsonObject), + let data = try? JSONSerialization.data(withJSONObject: jsonObject, options: [.prettyPrinted]), + let string = String(data: data, encoding: .utf8) + else { + return nil + } + return string + } + + func buildMultipartFormData( + boundary: String, + parts: [(name: String, contentType: String?, value: Data)] + ) -> Data { + var body = Data() + + for part in parts { + body.append("--\(boundary)\r\n".data(using: .utf8)!) + body.append("Content-Disposition: form-data; name=\"\(part.name)\"\r\n".data(using: .utf8)!) + if let contentType = part.contentType { + body.append("Content-Type: \(contentType)\r\n".data(using: .utf8)!) + } + body.append("\r\n".data(using: .utf8)!) + body.append(part.value) + body.append("\r\n".data(using: .utf8)!) + } + + body.append("--\(boundary)--\r\n".data(using: .utf8)!) + return body + } + func configureAudioSession(for output: AudioOutputPreference) throws { guard !skipAudioSession else { logger.log( @@ -337,7 +465,12 @@ extension OpenAIWebRTCBase { audioTrack.isEnabled = !isOutgoingAudioMuted self.logger.log( "[VmWebrtc] " + "Attached audio track to peer connection", - attributes: logAttributes(for: .debug)) + attributes: logAttributes( + for: .debug, + metadata: [ + "trackId": audioTrack.trackId, + "isEnabled": audioTrack.isEnabled, + ])) let dataChannelConfig = RTCDataChannelConfiguration() dataChannelConfig.channelId = 0 @@ -394,7 +527,13 @@ extension OpenAIWebRTCBase { self.logger.log( "[VmWebrtc] " + "Local SDP offer ready", - attributes: logAttributes(for: .debug, metadata: ["sdpLength": sdp.sdp.count])) + attributes: logAttributes( + for: .debug, + metadata: [ + "sdpLength": sdp.sdp.count, + "sdpSummary": self.analyzeSDP(sdp.sdp), + "sdp": sdp.sdp, + ])) continuation.resume(returning: sdp) } } @@ -516,13 +655,21 @@ extension OpenAIWebRTCBase { func exchangeSDPWithOpenAI(apiKey: String, endpointURL: URL, offerSDP: String) async throws -> String { + let endpointSummary = describeEndpoint(endpointURL) + let sdpSummary = analyzeSDP(offerSDP) self.logger.log( "[VmWebrtc] " + "Sending SDP offer to OpenAI", attributes: logAttributes( for: .debug, metadata: [ "endpoint": endpointURL.absoluteString, + "endpointSummary": endpointSummary, + "requestMethod": "POST", + "requestContentType": "application/sdp", + "apiKeyLength": apiKey.count, "sdpLength": offerSDP.count, + "sdpSummary": sdpSummary, + "sdp": offerSDP, ])) var request = URLRequest(url: endpointURL) @@ -541,11 +688,23 @@ extension OpenAIWebRTCBase { } guard (200..<300).contains(httpResponse.statusCode) else { + let responseBody = String(data: data, encoding: .utf8) ?? "" self.logger.log( "[VmWebrtc] " + "OpenAI rejected SDP offer", attributes: logAttributes( - for: .error, metadata: ["status": httpResponse.statusCode])) - throw OpenAIWebRTCError.openAIRejected(httpResponse.statusCode) + for: .error, + metadata: [ + "status": httpResponse.statusCode, + "endpoint": endpointURL.absoluteString, + "endpointSummary": endpointSummary, + "responseHeaders": summarizeHTTPHeaders(httpResponse.allHeaderFields), + "responseBody": responseBody, + "sdpSummary": sdpSummary, + ])) + throw OpenAIWebRTCError.openAIRejected( + status: httpResponse.statusCode, + details: responseBody + ) } guard let answer = String(data: data, encoding: .utf8), !answer.isEmpty else { @@ -557,7 +716,116 @@ extension OpenAIWebRTCBase { self.logger.log( "[VmWebrtc] " + "Received SDP answer from OpenAI", - attributes: logAttributes(for: .debug, metadata: ["sdpLength": answer.count])) + attributes: logAttributes( + for: .debug, + metadata: [ + "sdpLength": answer.count, + "responseHeaders": summarizeHTTPHeaders(httpResponse.allHeaderFields), + "sdpSummary": analyzeSDP(answer), + ])) + return answer + } + + func exchangeRealtimeCallWithOpenAI( + apiKey: String, + endpointURL: URL, + offerSDP: String, + session: [String: Any] + ) async throws -> String { + let endpointSummary = describeEndpoint(endpointURL) + let sdpSummary = analyzeSDP(offerSDP) + let sessionData = try JSONSerialization.data(withJSONObject: session, options: []) + let sessionPretty = prettyJSONString(from: session) ?? "" + let boundary = "Boundary-\(UUID().uuidString)" + let requestBody = buildMultipartFormData( + boundary: boundary, + parts: [ + ( + name: "sdp", + contentType: "application/sdp", + value: Data(offerSDP.utf8) + ), + ( + name: "session", + contentType: "application/json", + value: sessionData + ), + ]) + + self.logger.log( + "[VmWebrtc] Sending realtime call offer to OpenAI", + attributes: logAttributes( + for: .debug, + metadata: [ + "endpoint": endpointURL.absoluteString, + "endpointSummary": endpointSummary, + "requestMethod": "POST", + "requestContentType": "multipart/form-data", + "multipartBoundary": boundary, + "apiKeyLength": apiKey.count, + "sdpLength": offerSDP.count, + "sdpSummary": sdpSummary, + "sdp": offerSDP, + "session": session, + "sessionJSON": sessionPretty, + ])) + + var request = URLRequest(url: endpointURL) + request.httpMethod = "POST" + request.httpBody = requestBody + request.setValue( + "multipart/form-data; boundary=\(boundary)", + forHTTPHeaderField: "Content-Type" + ) + request.setValue("Bearer \(apiKey)", forHTTPHeaderField: "Authorization") + + let (data, response) = try await URLSession.shared.data(for: request) + + guard let httpResponse = response as? HTTPURLResponse else { + self.logger.log( + "[VmWebrtc] OpenAI realtime call response missing HTTP status", + attributes: logAttributes(for: .error)) + throw OpenAIWebRTCError.openAIResponseDecoding + } + + guard (200..<300).contains(httpResponse.statusCode) else { + let responseBody = String(data: data, encoding: .utf8) ?? "" + self.logger.log( + "[VmWebrtc] OpenAI rejected realtime call offer", + attributes: logAttributes( + for: .error, + metadata: [ + "status": httpResponse.statusCode, + "endpoint": endpointURL.absoluteString, + "endpointSummary": endpointSummary, + "responseHeaders": summarizeHTTPHeaders(httpResponse.allHeaderFields), + "responseBody": responseBody, + "sdpSummary": sdpSummary, + "session": session, + "sessionJSON": sessionPretty, + ])) + throw OpenAIWebRTCError.openAIRejected( + status: httpResponse.statusCode, + details: responseBody + ) + } + + guard let answer = String(data: data, encoding: .utf8), !answer.isEmpty else { + self.logger.log( + "[VmWebrtc] OpenAI realtime call returned an empty SDP answer", + attributes: logAttributes(for: .error)) + throw OpenAIWebRTCError.openAIResponseDecoding + } + + self.logger.log( + "[VmWebrtc] Received realtime call SDP answer from OpenAI", + attributes: logAttributes( + for: .debug, + metadata: [ + "sdpLength": answer.count, + "responseHeaders": summarizeHTTPHeaders(httpResponse.allHeaderFields), + "sdpSummary": analyzeSDP(answer), + ])) return answer } diff --git a/modules/vm-webrtc/src/VmWebrtcModule.ts b/modules/vm-webrtc/src/VmWebrtcModule.ts index 7578e23..5755513 100644 --- a/modules/vm-webrtc/src/VmWebrtcModule.ts +++ b/modules/vm-webrtc/src/VmWebrtcModule.ts @@ -168,8 +168,8 @@ export const openOpenAIConnectionAsync = async ( `[${MODULE_NAME}] openOpenAIConnectionAsync invoked`, {}, { - hasBaseUrl: Boolean(options.baseUrl), - hasModel: Boolean(options.model), + baseUrl: options.baseUrl ?? "default", + model: options.model ?? "default", audioOutput: options.audioOutput ?? "handset", audioSpeed: resolvedAudioSpeed ?? "default", hasInstructions: trimmedInstructions.length > 0,