Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 3 additions & 4 deletions app/RealtimeTranslation.tsx
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@ import { LanguagePickerRow } from "../components/realtime/LanguagePickerRow";
import { SessionFooter } from "../components/realtime/SessionFooter";
import { TranscriptView } from "../components/realtime/TranscriptView";
import { log } from "../lib/logger";
import { getUserFacingRealtimeErrorMessage } from "../lib/realtimeUserError";
import {
DEFAULT_BIDIRECTIONAL_ENABLED,
DEFAULT_BIDIRECTIONAL_LANGUAGE,
Expand Down Expand Up @@ -314,8 +315,7 @@ export function RealtimeTranslation({
},
error,
);
const message =
error instanceof Error ? error.message : "Unexpected error";
const message = getUserFacingRealtimeErrorMessage(error, "translation");
Alert.alert("Translation", message);
setIsSessionActive(false);
} finally {
Expand Down Expand Up @@ -348,8 +348,7 @@ export function RealtimeTranslation({
},
error,
);
const message =
error instanceof Error ? error.message : "Unexpected error";
const message = getUserFacingRealtimeErrorMessage(error, "translation");
Alert.alert("Translation", message);
} finally {
setIsStopping(false);
Expand Down
17 changes: 9 additions & 8 deletions app/VoiceChat.tsx
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@ import { VoiceSpeedCustomization } from "../components/VoiceSpeedCustomization";
import { loadShowRealtimeErrorAlerts } from "../lib/developerSettings";
import { log } from "../lib/logger";
import { composeMainPrompt } from "../lib/mainPrompt";
import { getUserFacingRealtimeErrorMessage } from "../lib/realtimeUserError";
import { TokenUsageTracker } from "../lib/tokenUsageTracker";
import { loadTranscriptionPreference } from "../lib/transcriptionPreference";
import type { VadMode } from "../lib/vadPreference";
Expand Down Expand Up @@ -173,10 +174,10 @@ export function VoiceChat({
},
);
const message =
typeof payload?.error?.message === "string" &&
payload.error.message.trim().length > 0
? payload.error.message.trim()
: "The voice session encountered an unexpected error.";
getUserFacingRealtimeErrorMessage(
payload?.error?.message,
"voice",
) || "The voice session encountered an unexpected error.";

// Only show alert if developer setting is enabled
const shouldShowAlert = await loadShowRealtimeErrorAlerts();
Expand Down Expand Up @@ -396,15 +397,16 @@ export function VoiceChat({
"Starting OpenAI voice session",
{},
{
hasBaseUrl: Boolean(baseConnectionOptions.baseUrl),
hasModel: Boolean(baseConnectionOptions.model),
baseUrl: baseConnectionOptions.baseUrl ?? "default",
model: baseConnectionOptions.model ?? "default",
audioOutput,
voice: selectedVoice,
hasInstructions: finalPrompt.trim().length > 0,
hasCustomAddition: mainPromptAddition.trim().length > 0,
toolNames: voiceToolNames,
transcriptionEnabled,
selectedLanguage,
hasMicPermission,
},
);
const customConnectionOptions: OpenAIConnectionOptions = {
Expand Down Expand Up @@ -522,8 +524,7 @@ export function VoiceChat({
},
error,
);
const message =
error instanceof Error ? error.message : "Unexpected error";
const message = getUserFacingRealtimeErrorMessage(error, "voice");

// Emit error status
emitVoiceSessionStatus(`Connection failed: ${message}`);
Expand Down
78 changes: 78 additions & 0 deletions lib/realtimeUserError.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,78 @@
const DEFAULT_MESSAGES = {
voice: "The voice session could not be started.",
translation: "The translation session could not be started.",
} as const;

type RealtimeUserErrorContext = keyof typeof DEFAULT_MESSAGES;

const isNonEmptyString = (value: unknown): value is string =>
typeof value === "string" && value.trim().length > 0;

const firstLine = (value: string) => value.split(/\r?\n/, 1)[0]?.trim() ?? "";

export function getUserFacingRealtimeErrorMessage(
error: unknown,
context: RealtimeUserErrorContext = "voice",
): string {
const fallback = DEFAULT_MESSAGES[context];
const rawMessage = (() => {
if (error instanceof Error) {
return error.message;
}
if (isNonEmptyString(error)) {
return error;
}
return fallback;
})();

const message = rawMessage.trim();
if (!message) {
return fallback;
}

if (message.includes("OpenAI Realtime endpoint rejected")) {
return "OpenAI rejected the realtime connection setup.";
}

if (message.includes("Timed out waiting for the WebRTC connection")) {
return "Timed out while establishing the realtime connection.";
}

if (message.includes("WebRTC connection failed with state:")) {
return "The realtime connection failed before the session was ready.";
}

if (
message.includes("An OpenAI API key must be set") ||
message.includes("Missing OpenAI API key")
) {
return "An OpenAI API key is required to start this session.";
}

if (message.includes("Failed to build the OpenAI Realtime endpoint URL")) {
return "The configured OpenAI endpoint is invalid.";
}

if (message.includes("The local WebRTC session description is missing")) {
return "The device could not prepare the realtime connection offer.";
}

if (message.includes("Could not decode the SDP answer returned by OpenAI")) {
return "OpenAI returned an invalid realtime connection response.";
}

if (
message.startsWith("{") ||
message.startsWith("[") ||
message.includes("Response body:")
) {
return fallback;
}

const summary = firstLine(message);
if (!summary) {
return fallback;
}

return summary.length > 140 ? `${summary.slice(0, 139).trimEnd()}…` : summary;
}
125 changes: 89 additions & 36 deletions modules/vm-webrtc/ios/OpenAIWebRTCClient.swift
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@ final class OpenAIWebRTCClient: OpenAIWebRTCBase {

// MARK: Subclass-provided endpoint constants

override var defaultEndpoint: String { "https://api.openai.com/v1/realtime" }
override var defaultEndpoint: String { "https://api.openai.com/v1/realtime/calls" }
override var defaultModel: String { "gpt-realtime" }

// MARK: Chat-specific stored properties
Expand Down Expand Up @@ -41,6 +41,7 @@ final class OpenAIWebRTCClient: OpenAIWebRTCBase {
let audioMixPlayer = AudioMixPlayer()

var toolDefinitions: [[String: Any]] = []
private var initialSessionConfiguration: [String: Any]?
lazy var eventHandler = WebRTCEventHandler()

// MARK: Init / deinit
Expand Down Expand Up @@ -69,7 +70,7 @@ final class OpenAIWebRTCClient: OpenAIWebRTCBase {
// MARK: Virtual hook overrides

override func dataChannelDidOpen() {
sendInitialSessionConfiguration()
handleDataChannelOpenAfterInitialSessionSetup()
}

override func handleDataChannelMessage(_ event: [String: Any]) {
Expand Down Expand Up @@ -220,17 +221,31 @@ final class OpenAIWebRTCClient: OpenAIWebRTCBase {
attributes: logAttributes(
for: .info,
metadata: [
"hasModel": (model?.isEmpty == false),
"hasBaseURL": (baseURL?.isEmpty == false),
"resolvedModel": resolvedModelName(model),
"baseURL": baseURL ?? "",
"audioOutput": audioOutput.rawValue,
"voice": sessionVoice,
]))

let endpointURL = try buildEndpointURL(baseURL: baseURL, model: model)
let endpointURL = try buildEndpointURL(
baseURL: baseURL,
model: model,
appendModelQuery: false
)
self.logger.log(
"[VmWebrtc] Resolved OpenAI endpoint",
attributes: logAttributes(for: .debug, metadata: ["endpoint": endpointURL.absoluteString])
)
let sessionConfiguration = buildInitialRealtimeSessionConfiguration(model: model)
initialSessionConfiguration = sessionConfiguration
self.logger.log(
"[VmWebrtc] Prepared initial realtime session configuration",
attributes: logAttributes(
for: .debug,
metadata: [
"session": sessionConfiguration,
"sessionJSON": prettyJSONString(from: sessionConfiguration) ?? "<invalid_session_json>",
]))

try configureAudioSession(for: audioOutput)
self.logger.log(
Expand Down Expand Up @@ -282,12 +297,26 @@ final class OpenAIWebRTCClient: OpenAIWebRTCBase {
attributes: logAttributes(for: .error))
throw OpenAIWebRTCError.missingLocalDescription
}
self.logger.log(
"[VmWebrtc] Final local SDP ready for OpenAI exchange",
attributes: logAttributes(
for: .debug,
metadata: [
"endpoint": endpointURL.absoluteString,
"sdpLength": localSDP.count,
"sdpSummary": analyzeSDP(localSDP),
"sdp": localSDP,
]))

emitModuleEvent(
"onVoiceSessionStatus", payload: ["status_update": "Connecting to OpenAI endpoint..."])

let answerSDP = try await exchangeSDPWithOpenAI(
apiKey: resolvedApiKey, endpointURL: endpointURL, offerSDP: localSDP)
let answerSDP = try await exchangeRealtimeCallWithOpenAI(
apiKey: resolvedApiKey,
endpointURL: endpointURL,
offerSDP: localSDP,
session: sessionConfiguration
)
let remoteDescription = RTCSessionDescription(type: .answer, sdp: answerSDP)
try await setRemoteDescription(remoteDescription, for: connection)
self.logger.log(
Expand Down Expand Up @@ -324,22 +353,19 @@ final class OpenAIWebRTCClient: OpenAIWebRTCBase {
eventHandler.resetAudioStreamingState()
eventHandler.resetFunctionCallState()
eventHandler.shadowObserve_reset(reason: "connection_closed")
initialSessionConfiguration = nil

return super.closeConnection()
}

// MARK: Session configuration (sent once data channel opens)

func sendInitialSessionConfiguration() {
guard !hasSentInitialSessionConfig else { return }
// MARK: Session configuration / startup

guard let dataChannel, dataChannel.readyState == .open else {
self.logger.log(
"[VmWebrtc] Data channel not ready for initial session configuration",
attributes: logAttributes(for: .warn, metadata: ["hasChannel": dataChannel != nil]))
return
}
private func resolvedModelName(_ model: String?) -> String {
let trimmedModel = model?.trimmingCharacters(in: .whitespacesAndNewlines) ?? ""
return trimmedModel.isEmpty ? defaultModel : trimmedModel
}

private func buildInitialRealtimeSessionConfiguration(model: String?) -> [String: Any] {
let tools = buildTools()

if tools.isEmpty && !toolDefinitions.isEmpty {
Expand All @@ -349,52 +375,79 @@ final class OpenAIWebRTCClient: OpenAIWebRTCBase {
for: .warn, metadata: ["definitionCount": toolDefinitions.count]))
}

var session: [String: Any] = [
"instructions": sessionInstructions,
"voice": sessionVoice,
"tools": tools,
]
var inputAudioConfig: [String: Any] = [:]

switch turnDetectionMode {
case .semantic:
session["turn_detection"] = [
inputAudioConfig["turn_detection"] = [
"type": "semantic_vad",
"create_response": true,
"eagerness": "low",
]
case .server:
session["turn_detection"] = [
inputAudioConfig["turn_detection"] = [
"type": "server_vad",
"create_response": true,
]
}

if transcriptionEnabled {
inputAudioConfig["transcription"] = ["model": "whisper-1"]
}

var audioConfig: [String: Any] = [
"output": [
"voice": sessionVoice,
]
]

if !inputAudioConfig.isEmpty {
audioConfig["input"] = inputAudioConfig
}

var session: [String: Any] = [
"type": "realtime",
"model": resolvedModelName(model),
"instructions": sessionInstructions,
"audio": audioConfig,
"tools": tools,
]

if let ratio = retentionRatio {
session["truncation"] = [
"type": "retention_ratio",
"retention_ratio": quantizedRetentionRatio(ratio),
]
}

if transcriptionEnabled {
session["input_audio_transcription"] = ["model": "whisper-1"]
return session
}

func handleDataChannelOpenAfterInitialSessionSetup() {
guard !hasSentInitialSessionConfig else { return }

guard let dataChannel, dataChannel.readyState == .open else {
self.logger.log(
"[VmWebrtc] Data channel not ready for initial session configuration",
attributes: logAttributes(for: .warn, metadata: ["hasChannel": dataChannel != nil]))
return
}

if let prettyData = try? JSONSerialization.data(
withJSONObject: session, options: [.prettyPrinted]),
let prettyString = String(data: prettyData, encoding: .utf8)
{
if let session = initialSessionConfiguration {
self.logger.log(
"📑 [VmWebrtc] Sending session.update payload",
attributes: logAttributes(for: .debug, metadata: ["session": prettyString]))
"📑 [VmWebrtc] Initial session already configured via /v1/realtime/calls",
attributes: logAttributes(
for: .debug,
metadata: [
"session": session,
"sessionJSON": prettyJSONString(from: session) ?? "<invalid_session_json>",
]))
} else {
self.logger.log(
"📑 [VmWebrtc] Sending session.update payload (fallback formatting)",
attributes: logAttributes(for: .debug, metadata: ["session": session]))
"[VmWebrtc] Missing cached initial session configuration when data channel opened",
attributes: logAttributes(for: .warn))
}

_ = sendEvent(["type": "session.update", "session": session])

Task { @MainActor in
self.emitModuleEvent(
"onVoiceSessionStatus", payload: ["status_update": "Started Voice Session"])
Expand Down
4 changes: 2 additions & 2 deletions modules/vm-webrtc/ios/OpenAIWebRTCCore.swift
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@ enum OpenAIWebRTCError: LocalizedError {
case invalidEndpoint
case missingLocalDescription
case missingAPIKey
case openAIRejected(Int)
case openAIRejected(status: Int, details: String?)
case openAIResponseDecoding
case connectionTimeout
case connectionFailed(String)
Expand All @@ -23,7 +23,7 @@ enum OpenAIWebRTCError: LocalizedError {
return "The local WebRTC session description is missing after ICE gathering."
case .missingAPIKey:
return "An OpenAI API key must be set before starting a session."
case .openAIRejected(let status):
case .openAIRejected(let status, _):
return "OpenAI Realtime endpoint rejected the SDP offer with status code \(status)."
case .openAIResponseDecoding:
return "Could not decode the SDP answer returned by OpenAI."
Expand Down
Loading
Loading