diff --git a/ARCHITECTURE.md b/ARCHITECTURE.md index ab75b712c..a3b8adc21 100644 --- a/ARCHITECTURE.md +++ b/ARCHITECTURE.md @@ -213,11 +213,27 @@ Guidelines: - The app owns visible, screen-local state: which actions are available, which element should be spotlighted, and how actions are choreographed so users can see control happen. -- Controllers such as MCP bridges, test harnesses, or optional external drivers should +- Controllers such as OpenAI Realtime, MCP bridges, or test harnesses should call the app control surface instead of reaching into app internals. +- OpenAI Realtime is one replaceable control driver, not the owner of the + control architecture. The generic app-control registry and session actions + should remain useful if the voice driver is removed or replaced by tests, + scripts, MCP bridges, or other controllers. - Provider/API secrets and privileged filesystem or server mutations remain server-owned; the app control surface should route those through OpenWork server APIs rather than adding provider-specific behavior to the UI. +- `/remote/session` is the OpenWork server endpoint that brokers short-lived + remote-control Realtime sessions. It keeps provider API keys server-side and + returns only an ephemeral browser client secret. +- Realtime control is a Feature Preview capability and is off by default. When + enabled, users start or stop it from the session status bar instead of a + floating overlay. +- The OpenAI key used for the initial Realtime controller can come from the + server process environment or from the OpenWork local environment store via + Settings -> Feature Preview; the browser never receives the long-lived key. +- Realtime remote control captures microphone audio in the app/browser only + after the user starts the mode. The first implementation sends audio input to + the Realtime session while keeping model output text/tool-call based. - Raw screenshot or coordinate-based control is a fallback for uninstrumented surfaces, not the default architecture. diff --git a/apps/app/src/app/lib/desktop.ts b/apps/app/src/app/lib/desktop.ts index dd4a2b954..e06485015 100644 --- a/apps/app/src/app/lib/desktop.ts +++ b/apps/app/src/app/lib/desktop.ts @@ -14,6 +14,9 @@ declare global { openExternal?: (url: string) => Promise; relaunch?: () => Promise; }; + permissions?: { + requestMicrophone?: () => Promise<{ granted: boolean; status: string }>; + }; migration?: { readSnapshot?: () => Promise; ackSnapshot?: () => Promise<{ ok: boolean; moved: boolean }>; diff --git a/apps/app/src/app/lib/openwork-server.ts b/apps/app/src/app/lib/openwork-server.ts index 7b7eb1963..b32f5d512 100644 --- a/apps/app/src/app/lib/openwork-server.ts +++ b/apps/app/src/app/lib/openwork-server.ts @@ -217,6 +217,20 @@ export type OpenworkBlueprintSessionsMaterializeResult = { openSessionId: string | null; }; +export type OpenworkRemoteSession = { + clientSecret: string; + expiresAt: number | null; + model: string; + voice: string; + tools: string[]; +}; + +export type OpenworkRemoteSessionRequest = { + model?: string; + voice?: string; + instructions?: string; +}; + export type OpenworkArtifactItem = { id: string; name?: string; @@ -766,6 +780,24 @@ export function createOpenworkServerClient(options: { baseUrl: string; token?: s requestJson(baseUrl, "/runtime/versions", { token, hostToken, timeoutMs: timeouts.status }), status: () => requestJson(baseUrl, "/status", { token, hostToken, timeoutMs: timeouts.status }), capabilities: () => requestJson(baseUrl, "/capabilities", { token, hostToken, timeoutMs: timeouts.capabilities }), + createRemoteSession: (payload: OpenworkRemoteSessionRequest = {}) => + requestJson(baseUrl, "/remote/session", { + token, + hostToken, + method: "POST", + body: payload, + timeoutMs: timeouts.status, + }).catch((error) => { + if (error instanceof OpenworkServerError && error.status === 404) { + throw new OpenworkServerError( + 404, + "remote_session_unavailable", + "Realtime control requires a newer OpenWork server. Restart OpenWork so the updated server binary is used, then try Control again.", + error.details, + ); + } + throw error; + }), listWorkspaces: () => requestJson(baseUrl, "/workspaces", { token, hostToken, timeoutMs: timeouts.listWorkspaces }), createLocalWorkspace: (payload: { folderPath: string; name: string; preset: string }) => requestJson(baseUrl, "/workspaces/local", { diff --git a/apps/app/src/app/types.ts b/apps/app/src/app/types.ts index 9b215e0da..a581064ff 100644 --- a/apps/app/src/app/types.ts +++ b/apps/app/src/app/types.ts @@ -175,6 +175,7 @@ export type SettingsTab = | "skills" | "extensions" | "environment" + | "feature-preview" | "advanced" | "appearance" | "updates" diff --git a/apps/app/src/i18n/locales/en.ts b/apps/app/src/i18n/locales/en.ts index da43073db..0af3baa51 100644 --- a/apps/app/src/i18n/locales/en.ts +++ b/apps/app/src/i18n/locales/en.ts @@ -1717,6 +1717,7 @@ export default { "settings.environment.validation_shape": "Use letters, digits, and underscores; do not start with a digit.", "settings.environment.value_label": "Value", "settings.tab_description_environment": "Save API keys and tokens for local agents, skills, and MCP servers. Secrets stay on this device.", + "settings.tab_description_feature_preview": "Try experimental OpenWork capabilities before they graduate into the default product.", "settings.tab_description_messaging": "Configure router identities and inbox behavior from workspace settings.", "settings.tab_description_model": "Tune the default model, runtime behavior, and assistant output settings.", "settings.tab_description_recovery": "Repair migration state, reset workspace defaults, and recover local settings.", @@ -1724,6 +1725,7 @@ export default { "settings.tab_description_updates": "Keep the app current with quiet background checks and install controls.", "settings.tab_environment": "Environment", "settings.tab_extensions": "Extensions", + "settings.tab_feature_preview": "Feature Preview", "settings.tab_general": "Settings", "settings.tab_messaging": "Messaging", "settings.tab_model": "Model", @@ -1734,6 +1736,23 @@ export default { "settings.theme_light": "Light", "settings.theme_system": "System", "settings.theme_system_hint": "System mode follows your OS preference automatically.", + "settings.feature_preview.badge": "Preview", + "settings.feature_preview.checking": "Checking…", + "settings.feature_preview.configured": "Configured", + "settings.feature_preview.connect_server_hint": "Connect to an OpenWork server before saving a key.", + "settings.feature_preview.disabled": "Disabled", + "settings.feature_preview.enabled": "Enabled", + "settings.feature_preview.not_configured": "Not configured", + "settings.feature_preview.openai_key_description": "The key is saved in OpenWork's local environment store and used server-side to mint short-lived Realtime sessions. It is not sent to the browser as a long-lived secret.", + "settings.feature_preview.openai_key_hint": "Required for Realtime control. Existing shell OPENAI_API_KEY values still work.", + "settings.feature_preview.openai_key_label": "OpenAI API key", + "settings.feature_preview.openai_key_removed": "OpenAI API key removed.", + "settings.feature_preview.openai_key_required": "Enter an OpenAI API key before saving.", + "settings.feature_preview.openai_key_saved": "OpenAI key saved. Realtime control can use it immediately.", + "settings.feature_preview.openai_key_title": "OpenAI Realtime key", + "settings.feature_preview.realtime_description": "Shows the Realtime control entry in the session status bar. When started, OpenWork captures microphone audio and lets the Realtime model drive registered app actions.", + "settings.feature_preview.realtime_title": "Realtime control mode", + "settings.feature_preview.replace_key": "Replace OpenAI API key", "settings.toolbar_ready_to_install": "Ready to install", "settings.update": "Update", "settings.update_available": "Update available: v", diff --git a/apps/app/src/react-app/domains/session/chat/session-page.tsx b/apps/app/src/react-app/domains/session/chat/session-page.tsx index c9336b0b3..0ffdce094 100644 --- a/apps/app/src/react-app/domains/session/chat/session-page.tsx +++ b/apps/app/src/react-app/domains/session/chat/session-page.tsx @@ -26,6 +26,7 @@ import { WorkspaceSessionList } from "../sidebar/workspace-session-list"; import { SessionSurface, type SessionSurfaceProps } from "../surface/session-surface"; import { ShareWorkspaceModal } from "../../workspace/share-workspace-modal"; import { StatusBar, type StatusBarProps } from "./status-bar"; +import { OpenAIRealtimeActivityPanel } from "../../../shell/control-drivers/openai-realtime/openai-realtime-activity-panel"; import { DEFAULT_WORKSPACE_LEFT_SIDEBAR_WIDTH, useWorkspaceShellLayout, @@ -527,6 +528,8 @@ export function SessionPage(props: SessionPageProps) { showSettingsButton={props.statusBar?.showSettingsButton} /> + + {props.providerAuthModal ? : null} diff --git a/apps/app/src/react-app/domains/session/chat/status-bar.tsx b/apps/app/src/react-app/domains/session/chat/status-bar.tsx index f4521e36a..96a42fb04 100644 --- a/apps/app/src/react-app/domains/session/chat/status-bar.tsx +++ b/apps/app/src/react-app/domains/session/chat/status-bar.tsx @@ -5,6 +5,7 @@ import { BookOpen, MessageCircle, Settings } from "lucide-react"; import { t } from "../../../../i18n"; import { usePlatform } from "../../../kernel/platform"; import { useControlAction, type OpenworkControlAction } from "../../../shell/control/control-provider"; +import { OpenAIRealtimeStatusControl } from "../../../shell/control-drivers/openai-realtime/openai-realtime-status-control"; import type { OpenworkServerStatus } from "../../../../app/lib/openwork-server"; const DOCS_URL = "https://openworklabs.com/docs"; @@ -176,6 +177,7 @@ export function StatusBar(props: StatusBarProps) {
+ +
+ +
+
+
+
+ {t("settings.feature_preview.openai_key_title")} +
+
+ {t("settings.feature_preview.openai_key_description")} +
+
+
+ {loading + ? t("settings.feature_preview.checking") + : savedKey + ? t("settings.feature_preview.configured") + : t("settings.feature_preview.not_configured")} +
+
+ + {savedKey ? ( +
+
+
{OPENAI_API_KEY}
+
+ {revealed ? savedKey.value : maskValue(savedKey.value)} +
+
+
+ + +
+
+ ) : null} + +
+ setDraft(event.currentTarget.value)} + disabled={!props.client || saving} + hint={t("settings.feature_preview.openai_key_hint")} + /> +
+ + {!props.client ? ( + + {t("settings.feature_preview.connect_server_hint")} + + ) : null} +
+
+ + {status ?
{status}
: null} + {error ?
{error}
: null} +
+ +
+
+
+
Microphone input
+
+ Choose and test the microphone Realtime control should use before starting voice mode. System default follows your macOS/browser default input device. +
+
+ Current: {micPreference.label || "System default"} +
+
+ +
+ + + +
+
+
+ {micTestBusy + ? "Speak now — testing microphone…" + : micTestStatus ?? "Run a quick test to confirm OpenWork can hear audio from the selected input."} +
+ +
+
+
+
0.5 + ? "bg-green-9/80" + : micLevel > 0 + ? "bg-[rgba(var(--dls-accent-rgb),0.6)]" + : "bg-gray-5/50" + }`} + style={{ width: `${Math.round(micLevel * 100)}%` }} + /> +
+ {micTestBusy ? ( + + {Math.round(micLevel * 100)}% + + ) : null} +
+
+ + {micDevices.length === 0 ? ( +
+ Click “Refresh microphones” to grant device-list access and see available inputs. +
+ ) : null} + {micError ? ( +
+
{micError}
+
+ If macOS shows permission as denied, open Microphone settings, enable OpenWork, then fully quit and reopen the app. macOS will not show the prompt again after a denial. +
+ +
+ ) : null} +
+ +
+
+
Transcript panel
+
+ Show a right-side panel during voice control with spoken input, model output, and tool-call activity. +
+
+ +
+
+ + ); +} diff --git a/apps/app/src/react-app/domains/settings/shell/settings-page.tsx b/apps/app/src/react-app/domains/settings/shell/settings-page.tsx index 61689a0cb..f3511190a 100644 --- a/apps/app/src/react-app/domains/settings/shell/settings-page.tsx +++ b/apps/app/src/react-app/domains/settings/shell/settings-page.tsx @@ -50,6 +50,8 @@ export function getSettingsTabIcon(tab: SettingsTab) { return Puzzle; case "environment": return Terminal; + case "feature-preview": + return Sparkles; case "advanced": return Wrench; case "appearance": @@ -75,6 +77,8 @@ export function getSettingsTabLabel(tab: SettingsTab) { return t("settings.tab_extensions"); case "environment": return t("settings.tab_environment"); + case "feature-preview": + return t("settings.tab_feature_preview"); case "advanced": return t("settings.tab_advanced"); case "appearance": @@ -100,6 +104,8 @@ export function getSettingsTabDescription(tab: SettingsTab) { return t("settings.tab_description_extensions"); case "environment": return t("settings.tab_description_environment"); + case "feature-preview": + return t("settings.tab_description_feature_preview"); case "advanced": return t("settings.tab_description_advanced"); case "appearance": @@ -120,7 +126,7 @@ export function getWorkspaceSettingsTabs(): SettingsTab[] { } export function getGlobalSettingsTabs(developerMode: boolean): SettingsTab[] { - const tabs: SettingsTab[] = ["den", "appearance", "environment", "updates", "recovery"]; + const tabs: SettingsTab[] = ["den", "feature-preview", "appearance", "environment", "updates", "recovery"]; if (developerMode) tabs.push("debug"); return tabs; } diff --git a/apps/app/src/react-app/domains/settings/state/feature-flags-preferences.ts b/apps/app/src/react-app/domains/settings/state/feature-flags-preferences.ts index 591a58954..7cd49acf8 100644 --- a/apps/app/src/react-app/domains/settings/state/feature-flags-preferences.ts +++ b/apps/app/src/react-app/domains/settings/state/feature-flags-preferences.ts @@ -7,6 +7,7 @@ export function useFeatureFlagsPreferences() { const microsandboxCreateSandboxEnabled = prefs.featureFlags?.microsandboxCreateSandbox === true; + const realtimeControlEnabled = prefs.featureFlags?.realtimeControl === true; const toggleMicrosandboxCreateSandbox = useCallback(() => { setPrefs((previous) => ({ @@ -18,8 +19,20 @@ export function useFeatureFlagsPreferences() { })); }, [setPrefs]); + const toggleRealtimeControl = useCallback(() => { + setPrefs((previous) => ({ + ...previous, + featureFlags: { + ...previous.featureFlags, + realtimeControl: !previous.featureFlags?.realtimeControl, + }, + })); + }, [setPrefs]); + return { microsandboxCreateSandboxEnabled, toggleMicrosandboxCreateSandbox, + realtimeControlEnabled, + toggleRealtimeControl, }; } diff --git a/apps/app/src/react-app/domains/settings/state/realtime-control-preferences.ts b/apps/app/src/react-app/domains/settings/state/realtime-control-preferences.ts new file mode 100644 index 000000000..4ed60d631 --- /dev/null +++ b/apps/app/src/react-app/domains/settings/state/realtime-control-preferences.ts @@ -0,0 +1,61 @@ +const MIC_DEVICE_ID_KEY = "openwork.feature-preview.realtime-control.mic-device-id"; +const MIC_DEVICE_LABEL_KEY = "openwork.feature-preview.realtime-control.mic-device-label"; +const TRANSCRIPT_PANEL_KEY = "openwork.feature-preview.realtime-control.transcript-panel"; + +export type RealtimeControlMicPreference = { + deviceId: string; + label: string; +}; + +export function readRealtimeControlMicPreference(): RealtimeControlMicPreference { + if (typeof window === "undefined") return { deviceId: "", label: "System default" }; + try { + return { + deviceId: window.localStorage.getItem(MIC_DEVICE_ID_KEY) ?? "", + label: window.localStorage.getItem(MIC_DEVICE_LABEL_KEY) ?? "System default", + }; + } catch { + return { deviceId: "", label: "System default" }; + } +} + +export function writeRealtimeControlMicPreference(preference: RealtimeControlMicPreference) { + if (typeof window === "undefined") return; + try { + window.localStorage.setItem(MIC_DEVICE_ID_KEY, preference.deviceId); + window.localStorage.setItem(MIC_DEVICE_LABEL_KEY, preference.label || "System default"); + window.dispatchEvent(new CustomEvent("openwork:realtime-control-preferences-changed")); + } catch { + // ignore local preference persistence failures + } +} + +export function readRealtimeControlTranscriptPanelEnabled(): boolean { + if (typeof window === "undefined") return false; + try { + return window.localStorage.getItem(TRANSCRIPT_PANEL_KEY) === "1"; + } catch { + return false; + } +} + +export function writeRealtimeControlTranscriptPanelEnabled(enabled: boolean) { + if (typeof window === "undefined") return; + try { + window.localStorage.setItem(TRANSCRIPT_PANEL_KEY, enabled ? "1" : "0"); + window.dispatchEvent(new CustomEvent("openwork:realtime-control-preferences-changed")); + } catch { + // ignore local preference persistence failures + } +} + +export function subscribeRealtimeControlPreferencesChanged(listener: () => void): () => void { + if (typeof window === "undefined") return () => {}; + const handler = () => listener(); + window.addEventListener("openwork:realtime-control-preferences-changed", handler); + window.addEventListener("storage", handler); + return () => { + window.removeEventListener("openwork:realtime-control-preferences-changed", handler); + window.removeEventListener("storage", handler); + }; +} diff --git a/apps/app/src/react-app/kernel/local-provider.tsx b/apps/app/src/react-app/kernel/local-provider.tsx index 9ca1ffee3..402a929df 100644 --- a/apps/app/src/react-app/kernel/local-provider.tsx +++ b/apps/app/src/react-app/kernel/local-provider.tsx @@ -32,6 +32,7 @@ export type LocalPreferences = { releaseChannel: ReleaseChannel; featureFlags: { microsandboxCreateSandbox: boolean; + realtimeControl: boolean; }; /** * Set to true after the user completes the welcome/onboarding flow @@ -60,7 +61,7 @@ const INITIAL_PREFS: LocalPreferences = { modelVariant: null, defaultModel: null, releaseChannel: "stable", - featureFlags: { microsandboxCreateSandbox: false }, + featureFlags: { microsandboxCreateSandbox: false, realtimeControl: false }, hasCompletedOnboarding: false, }; diff --git a/apps/app/src/react-app/shell/control-drivers/openai-realtime/openai-realtime-activity-panel.tsx b/apps/app/src/react-app/shell/control-drivers/openai-realtime/openai-realtime-activity-panel.tsx new file mode 100644 index 000000000..37f1099de --- /dev/null +++ b/apps/app/src/react-app/shell/control-drivers/openai-realtime/openai-realtime-activity-panel.tsx @@ -0,0 +1,178 @@ +/** @jsxImportSource react */ +import { useEffect, useRef, useState } from "react"; +import { Activity, Mic2, Terminal, X } from "lucide-react"; + +import { useFeatureFlagsPreferences } from "../../../domains/settings/state/feature-flags-preferences"; +import { + readRealtimeControlTranscriptPanelEnabled, + writeRealtimeControlTranscriptPanelEnabled, + subscribeRealtimeControlPreferencesChanged, +} from "../../../domains/settings/state/realtime-control-preferences"; +import { getRealtimeControlController } from "./openai-realtime-controller"; + +function relativeTime(ts: number) { + const delta = Math.round((Date.now() - ts) / 1000); + if (delta < 5) return "now"; + if (delta < 60) return `${delta}s ago`; + if (delta < 3600) return `${Math.floor(delta / 60)}m ago`; + return `${Math.floor(delta / 3600)}h ago`; +} + +export function OpenAIRealtimeActivityPanel() { + const { realtimeControlEnabled } = useFeatureFlagsPreferences(); + const [panelEnabled, setPanelEnabled] = useState(readRealtimeControlTranscriptPanelEnabled); + const [realtimeState, setRealtimeState] = useState(() => getRealtimeControlController().state()); + const scrollRef = useRef(null); + + useEffect(() => { + return subscribeRealtimeControlPreferencesChanged(() => { + setPanelEnabled(readRealtimeControlTranscriptPanelEnabled()); + }); + }, []); + + useEffect(() => { + if (!realtimeControlEnabled || !panelEnabled) return undefined; + return getRealtimeControlController().subscribe(setRealtimeState); + }, [panelEnabled, realtimeControlEnabled]); + + useEffect(() => { + if (scrollRef.current) { + scrollRef.current.scrollTop = scrollRef.current.scrollHeight; + } + }, [realtimeState.transcriptLog]); + + const visible = realtimeControlEnabled && (panelEnabled || realtimeState.status === "connected" || realtimeState.status === "connecting"); + if (!visible || realtimeState.status === "idle") return null; + + const entries = realtimeState.transcriptLog ?? []; + const isError = realtimeState.status === "error"; + const isConnected = realtimeState.status === "connected"; + const isListening = isConnected && realtimeState.mic === "on"; + + const statusLine = isListening + ? realtimeState.micLabel ?? "Listening" + : isError + ? realtimeState.lastError || "Connection error" + : realtimeState.status === "connecting" + ? "Connecting…" + : realtimeState.status; + + const dismissPanel = () => { + writeRealtimeControlTranscriptPanelEnabled(false); + setPanelEnabled(false); + }; + + return ( + + ); +} diff --git a/apps/app/src/react-app/shell/control-drivers/openai-realtime/openai-realtime-controller.ts b/apps/app/src/react-app/shell/control-drivers/openai-realtime/openai-realtime-controller.ts new file mode 100644 index 000000000..9dc8cc4e9 --- /dev/null +++ b/apps/app/src/react-app/shell/control-drivers/openai-realtime/openai-realtime-controller.ts @@ -0,0 +1,473 @@ +import type { OpenworkRemoteSession } from "../../../../app/lib/openwork-server"; +import type { OpenworkControlResult, OpenworkControlSnapshot } from "../../control/control-provider"; + +type RemoteEvent = { + type?: string; + call_id?: string; + name?: string; + arguments?: string; + delta?: string; + transcript?: string; + response?: unknown; +}; + +export type RealtimeControlState = { + status: "idle" | "connecting" | "connected" | "error"; + mic: "off" | "requesting" | "on" | "error"; + micPermission: string | null; + micTrack: string | null; + micLabel: string | null; + lastError: string | null; + lastTranscript: string; + lastText: string; + lastEventType: string | null; + transcriptLog: RealtimeTranscriptEntry[]; +}; + +export type RealtimeTranscriptEntry = { + id: string; + role: "user" | "assistant" | "tool" | "system"; + text: string; + status?: "pending" | "done" | "error"; + createdAt: number; +}; + +export type RealtimeControlController = { + connect: (input: { createSession: () => Promise; audioInput?: boolean; audioDeviceId?: string; audioDeviceLabel?: string }) => Promise; + disconnect: () => void; + sendText: (text: string) => void; + state: () => RealtimeControlState; + subscribe: (listener: (state: RealtimeControlState) => void) => () => void; +}; + +type RealtimeRoot = typeof window & { + __openworkRealtimeControl?: RealtimeControlController; + __OPENWORK_ELECTRON__?: { + permissions?: { + requestMicrophone?: () => Promise<{ granted: boolean; status: string }>; + }; + }; +}; + +const state: RealtimeControlState = { + status: "idle", + mic: "off", + micPermission: null, + micTrack: null, + micLabel: null, + lastError: null, + lastTranscript: "", + lastText: "", + lastEventType: null, + transcriptLog: [], +}; + +let peer: RTCPeerConnection | null = null; +let dataChannel: RTCDataChannel | null = null; +let remoteAudio: HTMLAudioElement | null = null; +let localAudioStream: MediaStream | null = null; +const stateListeners = new Set<(state: RealtimeControlState) => void>(); +let nextLogId = 1; +let activeAssistantLogId: string | null = null; + +function setState(update: Partial) { + Object.assign(state, update); + const next = { ...state }; + stateListeners.forEach((listener) => listener(next)); +} + +function appendTranscriptLog(entry: Omit) { + const item: RealtimeTranscriptEntry = { + ...entry, + id: `rt-${nextLogId++}`, + createdAt: Date.now(), + }; + setState({ transcriptLog: [...state.transcriptLog, item].slice(-40) }); + return item.id; +} + +function updateTranscriptLog(id: string, update: Partial>) { + setState({ + transcriptLog: state.transcriptLog.map((entry) => ( + entry.id === id ? { ...entry, ...update } : entry + )), + }); +} + +function ensureControlSurface() { + const control = window.__openworkControl; + if (!control) throw new Error("OpenWork control surface is not available"); + return control; +} + +function safeJson(value: unknown) { + try { + return JSON.stringify(value); + } catch { + return JSON.stringify({ ok: false, error: "Could not serialize result" }); + } +} + +function parseArguments(raw: string | undefined) { + if (!raw) return {}; + try { + return JSON.parse(raw) as Record; + } catch { + return {}; + } +} + +async function requestHostMicrophonePermission() { + const root = window as RealtimeRoot; + const requestMicrophone = root.__OPENWORK_ELECTRON__?.permissions?.requestMicrophone; + if (!requestMicrophone) { + setState({ micPermission: "browser" }); + return; + } + const result = await requestMicrophone(); + setState({ micPermission: result?.status ?? "unknown" }); + if (result && result.granted === false) { + throw new Error(`Microphone permission is ${result.status || "not granted"}. Enable it in macOS Privacy & Security settings.`); + } +} + +async function getMicrophoneStream(deviceId: string) { + const constraints: MediaStreamConstraints = { + audio: { + ...(deviceId ? { deviceId: { exact: deviceId } } : {}), + echoCancellation: true, + noiseSuppression: true, + autoGainControl: true, + }, + }; + try { + return await navigator.mediaDevices.getUserMedia(constraints); + } catch (error) { + if (!deviceId || !(error instanceof DOMException) || error.name !== "OverconstrainedError") { + throw error; + } + appendTranscriptLog({ role: "system", text: "Selected microphone is unavailable; falling back to system default.", status: "done" }); + return navigator.mediaDevices.getUserMedia({ + audio: { + echoCancellation: true, + noiseSuppression: true, + autoGainControl: true, + }, + }); + } +} + +function errorMessage(error: unknown) { + if (error instanceof DOMException) { + const message = error.message.trim(); + const constraint = "constraint" in error && typeof error.constraint === "string" && error.constraint.trim() + ? ` (${error.constraint})` + : ""; + return message || `${error.name}${constraint}`; + } + if (error instanceof Error) return error.message || error.name || String(error); + return String(error); +} + +function waitForDataChannelOpen(channel: RTCDataChannel) { + if (channel.readyState === "open") return Promise.resolve(); + return new Promise((resolve, reject) => { + const timeout = window.setTimeout(() => { + cleanupListeners(); + reject(new Error("Realtime data channel did not open in time")); + }, 10000); + const cleanupListeners = () => { + window.clearTimeout(timeout); + channel.removeEventListener("open", handleOpen); + channel.removeEventListener("close", handleClose); + channel.removeEventListener("error", handleError); + }; + const handleOpen = () => { + cleanupListeners(); + resolve(); + }; + const handleClose = () => { + cleanupListeners(); + reject(new Error("Realtime data channel closed before it opened")); + }; + const handleError = () => { + cleanupListeners(); + reject(new Error("Realtime data channel failed to open")); + }; + channel.addEventListener("open", handleOpen); + channel.addEventListener("close", handleClose); + channel.addEventListener("error", handleError); + }); +} + +async function handleToolCall(event: RemoteEvent) { + const control = ensureControlSurface(); + const args = parseArguments(event.arguments); + const toolLabel = event.name === "execute_action" + ? `execute_action ${typeof args.actionId === "string" ? args.actionId : ""}`.trim() + : event.name === "set_input" + ? `set_input ${typeof args.actionId === "string" ? args.actionId : ""}`.trim() + : event.name === "read_transcript" + ? `read_transcript${typeof args.count === "number" ? ` (last ${args.count})` : ""}`.trim() + : event.name ?? "unknown_tool"; + const toolLogId = appendTranscriptLog({ role: "tool", text: `Calling ${toolLabel}…`, status: "pending" }); + let output: OpenworkControlSnapshot | OpenworkControlSnapshot["actions"] | OpenworkControlResult | { ok: false; error: string }; + + if (event.name === "snapshot") { + output = control.snapshot(); + } else if (event.name === "list_actions") { + output = control.listActions(); + } else if (event.name === "execute_action") { + const actionId = typeof args.actionId === "string" ? args.actionId : ""; + output = actionId + ? await control.execute(actionId, args.args) + : { ok: false, error: "execute_action requires actionId" }; + } else if (event.name === "set_input") { + const actionId = typeof args.actionId === "string" ? args.actionId : ""; + const text = typeof args.text === "string" ? args.text : ""; + output = actionId + ? await control.execute(actionId, { text }) + : { ok: false, error: "set_input requires actionId" }; + } else if (event.name === "list_sessions") { + output = await control.execute("session.list_sessions"); + } else if (event.name === "open_session") { + const sessionId = typeof args.sessionId === "string" ? args.sessionId : ""; + output = sessionId + ? await control.execute("session.open", { sessionId }) + : { ok: false, error: "open_session requires sessionId" }; + } else if (event.name === "rename_session") { + const sessionId = typeof args.sessionId === "string" ? args.sessionId : ""; + const title = typeof args.title === "string" ? args.title : ""; + output = sessionId && title + ? await control.execute("session.rename", { sessionId, title }) + : { ok: false, error: "rename_session requires sessionId and title" }; + } else if (event.name === "delete_session") { + const sessionId = typeof args.sessionId === "string" ? args.sessionId : ""; + output = sessionId + ? await control.execute("session.delete", { sessionId, confirmed: args.confirmed === true }) + : { ok: false, error: "delete_session requires sessionId" }; + } else if (event.name === "scroll_session") { + const position = args.position === "top" ? "top" : "bottom"; + output = await control.execute(position === "top" ? "session.scroll_top" : "session.scroll_bottom"); + } else if (event.name === "get_latest_message") { + output = await control.execute("session.latest_message"); + } else if (event.name === "read_transcript") { + const count = typeof args.count === "number" ? args.count : 10; + output = await control.execute("session.read_transcript", { count }); + } else { + output = { ok: false, error: `Unknown tool: ${event.name ?? "unknown"}` }; + } + + const failed = typeof output === "object" && output !== null && "ok" in output && output.ok === false; + updateTranscriptLog(toolLogId, { + text: failed + ? `Tool failed: ${toolLabel}${"error" in output ? ` — ${String(output.error)}` : ""}` + : `Tool complete: ${toolLabel}`, + status: failed ? "error" : "done", + }); + + if (!event.call_id || !dataChannel || dataChannel.readyState !== "open") return; + dataChannel.send(JSON.stringify({ + type: "conversation.item.create", + item: { + type: "function_call_output", + call_id: event.call_id, + output: safeJson(output), + }, + })); + dataChannel.send(JSON.stringify({ + type: "response.create", + response: { output_modalities: ["text"] }, + })); +} + +async function handleRealtimeMessage(raw: string) { + let event: RemoteEvent; + try { + event = JSON.parse(raw) as RemoteEvent; + } catch { + return; + } + setState({ lastEventType: event.type ?? null }); + + if (event.type === "response.output_text.delta" && typeof event.delta === "string") { + const nextText = `${state.lastText}${event.delta}`; + if (!activeAssistantLogId) { + activeAssistantLogId = appendTranscriptLog({ role: "assistant", text: event.delta, status: "pending" }); + } else { + updateTranscriptLog(activeAssistantLogId, { text: nextText, status: "pending" }); + } + setState({ lastText: nextText }); + return; + } + + if (event.type === "response.done" && activeAssistantLogId) { + updateTranscriptLog(activeAssistantLogId, { status: "done" }); + activeAssistantLogId = null; + return; + } + + if (event.type === "conversation.item.input_audio_transcription.delta" && typeof event.delta === "string") { + setState({ lastTranscript: `${state.lastTranscript}${event.delta}` }); + return; + } + + if (event.type === "conversation.item.input_audio_transcription.completed" && typeof event.transcript === "string") { + setState({ lastTranscript: event.transcript }); + appendTranscriptLog({ role: "user", text: event.transcript, status: "done" }); + return; + } + + if (event.type === "response.function_call_arguments.done") { + await handleToolCall(event); + } +} + +function cleanup() { + localAudioStream?.getTracks().forEach((track) => track.stop()); + localAudioStream = null; + setState({ micTrack: null }); + activeAssistantLogId = null; + dataChannel?.close(); + dataChannel = null; + peer?.close(); + peer = null; + remoteAudio?.remove(); + remoteAudio = null; +} + +export function getRealtimeControlController(): RealtimeControlController { + const root = window as RealtimeRoot; + if (root.__openworkRealtimeControl) return root.__openworkRealtimeControl; + + const controller: RealtimeControlController = { + async connect(input) { + cleanup(); + setState({ status: "connecting", mic: "off", micPermission: null, micTrack: null, micLabel: input.audioDeviceLabel?.trim() || "System default", lastError: null, lastTranscript: "", lastText: "", lastEventType: null, transcriptLog: [] }); + const startupLogId = appendTranscriptLog({ role: "system", text: "Starting voice control…", status: "pending" }); + try { + const audioInput = input.audioInput !== false; + if (audioInput) { + if (!navigator.mediaDevices?.getUserMedia) { + throw new Error("Microphone capture is not available in this browser context"); + } + setState({ mic: "requesting" }); + await requestHostMicrophonePermission(); + const deviceId = input.audioDeviceId?.trim() ?? ""; + localAudioStream = await getMicrophoneStream(deviceId); + const audioTrack = localAudioStream.getAudioTracks()[0]; + if (!audioTrack) { + throw new Error("No microphone audio track was returned"); + } + setState({ mic: "on", micLabel: audioTrack.label || input.audioDeviceLabel || "System default", micTrack: `${audioTrack.readyState}:${audioTrack.enabled ? "enabled" : "disabled"}` }); + appendTranscriptLog({ role: "system", text: `Microphone live: ${audioTrack.label || input.audioDeviceLabel || "System default"}`, status: "done" }); + audioTrack.addEventListener("ended", () => { + setState({ mic: "off", micTrack: "ended" }); + }); + } + + const session = await input.createSession(); + const pc = new RTCPeerConnection(); + peer = pc; + + remoteAudio = document.createElement("audio"); + remoteAudio.autoplay = true; + remoteAudio.dataset.openworkRealtime = "true"; + remoteAudio.style.display = "none"; + document.body.appendChild(remoteAudio); + if (audioInput) { + const audioTrack = localAudioStream?.getAudioTracks()[0]; + if (!audioTrack) { + throw new Error("No microphone audio track is available"); + } + pc.addTrack(audioTrack, localAudioStream ?? new MediaStream([audioTrack])); + } else { + pc.addTransceiver("audio", { direction: "recvonly" }); + } + pc.ontrack = (event) => { + if (remoteAudio) remoteAudio.srcObject = event.streams[0] ?? null; + }; + + const channel = pc.createDataChannel("oai-events"); + dataChannel = channel; + channel.addEventListener("message", (event) => { + void handleRealtimeMessage(String(event.data)); + }); + + const offer = await pc.createOffer(); + await pc.setLocalDescription(offer); + const sdpResponse = await fetch("https://api.openai.com/v1/realtime/calls", { + method: "POST", + headers: { + Authorization: `Bearer ${session.clientSecret}`, + "Content-Type": "application/sdp", + }, + body: offer.sdp, + }); + if (!sdpResponse.ok) { + const detail = await sdpResponse.text().catch(() => ""); + throw new Error(`OpenAI Realtime SDP failed: ${sdpResponse.status}${detail ? ` ${detail}` : ""}`); + } + await pc.setRemoteDescription({ type: "answer", sdp: await sdpResponse.text() }); + await waitForDataChannelOpen(channel); + const liveTrack = localAudioStream?.getAudioTracks()[0]; + setState({ + status: "connected", + mic: liveTrack && liveTrack.readyState === "live" ? "on" : state.mic, + micLabel: liveTrack?.label || state.micLabel, + micTrack: liveTrack ? `${liveTrack.readyState}:${liveTrack.enabled ? "enabled" : "disabled"}` : state.micTrack, + lastError: null, + }); + updateTranscriptLog(startupLogId, { text: "Voice control connected", status: "done" }); + window.__openworkControl?.setEnabled(true); + return { ...state }; + } catch (error) { + cleanup(); + const message = errorMessage(error); + appendTranscriptLog({ role: "system", text: message, status: "error" }); + setState({ status: "error", mic: "error", lastError: message }); + return { ...state }; + } + }, + disconnect() { + cleanup(); + setState({ status: "idle", mic: "off", micTrack: null, lastError: null, lastEventType: null }); + }, + sendText(text) { + if (!dataChannel || dataChannel.readyState !== "open") { + throw new Error("Realtime data channel is not connected"); + } + setState({ lastTranscript: text, lastText: "" }); + appendTranscriptLog({ role: "user", text, status: "done" }); + dataChannel.send(JSON.stringify({ + type: "conversation.item.create", + item: { + type: "message", + role: "user", + content: [{ type: "input_text", text }], + }, + })); + dataChannel.send(JSON.stringify({ + type: "response.create", + response: { output_modalities: ["text"] }, + })); + }, + state() { + return { ...state }; + }, + subscribe(listener) { + stateListeners.add(listener); + listener({ ...state }); + return () => { + stateListeners.delete(listener); + }; + }, + }; + + root.__openworkRealtimeControl = controller; + return controller; +} + +if (typeof window !== "undefined") { + getRealtimeControlController(); +} diff --git a/apps/app/src/react-app/shell/control-drivers/openai-realtime/openai-realtime-status-control.tsx b/apps/app/src/react-app/shell/control-drivers/openai-realtime/openai-realtime-status-control.tsx new file mode 100644 index 000000000..6f1085589 --- /dev/null +++ b/apps/app/src/react-app/shell/control-drivers/openai-realtime/openai-realtime-status-control.tsx @@ -0,0 +1,88 @@ +/** @jsxImportSource react */ +import { useEffect, useState } from "react"; +import { Loader2, Mic2, MicOff } from "lucide-react"; + +import { useFeatureFlagsPreferences } from "../../../domains/settings/state/feature-flags-preferences"; +import { useOpenworkControl } from "../../control/control-provider"; +import { getRealtimeControlController } from "./openai-realtime-controller"; + +export function OpenAIRealtimeStatusControl() { + const { realtimeControlEnabled } = useFeatureFlagsPreferences(); + const control = useOpenworkControl(); + const [realtimeState, setRealtimeState] = useState(() => getRealtimeControlController().state()); + + useEffect(() => { + if (!realtimeControlEnabled) return undefined; + return getRealtimeControlController().subscribe(setRealtimeState); + }, [realtimeControlEnabled]); + + if (!realtimeControlEnabled || !control) return null; + + const connectAction = control.actions.find((action) => action.id === "remote.realtime.connect"); + const disconnectAction = control.actions.find((action) => action.id === "remote.realtime.disconnect"); + const connected = realtimeState.status === "connected"; + const busy = realtimeState.status === "connecting" || realtimeState.mic === "requesting" || connectAction?.busy || disconnectAction?.busy; + const unavailable = connected ? disconnectAction?.disabled === true : connectAction?.disabled !== false; + const isListening = connected && realtimeState.mic === "on"; + + const title = realtimeState.lastError || (connected ? "Disconnect voice control" : "Connect voice control"); + + // Compact status text for the pill — keep short and readable + const stateText = realtimeState.lastError + ? "Error" + : realtimeState.status === "connecting" + ? "Connecting…" + : isListening + ? "Listening" + : connected + ? "Connected" + : ""; + + const handleClick = async () => { + if (busy || unavailable) return; + const actionId = connected ? "remote.realtime.disconnect" : "remote.realtime.connect"; + await control.executeAction(actionId); + setRealtimeState(getRealtimeControlController().state()); + }; + + return ( +
+ {stateText ? ( + + {stateText} + + ) : null} + +
+ ); +} diff --git a/apps/app/src/react-app/shell/control-drivers/openai-realtime/use-openai-realtime-control-actions.ts b/apps/app/src/react-app/shell/control-drivers/openai-realtime/use-openai-realtime-control-actions.ts new file mode 100644 index 000000000..27882d96e --- /dev/null +++ b/apps/app/src/react-app/shell/control-drivers/openai-realtime/use-openai-realtime-control-actions.ts @@ -0,0 +1,63 @@ +/** @jsxImportSource react */ +import { useMemo } from "react"; + +import type { OpenworkServerClient } from "../../../../app/lib/openwork-server"; +import { readRealtimeControlMicPreference } from "../../../domains/settings/state/realtime-control-preferences"; +import { useControlAction, type OpenworkControlAction } from "../../control/control-provider"; +import { getRealtimeControlController } from "./openai-realtime-controller"; + +export function useOpenAIRealtimeControlActions(input: { + enabled: boolean; + client: OpenworkServerClient | null; +}) { + const remoteRealtimeConnectAction = useMemo(() => ({ + id: "remote.realtime.connect", + label: "Connect OpenAI Realtime microphone control", + description: "Start a browser Realtime session that listens to the microphone and can call OpenWork control actions.", + sideEffect: "external", + disabled: !input.enabled || !input.client, + execute: async () => { + if (!input.enabled) return { status: "error", lastError: "Realtime control is disabled in Feature Preview settings" }; + if (!input.client) return { status: "error", lastError: "OpenWork server is not connected" }; + const mic = readRealtimeControlMicPreference(); + return getRealtimeControlController().connect({ + createSession: () => input.client!.createRemoteSession(), + audioInput: true, + audioDeviceId: mic.deviceId, + audioDeviceLabel: mic.label, + }); + }, + }), [input]); + useControlAction(remoteRealtimeConnectAction); + + const remoteRealtimeTextAction = useMemo(() => ({ + id: "remote.realtime.send_text", + label: "Send text to OpenAI Realtime remote control", + description: "Send a text message through the connected Realtime session.", + sideEffect: "external", + requiresArgs: true, + disabled: !input.enabled, + previewArgs: { text: "List the available OpenWork actions." }, + execute: (_args) => { + const text = typeof _args === "object" && _args && "text" in _args && typeof (_args as { text?: unknown }).text === "string" + ? (_args as { text: string }).text + : "List the available OpenWork actions."; + getRealtimeControlController().sendText(text); + return getRealtimeControlController().state(); + }, + }), [input]); + useControlAction(remoteRealtimeTextAction); + + const remoteRealtimeDisconnectAction = useMemo(() => ({ + id: "remote.realtime.disconnect", + label: "Disconnect OpenAI Realtime remote control", + description: "Close the browser Realtime session.", + sideEffect: "external", + disabled: !input.enabled, + execute: () => { + getRealtimeControlController().disconnect(); + return getRealtimeControlController().state(); + }, + }), [input]); + useControlAction(remoteRealtimeDisconnectAction); +} diff --git a/apps/app/src/react-app/shell/session-route.tsx b/apps/app/src/react-app/shell/session-route.tsx index 6d0cdb13b..cbbacbff9 100644 --- a/apps/app/src/react-app/shell/session-route.tsx +++ b/apps/app/src/react-app/shell/session-route.tsx @@ -95,6 +95,7 @@ import { useReloadCoordinator } from "./reload-coordinator"; import { getReactQueryClient } from "../infra/query-client"; import { useStatusToasts } from "../domains/shell-feedback/status-toasts"; import { useSessionControlActions } from "../domains/session/control/session-control-actions"; +import { useOpenAIRealtimeControlActions } from "./control-drivers/openai-realtime/use-openai-realtime-control-actions"; type RouteWorkspace = OpenworkWorkspaceInfo & { displayNameResolved: string; @@ -1572,6 +1573,9 @@ export function SessionRoute() { }), []); useControlAction(commandPaletteControlAction); + const realtimeControlPreviewEnabled = local.prefs.featureFlags?.realtimeControl === true; + useOpenAIRealtimeControlActions({ enabled: realtimeControlPreviewEnabled, client }); + const paletteSessionOptions = useMemo(() => { const out: PaletteSessionOption[] = []; for (const workspace of workspaces) { diff --git a/apps/app/src/react-app/shell/settings-route.tsx b/apps/app/src/react-app/shell/settings-route.tsx index 122a36aec..c9c462294 100644 --- a/apps/app/src/react-app/shell/settings-route.tsx +++ b/apps/app/src/react-app/shell/settings-route.tsx @@ -29,6 +29,7 @@ import { DebugView } from "../domains/settings/pages/debug-view"; import { DenView } from "../domains/settings/pages/den-view"; import { EnvironmentView } from "../domains/settings/pages/environment-view"; import { ExtensionsView } from "../domains/settings/pages/extensions-view"; +import { FeaturePreviewView } from "../domains/settings/pages/feature-preview-view"; import { McpView } from "../domains/settings/pages/mcp-view"; import { RecoveryView } from "../domains/settings/pages/recovery-view"; import { SkillsView } from "../domains/settings/pages/skills-view"; @@ -253,6 +254,7 @@ function parseSettingsPath(pathname: string): { case "advanced": case "appearance": case "environment": + case "feature-preview": case "updates": case "recovery": case "debug": @@ -1320,6 +1322,22 @@ export function SettingsRoute() { runtimeKey={environmentRuntimeKey} /> ); + case "feature-preview": + return ( + { + local.setPrefs((previous) => ({ + ...previous, + featureFlags: { + ...previous.featureFlags, + realtimeControl: !previous.featureFlags?.realtimeControl, + }, + })); + }} + /> + ); case "debug": return ; default: diff --git a/apps/desktop/build/entitlements.mac.inherit.plist b/apps/desktop/build/entitlements.mac.inherit.plist new file mode 100644 index 000000000..4c2011e5a --- /dev/null +++ b/apps/desktop/build/entitlements.mac.inherit.plist @@ -0,0 +1,12 @@ + + + + + com.apple.security.cs.allow-jit + + com.apple.security.cs.disable-library-validation + + com.apple.security.device.audio-input + + + diff --git a/apps/desktop/build/entitlements.mac.plist b/apps/desktop/build/entitlements.mac.plist new file mode 100644 index 000000000..4c2011e5a --- /dev/null +++ b/apps/desktop/build/entitlements.mac.plist @@ -0,0 +1,12 @@ + + + + + com.apple.security.cs.allow-jit + + com.apple.security.cs.disable-library-validation + + com.apple.security.device.audio-input + + + diff --git a/apps/desktop/electron-builder.yml b/apps/desktop/electron-builder.yml index 47212bd9d..12ae327cf 100644 --- a/apps/desktop/electron-builder.yml +++ b/apps/desktop/electron-builder.yml @@ -31,8 +31,12 @@ mac: icon: resources/icons/icon.icns category: public.app-category.developer-tools hardenedRuntime: true + entitlements: build/entitlements.mac.plist + entitlementsInherit: build/entitlements.mac.inherit.plist gatekeeperAssess: false notarize: false + extendInfo: + NSMicrophoneUsageDescription: OpenWork uses the microphone only when you turn on Realtime remote control. extraResources: - from: resources/sidecars to: sidecars diff --git a/apps/desktop/electron/main.mjs b/apps/desktop/electron/main.mjs index ccb2c0ad0..ab04fd56e 100644 --- a/apps/desktop/electron/main.mjs +++ b/apps/desktop/electron/main.mjs @@ -16,7 +16,7 @@ import os from "node:os"; import path from "node:path"; import { fileURLToPath } from "node:url"; -import { app, BrowserWindow, dialog, ipcMain, nativeImage, shell } from "electron"; +import { app, BrowserWindow, dialog, ipcMain, nativeImage, session, shell, systemPreferences } from "electron"; import { registerMigrationIpc } from "./migration.mjs"; import { createRuntimeManager } from "./runtime.mjs"; import { registerUpdaterIpc } from "./updater.mjs"; @@ -103,6 +103,33 @@ function envFlagDisabled(name) { return value === "0" || value === "false" || value === "off"; } +function isTrustedAppMediaUrl(rawUrl) { + if (typeof rawUrl !== "string" || rawUrl.trim().length === 0) return false; + try { + const url = new URL(rawUrl); + if (url.protocol === "file:") return true; + if (url.protocol !== "http:" && url.protocol !== "https:") return false; + return url.hostname === "localhost" || url.hostname === "127.0.0.1"; + } catch { + return false; + } +} + +function installMediaPermissionHandler() { + session.defaultSession.setPermissionRequestHandler((webContents, permission, callback, details) => { + if (permission !== "media") { + callback(false); + return; + } + const mediaDetails = /** @type {{ mediaTypes?: string[]; requestingUrl?: string }} */ (details ?? {}); + const mediaTypes = Array.isArray(mediaDetails.mediaTypes) ? mediaDetails.mediaTypes : []; + const requestsAudio = mediaTypes.length === 0 || mediaTypes.includes("audio"); + const requestsVideo = mediaTypes.includes("video"); + const trustedUrl = isTrustedAppMediaUrl(mediaDetails.requestingUrl) || isTrustedAppMediaUrl(webContents.getURL()); + callback(Boolean(trustedUrl && requestsAudio && !requestsVideo)); + }); +} + async function installReactDevToolsForDev() { if (app.isPackaged || envFlagDisabled("OPENWORK_REACT_DEVTOOLS")) return; try { @@ -1543,6 +1570,18 @@ ipcMain.handle("openwork:shell:relaunch", async () => { app.exit(0); }); +ipcMain.handle("openwork:permissions:microphone", async () => { + if (process.platform !== "darwin") return { granted: true, status: "granted" }; + const before = systemPreferences.getMediaAccessStatus("microphone"); + if (before === "granted") return { granted: true, status: before }; + if (before === "denied" || before === "restricted") return { granted: false, status: before }; + const granted = await systemPreferences.askForMediaAccess("microphone"); + return { + granted, + status: systemPreferences.getMediaAccessStatus("microphone"), + }; +}); + registerMigrationIpc({ app, ipcMain }); const { ensureAutoUpdater } = registerUpdaterIpc({ app, ipcMain, getMainWindow: () => mainWindow }); @@ -1572,6 +1611,7 @@ if (!app.requestSingleInstanceLock()) { }); app.whenReady().then(async () => { + installMediaPermissionHandler(); await installReactDevToolsForDev(); await runtimeManager.prepareFreshRuntime().catch(() => undefined); diff --git a/apps/desktop/electron/preload.mjs b/apps/desktop/electron/preload.mjs index 8591b2f39..f73b3b5f7 100644 --- a/apps/desktop/electron/preload.mjs +++ b/apps/desktop/electron/preload.mjs @@ -20,6 +20,11 @@ contextBridge.exposeInMainWorld("__OPENWORK_ELECTRON__", { return ipcRenderer.invoke("openwork:shell:relaunch"); }, }, + permissions: { + requestMicrophone() { + return ipcRenderer.invoke("openwork:permissions:microphone"); + }, + }, migration: { readSnapshot() { return ipcRenderer.invoke("openwork:migration:read"); diff --git a/apps/desktop/scripts/electron-dev.mjs b/apps/desktop/scripts/electron-dev.mjs index 363f146df..ccbf3cd4e 100644 --- a/apps/desktop/scripts/electron-dev.mjs +++ b/apps/desktop/scripts/electron-dev.mjs @@ -1,9 +1,11 @@ import { spawn, spawnSync } from "node:child_process"; +import { createRequire } from "node:module"; import net from "node:net"; import { dirname, resolve } from "node:path"; import { fileURLToPath } from "node:url"; const __dirname = dirname(fileURLToPath(import.meta.url)); +const require = createRequire(import.meta.url); const desktopRoot = resolve(__dirname, ".."); const repoRoot = resolve(desktopRoot, "../.."); const electronSidecarDir = resolve(desktopRoot, "resources", "sidecars"); @@ -19,6 +21,7 @@ const portValue = Number.parseInt(process.env.PORT ?? "", 10); const devPort = Number.isFinite(portValue) && portValue > 0 ? portValue : 5173; const explicitStartUrl = process.env.OPENWORK_ELECTRON_START_URL?.trim() || ""; const startUrl = explicitStartUrl || `http://localhost:${devPort}`; +const launchWithMacOpen = process.platform === "darwin" && process.env.OPENWORK_ELECTRON_LAUNCH_WITH_OPEN === "1"; const viteProbeUrls = explicitStartUrl ? [explicitStartUrl] : [ @@ -233,17 +236,36 @@ const defaultCdpPort = "9823"; const cdpPortRaw = process.env.OPENWORK_ELECTRON_REMOTE_DEBUG_PORT?.trim() ?? defaultCdpPort; const cdpPort = cdpPortRaw === "" || cdpPortRaw === "0" ? "" : cdpPortRaw; -electronChild = run(pnpmCmd, ["exec", "electron", "./electron/main.mjs"], { - cwd: desktopRoot, - detached: process.platform !== "win32", - env: { - ...process.env, - OPENWORK_DEV_MODE: process.env.OPENWORK_DEV_MODE ?? "1", - OPENWORK_DATA_DIR: process.env.OPENWORK_DATA_DIR ?? defaultDevDataDir, - OPENWORK_ELECTRON_START_URL: resolvedStartUrl, - ...(cdpPort ? { OPENWORK_ELECTRON_REMOTE_DEBUG_PORT: cdpPort } : {}), - }, -}); +if (launchWithMacOpen) { + const electronExecutable = require("electron"); + const electronAppPath = resolve(electronExecutable, "../../.."); + electronChild = run("open", [ + "-n", + "-W", + electronAppPath, + "--env", `OPENWORK_DEV_MODE=${process.env.OPENWORK_DEV_MODE ?? "1"}`, + "--env", `OPENWORK_DATA_DIR=${process.env.OPENWORK_DATA_DIR ?? defaultDevDataDir}`, + "--env", `OPENWORK_ELECTRON_START_URL=${resolvedStartUrl}`, + ...(cdpPort ? ["--env", `OPENWORK_ELECTRON_REMOTE_DEBUG_PORT=${cdpPort}`] : []), + "--args", + resolve(desktopRoot, "electron/main.mjs"), + ], { + cwd: desktopRoot, + detached: false, + }); +} else { + electronChild = run(pnpmCmd, ["exec", "electron", "./electron/main.mjs"], { + cwd: desktopRoot, + detached: process.platform !== "win32", + env: { + ...process.env, + OPENWORK_DEV_MODE: process.env.OPENWORK_DEV_MODE ?? "1", + OPENWORK_DATA_DIR: process.env.OPENWORK_DATA_DIR ?? defaultDevDataDir, + OPENWORK_ELECTRON_START_URL: resolvedStartUrl, + ...(cdpPort ? { OPENWORK_ELECTRON_REMOTE_DEBUG_PORT: cdpPort } : {}), + }, + }); +} if (cdpPort) { console.log(`[openwork] Electron CDP exposed at http://127.0.0.1:${cdpPort}`); diff --git a/apps/server/src/remote-control/openai-realtime.ts b/apps/server/src/remote-control/openai-realtime.ts new file mode 100644 index 000000000..56aabf6bf --- /dev/null +++ b/apps/server/src/remote-control/openai-realtime.ts @@ -0,0 +1,288 @@ +import { ApiError } from "../errors.js"; +import { EnvService } from "../env-file.js"; + +export const REMOTE_CONTROL_DEFAULT_MODEL = "gpt-realtime-1.5"; +export const REMOTE_CONTROL_DEFAULT_VOICE = "marin"; +export const REMOTE_CONTROL_DEFAULT_INSTRUCTIONS = [ + "You are controlling the OpenWork app through a provider-neutral control surface.", + "You CAN see the current session. Use read_transcript to read messages in the active session. Use get_latest_message for just the newest message.", + "Use snapshot or list_actions before choosing an action unless the user named an obvious action.", + "Narrate briefly before and after actions. Keep answers concise.", + "Prefer set_input for typing text and execute_action for navigation or buttons.", + "Ask for explicit confirmation before destructive actions like deleting sessions.", + "Do not invent action IDs. Only use IDs returned by list_actions or snapshot.", + "When the user asks about session content, always call read_transcript or get_latest_message first — do not say you cannot see the session.", + "", + "REPLY INTENT: The user is looking at an active OpenWork session where an AI assistant is chatting.", + "When the user dictates something that sounds like a reply to that on-screen conversation — e.g. \"tell them I'll be there at 3\", \"reply that looks good\", \"say yes we can do that\", \"answer with the budget numbers\" — they want you to TYPE that reply into the session composer and SEND it, not respond to them yourself.", + "Steps: 1) call read_transcript to understand the conversation context, 2) compose the reply text from what the user said, 3) call set_input with actionId \"composer.set_text\" and the reply text, 4) call execute_action with actionId \"composer.send\".", + "If the user explicitly addresses YOU (e.g. \"what can you do?\", \"list my sessions\", \"open settings\"), answer them directly instead.", + "When ambiguous, prefer treating spoken input as a reply to the on-screen session — that is the most common intent when the user is looking at a conversation.", +].join(" "); + +function openAIRealtimeTools() { + return [ + { + type: "function", + name: "snapshot", + description: "Read the current OpenWork route, control status, narration, and available actions.", + parameters: { + type: "object", + properties: {}, + additionalProperties: false, + }, + }, + { + type: "function", + name: "list_actions", + description: "List currently available OpenWork app actions with IDs, labels, descriptions, and disabled state.", + parameters: { + type: "object", + properties: {}, + additionalProperties: false, + }, + }, + { + type: "function", + name: "execute_action", + description: "Execute an available OpenWork action by ID. Use args only when the action requires them.", + parameters: { + type: "object", + properties: { + actionId: { + type: "string", + description: "An action ID returned by snapshot or list_actions.", + }, + args: { + type: "object", + description: "Optional arguments for the action.", + additionalProperties: true, + }, + }, + required: ["actionId"], + additionalProperties: false, + }, + }, + { + type: "function", + name: "set_input", + description: "Type text into a text-entry action such as the session composer.", + parameters: { + type: "object", + properties: { + actionId: { + type: "string", + description: "The text-entry action ID, usually composer.set_text.", + }, + text: { + type: "string", + description: "The exact text to type visibly into the app.", + }, + }, + required: ["actionId", "text"], + additionalProperties: false, + }, + }, + { + type: "function", + name: "list_sessions", + description: "List available sessions across workspaces with their IDs and titles so you can navigate to one by name.", + parameters: { + type: "object", + properties: {}, + additionalProperties: false, + }, + }, + { + type: "function", + name: "open_session", + description: "Navigate to a specific session by its ID. Use list_sessions first to find the right ID.", + parameters: { + type: "object", + properties: { + sessionId: { + type: "string", + description: "The session ID returned by list_sessions.", + }, + }, + required: ["sessionId"], + additionalProperties: false, + }, + }, + { + type: "function", + name: "rename_session", + description: "Rename a session by ID. Use list_sessions first to identify the exact session the user means.", + parameters: { + type: "object", + properties: { + sessionId: { + type: "string", + description: "The session ID returned by list_sessions.", + }, + title: { + type: "string", + description: "The new session title.", + }, + }, + required: ["sessionId", "title"], + additionalProperties: false, + }, + }, + { + type: "function", + name: "delete_session", + description: "Delete a session by ID. Destructive: only set confirmed true after the user explicitly confirms deletion.", + parameters: { + type: "object", + properties: { + sessionId: { + type: "string", + description: "The session ID returned by list_sessions.", + }, + confirmed: { + type: "boolean", + description: "Must be true only after explicit user confirmation.", + }, + }, + required: ["sessionId", "confirmed"], + additionalProperties: false, + }, + }, + { + type: "function", + name: "scroll_session", + description: "Scroll the current session transcript to the top or bottom.", + parameters: { + type: "object", + properties: { + position: { + type: "string", + enum: ["top", "bottom"], + description: "Where to scroll the current session transcript.", + }, + }, + required: ["position"], + additionalProperties: false, + }, + }, + { + type: "function", + name: "get_latest_message", + description: "Read the latest visible message in the current session transcript.", + parameters: { + type: "object", + properties: {}, + additionalProperties: false, + }, + }, + { + type: "function", + name: "read_transcript", + description: "Read the last N messages from the current session transcript. Returns session ID, total message count, and each message's role and text. Use this when the user asks about the current session's content.", + parameters: { + type: "object", + properties: { + count: { + type: "number", + description: "Number of recent messages to return (1–30, default 10).", + }, + }, + additionalProperties: false, + }, + }, + ]; +} + +async function resolveOpenAIKey(env: EnvService): Promise { + const processKey = process.env.OPENAI_API_KEY?.trim(); + if (processKey) return processKey; + + let savedEnv: Awaited> = []; + try { + savedEnv = await env.list(); + } catch { + throw new ApiError(409, "openai_api_key_store_unreadable", "OpenWork could not read the saved OpenAI API key"); + } + return savedEnv.find((entry) => entry.key === "OPENAI_API_KEY")?.value.trim() ?? ""; +} + +export async function createRemoteControlSession(input: { model: string; voice: string; instructions: string }, env: EnvService) { + const apiKey = await resolveOpenAIKey(env); + if (!apiKey) { + throw new ApiError(400, "openai_api_key_missing", "Add an OpenAI API key in Settings → Feature Preview before starting Realtime control"); + } + + const response = await fetch("https://api.openai.com/v1/realtime/client_secrets", { + method: "POST", + headers: { + Authorization: `Bearer ${apiKey}`, + "Content-Type": "application/json", + }, + body: JSON.stringify({ + session: { + type: "realtime", + model: input.model, + output_modalities: ["text"], + audio: { + input: { + transcription: { + model: "gpt-4o-mini-transcribe", + }, + turn_detection: { + type: "server_vad", + threshold: 0.5, + silence_duration_ms: 200, + prefix_padding_ms: 300, + create_response: true, + interrupt_response: false, + }, + }, + }, + instructions: input.instructions, + tool_choice: "auto", + tools: openAIRealtimeTools(), + }, + }), + }); + + const text = await response.text(); + let json: any = null; + try { + json = text ? JSON.parse(text) : null; + } catch { + json = null; + } + + if (!response.ok) { + const message = typeof json?.error?.message === "string" ? json.error.message : response.statusText; + throw new ApiError(response.status, "openai_realtime_session_failed", message || "Failed to create remote control session"); + } + + const clientSecret = + typeof json?.client_secret?.value === "string" + ? json.client_secret.value + : typeof json?.value === "string" + ? json.value + : typeof json?.client_secret === "string" + ? json.client_secret + : ""; + if (!clientSecret) { + throw new ApiError(502, "openai_realtime_session_invalid", "OpenAI did not return a usable realtime client secret"); + } + + const expiresAt = + typeof json?.client_secret?.expires_at === "number" + ? json.client_secret.expires_at + : typeof json?.expires_at === "number" + ? json.expires_at + : null; + + return { + clientSecret, + expiresAt, + model: input.model, + voice: input.voice, + tools: openAIRealtimeTools().map((tool) => tool.name), + }; +} diff --git a/apps/server/src/server.ts b/apps/server/src/server.ts index a388ced1e..5e493e851 100644 --- a/apps/server/src/server.ts +++ b/apps/server/src/server.ts @@ -36,6 +36,12 @@ import { fetchSharedBundle, publishSharedBundle } from "./share-bundles.js"; import { seedOpencodeSessionMessages } from "./opencode-db.js"; import { listPortableFiles, planPortableFiles, writePortableFiles } from "./portable-files.js"; import { buildSession, buildSessionList, buildSessionMessages, buildSessionSnapshot, buildSessionStatuses, buildSessionTodos } from "./session-read-model.js"; +import { + createRemoteControlSession, + REMOTE_CONTROL_DEFAULT_INSTRUCTIONS, + REMOTE_CONTROL_DEFAULT_MODEL, + REMOTE_CONTROL_DEFAULT_VOICE, +} from "./remote-control/openai-realtime.js"; import { collectWorkspaceExportWarnings, stripSensitiveWorkspaceExportData, @@ -52,7 +58,6 @@ const FILE_SESSION_MAX_BATCH_ITEMS = 64; const FILE_SESSION_MAX_FILE_BYTES = 5_000_000; const FILE_SESSION_CATALOG_DEFAULT_LIMIT = 2000; const FILE_SESSION_CATALOG_MAX_LIMIT = 10000; - type LogLevel = "info" | "warn" | "error"; type LogAttributes = Record; @@ -1264,6 +1269,14 @@ function createRoutes( return jsonResponse(buildCapabilities(config)); }); + addRoute(routes, "POST", "/remote/session", "client", async (ctx) => { + const body = await readJsonBody(ctx.request); + const model = normalizeOptionalString(body.model) || REMOTE_CONTROL_DEFAULT_MODEL; + const voice = normalizeOptionalString(body.voice) || REMOTE_CONTROL_DEFAULT_VOICE; + const instructions = normalizeOptionalString(body.instructions) || REMOTE_CONTROL_DEFAULT_INSTRUCTIONS; + return jsonResponse(await createRemoteControlSession({ model, voice, instructions }, env)); + }); + addRoute(routes, "GET", "/workspaces", "client", async () => { const active = config.workspaces[0] ?? null; const items = config.workspaces.map(serializeWorkspace); @@ -3061,6 +3074,10 @@ function requireClientScope(ctx: RequestContext, required: TokenScope): void { } } +function normalizeOptionalString(input: unknown): string | null { + return typeof input === "string" && input.trim() ? input.trim() : null; +} + async function readJsonBody(request: Request): Promise> { try { const json = await request.json(); diff --git a/pnpm-lock.yaml b/pnpm-lock.yaml index 632ede94a..afcd385c3 100644 --- a/pnpm-lock.yaml +++ b/pnpm-lock.yaml @@ -9072,7 +9072,7 @@ snapshots: '@babel/template@7.28.6': dependencies: '@babel/code-frame': 7.28.6 - '@babel/parser': 7.28.6 + '@babel/parser': 7.29.2 '@babel/types': 7.28.6 '@babel/traverse@7.28.6': @@ -11620,7 +11620,7 @@ snapshots: '@types/babel__template@7.4.4': dependencies: - '@babel/parser': 7.28.6 + '@babel/parser': 7.29.2 '@babel/types': 7.29.0 '@types/babel__traverse@7.28.0': diff --git a/screenshots/control-mode-composer.png b/screenshots/control-mode-composer.png new file mode 100644 index 000000000..ba80f234f Binary files /dev/null and b/screenshots/control-mode-composer.png differ diff --git a/screenshots/feature-preview-settings.png b/screenshots/feature-preview-settings.png new file mode 100644 index 000000000..68200bc58 Binary files /dev/null and b/screenshots/feature-preview-settings.png differ diff --git a/screenshots/realtime-control-status-bar.png b/screenshots/realtime-control-status-bar.png new file mode 100644 index 000000000..5036c0db7 Binary files /dev/null and b/screenshots/realtime-control-status-bar.png differ diff --git a/screenshots/realtime-control-typed.png b/screenshots/realtime-control-typed.png new file mode 100644 index 000000000..35f53a67f Binary files /dev/null and b/screenshots/realtime-control-typed.png differ diff --git a/screenshots/realtime-transcript-panel-visible.png b/screenshots/realtime-transcript-panel-visible.png new file mode 100644 index 000000000..b60323b6e Binary files /dev/null and b/screenshots/realtime-transcript-panel-visible.png differ