diff --git a/app/src-tauri/src/lib.rs b/app/src-tauri/src/lib.rs index 3b62ae2654..a4ba8f1509 100644 --- a/app/src-tauri/src/lib.rs +++ b/app/src-tauri/src/lib.rs @@ -2989,6 +2989,10 @@ pub fn run() { request_id: request_id.clone(), meet_url: meet_url.clone(), display_name: "OpenHuman Dev".to_string(), + // Dev-auto launch has no real user identity — the + // wake gate will fail-closed (no wakes fire) which + // is the safe posture for an automated harness. + owner_display_name: String::new(), }; match meet_call::meet_call_open_window(app_handle.clone(), state, args) .await diff --git a/app/src-tauri/src/meet_audio/audio_bridge.js b/app/src-tauri/src/meet_audio/audio_bridge.js index 9d9cfdedd9..6dbd8e9eb8 100644 --- a/app/src-tauri/src/meet_audio/audio_bridge.js +++ b/app/src-tauri/src/meet_audio/audio_bridge.js @@ -97,6 +97,29 @@ return out; } + // Track every scheduled AudioBufferSource so __openhumanFlushAudio + // can stop them on barge-in (user re-asks during a long bot reply). + // Without this list, only the queue tail past `nextStartTime` would + // be cancellable; anything already start()-ed plays to completion. + var activeSources = []; + + // Stop in-flight playback and reset the schedule cursor. Called by + // the Rust shell when the brain cancels outbound (new wake fires + // mid-reply). Returns the number of sources that were stopped, so + // the shell can log how much speech got cut. + window.__openhumanFlushAudio = function () { + var stopped = 0; + while (activeSources.length) { + var s = activeSources.pop(); + try { s.stop(); stopped++; } catch (_) {} + try { s.disconnect(); } catch (_) {} + } + if (ctx) { + nextStartTime = ctx.currentTime; + } + return stopped; + }; + // Public push API. Returns the duration in seconds the chunk added // to the queue, mostly for diagnostics; the shell ignores it. window.__openhumanFeedPcm = function (b64) { @@ -110,6 +133,16 @@ var src = ctx.createBufferSource(); src.buffer = buffer; src.connect(dest); + // Also pipe to the page's default audio output so the bot is + // audible on the host machine (the openhuman app's speakers). + // Without this, bot audio only flows up Meet's gUM intercept + // and the user has to be receiving the meeting on a separate + // endpoint (other browser tab / phone) to hear it. Playing + // locally too costs nothing audio-quality-wise and removes the + // "captions appear but no sound" foot-gun. Follow-up #20 + // (mute bot CEF at OS level) will re-introduce a clean off + // switch once we have a config toggle. + src.connect(ctx.destination); // Schedule strictly after the previous chunk so successive // 100 ms feeds line up gaplessly. If the queue has emptied // (caller fell behind), restart at currentTime so we don't try @@ -118,6 +151,11 @@ nextStartTime = ctx.currentTime; } src.start(nextStartTime); + activeSources.push(src); + src.onended = function () { + var idx = activeSources.indexOf(src); + if (idx !== -1) activeSources.splice(idx, 1); + }; nextStartTime += buffer.duration; // High-frequency log gated by a counter so we don't drown the // console at 10 Hz; emit ~1 in 50 frames (~5 s cadence at the diff --git a/app/src-tauri/src/meet_audio/captions_bridge.js b/app/src-tauri/src/meet_audio/captions_bridge.js index cf79bd45e3..14b52a178e 100644 --- a/app/src-tauri/src/meet_audio/captions_bridge.js +++ b/app/src-tauri/src/meet_audio/captions_bridge.js @@ -139,18 +139,35 @@ } // Auto-enable captions: walk every button on the page and click any - // that has an aria-label starting with "Turn on captions". Caps the - // attempts so we don't fight a user who deliberately disables CC. - var ENABLE_ATTEMPT_BUDGET = 30; // ~30 * 2s = 60s + // that has an aria-label matching the "turn on captions" intent. + // Substring match (not prefix) — Meet rolls out variant labels + // ("Turn on captions (c)", "Turn on live captions", "Subtitles", + // "Captions") that the strict prefix-only matcher missed, forcing + // the user to click the toggle by hand. Caps attempts so a user who + // deliberately disables CC isn't fought over forever. + var ENABLE_ATTEMPT_BUDGET = 60; // ~60 * 2s = 120s — covers slow admit var enableAttempts = 0; function tryEnableCaptions() { if (enableAttempts >= ENABLE_ATTEMPT_BUDGET) return; enableAttempts++; var buttons = document.querySelectorAll("button[aria-label]"); + var ON_PATTERNS = [ + "turn on captions", + "turn on live captions", + "turn on subtitles", + "turn on closed captions", + "captions on", + "captions (c)", + "show captions", + "enable captions", + ]; + // Negative guard: never click anything that is already-on (Meet + // shows "Turn off captions" when CC is active). + var OFF_PATTERNS = ["turn off captions", "captions off", "disable captions"]; for (var i = 0; i < buttons.length; i++) { var lbl = (buttons[i].getAttribute("aria-label") || "").toLowerCase(); - // Match "Turn on captions" but NOT "Turn off captions". - if (lbl.indexOf("turn on captions") === 0 || /^turn on captions/.test(lbl)) { + if (OFF_PATTERNS.some(function (p) { return lbl.indexOf(p) >= 0; })) continue; + if (ON_PATTERNS.some(function (p) { return lbl.indexOf(p) >= 0; })) { try { buttons[i].click(); enableAttempts = ENABLE_ATTEMPT_BUDGET; // success — stop trying. diff --git a/app/src-tauri/src/meet_audio/inject.rs b/app/src-tauri/src/meet_audio/inject.rs index 312f5297b1..1891c4dcd6 100644 --- a/app/src-tauri/src/meet_audio/inject.rs +++ b/app/src-tauri/src/meet_audio/inject.rs @@ -274,3 +274,31 @@ pub async fn feed_pcm_chunk(cdp: &mut CdpConn, session: &str, pcm_b64: &str) -> } Ok(()) } + +/// Stop any in-flight audio playback inside the page bridge and reset +/// its schedule cursor. Called when the brain cancels outbound (user +/// re-asks during a long reply) so the previous reply's tail doesn't +/// keep playing while the new turn is dispatched. Returns the count +/// of sources that were stopped, useful for diagnostic logging. +pub async fn flush_audio_bridge(cdp: &mut CdpConn, session: &str) -> Result { + let res = cdp + .call( + "Runtime.evaluate", + json!({ + "expression": "(typeof window.__openhumanFlushAudio === 'function') ? window.__openhumanFlushAudio() : -1", + "returnByValue": true, + }), + Some(session), + ) + .await + .map_err(|e| format!("Runtime.evaluate flush: {e}"))?; + if let Some(exception) = res.get("exceptionDetails") { + return Err(format!("page exception: {exception}")); + } + let stopped = res + .get("result") + .and_then(|r| r.get("value")) + .and_then(|v| v.as_i64()) + .unwrap_or(0); + Ok(stopped) +} diff --git a/app/src-tauri/src/meet_audio/mod.rs b/app/src-tauri/src/meet_audio/mod.rs index 789dcce401..d89314159d 100644 --- a/app/src-tauri/src/meet_audio/mod.rs +++ b/app/src-tauri/src/meet_audio/mod.rs @@ -87,10 +87,15 @@ pub async fn start( app: AppHandle, request_id: String, meet_url: String, + owner_display_name: String, + bot_display_name: String, ) -> Result<(), String> { log::info!( - "[meet-audio] start request_id={request_id} url_prefix={}", - truncate_for_log(&meet_url, 64) + "[meet-audio] start request_id={request_id} url_prefix={} \ + owner_chars={} bot_chars={}", + truncate_for_log(&meet_url, 64), + owner_display_name.chars().count(), + bot_display_name.chars().count() ); if let Some(state) = app.try_state::() { @@ -104,12 +109,22 @@ pub async fn start( } // Tell core to open its session first so the very first PCM push - // doesn't race the start RPC. + // doesn't race the start RPC. Hand the call owner + bot display + // names through with the request so the core wake-word gate + // (privacy lock: only the owner can trigger tool calls) is + // active before the first caption can arrive. rpc_call( "openhuman.meet_agent_start_session", serde_json::json!({ "request_id": request_id, "sample_rate_hz": 16_000, + "owner_display_name": owner_display_name, + "bot_display_name": bot_display_name, + // Persisted into the recent-calls JSONL by stop_session + // so the Skills "Meeting Bots" card can show "joined + // " in the history list. The URL the shell built + // the CEF window with is the canonical value. + "meet_url": meet_url, }), ) .await?; @@ -170,7 +185,7 @@ pub async fn start( caption_listener_disabled(request_id.clone()) } }; - let speak = speak_pump::start(request_id.clone(), cdp, session); + let speak = speak_pump::start(app.clone(), request_id.clone(), cdp, session); (speak, captions) } Err(err) => { diff --git a/app/src-tauri/src/meet_audio/speak_pump.rs b/app/src-tauri/src/meet_audio/speak_pump.rs index 1fab93f869..4b83a15ce1 100644 --- a/app/src-tauri/src/meet_audio/speak_pump.rs +++ b/app/src-tauri/src/meet_audio/speak_pump.rs @@ -7,9 +7,10 @@ //! the call so each tick is a single `Runtime.evaluate` round-trip //! rather than fresh attach + detach. -use std::time::Duration; +use std::time::{Duration, Instant}; use base64::{engine::general_purpose::STANDARD as B64, Engine as _}; +use tauri::{AppHandle, Emitter, Runtime}; use tokio::sync::oneshot; use tokio::time::interval; @@ -29,6 +30,21 @@ const POLL_INTERVAL: Duration = Duration::from_millis(100); /// either way. const MAX_CONSECUTIVE_FEED_ERRORS: u32 = 30; +/// How long the speaking-state event keeps reporting `speaking=true` +/// after the last non-empty PCM tick. Brain enqueues outbound in +/// chunks of ~50–200 ms and there's a gap of one or two pump ticks +/// (100 ms each) between chunks while the next batch is being +/// synthesised. Without a hangover, the mascot's mouth would flicker +/// shut every gap. 400 ms covers the typical inter-chunk silence +/// without bridging across legitimate end-of-utterance pauses. +const SPEAKING_HANGOVER: Duration = Duration::from_millis(400); + +/// Tauri event channel for "the bot is/isn't speaking right now". +/// Consumed by `MascotFrameProducer` (frontend) to flip the mascot +/// SVG between idle and a mouth-open / talking pose so the Meet +/// participant sees a visual cue that matches the audio they hear. +const SPEAKING_STATE_EVENT: &str = "meet-video:speaking-state"; + /// RAII handle. Drop to stop the pump task. The shutdown channel /// causes the spawned loop to exit on the next select tick. pub struct SpeakPump { @@ -45,8 +61,15 @@ impl Drop for SpeakPump { /// Spawn the speak pump for a session that already has the audio /// bridge installed. `cdp` and `session_id` come from /// [`inject::install_audio_bridge`] and are owned by the pump task -/// from this point on. -pub fn start(request_id: String, cdp: CdpConn, session_id: String) -> SpeakPump { +/// from this point on. `app` is held so the pump can fire +/// `meet-video:speaking-state` events when the bot starts / stops +/// producing PCM (drives the in-Meet mascot's mouth animation). +pub fn start( + app: AppHandle, + request_id: String, + cdp: CdpConn, + session_id: String, +) -> SpeakPump { let (shutdown_tx, mut shutdown_rx) = oneshot::channel::<()>(); let request_id_for_task = request_id.clone(); tauri::async_runtime::spawn(async move { @@ -56,17 +79,30 @@ pub fn start(request_id: String, cdp: CdpConn, session_id: String) -> SpeakPump tick.tick().await; let mut cdp = cdp; let mut feed_errors: u32 = 0; + // Edge-detect state for the speaking-state event. We emit on + // every flip and never on every tick — the frontend renderer + // would otherwise see a flood of redundant state updates and + // burn worker time on no-op rerenders. + let mut speaking_state = SpeakingTracker::new(); loop { tokio::select! { _ = &mut shutdown_rx => { log::info!( "[meet-audio] speak pump shutdown request_id={request_id_for_task}" ); + // Make sure the mascot stops talking when the + // session ends — without this the last "speaking" + // edge would leave the mouth open for the next + // call's first frame. + speaking_state.force_off(&app, &request_id_for_task); break; } _ = tick.tick() => { - match poll_and_feed(&request_id_for_task, &mut cdp, &session_id).await { - Ok(_) => feed_errors = 0, + let had_pcm = match poll_and_feed(&request_id_for_task, &mut cdp, &session_id).await { + Ok(had) => { + feed_errors = 0; + had + } Err(err) => { feed_errors += 1; log::debug!( @@ -76,10 +112,17 @@ pub fn start(request_id: String, cdp: CdpConn, session_id: String) -> SpeakPump log::warn!( "[meet-audio] speak pump giving up after {feed_errors} consecutive errors request_id={request_id_for_task}" ); + speaking_state.force_off(&app, &request_id_for_task); break; } + // A failed tick is *not* evidence the bot + // stopped speaking — leave the hangover to + // expire naturally so transient CDP errors + // don't flicker the mascot's mouth shut. + false } - } + }; + speaking_state.tick(had_pcm, &app, &request_id_for_task); } } } @@ -91,6 +134,86 @@ pub fn start(request_id: String, cdp: CdpConn, session_id: String) -> SpeakPump } } +/// Edge-detector + hangover for the speaking-state event stream. +/// State machine has two reachable values (`speaking` / `idle`) and +/// flips between them only when the underlying signal sustains long +/// enough to clear the hangover, so the frontend never sees a flap +/// during the natural gap between two PCM chunks. +struct SpeakingTracker { + /// Currently-reported state. Defaults to `false` so the mascot + /// boots into the idle pose; the first `speaking=true` tick is a + /// real edge. + reported: bool, + /// Wall-clock the hangover expires. Set to `now + SPEAKING_HANGOVER` + /// every tick that carries PCM; the state flips back to `false` + /// only once `now > hangover_until` AND a tick with no PCM lands. + hangover_until: Option, +} + +impl SpeakingTracker { + fn new() -> Self { + Self { + reported: false, + hangover_until: None, + } + } + + /// Drive the state machine from a single pump tick. `had_pcm` + /// is whether `poll_and_feed` saw a non-empty `pcm_base64` for + /// this tick. Emits the Tauri event only when the reported + /// state actually flips. + fn tick(&mut self, had_pcm: bool, app: &AppHandle, request_id: &str) { + if had_pcm { + // Extend the hangover. If we were idle, flip up to + // speaking — the user hears audio starting now. + self.hangover_until = Some(Instant::now() + SPEAKING_HANGOVER); + self.set_reported(true, app, request_id); + return; + } + // No PCM this tick. If the hangover hasn't expired, stay in + // whatever state we were already in (typically `speaking=true` + // during the gap between two consecutive chunks). + if let Some(until) = self.hangover_until { + if Instant::now() < until { + return; + } + // Hangover elapsed; clear so we don't re-evaluate on + // every future idle tick. + self.hangover_until = None; + } + // Hangover expired or never armed → bot is genuinely idle. + self.set_reported(false, app, request_id); + } + + /// Force the reported state to `false` and emit an event if that's + /// a flip. Used on shutdown / fatal error paths so the mascot + /// can't get stuck mid-talk. + fn force_off(&mut self, app: &AppHandle, request_id: &str) { + self.hangover_until = None; + self.set_reported(false, app, request_id); + } + + fn set_reported(&mut self, next: bool, app: &AppHandle, request_id: &str) { + if self.reported == next { + return; + } + self.reported = next; + let payload = serde_json::json!({ + "requestId": request_id, + "speaking": next, + }); + if let Err(err) = app.emit(SPEAKING_STATE_EVENT, payload) { + // Best-effort: a missing renderer (closed window mid-tick) + // is the common case and not worth raising the log level. + log::debug!( + "[meet-audio] speaking-state emit failed request_id={request_id} speaking={next} err={err}" + ); + } else { + log::debug!("[meet-audio] speaking-state -> {next} request_id={request_id}"); + } + } +} + /// No-op pump used when bridge install failed at session start. Keeps /// the rest of the session lifecycle uniform — `MeetAudioSession` can /// still hold a `SpeakPump` regardless of speak-path readiness. @@ -101,11 +224,14 @@ pub fn start_disabled(request_id: String) -> SpeakPump { } } +/// Run a single pump tick. Returns `true` when the tick actually +/// carried synthesized PCM (used by the caller to drive the +/// speaking-state edge detector). async fn poll_and_feed( request_id: &str, cdp: &mut CdpConn, session_id: &str, -) -> Result<(), String> { +) -> Result { let v = super::rpc_call( "openhuman.meet_agent_poll_speech", serde_json::json!({ "request_id": request_id }), @@ -119,6 +245,27 @@ async fn poll_and_feed( .get("utterance_done") .and_then(|x| x.as_bool()) .unwrap_or(false); + let flush_pending = v + .get("flush_pending") + .and_then(|x| x.as_bool()) + .unwrap_or(false); + + // Barge-in: brain set flush_pending when it cancelled the previous + // outbound. Stop in-flight playback inside the JS bridge BEFORE we + // feed the next chunk so the user hears the new reply instead of + // the tail of the old one. Best-effort — if the page is gone the + // flush errors and we drop through to the feed, which will fail + // the same way and trigger the same recovery path. + if flush_pending { + match inject::flush_audio_bridge(cdp, session_id).await { + Ok(stopped) => log::info!( + "[meet-audio] barge-in flush request_id={request_id} sources_stopped={stopped}" + ), + Err(e) => { + log::warn!("[meet-audio] barge-in flush failed request_id={request_id} err={e}") + } + } + } if !pcm_b64.is_empty() { // Validate decode locally before pushing — saves a round-trip @@ -131,8 +278,10 @@ async fn poll_and_feed( bytes.len() ); inject::feed_pcm_chunk(cdp, session_id, pcm_b64).await?; - } else if utterance_done { + return Ok(true); + } + if utterance_done { log::info!("[meet-audio] speak pump utterance complete request_id={request_id}"); } - Ok(()) + Ok(false) } diff --git a/app/src-tauri/src/meet_call/mod.rs b/app/src-tauri/src/meet_call/mod.rs index b075405b2c..01b72f5ea4 100644 --- a/app/src-tauri/src/meet_call/mod.rs +++ b/app/src-tauri/src/meet_call/mod.rs @@ -81,7 +81,18 @@ impl Default for MeetCallState { pub struct OpenWindowArgs { pub request_id: String, pub meet_url: String, + /// Bot's Meet participant tile name — what the bot types into + /// Meet's "Your name" input. Also passed to the core wake gate + /// so the bot's own captioned TTS is filtered out as self-echo. pub display_name: String, + /// Call owner's Meet participant name — the human who launched + /// the bot. The core wake-word gate (privacy lock: only the + /// owner can trigger tool calls) compares speaker captions + /// against this value. Defaulted to empty so callers staged + /// during the rollout window keep parsing; an empty owner + /// fails closed in core (no wakes fire). + #[serde(default)] + pub owner_display_name: String, } /// Open a dedicated top-level CEF webview window pointed at the Meet URL. @@ -113,6 +124,25 @@ pub async fn meet_call_open_window( return Ok(label); } + // Only one meet-call window can be live at a time — concurrent bot + // sessions race the CEF audio handler registration (`listen_capture`) + // and confuse the user with multiple "Meet — OpenHuman" windows in + // their Dock. Close any stragglers from a prior Join before opening + // a fresh one. The CloseRequested handler will tear down their + // scanner + audio session via the per-window event listeners below. + let stale_labels: Vec = app + .webview_windows() + .keys() + .filter(|l| l.starts_with("meet-call-")) + .cloned() + .collect(); + for stale in stale_labels { + if let Some(window) = app.get_webview_window(&stale) { + log::info!("[meet-call] closing stale window label={stale} before new join"); + let _ = window.close(); + } + } + let data_dir = data_directory_for(&app, &request_id)?; if let Err(err) = std::fs::create_dir_all(&data_dir) { log::warn!( @@ -162,6 +192,29 @@ pub async fn meet_call_open_window( .build() .map_err(|e| format!("[meet-call] WebviewWindowBuilder.build failed: {e}"))?; + // Push the window off-screen post-build. macOS Cocoa clamps NSWindow + // frame origins to the union of all attached monitors' bounds, so + // (-30000, -30000) lands at (0, 0) on a single-display setup or on + // a secondary monitor's edge on multi-display setups. Not perfect, + // but the post-join hide() in `meet_scanner::run` is the primary + // hiding mechanism — this just keeps the brief pre-join window + // out of the user's main display where possible. + // + // We can't hide() here: a window built hidden never gives its + // renderer a backing surface, and `meet_scanner` drives the join + // via CDP `Input.dispatchMouseEvent` which requires laid-out DOM. + // Hide post-join instead. + if let Err(err) = window.set_position(tauri::PhysicalPosition::new(-30000i32, -30000i32)) { + log::warn!("[meet-call] post-build set_position failed: {err}"); + } + if let Ok(pos) = window.outer_position() { + log::info!( + "[meet-call] post-build outer_position={{x:{},y:{}}} (target=-30000,-30000)", + pos.x, + pos.y + ); + } + state .inner .lock() @@ -195,10 +248,17 @@ pub async fn meet_call_open_window( let app_for_audio = app.clone(); let request_id_for_audio = request_id.clone(); let url_for_audio = parsed.to_string(); + let bot_for_audio = args.display_name.clone(); + let owner_for_audio = args.owner_display_name.clone(); tauri::async_runtime::spawn(async move { - if let Err(err) = - crate::meet_audio::start(app_for_audio, request_id_for_audio.clone(), url_for_audio) - .await + if let Err(err) = crate::meet_audio::start( + app_for_audio, + request_id_for_audio.clone(), + url_for_audio, + owner_for_audio, + bot_for_audio, + ) + .await { log::warn!( "[meet-call] meet_audio start failed request_id={request_id_for_audio} err={err}" @@ -364,7 +424,7 @@ pub async fn meet_call_close_window( Ok(false) } -fn window_label_for(request_id: &str) -> String { +pub fn window_label_for(request_id: &str) -> String { format!("meet-call-{request_id}") } diff --git a/app/src-tauri/src/meet_scanner/mod.rs b/app/src-tauri/src/meet_scanner/mod.rs index 67e56f57f2..90fa49167a 100644 --- a/app/src-tauri/src/meet_scanner/mod.rs +++ b/app/src-tauri/src/meet_scanner/mod.rs @@ -35,7 +35,7 @@ use std::time::Duration; use serde_json::{json, Value}; -use tauri::{AppHandle, Runtime}; +use tauri::{AppHandle, Manager, Runtime}; use crate::cdp::{self, CdpConn}; @@ -70,7 +70,7 @@ const POLL_INTERVAL: Duration = Duration::from_millis(500); /// the scanner uses it as a target-URL prefix so two concurrent calls /// each attach to their own CEF target instead of cross-controlling. pub fn spawn( - _app: AppHandle, + app: AppHandle, request_id: String, meet_url: String, display_name: String, @@ -79,7 +79,21 @@ pub fn spawn( // JoinHandle whose abort_handle() we can return to the caller. let handle = tokio::spawn(async move { match run(&request_id, &meet_url, &display_name).await { - Ok(()) => log::info!("[meet-scanner] join sequence completed request_id={request_id}"), + Ok(()) => { + log::info!("[meet-scanner] join sequence completed request_id={request_id}"); + // Diagnostic build: keep the window VISIBLE post-join so + // we can verify whether the previous `window.hide()` was + // suspending the renderer enough to break the audio + + // caption bridges. Smoke shows audio_context_state stuck + // at "not-created" and no push_caption RPCs ever fire + // after hide() — both consistent with the renderer + // pausing its event loop when orderOut: lands. If the + // pipeline works with the window visible we'll restore + // hide() via a different mechanism (e.g. drag off-screen + // via Tauri set_position rather than orderOut:). + let _ = app; + let _ = request_id; + } Err(err) => { log::warn!("[meet-scanner] join sequence aborted request_id={request_id} err={err}") } @@ -99,6 +113,53 @@ async fn run(request_id: &str, meet_url: &str, display_name: &str) -> Result<(), let _ = cdp.call("Page.enable", json!({}), Some(&session)).await; let _ = cdp.call("Runtime.enable", json!({}), Some(&session)).await; + // Phase 0 — strip any leaked Google session cookies/cache before + // we touch the page. The vendored tauri-cef runtime does not yet + // honour our per-request_id `data_directory` as a fresh CEF + // RequestContext — webviews end up sharing the parent process's + // cookie + cache store. Without this clear, Meet recognises the + // signed-in Google account on the user's main openhuman session + // ("nikhil@tinyhumans.ai" / "Verify it's you" screen) and the bot + // never reaches the anonymous "Your name" pre-join input we drive + // in Phase 2. + // + // `Network.clearBrowserCookies` + `Network.clearBrowserCache` are + // CDP-wide for the attached browser instance, so they wipe the + // session for THIS Meet target without touching the user's main + // openhuman webviews (those run in separate browser instances). + // Best-effort: if Network domain isn't enabled or CDP returns an + // error, we log and continue — the bot may still land on the + // verify screen but won't get worse than the pre-clear state. + let _ = cdp.call("Network.enable", json!({}), Some(&session)).await; + if let Err(err) = cdp + .call("Network.clearBrowserCookies", json!({}), Some(&session)) + .await + { + log::warn!("[meet-scanner] clearBrowserCookies failed: {err}"); + } else { + log::info!("[meet-scanner] cleared browser cookies for fresh anonymous session"); + } + if let Err(err) = cdp + .call("Network.clearBrowserCache", json!({}), Some(&session)) + .await + { + log::info!("[meet-scanner] clearBrowserCache skipped: {err}"); + } + // Reload the page once so Meet re-fetches from scratch without the + // user's Google session cookies. Without the reload, Meet's React + // state still holds the post-auth view; we'd be clicking buttons + // on a stale page. + if let Err(err) = cdp + .call("Page.reload", json!({"ignoreCache": true}), Some(&session)) + .await + { + log::warn!("[meet-scanner] post-cookie-clear reload failed: {err}"); + } + // Give the reloaded page a moment to settle before scanner phases + // start poking the DOM. 1.5s is comfortably above Meet's typical + // first-paint on CEF + leaves headroom for slow CI runners. + tokio::time::sleep(Duration::from_millis(1500)).await; + // Phase 1 — dismiss the device-check screen. // // Meet's exact copy varies by region/A-B test; we try the canonical @@ -122,6 +183,108 @@ async fn run(request_id: &str, meet_url: &str, display_name: &str) -> Result<(), // Phase 2 — type the display name. type_into_named_input(&mut cdp, &session, "Your name", display_name).await?; + // Phase 2.5 — ensure camera + mic are ON before Ask-to-join. + // + // Meet pre-join shows the toggle button with aria-label that + // describes the *action it performs*: "Turn on camera" when the + // camera is currently OFF, "Turn off camera" when currently ON. + // We want both ON, so we MUST only match the "Turn on …" variants. + // Matching "Turn off …" would booby-trap us: it would click an + // already-on toggle, turning it OFF — which is the bug we just + // tripped on (mic ended up muted because "Turn off microphone" + // matched and the click flipped it off). + // + // If no "Turn on …" match is found, the device is already on (or + // the page hasn't rendered the toggle yet) — log + skip silently. + // On miss, dump the current aria-labels so we can verify state and + // extend the matcher with newly observed Meet variants. + if let Err(err) = click_by_aria_label( + &mut cdp, + &session, + &["turn on camera", "turn camera on", "camera is off"], + Duration::from_secs(8), + ) + .await + { + log::info!( + "[meet-scanner] camera toggle ON not clicked (already on or label drift): {err}" + ); + dump_aria_labels(&mut cdp, &session, "camera|video").await; + } + if let Err(err) = click_by_aria_label( + &mut cdp, + &session, + &[ + "turn on microphone", + "turn microphone on", + "turn on mic", + "turn mic on", + "microphone is off", + "mic is off", + ], + Duration::from_secs(8), + ) + .await + { + log::info!("[meet-scanner] mic toggle ON not clicked (already on or label drift): {err}"); + dump_aria_labels(&mut cdp, &session, "mic|microphone|audio").await; + } + + // Phase 2.6 — force a fresh getUserMedia call by cycling mic off-on + // BEFORE Ask-to-join. + // + // Why before, not after: if Ask-to-join times out (Meet UI variant + // drift or already-joined-elsewhere) the scanner returns Err and + // any later phases never run. Cycling here means the gUM intercept + // gets its chance regardless of what happens at the join button — + // and pre-join is also when Meet's React happily re-acquires media + // on toggle, so this is the more reliable site anyway. + // + // Meet caches the camera + mic MediaStreams from initial page load + // (before meet_audio::inject reloaded with our bridges). Our gUM + // intercept in audio_bridge.js only fires on NEW gUM calls, so the + // cached streams keep flowing — the bot's mic stays the real OS + // microphone, the bot's camera stays the static fake-camera Y4M + // frame, and our speak_pump pushes synthesized PCM into a + // MediaStreamDestination that's never attached to any outbound + // track. Host hears the user (echo loop) instead of the bot. + // + // Click "Turn off microphone" → ~700 ms pause for React to settle → + // click whatever aria-label appears in its place ("Turn on + // microphone" or a variant). The second click triggers Meet to + // re-request via getUserMedia, which our bridge then intercepts. + if let Err(err) = click_by_aria_label( + &mut cdp, + &session, + &["turn off microphone", "turn microphone off", "turn off mic"], + Duration::from_secs(4), + ) + .await + { + log::info!("[meet-scanner] mic off-cycle skipped: {err}"); + } else { + log::info!("[meet-scanner] mic cycled off; pausing 700ms before re-arm"); + tokio::time::sleep(Duration::from_millis(700)).await; + if let Err(err) = click_by_aria_label( + &mut cdp, + &session, + &[ + "turn on microphone", + "turn microphone on", + "turn on mic", + "turn mic on", + ], + Duration::from_secs(6), + ) + .await + { + log::warn!("[meet-scanner] mic on-cycle missed (left muted!): {err}"); + dump_aria_labels(&mut cdp, &session, "mic|microphone").await; + } else { + log::info!("[meet-scanner] mic re-armed (gUM intercept should now fire)"); + } + } + // Phase 3 — request to join. wait_and_click_text( &mut cdp, @@ -131,9 +294,212 @@ async fn run(request_id: &str, meet_url: &str, display_name: &str) -> Result<(), ) .await?; + // Phase 4 — once the bot is admitted, force-enable captions. + // + // captions_bridge.js already polls every 2 s for a button whose + // aria-label STARTS with "turn on captions" (`indexOf(...) === 0`). + // That's brittle: Meet ships "Turn on captions (c)" in some regions + // (the parenthesised shortcut breaks the `=== 0` prefix-match), and + // the polling cap (30 attempts * 2 s = 60 s) can expire before a + // slow host admits the bot. Belt-and-suspenders: from the scanner + // side, wait for admission (the "Leave call" affordance) then click + // the captions toggle ourselves via the looser substring matcher. + // + // Best-effort: if any step times out, log + continue. The brain + // will simply not see captions for this session, which is no worse + // than the pre-fix state. + if let Err(err) = wait_for_admission(&mut cdp, &session).await { + log::info!("[meet-scanner] admission wait skipped: {err}"); + } else { + log::info!("[meet-scanner] bot admitted into meeting"); + if let Err(err) = click_by_aria_label( + &mut cdp, + &session, + &[ + "turn on captions", + "turn on live captions", + "turn on subtitles", + "turn on closed captions", + "captions on", + "captions (c)", + "show captions", + "enable captions", + ], + Duration::from_secs(8), + ) + .await + { + log::info!("[meet-scanner] captions toggle ON not clicked: {err}"); + dump_aria_labels(&mut cdp, &session, "caption|subtitle").await; + } + } + Ok(()) } +/// Wait until the meeting page renders the in-call control bar — the +/// signal that the host has admitted the bot from the waiting room. +/// The "Leave call" / "End call" button is the simplest stable anchor; +/// the captions and "more options" buttons exist in pre-join too. +async fn wait_for_admission(cdp: &mut CdpConn, session: &str) -> Result<(), String> { + const ADMISSION_BUDGET: Duration = Duration::from_secs(120); + let expression = r#" + (() => { + const all = document.querySelectorAll('button[aria-label]'); + for (const el of all) { + const a = (el.getAttribute('aria-label') || '').toLowerCase(); + if (a.includes('leave call') || a.includes('end call')) { + const rect = el.getBoundingClientRect(); + if (rect.width > 0 && rect.height > 0) return true; + } + } + return false; + })() + "#; + let deadline = tokio::time::Instant::now() + ADMISSION_BUDGET; + while tokio::time::Instant::now() < deadline { + let res = cdp + .call( + "Runtime.evaluate", + json!({ "expression": expression, "returnByValue": true }), + Some(session), + ) + .await?; + let admitted = res + .get("result") + .and_then(|r| r.get("value")) + .and_then(|v| v.as_bool()) + .unwrap_or(false); + if admitted { + return Ok(()); + } + tokio::time::sleep(Duration::from_secs(1)).await; + } + Err(format!( + "timeout ({}s) waiting for Leave-call affordance", + ADMISSION_BUDGET.as_secs() + )) +} + +/// Dump the page's aria-labels that match a JS regex pattern so we can +/// inspect what Meet actually exposes after a failed +/// [`click_by_aria_label`]. Best-effort, swallows all CDP errors. +async fn dump_aria_labels(cdp: &mut CdpConn, session: &str, pattern: &str) { + let pattern_js = serde_json::to_string(pattern).unwrap_or_else(|_| "\"camera\"".to_string()); + let expression = format!( + r#" + (() => {{ + const re = new RegExp({pattern_js}, "i"); + const nodes = document.querySelectorAll('[aria-label]'); + const hits = []; + for (const el of nodes) {{ + const aria = el.getAttribute('aria-label') || ''; + if (!re.test(aria)) continue; + const tag = el.tagName.toLowerCase(); + const role = el.getAttribute('role') || ''; + const dataTip = el.getAttribute('data-tooltip') || ''; + const rect = el.getBoundingClientRect(); + const visible = rect.width > 0 && rect.height > 0; + hits.push({{ aria, tag, role, dataTip, visible }}); + if (hits.length >= 24) break; + }} + return hits; + }})() + "# + ); + let res = match cdp + .call( + "Runtime.evaluate", + json!({ "expression": expression, "returnByValue": true }), + Some(session), + ) + .await + { + Ok(v) => v, + Err(err) => { + log::info!("[meet-scanner] aria-label dump failed: {err}"); + return; + } + }; + if let Some(arr) = res.get("result").and_then(|r| r.get("value")) { + log::warn!( + "[meet-scanner] aria-label dump pattern={} hits={}", + pattern, + arr + ); + } +} + +/// Click a button whose `aria-label` matches one of `labels` +/// (case-insensitive substring). Meet's camera + mic toggles have no +/// visible text — they're icon buttons with `aria-label="Turn on +/// camera"` etc. The existing `wait_and_click_text` matches innerText +/// only, so we need a sibling matcher anchored on aria-label. +async fn click_by_aria_label( + cdp: &mut CdpConn, + session: &str, + labels: &[&str], + budget: Duration, +) -> Result<(), String> { + let labels_js = serde_json::to_string(labels).map_err(|e| format!("labels json: {e}"))?; + let expression = format!( + r#" + (() => {{ + const labels = {labels_js}; + const want = labels.map(l => l.toLowerCase()); + const candidates = document.querySelectorAll( + 'button, [role="button"], [aria-label]' + ); + for (const el of candidates) {{ + if (el.disabled || el.getAttribute('aria-disabled') === 'true') continue; + const aria = (el.getAttribute('aria-label') || '').toLowerCase(); + if (!aria) continue; + if (!want.some(w => aria.includes(w))) continue; + const rect = el.getBoundingClientRect(); + if (rect.width === 0 || rect.height === 0) continue; + el.scrollIntoView({{ block: 'center', inline: 'center' }}); + el.click(); + return aria; + }} + return null; + }})() + "# + ); + + let deadline = tokio::time::Instant::now() + budget; + let mut last_value = Value::Null; + while tokio::time::Instant::now() < deadline { + let res = cdp + .call( + "Runtime.evaluate", + json!({ + "expression": expression, + "returnByValue": true, + "awaitPromise": false, + }), + Some(session), + ) + .await?; + let value = res + .get("result") + .and_then(|r| r.get("value")) + .cloned() + .unwrap_or(Value::Null); + if value.is_string() { + log::info!( + "[meet-scanner] clicked aria-label matching {labels:?} aria={}", + value.as_str().unwrap_or("") + ); + return Ok(()); + } + last_value = value; + tokio::time::sleep(POLL_INTERVAL).await; + } + Err(format!( + "timeout waiting for aria-label matching {labels:?} (last={last_value})" + )) +} + /// Poll CEF's target list until a page whose URL starts with `meet_url` /// shows up, then attach a CDP session to it. Filtering by the full /// per-call URL prefix (rather than just the host) keeps two concurrent diff --git a/app/src/components/intelligence/IntelligenceCallsTab.tsx b/app/src/components/intelligence/IntelligenceCallsTab.tsx index cc3801ce27..5bc344a4ce 100644 --- a/app/src/components/intelligence/IntelligenceCallsTab.tsx +++ b/app/src/components/intelligence/IntelligenceCallsTab.tsx @@ -64,7 +64,13 @@ export default function IntelligenceCallsTab({ onToast }: Props) { setError(null); setSubmitting(true); try { - const result = await joinMeetCall({ meetUrl, displayName }); + // ownerDisplayName left empty here because this tab's UI is hidden + // behind a "Coming Soon" gate (see render branch below) — the call + // is dead-code-reachable only. When the tab is revived it must + // collect an owner-name input the same way `MeetingBotsCard` does + // (privacy lock for the in-call wake gate). Empty fails closed in + // core, so we're safe in the meantime. + const result = await joinMeetCall({ meetUrl, displayName, ownerDisplayName: '' }); setActiveCalls(prev => [ ...prev.filter(call => call.requestId !== result.requestId), { requestId: result.requestId, meetUrl: result.meetUrl, displayName: result.displayName }, diff --git a/app/src/components/skills/MeetingBotsCard.tsx b/app/src/components/skills/MeetingBotsCard.tsx index c93dd1f26a..3ba6e7e0df 100644 --- a/app/src/components/skills/MeetingBotsCard.tsx +++ b/app/src/components/skills/MeetingBotsCard.tsx @@ -1,20 +1,20 @@ // Meeting bots entry point on the Skills "Integrations" section. // -// Surfaces as a compact, fun banner: clicking opens a modal that wraps -// the backend mascot bot (PR tinyhumansai/backend#773). Joining a -// Google Meet kicks off the Camoufox-driven mascot in the backend, -// which streams the mascot's WebRTC video into the call as an -// anonymous guest. Zoom and Teams are shown as "coming soon" — the -// backend already routes them but returns 400 "not yet supported". +// Surfaces as a compact, fun banner: clicking opens a modal that opens +// a dedicated CEF webview pointed at the Meet URL. The bot's outbound +// camera is the mascot canvas (`meet_video::camera_bridge`) and its +// outbound audio is the synthesized speech pump (`meet_audio`). Zoom +// and Teams are shown as "coming soon" — only Google Meet has the CEF +// bridge pipeline today. -import { useEffect, useState } from 'react'; +import { useCallback, useEffect, useState } from 'react'; import { useT } from '../../lib/i18n/I18nContext'; import { - joinMeetingViaMascotBot, - SERVER_OVERLOADED_MESSAGE, - type MascotJoinMeetingError, + joinMeetCall, + listMeetCalls, type MascotMeetPlatform, + type MeetCallRecord, } from '../../services/meetCallService'; type Toast = { type: 'success' | 'error' | 'info'; title: string; message?: string }; @@ -41,10 +41,6 @@ const PLATFORMS: PlatformDef[] = [ }, ]; -function isMascotJoinMeetingError(err: unknown): err is MascotJoinMeetingError { - return !!err && typeof err === 'object' && 'isCapacityGated' in err && 'message' in err; -} - export default function MeetingBotsCard({ onToast }: Props) { const [open, setOpen] = useState(false); @@ -115,14 +111,45 @@ interface ModalProps { onToast?: (toast: Toast) => void; } -function MeetingBotsModal({ onClose, onToast }: ModalProps) { +export function MeetingBotsModal({ onClose, onToast }: ModalProps) { const { t } = useT(); const [platform, setPlatform] = useState('gmeet'); const [meetUrl, setMeetUrl] = useState(''); const [displayName, setDisplayName] = useState('OpenHuman'); + // Privacy lock: the bot will only react to the wake word when this + // exact name is the speaker in Meet's captions. Anyone else who + // says "hey openhuman …" is silently ignored — preventing a + // remote participant from issuing tool calls in the owner's + // name. Empty fails closed; the submit handler will surface an + // explicit error before opening the CEF window. + const [ownerDisplayName, setOwnerDisplayName] = useState(''); const [submitting, setSubmitting] = useState(false); - const [capacityGated, setCapacityGated] = useState(false); const [error, setError] = useState(null); + // Recent-calls history loaded from core when the modal opens. + // `null` means "not yet fetched"; `[]` means "fetched, no rows". + // Separating the two lets the UI render a "Loading…" hint on + // first open without flashing a misleading empty state. + const [recentCalls, setRecentCalls] = useState(null); + const [recentError, setRecentError] = useState(null); + + const refreshRecentCalls = useCallback(async () => { + setRecentError(null); + try { + const rows = await listMeetCalls(20); + setRecentCalls(rows); + } catch (err) { + const message = err instanceof Error ? err.message : 'Failed to load recent calls.'; + console.warn('[meeting-bots] listMeetCalls failed:', err); + setRecentError(message); + setRecentCalls([]); + } + }, []); + + useEffect(() => { + // Fire-and-forget on mount; the modal is short-lived (closes on + // submit or Cancel) so a slow RPC here can't pile up. + void refreshRecentCalls(); + }, [refreshRecentCalls]); const selected = PLATFORMS.find(p => p.platform === platform) ?? PLATFORMS[0]; const isComingSoon = !!selected.comingSoon; @@ -139,14 +166,24 @@ function MeetingBotsModal({ onClose, onToast }: ModalProps) { const handleSubmit = async (event: React.FormEvent) => { event.preventDefault(); setError(null); - setCapacityGated(false); if (isComingSoon) { setError(`${selected.label} support is coming soon.`); return; } setSubmitting(true); try { - await joinMeetingViaMascotBot({ platform, meetUrl, displayName }); + // Flow A: local CEF webview with mascot canvas + synthesized audio. + // joinMeetCall opens an off-screen CEF window per request_id, + // installs the audio/video bridges via CDP, then meet_scanner + // drives the join automatically. Returns once the window has + // been created — meet_audio + meet_scanner take it from there. + // + // ownerDisplayName is the privacy lock: the wake-word gate in + // the core only accepts captions whose speaker matches this + // value (case-insensitive, "(host)" / "(you)" suffix stripped). + // Anyone else in the room saying the wake phrase is dropped + // without dispatching a tool turn. + await joinMeetCall({ meetUrl, displayName, ownerDisplayName }); onToast?.({ type: 'success', title: t('skills.meetingBots.joiningTitle'), @@ -155,20 +192,9 @@ function MeetingBotsModal({ onClose, onToast }: ModalProps) { setMeetUrl(''); onClose(); } catch (err) { - if (isMascotJoinMeetingError(err)) { - setCapacityGated(err.isCapacityGated); - const message = err.isCapacityGated ? SERVER_OVERLOADED_MESSAGE : err.message; - setError(message); - onToast?.({ - type: 'error', - title: err.isCapacityGated ? t('skills.meetingBots.busyTitle') : t('skills.meetingBots.couldNotStartTitle'), - message, - }); - } else { - const message = err instanceof Error ? err.message : t('skills.meetingBots.failedToStart'); - setError(message); - onToast?.({ type: 'error', title: t('skills.meetingBots.couldNotStartTitle'), message }); - } + const message = err instanceof Error ? err.message : t('skills.meetingBots.failedToStart'); + setError(message); + onToast?.({ type: 'error', title: t('skills.meetingBots.couldNotStartTitle'), message }); } finally { setSubmitting(false); } @@ -258,14 +284,33 @@ function MeetingBotsModal({ onClose, onToast }: ModalProps) { /> + + {error && (
+ className="rounded-xl border border-coral-200 dark:border-coral-500/30 bg-coral-50 dark:bg-coral-500/10 px-3 py-2 text-xs text-coral-700 dark:text-coral-300"> {error}
)} @@ -279,7 +324,9 @@ function MeetingBotsModal({ onClose, onToast }: ModalProps) { + + ); } + +/** + * Recent calls list rendered below the join form inside the same + * modal — same surface where the user launches a call, so they see + * their history without navigating away. Three states: + * - `rows === null` → still loading (small spinner-y hint). + * - `rows === []` → no calls yet (gentle empty state). + * - `rows.length > 0` → render a compact list, newest first. + * + * `error` is shown inline above the list when the fetch failed but + * doesn't block the form — the join path is independent. + */ +function RecentCallsSection({ + rows, + error, +}: { + rows: MeetCallRecord[] | null; + error: string | null; +}) { + return ( +
+
+

+ Recent calls + {rows && rows.length > 0 && ( + + ({rows.length}) + + )} +

+
+ + {error && ( + // Plain status text rather than role="alert" — the join form + // already owns the alert role for the modal's primary error + // surface. A failure to fetch history is informational, not + // actionable, and shouldn't collide with the form's a11y + // announcement. +

{error}

+ )} + + {rows === null ? ( +

Loading…

+ ) : rows.length === 0 ? ( +

+ No previous calls yet — your meeting history will appear here. +

+ ) : ( +
    + {rows.map(call => ( + + ))} +
+ )} +
+ ); +} + +function RecentCallRow({ call }: { call: MeetCallRecord }) { + // Show the trailing meeting code (`abc-defg-hij`) rather than the + // full URL — the URL prefix is always `https://meet.google.com/` + // and would just waste row width. + const meetingCode = (() => { + try { + const parsed = new URL(call.meet_url); + const tail = parsed.pathname.replace(/^\/+/, ''); + return tail || call.meet_url; + } catch { + return call.meet_url || '(unknown URL)'; + } + })(); + const duration = Math.max(0, Math.round(call.spoken_seconds + call.listened_seconds)); + return ( +
  • +
    + {meetingCode} + + {formatRelativeTime(call.started_at_ms)} + +
    +
    + {call.turn_count} turn{call.turn_count === 1 ? '' : 's'} + {duration}s on call +
    +
  • + ); +} + +/** + * Compact "12 min ago" / "yesterday" / "May 14" style stamp. Browser + * `Intl.RelativeTimeFormat` would be nicer but pulls a much larger + * locale data path; the targets here are short labels in a single + * surface, not a full i18n investment. + */ +function formatRelativeTime(ms: number): string { + if (!ms) return '—'; + const diff = Date.now() - ms; + if (diff < 0) return 'just now'; + const seconds = Math.floor(diff / 1000); + if (seconds < 60) return 'just now'; + const minutes = Math.floor(seconds / 60); + if (minutes < 60) return `${minutes}m ago`; + const hours = Math.floor(minutes / 60); + if (hours < 24) return `${hours}h ago`; + const days = Math.floor(hours / 24); + if (days === 1) return 'yesterday'; + if (days < 7) return `${days}d ago`; + try { + return new Date(ms).toLocaleDateString(undefined, { month: 'short', day: 'numeric' }); + } catch { + return '—'; + } +} diff --git a/app/src/components/skills/__tests__/MeetingBotsCard.test.tsx b/app/src/components/skills/__tests__/MeetingBotsCard.test.tsx index 1d99f8d739..1c7c3c0b4f 100644 --- a/app/src/components/skills/__tests__/MeetingBotsCard.test.tsx +++ b/app/src/components/skills/__tests__/MeetingBotsCard.test.tsx @@ -11,7 +11,11 @@ vi.mock('../../../services/meetCallService', async () => { ); return { ...actual, - joinMeetingViaMascotBot: (...args: unknown[]) => joinMock(...args), + // Flow A: the modal submit calls joinMeetCall (CEF webview), not the + // Flow B backend joinMeetingViaMascotBot. Switched in the + // mascot-meet-flowA revival commits — kept the mock variable name + // `joinMock` to keep the diff focused on the call site swap. + joinMeetCall: (...args: unknown[]) => joinMock(...args), }; }); @@ -45,8 +49,8 @@ describe('MeetingBotsCard', () => { expect(screen.queryByRole('dialog')).not.toBeInTheDocument(); }); - it('submits to joinMeetingViaMascotBot and fires a success toast', async () => { - joinMock.mockResolvedValueOnce({ success: true }); + it('submits to joinMeetCall and fires a success toast', async () => { + joinMock.mockResolvedValueOnce({ requestId: 'req-1' }); const onToast = vi.fn(); render(); @@ -54,14 +58,25 @@ describe('MeetingBotsCard', () => { fireEvent.change(screen.getByLabelText(/meeting link/i), { target: { value: 'https://meet.google.com/abc-defg-hij' }, }); + // Owner display name is now required — the wake-word gate refuses + // every caption when this is empty (privacy lock), so the submit + // button stays disabled and the test would hang on form submit + // without typing a value here. + fireEvent.change(screen.getByLabelText(/your name in the call/i), { + target: { value: 'Alice' }, + }); const form = screen.getByRole('dialog').querySelector('form')!; fireEvent.submit(form); + // Flow A's joinMeetCall takes { meetUrl, displayName, ownerDisplayName }. + // Assert on the owner name (the new privacy-lock contract) and meetUrl; + // the bot displayName is a UI-supplied default and not contract-load- + // bearing for this assertion. await vi.waitFor(() => { expect(joinMock).toHaveBeenCalledWith( expect.objectContaining({ - platform: 'gmeet', meetUrl: 'https://meet.google.com/abc-defg-hij', + ownerDisplayName: 'Alice', }) ); }); @@ -76,11 +91,11 @@ describe('MeetingBotsCard', () => { }); }); - it('surfaces a capacity-gated error inline + as an amber toast', async () => { - joinMock.mockRejectedValueOnce({ - isCapacityGated: true, - message: 'busy', - }); + // Flow A's joinMeetCall has no capacity-gated concept — any throw maps + // to the single "could not start" toast + inline alert with the error + // message. Two error cases collapsed into one in the Flow A model. + it('surfaces a join error inline + as an error toast', async () => { + joinMock.mockRejectedValueOnce(new Error('Bad URL')); const onToast = vi.fn(); render(); @@ -88,25 +103,8 @@ describe('MeetingBotsCard', () => { fireEvent.change(screen.getByLabelText(/meeting link/i), { target: { value: 'https://meet.google.com/x' }, }); - fireEvent.submit(screen.getByRole('dialog').querySelector('form')!); - - await vi.waitFor(() => { - expect(onToast).toHaveBeenCalledWith( - expect.objectContaining({ type: 'error', title: expect.stringMatching(/busy/i) }) - ); - }); - // Modal stays open so the user can retry; inline alert visible. - expect(screen.getByRole('alert')).toBeInTheDocument(); - }); - - it('surfaces a non-capacity error', async () => { - joinMock.mockRejectedValueOnce({ isCapacityGated: false, message: 'Bad URL' }); - const onToast = vi.fn(); - render(); - - fireEvent.click(screen.getByTestId('meeting-bots-banner')); - fireEvent.change(screen.getByLabelText(/meeting link/i), { - target: { value: 'https://meet.google.com/x' }, + fireEvent.change(screen.getByLabelText(/your name in the call/i), { + target: { value: 'Alice' }, }); fireEvent.submit(screen.getByRole('dialog').querySelector('form')!); diff --git a/app/src/features/human/HumanPage.tsx b/app/src/features/human/HumanPage.tsx index 8def765428..d257b7b2da 100644 --- a/app/src/features/human/HumanPage.tsx +++ b/app/src/features/human/HumanPage.tsx @@ -1,5 +1,6 @@ import { useEffect, useState } from 'react'; +import { MeetingBotsModal } from '../../components/skills/MeetingBotsCard'; import { useT } from '../../lib/i18n/I18nContext'; import Conversations from '../../pages/Conversations'; import type { ToolTimelineEntry } from '../../store/chatRuntimeSlice'; @@ -21,6 +22,7 @@ const HumanPage = () => { const raw = window.localStorage.getItem(SPEAK_REPLIES_KEY); return raw === null ? true : raw === '1'; }); + const [joinMeetingOpen, setJoinMeetingOpen] = useState(false); useEffect(() => { window.localStorage.setItem(SPEAK_REPLIES_KEY, speakReplies ? '1' : '0'); @@ -65,6 +67,21 @@ const HumanPage = () => { {t('voice.pushToTalk')} + {/* "Send OpenHuman to a meeting" — opens the Flow A modal which spawns + an off-screen CEF webview pointed at the Meet URL with the mascot + canvas as the outbound camera and synthesized speech as the + outbound mic. The user's OS mic is never wired to the meeting. */} + + + {joinMeetingOpen && setJoinMeetingOpen(false)} />} + {/* Chat sidebar — vertically centered above the BottomTabBar (~80px). */}