From 92ad9dc3b962f4c939cf1e1170711b9520c4555c Mon Sep 17 00:00:00 2001 From: oxoxDev Date: Fri, 22 May 2026 12:04:57 +0530 Subject: [PATCH 01/64] refactor(meeting-bots): point modal submit at Flow A (CEF webview) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The modal used to POST /mascots/join-meeting to the backend Camoufox bot (Flow B). Two production blockers there: - Firefox / Camoufox bypasses our JS getUserMedia override at the C++ native layer, so the mascot Y4M never replaces the bot's camera and the tile is a static placeholder. - Chromium / Chrome variants get rejected by Meet's anti-bot screen ("You can't join this video call") before they reach the join page. Flow A (PR #1350 + #1359) sidesteps both: it opens a dedicated, profile- isolated CEF webview on the user's machine, installs the audio + video bridges via CDP at document-start, and lets meet_scanner drive the join. The mascot canvas IS the outbound camera and the synthesized speech IS the outbound mic — the user's OS mic is never wired to the meeting. --- app/src/components/skills/MeetingBotsCard.tsx | 54 +++++++------------ 1 file changed, 18 insertions(+), 36 deletions(-) diff --git a/app/src/components/skills/MeetingBotsCard.tsx b/app/src/components/skills/MeetingBotsCard.tsx index c93dd1f26a..f0f9301617 100644 --- a/app/src/components/skills/MeetingBotsCard.tsx +++ b/app/src/components/skills/MeetingBotsCard.tsx @@ -1,19 +1,17 @@ // Meeting bots entry point on the Skills "Integrations" section. // -// Surfaces as a compact, fun banner: clicking opens a modal that wraps -// the backend mascot bot (PR tinyhumansai/backend#773). Joining a -// Google Meet kicks off the Camoufox-driven mascot in the backend, -// which streams the mascot's WebRTC video into the call as an -// anonymous guest. Zoom and Teams are shown as "coming soon" — the -// backend already routes them but returns 400 "not yet supported". +// Surfaces as a compact, fun banner: clicking opens a modal that opens +// a dedicated CEF webview pointed at the Meet URL. The bot's outbound +// camera is the mascot canvas (`meet_video::camera_bridge`) and its +// outbound audio is the synthesized speech pump (`meet_audio`). Zoom +// and Teams are shown as "coming soon" — only Google Meet has the CEF +// bridge pipeline today. import { useEffect, useState } from 'react'; import { useT } from '../../lib/i18n/I18nContext'; import { - joinMeetingViaMascotBot, - SERVER_OVERLOADED_MESSAGE, - type MascotJoinMeetingError, + joinMeetCall, type MascotMeetPlatform, } from '../../services/meetCallService'; @@ -41,10 +39,6 @@ const PLATFORMS: PlatformDef[] = [ }, ]; -function isMascotJoinMeetingError(err: unknown): err is MascotJoinMeetingError { - return !!err && typeof err === 'object' && 'isCapacityGated' in err && 'message' in err; -} - export default function MeetingBotsCard({ onToast }: Props) { const [open, setOpen] = useState(false); @@ -115,13 +109,12 @@ interface ModalProps { onToast?: (toast: Toast) => void; } -function MeetingBotsModal({ onClose, onToast }: ModalProps) { +export function MeetingBotsModal({ onClose, onToast }: ModalProps) { const { t } = useT(); const [platform, setPlatform] = useState('gmeet'); const [meetUrl, setMeetUrl] = useState(''); const [displayName, setDisplayName] = useState('OpenHuman'); const [submitting, setSubmitting] = useState(false); - const [capacityGated, setCapacityGated] = useState(false); const [error, setError] = useState(null); const selected = PLATFORMS.find(p => p.platform === platform) ?? PLATFORMS[0]; @@ -139,14 +132,18 @@ function MeetingBotsModal({ onClose, onToast }: ModalProps) { const handleSubmit = async (event: React.FormEvent) => { event.preventDefault(); setError(null); - setCapacityGated(false); if (isComingSoon) { setError(`${selected.label} support is coming soon.`); return; } setSubmitting(true); try { - await joinMeetingViaMascotBot({ platform, meetUrl, displayName }); + // Flow A: local CEF webview with mascot canvas + synthesized audio. + // joinMeetCall opens an off-screen CEF window per request_id, + // installs the audio/video bridges via CDP, then meet_scanner + // drives the join automatically. Returns once the window has + // been created — meet_audio + meet_scanner take it from there. + await joinMeetCall({ meetUrl, displayName }); onToast?.({ type: 'success', title: t('skills.meetingBots.joiningTitle'), @@ -155,20 +152,9 @@ function MeetingBotsModal({ onClose, onToast }: ModalProps) { setMeetUrl(''); onClose(); } catch (err) { - if (isMascotJoinMeetingError(err)) { - setCapacityGated(err.isCapacityGated); - const message = err.isCapacityGated ? SERVER_OVERLOADED_MESSAGE : err.message; - setError(message); - onToast?.({ - type: 'error', - title: err.isCapacityGated ? t('skills.meetingBots.busyTitle') : t('skills.meetingBots.couldNotStartTitle'), - message, - }); - } else { - const message = err instanceof Error ? err.message : t('skills.meetingBots.failedToStart'); - setError(message); - onToast?.({ type: 'error', title: t('skills.meetingBots.couldNotStartTitle'), message }); - } + const message = err instanceof Error ? err.message : t('skills.meetingBots.failedToStart'); + setError(message); + onToast?.({ type: 'error', title: t('skills.meetingBots.couldNotStartTitle'), message }); } finally { setSubmitting(false); } @@ -261,11 +247,7 @@ function MeetingBotsModal({ onClose, onToast }: ModalProps) { {error && (
+ className="rounded-xl border border-coral-200 dark:border-coral-500/30 bg-coral-50 dark:bg-coral-500/10 px-3 py-2 text-xs text-coral-700 dark:text-coral-300"> {error}
)} From 8e9b7b226f30cdd1cf5a4500b2478c0d14015725 Mon Sep 17 00:00:00 2001 From: oxoxDev Date: Fri, 22 May 2026 12:05:10 +0530 Subject: [PATCH 02/64] feat(human): add join-meeting pill that opens Flow A modal MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Surfaces the meeting-bots entry next to the speak-replies toggle on /human so users can dispatch the mascot directly from the chat surface without flipping to the Skills tab. Same modal, same Flow A backing — just an additional surface. --- app/src/features/human/HumanPage.tsx | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) diff --git a/app/src/features/human/HumanPage.tsx b/app/src/features/human/HumanPage.tsx index 8def765428..d257b7b2da 100644 --- a/app/src/features/human/HumanPage.tsx +++ b/app/src/features/human/HumanPage.tsx @@ -1,5 +1,6 @@ import { useEffect, useState } from 'react'; +import { MeetingBotsModal } from '../../components/skills/MeetingBotsCard'; import { useT } from '../../lib/i18n/I18nContext'; import Conversations from '../../pages/Conversations'; import type { ToolTimelineEntry } from '../../store/chatRuntimeSlice'; @@ -21,6 +22,7 @@ const HumanPage = () => { const raw = window.localStorage.getItem(SPEAK_REPLIES_KEY); return raw === null ? true : raw === '1'; }); + const [joinMeetingOpen, setJoinMeetingOpen] = useState(false); useEffect(() => { window.localStorage.setItem(SPEAK_REPLIES_KEY, speakReplies ? '1' : '0'); @@ -65,6 +67,21 @@ const HumanPage = () => { {t('voice.pushToTalk')} + {/* "Send OpenHuman to a meeting" — opens the Flow A modal which spawns + an off-screen CEF webview pointed at the Meet URL with the mascot + canvas as the outbound camera and synthesized speech as the + outbound mic. The user's OS mic is never wired to the meeting. */} + + + {joinMeetingOpen && setJoinMeetingOpen(false)} />} + {/* Chat sidebar — vertically centered above the BottomTabBar (~80px). */}
); } + +/** + * Recent calls list rendered below the join form inside the same + * modal — same surface where the user launches a call, so they see + * their history without navigating away. Three states: + * - `rows === null` → still loading (small spinner-y hint). + * - `rows === []` → no calls yet (gentle empty state). + * - `rows.length > 0` → render a compact list, newest first. + * + * `error` is shown inline above the list when the fetch failed but + * doesn't block the form — the join path is independent. + */ +function RecentCallsSection({ + rows, + error, +}: { + rows: MeetCallRecord[] | null; + error: string | null; +}) { + return ( +
+
+

+ Recent calls + {rows && rows.length > 0 && ( + + ({rows.length}) + + )} +

+
+ + {error && ( + // Plain status text rather than role="alert" — the join form + // already owns the alert role for the modal's primary error + // surface. A failure to fetch history is informational, not + // actionable, and shouldn't collide with the form's a11y + // announcement. +

{error}

+ )} + + {rows === null ? ( +

Loading…

+ ) : rows.length === 0 ? ( +

+ No previous calls yet — your meeting history will appear here. +

+ ) : ( +
    + {rows.map(call => ( + + ))} +
+ )} +
+ ); +} + +function RecentCallRow({ call }: { call: MeetCallRecord }) { + // Show the trailing meeting code (`abc-defg-hij`) rather than the + // full URL — the URL prefix is always `https://meet.google.com/` + // and would just waste row width. + const meetingCode = (() => { + try { + const parsed = new URL(call.meet_url); + const tail = parsed.pathname.replace(/^\/+/, ''); + return tail || call.meet_url; + } catch { + return call.meet_url || '(unknown URL)'; + } + })(); + const duration = Math.max(0, Math.round(call.spoken_seconds + call.listened_seconds)); + return ( +
  • +
    + {meetingCode} + + {formatRelativeTime(call.started_at_ms)} + +
    +
    + {call.turn_count} turn{call.turn_count === 1 ? '' : 's'} + {duration}s on call +
    +
  • + ); +} + +/** + * Compact "12 min ago" / "yesterday" / "May 14" style stamp. Browser + * `Intl.RelativeTimeFormat` would be nicer but pulls a much larger + * locale data path; the targets here are short labels in a single + * surface, not a full i18n investment. + */ +function formatRelativeTime(ms: number): string { + if (!ms) return '—'; + const diff = Date.now() - ms; + if (diff < 0) return 'just now'; + const seconds = Math.floor(diff / 1000); + if (seconds < 60) return 'just now'; + const minutes = Math.floor(seconds / 60); + if (minutes < 60) return `${minutes}m ago`; + const hours = Math.floor(minutes / 60); + if (hours < 24) return `${hours}h ago`; + const days = Math.floor(hours / 24); + if (days === 1) return 'yesterday'; + if (days < 7) return `${days}d ago`; + try { + return new Date(ms).toLocaleDateString(undefined, { month: 'short', day: 'numeric' }); + } catch { + return '—'; + } +} From 0e5e2ecf8b415edffdbc7adda9a4f96171af5f52 Mon Sep 17 00:00:00 2001 From: oxoxDev Date: Mon, 25 May 2026 13:01:27 +0530 Subject: [PATCH 52/64] chore: apply auto-fixes --- app/src-tauri/src/meet_audio/speak_pump.rs | 4 +--- src/openhuman/meet_agent/session.rs | 6 +----- 2 files changed, 2 insertions(+), 8 deletions(-) diff --git a/app/src-tauri/src/meet_audio/speak_pump.rs b/app/src-tauri/src/meet_audio/speak_pump.rs index 549a12ece4..4b83a15ce1 100644 --- a/app/src-tauri/src/meet_audio/speak_pump.rs +++ b/app/src-tauri/src/meet_audio/speak_pump.rs @@ -209,9 +209,7 @@ impl SpeakingTracker { "[meet-audio] speaking-state emit failed request_id={request_id} speaking={next} err={err}" ); } else { - log::debug!( - "[meet-audio] speaking-state -> {next} request_id={request_id}" - ); + log::debug!("[meet-audio] speaking-state -> {next} request_id={request_id}"); } } } diff --git a/src/openhuman/meet_agent/session.rs b/src/openhuman/meet_agent/session.rs index a6164da732..8698c15e4f 100644 --- a/src/openhuman/meet_agent/session.rs +++ b/src/openhuman/meet_agent/session.rs @@ -871,11 +871,7 @@ mod tests { // The bot must never wake on its own voice — regardless of // the text content, including text that happens to repeat the // wake phrase. - let fired = s.note_caption( - "OpenHuman", - "hey openhuman would you like to know more", - 1, - ); + let fired = s.note_caption("OpenHuman", "hey openhuman would you like to know more", 1); assert!(!fired, "bot-self caption must be filtered"); } From 18e70f3c7e5c13452f58969e729e786ca71f758f Mon Sep 17 00:00:00 2001 From: oxoxDev Date: Mon, 25 May 2026 13:17:13 +0530 Subject: [PATCH 53/64] feat(meet-agent): CaptionOutcome enum + soft-deny + owner-grant state MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit `note_caption` now returns a `CaptionOutcome` enum (Ignored / WakeFired / UnauthorizedWake) so callers can branch between the silent-drop, normal-turn, and polite-refusal paths without re-doing the gate logic out-of-band. The unauthorised path only fires when the non-owner caption actually contains a wake phrase — random chatter still goes through the existing `Ignored` branch. Session gains: - `pending_unauthorized_speaker` + timestamp (2 min window) - `allowlist: HashSet` of normalised speaker names - `allow_speaker(name)` adds to allowlist - `take_pending_unauthorized()` consumes the slot if fresh Wake gate now accepts owner OR any allowlisted speaker. Bot-self filter still returns Ignored (an UnauthorizedWake here would loop on the bot's own refusal caption). Tests cover non-owner soft-deny outcome, non-owner chatter still ignored, allowlist promotes a refused speaker, pending take consumes once. --- src/openhuman/meet_agent/session.rs | 249 ++++++++++++++++++++++++---- 1 file changed, 216 insertions(+), 33 deletions(-) diff --git a/src/openhuman/meet_agent/session.rs b/src/openhuman/meet_agent/session.rs index 8698c15e4f..52119f7b99 100644 --- a/src/openhuman/meet_agent/session.rs +++ b/src/openhuman/meet_agent/session.rs @@ -10,7 +10,7 @@ //! live in a process-wide `OnceLock>>`. The locking //! pattern matches `meet_call::MeetCallState` on the shell side. -use std::collections::HashMap; +use std::collections::{HashMap, HashSet}; use std::sync::{Mutex, OnceLock}; use std::time::{Instant, SystemTime, UNIX_EPOCH}; @@ -19,6 +19,33 @@ use base64::{engine::general_purpose::STANDARD as B64, Engine as _}; use super::ops::{self, Vad, VadEvent}; use super::types::{SessionEvent, SessionEventKind}; +/// What `note_caption` decided to do with a caption. Replaces the +/// prior boolean return so the RPC layer can branch between the +/// "fire a normal LLM turn", "speak a polite refusal", and "do +/// nothing" paths without re-doing the gate logic out-of-band. +#[derive(Debug, Clone, PartialEq, Eq)] +pub enum CaptionOutcome { + /// Caption was dropped: not a wake, dedupped, cooled down, or + /// during a turn-in-flight. No audible response. + Ignored, + /// Wake fired and the caller should kick `brain::run_caption_turn`. + WakeFired, + /// Wake phrase was detected from someone who is not the call + /// owner (or on a session that hasn't had identities configured). + /// The caller should speak a polite refusal via + /// `brain::run_soft_deny_turn` rather than silently dropping — + /// silence makes the bot look broken; a single explicit "sorry, + /// only can ask" line clears the air and tells the owner + /// how to grant access if they'd like to. + UnauthorizedWake { speaker: String }, +} + +/// How long after a denied wake the owner has to say "allow" before +/// the grant request expires. 2 minutes is enough for a back-and-forth +/// exchange ("hey openhuman" — refusal — owner: "go ahead, let them +/// ask") without leaving the gate softened indefinitely. +const PENDING_GRANT_WINDOW_MS: u64 = 120_000; + /// Cap on the inbound buffer so a runaway shell push (e.g. shell never /// stops, brain never drains) can't grow memory unboundedly. 30s @ 16kHz /// mono = 960 KB per session — generous for any reasonable utterance. @@ -129,6 +156,22 @@ pub struct MeetAgentSession { /// math, but the JSONL persistence layer needs an absolute /// timestamp that can be sorted across process restarts. started_at_ms: u64, + /// Normalised name of the most recent non-owner speaker that + /// tripped the wake word. Recorded so the owner can grant them + /// access by saying "allow" / "let them" / "go ahead" within + /// `PENDING_GRANT_WINDOW_MS` of the refusal. Cleared once a + /// grant lands or the window elapses. + pending_unauthorized_speaker: Option, + /// Wall-clock ms when `pending_unauthorized_speaker` was set. + /// The owner has `PENDING_GRANT_WINDOW_MS` from this point to + /// approve the asker. + pending_unauthorized_at_ms: u64, + /// Speakers (normalised display names) the owner has explicitly + /// allowed to wake the bot during this call. Wake gate accepts + /// captions whose speaker matches the owner OR appears here. + /// Resets on `stop_session` (the registry drops the whole + /// session). Empty by default — grants are opt-in per call. + allowlist: HashSet, } impl MeetAgentSession { @@ -160,7 +203,45 @@ impl MeetAgentSession { .duration_since(UNIX_EPOCH) .map(|d| d.as_millis() as u64) .unwrap_or(0), + pending_unauthorized_speaker: None, + pending_unauthorized_at_ms: 0, + allowlist: HashSet::new(), + } + } + + /// Add a speaker to the per-call allowlist. The wake gate + /// thereafter accepts captions from this speaker just like it + /// would from the owner — single source of truth so the + /// granted user can ask follow-up questions without saying + /// "allow" each time. Stored using the normalised name so + /// Meet's punctuation/case jitter doesn't reset the grant. + pub fn allow_speaker(&mut self, speaker_display_name: &str) { + let norm = normalise_participant_name(speaker_display_name); + if !norm.is_empty() { + self.allowlist.insert(norm); + } + } + + /// Consume the pending unauthorized speaker if still inside the + /// grant window. Returns the display name (in its normalised + /// form) so the brain layer can both grant them access and name + /// them in the spoken confirmation ("Okay, can ask me"). + /// Returns `None` when no pending grant exists or the window + /// has already elapsed. + pub fn take_pending_unauthorized(&mut self) -> Option { + let now_ms = SystemTime::now() + .duration_since(UNIX_EPOCH) + .map(|d| d.as_millis() as u64) + .unwrap_or(0); + let candidate = self.pending_unauthorized_speaker.take()?; + if now_ms.saturating_sub(self.pending_unauthorized_at_ms) > PENDING_GRANT_WINDOW_MS { + // Stale grant — drop without surfacing. The owner would + // need to re-trigger the refusal flow to re-arm. + self.pending_unauthorized_at_ms = 0; + return None; } + self.pending_unauthorized_at_ms = 0; + Some(candidate) } /// Record the Meet URL the call joined. Stored alongside the @@ -238,9 +319,9 @@ impl MeetAgentSession { /// the match in the same caption is treated as the start of the /// prompt; subsequent captions append until `take_pending_prompt` /// drains. - pub fn note_caption(&mut self, speaker: &str, text: &str, ts_ms: u64) -> bool { + pub fn note_caption(&mut self, speaker: &str, text: &str, ts_ms: u64) -> CaptionOutcome { if text.trim().is_empty() { - return false; + return CaptionOutcome::Ignored; } // Drop noise captions from Meet's local-user / UI affordances. // `speaker=="You"` is Meet's label for the local participant @@ -252,7 +333,7 @@ impl MeetAgentSession { // eating the prompt budget and producing endless speech. let speaker_lower = speaker.trim().to_lowercase(); if speaker_lower == "you" || speaker_lower.is_empty() { - return false; + return CaptionOutcome::Ignored; } // Privacy gate — owner-only wake. // @@ -280,7 +361,7 @@ impl MeetAgentSession { // owner check so a (very contrived) bot_display_name == // owner_display_name still doesn't let the bot wake itself. if !bot_norm.is_empty() && speaker_norm == bot_norm { - return false; + return CaptionOutcome::Ignored; } // Fail-closed when no owner has been configured. A live // session without a known owner is by definition unsafe — @@ -293,9 +374,44 @@ impl MeetAgentSession { self.request_id, speaker ); - return false; + return CaptionOutcome::Ignored; } - if speaker_norm != owner_norm { + // Treat owner + previously-granted allowlist members as + // authorised speakers for wake purposes. The allowlist is + // populated when the owner says "allow them" / "go ahead" + // / "let them ask" after a non-owner wake refusal — see + // `brain::run_caption_turn`'s grant-intent branch. + let speaker_is_authorised = + speaker_norm == owner_norm || self.allowlist.contains(&speaker_norm); + if !speaker_is_authorised { + // Walk the caption to see if it actually carries a wake + // phrase. Random conversation from a non-owner shouldn't + // trigger the polite refusal — only an attempt to wake + // the bot does. Mirrors the matcher used in the owner + // path below; intentionally duplicated rather than + // refactored to a shared helper so the (currently small) + // unauthorised-path stays self-contained. + let normalized_for_match = normalize_for_wake(text); + const WAKE_PHRASES: &[&str] = &[ + "hey open human", + "hi open human", + "hello open human", + "hey openhuman", + "hi openhuman", + "hello openhuman", + "open human", + "openhuman", + ]; + let mut hit = false; + for phrase in WAKE_PHRASES { + if normalized_for_match.contains(phrase) { + hit = true; + break; + } + } + if !hit { + return CaptionOutcome::Ignored; + } // Audit-style log so dev:app stdout makes the rejection // visible without leaking the caption body verbatim // (preview capped, matches the wake-preview style used @@ -309,7 +425,19 @@ impl MeetAgentSession { self.owner_display_name, preview ); - return false; + // Record the pending grant request. The owner has + // PENDING_GRANT_WINDOW_MS to approve them via the + // "allow" / "let them" / "go ahead" pattern; after that + // the slot expires and the unauthorised speaker has to + // re-trigger the refusal to re-arm. + self.pending_unauthorized_speaker = Some(speaker.trim().to_string()); + self.pending_unauthorized_at_ms = SystemTime::now() + .duration_since(UNIX_EPOCH) + .map(|d| d.as_millis() as u64) + .unwrap_or(0); + return CaptionOutcome::UnauthorizedWake { + speaker: speaker.trim().to_string(), + }; } // Per-speaker dedup. Meet's CC region re-renders the same line // every 250 ms poll tick and emits BOTH speaker rows on each @@ -328,7 +456,7 @@ impl MeetAgentSession { let normalised = normalise_for_dedup(text); if let Some(prev) = self.last_caption_by_speaker.get(&key) { if prev == &normalised { - return false; + return CaptionOutcome::Ignored; } } self.last_caption_by_speaker.insert(key, normalised); @@ -345,7 +473,7 @@ impl MeetAgentSession { SessionEventKind::Heard, format!("{speaker}: {text} (suppressed: turn_in_progress)"), ); - return false; + return CaptionOutcome::Ignored; } self.last_caption_ts_ms = ts_ms; // Already collecting after a previous wake word: just append @@ -358,7 +486,7 @@ impl MeetAgentSession { self.pending_prompt.push(' '); } self.pending_prompt.push_str(text.trim()); - return false; + return CaptionOutcome::Ignored; } // Min-turn-gap backstop. Even if the page-side caption // cooldown window expires, refuse to start a new turn @@ -381,7 +509,7 @@ impl MeetAgentSession { MIN_TURN_GAP_MS ), ); - return false; + return CaptionOutcome::Ignored; } // In cooldown after a recent turn — Meet keeps the same // utterance visible for several seconds, so without this @@ -397,7 +525,7 @@ impl MeetAgentSession { format!("{speaker}: {text}") }, ); - return false; + return CaptionOutcome::Ignored; } // Normalize before matching: Meet's STT punctuates the wake // phrase ("hey, openhuman"), capitalizes mid-sentence, and @@ -440,7 +568,7 @@ impl MeetAgentSession { SessionEventKind::Note, format!("wake word from speaker={speaker}"), ); - return true; + return CaptionOutcome::WakeFired; } // Outside a wake context, just record the line for the // transcript log. Useful for debugging "why didn't the agent @@ -454,7 +582,7 @@ impl MeetAgentSession { format!("{speaker}: {text}") }, ); - false + CaptionOutcome::Ignored } /// Drain the assembled wake-word prompt and clear the active @@ -806,8 +934,8 @@ mod tests { fn note_caption_handles_punctuated_wake() { let mut s = session_with_owner_alice(); // Meet often inserts a comma after "hey". - let fired = s.note_caption("Alice", "Hey, OpenHuman remember the launch", 1); - assert!(fired, "punctuated wake phrase should still fire"); + let outcome = s.note_caption("Alice", "Hey, OpenHuman remember the launch", 1); + assert_eq!(outcome, CaptionOutcome::WakeFired); let prompt = s.take_pending_prompt().expect("prompt drained"); assert_eq!(prompt, "remember the launch"); } @@ -815,8 +943,8 @@ mod tests { #[test] fn note_caption_handles_split_brand() { let mut s = session_with_owner_alice(); - let fired = s.note_caption("Alice", "hey open-human, send the report", 1); - assert!(fired); + let outcome = s.note_caption("Alice", "hey open-human, send the report", 1); + assert_eq!(outcome, CaptionOutcome::WakeFired); let prompt = s.take_pending_prompt().expect("prompt drained"); assert_eq!(prompt, "send the report"); } @@ -825,9 +953,13 @@ mod tests { fn note_caption_does_not_double_fire_on_growing_caption() { let mut s = session_with_owner_alice(); let first = s.note_caption("Alice", "hey openhuman take notes", 1); - assert!(first); + assert_eq!(first, CaptionOutcome::WakeFired); let second = s.note_caption("Alice", "hey openhuman take notes about the launch", 2); - assert!(!second, "second caption while wake_active must not refire"); + assert_eq!( + second, + CaptionOutcome::Ignored, + "second caption while wake_active must not refire" + ); let prompt = s.take_pending_prompt().expect("prompt drained"); // First wake stripped "hey openhuman"; the continuation // appended the WHOLE growing caption (still containing "hey @@ -858,21 +990,41 @@ mod tests { fn note_caption_rejects_non_owner_speaker() { let mut s = session_with_owner_alice(); // Bob is in the room but not the owner; even with a perfect - // wake phrase the gate must refuse. - let fired = s.note_caption("Bob", "hey openhuman read alice's slack DMs", 1); - assert!(!fired, "non-owner must not wake the bot"); + // wake phrase the gate must refuse with a soft-deny outcome + // (so the bot can speak a polite refusal) rather than + // silently ignoring. + let outcome = s.note_caption("Bob", "hey openhuman read alice's slack DMs", 1); + assert_eq!( + outcome, + CaptionOutcome::UnauthorizedWake { speaker: "Bob".into() }, + "non-owner wake must produce an UnauthorizedWake outcome" + ); + // Soft-deny path doesn't drain the wake prompt — the brain + // only synthesises a canned refusal line. assert!(s.take_pending_prompt().is_none()); } + #[test] + fn note_caption_non_owner_without_wake_phrase_is_ignored() { + // Random chatter from a non-owner shouldn't trigger the + // refusal — only an actual attempt to wake the bot does. + let mut s = session_with_owner_alice(); + let outcome = s.note_caption("Bob", "hey did you watch the game last night", 1); + assert_eq!(outcome, CaptionOutcome::Ignored); + } + #[test] fn note_caption_rejects_bot_self_caption() { let mut s = session_with_owner_alice(); // Meet often re-captions the bot's own TTS in the same region. // The bot must never wake on its own voice — regardless of // the text content, including text that happens to repeat the - // wake phrase. - let fired = s.note_caption("OpenHuman", "hey openhuman would you like to know more", 1); - assert!(!fired, "bot-self caption must be filtered"); + // wake phrase. Bot-self caption is `Ignored` (no audible + // response at all) rather than `UnauthorizedWake` — surfacing + // a soft-deny here would create an infinite loop where the + // refusal triggers its own bot-self caption. + let outcome = s.note_caption("OpenHuman", "hey openhuman would you like to know more", 1); + assert_eq!(outcome, CaptionOutcome::Ignored); } #[test] @@ -881,8 +1033,8 @@ mod tests { // speaker. Mirrors the misconfigured-launch posture: better // silent failure than an open mic for the user's tool surface. let mut s = MeetAgentSession::new("p".into(), 16_000); - let fired = s.note_caption("Alice", "hey openhuman do the thing", 1); - assert!(!fired, "empty owner must fail-closed"); + let outcome = s.note_caption("Alice", "hey openhuman do the thing", 1); + assert_eq!(outcome, CaptionOutcome::Ignored); } #[test] @@ -892,8 +1044,8 @@ mod tests { // gate still recognises Alice when Meet renders her as // "Alice (host)". let mut s = session_with_owner_alice(); - let fired = s.note_caption("Alice (host)", "hey openhuman take a note", 1); - assert!(fired, "owner with parenthetical decorator must match"); + let outcome = s.note_caption("Alice (host)", "hey openhuman take a note", 1); + assert_eq!(outcome, CaptionOutcome::WakeFired); } #[test] @@ -902,8 +1054,39 @@ mod tests { // entered in lowercase, or vice versa. The comparison must // be case-insensitive. let mut s = session_with_owner_alice(); - let fired = s.note_caption("ALICE", "hey openhuman summarise", 1); - assert!(fired, "owner match must be case-insensitive"); + let outcome = s.note_caption("ALICE", "hey openhuman summarise", 1); + assert_eq!(outcome, CaptionOutcome::WakeFired); + } + + #[test] + fn allowlist_grants_subsequent_wakes() { + // After the owner grants Bob via `allow_speaker`, Bob's + // next wake-phrase caption should fire just like the + // owner's — no soft-deny, no Ignored. + let mut s = session_with_owner_alice(); + // First attempt without a grant is soft-deny: + let denied = s.note_caption("Bob", "hey openhuman read slack", 1); + assert!(matches!(denied, CaptionOutcome::UnauthorizedWake { .. })); + // Owner grants Bob: + s.allow_speaker("Bob"); + // Bob now wakes successfully. Use a different text so the + // per-speaker dedup doesn't reject it. + let granted = s.note_caption("Bob", "hey openhuman what's the weather", 2); + assert_eq!(granted, CaptionOutcome::WakeFired); + } + + #[test] + fn take_pending_unauthorized_returns_within_window() { + // The soft-deny path records the speaker so the owner can + // grant them shortly after. Inside the window we get the + // name back; we'd need to fast-forward time to test the + // expiry path, so just assert the in-window happy path here. + let mut s = session_with_owner_alice(); + let _ = s.note_caption("Bob", "hey openhuman list my emails", 1); + let pending = s.take_pending_unauthorized(); + assert_eq!(pending.as_deref(), Some("Bob")); + // Consumed — second take returns None. + assert!(s.take_pending_unauthorized().is_none()); } #[test] From 1c29cff89e33ad56371f2cd624a6a34168e7ac83 Mon Sep 17 00:00:00 2001 From: oxoxDev Date: Mon, 25 May 2026 13:17:25 +0530 Subject: [PATCH 54/64] feat(meet-agent): run_soft_deny_turn + run_grant_turn MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Two new short brain paths that bypass the orchestrator agent: `run_soft_deny_turn` synthesises a canned refusal line ("Sorry , only can ask me things here. , say 'allow' to let them in.") and enqueues it as a normal TTS reply. Cancels any prior outbound first so the refusal doesn't queue behind a half-drained turn. Stamps turn-done so the min-turn-gap backstop also covers refusals — a chatty non-owner can't spam the gate every few seconds. `run_grant_turn` adds the previously-refused speaker to the session's per-call allowlist, speaks a short confirmation ("Okay, Bob can ask me now."), and clears the wake_active / turn_in_progress flags so the grantee's next caption can fire a fresh turn rather than coalescing into this one. `run_caption_turn` checks `looks_like_grant_intent` at the top of the prompt. If a pending unauthorised speaker exists within the 2-min grant window, the turn branches into `run_grant_turn` instead of the orchestrator. No pending request → fall through to the normal LLM path, so the model can still answer if the owner uses the same vocabulary in an unrelated query. Tests cover the canned message templates, the grant-intent matcher (accepts canonical phrases including "yes go ahead", "let them in"; rejects mid-prompt false positives like "did i allow that meeting"). --- src/openhuman/meet_agent/brain.rs | 228 ++++++++++++++++++++++++++++++ 1 file changed, 228 insertions(+) diff --git a/src/openhuman/meet_agent/brain.rs b/src/openhuman/meet_agent/brain.rs index ae7ca1d4ab..dbe0ed9cc2 100644 --- a/src/openhuman/meet_agent/brain.rs +++ b/src/openhuman/meet_agent/brain.rs @@ -118,6 +118,150 @@ const MIN_TURN_SAMPLES: usize = 4_000; /// the ops boundary check rejects anything else outright. const SAMPLE_RATE_HZ: u32 = super::ops::REQUIRED_SAMPLE_RATE; +/// Spoken refusal when a non-owner trips the wake word. Built per +/// call from the configured owner display name so the audible +/// response names the actual person who has the keys, and tells +/// the owner the magic word ("allow") to grant access. Kept short +/// so it doesn't drown the conversation. +fn soft_deny_message(asker: &str, owner: &str) -> String { + let asker = asker.trim(); + let owner = owner.trim(); + match (asker.is_empty(), owner.is_empty()) { + (true, true) => "Sorry, I only respond to my owner.".to_string(), + (true, false) => format!( + "Sorry, only {owner} can ask me things in this call. {owner}, say 'allow' if you'd like me to answer." + ), + (false, true) => format!("Sorry {asker}, I only respond to my owner."), + (false, false) => format!( + "Sorry {asker}, only {owner} can ask me things here. {owner}, say 'allow' to let them in." + ), + } +} + +/// Recognise an "open the gate" intent from the owner's first words +/// after the wake phrase. Conservative: only fires when the prompt +/// begins with one of the canonical permit verbs so an unrelated +/// owner query that happens to contain "allow" or "yes" deeper in +/// the sentence isn't hijacked. +/// +/// Returns `true` when the owner is explicitly granting access to +/// the most-recently-refused asker. The caller still gates on +/// session-level state (`take_pending_unauthorized`) — without a +/// pending request the intent is meaningless and the prompt should +/// just run as a normal LLM turn. +fn looks_like_grant_intent(prompt: &str) -> bool { + let p = prompt.trim().to_ascii_lowercase(); + if p.is_empty() { + return false; + } + // Whole-prompt matches first so short approvals ("allow", "yes") + // don't collide with longer prompts that happen to start with + // the same word. + matches!(p.as_str(), "allow" | "yes" | "ok" | "okay" | "go ahead" | "let them in" | "let them ask" | "permit") + || p.starts_with("allow ") + || p.starts_with("let them") + || p.starts_with("let him") + || p.starts_with("let her") + || p.starts_with("go ahead") + || p.starts_with("yes go ahead") + || p.starts_with("yes let") + || p.starts_with("permit ") + || p.starts_with("you can answer") + || p.starts_with("you can tell") +} + +/// Owner-grant path: the owner said "allow them" / "go ahead" / +/// "let them in" after a non-owner's wake refusal. Add the +/// previously-refused speaker to the per-call allowlist (so their +/// next wake fires through to the orchestrator), and speak a +/// short confirmation so they know they're in. +pub async fn run_grant_turn(request_id: &str, grantee: &str) -> Result { + let grantee = grantee.trim(); + let message = if grantee.is_empty() { + "Okay, you can ask me now.".to_string() + } else { + format!("Okay, {grantee} can ask me now.") + }; + log::info!("[meet-agent] grant request_id={request_id} grantee=\"{grantee}\""); + // Apply the grant on the session BEFORE speaking — if TTS races + // and the grantee re-asks during synthesis, we want their next + // wake to fire through. Also cancel any prior outbound so the + // confirmation doesn't queue behind a half-drained refusal. + let _ = registry().with_session(request_id, |s| { + s.allow_speaker(grantee); + s.cancel_outbound(); + }); + let samples = match tts(&message).await { + Ok(samples) => samples, + Err(err) => { + log::warn!("[meet-agent] grant TTS failed request_id={request_id} err={err}"); + stub_tts(&message).await + } + }; + registry().with_session(request_id, |s| { + s.record_event( + SessionEventKind::Note, + format!("owner granted wake access to {grantee}"), + ); + s.record_event(SessionEventKind::Spoke, message.clone()); + if !samples.is_empty() { + s.enqueue_outbound_pcm(&samples, true); + } + // Clear the wake_active + turn_in_progress flags so the + // next caption (likely the grantee's actual question) can + // fire a new turn. Without this, the wake state from the + // owner's "allow them" prompt would coalesce the grantee's + // first real caption into a continuation of this grant turn. + s.wake_active = false; + s.turn_in_progress = false; + s.mark_turn_done(); + })?; + Ok(true) +} + +/// Soft-deny path: kick a polite refusal TTS reply when the wake +/// word fires from a non-owner. Does NOT touch the orchestrator +/// agent (no tool calls, no memory writes) — it's a single canned +/// line, so the failure modes are limited to TTS errors. +/// +/// The session has already recorded the pending grant request +/// inside `note_caption`, so all this routine has to do is +/// synthesize + enqueue the line + log a transcript event. +pub async fn run_soft_deny_turn(request_id: &str, asker: &str) -> Result { + let owner = registry() + .with_session(request_id, |s| s.owner_display_name().to_string()) + .unwrap_or_default(); + let message = soft_deny_message(asker, &owner); + log::info!( + "[meet-agent] soft-deny request_id={request_id} asker=\"{asker}\" owner=\"{owner}\"" + ); + // Cancel any prior outbound so the refusal doesn't queue behind a + // half-drained reply from a previous turn. + let _ = registry().with_session(request_id, |s| s.cancel_outbound()); + let samples = match tts(&message).await { + Ok(samples) => samples, + Err(err) => { + log::warn!("[meet-agent] soft-deny TTS failed request_id={request_id} err={err}"); + stub_tts(&message).await + } + }; + registry().with_session(request_id, |s| { + s.record_event( + SessionEventKind::Note, + format!("soft-deny: {asker} attempted wake without owner approval"), + ); + s.record_event(SessionEventKind::Spoke, message.clone()); + if !samples.is_empty() { + s.enqueue_outbound_pcm(&samples, true); + } + // Stamp turn-done so the min-turn-gap backstop covers the + // refusal the same way it covers a real reply. Without this, + // a chatty non-owner could re-trip the gate every few seconds. + s.mark_turn_done(); + })?; + Ok(true) +} + /// Caption-driven turn. Drains the session's pending wake-word prompt /// (assembled by `session::note_caption`) and runs LLM → TTS → enqueue /// outbound. Skips STT entirely — the captions are already text. @@ -174,6 +318,29 @@ pub async fn run_caption_turn(request_id: &str) -> Result { was_bare_wake, ); + // Grant-intent fast path. When the owner says "hey openhuman, + // allow them" / "let them in" / "go ahead" after a non-owner + // wake refusal, treat the turn as a single-shot session-level + // grant rather than handing the prompt to the orchestrator. + // The pending grantee was captured by `note_caption` at refusal + // time and lives on the session for `PENDING_GRANT_WINDOW_MS`. + if !was_bare_wake && looks_like_grant_intent(&prompt) { + let pending = registry() + .with_session(request_id, |s| s.take_pending_unauthorized()) + .ok() + .flatten(); + if let Some(grantee) = pending { + return run_grant_turn(request_id, &grantee).await; + } + // No pending request to grant — fall through to the normal + // LLM path. The model can interpret "allow" however it + // wants from there; without a pending grantee we have no + // session-level meaning to attach to it. + log::info!( + "[meet-agent] grant-intent prompt detected but no pending request — falling through request_id={request_id}" + ); + } + // Pre-roll filler. The orchestrator + integration tools take // 30–60s on slow paths (Slack / Gmail / Calendar). Without an // immediate acoustic cue, the user assumes the bot is broken and @@ -1094,4 +1261,65 @@ mod tests { assert_eq!(strip_for_speech(""), ""); assert_eq!(strip_for_speech(" \n "), ""); } + + #[test] + fn soft_deny_message_names_both_owner_and_asker() { + let line = soft_deny_message("Bob", "Alice"); + assert!(line.contains("Bob"), "must address the asker: {line}"); + assert!(line.contains("Alice"), "must name the owner: {line}"); + assert!(line.to_lowercase().contains("allow"), "must hint the magic word: {line}"); + } + + #[test] + fn soft_deny_message_handles_missing_names_gracefully() { + // No asker, no owner — should still be a polite English sentence, + // not a templated stub with empty placeholders. + let line = soft_deny_message("", ""); + assert!(!line.is_empty()); + assert!(!line.contains("{"), "must not leak format placeholders: {line}"); + } + + #[test] + fn looks_like_grant_intent_accepts_canonical_phrases() { + // Whole-prompt approvals. + for phrase in ["allow", "yes", "ok", "okay", "go ahead", "permit"] { + assert!( + looks_like_grant_intent(phrase), + "must accept bare approval phrase: {phrase}" + ); + } + // Common longer forms. + for phrase in [ + "allow them", + "allow Bob to ask", + "let them in", + "let them ask", + "let her ask", + "go ahead and answer them", + "yes go ahead", + "permit Bob", + "you can tell Bob", + ] { + assert!(looks_like_grant_intent(phrase), "should accept: {phrase}"); + } + } + + #[test] + fn looks_like_grant_intent_rejects_unrelated_prompts() { + // Words that happen to contain "allow" / "yes" mid-prompt + // shouldn't hijack a normal question — the matcher only + // honors prompts that BEGIN with a permit verb. + for phrase in [ + "what's on my calendar today", + "did i allow that meeting earlier", + "yesterday's notes please", + "remind me to ok the budget", + "permittivity of free space", + ] { + assert!( + !looks_like_grant_intent(phrase), + "must not match unrelated prompt: {phrase}" + ); + } + } } From 4e52b0ac18a811eaf0617c63bf859bf26d6a5dec Mon Sep 17 00:00:00 2001 From: oxoxDev Date: Mon, 25 May 2026 13:17:33 +0530 Subject: [PATCH 55/64] feat(meet-agent): dispatch CaptionOutcome to soft-deny / wake / ignore `handle_push_caption` now switches on the `CaptionOutcome` enum returned by `session::note_caption`. `WakeFired` spawns the existing `run_caption_turn`; `UnauthorizedWake` spawns the new `run_soft_deny_turn` (passing the asker's display name so the spoken refusal can address them by name); `Ignored` is a no-op. `turn_started` in the response stays true only for `WakeFired` so the existing shell-side UI hints don't see a refusal as an authorised turn. --- src/openhuman/meet_agent/rpc.rs | 62 ++++++++++++++++++++++++--------- 1 file changed, 45 insertions(+), 17 deletions(-) diff --git a/src/openhuman/meet_agent/rpc.rs b/src/openhuman/meet_agent/rpc.rs index e89a9aaec7..5cb7e9d93d 100644 --- a/src/openhuman/meet_agent/rpc.rs +++ b/src/openhuman/meet_agent/rpc.rs @@ -18,7 +18,7 @@ use crate::rpc::RpcOutcome; use super::brain; use super::ops::VadEvent; -use super::session::registry; +use super::session::{registry, CaptionOutcome}; use super::store::{self, MeetCallRecord}; use super::types::{ ListCallsRequest, ListCallsResponse, PollSpeechRequest, PushCaptionRequest, @@ -110,35 +110,63 @@ pub async fn handle_push_caption(params: Map) -> Result can ask" and the owner is told + // how to grant them access + // - Ignored → no audible response + let turn_started = matches!(outcome, CaptionOutcome::WakeFired); + match outcome { + CaptionOutcome::WakeFired => { + log::info!( + "{LOG_PREFIX} wake word fired request_id={} speaker={}", + req.request_id, + req.speaker + ); + let request_id = req.request_id.clone(); + tokio::spawn(async move { + if let Err(err) = brain::run_caption_turn(&request_id).await { + log::warn!( + "{LOG_PREFIX} caption-turn failed request_id={request_id} err={err}" + ); + } + }); + } + CaptionOutcome::UnauthorizedWake { speaker } => { + log::info!( + "{LOG_PREFIX} unauthorized wake — soft-deny turn request_id={} speaker={}", + req.request_id, + speaker + ); + let request_id = req.request_id.clone(); + tokio::spawn(async move { + if let Err(err) = brain::run_soft_deny_turn(&request_id, &speaker).await { + log::warn!( + "{LOG_PREFIX} soft-deny turn failed request_id={request_id} err={err}" + ); + } + }); + } + CaptionOutcome::Ignored => {} } RpcOutcome::new( json!({ "ok": true, - "turn_started": wake_fired, + "turn_started": turn_started, }), vec![], ) From 7969b4d9ebd013eae0df7ed8bfa8d53fa256c4e9 Mon Sep 17 00:00:00 2001 From: oxoxDev Date: Mon, 25 May 2026 13:18:40 +0530 Subject: [PATCH 56/64] chore: apply auto-fixes --- src/openhuman/meet_agent/brain.rs | 16 ++++++++++++---- src/openhuman/meet_agent/session.rs | 4 +++- 2 files changed, 15 insertions(+), 5 deletions(-) diff --git a/src/openhuman/meet_agent/brain.rs b/src/openhuman/meet_agent/brain.rs index dbe0ed9cc2..30099aef75 100644 --- a/src/openhuman/meet_agent/brain.rs +++ b/src/openhuman/meet_agent/brain.rs @@ -157,8 +157,10 @@ fn looks_like_grant_intent(prompt: &str) -> bool { // Whole-prompt matches first so short approvals ("allow", "yes") // don't collide with longer prompts that happen to start with // the same word. - matches!(p.as_str(), "allow" | "yes" | "ok" | "okay" | "go ahead" | "let them in" | "let them ask" | "permit") - || p.starts_with("allow ") + matches!( + p.as_str(), + "allow" | "yes" | "ok" | "okay" | "go ahead" | "let them in" | "let them ask" | "permit" + ) || p.starts_with("allow ") || p.starts_with("let them") || p.starts_with("let him") || p.starts_with("let her") @@ -1267,7 +1269,10 @@ mod tests { let line = soft_deny_message("Bob", "Alice"); assert!(line.contains("Bob"), "must address the asker: {line}"); assert!(line.contains("Alice"), "must name the owner: {line}"); - assert!(line.to_lowercase().contains("allow"), "must hint the magic word: {line}"); + assert!( + line.to_lowercase().contains("allow"), + "must hint the magic word: {line}" + ); } #[test] @@ -1276,7 +1281,10 @@ mod tests { // not a templated stub with empty placeholders. let line = soft_deny_message("", ""); assert!(!line.is_empty()); - assert!(!line.contains("{"), "must not leak format placeholders: {line}"); + assert!( + !line.contains("{"), + "must not leak format placeholders: {line}" + ); } #[test] diff --git a/src/openhuman/meet_agent/session.rs b/src/openhuman/meet_agent/session.rs index 52119f7b99..acb92fe59b 100644 --- a/src/openhuman/meet_agent/session.rs +++ b/src/openhuman/meet_agent/session.rs @@ -996,7 +996,9 @@ mod tests { let outcome = s.note_caption("Bob", "hey openhuman read alice's slack DMs", 1); assert_eq!( outcome, - CaptionOutcome::UnauthorizedWake { speaker: "Bob".into() }, + CaptionOutcome::UnauthorizedWake { + speaker: "Bob".into() + }, "non-owner wake must produce an UnauthorizedWake outcome" ); // Soft-deny path doesn't drain the wake prompt — the brain From 62a70b48c7a64e7c5a2f6dc5e7e2d73422871196 Mon Sep 17 00:00:00 2001 From: oxoxDev Date: Mon, 25 May 2026 13:23:21 +0530 Subject: [PATCH 57/64] fix(skills): drop {label} placeholder from sendTo/comingSoon i18n strings MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The Meeting Bots modal's submit button renders the platform label by string-concatenating the translation with `selected.label` (\`\${t('sendTo')} \${selected.label}\` / \`\${selected.label} \${t('comingSoon')}\`). The base `t()` does not interpolate, so en/ko translations that embedded `{label}` showed up verbatim — "Send to {label} Google Meet" and "{label} coming soon" — instead of the intended interpolation. All other locale chunks already use bare "Send to" / "Coming soon" strings to match the concat pattern. Bring en + ko in line so the button reads correctly in those locales too. --- app/src/lib/i18n/en.ts | 4 ++-- app/src/lib/i18n/ko.ts | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/app/src/lib/i18n/en.ts b/app/src/lib/i18n/en.ts index 9ac18e4cf8..59ad212498 100644 --- a/app/src/lib/i18n/en.ts +++ b/app/src/lib/i18n/en.ts @@ -2109,7 +2109,7 @@ const en: TranslationMap = { 'OpenHuman joins calls silently and handles notes and follow-ups automatically.', 'skills.meetingBots.bannerTitle': 'Send OpenHuman to your next meeting', 'skills.meetingBots.busyTitle': 'OpenHuman is busy', - 'skills.meetingBots.comingSoon': '{label} coming soon', + 'skills.meetingBots.comingSoon': 'coming soon', 'skills.meetingBots.couldNotStartTitle': 'Could not start OpenHuman', 'skills.meetingBots.displayName': 'Display name', 'skills.meetingBots.failedToStart': 'Failed to start OpenHuman.', @@ -2121,7 +2121,7 @@ const en: TranslationMap = { 'OpenHuman will join as a silent participant and help with notes and follow-ups.', 'skills.meetingBots.modalTitle': 'Send OpenHuman to a meeting', 'skills.meetingBots.newBadge': 'New', - 'skills.meetingBots.sendTo': 'Send to {label}', + 'skills.meetingBots.sendTo': 'Send to', 'skills.meetingBots.starting': 'Starting…', 'skills.resource.preview.closeAriaLabel': 'Close preview', 'skills.resource.preview.failed': 'Preview failed', diff --git a/app/src/lib/i18n/ko.ts b/app/src/lib/i18n/ko.ts index 5e85ff7b8b..1e7387d1f6 100644 --- a/app/src/lib/i18n/ko.ts +++ b/app/src/lib/i18n/ko.ts @@ -1957,7 +1957,7 @@ const ko: TranslationMap = { 'OpenHuman이 통화에 조용히 참여하여 메모와 후속 조치를 자동으로 처리합니다.', 'skills.meetingBots.bannerTitle': '다음 회의에 OpenHuman 보내기', 'skills.meetingBots.busyTitle': 'OpenHuman이 바쁩니다', - 'skills.meetingBots.comingSoon': '{label} 곧 제공 예정', + 'skills.meetingBots.comingSoon': '곧 제공 예정', 'skills.meetingBots.couldNotStartTitle': 'OpenHuman을 시작할 수 없습니다', 'skills.meetingBots.displayName': '표시 이름', 'skills.meetingBots.failedToStart': 'OpenHuman 시작에 실패했습니다.', @@ -1969,7 +1969,7 @@ const ko: TranslationMap = { 'OpenHuman이 조용한 참가자로 참여하여 메모와 후속 조치를 도와줍니다.', 'skills.meetingBots.modalTitle': 'OpenHuman을 회의에 보내기', 'skills.meetingBots.newBadge': '새 항목', - 'skills.meetingBots.sendTo': '{label}로 보내기', + 'skills.meetingBots.sendTo': '보내기', 'skills.meetingBots.starting': '시작 중…', 'skills.resource.preview.closeAriaLabel': '미리보기 닫기', 'skills.resource.preview.failed': '미리보기 실패', From 60916bbfb8ab4d9fbe70a037c24b1252ede3c780 Mon Sep 17 00:00:00 2001 From: oxoxDev Date: Mon, 25 May 2026 13:29:13 +0530 Subject: [PATCH 58/64] fix(meet-agent): declare owner_display_name / bot_display_name / meet_url in start_session schema MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The controller schema validator rejected the new fields as unknown params: meet_audio start failed err=rpc error: {"code":-32000, ..., "message":"unknown param 'bot_display_name' for meet_agent.start_session"} Plan C added the fields to `StartSessionRequest` (with serde default fallbacks) and Plan A added `meet_url`, but the schema declaration in `schemas.rs` was never updated. Add all three as optional fields so the dispatch layer admits them and the gate / persistence paths actually run. Knock-on effect of the rejection: `meet_audio::start` bailed before installing the audio bridge or starting the frame bus, so the gUM intercept never installed → Meet exposed the host's real camera instead of the mascot canvas. Fixing the schema restores the full pipeline. --- src/openhuman/meet_agent/schemas.rs | 25 +++++++++++++++++++++++++ 1 file changed, 25 insertions(+) diff --git a/src/openhuman/meet_agent/schemas.rs b/src/openhuman/meet_agent/schemas.rs index c59b4779d7..54ca599e2e 100644 --- a/src/openhuman/meet_agent/schemas.rs +++ b/src/openhuman/meet_agent/schemas.rs @@ -87,6 +87,31 @@ fn schema_start_session() -> ControllerSchema { comment: "Sample rate of inbound/outbound PCM. Default 16000.", required: false, }, + FieldSchema { + name: "owner_display_name", + ty: TypeSchema::String, + comment: + "Display name of the call owner (the user who launched the bot). \ + Used by the wake-word gate as the only speaker authorised to trigger \ + tool calls. Empty fails closed.", + required: false, + }, + FieldSchema { + name: "bot_display_name", + ty: TypeSchema::String, + comment: + "Display name the bot uses as its Meet participant tile. Used to drop \ + the bot's own captions (self-echo filter).", + required: false, + }, + FieldSchema { + name: "meet_url", + ty: TypeSchema::String, + comment: + "Normalised Meet URL the call joined. Persisted into the recent-calls \ + log on stop_session.", + required: false, + }, ], outputs: vec![ FieldSchema { From 6d1e36592049909c004538ede810b52fb20a72df Mon Sep 17 00:00:00 2001 From: oxoxDev Date: Mon, 25 May 2026 13:30:24 +0530 Subject: [PATCH 59/64] chore: apply auto-fixes --- src/openhuman/meet_agent/schemas.rs | 9 +++------ 1 file changed, 3 insertions(+), 6 deletions(-) diff --git a/src/openhuman/meet_agent/schemas.rs b/src/openhuman/meet_agent/schemas.rs index 54ca599e2e..1f143c223b 100644 --- a/src/openhuman/meet_agent/schemas.rs +++ b/src/openhuman/meet_agent/schemas.rs @@ -90,8 +90,7 @@ fn schema_start_session() -> ControllerSchema { FieldSchema { name: "owner_display_name", ty: TypeSchema::String, - comment: - "Display name of the call owner (the user who launched the bot). \ + comment: "Display name of the call owner (the user who launched the bot). \ Used by the wake-word gate as the only speaker authorised to trigger \ tool calls. Empty fails closed.", required: false, @@ -99,16 +98,14 @@ fn schema_start_session() -> ControllerSchema { FieldSchema { name: "bot_display_name", ty: TypeSchema::String, - comment: - "Display name the bot uses as its Meet participant tile. Used to drop \ + comment: "Display name the bot uses as its Meet participant tile. Used to drop \ the bot's own captions (self-echo filter).", required: false, }, FieldSchema { name: "meet_url", ty: TypeSchema::String, - comment: - "Normalised Meet URL the call joined. Persisted into the recent-calls \ + comment: "Normalised Meet URL the call joined. Persisted into the recent-calls \ log on stop_session.", required: false, }, From 62732c7b0e8b39d4f29b21a79309b56857f6cab3 Mon Sep 17 00:00:00 2001 From: oxoxDev Date: Mon, 25 May 2026 13:45:09 +0530 Subject: [PATCH 60/64] fix(meet-agent): dedup unauthorized wakes before soft-deny dispatch MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Plan D landed the unauthorised-wake branch ABOVE the per-speaker dedup + min-turn-gap + cooldown + turn-in-progress gates. Meet's caption observer re-emits the same caption row every 250 ms while the speaker is still visible in the CC region, so each tick fired a fresh UnauthorizedWake → soft-deny TTS — producing the "sorry sorry sorry" loop seen in dev:app on 2026-05-25 (also producing 429s from the TTS endpoint as the loop hit rate-limits). Restructure: compute `speaker_is_authorised` early, run all rate-limit gates uniformly for both authorised and unauthorised speakers, then branch on authorised at the wake-phrase match point. Restrict the wake_active prompt-continuation append to authorised speakers too so a non-owner can't smuggle text into the in-flight owner prompt. Regression test `note_caption_unauthorized_wake_does_not_loop_on_identical_caption` asserts the first emission produces `UnauthorizedWake` and subsequent emissions of the same (or punctuation-jittered) text are deduped to `Ignored`. --- src/openhuman/meet_agent/session.rs | 131 +++++++++++++++------------- 1 file changed, 69 insertions(+), 62 deletions(-) diff --git a/src/openhuman/meet_agent/session.rs b/src/openhuman/meet_agent/session.rs index acb92fe59b..9cd42a657b 100644 --- a/src/openhuman/meet_agent/session.rs +++ b/src/openhuman/meet_agent/session.rs @@ -381,64 +381,18 @@ impl MeetAgentSession { // populated when the owner says "allow them" / "go ahead" // / "let them ask" after a non-owner wake refusal — see // `brain::run_caption_turn`'s grant-intent branch. + // + // The actual authorised/unauthorised branch happens AFTER + // all the rate-limit gates (dedup, turn-in-progress, min- + // turn-gap, cooldown) below, so the same caption repeating + // every 250 ms — which Meet does aggressively while a + // participant is still visible in the CC region — cannot + // spam the refusal path either. Without that ordering the + // soft-deny TTS triggers a fresh refusal on every Meet + // re-emit of the identical caption text. Smoke-tested as + // the "sorry sorry sorry" loop on 2026-05-25. let speaker_is_authorised = speaker_norm == owner_norm || self.allowlist.contains(&speaker_norm); - if !speaker_is_authorised { - // Walk the caption to see if it actually carries a wake - // phrase. Random conversation from a non-owner shouldn't - // trigger the polite refusal — only an attempt to wake - // the bot does. Mirrors the matcher used in the owner - // path below; intentionally duplicated rather than - // refactored to a shared helper so the (currently small) - // unauthorised-path stays self-contained. - let normalized_for_match = normalize_for_wake(text); - const WAKE_PHRASES: &[&str] = &[ - "hey open human", - "hi open human", - "hello open human", - "hey openhuman", - "hi openhuman", - "hello openhuman", - "open human", - "openhuman", - ]; - let mut hit = false; - for phrase in WAKE_PHRASES { - if normalized_for_match.contains(phrase) { - hit = true; - break; - } - } - if !hit { - return CaptionOutcome::Ignored; - } - // Audit-style log so dev:app stdout makes the rejection - // visible without leaking the caption body verbatim - // (preview capped, matches the wake-preview style used - // upstream in handle_push_caption). - let preview: String = text.chars().take(40).collect(); - log::info!( - "[meet-agent] unauthorized_wake_attempt request_id={} \ - speaker=\"{}\" owner=\"{}\" preview=\"{}\"", - self.request_id, - speaker, - self.owner_display_name, - preview - ); - // Record the pending grant request. The owner has - // PENDING_GRANT_WINDOW_MS to approve them via the - // "allow" / "let them" / "go ahead" pattern; after that - // the slot expires and the unauthorised speaker has to - // re-trigger the refusal to re-arm. - self.pending_unauthorized_speaker = Some(speaker.trim().to_string()); - self.pending_unauthorized_at_ms = SystemTime::now() - .duration_since(UNIX_EPOCH) - .map(|d| d.as_millis() as u64) - .unwrap_or(0); - return CaptionOutcome::UnauthorizedWake { - speaker: speaker.trim().to_string(), - }; - } // Per-speaker dedup. Meet's CC region re-renders the same line // every 250 ms poll tick and emits BOTH speaker rows on each // walk (the user AND the bot TTS as speaker="You"). A single- @@ -476,12 +430,17 @@ impl MeetAgentSession { return CaptionOutcome::Ignored; } self.last_caption_ts_ms = ts_ms; - // Already collecting after a previous wake word: just append - // the new caption. No second fire — the brain is already - // scheduled and will drain the prompt in ~1.5 s. Without this - // gate, a slowly-growing caption fires the wake word on - // every dedupe-then-grow cycle. - if self.wake_active { + // Already collecting after a previous (authorised) wake word: + // append the continuation. No second fire — the brain is + // already scheduled and will drain the prompt in ~1.5 s. + // Without this gate, a slowly-growing caption fires the wake + // word on every dedupe-then-grow cycle. + // + // Restricted to authorised speakers so a non-owner can't + // smuggle text into the in-flight owner prompt (e.g. owner + // says "hey openhuman, what's on my calendar"; non-owner + // mid-prompt: "and read alice's slack"). + if self.wake_active && speaker_is_authorised { if !self.pending_prompt.is_empty() { self.pending_prompt.push(' '); } @@ -560,6 +519,34 @@ impl MeetAgentSession { } } if let Some((idx, phrase)) = wake_hit { + // Wake phrase detected — branch on whether the speaker is + // allowed to actually drive the bot. Non-owner + not + // allowlisted → polite refusal turn; owner + allowlist → + // normal LLM turn. + if !speaker_is_authorised { + let preview: String = text.chars().take(40).collect(); + log::info!( + "[meet-agent] unauthorized_wake_attempt request_id={} \ + speaker=\"{}\" owner=\"{}\" preview=\"{}\"", + self.request_id, + speaker, + self.owner_display_name, + preview + ); + // Record the pending grant request. The owner has + // PENDING_GRANT_WINDOW_MS to approve them via the + // "allow" / "let them" / "go ahead" pattern; after + // that the slot expires and the unauthorised speaker + // has to re-trigger the refusal to re-arm. + self.pending_unauthorized_speaker = Some(speaker.trim().to_string()); + self.pending_unauthorized_at_ms = SystemTime::now() + .duration_since(UNIX_EPOCH) + .map(|d| d.as_millis() as u64) + .unwrap_or(0); + return CaptionOutcome::UnauthorizedWake { + speaker: speaker.trim().to_string(), + }; + } let after = idx + phrase.len(); let tail = normalized.get(after..).unwrap_or("").trim().to_string(); self.pending_prompt = tail; @@ -1077,6 +1064,26 @@ mod tests { assert_eq!(granted, CaptionOutcome::WakeFired); } + #[test] + fn note_caption_unauthorized_wake_does_not_loop_on_identical_caption() { + // Regression: Meet's caption observer re-emits the same row + // every 250 ms while it's still visible. The first emission + // produces an UnauthorizedWake; subsequent identical + // emissions must be deduped to `Ignored` so the soft-deny + // TTS doesn't fire on every tick ("sorry, sorry, sorry…" + // loop seen in dev:app on 2026-05-25). + let mut s = session_with_owner_alice(); + let first = s.note_caption("Bob", "hey openhuman read my dms", 1); + assert!(matches!(first, CaptionOutcome::UnauthorizedWake { .. })); + // Same text from same speaker — must dedup to Ignored. + let second = s.note_caption("Bob", "hey openhuman read my dms", 2); + assert_eq!(second, CaptionOutcome::Ignored); + // Punctuation/case jitter on the same utterance still dedups + // because the normaliser strips it before compare. + let third = s.note_caption("Bob", "Hey, openhuman read my DMs.", 3); + assert_eq!(third, CaptionOutcome::Ignored); + } + #[test] fn take_pending_unauthorized_returns_within_window() { // The soft-deny path records the speaker so the owner can From 52a22d35cef2d8fc85944301c23215c3500df860 Mon Sep 17 00:00:00 2001 From: oxoxDev Date: Mon, 25 May 2026 18:30:04 +0530 Subject: [PATCH 61/64] fix(meet-agent): session-wide soft-deny cooldown + greeting/refusal split MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Two follow-up bugs from the first soft-deny smoke: 1) Meet's STT re-transcribes the same utterance with text jitter ("Openhuman. I open." → "Openhuman. High openhum." → "Openhuman. High Openhuman.") so the per-text dedup misses the variants. Each fired a fresh soft-deny TTS, producing the "sorry sorry sorry" loop and 429 rate-limits from the TTS backend. Fix: session-wide UNAUTHORIZED_COOLDOWN_MS (60s, 1 dispatch per window). Tracked on a new `last_unauthorized_dispatch_at_ms` field on the session. Independent of the owner's `last_turn_done_at_ms` so the owner can still wake (e.g. say "allow them") within seconds of a refusal. 2) Greetings from non-owners were getting refused instead of answered. New `classify_unauthorized_intent` looks at the post-wake tail — bare wake or greeting-only words ("hi", "hello", "good morning", "there", "everyone", ...) maps to `Greeting`; substantive task asks map to `TaskAsk`. `run_soft_deny_turn` branches on intent: Greeting → "Hi ! Nice to meet you." (no privacy gate noise on a hello) TaskAsk → the existing refusal + "say 'allow' to let them in" hint `CaptionOutcome::UnauthorizedWake` now carries the full caption text so the brain layer can classify; rpc.rs forwards it into the spawned turn. Tests: - session: cooldown blocks text-variants + cross-speaker - brain: greeting / filler / task classification --- src/openhuman/meet_agent/brain.rs | 183 +++++++++++++++++++++++++--- src/openhuman/meet_agent/rpc.rs | 4 +- src/openhuman/meet_agent/session.rs | 85 +++++++++++-- 3 files changed, 244 insertions(+), 28 deletions(-) diff --git a/src/openhuman/meet_agent/brain.rs b/src/openhuman/meet_agent/brain.rs index 30099aef75..04ff2d1b0d 100644 --- a/src/openhuman/meet_agent/brain.rs +++ b/src/openhuman/meet_agent/brain.rs @@ -118,6 +118,94 @@ const MIN_TURN_SAMPLES: usize = 4_000; /// the ops boundary check rejects anything else outright. const SAMPLE_RATE_HZ: u32 = super::ops::REQUIRED_SAMPLE_RATE; +/// Classify a non-owner caption that tripped the wake word. The +/// gate has already decided the speaker isn't authorised; this +/// picks between a friendly hi-back (greeting / pleasantry) and +/// a polite refusal (real task ask). Matching is conservative: +/// when the post-wake tail is empty OR only contains greeting +/// words, treat it as a greeting. Anything else is assumed to be +/// a task ask. +fn classify_unauthorized_intent(caption_text: &str) -> UnauthorizedIntent { + // Lift the bit of text that comes after the matched wake + // phrase so we don't get fooled by the wake itself ("hey + // openhuman" obviously contains "hey"). + let lower = caption_text.to_ascii_lowercase(); + let wake_phrases = [ + "hey open human", + "hi open human", + "hello open human", + "hey openhuman", + "hi openhuman", + "hello openhuman", + "open human", + "openhuman", + ]; + let tail = wake_phrases + .iter() + .filter_map(|p| lower.find(p).map(|i| &lower[i + p.len()..])) + .next() + .unwrap_or(&lower); + // Strip punctuation / common filler so "hi there!" reduces to + // ["hi", "there"]. Keeping the word list cheap and English-only + // for v1; the locale-aware story lands with multilingual TTS. + let words: Vec<&str> = tail + .split(|c: char| !c.is_ascii_alphanumeric()) + .filter(|w| !w.is_empty()) + .collect(); + if words.is_empty() { + return UnauthorizedIntent::Greeting; + } + const GREETING_WORDS: &[&str] = &[ + "hi", + "hello", + "hey", + "yo", + "sup", + "howdy", + "greetings", + "hola", + "good", + "morning", + "afternoon", + "evening", + "night", + "there", + "everyone", + "all", + "folks", + "team", + "guys", + "yall", + ]; + if words.iter().all(|w| GREETING_WORDS.contains(w)) { + UnauthorizedIntent::Greeting + } else { + UnauthorizedIntent::TaskAsk + } +} + +/// Output of `classify_unauthorized_intent`. Drives whether the +/// soft-deny turn speaks a friendly hi-back or a polite refusal. +#[derive(Debug, Clone, Copy, PartialEq, Eq)] +enum UnauthorizedIntent { + /// Just a greeting — bot says hi back without offering tools. + Greeting, + /// Real task ask — refuse + tell the owner how to grant. + TaskAsk, +} + +/// Friendly hi-back canned line when a non-owner just greets the +/// bot. Kept short and warm; doesn't mention the owner / privacy +/// gate at all — that's noise on a "hello". +fn friendly_greeting_message(asker: &str) -> String { + let asker = asker.trim(); + if asker.is_empty() { + "Hi there! Nice to meet you.".to_string() + } else { + format!("Hi {asker}! Nice to meet you.") + } +} + /// Spoken refusal when a non-owner trips the wake word. Built per /// call from the configured owner display name so the audible /// response names the actual person who has the keys, and tells @@ -221,21 +309,31 @@ pub async fn run_grant_turn(request_id: &str, grantee: &str) -> Result Result { +/// `caption_text` is the full caption from `note_caption` so we can +/// classify intent here; the session has already recorded the +/// pending grant request and dispatch timestamp. +pub async fn run_soft_deny_turn( + request_id: &str, + asker: &str, + caption_text: &str, +) -> Result { let owner = registry() .with_session(request_id, |s| s.owner_display_name().to_string()) .unwrap_or_default(); - let message = soft_deny_message(asker, &owner); + let intent = classify_unauthorized_intent(caption_text); + let message = match intent { + UnauthorizedIntent::Greeting => friendly_greeting_message(asker), + UnauthorizedIntent::TaskAsk => soft_deny_message(asker, &owner), + }; log::info!( - "[meet-agent] soft-deny request_id={request_id} asker=\"{asker}\" owner=\"{owner}\"" + "[meet-agent] soft-deny request_id={request_id} asker=\"{asker}\" owner=\"{owner}\" intent={intent:?}" ); // Cancel any prior outbound so the refusal doesn't queue behind a // half-drained reply from a previous turn. @@ -248,18 +346,24 @@ pub async fn run_soft_deny_turn(request_id: &str, asker: &str) -> Result "greeting", + UnauthorizedIntent::TaskAsk => "refusal", + }; s.record_event( SessionEventKind::Note, - format!("soft-deny: {asker} attempted wake without owner approval"), + format!("soft-deny ({kind}): {asker} unauthorised wake"), ); s.record_event(SessionEventKind::Spoke, message.clone()); if !samples.is_empty() { s.enqueue_outbound_pcm(&samples, true); } - // Stamp turn-done so the min-turn-gap backstop covers the - // refusal the same way it covers a real reply. Without this, - // a chatty non-owner could re-trip the gate every few seconds. - s.mark_turn_done(); + // NB: do NOT call `mark_turn_done` here — that's the + // owner-min-turn-gap stamp, and we want the owner to be + // able to wake (e.g. say "allow them") within seconds of a + // refusal. The session's own `UNAUTHORIZED_COOLDOWN_MS` is + // what guards against a soft-deny loop from the same + // non-owner speaker. })?; Ok(true) } @@ -1312,6 +1416,55 @@ mod tests { } } + #[test] + fn classify_unauthorized_intent_treats_bare_wake_as_greeting() { + // Empty tail after the wake phrase — the non-owner just + // said "hey openhuman" with nothing else. Friendly hi-back + // is the right call, not a refusal. + assert_eq!( + classify_unauthorized_intent("hey openhuman"), + UnauthorizedIntent::Greeting + ); + assert_eq!( + classify_unauthorized_intent("Hi openhuman."), + UnauthorizedIntent::Greeting + ); + } + + #[test] + fn classify_unauthorized_intent_treats_filler_as_greeting() { + // Common pleasantries that contain greeting words only. + for text in [ + "hello openhuman there", + "hi openhuman everyone", + "hey openhuman hi", + "hey openhuman good morning", + ] { + assert_eq!( + classify_unauthorized_intent(text), + UnauthorizedIntent::Greeting, + "should be greeting: {text}" + ); + } + } + + #[test] + fn classify_unauthorized_intent_flags_task_asks() { + // Substantive task asks — refuse + tell owner how to grant. + for text in [ + "hey openhuman read my slack", + "hi openhuman what's on alice's calendar", + "openhuman send the report", + "hello openhuman remember the launch", + ] { + assert_eq!( + classify_unauthorized_intent(text), + UnauthorizedIntent::TaskAsk, + "should be task: {text}" + ); + } + } + #[test] fn looks_like_grant_intent_rejects_unrelated_prompts() { // Words that happen to contain "allow" / "yes" mid-prompt diff --git a/src/openhuman/meet_agent/rpc.rs b/src/openhuman/meet_agent/rpc.rs index 5cb7e9d93d..5354108803 100644 --- a/src/openhuman/meet_agent/rpc.rs +++ b/src/openhuman/meet_agent/rpc.rs @@ -145,7 +145,7 @@ pub async fn handle_push_caption(params: Map) -> Result { + CaptionOutcome::UnauthorizedWake { speaker, text } => { log::info!( "{LOG_PREFIX} unauthorized wake — soft-deny turn request_id={} speaker={}", req.request_id, @@ -153,7 +153,7 @@ pub async fn handle_push_caption(params: Map) -> Result can ask" line clears the air and tells the owner - /// how to grant access if they'd like to. - UnauthorizedWake { speaker: String }, + /// The caller should speak a polite refusal — or a friendly hi + /// when the tail is a greeting — via `brain::run_soft_deny_turn` + /// rather than silently dropping. Carries the full caption text + /// so the brain layer can classify intent (greeting vs task) + /// and pick the appropriate canned reply. + UnauthorizedWake { speaker: String, text: String }, } /// How long after a denied wake the owner has to say "allow" before @@ -46,6 +46,16 @@ pub enum CaptionOutcome { /// ask") without leaving the gate softened indefinitely. const PENDING_GRANT_WINDOW_MS: u64 = 120_000; +/// Minimum gap between consecutive soft-deny dispatches. Meet's STT +/// re-transcribes the same utterance with slight wording jitter +/// ("Openhuman. I open." → "Openhuman. High openhum." → +/// "Openhuman. High Openhuman.") so per-text dedup misses the +/// duplicates and fires a fresh refusal on each variant. This +/// session-wide cooldown caps the soft-deny TTS to one dispatch +/// per minute regardless of caption variation. 2026-05-25 smoke +/// hit the loop repeatedly without this. +const UNAUTHORIZED_COOLDOWN_MS: u64 = 60_000; + /// Cap on the inbound buffer so a runaway shell push (e.g. shell never /// stops, brain never drains) can't grow memory unboundedly. 30s @ 16kHz /// mono = 960 KB per session — generous for any reasonable utterance. @@ -156,6 +166,12 @@ pub struct MeetAgentSession { /// math, but the JSONL persistence layer needs an absolute /// timestamp that can be sorted across process restarts. started_at_ms: u64, + /// Wall-clock ms of the most recent soft-deny dispatch. Used + /// to enforce `UNAUTHORIZED_COOLDOWN_MS` so a non-owner whose + /// caption Meet re-transcribes with text variations doesn't + /// trigger a fresh soft-deny TTS on every variant. 0 = no + /// soft-deny has dispatched yet this call. + last_unauthorized_dispatch_at_ms: u64, /// Normalised name of the most recent non-owner speaker that /// tripped the wake word. Recorded so the owner can grant them /// access by saying "allow" / "let them" / "go ahead" within @@ -196,6 +212,7 @@ impl MeetAgentSession { turn_in_progress: false, flush_pending: false, last_turn_done_at_ms: 0, + last_unauthorized_dispatch_at_ms: 0, owner_display_name: String::new(), bot_display_name: String::new(), meet_url: String::new(), @@ -525,6 +542,29 @@ impl MeetAgentSession { // normal LLM turn. if !speaker_is_authorised { let preview: String = text.chars().take(40).collect(); + let now_ms = SystemTime::now() + .duration_since(UNIX_EPOCH) + .map(|d| d.as_millis() as u64) + .unwrap_or(0); + // Session-wide soft-deny cooldown. Meet's STT + // re-transcribes the same utterance with wording + // jitter, slipping past the per-text dedup. Cap the + // refusal TTS to one dispatch per minute so the loop + // can't compound itself (and so rate-limits from the + // TTS backend don't fire either). + if self.last_unauthorized_dispatch_at_ms != 0 + && now_ms.saturating_sub(self.last_unauthorized_dispatch_at_ms) + < UNAUTHORIZED_COOLDOWN_MS + { + log::debug!( + "[meet-agent] unauthorized_wake suppressed (cooldown) \ + request_id={} speaker=\"{}\" preview=\"{}\"", + self.request_id, + speaker, + preview + ); + return CaptionOutcome::Ignored; + } log::info!( "[meet-agent] unauthorized_wake_attempt request_id={} \ speaker=\"{}\" owner=\"{}\" preview=\"{}\"", @@ -533,18 +573,17 @@ impl MeetAgentSession { self.owner_display_name, preview ); + self.last_unauthorized_dispatch_at_ms = now_ms; // Record the pending grant request. The owner has // PENDING_GRANT_WINDOW_MS to approve them via the // "allow" / "let them" / "go ahead" pattern; after // that the slot expires and the unauthorised speaker // has to re-trigger the refusal to re-arm. self.pending_unauthorized_speaker = Some(speaker.trim().to_string()); - self.pending_unauthorized_at_ms = SystemTime::now() - .duration_since(UNIX_EPOCH) - .map(|d| d.as_millis() as u64) - .unwrap_or(0); + self.pending_unauthorized_at_ms = now_ms; return CaptionOutcome::UnauthorizedWake { speaker: speaker.trim().to_string(), + text: text.to_string(), }; } let after = idx + phrase.len(); @@ -984,7 +1023,8 @@ mod tests { assert_eq!( outcome, CaptionOutcome::UnauthorizedWake { - speaker: "Bob".into() + speaker: "Bob".into(), + text: "hey openhuman read alice's slack DMs".into(), }, "non-owner wake must produce an UnauthorizedWake outcome" ); @@ -1064,6 +1104,29 @@ mod tests { assert_eq!(granted, CaptionOutcome::WakeFired); } + #[test] + fn note_caption_unauthorized_wake_cooldown_blocks_text_variants() { + // Meet's STT re-transcribes the same utterance with text + // jitter ("Openhuman. I open." → "Openhuman. High openhum.") + // — the per-text dedup doesn't catch these because the + // strings differ. The session-wide soft-deny cooldown must + // gate subsequent variants from the same speaker so only + // one refusal TTS dispatches per minute regardless of + // STT churn. + let mut s = session_with_owner_alice(); + let first = s.note_caption("Bob", "Openhuman. I open.", 1); + assert!(matches!(first, CaptionOutcome::UnauthorizedWake { .. })); + // Different text but same speaker → still cooled down. + let second = s.note_caption("Bob", "Openhuman. High openhum.", 2); + assert_eq!(second, CaptionOutcome::Ignored); + let third = s.note_caption("Bob", "Openhuman. High Openhuman.", 3); + assert_eq!(third, CaptionOutcome::Ignored); + // Different speaker also gated — soft-deny TTS slot is + // session-wide, not per-speaker. + let charlie = s.note_caption("Charlie", "openhuman hello", 4); + assert_eq!(charlie, CaptionOutcome::Ignored); + } + #[test] fn note_caption_unauthorized_wake_does_not_loop_on_identical_caption() { // Regression: Meet's caption observer re-emits the same row From 17dd6c9e5985512e00132d28789e31c9bc383622 Mon Sep 17 00:00:00 2001 From: oxoxDev Date: Mon, 25 May 2026 18:51:56 +0530 Subject: [PATCH 62/64] fix(meet-audio): route bot PCM to local speakers in addition to Meet uplink MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The audio bridge connected each fed `AudioBufferSource` only to the `MediaStreamAudioDestinationNode` that backs Meet's getUserMedia intercept. Bot voice therefore reached Meet (and other participants via the WebRTC wire), but was silent on the host machine — the user running openhuman could only hear the bot if they were receiving the call on a *separate* endpoint (other browser tab, phone, ...). Smoke today surfaced as "captions appear from OpenHuman but no sound" while the user was watching the bot+meet on the same mac. Add a second `src.connect(ctx.destination)` so the same buffer also plays through the default output device. No quality impact; the MediaStream path is unchanged. Follow-up #20 (vendored CEF `set_audio_muted` for the bot window) will re-introduce a clean off switch behind a config toggle once we have one — right now defaulting to audible-locally is the less confusing posture. --- app/src-tauri/src/meet_audio/audio_bridge.js | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/app/src-tauri/src/meet_audio/audio_bridge.js b/app/src-tauri/src/meet_audio/audio_bridge.js index 1714b340e3..6dbd8e9eb8 100644 --- a/app/src-tauri/src/meet_audio/audio_bridge.js +++ b/app/src-tauri/src/meet_audio/audio_bridge.js @@ -133,6 +133,16 @@ var src = ctx.createBufferSource(); src.buffer = buffer; src.connect(dest); + // Also pipe to the page's default audio output so the bot is + // audible on the host machine (the openhuman app's speakers). + // Without this, bot audio only flows up Meet's gUM intercept + // and the user has to be receiving the meeting on a separate + // endpoint (other browser tab / phone) to hear it. Playing + // locally too costs nothing audio-quality-wise and removes the + // "captions appear but no sound" foot-gun. Follow-up #20 + // (mute bot CEF at OS level) will re-introduce a clean off + // switch once we have a config toggle. + src.connect(ctx.destination); // Schedule strictly after the previous chunk so successive // 100 ms feeds line up gaplessly. If the queue has emptied // (caller fell behind), restart at currentTime so we don't try From 9a3c2dbb91973872f1743c6c9707e1051225036e Mon Sep 17 00:00:00 2001 From: oxoxDev Date: Mon, 25 May 2026 18:57:34 +0530 Subject: [PATCH 63/64] feat(meet-agent): toolless LLM for non-owner non-greeting wakes Loosen the non-owner branch: instead of a canned refusal, route substantive asks through a toolless chat-v1 LLM with an explicit no-personal-data system prompt. The LLM: - Answers general knowledge / casual chat / definitions / jokes from training data ("what's the capital of France" -> "Paris"). - Refuses anything that would need the owner's tools (Slack, Gmail, Calendar, memory, integrations) with a one-line pointer at the magic word: ", say 'allow' if you'd like me to help." - Has zero tools wired, so it physically can't fire a Composio call even if it tried. - Has empty history (no rolling context from owner turns) so private replies from earlier in the call can't bleed into a non-owner reply. `run_soft_deny_turn` still gates on `classify_unauthorized_intent`: greeting -> canned hi (cheap, no network); task ask -> the new `llm_general_no_tools`. LLM errors / empty replies fall through to the explicit canned refusal so the speaker hears *something*. Changes: - brain::llm_meeting_basic gains a `system_prompt` param so the same plumbing serves both owner-fallback and non-owner paths. - new `non_owner_system_prompt(owner)` builder. - new `llm_general_no_tools(prompt, owner)` wrapper. - cooldown lowered 60s -> 20s so non-owners can engage in actual back-and-forth instead of the bot going deaf for a minute after the first refusal. --- src/openhuman/meet_agent/brain.rs | 92 +++++++++++++++++++++++++++-- src/openhuman/meet_agent/session.rs | 17 ++++-- 2 files changed, 98 insertions(+), 11 deletions(-) diff --git a/src/openhuman/meet_agent/brain.rs b/src/openhuman/meet_agent/brain.rs index 04ff2d1b0d..47c857c026 100644 --- a/src/openhuman/meet_agent/brain.rs +++ b/src/openhuman/meet_agent/brain.rs @@ -185,15 +185,68 @@ fn classify_unauthorized_intent(caption_text: &str) -> UnauthorizedIntent { } /// Output of `classify_unauthorized_intent`. Drives whether the -/// soft-deny turn speaks a friendly hi-back or a polite refusal. +/// non-owner turn speaks a canned hi-back or routes the prompt +/// through a toolless LLM (general-knowledge + safe deflection). #[derive(Debug, Clone, Copy, PartialEq, Eq)] enum UnauthorizedIntent { /// Just a greeting — bot says hi back without offering tools. Greeting, - /// Real task ask — refuse + tell the owner how to grant. + /// Substantive question. Route to a toolless LLM with a strict + /// system prompt — answer general knowledge / casual chat, + /// refuse anything that would require the owner's personal + /// tools or data, and point the owner at the magic word + /// ("allow") if access is needed. TaskAsk, } +/// System prompt for the non-owner branch. The LLM has no tool +/// surface attached and is told to refuse any request that would +/// need the owner's personal data. Kept short and explicit so the +/// model doesn't ad-lib a different boundary. +fn non_owner_system_prompt(owner: &str) -> String { + let owner_label = if owner.trim().is_empty() { + "the meeting host" + } else { + owner.trim() + }; + format!("\ +You are openhuman, an AI participant in a live Google Meet call. The speaker is NOT the call \ +owner — the owner is {owner_label}.\n\ +\n\ +WHAT YOU MAY DO:\n\ +- Answer general knowledge questions (history, science, math, definitions, weather concepts).\n\ +- Casual conversation, jokes, small talk, greetings.\n\ +- Explain what you are and what you can do at a high level.\n\ +\n\ +WHAT YOU MUST REFUSE (no exceptions):\n\ +- Anything that would require {owner_label}'s personal data: their Slack, Gmail, Calendar, \ +contacts, memory notes, files, schedule, integrations, or chat history.\n\ +- Sending messages, scheduling, reminding, creating, modifying or deleting any data on their \ +behalf.\n\ +- Revealing what {owner_label} has previously told you or stored with you.\n\ +\n\ +WHEN REFUSING: respond with exactly one short sentence pointing at the magic word, e.g. \ +\"That needs {owner_label}'s permission — {owner_label}, say 'allow' if you'd like me to help.\"\n\ +\n\ +OUTPUT FORMAT (strict):\n\ +- ONE short spoken sentence, max 25 words.\n\ +- Plain English. No markdown, bullets, code fences, or URLs.\n\ +- No meta-narration (\"I should…\", \"Let me…\", \"As an AI…\"). Just answer.\n\ +- Respond in ENGLISH ONLY regardless of the speaker's language — TTS is English-only.\n\ +") +} + +/// Route a non-owner caption through the toolless chat-v1 LLM. +/// Returns the spoken text — the caller TTS's it and enqueues. +async fn llm_general_no_tools(prompt: &str, owner: &str) -> Result { + let system_prompt = non_owner_system_prompt(owner); + // No rolling history for the non-owner path — each ask is a + // fresh conversation. Sharing history between owner turns and + // non-owner turns risks leaking the owner's tool-call results + // into a stranger-facing reply. + llm_meeting_basic(prompt, &[], &system_prompt).await +} + /// Friendly hi-back canned line when a non-owner just greets the /// bot. Kept short and warm; doesn't mention the owner / privacy /// gate at all — that's noise on a "hello". @@ -328,9 +381,34 @@ pub async fn run_soft_deny_turn( .with_session(request_id, |s| s.owner_display_name().to_string()) .unwrap_or_default(); let intent = classify_unauthorized_intent(caption_text); + // Greeting → canned hi (no network round-trip needed). + // TaskAsk → toolless LLM. The LLM has no tools attached, has + // an explicit "refuse personal-data asks" system + // prompt, and is asked to point the owner at the + // magic word when refusing. So a Q like "what's + // the capital of France" lands as a normal answer + // ("Paris"), while "read Nikhil's Slack" lands as + // the refusal. The LLM picks; we don't classify. let message = match intent { UnauthorizedIntent::Greeting => friendly_greeting_message(asker), - UnauthorizedIntent::TaskAsk => soft_deny_message(asker, &owner), + UnauthorizedIntent::TaskAsk => match llm_general_no_tools(caption_text, &owner).await { + Ok(reply) if !reply.trim().is_empty() => reply, + Ok(_) => { + // Empty reply = LLM declined silently. Fall back to + // the explicit canned refusal so the speaker hears + // *something* and knows the bot didn't crash. + log::info!( + "[meet-agent] non-owner LLM returned empty — using canned refusal request_id={request_id}" + ); + soft_deny_message(asker, &owner) + } + Err(err) => { + log::warn!( + "[meet-agent] non-owner LLM failed request_id={request_id} err={err}" + ); + soft_deny_message(asker, &owner) + } + }, }; log::info!( "[meet-agent] soft-deny request_id={request_id} asker=\"{asker}\" owner=\"{owner}\" intent={intent:?}" @@ -919,7 +997,11 @@ async fn get_or_build_agent_for_meet(request_id: &str) -> Result Result { +async fn llm_meeting_basic( + prompt: &str, + history: &[ConversationTurn], + system_prompt: &str, +) -> Result { use crate::api::config::effective_backend_api_url; use crate::api::jwt::get_session_token; use crate::api::BackendOAuthClient; @@ -935,7 +1017,7 @@ async fn llm_meeting_basic(prompt: &str, history: &[ConversationTurn]) -> Result let client = BackendOAuthClient::new(&api_url).map_err(|e| e.to_string())?; let mut messages: Vec = Vec::with_capacity(history.len() + 2); - messages.push(json!({ "role": "system", "content": MEETING_SYSTEM_PROMPT })); + messages.push(json!({ "role": "system", "content": system_prompt })); for turn in history { messages.push(json!({ "role": turn.role, "content": turn.content })); } diff --git a/src/openhuman/meet_agent/session.rs b/src/openhuman/meet_agent/session.rs index 185374272d..04a22f6b21 100644 --- a/src/openhuman/meet_agent/session.rs +++ b/src/openhuman/meet_agent/session.rs @@ -46,15 +46,20 @@ pub enum CaptionOutcome { /// ask") without leaving the gate softened indefinitely. const PENDING_GRANT_WINDOW_MS: u64 = 120_000; -/// Minimum gap between consecutive soft-deny dispatches. Meet's STT +/// Minimum gap between consecutive non-owner dispatches. Meet's STT /// re-transcribes the same utterance with slight wording jitter /// ("Openhuman. I open." → "Openhuman. High openhum." → /// "Openhuman. High Openhuman.") so per-text dedup misses the -/// duplicates and fires a fresh refusal on each variant. This -/// session-wide cooldown caps the soft-deny TTS to one dispatch -/// per minute regardless of caption variation. 2026-05-25 smoke -/// hit the loop repeatedly without this. -const UNAUTHORIZED_COOLDOWN_MS: u64 = 60_000; +/// duplicates. Without a session-wide rate limit each variant +/// would fire a fresh LLM + TTS round-trip. +/// +/// Set at 20s (vs the prior 60s) so a non-owner can actually +/// engage in back-and-forth conversation — the toolless LLM +/// answers general questions now, so a 1-minute gate would feel +/// like the bot has gone deaf between asks. 20s is long enough +/// to cover Meet's STT replay window while letting real new +/// utterances through. 2026-05-25 smoke matrix. +const UNAUTHORIZED_COOLDOWN_MS: u64 = 20_000; /// Cap on the inbound buffer so a runaway shell push (e.g. shell never /// stops, brain never drains) can't grow memory unboundedly. 30s @ 16kHz From 15df969e0cdb0e0c8e657fa34245f6327ab06a14 Mon Sep 17 00:00:00 2001 From: oxoxDev Date: Mon, 25 May 2026 18:58:36 +0530 Subject: [PATCH 64/64] chore: apply auto-fixes --- src/openhuman/meet_agent/brain.rs | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/src/openhuman/meet_agent/brain.rs b/src/openhuman/meet_agent/brain.rs index 47c857c026..8be8fdbd29 100644 --- a/src/openhuman/meet_agent/brain.rs +++ b/src/openhuman/meet_agent/brain.rs @@ -209,7 +209,8 @@ fn non_owner_system_prompt(owner: &str) -> String { } else { owner.trim() }; - format!("\ + format!( + "\ You are openhuman, an AI participant in a live Google Meet call. The speaker is NOT the call \ owner — the owner is {owner_label}.\n\ \n\ @@ -233,7 +234,8 @@ OUTPUT FORMAT (strict):\n\ - Plain English. No markdown, bullets, code fences, or URLs.\n\ - No meta-narration (\"I should…\", \"Let me…\", \"As an AI…\"). Just answer.\n\ - Respond in ENGLISH ONLY regardless of the speaker's language — TTS is English-only.\n\ -") +" + ) } /// Route a non-owner caption through the toolless chat-v1 LLM. @@ -403,9 +405,7 @@ pub async fn run_soft_deny_turn( soft_deny_message(asker, &owner) } Err(err) => { - log::warn!( - "[meet-agent] non-owner LLM failed request_id={request_id} err={err}" - ); + log::warn!("[meet-agent] non-owner LLM failed request_id={request_id} err={err}"); soft_deny_message(asker, &owner) } },