From 92ad9dc3b962f4c939cf1e1170711b9520c4555c Mon Sep 17 00:00:00 2001
From: oxoxDev <nikhil@tinyhumans.ai>
Date: Fri, 22 May 2026 12:04:57 +0530
Subject: [PATCH 01/64] refactor(meeting-bots): point modal submit at Flow A
 (CEF webview)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The modal used to POST /mascots/join-meeting to the backend Camoufox bot
(Flow B). Two production blockers there:

- Firefox / Camoufox bypasses our JS getUserMedia override at the C++
  native layer, so the mascot Y4M never replaces the bot's camera and
  the tile is a static placeholder.
- Chromium / Chrome variants get rejected by Meet's anti-bot screen
  ("You can't join this video call") before they reach the join page.

Flow A (PR #1350 + #1359) sidesteps both: it opens a dedicated, profile-
isolated CEF webview on the user's machine, installs the audio + video
bridges via CDP at document-start, and lets meet_scanner drive the join.
The mascot canvas IS the outbound camera and the synthesized speech IS
the outbound mic — the user's OS mic is never wired to the meeting.
---
 app/src/components/skills/MeetingBotsCard.tsx | 54 +++++++------------
 1 file changed, 18 insertions(+), 36 deletions(-)
diff --git a/app/src/components/skills/MeetingBotsCard.tsx b/app/src/components/skills/MeetingBotsCard.tsx
index c93dd1f26a..f0f9301617 100644
--- a/app/src/components/skills/MeetingBotsCard.tsx
+++ b/app/src/components/skills/MeetingBotsCard.tsx
@@ -1,19 +1,17 @@
 // Meeting bots entry point on the Skills "Integrations" section.
 //
-// Surfaces as a compact, fun banner: clicking opens a modal that wraps
-// the backend mascot bot (PR tinyhumansai/backend#773). Joining a
-// Google Meet kicks off the Camoufox-driven mascot in the backend,
-// which streams the mascot's WebRTC video into the call as an
-// anonymous guest. Zoom and Teams are shown as "coming soon" — the
-// backend already routes them but returns 400 "not yet supported".
+// Surfaces as a compact, fun banner: clicking opens a modal that opens
+// a dedicated CEF webview pointed at the Meet URL. The bot's outbound
+// camera is the mascot canvas (`meet_video::camera_bridge`) and its
+// outbound audio is the synthesized speech pump (`meet_audio`). Zoom
+// and Teams are shown as "coming soon" — only Google Meet has the CEF
+// bridge pipeline today.
 
 import { useEffect, useState } from 'react';
 
 import { useT } from '../../lib/i18n/I18nContext';
 import {
-  joinMeetingViaMascotBot,
-  SERVER_OVERLOADED_MESSAGE,
-  type MascotJoinMeetingError,
+  joinMeetCall,
   type MascotMeetPlatform,
 } from '../../services/meetCallService';
 
@@ -41,10 +39,6 @@ const PLATFORMS: PlatformDef[] = [
   },
 ];
 
-function isMascotJoinMeetingError(err: unknown): err is MascotJoinMeetingError {
-  return !!err && typeof err === 'object' && 'isCapacityGated' in err && 'message' in err;
-}
-
 export default function MeetingBotsCard({ onToast }: Props) {
   const [open, setOpen] = useState(false);
 
@@ -115,13 +109,12 @@ interface ModalProps {
   onToast?: (toast: Toast) => void;
 }
 
-function MeetingBotsModal({ onClose, onToast }: ModalProps) {
+export function MeetingBotsModal({ onClose, onToast }: ModalProps) {
   const { t } = useT();
   const [platform, setPlatform] = useState<MascotMeetPlatform>('gmeet');
   const [meetUrl, setMeetUrl] = useState('');
   const [displayName, setDisplayName] = useState('OpenHuman');
   const [submitting, setSubmitting] = useState(false);
-  const [capacityGated, setCapacityGated] = useState(false);
   const [error, setError] = useState<string | null>(null);
 
   const selected = PLATFORMS.find(p => p.platform === platform) ?? PLATFORMS[0];
@@ -139,14 +132,18 @@ function MeetingBotsModal({ onClose, onToast }: ModalProps) {
   const handleSubmit = async (event: React.FormEvent<HTMLFormElement>) => {
     event.preventDefault();
     setError(null);
-    setCapacityGated(false);
     if (isComingSoon) {
       setError(`${selected.label} support is coming soon.`);
       return;
     }
     setSubmitting(true);
     try {
-      await joinMeetingViaMascotBot({ platform, meetUrl, displayName });
+      // Flow A: local CEF webview with mascot canvas + synthesized audio.
+      // joinMeetCall opens an off-screen CEF window per request_id,
+      // installs the audio/video bridges via CDP, then meet_scanner
+      // drives the join automatically. Returns once the window has
+      // been created — meet_audio + meet_scanner take it from there.
+      await joinMeetCall({ meetUrl, displayName });
       onToast?.({
         type: 'success',
         title: t('skills.meetingBots.joiningTitle'),
@@ -155,20 +152,9 @@ function MeetingBotsModal({ onClose, onToast }: ModalProps) {
       setMeetUrl('');
       onClose();
     } catch (err) {
-      if (isMascotJoinMeetingError(err)) {
-        setCapacityGated(err.isCapacityGated);
-        const message = err.isCapacityGated ? SERVER_OVERLOADED_MESSAGE : err.message;
-        setError(message);
-        onToast?.({
-          type: 'error',
-          title: err.isCapacityGated ? t('skills.meetingBots.busyTitle') : t('skills.meetingBots.couldNotStartTitle'),
-          message,
-        });
-      } else {
-        const message = err instanceof Error ? err.message : t('skills.meetingBots.failedToStart');
-        setError(message);
-        onToast?.({ type: 'error', title: t('skills.meetingBots.couldNotStartTitle'), message });
-      }
+      const message = err instanceof Error ? err.message : t('skills.meetingBots.failedToStart');
+      setError(message);
+      onToast?.({ type: 'error', title: t('skills.meetingBots.couldNotStartTitle'), message });
     } finally {
       setSubmitting(false);
     }
@@ -261,11 +247,7 @@ function MeetingBotsModal({ onClose, onToast }: ModalProps) {
             {error && (
               <div
                 role="alert"
-                className={`rounded-xl border px-3 py-2 text-xs ${
-                  capacityGated
-                    ? 'border-amber-200 dark:border-amber-500/30 bg-amber-50 dark:bg-amber-500/10 text-amber-800 dark:text-amber-300'
-                    : 'border-coral-200 dark:border-coral-500/30 bg-coral-50 dark:bg-coral-500/10 text-coral-700 dark:text-coral-300'
-                }`}>
+                className="rounded-xl border border-coral-200 dark:border-coral-500/30 bg-coral-50 dark:bg-coral-500/10 px-3 py-2 text-xs text-coral-700 dark:text-coral-300">
                 {error}
               </div>
             )}

From 8e9b7b226f30cdd1cf5a4500b2478c0d14015725 Mon Sep 17 00:00:00 2001
From: oxoxDev <nikhil@tinyhumans.ai>
Date: Fri, 22 May 2026 12:05:10 +0530
Subject: [PATCH 02/64] feat(human): add join-meeting pill that opens Flow A
 modal
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Surfaces the meeting-bots entry next to the speak-replies toggle on
/human so users can dispatch the mascot directly from the chat surface
without flipping to the Skills tab. Same modal, same Flow A backing —
just an additional surface.
---
 app/src/features/human/HumanPage.tsx | 17 +++++++++++++++++
 1 file changed, 17 insertions(+)

diff --git a/app/src/features/human/HumanPage.tsx b/app/src/features/human/HumanPage.tsx
index 8def765428..d257b7b2da 100644
--- a/app/src/features/human/HumanPage.tsx
+++ b/app/src/features/human/HumanPage.tsx
@@ -1,5 +1,6 @@
 import { useEffect, useState } from 'react';
 
+import { MeetingBotsModal } from '../../components/skills/MeetingBotsCard';
 import { useT } from '../../lib/i18n/I18nContext';
 import Conversations from '../../pages/Conversations';
 import type { ToolTimelineEntry } from '../../store/chatRuntimeSlice';
@@ -21,6 +22,7 @@ const HumanPage = () => {
     const raw = window.localStorage.getItem(SPEAK_REPLIES_KEY);
     return raw === null ? true : raw === '1';
   });
+  const [joinMeetingOpen, setJoinMeetingOpen] = useState(false);
 
   useEffect(() => {
     window.localStorage.setItem(SPEAK_REPLIES_KEY, speakReplies ? '1' : '0');
@@ -65,6 +67,21 @@ const HumanPage = () => {
         {t('voice.pushToTalk')}
       </label>
 
+      {/* "Send OpenHuman to a meeting" — opens the Flow A modal which spawns
+          an off-screen CEF webview pointed at the Meet URL with the mascot
+          canvas as the outbound camera and synthesized speech as the
+          outbound mic. The user's OS mic is never wired to the meeting. */}
+      <button
+        type="button"
+        onClick={() => setJoinMeetingOpen(true)}
+        data-testid="human-join-meeting-pill"
+        className="absolute top-4 left-44 z-10 inline-flex items-center gap-1.5 px-3 py-1.5 rounded-full bg-primary-500 text-white text-xs font-medium shadow-soft hover:bg-primary-600 focus:outline-none focus-visible:ring-2 focus-visible:ring-primary-300">
+        <span aria-hidden="true">📞</span>
+        {t('skills.meetingBots.modalTitle')}
+      </button>
+
+      {joinMeetingOpen && <MeetingBotsModal onClose={() => setJoinMeetingOpen(false)} />}
+
       {/* Chat sidebar — vertically centered above the BottomTabBar (~80px). */}
       <div className="absolute right-4 top-0 bottom-20 z-10 flex items-center">
         <aside className="w-[420px] h-[min(720px,calc(100vh-160px))] rounded-2xl border border-stone-300 dark:border-neutral-700 bg-white dark:bg-neutral-900 shadow-soft flex flex-col overflow-hidden">

From 54333fc354e5460a33560e6957f506808806bed5 Mon Sep 17 00:00:00 2001
From: oxoxDev <nikhil@tinyhumans.ai>
Date: Fri, 22 May 2026 12:44:48 +0530
Subject: [PATCH 03/64] fix(meet-call): force off-screen position post-build +
 minimize
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

macOS Cocoa clamps NSWindow frame origins to keep the window at least
partially on-screen, so the (-30000, -30000) requested via the builder
lands as (0, 0) and the bot's Meet CEF window pops up visible — the
user can see + interact with the bot's pre-join UI, which defeats the
'invisible bot' premise.

Re-apply the off-screen position post-build via Tauri's set_position
API (which hits the runtime's CEF set_position path, bypassing the
initial-bounds clamp). Belt-and-suspenders with window.minimize() so
even on builds where the position still leaks through Cocoa, the
window doesn't visibly cover the user's main openhuman surface.
---
 app/src-tauri/src/meet_call/mod.rs | 16 ++++++++++++++++
 1 file changed, 16 insertions(+)

diff --git a/app/src-tauri/src/meet_call/mod.rs b/app/src-tauri/src/meet_call/mod.rs
index b075405b2c..2454967a67 100644
--- a/app/src-tauri/src/meet_call/mod.rs
+++ b/app/src-tauri/src/meet_call/mod.rs
@@ -162,6 +162,22 @@ pub async fn meet_call_open_window<R: Runtime>(
         .build()
         .map_err(|e| format!("[meet-call] WebviewWindowBuilder.build failed: {e}"))?;
 
+    // macOS Cocoa clamps NSWindow frame origins to keep the window at
+    // least partially on-screen, so the `(-30000, -30000)` requested in
+    // the builder lands as `(0, 0)` and the bot's CEF window pops up
+    // visible (issue: user can see + interact with the bot's Meet UI).
+    // Re-apply the off-screen position post-build via Tauri's
+    // `set_position` API — that hits the runtime's CEF `set_position`
+    // path which bypasses the initial-bounds clamp. Belt-and-suspenders
+    // with a minimize so even on builds where the position still leaks
+    // through Cocoa, the window doesn't visibly cover the user.
+    if let Err(err) = window.set_position(tauri::PhysicalPosition::new(-30000i32, -30000i32)) {
+        log::warn!("[meet-call] post-build set_position failed: {err}");
+    }
+    if let Err(err) = window.minimize() {
+        log::warn!("[meet-call] post-build minimize failed: {err}");
+    }
+
     state
         .inner
         .lock()

From 0260c16022464274be5271ad77684b2e0bbf79d1 Mon Sep 17 00:00:00 2001
From: oxoxDev <nikhil@tinyhumans.ai>
Date: Fri, 22 May 2026 13:04:40 +0530
Subject: [PATCH 04/64] fix(meet-call): drop minimize, close stale windows, log
 post-build pos
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

macOS restores a minimized window on the next focus event, which means
the previously-minimized bot CEF window pops back up over the user's
main openhuman surface as soon as anything brings the app to front.
Worse UX than a window stuck off-screen — drop the minimize().

Also close any lingering meet-call-* window before opening a new one.
Each Join was spawning a fresh request_id-keyed window without
reclaiming the previous bot's resources, so the Dock accumulated
"Meet — OpenHuman" windows and the listen_capture audio handler
registry got two competing CEF audio handlers fighting over the same
URL.

Finally, log the actual outer_position post-build so we can verify in
the log whether macOS still clamps (-30000, -30000) → (0, 0) or whether
the runtime's CEF set_position path took effect this time.
---
 app/src-tauri/src/meet_call/mod.rs | 38 ++++++++++++++++++++++++------
 1 file changed, 31 insertions(+), 7 deletions(-)

diff --git a/app/src-tauri/src/meet_call/mod.rs b/app/src-tauri/src/meet_call/mod.rs
index 2454967a67..6db21cd989 100644
--- a/app/src-tauri/src/meet_call/mod.rs
+++ b/app/src-tauri/src/meet_call/mod.rs
@@ -113,6 +113,25 @@ pub async fn meet_call_open_window<R: Runtime>(
         return Ok(label);
     }
 
+    // Only one meet-call window can be live at a time — concurrent bot
+    // sessions race the CEF audio handler registration (`listen_capture`)
+    // and confuse the user with multiple "Meet — OpenHuman" windows in
+    // their Dock. Close any stragglers from a prior Join before opening
+    // a fresh one. The CloseRequested handler will tear down their
+    // scanner + audio session via the per-window event listeners below.
+    let stale_labels: Vec<String> = app
+        .webview_windows()
+        .keys()
+        .filter(|l| l.starts_with("meet-call-"))
+        .cloned()
+        .collect();
+    for stale in stale_labels {
+        if let Some(window) = app.get_webview_window(&stale) {
+            log::info!("[meet-call] closing stale window label={stale} before new join");
+            let _ = window.close();
+        }
+    }
+
     let data_dir = data_directory_for(&app, &request_id)?;
     if let Err(err) = std::fs::create_dir_all(&data_dir) {
         log::warn!(
@@ -165,17 +184,22 @@ pub async fn meet_call_open_window<R: Runtime>(
     // macOS Cocoa clamps NSWindow frame origins to keep the window at
     // least partially on-screen, so the `(-30000, -30000)` requested in
     // the builder lands as `(0, 0)` and the bot's CEF window pops up
-    // visible (issue: user can see + interact with the bot's Meet UI).
-    // Re-apply the off-screen position post-build via Tauri's
+    // visible. Re-apply the off-screen position post-build via Tauri's
     // `set_position` API — that hits the runtime's CEF `set_position`
-    // path which bypasses the initial-bounds clamp. Belt-and-suspenders
-    // with a minimize so even on builds where the position still leaks
-    // through Cocoa, the window doesn't visibly cover the user.
+    // path which bypasses the initial-bounds clamp.
+    //
+    // Don't `minimize()` here: macOS restores a minimized window on
+    // any app focus event, and the restored Meet pre-join page shows
+    // up over the user's main openhuman surface — far worse UX than
+    // a window stuck off-screen.
     if let Err(err) = window.set_position(tauri::PhysicalPosition::new(-30000i32, -30000i32)) {
         log::warn!("[meet-call] post-build set_position failed: {err}");
     }
-    if let Err(err) = window.minimize() {
-        log::warn!("[meet-call] post-build minimize failed: {err}");
+    if let Ok(pos) = window.outer_position() {
+        log::info!(
+            "[meet-call] post-build outer_position={{x:{},y:{}}} (target=-30000,-30000)",
+            pos.x, pos.y
+        );
     }
 
     state

From c322dc7310540239efc29e3d122e6a33eded7c68 Mon Sep 17 00:00:00 2001
From: oxoxDev <nikhil@tinyhumans.ai>
Date: Fri, 22 May 2026 13:08:17 +0530
Subject: [PATCH 05/64] fix(meet-call): hide window post-build instead of
 positioning off-screen
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

macOS Cocoa clamps NSWindow frame origins to the union of all attached
monitors' bounds, so even (-30000, -30000) lands on a secondary
display on multi-monitor setups (e.g. (-1692, 66) on a left-extended
layout). Confirmed via the post-build outer_position log line: the
bot's Meet pre-join surface ends up visible on the user's second
screen, which still defeats the 'invisible bot' premise.

Swap to window.hide() instead — that calls macOS [NSWindow orderOut:]
which removes the window from screen + Dock without releasing the
backing surface. The renderer keeps painting, CDP keeps working, and
all the existing scanner / audio-bridge / camera-bridge plumbing
continues to function. Critically different from .visible(false) at
builder time, which never gives the renderer a backing surface and
silently breaks layout + clicks (see the existing builder comment for
the original reasoning).
---
 app/src-tauri/src/meet_call/mod.rs | 38 +++++++++++++++++++-----------
 1 file changed, 24 insertions(+), 14 deletions(-)

diff --git a/app/src-tauri/src/meet_call/mod.rs b/app/src-tauri/src/meet_call/mod.rs
index 6db21cd989..87b42b68ed 100644
--- a/app/src-tauri/src/meet_call/mod.rs
+++ b/app/src-tauri/src/meet_call/mod.rs
@@ -181,24 +181,34 @@ pub async fn meet_call_open_window<R: Runtime>(
         .build()
         .map_err(|e| format!("[meet-call] WebviewWindowBuilder.build failed: {e}"))?;
 
-    // macOS Cocoa clamps NSWindow frame origins to keep the window at
-    // least partially on-screen, so the `(-30000, -30000)` requested in
-    // the builder lands as `(0, 0)` and the bot's CEF window pops up
-    // visible. Re-apply the off-screen position post-build via Tauri's
-    // `set_position` API — that hits the runtime's CEF `set_position`
-    // path which bypasses the initial-bounds clamp.
+    // Hide the window after build so the user never sees it.
     //
-    // Don't `minimize()` here: macOS restores a minimized window on
-    // any app focus event, and the restored Meet pre-join page shows
-    // up over the user's main openhuman surface — far worse UX than
-    // a window stuck off-screen.
-    if let Err(err) = window.set_position(tauri::PhysicalPosition::new(-30000i32, -30000i32)) {
-        log::warn!("[meet-call] post-build set_position failed: {err}");
+    // We previously relied on positioning the window off-screen at
+    // (-30000, -30000), but macOS Cocoa clamps NSWindow frame origins
+    // to keep them within the union of all attached monitors' bounds.
+    // On a single-display setup the clamp lands at (0, 0); on a
+    // multi-display setup it lands somewhere on the secondary display
+    // (e.g. (-1692, 66) on a left-extended layout). Either way the
+    // user sees the bot's Meet pre-join surface — which defeats the
+    // "invisible bot" premise.
+    //
+    // `window.hide()` calls macOS `[NSWindow orderOut:]`, which removes
+    // the window from screen + Dock without releasing its backing
+    // surface. Critically, the renderer keeps painting and CDP keeps
+    // working — so `meet_audio::inject`, `meet_video::camera_bridge`,
+    // and `meet_scanner::Input.dispatchMouseEvent` all continue to
+    // function. This is different from `.visible(false)` at builder
+    // time (which never gives the renderer a backing surface in the
+    // first place and silently breaks layout + clicks).
+    if let Err(err) = window.hide() {
+        log::warn!("[meet-call] post-build hide failed: {err}");
     }
     if let Ok(pos) = window.outer_position() {
         log::info!(
-            "[meet-call] post-build outer_position={{x:{},y:{}}} (target=-30000,-30000)",
-            pos.x, pos.y
+            "[meet-call] post-build outer_position={{x:{},y:{}}} visible={}",
+            pos.x,
+            pos.y,
+            window.is_visible().unwrap_or(true)
         );
     }
 

From e98dbd16bc37711efea93d7edbf704e0fe403574 Mon Sep 17 00:00:00 2001
From: oxoxDev <nikhil@tinyhumans.ai>
Date: Fri, 22 May 2026 13:15:09 +0530
Subject: [PATCH 06/64] fix(meet-call): hide window post-join (not post-build)
 to keep scanner working
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Hiding the window at post-build time stripped CEF's renderer of its
key-window state and the meet_scanner's CDP `Input.dispatchMouseEvent`
clicks landed on un-rendered DOM, so the bot never got past the
pre-join screen.

Move the hide() call into `meet_scanner::spawn` on the Ok branch of
the join sequence — that fires after "Ask to join" has been clicked
and Meet has confirmed entry into the waiting room. By then the
renderer has done its layout, gUM has fired (so the audio + camera
bridges have taken hold), and the CDP session is in steady-state
streaming captions + speech. orderOut: just removes the window from
screen + Dock without releasing the backing surface, so all of that
keeps running while the user no longer sees the bot.

Pre-join, the window is positioned off-screen at (-30000, -30000) and
macOS clamps it onto whatever monitor it can find — so on multi-
display setups the user sees a flash of the bot's pre-join page on
their secondary monitor for ~7 s before it goes away. Best we can
do without restructuring CEF's headless-render path.
---
 app/src-tauri/src/meet_call/mod.rs    | 41 +++++++++++----------------
 app/src-tauri/src/meet_scanner/mod.rs | 27 ++++++++++++++++--
 2 files changed, 40 insertions(+), 28 deletions(-)

diff --git a/app/src-tauri/src/meet_call/mod.rs b/app/src-tauri/src/meet_call/mod.rs
index 87b42b68ed..24c8b080e1 100644
--- a/app/src-tauri/src/meet_call/mod.rs
+++ b/app/src-tauri/src/meet_call/mod.rs
@@ -181,34 +181,25 @@ pub async fn meet_call_open_window<R: Runtime>(
         .build()
         .map_err(|e| format!("[meet-call] WebviewWindowBuilder.build failed: {e}"))?;
 
-    // Hide the window after build so the user never sees it.
+    // Push the window off-screen post-build. macOS Cocoa clamps NSWindow
+    // frame origins to the union of all attached monitors' bounds, so
+    // (-30000, -30000) lands at (0, 0) on a single-display setup or on
+    // a secondary monitor's edge on multi-display setups. Not perfect,
+    // but the post-join hide() in `meet_scanner::run` is the primary
+    // hiding mechanism — this just keeps the brief pre-join window
+    // out of the user's main display where possible.
     //
-    // We previously relied on positioning the window off-screen at
-    // (-30000, -30000), but macOS Cocoa clamps NSWindow frame origins
-    // to keep them within the union of all attached monitors' bounds.
-    // On a single-display setup the clamp lands at (0, 0); on a
-    // multi-display setup it lands somewhere on the secondary display
-    // (e.g. (-1692, 66) on a left-extended layout). Either way the
-    // user sees the bot's Meet pre-join surface — which defeats the
-    // "invisible bot" premise.
-    //
-    // `window.hide()` calls macOS `[NSWindow orderOut:]`, which removes
-    // the window from screen + Dock without releasing its backing
-    // surface. Critically, the renderer keeps painting and CDP keeps
-    // working — so `meet_audio::inject`, `meet_video::camera_bridge`,
-    // and `meet_scanner::Input.dispatchMouseEvent` all continue to
-    // function. This is different from `.visible(false)` at builder
-    // time (which never gives the renderer a backing surface in the
-    // first place and silently breaks layout + clicks).
-    if let Err(err) = window.hide() {
-        log::warn!("[meet-call] post-build hide failed: {err}");
+    // We can't hide() here: a window built hidden never gives its
+    // renderer a backing surface, and `meet_scanner` drives the join
+    // via CDP `Input.dispatchMouseEvent` which requires laid-out DOM.
+    // Hide post-join instead.
+    if let Err(err) = window.set_position(tauri::PhysicalPosition::new(-30000i32, -30000i32)) {
+        log::warn!("[meet-call] post-build set_position failed: {err}");
     }
     if let Ok(pos) = window.outer_position() {
         log::info!(
-            "[meet-call] post-build outer_position={{x:{},y:{}}} visible={}",
-            pos.x,
-            pos.y,
-            window.is_visible().unwrap_or(true)
+            "[meet-call] post-build outer_position={{x:{},y:{}}} (target=-30000,-30000)",
+            pos.x, pos.y
         );
     }
 
@@ -414,7 +405,7 @@ pub async fn meet_call_close_window<R: Runtime>(
     Ok(false)
 }
 
-fn window_label_for(request_id: &str) -> String {
+pub fn window_label_for(request_id: &str) -> String {
     format!("meet-call-{request_id}")
 }
 
diff --git a/app/src-tauri/src/meet_scanner/mod.rs b/app/src-tauri/src/meet_scanner/mod.rs
index 67e56f57f2..55ab6aeb6e 100644
--- a/app/src-tauri/src/meet_scanner/mod.rs
+++ b/app/src-tauri/src/meet_scanner/mod.rs
@@ -35,7 +35,7 @@
 use std::time::Duration;
 
 use serde_json::{json, Value};
-use tauri::{AppHandle, Runtime};
+use tauri::{AppHandle, Manager, Runtime};
 
 use crate::cdp::{self, CdpConn};
 
@@ -70,7 +70,7 @@ const POLL_INTERVAL: Duration = Duration::from_millis(500);
 /// the scanner uses it as a target-URL prefix so two concurrent calls
 /// each attach to their own CEF target instead of cross-controlling.
 pub fn spawn<R: Runtime>(
-    _app: AppHandle<R>,
+    app: AppHandle<R>,
     request_id: String,
     meet_url: String,
     display_name: String,
@@ -79,7 +79,28 @@ pub fn spawn<R: Runtime>(
     // JoinHandle whose abort_handle() we can return to the caller.
     let handle = tokio::spawn(async move {
         match run(&request_id, &meet_url, &display_name).await {
-            Ok(()) => log::info!("[meet-scanner] join sequence completed request_id={request_id}"),
+            Ok(()) => {
+                log::info!("[meet-scanner] join sequence completed request_id={request_id}");
+                // Now that the scanner has clicked "Ask to join" and CEF
+                // has the meeting page laid out, hide the window so the
+                // user never sees the bot's Meet UI. The renderer keeps
+                // its backing surface (orderOut: rather than release),
+                // so the audio + camera bridges and the meet-agent
+                // CDP session continue to function while the bot is
+                // off-screen.
+                let label = crate::meet_call::window_label_for(&request_id);
+                if let Some(window) = app.get_webview_window(&label) {
+                    if let Err(err) = window.hide() {
+                        log::warn!(
+                            "[meet-scanner] post-join hide failed request_id={request_id} err={err}"
+                        );
+                    } else {
+                        log::info!(
+                            "[meet-scanner] post-join hide ok request_id={request_id}"
+                        );
+                    }
+                }
+            }
             Err(err) => {
                 log::warn!("[meet-scanner] join sequence aborted request_id={request_id} err={err}")
             }

From 34bdd11db6806a47dfc7d31727c1c51e3555497b Mon Sep 17 00:00:00 2001
From: oxoxDev <nikhil@tinyhumans.ai>
Date: Fri, 22 May 2026 13:21:33 +0530
Subject: [PATCH 07/64] fix(meet-scanner): flip camera + mic toggles ON before
 Ask to join
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Meet defaults camera + mic OFF for new participants. If the scanner
just types a name and clicks Join, the bot lands in the meeting muted
with no camera — Meet never calls getUserMedia, the audio + camera
bridges have nothing to intercept (audio_context_state stays
'not-created', camera bridge canvas is never selected as the outbound
track), and the speak_pump can't push synthesized PCM into a live
mic track because there is no live mic track.

Add a Phase 2.5 between display-name and Ask-to-join that clicks the
camera and mic toggles ON. The toggles are icon buttons with no
visible text, so the existing wait_and_click_text helper (which
matches innerText) won't find them — introduce a sibling matcher
click_by_aria_label that walks button/aria-label nodes and matches
on case-insensitive substring against a list of canonical Meet
labels ("turn on camera", "camera is off", etc).

Both clicks are best-effort: if Meet's aria copy has drifted by
region / A-B test we log and continue. The bot still joins, just
without that capability.
---
 app/src-tauri/src/meet_scanner/mod.rs | 115 ++++++++++++++++++++++++++
 1 file changed, 115 insertions(+)

diff --git a/app/src-tauri/src/meet_scanner/mod.rs b/app/src-tauri/src/meet_scanner/mod.rs
index 55ab6aeb6e..ad8ad1edce 100644
--- a/app/src-tauri/src/meet_scanner/mod.rs
+++ b/app/src-tauri/src/meet_scanner/mod.rs
@@ -143,6 +143,51 @@ async fn run(request_id: &str, meet_url: &str, display_name: &str) -> Result<(),
     // Phase 2 — type the display name.
     type_into_named_input(&mut cdp, &session, "Your name", display_name).await?;
 
+    // Phase 2.5 — flip the camera + mic toggles ON.
+    //
+    // Meet defaults camera + mic OFF for new participants. If we click
+    // "Ask to join" without flipping them, the bot joins muted with no
+    // camera — Meet never calls getUserMedia, the audio + camera bridges
+    // have nothing to intercept, the mascot tile shows initials instead
+    // of the mascot canvas, and the speak_pump can't push synthesized
+    // PCM into a live mic track. Both toggles use `aria-label` (no
+    // visible text) so wait_and_click_text isn't enough; use a
+    // dedicated aria-label matcher.
+    //
+    // Best-effort: if Meet's aria copy has drifted (region / A-B test)
+    // we log and continue. The bot will still join, just without one or
+    // both of camera + mic.
+    if let Err(err) = click_by_aria_label(
+        &mut cdp,
+        &session,
+        &[
+            "turn on camera",
+            "camera is off",
+            "turn camera on",
+        ],
+        Duration::from_secs(4),
+    )
+    .await
+    {
+        log::info!("[meet-scanner] camera toggle ON not clicked: {err}");
+    }
+    if let Err(err) = click_by_aria_label(
+        &mut cdp,
+        &session,
+        &[
+            "turn on microphone",
+            "turn on mic",
+            "microphone is off",
+            "mic is off",
+            "turn microphone on",
+        ],
+        Duration::from_secs(4),
+    )
+    .await
+    {
+        log::info!("[meet-scanner] mic toggle ON not clicked: {err}");
+    }
+
     // Phase 3 — request to join.
     wait_and_click_text(
         &mut cdp,
@@ -155,6 +200,76 @@ async fn run(request_id: &str, meet_url: &str, display_name: &str) -> Result<(),
     Ok(())
 }
 
+/// Click a button whose `aria-label` matches one of `labels`
+/// (case-insensitive substring). Meet's camera + mic toggles have no
+/// visible text — they're icon buttons with `aria-label="Turn on
+/// camera"` etc. The existing `wait_and_click_text` matches innerText
+/// only, so we need a sibling matcher anchored on aria-label.
+async fn click_by_aria_label(
+    cdp: &mut CdpConn,
+    session: &str,
+    labels: &[&str],
+    budget: Duration,
+) -> Result<(), String> {
+    let labels_js = serde_json::to_string(labels).map_err(|e| format!("labels json: {e}"))?;
+    let expression = format!(
+        r#"
+        (() => {{
+          const labels = {labels_js};
+          const want = labels.map(l => l.toLowerCase());
+          const candidates = document.querySelectorAll(
+            'button, [role="button"], [aria-label]'
+          );
+          for (const el of candidates) {{
+            if (el.disabled || el.getAttribute('aria-disabled') === 'true') continue;
+            const aria = (el.getAttribute('aria-label') || '').toLowerCase();
+            if (!aria) continue;
+            if (!want.some(w => aria.includes(w))) continue;
+            const rect = el.getBoundingClientRect();
+            if (rect.width === 0 || rect.height === 0) continue;
+            el.scrollIntoView({{ block: 'center', inline: 'center' }});
+            el.click();
+            return aria;
+          }}
+          return null;
+        }})()
+        "#
+    );
+
+    let deadline = tokio::time::Instant::now() + budget;
+    let mut last_value = Value::Null;
+    while tokio::time::Instant::now() < deadline {
+        let res = cdp
+            .call(
+                "Runtime.evaluate",
+                json!({
+                    "expression": expression,
+                    "returnByValue": true,
+                    "awaitPromise": false,
+                }),
+                Some(session),
+            )
+            .await?;
+        let value = res
+            .get("result")
+            .and_then(|r| r.get("value"))
+            .cloned()
+            .unwrap_or(Value::Null);
+        if value.is_string() {
+            log::info!(
+                "[meet-scanner] clicked aria-label matching {labels:?} aria={}",
+                value.as_str().unwrap_or("")
+            );
+            return Ok(());
+        }
+        last_value = value;
+        tokio::time::sleep(POLL_INTERVAL).await;
+    }
+    Err(format!(
+        "timeout waiting for aria-label matching {labels:?} (last={last_value})"
+    ))
+}
+
 /// Poll CEF's target list until a page whose URL starts with `meet_url`
 /// shows up, then attach a CDP session to it. Filtering by the full
 /// per-call URL prefix (rather than just the host) keeps two concurrent

From 7a72279873132fce40d310589db605b3b5d3a819 Mon Sep 17 00:00:00 2001
From: oxoxDev <nikhil@tinyhumans.ai>
Date: Fri, 22 May 2026 14:24:39 +0530
Subject: [PATCH 08/64] fix(meet-scanner): widen aria-label matchers + dump on
 toggle miss
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Camera + mic toggle clicks timed out in the latest smoke. Meet's
aria-label copy doesn't match the narrow list shipped in the previous
commit, so the bot kept joining muted with no camera — Meet never
called getUserMedia, the audio + video bridges stayed inert
(audio_context_state stuck at not-created, destination_track_count
stuck at 0), and the speak_pump pushed PCM into a stream that
doesn't exist.

Two changes:
- Broaden the matcher list to include the toggled-on variants (Meet
  sometimes ships pre-join in 'Turn off camera' state by default when
  the previous session left the toggle on), and include the
  keyboard-shortcut suffix variants ('camera (cmd+e)').
- Bump the per-toggle budget from 4 s to 12 s. Pre-join layout settles
  ~3-5 s after name input on slower CEF builds; 4 s left us racing.
- On miss, dump the matching aria-labels via a CDP Runtime.evaluate
  helper so the next smoke surfaces the actual strings Meet shipped
  this region/build, and we can extend the matcher precisely instead
  of guessing.
---
 app/src-tauri/src/meet_scanner/mod.rs | 73 ++++++++++++++++++++++++---
 1 file changed, 66 insertions(+), 7 deletions(-)

diff --git a/app/src-tauri/src/meet_scanner/mod.rs b/app/src-tauri/src/meet_scanner/mod.rs
index ad8ad1edce..f76efdfd17 100644
--- a/app/src-tauri/src/meet_scanner/mod.rs
+++ b/app/src-tauri/src/meet_scanner/mod.rs
@@ -154,9 +154,8 @@ async fn run(request_id: &str, meet_url: &str, display_name: &str) -> Result<(),
     // visible text) so wait_and_click_text isn't enough; use a
     // dedicated aria-label matcher.
     //
-    // Best-effort: if Meet's aria copy has drifted (region / A-B test)
-    // we log and continue. The bot will still join, just without one or
-    // both of camera + mic.
+    // Best-effort: if no match is found in the budget, dump the page's
+    // current aria-labels so we can extend the matcher next iteration.
     if let Err(err) = click_by_aria_label(
         &mut cdp,
         &session,
@@ -164,12 +163,16 @@ async fn run(request_id: &str, meet_url: &str, display_name: &str) -> Result<(),
             "turn on camera",
             "camera is off",
             "turn camera on",
+            "turn camera off",
+            "camera (cmd+e)",
+            "camera off",
         ],
-        Duration::from_secs(4),
+        Duration::from_secs(12),
     )
     .await
     {
-        log::info!("[meet-scanner] camera toggle ON not clicked: {err}");
+        log::warn!("[meet-scanner] camera toggle ON not clicked: {err}");
+        dump_aria_labels(&mut cdp, &session, "camera|video").await;
     }
     if let Err(err) = click_by_aria_label(
         &mut cdp,
@@ -180,12 +183,18 @@ async fn run(request_id: &str, meet_url: &str, display_name: &str) -> Result<(),
             "microphone is off",
             "mic is off",
             "turn microphone on",
+            "turn off microphone",
+            "turn off mic",
+            "microphone (cmd+d)",
+            "mic (cmd+d)",
+            "microphone off",
         ],
-        Duration::from_secs(4),
+        Duration::from_secs(12),
     )
     .await
     {
-        log::info!("[meet-scanner] mic toggle ON not clicked: {err}");
+        log::warn!("[meet-scanner] mic toggle ON not clicked: {err}");
+        dump_aria_labels(&mut cdp, &session, "mic|microphone|audio").await;
     }
 
     // Phase 3 — request to join.
@@ -200,6 +209,56 @@ async fn run(request_id: &str, meet_url: &str, display_name: &str) -> Result<(),
     Ok(())
 }
 
+/// Dump the page's aria-labels that match a JS regex pattern so we can
+/// inspect what Meet actually exposes after a failed
+/// [`click_by_aria_label`]. Best-effort, swallows all CDP errors.
+async fn dump_aria_labels(cdp: &mut CdpConn, session: &str, pattern: &str) {
+    let pattern_js =
+        serde_json::to_string(pattern).unwrap_or_else(|_| "\"camera\"".to_string());
+    let expression = format!(
+        r#"
+        (() => {{
+          const re = new RegExp({pattern_js}, "i");
+          const nodes = document.querySelectorAll('[aria-label]');
+          const hits = [];
+          for (const el of nodes) {{
+            const aria = el.getAttribute('aria-label') || '';
+            if (!re.test(aria)) continue;
+            const tag = el.tagName.toLowerCase();
+            const role = el.getAttribute('role') || '';
+            const dataTip = el.getAttribute('data-tooltip') || '';
+            const rect = el.getBoundingClientRect();
+            const visible = rect.width > 0 && rect.height > 0;
+            hits.push({{ aria, tag, role, dataTip, visible }});
+            if (hits.length >= 24) break;
+          }}
+          return hits;
+        }})()
+        "#
+    );
+    let res = match cdp
+        .call(
+            "Runtime.evaluate",
+            json!({ "expression": expression, "returnByValue": true }),
+            Some(session),
+        )
+        .await
+    {
+        Ok(v) => v,
+        Err(err) => {
+            log::info!("[meet-scanner] aria-label dump failed: {err}");
+            return;
+        }
+    };
+    if let Some(arr) = res.get("result").and_then(|r| r.get("value")) {
+        log::warn!(
+            "[meet-scanner] aria-label dump pattern={} hits={}",
+            pattern,
+            arr
+        );
+    }
+}
+
 /// Click a button whose `aria-label` matches one of `labels`
 /// (case-insensitive substring). Meet's camera + mic toggles have no
 /// visible text — they're icon buttons with `aria-label="Turn on

From ea08f765ca297ddb898dba39ac1d073dab37f0ce Mon Sep 17 00:00:00 2001
From: oxoxDev <nikhil@tinyhumans.ai>
Date: Fri, 22 May 2026 14:31:10 +0530
Subject: [PATCH 09/64] fix(meet-scanner): only click 'Turn ON' variants; never
 click 'Turn off'
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Booby-trap fix. Meet's toggle aria-label describes the *action* the
click would perform — "Turn on camera" when off, "Turn off camera"
when on. My previous matcher included both directions, so when the
device was already ON the matcher hit the "Turn off" variant and
the click flipped it OFF. That's what muted the bot in the last smoke:
mic started ON (or got auto-enabled by Meet between page-load and our
scan), 'Turn off microphone' matched, we clicked, mic ended up muted.

Trim both matchers to ON-only variants. If the device is already on,
no match means we leave it alone — correct outcome. If both directions
miss, dump aria-labels via the existing helper so we can extend.

Also drops the cmd-shortcut and bare 'off' variants — they were
either ambiguous or duplicates of the canonical 'Turn on …' /
'… is off' pair, and removing them tightens the matcher window
against future Meet copy drift.
---
 app/src-tauri/src/meet_scanner/mod.rs | 49 +++++++++++++--------------
 1 file changed, 24 insertions(+), 25 deletions(-)

diff --git a/app/src-tauri/src/meet_scanner/mod.rs b/app/src-tauri/src/meet_scanner/mod.rs
index f76efdfd17..ec14609689 100644
--- a/app/src-tauri/src/meet_scanner/mod.rs
+++ b/app/src-tauri/src/meet_scanner/mod.rs
@@ -143,35 +143,36 @@ async fn run(request_id: &str, meet_url: &str, display_name: &str) -> Result<(),
     // Phase 2 — type the display name.
     type_into_named_input(&mut cdp, &session, "Your name", display_name).await?;
 
-    // Phase 2.5 — flip the camera + mic toggles ON.
+    // Phase 2.5 — ensure camera + mic are ON before Ask-to-join.
     //
-    // Meet defaults camera + mic OFF for new participants. If we click
-    // "Ask to join" without flipping them, the bot joins muted with no
-    // camera — Meet never calls getUserMedia, the audio + camera bridges
-    // have nothing to intercept, the mascot tile shows initials instead
-    // of the mascot canvas, and the speak_pump can't push synthesized
-    // PCM into a live mic track. Both toggles use `aria-label` (no
-    // visible text) so wait_and_click_text isn't enough; use a
-    // dedicated aria-label matcher.
+    // Meet pre-join shows the toggle button with aria-label that
+    // describes the *action it performs*: "Turn on camera" when the
+    // camera is currently OFF, "Turn off camera" when currently ON.
+    // We want both ON, so we MUST only match the "Turn on …" variants.
+    // Matching "Turn off …" would booby-trap us: it would click an
+    // already-on toggle, turning it OFF — which is the bug we just
+    // tripped on (mic ended up muted because "Turn off microphone"
+    // matched and the click flipped it off).
     //
-    // Best-effort: if no match is found in the budget, dump the page's
-    // current aria-labels so we can extend the matcher next iteration.
+    // If no "Turn on …" match is found, the device is already on (or
+    // the page hasn't rendered the toggle yet) — log + skip silently.
+    // On miss, dump the current aria-labels so we can verify state and
+    // extend the matcher with newly observed Meet variants.
     if let Err(err) = click_by_aria_label(
         &mut cdp,
         &session,
         &[
             "turn on camera",
-            "camera is off",
             "turn camera on",
-            "turn camera off",
-            "camera (cmd+e)",
-            "camera off",
+            "camera is off",
         ],
-        Duration::from_secs(12),
+        Duration::from_secs(8),
     )
     .await
     {
-        log::warn!("[meet-scanner] camera toggle ON not clicked: {err}");
+        log::info!(
+            "[meet-scanner] camera toggle ON not clicked (already on or label drift): {err}"
+        );
         dump_aria_labels(&mut cdp, &session, "camera|video").await;
     }
     if let Err(err) = click_by_aria_label(
@@ -179,21 +180,19 @@ async fn run(request_id: &str, meet_url: &str, display_name: &str) -> Result<(),
         &session,
         &[
             "turn on microphone",
+            "turn microphone on",
             "turn on mic",
+            "turn mic on",
             "microphone is off",
             "mic is off",
-            "turn microphone on",
-            "turn off microphone",
-            "turn off mic",
-            "microphone (cmd+d)",
-            "mic (cmd+d)",
-            "microphone off",
         ],
-        Duration::from_secs(12),
+        Duration::from_secs(8),
     )
     .await
     {
-        log::warn!("[meet-scanner] mic toggle ON not clicked: {err}");
+        log::info!(
+            "[meet-scanner] mic toggle ON not clicked (already on or label drift): {err}"
+        );
         dump_aria_labels(&mut cdp, &session, "mic|microphone|audio").await;
     }
 

From be8f61b94ceee919d966835eef31ffd8ceed7736 Mon Sep 17 00:00:00 2001
From: oxoxDev <nikhil@tinyhumans.ai>
Date: Fri, 22 May 2026 14:35:37 +0530
Subject: [PATCH 10/64] diag(meet-scanner): drop post-join hide to isolate
 pipeline regression
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Smoke shows audio_context_state stuck at 'not-created' and no
push_caption RPC after the post-join hide. Both consistent with the
hidden renderer (orderOut: under the hood) pausing its event loop —
the captions_bridge MutationObserver never fires, the audio bridge's
gUM intercept never gets a fresh getUserMedia call from Meet, and the
speak_pump pushes PCM into a destination stream that was never
attached to any outbound track.

Temporarily revert the hide to confirm the diagnosis. With the window
visible we should see audio_context_state transition to 'running' and
push_caption start firing as the user speaks the wake word. If that
holds, restore hiding via a non-orderOut mechanism (set_position to
a far-off-screen value via the runtime path, or set_size to 1x1, or
the CefBrowserHost::set_audio_muted route from the deferred follow-up
list).
---
 app/src-tauri/src/meet_scanner/mod.rs | 31 +++++++++++----------------
 1 file changed, 12 insertions(+), 19 deletions(-)

diff --git a/app/src-tauri/src/meet_scanner/mod.rs b/app/src-tauri/src/meet_scanner/mod.rs
index ec14609689..d363987f50 100644
--- a/app/src-tauri/src/meet_scanner/mod.rs
+++ b/app/src-tauri/src/meet_scanner/mod.rs
@@ -81,25 +81,18 @@ pub fn spawn<R: Runtime>(
         match run(&request_id, &meet_url, &display_name).await {
             Ok(()) => {
                 log::info!("[meet-scanner] join sequence completed request_id={request_id}");
-                // Now that the scanner has clicked "Ask to join" and CEF
-                // has the meeting page laid out, hide the window so the
-                // user never sees the bot's Meet UI. The renderer keeps
-                // its backing surface (orderOut: rather than release),
-                // so the audio + camera bridges and the meet-agent
-                // CDP session continue to function while the bot is
-                // off-screen.
-                let label = crate::meet_call::window_label_for(&request_id);
-                if let Some(window) = app.get_webview_window(&label) {
-                    if let Err(err) = window.hide() {
-                        log::warn!(
-                            "[meet-scanner] post-join hide failed request_id={request_id} err={err}"
-                        );
-                    } else {
-                        log::info!(
-                            "[meet-scanner] post-join hide ok request_id={request_id}"
-                        );
-                    }
-                }
+                // Diagnostic build: keep the window VISIBLE post-join so
+                // we can verify whether the previous `window.hide()` was
+                // suspending the renderer enough to break the audio +
+                // caption bridges. Smoke shows audio_context_state stuck
+                // at "not-created" and no push_caption RPCs ever fire
+                // after hide() — both consistent with the renderer
+                // pausing its event loop when orderOut: lands. If the
+                // pipeline works with the window visible we'll restore
+                // hide() via a different mechanism (e.g. drag off-screen
+                // via Tauri set_position rather than orderOut:).
+                let _ = app;
+                let _ = request_id;
             }
             Err(err) => {
                 log::warn!("[meet-scanner] join sequence aborted request_id={request_id} err={err}")

From 6b11c96dcf6c675e5890acd5e41bb4a3f77df5f0 Mon Sep 17 00:00:00 2001
From: oxoxDev <nikhil@tinyhumans.ai>
Date: Fri, 22 May 2026 14:49:40 +0530
Subject: [PATCH 11/64] fix(meet-agent): bare-wake (no tail) replies with
 greeting ack, not silence
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

When the wake-word caption arrives with no tail ("Hey Openhuman" by
itself, with no question following), session.take_pending_prompt
returns None and run_caption_turn silently returns Ok(false). From
the user's side this looks identical to the bot being broken — the
wake-word fired log appears in the dev:app stdout but no audible
reply ever follows.

Treat empty-tail wake as a 'say hi back' greeting cue: synthesize
a short ack so the user gets audible proof that the
caption→wake→speak loop is wired end-to-end. Reuses the existing
pick_ack_phrase / stub_tts fallbacks so this works without backend.

Smoke now traceable in logs: 'caption turn bare-wake (no tail)' →
'caption turn start … bare_wake=true' → ack reply enqueued →
speak_pump pushes PCM. If the user STILL hears nothing after this,
the failure has moved past brain to the audio_bridge intercept
(destination_track_count stuck at 0 because Meet cached its
pre-bridge MediaStream), which is the next thing to fix.
---
 src/openhuman/meet_agent/brain.rs | 21 +++++++++++++++++----
 1 file changed, 17 insertions(+), 4 deletions(-)

diff --git a/src/openhuman/meet_agent/brain.rs b/src/openhuman/meet_agent/brain.rs
index 497874c40b..c83f65a113 100644
--- a/src/openhuman/meet_agent/brain.rs
+++ b/src/openhuman/meet_agent/brain.rs
@@ -77,18 +77,31 @@ pub async fn run_caption_turn(request_id: &str) -> Result<bool, String> {
     // a chance to assemble before we drain the prompt.
     tokio::time::sleep(std::time::Duration::from_millis(CAPTION_TURN_DELAY_MS)).await;
 
-    let (prompt, history) = match registry().with_session(request_id, |s| {
+    // When wake fires from a bare "hey openhuman" with no tail, the
+    // session returns None from take_pending_prompt — there's nothing
+    // to feed the LLM. Previously we silently bailed (`return Ok(false)`)
+    // which made the bot look broken to the user. Treat empty-tail wake
+    // as a "say hi back" greeting cue: synthesize a short ack so the
+    // user gets audible proof that the caption→wake→speak loop is
+    // wired up end-to-end.
+    let (prompt, history, was_bare_wake) = match registry().with_session(request_id, |s| {
         let prompt = s.take_pending_prompt();
         let history = recent_dialog_history(s.events(), CONTEXT_EVENT_WINDOW);
         (prompt, history)
     })? {
-        (Some(p), h) => (p, h),
-        (None, _) => return Ok(false),
+        (Some(p), h) => (p, h, false),
+        (None, h) => {
+            log::info!(
+                "[meet-agent] caption turn bare-wake (no tail) request_id={request_id} — replying with greeting ack"
+            );
+            ("hello".to_string(), h, true)
+        }
     };
     log::info!(
-        "[meet-agent] caption turn start request_id={request_id} prompt_chars={} history_msgs={}",
+        "[meet-agent] caption turn start request_id={request_id} prompt_chars={} history_msgs={} bare_wake={}",
         prompt.chars().count(),
         history.len(),
+        was_bare_wake,
     );
 
     // Real LLM call. The model gets the rolling caption history plus

From 2152db9ee3e2f187a64af49963b84a024cf8e54d Mon Sep 17 00:00:00 2001
From: oxoxDev <nikhil@tinyhumans.ai>
Date: Fri, 22 May 2026 14:58:41 +0530
Subject: [PATCH 12/64] feat(meet-scanner): scanner-side admission watch +
 captions force-enable
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

captions_bridge.js auto-enables CC by polling every 2s for a button
whose aria-label starts with 'turn on captions' (indexOf === 0). Two
weaknesses surfaced in smoke:

1. Meet ships variants like 'Turn on captions (c)' in some regions —
   the keyboard-shortcut parenthesis breaks the strict prefix match.
2. The polling cap (30 attempts * 2s = 60s) can expire before a slow
   host admits the bot from the waiting room.

Add a Phase 4 to the Rust scanner: after clicking Ask-to-join, poll
the in-call control bar for a 'Leave call' / 'End call' affordance —
that's the cleanest signal the bot got admitted. Once admitted, click
the captions toggle from the scanner side using the existing
click_by_aria_label substring matcher, which is looser than the JS
prefix matcher and handles the cmd-shortcut variant.

Belt-and-suspenders: if either step times out, log and continue. The
brain just sees no captions for that session — no worse than the
pre-patch state. Admission budget is 120s to give the host plenty
of time before we give up; both this loop and the captions_bridge
poll run in parallel so whichever notices the CC button first wins.
---
 app/src-tauri/src/meet_scanner/mod.rs | 81 +++++++++++++++++++++++++++
 1 file changed, 81 insertions(+)

diff --git a/app/src-tauri/src/meet_scanner/mod.rs b/app/src-tauri/src/meet_scanner/mod.rs
index d363987f50..99959038b4 100644
--- a/app/src-tauri/src/meet_scanner/mod.rs
+++ b/app/src-tauri/src/meet_scanner/mod.rs
@@ -198,9 +198,90 @@ async fn run(request_id: &str, meet_url: &str, display_name: &str) -> Result<(),
     )
     .await?;
 
+    // Phase 4 — once the bot is admitted, force-enable captions.
+    //
+    // captions_bridge.js already polls every 2 s for a button whose
+    // aria-label STARTS with "turn on captions" (`indexOf(...) === 0`).
+    // That's brittle: Meet ships "Turn on captions (c)" in some regions
+    // (the parenthesised shortcut breaks the `=== 0` prefix-match), and
+    // the polling cap (30 attempts * 2 s = 60 s) can expire before a
+    // slow host admits the bot. Belt-and-suspenders: from the scanner
+    // side, wait for admission (the "Leave call" affordance) then click
+    // the captions toggle ourselves via the looser substring matcher.
+    //
+    // Best-effort: if any step times out, log + continue. The brain
+    // will simply not see captions for this session, which is no worse
+    // than the pre-fix state.
+    if let Err(err) = wait_for_admission(&mut cdp, &session).await {
+        log::info!("[meet-scanner] admission wait skipped: {err}");
+    } else {
+        log::info!("[meet-scanner] bot admitted into meeting");
+        if let Err(err) = click_by_aria_label(
+            &mut cdp,
+            &session,
+            &[
+                "turn on captions",
+                "captions on",
+                "captions (c)",
+                "turn on live captions",
+                "show captions",
+            ],
+            Duration::from_secs(8),
+        )
+        .await
+        {
+            log::info!("[meet-scanner] captions toggle ON not clicked: {err}");
+            dump_aria_labels(&mut cdp, &session, "caption").await;
+        }
+    }
+
     Ok(())
 }
 
+/// Wait until the meeting page renders the in-call control bar — the
+/// signal that the host has admitted the bot from the waiting room.
+/// The "Leave call" / "End call" button is the simplest stable anchor;
+/// the captions and "more options" buttons exist in pre-join too.
+async fn wait_for_admission(cdp: &mut CdpConn, session: &str) -> Result<(), String> {
+    const ADMISSION_BUDGET: Duration = Duration::from_secs(120);
+    let expression = r#"
+        (() => {
+          const all = document.querySelectorAll('button[aria-label]');
+          for (const el of all) {
+            const a = (el.getAttribute('aria-label') || '').toLowerCase();
+            if (a.includes('leave call') || a.includes('end call')) {
+              const rect = el.getBoundingClientRect();
+              if (rect.width > 0 && rect.height > 0) return true;
+            }
+          }
+          return false;
+        })()
+    "#;
+    let deadline = tokio::time::Instant::now() + ADMISSION_BUDGET;
+    while tokio::time::Instant::now() < deadline {
+        let res = cdp
+            .call(
+                "Runtime.evaluate",
+                json!({ "expression": expression, "returnByValue": true }),
+                Some(session),
+            )
+            .await?;
+        let admitted = res
+            .get("result")
+            .and_then(|r| r.get("value"))
+            .and_then(|v| v.as_bool())
+            .unwrap_or(false);
+        if admitted {
+            return Ok(());
+        }
+        tokio::time::sleep(Duration::from_secs(1)).await;
+    }
+    Err(format!(
+        "timeout ({}s) waiting for Leave-call affordance",
+        ADMISSION_BUDGET.as_secs()
+    ))
+}
+
 /// Dump the page's aria-labels that match a JS regex pattern so we can
 /// inspect what Meet actually exposes after a failed
 /// [`click_by_aria_label`]. Best-effort, swallows all CDP errors.

From 309301ae813791b76007f35fdd7402a3e37f04af Mon Sep 17 00:00:00 2001
From: oxoxDev <nikhil@tinyhumans.ai>
Date: Fri, 22 May 2026 15:03:23 +0530
Subject: [PATCH 13/64] diag(meet-agent): log every push_caption text +
 wake_fired outcome
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Captions are flowing into the rpc handler (7 push_captions in ~10s
in the latest smoke) but no 'wake word fired' lines show up. Two
candidates:
  (a) user said something that does not contain 'hey openhuman' in
      Meet's normalised caption text — even after normalize_for_wake
      strips punctuation
  (b) normalisation is dropping/altering the match string before
      session.note_caption searches it

Log every push_caption's text + wake_fired so the next smoke shows
the exact string Meet's STT produced and whether the matcher fired.
Truncated to 120 chars so a long caption doesn't blow up the log line.
Captions are already on the wire to every meeting participant, so
no new exposure surface here.
---
 src/openhuman/meet_agent/rpc.rs | 13 +++++++++++++
 1 file changed, 13 insertions(+)

diff --git a/src/openhuman/meet_agent/rpc.rs b/src/openhuman/meet_agent/rpc.rs
index 31c36c833f..dc3784f3f2 100644
--- a/src/openhuman/meet_agent/rpc.rs
+++ b/src/openhuman/meet_agent/rpc.rs
@@ -83,9 +83,22 @@ pub async fn handle_push_caption(params: Map<String, Value>) -> Result<Value, St
     let req: PushCaptionRequest = serde_json::from_value(Value::Object(params))
         .map_err(|e| format!("{LOG_PREFIX} invalid push_caption params: {e}"))?;
 
+    // Diagnostic: log the caption text + match outcome so we can tell
+    // from the dev:app stdout exactly what the wake-word matcher saw.
+    // Truncate to 120 chars to avoid blowing up the log line. This is
+    // safe to leave on for now — captions are already broadcast to all
+    // participants in the meeting; nothing here that isn't on the wire.
+    let preview: String = req.text.chars().take(120).collect();
     let wake_fired = registry().with_session(&req.request_id, |s| {
         s.note_caption(&req.speaker, &req.text, req.ts_ms)
     })?;
+    log::info!(
+        "{LOG_PREFIX} push_caption request_id={} speaker={} text=\"{}\" wake_fired={}",
+        req.request_id,
+        req.speaker,
+        preview,
+        wake_fired,
+    );
 
     if wake_fired {
         log::info!(

From 62014ab58b5e1964a3971d07d626778660bfce3e Mon Sep 17 00:00:00 2001
From: oxoxDev <nikhil@tinyhumans.ai>
Date: Fri, 22 May 2026 15:41:18 +0530
Subject: [PATCH 14/64] =?UTF-8?q?feat(meet-scanner):=20cycle=20mic=20off?=
 =?UTF-8?q?=E2=86=92on=20after=20Ask-to-join=20to=20force=20fresh=20gUM?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Smoke shows the full caption→wake→brain→TTS→speak_pump pipeline
fires end-to-end (caption_turn_done reply_chars=12 synth_samples=3200)
but the host hears nothing. Root cause: audio_bridge.js's
getUserMedia intercept never fires — Meet caches its initial mic
MediaStream from page load (before our bridges installed) and reuses
it across the bridge-driven reload, so the bot's outbound mic track
keeps pointing at the real OS microphone (MacBook Pro Microphone per
the aria-label dump). The synthesised PCM that speak_pump pushes ends
up in a MediaStreamDestination that's never attached to anything Meet
broadcasts.

Add a Phase 3.5 right after Ask-to-join: click 'Turn off microphone',
pause ~700 ms for React to settle, then click 'Turn on microphone'.
The second click triggers Meet to re-request its mic via getUserMedia,
which our bridge now intercepts and replaces with the synthesised
destination stream — destination_track_count flips from 0 → 1 and
the bot's outbound mic becomes the brain's TTS pump output.

Camera off-on cycle deliberately not added: the fake-camera Y4M flag
already feeds Meet a one-frame mascot via Chromium's process-level
fake-video-capture path, so the bot's tile shows the mascot already.
The video animation upgrade lives in the separate MascotFrameProducer
encode-bottleneck follow-up.
---
 app/src-tauri/src/meet_scanner/mod.rs | 47 +++++++++++++++++++++++++++
 1 file changed, 47 insertions(+)

diff --git a/app/src-tauri/src/meet_scanner/mod.rs b/app/src-tauri/src/meet_scanner/mod.rs
index 99959038b4..a9e1260746 100644
--- a/app/src-tauri/src/meet_scanner/mod.rs
+++ b/app/src-tauri/src/meet_scanner/mod.rs
@@ -198,6 +198,53 @@ async fn run(request_id: &str, meet_url: &str, display_name: &str) -> Result<(),
     )
     .await?;
 
+    // Phase 3.5 — force a fresh getUserMedia call by cycling mic off-on.
+    //
+    // Meet caches the camera + mic MediaStreams from initial page load
+    // (before meet_audio::inject reloaded with our bridges). Our gUM
+    // intercept in audio_bridge.js only fires on NEW gUM calls, so the
+    // cached streams keep flowing — the bot's mic stays the real OS
+    // microphone, the bot's camera stays the static fake-camera Y4M
+    // frame, and our speak_pump pushes synthesized PCM into a
+    // MediaStreamDestination that's never attached to any outbound
+    // track. Host hears the user (echo loop) instead of the bot.
+    //
+    // Click "Turn off microphone" → ~700 ms pause for React to settle →
+    // click whatever aria-label appears in its place ("Turn on
+    // microphone" or a variant). The second click triggers Meet to
+    // re-request via getUserMedia, which our bridge then intercepts.
+    if let Err(err) = click_by_aria_label(
+        &mut cdp,
+        &session,
+        &["turn off microphone", "turn microphone off", "turn off mic"],
+        Duration::from_secs(4),
+    )
+    .await
+    {
+        log::info!("[meet-scanner] mic off-cycle skipped: {err}");
+    } else {
+        log::info!("[meet-scanner] mic cycled off; pausing 700ms before re-arm");
+        tokio::time::sleep(Duration::from_millis(700)).await;
+        if let Err(err) = click_by_aria_label(
+            &mut cdp,
+            &session,
+            &[
+                "turn on microphone",
+                "turn microphone on",
+                "turn on mic",
+                "turn mic on",
+            ],
+            Duration::from_secs(6),
+        )
+        .await
+        {
+            log::warn!("[meet-scanner] mic on-cycle missed (left muted!): {err}");
+            dump_aria_labels(&mut cdp, &session, "mic|microphone").await;
+        } else {
+            log::info!("[meet-scanner] mic re-armed (gUM intercept should now fire)");
+        }
+    }
+
     // Phase 4 — once the bot is admitted, force-enable captions.
     //
     // captions_bridge.js already polls every 2 s for a button whose

From 356f27c10a5050328484d06908ab0ab6fd3ee938 Mon Sep 17 00:00:00 2001
From: oxoxDev <nikhil@tinyhumans.ai>
Date: Fri, 22 May 2026 16:25:05 +0530
Subject: [PATCH 15/64] =?UTF-8?q?fix(meet-agent):=20widen=20wake=20matcher?=
 =?UTF-8?q?=20=E2=80=94=20accept=20'hi/hello=20openhuman'=20+=20bare=20'op?=
 =?UTF-8?q?enhuman'?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Smoke caption 'I, Hi Openhuman.' did not fire the wake word because
the previous matcher only knew 'hey openhuman' / 'hey open human'.
Meet's STT also routinely drops the 'hey' prefix, splits the brand
into 'Open Human' (two words), or substitutes 'Hi'/'Hello'.

Expand the matcher to a small ordered list — checked longest-first
so the tail offset is calculated against the matched phrase length,
not the wake-prefix length:

  hey open human, hi open human, hello open human,
  hey openhuman,  hi openhuman,  hello openhuman,
  open human, openhuman

Bare 'openhuman' is in the list because Meet's STT will sometimes
drop both the greeting AND the space — leaving the brand alone in
the caption. Risk of false-positives is low: 'openhuman' isn't a
common English token, and 'open human' as a 2-word collocation is
almost only ever the brand spoken aloud.
---
 src/openhuman/meet_agent/session.rs | 33 +++++++++++++++++++++--------
 1 file changed, 24 insertions(+), 9 deletions(-)

diff --git a/src/openhuman/meet_agent/session.rs b/src/openhuman/meet_agent/session.rs
index 6e60c649e0..a3e946e8b6 100644
--- a/src/openhuman/meet_agent/session.rs
+++ b/src/openhuman/meet_agent/session.rs
@@ -147,15 +147,30 @@ impl MeetAgentSession {
         // and the transcript log; the user's punctuation isn't load-
         // bearing for note-taking.
         let normalized = normalize_for_wake(text);
-        let wake_idx = normalized
-            .find("hey openhuman")
-            .or_else(|| normalized.find("hey open human"));
-        if let Some(idx) = wake_idx {
-            let after = if normalized[idx..].starts_with("hey openhuman") {
-                idx + "hey openhuman".len()
-            } else {
-                idx + "hey open human".len()
-            };
+        // Accept any of the canonical wake phrases. Meet's STT mangles
+        // the brand ("Hi Openhuman", "Open Human", dropped prefix) so
+        // we match a small set rather than a single rigid prefix.
+        // Ordered longest-first so the tail offset is calculated against
+        // the actual matched phrase.
+        const WAKE_PHRASES: &[&str] = &[
+            "hey open human",
+            "hi open human",
+            "hello open human",
+            "hey openhuman",
+            "hi openhuman",
+            "hello openhuman",
+            "open human",
+            "openhuman",
+        ];
+        let mut wake_hit: Option<(usize, &'static str)> = None;
+        for phrase in WAKE_PHRASES {
+            if let Some(idx) = normalized.find(phrase) {
+                wake_hit = Some((idx, phrase));
+                break;
+            }
+        }
+        if let Some((idx, phrase)) = wake_hit {
+            let after = idx + phrase.len();
             let tail = normalized.get(after..).unwrap_or("").trim().to_string();
             self.pending_prompt = tail;
             self.wake_active = true;

From ea6157512f400516e6618c562151c8cf2949457f Mon Sep 17 00:00:00 2001
From: oxoxDev <nikhil@tinyhumans.ai>
Date: Fri, 22 May 2026 16:29:49 +0530
Subject: [PATCH 16/64] =?UTF-8?q?fix(meet-scanner):=20cycle=20mic=20off?=
 =?UTF-8?q?=E2=86=92on=20BEFORE=20Ask-to-join,=20not=20after?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Latest smoke aborted at the Ask-to-join click (Meet UI variant; bot
got admitted manually) and the post-join mic-cycle never ran — the
flow returns Err and any later phase is skipped. Bot ended up
broadcasting the real OS mic.

Move Phase 3.5 → Phase 2.6: cycle the mic right after the name input,
before clicking Ask-to-join. The cycle is best-effort either way, but
this site is more reliable:

- Pre-join is when Meet's React happily re-acquires media on toggle —
  in-call cycling can race the join handshake.
- The mic cycle now runs even when Ask-to-join itself times out, so a
  manual join from the host still leaves the bot with the gUM
  intercept armed.
- The Ask-to-join click stays best-effort (still -propagates Err
  so the caller knows the scanner gave up driving the page), but
  the gUM bootstrap is no longer gated on it.
---
 app/src-tauri/src/meet_scanner/mod.rs | 28 +++++++++++++++++----------
 1 file changed, 18 insertions(+), 10 deletions(-)

diff --git a/app/src-tauri/src/meet_scanner/mod.rs b/app/src-tauri/src/meet_scanner/mod.rs
index a9e1260746..536a840f53 100644
--- a/app/src-tauri/src/meet_scanner/mod.rs
+++ b/app/src-tauri/src/meet_scanner/mod.rs
@@ -189,16 +189,15 @@ async fn run(request_id: &str, meet_url: &str, display_name: &str) -> Result<(),
         dump_aria_labels(&mut cdp, &session, "mic|microphone|audio").await;
     }
 
-    // Phase 3 — request to join.
-    wait_and_click_text(
-        &mut cdp,
-        &session,
-        &["Ask to join", "Join now"],
-        JOIN_BUTTON_BUDGET,
-    )
-    .await?;
-
-    // Phase 3.5 — force a fresh getUserMedia call by cycling mic off-on.
+    // Phase 2.6 — force a fresh getUserMedia call by cycling mic off-on
+    // BEFORE Ask-to-join.
+    //
+    // Why before, not after: if Ask-to-join times out (Meet UI variant
+    // drift or already-joined-elsewhere) the scanner returns Err and
+    // any later phases never run. Cycling here means the gUM intercept
+    // gets its chance regardless of what happens at the join button —
+    // and pre-join is also when Meet's React happily re-acquires media
+    // on toggle, so this is the more reliable site anyway.
     //
     // Meet caches the camera + mic MediaStreams from initial page load
     // (before meet_audio::inject reloaded with our bridges). Our gUM
@@ -245,6 +244,15 @@ async fn run(request_id: &str, meet_url: &str, display_name: &str) -> Result<(),
         }
     }
 
+    // Phase 3 — request to join.
+    wait_and_click_text(
+        &mut cdp,
+        &session,
+        &["Ask to join", "Join now"],
+        JOIN_BUTTON_BUDGET,
+    )
+    .await?;
+
     // Phase 4 — once the bot is admitted, force-enable captions.
     //
     // captions_bridge.js already polls every 2 s for a button whose

From b4897bd1706265506ff50e2ff22d71f58df77565 Mon Sep 17 00:00:00 2001
From: oxoxDev <nikhil@tinyhumans.ai>
Date: Fri, 22 May 2026 21:32:48 +0530
Subject: [PATCH 17/64] fix(meet-scanner): clear browser cookies pre-join to
 drop leaked Google session
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Smoke against the staging-deployed staging backend hit a new failure:
the bot CEF webview landed on Google's 'Verify it's you' page for the
user's own email (nikhil@tinyhumans.ai) instead of the anonymous
'Your name' pre-join input the scanner drives. The vendored tauri-cef
runtime does not yet honour our per-request_id `data_directory` as a
fresh CEF RequestContext — webviews effectively share the parent
process's cookie + cache store, and Meet recognises the signed-in
Google account on the user's main openhuman session.

Add a Phase 0 in meet_scanner::run that:
- enables the Network CDP domain
- calls Network.clearBrowserCookies on the meet target
- calls Network.clearBrowserCache too (belt-and-suspenders)
- Page.reload with ignoreCache=true so Meet's React state re-fetches
  from a clean slate
- 1500ms sleep to let the reloaded page settle before scanner phases
  start poking the DOM

These CDP commands are scoped to the attached browser instance, so
they wipe the session for THIS Meet target without touching the
user's main openhuman webviews (those run in separate browser
instances). Best-effort — if Network isn't reachable we log and
continue. The proper fix is a per-RequestContext CEF profile in
the vendored runtime; that lives in the deferred follow-up.
---
 app/src-tauri/src/meet_scanner/mod.rs | 49 +++++++++++++++++++++++++++
 1 file changed, 49 insertions(+)

diff --git a/app/src-tauri/src/meet_scanner/mod.rs b/app/src-tauri/src/meet_scanner/mod.rs
index 536a840f53..f7b94bf487 100644
--- a/app/src-tauri/src/meet_scanner/mod.rs
+++ b/app/src-tauri/src/meet_scanner/mod.rs
@@ -113,6 +113,55 @@ async fn run(request_id: &str, meet_url: &str, display_name: &str) -> Result<(),
     let _ = cdp.call("Page.enable", json!({}), Some(&session)).await;
     let _ = cdp.call("Runtime.enable", json!({}), Some(&session)).await;
 
+    // Phase 0 — strip any leaked Google session cookies/cache before
+    // we touch the page. The vendored tauri-cef runtime does not yet
+    // honour our per-request_id `data_directory` as a fresh CEF
+    // RequestContext — webviews end up sharing the parent process's
+    // cookie + cache store. Without this clear, Meet recognises the
+    // signed-in Google account on the user's main openhuman session
+    // ("nikhil@tinyhumans.ai" / "Verify it's you" screen) and the bot
+    // never reaches the anonymous "Your name" pre-join input we drive
+    // in Phase 2.
+    //
+    // `Network.clearBrowserCookies` + `Network.clearBrowserCache` are
+    // CDP-wide for the attached browser instance, so they wipe the
+    // session for THIS Meet target without touching the user's main
+    // openhuman webviews (those run in separate browser instances).
+    // Best-effort: if Network domain isn't enabled or CDP returns an
+    // error, we log and continue — the bot may still land on the
+    // verify screen but won't get worse than the pre-clear state.
+    let _ = cdp
+        .call("Network.enable", json!({}), Some(&session))
+        .await;
+    if let Err(err) = cdp
+        .call("Network.clearBrowserCookies", json!({}), Some(&session))
+        .await
+    {
+        log::warn!("[meet-scanner] clearBrowserCookies failed: {err}");
+    } else {
+        log::info!("[meet-scanner] cleared browser cookies for fresh anonymous session");
+    }
+    if let Err(err) = cdp
+        .call("Network.clearBrowserCache", json!({}), Some(&session))
+        .await
+    {
+        log::info!("[meet-scanner] clearBrowserCache skipped: {err}");
+    }
+    // Reload the page once so Meet re-fetches from scratch without the
+    // user's Google session cookies. Without the reload, Meet's React
+    // state still holds the post-auth view; we'd be clicking buttons
+    // on a stale page.
+    if let Err(err) = cdp
+        .call("Page.reload", json!({"ignoreCache": true}), Some(&session))
+        .await
+    {
+        log::warn!("[meet-scanner] post-cookie-clear reload failed: {err}");
+    }
+    // Give the reloaded page a moment to settle before scanner phases
+    // start poking the DOM. 1.5s is comfortably above Meet's typical
+    // first-paint on CEF + leaves headroom for slow CI runners.
+    tokio::time::sleep(Duration::from_millis(1500)).await;
+
     // Phase 1 — dismiss the device-check screen.
     //
     // Meet's exact copy varies by region/A-B test; we try the canonical

From 576173c52c9e16d1e59ec5f432b4afa4971b80be Mon Sep 17 00:00:00 2001
From: oxoxDev <nikhil@tinyhumans.ai>
Date: Fri, 22 May 2026 21:41:57 +0530
Subject: [PATCH 18/64] fix(meet-agent): filter 'You'-speaker noise, strip
 <think> blocks, interrupt on new turn
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Three deep gaps surfaced once the staging backend was online and
real LLM + ElevenLabs were producing 60+ second replies:

1. Echo / noise loop. Meet labels its placeholder + accessibility
   strings under speaker='You' (the local participant tag), which
   includes a multi-paragraph 'sample caption' demo string staging's
   captioning UI emits every 250ms. Each scrape re-fired the wake
   word ('openhuman' literal lives inside that demo string) and the
   bot kept replying to its own broadcast. note_caption now drops
   captions where speaker.lowercase() == 'you' (or empty).

2. Bot was speaking its own chain-of-thought. The reasoning models
   on staging emit a <think>...</think> block ahead of the actual
   user-facing reply; strip_for_speech happily passed it through to
   TTS, producing a minute of internal monologue. Strip the think
   blocks before any other markdown clean-up. Unclosed <think> at
   end of output drops everything from the tag onwards.

3. Bot wouldn't stop talking. speak_pump just drains whatever is
   queued — if a new wake fires while the previous reply is still
   playing, the old PCM finishes BEFORE the new reply starts.
   run_caption_turn now calls session.cancel_outbound() at start,
   which clears the outbound buffer and flips outbound_done so the
   page bridge sees end-of-utterance cleanly. Bot becomes
   interruptible — user can re-fire the wake word and the previous
   reply is cut short.
---
 src/openhuman/meet_agent/brain.rs   | 41 +++++++++++++++++++++++++++++
 src/openhuman/meet_agent/session.rs | 26 ++++++++++++++++++
 2 files changed, 67 insertions(+)

diff --git a/src/openhuman/meet_agent/brain.rs b/src/openhuman/meet_agent/brain.rs
index c83f65a113..58041df2b5 100644
--- a/src/openhuman/meet_agent/brain.rs
+++ b/src/openhuman/meet_agent/brain.rs
@@ -84,7 +84,14 @@ pub async fn run_caption_turn(request_id: &str) -> Result<bool, String> {
     // as a "say hi back" greeting cue: synthesize a short ack so the
     // user gets audible proof that the caption→wake→speak loop is
     // wired up end-to-end.
+    //
+    // Also: drop any queued outbound PCM from the previous turn.
+    // Reasoning-model replies can run 60+ seconds; if the user re-fires
+    // the wake mid-reply we need to stop the old speech rather than
+    // play the entire backlog before the new reply starts. This makes
+    // the bot interruptible from the user's side.
     let (prompt, history, was_bare_wake) = match registry().with_session(request_id, |s| {
+        s.cancel_outbound();
         let prompt = s.take_pending_prompt();
         let history = recent_dialog_history(s.events(), CONTEXT_EVENT_WINDOW);
         (prompt, history)
@@ -395,6 +402,40 @@ async fn llm_meeting(prompt: &str, history: &[ConversationTurn]) -> Result<Strin
 /// code, leading bullets). Keep punctuation that affects prosody
 /// (commas, periods, question marks) intact.
 fn strip_for_speech(text: &str) -> String {
+    // Strip reasoning-model <think>...</think> blocks before we strip
+    // markdown. DeepSeek / GMI / qwen-style reasoning models emit
+    // their internal chain-of-thought wrapped in <think>...</think>
+    // tags ahead of the user-facing reply. Without this, TTS reads
+    // the entire monologue aloud — which on a 60s+ reasoning trace
+    // produces a minute of bot speech the user never asked for.
+    // Multiple non-overlapping blocks are stripped in sequence; an
+    // unclosed <think> at the end (truncated output) drops everything
+    // from the tag onwards.
+    let mut cleaned = String::with_capacity(text.len());
+    let mut rest = text;
+    loop {
+        match rest.find("<think>") {
+            Some(open) => {
+                cleaned.push_str(&rest[..open]);
+                let after = &rest[open + "<think>".len()..];
+                match after.find("</think>") {
+                    Some(close) => {
+                        rest = &after[close + "</think>".len()..];
+                    }
+                    None => {
+                        // Unclosed tag → drop the rest as reasoning.
+                        break;
+                    }
+                }
+            }
+            None => {
+                cleaned.push_str(rest);
+                break;
+            }
+        }
+    }
+    let text = cleaned.trim();
+
     let mut out = String::with_capacity(text.len());
     let mut in_code = false;
     for line in text.lines() {
diff --git a/src/openhuman/meet_agent/session.rs b/src/openhuman/meet_agent/session.rs
index a3e946e8b6..d551b82f88 100644
--- a/src/openhuman/meet_agent/session.rs
+++ b/src/openhuman/meet_agent/session.rs
@@ -108,6 +108,18 @@ impl MeetAgentSession {
         if text.trim().is_empty() {
             return false;
         }
+        // Drop noise captions from Meet's local-user / UI affordances.
+        // `speaker=="You"` is Meet's label for the local participant
+        // (the bot itself when its outbound is the user-facing tile),
+        // plus a catch-all for placeholder / demo / accessibility
+        // strings that some Meet variants surface inside the caption
+        // region. Without this filter the bot's own TTS would loop
+        // back as a "user spoke" prompt and re-fire the wake word,
+        // eating the prompt budget and producing endless speech.
+        let speaker_lower = speaker.trim().to_lowercase();
+        if speaker_lower == "you" || speaker_lower.is_empty() {
+            return false;
+        }
         self.last_caption_ts_ms = ts_ms;
         // Already collecting after a previous wake word: just append
         // the new caption. No second fire — the brain is already
@@ -257,6 +269,20 @@ impl MeetAgentSession {
         }
     }
 
+    /// Drop everything queued for playback. The brain calls this at
+    /// the start of a new caption turn so the bot stops mid-sentence
+    /// instead of letting the previous reply play to completion while
+    /// the user is already speaking again. Marks the outbound channel
+    /// as 'done' so the speak_pump signals end-of-utterance on its
+    /// next poll and the page bridge can reset its audio-bridge state
+    /// cleanly.
+    pub fn cancel_outbound(&mut self) {
+        if !self.outbound.is_empty() {
+            self.outbound.clear();
+        }
+        self.outbound_done = true;
+    }
+
     /// Drain everything currently queued for the shell. Returns
     /// `(pcm_base64, utterance_done)`.
     pub fn poll_outbound(&mut self) -> (String, bool) {

From 90552410a0e8675b8e486e1512b1ba8de31a7ef6 Mon Sep 17 00:00:00 2001
From: oxoxDev <nikhil@tinyhumans.ai>
Date: Fri, 22 May 2026 23:46:01 +0530
Subject: [PATCH 19/64] =?UTF-8?q?fix(meet-agent):=20break=20echo=20loop=20?=
 =?UTF-8?q?=E2=80=94=20speaking=20gate,=20caption=20dedup,=20TTS=20cap?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Three guards stack to make the bot loop-proof when running with a real
LLM that produces 30s+ replies on staging:

1. Speaking gate. session.note_caption refuses to fire a fresh wake
   while the outbound TTS queue still has audio. Without this, the user
   continuing to speak (or Meet captioning the bot's own voice) during
   a long reply lands a second wake, brain cancels the first and
   starts a new turn — repeated forever. Captures still record to the
   transcript log with a "(suppressed: bot speaking)" tag so we keep
   the diagnostic trail.

2. Server-side caption dedup. Meet's CC region re-renders the same
   line every 250 ms poll tick, and the page-side lastBySpeaker
   dedup keys on a speaker guess that flips for the same row when
   the avatar marker comes and goes. Defensive (speaker, text)
   signature on the session drops verbatim repeats before they hit
   the wake matcher or the RPC log.

3. TTS char cap. Reasoning models on staging routinely emit 800+
   char replies despite REPLY_MAX_TOKENS=220 (token budget is per
   the user-facing text, not the <think> trace). New cap_for_speech
   trims to 400 chars at the last sentence terminator inside the
   budget; falls back to a hard cut + ellipsis. ~25s of speech at
   average prosody — short enough to stay interruptible.

Together these break the speak-listen-speak loop user hit on the
"Hey Openhuman, can you hear me?" round trip.
---
 src/openhuman/meet_agent/brain.rs   | 30 +++++++++++++++++++-
 src/openhuman/meet_agent/session.rs | 43 +++++++++++++++++++++++++++++
 2 files changed, 72 insertions(+), 1 deletion(-)

diff --git a/src/openhuman/meet_agent/brain.rs b/src/openhuman/meet_agent/brain.rs
index 58041df2b5..e15a005f01 100644
--- a/src/openhuman/meet_agent/brain.rs
+++ b/src/openhuman/meet_agent/brain.rs
@@ -51,6 +51,14 @@ const REPLY_MAX_TOKENS: u32 = 220;
 /// (`eleven_monolingual_v1`) sounds noticeably flatter.
 const TTS_MODEL_ID: &str = "eleven_turbo_v2_5";
 
+/// Hard ceiling on reply characters fed to TTS. The LLM is asked to be
+/// concise but reasoning models still emit 800+ char paragraphs. Cap
+/// drops everything past the first sentence boundary at-or-before
+/// this index, falling back to a raw char cut when no boundary fits.
+/// ~25s of speech at average prosody — keeps the bot interruptible
+/// and prevents the "60s monologue / can't talk over it" loop.
+const MAX_TTS_CHARS: usize = 400;
+
 /// Minimum samples below which we skip the brain turn entirely.
 /// 250 ms @ 16 kHz — under this, VAD almost certainly fired on a
 /// transient (cough, click) rather than real speech.
@@ -461,7 +469,27 @@ fn strip_for_speech(text: &str) -> String {
         }
         out.push_str(&cleaned);
     }
-    out.trim().to_string()
+    let trimmed = out.trim().to_string();
+    cap_for_speech(&trimmed, MAX_TTS_CHARS)
+}
+
+/// Truncate `text` to at most `max_chars` characters, preferring to
+/// cut at the last sentence terminator (`.`, `!`, `?`) inside the
+/// budget so the TTS doesn't trail off mid-clause. Falls back to a
+/// hard char cut + ellipsis when no terminator fits.
+fn cap_for_speech(text: &str, max_chars: usize) -> String {
+    let total = text.chars().count();
+    if total <= max_chars {
+        return text.to_string();
+    }
+    let prefix: String = text.chars().take(max_chars).collect();
+    if let Some(idx) = prefix.rfind(['.', '!', '?']) {
+        let end = idx + prefix[idx..].chars().next().map(char::len_utf8).unwrap_or(1);
+        return prefix[..end].trim_end().to_string();
+    }
+    let mut out = prefix.trim_end().to_string();
+    out.push('…');
+    out
 }
 
 /// One rolling-history entry handed to the LLM.
diff --git a/src/openhuman/meet_agent/session.rs b/src/openhuman/meet_agent/session.rs
index d551b82f88..7b0b178c25 100644
--- a/src/openhuman/meet_agent/session.rs
+++ b/src/openhuman/meet_agent/session.rs
@@ -72,6 +72,14 @@ pub struct MeetAgentSession {
     /// text — a single character growth re-queues the line). Without
     /// this gate the brain spam-fires on every caption growth.
     wake_cooldown_until_ts_ms: u64,
+    /// Last `(speaker, text)` pair forwarded to the wake-word matcher.
+    /// Drops verbatim repeats produced by Meet's caption observer
+    /// re-emitting the same line every poll tick — the page-side
+    /// `lastBySpeaker` dedupe is per-key, and Meet sometimes flips
+    /// the key (speaker name appears / disappears for the same row)
+    /// so identical text still reaches us. Server-side defence keeps
+    /// the log clean and stops spurious wake retries.
+    last_caption_signature: Option<String>,
 }
 
 impl MeetAgentSession {
@@ -92,9 +100,20 @@ impl MeetAgentSession {
             wake_active: false,
             last_caption_ts_ms: 0,
             wake_cooldown_until_ts_ms: 0,
+            last_caption_signature: None,
         }
     }
 
+    /// True when the brain has TTS audio queued for playback. The
+    /// note_caption gate uses this to refuse wake matches while the
+    /// bot is actively speaking — otherwise Meet captions the bot's
+    /// own voice (or the user keeps talking through the reply) and
+    /// fires a fresh turn before the current one finishes, producing
+    /// an unbreakable speak-listen-speak loop.
+    pub fn is_speaking(&self) -> bool {
+        !self.outbound.is_empty()
+    }
+
     /// Caption-driven listen path. Returns `true` when this caption
     /// just tripped the wake word (caller should kick a turn).
     ///
@@ -120,6 +139,30 @@ impl MeetAgentSession {
         if speaker_lower == "you" || speaker_lower.is_empty() {
             return false;
         }
+        // Server-side dedup. Meet's CC region re-renders the same line
+        // every 250 ms poll tick for the duration of an utterance, and
+        // the page-side `lastBySpeaker` dedup keys on a speaker guess
+        // that flips for the same row. Without this, the wake-word
+        // matcher (and the RPC log) sees N copies of every caption.
+        let signature = format!("{speaker_lower}\u{1F}{}", text.trim());
+        if self.last_caption_signature.as_deref() == Some(signature.as_str()) {
+            return false;
+        }
+        self.last_caption_signature = Some(signature);
+        // Gate: if the bot is currently speaking (queued TTS audio),
+        // refuse to fire a new wake. The user's voice + the bot's
+        // voice can both show up as captions, and a reply that runs
+        // 30–60 s will collide with continued user speech every time.
+        // Without this, the bot speaks-listens-speaks in a loop until
+        // someone closes the call. New captions still record to the
+        // transcript log for context but cannot trigger another turn.
+        if self.is_speaking() {
+            self.record_event(
+                SessionEventKind::Heard,
+                format!("{speaker}: {text} (suppressed: bot speaking)"),
+            );
+            return false;
+        }
         self.last_caption_ts_ms = ts_ms;
         // Already collecting after a previous wake word: just append
         // the new caption. No second fire — the brain is already

From 96346a86b31ad9fbdc96b10659516b459ae0cb05 Mon Sep 17 00:00:00 2001
From: oxoxDev <nikhil@tinyhumans.ai>
Date: Fri, 22 May 2026 23:53:34 +0530
Subject: [PATCH 20/64] =?UTF-8?q?fix(meet-agent):=20tighten=20reply=20leng?=
 =?UTF-8?q?th=20=E2=80=94=2080-token=20cap=20+=20strict=20spoken-mode=20pr?=
 =?UTF-8?q?ompt?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The previous prompt asked for "1-2 sentences" but reasoning-style backends
(DeepSeek / GMI / qwen flavours routed under model="agentic-v1") routinely
ignored soft length hints and emitted 800+ char monologues. cap_for_speech
trimmed them at 400 chars but the TTS still ran 25s per turn — long enough
that the user couldn't get a word in edge-wise.

Two changes:

1. REPLY_MAX_TOKENS 220 → 80. ~60 spoken words ≈ ~12s of audio. Hard ceiling
   regardless of model verbosity.

2. MEETING_SYSTEM_PROMPT rewritten as strict numbered rules — "ONE sentence,
   max 25 spoken words, no chain-of-thought, no <think> blocks, plain spoken
   English". Address-detection and dictation rules preserved but condensed.

Combined with cap_for_speech(400) and the speaking gate, the bot now produces
one short answer per wake instead of a minute-long reply that locks the
loop open.

Real second-brain (tools+memory+calendar via Agent::from_config_for_agent)
is the next commit per the approved plan.
---
 src/openhuman/meet_agent/brain.rs | 51 +++++++++++++++----------------
 1 file changed, 24 insertions(+), 27 deletions(-)

diff --git a/src/openhuman/meet_agent/brain.rs b/src/openhuman/meet_agent/brain.rs
index e15a005f01..e13cf58081 100644
--- a/src/openhuman/meet_agent/brain.rs
+++ b/src/openhuman/meet_agent/brain.rs
@@ -42,10 +42,12 @@ use super::wav;
 /// captioned dialogue — enough for the model to follow a thread without
 /// blowing the prompt budget.
 const CONTEXT_EVENT_WINDOW: usize = 12;
-/// Spoken-reply ceiling. Each token is roughly ¾ of a word, so 220
-/// tokens ≈ 30 seconds of speech — long enough for a real answer, short
-/// enough that the model can't hijack the meeting.
-const REPLY_MAX_TOKENS: u32 = 220;
+/// Spoken-reply ceiling. Each token is roughly ¾ of a word, so 80
+/// tokens ≈ ~60 spoken words ≈ ~12 seconds. The system prompt asks for
+/// one short sentence, but reasoning-style backends ignore soft length
+/// hints and emit 800+ char monologues. Hard token cap keeps the bot
+/// interruptible regardless of model behaviour.
+const REPLY_MAX_TOKENS: u32 = 80;
 /// ElevenLabs model. `eleven_turbo_v2_5` strikes the best
 /// quality/latency balance; the older default the backend would pick
 /// (`eleven_monolingual_v1`) sounds noticeably flatter.
@@ -330,32 +332,27 @@ async fn stt(samples: &[i16]) -> Result<String, String> {
 /// at it (intent classification — emit empty string when not), and
 /// (b) responding conversationally and concisely when it is.
 const MEETING_SYSTEM_PROMPT: &str = "\
-You are OpenHuman, an AI assistant joining a live Google Meet call as a participant. \
-The meeting transcript is provided as prior turns where `user` lines are captions \
-spoken by humans on the call (sometimes prefixed with their name) and `assistant` \
-lines are things you previously said out loud. The latest `user` message is the \
-utterance you are deciding how to respond to.\n\
+You are OpenHuman, joining a live Google Meet call by voice. Every word you \
+produce will be spoken aloud over the call. The transcript shows `user` lines \
+(humans on the call, sometimes prefixed with a name) and `assistant` lines \
+(things you previously said out loud).\n\
 \n\
-Decide first: was this latest utterance actually directed at you? Strong signals: \
-the speaker addresses you by name (\"OpenHuman\", \"hey openhuman\"), asks a direct \
-question, or asks you to do something (note this, summarise, look up, remember, \
-remind, draft). Weak signals (do NOT respond): chit-chat between humans, \
-side conversation, your name appearing inside a longer thought aimed at someone \
-else, ambient transcription noise.\n\
+STRICT OUTPUT RULES — these are non-negotiable:\n\
+1. Output ONE sentence. Maximum 25 spoken words.\n\
+2. Plain spoken English. No markdown. No bullets. No code. No emoji.\n\
+3. No chain-of-thought. No reasoning out loud. No <think> blocks. Answer only.\n\
+4. Never repeat what the user said. Never narrate what you are about to do.\n\
+5. If the latest user line is not directly addressed to you, output the empty \
+string. Do not respond to side conversations or ambient speech.\n\
 \n\
-If it is NOT directed at you, output exactly the empty string. Stay silent. \
+Address-detection: respond when the user names you (\"OpenHuman\", \"hey \
+openhuman\"), asks a direct question of you, or gives a direct command \
+(remember, summarise, look up). Otherwise stay silent.\n\
 \n\
-If it IS directed at you:\n\
-  • Reply in 1–2 spoken sentences. Conversational, warm, direct. No filler.\n\
-  • Pronounce naturally — write the way a person speaks, not the way they type. \
-No markdown, no bullet lists, no code blocks, no emoji.\n\
-  • For dictation / note requests (\"remember…\", \"action item…\", \"follow up on…\"), \
-the note is already captured in the transcript log, so just acknowledge briefly \
-(\"Got it.\", \"Adding that.\") — don't read the note back.\n\
-  • For questions, answer directly with what you know; if you don't know, say so \
-in one sentence rather than guessing.\n\
-  • Never repeat verbatim what was said. Never describe what you're about to do — \
-just do it.\n\
+For unanswerable questions: say so in one sentence (\"I don't know that off \
+the top of my head\") instead of guessing or stalling.\n\
+For dictation / note requests: a 2-3 word ack (\"Got it.\", \"Noted.\"). Don't \
+read the note back.\n\
 ";
 
 /// Build a chat-completions request from rolling meeting history plus

From 8f7e37100203c1340ccd608a5024c946d264a0c8 Mon Sep 17 00:00:00 2001
From: oxoxDev <nikhil@tinyhumans.ai>
Date: Sat, 23 May 2026 00:00:39 +0530
Subject: [PATCH 21/64] =?UTF-8?q?fix(meet-agent):=20switch=20model=20agent?=
 =?UTF-8?q?ic-v1=20=E2=86=92=20chat-v1=20+=20strip=20leaked=20reasoning?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Root cause of "bot reads its chain-of-thought aloud" (e.g. "We need to
generate a single sentence, max 25 words, plane spoken English. The user
said hello. This is a greeting addressed to Openhuman. So I should respond
with a greeting."): the bare /openai/v1/chat/completions endpoint pinned
to model="agentic-v1", which is a reasoning-style model. Reasoning models
emit their internal chain-of-thought as PLAIN TEXT (not <think> tags) in
the completion body when called outside the structured thinking_delta
channel — senamakel's chat path consumes those events separately and
shows them as a status, but a raw chat/completions call gets them
concatenated into the response. TTS then reads the whole thing aloud.

Two changes:

1. Pin model to chat-v1 (MODEL_CHAT_V1 in
   src/openhuman/config/schema/types.rs:17). chat-v1 is the
   conversational non-reasoning model — produces a direct user-facing
   answer suited to voice. Same family of aliases used by other entry
   points; no infra change required.

2. Add strip_untagged_reasoning() pass in strip_for_speech. Defensive
   heuristic against future model swaps: drops sentences whose lower-
   case trim begins with known reasoning openers ("We need to…",
   "I should…", "Let me…", "The user said…", "So I should…", etc.).
   If every sentence matches, returns the last sentence (final
   conclusion) instead of empty string.

3. Tighter MEETING_SYSTEM_PROMPT with NO-CHAIN-OF-THOUGHT rules +
   explicit good/bad examples. Even though chat-v1 doesn't reason out
   loud, the prompt now defends against accidental leaks if the router
   ever falls back to a reasoning tier.

Real second-brain (Agent::from_config_for_agent / channels-style chat
path) is still the next commit per the approved plan — this is the
defence-in-depth that fixes the spoken-out-loud reasoning today.
---
 src/openhuman/meet_agent/brain.rs | 84 +++++++++++++++++++++++++++++--
 1 file changed, 80 insertions(+), 4 deletions(-)

diff --git a/src/openhuman/meet_agent/brain.rs b/src/openhuman/meet_agent/brain.rs
index e13cf58081..565543d415 100644
--- a/src/openhuman/meet_agent/brain.rs
+++ b/src/openhuman/meet_agent/brain.rs
@@ -337,13 +337,21 @@ produce will be spoken aloud over the call. The transcript shows `user` lines \
 (humans on the call, sometimes prefixed with a name) and `assistant` lines \
 (things you previously said out loud).\n\
 \n\
-STRICT OUTPUT RULES — these are non-negotiable:\n\
+STRICT OUTPUT RULES — these are non-negotiable. The output is fed DIRECTLY \
+into TTS and spoken aloud verbatim. Any meta-text becomes audible bot \
+gibberish on a live call.\n\
 1. Output ONE sentence. Maximum 25 spoken words.\n\
 2. Plain spoken English. No markdown. No bullets. No code. No emoji.\n\
-3. No chain-of-thought. No reasoning out loud. No <think> blocks. Answer only.\n\
+3. NO chain-of-thought. NO reasoning. NO planning. NO <think> blocks. NO \
+preamble. NEVER write phrases like \"We need to…\", \"I should…\", \"Let me…\", \
+\"The user said…\", \"This is a greeting…\", \"So I should respond with…\", \
+\"My response is…\". Output ONLY the final answer that the user should hear.\n\
 4. Never repeat what the user said. Never narrate what you are about to do.\n\
 5. If the latest user line is not directly addressed to you, output the empty \
 string. Do not respond to side conversations or ambient speech.\n\
+6. Examples — good vs bad:\n\
+   User: \"hello\" → GOOD: \"Hey there.\"  BAD: \"The user said hello, so I should respond with a greeting.\"\n\
+   User: \"what's the time\" → GOOD: \"I don't have a clock right now.\"  BAD: \"We need to generate a single sentence. The user is asking the time.\"\n\
 \n\
 Address-detection: respond when the user names you (\"OpenHuman\", \"hey \
 openhuman\"), asks a direct question of you, or gives a direct command \
@@ -381,7 +389,13 @@ async fn llm_meeting(prompt: &str, history: &[ConversationTurn]) -> Result<Strin
     messages.push(json!({ "role": "user", "content": prompt }));
 
     let body = json!({
-        "model": "agentic-v1",
+        // chat-v1 = conversational non-reasoning model. agentic-v1 /
+        // reasoning-v1 leak their chain-of-thought as plain text
+        // ("We need to generate a single sentence…") into the response
+        // body when streamed without the structured thinking_delta
+        // channel — which TTS then reads aloud. chat-v1 produces a
+        // direct user-facing answer, which is what we want over voice.
+        "model": "chat-v1",
         "temperature": 0.5,
         "max_tokens": REPLY_MAX_TOKENS,
         "messages": messages,
@@ -467,7 +481,69 @@ fn strip_for_speech(text: &str) -> String {
         out.push_str(&cleaned);
     }
     let trimmed = out.trim().to_string();
-    cap_for_speech(&trimmed, MAX_TTS_CHARS)
+    let de_reasoned = strip_untagged_reasoning(&trimmed);
+    cap_for_speech(&de_reasoned, MAX_TTS_CHARS)
+}
+
+/// Strip reasoning-style preamble that reasoning models leak as plain
+/// text (no `<think>` tags) — phrases like "We need to generate…",
+/// "I should respond with…", "The user said…", "Let me think…".
+/// Heuristic: drop sentences whose lowercased trim matches a known
+/// reasoning opener; if everything is reasoning, return only the last
+/// sentence (final conclusion). If no signal, return input untouched.
+fn strip_untagged_reasoning(text: &str) -> String {
+    if text.is_empty() {
+        return text.to_string();
+    }
+    const REASONING_OPENERS: &[&str] = &[
+        "we need to",
+        "we should",
+        "i need to",
+        "i should",
+        "i will",
+        "let me ",
+        "first,",
+        "the user said",
+        "the user is",
+        "the user asked",
+        "the user wants",
+        "this is a",
+        "this seems",
+        "so i should",
+        "so the response",
+        "so my response",
+        "okay, so",
+        "alright,",
+        "given that",
+        "since the user",
+        "the assistant",
+        "the response should",
+        "my response",
+        "to respond",
+        "responding with",
+    ];
+    let sentences: Vec<&str> = text
+        .split_inclusive(|c: char| matches!(c, '.' | '!' | '?'))
+        .map(str::trim)
+        .filter(|s| !s.is_empty())
+        .collect();
+    if sentences.is_empty() {
+        return text.to_string();
+    }
+    let kept: Vec<&str> = sentences
+        .iter()
+        .filter(|s| {
+            let lc = s.to_lowercase();
+            !REASONING_OPENERS.iter().any(|opener| lc.starts_with(opener))
+        })
+        .copied()
+        .collect();
+    if kept.is_empty() {
+        // Everything was reasoning — return the last sentence as the
+        // probable conclusion, lower-cased openers stripped.
+        return sentences.last().map(|s| s.to_string()).unwrap_or_default();
+    }
+    kept.join(" ")
 }
 
 /// Truncate `text` to at most `max_chars` characters, preferring to

From 44d1bbef8a97a16f4f44aab449e537111d814189 Mon Sep 17 00:00:00 2001
From: oxoxDev <nikhil@tinyhumans.ai>
Date: Sat, 23 May 2026 00:08:39 +0530
Subject: [PATCH 22/64] =?UTF-8?q?feat(meet-agent):=20route=20bot=20through?=
 =?UTF-8?q?=20orchestrator=20agent=20=E2=80=94=20second=20brain=20in=20voi?=
 =?UTF-8?q?ce?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The bot now answers via the SAME path as the chat UI and the webview meet
handoff: Agent::from_config_for_agent(&config, "orchestrator"). It
inherits the user's connected integrations, memory tree, MCP clients,
skills, and the project-wide tool registry. Whatever the user has wired
in their core is available to the bot day-one — no per-tool plumbing in
meet_agent.

Pipeline now:
  caption / STT  →  llm_meeting_agentic (orchestrator + tools + memory)
                 ↓  on error: llm_meeting_basic (bare chat-v1)
                 ↓  on error: stub / canned ack
                 →  strip_for_speech  →  cap_for_speech(400)  →  TTS

Why agentic-first, basic-as-fallback:
- Agentic gives real answers ("is my Friday evening free", "what did
  Alice say about the deploy", "remember to mail Bob tomorrow"). The
  orchestrator runs the same tool-iteration loop the chat UI does.
- Basic exists only so a config / registry / token issue doesn't kill
  the call. Degrades to a polite reply instead of dead air.
- Reasoning leak ("We need to generate a single sentence…") was the
  symptom that motivated this commit; the proper fix is routing through
  the channels-style path because that path consumes thinking_delta
  events separately and never lands them in the response body.

MEET_VOICE_DIRECTIVE prepended to every user utterance constrains the
orchestrator's reply to one short spoken sentence (max 25 words, no
markdown, no preamble, no chain-of-thought). The directive is wrapped
in a delimiter so the orchestrator can't confuse it with the user's
literal speech.

AGENTIC_TURN_TIMEOUT_SECS = 20 wraps run_single so a slow tool
iteration doesn't leave the meeting participant in indefinite silence.
On timeout the basic-LLM fallback fires.

strip_for_speech + cap_for_speech(400) still run on the harness output
as TTS hygiene — tool-use markers / citations / markdown leak through
even on chat-v1, and the agent reply can be longer than the
voice-budget if the orchestrator decides a fuller answer is right.
---
 src/openhuman/meet_agent/brain.rs | 170 +++++++++++++++++++++++++-----
 1 file changed, 143 insertions(+), 27 deletions(-)

diff --git a/src/openhuman/meet_agent/brain.rs b/src/openhuman/meet_agent/brain.rs
index 565543d415..deadb1c1af 100644
--- a/src/openhuman/meet_agent/brain.rs
+++ b/src/openhuman/meet_agent/brain.rs
@@ -32,11 +32,18 @@
 
 use base64::{engine::general_purpose::STANDARD as B64, Engine as _};
 use serde_json::{json, Value};
+use std::time::Duration;
 
 use super::session::registry;
 use super::types::{SessionEvent, SessionEventKind};
 use super::wav;
 
+/// Wall-clock ceiling on one agentic turn. Tool iterations + LLM call
+/// can run 10s+; 20s is comfortable for calendar / memory lookups but
+/// short enough that we fall back to a polite "let me get back to
+/// you" instead of leaving the meet participant in silence.
+const AGENTIC_TURN_TIMEOUT_SECS: u64 = 20;
+
 /// How many of the most recent `Heard` / `Spoke` events we feed back
 /// into the LLM as rolling conversation context. 12 ≈ a few minutes of
 /// captioned dialogue — enough for the model to follow a thread without
@@ -121,22 +128,36 @@ pub async fn run_caption_turn(request_id: &str) -> Result<bool, String> {
         was_bare_wake,
     );
 
-    // Real LLM call. The model gets the rolling caption history plus
-    // the user's direct address and decides whether to respond, what
-    // to say, and how concise to be. It can also return an empty
-    // string when it concludes the message wasn't actually directed
-    // at it (false-positive wake word, side conversation).
-    let reply_text = match llm_meeting(&prompt, &history).await {
+    // Route the turn through the FULL orchestrator agent first — it
+    // owns the user's connected integrations, memory tree, MCP
+    // clients and skills, so it can actually answer "is my Friday
+    // free", "what did Alice say about the deploy", etc. Falls back
+    // to the bare chat-completions path on orchestrator build /
+    // timeout / RPC error so a config-degraded environment still
+    // produces audible output instead of dead air.
+    let reply_text = match llm_meeting_agentic(&prompt, request_id).await {
         Ok(text) => text,
-        Err(err) => {
-            log::warn!("[meet-agent] caption-turn LLM failed request_id={request_id} err={err}");
-            let _ = registry().with_session(request_id, |s| {
-                s.record_event(
-                    SessionEventKind::Note,
-                    format!("LLM failure (using ack): {err}"),
-                );
-            });
-            pick_ack_phrase(&prompt).to_string()
+        Err(agentic_err) => {
+            log::warn!(
+                "[meet-agent] agentic turn failed, falling back to basic LLM request_id={request_id} err={agentic_err}"
+            );
+            match llm_meeting_basic(&prompt, &history).await {
+                Ok(text) => text,
+                Err(basic_err) => {
+                    log::warn!(
+                        "[meet-agent] basic LLM also failed request_id={request_id} err={basic_err}"
+                    );
+                    let _ = registry().with_session(request_id, |s| {
+                        s.record_event(
+                            SessionEventKind::Note,
+                            format!(
+                                "both LLM paths failed (agentic: {agentic_err}; basic: {basic_err})"
+                            ),
+                        );
+                    });
+                    pick_ack_phrase(&prompt).to_string()
+                }
+            }
         }
     };
 
@@ -251,18 +272,30 @@ pub async fn run_turn(request_id: &str) -> Result<bool, String> {
         heard.chars().count()
     );
 
-    // ─── LLM ────────────────────────────────────────────────────────
-    let reply_text = match llm_meeting(&heard, &history).await {
+    // ─── LLM (agentic-first, basic-fallback) ───────────────────────
+    let reply_text = match llm_meeting_agentic(&heard, request_id).await {
         Ok(text) => text,
-        Err(err) => {
-            log::warn!("[meet-agent] LLM failed request_id={request_id} err={err}");
-            let _ = registry().with_session(request_id, |s| {
-                s.record_event(
-                    SessionEventKind::Note,
-                    format!("LLM failure (using stub): {err}"),
-                );
-            });
-            stub_llm(&heard).await
+        Err(agentic_err) => {
+            log::warn!(
+                "[meet-agent] STT-path agentic failed, falling back request_id={request_id} err={agentic_err}"
+            );
+            match llm_meeting_basic(&heard, &history).await {
+                Ok(text) => text,
+                Err(basic_err) => {
+                    log::warn!(
+                        "[meet-agent] STT-path basic LLM also failed request_id={request_id} err={basic_err}"
+                    );
+                    let _ = registry().with_session(request_id, |s| {
+                        s.record_event(
+                            SessionEventKind::Note,
+                            format!(
+                                "both LLM paths failed (agentic: {agentic_err}; basic: {basic_err})"
+                            ),
+                        );
+                    });
+                    stub_llm(&heard).await
+                }
+            }
         }
     };
 
@@ -363,10 +396,93 @@ For dictation / note requests: a 2-3 word ack (\"Got it.\", \"Noted.\"). Don't \
 read the note back.\n\
 ";
 
+/// Voice-frontend system-prompt directive prepended to the user
+/// utterance before it reaches the orchestrator. The orchestrator
+/// already has its own persona, tool catalogue, memory loader and
+/// connected integrations; this addendum just tells it the answer is
+/// going to be spoken aloud verbatim so it should reply in one short
+/// spoken sentence with no markdown / no chain-of-thought / no
+/// preamble. Wrapped in a delimiter so the orchestrator can't confuse
+/// the directive with the user's actual utterance.
+const MEET_VOICE_DIRECTIVE: &str = "[meeting voice — your reply will be spoken aloud verbatim into a live Google Meet call. Answer in ONE short spoken sentence, max 25 words. Plain spoken English only. No markdown. No bullets. No code. No preamble. No phrases like \"I should…\", \"Let me…\", \"We need to…\". If the user is not directly addressing you, output an empty string and stay silent.]";
+
+/// Route the meeting utterance through the FULL orchestrator agent —
+/// same path the chat UI and the webview meet handoff use. The
+/// orchestrator inherits the user's connected integrations, memory
+/// tree, MCP clients, skills, and the project-wide tool registry, so
+/// "is my Friday evening free", "did anyone in #eng ping me about
+/// the deploy", "remind me to mail Alice tomorrow" all answer with
+/// real data — not a guess from the model's training prior.
+///
+/// We rebuild the Agent per turn (cheap relative to the LLM call
+/// itself, since the registry is initialised once at startup) and
+/// wrap `run_single` in a 20s timeout so a slow tool iteration
+/// doesn't leave the meeting participant in silence indefinitely.
+///
+/// Errors propagate to the caller, which falls back to the bare
+/// chat-completions path (`llm_meeting_basic`) so a config /
+/// registry / token issue degrades to a polite reply instead of
+/// dead air.
+async fn llm_meeting_agentic(prompt: &str, request_id: &str) -> Result<String, String> {
+    use crate::openhuman::agent::harness::session::Agent;
+
+    let config = crate::openhuman::config::ops::load_config_with_timeout().await?;
+
+    // Build a fresh orchestrator Agent. Synchronous constructor — no
+    // .await — but heavy (memory tree, provider, MCP). Keep an eye on
+    // turn latency; if it becomes a bottleneck, cache an Arc<Agent>
+    // keyed by request_id in a follow-up commit. For now the LLM
+    // call dominates and the build is a few hundred ms at most.
+    let mut agent = Agent::from_config_for_agent(&config, "orchestrator")
+        .map_err(|e| format!("[meet-agent] orchestrator build failed: {e}"))?;
+
+    // Prepend the voice-frontend directive so the orchestrator knows
+    // this turn is spoken-aloud and constrains its output. The
+    // delimiter prevents the directive from being mistaken for the
+    // user's literal speech.
+    let meet_prompt = format!("{MEET_VOICE_DIRECTIVE}\n\n{prompt}");
+
+    log::info!(
+        "[meet-agent] agentic turn dispatch request_id={request_id} prompt_chars={}",
+        prompt.chars().count()
+    );
+
+    let fut = agent.run_single(&meet_prompt);
+    let reply = match tokio::time::timeout(
+        Duration::from_secs(AGENTIC_TURN_TIMEOUT_SECS),
+        fut,
+    )
+    .await
+    {
+        Ok(Ok(text)) => text,
+        Ok(Err(e)) => {
+            return Err(format!("[meet-agent] orchestrator run_single failed: {e}"));
+        }
+        Err(_elapsed) => {
+            log::warn!(
+                "[meet-agent] agentic turn timed out request_id={request_id} after {}s — falling back",
+                AGENTIC_TURN_TIMEOUT_SECS
+            );
+            return Err(format!(
+                "agentic timeout after {AGENTIC_TURN_TIMEOUT_SECS}s"
+            ));
+        }
+    };
+
+    Ok(strip_for_speech(&reply))
+}
+
 /// Build a chat-completions request from rolling meeting history plus
 /// the current user prompt, post it through the backend, and return
 /// the assistant's reply (trimmed, possibly empty).
-async fn llm_meeting(prompt: &str, history: &[ConversationTurn]) -> Result<String, String> {
+///
+/// Used as a fallback when the orchestrator path
+/// (`llm_meeting_agentic`) cannot be built — missing config,
+/// registry not initialised, no session token. The orchestrator path
+/// gives memory/tool/integration access; this bare path only gets
+/// the rolling caption history. Acceptable degradation so the bot
+/// doesn't go silent in a config-degraded environment.
+async fn llm_meeting_basic(prompt: &str, history: &[ConversationTurn]) -> Result<String, String> {
     use crate::api::config::effective_backend_api_url;
     use crate::api::jwt::get_session_token;
     use crate::api::BackendOAuthClient;

From 71d50368b2fbffe7d17b9840e660c92a24258796 Mon Sep 17 00:00:00 2001
From: oxoxDev <nikhil@tinyhumans.ai>
Date: Sat, 23 May 2026 00:12:46 +0530
Subject: [PATCH 23/64] fix(meet-agent): switch to with_profile builder to pick
 up connected integrations
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

from_config_for_agent builds the orchestrator with ZERO integrations
attached — saw "[orchestrator_tools] assembled 9 delegation tool(s) for
agent 'orchestrator' (0 integrations connected)" in the bot path log,
versus "10 delegation tool(s) (119 integrations connected)" for the chat
UI path. The web channel uses Agent::from_config_for_agent_with_profile
(channels/providers/web.rs:1570) which is what wires the integrations
in. Switch the meet-agent path to the same builder.

Pass MEET_VOICE_DIRECTIVE as profile_prompt_suffix instead of prepending
to the user message — same hook the web channel uses for locale-reply
directives. The orchestrator now reads the voice-frontend constraint at
system-prompt level, which is the right altitude (it's a channel-wide
contract, not a per-utterance instruction).

Per-meet event-context + agent-definition-name (orchestrator_meet_<id>)
so the harness scopes its session transcript to this request_id —
otherwise two simultaneous orchestrators (chat UI + meet bot) would
share one transcript file.

Strengthened MEET_VOICE_DIRECTIVE wording — explicit "tool-use is great,
but only the final spoken reply should appear in your output" so the
orchestrator knows it CAN run tools (calendar, memory, integrations)
but should suppress narration about them.

Net effect: bot now has the user's full 119-integration tool surface
available, plus the voice-mode output contract.
---
 src/openhuman/meet_agent/brain.rs | 43 +++++++++++++++++++++----------
 1 file changed, 29 insertions(+), 14 deletions(-)

diff --git a/src/openhuman/meet_agent/brain.rs b/src/openhuman/meet_agent/brain.rs
index deadb1c1af..486146803e 100644
--- a/src/openhuman/meet_agent/brain.rs
+++ b/src/openhuman/meet_agent/brain.rs
@@ -404,7 +404,13 @@ read the note back.\n\
 /// spoken sentence with no markdown / no chain-of-thought / no
 /// preamble. Wrapped in a delimiter so the orchestrator can't confuse
 /// the directive with the user's actual utterance.
-const MEET_VOICE_DIRECTIVE: &str = "[meeting voice — your reply will be spoken aloud verbatim into a live Google Meet call. Answer in ONE short spoken sentence, max 25 words. Plain spoken English only. No markdown. No bullets. No code. No preamble. No phrases like \"I should…\", \"Let me…\", \"We need to…\". If the user is not directly addressing you, output an empty string and stay silent.]";
+const MEET_VOICE_DIRECTIVE: &str = "MEETING VOICE MODE: This conversation is happening live over voice in a Google Meet call. Every word of your reply will be passed VERBATIM to TTS and spoken aloud. Therefore: answer in ONE short spoken sentence, max 25 words, plain spoken English, no markdown, no bullets, no code, no preamble (do not say \"I should…\", \"Let me…\", \"We need to…\", \"The user said…\"). Tool-use is great — call tools when needed — but only the final spoken reply should appear in your output. If the user is not directly addressing you, output an empty string and stay silent.";
+
+/// First 12 chars of `request_id`, for log scoping. UUID prefixes are
+/// unique enough at one-meet-at-a-time to keep transcripts apart.
+fn short_id(id: &str) -> String {
+    id.chars().take(12).collect()
+}
 
 /// Route the meeting utterance through the FULL orchestrator agent —
 /// same path the chat UI and the webview meet handoff use. The
@@ -428,26 +434,35 @@ async fn llm_meeting_agentic(prompt: &str, request_id: &str) -> Result<String, S
 
     let config = crate::openhuman::config::ops::load_config_with_timeout().await?;
 
-    // Build a fresh orchestrator Agent. Synchronous constructor — no
-    // .await — but heavy (memory tree, provider, MCP). Keep an eye on
-    // turn latency; if it becomes a bottleneck, cache an Arc<Agent>
-    // keyed by request_id in a follow-up commit. For now the LLM
-    // call dominates and the build is a few hundred ms at most.
-    let mut agent = Agent::from_config_for_agent(&config, "orchestrator")
-        .map_err(|e| format!("[meet-agent] orchestrator build failed: {e}"))?;
+    // Use the with_profile builder — same canonical path the web
+    // channel (chat UI) uses at channels/providers/web.rs:1570. This
+    // is what wires the user's connected integrations + delegation
+    // tools onto the orchestrator. The plain `from_config_for_agent`
+    // builds with zero integrations attached. `profile_prompt_suffix`
+    // is the established hook for per-channel system-prompt
+    // augmentation — the web channel uses it for the locale-reply
+    // directive; we use it for the voice-frontend directive.
+    let mut agent = Agent::from_config_for_agent_with_profile(
+        &config,
+        "orchestrator",
+        None,
+        Some(MEET_VOICE_DIRECTIVE.to_string()),
+    )
+    .map_err(|e| format!("[meet-agent] orchestrator build failed: {e}"))?;
 
-    // Prepend the voice-frontend directive so the orchestrator knows
-    // this turn is spoken-aloud and constrains its output. The
-    // delimiter prevents the directive from being mistaken for the
-    // user's literal speech.
-    let meet_prompt = format!("{MEET_VOICE_DIRECTIVE}\n\n{prompt}");
+    // Per-meet event context so the harness scopes its session
+    // transcript to this request_id instead of colliding with the
+    // chat-UI thread. Without this, two simultaneous orchestrators
+    // (chat + meet) share one transcript file.
+    agent.set_event_context(format!("meet_{request_id}"), "meet_agent");
+    agent.set_agent_definition_name(format!("orchestrator_meet_{}", short_id(request_id)));
 
     log::info!(
         "[meet-agent] agentic turn dispatch request_id={request_id} prompt_chars={}",
         prompt.chars().count()
     );
 
-    let fut = agent.run_single(&meet_prompt);
+    let fut = agent.run_single(prompt);
     let reply = match tokio::time::timeout(
         Duration::from_secs(AGENTIC_TURN_TIMEOUT_SECS),
         fut,

From ef7e648980246fbd3f744fac53b8ed3ccdf102be Mon Sep 17 00:00:00 2001
From: oxoxDev <nikhil@tinyhumans.ai>
Date: Sat, 23 May 2026 00:42:22 +0530
Subject: [PATCH 24/64] fix(meet-agent): per-turn unique definition_name to
 bypass corrupt transcript resume
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Every turn was hitting:
  "400 An assistant message with 'tool_calls' must be followed by tool
   messages responding to each 'tool_call_id'"

Root cause: the harness auto-resumes prior transcripts when an
agent_definition_name matches a file on disk. A prior turn was killed
mid-tool-call (app restart while orchestrator was awaiting tool
output), leaving an assistant message with `tool_calls` and no
follow-up `tool` reply. Every subsequent run_single re-loaded that
file as the seeded history and the LLM API rejected it.

Switch agent_definition_name to include now_ms so each turn gets a
unique name and the harness never finds a prior transcript to load.
Trade-off: harness loses cross-turn memory persistence (each turn is
stateless from the agent's POV). Tools still work — they query real
external systems. Cross-turn memory is a follow-up that needs an
Agent cache (Arc<Mutex<Agent>> per request_id) so the harness keeps
history in-memory and never round-trips through the corrupt-able disk
transcript loader.

Corrupt transcript file purged manually for the active staging
workspace; future kills will create new ones but per-turn unique
naming means they won't poison subsequent turns.
---
 src/openhuman/meet_agent/brain.rs | 18 +++++++++++++++++-
 1 file changed, 17 insertions(+), 1 deletion(-)

diff --git a/src/openhuman/meet_agent/brain.rs b/src/openhuman/meet_agent/brain.rs
index 486146803e..a3bbdab1a1 100644
--- a/src/openhuman/meet_agent/brain.rs
+++ b/src/openhuman/meet_agent/brain.rs
@@ -454,8 +454,24 @@ async fn llm_meeting_agentic(prompt: &str, request_id: &str) -> Result<String, S
     // transcript to this request_id instead of colliding with the
     // chat-UI thread. Without this, two simultaneous orchestrators
     // (chat + meet) share one transcript file.
+    // Per-turn unique definition_name. The harness auto-resumes prior
+    // transcripts when a definition_name matches a file on disk; if
+    // an earlier turn was killed mid-tool-call, the file ends with a
+    // dangling `tool_calls` assistant message and the LLM rejects
+    // the next request with 400 "tool_calls must be followed by tool
+    // messages". Per-turn naming bypasses resume entirely. Memory
+    // across turns is a follow-up (Arc<Mutex<Agent>> cache); for
+    // now each turn is stateless from the harness's perspective but
+    // tools still query real systems.
+    let now_ms = std::time::SystemTime::now()
+        .duration_since(std::time::UNIX_EPOCH)
+        .map(|d| d.as_millis())
+        .unwrap_or(0);
     agent.set_event_context(format!("meet_{request_id}"), "meet_agent");
-    agent.set_agent_definition_name(format!("orchestrator_meet_{}", short_id(request_id)));
+    agent.set_agent_definition_name(format!(
+        "orchestrator_meet_{}_{now_ms}",
+        short_id(request_id)
+    ));
 
     log::info!(
         "[meet-agent] agentic turn dispatch request_id={request_id} prompt_chars={}",

From 5d97de258c55f0db8517074d54250ba2a8ed5212 Mon Sep 17 00:00:00 2001
From: oxoxDev <nikhil@tinyhumans.ai>
Date: Sat, 23 May 2026 00:48:54 +0530
Subject: [PATCH 25/64] fix(meet-agent): rewrite voice directive to encourage
 tool use
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

User reported: connected calendar mid-call, then asked bot about tomorrow's
meetings; bot kept saying "I don't have calendar access" even though
[orchestrator_tools] logged 119 integrations connected on every turn.

Diagnosis: the previous MEET_VOICE_DIRECTIVE said "answer in ONE short
spoken sentence, no preamble, no 'Let me…', no 'I should…'". The model
interpreted this as a blanket "skip tool use, answer directly from prior"
— tool calls + tool replies look like preamble to a model trained to
match instruction shape. So it short-circuited to a hallucinated "not
connected" answer instead of dispatching delegate_to_integrations_agent.

Rewritten directive separates two contracts:

1. TOOL USE (encouraged + explicit): call tools whenever real data is
   needed. Tool calls are invisible to the user, do NOT count toward
   reply length. Explicit "do not claim something is not connected
   before attempting to call its tool". Explicit pointer to
   delegate_to_integrations_agent as the integration gateway.

2. FINAL SPOKEN REPLY (strict): same 25-word one-sentence ceiling, but
   framed as applying ONLY to the user-facing text that lands in TTS.
   The model is free to do whatever tool work it needs first.

Same dictation / silence-on-side-conversation rules retained.

Bug-1 (echo loop — Rust outbound drains faster than JS audio playback,
is_speaking() flips false mid-reply, new wake fires) is a known follow-up.
Needs speaking_until_ms deadline on the session + a JS-side audio flush
RPC. Tracked, not addressed in this commit.
---
 src/openhuman/meet_agent/brain.rs | 24 +++++++++++++++++++++++-
 1 file changed, 23 insertions(+), 1 deletion(-)

diff --git a/src/openhuman/meet_agent/brain.rs b/src/openhuman/meet_agent/brain.rs
index a3bbdab1a1..bb93cac4ac 100644
--- a/src/openhuman/meet_agent/brain.rs
+++ b/src/openhuman/meet_agent/brain.rs
@@ -404,7 +404,29 @@ read the note back.\n\
 /// spoken sentence with no markdown / no chain-of-thought / no
 /// preamble. Wrapped in a delimiter so the orchestrator can't confuse
 /// the directive with the user's actual utterance.
-const MEET_VOICE_DIRECTIVE: &str = "MEETING VOICE MODE: This conversation is happening live over voice in a Google Meet call. Every word of your reply will be passed VERBATIM to TTS and spoken aloud. Therefore: answer in ONE short spoken sentence, max 25 words, plain spoken English, no markdown, no bullets, no code, no preamble (do not say \"I should…\", \"Let me…\", \"We need to…\", \"The user said…\"). Tool-use is great — call tools when needed — but only the final spoken reply should appear in your output. If the user is not directly addressing you, output an empty string and stay silent.";
+const MEET_VOICE_DIRECTIVE: &str = "\
+MEETING VOICE MODE — this conversation is happening live over voice in a Google Meet call. \n\
+\n\
+TOOL USE (encouraged):\n\
+- USE TOOLS whenever a tool can give a real answer. Calendar, email, slack, memory, integrations — \
+call them. Tool calls are invisible to the user and DO NOT count toward your reply word budget.\n\
+- If you need data from a tool to answer accurately, CALL THE TOOL. Do not guess from prior training. \
+Do not claim something is not connected before attempting to call its tool — the tool surface above \
+shows what is actually available right now.\n\
+- delegate_to_integrations_agent is your gateway to all connected provider integrations (calendar, \
+gmail, slack, etc.). Use it when the user asks about their schedule, mail, messages, or any other \
+integration-backed data.\n\
+\n\
+FINAL SPOKEN REPLY (strict — this is the only part the user hears):\n\
+- After tool work is done, output ONE short spoken sentence, max 25 words.\n\
+- Plain spoken English only. No markdown. No bullets. No code. No URLs.\n\
+- No meta-narration. Do not say \"Let me check…\", \"I will look…\", \"The user is asking…\", \
+\"We need to…\", \"I should…\". Just give the answer.\n\
+- If the user is not directly addressing you (chit-chat between humans, side conversation, your \
+name appearing inside a longer thought aimed at someone else), output an empty string and stay silent.\n\
+- For dictation / note requests (\"remember…\", \"action item…\", \"follow up on…\"), a 2-3 word \
+ack is enough (\"Got it.\", \"Noted.\").\n\
+- For genuinely unanswerable questions, say so in one short sentence rather than guessing.";
 
 /// First 12 chars of `request_id`, for log scoping. UUID prefixes are
 /// unique enough at one-meet-at-a-time to keep transcripts apart.

From d8de1f3b9cccc81ad99fe6eb3c5dfbec1927008c Mon Sep 17 00:00:00 2001
From: oxoxDev <nikhil@tinyhumans.ai>
Date: Sat, 23 May 2026 01:03:13 +0530
Subject: [PATCH 26/64] fix(meet-agent): per-speaker caption dedup +
 turn_in_progress gate + 60s timeout
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Sub-agent log analysis of the live dev:app run found three converging
bugs that produced "bot keeps repeating the same toolless reply 20 times"
behaviour even after the orchestrator + tools were wired up correctly:

1. **Single-slot last_caption_signature was broken**. Meet's CC region
   renders two simultaneous rows (the user's caption AND the bot's TTS
   captioned back as speaker="You"). The 250 ms poll walked both rows
   every tick, so the signature flipped A → B → A → B and dedup never
   matched on byte-identical user repeats. One utterance fired the wake
   word 24 times. Replace with HashMap<speaker_lower, last_text>.

2. **turn_in_progress gate** added. While a brain turn is in flight
   (LLM + tools), refuse new wakes. The user's growing utterance was
   firing a fresh agentic turn every ~9-10s while the prior turn's
   delegate_to_integrations_agent (16-30s for calendar) was still
   running. Result: ~20 parallel calendar API hits per question, none
   of which finished inside the timeout. Gate is set at run_caption_turn
   entry (alongside cancel_outbound + take_pending_prompt) and cleared
   at the final with_session that enqueues the reply.

3. **Agentic timeout 20s → 60s**. Single delegate_to_integrations_agent
   already takes 15-30s on its own. Iteration 2 (synthesis using the
   tool result) needs another 3-5s. The 20s budget killed iteration 1
   mid-flight and forced the bot back to llm_meeting_basic, which
   produced the confidently-wrong "I don't have access to your
   calendar" lie. 60s covers tool + synthesis with headroom. The
   turn_in_progress gate prevents the longer window from starving the
   user — they cannot fire 20 parallel turns during the wait.

Known follow-up: when the agentic path times out (rare with 60s), the
basic-LLM fallback still hallucinates. Should swap that for a polite
"still checking" ack instead. Tracked, not in this commit.
---
 src/openhuman/meet_agent/brain.rs   | 25 +++++++--
 src/openhuman/meet_agent/session.rs | 78 +++++++++++++++++++----------
 2 files changed, 71 insertions(+), 32 deletions(-)

diff --git a/src/openhuman/meet_agent/brain.rs b/src/openhuman/meet_agent/brain.rs
index bb93cac4ac..24f58c2f60 100644
--- a/src/openhuman/meet_agent/brain.rs
+++ b/src/openhuman/meet_agent/brain.rs
@@ -38,11 +38,15 @@ use super::session::registry;
 use super::types::{SessionEvent, SessionEventKind};
 use super::wav;
 
-/// Wall-clock ceiling on one agentic turn. Tool iterations + LLM call
-/// can run 10s+; 20s is comfortable for calendar / memory lookups but
-/// short enough that we fall back to a polite "let me get back to
-/// you" instead of leaving the meet participant in silence.
-const AGENTIC_TURN_TIMEOUT_SECS: u64 = 20;
+/// Wall-clock ceiling on one agentic turn. Single tool dispatch
+/// (delegate_to_integrations_agent → calendar / gmail / slack) runs
+/// 15-30s on its own; iteration 2 (synthesis using the tool result)
+/// adds 3-5s. 60s gives enough headroom for one tool + synthesis,
+/// while still short enough that a hung tool doesn't leave the
+/// meeting participant in indefinite silence. The turn_in_progress
+/// gate blocks new wakes during the wait so the user can't fire 20
+/// parallel calendar queries by talking more.
+const AGENTIC_TURN_TIMEOUT_SECS: u64 = 60;
 
 /// How many of the most recent `Heard` / `Spoke` events we feed back
 /// into the LLM as rolling conversation context. 12 ≈ a few minutes of
@@ -108,6 +112,12 @@ pub async fn run_caption_turn(request_id: &str) -> Result<bool, String> {
     // play the entire backlog before the new reply starts. This makes
     // the bot interruptible from the user's side.
     let (prompt, history, was_bare_wake) = match registry().with_session(request_id, |s| {
+        // Mark turn as in-flight so note_caption refuses to fire new
+        // wakes until run_caption_turn returns. Without this, the
+        // user's continuing speech (or growing-caption re-fires)
+        // spawns 20 parallel agentic turns for one question and none
+        // of them complete inside the timeout.
+        s.turn_in_progress = true;
         s.cancel_outbound();
         let prompt = s.take_pending_prompt();
         let history = recent_dialog_history(s.events(), CONTEXT_EVENT_WINDOW);
@@ -195,6 +205,11 @@ pub async fn run_caption_turn(request_id: &str) -> Result<bool, String> {
             );
         }
         s.turn_count += 1;
+        // Clear the in-flight gate so the next wake can fire. Done
+        // inside the same with_session so it lands in one critical
+        // section with the reply enqueue, even if the caller drops
+        // the future after this point.
+        s.turn_in_progress = false;
     })?;
 
     log::info!(
diff --git a/src/openhuman/meet_agent/session.rs b/src/openhuman/meet_agent/session.rs
index 7b0b178c25..a8dac2500c 100644
--- a/src/openhuman/meet_agent/session.rs
+++ b/src/openhuman/meet_agent/session.rs
@@ -72,14 +72,24 @@ pub struct MeetAgentSession {
     /// text — a single character growth re-queues the line). Without
     /// this gate the brain spam-fires on every caption growth.
     wake_cooldown_until_ts_ms: u64,
-    /// Last `(speaker, text)` pair forwarded to the wake-word matcher.
-    /// Drops verbatim repeats produced by Meet's caption observer
-    /// re-emitting the same line every poll tick — the page-side
-    /// `lastBySpeaker` dedupe is per-key, and Meet sometimes flips
-    /// the key (speaker name appears / disappears for the same row)
-    /// so identical text still reaches us. Server-side defence keeps
-    /// the log clean and stops spurious wake retries.
-    last_caption_signature: Option<String>,
+    /// Per-speaker last caption text. Drops verbatim repeats from the
+    /// page-side observer. A single-slot Option<String> was broken
+    /// because Meet's CC region renders two simultaneous rows (the
+    /// user's caption AND the bot's TTS being captioned as
+    /// speaker="You"). Polling walks both rows every 250ms; with a
+    /// single-slot signature the value flips A → B → A → B every
+    /// tick and dedup never matches. Per-speaker keying fixes it.
+    last_caption_by_speaker: std::collections::HashMap<String, String>,
+    /// True between brain-turn dispatch (run_caption_turn entry) and
+    /// final-reply enqueue. While set, note_caption refuses to fire a
+    /// fresh wake — without this gate, the model takes 5–15s to run
+    /// tools but Meet keeps emitting new captions every 250ms, each
+    /// firing a new turn that cancels the prior one. Tool calls never
+    /// resolve. The gate is wider than `is_speaking()` (which only
+    /// covers TTS playback) because the LLM + tool phase is the part
+    /// the user can interrupt only by deliberately re-saying the wake
+    /// word, which they shouldn't have to.
+    pub turn_in_progress: bool,
 }
 
 impl MeetAgentSession {
@@ -100,7 +110,8 @@ impl MeetAgentSession {
             wake_active: false,
             last_caption_ts_ms: 0,
             wake_cooldown_until_ts_ms: 0,
-            last_caption_signature: None,
+            last_caption_by_speaker: std::collections::HashMap::new(),
+            turn_in_progress: false,
         }
     }
 
@@ -139,27 +150,40 @@ impl MeetAgentSession {
         if speaker_lower == "you" || speaker_lower.is_empty() {
             return false;
         }
-        // Server-side dedup. Meet's CC region re-renders the same line
-        // every 250 ms poll tick for the duration of an utterance, and
-        // the page-side `lastBySpeaker` dedup keys on a speaker guess
-        // that flips for the same row. Without this, the wake-word
-        // matcher (and the RPC log) sees N copies of every caption.
-        let signature = format!("{speaker_lower}\u{1F}{}", text.trim());
-        if self.last_caption_signature.as_deref() == Some(signature.as_str()) {
-            return false;
+        // Per-speaker dedup. Meet's CC region re-renders the same line
+        // every 250 ms poll tick and emits BOTH speaker rows on each
+        // walk (the user AND the bot TTS as speaker="You"). A single-
+        // slot last-signature would flip A → B → A → B every tick and
+        // never dedup. Keyed by speaker_lower so the user's repeating
+        // utterance is dropped after the first hit regardless of bot
+        // captions interleaving.
+        let key = speaker_lower.clone();
+        let trimmed_text = text.trim().to_string();
+        if let Some(prev) = self.last_caption_by_speaker.get(&key) {
+            if prev == &trimmed_text {
+                return false;
+            }
         }
-        self.last_caption_signature = Some(signature);
-        // Gate: if the bot is currently speaking (queued TTS audio),
-        // refuse to fire a new wake. The user's voice + the bot's
-        // voice can both show up as captions, and a reply that runs
-        // 30–60 s will collide with continued user speech every time.
-        // Without this, the bot speaks-listens-speaks in a loop until
-        // someone closes the call. New captions still record to the
-        // transcript log for context but cannot trigger another turn.
-        if self.is_speaking() {
+        self.last_caption_by_speaker
+            .insert(key, trimmed_text.clone());
+        // Gate: while a brain turn is in flight (LLM + tools running)
+        // or the bot is mid-playback, refuse to fire a fresh wake.
+        // Without this gate the user's continuing speech, or Meet's
+        // own caption observer re-emitting growing captions, fires
+        // new turns every ~9-10s while the prior turn's tool dispatch
+        // (16-29s for delegate_to_integrations_agent) is still running.
+        // Result: 20 parallel calendar API calls for one question, none
+        // of which complete in time. The is_speaking() side covers TTS
+        // playback after the agent returns; turn_in_progress covers the
+        // LLM + tool-execution phase.
+        if self.turn_in_progress || self.is_speaking() {
             self.record_event(
                 SessionEventKind::Heard,
-                format!("{speaker}: {text} (suppressed: bot speaking)"),
+                format!(
+                    "{speaker}: {text} (suppressed: turn_in_progress={} speaking={})",
+                    self.turn_in_progress,
+                    self.is_speaking()
+                ),
             );
             return false;
         }

From cb4117b5dbbee02dc452b2b9a947448a0aac9f8d Mon Sep 17 00:00:00 2001
From: oxoxDev <nikhil@tinyhumans.ai>
Date: Sat, 23 May 2026 01:19:21 +0530
Subject: [PATCH 27/64] =?UTF-8?q?fix(meet-agent):=2060s=E2=86=9290s=20time?=
 =?UTF-8?q?out=20+=20drop=20basic-LLM=20hallucination=20fallback?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Live test of the Slack question hit the 60s ceiling — delegate_to_integrations_agent
completed in 33.97s with 8 iterations + 239 chars of real Slack data, but
iteration 2 (orchestrator synthesis) never landed. The bot fell back to
llm_meeting_basic, which has no tool access and confidently invented an
answer the user heard over voice — worse than honest silence.

1. AGENTIC_TURN_TIMEOUT_SECS: 60 → 90. Slack / Gmail fetches via Composio
   + per-message filtering + synthesis hit 60-80s in the slow path. The
   turn_in_progress gate still blocks parallel wakes during the wait.

2. Removed llm_meeting_basic fallback from both run_caption_turn and
   run_turn. On agentic failure we now speak "Let me get back to you on
   that." instead of routing to a toolless LLM that hallucinates.
   Honest deflection > false answer in a live meeting.

llm_meeting_basic is retained in the file for future integration-degraded
smoke tests; no live caller exercises it now.
---
 src/openhuman/meet_agent/brain.rs | 83 ++++++++++++-------------------
 1 file changed, 32 insertions(+), 51 deletions(-)

diff --git a/src/openhuman/meet_agent/brain.rs b/src/openhuman/meet_agent/brain.rs
index 24f58c2f60..f9e6751f91 100644
--- a/src/openhuman/meet_agent/brain.rs
+++ b/src/openhuman/meet_agent/brain.rs
@@ -38,15 +38,12 @@ use super::session::registry;
 use super::types::{SessionEvent, SessionEventKind};
 use super::wav;
 
-/// Wall-clock ceiling on one agentic turn. Single tool dispatch
-/// (delegate_to_integrations_agent → calendar / gmail / slack) runs
-/// 15-30s on its own; iteration 2 (synthesis using the tool result)
-/// adds 3-5s. 60s gives enough headroom for one tool + synthesis,
-/// while still short enough that a hung tool doesn't leave the
-/// meeting participant in indefinite silence. The turn_in_progress
-/// gate blocks new wakes during the wait so the user can't fire 20
-/// parallel calendar queries by talking more.
-const AGENTIC_TURN_TIMEOUT_SECS: u64 = 60;
+/// Wall-clock ceiling on one agentic turn. Slack / Gmail fetches via
+/// Composio + per-message filtering + iteration-2 synthesis can hit
+/// 60-80s in the slow path. 90s gives the long integrations a chance
+/// to land. The turn_in_progress gate blocks new wakes during the
+/// wait, so the user cannot spawn parallel queries by re-asking.
+const AGENTIC_TURN_TIMEOUT_SECS: u64 = 90;
 
 /// How many of the most recent `Heard` / `Spoke` events we feed back
 /// into the LLM as rolling conversation context. 12 ≈ a few minutes of
@@ -148,26 +145,24 @@ pub async fn run_caption_turn(request_id: &str) -> Result<bool, String> {
     let reply_text = match llm_meeting_agentic(&prompt, request_id).await {
         Ok(text) => text,
         Err(agentic_err) => {
+            // Do NOT fall back to basic LLM. The basic path has no
+            // tool access, so on a calendar/slack/gmail question it
+            // confidently hallucinates "I don't have access" — which
+            // is the WRONG answer and worse than silence. Speak a
+            // short canned "let me get back to you" ack so the user
+            // knows the question was heard but the bot couldn't
+            // resolve it in time, then drop the prompt. The user
+            // can re-ask (turn_in_progress gate clears as we exit).
             log::warn!(
-                "[meet-agent] agentic turn failed, falling back to basic LLM request_id={request_id} err={agentic_err}"
+                "[meet-agent] agentic turn failed — speaking polite ack instead of toolless fallback request_id={request_id} err={agentic_err}"
             );
-            match llm_meeting_basic(&prompt, &history).await {
-                Ok(text) => text,
-                Err(basic_err) => {
-                    log::warn!(
-                        "[meet-agent] basic LLM also failed request_id={request_id} err={basic_err}"
-                    );
-                    let _ = registry().with_session(request_id, |s| {
-                        s.record_event(
-                            SessionEventKind::Note,
-                            format!(
-                                "both LLM paths failed (agentic: {agentic_err}; basic: {basic_err})"
-                            ),
-                        );
-                    });
-                    pick_ack_phrase(&prompt).to_string()
-                }
-            }
+            let _ = registry().with_session(request_id, |s| {
+                s.record_event(
+                    SessionEventKind::Note,
+                    format!("agentic path failed; speaking ack: {agentic_err}"),
+                );
+            });
+            "Let me get back to you on that.".to_string()
         }
     };
 
@@ -244,11 +239,7 @@ fn pick_ack_phrase(prompt: &str) -> &'static str {
 /// turn actually ran, `Ok(false)` when the inbound buffer was below the
 /// floor.
 pub async fn run_turn(request_id: &str) -> Result<bool, String> {
-    let (drained, history) = registry().with_session(request_id, |s| {
-        let drained = s.drain_inbound();
-        let history = recent_dialog_history(s.events(), CONTEXT_EVENT_WINDOW);
-        (drained, history)
-    })?;
+    let drained = registry().with_session(request_id, |s| s.drain_inbound())?;
     if drained.len() < MIN_TURN_SAMPLES {
         log::debug!(
             "[meet-agent] skipping turn request_id={request_id} samples={}",
@@ -287,30 +278,20 @@ pub async fn run_turn(request_id: &str) -> Result<bool, String> {
         heard.chars().count()
     );
 
-    // ─── LLM (agentic-first, basic-fallback) ───────────────────────
+    // ─── LLM (agentic only; no basic-LLM fallback to avoid toolless hallucinations) ─
     let reply_text = match llm_meeting_agentic(&heard, request_id).await {
         Ok(text) => text,
         Err(agentic_err) => {
             log::warn!(
-                "[meet-agent] STT-path agentic failed, falling back request_id={request_id} err={agentic_err}"
+                "[meet-agent] STT-path agentic failed — speaking polite ack request_id={request_id} err={agentic_err}"
             );
-            match llm_meeting_basic(&heard, &history).await {
-                Ok(text) => text,
-                Err(basic_err) => {
-                    log::warn!(
-                        "[meet-agent] STT-path basic LLM also failed request_id={request_id} err={basic_err}"
-                    );
-                    let _ = registry().with_session(request_id, |s| {
-                        s.record_event(
-                            SessionEventKind::Note,
-                            format!(
-                                "both LLM paths failed (agentic: {agentic_err}; basic: {basic_err})"
-                            ),
-                        );
-                    });
-                    stub_llm(&heard).await
-                }
-            }
+            let _ = registry().with_session(request_id, |s| {
+                s.record_event(
+                    SessionEventKind::Note,
+                    format!("agentic path failed; speaking ack: {agentic_err}"),
+                );
+            });
+            "Let me get back to you on that.".to_string()
         }
     };
 

From 3c864a4248d79c8e191db3a4cdcfbde622214ef4 Mon Sep 17 00:00:00 2001
From: oxoxDev <nikhil@tinyhumans.ai>
Date: Sat, 23 May 2026 01:22:07 +0530
Subject: [PATCH 28/64] feat(meet-agent): inject current date/time into
 orchestrator system prompt

User asked "what time is it" and got "I don't know" / "Let me get back to
you" because the orchestrator's registry has no clock tool. Cheap fix:
include current local date/time/weekday/tz-offset in the
profile_prompt_suffix when building the per-turn orchestrator. The
directive tells the model to use this block directly for time/date
questions and NOT dispatch a tool. Refreshed every turn because Agent
is built per-turn, so the answer stays accurate across long meetings.

Format example: "Current local date/time: 2026-05-23 01:21:48".
---
 src/openhuman/meet_agent/brain.rs | 18 +++++++++++++++++-
 1 file changed, 17 insertions(+), 1 deletion(-)

diff --git a/src/openhuman/meet_agent/brain.rs b/src/openhuman/meet_agent/brain.rs
index f9e6751f91..9633b67ef6 100644
--- a/src/openhuman/meet_agent/brain.rs
+++ b/src/openhuman/meet_agent/brain.rs
@@ -460,11 +460,27 @@ async fn llm_meeting_agentic(prompt: &str, request_id: &str) -> Result<String, S
     // is the established hook for per-channel system-prompt
     // augmentation — the web channel uses it for the locale-reply
     // directive; we use it for the voice-frontend directive.
+    // Compose the system-prompt suffix with the static voice directive
+    // plus a tiny "right-now context" block so the model can answer
+    // "what time is it / what's today's date" without a tool dispatch
+    // (no clock tool exists; without this the bot says "I don't know").
+    let now_local = chrono::Local::now();
+    let now_block = format!(
+        "\n\nRIGHT-NOW CONTEXT (use directly for time / date questions):\n\
+         - Current local date/time: {}\n\
+         - Current weekday: {}\n\
+         - Timezone offset: {}\n\
+         Trust this block for time questions; do NOT call a tool to look up the clock.",
+        now_local.format("%Y-%m-%d %H:%M:%S"),
+        now_local.format("%A"),
+        now_local.format("%:z"),
+    );
+    let composed_suffix = format!("{MEET_VOICE_DIRECTIVE}{now_block}");
     let mut agent = Agent::from_config_for_agent_with_profile(
         &config,
         "orchestrator",
         None,
-        Some(MEET_VOICE_DIRECTIVE.to_string()),
+        Some(composed_suffix),
     )
     .map_err(|e| format!("[meet-agent] orchestrator build failed: {e}"))?;
 

From b20e46684691e59b355ea5c0ce3864f0a9e12818 Mon Sep 17 00:00:00 2001
From: oxoxDev <nikhil@tinyhumans.ai>
Date: Sat, 23 May 2026 01:23:26 +0530
Subject: [PATCH 29/64] fix(meet-scanner): broaden CC auto-enable matchers +
 double attempt budget
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

User reported: still has to enable Meet captions manually each call.
The bot can't hear without CC because Flow A scrapes Meet's caption DOM.

Two paths were running but both narrow:
1. captions_bridge.js polled prefix-only `aria.indexOf("turn on captions") === 0`,
   missing Meet variants like "Turn on captions (c)", "Turn on live captions",
   "Subtitles", "Closed captions".
2. meet_scanner phase-4 click_by_aria_label substring-matched but only
   knew 5 patterns; Meet rolls out new labels regionally.

Widen both:
- Patterns: turn on captions / turn on live captions / turn on subtitles /
  turn on closed captions / captions on / captions (c) / show captions /
  enable captions
- Bridge uses substring match (`indexOf >= 0`), not prefix-only
- Negative guard added so we never accidentally click an already-ON
  toggle ("Turn off captions" / "captions off" / "disable captions")
- Bridge attempt budget 30 → 60 (~120s) for slow waiting-room admits
- Scanner dump label widened from "caption" to "caption|subtitle" so the
  failure log catches any future label variant for further widening
---
 .../src/meet_audio/captions_bridge.js         | 27 +++++++++++++++----
 app/src-tauri/src/meet_scanner/mod.rs         |  7 +++--
 2 files changed, 27 insertions(+), 7 deletions(-)

diff --git a/app/src-tauri/src/meet_audio/captions_bridge.js b/app/src-tauri/src/meet_audio/captions_bridge.js
index cf79bd45e3..14b52a178e 100644
--- a/app/src-tauri/src/meet_audio/captions_bridge.js
+++ b/app/src-tauri/src/meet_audio/captions_bridge.js
@@ -139,18 +139,35 @@
   }
 
   // Auto-enable captions: walk every button on the page and click any
-  // that has an aria-label starting with "Turn on captions". Caps the
-  // attempts so we don't fight a user who deliberately disables CC.
-  var ENABLE_ATTEMPT_BUDGET = 30; // ~30 * 2s = 60s
+  // that has an aria-label matching the "turn on captions" intent.
+  // Substring match (not prefix) — Meet rolls out variant labels
+  // ("Turn on captions (c)", "Turn on live captions", "Subtitles",
+  // "Captions") that the strict prefix-only matcher missed, forcing
+  // the user to click the toggle by hand. Caps attempts so a user who
+  // deliberately disables CC isn't fought over forever.
+  var ENABLE_ATTEMPT_BUDGET = 60; // ~60 * 2s = 120s — covers slow admit
   var enableAttempts = 0;
   function tryEnableCaptions() {
     if (enableAttempts >= ENABLE_ATTEMPT_BUDGET) return;
     enableAttempts++;
     var buttons = document.querySelectorAll("button[aria-label]");
+    var ON_PATTERNS = [
+      "turn on captions",
+      "turn on live captions",
+      "turn on subtitles",
+      "turn on closed captions",
+      "captions on",
+      "captions (c)",
+      "show captions",
+      "enable captions",
+    ];
+    // Negative guard: never click anything that is already-on (Meet
+    // shows "Turn off captions" when CC is active).
+    var OFF_PATTERNS = ["turn off captions", "captions off", "disable captions"];
     for (var i = 0; i < buttons.length; i++) {
       var lbl = (buttons[i].getAttribute("aria-label") || "").toLowerCase();
-      // Match "Turn on captions" but NOT "Turn off captions".
-      if (lbl.indexOf("turn on captions") === 0 || /^turn on captions/.test(lbl)) {
+      if (OFF_PATTERNS.some(function (p) { return lbl.indexOf(p) >= 0; })) continue;
+      if (ON_PATTERNS.some(function (p) { return lbl.indexOf(p) >= 0; })) {
         try {
           buttons[i].click();
           enableAttempts = ENABLE_ATTEMPT_BUDGET; // success — stop trying.
diff --git a/app/src-tauri/src/meet_scanner/mod.rs b/app/src-tauri/src/meet_scanner/mod.rs
index f7b94bf487..771b2ca5c2 100644
--- a/app/src-tauri/src/meet_scanner/mod.rs
+++ b/app/src-tauri/src/meet_scanner/mod.rs
@@ -325,17 +325,20 @@ async fn run(request_id: &str, meet_url: &str, display_name: &str) -> Result<(),
             &session,
             &[
                 "turn on captions",
+                "turn on live captions",
+                "turn on subtitles",
+                "turn on closed captions",
                 "captions on",
                 "captions (c)",
-                "turn on live captions",
                 "show captions",
+                "enable captions",
             ],
             Duration::from_secs(8),
         )
         .await
         {
             log::info!("[meet-scanner] captions toggle ON not clicked: {err}");
-            dump_aria_labels(&mut cdp, &session, "caption").await;
+            dump_aria_labels(&mut cdp, &session, "caption|subtitle").await;
         }
     }
 

From ded0bfc5b017cb3979f5f190ed763d2762b4d248 Mon Sep 17 00:00:00 2001
From: oxoxDev <nikhil@tinyhumans.ai>
Date: Sat, 23 May 2026 01:39:19 +0530
Subject: [PATCH 30/64] feat(meet-agent): pre-roll "On it." ack to bridge
 30-60s tool-call silence
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Slack / Gmail / Calendar turns run 30-60s end-to-end via the orchestrator
+ delegate_to_integrations_agent chain. Without an immediate acoustic
cue, the meeting participant assumes the bot is broken, re-asks (which
the turn_in_progress gate blocks now, but the call still feels dead),
or just disengages.

Synth a 2-word "On it." ack immediately after the wake-active prompt is
drained, before the agent dispatch. Enqueue with done=false so the real
reply (or polite "let me get back to you" fallback) appends cleanly on
the back of it. Audio plays as: <ack> → <real reply>.

Cost is one extra ElevenLabs roundtrip (~500ms) per turn — a flat tax,
amortized across the 30-60s wait it covers. Skipped on bare-wake greeting
turns ("hey openhuman" with no question), where the real reply is itself
already a short greeting and the ack would just feel redundant.
---
 src/openhuman/meet_agent/brain.rs | 32 +++++++++++++++++++++++++++++++
 1 file changed, 32 insertions(+)

diff --git a/src/openhuman/meet_agent/brain.rs b/src/openhuman/meet_agent/brain.rs
index 9633b67ef6..522c6b89c9 100644
--- a/src/openhuman/meet_agent/brain.rs
+++ b/src/openhuman/meet_agent/brain.rs
@@ -45,6 +45,12 @@ use super::wav;
 /// wait, so the user cannot spawn parallel queries by re-asking.
 const AGENTIC_TURN_TIMEOUT_SECS: u64 = 90;
 
+/// Spoken filler played immediately after wake-word fires, before the
+/// (possibly slow) orchestrator+tool path runs. Bridges the 30-60s
+/// silence on slow integration paths. Kept short (~1s synth) so it
+/// doesn't intrude on fast greetings / time questions.
+const PREROLL_ACK_PHRASE: &str = "On it.";
+
 /// How many of the most recent `Heard` / `Spoke` events we feed back
 /// into the LLM as rolling conversation context. 12 ≈ a few minutes of
 /// captioned dialogue — enough for the model to follow a thread without
@@ -135,6 +141,32 @@ pub async fn run_caption_turn(request_id: &str) -> Result<bool, String> {
         was_bare_wake,
     );
 
+    // Pre-roll filler. The orchestrator + integration tools take
+    // 30–60s on slow paths (Slack / Gmail / Calendar). Without an
+    // immediate acoustic cue, the user assumes the bot is broken and
+    // re-asks (which the turn_in_progress gate now blocks but still
+    // burns the call atmosphere). Speak a 2-word ack right away and
+    // enqueue with done=false so the real reply appends cleanly when
+    // it lands. If the agent path returns < 1s (greeting, time
+    // question), the user hears "On it. <real reply>" — slightly
+    // redundant but not annoying. On slow paths the ack covers
+    // exactly the dead air it was designed for.
+    if !was_bare_wake {
+        if let Ok(ack_pcm) = tts(PREROLL_ACK_PHRASE).await {
+            let _ = registry().with_session(request_id, |s| {
+                s.enqueue_outbound_pcm(&ack_pcm, false);
+            });
+            log::info!(
+                "[meet-agent] pre-roll ack queued request_id={request_id} samples={}",
+                ack_pcm.len()
+            );
+        } else {
+            log::debug!(
+                "[meet-agent] pre-roll ack synth failed request_id={request_id} — skipping pre-roll"
+            );
+        }
+    }
+
     // Route the turn through the FULL orchestrator agent first — it
     // owns the user's connected integrations, memory tree, MCP
     // clients and skills, so it can actually answer "is my Friday

From 89b33727c0ce328661a44502c358501da6355429 Mon Sep 17 00:00:00 2001
From: oxoxDev <nikhil@tinyhumans.ai>
Date: Sat, 23 May 2026 01:42:16 +0530
Subject: [PATCH 31/64] feat(meet-agent): cache orchestrator Agent per meet for
 cross-turn memory + faster turns
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Each meet wake was building a fresh orchestrator Agent — paying the
5-10s memory-tree + MCP + tool-registry cost every single turn, and
losing the harness's in-memory conversation history immediately
(unique definition_name disabled disk resume, which was necessary to
avoid the corrupt-transcript tool_calls API rejection).

Add AGENT_CACHE: OnceLock<TokioMutex<HashMap<request_id, Arc<TokioMutex<Agent>>>>>.

Lifecycle:
- First wake: cold-build via from_config_for_agent_with_profile (same
  canonical path as the chat UI), cache the resulting Agent under the
  request_id, lock it for the run_single call.
- Every later wake of the same meet: lock the cached Agent and reuse.
  Agent.history accumulates across turns, so the orchestrator can
  recall earlier dialogue ("did I tell you about Friday?") naturally.
- stop_session: drop the cached Agent so memory tree + tool handles
  are released. forget_session_agent() exposed for that path.

Per-turn unique definition_name retained, but only for the on-disk
transcript filename (kill-mid-tool-call wouldn't poison a restart). The
Agent's in-memory state survives independently of the file.

Time-context block moved from system-prompt-suffix (baked at build) to
a user-message prefix (refreshed each turn) — otherwise the clock
sticks at the meet's start-time forever.

tokio::sync::Mutex (not std::sync::Mutex) because the inner lock is
held across run_single().await — std Mutex breaks Send + can leak on
cancel. Per-meet locking, so two distinct meets can run their agents
in parallel.
---
 src/openhuman/meet_agent/brain.rs | 157 +++++++++++++++++++++---------
 src/openhuman/meet_agent/rpc.rs   |   5 +
 2 files changed, 114 insertions(+), 48 deletions(-)

diff --git a/src/openhuman/meet_agent/brain.rs b/src/openhuman/meet_agent/brain.rs
index 522c6b89c9..12b3b843e5 100644
--- a/src/openhuman/meet_agent/brain.rs
+++ b/src/openhuman/meet_agent/brain.rs
@@ -32,12 +32,45 @@
 
 use base64::{engine::general_purpose::STANDARD as B64, Engine as _};
 use serde_json::{json, Value};
+use std::collections::HashMap;
+use std::sync::{Arc, OnceLock};
 use std::time::Duration;
+use tokio::sync::Mutex as TokioMutex;
 
 use super::session::registry;
 use super::types::{SessionEvent, SessionEventKind};
 use super::wav;
 
+use crate::openhuman::agent::harness::session::Agent;
+
+/// Process-wide cache of orchestrator Agents keyed by `request_id`.
+/// Each meet session reuses the same Agent across all its turns so
+/// the harness's in-memory `Agent.history` accumulates and the
+/// orchestrator can recall prior dialogue ("did I tell you to
+/// remember Friday?", "what did Alice say earlier?"). Without the
+/// cache each turn builds a fresh Agent, loses the prior turn's
+/// memory, and pays the 5-10s build cost every time.
+///
+/// Locked with `tokio::sync::Mutex` because we hold the inner
+/// `Arc<TokioMutex<Agent>>` lock across `run_single().await` —
+/// std::sync::Mutex cannot be held across await without breaking
+/// Send + leaking the lock on cancel.
+static AGENT_CACHE: OnceLock<TokioMutex<HashMap<String, Arc<TokioMutex<Agent>>>>> = OnceLock::new();
+
+fn agent_cache() -> &'static TokioMutex<HashMap<String, Arc<TokioMutex<Agent>>>> {
+    AGENT_CACHE.get_or_init(|| TokioMutex::new(HashMap::new()))
+}
+
+/// Drop the cached orchestrator for a meet session. Called from
+/// `handle_stop_session` so a finished call doesn't leak the Agent
+/// (each one carries memory tree + tool registry handles).
+pub async fn forget_session_agent(request_id: &str) {
+    let mut guard = agent_cache().lock().await;
+    if guard.remove(request_id).is_some() {
+        log::info!("[meet-agent] dropped cached orchestrator for request_id={request_id}");
+    }
+}
+
 /// Wall-clock ceiling on one agentic turn. Slack / Gmail fetches via
 /// Composio + per-message filtering + iteration-2 synthesis can hit
 /// 60-80s in the slow path. 90s gives the long integrations a chance
@@ -480,71 +513,56 @@ fn short_id(id: &str) -> String {
 /// registry / token issue degrades to a polite reply instead of
 /// dead air.
 async fn llm_meeting_agentic(prompt: &str, request_id: &str) -> Result<String, String> {
-    use crate::openhuman::agent::harness::session::Agent;
-
-    let config = crate::openhuman::config::ops::load_config_with_timeout().await?;
-
-    // Use the with_profile builder — same canonical path the web
-    // channel (chat UI) uses at channels/providers/web.rs:1570. This
-    // is what wires the user's connected integrations + delegation
-    // tools onto the orchestrator. The plain `from_config_for_agent`
-    // builds with zero integrations attached. `profile_prompt_suffix`
-    // is the established hook for per-channel system-prompt
-    // augmentation — the web channel uses it for the locale-reply
-    // directive; we use it for the voice-frontend directive.
-    // Compose the system-prompt suffix with the static voice directive
-    // plus a tiny "right-now context" block so the model can answer
-    // "what time is it / what's today's date" without a tool dispatch
-    // (no clock tool exists; without this the bot says "I don't know").
+    // Get-or-build the per-meet cached Agent. First wake of a meet
+    // builds the orchestrator once (memory tree + MCP + tools — 5-10s
+    // cold); subsequent wakes reuse the same instance, so its
+    // in-memory history accumulates and the orchestrator can recall
+    // earlier dialogue without disk-resume corruption tripping the
+    // tool_calls / tool_message API constraint.
+    let agent_lock = get_or_build_agent_for_meet(request_id).await?;
+
+    // Lock for the duration of the turn. The lock is per-meet, so
+    // two distinct meet sessions can run agents in parallel; within
+    // one meet, turn_in_progress already prevents reentrancy. Held
+    // across run_single().await — that's why we use tokio::sync::Mutex.
+    let mut agent = agent_lock.lock().await;
+
+    // Per-turn refresh of the time-context block. The voice directive
+    // is baked into the system prompt at build time; the clock has
+    // to update each turn or the bot will tell the user it's still
+    // 2am ten minutes later. Prepend the time block to the user
+    // utterance instead of touching the system prompt suffix (which
+    // we can't change without rebuilding the Agent).
     let now_local = chrono::Local::now();
-    let now_block = format!(
-        "\n\nRIGHT-NOW CONTEXT (use directly for time / date questions):\n\
-         - Current local date/time: {}\n\
-         - Current weekday: {}\n\
-         - Timezone offset: {}\n\
-         Trust this block for time questions; do NOT call a tool to look up the clock.",
+    let time_block = format!(
+        "[RIGHT-NOW CONTEXT — current local time: {} ({}), tz {}. \
+         Use this directly for any time/date question; do not call a tool.]",
         now_local.format("%Y-%m-%d %H:%M:%S"),
         now_local.format("%A"),
         now_local.format("%:z"),
     );
-    let composed_suffix = format!("{MEET_VOICE_DIRECTIVE}{now_block}");
-    let mut agent = Agent::from_config_for_agent_with_profile(
-        &config,
-        "orchestrator",
-        None,
-        Some(composed_suffix),
-    )
-    .map_err(|e| format!("[meet-agent] orchestrator build failed: {e}"))?;
+    let user_message = format!("{time_block}\n\n{prompt}");
 
-    // Per-meet event context so the harness scopes its session
-    // transcript to this request_id instead of colliding with the
-    // chat-UI thread. Without this, two simultaneous orchestrators
-    // (chat + meet) share one transcript file.
-    // Per-turn unique definition_name. The harness auto-resumes prior
-    // transcripts when a definition_name matches a file on disk; if
-    // an earlier turn was killed mid-tool-call, the file ends with a
-    // dangling `tool_calls` assistant message and the LLM rejects
-    // the next request with 400 "tool_calls must be followed by tool
-    // messages". Per-turn naming bypasses resume entirely. Memory
-    // across turns is a follow-up (Arc<Mutex<Agent>> cache); for
-    // now each turn is stateless from the harness's perspective but
-    // tools still query real systems.
+    // Per-turn unique definition_name for the transcript file. The
+    // Agent's in-memory history persists across turns (cache); only
+    // the on-disk transcript filename rolls per turn so a kill
+    // mid-tool-call doesn't poison the next process's resume path.
     let now_ms = std::time::SystemTime::now()
         .duration_since(std::time::UNIX_EPOCH)
         .map(|d| d.as_millis())
         .unwrap_or(0);
-    agent.set_event_context(format!("meet_{request_id}"), "meet_agent");
     agent.set_agent_definition_name(format!(
         "orchestrator_meet_{}_{now_ms}",
         short_id(request_id)
     ));
 
     log::info!(
-        "[meet-agent] agentic turn dispatch request_id={request_id} prompt_chars={}",
-        prompt.chars().count()
+        "[meet-agent] agentic turn dispatch request_id={request_id} prompt_chars={} cached_history_msgs={}",
+        prompt.chars().count(),
+        agent.history().len(),
     );
 
-    let fut = agent.run_single(prompt);
+    let fut = agent.run_single(&user_message);
     let reply = match tokio::time::timeout(
         Duration::from_secs(AGENTIC_TURN_TIMEOUT_SECS),
         fut,
@@ -557,7 +575,7 @@ async fn llm_meeting_agentic(prompt: &str, request_id: &str) -> Result<String, S
         }
         Err(_elapsed) => {
             log::warn!(
-                "[meet-agent] agentic turn timed out request_id={request_id} after {}s — falling back",
+                "[meet-agent] agentic turn timed out request_id={request_id} after {}s — speaking polite ack",
                 AGENTIC_TURN_TIMEOUT_SECS
             );
             return Err(format!(
@@ -569,6 +587,49 @@ async fn llm_meeting_agentic(prompt: &str, request_id: &str) -> Result<String, S
     Ok(strip_for_speech(&reply))
 }
 
+/// Get the cached orchestrator for this meet, or build it on first
+/// call. Returns an `Arc<TokioMutex<Agent>>` so the caller can lock
+/// across the run_single().await.
+async fn get_or_build_agent_for_meet(
+    request_id: &str,
+) -> Result<Arc<TokioMutex<Agent>>, String> {
+    {
+        let cache = agent_cache().lock().await;
+        if let Some(existing) = cache.get(request_id) {
+            return Ok(existing.clone());
+        }
+    }
+
+    // Cold build. Use the with_profile builder — same canonical path
+    // the web channel (chat UI) uses at channels/providers/web.rs:1570,
+    // which is what wires the user's connected integrations + delegation
+    // tools. profile_prompt_suffix carries the meet voice directive.
+    let config = crate::openhuman::config::ops::load_config_with_timeout().await?;
+    let mut agent = Agent::from_config_for_agent_with_profile(
+        &config,
+        "orchestrator",
+        None,
+        Some(MEET_VOICE_DIRECTIVE.to_string()),
+    )
+    .map_err(|e| format!("[meet-agent] orchestrator build failed: {e}"))?;
+
+    // Per-meet event context so the harness scopes its observability
+    // events to this request_id instead of colliding with the chat UI.
+    agent.set_event_context(format!("meet_{request_id}"), "meet_agent");
+    agent.set_agent_definition_name(format!("orchestrator_meet_{}", short_id(request_id)));
+
+    log::info!(
+        "[meet-agent] orchestrator built + cached for request_id={request_id}"
+    );
+
+    let arc = Arc::new(TokioMutex::new(agent));
+    agent_cache()
+        .lock()
+        .await
+        .insert(request_id.to_string(), arc.clone());
+    Ok(arc)
+}
+
 /// Build a chat-completions request from rolling meeting history plus
 /// the current user prompt, post it through the backend, and return
 /// the assistant's reply (trimmed, possibly empty).
diff --git a/src/openhuman/meet_agent/rpc.rs b/src/openhuman/meet_agent/rpc.rs
index dc3784f3f2..766ee25acb 100644
--- a/src/openhuman/meet_agent/rpc.rs
+++ b/src/openhuman/meet_agent/rpc.rs
@@ -147,6 +147,11 @@ pub async fn handle_stop_session(params: Map<String, Value>) -> Result<Value, St
         .map_err(|e| format!("{LOG_PREFIX} invalid stop_session params: {e}"))?;
 
     let session = registry().stop(&req.request_id)?;
+    // Drop the cached orchestrator Agent for this meet so we don't
+    // leak its memory tree + tool registry handles after the call
+    // ends. The next start_session with the same request_id (rare
+    // but possible) will cold-build a fresh Agent.
+    super::brain::forget_session_agent(&req.request_id).await;
     log::info!(
         "{LOG_PREFIX} stop_session request_id={} listened={:.2}s spoken={:.2}s turns={}",
         session.request_id,

From 8560d2200119d4f1cd173a358e4a4bff5e213295 Mon Sep 17 00:00:00 2001
From: oxoxDev <nikhil@tinyhumans.ai>
Date: Sat, 23 May 2026 01:45:52 +0530
Subject: [PATCH 32/64] =?UTF-8?q?feat(meet-agent):=20barge-in=20=E2=80=94?=
 =?UTF-8?q?=20JS=20audio=20flush=20+=20relaxed=20wake=20gate?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The user couldn't interrupt a wrong-direction bot reply: turn_in_progress
gate blocked new wakes while outbound was queued, AND even when the gate
cleared, the JS audio bridge kept playing its in-flight AudioBufferSources
because cancel_outbound only cleared the Rust queue.

Plumb a "flush" signal end-to-end:

1. audio_bridge.js — track every started AudioBufferSource in
   `activeSources`. New `window.__openhumanFlushAudio()` stops them
   all and resets `nextStartTime` to current playback cursor.
2. inject::flush_audio_bridge — Runtime.evaluate the new JS hook;
   returns the count of sources stopped.
3. session.flush_pending — set by cancel_outbound, taken by
   poll_outbound. Distinct from utterance_done so a normal end of
   reply doesn't trigger a mid-word cut.
4. poll_outbound RPC adds `flush_pending: bool` to its response.
5. speak_pump on every tick: if flush_pending=true, call
   flush_audio_bridge BEFORE feeding the next chunk.
6. note_caption gate relaxed: only blocks during turn_in_progress
   (LLM + tools running). is_speaking() removed from the gate —
   user can now interrupt the speaking phase by re-asking, which
   triggers cancel_outbound → flush_pending → JS bridge flush →
   new turn dispatches cleanly. LLM phase still blocks, since
   spawning a parallel agentic turn would burn tool calls.

End-to-end: user says "hey openhuman, what's on Friday" → bot starts
20s tool dispatch → user realises wrong question, says "hey openhuman,
forget Friday, tell me about Slack" mid-reply. With #3 in place:
brain's cancel_outbound flushes the in-flight reply audio AND drops
the new wake's matching path through the gate.
---
 app/src-tauri/src/meet_audio/audio_bridge.js | 28 ++++++++++++
 app/src-tauri/src/meet_audio/inject.rs       | 28 ++++++++++++
 app/src-tauri/src/meet_audio/speak_pump.rs   | 21 +++++++++
 src/openhuman/meet_agent/rpc.rs              |  9 +++-
 src/openhuman/meet_agent/session.rs          | 47 +++++++++++++-------
 5 files changed, 115 insertions(+), 18 deletions(-)

diff --git a/app/src-tauri/src/meet_audio/audio_bridge.js b/app/src-tauri/src/meet_audio/audio_bridge.js
index 9d9cfdedd9..1714b340e3 100644
--- a/app/src-tauri/src/meet_audio/audio_bridge.js
+++ b/app/src-tauri/src/meet_audio/audio_bridge.js
@@ -97,6 +97,29 @@
     return out;
   }
 
+  // Track every scheduled AudioBufferSource so __openhumanFlushAudio
+  // can stop them on barge-in (user re-asks during a long bot reply).
+  // Without this list, only the queue tail past `nextStartTime` would
+  // be cancellable; anything already start()-ed plays to completion.
+  var activeSources = [];
+
+  // Stop in-flight playback and reset the schedule cursor. Called by
+  // the Rust shell when the brain cancels outbound (new wake fires
+  // mid-reply). Returns the number of sources that were stopped, so
+  // the shell can log how much speech got cut.
+  window.__openhumanFlushAudio = function () {
+    var stopped = 0;
+    while (activeSources.length) {
+      var s = activeSources.pop();
+      try { s.stop(); stopped++; } catch (_) {}
+      try { s.disconnect(); } catch (_) {}
+    }
+    if (ctx) {
+      nextStartTime = ctx.currentTime;
+    }
+    return stopped;
+  };
+
   // Public push API. Returns the duration in seconds the chunk added
   // to the queue, mostly for diagnostics; the shell ignores it.
   window.__openhumanFeedPcm = function (b64) {
@@ -118,6 +141,11 @@
         nextStartTime = ctx.currentTime;
       }
       src.start(nextStartTime);
+      activeSources.push(src);
+      src.onended = function () {
+        var idx = activeSources.indexOf(src);
+        if (idx !== -1) activeSources.splice(idx, 1);
+      };
       nextStartTime += buffer.duration;
       // High-frequency log gated by a counter so we don't drown the
       // console at 10 Hz; emit ~1 in 50 frames (~5 s cadence at the
diff --git a/app/src-tauri/src/meet_audio/inject.rs b/app/src-tauri/src/meet_audio/inject.rs
index 312f5297b1..1891c4dcd6 100644
--- a/app/src-tauri/src/meet_audio/inject.rs
+++ b/app/src-tauri/src/meet_audio/inject.rs
@@ -274,3 +274,31 @@ pub async fn feed_pcm_chunk(cdp: &mut CdpConn, session: &str, pcm_b64: &str) ->
     }
     Ok(())
 }
+
+/// Stop any in-flight audio playback inside the page bridge and reset
+/// its schedule cursor. Called when the brain cancels outbound (user
+/// re-asks during a long reply) so the previous reply's tail doesn't
+/// keep playing while the new turn is dispatched. Returns the count
+/// of sources that were stopped, useful for diagnostic logging.
+pub async fn flush_audio_bridge(cdp: &mut CdpConn, session: &str) -> Result<i64, String> {
+    let res = cdp
+        .call(
+            "Runtime.evaluate",
+            json!({
+                "expression": "(typeof window.__openhumanFlushAudio === 'function') ? window.__openhumanFlushAudio() : -1",
+                "returnByValue": true,
+            }),
+            Some(session),
+        )
+        .await
+        .map_err(|e| format!("Runtime.evaluate flush: {e}"))?;
+    if let Some(exception) = res.get("exceptionDetails") {
+        return Err(format!("page exception: {exception}"));
+    }
+    let stopped = res
+        .get("result")
+        .and_then(|r| r.get("value"))
+        .and_then(|v| v.as_i64())
+        .unwrap_or(0);
+    Ok(stopped)
+}
diff --git a/app/src-tauri/src/meet_audio/speak_pump.rs b/app/src-tauri/src/meet_audio/speak_pump.rs
index 1fab93f869..7d5c5b55e1 100644
--- a/app/src-tauri/src/meet_audio/speak_pump.rs
+++ b/app/src-tauri/src/meet_audio/speak_pump.rs
@@ -119,6 +119,27 @@ async fn poll_and_feed(
         .get("utterance_done")
         .and_then(|x| x.as_bool())
         .unwrap_or(false);
+    let flush_pending = v
+        .get("flush_pending")
+        .and_then(|x| x.as_bool())
+        .unwrap_or(false);
+
+    // Barge-in: brain set flush_pending when it cancelled the previous
+    // outbound. Stop in-flight playback inside the JS bridge BEFORE we
+    // feed the next chunk so the user hears the new reply instead of
+    // the tail of the old one. Best-effort — if the page is gone the
+    // flush errors and we drop through to the feed, which will fail
+    // the same way and trigger the same recovery path.
+    if flush_pending {
+        match inject::flush_audio_bridge(cdp, session_id).await {
+            Ok(stopped) => log::info!(
+                "[meet-audio] barge-in flush request_id={request_id} sources_stopped={stopped}"
+            ),
+            Err(e) => log::warn!(
+                "[meet-audio] barge-in flush failed request_id={request_id} err={e}"
+            ),
+        }
+    }
 
     if !pcm_b64.is_empty() {
         // Validate decode locally before pushing — saves a round-trip
diff --git a/src/openhuman/meet_agent/rpc.rs b/src/openhuman/meet_agent/rpc.rs
index 766ee25acb..5d3d896518 100644
--- a/src/openhuman/meet_agent/rpc.rs
+++ b/src/openhuman/meet_agent/rpc.rs
@@ -128,14 +128,19 @@ pub async fn handle_poll_speech(params: Map<String, Value>) -> Result<Value, Str
     let req: PollSpeechRequest = serde_json::from_value(Value::Object(params))
         .map_err(|e| format!("{LOG_PREFIX} invalid poll_speech params: {e}"))?;
 
-    let (pcm_base64, utterance_done) =
-        registry().with_session(&req.request_id, |s| s.poll_outbound())?;
+    let (pcm_base64, utterance_done, flush_pending) =
+        registry().with_session(&req.request_id, |s| {
+            let (b64, done) = s.poll_outbound();
+            let flush = s.take_flush_pending();
+            (b64, done, flush)
+        })?;
 
     RpcOutcome::new(
         json!({
             "ok": true,
             "pcm_base64": pcm_base64,
             "utterance_done": utterance_done,
+            "flush_pending": flush_pending,
         }),
         vec![],
     )
diff --git a/src/openhuman/meet_agent/session.rs b/src/openhuman/meet_agent/session.rs
index a8dac2500c..39faeabefe 100644
--- a/src/openhuman/meet_agent/session.rs
+++ b/src/openhuman/meet_agent/session.rs
@@ -90,6 +90,13 @@ pub struct MeetAgentSession {
     /// the user can interrupt only by deliberately re-saying the wake
     /// word, which they shouldn't have to.
     pub turn_in_progress: bool,
+    /// Set true by `cancel_outbound`; cleared by the next
+    /// `poll_outbound`. Tells the shell side that the previous reply
+    /// was interrupted and the JS audio bridge should flush any
+    /// in-flight playback BEFORE feeding the next chunk. Without this
+    /// distinct signal, a normal end-of-utterance would also flush,
+    /// cutting the final 100ms of the last legitimate reply.
+    flush_pending: bool,
 }
 
 impl MeetAgentSession {
@@ -112,6 +119,7 @@ impl MeetAgentSession {
             wake_cooldown_until_ts_ms: 0,
             last_caption_by_speaker: std::collections::HashMap::new(),
             turn_in_progress: false,
+            flush_pending: false,
         }
     }
 
@@ -166,24 +174,18 @@ impl MeetAgentSession {
         }
         self.last_caption_by_speaker
             .insert(key, trimmed_text.clone());
-        // Gate: while a brain turn is in flight (LLM + tools running)
-        // or the bot is mid-playback, refuse to fire a fresh wake.
-        // Without this gate the user's continuing speech, or Meet's
-        // own caption observer re-emitting growing captions, fires
-        // new turns every ~9-10s while the prior turn's tool dispatch
-        // (16-29s for delegate_to_integrations_agent) is still running.
-        // Result: 20 parallel calendar API calls for one question, none
-        // of which complete in time. The is_speaking() side covers TTS
-        // playback after the agent returns; turn_in_progress covers the
-        // LLM + tool-execution phase.
-        if self.turn_in_progress || self.is_speaking() {
+        // Gate: while a brain turn is in flight (LLM + tools running),
+        // refuse to fire a fresh wake. The prior gate also blocked on
+        // is_speaking() (outbound queued), but that prevented barge-in
+        // — the user couldn't interrupt a wrong-direction reply by
+        // re-asking. is_speaking() removed; barge-in now works via
+        // cancel_outbound → flush_pending → JS bridge flush. The LLM
+        // phase still blocks because spawning a parallel agentic turn
+        // would waste tool calls on the same question.
+        if self.turn_in_progress {
             self.record_event(
                 SessionEventKind::Heard,
-                format!(
-                    "{speaker}: {text} (suppressed: turn_in_progress={} speaking={})",
-                    self.turn_in_progress,
-                    self.is_speaking()
-                ),
+                format!("{speaker}: {text} (suppressed: turn_in_progress)"),
             );
             return false;
         }
@@ -344,12 +346,25 @@ impl MeetAgentSession {
     /// next poll and the page bridge can reset its audio-bridge state
     /// cleanly.
     pub fn cancel_outbound(&mut self) {
+        // Mark flush BEFORE the early-empty check — even if the Rust
+        // queue happens to be empty right now, the JS bridge may have
+        // already pulled the prior reply's tail and be playing it
+        // standalone. The flush signal must still fire.
+        self.flush_pending = true;
         if !self.outbound.is_empty() {
             self.outbound.clear();
         }
         self.outbound_done = true;
     }
 
+    /// Take + clear the pending-flush flag. Called by the shell on
+    /// every poll_outbound; when true, the shell will issue a JS
+    /// bridge flush BEFORE feeding the next PCM chunk so the prior
+    /// reply's in-flight playback stops cleanly.
+    pub fn take_flush_pending(&mut self) -> bool {
+        std::mem::take(&mut self.flush_pending)
+    }
+
     /// Drain everything currently queued for the shell. Returns
     /// `(pcm_base64, utterance_done)`.
     pub fn poll_outbound(&mut self) -> (String, bool) {

From 12bc5e78a4c4fd36fd05bf07aa68976e9381c3bd Mon Sep 17 00:00:00 2001
From: oxoxDev <nikhil@tinyhumans.ai>
Date: Sat, 23 May 2026 01:57:38 +0530
Subject: [PATCH 33/64] fix(meet-agent): force English-only TTS + log reply
 preview
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Bot was speaking "On it." (pre-roll English) followed by a phrase in a
different language. ElevenLabs TTS is configured English-only, so a
non-English orchestrator reply produces garbled audio. The reasoning
model can pick up a non-English locale from prior memory / transcript /
profile and switch languages mid-turn.

Add explicit "LANGUAGE: Respond in ENGLISH ONLY" rule at the top of
MEET_VOICE_DIRECTIVE. Extend caption-turn-done log with reply_preview
(first 120 chars) so the next non-English leak — or any other content
oddity — surfaces directly in dev:app stdout without grepping
transcripts.
---
 src/openhuman/meet_agent/brain.rs | 9 ++++++---
 1 file changed, 6 insertions(+), 3 deletions(-)

diff --git a/src/openhuman/meet_agent/brain.rs b/src/openhuman/meet_agent/brain.rs
index 12b3b843e5..c78be519dd 100644
--- a/src/openhuman/meet_agent/brain.rs
+++ b/src/openhuman/meet_agent/brain.rs
@@ -273,9 +273,10 @@ pub async fn run_caption_turn(request_id: &str) -> Result<bool, String> {
     })?;
 
     log::info!(
-        "[meet-agent] caption turn done request_id={request_id} reply_chars={} synth_samples={}",
+        "[meet-agent] caption turn done request_id={request_id} reply_chars={} synth_samples={} reply_preview={:?}",
         reply_text.chars().count(),
-        synthesized.len()
+        synthesized.len(),
+        reply_text.chars().take(120).collect::<String>(),
     );
     Ok(true)
 }
@@ -466,7 +467,9 @@ read the note back.\n\
 /// preamble. Wrapped in a delimiter so the orchestrator can't confuse
 /// the directive with the user's actual utterance.
 const MEET_VOICE_DIRECTIVE: &str = "\
-MEETING VOICE MODE — this conversation is happening live over voice in a Google Meet call. \n\
+MEETING VOICE MODE — this conversation is happening live over voice in a Google Meet call.\n\
+\n\
+LANGUAGE: Respond in ENGLISH ONLY. Do not switch languages even if a user's name, prior memory, or transcript hint suggests another locale. The TTS engine is English-only; non-English output produces garbled audio.\n\
 \n\
 TOOL USE (encouraged):\n\
 - USE TOOLS whenever a tool can give a real answer. Calendar, email, slack, memory, integrations — \

From 273e39040acc6a9064efacca52128ed9d70cca3b Mon Sep 17 00:00:00 2001
From: oxoxDev <nikhil@tinyhumans.ai>
Date: Sat, 23 May 2026 02:15:39 +0530
Subject: [PATCH 34/64] =?UTF-8?q?fix(meet-agent):=20triple-up=20dedup=20?=
 =?UTF-8?q?=E2=80=94=2030s=20cooldown=20+=2015s=20min-turn-gap=20+=20norma?=
 =?UTF-8?q?lised=20dedup?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 src/openhuman/meet_agent/brain.rs   |  4 ++
 src/openhuman/meet_agent/session.rs | 79 ++++++++++++++++++++++++++---
 2 files changed, 76 insertions(+), 7 deletions(-)

diff --git a/src/openhuman/meet_agent/brain.rs b/src/openhuman/meet_agent/brain.rs
index c78be519dd..30e6a13fd6 100644
--- a/src/openhuman/meet_agent/brain.rs
+++ b/src/openhuman/meet_agent/brain.rs
@@ -270,6 +270,10 @@ pub async fn run_caption_turn(request_id: &str) -> Result<bool, String> {
         // section with the reply enqueue, even if the caller drops
         // the future after this point.
         s.turn_in_progress = false;
+        // Stamp turn-done time so note_caption's min-turn-gap
+        // backstop can suppress wakes that fire within 15s of this
+        // turn's completion (caption residue / repeat questions).
+        s.mark_turn_done();
     })?;
 
     log::info!(
diff --git a/src/openhuman/meet_agent/session.rs b/src/openhuman/meet_agent/session.rs
index 39faeabefe..f1170ec30d 100644
--- a/src/openhuman/meet_agent/session.rs
+++ b/src/openhuman/meet_agent/session.rs
@@ -97,6 +97,14 @@ pub struct MeetAgentSession {
     /// distinct signal, a normal end-of-utterance would also flush,
     /// cutting the final 100ms of the last legitimate reply.
     flush_pending: bool,
+    /// Wall-clock ms at the moment the previous brain turn finished.
+    /// Used by note_caption to enforce a minimum gap between turns —
+    /// even if the page-side caption cooldown expires (or Meet emits
+    /// a fresh utterance just past it), the bot still refuses to
+    /// fire a new wake within MIN_TURN_GAP_MS. Backstop against the
+    /// "user asks once, bot answers 5 times" pattern when caption
+    /// residue keeps re-matching the wake phrase.
+    last_turn_done_at_ms: u64,
 }
 
 impl MeetAgentSession {
@@ -120,9 +128,21 @@ impl MeetAgentSession {
             last_caption_by_speaker: std::collections::HashMap::new(),
             turn_in_progress: false,
             flush_pending: false,
+            last_turn_done_at_ms: 0,
         }
     }
 
+    /// Stamp the current wall-clock time as "turn just finished". The
+    /// brain calls this from the final with_session block of
+    /// run_caption_turn (alongside clearing turn_in_progress) so the
+    /// min-turn-gap backstop in note_caption can see it.
+    pub fn mark_turn_done(&mut self) {
+        self.last_turn_done_at_ms = std::time::SystemTime::now()
+            .duration_since(UNIX_EPOCH)
+            .map(|d| d.as_millis() as u64)
+            .unwrap_or(0);
+    }
+
     /// True when the brain has TTS audio queued for playback. The
     /// note_caption gate uses this to refuse wake matches while the
     /// bot is actively speaking — otherwise Meet captions the bot's
@@ -165,15 +185,21 @@ impl MeetAgentSession {
         // never dedup. Keyed by speaker_lower so the user's repeating
         // utterance is dropped after the first hit regardless of bot
         // captions interleaving.
+        //
+        // Normalised match (lowercase + drop non-alphanumeric + collapse
+        // whitespace) so Meet's punctuation/case jitter between emits
+        // ("Hey, openhuman" → "hey openhuman.") doesn't slip through
+        // the dedup. Without normalisation each capitalisation flip
+        // fires another wake.
         let key = speaker_lower.clone();
-        let trimmed_text = text.trim().to_string();
+        let normalised = normalise_for_dedup(text);
         if let Some(prev) = self.last_caption_by_speaker.get(&key) {
-            if prev == &trimmed_text {
+            if prev == &normalised {
                 return false;
             }
         }
         self.last_caption_by_speaker
-            .insert(key, trimmed_text.clone());
+            .insert(key, normalised);
         // Gate: while a brain turn is in flight (LLM + tools running),
         // refuse to fire a fresh wake. The prior gate also blocked on
         // is_speaking() (outbound queued), but that prevented barge-in
@@ -202,6 +228,29 @@ impl MeetAgentSession {
             self.pending_prompt.push_str(text.trim());
             return false;
         }
+        // Min-turn-gap backstop. Even if the page-side caption
+        // cooldown window expires, refuse to start a new turn
+        // within MIN_TURN_GAP_MS of the prior turn's completion.
+        // Without this the bot replied to the same user question 4-5
+        // times when Meet's caption observer kept re-emitting the line
+        // with subtle text variation that slipped past the dedup.
+        const MIN_TURN_GAP_MS: u64 = 15_000;
+        let now_wall_ms = std::time::SystemTime::now()
+            .duration_since(UNIX_EPOCH)
+            .map(|d| d.as_millis() as u64)
+            .unwrap_or(0);
+        if self.last_turn_done_at_ms != 0
+            && now_wall_ms.saturating_sub(self.last_turn_done_at_ms) < MIN_TURN_GAP_MS
+        {
+            self.record_event(
+                SessionEventKind::Heard,
+                format!(
+                    "{speaker}: {text} (suppressed: <{}ms since last turn)",
+                    MIN_TURN_GAP_MS
+                ),
+            );
+            return false;
+        }
         // In cooldown after a recent turn — Meet keeps the same
         // utterance visible for several seconds, so without this
         // gate the brain re-fires on every caption growth. Continue
@@ -289,10 +338,16 @@ impl MeetAgentSession {
             return None;
         }
         self.wake_active = false;
-        // 8s grace beyond the most recent caption's page timestamp.
-        // `last_caption_ts_ms` is whatever Date.now() was page-side
-        // when the line landed — same clock as future caption pushes.
-        const COOLDOWN_MS: u64 = 8_000;
+        // 30s grace beyond the most recent caption's page timestamp.
+        // The previous 8s window was too short: Meet's caption region
+        // re-renders the just-finished utterance for 5-8s, the bot's
+        // reply takes another 5-15s to synthesize + speak, then any
+        // natural user follow-up ("wait, did you say X?") within the
+        // same 30s window is treated as continuation rather than a
+        // fresh wake. Under-prompted users especially repeat the wake
+        // phrase 2-3 times before realising the bot already heard them
+        // — without this, each repeat fires another tool call.
+        const COOLDOWN_MS: u64 = 30_000;
         self.wake_cooldown_until_ts_ms = self.last_caption_ts_ms.saturating_add(COOLDOWN_MS);
         let prompt = std::mem::take(&mut self.pending_prompt);
         let trimmed = prompt.trim().to_string();
@@ -410,6 +465,16 @@ impl MeetAgentSession {
     }
 }
 
+/// Lowercase + drop non-alphanumeric + collapse whitespace. Used by
+/// the per-speaker dedup so Meet's punctuation/case jitter between
+/// caption emits doesn't bypass the dedup. Same shape as
+/// `normalize_for_wake` but exposed under a distinct name to keep
+/// the two intents (wake-word match vs. dedup key) separate at the
+/// call site.
+fn normalise_for_dedup(text: &str) -> String {
+    normalize_for_wake(text)
+}
+
 /// Lowercase + drop punctuation + collapse whitespace, so the wake
 /// phrase matches regardless of how Meet's STT punctuated or cased
 /// it ("Hey, OpenHuman", "hey open-human", etc).

From cab2e754e8822256dc3ae2f6d73ddf63c7433972 Mon Sep 17 00:00:00 2001
From: oxoxDev <nikhil@tinyhumans.ai>
Date: Sat, 23 May 2026 02:23:45 +0530
Subject: [PATCH 35/64] chore: apply auto-fixes from pre-push hook

---
 app/src-tauri/src/meet_audio/speak_pump.rs |  6 ++---
 app/src-tauri/src/meet_call/mod.rs         |  3 ++-
 app/src-tauri/src/meet_scanner/mod.rs      | 17 ++++----------
 src/openhuman/meet_agent/brain.rs          | 26 +++++++++++-----------
 src/openhuman/meet_agent/session.rs        |  3 +--
 5 files changed, 23 insertions(+), 32 deletions(-)

diff --git a/app/src-tauri/src/meet_audio/speak_pump.rs b/app/src-tauri/src/meet_audio/speak_pump.rs
index 7d5c5b55e1..ca64e95b0b 100644
--- a/app/src-tauri/src/meet_audio/speak_pump.rs
+++ b/app/src-tauri/src/meet_audio/speak_pump.rs
@@ -135,9 +135,9 @@ async fn poll_and_feed(
             Ok(stopped) => log::info!(
                 "[meet-audio] barge-in flush request_id={request_id} sources_stopped={stopped}"
             ),
-            Err(e) => log::warn!(
-                "[meet-audio] barge-in flush failed request_id={request_id} err={e}"
-            ),
+            Err(e) => {
+                log::warn!("[meet-audio] barge-in flush failed request_id={request_id} err={e}")
+            }
         }
     }
 
diff --git a/app/src-tauri/src/meet_call/mod.rs b/app/src-tauri/src/meet_call/mod.rs
index 24c8b080e1..fa61e35f22 100644
--- a/app/src-tauri/src/meet_call/mod.rs
+++ b/app/src-tauri/src/meet_call/mod.rs
@@ -199,7 +199,8 @@ pub async fn meet_call_open_window<R: Runtime>(
     if let Ok(pos) = window.outer_position() {
         log::info!(
             "[meet-call] post-build outer_position={{x:{},y:{}}} (target=-30000,-30000)",
-            pos.x, pos.y
+            pos.x,
+            pos.y
         );
     }
 
diff --git a/app/src-tauri/src/meet_scanner/mod.rs b/app/src-tauri/src/meet_scanner/mod.rs
index 771b2ca5c2..90fa49167a 100644
--- a/app/src-tauri/src/meet_scanner/mod.rs
+++ b/app/src-tauri/src/meet_scanner/mod.rs
@@ -130,9 +130,7 @@ async fn run(request_id: &str, meet_url: &str, display_name: &str) -> Result<(),
     // Best-effort: if Network domain isn't enabled or CDP returns an
     // error, we log and continue — the bot may still land on the
     // verify screen but won't get worse than the pre-clear state.
-    let _ = cdp
-        .call("Network.enable", json!({}), Some(&session))
-        .await;
+    let _ = cdp.call("Network.enable", json!({}), Some(&session)).await;
     if let Err(err) = cdp
         .call("Network.clearBrowserCookies", json!({}), Some(&session))
         .await
@@ -203,11 +201,7 @@ async fn run(request_id: &str, meet_url: &str, display_name: &str) -> Result<(),
     if let Err(err) = click_by_aria_label(
         &mut cdp,
         &session,
-        &[
-            "turn on camera",
-            "turn camera on",
-            "camera is off",
-        ],
+        &["turn on camera", "turn camera on", "camera is off"],
         Duration::from_secs(8),
     )
     .await
@@ -232,9 +226,7 @@ async fn run(request_id: &str, meet_url: &str, display_name: &str) -> Result<(),
     )
     .await
     {
-        log::info!(
-            "[meet-scanner] mic toggle ON not clicked (already on or label drift): {err}"
-        );
+        log::info!("[meet-scanner] mic toggle ON not clicked (already on or label drift): {err}");
         dump_aria_labels(&mut cdp, &session, "mic|microphone|audio").await;
     }
 
@@ -393,8 +385,7 @@ async fn wait_for_admission(cdp: &mut CdpConn, session: &str) -> Result<(), Stri
 /// inspect what Meet actually exposes after a failed
 /// [`click_by_aria_label`]. Best-effort, swallows all CDP errors.
 async fn dump_aria_labels(cdp: &mut CdpConn, session: &str, pattern: &str) {
-    let pattern_js =
-        serde_json::to_string(pattern).unwrap_or_else(|_| "\"camera\"".to_string());
+    let pattern_js = serde_json::to_string(pattern).unwrap_or_else(|_| "\"camera\"".to_string());
     let expression = format!(
         r#"
         (() => {{
diff --git a/src/openhuman/meet_agent/brain.rs b/src/openhuman/meet_agent/brain.rs
index 30e6a13fd6..d783fcb2fd 100644
--- a/src/openhuman/meet_agent/brain.rs
+++ b/src/openhuman/meet_agent/brain.rs
@@ -570,11 +570,8 @@ async fn llm_meeting_agentic(prompt: &str, request_id: &str) -> Result<String, S
     );
 
     let fut = agent.run_single(&user_message);
-    let reply = match tokio::time::timeout(
-        Duration::from_secs(AGENTIC_TURN_TIMEOUT_SECS),
-        fut,
-    )
-    .await
+    let reply = match tokio::time::timeout(Duration::from_secs(AGENTIC_TURN_TIMEOUT_SECS), fut)
+        .await
     {
         Ok(Ok(text)) => text,
         Ok(Err(e)) => {
@@ -597,9 +594,7 @@ async fn llm_meeting_agentic(prompt: &str, request_id: &str) -> Result<String, S
 /// Get the cached orchestrator for this meet, or build it on first
 /// call. Returns an `Arc<TokioMutex<Agent>>` so the caller can lock
 /// across the run_single().await.
-async fn get_or_build_agent_for_meet(
-    request_id: &str,
-) -> Result<Arc<TokioMutex<Agent>>, String> {
+async fn get_or_build_agent_for_meet(request_id: &str) -> Result<Arc<TokioMutex<Agent>>, String> {
     {
         let cache = agent_cache().lock().await;
         if let Some(existing) = cache.get(request_id) {
@@ -625,9 +620,7 @@ async fn get_or_build_agent_for_meet(
     agent.set_event_context(format!("meet_{request_id}"), "meet_agent");
     agent.set_agent_definition_name(format!("orchestrator_meet_{}", short_id(request_id)));
 
-    log::info!(
-        "[meet-agent] orchestrator built + cached for request_id={request_id}"
-    );
+    log::info!("[meet-agent] orchestrator built + cached for request_id={request_id}");
 
     let arc = Arc::new(TokioMutex::new(agent));
     agent_cache()
@@ -815,7 +808,9 @@ fn strip_untagged_reasoning(text: &str) -> String {
         .iter()
         .filter(|s| {
             let lc = s.to_lowercase();
-            !REASONING_OPENERS.iter().any(|opener| lc.starts_with(opener))
+            !REASONING_OPENERS
+                .iter()
+                .any(|opener| lc.starts_with(opener))
         })
         .copied()
         .collect();
@@ -838,7 +833,12 @@ fn cap_for_speech(text: &str, max_chars: usize) -> String {
     }
     let prefix: String = text.chars().take(max_chars).collect();
     if let Some(idx) = prefix.rfind(['.', '!', '?']) {
-        let end = idx + prefix[idx..].chars().next().map(char::len_utf8).unwrap_or(1);
+        let end = idx
+            + prefix[idx..]
+                .chars()
+                .next()
+                .map(char::len_utf8)
+                .unwrap_or(1);
         return prefix[..end].trim_end().to_string();
     }
     let mut out = prefix.trim_end().to_string();
diff --git a/src/openhuman/meet_agent/session.rs b/src/openhuman/meet_agent/session.rs
index f1170ec30d..a093504571 100644
--- a/src/openhuman/meet_agent/session.rs
+++ b/src/openhuman/meet_agent/session.rs
@@ -198,8 +198,7 @@ impl MeetAgentSession {
                 return false;
             }
         }
-        self.last_caption_by_speaker
-            .insert(key, normalised);
+        self.last_caption_by_speaker.insert(key, normalised);
         // Gate: while a brain turn is in flight (LLM + tools running),
         // refuse to fire a fresh wake. The prior gate also blocked on
         // is_speaking() (outbound queued), but that prevented barge-in

From 26c65fc5bd7de1995bc43a15f14280679de0cd0a Mon Sep 17 00:00:00 2001
From: oxoxDev <nikhil@tinyhumans.ai>
Date: Sat, 23 May 2026 02:28:42 +0530
Subject: [PATCH 36/64] fix(meet-agent): skip pre-roll ack for short prompts
 (greetings / time / hear-me)

---
 src/openhuman/meet_agent/brain.rs | 16 +++++++++++-----
 1 file changed, 11 insertions(+), 5 deletions(-)

diff --git a/src/openhuman/meet_agent/brain.rs b/src/openhuman/meet_agent/brain.rs
index d783fcb2fd..ae7ca1d4ab 100644
--- a/src/openhuman/meet_agent/brain.rs
+++ b/src/openhuman/meet_agent/brain.rs
@@ -180,11 +180,17 @@ pub async fn run_caption_turn(request_id: &str) -> Result<bool, String> {
     // re-asks (which the turn_in_progress gate now blocks but still
     // burns the call atmosphere). Speak a 2-word ack right away and
     // enqueue with done=false so the real reply appends cleanly when
-    // it lands. If the agent path returns < 1s (greeting, time
-    // question), the user hears "On it. <real reply>" — slightly
-    // redundant but not annoying. On slow paths the ack covers
-    // exactly the dead air it was designed for.
-    if !was_bare_wake {
+    // it lands.
+    //
+    // Skip pre-roll on short prompts: greetings ("hi"), checks ("can
+    // you hear me", "are you there"), time questions ("what's the
+    // time"), and other trivial asks the agent answers in 2-5s
+    // without tools — those don't need the ack, and "On it. Yes, I
+    // can hear you" sounds redundant. The 50-char threshold is a
+    // rough proxy; real second-brain questions ("am I free Friday
+    // afternoon for a 30 min slot") are almost always longer.
+    const PREROLL_SKIP_PROMPT_CHARS: usize = 50;
+    if !was_bare_wake && prompt.chars().count() > PREROLL_SKIP_PROMPT_CHARS {
         if let Ok(ack_pcm) = tts(PREROLL_ACK_PHRASE).await {
             let _ = registry().with_session(request_id, |s| {
                 s.enqueue_outbound_pcm(&ack_pcm, false);

From 257646cf070b9678605601ccdf1136be29a169b9 Mon Sep 17 00:00:00 2001
From: oxoxDev <nikhil@tinyhumans.ai>
Date: Sat, 23 May 2026 02:38:02 +0530
Subject: [PATCH 37/64] =?UTF-8?q?fix(meet-agent):=20cooldown=20+=20min-tur?=
 =?UTF-8?q?n-gap=20=E2=86=92=2060s=20each=20to=20kill=20residual-caption?=
 =?UTF-8?q?=20re-fires?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 src/openhuman/meet_agent/session.rs | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/openhuman/meet_agent/session.rs b/src/openhuman/meet_agent/session.rs
index a093504571..5e66bc6157 100644
--- a/src/openhuman/meet_agent/session.rs
+++ b/src/openhuman/meet_agent/session.rs
@@ -233,7 +233,7 @@ impl MeetAgentSession {
         // Without this the bot replied to the same user question 4-5
         // times when Meet's caption observer kept re-emitting the line
         // with subtle text variation that slipped past the dedup.
-        const MIN_TURN_GAP_MS: u64 = 15_000;
+        const MIN_TURN_GAP_MS: u64 = 60_000;
         let now_wall_ms = std::time::SystemTime::now()
             .duration_since(UNIX_EPOCH)
             .map(|d| d.as_millis() as u64)
@@ -346,7 +346,7 @@ impl MeetAgentSession {
         // fresh wake. Under-prompted users especially repeat the wake
         // phrase 2-3 times before realising the bot already heard them
         // — without this, each repeat fires another tool call.
-        const COOLDOWN_MS: u64 = 30_000;
+        const COOLDOWN_MS: u64 = 60_000;
         self.wake_cooldown_until_ts_ms = self.last_caption_ts_ms.saturating_add(COOLDOWN_MS);
         let prompt = std::mem::take(&mut self.pending_prompt);
         let trimmed = prompt.trim().to_string();

From 140f371c30db51a6cd81037503795fbefb4b81c6 Mon Sep 17 00:00:00 2001
From: oxoxDev <nikhil@tinyhumans.ai>
Date: Sat, 23 May 2026 02:51:36 +0530
Subject: [PATCH 38/64] test(skills): update MeetingBotsCard tests for Flow A
 (joinMeetCall)

---
 .../skills/__tests__/MeetingBotsCard.test.tsx | 45 +++++++------------
 1 file changed, 16 insertions(+), 29 deletions(-)

diff --git a/app/src/components/skills/__tests__/MeetingBotsCard.test.tsx b/app/src/components/skills/__tests__/MeetingBotsCard.test.tsx
index 1d99f8d739..192e9211f1 100644
--- a/app/src/components/skills/__tests__/MeetingBotsCard.test.tsx
+++ b/app/src/components/skills/__tests__/MeetingBotsCard.test.tsx
@@ -11,7 +11,11 @@ vi.mock('../../../services/meetCallService', async () => {
   );
   return {
     ...actual,
-    joinMeetingViaMascotBot: (...args: unknown[]) => joinMock(...args),
+    // Flow A: the modal submit calls joinMeetCall (CEF webview), not the
+    // Flow B backend joinMeetingViaMascotBot. Switched in the
+    // mascot-meet-flowA revival commits — kept the mock variable name
+    // `joinMock` to keep the diff focused on the call site swap.
+    joinMeetCall: (...args: unknown[]) => joinMock(...args),
   };
 });
 
@@ -45,8 +49,8 @@ describe('MeetingBotsCard', () => {
     expect(screen.queryByRole('dialog')).not.toBeInTheDocument();
   });
 
-  it('submits to joinMeetingViaMascotBot and fires a success toast', async () => {
-    joinMock.mockResolvedValueOnce({ success: true });
+  it('submits to joinMeetCall and fires a success toast', async () => {
+    joinMock.mockResolvedValueOnce({ requestId: 'req-1' });
     const onToast = vi.fn();
     render(<MeetingBotsCard onToast={onToast} />);
 
@@ -57,10 +61,13 @@ describe('MeetingBotsCard', () => {
     const form = screen.getByRole('dialog').querySelector('form')!;
     fireEvent.submit(form);
 
+    // Flow A's joinMeetCall takes { meetUrl, displayName }. The component
+    // synthesises displayName from the user profile; the test asserts on
+    // meetUrl + the presence of a displayName field rather than its exact
+    // value (which would couple the test to the auth-fixture seam).
     await vi.waitFor(() => {
       expect(joinMock).toHaveBeenCalledWith(
         expect.objectContaining({
-          platform: 'gmeet',
           meetUrl: 'https://meet.google.com/abc-defg-hij',
         })
       );
@@ -76,31 +83,11 @@ describe('MeetingBotsCard', () => {
     });
   });
 
-  it('surfaces a capacity-gated error inline + as an amber toast', async () => {
-    joinMock.mockRejectedValueOnce({
-      isCapacityGated: true,
-      message: 'busy',
-    });
-    const onToast = vi.fn();
-    render(<MeetingBotsCard onToast={onToast} />);
-
-    fireEvent.click(screen.getByTestId('meeting-bots-banner'));
-    fireEvent.change(screen.getByLabelText(/meeting link/i), {
-      target: { value: 'https://meet.google.com/x' },
-    });
-    fireEvent.submit(screen.getByRole('dialog').querySelector('form')!);
-
-    await vi.waitFor(() => {
-      expect(onToast).toHaveBeenCalledWith(
-        expect.objectContaining({ type: 'error', title: expect.stringMatching(/busy/i) })
-      );
-    });
-    // Modal stays open so the user can retry; inline alert visible.
-    expect(screen.getByRole('alert')).toBeInTheDocument();
-  });
-
-  it('surfaces a non-capacity error', async () => {
-    joinMock.mockRejectedValueOnce({ isCapacityGated: false, message: 'Bad URL' });
+  // Flow A's joinMeetCall has no capacity-gated concept — any throw maps
+  // to the single "could not start" toast + inline alert with the error
+  // message. Two error cases collapsed into one in the Flow A model.
+  it('surfaces a join error inline + as an error toast', async () => {
+    joinMock.mockRejectedValueOnce(new Error('Bad URL'));
     const onToast = vi.fn();
     render(<MeetingBotsCard onToast={onToast} />);
 

From d2fcda7ecdb89cda1eb26d40da8142c3fd65fb9f Mon Sep 17 00:00:00 2001
From: oxoxDev <nikhil@tinyhumans.ai>
Date: Sat, 23 May 2026 03:05:18 +0530
Subject: [PATCH 39/64] test(meet-agent): poll for audio up to 30s instead of
 single 50ms sleep

The agent-cached path builds an orchestrator on first wake (memory tree
load + MCP init) which can take several seconds even in a minimal test
environment. The prior 50ms fixed sleep raced against that and the test
asserted on an empty queue. Convert to a 100ms-tick busy-poll capped at
30s so the test exits the moment audio lands but tolerates the slower
cold path.
---
 src/openhuman/meet_agent/rpc.rs | 26 +++++++++++++++++++-------
 1 file changed, 19 insertions(+), 7 deletions(-)

diff --git a/src/openhuman/meet_agent/rpc.rs b/src/openhuman/meet_agent/rpc.rs
index 5d3d896518..9be37545f2 100644
--- a/src/openhuman/meet_agent/rpc.rs
+++ b/src/openhuman/meet_agent/rpc.rs
@@ -259,13 +259,25 @@ mod tests {
         }
         assert_eq!(last, json!(true), "expected a turn_started=true reply");
 
-        // Give the spawned turn a moment to enqueue audio.
-        tokio::time::sleep(std::time::Duration::from_millis(50)).await;
-
-        let mut poll = Map::new();
-        poll.insert("request_id".into(), json!("rpc-push"));
-        let out = handle_poll_speech(poll).await.unwrap();
-        let pcm = out.get("pcm_base64").and_then(|v| v.as_str()).unwrap_or("");
+        // Wait up to 30s for the spawned brain turn to enqueue audio.
+        // The agentic path builds an orchestrator Agent on first wake
+        // (memory tree load + MCP init), which can take several seconds
+        // even in a minimal test environment. Failing the agentic path
+        // (no backend token) still falls through to a canned-ack TTS
+        // stub, so the queue eventually fills regardless. Poll every
+        // 100ms so the test exits the moment audio lands.
+        let mut pcm = String::new();
+        for _ in 0..300 {
+            let mut poll = Map::new();
+            poll.insert("request_id".into(), json!("rpc-push"));
+            let out = handle_poll_speech(poll).await.unwrap();
+            let chunk = out.get("pcm_base64").and_then(|v| v.as_str()).unwrap_or("");
+            if !chunk.is_empty() {
+                pcm = chunk.to_string();
+                break;
+            }
+            tokio::time::sleep(std::time::Duration::from_millis(100)).await;
+        }
         assert!(!pcm.is_empty(), "expected synthesized audio after turn");
 
         let mut stop = Map::new();

From 42834af84077e7cc795116ba25d3b3dbcf6d7f30 Mon Sep 17 00:00:00 2001
From: oxoxDev <nikhil@tinyhumans.ai>
Date: Mon, 25 May 2026 12:14:09 +0530
Subject: [PATCH 40/64] feat(meet-agent): owner-only wake gate (privacy lock)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Add `owner_display_name` + `bot_display_name` fields to
`MeetAgentSession`. Reject `note_caption` wakes unless the speaker
matches the owner, and drop the bot's own captioned TTS as
self-echo. Empty owner fails closed so a misconfigured launch
can never expose the user's tool surface.

The brain runs the user's full orchestrator with 119 Composio
integrations + memory tree. Without an identity gate, any
participant in the Meet who says the wake phrase can issue tool
calls in the user's name and have the results spoken back to the
whole room (e.g. "hey openhuman read my Slack DMs from <person>"
→ private data broadcast). Gate is intentionally enforced before
dedup / cooldown so unauthorised-wake attempts are auditable.

Normalisation strips a single trailing parenthetical so Meet's
"(host)" / "(you)" / "(presenter)" decorators don't break the
match, and lowercases for case-insensitive compare. Unit tests
cover the four denial paths (non-owner, bot-self, empty owner,
case insensitivity) plus the (host)-suffix path.
---
 src/openhuman/meet_agent/session.rs | 240 +++++++++++++++++++++++++++-
 1 file changed, 237 insertions(+), 3 deletions(-)

diff --git a/src/openhuman/meet_agent/session.rs b/src/openhuman/meet_agent/session.rs
index 5e66bc6157..421aaaefb5 100644
--- a/src/openhuman/meet_agent/session.rs
+++ b/src/openhuman/meet_agent/session.rs
@@ -105,6 +105,21 @@ pub struct MeetAgentSession {
     /// "user asks once, bot answers 5 times" pattern when caption
     /// residue keeps re-matching the wake phrase.
     last_turn_done_at_ms: u64,
+    /// Display name of the call owner — the user who launched the
+    /// bot. Only captions from this speaker may trip the wake word.
+    /// Empty until [`set_identities`] is called; while empty the
+    /// gate fails closed (no wakes fire) so a misconfigured launch
+    /// can never leak the user's tool surface to a remote
+    /// participant. Normalisation (lowercase / parenthetical
+    /// suffix strip) happens at compare time inside note_caption.
+    owner_display_name: String,
+    /// Display name the bot uses as its Meet participant tile.
+    /// Used to drop the bot's own captions (Meet renders the bot's
+    /// TTS in the same captions region as remote speakers; without
+    /// an explicit bot-self filter the bot would re-wake on its
+    /// own voice). Empty until set; while empty the bot-self filter
+    /// is inert.
+    bot_display_name: String,
 }
 
 impl MeetAgentSession {
@@ -129,9 +144,34 @@ impl MeetAgentSession {
             turn_in_progress: false,
             flush_pending: false,
             last_turn_done_at_ms: 0,
+            owner_display_name: String::new(),
+            bot_display_name: String::new(),
         }
     }
 
+    /// Set the call-owner display name (the human who launched the
+    /// bot) and the bot's own Meet participant name. The note_caption
+    /// gate uses both: captions are accepted only when the speaker
+    /// matches the owner, and the bot-self filter drops captions
+    /// authored by the bot's own TTS feed.
+    ///
+    /// Either argument may be empty. Empty owner_display_name
+    /// fails-closed (the gate refuses every wake) so a misconfigured
+    /// launch can never expose the user's tool surface to a remote
+    /// participant. Empty bot_display_name simply disables the
+    /// bot-self filter — the dedup / cooldown layers still keep the
+    /// loop in check, but it's a less-defended posture.
+    pub fn set_identities(&mut self, owner_display_name: &str, bot_display_name: &str) {
+        self.owner_display_name = owner_display_name.trim().to_string();
+        self.bot_display_name = bot_display_name.trim().to_string();
+    }
+
+    /// Read accessor used by audit logging. Empty when set_identities
+    /// has not been called for this session.
+    pub fn owner_display_name(&self) -> &str {
+        &self.owner_display_name
+    }
+
     /// Stamp the current wall-clock time as "turn just finished". The
     /// brain calls this from the final with_session block of
     /// run_caption_turn (alongside clearing turn_in_progress) so the
@@ -178,6 +218,63 @@ impl MeetAgentSession {
         if speaker_lower == "you" || speaker_lower.is_empty() {
             return false;
         }
+        // Privacy gate — owner-only wake.
+        //
+        // Today the brain runs the user's full orchestrator agent with
+        // their tool surface (calendar, Slack, Gmail, … 119 Composio
+        // integrations) and the user's memory tree. A meeting is a
+        // public room. Without an identity gate, *any* participant who
+        // says the wake phrase (or whose audio Meet transcribes near
+        // one) can issue tool calls in the user's name and have the
+        // results spoken aloud to the whole room — a hard privacy
+        // leak. So before any wake / dedup / cooldown work happens we
+        // require: speaker == owner_display_name. Anyone else (and
+        // the bot itself) is dropped without recording an event.
+        //
+        // Normalisation is intentionally light (lowercase + trim +
+        // parenthetical suffix strip) so Meet's "(host)" / "(you)"
+        // decorations don't break the match. Anything fancier
+        // (NFKC, diacritic folding) waits for a real-name smoke
+        // report — start tight, expand only on evidence.
+        let speaker_norm = normalise_participant_name(speaker);
+        let owner_norm = normalise_participant_name(&self.owner_display_name);
+        let bot_norm = normalise_participant_name(&self.bot_display_name);
+        // Bot-self filter first: a bot caption that happens to match
+        // its own display name must never re-wake. Run before the
+        // owner check so a (very contrived) bot_display_name ==
+        // owner_display_name still doesn't let the bot wake itself.
+        if !bot_norm.is_empty() && speaker_norm == bot_norm {
+            return false;
+        }
+        // Fail-closed when no owner has been configured. A live
+        // session without a known owner is by definition unsafe —
+        // any participant could wake. Log once per such caption so
+        // operators can spot the misconfiguration in the dev log.
+        if owner_norm.is_empty() {
+            log::warn!(
+                "[meet-agent] wake refused: no owner_display_name configured \
+                 request_id={} speaker={}",
+                self.request_id,
+                speaker
+            );
+            return false;
+        }
+        if speaker_norm != owner_norm {
+            // Audit-style log so dev:app stdout makes the rejection
+            // visible without leaking the caption body verbatim
+            // (preview capped, matches the wake-preview style used
+            // upstream in handle_push_caption).
+            let preview: String = text.chars().take(40).collect();
+            log::info!(
+                "[meet-agent] unauthorized_wake_attempt request_id={} \
+                 speaker=\"{}\" owner=\"{}\" preview=\"{}\"",
+                self.request_id,
+                speaker,
+                self.owner_display_name,
+                preview
+            );
+            return false;
+        }
         // Per-speaker dedup. Meet's CC region re-renders the same line
         // every 250 ms poll tick and emits BOTH speaker rows on each
         // walk (the user AND the bot TTS as speaker="You"). A single-
@@ -464,6 +561,51 @@ impl MeetAgentSession {
     }
 }
 
+/// Canonicalise a Meet participant display name for the owner-gate
+/// comparison. Strips a single trailing parenthetical decorator
+/// (Meet appends `" (host)"`, `" (you)"`, `" (presenter)"` to some
+/// captions and labels), lowercases ASCII, and collapses internal
+/// whitespace. NFKC folding is *not* applied — start tight and
+/// expand on real-world miss reports rather than guessing at the
+/// shape of names we haven't seen yet. Returns empty when the input
+/// is empty / whitespace-only so the caller can fail-closed.
+fn normalise_participant_name(raw: &str) -> String {
+    let trimmed = raw.trim();
+    if trimmed.is_empty() {
+        return String::new();
+    }
+    // Strip a single trailing parenthetical (e.g. "Alice (host)").
+    // We only strip when the parenthetical is at the end and the
+    // preceding chunk is non-empty — guards against pathological
+    // inputs like "()" or "(host)" alone.
+    let stripped: &str = if let Some(open_idx) = trimmed.rfind(" (") {
+        if trimmed.ends_with(')') && open_idx > 0 {
+            &trimmed[..open_idx]
+        } else {
+            trimmed
+        }
+    } else {
+        trimmed
+    };
+    // Lowercase + collapse internal whitespace.
+    let mut out = String::with_capacity(stripped.len());
+    let mut prev_space = true;
+    for c in stripped.chars() {
+        if c.is_whitespace() {
+            if !prev_space {
+                out.push(' ');
+                prev_space = true;
+            }
+        } else {
+            for lc in c.to_lowercase() {
+                out.push(lc);
+            }
+            prev_space = false;
+        }
+    }
+    out.trim_end().to_string()
+}
+
 /// Lowercase + drop non-alphanumeric + collapse whitespace. Used by
 /// the per-speaker dedup so Meet's punctuation/case jitter between
 /// caption emits doesn't bypass the dedup. Same shape as
@@ -613,9 +755,20 @@ mod tests {
         .unwrap();
     }
 
+    /// Build a session pre-configured for the wake-word tests: Alice
+    /// is the call owner, "OpenHuman" is the bot's Meet tile. Every
+    /// wake-path test goes through this helper so the owner gate
+    /// (the privacy hard-lock around tool calls) is consistently
+    /// in scope.
+    fn session_with_owner_alice() -> MeetAgentSession {
+        let mut s = MeetAgentSession::new("p".into(), 16_000);
+        s.set_identities("Alice", "OpenHuman");
+        s
+    }
+
     #[test]
     fn note_caption_handles_punctuated_wake() {
-        let mut s = MeetAgentSession::new("p".into(), 16_000);
+        let mut s = session_with_owner_alice();
         // Meet often inserts a comma after "hey".
         let fired = s.note_caption("Alice", "Hey, OpenHuman remember the launch", 1);
         assert!(fired, "punctuated wake phrase should still fire");
@@ -625,7 +778,7 @@ mod tests {
 
     #[test]
     fn note_caption_handles_split_brand() {
-        let mut s = MeetAgentSession::new("p".into(), 16_000);
+        let mut s = session_with_owner_alice();
         let fired = s.note_caption("Alice", "hey open-human, send the report", 1);
         assert!(fired);
         let prompt = s.take_pending_prompt().expect("prompt drained");
@@ -634,7 +787,7 @@ mod tests {
 
     #[test]
     fn note_caption_does_not_double_fire_on_growing_caption() {
-        let mut s = MeetAgentSession::new("p".into(), 16_000);
+        let mut s = session_with_owner_alice();
         let first = s.note_caption("Alice", "hey openhuman take notes", 1);
         assert!(first);
         let second = s.note_caption("Alice", "hey openhuman take notes about the launch", 2);
@@ -662,4 +815,85 @@ mod tests {
         })
         .unwrap();
     }
+
+    // -- Owner-only wake gate (privacy lock) --------------------------
+
+    #[test]
+    fn note_caption_rejects_non_owner_speaker() {
+        let mut s = session_with_owner_alice();
+        // Bob is in the room but not the owner; even with a perfect
+        // wake phrase the gate must refuse.
+        let fired = s.note_caption("Bob", "hey openhuman read alice's slack DMs", 1);
+        assert!(!fired, "non-owner must not wake the bot");
+        assert!(s.take_pending_prompt().is_none());
+    }
+
+    #[test]
+    fn note_caption_rejects_bot_self_caption() {
+        let mut s = session_with_owner_alice();
+        // Meet often re-captions the bot's own TTS in the same region.
+        // The bot must never wake on its own voice — regardless of
+        // the text content, including text that happens to repeat the
+        // wake phrase.
+        let fired = s.note_caption(
+            "OpenHuman",
+            "hey openhuman would you like to know more",
+            1,
+        );
+        assert!(!fired, "bot-self caption must be filtered");
+    }
+
+    #[test]
+    fn note_caption_fails_closed_when_owner_unconfigured() {
+        // No set_identities call → owner empty → no wake regardless of
+        // speaker. Mirrors the misconfigured-launch posture: better
+        // silent failure than an open mic for the user's tool surface.
+        let mut s = MeetAgentSession::new("p".into(), 16_000);
+        let fired = s.note_caption("Alice", "hey openhuman do the thing", 1);
+        assert!(!fired, "empty owner must fail-closed");
+    }
+
+    #[test]
+    fn note_caption_owner_with_host_suffix_matches() {
+        // Meet decorates some captions with "(host)" / "(you)". The
+        // normaliser strips a single trailing parenthetical so the
+        // gate still recognises Alice when Meet renders her as
+        // "Alice (host)".
+        let mut s = session_with_owner_alice();
+        let fired = s.note_caption("Alice (host)", "hey openhuman take a note", 1);
+        assert!(fired, "owner with parenthetical decorator must match");
+    }
+
+    #[test]
+    fn note_caption_owner_case_insensitive() {
+        // Meet sometimes title-cases display names that the user
+        // entered in lowercase, or vice versa. The comparison must
+        // be case-insensitive.
+        let mut s = session_with_owner_alice();
+        let fired = s.note_caption("ALICE", "hey openhuman summarise", 1);
+        assert!(fired, "owner match must be case-insensitive");
+    }
+
+    #[test]
+    fn normalise_participant_name_strips_trailing_paren() {
+        assert_eq!(normalise_participant_name("Alice (host)"), "alice");
+        assert_eq!(normalise_participant_name("Bob (you)"), "bob");
+        // No paren — left as-is (modulo lowercase / trim).
+        assert_eq!(normalise_participant_name("  Charlie  "), "charlie");
+        // Internal whitespace collapsed.
+        assert_eq!(normalise_participant_name("First  Last"), "first last");
+        // Pathological standalone paren — preserved so the gate can
+        // still treat it as a name distinct from the owner.
+        assert_eq!(normalise_participant_name("(host)"), "(host)");
+        // Empty stays empty so callers can fail-closed.
+        assert_eq!(normalise_participant_name(""), "");
+        assert_eq!(normalise_participant_name("   "), "");
+    }
+
+    #[test]
+    fn set_identities_trims_whitespace() {
+        let mut s = MeetAgentSession::new("p".into(), 16_000);
+        s.set_identities("  Alice  ", "\tOpenHuman\n");
+        assert_eq!(s.owner_display_name(), "Alice");
+    }
 }

From 6b1c7a8f75cb841e11c7b12e280446dfd75fab88 Mon Sep 17 00:00:00 2001
From: oxoxDev <nikhil@tinyhumans.ai>
Date: Mon, 25 May 2026 12:14:16 +0530
Subject: [PATCH 41/64] feat(meet-agent): plumb owner+bot display names into
 start_session RPC
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Extend `StartSessionRequest` with `owner_display_name` and
`bot_display_name` (both `#[serde(default)]` so old shells keep
parsing). `handle_start_session` installs the identities via
`session.set_identities` right after the registry create — before
any caption push can arrive, so a racing wake never reads the
empty-owner state in a way that could leak. Done as a two-step
register+set rather than threading the identities through the
existing `start()` signature so smoke tests (and any future
non-Meet caller) don't have to be updated in lockstep.
---
 src/openhuman/meet_agent/rpc.rs   | 18 ++++++++++++++++--
 src/openhuman/meet_agent/types.rs | 18 ++++++++++++++++++
 2 files changed, 34 insertions(+), 2 deletions(-)

diff --git a/src/openhuman/meet_agent/rpc.rs b/src/openhuman/meet_agent/rpc.rs
index 9be37545f2..b811f8c62d 100644
--- a/src/openhuman/meet_agent/rpc.rs
+++ b/src/openhuman/meet_agent/rpc.rs
@@ -31,10 +31,24 @@ pub async fn handle_start_session(params: Map<String, Value>) -> Result<Value, S
         .map_err(|e| format!("{LOG_PREFIX} invalid start_session params: {e}"))?;
 
     registry().start(&req.request_id, req.sample_rate_hz)?;
+    // Install the call-owner identity before any captions can arrive.
+    // The session is created with empty identities — which deliberately
+    // fails closed in note_caption — so racing a push_caption against
+    // this with_session call would simply drop the early caption rather
+    // than leak it. Done as a second step (vs threading through
+    // `start`) so the registry's start signature stays unchanged and
+    // existing callers (legacy shell variants, smoke tests) don't have
+    // to be updated in lockstep.
+    registry().with_session(&req.request_id, |s| {
+        s.set_identities(&req.owner_display_name, &req.bot_display_name);
+    })?;
     log::info!(
-        "{LOG_PREFIX} start_session request_id={} sample_rate_hz={}",
+        "{LOG_PREFIX} start_session request_id={} sample_rate_hz={} \
+         owner_chars={} bot_chars={}",
         req.request_id,
-        req.sample_rate_hz
+        req.sample_rate_hz,
+        req.owner_display_name.chars().count(),
+        req.bot_display_name.chars().count()
     );
 
     RpcOutcome::new(
diff --git a/src/openhuman/meet_agent/types.rs b/src/openhuman/meet_agent/types.rs
index 636fbd6825..b03b266a55 100644
--- a/src/openhuman/meet_agent/types.rs
+++ b/src/openhuman/meet_agent/types.rs
@@ -21,6 +21,24 @@ pub struct StartSessionRequest {
     /// sending. Validated on entry.
     #[serde(default = "default_sample_rate")]
     pub sample_rate_hz: u32,
+    /// Display name of the call owner — the human who launched the
+    /// bot. Used by the wake-word gate in [`crate::openhuman::meet_agent::session`]
+    /// as the *only* speaker allowed to issue tool calls. Captions
+    /// from any other participant are dropped without recording an
+    /// event. Empty string fails closed (no wake fires) so a
+    /// misconfigured shell can never expose the user's tool surface.
+    /// Defaulted so older shells / smoke tests that don't yet set
+    /// the field still parse the payload.
+    #[serde(default)]
+    pub owner_display_name: String,
+    /// Display name the bot uses as its Meet participant tile.
+    /// Captions whose `speaker` matches this name are treated as the
+    /// bot's own TTS echoing back and dropped — without an explicit
+    /// filter the bot would re-wake on its own voice. Empty disables
+    /// the filter; dedup + cooldown still apply but it's a weaker
+    /// posture.
+    #[serde(default)]
+    pub bot_display_name: String,
 }
 
 fn default_sample_rate() -> u32 {

From 994e66696c69e0a7afe30942b036d978828e0ae5 Mon Sep 17 00:00:00 2001
From: oxoxDev <nikhil@tinyhumans.ai>
Date: Mon, 25 May 2026 12:14:23 +0530
Subject: [PATCH 42/64] feat(tauri/meet): forward owner display name from shell
 to core wake gate
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

`meet_call_open_window`'s `OpenWindowArgs` gains
`owner_display_name`. `meet_audio::start` now accepts both the
bot and owner names and includes them in the
`openhuman.meet_agent_start_session` RPC payload, so the core
wake gate is armed before the first caption arrives.

Dev-auto launch path in `lib.rs` passes an empty owner name —
the gate fails closed (no wakes fire) which is the safe posture
for an automated harness that has no real user behind the
keyboard.
---
 app/src-tauri/src/lib.rs            |  4 ++++
 app/src-tauri/src/meet_audio/mod.rs | 16 +++++++++++++---
 app/src-tauri/src/meet_call/mod.rs  | 24 +++++++++++++++++++++---
 3 files changed, 38 insertions(+), 6 deletions(-)

diff --git a/app/src-tauri/src/lib.rs b/app/src-tauri/src/lib.rs
index 3b62ae2654..a4ba8f1509 100644
--- a/app/src-tauri/src/lib.rs
+++ b/app/src-tauri/src/lib.rs
@@ -2989,6 +2989,10 @@ pub fn run() {
                             request_id: request_id.clone(),
                             meet_url: meet_url.clone(),
                             display_name: "OpenHuman Dev".to_string(),
+                            // Dev-auto launch has no real user identity — the
+                            // wake gate will fail-closed (no wakes fire) which
+                            // is the safe posture for an automated harness.
+                            owner_display_name: String::new(),
                         };
                         match meet_call::meet_call_open_window(app_handle.clone(), state, args)
                             .await
diff --git a/app/src-tauri/src/meet_audio/mod.rs b/app/src-tauri/src/meet_audio/mod.rs
index 789dcce401..2f1f4442d1 100644
--- a/app/src-tauri/src/meet_audio/mod.rs
+++ b/app/src-tauri/src/meet_audio/mod.rs
@@ -87,10 +87,15 @@ pub async fn start<R: Runtime>(
     app: AppHandle<R>,
     request_id: String,
     meet_url: String,
+    owner_display_name: String,
+    bot_display_name: String,
 ) -> Result<(), String> {
     log::info!(
-        "[meet-audio] start request_id={request_id} url_prefix={}",
-        truncate_for_log(&meet_url, 64)
+        "[meet-audio] start request_id={request_id} url_prefix={} \
+         owner_chars={} bot_chars={}",
+        truncate_for_log(&meet_url, 64),
+        owner_display_name.chars().count(),
+        bot_display_name.chars().count()
     );
 
     if let Some(state) = app.try_state::<MeetAudioState>() {
@@ -104,12 +109,17 @@ pub async fn start<R: Runtime>(
     }
 
     // Tell core to open its session first so the very first PCM push
-    // doesn't race the start RPC.
+    // doesn't race the start RPC. Hand the call owner + bot display
+    // names through with the request so the core wake-word gate
+    // (privacy lock: only the owner can trigger tool calls) is
+    // active before the first caption can arrive.
     rpc_call(
         "openhuman.meet_agent_start_session",
         serde_json::json!({
             "request_id": request_id,
             "sample_rate_hz": 16_000,
+            "owner_display_name": owner_display_name,
+            "bot_display_name": bot_display_name,
         }),
     )
     .await?;
diff --git a/app/src-tauri/src/meet_call/mod.rs b/app/src-tauri/src/meet_call/mod.rs
index fa61e35f22..01b72f5ea4 100644
--- a/app/src-tauri/src/meet_call/mod.rs
+++ b/app/src-tauri/src/meet_call/mod.rs
@@ -81,7 +81,18 @@ impl Default for MeetCallState {
 pub struct OpenWindowArgs {
     pub request_id: String,
     pub meet_url: String,
+    /// Bot's Meet participant tile name — what the bot types into
+    /// Meet's "Your name" input. Also passed to the core wake gate
+    /// so the bot's own captioned TTS is filtered out as self-echo.
     pub display_name: String,
+    /// Call owner's Meet participant name — the human who launched
+    /// the bot. The core wake-word gate (privacy lock: only the
+    /// owner can trigger tool calls) compares speaker captions
+    /// against this value. Defaulted to empty so callers staged
+    /// during the rollout window keep parsing; an empty owner
+    /// fails closed in core (no wakes fire).
+    #[serde(default)]
+    pub owner_display_name: String,
 }
 
 /// Open a dedicated top-level CEF webview window pointed at the Meet URL.
@@ -237,10 +248,17 @@ pub async fn meet_call_open_window<R: Runtime>(
         let app_for_audio = app.clone();
         let request_id_for_audio = request_id.clone();
         let url_for_audio = parsed.to_string();
+        let bot_for_audio = args.display_name.clone();
+        let owner_for_audio = args.owner_display_name.clone();
         tauri::async_runtime::spawn(async move {
-            if let Err(err) =
-                crate::meet_audio::start(app_for_audio, request_id_for_audio.clone(), url_for_audio)
-                    .await
+            if let Err(err) = crate::meet_audio::start(
+                app_for_audio,
+                request_id_for_audio.clone(),
+                url_for_audio,
+                owner_for_audio,
+                bot_for_audio,
+            )
+            .await
             {
                 log::warn!(
                     "[meet-call] meet_audio start failed request_id={request_id_for_audio} err={err}"

From 6e44f16e5fb535219b3512343feb470f10a69ddc Mon Sep 17 00:00:00 2001
From: oxoxDev <nikhil@tinyhumans.ai>
Date: Mon, 25 May 2026 12:14:33 +0530
Subject: [PATCH 43/64] feat(skills): owner-name input + privacy hint on
 MeetingBotsCard modal
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Add a required "Your name in the call" input to the Flow A join
modal and forward it through `joinMeetCall` → `meet_call_open_window`.
The hint text under the field tells the user this is the privacy
lock — OpenHuman will only respond to the wake word when this
exact name is speaking, so a remote participant can't trigger tool
calls in their name.

Submit is disabled until the owner field is non-empty; submitting
an empty value would fail closed in core anyway but surfacing the
requirement up front avoids the user joining a call and finding
the bot silent.

`IntelligenceCallsTab` is hidden behind a Coming Soon gate, so its
`joinMeetCall` call site passes an empty owner placeholder with a
note that the field has to be wired up when the tab is revived.

Vitest `MeetingBotsCard.test.tsx` updated to type a value into the
new field before submitting (previously the disabled-submit gate
would have blocked the form).
---
 .../intelligence/IntelligenceCallsTab.tsx     |  8 +++-
 app/src/components/skills/MeetingBotsCard.tsx | 42 ++++++++++++++++++-
 .../skills/__tests__/MeetingBotsCard.test.tsx | 19 +++++++--
 app/src/services/meetCallService.ts           | 32 +++++++++++++-
 4 files changed, 93 insertions(+), 8 deletions(-)

diff --git a/app/src/components/intelligence/IntelligenceCallsTab.tsx b/app/src/components/intelligence/IntelligenceCallsTab.tsx
index cc3801ce27..5bc344a4ce 100644
--- a/app/src/components/intelligence/IntelligenceCallsTab.tsx
+++ b/app/src/components/intelligence/IntelligenceCallsTab.tsx
@@ -64,7 +64,13 @@ export default function IntelligenceCallsTab({ onToast }: Props) {
     setError(null);
     setSubmitting(true);
     try {
-      const result = await joinMeetCall({ meetUrl, displayName });
+      // ownerDisplayName left empty here because this tab's UI is hidden
+      // behind a "Coming Soon" gate (see render branch below) — the call
+      // is dead-code-reachable only. When the tab is revived it must
+      // collect an owner-name input the same way `MeetingBotsCard` does
+      // (privacy lock for the in-call wake gate). Empty fails closed in
+      // core, so we're safe in the meantime.
+      const result = await joinMeetCall({ meetUrl, displayName, ownerDisplayName: '' });
       setActiveCalls(prev => [
         ...prev.filter(call => call.requestId !== result.requestId),
         { requestId: result.requestId, meetUrl: result.meetUrl, displayName: result.displayName },
diff --git a/app/src/components/skills/MeetingBotsCard.tsx b/app/src/components/skills/MeetingBotsCard.tsx
index f0f9301617..07b972bfe1 100644
--- a/app/src/components/skills/MeetingBotsCard.tsx
+++ b/app/src/components/skills/MeetingBotsCard.tsx
@@ -114,6 +114,13 @@ export function MeetingBotsModal({ onClose, onToast }: ModalProps) {
   const [platform, setPlatform] = useState<MascotMeetPlatform>('gmeet');
   const [meetUrl, setMeetUrl] = useState('');
   const [displayName, setDisplayName] = useState('OpenHuman');
+  // Privacy lock: the bot will only react to the wake word when this
+  // exact name is the speaker in Meet's captions. Anyone else who
+  // says "hey openhuman …" is silently ignored — preventing a
+  // remote participant from issuing tool calls in the owner's
+  // name. Empty fails closed; the submit handler will surface an
+  // explicit error before opening the CEF window.
+  const [ownerDisplayName, setOwnerDisplayName] = useState('');
   const [submitting, setSubmitting] = useState(false);
   const [error, setError] = useState<string | null>(null);
 
@@ -143,7 +150,13 @@ export function MeetingBotsModal({ onClose, onToast }: ModalProps) {
       // installs the audio/video bridges via CDP, then meet_scanner
       // drives the join automatically. Returns once the window has
       // been created — meet_audio + meet_scanner take it from there.
-      await joinMeetCall({ meetUrl, displayName });
+      //
+      // ownerDisplayName is the privacy lock: the wake-word gate in
+      // the core only accepts captions whose speaker matches this
+      // value (case-insensitive, "(host)" / "(you)" suffix stripped).
+      // Anyone else in the room saying the wake phrase is dropped
+      // without dispatching a tool turn.
+      await joinMeetCall({ meetUrl, displayName, ownerDisplayName });
       onToast?.({
         type: 'success',
         title: t('skills.meetingBots.joiningTitle'),
@@ -244,6 +257,29 @@ export function MeetingBotsModal({ onClose, onToast }: ModalProps) {
               />
             </label>
 
+            <label className="block">
+              <span className="text-[10px] font-medium uppercase tracking-wide text-stone-500 dark:text-neutral-400">
+                Your name in the call
+              </span>
+              <input
+                type="text"
+                value={ownerDisplayName}
+                onChange={e => setOwnerDisplayName(e.target.value)}
+                maxLength={64}
+                placeholder="As shown in Google Meet (e.g. Nikhil Bajaj)"
+                disabled={isComingSoon || submitting}
+                aria-describedby="meeting-bots-owner-hint"
+                required
+                className="mt-1 w-full rounded-xl border border-stone-200 dark:border-neutral-800 bg-white dark:bg-neutral-900 px-3 py-2 text-sm text-stone-900 dark:text-neutral-100 focus:border-primary-500 focus:outline-none focus:ring-2 focus:ring-primary-100 disabled:cursor-not-allowed disabled:bg-stone-50 dark:disabled:bg-neutral-800/60"
+              />
+              <p
+                id="meeting-bots-owner-hint"
+                className="mt-1 text-[10px] leading-relaxed text-stone-500 dark:text-neutral-400">
+                Privacy lock. OpenHuman will only respond to the wake word when this exact name
+                is speaking — anyone else in the call cannot trigger tool calls in your name.
+              </p>
+            </label>
+
             {error && (
               <div
                 role="alert"
@@ -261,7 +297,9 @@ export function MeetingBotsModal({ onClose, onToast }: ModalProps) {
               </button>
               <button
                 type="submit"
-                disabled={submitting || isComingSoon || !meetUrl.trim()}
+                disabled={
+                  submitting || isComingSoon || !meetUrl.trim() || !ownerDisplayName.trim()
+                }
                 className="rounded-xl bg-primary-500 px-4 py-2 text-sm font-semibold text-white hover:bg-primary-600 disabled:cursor-not-allowed disabled:bg-stone-200 dark:disabled:bg-neutral-700 disabled:text-stone-400 dark:disabled:text-neutral-500">
                 {isComingSoon
                   ? `${selected.label} ${t('skills.meetingBots.comingSoon')}`
diff --git a/app/src/components/skills/__tests__/MeetingBotsCard.test.tsx b/app/src/components/skills/__tests__/MeetingBotsCard.test.tsx
index 192e9211f1..1c7c3c0b4f 100644
--- a/app/src/components/skills/__tests__/MeetingBotsCard.test.tsx
+++ b/app/src/components/skills/__tests__/MeetingBotsCard.test.tsx
@@ -58,17 +58,25 @@ describe('MeetingBotsCard', () => {
     fireEvent.change(screen.getByLabelText(/meeting link/i), {
       target: { value: 'https://meet.google.com/abc-defg-hij' },
     });
+    // Owner display name is now required — the wake-word gate refuses
+    // every caption when this is empty (privacy lock), so the submit
+    // button stays disabled and the test would hang on form submit
+    // without typing a value here.
+    fireEvent.change(screen.getByLabelText(/your name in the call/i), {
+      target: { value: 'Alice' },
+    });
     const form = screen.getByRole('dialog').querySelector('form')!;
     fireEvent.submit(form);
 
-    // Flow A's joinMeetCall takes { meetUrl, displayName }. The component
-    // synthesises displayName from the user profile; the test asserts on
-    // meetUrl + the presence of a displayName field rather than its exact
-    // value (which would couple the test to the auth-fixture seam).
+    // Flow A's joinMeetCall takes { meetUrl, displayName, ownerDisplayName }.
+    // Assert on the owner name (the new privacy-lock contract) and meetUrl;
+    // the bot displayName is a UI-supplied default and not contract-load-
+    // bearing for this assertion.
     await vi.waitFor(() => {
       expect(joinMock).toHaveBeenCalledWith(
         expect.objectContaining({
           meetUrl: 'https://meet.google.com/abc-defg-hij',
+          ownerDisplayName: 'Alice',
         })
       );
     });
@@ -95,6 +103,9 @@ describe('MeetingBotsCard', () => {
     fireEvent.change(screen.getByLabelText(/meeting link/i), {
       target: { value: 'https://meet.google.com/x' },
     });
+    fireEvent.change(screen.getByLabelText(/your name in the call/i), {
+      target: { value: 'Alice' },
+    });
     fireEvent.submit(screen.getByRole('dialog').querySelector('form')!);
 
     await vi.waitFor(() => {
diff --git a/app/src/services/meetCallService.ts b/app/src/services/meetCallService.ts
index 6ca0248c25..5283cab416 100644
--- a/app/src/services/meetCallService.ts
+++ b/app/src/services/meetCallService.ts
@@ -15,12 +15,25 @@ import { isTauri } from '../utils/tauriCommands/common';
 import { apiClient } from './apiClient';
 import { callCoreRpc } from './coreRpcClient';
 
-export type MeetJoinCallInput = { meetUrl: string; displayName: string };
+export type MeetJoinCallInput = {
+  meetUrl: string;
+  /** Bot's display name in Meet's "Your name" prompt. */
+  displayName: string;
+  /**
+   * The launching user's display name as it will appear in the Meet
+   * call. This is the *only* speaker the in-call wake-word gate will
+   * accept — captions from any other participant are dropped before
+   * tools can be dispatched. Empty / missing fails closed in core
+   * (no wakes fire) which is the safe default during the rollout.
+   */
+  ownerDisplayName?: string;
+};
 
 export type MeetJoinCallResult = {
   requestId: string;
   meetUrl: string;
   displayName: string;
+  ownerDisplayName: string;
   windowLabel: string;
 };
 
@@ -29,9 +42,19 @@ type CoreJoinResponse = { ok: boolean; request_id: string; meet_url: string; dis
 export async function joinMeetCall(input: MeetJoinCallInput): Promise<MeetJoinCallResult> {
   const meetUrl = input.meetUrl.trim();
   const displayName = input.displayName.trim();
+  const ownerDisplayName = (input.ownerDisplayName ?? '').trim();
 
   if (!meetUrl) throw new Error('Please paste a Google Meet link.');
   if (!displayName) throw new Error('Please enter a display name.');
+  // Owner name is the privacy lock — captions from anyone else are
+  // refused by the core wake gate. Surfacing the requirement up front
+  // keeps the user from sitting through the join only to find the bot
+  // ignores them; matches the message the inline alert would show.
+  if (!ownerDisplayName) {
+    throw new Error(
+      'Please enter your own name as it will appear in the Meet so OpenHuman knows who to listen to.'
+    );
+  }
   // Refuse early outside the desktop shell so the browser dev surface
   // (`pnpm dev`) doesn't mint a stray request_id on the core for a join
   // attempt that has no chance of opening a CEF window.
@@ -57,6 +80,12 @@ export async function joinMeetCall(input: MeetJoinCallInput): Promise<MeetJoinCa
         request_id: rpcResult.request_id,
         meet_url: rpcResult.meet_url,
         display_name: rpcResult.display_name,
+        // Owner name doesn't round-trip through meet_join_call (the
+        // RPC is platform-agnostic validation only) — pass it
+        // directly to the shell so the meet_audio start path can
+        // hand it to the wake-word gate. See feat/mascot-meet-flowA
+        // Plan C — owner-only privacy lock.
+        owner_display_name: ownerDisplayName,
       },
     });
   } catch (err) {
@@ -73,6 +102,7 @@ export async function joinMeetCall(input: MeetJoinCallInput): Promise<MeetJoinCa
     requestId: rpcResult.request_id,
     meetUrl: rpcResult.meet_url,
     displayName: rpcResult.display_name,
+    ownerDisplayName,
     windowLabel,
   };
 }

From 683f3e20bf0a5c00b2a6615849d139ff0557815c Mon Sep 17 00:00:00 2001
From: oxoxDev <nikhil@tinyhumans.ai>
Date: Mon, 25 May 2026 12:18:26 +0530
Subject: [PATCH 44/64] feat(tauri/meet-audio): emit speaking-state edges from
 speak_pump

The pump now tracks an edge-detected speaking flag per session and
fires a `meet-video:speaking-state` Tauri event on every flip. The
detector is gated by a 400 ms hangover so the natural gap between
two consecutive PCM chunks doesn't flap the mascot's mouth shut.
Shutdown and fatal-feed-error paths force the state to `false` so
the mascot can't get stuck mid-talk if the call dies during a TTS
chunk.

`poll_and_feed` now returns whether the tick carried PCM (the
edge-detector's input). `speak_pump::start` takes an
`AppHandle<R>` so the spawned task can emit events; updated the
single caller in `meet_audio::start`.

Frontend consumer (the in-Meet mascot frame producer) lands in the
next commit.
---
 app/src-tauri/src/meet_audio/mod.rs        |   2 +-
 app/src-tauri/src/meet_audio/speak_pump.rs | 148 +++++++++++++++++++--
 2 files changed, 140 insertions(+), 10 deletions(-)

diff --git a/app/src-tauri/src/meet_audio/mod.rs b/app/src-tauri/src/meet_audio/mod.rs
index 2f1f4442d1..2fab0204d3 100644
--- a/app/src-tauri/src/meet_audio/mod.rs
+++ b/app/src-tauri/src/meet_audio/mod.rs
@@ -180,7 +180,7 @@ pub async fn start<R: Runtime>(
                     caption_listener_disabled(request_id.clone())
                 }
             };
-            let speak = speak_pump::start(request_id.clone(), cdp, session);
+            let speak = speak_pump::start(app.clone(), request_id.clone(), cdp, session);
             (speak, captions)
         }
         Err(err) => {
diff --git a/app/src-tauri/src/meet_audio/speak_pump.rs b/app/src-tauri/src/meet_audio/speak_pump.rs
index ca64e95b0b..549a12ece4 100644
--- a/app/src-tauri/src/meet_audio/speak_pump.rs
+++ b/app/src-tauri/src/meet_audio/speak_pump.rs
@@ -7,9 +7,10 @@
 //! the call so each tick is a single `Runtime.evaluate` round-trip
 //! rather than fresh attach + detach.
 
-use std::time::Duration;
+use std::time::{Duration, Instant};
 
 use base64::{engine::general_purpose::STANDARD as B64, Engine as _};
+use tauri::{AppHandle, Emitter, Runtime};
 use tokio::sync::oneshot;
 use tokio::time::interval;
 
@@ -29,6 +30,21 @@ const POLL_INTERVAL: Duration = Duration::from_millis(100);
 /// either way.
 const MAX_CONSECUTIVE_FEED_ERRORS: u32 = 30;
 
+/// How long the speaking-state event keeps reporting `speaking=true`
+/// after the last non-empty PCM tick. Brain enqueues outbound in
+/// chunks of ~50–200 ms and there's a gap of one or two pump ticks
+/// (100 ms each) between chunks while the next batch is being
+/// synthesised. Without a hangover, the mascot's mouth would flicker
+/// shut every gap. 400 ms covers the typical inter-chunk silence
+/// without bridging across legitimate end-of-utterance pauses.
+const SPEAKING_HANGOVER: Duration = Duration::from_millis(400);
+
+/// Tauri event channel for "the bot is/isn't speaking right now".
+/// Consumed by `MascotFrameProducer` (frontend) to flip the mascot
+/// SVG between idle and a mouth-open / talking pose so the Meet
+/// participant sees a visual cue that matches the audio they hear.
+const SPEAKING_STATE_EVENT: &str = "meet-video:speaking-state";
+
 /// RAII handle. Drop to stop the pump task. The shutdown channel
 /// causes the spawned loop to exit on the next select tick.
 pub struct SpeakPump {
@@ -45,8 +61,15 @@ impl Drop for SpeakPump {
 /// Spawn the speak pump for a session that already has the audio
 /// bridge installed. `cdp` and `session_id` come from
 /// [`inject::install_audio_bridge`] and are owned by the pump task
-/// from this point on.
-pub fn start(request_id: String, cdp: CdpConn, session_id: String) -> SpeakPump {
+/// from this point on. `app` is held so the pump can fire
+/// `meet-video:speaking-state` events when the bot starts / stops
+/// producing PCM (drives the in-Meet mascot's mouth animation).
+pub fn start<R: Runtime>(
+    app: AppHandle<R>,
+    request_id: String,
+    cdp: CdpConn,
+    session_id: String,
+) -> SpeakPump {
     let (shutdown_tx, mut shutdown_rx) = oneshot::channel::<()>();
     let request_id_for_task = request_id.clone();
     tauri::async_runtime::spawn(async move {
@@ -56,17 +79,30 @@ pub fn start(request_id: String, cdp: CdpConn, session_id: String) -> SpeakPump
         tick.tick().await;
         let mut cdp = cdp;
         let mut feed_errors: u32 = 0;
+        // Edge-detect state for the speaking-state event. We emit on
+        // every flip and never on every tick — the frontend renderer
+        // would otherwise see a flood of redundant state updates and
+        // burn worker time on no-op rerenders.
+        let mut speaking_state = SpeakingTracker::new();
         loop {
             tokio::select! {
                 _ = &mut shutdown_rx => {
                     log::info!(
                         "[meet-audio] speak pump shutdown request_id={request_id_for_task}"
                     );
+                    // Make sure the mascot stops talking when the
+                    // session ends — without this the last "speaking"
+                    // edge would leave the mouth open for the next
+                    // call's first frame.
+                    speaking_state.force_off(&app, &request_id_for_task);
                     break;
                 }
                 _ = tick.tick() => {
-                    match poll_and_feed(&request_id_for_task, &mut cdp, &session_id).await {
-                        Ok(_) => feed_errors = 0,
+                    let had_pcm = match poll_and_feed(&request_id_for_task, &mut cdp, &session_id).await {
+                        Ok(had) => {
+                            feed_errors = 0;
+                            had
+                        }
                         Err(err) => {
                             feed_errors += 1;
                             log::debug!(
@@ -76,10 +112,17 @@ pub fn start(request_id: String, cdp: CdpConn, session_id: String) -> SpeakPump
                                 log::warn!(
                                     "[meet-audio] speak pump giving up after {feed_errors} consecutive errors request_id={request_id_for_task}"
                                 );
+                                speaking_state.force_off(&app, &request_id_for_task);
                                 break;
                             }
+                            // A failed tick is *not* evidence the bot
+                            // stopped speaking — leave the hangover to
+                            // expire naturally so transient CDP errors
+                            // don't flicker the mascot's mouth shut.
+                            false
                         }
-                    }
+                    };
+                    speaking_state.tick(had_pcm, &app, &request_id_for_task);
                 }
             }
         }
@@ -91,6 +134,88 @@ pub fn start(request_id: String, cdp: CdpConn, session_id: String) -> SpeakPump
     }
 }
 
+/// Edge-detector + hangover for the speaking-state event stream.
+/// State machine has two reachable values (`speaking` / `idle`) and
+/// flips between them only when the underlying signal sustains long
+/// enough to clear the hangover, so the frontend never sees a flap
+/// during the natural gap between two PCM chunks.
+struct SpeakingTracker {
+    /// Currently-reported state. Defaults to `false` so the mascot
+    /// boots into the idle pose; the first `speaking=true` tick is a
+    /// real edge.
+    reported: bool,
+    /// Wall-clock the hangover expires. Set to `now + SPEAKING_HANGOVER`
+    /// every tick that carries PCM; the state flips back to `false`
+    /// only once `now > hangover_until` AND a tick with no PCM lands.
+    hangover_until: Option<Instant>,
+}
+
+impl SpeakingTracker {
+    fn new() -> Self {
+        Self {
+            reported: false,
+            hangover_until: None,
+        }
+    }
+
+    /// Drive the state machine from a single pump tick. `had_pcm`
+    /// is whether `poll_and_feed` saw a non-empty `pcm_base64` for
+    /// this tick. Emits the Tauri event only when the reported
+    /// state actually flips.
+    fn tick<R: Runtime>(&mut self, had_pcm: bool, app: &AppHandle<R>, request_id: &str) {
+        if had_pcm {
+            // Extend the hangover. If we were idle, flip up to
+            // speaking — the user hears audio starting now.
+            self.hangover_until = Some(Instant::now() + SPEAKING_HANGOVER);
+            self.set_reported(true, app, request_id);
+            return;
+        }
+        // No PCM this tick. If the hangover hasn't expired, stay in
+        // whatever state we were already in (typically `speaking=true`
+        // during the gap between two consecutive chunks).
+        if let Some(until) = self.hangover_until {
+            if Instant::now() < until {
+                return;
+            }
+            // Hangover elapsed; clear so we don't re-evaluate on
+            // every future idle tick.
+            self.hangover_until = None;
+        }
+        // Hangover expired or never armed → bot is genuinely idle.
+        self.set_reported(false, app, request_id);
+    }
+
+    /// Force the reported state to `false` and emit an event if that's
+    /// a flip. Used on shutdown / fatal error paths so the mascot
+    /// can't get stuck mid-talk.
+    fn force_off<R: Runtime>(&mut self, app: &AppHandle<R>, request_id: &str) {
+        self.hangover_until = None;
+        self.set_reported(false, app, request_id);
+    }
+
+    fn set_reported<R: Runtime>(&mut self, next: bool, app: &AppHandle<R>, request_id: &str) {
+        if self.reported == next {
+            return;
+        }
+        self.reported = next;
+        let payload = serde_json::json!({
+            "requestId": request_id,
+            "speaking": next,
+        });
+        if let Err(err) = app.emit(SPEAKING_STATE_EVENT, payload) {
+            // Best-effort: a missing renderer (closed window mid-tick)
+            // is the common case and not worth raising the log level.
+            log::debug!(
+                "[meet-audio] speaking-state emit failed request_id={request_id} speaking={next} err={err}"
+            );
+        } else {
+            log::debug!(
+                "[meet-audio] speaking-state -> {next} request_id={request_id}"
+            );
+        }
+    }
+}
+
 /// No-op pump used when bridge install failed at session start. Keeps
 /// the rest of the session lifecycle uniform — `MeetAudioSession` can
 /// still hold a `SpeakPump` regardless of speak-path readiness.
@@ -101,11 +226,14 @@ pub fn start_disabled(request_id: String) -> SpeakPump {
     }
 }
 
+/// Run a single pump tick. Returns `true` when the tick actually
+/// carried synthesized PCM (used by the caller to drive the
+/// speaking-state edge detector).
 async fn poll_and_feed(
     request_id: &str,
     cdp: &mut CdpConn,
     session_id: &str,
-) -> Result<(), String> {
+) -> Result<bool, String> {
     let v = super::rpc_call(
         "openhuman.meet_agent_poll_speech",
         serde_json::json!({ "request_id": request_id }),
@@ -152,8 +280,10 @@ async fn poll_and_feed(
             bytes.len()
         );
         inject::feed_pcm_chunk(cdp, session_id, pcm_b64).await?;
-    } else if utterance_done {
+        return Ok(true);
+    }
+    if utterance_done {
         log::info!("[meet-audio] speak pump utterance complete request_id={request_id}");
     }
-    Ok(())
+    Ok(false)
 }

From 93912b16802e287b6c1ecca557acf7fe4c953de1 Mon Sep 17 00:00:00 2001
From: oxoxDev <nikhil@tinyhumans.ai>
Date: Mon, 25 May 2026 12:18:33 +0530
Subject: [PATCH 45/64] feat(meet): animate the in-call mascot's mouth while
 the bot speaks
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

`MascotFrameProducer` subscribes to `meet-video:speaking-state`
and flips `<YellowMascotIdle/>` between `talking={false}` (idle)
and `talking={true}` (mouth animating in sync with the synthesized
PCM the bot is feeding into Meet). RequestId on the payload is
matched against the active session so a stale event from a torn-
down call can't bleed into the current one.

Visual cue only — no audio path / bridge changes. Meet participants
now see the mascot's mouth open and close in time with the audio
they hear, instead of the prior frozen idle pose.
---
 app/src/features/meet/MascotFrameProducer.tsx | 43 ++++++++++++++++++-
 1 file changed, 42 insertions(+), 1 deletion(-)

diff --git a/app/src/features/meet/MascotFrameProducer.tsx b/app/src/features/meet/MascotFrameProducer.tsx
index f139e7236a..fb655fb1b6 100644
--- a/app/src/features/meet/MascotFrameProducer.tsx
+++ b/app/src/features/meet/MascotFrameProducer.tsx
@@ -122,6 +122,16 @@ const ProducerSession: FC<{ session: BusSession }> = ({ session }) => {
   const stoppedRef = useRef(false);
   const inflightRef = useRef(false);
   const sentFramesRef = useRef(0);
+  // True while the bot is actively producing PCM into the Meet
+  // call. Drives the mascot's `talking` prop so the mouth visibly
+  // animates in time with the audio the participants hear. Source
+  // of truth lives in the Rust speak_pump (edge-detected from the
+  // RPC poll loop with a ~400ms hangover that bridges the natural
+  // gap between consecutive PCM chunks) — the producer just
+  // subscribes here and rerenders on flip. Same `requestId` we got
+  // from `meet-video:bus-started` so a stale event from a previous
+  // session can never bleed into this session's mascot state.
+  const [isSpeaking, setIsSpeaking] = useState(false);
   // Frame counter feeding our own FrameContext below. We DON'T use the
   // shared `<FrameProvider>` wrapper because it ticks via
   // requestAnimationFrame, which Chromium throttles when the main
@@ -313,8 +323,32 @@ const ProducerSession: FC<{ session: BusSession }> = ({ session }) => {
     worker.onmessage = onTick;
     worker.postMessage({ cmd: 'start', intervalMs });
 
+    // Subscribe to the speak_pump's speaking-state edge events.
+    // Done inside the existing effect (rather than a sibling useEffect)
+    // so the listener lifetime is bound to the same session.port — a
+    // session swap (rare; the producer keys on `session.requestId` and
+    // remounts) tears the listener down with the rest of the
+    // pipeline.
+    let unlistenSpeaking: UnlistenFn | undefined;
+    let speakingListenerCancelled = false;
+    listen<{ requestId?: string; speaking?: boolean }>('meet-video:speaking-state', event => {
+      const payload = event.payload;
+      if (!payload) return;
+      // Defensive: ignore events from a different session that might
+      // be in flight during a teardown / restart race.
+      if (payload.requestId && payload.requestId !== session.requestId) return;
+      setIsSpeaking(!!payload.speaking);
+    })
+      .then(stop => {
+        if (speakingListenerCancelled) stop();
+        else unlistenSpeaking = stop;
+      })
+      .catch(err => console.debug('[meet-video-producer] speaking-state listen failed', err));
+
     return () => {
       stoppedRef.current = true;
+      speakingListenerCancelled = true;
+      if (unlistenSpeaking) unlistenSpeaking();
       window.clearInterval(diagInterval);
       try {
         worker.postMessage({ cmd: 'stop' });
@@ -376,6 +410,10 @@ const ProducerSession: FC<{ session: BusSession }> = ({ session }) => {
         <FrameConfigContext.Provider value={frameConfig}>
           <FrameContext.Provider value={frame}>
             <YellowMascotIdle
+              // `face` is the icon overlay (normal eyes / recording dot
+              // / loading ring) — kept as 'normal' so the speaking
+              // animation lives on the `talking` prop alone, which
+              // drives the mouth-open shape every frame.
               face="normal"
               recordingColor="#ff3b30"
               loadingColor="#ffffff"
@@ -383,7 +421,10 @@ const ProducerSession: FC<{ session: BusSession }> = ({ session }) => {
               sleeping={false}
               mascotColor={mascotColor}
               arm="wave"
-              talking={false}
+              // Toggled by the speak_pump's edge-detector. The
+              // mascot's mouth animates open/closed in sync with
+              // the synthesized PCM the bot is feeding into Meet.
+              talking={isSpeaking}
               thinking={false}
             />
           </FrameContext.Provider>

From f26e25b9871814bfae8cc1e0bac8dade26464443 Mon Sep 17 00:00:00 2001
From: oxoxDev <nikhil@tinyhumans.ai>
Date: Mon, 25 May 2026 12:35:27 +0530
Subject: [PATCH 46/64] feat(meet-agent): JSONL store for completed call
 records
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

`store::MeetCallRecord` captures request_id, meet_url, bot +
owner display names, started/ended timestamps, listened/spoken
seconds, and turn count. `append_record` opens the workspace's
`meet_agent/calls.jsonl` in append mode (mkdir as needed);
`read_recent(limit)` reads the file, drops malformed lines with
a debug log, sorts newest-first, and clamps to 200 rows so a
misconfigured caller can't trigger an unbounded read.

JSONL chosen over sqlite for the same shape used elsewhere in
the workspace: low-volume, write-rarely / read-rarely data, no
migration story needed, and a malformed final line just gets
skipped on next read. Tests cover round-trip, limit cap,
missing-file → empty, malformed-line tolerance, zero-limit,
and the usize::MAX clamp guard.
---
 src/openhuman/meet_agent/mod.rs   |   1 +
 src/openhuman/meet_agent/store.rs | 250 ++++++++++++++++++++++++++++++
 2 files changed, 251 insertions(+)
 create mode 100644 src/openhuman/meet_agent/store.rs

diff --git a/src/openhuman/meet_agent/mod.rs b/src/openhuman/meet_agent/mod.rs
index 8cf60bdae0..4207171bf5 100644
--- a/src/openhuman/meet_agent/mod.rs
+++ b/src/openhuman/meet_agent/mod.rs
@@ -39,6 +39,7 @@ pub mod ops;
 pub mod rpc;
 pub mod schemas;
 pub mod session;
+pub mod store;
 pub mod types;
 pub mod wav;
 
diff --git a/src/openhuman/meet_agent/store.rs b/src/openhuman/meet_agent/store.rs
new file mode 100644
index 0000000000..40b2a1ee09
--- /dev/null
+++ b/src/openhuman/meet_agent/store.rs
@@ -0,0 +1,250 @@
+//! Persistence for completed meet-agent calls.
+//!
+//! Append-only JSONL file under the workspace data dir. Each line is
+//! one `MeetCallRecord` written when `handle_stop_session` closes a
+//! call. The list endpoint reads the tail of the file in reverse so
+//! the most recent calls come first — same shape the UI expects.
+//!
+//! ## Why JSONL (not sqlite)
+//!
+//! Meet call records are write-rarely, read-rarely, low-cardinality
+//! data. A single user closes a few calls per day at most. JSONL is
+//! cheap to append (no locking machinery beyond OpenOptions::append),
+//! trivial to inspect with `tail`, and survives partial writes — a
+//! malformed final line just gets skipped on parse. A sqlite table
+//! would add a migration, a connection pool, and a `cargo` build
+//! dependency for no real benefit at this volume.
+//!
+//! ## Bounding
+//!
+//! `read_recent` caps the in-memory result at `MAX_RECENT_CALLS` so
+//! a long-lived install with thousands of calls doesn't allocate an
+//! unbounded Vec. The file itself is never truncated here; a future
+//! housekeeping job can prune.
+
+use std::path::{Path, PathBuf};
+
+use serde::{Deserialize, Serialize};
+use tokio::io::{AsyncBufReadExt, AsyncWriteExt, BufReader};
+
+use crate::openhuman::config::Config;
+
+/// One closed Meet call. Persisted as a JSONL line.
+///
+/// Fields use `snake_case` because the RPC layer surfaces them
+/// directly (we don't rename when serializing to the frontend), and
+/// the JSONL file becomes self-describing for anyone running `tail`
+/// on it.
+#[derive(Debug, Clone, Serialize, Deserialize, PartialEq)]
+pub struct MeetCallRecord {
+    /// UUID minted by `openhuman.meet_join_call`. Matches the session
+    /// key. Stable per call so the UI can dedup if a record is
+    /// re-emitted on a rare crash-and-retry path.
+    pub request_id: String,
+    /// Normalised Meet URL the call joined. Stored so the recent-calls
+    /// list can show *which* meeting this was without forcing the
+    /// frontend to keep an in-memory map.
+    pub meet_url: String,
+    /// Bot tile name as typed into Meet's "Your name" input. Useful
+    /// when the user runs multiple bot personas.
+    pub bot_display_name: String,
+    /// Call owner display name (the user who launched the bot).
+    /// Snapshotted at start so a later rename in the user profile
+    /// doesn't mutate history.
+    pub owner_display_name: String,
+    /// Wall-clock ms at start_session.
+    pub started_at_ms: u64,
+    /// Wall-clock ms at stop_session.
+    pub ended_at_ms: u64,
+    /// Total seconds of inbound (Meet → agent) audio processed.
+    pub listened_seconds: f32,
+    /// Total seconds of outbound (agent → Meet) audio synthesized.
+    pub spoken_seconds: f32,
+    /// Completed agent turns during the call.
+    pub turn_count: u32,
+}
+
+/// Hard cap on the rows returned from `read_recent`. The UI shows ~20
+/// rows initially with a "Load more" affordance reserved for later;
+/// keeping the API ceiling at 200 means a misconfigured client can't
+/// trigger an OOM-shaped read.
+pub const MAX_RECENT_CALLS: usize = 200;
+
+/// Resolve the workspace-relative path of the meet-calls JSONL file.
+/// Mirrors `threads/ops::workspace_dir` — single source of truth for
+/// "where does openhuman keep its per-user data". Created on demand
+/// at append time; missing file at read time is treated as "no
+/// recorded calls yet" (returns an empty Vec rather than an error).
+pub async fn meet_calls_jsonl_path() -> Result<PathBuf, String> {
+    let workspace = Config::load_or_init()
+        .await
+        .map(|c| c.workspace_dir)
+        .map_err(|e| format!("load config: {e}"))?;
+    Ok(workspace.join("meet_agent").join("calls.jsonl"))
+}
+
+/// Append a single record to the JSONL store. Creates parent
+/// directories if missing. Each call writes one line + newline so
+/// the file remains parsable even when a future writer crashes
+/// mid-line (the partial line is skipped on read).
+pub async fn append_record(record: &MeetCallRecord) -> Result<(), String> {
+    let path = meet_calls_jsonl_path().await?;
+    append_record_to(&path, record).await
+}
+
+async fn append_record_to(path: &Path, record: &MeetCallRecord) -> Result<(), String> {
+    if let Some(parent) = path.parent() {
+        tokio::fs::create_dir_all(parent)
+            .await
+            .map_err(|e| format!("mkdir {}: {e}", parent.display()))?;
+    }
+    let mut line = serde_json::to_string(record).map_err(|e| format!("serialize: {e}"))?;
+    line.push('\n');
+    let mut file = tokio::fs::OpenOptions::new()
+        .create(true)
+        .append(true)
+        .open(path)
+        .await
+        .map_err(|e| format!("open {}: {e}", path.display()))?;
+    file.write_all(line.as_bytes())
+        .await
+        .map_err(|e| format!("write {}: {e}", path.display()))?;
+    Ok(())
+}
+
+/// Return the `limit` most recent records (newest first). Missing
+/// file → empty Vec. Malformed lines are dropped silently with a
+/// debug log so one bad row doesn't poison the whole list. The cap
+/// is enforced *after* parsing so future fields don't break older
+/// records — readers are tolerant of unknown trailing fields via
+/// serde's default behavior.
+pub async fn read_recent(limit: usize) -> Result<Vec<MeetCallRecord>, String> {
+    let path = meet_calls_jsonl_path().await?;
+    read_recent_from(&path, limit).await
+}
+
+async fn read_recent_from(path: &Path, limit: usize) -> Result<Vec<MeetCallRecord>, String> {
+    let limit = limit.min(MAX_RECENT_CALLS);
+    if limit == 0 {
+        return Ok(Vec::new());
+    }
+    let file = match tokio::fs::File::open(path).await {
+        Ok(f) => f,
+        Err(err) if err.kind() == std::io::ErrorKind::NotFound => return Ok(Vec::new()),
+        Err(err) => return Err(format!("open {}: {err}", path.display())),
+    };
+    let reader = BufReader::new(file);
+    let mut lines = reader.lines();
+    let mut all: Vec<MeetCallRecord> = Vec::new();
+    while let Some(line) = lines
+        .next_line()
+        .await
+        .map_err(|e| format!("read {}: {e}", path.display()))?
+    {
+        if line.trim().is_empty() {
+            continue;
+        }
+        match serde_json::from_str::<MeetCallRecord>(&line) {
+            Ok(rec) => all.push(rec),
+            Err(err) => {
+                log::debug!("[meet-agent-store] skip malformed line err={err}");
+            }
+        }
+    }
+    // Newest first. Compare on started_at_ms for stability against
+    // future out-of-order writes (e.g. a future async flush race).
+    all.sort_by(|a, b| b.started_at_ms.cmp(&a.started_at_ms));
+    all.truncate(limit);
+    Ok(all)
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+    use tempfile::TempDir;
+
+    fn sample(idx: u64) -> MeetCallRecord {
+        MeetCallRecord {
+            request_id: format!("req-{idx}"),
+            meet_url: "https://meet.google.com/abc-defg-hij".into(),
+            bot_display_name: "OpenHuman".into(),
+            owner_display_name: "Alice".into(),
+            started_at_ms: 1_000_000 + idx * 60_000,
+            ended_at_ms: 1_000_000 + idx * 60_000 + 30_000,
+            listened_seconds: 12.5,
+            spoken_seconds: 4.2,
+            turn_count: 3,
+        }
+    }
+
+    #[tokio::test]
+    async fn append_then_read_round_trip() {
+        let tmp = TempDir::new().unwrap();
+        let path = tmp.path().join("nested").join("calls.jsonl");
+        let a = sample(1);
+        let b = sample(2);
+        append_record_to(&path, &a).await.unwrap();
+        append_record_to(&path, &b).await.unwrap();
+        let recent = read_recent_from(&path, 10).await.unwrap();
+        assert_eq!(recent.len(), 2);
+        // Newest first → req-2 comes before req-1.
+        assert_eq!(recent[0].request_id, "req-2");
+        assert_eq!(recent[1].request_id, "req-1");
+    }
+
+    #[tokio::test]
+    async fn read_recent_caps_limit() {
+        let tmp = TempDir::new().unwrap();
+        let path = tmp.path().join("calls.jsonl");
+        for i in 0..5 {
+            append_record_to(&path, &sample(i)).await.unwrap();
+        }
+        let recent = read_recent_from(&path, 3).await.unwrap();
+        assert_eq!(recent.len(), 3);
+        // Top 3 are the most recent (idx 4, 3, 2).
+        assert_eq!(recent[0].request_id, "req-4");
+        assert_eq!(recent[2].request_id, "req-2");
+    }
+
+    #[tokio::test]
+    async fn read_recent_missing_file_returns_empty() {
+        let tmp = TempDir::new().unwrap();
+        let path = tmp.path().join("does-not-exist.jsonl");
+        let recent = read_recent_from(&path, 10).await.unwrap();
+        assert!(recent.is_empty());
+    }
+
+    #[tokio::test]
+    async fn malformed_line_is_skipped() {
+        let tmp = TempDir::new().unwrap();
+        let path = tmp.path().join("calls.jsonl");
+        // Hand-write a file with one good record + one bad line.
+        let good = serde_json::to_string(&sample(1)).unwrap();
+        tokio::fs::write(&path, format!("{good}\nnot-json\n"))
+            .await
+            .unwrap();
+        let recent = read_recent_from(&path, 10).await.unwrap();
+        assert_eq!(recent.len(), 1);
+        assert_eq!(recent[0].request_id, "req-1");
+    }
+
+    #[tokio::test]
+    async fn zero_limit_returns_empty() {
+        let tmp = TempDir::new().unwrap();
+        let path = tmp.path().join("calls.jsonl");
+        append_record_to(&path, &sample(1)).await.unwrap();
+        let recent = read_recent_from(&path, 0).await.unwrap();
+        assert!(recent.is_empty());
+    }
+
+    #[tokio::test]
+    async fn limit_above_cap_is_clamped() {
+        // Passing usize::MAX must not allocate Vec::with_capacity(usize::MAX).
+        // The clamp lives inside read_recent_from before any allocation.
+        let tmp = TempDir::new().unwrap();
+        let path = tmp.path().join("calls.jsonl");
+        append_record_to(&path, &sample(1)).await.unwrap();
+        let recent = read_recent_from(&path, usize::MAX).await.unwrap();
+        assert_eq!(recent.len(), 1);
+    }
+}

From a39bf3890a7ba454d6b00f4c7629295ebc3815ad Mon Sep 17 00:00:00 2001
From: oxoxDev <nikhil@tinyhumans.ai>
Date: Mon, 25 May 2026 12:35:36 +0530
Subject: [PATCH 47/64] feat(meet-agent): capture meet_url + started_at_ms on
 session start

Extend `MeetAgentSession` with `meet_url: String` and
`started_at_ms: u64`, plus a `set_meet_url` setter and read
accessors (`meet_url`, `bot_display_name`, `started_at_ms`) so
the store layer doesn't reach into private fields. The
monotonic `Instant` `started_at` is kept for elapsed-seconds
math; the new wall-clock ms field is what the JSONL log sorts
on across process restarts.

`StartSessionRequest` gains an optional `meet_url` field
(serde default = empty) so older shells keep parsing while
new shells forward the URL the CEF window joined.
---
 src/openhuman/meet_agent/session.rs | 36 +++++++++++++++++++++++++++++
 src/openhuman/meet_agent/types.rs   | 31 +++++++++++++++++++++++++
 2 files changed, 67 insertions(+)

diff --git a/src/openhuman/meet_agent/session.rs b/src/openhuman/meet_agent/session.rs
index 421aaaefb5..a6164da732 100644
--- a/src/openhuman/meet_agent/session.rs
+++ b/src/openhuman/meet_agent/session.rs
@@ -120,6 +120,15 @@ pub struct MeetAgentSession {
     /// own voice). Empty until set; while empty the bot-self filter
     /// is inert.
     bot_display_name: String,
+    /// Normalised Meet URL the call joined. Snapshotted at start
+    /// so the recent-calls log captures which meeting this was
+    /// without forcing the frontend to keep an in-memory map.
+    meet_url: String,
+    /// Wall-clock ms when `start_session` ran. The session also
+    /// keeps `started_at: Instant` for monotonic elapsed-seconds
+    /// math, but the JSONL persistence layer needs an absolute
+    /// timestamp that can be sorted across process restarts.
+    started_at_ms: u64,
 }
 
 impl MeetAgentSession {
@@ -146,9 +155,36 @@ impl MeetAgentSession {
             last_turn_done_at_ms: 0,
             owner_display_name: String::new(),
             bot_display_name: String::new(),
+            meet_url: String::new(),
+            started_at_ms: SystemTime::now()
+                .duration_since(UNIX_EPOCH)
+                .map(|d| d.as_millis() as u64)
+                .unwrap_or(0),
         }
     }
 
+    /// Record the Meet URL the call joined. Stored alongside the
+    /// session so `stop_session` can write it into the JSONL
+    /// recent-calls log. Empty string acceptable (older shells that
+    /// don't yet forward the URL will simply log calls with an
+    /// empty `meet_url` field — the UI degrades gracefully).
+    pub fn set_meet_url(&mut self, meet_url: &str) {
+        self.meet_url = meet_url.trim().to_string();
+    }
+
+    /// Read accessors used when persisting the call record on
+    /// `stop_session`. Kept at the session boundary so the store
+    /// module doesn't have to reach into private fields.
+    pub fn meet_url(&self) -> &str {
+        &self.meet_url
+    }
+    pub fn bot_display_name(&self) -> &str {
+        &self.bot_display_name
+    }
+    pub fn started_at_ms(&self) -> u64 {
+        self.started_at_ms
+    }
+
     /// Set the call-owner display name (the human who launched the
     /// bot) and the bot's own Meet participant name. The note_caption
     /// gate uses both: captions are accepted only when the speaker
diff --git a/src/openhuman/meet_agent/types.rs b/src/openhuman/meet_agent/types.rs
index b03b266a55..35b4bb6b75 100644
--- a/src/openhuman/meet_agent/types.rs
+++ b/src/openhuman/meet_agent/types.rs
@@ -39,6 +39,12 @@ pub struct StartSessionRequest {
     /// posture.
     #[serde(default)]
     pub bot_display_name: String,
+    /// Normalised Meet URL the call joined. Persisted into the
+    /// recent-calls log so the UI can show "Joined `…/abc-defg-hij`
+    /// — 12 min ago". Defaulted so older shells that haven't been
+    /// updated to forward the URL still parse the payload.
+    #[serde(default)]
+    pub meet_url: String,
 }
 
 fn default_sample_rate() -> u32 {
@@ -129,6 +135,31 @@ pub struct PushCaptionResponse {
     pub turn_started: bool,
 }
 
+/// Inputs to `openhuman.meet_agent_list_calls`.
+///
+/// Returns the most recently completed Meet calls (newest first) so
+/// the Skills "Meeting Bots" card can render a history list inside
+/// the same modal the user used to launch the call. Capped server-
+/// side at `store::MAX_RECENT_CALLS` so a misconfigured client
+/// can't request an unbounded read.
+#[derive(Debug, Clone, Deserialize)]
+pub struct ListCallsRequest {
+    /// Maximum rows to return. Defaults to 50 if absent. Hard cap
+    /// applied server-side regardless of what the caller asks for.
+    #[serde(default)]
+    pub limit: Option<usize>,
+}
+
+/// Outputs from `openhuman.meet_agent_list_calls`.
+#[derive(Debug, Clone, Serialize)]
+pub struct ListCallsResponse {
+    pub ok: bool,
+    pub calls: Vec<super::store::MeetCallRecord>,
+    /// Number of rows in `calls`. Convenient for the UI when
+    /// rendering a header like "Recent calls (12)".
+    pub count: usize,
+}
+
 /// Inputs to `openhuman.meet_agent_stop_session`.
 #[derive(Debug, Clone, Deserialize)]
 pub struct StopSessionRequest {

From 0eebe5be5a99371205d9ef632fae5a5e0d2705cf Mon Sep 17 00:00:00 2001
From: oxoxDev <nikhil@tinyhumans.ai>
Date: Mon, 25 May 2026 12:35:45 +0530
Subject: [PATCH 48/64] feat(meet-agent): persist on stop_session + new
 list_calls RPC

handle_stop_session now builds a `MeetCallRecord` from the
just-closed session and appends it to the JSONL store. The
append is best-effort: a failed write logs at warn level but
never blocks the stop_session response (the call is already
over). handle_start_session forwards `meet_url` from the
request into the session.

New `openhuman.meet_agent_list_calls` returns the most recent
records, newest first, with an optional `limit` param (default
50, hard-capped at 200 by the store). Wired into the
controller schema registry alongside the existing five
`meet_agent_*` endpoints; the schema-vs-handler-symmetry test
is extended to include it.
---
 src/openhuman/meet_agent/rpc.rs     | 57 ++++++++++++++++++++++++++++-
 src/openhuman/meet_agent/schemas.rs | 47 +++++++++++++++++++++++-
 2 files changed, 101 insertions(+), 3 deletions(-)

diff --git a/src/openhuman/meet_agent/rpc.rs b/src/openhuman/meet_agent/rpc.rs
index b811f8c62d..e89a9aaec7 100644
--- a/src/openhuman/meet_agent/rpc.rs
+++ b/src/openhuman/meet_agent/rpc.rs
@@ -19,11 +19,17 @@ use crate::rpc::RpcOutcome;
 use super::brain;
 use super::ops::VadEvent;
 use super::session::registry;
+use super::store::{self, MeetCallRecord};
 use super::types::{
-    PollSpeechRequest, PushCaptionRequest, PushListenPcmRequest, StartSessionRequest,
-    StopSessionRequest,
+    ListCallsRequest, ListCallsResponse, PollSpeechRequest, PushCaptionRequest,
+    PushListenPcmRequest, StartSessionRequest, StopSessionRequest,
 };
 
+/// Default `limit` for `handle_list_calls` when the caller omits one.
+/// Comfortably above the ~20 rows the UI shows initially while still
+/// keeping the response payload small.
+const LIST_CALLS_DEFAULT_LIMIT: usize = 50;
+
 const LOG_PREFIX: &str = "[meet-agent-rpc]";
 
 pub async fn handle_start_session(params: Map<String, Value>) -> Result<Value, String> {
@@ -41,6 +47,7 @@ pub async fn handle_start_session(params: Map<String, Value>) -> Result<Value, S
     // to be updated in lockstep.
     registry().with_session(&req.request_id, |s| {
         s.set_identities(&req.owner_display_name, &req.bot_display_name);
+        s.set_meet_url(&req.meet_url);
     })?;
     log::info!(
         "{LOG_PREFIX} start_session request_id={} sample_rate_hz={} \
@@ -179,6 +186,32 @@ pub async fn handle_stop_session(params: Map<String, Value>) -> Result<Value, St
         session.turn_count
     );
 
+    // Persist a recent-calls record. Best-effort: a failed write
+    // never blocks the stop_session response — the call is already
+    // over by definition and the UI doesn't depend on the record
+    // existing to function. We log loudly enough that a broken
+    // persistence path is visible in dev:app stdout.
+    let record = MeetCallRecord {
+        request_id: session.request_id.clone(),
+        meet_url: session.meet_url().to_string(),
+        bot_display_name: session.bot_display_name().to_string(),
+        owner_display_name: session.owner_display_name().to_string(),
+        started_at_ms: session.started_at_ms(),
+        ended_at_ms: std::time::SystemTime::now()
+            .duration_since(std::time::UNIX_EPOCH)
+            .map(|d| d.as_millis() as u64)
+            .unwrap_or(0),
+        listened_seconds: session.listened_seconds(),
+        spoken_seconds: session.spoken_seconds(),
+        turn_count: session.turn_count,
+    };
+    if let Err(err) = store::append_record(&record).await {
+        log::warn!(
+            "{LOG_PREFIX} append_record failed request_id={} err={err}",
+            session.request_id
+        );
+    }
+
     RpcOutcome::new(
         json!({
             "ok": true,
@@ -192,6 +225,26 @@ pub async fn handle_stop_session(params: Map<String, Value>) -> Result<Value, St
     .into_cli_compatible_json()
 }
 
+/// Return the most recent completed calls (newest first). Reads
+/// the per-user JSONL log written by `handle_stop_session`. Missing
+/// file → empty list (first run after install). Caller may pass an
+/// optional `limit`; we apply `LIST_CALLS_DEFAULT_LIMIT` when absent
+/// and `store::MAX_RECENT_CALLS` as the hard ceiling.
+pub async fn handle_list_calls(params: Map<String, Value>) -> Result<Value, String> {
+    let req: ListCallsRequest = serde_json::from_value(Value::Object(params))
+        .map_err(|e| format!("{LOG_PREFIX} invalid list_calls params: {e}"))?;
+    let limit = req.limit.unwrap_or(LIST_CALLS_DEFAULT_LIMIT);
+    let calls = store::read_recent(limit).await?;
+    let response = ListCallsResponse {
+        ok: true,
+        count: calls.len(),
+        calls,
+    };
+    let value = serde_json::to_value(&response)
+        .map_err(|e| format!("{LOG_PREFIX} serialize list_calls response: {e}"))?;
+    RpcOutcome::new(value, vec![]).into_cli_compatible_json()
+}
+
 /// Decode a base64 string of PCM16LE bytes into samples. Empty input is
 /// a "heartbeat" push (no audio this tick) and yields an empty Vec.
 fn decode_pcm16le_b64(b64: &str) -> Result<Vec<i16>, String> {
diff --git a/src/openhuman/meet_agent/schemas.rs b/src/openhuman/meet_agent/schemas.rs
index 1c1795a090..c59b4779d7 100644
--- a/src/openhuman/meet_agent/schemas.rs
+++ b/src/openhuman/meet_agent/schemas.rs
@@ -40,6 +40,11 @@ const DEFS: &[Def] = &[
         schema: schema_stop_session,
         handler: handle_stop_session,
     },
+    Def {
+        function: "list_calls",
+        schema: schema_list_calls,
+        handler: handle_list_calls,
+    },
 ];
 
 pub fn all_controller_schemas() -> Vec<ControllerSchema> {
@@ -275,6 +280,42 @@ fn schema_stop_session() -> ControllerSchema {
     }
 }
 
+fn schema_list_calls() -> ControllerSchema {
+    ControllerSchema {
+        namespace: "meet_agent",
+        function: "list_calls",
+        description:
+            "Return the most recent completed Meet calls (newest first). Reads the JSONL log written \
+                      on each stop_session. Used by the Skills Meeting Bots card to show a recent-calls list.",
+        inputs: vec![FieldSchema {
+            name: "limit",
+            ty: TypeSchema::F64,
+            comment: "Max rows to return. Defaults to 50; hard-capped server-side.",
+            required: false,
+        }],
+        outputs: vec![
+            FieldSchema {
+                name: "ok",
+                ty: TypeSchema::Bool,
+                comment: "True when the read succeeded (even if no rows exist yet).",
+                required: true,
+            },
+            FieldSchema {
+                name: "calls",
+                ty: TypeSchema::String,
+                comment: "Array of MeetCallRecord objects, newest first.",
+                required: true,
+            },
+            FieldSchema {
+                name: "count",
+                ty: TypeSchema::F64,
+                comment: "Number of rows in `calls`.",
+                required: true,
+            },
+        ],
+    }
+}
+
 fn schema_unknown() -> ControllerSchema {
     ControllerSchema {
         namespace: "meet_agent",
@@ -310,6 +351,9 @@ fn handle_poll_speech(p: Map<String, Value>) -> ControllerFuture {
 fn handle_stop_session(p: Map<String, Value>) -> ControllerFuture {
     Box::pin(async move { super::rpc::handle_stop_session(p).await })
 }
+fn handle_list_calls(p: Map<String, Value>) -> ControllerFuture {
+    Box::pin(async move { super::rpc::handle_list_calls(p).await })
+}
 
 #[cfg(test)]
 mod tests {
@@ -333,7 +377,8 @@ mod tests {
                 "push_listen_pcm",
                 "push_caption",
                 "poll_speech",
-                "stop_session"
+                "stop_session",
+                "list_calls",
             ]
         );
     }

From 0f46ed0a4658f90e1b353ee34371c4af9a64e6db Mon Sep 17 00:00:00 2001
From: oxoxDev <nikhil@tinyhumans.ai>
Date: Mon, 25 May 2026 12:35:53 +0530
Subject: [PATCH 49/64] feat(tauri/meet-audio): forward meet_url into
 start_session

The shell already knows the call's Meet URL (it built the CEF
window with it); include it in the meet_agent_start_session
RPC payload so the core can snapshot it onto the session and
persist it in the recent-calls JSONL log on stop_session.
---
 app/src-tauri/src/meet_audio/mod.rs | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/app/src-tauri/src/meet_audio/mod.rs b/app/src-tauri/src/meet_audio/mod.rs
index 2fab0204d3..d89314159d 100644
--- a/app/src-tauri/src/meet_audio/mod.rs
+++ b/app/src-tauri/src/meet_audio/mod.rs
@@ -120,6 +120,11 @@ pub async fn start<R: Runtime>(
             "sample_rate_hz": 16_000,
             "owner_display_name": owner_display_name,
             "bot_display_name": bot_display_name,
+            // Persisted into the recent-calls JSONL by stop_session
+            // so the Skills "Meeting Bots" card can show "joined
+            // <code>" in the history list. The URL the shell built
+            // the CEF window with is the canonical value.
+            "meet_url": meet_url,
         }),
     )
     .await?;

From b4e1747930c8a800a0cc9efb288bdb5c3172240b Mon Sep 17 00:00:00 2001
From: oxoxDev <nikhil@tinyhumans.ai>
Date: Mon, 25 May 2026 12:36:01 +0530
Subject: [PATCH 50/64] feat(meet): listMeetCalls() service + tests

`MeetCallRecord` interface mirrors the core's `MeetCallRecord`
struct (snake_case fields surfaced verbatim). `listMeetCalls(limit)`
calls `openhuman.meet_agent_list_calls` and returns the rows
array, or an empty array on a fresh install.

Test file updated for the new privacy-lock contract: every
joinMeetCall happy-path case now passes `ownerDisplayName`,
and the invoke-args assertion checks the new
`owner_display_name` field on the shell payload. Added a
dedicated test for the empty-owner rejection path so future
refactors can't silently weaken the gate.
---
 .../__tests__/meetCallService.test.ts         | 34 ++++++++++++++-
 app/src/services/meetCallService.ts           | 42 +++++++++++++++++++
 2 files changed, 74 insertions(+), 2 deletions(-)

diff --git a/app/src/services/__tests__/meetCallService.test.ts b/app/src/services/__tests__/meetCallService.test.ts
index b92c5be924..febf4ae6c2 100644
--- a/app/src/services/__tests__/meetCallService.test.ts
+++ b/app/src/services/__tests__/meetCallService.test.ts
@@ -41,23 +41,29 @@ describe('joinMeetCall', () => {
     const result = await joinMeetCall({
       meetUrl: 'https://meet.google.com/abc-defg-hij',
       displayName: 'Agent Alice',
+      ownerDisplayName: 'Owner Bob',
     });
 
     expect(callCoreRpc).toHaveBeenCalledWith({
       method: 'openhuman.meet_join_call',
       params: { meet_url: 'https://meet.google.com/abc-defg-hij', display_name: 'Agent Alice' },
     });
+    // owner_display_name is forwarded to the shell (not to the core's
+    // meet_join_call, which is stateless validation only) — assert on
+    // the shell args, not the core RPC params.
     expect(invoke).toHaveBeenCalledWith('meet_call_open_window', {
       args: {
         request_id: 'req-1',
         meet_url: 'https://meet.google.com/abc-defg-hij',
         display_name: 'Agent Alice',
+        owner_display_name: 'Owner Bob',
       },
     });
     expect(result).toEqual({
       requestId: 'req-1',
       meetUrl: 'https://meet.google.com/abc-defg-hij',
       displayName: 'Agent Alice',
+      ownerDisplayName: 'Owner Bob',
       windowLabel: 'meet-call-req-1',
     });
   });
@@ -65,7 +71,11 @@ describe('joinMeetCall', () => {
   it('throws if core rejects the request', async () => {
     vi.mocked(callCoreRpc).mockResolvedValueOnce({ ok: false } as never);
     await expect(
-      joinMeetCall({ meetUrl: 'https://meet.google.com/abc-defg-hij', displayName: 'Agent Alice' })
+      joinMeetCall({
+        meetUrl: 'https://meet.google.com/abc-defg-hij',
+        displayName: 'Agent Alice',
+        ownerDisplayName: 'Owner Bob',
+      })
     ).rejects.toThrow(/Core rejected/);
     expect(invoke).not.toHaveBeenCalled();
   });
@@ -80,10 +90,30 @@ describe('joinMeetCall', () => {
     } as never);
 
     await expect(
-      joinMeetCall({ meetUrl: 'https://meet.google.com/abc-defg-hij', displayName: 'Agent Alice' })
+      joinMeetCall({
+        meetUrl: 'https://meet.google.com/abc-defg-hij',
+        displayName: 'Agent Alice',
+        ownerDisplayName: 'Owner Bob',
+      })
     ).rejects.toThrow(/desktop app/);
     expect(invoke).not.toHaveBeenCalled();
   });
+
+  it('rejects an empty owner_display_name as a privacy-lock guard', async () => {
+    // Privacy lock: empty owner would fail closed at the core wake
+    // gate (no captions ever wake the bot). Surface the requirement
+    // up front so the user doesn't sit through a join only to find
+    // the bot silent — see feat/mascot-meet-flowA Plan C.
+    await expect(
+      joinMeetCall({
+        meetUrl: 'https://meet.google.com/abc-defg-hij',
+        displayName: 'Agent Alice',
+        ownerDisplayName: '   ',
+      })
+    ).rejects.toThrow(/your own name/i);
+    expect(callCoreRpc).not.toHaveBeenCalled();
+    expect(invoke).not.toHaveBeenCalled();
+  });
 });
 
 describe('closeMeetCall', () => {
diff --git a/app/src/services/meetCallService.ts b/app/src/services/meetCallService.ts
index 5283cab416..aa29eb3cef 100644
--- a/app/src/services/meetCallService.ts
+++ b/app/src/services/meetCallService.ts
@@ -112,6 +112,48 @@ export async function closeMeetCall(requestId: string): Promise<boolean> {
   return invoke<boolean>('meet_call_close_window', { requestId });
 }
 
+/**
+ * One completed Meet call as persisted by the core in the JSONL
+ * recent-calls log (written by `handle_stop_session`). Same shape
+ * as `MeetCallRecord` in `src/openhuman/meet_agent/store.rs` —
+ * snake_case fields because the core surfaces them verbatim.
+ */
+export interface MeetCallRecord {
+  request_id: string;
+  meet_url: string;
+  bot_display_name: string;
+  owner_display_name: string;
+  started_at_ms: number;
+  ended_at_ms: number;
+  listened_seconds: number;
+  spoken_seconds: number;
+  turn_count: number;
+}
+
+interface CoreListCallsResponse {
+  ok: boolean;
+  calls: MeetCallRecord[];
+  count: number;
+}
+
+/**
+ * Fetch the most recent completed Meet calls (newest first). Used
+ * by the Skills "Meeting Bots" modal to render a history list
+ * underneath the join form. Returns an empty array on a fresh
+ * install (no recorded calls yet) — the core treats a missing
+ * JSONL file as "no rows" rather than an error.
+ */
+export async function listMeetCalls(limit = 20): Promise<MeetCallRecord[]> {
+  const result = await callCoreRpc<CoreListCallsResponse>({
+    method: 'openhuman.meet_agent_list_calls',
+    params: { limit },
+  });
+  if (!result?.ok) {
+    throw new Error('Core rejected the meet_agent_list_calls request.');
+  }
+  return result.calls ?? [];
+}
+
 /**
  * Backend-driven meet bot join (PR tinyhumansai/backend#773).
  *

From 2a5014ab94bad940dc2398052d7101d8bf634c6d Mon Sep 17 00:00:00 2001
From: oxoxDev <nikhil@tinyhumans.ai>
Date: Mon, 25 May 2026 12:36:12 +0530
Subject: [PATCH 51/64] feat(skills): inline recent-calls list in
 MeetingBotsCard modal
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

`MeetingBotsModal` now fetches the most recent 20 calls via
`listMeetCalls()` on mount and renders them in a new
`RecentCallsSection` underneath the join form — same surface
where the user launched the call, so they see their history
without navigating away. Three render states (loading / empty /
populated) avoid the empty-flash on first open.

Each row shows the trailing Meet code (`abc-defg-hij`), a
relative timestamp (`12m ago`, `yesterday`, `May 14`), and the
turn count + on-call seconds — enough at a glance without
overflowing the modal width. Fetch errors are surfaced
inline as informational text (not role="alert", which the form
already owns).
---
 app/src/components/skills/MeetingBotsCard.tsx | 146 +++++++++++++++++-
 1 file changed, 145 insertions(+), 1 deletion(-)

diff --git a/app/src/components/skills/MeetingBotsCard.tsx b/app/src/components/skills/MeetingBotsCard.tsx
index 07b972bfe1..3ba6e7e0df 100644
--- a/app/src/components/skills/MeetingBotsCard.tsx
+++ b/app/src/components/skills/MeetingBotsCard.tsx
@@ -7,12 +7,14 @@
 // and Teams are shown as "coming soon" — only Google Meet has the CEF
 // bridge pipeline today.
 
-import { useEffect, useState } from 'react';
+import { useCallback, useEffect, useState } from 'react';
 
 import { useT } from '../../lib/i18n/I18nContext';
 import {
   joinMeetCall,
+  listMeetCalls,
   type MascotMeetPlatform,
+  type MeetCallRecord,
 } from '../../services/meetCallService';
 
 type Toast = { type: 'success' | 'error' | 'info'; title: string; message?: string };
@@ -123,6 +125,31 @@ export function MeetingBotsModal({ onClose, onToast }: ModalProps) {
   const [ownerDisplayName, setOwnerDisplayName] = useState('');
   const [submitting, setSubmitting] = useState(false);
   const [error, setError] = useState<string | null>(null);
+  // Recent-calls history loaded from core when the modal opens.
+  // `null` means "not yet fetched"; `[]` means "fetched, no rows".
+  // Separating the two lets the UI render a "Loading…" hint on
+  // first open without flashing a misleading empty state.
+  const [recentCalls, setRecentCalls] = useState<MeetCallRecord[] | null>(null);
+  const [recentError, setRecentError] = useState<string | null>(null);
+
+  const refreshRecentCalls = useCallback(async () => {
+    setRecentError(null);
+    try {
+      const rows = await listMeetCalls(20);
+      setRecentCalls(rows);
+    } catch (err) {
+      const message = err instanceof Error ? err.message : 'Failed to load recent calls.';
+      console.warn('[meeting-bots] listMeetCalls failed:', err);
+      setRecentError(message);
+      setRecentCalls([]);
+    }
+  }, []);
+
+  useEffect(() => {
+    // Fire-and-forget on mount; the modal is short-lived (closes on
+    // submit or Cancel) so a slow RPC here can't pile up.
+    void refreshRecentCalls();
+  }, [refreshRecentCalls]);
 
   const selected = PLATFORMS.find(p => p.platform === platform) ?? PLATFORMS[0];
   const isComingSoon = !!selected.comingSoon;
@@ -309,8 +336,125 @@ export function MeetingBotsModal({ onClose, onToast }: ModalProps) {
               </button>
             </div>
           </form>
+
+          <RecentCallsSection rows={recentCalls} error={recentError} />
         </div>
       </div>
     </div>
   );
 }
+
+/**
+ * Recent calls list rendered below the join form inside the same
+ * modal — same surface where the user launches a call, so they see
+ * their history without navigating away. Three states:
+ *   - `rows === null`     → still loading (small spinner-y hint).
+ *   - `rows === []`       → no calls yet (gentle empty state).
+ *   - `rows.length > 0`   → render a compact list, newest first.
+ *
+ * `error` is shown inline above the list when the fetch failed but
+ * doesn't block the form — the join path is independent.
+ */
+function RecentCallsSection({
+  rows,
+  error,
+}: {
+  rows: MeetCallRecord[] | null;
+  error: string | null;
+}) {
+  return (
+    <section
+      aria-label="Recent meeting calls"
+      className="mt-4 border-t border-stone-200 dark:border-neutral-800 pt-4">
+      <div className="flex items-baseline justify-between">
+        <h3 className="text-[11px] font-semibold uppercase tracking-wide text-stone-500 dark:text-neutral-400">
+          Recent calls
+          {rows && rows.length > 0 && (
+            <span className="ml-1 text-stone-400 dark:text-neutral-500 normal-case font-normal">
+              ({rows.length})
+            </span>
+          )}
+        </h3>
+      </div>
+
+      {error && (
+        // Plain status text rather than role="alert" — the join form
+        // already owns the alert role for the modal's primary error
+        // surface. A failure to fetch history is informational, not
+        // actionable, and shouldn't collide with the form's a11y
+        // announcement.
+        <p className="mt-2 text-[11px] text-coral-600 dark:text-coral-400">{error}</p>
+      )}
+
+      {rows === null ? (
+        <p className="mt-2 text-[11px] text-stone-400 dark:text-neutral-500">Loading…</p>
+      ) : rows.length === 0 ? (
+        <p className="mt-2 text-[11px] text-stone-400 dark:text-neutral-500">
+          No previous calls yet — your meeting history will appear here.
+        </p>
+      ) : (
+        <ul className="mt-2 max-h-48 space-y-1 overflow-y-auto pr-1">
+          {rows.map(call => (
+            <RecentCallRow key={call.request_id} call={call} />
+          ))}
+        </ul>
+      )}
+    </section>
+  );
+}
+
+function RecentCallRow({ call }: { call: MeetCallRecord }) {
+  // Show the trailing meeting code (`abc-defg-hij`) rather than the
+  // full URL — the URL prefix is always `https://meet.google.com/`
+  // and would just waste row width.
+  const meetingCode = (() => {
+    try {
+      const parsed = new URL(call.meet_url);
+      const tail = parsed.pathname.replace(/^\/+/, '');
+      return tail || call.meet_url;
+    } catch {
+      return call.meet_url || '(unknown URL)';
+    }
+  })();
+  const duration = Math.max(0, Math.round(call.spoken_seconds + call.listened_seconds));
+  return (
+    <li className="rounded-lg px-2 py-1.5 text-[11px] text-stone-700 dark:text-neutral-300 hover:bg-stone-50 dark:hover:bg-neutral-800/40">
+      <div className="flex items-center justify-between gap-2">
+        <span className="truncate font-mono text-stone-800 dark:text-neutral-200">{meetingCode}</span>
+        <span className="shrink-0 text-stone-400 dark:text-neutral-500">
+          {formatRelativeTime(call.started_at_ms)}
+        </span>
+      </div>
+      <div className="mt-0.5 flex items-center gap-3 text-[10px] text-stone-500 dark:text-neutral-400">
+        <span>{call.turn_count} turn{call.turn_count === 1 ? '' : 's'}</span>
+        <span>{duration}s on call</span>
+      </div>
+    </li>
+  );
+}
+
+/**
+ * Compact "12 min ago" / "yesterday" / "May 14" style stamp. Browser
+ * `Intl.RelativeTimeFormat` would be nicer but pulls a much larger
+ * locale data path; the targets here are short labels in a single
+ * surface, not a full i18n investment.
+ */
+function formatRelativeTime(ms: number): string {
+  if (!ms) return '—';
+  const diff = Date.now() - ms;
+  if (diff < 0) return 'just now';
+  const seconds = Math.floor(diff / 1000);
+  if (seconds < 60) return 'just now';
+  const minutes = Math.floor(seconds / 60);
+  if (minutes < 60) return `${minutes}m ago`;
+  const hours = Math.floor(minutes / 60);
+  if (hours < 24) return `${hours}h ago`;
+  const days = Math.floor(hours / 24);
+  if (days === 1) return 'yesterday';
+  if (days < 7) return `${days}d ago`;
+  try {
+    return new Date(ms).toLocaleDateString(undefined, { month: 'short', day: 'numeric' });
+  } catch {
+    return '—';
+  }
+}

From 0e5e2ecf8b415edffdbc7adda9a4f96171af5f52 Mon Sep 17 00:00:00 2001
From: oxoxDev <nikhil@tinyhumans.ai>
Date: Mon, 25 May 2026 13:01:27 +0530
Subject: [PATCH 52/64] chore: apply auto-fixes

---
 app/src-tauri/src/meet_audio/speak_pump.rs | 4 +---
 src/openhuman/meet_agent/session.rs        | 6 +-----
 2 files changed, 2 insertions(+), 8 deletions(-)

diff --git a/app/src-tauri/src/meet_audio/speak_pump.rs b/app/src-tauri/src/meet_audio/speak_pump.rs
index 549a12ece4..4b83a15ce1 100644
--- a/app/src-tauri/src/meet_audio/speak_pump.rs
+++ b/app/src-tauri/src/meet_audio/speak_pump.rs
@@ -209,9 +209,7 @@ impl SpeakingTracker {
                 "[meet-audio] speaking-state emit failed request_id={request_id} speaking={next} err={err}"
             );
         } else {
-            log::debug!(
-                "[meet-audio] speaking-state -> {next} request_id={request_id}"
-            );
+            log::debug!("[meet-audio] speaking-state -> {next} request_id={request_id}");
         }
     }
 }
diff --git a/src/openhuman/meet_agent/session.rs b/src/openhuman/meet_agent/session.rs
index a6164da732..8698c15e4f 100644
--- a/src/openhuman/meet_agent/session.rs
+++ b/src/openhuman/meet_agent/session.rs
@@ -871,11 +871,7 @@ mod tests {
         // The bot must never wake on its own voice — regardless of
         // the text content, including text that happens to repeat the
         // wake phrase.
-        let fired = s.note_caption(
-            "OpenHuman",
-            "hey openhuman would you like to know more",
-            1,
-        );
+        let fired = s.note_caption("OpenHuman", "hey openhuman would you like to know more", 1);
         assert!(!fired, "bot-self caption must be filtered");
     }
 

From 18e70f3c7e5c13452f58969e729e786ca71f758f Mon Sep 17 00:00:00 2001
From: oxoxDev <nikhil@tinyhumans.ai>
Date: Mon, 25 May 2026 13:17:13 +0530
Subject: [PATCH 53/64] feat(meet-agent): CaptionOutcome enum + soft-deny +
 owner-grant state
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

`note_caption` now returns a `CaptionOutcome` enum (Ignored /
WakeFired / UnauthorizedWake) so callers can branch between the
silent-drop, normal-turn, and polite-refusal paths without
re-doing the gate logic out-of-band. The unauthorised path only
fires when the non-owner caption actually contains a wake
phrase — random chatter still goes through the existing
`Ignored` branch.

Session gains:
  - `pending_unauthorized_speaker` + timestamp (2 min window)
  - `allowlist: HashSet<String>` of normalised speaker names
  - `allow_speaker(name)` adds to allowlist
  - `take_pending_unauthorized()` consumes the slot if fresh

Wake gate now accepts owner OR any allowlisted speaker. Bot-self
filter still returns Ignored (an UnauthorizedWake here would
loop on the bot's own refusal caption).

Tests cover non-owner soft-deny outcome, non-owner chatter
still ignored, allowlist promotes a refused speaker, pending
take consumes once.
---
 src/openhuman/meet_agent/session.rs | 249 ++++++++++++++++++++++++----
 1 file changed, 216 insertions(+), 33 deletions(-)

diff --git a/src/openhuman/meet_agent/session.rs b/src/openhuman/meet_agent/session.rs
index 8698c15e4f..52119f7b99 100644
--- a/src/openhuman/meet_agent/session.rs
+++ b/src/openhuman/meet_agent/session.rs
@@ -10,7 +10,7 @@
 //! live in a process-wide `OnceLock<Mutex<HashMap<...>>>`. The locking
 //! pattern matches `meet_call::MeetCallState` on the shell side.
 
-use std::collections::HashMap;
+use std::collections::{HashMap, HashSet};
 use std::sync::{Mutex, OnceLock};
 use std::time::{Instant, SystemTime, UNIX_EPOCH};
 
@@ -19,6 +19,33 @@ use base64::{engine::general_purpose::STANDARD as B64, Engine as _};
 use super::ops::{self, Vad, VadEvent};
 use super::types::{SessionEvent, SessionEventKind};
 
+/// What `note_caption` decided to do with a caption. Replaces the
+/// prior boolean return so the RPC layer can branch between the
+/// "fire a normal LLM turn", "speak a polite refusal", and "do
+/// nothing" paths without re-doing the gate logic out-of-band.
+#[derive(Debug, Clone, PartialEq, Eq)]
+pub enum CaptionOutcome {
+    /// Caption was dropped: not a wake, dedupped, cooled down, or
+    /// during a turn-in-flight. No audible response.
+    Ignored,
+    /// Wake fired and the caller should kick `brain::run_caption_turn`.
+    WakeFired,
+    /// Wake phrase was detected from someone who is not the call
+    /// owner (or on a session that hasn't had identities configured).
+    /// The caller should speak a polite refusal via
+    /// `brain::run_soft_deny_turn` rather than silently dropping —
+    /// silence makes the bot look broken; a single explicit "sorry,
+    /// only <owner> can ask" line clears the air and tells the owner
+    /// how to grant access if they'd like to.
+    UnauthorizedWake { speaker: String },
+}
+
+/// How long after a denied wake the owner has to say "allow" before
+/// the grant request expires. 2 minutes is enough for a back-and-forth
+/// exchange ("hey openhuman" — refusal — owner: "go ahead, let them
+/// ask") without leaving the gate softened indefinitely.
+const PENDING_GRANT_WINDOW_MS: u64 = 120_000;
+
 /// Cap on the inbound buffer so a runaway shell push (e.g. shell never
 /// stops, brain never drains) can't grow memory unboundedly. 30s @ 16kHz
 /// mono = 960 KB per session — generous for any reasonable utterance.
@@ -129,6 +156,22 @@ pub struct MeetAgentSession {
     /// math, but the JSONL persistence layer needs an absolute
     /// timestamp that can be sorted across process restarts.
     started_at_ms: u64,
+    /// Normalised name of the most recent non-owner speaker that
+    /// tripped the wake word. Recorded so the owner can grant them
+    /// access by saying "allow" / "let them" / "go ahead" within
+    /// `PENDING_GRANT_WINDOW_MS` of the refusal. Cleared once a
+    /// grant lands or the window elapses.
+    pending_unauthorized_speaker: Option<String>,
+    /// Wall-clock ms when `pending_unauthorized_speaker` was set.
+    /// The owner has `PENDING_GRANT_WINDOW_MS` from this point to
+    /// approve the asker.
+    pending_unauthorized_at_ms: u64,
+    /// Speakers (normalised display names) the owner has explicitly
+    /// allowed to wake the bot during this call. Wake gate accepts
+    /// captions whose speaker matches the owner OR appears here.
+    /// Resets on `stop_session` (the registry drops the whole
+    /// session). Empty by default — grants are opt-in per call.
+    allowlist: HashSet<String>,
 }
 
 impl MeetAgentSession {
@@ -160,7 +203,45 @@ impl MeetAgentSession {
                 .duration_since(UNIX_EPOCH)
                 .map(|d| d.as_millis() as u64)
                 .unwrap_or(0),
+            pending_unauthorized_speaker: None,
+            pending_unauthorized_at_ms: 0,
+            allowlist: HashSet::new(),
+        }
+    }
+
+    /// Add a speaker to the per-call allowlist. The wake gate
+    /// thereafter accepts captions from this speaker just like it
+    /// would from the owner — single source of truth so the
+    /// granted user can ask follow-up questions without saying
+    /// "allow" each time. Stored using the normalised name so
+    /// Meet's punctuation/case jitter doesn't reset the grant.
+    pub fn allow_speaker(&mut self, speaker_display_name: &str) {
+        let norm = normalise_participant_name(speaker_display_name);
+        if !norm.is_empty() {
+            self.allowlist.insert(norm);
+        }
+    }
+
+    /// Consume the pending unauthorized speaker if still inside the
+    /// grant window. Returns the display name (in its normalised
+    /// form) so the brain layer can both grant them access and name
+    /// them in the spoken confirmation ("Okay, <name> can ask me").
+    /// Returns `None` when no pending grant exists or the window
+    /// has already elapsed.
+    pub fn take_pending_unauthorized(&mut self) -> Option<String> {
+        let now_ms = SystemTime::now()
+            .duration_since(UNIX_EPOCH)
+            .map(|d| d.as_millis() as u64)
+            .unwrap_or(0);
+        let candidate = self.pending_unauthorized_speaker.take()?;
+        if now_ms.saturating_sub(self.pending_unauthorized_at_ms) > PENDING_GRANT_WINDOW_MS {
+            // Stale grant — drop without surfacing. The owner would
+            // need to re-trigger the refusal flow to re-arm.
+            self.pending_unauthorized_at_ms = 0;
+            return None;
         }
+        self.pending_unauthorized_at_ms = 0;
+        Some(candidate)
     }
 
     /// Record the Meet URL the call joined. Stored alongside the
@@ -238,9 +319,9 @@ impl MeetAgentSession {
     /// the match in the same caption is treated as the start of the
     /// prompt; subsequent captions append until `take_pending_prompt`
     /// drains.
-    pub fn note_caption(&mut self, speaker: &str, text: &str, ts_ms: u64) -> bool {
+    pub fn note_caption(&mut self, speaker: &str, text: &str, ts_ms: u64) -> CaptionOutcome {
         if text.trim().is_empty() {
-            return false;
+            return CaptionOutcome::Ignored;
         }
         // Drop noise captions from Meet's local-user / UI affordances.
         // `speaker=="You"` is Meet's label for the local participant
@@ -252,7 +333,7 @@ impl MeetAgentSession {
         // eating the prompt budget and producing endless speech.
         let speaker_lower = speaker.trim().to_lowercase();
         if speaker_lower == "you" || speaker_lower.is_empty() {
-            return false;
+            return CaptionOutcome::Ignored;
         }
         // Privacy gate — owner-only wake.
         //
@@ -280,7 +361,7 @@ impl MeetAgentSession {
         // owner check so a (very contrived) bot_display_name ==
         // owner_display_name still doesn't let the bot wake itself.
         if !bot_norm.is_empty() && speaker_norm == bot_norm {
-            return false;
+            return CaptionOutcome::Ignored;
         }
         // Fail-closed when no owner has been configured. A live
         // session without a known owner is by definition unsafe —
@@ -293,9 +374,44 @@ impl MeetAgentSession {
                 self.request_id,
                 speaker
             );
-            return false;
+            return CaptionOutcome::Ignored;
         }
-        if speaker_norm != owner_norm {
+        // Treat owner + previously-granted allowlist members as
+        // authorised speakers for wake purposes. The allowlist is
+        // populated when the owner says "allow them" / "go ahead"
+        // / "let them ask" after a non-owner wake refusal — see
+        // `brain::run_caption_turn`'s grant-intent branch.
+        let speaker_is_authorised =
+            speaker_norm == owner_norm || self.allowlist.contains(&speaker_norm);
+        if !speaker_is_authorised {
+            // Walk the caption to see if it actually carries a wake
+            // phrase. Random conversation from a non-owner shouldn't
+            // trigger the polite refusal — only an attempt to wake
+            // the bot does. Mirrors the matcher used in the owner
+            // path below; intentionally duplicated rather than
+            // refactored to a shared helper so the (currently small)
+            // unauthorised-path stays self-contained.
+            let normalized_for_match = normalize_for_wake(text);
+            const WAKE_PHRASES: &[&str] = &[
+                "hey open human",
+                "hi open human",
+                "hello open human",
+                "hey openhuman",
+                "hi openhuman",
+                "hello openhuman",
+                "open human",
+                "openhuman",
+            ];
+            let mut hit = false;
+            for phrase in WAKE_PHRASES {
+                if normalized_for_match.contains(phrase) {
+                    hit = true;
+                    break;
+                }
+            }
+            if !hit {
+                return CaptionOutcome::Ignored;
+            }
             // Audit-style log so dev:app stdout makes the rejection
             // visible without leaking the caption body verbatim
             // (preview capped, matches the wake-preview style used
@@ -309,7 +425,19 @@ impl MeetAgentSession {
                 self.owner_display_name,
                 preview
             );
-            return false;
+            // Record the pending grant request. The owner has
+            // PENDING_GRANT_WINDOW_MS to approve them via the
+            // "allow" / "let them" / "go ahead" pattern; after that
+            // the slot expires and the unauthorised speaker has to
+            // re-trigger the refusal to re-arm.
+            self.pending_unauthorized_speaker = Some(speaker.trim().to_string());
+            self.pending_unauthorized_at_ms = SystemTime::now()
+                .duration_since(UNIX_EPOCH)
+                .map(|d| d.as_millis() as u64)
+                .unwrap_or(0);
+            return CaptionOutcome::UnauthorizedWake {
+                speaker: speaker.trim().to_string(),
+            };
         }
         // Per-speaker dedup. Meet's CC region re-renders the same line
         // every 250 ms poll tick and emits BOTH speaker rows on each
@@ -328,7 +456,7 @@ impl MeetAgentSession {
         let normalised = normalise_for_dedup(text);
         if let Some(prev) = self.last_caption_by_speaker.get(&key) {
             if prev == &normalised {
-                return false;
+                return CaptionOutcome::Ignored;
             }
         }
         self.last_caption_by_speaker.insert(key, normalised);
@@ -345,7 +473,7 @@ impl MeetAgentSession {
                 SessionEventKind::Heard,
                 format!("{speaker}: {text} (suppressed: turn_in_progress)"),
             );
-            return false;
+            return CaptionOutcome::Ignored;
         }
         self.last_caption_ts_ms = ts_ms;
         // Already collecting after a previous wake word: just append
@@ -358,7 +486,7 @@ impl MeetAgentSession {
                 self.pending_prompt.push(' ');
             }
             self.pending_prompt.push_str(text.trim());
-            return false;
+            return CaptionOutcome::Ignored;
         }
         // Min-turn-gap backstop. Even if the page-side caption
         // cooldown window expires, refuse to start a new turn
@@ -381,7 +509,7 @@ impl MeetAgentSession {
                     MIN_TURN_GAP_MS
                 ),
             );
-            return false;
+            return CaptionOutcome::Ignored;
         }
         // In cooldown after a recent turn — Meet keeps the same
         // utterance visible for several seconds, so without this
@@ -397,7 +525,7 @@ impl MeetAgentSession {
                     format!("{speaker}: {text}")
                 },
             );
-            return false;
+            return CaptionOutcome::Ignored;
         }
         // Normalize before matching: Meet's STT punctuates the wake
         // phrase ("hey, openhuman"), capitalizes mid-sentence, and
@@ -440,7 +568,7 @@ impl MeetAgentSession {
                 SessionEventKind::Note,
                 format!("wake word from speaker={speaker}"),
             );
-            return true;
+            return CaptionOutcome::WakeFired;
         }
         // Outside a wake context, just record the line for the
         // transcript log. Useful for debugging "why didn't the agent
@@ -454,7 +582,7 @@ impl MeetAgentSession {
                 format!("{speaker}: {text}")
             },
         );
-        false
+        CaptionOutcome::Ignored
     }
 
     /// Drain the assembled wake-word prompt and clear the active
@@ -806,8 +934,8 @@ mod tests {
     fn note_caption_handles_punctuated_wake() {
         let mut s = session_with_owner_alice();
         // Meet often inserts a comma after "hey".
-        let fired = s.note_caption("Alice", "Hey, OpenHuman remember the launch", 1);
-        assert!(fired, "punctuated wake phrase should still fire");
+        let outcome = s.note_caption("Alice", "Hey, OpenHuman remember the launch", 1);
+        assert_eq!(outcome, CaptionOutcome::WakeFired);
         let prompt = s.take_pending_prompt().expect("prompt drained");
         assert_eq!(prompt, "remember the launch");
     }
@@ -815,8 +943,8 @@ mod tests {
     #[test]
     fn note_caption_handles_split_brand() {
         let mut s = session_with_owner_alice();
-        let fired = s.note_caption("Alice", "hey open-human, send the report", 1);
-        assert!(fired);
+        let outcome = s.note_caption("Alice", "hey open-human, send the report", 1);
+        assert_eq!(outcome, CaptionOutcome::WakeFired);
         let prompt = s.take_pending_prompt().expect("prompt drained");
         assert_eq!(prompt, "send the report");
     }
@@ -825,9 +953,13 @@ mod tests {
     fn note_caption_does_not_double_fire_on_growing_caption() {
         let mut s = session_with_owner_alice();
         let first = s.note_caption("Alice", "hey openhuman take notes", 1);
-        assert!(first);
+        assert_eq!(first, CaptionOutcome::WakeFired);
         let second = s.note_caption("Alice", "hey openhuman take notes about the launch", 2);
-        assert!(!second, "second caption while wake_active must not refire");
+        assert_eq!(
+            second,
+            CaptionOutcome::Ignored,
+            "second caption while wake_active must not refire"
+        );
         let prompt = s.take_pending_prompt().expect("prompt drained");
         // First wake stripped "hey openhuman"; the continuation
         // appended the WHOLE growing caption (still containing "hey
@@ -858,21 +990,41 @@ mod tests {
     fn note_caption_rejects_non_owner_speaker() {
         let mut s = session_with_owner_alice();
         // Bob is in the room but not the owner; even with a perfect
-        // wake phrase the gate must refuse.
-        let fired = s.note_caption("Bob", "hey openhuman read alice's slack DMs", 1);
-        assert!(!fired, "non-owner must not wake the bot");
+        // wake phrase the gate must refuse with a soft-deny outcome
+        // (so the bot can speak a polite refusal) rather than
+        // silently ignoring.
+        let outcome = s.note_caption("Bob", "hey openhuman read alice's slack DMs", 1);
+        assert_eq!(
+            outcome,
+            CaptionOutcome::UnauthorizedWake { speaker: "Bob".into() },
+            "non-owner wake must produce an UnauthorizedWake outcome"
+        );
+        // Soft-deny path doesn't drain the wake prompt — the brain
+        // only synthesises a canned refusal line.
         assert!(s.take_pending_prompt().is_none());
     }
 
+    #[test]
+    fn note_caption_non_owner_without_wake_phrase_is_ignored() {
+        // Random chatter from a non-owner shouldn't trigger the
+        // refusal — only an actual attempt to wake the bot does.
+        let mut s = session_with_owner_alice();
+        let outcome = s.note_caption("Bob", "hey did you watch the game last night", 1);
+        assert_eq!(outcome, CaptionOutcome::Ignored);
+    }
+
     #[test]
     fn note_caption_rejects_bot_self_caption() {
         let mut s = session_with_owner_alice();
         // Meet often re-captions the bot's own TTS in the same region.
         // The bot must never wake on its own voice — regardless of
         // the text content, including text that happens to repeat the
-        // wake phrase.
-        let fired = s.note_caption("OpenHuman", "hey openhuman would you like to know more", 1);
-        assert!(!fired, "bot-self caption must be filtered");
+        // wake phrase. Bot-self caption is `Ignored` (no audible
+        // response at all) rather than `UnauthorizedWake` — surfacing
+        // a soft-deny here would create an infinite loop where the
+        // refusal triggers its own bot-self caption.
+        let outcome = s.note_caption("OpenHuman", "hey openhuman would you like to know more", 1);
+        assert_eq!(outcome, CaptionOutcome::Ignored);
     }
 
     #[test]
@@ -881,8 +1033,8 @@ mod tests {
         // speaker. Mirrors the misconfigured-launch posture: better
         // silent failure than an open mic for the user's tool surface.
         let mut s = MeetAgentSession::new("p".into(), 16_000);
-        let fired = s.note_caption("Alice", "hey openhuman do the thing", 1);
-        assert!(!fired, "empty owner must fail-closed");
+        let outcome = s.note_caption("Alice", "hey openhuman do the thing", 1);
+        assert_eq!(outcome, CaptionOutcome::Ignored);
     }
 
     #[test]
@@ -892,8 +1044,8 @@ mod tests {
         // gate still recognises Alice when Meet renders her as
         // "Alice (host)".
         let mut s = session_with_owner_alice();
-        let fired = s.note_caption("Alice (host)", "hey openhuman take a note", 1);
-        assert!(fired, "owner with parenthetical decorator must match");
+        let outcome = s.note_caption("Alice (host)", "hey openhuman take a note", 1);
+        assert_eq!(outcome, CaptionOutcome::WakeFired);
     }
 
     #[test]
@@ -902,8 +1054,39 @@ mod tests {
         // entered in lowercase, or vice versa. The comparison must
         // be case-insensitive.
         let mut s = session_with_owner_alice();
-        let fired = s.note_caption("ALICE", "hey openhuman summarise", 1);
-        assert!(fired, "owner match must be case-insensitive");
+        let outcome = s.note_caption("ALICE", "hey openhuman summarise", 1);
+        assert_eq!(outcome, CaptionOutcome::WakeFired);
+    }
+
+    #[test]
+    fn allowlist_grants_subsequent_wakes() {
+        // After the owner grants Bob via `allow_speaker`, Bob's
+        // next wake-phrase caption should fire just like the
+        // owner's — no soft-deny, no Ignored.
+        let mut s = session_with_owner_alice();
+        // First attempt without a grant is soft-deny:
+        let denied = s.note_caption("Bob", "hey openhuman read slack", 1);
+        assert!(matches!(denied, CaptionOutcome::UnauthorizedWake { .. }));
+        // Owner grants Bob:
+        s.allow_speaker("Bob");
+        // Bob now wakes successfully. Use a different text so the
+        // per-speaker dedup doesn't reject it.
+        let granted = s.note_caption("Bob", "hey openhuman what's the weather", 2);
+        assert_eq!(granted, CaptionOutcome::WakeFired);
+    }
+
+    #[test]
+    fn take_pending_unauthorized_returns_within_window() {
+        // The soft-deny path records the speaker so the owner can
+        // grant them shortly after. Inside the window we get the
+        // name back; we'd need to fast-forward time to test the
+        // expiry path, so just assert the in-window happy path here.
+        let mut s = session_with_owner_alice();
+        let _ = s.note_caption("Bob", "hey openhuman list my emails", 1);
+        let pending = s.take_pending_unauthorized();
+        assert_eq!(pending.as_deref(), Some("Bob"));
+        // Consumed — second take returns None.
+        assert!(s.take_pending_unauthorized().is_none());
     }
 
     #[test]

From 1c29cff89e33ad56371f2cd624a6a34168e7ac83 Mon Sep 17 00:00:00 2001
From: oxoxDev <nikhil@tinyhumans.ai>
Date: Mon, 25 May 2026 13:17:25 +0530
Subject: [PATCH 54/64] feat(meet-agent): run_soft_deny_turn + run_grant_turn
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Two new short brain paths that bypass the orchestrator agent:

`run_soft_deny_turn` synthesises a canned refusal line ("Sorry
<asker>, only <owner> can ask me things here. <owner>, say
'allow' to let them in.") and enqueues it as a normal TTS reply.
Cancels any prior outbound first so the refusal doesn't queue
behind a half-drained turn. Stamps turn-done so the min-turn-gap
backstop also covers refusals — a chatty non-owner can't spam
the gate every few seconds.

`run_grant_turn` adds the previously-refused speaker to the
session's per-call allowlist, speaks a short confirmation
("Okay, Bob can ask me now."), and clears the wake_active /
turn_in_progress flags so the grantee's next caption can fire
a fresh turn rather than coalescing into this one.

`run_caption_turn` checks `looks_like_grant_intent` at the top
of the prompt. If a pending unauthorised speaker exists within
the 2-min grant window, the turn branches into `run_grant_turn`
instead of the orchestrator. No pending request → fall through
to the normal LLM path, so the model can still answer if the
owner uses the same vocabulary in an unrelated query.

Tests cover the canned message templates, the grant-intent
matcher (accepts canonical phrases including "yes go ahead",
"let them in"; rejects mid-prompt false positives like
"did i allow that meeting").
---
 src/openhuman/meet_agent/brain.rs | 228 ++++++++++++++++++++++++++++++
 1 file changed, 228 insertions(+)

diff --git a/src/openhuman/meet_agent/brain.rs b/src/openhuman/meet_agent/brain.rs
index ae7ca1d4ab..dbe0ed9cc2 100644
--- a/src/openhuman/meet_agent/brain.rs
+++ b/src/openhuman/meet_agent/brain.rs
@@ -118,6 +118,150 @@ const MIN_TURN_SAMPLES: usize = 4_000;
 /// the ops boundary check rejects anything else outright.
 const SAMPLE_RATE_HZ: u32 = super::ops::REQUIRED_SAMPLE_RATE;
 
+/// Spoken refusal when a non-owner trips the wake word. Built per
+/// call from the configured owner display name so the audible
+/// response names the actual person who has the keys, and tells
+/// the owner the magic word ("allow") to grant access. Kept short
+/// so it doesn't drown the conversation.
+fn soft_deny_message(asker: &str, owner: &str) -> String {
+    let asker = asker.trim();
+    let owner = owner.trim();
+    match (asker.is_empty(), owner.is_empty()) {
+        (true, true) => "Sorry, I only respond to my owner.".to_string(),
+        (true, false) => format!(
+            "Sorry, only {owner} can ask me things in this call. {owner}, say 'allow' if you'd like me to answer."
+        ),
+        (false, true) => format!("Sorry {asker}, I only respond to my owner."),
+        (false, false) => format!(
+            "Sorry {asker}, only {owner} can ask me things here. {owner}, say 'allow' to let them in."
+        ),
+    }
+}
+
+/// Recognise an "open the gate" intent from the owner's first words
+/// after the wake phrase. Conservative: only fires when the prompt
+/// begins with one of the canonical permit verbs so an unrelated
+/// owner query that happens to contain "allow" or "yes" deeper in
+/// the sentence isn't hijacked.
+///
+/// Returns `true` when the owner is explicitly granting access to
+/// the most-recently-refused asker. The caller still gates on
+/// session-level state (`take_pending_unauthorized`) — without a
+/// pending request the intent is meaningless and the prompt should
+/// just run as a normal LLM turn.
+fn looks_like_grant_intent(prompt: &str) -> bool {
+    let p = prompt.trim().to_ascii_lowercase();
+    if p.is_empty() {
+        return false;
+    }
+    // Whole-prompt matches first so short approvals ("allow", "yes")
+    // don't collide with longer prompts that happen to start with
+    // the same word.
+    matches!(p.as_str(), "allow" | "yes" | "ok" | "okay" | "go ahead" | "let them in" | "let them ask" | "permit")
+        || p.starts_with("allow ")
+        || p.starts_with("let them")
+        || p.starts_with("let him")
+        || p.starts_with("let her")
+        || p.starts_with("go ahead")
+        || p.starts_with("yes go ahead")
+        || p.starts_with("yes let")
+        || p.starts_with("permit ")
+        || p.starts_with("you can answer")
+        || p.starts_with("you can tell")
+}
+
+/// Owner-grant path: the owner said "allow them" / "go ahead" /
+/// "let them in" after a non-owner's wake refusal. Add the
+/// previously-refused speaker to the per-call allowlist (so their
+/// next wake fires through to the orchestrator), and speak a
+/// short confirmation so they know they're in.
+pub async fn run_grant_turn(request_id: &str, grantee: &str) -> Result<bool, String> {
+    let grantee = grantee.trim();
+    let message = if grantee.is_empty() {
+        "Okay, you can ask me now.".to_string()
+    } else {
+        format!("Okay, {grantee} can ask me now.")
+    };
+    log::info!("[meet-agent] grant request_id={request_id} grantee=\"{grantee}\"");
+    // Apply the grant on the session BEFORE speaking — if TTS races
+    // and the grantee re-asks during synthesis, we want their next
+    // wake to fire through. Also cancel any prior outbound so the
+    // confirmation doesn't queue behind a half-drained refusal.
+    let _ = registry().with_session(request_id, |s| {
+        s.allow_speaker(grantee);
+        s.cancel_outbound();
+    });
+    let samples = match tts(&message).await {
+        Ok(samples) => samples,
+        Err(err) => {
+            log::warn!("[meet-agent] grant TTS failed request_id={request_id} err={err}");
+            stub_tts(&message).await
+        }
+    };
+    registry().with_session(request_id, |s| {
+        s.record_event(
+            SessionEventKind::Note,
+            format!("owner granted wake access to {grantee}"),
+        );
+        s.record_event(SessionEventKind::Spoke, message.clone());
+        if !samples.is_empty() {
+            s.enqueue_outbound_pcm(&samples, true);
+        }
+        // Clear the wake_active + turn_in_progress flags so the
+        // next caption (likely the grantee's actual question) can
+        // fire a new turn. Without this, the wake state from the
+        // owner's "allow them" prompt would coalesce the grantee's
+        // first real caption into a continuation of this grant turn.
+        s.wake_active = false;
+        s.turn_in_progress = false;
+        s.mark_turn_done();
+    })?;
+    Ok(true)
+}
+
+/// Soft-deny path: kick a polite refusal TTS reply when the wake
+/// word fires from a non-owner. Does NOT touch the orchestrator
+/// agent (no tool calls, no memory writes) — it's a single canned
+/// line, so the failure modes are limited to TTS errors.
+///
+/// The session has already recorded the pending grant request
+/// inside `note_caption`, so all this routine has to do is
+/// synthesize + enqueue the line + log a transcript event.
+pub async fn run_soft_deny_turn(request_id: &str, asker: &str) -> Result<bool, String> {
+    let owner = registry()
+        .with_session(request_id, |s| s.owner_display_name().to_string())
+        .unwrap_or_default();
+    let message = soft_deny_message(asker, &owner);
+    log::info!(
+        "[meet-agent] soft-deny request_id={request_id} asker=\"{asker}\" owner=\"{owner}\""
+    );
+    // Cancel any prior outbound so the refusal doesn't queue behind a
+    // half-drained reply from a previous turn.
+    let _ = registry().with_session(request_id, |s| s.cancel_outbound());
+    let samples = match tts(&message).await {
+        Ok(samples) => samples,
+        Err(err) => {
+            log::warn!("[meet-agent] soft-deny TTS failed request_id={request_id} err={err}");
+            stub_tts(&message).await
+        }
+    };
+    registry().with_session(request_id, |s| {
+        s.record_event(
+            SessionEventKind::Note,
+            format!("soft-deny: {asker} attempted wake without owner approval"),
+        );
+        s.record_event(SessionEventKind::Spoke, message.clone());
+        if !samples.is_empty() {
+            s.enqueue_outbound_pcm(&samples, true);
+        }
+        // Stamp turn-done so the min-turn-gap backstop covers the
+        // refusal the same way it covers a real reply. Without this,
+        // a chatty non-owner could re-trip the gate every few seconds.
+        s.mark_turn_done();
+    })?;
+    Ok(true)
+}
+
 /// Caption-driven turn. Drains the session's pending wake-word prompt
 /// (assembled by `session::note_caption`) and runs LLM → TTS → enqueue
 /// outbound. Skips STT entirely — the captions are already text.
@@ -174,6 +318,29 @@ pub async fn run_caption_turn(request_id: &str) -> Result<bool, String> {
         was_bare_wake,
     );
 
+    // Grant-intent fast path. When the owner says "hey openhuman,
+    // allow them" / "let them in" / "go ahead" after a non-owner
+    // wake refusal, treat the turn as a single-shot session-level
+    // grant rather than handing the prompt to the orchestrator.
+    // The pending grantee was captured by `note_caption` at refusal
+    // time and lives on the session for `PENDING_GRANT_WINDOW_MS`.
+    if !was_bare_wake && looks_like_grant_intent(&prompt) {
+        let pending = registry()
+            .with_session(request_id, |s| s.take_pending_unauthorized())
+            .ok()
+            .flatten();
+        if let Some(grantee) = pending {
+            return run_grant_turn(request_id, &grantee).await;
+        }
+        // No pending request to grant — fall through to the normal
+        // LLM path. The model can interpret "allow" however it
+        // wants from there; without a pending grantee we have no
+        // session-level meaning to attach to it.
+        log::info!(
+            "[meet-agent] grant-intent prompt detected but no pending request — falling through request_id={request_id}"
+        );
+    }
+
     // Pre-roll filler. The orchestrator + integration tools take
     // 30–60s on slow paths (Slack / Gmail / Calendar). Without an
     // immediate acoustic cue, the user assumes the bot is broken and
@@ -1094,4 +1261,65 @@ mod tests {
         assert_eq!(strip_for_speech(""), "");
         assert_eq!(strip_for_speech("   \n  "), "");
     }
+
+    #[test]
+    fn soft_deny_message_names_both_owner_and_asker() {
+        let line = soft_deny_message("Bob", "Alice");
+        assert!(line.contains("Bob"), "must address the asker: {line}");
+        assert!(line.contains("Alice"), "must name the owner: {line}");
+        assert!(line.to_lowercase().contains("allow"), "must hint the magic word: {line}");
+    }
+
+    #[test]
+    fn soft_deny_message_handles_missing_names_gracefully() {
+        // No asker, no owner — should still be a polite English sentence,
+        // not a templated stub with empty placeholders.
+        let line = soft_deny_message("", "");
+        assert!(!line.is_empty());
+        assert!(!line.contains("{"), "must not leak format placeholders: {line}");
+    }
+
+    #[test]
+    fn looks_like_grant_intent_accepts_canonical_phrases() {
+        // Whole-prompt approvals.
+        for phrase in ["allow", "yes", "ok", "okay", "go ahead", "permit"] {
+            assert!(
+                looks_like_grant_intent(phrase),
+                "must accept bare approval phrase: {phrase}"
+            );
+        }
+        // Common longer forms.
+        for phrase in [
+            "allow them",
+            "allow Bob to ask",
+            "let them in",
+            "let them ask",
+            "let her ask",
+            "go ahead and answer them",
+            "yes go ahead",
+            "permit Bob",
+            "you can tell Bob",
+        ] {
+            assert!(looks_like_grant_intent(phrase), "should accept: {phrase}");
+        }
+    }
+
+    #[test]
+    fn looks_like_grant_intent_rejects_unrelated_prompts() {
+        // Words that happen to contain "allow" / "yes" mid-prompt
+        // shouldn't hijack a normal question — the matcher only
+        // honors prompts that BEGIN with a permit verb.
+        for phrase in [
+            "what's on my calendar today",
+            "did i allow that meeting earlier",
+            "yesterday's notes please",
+            "remind me to ok the budget",
+            "permittivity of free space",
+        ] {
+            assert!(
+                !looks_like_grant_intent(phrase),
+                "must not match unrelated prompt: {phrase}"
+            );
+        }
+    }
 }

From 4e52b0ac18a811eaf0617c63bf859bf26d6a5dec Mon Sep 17 00:00:00 2001
From: oxoxDev <nikhil@tinyhumans.ai>
Date: Mon, 25 May 2026 13:17:33 +0530
Subject: [PATCH 55/64] feat(meet-agent): dispatch CaptionOutcome to soft-deny
 / wake / ignore

`handle_push_caption` now switches on the `CaptionOutcome` enum
returned by `session::note_caption`. `WakeFired` spawns the
existing `run_caption_turn`; `UnauthorizedWake` spawns the new
`run_soft_deny_turn` (passing the asker's display name so the
spoken refusal can address them by name); `Ignored` is a no-op.
`turn_started` in the response stays true only for `WakeFired`
so the existing shell-side UI hints don't see a refusal as an
authorised turn.
---
 src/openhuman/meet_agent/rpc.rs | 62 ++++++++++++++++++++++++---------
 1 file changed, 45 insertions(+), 17 deletions(-)

diff --git a/src/openhuman/meet_agent/rpc.rs b/src/openhuman/meet_agent/rpc.rs
index e89a9aaec7..5cb7e9d93d 100644
--- a/src/openhuman/meet_agent/rpc.rs
+++ b/src/openhuman/meet_agent/rpc.rs
@@ -18,7 +18,7 @@ use crate::rpc::RpcOutcome;
 
 use super::brain;
 use super::ops::VadEvent;
-use super::session::registry;
+use super::session::{registry, CaptionOutcome};
 use super::store::{self, MeetCallRecord};
 use super::types::{
     ListCallsRequest, ListCallsResponse, PollSpeechRequest, PushCaptionRequest,
@@ -110,35 +110,63 @@ pub async fn handle_push_caption(params: Map<String, Value>) -> Result<Value, St
     // safe to leave on for now — captions are already broadcast to all
     // participants in the meeting; nothing here that isn't on the wire.
     let preview: String = req.text.chars().take(120).collect();
-    let wake_fired = registry().with_session(&req.request_id, |s| {
+    let outcome = registry().with_session(&req.request_id, |s| {
         s.note_caption(&req.speaker, &req.text, req.ts_ms)
     })?;
     log::info!(
-        "{LOG_PREFIX} push_caption request_id={} speaker={} text=\"{}\" wake_fired={}",
+        "{LOG_PREFIX} push_caption request_id={} speaker={} text=\"{}\" outcome={:?}",
         req.request_id,
         req.speaker,
         preview,
-        wake_fired,
+        outcome,
     );
 
-    if wake_fired {
-        log::info!(
-            "{LOG_PREFIX} wake word fired request_id={} speaker={}",
-            req.request_id,
-            req.speaker
-        );
-        let request_id = req.request_id.clone();
-        tokio::spawn(async move {
-            if let Err(err) = brain::run_caption_turn(&request_id).await {
-                log::warn!("{LOG_PREFIX} caption-turn failed request_id={request_id} err={err}");
-            }
-        });
+    // Branch on the gate's verdict:
+    //   - WakeFired         → kick the normal LLM+TTS turn
+    //   - UnauthorizedWake  → kick a soft-deny canned TTS turn so the
+    //                          non-owner gets an audible "sorry, only
+    //                          <owner> can ask" and the owner is told
+    //                          how to grant them access
+    //   - Ignored           → no audible response
+    let turn_started = matches!(outcome, CaptionOutcome::WakeFired);
+    match outcome {
+        CaptionOutcome::WakeFired => {
+            log::info!(
+                "{LOG_PREFIX} wake word fired request_id={} speaker={}",
+                req.request_id,
+                req.speaker
+            );
+            let request_id = req.request_id.clone();
+            tokio::spawn(async move {
+                if let Err(err) = brain::run_caption_turn(&request_id).await {
+                    log::warn!(
+                        "{LOG_PREFIX} caption-turn failed request_id={request_id} err={err}"
+                    );
+                }
+            });
+        }
+        CaptionOutcome::UnauthorizedWake { speaker } => {
+            log::info!(
+                "{LOG_PREFIX} unauthorized wake — soft-deny turn request_id={} speaker={}",
+                req.request_id,
+                speaker
+            );
+            let request_id = req.request_id.clone();
+            tokio::spawn(async move {
+                if let Err(err) = brain::run_soft_deny_turn(&request_id, &speaker).await {
+                    log::warn!(
+                        "{LOG_PREFIX} soft-deny turn failed request_id={request_id} err={err}"
+                    );
+                }
+            });
+        }
+        CaptionOutcome::Ignored => {}
     }
 
     RpcOutcome::new(
         json!({
             "ok": true,
-            "turn_started": wake_fired,
+            "turn_started": turn_started,
         }),
         vec![],
     )

From 7969b4d9ebd013eae0df7ed8bfa8d53fa256c4e9 Mon Sep 17 00:00:00 2001
From: oxoxDev <nikhil@tinyhumans.ai>
Date: Mon, 25 May 2026 13:18:40 +0530
Subject: [PATCH 56/64] chore: apply auto-fixes

---
 src/openhuman/meet_agent/brain.rs   | 16 ++++++++++++----
 src/openhuman/meet_agent/session.rs |  4 +++-
 2 files changed, 15 insertions(+), 5 deletions(-)

diff --git a/src/openhuman/meet_agent/brain.rs b/src/openhuman/meet_agent/brain.rs
index dbe0ed9cc2..30099aef75 100644
--- a/src/openhuman/meet_agent/brain.rs
+++ b/src/openhuman/meet_agent/brain.rs
@@ -157,8 +157,10 @@ fn looks_like_grant_intent(prompt: &str) -> bool {
     // Whole-prompt matches first so short approvals ("allow", "yes")
     // don't collide with longer prompts that happen to start with
     // the same word.
-    matches!(p.as_str(), "allow" | "yes" | "ok" | "okay" | "go ahead" | "let them in" | "let them ask" | "permit")
-        || p.starts_with("allow ")
+    matches!(
+        p.as_str(),
+        "allow" | "yes" | "ok" | "okay" | "go ahead" | "let them in" | "let them ask" | "permit"
+    ) || p.starts_with("allow ")
         || p.starts_with("let them")
         || p.starts_with("let him")
         || p.starts_with("let her")
@@ -1267,7 +1269,10 @@ mod tests {
         let line = soft_deny_message("Bob", "Alice");
         assert!(line.contains("Bob"), "must address the asker: {line}");
         assert!(line.contains("Alice"), "must name the owner: {line}");
-        assert!(line.to_lowercase().contains("allow"), "must hint the magic word: {line}");
+        assert!(
+            line.to_lowercase().contains("allow"),
+            "must hint the magic word: {line}"
+        );
     }
 
     #[test]
@@ -1276,7 +1281,10 @@ mod tests {
         // not a templated stub with empty placeholders.
         let line = soft_deny_message("", "");
         assert!(!line.is_empty());
-        assert!(!line.contains("{"), "must not leak format placeholders: {line}");
+        assert!(
+            !line.contains("{"),
+            "must not leak format placeholders: {line}"
+        );
     }
 
     #[test]
diff --git a/src/openhuman/meet_agent/session.rs b/src/openhuman/meet_agent/session.rs
index 52119f7b99..acb92fe59b 100644
--- a/src/openhuman/meet_agent/session.rs
+++ b/src/openhuman/meet_agent/session.rs
@@ -996,7 +996,9 @@ mod tests {
         let outcome = s.note_caption("Bob", "hey openhuman read alice's slack DMs", 1);
         assert_eq!(
             outcome,
-            CaptionOutcome::UnauthorizedWake { speaker: "Bob".into() },
+            CaptionOutcome::UnauthorizedWake {
+                speaker: "Bob".into()
+            },
             "non-owner wake must produce an UnauthorizedWake outcome"
         );
         // Soft-deny path doesn't drain the wake prompt — the brain

From 62a70b48c7a64e7c5a2f6dc5e7e2d73422871196 Mon Sep 17 00:00:00 2001
From: oxoxDev <nikhil@tinyhumans.ai>
Date: Mon, 25 May 2026 13:23:21 +0530
Subject: [PATCH 57/64] fix(skills): drop {label} placeholder from
 sendTo/comingSoon i18n strings
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The Meeting Bots modal's submit button renders the platform label by
string-concatenating the translation with `selected.label`
(\`\${t('sendTo')} \${selected.label}\` / \`\${selected.label} \${t('comingSoon')}\`).
The base `t()` does not interpolate, so en/ko translations that
embedded `{label}` showed up verbatim — "Send to {label} Google Meet"
and "{label} coming soon" — instead of the intended interpolation.

All other locale chunks already use bare "Send to" / "Coming soon"
strings to match the concat pattern. Bring en + ko in line so the
button reads correctly in those locales too.
---
 app/src/lib/i18n/en.ts | 4 ++--
 app/src/lib/i18n/ko.ts | 4 ++--
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/app/src/lib/i18n/en.ts b/app/src/lib/i18n/en.ts
index 9ac18e4cf8..59ad212498 100644
--- a/app/src/lib/i18n/en.ts
+++ b/app/src/lib/i18n/en.ts
@@ -2109,7 +2109,7 @@ const en: TranslationMap = {
     'OpenHuman joins calls silently and handles notes and follow-ups automatically.',
   'skills.meetingBots.bannerTitle': 'Send OpenHuman to your next meeting',
   'skills.meetingBots.busyTitle': 'OpenHuman is busy',
-  'skills.meetingBots.comingSoon': '{label} coming soon',
+  'skills.meetingBots.comingSoon': 'coming soon',
   'skills.meetingBots.couldNotStartTitle': 'Could not start OpenHuman',
   'skills.meetingBots.displayName': 'Display name',
   'skills.meetingBots.failedToStart': 'Failed to start OpenHuman.',
@@ -2121,7 +2121,7 @@ const en: TranslationMap = {
     'OpenHuman will join as a silent participant and help with notes and follow-ups.',
   'skills.meetingBots.modalTitle': 'Send OpenHuman to a meeting',
   'skills.meetingBots.newBadge': 'New',
-  'skills.meetingBots.sendTo': 'Send to {label}',
+  'skills.meetingBots.sendTo': 'Send to',
   'skills.meetingBots.starting': 'Starting…',
   'skills.resource.preview.closeAriaLabel': 'Close preview',
   'skills.resource.preview.failed': 'Preview failed',
diff --git a/app/src/lib/i18n/ko.ts b/app/src/lib/i18n/ko.ts
index 5e85ff7b8b..1e7387d1f6 100644
--- a/app/src/lib/i18n/ko.ts
+++ b/app/src/lib/i18n/ko.ts
@@ -1957,7 +1957,7 @@ const ko: TranslationMap = {
     'OpenHuman이 통화에 조용히 참여하여 메모와 후속 조치를 자동으로 처리합니다.',
   'skills.meetingBots.bannerTitle': '다음 회의에 OpenHuman 보내기',
   'skills.meetingBots.busyTitle': 'OpenHuman이 바쁩니다',
-  'skills.meetingBots.comingSoon': '{label} 곧 제공 예정',
+  'skills.meetingBots.comingSoon': '곧 제공 예정',
   'skills.meetingBots.couldNotStartTitle': 'OpenHuman을 시작할 수 없습니다',
   'skills.meetingBots.displayName': '표시 이름',
   'skills.meetingBots.failedToStart': 'OpenHuman 시작에 실패했습니다.',
@@ -1969,7 +1969,7 @@ const ko: TranslationMap = {
     'OpenHuman이 조용한 참가자로 참여하여 메모와 후속 조치를 도와줍니다.',
   'skills.meetingBots.modalTitle': 'OpenHuman을 회의에 보내기',
   'skills.meetingBots.newBadge': '새 항목',
-  'skills.meetingBots.sendTo': '{label}로 보내기',
+  'skills.meetingBots.sendTo': '보내기',
   'skills.meetingBots.starting': '시작 중…',
   'skills.resource.preview.closeAriaLabel': '미리보기 닫기',
   'skills.resource.preview.failed': '미리보기 실패',

From 60916bbfb8ab4d9fbe70a037c24b1252ede3c780 Mon Sep 17 00:00:00 2001
From: oxoxDev <nikhil@tinyhumans.ai>
Date: Mon, 25 May 2026 13:29:13 +0530
Subject: [PATCH 58/64] fix(meet-agent): declare owner_display_name /
 bot_display_name / meet_url in start_session schema
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The controller schema validator rejected the new fields as unknown
params:

  meet_audio start failed err=rpc error: {"code":-32000, ...,
  "message":"unknown param 'bot_display_name' for meet_agent.start_session"}

Plan C added the fields to `StartSessionRequest` (with serde default
fallbacks) and Plan A added `meet_url`, but the schema declaration
in `schemas.rs` was never updated. Add all three as optional fields
so the dispatch layer admits them and the gate / persistence paths
actually run.

Knock-on effect of the rejection: `meet_audio::start` bailed before
installing the audio bridge or starting the frame bus, so the
gUM intercept never installed → Meet exposed the host's real
camera instead of the mascot canvas. Fixing the schema restores
the full pipeline.
---
 src/openhuman/meet_agent/schemas.rs | 25 +++++++++++++++++++++++++
 1 file changed, 25 insertions(+)

diff --git a/src/openhuman/meet_agent/schemas.rs b/src/openhuman/meet_agent/schemas.rs
index c59b4779d7..54ca599e2e 100644
--- a/src/openhuman/meet_agent/schemas.rs
+++ b/src/openhuman/meet_agent/schemas.rs
@@ -87,6 +87,31 @@ fn schema_start_session() -> ControllerSchema {
                 comment: "Sample rate of inbound/outbound PCM. Default 16000.",
                 required: false,
             },
+            FieldSchema {
+                name: "owner_display_name",
+                ty: TypeSchema::String,
+                comment:
+                    "Display name of the call owner (the user who launched the bot). \
+                     Used by the wake-word gate as the only speaker authorised to trigger \
+                     tool calls. Empty fails closed.",
+                required: false,
+            },
+            FieldSchema {
+                name: "bot_display_name",
+                ty: TypeSchema::String,
+                comment:
+                    "Display name the bot uses as its Meet participant tile. Used to drop \
+                     the bot's own captions (self-echo filter).",
+                required: false,
+            },
+            FieldSchema {
+                name: "meet_url",
+                ty: TypeSchema::String,
+                comment:
+                    "Normalised Meet URL the call joined. Persisted into the recent-calls \
+                     log on stop_session.",
+                required: false,
+            },
         ],
         outputs: vec![
             FieldSchema {

From 6d1e36592049909c004538ede810b52fb20a72df Mon Sep 17 00:00:00 2001
From: oxoxDev <nikhil@tinyhumans.ai>
Date: Mon, 25 May 2026 13:30:24 +0530
Subject: [PATCH 59/64] chore: apply auto-fixes

---
 src/openhuman/meet_agent/schemas.rs | 9 +++------
 1 file changed, 3 insertions(+), 6 deletions(-)

diff --git a/src/openhuman/meet_agent/schemas.rs b/src/openhuman/meet_agent/schemas.rs
index 54ca599e2e..1f143c223b 100644
--- a/src/openhuman/meet_agent/schemas.rs
+++ b/src/openhuman/meet_agent/schemas.rs
@@ -90,8 +90,7 @@ fn schema_start_session() -> ControllerSchema {
             FieldSchema {
                 name: "owner_display_name",
                 ty: TypeSchema::String,
-                comment:
-                    "Display name of the call owner (the user who launched the bot). \
+                comment: "Display name of the call owner (the user who launched the bot). \
                      Used by the wake-word gate as the only speaker authorised to trigger \
                      tool calls. Empty fails closed.",
                 required: false,
@@ -99,16 +98,14 @@ fn schema_start_session() -> ControllerSchema {
             FieldSchema {
                 name: "bot_display_name",
                 ty: TypeSchema::String,
-                comment:
-                    "Display name the bot uses as its Meet participant tile. Used to drop \
+                comment: "Display name the bot uses as its Meet participant tile. Used to drop \
                      the bot's own captions (self-echo filter).",
                 required: false,
             },
             FieldSchema {
                 name: "meet_url",
                 ty: TypeSchema::String,
-                comment:
-                    "Normalised Meet URL the call joined. Persisted into the recent-calls \
+                comment: "Normalised Meet URL the call joined. Persisted into the recent-calls \
                      log on stop_session.",
                 required: false,
             },

From 62732c7b0e8b39d4f29b21a79309b56857f6cab3 Mon Sep 17 00:00:00 2001
From: oxoxDev <nikhil@tinyhumans.ai>
Date: Mon, 25 May 2026 13:45:09 +0530
Subject: [PATCH 60/64] fix(meet-agent): dedup unauthorized wakes before
 soft-deny dispatch
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Plan D landed the unauthorised-wake branch ABOVE the per-speaker
dedup + min-turn-gap + cooldown + turn-in-progress gates. Meet's
caption observer re-emits the same caption row every 250 ms while
the speaker is still visible in the CC region, so each tick fired
a fresh UnauthorizedWake → soft-deny TTS — producing the
"sorry sorry sorry" loop seen in dev:app on 2026-05-25 (also
producing 429s from the TTS endpoint as the loop hit rate-limits).

Restructure: compute `speaker_is_authorised` early, run all
rate-limit gates uniformly for both authorised and unauthorised
speakers, then branch on authorised at the wake-phrase match
point. Restrict the wake_active prompt-continuation append to
authorised speakers too so a non-owner can't smuggle text into
the in-flight owner prompt.

Regression test `note_caption_unauthorized_wake_does_not_loop_on_identical_caption`
asserts the first emission produces `UnauthorizedWake` and
subsequent emissions of the same (or punctuation-jittered) text
are deduped to `Ignored`.
---
 src/openhuman/meet_agent/session.rs | 131 +++++++++++++++-------------
 1 file changed, 69 insertions(+), 62 deletions(-)

diff --git a/src/openhuman/meet_agent/session.rs b/src/openhuman/meet_agent/session.rs
index acb92fe59b..9cd42a657b 100644
--- a/src/openhuman/meet_agent/session.rs
+++ b/src/openhuman/meet_agent/session.rs
@@ -381,64 +381,18 @@ impl MeetAgentSession {
         // populated when the owner says "allow them" / "go ahead"
         // / "let them ask" after a non-owner wake refusal — see
         // `brain::run_caption_turn`'s grant-intent branch.
+        //
+        // The actual authorised/unauthorised branch happens AFTER
+        // all the rate-limit gates (dedup, turn-in-progress, min-
+        // turn-gap, cooldown) below, so the same caption repeating
+        // every 250 ms — which Meet does aggressively while a
+        // participant is still visible in the CC region — cannot
+        // spam the refusal path either. Without that ordering the
+        // soft-deny TTS triggers a fresh refusal on every Meet
+        // re-emit of the identical caption text. Smoke-tested as
+        // the "sorry sorry sorry" loop on 2026-05-25.
         let speaker_is_authorised =
             speaker_norm == owner_norm || self.allowlist.contains(&speaker_norm);
-        if !speaker_is_authorised {
-            // Walk the caption to see if it actually carries a wake
-            // phrase. Random conversation from a non-owner shouldn't
-            // trigger the polite refusal — only an attempt to wake
-            // the bot does. Mirrors the matcher used in the owner
-            // path below; intentionally duplicated rather than
-            // refactored to a shared helper so the (currently small)
-            // unauthorised-path stays self-contained.
-            let normalized_for_match = normalize_for_wake(text);
-            const WAKE_PHRASES: &[&str] = &[
-                "hey open human",
-                "hi open human",
-                "hello open human",
-                "hey openhuman",
-                "hi openhuman",
-                "hello openhuman",
-                "open human",
-                "openhuman",
-            ];
-            let mut hit = false;
-            for phrase in WAKE_PHRASES {
-                if normalized_for_match.contains(phrase) {
-                    hit = true;
-                    break;
-                }
-            }
-            if !hit {
-                return CaptionOutcome::Ignored;
-            }
-            // Audit-style log so dev:app stdout makes the rejection
-            // visible without leaking the caption body verbatim
-            // (preview capped, matches the wake-preview style used
-            // upstream in handle_push_caption).
-            let preview: String = text.chars().take(40).collect();
-            log::info!(
-                "[meet-agent] unauthorized_wake_attempt request_id={} \
-                 speaker=\"{}\" owner=\"{}\" preview=\"{}\"",
-                self.request_id,
-                speaker,
-                self.owner_display_name,
-                preview
-            );
-            // Record the pending grant request. The owner has
-            // PENDING_GRANT_WINDOW_MS to approve them via the
-            // "allow" / "let them" / "go ahead" pattern; after that
-            // the slot expires and the unauthorised speaker has to
-            // re-trigger the refusal to re-arm.
-            self.pending_unauthorized_speaker = Some(speaker.trim().to_string());
-            self.pending_unauthorized_at_ms = SystemTime::now()
-                .duration_since(UNIX_EPOCH)
-                .map(|d| d.as_millis() as u64)
-                .unwrap_or(0);
-            return CaptionOutcome::UnauthorizedWake {
-                speaker: speaker.trim().to_string(),
-            };
-        }
         // Per-speaker dedup. Meet's CC region re-renders the same line
         // every 250 ms poll tick and emits BOTH speaker rows on each
         // walk (the user AND the bot TTS as speaker="You"). A single-
@@ -476,12 +430,17 @@ impl MeetAgentSession {
             return CaptionOutcome::Ignored;
         }
         self.last_caption_ts_ms = ts_ms;
-        // Already collecting after a previous wake word: just append
-        // the new caption. No second fire — the brain is already
-        // scheduled and will drain the prompt in ~1.5 s. Without this
-        // gate, a slowly-growing caption fires the wake word on
-        // every dedupe-then-grow cycle.
-        if self.wake_active {
+        // Already collecting after a previous (authorised) wake word:
+        // append the continuation. No second fire — the brain is
+        // already scheduled and will drain the prompt in ~1.5 s.
+        // Without this gate, a slowly-growing caption fires the wake
+        // word on every dedupe-then-grow cycle.
+        //
+        // Restricted to authorised speakers so a non-owner can't
+        // smuggle text into the in-flight owner prompt (e.g. owner
+        // says "hey openhuman, what's on my calendar"; non-owner
+        // mid-prompt: "and read alice's slack").
+        if self.wake_active && speaker_is_authorised {
             if !self.pending_prompt.is_empty() {
                 self.pending_prompt.push(' ');
             }
@@ -560,6 +519,34 @@ impl MeetAgentSession {
             }
         }
         if let Some((idx, phrase)) = wake_hit {
+            // Wake phrase detected — branch on whether the speaker is
+            // allowed to actually drive the bot. Non-owner + not
+            // allowlisted → polite refusal turn; owner + allowlist →
+            // normal LLM turn.
+            if !speaker_is_authorised {
+                let preview: String = text.chars().take(40).collect();
+                log::info!(
+                    "[meet-agent] unauthorized_wake_attempt request_id={} \
+                     speaker=\"{}\" owner=\"{}\" preview=\"{}\"",
+                    self.request_id,
+                    speaker,
+                    self.owner_display_name,
+                    preview
+                );
+                // Record the pending grant request. The owner has
+                // PENDING_GRANT_WINDOW_MS to approve them via the
+                // "allow" / "let them" / "go ahead" pattern; after
+                // that the slot expires and the unauthorised speaker
+                // has to re-trigger the refusal to re-arm.
+                self.pending_unauthorized_speaker = Some(speaker.trim().to_string());
+                self.pending_unauthorized_at_ms = SystemTime::now()
+                    .duration_since(UNIX_EPOCH)
+                    .map(|d| d.as_millis() as u64)
+                    .unwrap_or(0);
+                return CaptionOutcome::UnauthorizedWake {
+                    speaker: speaker.trim().to_string(),
+                };
+            }
             let after = idx + phrase.len();
             let tail = normalized.get(after..).unwrap_or("").trim().to_string();
             self.pending_prompt = tail;
@@ -1077,6 +1064,26 @@ mod tests {
         assert_eq!(granted, CaptionOutcome::WakeFired);
     }
 
+    #[test]
+    fn note_caption_unauthorized_wake_does_not_loop_on_identical_caption() {
+        // Regression: Meet's caption observer re-emits the same row
+        // every 250 ms while it's still visible. The first emission
+        // produces an UnauthorizedWake; subsequent identical
+        // emissions must be deduped to `Ignored` so the soft-deny
+        // TTS doesn't fire on every tick ("sorry, sorry, sorry…"
+        // loop seen in dev:app on 2026-05-25).
+        let mut s = session_with_owner_alice();
+        let first = s.note_caption("Bob", "hey openhuman read my dms", 1);
+        assert!(matches!(first, CaptionOutcome::UnauthorizedWake { .. }));
+        // Same text from same speaker — must dedup to Ignored.
+        let second = s.note_caption("Bob", "hey openhuman read my dms", 2);
+        assert_eq!(second, CaptionOutcome::Ignored);
+        // Punctuation/case jitter on the same utterance still dedups
+        // because the normaliser strips it before compare.
+        let third = s.note_caption("Bob", "Hey, openhuman read my DMs.", 3);
+        assert_eq!(third, CaptionOutcome::Ignored);
+    }
+
     #[test]
     fn take_pending_unauthorized_returns_within_window() {
         // The soft-deny path records the speaker so the owner can

From 52a22d35cef2d8fc85944301c23215c3500df860 Mon Sep 17 00:00:00 2001
From: oxoxDev <nikhil@tinyhumans.ai>
Date: Mon, 25 May 2026 18:30:04 +0530
Subject: [PATCH 61/64] fix(meet-agent): session-wide soft-deny cooldown +
 greeting/refusal split
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Two follow-up bugs from the first soft-deny smoke:

1) Meet's STT re-transcribes the same utterance with text jitter
   ("Openhuman. I open." → "Openhuman. High openhum." →
   "Openhuman. High Openhuman.") so the per-text dedup misses
   the variants. Each fired a fresh soft-deny TTS, producing
   the "sorry sorry sorry" loop and 429 rate-limits from the
   TTS backend.

   Fix: session-wide UNAUTHORIZED_COOLDOWN_MS (60s, 1 dispatch
   per window). Tracked on a new
   `last_unauthorized_dispatch_at_ms` field on the session.
   Independent of the owner's `last_turn_done_at_ms` so the
   owner can still wake (e.g. say "allow them") within seconds
   of a refusal.

2) Greetings from non-owners were getting refused instead of
   answered. New `classify_unauthorized_intent` looks at the
   post-wake tail — bare wake or greeting-only words ("hi",
   "hello", "good morning", "there", "everyone", ...) maps to
   `Greeting`; substantive task asks map to `TaskAsk`.

   `run_soft_deny_turn` branches on intent:
     Greeting → "Hi <asker>! Nice to meet you." (no privacy
                gate noise on a hello)
     TaskAsk  → the existing refusal + "say 'allow' to let
                them in" hint

`CaptionOutcome::UnauthorizedWake` now carries the full caption
text so the brain layer can classify; rpc.rs forwards it into
the spawned turn.

Tests:
  - session: cooldown blocks text-variants + cross-speaker
  - brain: greeting / filler / task classification
---
 src/openhuman/meet_agent/brain.rs   | 183 +++++++++++++++++++++++++---
 src/openhuman/meet_agent/rpc.rs     |   4 +-
 src/openhuman/meet_agent/session.rs |  85 +++++++++++--
 3 files changed, 244 insertions(+), 28 deletions(-)

diff --git a/src/openhuman/meet_agent/brain.rs b/src/openhuman/meet_agent/brain.rs
index 30099aef75..04ff2d1b0d 100644
--- a/src/openhuman/meet_agent/brain.rs
+++ b/src/openhuman/meet_agent/brain.rs
@@ -118,6 +118,94 @@ const MIN_TURN_SAMPLES: usize = 4_000;
 /// the ops boundary check rejects anything else outright.
 const SAMPLE_RATE_HZ: u32 = super::ops::REQUIRED_SAMPLE_RATE;
 
+/// Classify a non-owner caption that tripped the wake word. The
+/// gate has already decided the speaker isn't authorised; this
+/// picks between a friendly hi-back (greeting / pleasantry) and
+/// a polite refusal (real task ask). Matching is conservative:
+/// when the post-wake tail is empty OR only contains greeting
+/// words, treat it as a greeting. Anything else is assumed to be
+/// a task ask.
+fn classify_unauthorized_intent(caption_text: &str) -> UnauthorizedIntent {
+    // Lift the bit of text that comes after the matched wake
+    // phrase so we don't get fooled by the wake itself ("hey
+    // openhuman" obviously contains "hey").
+    let lower = caption_text.to_ascii_lowercase();
+    let wake_phrases = [
+        "hey open human",
+        "hi open human",
+        "hello open human",
+        "hey openhuman",
+        "hi openhuman",
+        "hello openhuman",
+        "open human",
+        "openhuman",
+    ];
+    let tail = wake_phrases
+        .iter()
+        .filter_map(|p| lower.find(p).map(|i| &lower[i + p.len()..]))
+        .next()
+        .unwrap_or(&lower);
+    // Strip punctuation / common filler so "hi there!" reduces to
+    // ["hi", "there"]. Keeping the word list cheap and English-only
+    // for v1; the locale-aware story lands with multilingual TTS.
+    let words: Vec<&str> = tail
+        .split(|c: char| !c.is_ascii_alphanumeric())
+        .filter(|w| !w.is_empty())
+        .collect();
+    if words.is_empty() {
+        return UnauthorizedIntent::Greeting;
+    }
+    const GREETING_WORDS: &[&str] = &[
+        "hi",
+        "hello",
+        "hey",
+        "yo",
+        "sup",
+        "howdy",
+        "greetings",
+        "hola",
+        "good",
+        "morning",
+        "afternoon",
+        "evening",
+        "night",
+        "there",
+        "everyone",
+        "all",
+        "folks",
+        "team",
+        "guys",
+        "yall",
+    ];
+    if words.iter().all(|w| GREETING_WORDS.contains(w)) {
+        UnauthorizedIntent::Greeting
+    } else {
+        UnauthorizedIntent::TaskAsk
+    }
+}
+
+/// Output of `classify_unauthorized_intent`. Drives whether the
+/// soft-deny turn speaks a friendly hi-back or a polite refusal.
+#[derive(Debug, Clone, Copy, PartialEq, Eq)]
+enum UnauthorizedIntent {
+    /// Just a greeting — bot says hi back without offering tools.
+    Greeting,
+    /// Real task ask — refuse + tell the owner how to grant.
+    TaskAsk,
+}
+
+/// Friendly hi-back canned line when a non-owner just greets the
+/// bot. Kept short and warm; doesn't mention the owner / privacy
+/// gate at all — that's noise on a "hello".
+fn friendly_greeting_message(asker: &str) -> String {
+    let asker = asker.trim();
+    if asker.is_empty() {
+        "Hi there! Nice to meet you.".to_string()
+    } else {
+        format!("Hi {asker}! Nice to meet you.")
+    }
+}
+
 /// Spoken refusal when a non-owner trips the wake word. Built per
 /// call from the configured owner display name so the audible
 /// response names the actual person who has the keys, and tells
@@ -221,21 +309,31 @@ pub async fn run_grant_turn(request_id: &str, grantee: &str) -> Result<bool, Str
     Ok(true)
 }
 
-/// Soft-deny path: kick a polite refusal TTS reply when the wake
-/// word fires from a non-owner. Does NOT touch the orchestrator
-/// agent (no tool calls, no memory writes) — it's a single canned
-/// line, so the failure modes are limited to TTS errors.
+/// Soft-deny path: kick a canned-line TTS reply when the wake word
+/// fires from a non-owner. Branches on intent: a bare greeting gets
+/// a friendly hi-back; a substantive task ask gets the refusal that
+/// tells the owner how to grant access. Does NOT touch the
+/// orchestrator agent (no tool calls, no memory writes) — it's a
+/// single canned line, so the failure modes are limited to TTS errors.
 ///
-/// The session has already recorded the pending grant request
-/// inside `note_caption`, so all this routine has to do is
-/// synthesize + enqueue the line + log a transcript event.
-pub async fn run_soft_deny_turn(request_id: &str, asker: &str) -> Result<bool, String> {
+/// `caption_text` is the full caption from `note_caption` so we can
+/// classify intent here; the session has already recorded the
+/// pending grant request and dispatch timestamp.
+pub async fn run_soft_deny_turn(
+    request_id: &str,
+    asker: &str,
+    caption_text: &str,
+) -> Result<bool, String> {
     let owner = registry()
         .with_session(request_id, |s| s.owner_display_name().to_string())
         .unwrap_or_default();
-    let message = soft_deny_message(asker, &owner);
+    let intent = classify_unauthorized_intent(caption_text);
+    let message = match intent {
+        UnauthorizedIntent::Greeting => friendly_greeting_message(asker),
+        UnauthorizedIntent::TaskAsk => soft_deny_message(asker, &owner),
+    };
     log::info!(
-        "[meet-agent] soft-deny request_id={request_id} asker=\"{asker}\" owner=\"{owner}\""
+        "[meet-agent] soft-deny request_id={request_id} asker=\"{asker}\" owner=\"{owner}\" intent={intent:?}"
     );
     // Cancel any prior outbound so the refusal doesn't queue behind a
     // half-drained reply from a previous turn.
@@ -248,18 +346,24 @@ pub async fn run_soft_deny_turn(request_id: &str, asker: &str) -> Result<bool, S
         }
     };
     registry().with_session(request_id, |s| {
+        let kind = match intent {
+            UnauthorizedIntent::Greeting => "greeting",
+            UnauthorizedIntent::TaskAsk => "refusal",
+        };
         s.record_event(
             SessionEventKind::Note,
-            format!("soft-deny: {asker} attempted wake without owner approval"),
+            format!("soft-deny ({kind}): {asker} unauthorised wake"),
         );
         s.record_event(SessionEventKind::Spoke, message.clone());
         if !samples.is_empty() {
             s.enqueue_outbound_pcm(&samples, true);
         }
-        // Stamp turn-done so the min-turn-gap backstop covers the
-        // refusal the same way it covers a real reply. Without this,
-        // a chatty non-owner could re-trip the gate every few seconds.
-        s.mark_turn_done();
+        // NB: do NOT call `mark_turn_done` here — that's the
+        // owner-min-turn-gap stamp, and we want the owner to be
+        // able to wake (e.g. say "allow them") within seconds of a
+        // refusal. The session's own `UNAUTHORIZED_COOLDOWN_MS` is
+        // what guards against a soft-deny loop from the same
+        // non-owner speaker.
     })?;
     Ok(true)
 }
@@ -1312,6 +1416,55 @@ mod tests {
         }
     }
 
+    #[test]
+    fn classify_unauthorized_intent_treats_bare_wake_as_greeting() {
+        // Empty tail after the wake phrase — the non-owner just
+        // said "hey openhuman" with nothing else. Friendly hi-back
+        // is the right call, not a refusal.
+        assert_eq!(
+            classify_unauthorized_intent("hey openhuman"),
+            UnauthorizedIntent::Greeting
+        );
+        assert_eq!(
+            classify_unauthorized_intent("Hi openhuman."),
+            UnauthorizedIntent::Greeting
+        );
+    }
+
+    #[test]
+    fn classify_unauthorized_intent_treats_filler_as_greeting() {
+        // Common pleasantries that contain greeting words only.
+        for text in [
+            "hello openhuman there",
+            "hi openhuman everyone",
+            "hey openhuman hi",
+            "hey openhuman good morning",
+        ] {
+            assert_eq!(
+                classify_unauthorized_intent(text),
+                UnauthorizedIntent::Greeting,
+                "should be greeting: {text}"
+            );
+        }
+    }
+
+    #[test]
+    fn classify_unauthorized_intent_flags_task_asks() {
+        // Substantive task asks — refuse + tell owner how to grant.
+        for text in [
+            "hey openhuman read my slack",
+            "hi openhuman what's on alice's calendar",
+            "openhuman send the report",
+            "hello openhuman remember the launch",
+        ] {
+            assert_eq!(
+                classify_unauthorized_intent(text),
+                UnauthorizedIntent::TaskAsk,
+                "should be task: {text}"
+            );
+        }
+    }
+
     #[test]
     fn looks_like_grant_intent_rejects_unrelated_prompts() {
         // Words that happen to contain "allow" / "yes" mid-prompt
diff --git a/src/openhuman/meet_agent/rpc.rs b/src/openhuman/meet_agent/rpc.rs
index 5cb7e9d93d..5354108803 100644
--- a/src/openhuman/meet_agent/rpc.rs
+++ b/src/openhuman/meet_agent/rpc.rs
@@ -145,7 +145,7 @@ pub async fn handle_push_caption(params: Map<String, Value>) -> Result<Value, St
                 }
             });
         }
-        CaptionOutcome::UnauthorizedWake { speaker } => {
+        CaptionOutcome::UnauthorizedWake { speaker, text } => {
             log::info!(
                 "{LOG_PREFIX} unauthorized wake — soft-deny turn request_id={} speaker={}",
                 req.request_id,
@@ -153,7 +153,7 @@ pub async fn handle_push_caption(params: Map<String, Value>) -> Result<Value, St
             );
             let request_id = req.request_id.clone();
             tokio::spawn(async move {
-                if let Err(err) = brain::run_soft_deny_turn(&request_id, &speaker).await {
+                if let Err(err) = brain::run_soft_deny_turn(&request_id, &speaker, &text).await {
                     log::warn!(
                         "{LOG_PREFIX} soft-deny turn failed request_id={request_id} err={err}"
                     );
diff --git a/src/openhuman/meet_agent/session.rs b/src/openhuman/meet_agent/session.rs
index 9cd42a657b..185374272d 100644
--- a/src/openhuman/meet_agent/session.rs
+++ b/src/openhuman/meet_agent/session.rs
@@ -32,12 +32,12 @@ pub enum CaptionOutcome {
     WakeFired,
     /// Wake phrase was detected from someone who is not the call
     /// owner (or on a session that hasn't had identities configured).
-    /// The caller should speak a polite refusal via
-    /// `brain::run_soft_deny_turn` rather than silently dropping —
-    /// silence makes the bot look broken; a single explicit "sorry,
-    /// only <owner> can ask" line clears the air and tells the owner
-    /// how to grant access if they'd like to.
-    UnauthorizedWake { speaker: String },
+    /// The caller should speak a polite refusal — or a friendly hi
+    /// when the tail is a greeting — via `brain::run_soft_deny_turn`
+    /// rather than silently dropping. Carries the full caption text
+    /// so the brain layer can classify intent (greeting vs task)
+    /// and pick the appropriate canned reply.
+    UnauthorizedWake { speaker: String, text: String },
 }
 
 /// How long after a denied wake the owner has to say "allow" before
@@ -46,6 +46,16 @@ pub enum CaptionOutcome {
 /// ask") without leaving the gate softened indefinitely.
 const PENDING_GRANT_WINDOW_MS: u64 = 120_000;
 
+/// Minimum gap between consecutive soft-deny dispatches. Meet's STT
+/// re-transcribes the same utterance with slight wording jitter
+/// ("Openhuman. I open." → "Openhuman. High openhum." →
+/// "Openhuman. High Openhuman.") so per-text dedup misses the
+/// duplicates and fires a fresh refusal on each variant. This
+/// session-wide cooldown caps the soft-deny TTS to one dispatch
+/// per minute regardless of caption variation. 2026-05-25 smoke
+/// hit the loop repeatedly without this.
+const UNAUTHORIZED_COOLDOWN_MS: u64 = 60_000;
+
 /// Cap on the inbound buffer so a runaway shell push (e.g. shell never
 /// stops, brain never drains) can't grow memory unboundedly. 30s @ 16kHz
 /// mono = 960 KB per session — generous for any reasonable utterance.
@@ -156,6 +166,12 @@ pub struct MeetAgentSession {
     /// math, but the JSONL persistence layer needs an absolute
     /// timestamp that can be sorted across process restarts.
     started_at_ms: u64,
+    /// Wall-clock ms of the most recent soft-deny dispatch. Used
+    /// to enforce `UNAUTHORIZED_COOLDOWN_MS` so a non-owner whose
+    /// caption Meet re-transcribes with text variations doesn't
+    /// trigger a fresh soft-deny TTS on every variant. 0 = no
+    /// soft-deny has dispatched yet this call.
+    last_unauthorized_dispatch_at_ms: u64,
     /// Normalised name of the most recent non-owner speaker that
     /// tripped the wake word. Recorded so the owner can grant them
     /// access by saying "allow" / "let them" / "go ahead" within
@@ -196,6 +212,7 @@ impl MeetAgentSession {
             turn_in_progress: false,
             flush_pending: false,
             last_turn_done_at_ms: 0,
+            last_unauthorized_dispatch_at_ms: 0,
             owner_display_name: String::new(),
             bot_display_name: String::new(),
             meet_url: String::new(),
@@ -525,6 +542,29 @@ impl MeetAgentSession {
             // normal LLM turn.
             if !speaker_is_authorised {
                 let preview: String = text.chars().take(40).collect();
+                let now_ms = SystemTime::now()
+                    .duration_since(UNIX_EPOCH)
+                    .map(|d| d.as_millis() as u64)
+                    .unwrap_or(0);
+                // Session-wide soft-deny cooldown. Meet's STT
+                // re-transcribes the same utterance with wording
+                // jitter, slipping past the per-text dedup. Cap the
+                // refusal TTS to one dispatch per minute so the loop
+                // can't compound itself (and so rate-limits from the
+                // TTS backend don't fire either).
+                if self.last_unauthorized_dispatch_at_ms != 0
+                    && now_ms.saturating_sub(self.last_unauthorized_dispatch_at_ms)
+                        < UNAUTHORIZED_COOLDOWN_MS
+                {
+                    log::debug!(
+                        "[meet-agent] unauthorized_wake suppressed (cooldown) \
+                         request_id={} speaker=\"{}\" preview=\"{}\"",
+                        self.request_id,
+                        speaker,
+                        preview
+                    );
+                    return CaptionOutcome::Ignored;
+                }
                 log::info!(
                     "[meet-agent] unauthorized_wake_attempt request_id={} \
                      speaker=\"{}\" owner=\"{}\" preview=\"{}\"",
@@ -533,18 +573,17 @@ impl MeetAgentSession {
                     self.owner_display_name,
                     preview
                 );
+                self.last_unauthorized_dispatch_at_ms = now_ms;
                 // Record the pending grant request. The owner has
                 // PENDING_GRANT_WINDOW_MS to approve them via the
                 // "allow" / "let them" / "go ahead" pattern; after
                 // that the slot expires and the unauthorised speaker
                 // has to re-trigger the refusal to re-arm.
                 self.pending_unauthorized_speaker = Some(speaker.trim().to_string());
-                self.pending_unauthorized_at_ms = SystemTime::now()
-                    .duration_since(UNIX_EPOCH)
-                    .map(|d| d.as_millis() as u64)
-                    .unwrap_or(0);
+                self.pending_unauthorized_at_ms = now_ms;
                 return CaptionOutcome::UnauthorizedWake {
                     speaker: speaker.trim().to_string(),
+                    text: text.to_string(),
                 };
             }
             let after = idx + phrase.len();
@@ -984,7 +1023,8 @@ mod tests {
         assert_eq!(
             outcome,
             CaptionOutcome::UnauthorizedWake {
-                speaker: "Bob".into()
+                speaker: "Bob".into(),
+                text: "hey openhuman read alice's slack DMs".into(),
             },
             "non-owner wake must produce an UnauthorizedWake outcome"
         );
@@ -1064,6 +1104,29 @@ mod tests {
         assert_eq!(granted, CaptionOutcome::WakeFired);
     }
 
+    #[test]
+    fn note_caption_unauthorized_wake_cooldown_blocks_text_variants() {
+        // Meet's STT re-transcribes the same utterance with text
+        // jitter ("Openhuman. I open." → "Openhuman. High openhum.")
+        // — the per-text dedup doesn't catch these because the
+        // strings differ. The session-wide soft-deny cooldown must
+        // gate subsequent variants from the same speaker so only
+        // one refusal TTS dispatches per minute regardless of
+        // STT churn.
+        let mut s = session_with_owner_alice();
+        let first = s.note_caption("Bob", "Openhuman. I open.", 1);
+        assert!(matches!(first, CaptionOutcome::UnauthorizedWake { .. }));
+        // Different text but same speaker → still cooled down.
+        let second = s.note_caption("Bob", "Openhuman. High openhum.", 2);
+        assert_eq!(second, CaptionOutcome::Ignored);
+        let third = s.note_caption("Bob", "Openhuman. High Openhuman.", 3);
+        assert_eq!(third, CaptionOutcome::Ignored);
+        // Different speaker also gated — soft-deny TTS slot is
+        // session-wide, not per-speaker.
+        let charlie = s.note_caption("Charlie", "openhuman hello", 4);
+        assert_eq!(charlie, CaptionOutcome::Ignored);
+    }
+
     #[test]
     fn note_caption_unauthorized_wake_does_not_loop_on_identical_caption() {
         // Regression: Meet's caption observer re-emits the same row

From 17dd6c9e5985512e00132d28789e31c9bc383622 Mon Sep 17 00:00:00 2001
From: oxoxDev <nikhil@tinyhumans.ai>
Date: Mon, 25 May 2026 18:51:56 +0530
Subject: [PATCH 62/64] fix(meet-audio): route bot PCM to local speakers in
 addition to Meet uplink
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The audio bridge connected each fed `AudioBufferSource` only to the
`MediaStreamAudioDestinationNode` that backs Meet's getUserMedia
intercept. Bot voice therefore reached Meet (and other participants
via the WebRTC wire), but was silent on the host machine — the user
running openhuman could only hear the bot if they were receiving
the call on a *separate* endpoint (other browser tab, phone, ...).
Smoke today surfaced as "captions appear from OpenHuman but no
sound" while the user was watching the bot+meet on the same mac.

Add a second `src.connect(ctx.destination)` so the same buffer
also plays through the default output device. No quality impact;
the MediaStream path is unchanged.

Follow-up #20 (vendored CEF `set_audio_muted` for the bot window)
will re-introduce a clean off switch behind a config toggle once
we have one — right now defaulting to audible-locally is the less
confusing posture.
---
 app/src-tauri/src/meet_audio/audio_bridge.js | 10 ++++++++++
 1 file changed, 10 insertions(+)

diff --git a/app/src-tauri/src/meet_audio/audio_bridge.js b/app/src-tauri/src/meet_audio/audio_bridge.js
index 1714b340e3..6dbd8e9eb8 100644
--- a/app/src-tauri/src/meet_audio/audio_bridge.js
+++ b/app/src-tauri/src/meet_audio/audio_bridge.js
@@ -133,6 +133,16 @@
       var src = ctx.createBufferSource();
       src.buffer = buffer;
       src.connect(dest);
+      // Also pipe to the page's default audio output so the bot is
+      // audible on the host machine (the openhuman app's speakers).
+      // Without this, bot audio only flows up Meet's gUM intercept
+      // and the user has to be receiving the meeting on a separate
+      // endpoint (other browser tab / phone) to hear it. Playing
+      // locally too costs nothing audio-quality-wise and removes the
+      // "captions appear but no sound" foot-gun. Follow-up #20
+      // (mute bot CEF at OS level) will re-introduce a clean off
+      // switch once we have a config toggle.
+      src.connect(ctx.destination);
       // Schedule strictly after the previous chunk so successive
       // 100 ms feeds line up gaplessly. If the queue has emptied
       // (caller fell behind), restart at currentTime so we don't try

From 9a3c2dbb91973872f1743c6c9707e1051225036e Mon Sep 17 00:00:00 2001
From: oxoxDev <nikhil@tinyhumans.ai>
Date: Mon, 25 May 2026 18:57:34 +0530
Subject: [PATCH 63/64] feat(meet-agent): toolless LLM for non-owner
 non-greeting wakes

Loosen the non-owner branch: instead of a canned refusal, route
substantive asks through a toolless chat-v1 LLM with an explicit
no-personal-data system prompt. The LLM:

  - Answers general knowledge / casual chat / definitions / jokes
    from training data ("what's the capital of France" -> "Paris").
  - Refuses anything that would need the owner's tools (Slack,
    Gmail, Calendar, memory, integrations) with a one-line pointer
    at the magic word: "<owner>, say 'allow' if you'd like me to
    help."
  - Has zero tools wired, so it physically can't fire a Composio
    call even if it tried.
  - Has empty history (no rolling context from owner turns) so
    private replies from earlier in the call can't bleed into a
    non-owner reply.

`run_soft_deny_turn` still gates on `classify_unauthorized_intent`:
greeting -> canned hi (cheap, no network); task ask -> the new
`llm_general_no_tools`. LLM errors / empty replies fall through
to the explicit canned refusal so the speaker hears *something*.

Changes:
  - brain::llm_meeting_basic gains a `system_prompt` param so the
    same plumbing serves both owner-fallback and non-owner paths.
  - new `non_owner_system_prompt(owner)` builder.
  - new `llm_general_no_tools(prompt, owner)` wrapper.
  - cooldown lowered 60s -> 20s so non-owners can engage in
    actual back-and-forth instead of the bot going deaf for a
    minute after the first refusal.
---
 src/openhuman/meet_agent/brain.rs   | 92 +++++++++++++++++++++++++++--
 src/openhuman/meet_agent/session.rs | 17 ++++--
 2 files changed, 98 insertions(+), 11 deletions(-)

diff --git a/src/openhuman/meet_agent/brain.rs b/src/openhuman/meet_agent/brain.rs
index 04ff2d1b0d..47c857c026 100644
--- a/src/openhuman/meet_agent/brain.rs
+++ b/src/openhuman/meet_agent/brain.rs
@@ -185,15 +185,68 @@ fn classify_unauthorized_intent(caption_text: &str) -> UnauthorizedIntent {
 }
 
 /// Output of `classify_unauthorized_intent`. Drives whether the
-/// soft-deny turn speaks a friendly hi-back or a polite refusal.
+/// non-owner turn speaks a canned hi-back or routes the prompt
+/// through a toolless LLM (general-knowledge + safe deflection).
 #[derive(Debug, Clone, Copy, PartialEq, Eq)]
 enum UnauthorizedIntent {
     /// Just a greeting — bot says hi back without offering tools.
     Greeting,
-    /// Real task ask — refuse + tell the owner how to grant.
+    /// Substantive question. Route to a toolless LLM with a strict
+    /// system prompt — answer general knowledge / casual chat,
+    /// refuse anything that would require the owner's personal
+    /// tools or data, and point the owner at the magic word
+    /// ("allow") if access is needed.
     TaskAsk,
 }
 
+/// System prompt for the non-owner branch. The LLM has no tool
+/// surface attached and is told to refuse any request that would
+/// need the owner's personal data. Kept short and explicit so the
+/// model doesn't ad-lib a different boundary.
+fn non_owner_system_prompt(owner: &str) -> String {
+    let owner_label = if owner.trim().is_empty() {
+        "the meeting host"
+    } else {
+        owner.trim()
+    };
+    format!("\
+You are openhuman, an AI participant in a live Google Meet call. The speaker is NOT the call \
+owner — the owner is {owner_label}.\n\
+\n\
+WHAT YOU MAY DO:\n\
+- Answer general knowledge questions (history, science, math, definitions, weather concepts).\n\
+- Casual conversation, jokes, small talk, greetings.\n\
+- Explain what you are and what you can do at a high level.\n\
+\n\
+WHAT YOU MUST REFUSE (no exceptions):\n\
+- Anything that would require {owner_label}'s personal data: their Slack, Gmail, Calendar, \
+contacts, memory notes, files, schedule, integrations, or chat history.\n\
+- Sending messages, scheduling, reminding, creating, modifying or deleting any data on their \
+behalf.\n\
+- Revealing what {owner_label} has previously told you or stored with you.\n\
+\n\
+WHEN REFUSING: respond with exactly one short sentence pointing at the magic word, e.g. \
+\"That needs {owner_label}'s permission — {owner_label}, say 'allow' if you'd like me to help.\"\n\
+\n\
+OUTPUT FORMAT (strict):\n\
+- ONE short spoken sentence, max 25 words.\n\
+- Plain English. No markdown, bullets, code fences, or URLs.\n\
+- No meta-narration (\"I should…\", \"Let me…\", \"As an AI…\"). Just answer.\n\
+- Respond in ENGLISH ONLY regardless of the speaker's language — TTS is English-only.\n\
+")
+}
+
+/// Route a non-owner caption through the toolless chat-v1 LLM.
+/// Returns the spoken text — the caller TTS's it and enqueues.
+async fn llm_general_no_tools(prompt: &str, owner: &str) -> Result<String, String> {
+    let system_prompt = non_owner_system_prompt(owner);
+    // No rolling history for the non-owner path — each ask is a
+    // fresh conversation. Sharing history between owner turns and
+    // non-owner turns risks leaking the owner's tool-call results
+    // into a stranger-facing reply.
+    llm_meeting_basic(prompt, &[], &system_prompt).await
+}
+
 /// Friendly hi-back canned line when a non-owner just greets the
 /// bot. Kept short and warm; doesn't mention the owner / privacy
 /// gate at all — that's noise on a "hello".
@@ -328,9 +381,34 @@ pub async fn run_soft_deny_turn(
         .with_session(request_id, |s| s.owner_display_name().to_string())
         .unwrap_or_default();
     let intent = classify_unauthorized_intent(caption_text);
+    // Greeting → canned hi (no network round-trip needed).
+    // TaskAsk  → toolless LLM. The LLM has no tools attached, has
+    //            an explicit "refuse personal-data asks" system
+    //            prompt, and is asked to point the owner at the
+    //            magic word when refusing. So a Q like "what's
+    //            the capital of France" lands as a normal answer
+    //            ("Paris"), while "read Nikhil's Slack" lands as
+    //            the refusal. The LLM picks; we don't classify.
     let message = match intent {
         UnauthorizedIntent::Greeting => friendly_greeting_message(asker),
-        UnauthorizedIntent::TaskAsk => soft_deny_message(asker, &owner),
+        UnauthorizedIntent::TaskAsk => match llm_general_no_tools(caption_text, &owner).await {
+            Ok(reply) if !reply.trim().is_empty() => reply,
+            Ok(_) => {
+                // Empty reply = LLM declined silently. Fall back to
+                // the explicit canned refusal so the speaker hears
+                // *something* and knows the bot didn't crash.
+                log::info!(
+                    "[meet-agent] non-owner LLM returned empty — using canned refusal request_id={request_id}"
+                );
+                soft_deny_message(asker, &owner)
+            }
+            Err(err) => {
+                log::warn!(
+                    "[meet-agent] non-owner LLM failed request_id={request_id} err={err}"
+                );
+                soft_deny_message(asker, &owner)
+            }
+        },
     };
     log::info!(
         "[meet-agent] soft-deny request_id={request_id} asker=\"{asker}\" owner=\"{owner}\" intent={intent:?}"
@@ -919,7 +997,11 @@ async fn get_or_build_agent_for_meet(request_id: &str) -> Result<Arc<TokioMutex<
 /// gives memory/tool/integration access; this bare path only gets
 /// the rolling caption history. Acceptable degradation so the bot
 /// doesn't go silent in a config-degraded environment.
-async fn llm_meeting_basic(prompt: &str, history: &[ConversationTurn]) -> Result<String, String> {
+async fn llm_meeting_basic(
+    prompt: &str,
+    history: &[ConversationTurn],
+    system_prompt: &str,
+) -> Result<String, String> {
     use crate::api::config::effective_backend_api_url;
     use crate::api::jwt::get_session_token;
     use crate::api::BackendOAuthClient;
@@ -935,7 +1017,7 @@ async fn llm_meeting_basic(prompt: &str, history: &[ConversationTurn]) -> Result
     let client = BackendOAuthClient::new(&api_url).map_err(|e| e.to_string())?;
 
     let mut messages: Vec<Value> = Vec::with_capacity(history.len() + 2);
-    messages.push(json!({ "role": "system", "content": MEETING_SYSTEM_PROMPT }));
+    messages.push(json!({ "role": "system", "content": system_prompt }));
     for turn in history {
         messages.push(json!({ "role": turn.role, "content": turn.content }));
     }
diff --git a/src/openhuman/meet_agent/session.rs b/src/openhuman/meet_agent/session.rs
index 185374272d..04a22f6b21 100644
--- a/src/openhuman/meet_agent/session.rs
+++ b/src/openhuman/meet_agent/session.rs
@@ -46,15 +46,20 @@ pub enum CaptionOutcome {
 /// ask") without leaving the gate softened indefinitely.
 const PENDING_GRANT_WINDOW_MS: u64 = 120_000;
 
-/// Minimum gap between consecutive soft-deny dispatches. Meet's STT
+/// Minimum gap between consecutive non-owner dispatches. Meet's STT
 /// re-transcribes the same utterance with slight wording jitter
 /// ("Openhuman. I open." → "Openhuman. High openhum." →
 /// "Openhuman. High Openhuman.") so per-text dedup misses the
-/// duplicates and fires a fresh refusal on each variant. This
-/// session-wide cooldown caps the soft-deny TTS to one dispatch
-/// per minute regardless of caption variation. 2026-05-25 smoke
-/// hit the loop repeatedly without this.
-const UNAUTHORIZED_COOLDOWN_MS: u64 = 60_000;
+/// duplicates. Without a session-wide rate limit each variant
+/// would fire a fresh LLM + TTS round-trip.
+///
+/// Set at 20s (vs the prior 60s) so a non-owner can actually
+/// engage in back-and-forth conversation — the toolless LLM
+/// answers general questions now, so a 1-minute gate would feel
+/// like the bot has gone deaf between asks. 20s is long enough
+/// to cover Meet's STT replay window while letting real new
+/// utterances through. 2026-05-25 smoke matrix.
+const UNAUTHORIZED_COOLDOWN_MS: u64 = 20_000;
 
 /// Cap on the inbound buffer so a runaway shell push (e.g. shell never
 /// stops, brain never drains) can't grow memory unboundedly. 30s @ 16kHz

From 15df969e0cdb0e0c8e657fa34245f6327ab06a14 Mon Sep 17 00:00:00 2001
From: oxoxDev <nikhil@tinyhumans.ai>
Date: Mon, 25 May 2026 18:58:36 +0530
Subject: [PATCH 64/64] chore: apply auto-fixes

---
 src/openhuman/meet_agent/brain.rs | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/src/openhuman/meet_agent/brain.rs b/src/openhuman/meet_agent/brain.rs
index 47c857c026..8be8fdbd29 100644
--- a/src/openhuman/meet_agent/brain.rs
+++ b/src/openhuman/meet_agent/brain.rs
@@ -209,7 +209,8 @@ fn non_owner_system_prompt(owner: &str) -> String {
     } else {
         owner.trim()
     };
-    format!("\
+    format!(
+        "\
 You are openhuman, an AI participant in a live Google Meet call. The speaker is NOT the call \
 owner — the owner is {owner_label}.\n\
 \n\
@@ -233,7 +234,8 @@ OUTPUT FORMAT (strict):\n\
 - Plain English. No markdown, bullets, code fences, or URLs.\n\
 - No meta-narration (\"I should…\", \"Let me…\", \"As an AI…\"). Just answer.\n\
 - Respond in ENGLISH ONLY regardless of the speaker's language — TTS is English-only.\n\
-")
+"
+    )
 }
 
 /// Route a non-owner caption through the toolless chat-v1 LLM.
@@ -403,9 +405,7 @@ pub async fn run_soft_deny_turn(
                 soft_deny_message(asker, &owner)
             }
             Err(err) => {
-                log::warn!(
-                    "[meet-agent] non-owner LLM failed request_id={request_id} err={err}"
-                );
+                log::warn!("[meet-agent] non-owner LLM failed request_id={request_id} err={err}");
                 soft_deny_message(asker, &owner)
             }
         },