From d715765941ea317edeebb38ffd7b0974ab38b23f Mon Sep 17 00:00:00 2001
From: tsushanth <78000697+tsushanth@users.noreply.github.com>
Date: Thu, 11 Jun 2026 11:37:50 -0700
Subject: [PATCH] fix(voice): VAD-mode minEndpointingDelay collapses to ~0
 (#1741)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

In VAD-based turn detection, bounceEOUTask runs at VAD END_OF_SPEECH,
which Silero emits `minSilenceDuration` (~550 ms) after the user
actually stops. lastSpeakingTime is stamped earlier — at VAD
INFERENCE_DONE. The post-EOS delay was computed as

    extraSleep = endpointingDelay + (lastSpeakingTime - Date.now())

so it collapsed to `endpointingDelay - elapsedSilence` ≈ −50 ms with
the defaults (minDelay=500, minSilenceDuration=550). The turn committed
the instant END_OF_SPEECH fired and any natural mid-sentence pause —
or even any silence shorter than the configured min delay — split into
two segments. With realtime models using manual activity detection,
the second segment's userTurnCompleted never fires and the agent never
responds.

Skip the elapsed-since-speech adjustment in VAD mode so `minDelay`
actually provides a real post-EOS grouping window that an upcoming
START_OF_SPEECH can cancel. STT mode keeps the adjustment — there it
correctly compensates for transcription latency between
INFERENCE_DONE and END_OF_SPEECH on the STT side. Adds two regression
tests in audio_recognition_endpointing_delay.test.ts: a #1741 repro
that fails on main (~2 ms vs the required ≥250 ms), and a guard for
the STT path so the fix can't regress that branch.

Closes #1741
---
 .../fix-vad-endpointing-delay-collapse.md     |  23 +++
 agents/src/voice/audio_recognition.ts         |  16 +-
 ...udio_recognition_endpointing_delay.test.ts | 142 ++++++++++++++++++
 3 files changed, 180 insertions(+), 1 deletion(-)
 create mode 100644 .changeset/fix-vad-endpointing-delay-collapse.md
 create mode 100644 agents/src/voice/audio_recognition_endpointing_delay.test.ts

diff --git a/.changeset/fix-vad-endpointing-delay-collapse.md b/.changeset/fix-vad-endpointing-delay-collapse.md
new file mode 100644
index 000000000..7f1b6e9e6
--- /dev/null
+++ b/.changeset/fix-vad-endpointing-delay-collapse.md
@@ -0,0 +1,23 @@
+---
+'@livekit/agents': patch
+---
+
+Fix `minEndpointingDelay` being silently ignored in VAD-based turn
+detection. `bounceEOUTask` runs at VAD `END_OF_SPEECH`, which Silero
+emits `minSilenceDuration` (~550 ms) after the user stops, but the
+post-EOS delay was computed as
+`extraSleep = endpointingDelay + (lastSpeakingTime - Date.now())`,
+collapsing the grouping window to `~max(minSilenceDuration, minDelay)`
+and committing the turn the instant `END_OF_SPEECH` fired. With the
+default `minDelay = 500` and `minSilenceDuration = 550`, the effective
+post-EOS window was `~−50 ms` — so a natural mid-sentence pause (and
+even silences shorter than the configured min delay) split into two
+segments. With realtime models using manual activity detection, the
+second segment's `userTurnCompleted` never fires and the agent never
+responds (#1741).
+
+Skip the elapsed-since-speech adjustment in VAD mode so `minDelay`
+provides a real post-EOS grouping window that an upcoming
+`START_OF_SPEECH` can cancel. STT mode keeps the adjustment — there the
+adjustment correctly compensates for transcription latency, and a new
+regression test guards that path.
diff --git a/agents/src/voice/audio_recognition.ts b/agents/src/voice/audio_recognition.ts
index c85d926bd..9eae6f201 100644
--- a/agents/src/voice/audio_recognition.ts
+++ b/agents/src/voice/audio_recognition.ts
@@ -1153,7 +1153,21 @@ export class AudioRecognition {
         }
 
         let extraSleep = endpointingDelay;
-        if (lastSpeakingTime !== undefined) {
+        // In STT-based turn detection, lastSpeakingTime is roughly when the
+        // user stopped, but bounceEOUTask runs from STT's INFERENCE_DONE
+        // event — subtracting elapsed time compensates for transcription
+        // latency so the post-speech window stays ~endpointingDelay long.
+        //
+        // In VAD-based turn detection, lastSpeakingTime is stamped at
+        // VAD INFERENCE_DONE and bounceEOUTask runs at VAD END_OF_SPEECH,
+        // which Silero emits `minSilenceDuration` later. Subtracting that
+        // elapsed silence collapses the grouping window to ~0 (or negative)
+        // and commits the turn the instant END_OF_SPEECH fires — so a
+        // mid-sentence pause splits into two segments and the post-EOS
+        // start-of-speech can no longer cancel the pending commit. Skip
+        // the adjustment in VAD mode so `minEndpointingDelay` actually
+        // provides a real post-EOS grouping window.
+        if (lastSpeakingTime !== undefined && !this.vadBaseTurnDetection) {
           extraSleep += lastSpeakingTime - Date.now();
         }
 
diff --git a/agents/src/voice/audio_recognition_endpointing_delay.test.ts b/agents/src/voice/audio_recognition_endpointing_delay.test.ts
new file mode 100644
index 000000000..fdc9c9b45
--- /dev/null
+++ b/agents/src/voice/audio_recognition_endpointing_delay.test.ts
@@ -0,0 +1,142 @@
+// SPDX-FileCopyrightText: 2026 LiveKit, Inc.
+//
+// SPDX-License-Identifier: Apache-2.0
+import { ParticipantKind } from '@livekit/rtc-node';
+import { describe, expect, it, vi } from 'vitest';
+import { ChatContext } from '../llm/chat_context.js';
+import { initializeLogger } from '../log.js';
+import type { VAD } from '../vad.js';
+import {
+  AudioRecognition,
+  type AudioRecognitionOptions,
+  type RecognitionHooks,
+  type TurnDetectionMode,
+} from './audio_recognition.js';
+
+/** Private members of AudioRecognition the tests poke at to drive the EOU task. */
+interface RecognitionInternals {
+  vad?: VAD;
+  lastSpeakingTime?: number;
+  lastFinalTranscriptTime: number;
+  audioTranscript: string;
+  finalTranscriptConfidence: number[];
+  bounceEOUTask?: {
+    result: Promise<void>;
+    cancel: () => void;
+    cancelAndWait: () => Promise<void>;
+  };
+  runEOUDetection: (ctx: ChatContext) => void;
+}
+
+function makeHooks(): { hooks: RecognitionHooks; onEndOfTurn: ReturnType<typeof vi.fn> } {
+  const onEndOfTurn = vi.fn(async () => true);
+  const hooks: RecognitionHooks = {
+    onInterruption: vi.fn(),
+    onStartOfSpeech: vi.fn(),
+    onVADInferenceDone: vi.fn(),
+    onEndOfSpeech: vi.fn(),
+    onInterimTranscript: vi.fn(),
+    onFinalTranscript: vi.fn(),
+    onPreemptiveGeneration: vi.fn(),
+    onUserTurnExceeded: vi.fn(),
+    retrieveChatCtx: () => ChatContext.empty(),
+    onEndOfTurn,
+  };
+  return { hooks, onEndOfTurn };
+}
+
+function makeRecognition(opts: {
+  turnDetectionMode: TurnDetectionMode;
+  minEndpointingDelay: number;
+}): {
+  recognition: AudioRecognition;
+  internals: RecognitionInternals;
+  onEndOfTurn: ReturnType<typeof vi.fn>;
+} {
+  const { hooks, onEndOfTurn } = makeHooks();
+  const recognitionOpts: AudioRecognitionOptions = {
+    recognitionHooks: hooks,
+    stt: undefined,
+    vad: undefined,
+    interruptionDetection: undefined,
+    turnDetectionMode: opts.turnDetectionMode,
+    minEndpointingDelay: opts.minEndpointingDelay,
+    maxEndpointingDelay: opts.minEndpointingDelay,
+    getLinkedParticipant: () => ({ sid: 'p1', identity: 'bob', kind: ParticipantKind.AGENT }),
+  };
+  const recognition = new AudioRecognition(recognitionOpts);
+  return {
+    recognition,
+    internals: recognition as unknown as RecognitionInternals,
+    onEndOfTurn,
+  };
+}
+
+describe('AudioRecognition bounceEOUTask endpointing delay (#1741)', () => {
+  initializeLogger({ pretty: false, level: 'silent' });
+
+  it('VAD mode: minEndpointingDelay survives end-of-speech silence (regression for #1741)', async () => {
+    // Repro: Silero's minSilenceDuration (~550 ms) has already elapsed by
+    // the time bounceEOUTask is invoked at VAD END_OF_SPEECH. Before the
+    // fix, `extraSleep += lastSpeakingTime - Date.now()` collapsed the
+    // post-EOS window to (minDelay − elapsedSilence) ≈ −250 ms with the
+    // values below — so the turn committed the instant END_OF_SPEECH
+    // fired and any mid-sentence pause split into two segments.
+    const minDelay = 300;
+    const elapsedSilence = 550;
+
+    const { internals, onEndOfTurn } = makeRecognition({
+      turnDetectionMode: 'vad',
+      minEndpointingDelay: minDelay,
+    });
+    // VAD must be truthy for vadBaseTurnDetection to take the fix branch.
+    internals.vad = {} as VAD;
+    internals.lastSpeakingTime = Date.now() - elapsedSilence;
+    internals.lastFinalTranscriptTime = 0;
+    internals.audioTranscript = '';
+    internals.finalTranscriptConfidence = [];
+
+    const start = Date.now();
+    internals.runEOUDetection(ChatContext.empty());
+    // Wait for the task to settle.
+    await internals.bounceEOUTask!.result.catch(() => {});
+    const elapsed = Date.now() - start;
+
+    expect(onEndOfTurn).toHaveBeenCalledTimes(1);
+    // The post-EOS grouping window must be roughly the configured minDelay,
+    // independent of how long Silero waited before emitting END_OF_SPEECH.
+    // Allow generous slack for timer scheduling jitter.
+    expect(elapsed).toBeGreaterThanOrEqual(minDelay - 50);
+    expect(elapsed).toBeLessThan(minDelay + 250);
+  }, 10_000);
+
+  it('STT mode: endpointing delay still compensates for transcription latency', async () => {
+    // STT mode's adjustment is intentional — bounceEOUTask runs from STT's
+    // INFERENCE_DONE event, so subtracting elapsed time keeps the post-
+    // speech window roughly `minDelay` long even when transcription took
+    // a while. This test guards the fix from regressing STT-mode behaviour.
+    const minDelay = 400;
+    const elapsedSinceSpeech = 150;
+
+    const { internals, onEndOfTurn } = makeRecognition({
+      turnDetectionMode: 'stt',
+      minEndpointingDelay: minDelay,
+    });
+    // No VAD — STT mode path. (vad undefined keeps vadBaseTurnDetection false.)
+    internals.vad = undefined;
+    internals.lastSpeakingTime = Date.now() - elapsedSinceSpeech;
+    internals.lastFinalTranscriptTime = 0;
+    internals.audioTranscript = '';
+    internals.finalTranscriptConfidence = [];
+
+    const start = Date.now();
+    internals.runEOUDetection(ChatContext.empty());
+    await internals.bounceEOUTask!.result.catch(() => {});
+    const elapsed = Date.now() - start;
+
+    const expected = minDelay - elapsedSinceSpeech;
+    expect(onEndOfTurn).toHaveBeenCalledTimes(1);
+    expect(elapsed).toBeGreaterThanOrEqual(expected - 50);
+    expect(elapsed).toBeLessThan(expected + 250);
+  }, 10_000);
+});