From d715765941ea317edeebb38ffd7b0974ab38b23f Mon Sep 17 00:00:00 2001 From: tsushanth <78000697+tsushanth@users.noreply.github.com> Date: Thu, 11 Jun 2026 11:37:50 -0700 Subject: [PATCH] fix(voice): VAD-mode minEndpointingDelay collapses to ~0 (#1741) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit In VAD-based turn detection, bounceEOUTask runs at VAD END_OF_SPEECH, which Silero emits `minSilenceDuration` (~550 ms) after the user actually stops. lastSpeakingTime is stamped earlier — at VAD INFERENCE_DONE. The post-EOS delay was computed as extraSleep = endpointingDelay + (lastSpeakingTime - Date.now()) so it collapsed to `endpointingDelay - elapsedSilence` ≈ −50 ms with the defaults (minDelay=500, minSilenceDuration=550). The turn committed the instant END_OF_SPEECH fired and any natural mid-sentence pause — or even any silence shorter than the configured min delay — split into two segments. With realtime models using manual activity detection, the second segment's userTurnCompleted never fires and the agent never responds. Skip the elapsed-since-speech adjustment in VAD mode so `minDelay` actually provides a real post-EOS grouping window that an upcoming START_OF_SPEECH can cancel. STT mode keeps the adjustment — there it correctly compensates for transcription latency between INFERENCE_DONE and END_OF_SPEECH on the STT side. Adds two regression tests in audio_recognition_endpointing_delay.test.ts: a #1741 repro that fails on main (~2 ms vs the required ≥250 ms), and a guard for the STT path so the fix can't regress that branch. Closes #1741 --- .../fix-vad-endpointing-delay-collapse.md | 23 +++ agents/src/voice/audio_recognition.ts | 16 +- ...udio_recognition_endpointing_delay.test.ts | 142 ++++++++++++++++++ 3 files changed, 180 insertions(+), 1 deletion(-) create mode 100644 .changeset/fix-vad-endpointing-delay-collapse.md create mode 100644 agents/src/voice/audio_recognition_endpointing_delay.test.ts diff --git a/.changeset/fix-vad-endpointing-delay-collapse.md b/.changeset/fix-vad-endpointing-delay-collapse.md new file mode 100644 index 000000000..7f1b6e9e6 --- /dev/null +++ b/.changeset/fix-vad-endpointing-delay-collapse.md @@ -0,0 +1,23 @@ +--- +'@livekit/agents': patch +--- + +Fix `minEndpointingDelay` being silently ignored in VAD-based turn +detection. `bounceEOUTask` runs at VAD `END_OF_SPEECH`, which Silero +emits `minSilenceDuration` (~550 ms) after the user stops, but the +post-EOS delay was computed as +`extraSleep = endpointingDelay + (lastSpeakingTime - Date.now())`, +collapsing the grouping window to `~max(minSilenceDuration, minDelay)` +and committing the turn the instant `END_OF_SPEECH` fired. With the +default `minDelay = 500` and `minSilenceDuration = 550`, the effective +post-EOS window was `~−50 ms` — so a natural mid-sentence pause (and +even silences shorter than the configured min delay) split into two +segments. With realtime models using manual activity detection, the +second segment's `userTurnCompleted` never fires and the agent never +responds (#1741). + +Skip the elapsed-since-speech adjustment in VAD mode so `minDelay` +provides a real post-EOS grouping window that an upcoming +`START_OF_SPEECH` can cancel. STT mode keeps the adjustment — there the +adjustment correctly compensates for transcription latency, and a new +regression test guards that path. diff --git a/agents/src/voice/audio_recognition.ts b/agents/src/voice/audio_recognition.ts index c85d926bd..9eae6f201 100644 --- a/agents/src/voice/audio_recognition.ts +++ b/agents/src/voice/audio_recognition.ts @@ -1153,7 +1153,21 @@ export class AudioRecognition { } let extraSleep = endpointingDelay; - if (lastSpeakingTime !== undefined) { + // In STT-based turn detection, lastSpeakingTime is roughly when the + // user stopped, but bounceEOUTask runs from STT's INFERENCE_DONE + // event — subtracting elapsed time compensates for transcription + // latency so the post-speech window stays ~endpointingDelay long. + // + // In VAD-based turn detection, lastSpeakingTime is stamped at + // VAD INFERENCE_DONE and bounceEOUTask runs at VAD END_OF_SPEECH, + // which Silero emits `minSilenceDuration` later. Subtracting that + // elapsed silence collapses the grouping window to ~0 (or negative) + // and commits the turn the instant END_OF_SPEECH fires — so a + // mid-sentence pause splits into two segments and the post-EOS + // start-of-speech can no longer cancel the pending commit. Skip + // the adjustment in VAD mode so `minEndpointingDelay` actually + // provides a real post-EOS grouping window. + if (lastSpeakingTime !== undefined && !this.vadBaseTurnDetection) { extraSleep += lastSpeakingTime - Date.now(); } diff --git a/agents/src/voice/audio_recognition_endpointing_delay.test.ts b/agents/src/voice/audio_recognition_endpointing_delay.test.ts new file mode 100644 index 000000000..fdc9c9b45 --- /dev/null +++ b/agents/src/voice/audio_recognition_endpointing_delay.test.ts @@ -0,0 +1,142 @@ +// SPDX-FileCopyrightText: 2026 LiveKit, Inc. +// +// SPDX-License-Identifier: Apache-2.0 +import { ParticipantKind } from '@livekit/rtc-node'; +import { describe, expect, it, vi } from 'vitest'; +import { ChatContext } from '../llm/chat_context.js'; +import { initializeLogger } from '../log.js'; +import type { VAD } from '../vad.js'; +import { + AudioRecognition, + type AudioRecognitionOptions, + type RecognitionHooks, + type TurnDetectionMode, +} from './audio_recognition.js'; + +/** Private members of AudioRecognition the tests poke at to drive the EOU task. */ +interface RecognitionInternals { + vad?: VAD; + lastSpeakingTime?: number; + lastFinalTranscriptTime: number; + audioTranscript: string; + finalTranscriptConfidence: number[]; + bounceEOUTask?: { + result: Promise; + cancel: () => void; + cancelAndWait: () => Promise; + }; + runEOUDetection: (ctx: ChatContext) => void; +} + +function makeHooks(): { hooks: RecognitionHooks; onEndOfTurn: ReturnType } { + const onEndOfTurn = vi.fn(async () => true); + const hooks: RecognitionHooks = { + onInterruption: vi.fn(), + onStartOfSpeech: vi.fn(), + onVADInferenceDone: vi.fn(), + onEndOfSpeech: vi.fn(), + onInterimTranscript: vi.fn(), + onFinalTranscript: vi.fn(), + onPreemptiveGeneration: vi.fn(), + onUserTurnExceeded: vi.fn(), + retrieveChatCtx: () => ChatContext.empty(), + onEndOfTurn, + }; + return { hooks, onEndOfTurn }; +} + +function makeRecognition(opts: { + turnDetectionMode: TurnDetectionMode; + minEndpointingDelay: number; +}): { + recognition: AudioRecognition; + internals: RecognitionInternals; + onEndOfTurn: ReturnType; +} { + const { hooks, onEndOfTurn } = makeHooks(); + const recognitionOpts: AudioRecognitionOptions = { + recognitionHooks: hooks, + stt: undefined, + vad: undefined, + interruptionDetection: undefined, + turnDetectionMode: opts.turnDetectionMode, + minEndpointingDelay: opts.minEndpointingDelay, + maxEndpointingDelay: opts.minEndpointingDelay, + getLinkedParticipant: () => ({ sid: 'p1', identity: 'bob', kind: ParticipantKind.AGENT }), + }; + const recognition = new AudioRecognition(recognitionOpts); + return { + recognition, + internals: recognition as unknown as RecognitionInternals, + onEndOfTurn, + }; +} + +describe('AudioRecognition bounceEOUTask endpointing delay (#1741)', () => { + initializeLogger({ pretty: false, level: 'silent' }); + + it('VAD mode: minEndpointingDelay survives end-of-speech silence (regression for #1741)', async () => { + // Repro: Silero's minSilenceDuration (~550 ms) has already elapsed by + // the time bounceEOUTask is invoked at VAD END_OF_SPEECH. Before the + // fix, `extraSleep += lastSpeakingTime - Date.now()` collapsed the + // post-EOS window to (minDelay − elapsedSilence) ≈ −250 ms with the + // values below — so the turn committed the instant END_OF_SPEECH + // fired and any mid-sentence pause split into two segments. + const minDelay = 300; + const elapsedSilence = 550; + + const { internals, onEndOfTurn } = makeRecognition({ + turnDetectionMode: 'vad', + minEndpointingDelay: minDelay, + }); + // VAD must be truthy for vadBaseTurnDetection to take the fix branch. + internals.vad = {} as VAD; + internals.lastSpeakingTime = Date.now() - elapsedSilence; + internals.lastFinalTranscriptTime = 0; + internals.audioTranscript = ''; + internals.finalTranscriptConfidence = []; + + const start = Date.now(); + internals.runEOUDetection(ChatContext.empty()); + // Wait for the task to settle. + await internals.bounceEOUTask!.result.catch(() => {}); + const elapsed = Date.now() - start; + + expect(onEndOfTurn).toHaveBeenCalledTimes(1); + // The post-EOS grouping window must be roughly the configured minDelay, + // independent of how long Silero waited before emitting END_OF_SPEECH. + // Allow generous slack for timer scheduling jitter. + expect(elapsed).toBeGreaterThanOrEqual(minDelay - 50); + expect(elapsed).toBeLessThan(minDelay + 250); + }, 10_000); + + it('STT mode: endpointing delay still compensates for transcription latency', async () => { + // STT mode's adjustment is intentional — bounceEOUTask runs from STT's + // INFERENCE_DONE event, so subtracting elapsed time keeps the post- + // speech window roughly `minDelay` long even when transcription took + // a while. This test guards the fix from regressing STT-mode behaviour. + const minDelay = 400; + const elapsedSinceSpeech = 150; + + const { internals, onEndOfTurn } = makeRecognition({ + turnDetectionMode: 'stt', + minEndpointingDelay: minDelay, + }); + // No VAD — STT mode path. (vad undefined keeps vadBaseTurnDetection false.) + internals.vad = undefined; + internals.lastSpeakingTime = Date.now() - elapsedSinceSpeech; + internals.lastFinalTranscriptTime = 0; + internals.audioTranscript = ''; + internals.finalTranscriptConfidence = []; + + const start = Date.now(); + internals.runEOUDetection(ChatContext.empty()); + await internals.bounceEOUTask!.result.catch(() => {}); + const elapsed = Date.now() - start; + + const expected = minDelay - elapsedSinceSpeech; + expect(onEndOfTurn).toHaveBeenCalledTimes(1); + expect(elapsed).toBeGreaterThanOrEqual(expected - 50); + expect(elapsed).toBeLessThan(expected + 250); + }, 10_000); +});