diff --git a/.changeset/audio-eou.md b/.changeset/audio-eou.md new file mode 100644 index 000000000..3afe5b4fc --- /dev/null +++ b/.changeset/audio-eou.md @@ -0,0 +1,17 @@ +--- +"@livekit/agents": patch +"@livekit/agents-plugin-silero": patch +"@livekit/agents-plugins-livekit": patch +--- + +feat(core): audio end-of-turn detection with cloud → local fallback (AGT-2520) + +- New `inference.TurnDetector`: WebSocket cloud EOT transport (`version: 'v1'`, model name `turn-detector-v1`) with automatic fallback to the local native model (`version: 'v1-mini'`, model name `turn-detector-v1-mini`) via `@livekit/local-inference`. Auto-selects `'v1'` when `LIVEKIT_REMOTE_EOT_URL` is set, `'v1-mini'` otherwise. The `version` is the constructor knob; telemetry/billing report the full model name via `detector.model`. +- The local EOT model runs in the shared inference process (the same `InferenceProcExecutor` the text turn detector uses), loaded once per worker host (~138 MB) instead of in every job worker. The runner is registered by default when the native binding is available, so the inference process spawns on worker startup; on platforms where the binding can't load, local EOT degrades to a positive-default prediction and the worker still starts. (This is a JS-specific divergence from Python, which keeps EOT in-process and relies on forkserver COW sharing.) +- No prewarm helpers: EOT auto-warms in the inference process; the in-process silero VAD lazy-loads on first stream. (The `inference.prewarm*` helpers added during development were removed before release.) +- New `inference.VAD` (local-only streaming VAD via `@livekit/local-inference`). +- `AgentSession` now auto-provisions a bundled silero VAD when `vad` is omitted (`isDefault=true`). Pass `vad: null` to opt out. +- `livekit-plugins-silero` is deprecated; pass `vad: null` to opt out of the bundled default, or use `inference.VAD({ model: 'silero', ... })` to customise. +- `livekit-plugins-livekit` turn detector is deprecated in favor of `inference.TurnDetector`. +- New `EOTInferenceMetrics` and `EOTModelUsage`; new telemetry span attributes (`lk.eou.source`, `lk.eou.from_cache`, `lk.eou.detection_delay`); new `eot_prediction` event forwarded over remote sessions. +- Requires `@livekit/protocol` >= 1.46.5 (exposes the `AgentInference` message namespace used by the cloud transport, including the server-provided `SessionCreated` default thresholds). diff --git a/MODEL_LICENSE b/MODEL_LICENSE new file mode 100644 index 000000000..44bea4802 --- /dev/null +++ b/MODEL_LICENSE @@ -0,0 +1,113 @@ +LIVEKIT MODEL LICENSE AGREEMENT + +1. Introduction + + LiveKit Incorporated ("LiveKit") is making available its proprietary models for + use pursuant to the terms and conditions of this Agreement. As further + described below, you may use these LiveKit models freely but can only use them + together with the LiveKit Agents framework. You cannot use the LiveKit models + on a standalone basis or with any other frameworks. + + BY CLICKING "I ACCEPT," OR BY DOWNLOADING, INSTALLING, OR OTHERWISE ACCESSING + OR USING THE LIVEKIT MATERIALS, YOU AGREE THAT YOU HAVE READ AND UNDERSTOOD, + AND, AS A CONDITION TO YOUR USE OF THE LIVEKIT MATERIALS, YOU AGREE TO BE + BOUND BY, THE FOLLOWING TERMS AND CONDITIONS. + +2. Definitions + + "Agreement" means this LiveKit Model License Agreement. + + "Documentation" means the specifications, manuals, and documentation + accompanying any LiveKit Model and distributed by LiveKit. + + "Licensee" or "you" means the individual or entity agreeing to be bound by + this Agreement. + + "LiveKit Agents" means the proprietary LiveKit software framework for building + real-time multimodal AI applications with programmable backend participants. + + "LiveKit Materials" means, collectively, the LiveKit Models and Documentation. + + "LiveKit Model" means any of LiveKit's proprietary software models or + algorithms, including machine-learning software code, model weights, + inference-enabling software code, training-enabling software code, and + fine-tuning enabling software code. Any derivative works of a LiveKit Model, + whether developed by LiveKit, you, or any third party, will be deemed the + "LiveKit Model" for the purposes of this Agreement. + +3. License Rights + + Right to Use LiveKit Materials. Subject to the terms and conditions of this + Agreement, including the requirements of Section 3.b, LiveKit grants you a + nonexclusive, nontransferable, worldwide, royalty-free license under LiveKit's + intellectual property rights to use, reproduce, distribute, copy, and create + derivative works of the LiveKit Materials. + + Limitation on Use. As a condition to your use of the LiveKit Materials, you + agree: (i) not to use any LiveKit Models on a standalone basis or with any + frameworks other than LiveKit Agents; (ii) not to use any LiveKit Materials or + any output from, or results of using, LiveKit Models (including any derivative + works thereof) to improve or otherwise develop any other models that are not + LiveKit Models; or (iii) distribute or otherwise make available the LiveKit + Materials (including any derivative works thereof) except (x) pursuant to the + terms of this Agreement, and (y) you reproduce the above copyright notice. + +4. Intellectual Property + + The LiveKit Materials are owned by LiveKit and its licensors. Except for the + rights granted to you under this Agreement, all rights are reserved and no + other express or implied rights are granted. + + You will own any derivative works that you created from the LiveKit Materials, + subject to the terms of this Agreement. + +5. Disclaimer + + UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN WRITING, LIVEKIT PROVIDES + THE LIVEKIT MATERIALS, AND ANY OUTPUT OR RESULTS THEREFROM, ON AN "AS IS" + BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, EITHER EXPRESS OR IMPLIED, + INCLUDING, WITHOUT LIMITATION, ANY WARRANTIES OR CONDITIONS OF TITLE, + NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. YOU + ARE SOLELY RESPONSIBLE FOR DETERMINING THE APPROPRIATENESS OF USING OR + REDISTRIBUTING THE LIVEKIT MATERIALS AND ASSUME ANY RISKS ASSOCIATED WITH YOUR + USE OF THE LIVEKIT MATERIALS AND ANY OUTPUT AND RESULTS. + +6. Limitation of Liability + + IN NO EVENT AND UNDER NO LEGAL THEORY, WHETHER IN TORT (INCLUDING NEGLIGENCE), + CONTRACT, OR OTHERWISE, UNLESS REQUIRED BY APPLICABLE LAW (SUCH AS DELIBERATE + AND GROSSLY NEGLIGENT ACTS) OR AGREED TO IN WRITING, WILL LIVEKIT BE LIABLE TO + YOU FOR INDIRECT DAMAGES, INCLUDING ANY SPECIAL, INCIDENTAL, OR CONSEQUENTIAL + DAMAGES OF ANY CHARACTER ARISING AS A RESULT OF THIS AGREEMENT OR OUT OF THE + USE OR INABILITY TO USE THE LIVEKIT MATERIALS OR ANY OUTPUT OR RESULTS + THEREFROM (INCLUDING BUT NOT LIMITED TO DAMAGES FOR LOSS OF GOODWILL, WORK + STOPPAGE, COMPUTER FAILURE OR MALFUNCTION, OR ANY AND ALL OTHER COMMERCIAL + DAMAGES OR LOSSES), EVEN IF LIVEKIT HAS BEEN ADVISED OF THE POSSIBILITY OF + SUCH DAMAGES. + +7. Trademarks + + This Agreement does not grant permission to use the trade names, trademarks, + service marks, or product names of LiveKit, except as required for reasonable + and customary use in describing the origin of the LiveKit Materials. + +8. Term and Termination + + The term of this Agreement commences upon your acceptance of this Agreement + and continues in effect until you cease using the LiveKit Materials or it is + terminated by either party (on immediate written notice to the other party). + This Agreement will automatically terminate if you breach any of its terms. + Upon termination, you must immediately cease all use of the LiveKit Materials. + Sections 4, 5, 6, and 9 will survive termination. + +9. Governing Law and Venue + + This Agreement is subject to the laws of the State of California, without + regard to its conflict of laws principles. The UN Convention on Contracts for + the International Sale of Goods does not apply to this Agreement. The courts + located in San Francisco, California, have exclusive jurisdiction for any + dispute arising out of this Agreement. + ++ + + + + +Last Updated: November 25, 2024 diff --git a/README.md b/README.md index 8b7de522f..04c044a81 100644 --- a/README.md +++ b/README.md @@ -368,6 +368,8 @@ To connect and talk to your agent: This project is licensed under `Apache-2.0`, and is [REUSE-3.2](https://reuse.software) compliant. Refer to [the license](LICENSES/Apache-2.0.txt) for details. +The LiveKit turn detection models are licensed under the [LiveKit Model License](MODEL_LICENSE). +
diff --git a/REUSE.toml b/REUSE.toml index d2a802cee..1ed6c844d 100644 --- a/REUSE.toml +++ b/REUSE.toml @@ -7,6 +7,12 @@ SPDX-PackageName = "agents-js" SPDX-PackageSupplier = "LiveKit, Inc. " SPDX-PackageDownloadLocation = "https://github.com/livekit/agents-js" +# model license +[[annotations]] +path = ["MODEL_LICENSE"] +SPDX-FileCopyrightText = "2024 LiveKit, Inc." +SPDX-License-Identifier = "Apache-2.0" + # trivial files [[annotations]] path = [".gitignore", "flake.lock", ".envrc", "packages/livekit-rtc/.gitignore", ".changeset/**", "**/CHANGELOG.md", "NOTICE", ".github/**"] diff --git a/agents/package.json b/agents/package.json index 1ada14451..e39309524 100644 --- a/agents/package.json +++ b/agents/package.json @@ -52,8 +52,9 @@ "dependencies": { "@bufbuild/protobuf": "^1.10.0", "@ffmpeg-installer/ffmpeg": "^1.1.0", + "@livekit/local-inference": "^0.2.5", "@livekit/mutex": "^1.1.1", - "@livekit/protocol": "^1.46.4", + "@livekit/protocol": "^1.46.5", "@livekit/throws-transformer": "0.1.8", "@livekit/typed-emitter": "^3.0.0", "@opentelemetry/api": "^1.9.0", diff --git a/agents/src/inference/_warmup.ts b/agents/src/inference/_warmup.ts new file mode 100644 index 000000000..0c77e6814 --- /dev/null +++ b/agents/src/inference/_warmup.ts @@ -0,0 +1,45 @@ +// SPDX-FileCopyrightText: 2026 LiveKit, Inc. +// +// SPDX-License-Identifier: Apache-2.0 + +/** + * Loader for the bundled `@livekit/local-inference` native binding. + * + * Memory model (measured ~138 MB for the EOT model, ~2 MB for VAD): Node has + * no forkserver/COW, so anything loaded in a job worker is private to that + * worker. To avoid paying ~138 MB per worker, the EOT model is NOT loaded in + * job workers — it runs in the shared `InferenceProcExecutor` (see + * `inference/eot/runner.ts`), loaded once per host. The VAD stays in-process + * (it's small and runs continuously) and is reached via this loader. + * + * There are intentionally no public `prewarm*` helpers: EOT auto-warms via + * the inference runner's `initialize()` at proc startup, and the VAD lazy- + * loads on first stream. + */ +import { createRequire } from 'node:module'; +import { log } from '../log.js'; + +const cjsRequire = createRequire(import.meta.url); + +let nativeMod: typeof import('@livekit/local-inference') | undefined; +let triedLoad = false; + +function getNative(): typeof import('@livekit/local-inference') | undefined { + if (triedLoad) return nativeMod; + triedLoad = true; + try { + nativeMod = cjsRequire('@livekit/local-inference') as typeof import('@livekit/local-inference'); + return nativeMod; + } catch (err) { + log().warn( + { err: err instanceof Error ? err.message : String(err) }, + '@livekit/local-inference native binding not loadable; local VAD/EOT paths disabled', + ); + return undefined; + } +} + +/** @internal Returns the loaded native module, or `undefined` if unavailable. */ +export function _getLocalInferenceModule(): typeof import('@livekit/local-inference') | undefined { + return getNative(); +} diff --git a/agents/src/inference/eot/base.test.ts b/agents/src/inference/eot/base.test.ts new file mode 100644 index 000000000..e3c00445f --- /dev/null +++ b/agents/src/inference/eot/base.test.ts @@ -0,0 +1,269 @@ +// SPDX-FileCopyrightText: 2026 LiveKit, Inc. +// +// SPDX-License-Identifier: Apache-2.0 + +/** + * Inference-request lifecycle tests for `BaseStreamingTurnDetectorStream`. + * + * The stream is a thin transport-facing surface: per-request state is one + * `(requestId, requestFut)` pair. `predict` starts a request and returns its + * future, superseding any previous request; the transport's single prediction + * completes the request by resolving the future; `cancelInference`/`flush` + * close a pending request, resolving its future with a default event so + * waiters never hang. All policy (when to start a request, await timeout, turn + * commits) lives in `AudioRecognition` and is covered by + * `voice/audio_recognition_turn_detection.test.ts`. + * + * Port of Python `tests/test_turn_detection_fsm.py`. + */ +import type { AudioFrame } from '@livekit/rtc-node'; +import { describe, expect, it } from 'vitest'; +import type { Future } from '../../utils.js'; +import { + BaseStreamingTurnDetector, + type BaseStreamingTurnDetectorOptions, + BaseStreamingTurnDetectorStream, + type FlushSentinel, + type StreamingTurnDetectionTransport, + type TurnDetectionEvent, +} from './base.js'; +import { ThresholdOptions, type TurnDetectorModel } from './languages.js'; + +class FakeTransport implements StreamingTurnDetectionTransport { + events: Array<[string, string]> = []; + private _stream: BaseStreamingTurnDetectorStream | undefined; + + attach(stream: BaseStreamingTurnDetectorStream): void { + this._stream = stream; + } + async run(): Promise { + if (this._stream === undefined) { + throw new Error('stream not bound'); + } + await this._stream._drainAudioChannel(); + } + runInference(requestId: string): void { + this.events.push(['run_inference', requestId]); + } + async pushFrame(_frame: AudioFrame): Promise { + // no-op + } + async flush(_sentinel: FlushSentinel): Promise { + // no-op + } + detach(): void { + // no-op + } +} + +class FakeDetector extends BaseStreamingTurnDetector { + // Mirror Python's `_make_stream` default (the local mini model) so the + // timed-out-cancel test sees a non-cloud model and skips the fallback. + get model(): TurnDetectorModel { + return 'turn-detector-v1-mini'; + } + stream(): BaseStreamingTurnDetectorStream { + throw new Error('unused in request-lifecycle tests'); + } +} + +class FakeBackend extends BaseStreamingTurnDetectorStream { + fakeTransport: FakeTransport; + + constructor(opts: BaseStreamingTurnDetectorOptions) { + const transport = new FakeTransport(); + super({ detector: new FakeDetector(opts), opts, transport }); + this.fakeTransport = transport; + } + + get events(): Array<[string, string]> { + return this.fakeTransport.events; + } + + /** Mirror what a transport would do: hand the prediction to the stream. */ + simulatePrediction(requestId: string, probability: number): void { + this._resolvePrediction(requestId, probability); + } + + // Exposed for assertions. + get requestId(): string | undefined { + return this._requestId; + } + get requestFut(): Future | undefined { + return this._requestFut; + } +} + +function makeOpts(thresholds: Record = {}): BaseStreamingTurnDetectorOptions { + // Seed the resolved thresholds via a local-model dict override so `lookup` + // returns them (unmapped languages fall back to the shipped local table). + return { + sampleRate: 16000, + thresholds: new ThresholdOptions('turn-detector-v1-mini', thresholds), + }; +} + +function makeStream(thresholds: Record = {}): FakeBackend { + return new FakeBackend(makeOpts(thresholds)); +} + +const countRunInference = (events: Array<[string, string]>) => + events.filter((e) => e[0] === 'run_inference').length; + +describe('AudioTurnDetectionRequests', () => { + it('predict starts inference', async () => { + const s = makeStream(); + try { + const fut = s.predict(); + expect(s.requestId).toBeDefined(); + expect(fut.done).toBe(false); + expect(s.events).toEqual([['run_inference', s.requestId!]]); + } finally { + await s.aclose(); + } + }); + + it('predict supersedes previous request', async () => { + const s = makeStream(); + try { + const oldFut = s.predict(); + const oldId = s.requestId; + const newFut = s.predict(); + + expect(newFut).not.toBe(oldFut); + expect(s.requestId).not.toBe(oldId); + expect(oldFut.done).toBe(true); + expect((await oldFut.await).endOfTurnProbability).toBe(0.0); + expect(countRunInference(s.events)).toBe(2); + } finally { + await s.aclose(); + } + }); + + it('cancelInference closes the request', async () => { + const s = makeStream(); + try { + const fut = s.predict(); + s.cancelInference(); + + expect(s.requestId).toBeUndefined(); + expect(fut.done).toBe(true); + expect((await fut.await).endOfTurnProbability).toBe(0.0); + } finally { + await s.aclose(); + } + }); + + it('cancelInference when idle is a no-op', async () => { + const s = makeStream(); + try { + s.cancelInference(); + expect(s.events).toEqual([]); + } finally { + await s.aclose(); + } + }); + + it('late prediction after cancelInference is dropped', async () => { + const s = makeStream(); + try { + const fut = s.predict(); + const cancelledId = s.requestId!; + expect(cancelledId).toBeDefined(); + + s.cancelInference(); + s.simulatePrediction(cancelledId, 0.9); + // cancelInference default (0.0), not the late 0.9. + expect((await fut.await).endOfTurnProbability).toBe(0.0); + + const nextFut = s.predict(); + expect(nextFut).not.toBe(fut); + expect(nextFut.done).toBe(false); + expect(countRunInference(s.events)).toBe(2); + } finally { + await s.aclose(); + } + }); + + it('prediction completes the request', async () => { + const s = makeStream(); + try { + const fut = s.predict(); + const requestId = s.requestId!; + expect(requestId).toBeDefined(); + + s.simulatePrediction(requestId, 0.3); + expect(fut.done).toBe(true); + expect((await fut.await).endOfTurnProbability).toBe(0.3); + expect(s.requestId).toBeUndefined(); + } finally { + await s.aclose(); + } + }); + + it('flush closes the request', async () => { + const s = makeStream(); + try { + const fut = s.predict(); + s.flush('turn committed'); + expect(s.requestId).toBeUndefined(); + expect((await fut.await).endOfTurnProbability).toBe(0.0); + } finally { + await s.aclose(); + } + }); + + it('flush does not overwrite a resolved prediction', async () => { + const s = makeStream(); + try { + const fut = s.predict(); + const requestId = s.requestId!; + expect(requestId).toBeDefined(); + s.simulatePrediction(requestId, 0.7); + + s.flush('turn committed'); + expect((await fut.await).endOfTurnProbability).toBe(0.7); + expect(s.requestId).toBeUndefined(); + } finally { + await s.aclose(); + } + }); + + it('predict after endInput returns a resolved default', async () => { + const s = makeStream(); + try { + s.endInput(); + // `endInput` closes the audio channel asynchronously; wait for it. + await new Promise((resolve) => setTimeout(resolve, 20)); + const fut = s.predict(); + expect(fut.done).toBe(true); + expect((await fut.await).endOfTurnProbability).toBe(1.0); + expect(s.events.some((e) => e[0] === 'run_inference')).toBe(false); + } finally { + await s.aclose(); + } + }); + + it('aclose resolves a pending future', async () => { + const s = makeStream(); + const fut = s.predict(); + await s.aclose(); + expect(fut.done).toBe(true); + expect((await fut.await).endOfTurnProbability).toBe(0.0); + }); + + it('timed-out cancelInference does not fall back for the local model', async () => { + // `timedOut: true` only promotes the cloud→local fallback for the cloud + // model; the base stream (mini model) just closes the request — the cloud + // case is covered in detector.test.ts. + const s = makeStream(); + try { + const fut = s.predict(); + s.cancelInference({ timedOut: true }); + expect((await fut.await).endOfTurnProbability).toBe(0.0); + expect(s.model).toBe('turn-detector-v1-mini'); + } finally { + await s.aclose(); + } + }); +}); diff --git a/agents/src/inference/eot/base.ts b/agents/src/inference/eot/base.ts new file mode 100644 index 000000000..9ff17b983 --- /dev/null +++ b/agents/src/inference/eot/base.ts @@ -0,0 +1,543 @@ +// SPDX-FileCopyrightText: 2026 LiveKit, Inc. +// +// SPDX-License-Identifier: Apache-2.0 + +/** + * Audio EOT (end-of-turn) detector base, the per-window inference stream, and + * the transport interface that concrete cloud/local backends implement. + * + * Concrete implementations live in `agents/src/inference/eot/`. + * + * Port of Python `livekit.agents.voice.turn.audio`. + */ +import type { AudioFrame } from '@livekit/rtc-node'; +import { AudioResampler, AudioResamplerQuality } from '@livekit/rtc-node'; +import type { TypedEventEmitter as TypedEmitter } from '@livekit/typed-emitter'; +import { EventEmitter } from 'node:events'; +import type { LanguageCode } from '../../language.js'; +import { log } from '../../log.js'; +import type { EOTInferenceMetrics } from '../../metrics/base.js'; +import { type StreamChannel, createStreamChannel } from '../../stream/stream_channel.js'; +import { Future, Task, cancelAndWait, shortuuid } from '../../utils.js'; +import type { ThresholdOptions, TurnDetectorModel } from './languages.js'; + +export const DEFAULT_SAMPLE_RATE = 16000; +export const MIN_SILENCE_DURATION_MS = 200; + +/** + * Options shared by the audio EOT stream and every transport. + * + * Cloud-only transport concerns (base URL, credentials, conn options) + * live on a separate options class owned by the cloud transport. + */ +export interface BaseStreamingTurnDetectorOptions { + sampleRate: number; + thresholds: ThresholdOptions; +} + +/** + * Event emitted on each EOT prediction. + */ +export interface TurnDetectionEvent { + type: 'eot_prediction'; + endOfTurnProbability: number; + /** Wall-clock time when the prediction landed (milliseconds since epoch). */ + lastSpeakingTimeMs: number; + /** Latest input-audio creation time → prediction receive time (ms). */ + detectionDelay?: number; + /** Server-side model inference time (ms). */ + inferenceDuration?: number; +} + +/** + * Sentinel value carried alongside flush requests. Signals a turn boundary + * to the transport so it can clear its buffered audio. + */ +export interface FlushSentinel { + readonly kind: 'flush'; + reason?: string; +} + +export function isFlushSentinel(value: unknown): value is FlushSentinel { + return typeof value === 'object' && value !== null && (value as FlushSentinel).kind === 'flush'; +} + +/** + * Transport adapter for `BaseStreamingTurnDetectorStream` — owns the I/O (WebSocket + * session, in-process predict, etc.). The stream calls these methods + * directly; transports report predictions back via + * `stream._resolvePrediction(requestId, probability, ...)`. + */ +export interface StreamingTurnDetectionTransport { + attach(stream: BaseStreamingTurnDetectorStream): void; + run(): Promise; + runInference(requestId: string): void; + pushFrame(frame: AudioFrame): Promise; + flush(sentinel: FlushSentinel): Promise; + detach(): void; +} + +export type BaseStreamingTurnDetectorCallbacks = { + metrics_collected: (metrics: EOTInferenceMetrics) => void; +}; + +/** + * Abstract base for audio EOT detectors. Holds the threshold table and + * provides `stream()` to create a per-turn FSM instance. + * + * Subclasses (`TurnDetector` in `inference/eot/detector.ts`) wire up + * concrete transports. + */ +export abstract class BaseStreamingTurnDetector extends (EventEmitter as new () => TypedEmitter) { + protected _opts: BaseStreamingTurnDetectorOptions; + /** + * Active streams the detector tracks for bulk teardown via `aclose()`. + * `Set` rather than `WeakSet` because we need iteration; each stream + * removes itself on its own `aclose` (see `BaseStreamingTurnDetectorStream.aclose`) + * so the strong refs are released without requiring the caller to call + * `detector.aclose()`. + */ + protected _streams: Set = new Set(); + + constructor(opts: BaseStreamingTurnDetectorOptions) { + super(); + this._opts = opts; + } + + /** @internal Stream lifecycle hook — called by the stream itself on close. */ + _unregisterStream(stream: BaseStreamingTurnDetectorStream): void { + this._streams.delete(stream); + } + + abstract get model(): TurnDetectorModel; + + get provider(): string { + return 'livekit'; + } + + /** Most-recent materialized threshold map (after any cloud→local fallback + * rescale or server-default adoption). */ + get thresholds(): Readonly> { + return this._opts.thresholds.thresholds; + } + + /** Threshold below which the detector treats the prediction as "unlikely + * to be end-of-turn". Returns `undefined` when the language isn't covered. */ + async unlikelyThreshold(language: LanguageCode | undefined): Promise { + return this._opts.thresholds.lookup(language); + } + + async supportsLanguage(language: LanguageCode | undefined): Promise { + return this._opts.thresholds.supports(language); + } + + abstract stream(): BaseStreamingTurnDetectorStream; + + async aclose(): Promise { + const streams = Array.from(this._streams); + this._streams.clear(); + await Promise.allSettled(streams.map((s) => s.aclose())); + } +} + +/** + * Per-window inference stream. A thin transport-facing surface: per-request + * state is one `(requestId, requestFut)` pair. + * + * - `predict()` starts a request and returns its future, superseding any + * previous request. + * - the transport's single prediction completes the request by resolving the + * future via `_resolvePrediction`. + * - `cancelInference()` / `flush(reason)` close a pending request, resolving + * its future with a default event so waiters never hang. + * + * All policy (when to start a request, await timeout, turn commits) lives in + * `AudioRecognition`. + */ +export class SwapAbortError extends Error { + constructor() { + super('__swap__'); + this.name = 'SwapAbortError'; + } +} + +export class BaseStreamingTurnDetectorStream { + protected _detector: BaseStreamingTurnDetector; + protected _opts: BaseStreamingTurnDetectorOptions; + protected _transport: StreamingTurnDetectionTransport; + + private _audioInputSampleRate: number | undefined; + private _audioInputNumChannels: number | undefined; + private _audioResampler: AudioResampler | undefined; + private _audioChannel: StreamChannel = createStreamChannel(); + + /** Id of the in-flight inference request, or `undefined` when idle. */ + protected _requestId: string | undefined; + /** Future for the in-flight request; resolves to the prediction event (or + * a default event when the request is cancelled / flushed). */ + protected _requestFut: Future | undefined; + + protected _mainTask: Task; + protected _logger = log(); + /** + * Aborted whenever the main loop needs to retry on a new transport (e.g. + * fallback). The base FSM also aborts it from `aclose()` so idle + * transports that are awaiting forever can be unstuck. Listeners check + * `signal.aborted` and surface a sentinel rejection so the `_run` loop + * can decide whether to continue or exit. + */ + protected _swapController = new AbortController(); + + constructor(args: { + detector: BaseStreamingTurnDetector; + opts: BaseStreamingTurnDetectorOptions; + transport: StreamingTurnDetectionTransport; + }) { + this._detector = args.detector; + this._opts = args.opts; + this._transport = args.transport; + this._transport.attach(this); + + this._mainTask = Task.from((controller) => this._mainTaskBody(controller)); + } + + // region: _TurnDetector protocol proxies + + get model(): TurnDetectorModel { + return this._detector.model; + } + + get provider(): string { + return this._detector.provider; + } + + /** @internal Shared threshold resolver — the cloud transport reads it to + * adopt the server-sent defaults from `SessionCreated`. */ + get thresholdsOptions(): ThresholdOptions { + return this._opts.thresholds; + } + + async unlikelyThreshold(language: LanguageCode | undefined): Promise { + return this._opts.thresholds.lookup(language); + } + + async supportsLanguage(language: LanguageCode | undefined): Promise { + return this._opts.thresholds.supports(language); + } + + // endregion + + // region: inference requests + + /** Start a new inference request and return its future, superseding any + * previous request. */ + predict(): Future { + if (this._audioChannel.closed) { + const fut = new Future(); + fut.resolve(BaseStreamingTurnDetectorStream._defaultEvent(1.0)); + return fut; + } + + this.cancelInference(); // supersede any previous request + const fut = new Future(); + this._requestId = shortuuid('turn_request_'); + this._requestFut = fut; + // A transport may resolve synchronously (e.g. the local no-executor path + // defaults to 1.0 inline), which clears `_requestFut` via + // `_resolvePrediction`. Hold a local reference so we still return the + // resolved future rather than `undefined`. + this._transport.runInference(this._requestId); + return fut; + } + + /** Close the current inference request (new speech, turn boundary, + * prediction timeout, mode change) and fall back if needed. */ + cancelInference(opts: { timedOut?: boolean } = {}): void { + if (this._requestId !== undefined) { + const fut = this._requestFut; + this._requestId = undefined; + this._requestFut = undefined; + if (fut !== undefined && !fut.done) { + fut.resolve(BaseStreamingTurnDetectorStream._defaultEvent(0.0)); + } + } + + // trigger fallback immediately (the subclass timeout hook checks the + // model + signals the transport swap; the base hook is a no-op). + if (opts.timedOut) { + this._onPredictTimeout(); + } + } + + flush(reason?: string): void { + // Idempotent: a second call sends another sentinel that transports + // treat as a no-op (cloud: redundant session_flush; local: empty trim). + if (this._audioChannel.closed) { + return; + } + for (const resampled of this._flushAudioResampler()) { + void this._audioChannel.write(resampled); + } + const sentinel: FlushSentinel = { + kind: 'flush', + reason, + }; + void this._audioChannel.write(sentinel); + this.cancelInference(); + } + + protected static _defaultEvent(probability: number): TurnDetectionEvent { + return { + type: 'eot_prediction', + endOfTurnProbability: probability, + lastSpeakingTimeMs: Date.now(), + }; + } + + // endregion + + // region: audio ingress + + pushAudio(frame: AudioFrame): void { + if (this._audioChannel.closed) { + return; + } + for (const resampled of this._resampleAudioFrame(frame)) { + void this._audioChannel.write(resampled); + } + } + + endInput(): void { + this.flush(); + void this._audioChannel.close(); + } + + private _resampleAudioFrame(frame: AudioFrame): AudioFrame[] { + if (this._audioInputSampleRate === undefined || this._audioInputNumChannels === undefined) { + this._audioInputSampleRate = frame.sampleRate; + this._audioInputNumChannels = frame.channels; + if (this._audioInputSampleRate !== this._opts.sampleRate) { + this._audioResampler = new AudioResampler( + this._audioInputSampleRate, + this._opts.sampleRate, + this._audioInputNumChannels, + AudioResamplerQuality.QUICK, + ); + } + } else if ( + frame.sampleRate !== this._audioInputSampleRate || + frame.channels !== this._audioInputNumChannels + ) { + this._logger.error( + { + sampleRate: frame.sampleRate, + expectedSampleRate: this._audioInputSampleRate, + numChannels: frame.channels, + expectedNumChannels: this._audioInputNumChannels, + }, + 'a frame with different audio format was already pushed', + ); + return []; + } + if (this._audioResampler === undefined) { + return [frame]; + } + return this._audioResampler.push(frame); + } + + private _flushAudioResampler(): AudioFrame[] { + const frames = this._audioResampler?.flush() ?? []; + this._resetAudioResampler(); + return frames; + } + + private _resetAudioResampler(): void { + this._audioResampler = undefined; + this._audioInputSampleRate = undefined; + this._audioInputNumChannels = undefined; + } + + // endregion + + // region: results + + /** + * Accept a prediction from a transport. A stale response (request id + * mismatch) is ignored; otherwise the in-flight future resolves with the + * full `TurnDetectionEvent` and the request completes. + */ + _resolvePrediction( + requestId: string, + probability: number, + opts: { inferenceDuration?: number; detectionDelay?: number } = {}, + ): void { + // Drop predictions that land after teardown — an in-flight transport + // predict can resolve after `aclose` closed the channels. + if (this._closing) { + return; + } + if (requestId !== this._requestId) { + return; + } + const fut = this._requestFut; + this._requestId = undefined; + this._requestFut = undefined; + if (fut !== undefined && !fut.done) { + fut.resolve({ + type: 'eot_prediction', + endOfTurnProbability: probability, + lastSpeakingTimeMs: Date.now(), + detectionDelay: opts.detectionDelay, + inferenceDuration: opts.inferenceDuration, + }); + } + } + + // endregion + + // region: teardown + + /** + * Synchronously release this stream's registration on its owning detector, + * so a replacement stream can be created before this one's async teardown + * finishes. Base is a no-op; detectors that enforce single-stream ownership + * override it. Idempotent. + */ + detach(): void { + return; + } + + async aclose(): Promise { + this.endInput(); // the flush inside closes the in-flight request + this._closing = true; + this._swapController.abort(); + await cancelAndWait([this._mainTask]); + this.cancelInference(); // defensive, normally a no-op + // Drop our strong reference on the parent detector so callers that + // forget `detector.aclose()` don't leak the stream graph. + this._detector._unregisterStream(this); + } + + /** True once `aclose()` has been called. The `_run` loop uses this to + * distinguish swap-aborts (continue with new transport) from teardown + * aborts (exit). */ + protected _closing = false; + + // endregion + + // region: main task scaffolding + + private async _mainTaskBody(_controller: AbortController): Promise { + await this._run(); + } + + /** + * Drain the shared audio channel into the current transport. + * + * The audio channel exposes a single `ReadableStream` (one underlying + * `transform.readable`), so only one reader may hold its lock at a time. + * When `signal` aborts (a transport being swapped out — e.g. cloud→local + * fallback — fires it via `detach()`), we release the reader lock right + * away: on a pending `read()` this rejects that read and frees the lock so + * the swapped-in transport's `_drainAudioChannel` can re-acquire it. + * Without this an orphaned drain would hold the lock forever and the next + * `getReader()` would throw "ReadableStream is locked". + */ + async _drainAudioChannel(signal?: AbortSignal): Promise { + const stream = this._audioChannel.stream(); + const reader = stream.getReader(); + const release = () => { + try { + reader.releaseLock(); + } catch { + // already released + } + }; + if (signal?.aborted) { + release(); + return; + } + signal?.addEventListener('abort', release, { once: true }); + try { + while (true) { + const { done, value } = await reader.read(); + if (done) return; + if (isFlushSentinel(value)) { + await this._transport.flush(value); + } else { + await this._transport.pushFrame(value); + } + } + } catch (err) { + // The pending `read()` rejects when `release()` runs on abort — a clean + // swap-driven exit, not a drain failure. + if (signal?.aborted) return; + throw err; + } finally { + signal?.removeEventListener('abort', release); + release(); + } + } + + // endregion + + // region: subclass hooks + + /** Default: hand control to the transport. Subclasses override for + * cross-transport orchestration (e.g. cloud→local fallback). */ + protected async _run(): Promise { + await this._raceWithSwap(this._transport.run()); + } + + /** + * Race `inner` against `_swapController.signal`. If the signal aborts + * while `inner` is still pending, throw a `SwapAbortError` so the + * subclass loop can decide whether to continue or exit. Resets the + * controller after a swap-abort so subsequent races have a fresh signal. + * + * `aclose()` aborts during teardown — subclasses observe `_closing` to + * exit cleanly instead of looping. + */ + protected async _raceWithSwap(inner: Promise): Promise { + const signal = this._swapController.signal; + let onAbort: (() => void) | undefined; + const abortPromise = new Promise((_, reject) => { + if (signal.aborted) { + reject(new SwapAbortError()); + return; + } + onAbort = () => reject(new SwapAbortError()); + signal.addEventListener('abort', onAbort, { once: true }); + }); + // If `inner` wins the race, the abort listener would otherwise stay + // registered and reject this now-orphaned promise when `aclose()` later + // aborts the controller — surfacing as an unhandledRejection. Swallow it + // (the race result is already settled) and remove the listener below. + abortPromise.catch(() => {}); + try { + return await Promise.race([inner, abortPromise]); + } finally { + if (onAbort !== undefined) { + signal.removeEventListener('abort', onAbort); + } + if (signal.aborted) { + // Reset for the next iteration of the subclass loop. + this._swapController = new AbortController(); + } + } + } + + /** @internal Wake up an idle transport so the main loop can pick up a + * new one after fallback. Subclasses call this from their swap logic. */ + protected _signalSwap(): void { + this._swapController.abort(); + } + + /** `predictEndOfTurn` timed out. Subclasses may override to react (e.g. + * promote local on cloud timeout). */ + protected _onPredictTimeout(): void { + return; + } + + // endregion +} diff --git a/agents/src/inference/eot/detector.test.ts b/agents/src/inference/eot/detector.test.ts new file mode 100644 index 000000000..17b7bc09f --- /dev/null +++ b/agents/src/inference/eot/detector.test.ts @@ -0,0 +1,650 @@ +// SPDX-FileCopyrightText: 2026 LiveKit, Inc. +// +// SPDX-License-Identifier: Apache-2.0 + +/** + * Tests for the unified `TurnDetector` (auto-select + fallback + server defaults). + * + * Covers: + * + * - Auto-select via `LIVEKIT_REMOTE_EOT_URL` env var (with creds present, + * with creds missing → silent downgrade). + * - Explicit-cloud missing creds throws. + * - Cloud → local fallback triggers (transport raise, predict timeout). + * - Fallback persistence across turns. + * - Local-failure handling (default 1.0, retry on next turn). + * - Per-session warning dedupe (one warning per failure mode). + * - Server-provided default thresholds adopted from `SessionCreated`. + * - Override resolution (scalar / dict / none) against the server defaults, the + * override warning, runtime `updateOptions`, and the degenerate + * (no usable thresholds) → fallback path. + * - Threshold rescaling against the server defaults on actual fallback. + * + * Port of Python `tests/test_audio_turn_detector_fallback.py`. + */ +import { AudioFrame } from '@livekit/rtc-node'; +import { afterEach, beforeEach, describe, expect, it, vi } from 'vitest'; +import { APIConnectionError, APIError } from '../../_exceptions.js'; +import type { InferenceExecutor } from '../../ipc/inference_executor.js'; +import { log } from '../../log.js'; +import { DEFAULT_API_CONNECT_OPTIONS } from '../../types.js'; +import type { BaseStreamingTurnDetectorStream } from './base.js'; +import { + type BaseStreamingTurnDetectorOptions, + type FlushSentinel, + type StreamingTurnDetectionTransport, +} from './base.js'; +import { TurnDetector, TurnDetectorStreamImpl } from './detector.js'; +import { LOCAL_LANGUAGES, ThresholdOptions } from './languages.js'; +import { EOT_INFERENCE_METHOD } from './runner.js'; +import { LocalTransport } from './transports.js'; + +// Stand-in for the per-language defaults a gateway returns in `SessionCreated`. +const SERVER_THRESHOLDS: Record = { en: 0.56, ja: 0.37, fr: 0.575 }; +const SERVER_DEFAULT_THRESHOLD = 0.5; + +async function waitFor(predicate: () => boolean, ticks = 50): Promise { + for (let i = 0; i < ticks; i++) { + if (predicate()) return; + await new Promise((r) => setImmediate(r)); + } +} + +interface ScriptedTransportOptions { + runBehavior?: 'idle' | 'raise' | 'return'; + runExc?: Error; +} + +class ScriptedTransport implements StreamingTurnDetectionTransport { + runBehavior: 'idle' | 'raise' | 'return'; + runExc: Error | undefined; + runCalls = 0; + events: Array<[string, unknown]> = []; + private _stream: BaseStreamingTurnDetectorStream | undefined; + + constructor(opts: ScriptedTransportOptions = {}) { + this.runBehavior = opts.runBehavior ?? 'idle'; + this.runExc = opts.runExc; + } + + attach(stream: BaseStreamingTurnDetectorStream): void { + this._stream = stream; + } + async run(): Promise { + this.runCalls += 1; + if (this.runBehavior === 'raise') { + if (!this.runExc) throw new Error('runExc not set'); + throw this.runExc; + } + if (this.runBehavior === 'return') { + return; + } + // idle — wait until cancelled (resolved by `detach()` via the + // scripted transport's no-op; in our tests the parent stream + // cancels via `aclose`). + await new Promise(() => undefined); + } + runInference(requestId: string): void { + this.events.push(['run_inference', requestId]); + } + async pushFrame(frame: AudioFrame): Promise { + this.events.push(['push_frame', frame]); + } + async flush(sentinel: FlushSentinel): Promise { + this.events.push(['flush', sentinel]); + } + detach(): void { + this.events.push(['detach', null]); + } +} + +function detectorOpts(detector: TurnDetector): BaseStreamingTurnDetectorOptions { + return (detector as unknown as { _opts: BaseStreamingTurnDetectorOptions })._opts; +} + +interface MakeStreamOpts { + model?: 'turn-detector-v1' | 'turn-detector-v1-mini'; + userThreshold?: number | Record; + detector?: TurnDetector; +} + +/** + * Construct a stream wired to a scripted transport. The detector and stream + * share one `ThresholdOptions` (as in production). The cloud model starts with + * empty thresholds (its defaults arrive via `SessionCreated` — call + * `stream.thresholdsOptions._updateDefaults` to simulate that). The local mini + * model resolves its thresholds against `LOCAL_LANGUAGES` up front. + */ +function makeStreamWithTransport( + transport: StreamingTurnDetectionTransport, + opts: MakeStreamOpts = {}, +): TurnDetectorStreamImpl { + const model = opts.model ?? 'turn-detector-v1'; + const detector = opts.detector ?? makeMockDetector(model, opts.userThreshold); + const stream = new TurnDetectorStreamImpl({ + detector, + opts: detectorOpts(detector), + cloudOpts: + model === 'turn-detector-v1' + ? { + baseUrl: 'ws://test', + apiKey: 'x', + apiSecret: 'x', + connOptions: DEFAULT_API_CONNECT_OPTIONS, + } + : undefined, + model, + transport, + }); + return stream; +} + +/** Build a `TurnDetector` for assertions without going through env + * resolution — seed a specific model + threshold override for a stream we'll + * build separately. */ +function makeMockDetector( + model: 'turn-detector-v1' | 'turn-detector-v1-mini', + userThreshold?: number | Record, +): TurnDetector { + // Construct via the public constructor, then override the internal model + + // shared threshold options to match what we want for the assertion. + const originalEnv = { ...process.env }; + if (model === 'turn-detector-v1-mini') { + delete process.env.LIVEKIT_REMOTE_EOT_URL; + } else { + process.env.LIVEKIT_REMOTE_EOT_URL = 'ws://test'; + process.env.LIVEKIT_API_KEY = 'x'; + process.env.LIVEKIT_API_SECRET = 'x'; + } + const det = new TurnDetector(); + process.env = originalEnv; + const internals = det as unknown as { + _model: typeof model; + _opts: BaseStreamingTurnDetectorOptions; + }; + internals._model = model; + internals._opts = { ...internals._opts, thresholds: new ThresholdOptions(model, userThreshold) }; + return det; +} + +function withEnv( + overrides: Record, + fn: () => void | Promise, +): void | Promise { + const original = { ...process.env }; + for (const [k, v] of Object.entries(overrides)) { + if (v === undefined) delete process.env[k]; + else process.env[k] = v; + } + try { + const result = fn(); + if (result instanceof Promise) { + return result.finally(() => { + process.env = original; + }); + } + process.env = original; + return result; + } catch (err) { + process.env = original; + throw err; + } +} + +// Stub `LocalTransport.run` so the fallback FSM doesn't hang on a real +// drain loop. The behavior under test is the swap, not the post-swap I/O. +let runSpy: ReturnType; +beforeEach(() => { + runSpy = vi.spyOn(LocalTransport.prototype, 'run').mockImplementation(async () => undefined); +}); +afterEach(() => { + runSpy.mockRestore(); +}); + +describe('AutoSelect', () => { + it('selects local when no remote EOT url', () => { + void withEnv({ LIVEKIT_REMOTE_EOT_URL: undefined }, () => { + const detector = new TurnDetector(); + expect(detector.model).toBe('turn-detector-v1-mini'); + }); + }); + + it('selects cloud when remote EOT url set', () => { + void withEnv( + { + LIVEKIT_REMOTE_EOT_URL: 'ws://gateway', + LIVEKIT_API_KEY: 'k', + LIVEKIT_API_SECRET: 's', + }, + () => { + const detector = new TurnDetector(); + expect(detector.model).toBe('turn-detector-v1'); + }, + ); + }); + + it('downgrades to local when creds missing', () => { + void withEnv( + { + LIVEKIT_REMOTE_EOT_URL: 'ws://gateway', + LIVEKIT_API_KEY: undefined, + LIVEKIT_API_SECRET: undefined, + LIVEKIT_INFERENCE_API_KEY: undefined, + LIVEKIT_INFERENCE_API_SECRET: undefined, + }, + () => { + const detector = new TurnDetector(); + expect(detector.model).toBe('turn-detector-v1-mini'); + }, + ); + }); +}); + +describe('ExplicitModelErrors', () => { + it('explicit cloud missing creds throws', () => { + void withEnv( + { + LIVEKIT_REMOTE_EOT_URL: undefined, + LIVEKIT_API_KEY: undefined, + LIVEKIT_API_SECRET: undefined, + LIVEKIT_INFERENCE_API_KEY: undefined, + LIVEKIT_INFERENCE_API_SECRET: undefined, + }, + () => { + expect(() => new TurnDetector({ version: 'v1' })).toThrow(); + }, + ); + }); +}); + +describe('Fallback', () => { + it('fallback on transport error swaps to local', async () => { + const transport = new ScriptedTransport({ + runBehavior: 'raise', + runExc: new APIConnectionError({ message: 'boom' }), + }); + const stream = makeStreamWithTransport(transport); + await waitFor(() => stream.model === 'turn-detector-v1-mini'); + expect(stream.model).toBe('turn-detector-v1-mini'); + expect(stream.isFallback).toBe(true); + expect(stream.warnedCloudFailure).toBe(true); + expect(transport.events).toContainEqual(['detach', null]); + await stream.aclose(); + }); + + it('fallback on timed-out cancelInference', async () => { + const transport = new ScriptedTransport({ runBehavior: 'idle' }); + const stream = makeStreamWithTransport(transport); + const fut = stream.predict(); + // A timed-out cancel (driven by AudioRecognition's eou bounce) closes the + // request and promotes the cloud→local fallback. + stream.cancelInference({ timedOut: true }); + expect((await fut.await).endOfTurnProbability).toBe(0.0); + await waitFor(() => stream.model === 'turn-detector-v1-mini'); + expect(stream.model).toBe('turn-detector-v1-mini'); + expect(stream.isFallback).toBe(true); + await stream.aclose(); + }); + + it('fallback persists across turns', async () => { + const transport = new ScriptedTransport({ + runBehavior: 'raise', + runExc: new APIConnectionError({ message: 'boom' }), + }); + const stream = makeStreamWithTransport(transport); + await waitFor(() => stream.model === 'turn-detector-v1-mini'); + expect(transport.runCalls).toBe(1); + stream.predict(); + expect(stream.model).toBe('turn-detector-v1-mini'); + await stream.aclose(); + }); +}); + +describe('MultiStreamOwnership', () => { + it('multiple streams can be opened off one detector', async () => { + let detector!: TurnDetector; + withEnv({ LIVEKIT_REMOTE_EOT_URL: undefined }, () => { + detector = new TurnDetector({ version: 'v1-mini' }); + }); + // Only one stream is active at a time in production; the detector still + // permits constructing several (they share its `ThresholdOptions`). + const s1 = detector.stream(); + const s2 = detector.stream(); + await s1.aclose(); + await s2.aclose(); + }); +}); + +describe('DetectorViewAfterFallback', () => { + it('detector model + threshold follow the fallback (shared ThresholdOptions)', async () => { + let detector!: TurnDetector; + withEnv( + { + LIVEKIT_REMOTE_EOT_URL: 'ws://gateway', + LIVEKIT_API_KEY: 'k', + LIVEKIT_API_SECRET: 's', + }, + () => { + detector = new TurnDetector({ unlikelyThreshold: 0.5 }); + }, + ); + expect(detector.model).toBe('turn-detector-v1'); + // scalar override is resolvable pre-session via the catch-all + expect(await detector.unlikelyThreshold('en')).toBeCloseTo(0.5); + + const transport = new ScriptedTransport({ runBehavior: 'idle' }); + const stream = new TurnDetectorStreamImpl({ + detector, + opts: detectorOpts(detector), + cloudOpts: undefined, + model: 'turn-detector-v1', + transport, + }); + // server defaults arrive, then the cloud session fails + stream.thresholdsOptions._updateDefaults({ ...SERVER_THRESHOLDS }, SERVER_DEFAULT_THRESHOLD); + stream._fallBackToLocal(new APIConnectionError({ message: 'boom' })); + await waitFor(() => stream.model === 'turn-detector-v1-mini'); + + // Both the stream and the detector (sharing one ThresholdOptions) reflect it. + expect(stream.model).toBe('turn-detector-v1-mini'); + expect(detector.model).toBe('turn-detector-v1-mini'); + const expected = LOCAL_LANGUAGES.en! * (0.5 / SERVER_THRESHOLDS.en!); + expect(await detector.unlikelyThreshold('en')).toBeCloseTo(expected); + await stream.aclose(); + }); +}); + +describe('LocalFailureRetry', () => { + it('local failure emits default and retries on next turn', async () => { + const transport = new ScriptedTransport({ + runBehavior: 'raise', + runExc: new Error('local boom'), + }); + const stream = makeStreamWithTransport(transport, { model: 'turn-detector-v1-mini' }); + await waitFor(() => stream.warnedLocalFailure); + expect(stream.model).toBe('turn-detector-v1-mini'); + expect(stream.isFallback).toBe(false); + expect(stream.warnedLocalFailure).toBe(true); + expect(stream.transport).toBe(transport); + await stream.aclose(); + }); +}); + +describe('WarningDedupe', () => { + it('cloud→local warning logged once per session', async () => { + const transport = new ScriptedTransport({ + runBehavior: 'raise', + runExc: new APIConnectionError({ message: 'boom' }), + }); + const stream = makeStreamWithTransport(transport); + await waitFor(() => stream.model === 'turn-detector-v1-mini'); + // Trigger a second fallback path directly. + stream._fallBackToLocal(new APIConnectionError({ message: 'boom2' })); + // Across both invocations only one warning was emitted — tracked by + // the `warnedCloudFailure` flag staying flipped after the first call. + expect(stream.warnedCloudFailure).toBe(true); + await stream.aclose(); + }); + + it('local warning logged once per session', async () => { + const transport = new ScriptedTransport({ runBehavior: 'idle' }); + const stream = makeStreamWithTransport(transport, { model: 'turn-detector-v1-mini' }); + stream._onLocalFailure(new Error('a')); + stream._onLocalFailure(new Error('b')); + expect(stream.warnedLocalFailure).toBe(true); + await stream.aclose(); + }); +}); + +describe('ResolveThresholds', () => { + // Cloud-override resolution against the server defaults, via ThresholdOptions. + function cloud(overrides?: number | Record): ThresholdOptions { + const opts = new ThresholdOptions('turn-detector-v1', overrides); + opts._updateDefaults({ ...SERVER_THRESHOLDS }, SERVER_DEFAULT_THRESHOLD); + return opts; + } + + it('no override adopts server map + fallback default', () => { + const opts = cloud(); + expect(opts.thresholds).toEqual(SERVER_THRESHOLDS); + expect(opts.defaultThreshold).toBeCloseTo(SERVER_DEFAULT_THRESHOLD); + }); + + it('scalar override replaces with empty map', () => { + const opts = cloud(0.8); + // empty map → every language resolves through the scalar fallback + expect(opts.thresholds).toEqual({}); + expect(opts.defaultThreshold).toBeCloseTo(0.8); + }); + + it('dict override layers on server map', () => { + const opts = cloud({ en: 0.7 }); + expect(opts.thresholds.en).toBeCloseTo(0.7); + // unmapped languages keep the server values + server fallback + expect(opts.thresholds.ja).toBeCloseTo(SERVER_THRESHOLDS.ja!); + expect(opts.defaultThreshold).toBeCloseTo(SERVER_DEFAULT_THRESHOLD); + }); + + it('dict keys normalized', () => { + const opts = cloud({ English: 0.7, 'en-US': 0.7 }); + expect(opts.thresholds.en).toBeCloseTo(0.7); + }); +}); + +describe('ServerDefaults', () => { + it('cloud thresholds pending before session created', async () => { + const transport = new ScriptedTransport({ runBehavior: 'idle' }); + const stream = makeStreamWithTransport(transport); + // A cloud detector has no per-language threshold until `SessionCreated`, + // but reports the language as supported so the first turn isn't skipped. + expect(await stream.unlikelyThreshold('en')).toBeUndefined(); + expect(await stream.supportsLanguage('en')).toBe(true); + await stream.aclose(); + }); + + it('cloud adopts server defaults', async () => { + const transport = new ScriptedTransport({ runBehavior: 'idle' }); + const stream = makeStreamWithTransport(transport); + stream.thresholdsOptions._updateDefaults({ ...SERVER_THRESHOLDS }, SERVER_DEFAULT_THRESHOLD); + expect(await stream.unlikelyThreshold('en')).toBeCloseTo(SERVER_THRESHOLDS.en!); + // language absent from the server map → catch-all default + expect(await stream.unlikelyThreshold('de')).toBeCloseTo(SERVER_DEFAULT_THRESHOLD); + await stream.aclose(); + }); + + it('dict override layers on server defaults', async () => { + const transport = new ScriptedTransport({ runBehavior: 'idle' }); + const stream = makeStreamWithTransport(transport, { userThreshold: { en: 0.7, ja: 0.2 } }); + stream.thresholdsOptions._updateDefaults({ ...SERVER_THRESHOLDS }, SERVER_DEFAULT_THRESHOLD); + expect(await stream.unlikelyThreshold('en')).toBeCloseTo(0.7); + expect(await stream.unlikelyThreshold('ja')).toBeCloseTo(0.2); + // fr not overridden → server default for fr + expect(await stream.unlikelyThreshold('fr')).toBeCloseTo(SERVER_THRESHOLDS.fr!); + await stream.aclose(); + }); + + it('degenerate session created throws without override', async () => { + const transport = new ScriptedTransport({ runBehavior: 'idle' }); + const stream = makeStreamWithTransport(transport); + expect(() => stream.thresholdsOptions._updateDefaults({}, 0.0)).toThrow(APIError); + await stream.aclose(); + }); + + it('degenerate session created throws even with override', async () => { + const transport = new ScriptedTransport({ runBehavior: 'idle' }); + const stream = makeStreamWithTransport(transport, { userThreshold: 0.8 }); + expect(() => stream.thresholdsOptions._updateDefaults({}, 0.0)).toThrow(APIError); + await stream.aclose(); + }); +}); + +describe('OverrideWarning', () => { + it('warns on construction with override', () => { + const warnSpy = vi.spyOn(log(), 'warn'); + try { + withEnv({ LIVEKIT_REMOTE_EOT_URL: undefined }, () => { + new TurnDetector({ unlikelyThreshold: 0.5 }); + }); + const warned = warnSpy.mock.calls.some((c) => + JSON.stringify(c).includes('non-default turn detection threshold'), + ); + expect(warned).toBe(true); + } finally { + warnSpy.mockRestore(); + } + }); + + it('no warning without override', () => { + const warnSpy = vi.spyOn(log(), 'warn'); + try { + withEnv({ LIVEKIT_REMOTE_EOT_URL: undefined }, () => { + new TurnDetector(); + }); + const warned = warnSpy.mock.calls.some((c) => + JSON.stringify(c).includes('non-default turn detection threshold'), + ); + expect(warned).toBe(false); + } finally { + warnSpy.mockRestore(); + } + }); +}); + +describe('UpdateOptions', () => { + it('re-resolves an active cloud stream against cached server defaults', async () => { + let detector!: TurnDetector; + withEnv( + { + LIVEKIT_REMOTE_EOT_URL: 'ws://gateway', + LIVEKIT_API_KEY: 'k', + LIVEKIT_API_SECRET: 's', + }, + () => { + detector = new TurnDetector(); + }, + ); + const transport = new ScriptedTransport({ runBehavior: 'idle' }); + const stream = new TurnDetectorStreamImpl({ + detector, + opts: detectorOpts(detector), + cloudOpts: undefined, + model: 'turn-detector-v1', + transport, + }); + stream.thresholdsOptions._updateDefaults({ ...SERVER_THRESHOLDS }, SERVER_DEFAULT_THRESHOLD); + expect(await stream.unlikelyThreshold('en')).toBeCloseTo(SERVER_THRESHOLDS.en!); + + detector.updateOptions({ unlikelyThreshold: 0.7 }); + // the shared resolver re-resolves against the cached server defaults + expect(await stream.unlikelyThreshold('en')).toBeCloseTo(0.7); + await stream.aclose(); + }); + + it('local model updateOptions', async () => { + let detector!: TurnDetector; + withEnv({ LIVEKIT_REMOTE_EOT_URL: undefined }, () => { + detector = new TurnDetector(); + }); + expect(detector.model).toBe('turn-detector-v1-mini'); + detector.updateOptions({ unlikelyThreshold: 0.42 }); + expect(await detector.unlikelyThreshold('en')).toBeCloseTo(0.42); + await detector.aclose(); + }); +}); + +describe('ThresholdRescaleOnFallback', () => { + it('scalar override rescaled against server on fallback', async () => { + const transport = new ScriptedTransport({ runBehavior: 'idle' }); + const stream = makeStreamWithTransport(transport, { userThreshold: 0.5 }); + stream.thresholdsOptions._updateDefaults({ ...SERVER_THRESHOLDS }, SERVER_DEFAULT_THRESHOLD); + stream._fallBackToLocal(new APIConnectionError({ message: 'boom' })); + await waitFor(() => stream.model === 'turn-detector-v1-mini'); + expect(stream.isFallback).toBe(true); + expect(await stream.unlikelyThreshold('en')).toBeCloseTo( + LOCAL_LANGUAGES.en! * (0.5 / SERVER_THRESHOLDS.en!), + ); + await stream.aclose(); + }); + + it('no override fallback uses local table', async () => { + const transport = new ScriptedTransport({ runBehavior: 'idle' }); + const stream = makeStreamWithTransport(transport); + stream.thresholdsOptions._updateDefaults({ ...SERVER_THRESHOLDS }, SERVER_DEFAULT_THRESHOLD); + stream._fallBackToLocal(new APIConnectionError({ message: 'boom' })); + await waitFor(() => stream.model === 'turn-detector-v1-mini'); + // ratio 1.0 → local table unchanged + expect(await stream.unlikelyThreshold('en')).toBeCloseTo(LOCAL_LANGUAGES.en!); + await stream.aclose(); + }); + + it('dict override rescaled per language on fallback', async () => { + const transport = new ScriptedTransport({ runBehavior: 'idle' }); + const stream = makeStreamWithTransport(transport, { userThreshold: { en: 0.55, ja: 0.25 } }); + stream.thresholdsOptions._updateDefaults({ ...SERVER_THRESHOLDS }, SERVER_DEFAULT_THRESHOLD); + stream._fallBackToLocal(new APIConnectionError({ message: 'boom' })); + await waitFor(() => stream.model === 'turn-detector-v1-mini'); + expect(stream.isFallback).toBe(true); + expect(await stream.unlikelyThreshold('en')).toBeCloseTo( + LOCAL_LANGUAGES.en! * (0.55 / SERVER_THRESHOLDS.en!), + ); + expect(await stream.unlikelyThreshold('ja')).toBeCloseTo( + LOCAL_LANGUAGES.ja! * (0.25 / SERVER_THRESHOLDS.ja!), + ); + // fr not in dict → server value as effective → plain local default + expect(await stream.unlikelyThreshold('fr')).toBeCloseTo(LOCAL_LANGUAGES.fr!); + await stream.aclose(); + }); + + it('fallback before session created uses local table with override applied', async () => { + // Cloud fails before any `SessionCreated` → no server map to rescale + // against, so the local table (with the override applied) is used directly. + const transport = new ScriptedTransport({ + runBehavior: 'raise', + runExc: new APIConnectionError({ message: 'boom' }), + }); + const stream = makeStreamWithTransport(transport, { userThreshold: 0.42 }); + await waitFor(() => stream.model === 'turn-detector-v1-mini'); + expect(stream.isFallback).toBe(true); + // scalar 0.42 → 0.42 for every language via the catch-all + expect(await stream.unlikelyThreshold('en')).toBeCloseTo(0.42); + await stream.aclose(); + }); +}); + +describe('LocalModelExecutor', () => { + function pcmFrame(samples = 320): AudioFrame { + return new AudioFrame(new Int16Array(samples), 16000, 1, samples); + } + + it('routes local predict through the injected executor (base64 PCM)', async () => { + const doInference = vi.fn(async (method: string, data: unknown) => { + expect(method).toBe(EOT_INFERENCE_METHOD); + expect(typeof (data as { pcm: string }).pcm).toBe('string'); + return { probability: 0.7, inferenceDurationMs: 5 }; + }); + const executor: InferenceExecutor = { doInference }; + const detector = new TurnDetector({ version: 'v1-mini', executor }); + const stream = detector.stream(); + try { + stream.pushAudio(pcmFrame()); + const ev = await stream.predict().await; + expect(ev.endOfTurnProbability).toBe(0.7); + expect(doInference).toHaveBeenCalledWith(EOT_INFERENCE_METHOD, expect.anything()); + } finally { + await stream.aclose(); + } + }); + + it('degrades to a positive default when no executor is available', async () => { + // explicit undefined → constructor falls through to getJobContext() + // (throws outside a job) → executor stays undefined. + const detector = new TurnDetector({ version: 'v1-mini', executor: undefined }); + const stream = detector.stream(); + try { + const ev = await stream.predict().await; + expect(ev.endOfTurnProbability).toBe(1.0); + } finally { + await stream.aclose(); + } + }); +}); diff --git a/agents/src/inference/eot/detector.ts b/agents/src/inference/eot/detector.ts new file mode 100644 index 000000000..a7a9ca115 --- /dev/null +++ b/agents/src/inference/eot/detector.ts @@ -0,0 +1,349 @@ +// SPDX-FileCopyrightText: 2026 LiveKit, Inc. +// +// SPDX-License-Identifier: Apache-2.0 + +/** + * Audio end-of-turn detector with `turn-detector-v1` → `turn-detector-v1-mini` + * (cloud → local) fallback. + * + * Port of Python `livekit.agents.inference.eot.detector`. + */ +import type { InferenceExecutor } from '../../ipc/inference_executor.js'; +import { getJobContext } from '../../job.js'; +import { log } from '../../log.js'; +import { type APIConnectOptions, DEFAULT_API_CONNECT_OPTIONS } from '../../types.js'; +import { isDevMode, isHosted, resolveEnvVar } from '../../utils.js'; +import { getDefaultInferenceUrl } from '../utils.js'; +import { + BaseStreamingTurnDetector, + type BaseStreamingTurnDetectorOptions, + BaseStreamingTurnDetectorStream, + DEFAULT_SAMPLE_RATE, + type StreamingTurnDetectionTransport, + SwapAbortError, +} from './base.js'; +import { ThresholdOptions, type TurnDetectorModel, type TurnDetectorVersion } from './languages.js'; +import { CloudTransport, type CloudTransportOptions, LocalTransport } from './transports.js'; + +export interface TurnDetectorOptions { + /** + * Which turn-detector version to run. `'v1'` is the full cloud model (served + * over the inference gateway; model name `'turn-detector-v1'`); `'v1-mini'` + * is the local in-process model (`'turn-detector-v1-mini'`). When omitted, + * auto-selects `'v1'` on hosted/dev environments (falling back to `'v1-mini'` + * if cloud creds are missing) and `'v1-mini'` otherwise. + */ + version?: TurnDetectorVersion; + unlikelyThreshold?: number | Record; + baseUrl?: string; + apiKey?: string; + apiSecret?: string; + /** Sample rate (Hz). Defaults to 16000. */ + sampleRate?: number; + connOptions?: APIConnectOptions; + /** + * Inference executor that runs the local `turn-detector-v1-mini` model in the + * shared inference process. Defaults to the current job's + * `getJobContext().inferenceExecutor`. `undefined` (no job context / binding + * unavailable) degrades the local model to a positive-default prediction. + * Mainly an override seam for tests. + */ + executor?: InferenceExecutor; +} + +export class TurnDetector extends BaseStreamingTurnDetector { + protected _model: TurnDetectorModel; + protected _cloudOpts: CloudTransportOptions | undefined; + protected _executor: InferenceExecutor | undefined; + + constructor(opts: TurnDetectorOptions = {}) { + // auto = caller didn't pin a version; missing cloud creds warn-and- + // fall-back instead of raising. + const auto = opts.version === undefined; + const resolvedVersion: TurnDetectorVersion = + opts.version ?? (isHosted() || isDevMode() ? 'v1' : 'v1-mini'); + let resolvedModel: TurnDetectorModel = `turn-detector-${resolvedVersion}`; + + let cloudOpts: CloudTransportOptions | undefined; + if (resolvedVersion === 'v1') { + const baseUrl = resolveEnvVar( + opts.baseUrl, + ['LIVEKIT_INFERENCE_URL'], + getDefaultInferenceUrl(), + ); + const apiKey = resolveEnvVar(opts.apiKey, ['LIVEKIT_INFERENCE_API_KEY', 'LIVEKIT_API_KEY']); + const apiSecret = resolveEnvVar(opts.apiSecret, [ + 'LIVEKIT_INFERENCE_API_SECRET', + 'LIVEKIT_API_SECRET', + ]); + const missing: string[] = []; + if (!baseUrl) missing.push('LIVEKIT_INFERENCE_URL'); + if (!apiKey) missing.push('LIVEKIT_API_KEY'); + if (!apiSecret) missing.push('LIVEKIT_API_SECRET'); + if (missing.length > 0) { + if (auto) { + log().warn( + { missing }, + "LIVEKIT_INFERENCE_URL is set but creds are missing; falling back to 'v1-mini'", + ); + resolvedModel = 'turn-detector-v1-mini'; + } else { + throw new Error( + `TurnDetector(version='v1') requires ${missing.join(', ')} ` + + '(env or constructor argument).', + ); + } + } else { + cloudOpts = { + baseUrl, + apiKey, + apiSecret, + connOptions: opts.connOptions ?? DEFAULT_API_CONNECT_OPTIONS, + }; + } + } + + const detectorOpts: BaseStreamingTurnDetectorOptions = { + sampleRate: opts.sampleRate ?? DEFAULT_SAMPLE_RATE, + thresholds: new ThresholdOptions(resolvedModel, opts.unlikelyThreshold), + }; + super(detectorOpts); + this._model = resolvedModel; + this._cloudOpts = cloudOpts; + this._warnThresholdOverride(); + // Default to the current job's shared inference executor. `getJobContext` + // throws outside a job (tests, standalone) — degrade to `undefined` + // (the local model then resolves a positive default) rather than throwing. + if (opts.executor !== undefined) { + this._executor = opts.executor; + } else { + try { + this._executor = getJobContext().inferenceExecutor; + } catch { + this._executor = undefined; + } + } + } + + /** Current model name. Starts at the construction-time selection and flips to + * `'turn-detector-v1-mini'` after a cloud→local fallback: the detector and its + * (single) active stream share one mutable `ThresholdOptions`, and the + * stream writes the swap back here so EOU metrics and `audio_recognition` + * see a consistent view. The fallback is one-way and sticky. */ + override get model(): TurnDetectorModel { + return this._model; + } + + /** @internal Written by the active stream on cloud→local fallback. */ + _setModel(model: TurnDetectorModel): void { + this._model = model; + } + + protected _warnThresholdOverride(): void { + const overrides = this._opts.thresholds.overrides; + if (overrides !== undefined) { + log().warn( + { unlikelyThreshold: overrides }, + 'a non-default turn detection threshold was provided; the server provides calibrated ' + + 'defaults and overriding them may be suboptimal', + ); + } + } + + /** Replace the user threshold override at runtime. The shared + * `ThresholdOptions` re-resolves against the current (server or shipped) + * defaults, so an active stream picks it up immediately. */ + updateOptions(opts: { unlikelyThreshold?: number | Record } = {}): void { + this._opts.thresholds.updateOverrides(opts.unlikelyThreshold); + this._warnThresholdOverride(); + } + + override stream(opts: { connOptions?: APIConnectOptions } = {}): BaseStreamingTurnDetectorStream { + const cloudOpts = + this._cloudOpts !== undefined + ? { ...this._cloudOpts, connOptions: opts.connOptions ?? this._cloudOpts.connOptions } + : undefined; + const stream = new TurnDetectorStreamImpl({ + detector: this, + opts: this._opts, + cloudOpts, + model: this._model, + executor: this._executor, + }); + this._streams.add(stream); + return stream; + } +} + +export interface TurnDetectorStreamImplArgs { + detector: TurnDetector; + opts: BaseStreamingTurnDetectorOptions; + cloudOpts: CloudTransportOptions | undefined; + model: TurnDetectorModel; + /** Shared inference executor for the `turn-detector-v1-mini` (local) model + * (undefined degrades to a positive-default prediction). */ + executor?: InferenceExecutor; + /** Optional transport override (for tests). When omitted, a transport is + * constructed from `model` + `cloudOpts`. */ + transport?: StreamingTurnDetectionTransport; +} + +/** + * Stream that owns the `turn-detector-v1` → `turn-detector-v1-mini` (cloud → + * local) fallback FSM. On cloud transport failure (`transport.run()` raises, or + * `predictEndOfTurn` times out), the stream swaps the transport and rescales + * per-language thresholds in place on the shared `ThresholdOptions`, then writes + * the model swap back to the owning detector so its view stays consistent. + */ +export class TurnDetectorStreamImpl extends BaseStreamingTurnDetectorStream { + protected _model: TurnDetectorModel; + protected _cloudOpts: CloudTransportOptions | undefined; + protected _executor: InferenceExecutor | undefined; + protected _isFallback = false; + protected _warnedCloudFailure = false; + protected _warnedLocalFailure = false; + private _detLogger = log(); + + constructor(args: TurnDetectorStreamImplArgs) { + const transport = + args.transport ?? + (args.model === 'turn-detector-v1' + ? new CloudTransport({ + detector: args.detector, + opts: args.opts, + cloudOpts: args.cloudOpts!, + }) + : new LocalTransport({ opts: args.opts, executor: args.executor })); + super({ detector: args.detector, opts: args.opts, transport }); + this._model = args.model; + this._cloudOpts = args.cloudOpts; + this._executor = args.executor; + } + + /** This stream's *current* model name (flips to `'turn-detector-v1-mini'` + * after a cloud→local fallback). The swap is also written back to the owning + * detector, which shares this stream's mutable `ThresholdOptions`. */ + override get model(): TurnDetectorModel { + return this._model; + } + + get isFallback(): boolean { + return this._isFallback; + } + + /** @internal Test-visible. */ + get warnedCloudFailure(): boolean { + return this._warnedCloudFailure; + } + /** @internal Test-visible. */ + get warnedLocalFailure(): boolean { + return this._warnedLocalFailure; + } + /** @internal Test-visible. */ + get transport(): StreamingTurnDetectionTransport { + return this._transport; + } + + /** @internal Test-visible: same logic as the path taken when `_run` catches + * a cloud transport error. Tests call this directly to verify the warning + * dedupe across multiple invocations on the same stream. */ + _fallBackToLocal(reason: Error): void { + if (!this._warnedCloudFailure) { + this._detLogger.warn( + { reason: reason.message }, + 'cloud turn detector failed; falling back to local mini model', + ); + this._warnedCloudFailure = true; + } + this._emitDefaultForInflight(); + try { + this._transport.detach(); + } catch { + // ignore detach errors during swap + } + // Mutate the shared `ThresholdOptions` in place so the rescaled local + // thresholds + model swap are visible to the owning detector (read by EOU + // metrics and `audio_recognition`) without a copy-back. Safe because only + // one active stream per detector is supported, and the swap is sticky. + this._opts.thresholds._toLocalFallback(); + if (this._detector instanceof TurnDetector) { + this._detector._setModel('turn-detector-v1-mini'); + } + this._transport = new LocalTransport({ opts: this._opts, executor: this._executor }); + this._transport.attach(this); + this._model = 'turn-detector-v1-mini'; + this._isFallback = true; + } + + /** @internal Test-visible: same logic as the path taken when `_run` sees a + * local transport error. */ + _onLocalFailure(reason: Error): void { + if (!this._warnedLocalFailure) { + this._detLogger.warn( + { reason: reason.message }, + 'local audio turn detector failed; defaulting to 1.0 and retrying on next turn', + ); + this._warnedLocalFailure = true; + } + this._emitDefaultForInflight(); + } + + protected _emitDefaultForInflight(): void { + // Positive default so any waiter commits after minEndpointingDelay. + const requestId = this._requestId; + if (requestId !== undefined) { + this._resolvePrediction(requestId, 1.0); + } + } + + override async aclose(): Promise { + // Detach the transport first so the cloud send channel closes and its + // background sender/recv tasks tear down, then run the base teardown + // (which closes the audio channel and cancels the main task). + try { + this._transport.detach(); + } catch { + // ignore detach errors during teardown + } + await super.aclose(); + } + + protected override async _run(): Promise { + while (true) { + try { + await this._raceWithSwap(this._transport.run()); + return; + } catch (err) { + if (err instanceof SwapAbortError) { + if (this._closing) return; + // A swap already happened (e.g. predict timeout → fallback). + // The new transport is mounted; loop and run it. Routing the + // swap through `SwapAbortError` (rather than through the + // cloud/local branch below) is what prevents the "timeout + // flips model mid-await" misclassification — the catch + // exits early before ever consulting `_model`. + continue; + } + const e = err instanceof Error ? err : new Error(String(err)); + if (this._model === 'turn-detector-v1') { + this._fallBackToLocal(e); + continue; + } + this._onLocalFailure(e); + return; + } + } + } + + protected override _onPredictTimeout(): void { + if (this._model === 'turn-detector-v1') { + // Signal the swap BEFORE mutating model/transport state. The + // race in `_raceWithSwap` is rejected with `SwapAbortError` + // immediately, so the main loop exits through the + // SwapAbortError branch and never consults `_model` for a + // classification that would race with the assignment below. + this._signalSwap(); + this._fallBackToLocal(new Error('predict_end_of_turn')); + } + } +} diff --git a/agents/src/inference/eot/index.ts b/agents/src/inference/eot/index.ts new file mode 100644 index 000000000..44483947e --- /dev/null +++ b/agents/src/inference/eot/index.ts @@ -0,0 +1,8 @@ +// SPDX-FileCopyrightText: 2026 LiveKit, Inc. +// +// SPDX-License-Identifier: Apache-2.0 +export { TurnDetector, TurnDetectorStreamImpl } from './detector.js'; +export type { TurnDetectorOptions } from './detector.js'; +export { LOCAL_LANGUAGES, ThresholdOptions } from './languages.js'; +export type { ThresholdOverride, TurnDetectorModel, TurnDetectorVersion } from './languages.js'; +export { CloudTransport, LocalTransport, type CloudTransportOptions } from './transports.js'; diff --git a/agents/src/inference/eot/languages.ts b/agents/src/inference/eot/languages.ts new file mode 100644 index 000000000..3f38fce9d --- /dev/null +++ b/agents/src/inference/eot/languages.ts @@ -0,0 +1,246 @@ +// SPDX-FileCopyrightText: 2026 LiveKit, Inc. +// +// SPDX-License-Identifier: Apache-2.0 + +/** + * Per-language `unlikely` thresholds for the mini detector. + * + * The cloud `turn-detector-v1` model receives calibrated defaults from the + * inference gateway (via the `SessionCreated` message); only the local + * `turn-detector-v1-mini` model ships a hardcoded table here. + */ +import { APIError } from '../../_exceptions.js'; +import type { LanguageCode } from '../../language.js'; + +/** Full model name (used for telemetry/billing via `detector.model`). */ +export type TurnDetectorModel = 'turn-detector-v1' | 'turn-detector-v1-mini'; + +/** Public `version` constructor argument; maps to a {@link TurnDetectorModel}. */ +export type TurnDetectorVersion = 'v1' | 'v1-mini'; + +export const LOCAL_LANGUAGES: Readonly> = { + ar: 0.35, + de: 0.245, + en: 0.36, + es: 0.35, + fr: 0.285, + hi: 0.305, + id: 0.345, + it: 0.23, + ja: 0.295, + ko: 0.4, + nl: 0.2, + pt: 0.32, + tr: 0.255, + zh: 0.355, +}; + +/** + * BCP-47 language tag (or human-readable name) → ISO 639-1 two-letter code. + * + * Minimal port of Python's `LanguageCode` — covers the languages present in + * the threshold tables. Unknown inputs are returned lowercased and unchanged + * (callers should pass `en`, `en-US`, `English`, etc.). + */ +function normalizeLanguage(input: string): string { + const lower = input.toLowerCase().trim(); + if (lower.length === 2) return lower; + const dashIdx = lower.indexOf('-'); + if (dashIdx === 2) return lower.slice(0, 2); + // long-name aliases for languages in our tables + const aliases: Record = { + arabic: 'ar', + german: 'de', + english: 'en', + spanish: 'es', + french: 'fr', + hindi: 'hi', + indonesian: 'id', + italian: 'it', + japanese: 'ja', + korean: 'ko', + dutch: 'nl', + portuguese: 'pt', + turkish: 'tr', + chinese: 'zh', + mandarin: 'zh', + }; + return aliases[lower] ?? lower; +} + +const round4 = (value: number): number => Math.round(value * 1e4) / 1e4; + +/** + * User-supplied threshold override: a single value applied to every language, + * a per-language map, or `undefined` (Python `NOT_GIVEN` — use the defaults). + */ +export type ThresholdOverride = number | Record | undefined; + +function normalizeOverrides(overrides: ThresholdOverride): ThresholdOverride { + if (overrides === undefined || typeof overrides !== 'object') { + return overrides; + } + const out: Record = {}; + for (const [k, v] of Object.entries(overrides)) { + out[normalizeLanguage(k)] = Number(v); + } + return out; +} + +/** + * Resolves per-language `unlikely` thresholds for the audio EOT detector. + * + * Holds three layers and re-materializes the effective map whenever any of + * them changes: + * + * - **overrides** — what the user passed (`unlikelyThreshold`), normalized. + * - **server/shipped defaults** — for `turn-detector-v1-mini` these are the + * shipped `LOCAL_LANGUAGES` table; for the cloud `turn-detector-v1` they arrive + * from the gateway via `_updateDefaults` (the `SessionCreated` message) and + * are `undefined` until then. + * - **materialized** — `thresholds` (per-language map) + `defaultThreshold` + * (catch-all for languages absent from the map). + * + * The detector and its (single) active stream share one instance; the + * cloud→local fallback mutates it in place via `_toLocalFallback`. + */ +export class ThresholdOptions { + private _model: TurnDetectorModel; + private _overrides: ThresholdOverride; + + // server/shipped defaults + private _serverThresholds: Record | undefined; + private _serverDefault: number | undefined; + + // materialized values + private _thresholds: Record = {}; + private _default: number | undefined = undefined; + + constructor(model: TurnDetectorModel, overrides: ThresholdOverride = undefined) { + this._model = model; + this._overrides = normalizeOverrides(overrides); + if (model === 'turn-detector-v1-mini') { + this._serverThresholds = { ...LOCAL_LANGUAGES }; + this._serverDefault = LOCAL_LANGUAGES.en; + } + this._resolve(); + } + + get model(): TurnDetectorModel { + return this._model; + } + + get overrides(): ThresholdOverride { + return this._overrides; + } + + get thresholds(): Readonly> { + return this._thresholds; + } + + get defaultThreshold(): number | undefined { + return this._default; + } + + lookup(language: LanguageCode | string | undefined): number | undefined { + const key = language ? normalizeLanguage(language) : 'en'; + // `key in map`, not `?? default` — a legitimate override of 0 must not + // fall through to the catch-all default. + return key in this._thresholds ? this._thresholds[key] : this._default; + } + + supports(language: LanguageCode | string | undefined): boolean { + // A cloud detector reports every language as supported until its server + // defaults arrive, so the first turn (before `SessionCreated`) isn't + // skipped by the `audio_recognition` short-circuit. + const pending = this._model === 'turn-detector-v1' && this._serverThresholds === undefined; + return pending || this.lookup(language) !== undefined; + } + + updateOverrides(overrides: ThresholdOverride): void { + this._overrides = normalizeOverrides(overrides); + this._resolve(); + } + + /** + * @internal Adopt the calibrated defaults a `turn-detector` gateway sends in + * `SessionCreated`. Raises (non-retryable) when the server produced no usable + * thresholds — the caller degrades the session to the local model. + */ + _updateDefaults(serverThresholds: Record, serverDefault: number): void { + if (!serverThresholds || Object.keys(serverThresholds).length === 0 || serverDefault <= 0) { + throw new APIError('turn detector session created without usable default thresholds', { + retryable: false, + }); + } + const norm: Record = {}; + for (const [lang, value] of Object.entries(serverThresholds)) { + norm[normalizeLanguage(lang)] = round4(value); + } + this._serverThresholds = norm; + this._serverDefault = round4(serverDefault); + this._resolve(); + } + + /** + * @internal Promote to the local mini model on cloud→local fallback, + * preserving the user's effective-vs-default ratio per language: + * `local = LOCAL[lang] * (effective_t / server[lang])`. + */ + _toLocalFallback(): void { + if (this._model === 'turn-detector-v1-mini') { + return; + } + + let rescaled: Record | undefined; + const server = this._serverThresholds; + if (server) { + rescaled = {}; + for (const lang of Object.keys(server)) { + const activeT = this.lookup(lang); + const local = LOCAL_LANGUAGES[lang]; + if (activeT !== undefined && local !== undefined && server[lang] !== 0) { + rescaled[lang] = local * (activeT / server[lang]!); + } + } + } + + this._model = 'turn-detector-v1-mini'; + this._serverThresholds = { ...LOCAL_LANGUAGES }; + this._serverDefault = LOCAL_LANGUAGES.en; + this._resolve(); + + if (rescaled !== undefined) { + this._thresholds = rescaled; + this._default = this.lookup('en'); + } + } + + private _resolve(): void { + const scalarOverride = typeof this._overrides === 'number'; + if (this._serverThresholds === undefined || this._serverDefault === undefined) { + // cloud defaults not received yet; only a scalar override resolves up front + this._thresholds = {}; + this._default = scalarOverride ? (this._overrides as number) : undefined; + return; + } + + if (this._overrides === undefined) { + this._thresholds = { ...this._serverThresholds }; + this._default = this._serverDefault; + return; + } + + if (scalarOverride) { + this._thresholds = {}; + this._default = this._overrides as number; + return; + } + + this._thresholds = { + ...this._serverThresholds, + ...(this._overrides as Record), + }; + this._default = this._serverDefault; + } +} diff --git a/agents/src/inference/eot/runner.test.ts b/agents/src/inference/eot/runner.test.ts new file mode 100644 index 000000000..4058ca9d9 --- /dev/null +++ b/agents/src/inference/eot/runner.test.ts @@ -0,0 +1,55 @@ +// SPDX-FileCopyrightText: 2026 LiveKit, Inc. +// +// SPDX-License-Identifier: Apache-2.0 +import { afterEach, describe, expect, it, vi } from 'vitest'; +import * as warmup from '../_warmup.js'; +import EotRunner from './runner.js'; + +describe('EotRunner', () => { + afterEach(() => { + vi.restoreAllMocks(); + }); + + it('initializes the native EOT model and predicts on decoded PCM', async () => { + const received: Int16Array[] = []; + const fakeMod = { + initEot: vi.fn(), + initVad: vi.fn(), + createVad: vi.fn(), + VAD_WINDOW_SAMPLES: 512, + predict: vi.fn(async (pcm: Int16Array) => { + received.push(pcm); + return 0.83; + }), + }; + vi.spyOn(warmup, '_getLocalInferenceModule').mockReturnValue( + fakeMod as unknown as ReturnType, + ); + + const runner = new EotRunner(); + await runner.initialize(); + expect(fakeMod.initEot).toHaveBeenCalledOnce(); + + // 4 samples of s16le PCM → base64 + const samples = Int16Array.from([1, -2, 3, -4]); + const pcm = Buffer.from(samples.buffer, samples.byteOffset, samples.byteLength).toString( + 'base64', + ); + + const out = await runner.run({ pcm }); + expect(out.probability).toBe(0.83); + expect(out.inferenceDurationMs).toBeGreaterThanOrEqual(0); + + // the runner decoded the base64 back to the same samples + expect(received).toHaveLength(1); + expect(Array.from(received[0]!)).toEqual([1, -2, 3, -4]); + + await runner.close(); + }); + + it('throws on initialize when the native binding is unavailable', async () => { + vi.spyOn(warmup, '_getLocalInferenceModule').mockReturnValue(undefined); + const runner = new EotRunner(); + await expect(runner.initialize()).rejects.toThrow(/native binding unavailable/); + }); +}); diff --git a/agents/src/inference/eot/runner.ts b/agents/src/inference/eot/runner.ts new file mode 100644 index 000000000..d79ed7ad5 --- /dev/null +++ b/agents/src/inference/eot/runner.ts @@ -0,0 +1,71 @@ +// SPDX-FileCopyrightText: 2026 LiveKit, Inc. +// +// SPDX-License-Identifier: Apache-2.0 + +/** + * Audio EOT inference runner — runs inside the shared `InferenceProcExecutor` + * so the ~138 MB native model loads once per host instead of once per job + * worker. Job-side transports reach it via `executor.doInference(...)`. + * + * The inference proc instantiates this with `new Runner()` (no args) and + * calls `initialize()` once at startup, then dispatches `run(data)` per + * request — see `ipc/inference_proc_lazy_main.ts`. Hence the default export + * + no-arg constructor. + */ +import { InferenceRunner } from '../../inference_runner.js'; +import { log } from '../../log.js'; +import { _getLocalInferenceModule } from '../_warmup.js'; + +/** Inference method id used to register + dispatch the audio EOT runner. */ +export const EOT_INFERENCE_METHOD = 'lk_eot_audio'; + +/** Request payload: base64-encoded 16 kHz s16le PCM (up to 1.2 s). */ +export interface EotInferenceInput { + pcm: string; +} + +export interface EotInferenceOutput { + probability: number; + inferenceDurationMs: number; +} + +export default class EotRunner extends InferenceRunner { + #logger = log(); + #mod: ReturnType; + + async initialize(): Promise { + this.#mod = _getLocalInferenceModule(); + if (this.#mod === undefined) { + throw new Error( + 'EotRunner: @livekit/local-inference native binding unavailable in the inference process', + ); + } + // Eagerly page in the EOT model singleton (~138 MB) so the first + // request doesn't pay the load on the hot path. + this.#mod.initEot(); + } + + async run(data: EotInferenceInput): Promise { + if (this.#mod === undefined) { + throw new Error('EotRunner not initialized'); + } + // base64 → bytes → Int16Array view (PCM is 16 kHz s16le) + const bytes = Buffer.from(data.pcm, 'base64'); + const pcm = new Int16Array(bytes.buffer, bytes.byteOffset, Math.floor(bytes.byteLength / 2)); + const t0 = performance.now(); + let probability = 0.0; + try { + probability = await this.#mod.predict(pcm); + } catch (err) { + this.#logger.error( + { err: err instanceof Error ? err.message : String(err) }, + 'local audio EOT prediction failed', + ); + } + return { probability, inferenceDurationMs: performance.now() - t0 }; + } + + async close(): Promise { + return; + } +} diff --git a/agents/src/inference/eot/transports.test.ts b/agents/src/inference/eot/transports.test.ts new file mode 100644 index 000000000..7db4838c2 --- /dev/null +++ b/agents/src/inference/eot/transports.test.ts @@ -0,0 +1,234 @@ +// SPDX-FileCopyrightText: 2026 LiveKit, Inc. +// +// SPDX-License-Identifier: Apache-2.0 + +/** + * Tests for `CloudTransport` (cloud WS body, driven by the unified + * `TurnDetectorStreamImpl` stream). + * + * Uses an in-process fake WebSocket to drive the transport + * deterministically. Covers: + * + * - Retry counter resets after a successful connect (so transient drops + * across the session lifetime don't accumulate toward `maxRetry`). + * - All outbound messages are FIFO-ordered on the wire, even when `runInference` + * hooks fire synchronously between two awaited audio frames. + * + * Port of Python `tests/test_turn_detection_cloud_stream.py`. + */ +import { AgentInference } from '@livekit/protocol'; +import { AudioFrame } from '@livekit/rtc-node'; +import { describe, expect, it } from 'vitest'; +import { APIConnectionError } from '../../_exceptions.js'; +import { DEFAULT_API_CONNECT_OPTIONS } from '../../types.js'; +import { BaseStreamingTurnDetector, type BaseStreamingTurnDetectorOptions } from './base.js'; +import { TurnDetectorStreamImpl } from './detector.js'; +import { ThresholdOptions, type TurnDetectorModel } from './languages.js'; +import { CloudTransport, type CloudWebSocket } from './transports.js'; + +const { ClientMessage } = AgentInference; + +/** Fake WebSocket capturing outbound frames as parsed `ClientMessage`s. */ +class FakeWS implements CloudWebSocket { + sent: InstanceType[] = []; + readyState = 1; // OPEN + private closeCbs: Array<() => void> = []; + + send(data: Uint8Array): void { + if (this.readyState !== 1) throw new Error('ws closed'); + this.sent.push(ClientMessage.fromBinary(data)); + } + close(): void { + this.readyState = 3; // CLOSED + for (const cb of this.closeCbs) cb(); + } + on(event: 'message' | 'close' | 'error', cb: (...args: never[]) => void): void { + if (event === 'close') this.closeCbs.push(cb as () => void); + // message/error not driven in these tests + } +} + +class FakeDetector extends BaseStreamingTurnDetector { + get model(): TurnDetectorModel { + return 'turn-detector-v1'; + } + stream(): never { + throw new Error('unused'); + } +} + +interface MakeStreamResult { + stream: TurnDetectorStreamImpl; + fakeWs: FakeWS; + transport: CloudTransport; +} + +function makeStream(opts: { + connectScript?: Array; + maxRetry?: number; + retryIntervalMs?: number; +}): MakeStreamResult { + const fakeWs = new FakeWS(); + const script = [...(opts.connectScript ?? [])]; + const turnOpts: BaseStreamingTurnDetectorOptions = { + sampleRate: 16000, + thresholds: new ThresholdOptions('turn-detector-v1'), + }; + const detector = new FakeDetector(turnOpts); + const cloudOpts = { + baseUrl: '', + apiKey: 'x', + apiSecret: 'x', + connOptions: { + ...DEFAULT_API_CONNECT_OPTIONS, + maxRetry: opts.maxRetry ?? 3, + retryIntervalMs: opts.retryIntervalMs ?? 0, + }, + }; + // Scripted connect: consume the script left-to-right. An Error rejects; + // null (or exhausted) returns the fake ws. + const connect = async (): Promise => { + if (script.length > 0) { + const r = script.shift(); + if (r instanceof Error) throw r; + } + fakeWs.readyState = 1; + return fakeWs; + }; + const transport = new CloudTransport({ detector, opts: turnOpts, cloudOpts, connect }); + const stream = new TurnDetectorStreamImpl({ + detector, + opts: turnOpts, + cloudOpts, + model: 'turn-detector-v1', + transport, + }); + return { stream, fakeWs, transport }; +} + +async function tick(): Promise { + await new Promise((r) => setImmediate(r)); +} + +async function waitUntilConnected(transport: CloudTransport, ticks = 50): Promise { + for (let i = 0; i < ticks; i++) { + if (transport.transportReady()) return; + await tick(); + } + throw new Error('transport did not connect within timeout'); +} + +async function drainSendQueue(_transport: CloudTransport, ticks = 50): Promise { + // Let the sender task flush the buffered ClientMsgs to the fake socket. + for (let i = 0; i < ticks; i++) { + await tick(); + } +} + +async function waitForCond(predicate: () => boolean, ticks = 50): Promise { + for (let i = 0; i < ticks; i++) { + if (predicate()) return; + await tick(); + } +} + +function pcmFrame(samples = 320): AudioFrame { + return new AudioFrame(new Int16Array(samples), 16000, 1, samples); +} + +describe('CloudStreamRetry', () => { + it('num retries resets after a successful connect', async () => { + const { stream, transport } = makeStream({ + connectScript: [new APIConnectionError({ message: 'transient' }), null], + maxRetry: 3, + retryIntervalMs: 0, + }); + try { + await waitUntilConnected(transport); + // Two attempts: first raised (counter 0→1), second succeeded → reset to 0. + expect(transport.connectCalls).toBe(2); + expect(transport.numRetries).toBe(0); + } finally { + await stream.aclose(); + } + }); +}); + +describe('CloudToLocalFallback', () => { + it('releases the shared audio reader lock on fallback (regression)', async () => { + const { stream, transport } = makeStream({ connectScript: [null] }); + try { + await waitUntilConnected(transport); + // Drive a frame so the cloud drain task is actively parked on + // `reader.read()`, holding the audio channel's single reader lock. + stream.pushAudio(pcmFrame()); + await tick(); + + // A timed-out cancelInference triggers a cloud→local fallback. The + // orphaned cloud drain must release the shared reader lock before the + // real `LocalTransport.run()` re-acquires it — otherwise `getReader()` + // throws "ReadableStream is locked", which is mis-reported as a local + // failure. + const fut = stream.predict(); + stream.cancelInference({ timedOut: true }); + await fut.await; + + await waitForCond(() => stream.model === 'turn-detector-v1-mini'); + expect(stream.isFallback).toBe(true); + + // Let the swapped-in LocalTransport.run() re-acquire the reader and start + // draining. A freed lock ⇒ no "ReadableStream is locked" TypeError ⇒ no + // local failure flagged. + for (let i = 0; i < 10; i++) await tick(); + expect(stream.warnedLocalFailure).toBe(false); + } finally { + await stream.aclose(); + } + }); +}); + +describe('CloudStreamSendOrdering', () => { + it('inferenceStart precedes inputAudio (FIFO)', async () => { + const { stream, fakeWs, transport } = makeStream({ connectScript: [null] }); + try { + await waitUntilConnected(transport); + stream.predict(); + stream.pushAudio(pcmFrame()); + await drainSendQueue(transport); + + const kinds = fakeWs.sent.map((m) => m.message.case); + const startIdx = kinds.indexOf('inferenceStart'); + const audioIdx = kinds.indexOf('inputAudio'); + expect(startIdx).toBeGreaterThanOrEqual(0); + expect(audioIdx).toBeGreaterThanOrEqual(0); + expect(startIdx).toBeLessThan(audioIdx); + } finally { + await stream.aclose(); + } + }); + + it('consecutive inferenceStarts are serialized in call order', async () => { + // Two `runInference` hooks back-to-back (a predict superseding another) + // used to race at `ws.send`; the unified send channel serializes them in + // call order. + const { stream, fakeWs, transport } = makeStream({ connectScript: [null] }); + try { + await waitUntilConnected(transport); + stream.predict(); + const firstId = (stream as unknown as { _requestId?: string })._requestId; + stream.predict(); + const secondId = (stream as unknown as { _requestId?: string })._requestId; + await drainSendQueue(transport); + + const startIds: (string | undefined)[] = []; + for (const m of fakeWs.sent) { + if (m.message.case === 'inferenceStart') { + startIds.push(m.message.value.requestId); + } + } + expect(startIds).toEqual([firstId, secondId]); + } finally { + await stream.aclose(); + } + }); +}); diff --git a/agents/src/inference/eot/transports.ts b/agents/src/inference/eot/transports.ts new file mode 100644 index 000000000..ffbe3f991 --- /dev/null +++ b/agents/src/inference/eot/transports.ts @@ -0,0 +1,628 @@ +// SPDX-FileCopyrightText: 2026 LiveKit, Inc. +// +// SPDX-License-Identifier: Apache-2.0 + +/** + * Audio EOT transports: cloud (WebSocket) + local (@livekit/local-inference). + * + * Port of Python `livekit.agents.inference.eot.transports`. + */ +import { type Duration, Timestamp } from '@bufbuild/protobuf'; +import { AgentInference } from '@livekit/protocol'; +import type { AudioFrame } from '@livekit/rtc-node'; +import { APIConnectionError, APIError, APIStatusError } from '../../_exceptions.js'; +import type { InferenceExecutor } from '../../ipc/inference_executor.js'; +import { log } from '../../log.js'; +import { type StreamChannel, createStreamChannel } from '../../stream/stream_channel.js'; +import { type APIConnectOptions, intervalForRetry } from '../../types.js'; +import { Task, delay } from '../../utils.js'; +import { buildMetadataHeaders, connectWs, createAccessToken } from '../utils.js'; +import { + type BaseStreamingTurnDetectorOptions, + type BaseStreamingTurnDetectorStream, + DEFAULT_SAMPLE_RATE, + type FlushSentinel, + type StreamingTurnDetectionTransport, +} from './base.js'; +import type { TurnDetector } from './detector.js'; +import { EOT_INFERENCE_METHOD } from './runner.js'; + +const AudioEncoding = AgentInference.AudioEncoding; +const ClientMessageCtor = AgentInference.ClientMessage; +const ServerMessageCtor = AgentInference.ServerMessage; +const InferenceStart = AgentInference.InferenceStart; +const InputAudio = AgentInference.InputAudio; +const SessionClose = AgentInference.SessionClose; +const SessionCreate = AgentInference.SessionCreate; +const SessionFlush = AgentInference.SessionFlush; +const SessionSettings = AgentInference.SessionSettings; +type ClientMsg = InstanceType; +type ServerMsg = InstanceType; + +export interface CloudTransportOptions { + baseUrl: string; + apiKey: string; + apiSecret: string; + connOptions: APIConnectOptions; +} + +/** + * Minimal WebSocket shape both the real `ws` socket and test fakes satisfy. + * The cloud transport only needs send/close/readyState + the three events. + */ +export interface CloudWebSocket { + send(data: Uint8Array): void; + close(): void; + readonly readyState: number; + on(event: 'message', cb: (data: Buffer | ArrayBuffer | Buffer[]) => void): void; + on(event: 'close', cb: () => void): void; + on(event: 'error', cb: (err: Error) => void): void; +} + +const WS_OPEN = 1; + +function nowTimestamp(): Timestamp { + const now = Date.now(); + return new Timestamp({ + seconds: BigInt(Math.floor(now / 1000)), + nanos: (now % 1000) * 1_000_000, + }); +} + +function timestampToMs(ts?: Timestamp): number { + if (ts === undefined) return 0; + return Number(ts.seconds) * 1000 + Math.floor(ts.nanos / 1_000_000); +} + +function durationToMs(d?: Duration): number { + if (d === undefined) return 0; + return Number(d.seconds) * 1000 + Math.floor(d.nanos / 1_000_000); +} + +// Native model operates on up to 1.2 s of 16 kHz s16le PCM per predict. +const CLIENT_BUFFER_SECONDS = 1.2; +const CLIENT_BUFFER_SAMPLES = Math.floor(CLIENT_BUFFER_SECONDS * DEFAULT_SAMPLE_RATE); + +/** + * Append-only ring buffer of 16-bit PCM samples used by the local transport + * to keep the last ~1.2 s of audio available for per-window prediction. + */ +class PcmRingBuffer { + private buf: Int16Array; + private writeIdx = 0; + private filled = 0; + + constructor(public readonly capacity: number) { + this.buf = new Int16Array(capacity); + } + + pushFrame(frame: AudioFrame): void { + const src = frame.data; // Int16Array + for (let i = 0; i < src.length; i++) { + this.buf[this.writeIdx] = src[i]!; + this.writeIdx = (this.writeIdx + 1) % this.capacity; + } + this.filled = Math.min(this.filled + src.length, this.capacity); + } + + /** Returns a contiguous Int16Array snapshot of the last `filled` samples. */ + read(): Int16Array { + const out = new Int16Array(this.filled); + const start = (this.writeIdx - this.filled + this.capacity) % this.capacity; + if (start + this.filled <= this.capacity) { + out.set(this.buf.subarray(start, start + this.filled)); + } else { + const tail = this.capacity - start; + out.set(this.buf.subarray(start, this.capacity), 0); + out.set(this.buf.subarray(0, this.filled - tail), tail); + } + return out; + } + + /** Drop the oldest `n` samples. */ + shift(n: number): void { + this.filled = Math.max(0, this.filled - n); + } + + get length(): number { + return this.filled; + } +} + +/** + * Transport for the local `turn-detector-v1-mini` model. + * + * The native model runs in the shared `InferenceProcExecutor` (one load per + * host, ~138 MB) rather than in every job worker. Audio is buffered locally + * in the job process (no per-frame IPC); on each inference window the last + * ~1.2 s is snapshotted, base64-encoded, and sent over IPC to the runner + * (`inference/eot/runner.ts`) via `executor.doInference(...)`. + * + * When no executor is available (binding couldn't load on this platform), + * predictions resolve to a positive default (1.0) so the session still + * commits turns after `minDelay` — same as the existing local-failure path. + */ +export class LocalTransport implements StreamingTurnDetectionTransport { + protected _opts: BaseStreamingTurnDetectorOptions; + protected _executor: InferenceExecutor | undefined; + protected _buf: PcmRingBuffer; + protected _streamRef: WeakRef | undefined; + protected _tasks = new Set>(); + protected _warnedNoExecutor = false; + protected _logger = log(); + + constructor(opts: { + opts: BaseStreamingTurnDetectorOptions; + executor: InferenceExecutor | undefined; + }) { + this._opts = opts.opts; + this._executor = opts.executor; + this._buf = new PcmRingBuffer(CLIENT_BUFFER_SAMPLES); + } + + attach(stream: BaseStreamingTurnDetectorStream): void { + this._streamRef = new WeakRef(stream); + } + + runInference(requestId: string): void { + const snapshot = this._buf.read(); + const task = this._predict(requestId, snapshot); + this._tasks.add(task); + void task.finally(() => this._tasks.delete(task)); + } + + protected async _predict(requestId: string, pcmSnapshot: Int16Array): Promise { + const stream = this._streamRef?.deref(); + if (stream === undefined) return; + + if (this._executor === undefined) { + if (!this._warnedNoExecutor) { + this._warnedNoExecutor = true; + this._logger.warn( + 'local audio EOT unavailable (no inference executor / native binding); ' + + 'defaulting predictions to 1.0 so turns still commit after minDelay', + ); + } + stream._resolvePrediction(requestId, 1.0); + return; + } + + // base64-encode the s16le PCM so it survives the default JSON IPC + // serialization compactly (a raw Int16Array would balloon to an + // array-of-numbers). Only the snapshot crosses the boundary. + const pcm = Buffer.from( + pcmSnapshot.buffer, + pcmSnapshot.byteOffset, + pcmSnapshot.byteLength, + ).toString('base64'); + + let prob = 0.0; + let inferenceDurationMs = 0; + try { + const out = (await this._executor.doInference(EOT_INFERENCE_METHOD, { + pcm, + })) as { probability: number; inferenceDurationMs: number }; + prob = out.probability; + inferenceDurationMs = out.inferenceDurationMs; + } catch (err) { + this._logger.error( + { err: err instanceof Error ? err.message : String(err) }, + 'local audio EOT inference (executor) failed', + ); + } + const freshStream = this._streamRef?.deref(); + if (freshStream === undefined) return; + freshStream._resolvePrediction(requestId, prob, { inferenceDuration: inferenceDurationMs }); + } + + async pushFrame(frame: AudioFrame): Promise { + this._buf.pushFrame(frame); + } + + async flush(_sentinel: FlushSentinel): Promise { + if (this._buf.length > 0) { + this._buf.shift(this._buf.length); + } + } + + detach(): void { + // We drop our references to the in-flight predicts, but the underlying IPC + // `doInference` calls aren't cancellable, so they run to completion in the + // inference process. Their results are harmless: `_predict` re-derefs the + // (now-gone) stream via `_streamRef.deref()` and the stream's request-id / + // closing guards discard any late prediction. (Python cancels the tasks; + // our IPC executor has no AbortSignal to thread through, so we can't.) + this._tasks.clear(); + } + + async run(): Promise { + const stream = this._streamRef?.deref(); + if (stream === undefined) return; + await stream._drainAudioChannel(); + } +} + +/** + * WebSocket transport for the `turn-detector-v1` (cloud) model. + * + * Maintains one inference session against the LiveKit Agent Gateway: + * connect → `SessionCreate` → three concurrent tasks (drain audio, send, + * receive) → protobuf encode/decode → `stream._resolvePrediction(...)` + + * `EOTInferenceMetrics` on the detector. Mirrors Python `_CloudTransport`. + * + * All outbound messages flow through a single FIFO send channel so control + * hooks fired synchronously between two awaited audio frames (e.g. + * `inferenceStart` then `inputAudio`) reach the wire in call order. + */ +export class CloudTransport implements StreamingTurnDetectionTransport { + protected _detectorRef: WeakRef; + protected _opts: BaseStreamingTurnDetectorOptions; + protected _cloudOpts: CloudTransportOptions; + protected _connOptions: APIConnectOptions; + protected _streamRef: WeakRef | undefined; + protected _ws: CloudWebSocket | undefined; + protected _numRetries = 0; + protected _connectCalls = 0; + /** Outbound FIFO for the active connection; recreated per `_runOnce`. */ + protected _sendChannel: StreamChannel | undefined; + /** Set by `detach()`; stops the retry loop and suppresses the + * connection-closed throw so a teardown can't trigger a reconnect. */ + protected _detached = false; + /** Aborted by `detach()` to release the audio-drain reader lock so a + * swapped-in transport can re-acquire the shared audio stream. */ + protected _runAbort: AbortController | undefined; + protected _logger = log(); + /** Optional connect override for tests; defaults to a real WS handshake. */ + private _connectImpl: (() => Promise) | undefined; + + constructor(args: { + detector: TurnDetector; + opts: BaseStreamingTurnDetectorOptions; + cloudOpts: CloudTransportOptions; + /** @internal test seam — supply a fake WebSocket factory. */ + connect?: (transport: CloudTransport) => Promise; + }) { + this._detectorRef = new WeakRef(args.detector); + this._opts = args.opts; + this._cloudOpts = args.cloudOpts; + this._connOptions = args.cloudOpts.connOptions; + this._connectImpl = args.connect ? () => args.connect!(this) : undefined; + } + + /** @internal Test-visible: number of connect attempts. */ + get connectCalls(): number { + return this._connectCalls; + } + /** @internal Test-visible: retry counter (resets to 0 after a connect). */ + get numRetries(): number { + return this._numRetries; + } + + attach(stream: BaseStreamingTurnDetectorStream): void { + this._streamRef = new WeakRef(stream); + } + + /** @internal Test-visible: true once the WS handshake is open. Not part of + * the transport interface — the stream FSM no longer gates on this. */ + transportReady(): boolean { + return this._ws !== undefined && this._ws.readyState === WS_OPEN; + } + + runInference(requestId: string): void { + this._enqueue( + new ClientMessageCtor({ + message: { case: 'inferenceStart', value: new InferenceStart({ requestId }) }, + }), + ); + } + + async pushFrame(frame: AudioFrame): Promise { + if (frame.data.byteLength === 0) return; + this._enqueue( + new ClientMessageCtor({ + message: { + case: 'inputAudio', + value: new InputAudio({ + audio: new Uint8Array(frame.data.buffer, frame.data.byteOffset, frame.data.byteLength), + numSamples: frame.samplesPerChannel, + createdAt: nowTimestamp(), + }), + }, + }), + ); + } + + async flush(_sentinel: FlushSentinel): Promise { + this._enqueue( + new ClientMessageCtor({ message: { case: 'sessionFlush', value: new SessionFlush() } }), + ); + } + + detach(): void { + this._detached = true; + // Abort the active run: this releases the audio-drain reader lock (held by + // `stream._drainAudioChannel`) so a swapped-in transport can re-acquire the + // shared audio stream, and unblocks the recv/send tasks below. + this._runAbort?.abort(); + void this._sendChannel?.close(); + const ws = this._ws; + this._ws = undefined; + try { + ws?.close(); + } catch { + // ignore + } + } + + private _enqueue(msg: ClientMsg): void { + // The WS handle is cleared synchronously by `detach()` while + // `_sendChannel.close()` is still in flight (its `closed` flag flips + // asynchronously). Gate on `_ws` to drop late control hooks that the + // stream FSM may fire after the transport is being torn down. + if (this._ws === undefined || this._ws.readyState !== WS_OPEN) return; + const channel = this._sendChannel; + if (channel === undefined || channel.closed) return; + void channel.write(msg).catch(() => {}); + } + + private async _defaultConnect(): Promise { + let baseUrl = this._cloudOpts.baseUrl; + if (baseUrl.startsWith('http://')) baseUrl = baseUrl.replace('http://', 'ws://'); + else if (baseUrl.startsWith('https://')) baseUrl = baseUrl.replace('https://', 'wss://'); + const token = await createAccessToken(this._cloudOpts.apiKey, this._cloudOpts.apiSecret); + const headers = { ...buildMetadataHeaders(), Authorization: `Bearer ${token}` }; + const ws = await connectWs(`${baseUrl}/eot`, headers, this._connOptions.timeoutMs); + return ws as unknown as CloudWebSocket; + } + + private _warnTransportLatency(msg: ServerMsg): void { + const clientCreatedAtMs = timestampToMs(msg.clientCreatedAt); + const transportLatency = Date.now() - clientCreatedAtMs; + if (transportLatency > 500 && clientCreatedAtMs > 0) { + this._logger.warn( + { transportLatencyMs: transportLatency }, + 'turn detection transport latency is too high', + ); + } + } + + protected _processServerMessage(msg: ServerMsg): void { + const stream = this._streamRef?.deref(); + if (stream === undefined) return; + const kind = msg.message.case; + if (kind === 'eotPrediction') { + const prediction = msg.message.value; + const stats = prediction.inferenceStats; + const requestSentAtMs = timestampToMs(stats?.latestClientCreatedAt); + const detectionDelayMs = requestSentAtMs > 0 ? Date.now() - requestSentAtMs : 0; + const inferenceDurationMs = durationToMs(stats?.serverE2eLatency); + stream._resolvePrediction(msg.requestId ?? '', prediction.probability, { + detectionDelay: detectionDelayMs, + inferenceDuration: inferenceDurationMs, + }); + const detector = this._detectorRef.deref(); + if (detector !== undefined) { + detector.emit('metrics_collected', { + type: 'eot_inference_metrics', + timestamp: Date.now(), + totalDuration: durationToMs(stats?.clientE2eLatency), + predictionDuration: inferenceDurationMs, + detectionDelay: detectionDelayMs, + numRequests: 1, + metadata: { modelName: detector.model, modelProvider: detector.provider }, + }); + } + } else if (kind === 'error') { + const err = msg.message.value; + throw new APIStatusError({ + message: err.message, + options: { statusCode: err.code, requestId: msg.requestId }, + }); + } else if (kind === 'sessionCreated') { + this._warnTransportLatency(msg); + const created = msg.message.value; + // Adopt the gateway's calibrated default thresholds. A degenerate + // response (no usable thresholds) throws a non-retryable `APIError` that + // propagates out of the recv task → `run()` → the stream's cloud→local + // fallback. + stream.thresholdsOptions._updateDefaults(created.defaultThresholds, created.defaultThreshold); + this._logger.debug( + { + model: stream.thresholdsOptions.model, + thresholds: stream.thresholdsOptions.thresholds, + defaultThreshold: stream.thresholdsOptions.defaultThreshold, + overrides: stream.thresholdsOptions.overrides, + }, + 'audio turn detector initialized', + ); + } else if ( + kind === 'sessionClosed' || + kind === 'inferenceStarted' || + kind === 'inferenceStopped' + ) { + this._warnTransportLatency(msg); + } else { + this._logger.warn({ kind }, 'unexpected turn detector message'); + } + } + + async run(): Promise { + const maxRetries = this._connOptions.maxRetry; + while (!this._detached && this._numRetries <= maxRetries) { + try { + await this._runOnce(); + return; + } catch (err) { + // A detach (e.g. cloud→local fallback) tears the session down; don't + // surface that as a connection error or retry into a reconnect. + if (this._detached) return; + if (!(err instanceof APIError) || maxRetries === 0 || !err.retryable) throw err; + if (this._numRetries === maxRetries) { + throw new APIConnectionError({ + message: `failed to connect livekit turn detector after ${this._numRetries} attempts`, + }); + } + const retryIntervalMs = intervalForRetry(this._connOptions, this._numRetries); + this._logger.warn( + { err: err.message, attempt: this._numRetries, retryIntervalMs }, + 'livekit turn detector connection failed; retrying', + ); + await delay(retryIntervalMs); + this._numRetries += 1; + } + } + } + + protected async _runOnce(): Promise { + const stream = this._streamRef?.deref(); + if (stream === undefined) return; + + // Per-run abort: `detach()` fires it to release the audio-drain reader + // lock and stop the recv/send tasks without a spurious "closed" throw. + const runAbort = new AbortController(); + this._runAbort = runAbort; + + this._connectCalls += 1; + const ws = await (this._connectImpl ?? this._defaultConnect.bind(this))(); + + // Detached while the handshake was in flight — don't revive the session. + if (this._detached) { + try { + ws.close(); + } catch { + // ignore + } + return; + } + + // Successful connect — reset transient-failure counter so drops across + // the session lifetime don't accumulate toward maxRetry. + this._numRetries = 0; + this._ws = ws; + const sendChannel = createStreamChannel(); + this._sendChannel = sendChannel; + + // Send the SessionCreate handshake first, before any queued control msg. + ws.send( + new ClientMessageCtor({ + message: { + case: 'sessionCreate', + value: new SessionCreate({ + settings: new SessionSettings({ + sampleRate: this._opts.sampleRate, + encoding: AudioEncoding.PCM_S16LE, + }), + }), + }, + createdAt: nowTimestamp(), + }).toBinary(), + ); + + let closingWs = false; + let socketErr: Error | undefined; + // Closing the recv channel makes the reader drain buffered frames and then + // observe `done`; we use it (not `abort`) on socket close/error so the + // post-drain throw below still decides the outcome. + const recvChannel = createStreamChannel(); + + ws.on('message', (data) => { + const chunk = + data instanceof Buffer + ? new Uint8Array(data.buffer, data.byteOffset, data.byteLength) + : Array.isArray(data) + ? new Uint8Array(Buffer.concat(data)) + : new Uint8Array(data); + void recvChannel.write(chunk).catch(() => {}); + }); + ws.on('close', () => { + void recvChannel.close(); + void sendChannel.close(); + }); + ws.on('error', (err) => { + socketErr = err; + void recvChannel.close(); + void sendChannel.close(); + }); + + const drainAudioTask = Task.from(async () => { + await stream._drainAudioChannel(runAbort.signal); + // Detached mid-drain (fallback/teardown): the lock is already released; + // skip the graceful sessionClose — the session is being abandoned. + if (runAbort.signal.aborted) return; + closingWs = true; + this._enqueue( + new ClientMessageCtor({ message: { case: 'sessionClose', value: new SessionClose() } }), + ); + // Close after enqueue so the sender flushes `sessionClose` before exiting. + await sendChannel.close(); + }); + + const senderTask = Task.from(async () => { + const reader = sendChannel.stream().getReader(); + try { + while (true) { + const { done, value: msg } = await reader.read(); + if (done) return; + if (msg.createdAt === undefined) msg.createdAt = nowTimestamp(); + if (ws.readyState !== WS_OPEN) return; + try { + ws.send(msg.toBinary()); + } catch { + return; + } + } + } finally { + reader.releaseLock(); + } + }); + + const recvTask = Task.from(async () => { + const reader = recvChannel.stream().getReader(); + try { + while (true) { + const { done, value: chunk } = await reader.read(); + if (done) break; + this._processServerMessage(ServerMessageCtor.fromBinary(chunk)); + } + } finally { + reader.releaseLock(); + } + // A detach-driven ws close is expected teardown, not a failure. + if (socketErr !== undefined && !closingWs && !runAbort.signal.aborted) { + throw new APIConnectionError({ + message: `turn detector connection error: ${socketErr.message}`, + options: { retryable: false }, + }); + } + if (!closingWs && !runAbort.signal.aborted) { + throw new APIStatusError({ + message: 'turn detector connection closed unexpectedly', + options: { statusCode: -1, retryable: false }, + }); + } + }); + + try { + await Promise.all([drainAudioTask.result, senderTask.result, recvTask.result]); + } finally { + drainAudioTask.cancel(); + senderTask.cancel(); + recvTask.cancel(); + void sendChannel.close(); + void recvChannel.close(); + this._ws = undefined; + try { + ws.close(); + } catch { + // ignore + } + } + } +} + +// Re-export the transport interface from the FSM module so callers that +// import `StreamingTurnDetectionTransport` from this package barrel see the +// same type. +export type { StreamingTurnDetectionTransport }; +// Expose APIError so detector + fallback code can narrow on it. +export type { APIError }; diff --git a/agents/src/inference/index.ts b/agents/src/inference/index.ts index e77981c79..28bd2b1ad 100644 --- a/agents/src/inference/index.ts +++ b/agents/src/inference/index.ts @@ -1,10 +1,28 @@ // SPDX-FileCopyrightText: 2025 LiveKit, Inc. // // SPDX-License-Identifier: Apache-2.0 +import * as eot from './eot/index.js'; import * as llm from './llm.js'; import * as stt from './stt.js'; import * as tts from './tts.js'; +export { eot }; +export { + TurnDetector, + TurnDetectorStreamImpl, + LOCAL_LANGUAGES, + CloudTransport, + LocalTransport, + ThresholdOptions, + type TurnDetectorOptions, + type CloudTransportOptions, + type ThresholdOverride, + type TurnDetectorModel, + type TurnDetectorVersion, +} from './eot/index.js'; + +export { VAD, type VADOptions, type VADModels } from './vad.js'; + export { LLM, LLMStream, diff --git a/agents/src/inference/stt.test.ts b/agents/src/inference/stt.test.ts index 97a499daa..6e119cab5 100644 --- a/agents/src/inference/stt.test.ts +++ b/agents/src/inference/stt.test.ts @@ -13,6 +13,7 @@ import { normalizeSTTFallback, parseSTTModelString, } from './stt.js'; +import { VAD as InferenceVAD } from './vad.js'; beforeAll(() => { initializeLogger({ level: 'silent', pretty: false }); @@ -343,9 +344,10 @@ describe('STT VAD handling for Speechmatics models', () => { await expect(stt.vadPromise).resolves.toBeUndefined(); }); - it('speechmatics model with no user vad sets up a silero loader', () => { + it('speechmatics model with no user vad falls back to the inference VAD', async () => { const stt = makeStt({ model: 'speechmatics/enhanced' }); - expect(typeof stt['vad']).toBe('function'); + expect(stt['vad']).toBeInstanceOf(InferenceVAD); + await expect(stt.vadPromise).resolves.toBe(stt['vad']); }); it('speechmatics model with user vad uses that vad', async () => { @@ -372,11 +374,11 @@ describe('STT VAD handling for Speechmatics models', () => { await expect(stt.vadPromise).resolves.toBeUndefined(); }); - it('updateOptions non-speechmatics → speechmatics sets up silero loader', () => { + it('updateOptions non-speechmatics → speechmatics falls back to the inference VAD', () => { const stt = makeStt({ model: 'deepgram/nova-3' }); expect(stt['vad']).toBeUndefined(); stt.updateOptions({ model: 'speechmatics/enhanced' }); - expect(typeof stt['vad']).toBe('function'); + expect(stt['vad']).toBeInstanceOf(InferenceVAD); }); }); diff --git a/agents/src/inference/stt.ts b/agents/src/inference/stt.ts index 2acb1bc80..d0d7bf7a5 100644 --- a/agents/src/inference/stt.ts +++ b/agents/src/inference/stt.ts @@ -27,6 +27,7 @@ import { sttServerEventSchema, } from './api_protos.js'; import { type AnyString, connectWs, createAccessToken, getDefaultInferenceUrl } from './utils.js'; +import { VAD as InferenceVAD } from './vad.js'; export type DeepgramModels = | 'deepgram/nova-3' @@ -281,41 +282,20 @@ export function normalizeSTTFallback( return [makeFallback(fallback)]; } -type VADSource = VAD | (() => Promise); - function isSpeechmaticsModel(model: string | undefined): boolean { return model?.startsWith('speechmatics/') ?? false; } -function loadSileroVAD(model: string): () => Promise { - return async () => { - try { - const dynamicImport = (specifier: string) => - import(specifier) as Promise<{ VAD: { load(): Promise } }>; - const { VAD: SileroVAD } = await dynamicImport('@livekit/agents-plugin-silero'); - return SileroVAD.load(); - } catch (e) { - throw new Error( - `@livekit/agents-plugin-silero is required: model ${JSON.stringify( - model, - )} does not handle endpointing server-side.`, - { cause: e }, - ); - } - }; -} - -function resolveVADForModel( - model: string | undefined, - vad: VAD | undefined, -): VADSource | undefined { +function resolveVADForModel(model: string | undefined, vad: VAD | undefined): VAD | undefined { const speechmatics = isSpeechmaticsModel(model); if (vad && !speechmatics) { log().warn({ model }, '`vad` will be ignored: model handles endpointing server-side'); return undefined; } if (speechmatics && vad === undefined) { - return loadSileroVAD(model!); + // Speechmatics doesn't endpoint server-side, so fall back to the in-tree + // local inference VAD rather than the deprecated silero plugin. + return new InferenceVAD(); } return vad; } @@ -345,17 +325,16 @@ export interface InferenceSTTOptions { export class STT extends BaseSTT { private opts: InferenceSTTOptions; private streams: Set> = new Set(); - private vad?: VADSource; + private vad?: VAD; private _vadPromise?: Promise; /** * Resolves to the VAD instance for the current model, or `undefined` if the model - * handles endpointing server-side. Lazily computed on first read so callers that - * never need VAD don't pay the cost of loading Silero. + * handles endpointing server-side. Lazily computed on first read. */ get vadPromise(): Promise { if (this._vadPromise === undefined) { - this._vadPromise = typeof this.vad === 'function' ? this.vad() : Promise.resolve(this.vad); + this._vadPromise = Promise.resolve(this.vad); } return this._vadPromise; } @@ -488,10 +467,7 @@ export class STT extends BaseSTT { }; if (nextOpts.model !== undefined) { - this.vad = resolveVADForModel( - nextOpts.model, - this.vad && typeof this.vad !== 'function' ? this.vad : undefined, - ); + this.vad = resolveVADForModel(nextOpts.model, this.vad); this._vadPromise = undefined; } diff --git a/agents/src/inference/utils.ts b/agents/src/inference/utils.ts index 2849ed9ab..e1c545470 100644 --- a/agents/src/inference/utils.ts +++ b/agents/src/inference/utils.ts @@ -4,7 +4,7 @@ import { ThrowsPromise } from '@livekit/throws-transformer/throws'; import { AccessToken } from 'livekit-server-sdk'; import { WebSocket } from 'ws'; -import { APIConnectionError, APIStatusError } from '../_exceptions.js'; +import { APIConnectionError, APIStatusError, APITimeoutError } from '../_exceptions.js'; import { getJobContext } from '../job.js'; import { version } from '../version.js'; @@ -97,7 +97,7 @@ export async function connectWs( const socket = new WebSocket(url, { headers: { ...buildMetadataHeaders(), ...headers } }); const timeout = setTimeout(() => { - reject(new APIConnectionError({ message: 'Timeout connecting to LiveKit WebSocket' })); + reject(new APITimeoutError({ message: 'Timeout connecting to LiveKit WebSocket' })); }, timeoutMs); const onOpen = () => { diff --git a/agents/src/inference/vad.test.ts b/agents/src/inference/vad.test.ts new file mode 100644 index 000000000..fdfa926b8 --- /dev/null +++ b/agents/src/inference/vad.test.ts @@ -0,0 +1,63 @@ +// SPDX-FileCopyrightText: 2026 LiveKit, Inc. +// +// SPDX-License-Identifier: Apache-2.0 +import { beforeAll, describe, expect, it } from 'vitest'; +import { initializeLogger } from '../log.js'; +import type { VADStream } from '../vad.js'; +import { VAD, type VADOptions } from './vad.js'; + +beforeAll(() => { + initializeLogger({ level: 'silent', pretty: false }); +}); + +/** White-box view of an `InferenceVADStream`'s internal buffer state. */ +type StreamInternals = { + _opts: VADOptions; + _speechBuffer: Int16Array | null; + _prefixPaddingSamples: number; + _inputSampleRate: number; +}; + +const internals = (stream: VADStream): StreamInternals => stream as unknown as StreamInternals; + +describe('inference.VAD updateOptions propagation', () => { + it('fans out option changes to live streams', () => { + const vad = new VAD({ minSilenceDuration: 250 }); + const stream = vad.stream(); + try { + expect(internals(stream)._opts.minSilenceDuration).toBe(250); + + vad.updateOptions({ minSilenceDuration: 800 }); + + // The already-created stream observes the new value, not a stale snapshot. + expect(internals(stream)._opts.minSilenceDuration).toBe(800); + } finally { + stream.close(); + } + }); + + it('resizes a live stream speech buffer once the sample rate is known', () => { + const sampleRate = 16000; + const vad = new VAD({ maxBufferedSpeech: 10_000, prefixPaddingDuration: 500 }); + const stream = vad.stream(); + try { + // Simulate a stream that has already seen its first frame. + const s = internals(stream); + s._inputSampleRate = sampleRate; + s._prefixPaddingSamples = Math.trunc((500 * sampleRate) / 1000); + s._speechBuffer = new Int16Array( + Math.trunc((10_000 * sampleRate) / 1000) + s._prefixPaddingSamples, + ); + + vad.updateOptions({ maxBufferedSpeech: 20_000, prefixPaddingDuration: 1000 }); + + const expectedPrefix = Math.trunc((1000 * sampleRate) / 1000); + expect(s._prefixPaddingSamples).toBe(expectedPrefix); + expect(s._speechBuffer?.length).toBe( + Math.trunc((20_000 * sampleRate) / 1000) + expectedPrefix, + ); + } finally { + stream.close(); + } + }); +}); diff --git a/agents/src/inference/vad.ts b/agents/src/inference/vad.ts new file mode 100644 index 000000000..ab1034174 --- /dev/null +++ b/agents/src/inference/vad.ts @@ -0,0 +1,452 @@ +// SPDX-FileCopyrightText: 2026 LiveKit, Inc. +// +// SPDX-License-Identifier: Apache-2.0 + +/** + * Voice Activity Detection backed by `@livekit/local-inference`. + * + * Provides the same streaming VAD shape as `plugins/silero` but routes + * inference through the bundled native model so a default instance can be + * auto-provisioned by `AgentSession` without an explicit plugin import. + * + * Port of Python `livekit.agents.inference.vad`. + */ +import { AudioFrame, AudioResampler, AudioResamplerQuality } from '@livekit/rtc-node'; +import { log } from '../log.js'; +import { VAD as BaseVAD, VADStream as BaseVADStream, VADEventType } from '../vad.js'; +import { _getLocalInferenceModule } from './_warmup.js'; + +const SLOW_INFERENCE_THRESHOLD_MS = 200; +const MODEL_SAMPLE_RATE = 16000; + +export type VADModels = 'silero'; + +export interface VADOptions { + /** Minimum speech duration (ms) before reporting START_OF_SPEECH. */ + minSpeechDuration: number; + /** Trailing silence (ms) before reporting END_OF_SPEECH. */ + minSilenceDuration: number; + /** Pre-roll (ms) included in the speech buffer ahead of START_OF_SPEECH. */ + prefixPaddingDuration: number; + /** Maximum (ms) of buffered speech per utterance. */ + maxBufferedSpeech: number; + /** Sigmoid probability threshold for activation. */ + activationThreshold: number; + /** Sigmoid probability threshold for deactivation (defaults to + * `max(activationThreshold - 0.15, 0.01)`). */ + deactivationThreshold: number; +} + +const defaultVADOptions: VADOptions = { + minSpeechDuration: 50, + // 250ms (= MIN_SILENCE_DURATION_MS + 50) so the default satisfies the audio + // end-of-turn detector's silence-window requirement out of the box. + minSilenceDuration: 250, + prefixPaddingDuration: 500, + maxBufferedSpeech: 60_000, + activationThreshold: 0.5, + deactivationThreshold: 0.35, +}; + +export class VAD extends BaseVAD { + protected _opts: VADOptions; + protected _model: VADModels; + label = 'inference.VAD'; + // Live streams, tracked weakly so they don't outlive their consumers. JS + // `WeakSet` isn't iterable, so we hold `WeakRef`s in a `Set` and prune dead + // entries on iteration — the iterable equivalent of Python's `weakref.WeakSet`. + #streams = new Set>(); + + constructor(opts: Partial & { model?: VADModels } = {}) { + super({ updateInterval: 32 }); + const model: VADModels = opts.model ?? 'silero'; + if (model !== 'silero') { + throw new Error(`Unknown VAD model: ${String(model)}. Supported: 'silero'.`); + } + if (opts.deactivationThreshold !== undefined && opts.deactivationThreshold <= 0) { + throw new Error('deactivationThreshold must be greater than 0'); + } + this._model = model; + const activation = opts.activationThreshold ?? defaultVADOptions.activationThreshold; + this._opts = { + ...defaultVADOptions, + ...opts, + activationThreshold: activation, + deactivationThreshold: opts.deactivationThreshold ?? Math.max(activation - 0.15, 0.01), + }; + } + + get model(): string { + return this._model; + } + + get provider(): string { + return 'livekit-local-inference'; + } + + override get minSilenceDuration(): number { + return this._opts.minSilenceDuration; + } + + /** Update one or more knobs at runtime, propagating to live streams. */ + updateOptions(opts: Partial): void { + this._opts = { ...this._opts, ...opts }; + for (const ref of this.#streams) { + const stream = ref.deref(); + if (stream === undefined) { + this.#streams.delete(ref); + continue; + } + stream.updateOptions(opts); + } + } + + stream(): BaseVADStream { + // Each stream owns its own options snapshot so its `updateOptions` can read + // the prior `maxBufferedSpeech` before this VAD's copy is mutated. + const stream = new InferenceVADStream(this, { ...this._opts }); + this.#streams.add(new WeakRef(stream)); + return stream; + } +} + +class InferenceVADStream extends BaseVADStream { + private _opts: VADOptions; + private _logger = log(); + private _nativeVad: + | ReturnType>['createVad']> + | undefined; + private _windowSamples: number; + private _inputSampleRate = 0; + private _resampler: AudioResampler | undefined; + private _speechBuffer: Int16Array | null = null; + private _speechBufferMaxReached = false; + private _prefixPaddingSamples = 0; + private _pumpTask: Promise; + + constructor(parent: VAD, opts: VADOptions) { + super(parent); + this._opts = opts; + const mod = _getLocalInferenceModule(); + if (mod === undefined) { + this._logger.warn( + 'inference.VAD created without @livekit/local-inference; stream will be a no-op', + ); + this._windowSamples = 512; + } else { + this._nativeVad = mod.createVad(); + this._windowSamples = mod.VAD_WINDOW_SAMPLES; + } + this._pumpTask = this._pump().catch((err) => { + this._logger.error( + { err: err instanceof Error ? err.message : String(err) }, + 'VAD pump failed', + ); + }); + } + + /** + * Apply updated options to this live stream. Once the input sample rate is + * known, recomputes the prefix-padding pre-roll and resizes the speech + * buffer in place, preserving any audio already accumulated. + */ + updateOptions(opts: Partial): void { + const oldMaxBufferedSpeech = this._opts.maxBufferedSpeech; + this._opts = { ...this._opts, ...opts }; + + if (this._inputSampleRate && this._speechBuffer !== null) { + this._prefixPaddingSamples = Math.trunc( + (this._opts.prefixPaddingDuration * this._inputSampleRate) / 1000, + ); + const bufferSize = + Math.trunc((this._opts.maxBufferedSpeech * this._inputSampleRate) / 1000) + + this._prefixPaddingSamples; + const resized = new Int16Array(bufferSize); + resized.set(this._speechBuffer.subarray(0, Math.min(this._speechBuffer.length, bufferSize))); + this._speechBuffer = resized; + + if (this._opts.maxBufferedSpeech > oldMaxBufferedSpeech) { + this._speechBufferMaxReached = false; + } + } + } + + private async _pump(): Promise { + let pubSpeaking = false; + let pubSpeechDurationMs = 0; + let pubSilenceDurationMs = 0; + let pubCurrentSample = 0; + let pubTimestampMs = 0; + let speechThresholdDurationMs = 0; + let silenceThresholdDurationMs = 0; + let inputFrames: AudioFrame[] = []; + let inferenceFrames: AudioFrame[] = []; + let inputCopyRemainingFrac = 0; + let extraInferenceTime = 0; + // Write cursor into `_speechBuffer`. The buffer holds: + // [ ...prefix-padding (sliding pre-roll) ..., ...active speech... ] + // and is reset on END_OF_SPEECH (and on silence while idle) so the next + // turn starts from a fresh pre-roll window. + let speechBufferIndex = 0; + + const resetWriteCursor = () => { + if (this._speechBuffer === null) return; + if (speechBufferIndex <= this._prefixPaddingSamples) return; + // Slide the most-recent `prefixPaddingSamples` samples to the head + // of the buffer so the next utterance has continuous pre-roll + // context (the audio that immediately preceded START_OF_SPEECH). + const paddingData = this._speechBuffer.subarray( + speechBufferIndex - this._prefixPaddingSamples, + speechBufferIndex, + ); + this._speechBuffer.set(paddingData, 0); + speechBufferIndex = this._prefixPaddingSamples; + this._speechBufferMaxReached = false; + }; + + const resetState = () => { + this._nativeVad?.reset(); + + speechBufferIndex = 0; + this._speechBufferMaxReached = false; + this._speechBuffer?.fill(0); + + pubSpeaking = false; + pubSpeechDurationMs = 0; + pubSilenceDurationMs = 0; + pubCurrentSample = 0; + pubTimestampMs = 0; + speechThresholdDurationMs = 0; + silenceThresholdDurationMs = 0; + + inputFrames = []; + inferenceFrames = []; + inputCopyRemainingFrac = 0; + extraInferenceTime = 0; + + this._resampler?.close?.(); + if (this._inputSampleRate && this._inputSampleRate !== MODEL_SAMPLE_RATE) { + this._resampler = new AudioResampler( + this._inputSampleRate, + MODEL_SAMPLE_RATE, + 1, + AudioResamplerQuality.QUICK, + ); + } else { + this._resampler = undefined; + } + }; + + const copySpeechBuffer = (): AudioFrame => { + if (this._speechBuffer === null) { + return new AudioFrame(new Int16Array(0), this._inputSampleRate, 1, 0); + } + return new AudioFrame( + this._speechBuffer.subarray(0, speechBufferIndex), + this._inputSampleRate, + 1, + speechBufferIndex, + ); + }; + + while (!this.closed) { + const { done, value: frame } = await this.inputReader.read(); + if (done) break; + if (typeof frame === 'symbol') { + resetState(); + continue; + } + + if (!this._inputSampleRate) { + this._inputSampleRate = frame.sampleRate; + this._prefixPaddingSamples = Math.trunc( + (this._opts.prefixPaddingDuration * this._inputSampleRate) / 1000, + ); + const bufferSize = + Math.trunc((this._opts.maxBufferedSpeech * this._inputSampleRate) / 1000) + + this._prefixPaddingSamples; + this._speechBuffer = new Int16Array(bufferSize); + if (this._inputSampleRate !== MODEL_SAMPLE_RATE) { + this._resampler = new AudioResampler( + this._inputSampleRate, + MODEL_SAMPLE_RATE, + 1, + AudioResamplerQuality.QUICK, + ); + } + } else if (frame.sampleRate !== this._inputSampleRate) { + this._logger.error('a frame with a different sample rate was already pushed'); + continue; + } + + if (this._speechBuffer === null) continue; + + inputFrames.push(frame); + if (this._resampler !== undefined) { + inferenceFrames.push(...this._resampler.push(frame)); + } else { + inferenceFrames.push(frame); + } + + while (!this.closed) { + const startTime = performance.now(); + const availableInferenceSamples = inferenceFrames.reduce( + (acc, f) => acc + f.samplesPerChannel, + 0, + ); + if (availableInferenceSamples < this._windowSamples) break; + + const inputFrame = mergeFrames(inputFrames); + const inferenceFrame = mergeFrames(inferenceFrames); + const inferenceWindow = inferenceFrame.data.subarray(0, this._windowSamples); + + let p = 0.0; + if (this._nativeVad !== undefined) { + p = await this._nativeVad.predict(inferenceWindow); + } + + const windowDurationMs = (this._windowSamples / MODEL_SAMPLE_RATE) * 1000; + pubCurrentSample += this._windowSamples; + pubTimestampMs += windowDurationMs; + const resamplingRatio = this._inputSampleRate / MODEL_SAMPLE_RATE; + const toCopy = this._windowSamples * resamplingRatio + inputCopyRemainingFrac; + const toCopyInt = Math.trunc(toCopy); + inputCopyRemainingFrac = toCopy - toCopyInt; + + // Append the input-rate samples we just consumed into the + // speech buffer so START_OF_SPEECH / END_OF_SPEECH events can + // hand downstream consumers (STT, transcription) the prefix- + // padded audio they need. + const availableSpace = this._speechBuffer.length - speechBufferIndex; + const toCopyBuffer = Math.min(toCopyInt, availableSpace); + if (toCopyBuffer > 0) { + this._speechBuffer.set(inputFrame.data.subarray(0, toCopyBuffer), speechBufferIndex); + speechBufferIndex += toCopyBuffer; + } else if (!this._speechBufferMaxReached) { + this._speechBufferMaxReached = true; + this._logger.warn( + 'maxBufferedSpeech reached, ignoring further data for the current speech input', + ); + } + + const inferenceDuration = performance.now() - startTime; + extraInferenceTime = Math.max(0, extraInferenceTime + inferenceDuration - windowDurationMs); + // Guard on the per-window inference duration (not the accumulated slack) + // to match Python; the accumulated value is still surfaced as the delay. + if (inferenceDuration > SLOW_INFERENCE_THRESHOLD_MS) { + this._logger.warn( + { extraInferenceTimeMs: extraInferenceTime }, + 'VAD slower than realtime', + ); + } + + if (pubSpeaking) pubSpeechDurationMs += windowDurationMs; + else pubSilenceDurationMs += windowDurationMs; + + this.sendVADEvent({ + type: VADEventType.INFERENCE_DONE, + samplesIndex: pubCurrentSample, + timestamp: pubTimestampMs, + silenceDuration: pubSilenceDurationMs, + speechDuration: pubSpeechDurationMs, + probability: p, + inferenceDuration, + frames: [ + new AudioFrame( + inputFrame.data.subarray(0, toCopyInt), + this._inputSampleRate, + 1, + toCopyInt, + ), + ], + speaking: pubSpeaking, + rawAccumulatedSilence: silenceThresholdDurationMs, + rawAccumulatedSpeech: speechThresholdDurationMs, + }); + + if ( + p >= this._opts.activationThreshold || + (pubSpeaking && p > this._opts.deactivationThreshold) + ) { + speechThresholdDurationMs += windowDurationMs; + silenceThresholdDurationMs = 0; + if (!pubSpeaking && speechThresholdDurationMs >= this._opts.minSpeechDuration) { + pubSpeaking = true; + pubSilenceDurationMs = 0; + pubSpeechDurationMs = speechThresholdDurationMs; + this.sendVADEvent({ + type: VADEventType.START_OF_SPEECH, + samplesIndex: pubCurrentSample, + timestamp: pubTimestampMs, + silenceDuration: pubSilenceDurationMs, + speechDuration: pubSpeechDurationMs, + probability: p, + inferenceDuration, + frames: [copySpeechBuffer()], + speaking: true, + rawAccumulatedSilence: 0, + rawAccumulatedSpeech: 0, + }); + } + } else { + silenceThresholdDurationMs += windowDurationMs; + speechThresholdDurationMs = 0; + // Keep a sliding pre-roll window while we're not in active + // speech — without this the buffer would fill with idle + // silence and the next START_OF_SPEECH would lose its + // prefix-padding context. + if (!pubSpeaking) resetWriteCursor(); + if (pubSpeaking && silenceThresholdDurationMs >= this._opts.minSilenceDuration) { + pubSpeaking = false; + pubSilenceDurationMs = silenceThresholdDurationMs; + this.sendVADEvent({ + type: VADEventType.END_OF_SPEECH, + samplesIndex: pubCurrentSample, + timestamp: pubTimestampMs, + silenceDuration: pubSilenceDurationMs, + speechDuration: Math.max(0, pubSpeechDurationMs - silenceThresholdDurationMs), + probability: p, + inferenceDuration, + frames: [copySpeechBuffer()], + speaking: false, + rawAccumulatedSilence: 0, + rawAccumulatedSpeech: 0, + }); + pubSpeechDurationMs = 0; + resetWriteCursor(); + } + } + + inputFrames = []; + inferenceFrames = []; + if (inputFrame.data.length > toCopyInt) { + const data = inputFrame.data.subarray(toCopyInt); + inputFrames.push(new AudioFrame(data, this._inputSampleRate, 1, Math.trunc(data.length))); + } + if (inferenceFrame.data.length > this._windowSamples) { + const data = inferenceFrame.data.subarray(this._windowSamples); + inferenceFrames.push(new AudioFrame(data, MODEL_SAMPLE_RATE, 1, Math.trunc(data.length))); + } + } + } + this._resampler?.close?.(); + } +} + +/** Minimal frame-merging helper. The silero plugin uses `mergeFrames` from + * the agents package — for the inference VAD we keep a local copy to avoid + * an import cycle through `index.ts`. */ +function mergeFrames(frames: AudioFrame[]): AudioFrame { + if (frames.length === 1) return frames[0]!; + const sampleRate = frames[0]!.sampleRate; + const channels = frames[0]!.channels; + let total = 0; + for (const f of frames) total += f.samplesPerChannel; + const buf = new Int16Array(total * channels); + let offset = 0; + for (const f of frames) { + buf.set(f.data, offset); + offset += f.samplesPerChannel * channels; + } + return new AudioFrame(buf, sampleRate, channels, total); +} diff --git a/agents/src/metrics/base.ts b/agents/src/metrics/base.ts index f6af79ec9..3ae5546ac 100644 --- a/agents/src/metrics/base.ts +++ b/agents/src/metrics/base.ts @@ -15,6 +15,7 @@ export type AgentMetrics = | TTSMetrics | VADMetrics | EOUMetrics + | EOTInferenceMetrics | RealtimeModelMetrics | InterruptionMetrics | AvatarMetrics; @@ -197,6 +198,25 @@ export type RealtimeModelMetrics = { metadata?: MetricsMetadata; }; +/** + * Per-prediction telemetry for the audio EOT (end-of-turn) detector. Emitted + * by transports on each cloud or local prediction so we can track detection + * latency and inference time per call. + */ +export type EOTInferenceMetrics = { + type: 'eot_inference_metrics'; + timestamp: number; + /** Latest RTT time taken to perform inference, in milliseconds. */ + totalDuration: number; + /** Latest time taken by the model side, in milliseconds. */ + predictionDuration: number; + /** Latest total time from audio-frame creation to prediction receive, in milliseconds. */ + detectionDelay: number; + /** Number of prediction requests served (incremental). */ + numRequests: number; + metadata?: MetricsMetadata; +}; + export type InterruptionMetrics = { type: 'interruption_metrics'; timestamp: number; diff --git a/agents/src/metrics/index.ts b/agents/src/metrics/index.ts index 0438b0219..9f2726460 100644 --- a/agents/src/metrics/index.ts +++ b/agents/src/metrics/index.ts @@ -5,6 +5,7 @@ export type { AgentMetrics, AvatarMetrics, + EOTInferenceMetrics, EOUMetrics, InterruptionMetrics, LLMMetrics, @@ -17,6 +18,7 @@ export type { export { filterZeroValues, ModelUsageCollector, + type EOTModelUsage, type InterruptionModelUsage, type LLMModelUsage, type ModelUsage, diff --git a/agents/src/metrics/model_usage.ts b/agents/src/metrics/model_usage.ts index 5e723fb51..2a7da58ca 100644 --- a/agents/src/metrics/model_usage.ts +++ b/agents/src/metrics/model_usage.ts @@ -3,6 +3,7 @@ // SPDX-License-Identifier: Apache-2.0 import type { AgentMetrics, + EOTInferenceMetrics, InterruptionMetrics, LLMMetrics, RealtimeModelMetrics, @@ -84,7 +85,23 @@ export type InterruptionModelUsage = { totalRequests: number; }; -export type ModelUsage = LLMModelUsage | TTSModelUsage | STTModelUsage | InterruptionModelUsage; +/** Aggregate per-provider usage for the audio EOT detector. */ +export type EOTModelUsage = { + type: 'eot_usage'; + /** The provider name (e.g., 'livekit'). */ + provider: string; + /** The model name (e.g., 'turn-detector-v1' for cloud, 'turn-detector-v1-mini' for local). */ + model: string; + /** Total number of EOT prediction requests served. */ + totalRequests: number; +}; + +export type ModelUsage = + | LLMModelUsage + | TTSModelUsage + | STTModelUsage + | InterruptionModelUsage + | EOTModelUsage; export function filterZeroValues(usage: T): Partial { const result: Partial = {} as Partial; @@ -102,10 +119,17 @@ export class ModelUsageCollector { private sttUsage: Map = new Map(); private interruptionUsage: Map = new Map(); + private eotUsage: Map = new Map(); /** Extract provider and model from metrics metadata. */ private extractProviderModel( - metrics: LLMMetrics | STTMetrics | TTSMetrics | RealtimeModelMetrics | InterruptionMetrics, + metrics: + | LLMMetrics + | STTMetrics + | TTSMetrics + | RealtimeModelMetrics + | InterruptionMetrics + | EOTInferenceMetrics, ): [string, string] { let provider = ''; let model = ''; @@ -195,6 +219,21 @@ export class ModelUsageCollector { return usage; } + private getEotUsage(provider: string, model: string): EOTModelUsage { + const key = `${provider}:${model}`; + let usage = this.eotUsage.get(key); + if (!usage) { + usage = { + type: 'eot_usage', + provider, + model, + totalRequests: 0, + }; + this.eotUsage.set(key, usage); + } + return usage; + } + /** Collect metrics and aggregate usage by model/provider. */ collect(metrics: AgentMetrics): void { if (metrics.type === 'llm_metrics') { @@ -239,8 +278,13 @@ export class ModelUsageCollector { const [provider, model] = this.extractProviderModel(metrics); const usage = this.getInterruptionUsage(provider, model); usage.totalRequests += metrics.numRequests; + } else if (metrics.type === 'eot_inference_metrics') { + const [provider, model] = this.extractProviderModel(metrics); + const usage = this.getEotUsage(provider, model); + usage.totalRequests += metrics.numRequests; } - // VAD and EOU metrics are not aggregated for usage tracking. + // VAD and EOU (session-level summary) metrics are not aggregated for + // usage tracking; only per-prediction EOT inference metrics are. } flatten(): ModelUsage[] { @@ -257,6 +301,9 @@ export class ModelUsageCollector { for (const u of this.interruptionUsage.values()) { result.push({ ...u }); } + for (const u of this.eotUsage.values()) { + result.push({ ...u }); + } return result; } } diff --git a/agents/src/telemetry/trace_types.ts b/agents/src/telemetry/trace_types.ts index 7c1bb159a..1f79eca63 100644 --- a/agents/src/telemetry/trace_types.ts +++ b/agents/src/telemetry/trace_types.ts @@ -65,6 +65,13 @@ export const ATTR_EOU_PROBABILITY = 'lk.eou.probability'; export const ATTR_EOU_UNLIKELY_THRESHOLD = 'lk.eou.unlikely_threshold'; export const ATTR_EOU_DELAY = 'lk.eou.endpointing_delay'; export const ATTR_EOU_LANGUAGE = 'lk.eou.language'; +/** Which signal triggered the EOU detection: 'vad' | 'stt' | 'manual'. */ +export const ATTR_EOU_SOURCE = 'lk.eou.source'; +/** True when the audio EOT detector resolved this prediction from its + * inference-window cache instead of running a fresh predict. */ +export const ATTR_EOU_FROM_CACHE = 'lk.eou.from_cache'; +/** Latest input-audio creation time → prediction receive time (ms). */ +export const ATTR_EOU_DETECTION_DELAY = 'lk.eou.detection_delay'; export const ATTR_USER_TRANSCRIPT = 'lk.user_transcript'; export const ATTR_TRANSCRIPT_CONFIDENCE = 'lk.transcript_confidence'; export const ATTR_TRANSCRIPTION_DELAY = 'lk.transcription_delay'; diff --git a/agents/src/utils.ts b/agents/src/utils.ts index e537ea4e5..56d561025 100644 --- a/agents/src/utils.ts +++ b/agents/src/utils.ts @@ -1418,6 +1418,33 @@ export function asError(maybeError: unknown): Error { return new Error(String(maybeError)); } +/** + * Resolve a value that may come from an explicit argument, one of several + * environment variables (checked in order), or a final default. + * + * Mirrors Python `livekit.agents.utils.resolve_env_var`. Used by inference + * transports to plumb credentials and URLs (e.g. `LIVEKIT_REMOTE_EOT_URL`, + * `LIVEKIT_INFERENCE_API_KEY`). + */ +export function resolveEnvVar( + value: string | undefined, + envVars: readonly string[], + defaultValue = '', +): string { + // An explicit empty string is a provided value, returned as-is; only + // `undefined` falls through to env resolution. + if (value !== undefined) { + return value; + } + for (const name of envVars) { + const v = process.env[name]; + if (v !== undefined && v !== '') { + return v; + } + } + return defaultValue; +} + /** * Tagged template literal that strips common leading indentation from every line, * trims the first empty line and any trailing whitespace. diff --git a/agents/src/utils_env.test.ts b/agents/src/utils_env.test.ts new file mode 100644 index 000000000..71ecb531a --- /dev/null +++ b/agents/src/utils_env.test.ts @@ -0,0 +1,88 @@ +// SPDX-FileCopyrightText: 2026 LiveKit, Inc. +// +// SPDX-License-Identifier: Apache-2.0 + +/** + * Tests for the `resolveEnvVar` helper contract. + * + * Port of Python `tests/test_utils_env.py`. + */ +import { afterEach, beforeEach, describe, expect, it } from 'vitest'; +import { resolveEnvVar } from './utils.js'; + +const ENV_KEYS = ['LIVEKIT_INFERENCE_URL', 'LIVEKIT_URL'] as const; +const saved: Record = {}; + +beforeEach(() => { + for (const k of ENV_KEYS) { + saved[k] = process.env[k]; + delete process.env[k]; + } +}); + +afterEach(() => { + for (const k of ENV_KEYS) { + if (saved[k] === undefined) delete process.env[k]; + else process.env[k] = saved[k]; + } +}); + +describe('resolveEnvVar', () => { + it('returns empty string when no env or default', () => { + expect(resolveEnvVar(undefined, ['LIVEKIT_INFERENCE_URL'])).toBe(''); + }); + + it('returns default when no matching env exists', () => { + expect(resolveEnvVar(undefined, ['LIVEKIT_INFERENCE_URL'], 'https://default.example.com')).toBe( + 'https://default.example.com', + ); + }); + + it('returns first matching env value', () => { + process.env.LIVEKIT_INFERENCE_URL = 'https://inference.example.com'; + process.env.LIVEKIT_URL = 'https://livekit.example.com'; + expect( + resolveEnvVar( + undefined, + ['LIVEKIT_INFERENCE_URL', 'LIVEKIT_URL'], + 'https://default.example.com', + ), + ).toBe('https://inference.example.com'); + }); + + it('falls back to later env when earlier env missing', () => { + process.env.LIVEKIT_URL = 'https://livekit.example.com'; + expect( + resolveEnvVar( + undefined, + ['LIVEKIT_INFERENCE_URL', 'LIVEKIT_URL'], + 'https://default.example.com', + ), + ).toBe('https://livekit.example.com'); + }); + + it('prefers explicit value over environment', () => { + process.env.LIVEKIT_INFERENCE_URL = 'https://env.example.com'; + expect( + resolveEnvVar( + 'https://explicit.example.com', + ['LIVEKIT_INFERENCE_URL'], + 'https://default.example.com', + ), + ).toBe('https://explicit.example.com'); + }); + + it('treats empty env value as missing', () => { + process.env.LIVEKIT_INFERENCE_URL = ''; + expect(resolveEnvVar(undefined, ['LIVEKIT_INFERENCE_URL'], 'https://default.example.com')).toBe( + 'https://default.example.com', + ); + }); + + it('treats whitespace env value as set', () => { + process.env.LIVEKIT_INFERENCE_URL = ' '; + expect(resolveEnvVar(undefined, ['LIVEKIT_INFERENCE_URL'], 'https://default.example.com')).toBe( + ' ', + ); + }); +}); diff --git a/agents/src/vad.ts b/agents/src/vad.ts index e946ae213..c0a2e8c16 100644 --- a/agents/src/vad.ts +++ b/agents/src/vad.ts @@ -28,11 +28,11 @@ export interface VADEvent { * Index of the audio sample where the event occurred, relative to the inference sample rate. */ samplesIndex: number; - /** Timestamp when the event was fired. */ + /** Timestamp (milliseconds since epoch) when the event was fired. */ timestamp: number; - /** Duration of the speech segment in seconds. */ + /** Duration of the speech segment in milliseconds. */ speechDuration: number; - /** Duration of the silence segment in seconds. */ + /** Duration of the silence segment in milliseconds. */ silenceDuration: number; /** * List of audio frames associated with the speech. @@ -45,7 +45,7 @@ export interface VADEvent { frames: AudioFrame[]; /** Probability that speech is present (only for `INFERENCE_DONE` events). */ probability: number; - /** Time taken to perform the inference, in seconds (only for `INFERENCE_DONE` events). */ + /** Time taken to perform the inference, in milliseconds (only for `INFERENCE_DONE` events). */ inferenceDuration: number; /** Indicates whether speech was detected in the frames. */ speaking: boolean; @@ -77,6 +77,19 @@ export abstract class VAD extends (EventEmitter as new () => TypedEmitter 0) { const outputs = await ThrowsPromise.allSettled(tasks); @@ -226,6 +241,14 @@ export class AgentActivity implements RecognitionHooks { private isInterruptionByAudioActivityEnabled: boolean; private isDefaultInterruptionByAudioActivityEnabled: boolean; + /** + * Validated turn detection for this activity. Equals `this.turnDetection` + * except when an `BaseStreamingTurnDetector` instance fails the runtime preconditions + * (no VAD, or RealtimeModel with server-side turn detection enabled), in + * which case it is downgraded to `undefined` and a warning is logged. + */ + private _resolvedTurnDetection: TurnDetectionMode | undefined; + // for false interruption handling private pausedSpeech?: PausedSpeechInfo; private falseInterruptionTimer?: NodeJS.Timeout; @@ -292,8 +315,9 @@ export class AgentActivity implements RecognitionHooks { }); this.q_updated = new Future(); + this._resolvedTurnDetection = this._resolveTurnDetection(this.turnDetection); this.turnDetectionMode = - typeof this.turnDetection === 'string' ? this.turnDetection : undefined; + typeof this._resolvedTurnDetection === 'string' ? this._resolvedTurnDetection : undefined; if (this.turnDetectionMode === 'vad' && this.vad === undefined) { this.logger.warn( @@ -342,10 +366,13 @@ export class AgentActivity implements RecognitionHooks { this.turnDetectionMode = undefined; } - // fallback to VAD if server side turn detection is disabled and VAD is available + // fallback to VAD if server side turn detection is disabled and the + // user explicitly supplied a VAD. The bundled-default VAD is treated + // as absent here so behavior matches "no vad passed" sessions. if ( !this.llm.capabilities.turnDetection && this.vad && + !this.usingDefaultVad && this.turnDetectionMode === undefined ) { this.turnDetectionMode = 'vad'; @@ -516,12 +543,27 @@ export class AgentActivity implements RecognitionHooks { this.vad.on('metrics_collected', this.onMetricsCollected); } + if (this._resolvedTurnDetection instanceof BaseStreamingTurnDetector) { + this._resolvedTurnDetection.on('metrics_collected', this.onMetricsCollected); + } + + // Bundled-default VAD is treated as absent when the RealtimeModel does + // its own server-side turn detection — the realtime session is already + // canonical and an extra audio pipeline would just pay the native model + // load for no behavioral gain. User-supplied VADs still flow through + // (e.g. when the user wants adaptive interruption). + const realtimeUsesServerVad = + this.llm instanceof RealtimeModel && this.llm.capabilities.turnDetection === true; + const recognitionVad = this.usingDefaultVad && realtimeUsesServerVad ? undefined : this.vad; + this.audioRecognition = new AudioRecognition({ recognitionHooks: this, // Disable stt node if stt is not provided stt: this.stt ? (...args) => this.agent.sttNode(...args) : undefined, - vad: this.vad, - turnDetector: typeof this.turnDetection === 'string' ? undefined : this.turnDetection, + vad: recognitionVad, + usingDefaultVad: this.usingDefaultVad, + turnDetector: + typeof this._resolvedTurnDetection === 'string' ? undefined : this._resolvedTurnDetection, turnDetectionMode: this.turnDetectionMode, interruptionDetection: this.interruptionDetector, backchannelBoundary: @@ -538,20 +580,29 @@ export class AgentActivity implements RecognitionHooks { shouldDiscardAudioForStt: () => this.shouldDiscardInputAudio(), }); - if (reuseResources?.sttPipeline) { + const sttPipeline = reuseResources?.sttPipeline; + // carry the input epoch along with the reused pipeline: its stream clock + // is cumulative, so re-stamping inputStartedAt here would push STT-derived + // timestamps into the future and stall end-of-turn after every handoff + // (1.4.5 silence regression from #1603; see agent_task_handoff_eou.test.ts) + const sttInputStartedAt = reuseResources?.sttInputStartedAt; + const turnDetectorStream = reuseResources?.turnDetectorStream; + if (sttPipeline) { this.logger.debug('reusing STT pipeline from previous activity'); - // carry the input epoch along with the reused pipeline: its stream clock - // is cumulative, so re-stamping inputStartedAt here would push STT-derived - // timestamps into the future and stall end-of-turn after every handoff - // (1.4.5 silence regression from #1603; see agent_task_handoff_eou.test.ts) - await this.audioRecognition.start({ - sttPipeline: reuseResources.sttPipeline, - inputStartedAt: reuseResources.sttInputStartedAt, - }); - reuseResources.sttPipeline = undefined; // ownership transferred + } + if (turnDetectorStream) { + this.logger.debug('reusing turn detector stream from previous activity'); + } + await this.audioRecognition.start({ + sttPipeline, + inputStartedAt: sttInputStartedAt, + turnDetectorStream, + }); + if (reuseResources) { + // ownership transferred to the new AudioRecognition + reuseResources.sttPipeline = undefined; reuseResources.sttInputStartedAt = undefined; - } else { - await this.audioRecognition.start(); + reuseResources.turnDetectorStream = undefined; } this.started = true; @@ -592,6 +643,15 @@ export class AgentActivity implements RecognitionHooks { resources.sttInputStartedAt = this.audioRecognition.inputStartedAt; } + // reuse the turn detector stream during a handoff whenever we can + if ( + this.audioRecognition && + this._resolvedTurnDetection instanceof BaseStreamingTurnDetector && + this._resolvedTurnDetection === newActivity._resolvedTurnDetection + ) { + resources.turnDetectorStream = this.audioRecognition.detachTurnDetector(); + } + // rt session if ( this.realtimeSession && @@ -655,6 +715,18 @@ export class AgentActivity implements RecognitionHooks { return this.agent.vad || this.agentSession.vad; } + /** + * True iff the effective VAD for this activity is the framework-auto-provisioned + * default. False when the user passed `vad=` to either the agent or the + * session, even if the value happens to be the same silero model. + */ + get usingDefaultVad(): boolean { + if (this.agent.vad !== undefined) { + return false; + } + return this.agentSession._usingDefaultVad; + } + get stt(): STT | undefined { return this.agent.stt || this.agentSession.stt; } @@ -980,7 +1052,13 @@ export class AgentActivity implements RecognitionHooks { // -- Metrics and errors -- private onMetricsCollected = ( - ev: STTMetrics | TTSMetrics | VADMetrics | LLMMetrics | RealtimeModelMetrics, + ev: + | STTMetrics + | TTSMetrics + | VADMetrics + | LLMMetrics + | RealtimeModelMetrics + | EOTInferenceMetrics, ) => { const speechHandle = speechHandleStorage.getStore(); if (speechHandle && (ev.type === 'llm_metrics' || ev.type === 'tts_metrics')) { @@ -1032,7 +1110,11 @@ export class AgentActivity implements RecognitionHooks { onInputSpeechStarted(_ev: InputSpeechStartedEvent): void { this.logger.info('onInputSpeechStarted'); - if (!this.vad) { + // Bundled-default VAD is treated as absent here so the realtime + // session's own server-side turn detection drives the user-state / + // overlap-detection update, identical to a session that didn't + // configure any VAD. + if (!this.vad || this.usingDefaultVad) { this.agentSession._updateUserState('speaking'); if (this.isInterruptionDetectionEnabled && this.audioRecognition) { this.audioRecognition.onStartOfOverlapSpeech( @@ -1058,7 +1140,7 @@ export class AgentActivity implements RecognitionHooks { onInputSpeechStopped(ev: InputSpeechStoppedEvent): void { this.logger.info(ev, 'onInputSpeechStopped'); - if (!this.vad) { + if (!this.vad || this.usingDefaultVad) { if (this.isInterruptionDetectionEnabled && this.audioRecognition) { this.audioRecognition.onEndOfOverlapSpeech(Date.now(), this.agentSession._userSpeakingSpan); } @@ -1402,6 +1484,12 @@ export class AgentActivity implements RecognitionHooks { this.cancelSpeechPauseTask = this.cancelSpeechPause(); } + /** Forward audio EOT predictions up to the session so listeners (e.g. + * remote-session forwarders) can observe them. */ + onEotPrediction(ev: EotPredictionEvent): void { + this.agentSession.emit(AgentSessionEventTypes.EotPrediction, ev); + } + onPreemptiveGeneration(info: PreemptiveGenerationInfo): void { const preemptiveOpts = this.agentSession.sessionOptions.turnHandling.preemptiveGeneration; if ( @@ -3916,6 +4004,29 @@ export class AgentActivity implements RecognitionHooks { } } + private _resolveTurnDetection( + turnDetection: TurnDetectionMode | undefined, + ): TurnDetectionMode | undefined { + if (turnDetection !== undefined && typeof turnDetection !== 'string') { + if (turnDetection instanceof BaseStreamingTurnDetector) { + if (this.vad === undefined) { + this.logger.warn( + 'TurnDetector requires a VAD model. Pass vad=inference.VAD() to AgentSession/Agent or turnDetection=null to disable the default TurnDetector', + ); + return undefined; + } + if (this.llm instanceof RealtimeModel && this.llm.capabilities.turnDetection) { + this.logger.warn( + 'turnDetection is a TurnDetector, but the LLM is a RealtimeModel with server-side turn detection enabled, ignoring the turnDetection setting', + ); + return undefined; + } + } + return turnDetection; + } + return turnDetection; + } + private resolveInterruptionDetector(): AdaptiveInterruptionDetector | undefined { const agentInterruptionDetection = this.agent.turnHandling?.interruption?.mode; const sessionInterruptionDetection = this.agentSession.interruptionDetection; @@ -3924,7 +4035,7 @@ export class AgentActivity implements RecognitionHooks { this.stt && this.stt.capabilities.alignedTranscript && this.stt.capabilities.streaming && - this.vad && + this.vad !== undefined && this.turnDetection !== 'manual' && this.turnDetection !== 'realtime_llm' && !(this.llm instanceof RealtimeModel) @@ -4182,6 +4293,10 @@ export class AgentActivity implements RecognitionHooks { this.vad.off('metrics_collected', this.onMetricsCollected); } + if (this._resolvedTurnDetection instanceof BaseStreamingTurnDetector) { + this._resolvedTurnDetection.off('metrics_collected', this.onMetricsCollected); + } + this.detachAudioInput(); this.realtimeSpans?.clear(); await this.realtimeSession?.close(); diff --git a/agents/src/voice/agent_session.ts b/agents/src/voice/agent_session.ts index 3cade1f2f..09c83a088 100644 --- a/agents/src/voice/agent_session.ts +++ b/agents/src/voice/agent_session.ts @@ -12,10 +12,13 @@ import { context as otelContext, trace } from '@opentelemetry/api'; import { EventEmitter } from 'node:events'; import type { ReadableStream } from 'node:stream/web'; import type { z } from 'zod'; +import type { BaseStreamingTurnDetector } from '../inference/eot/base.js'; import { LLM as InferenceLLM, STT as InferenceSTT, TTS as InferenceTTS, + TurnDetector as InferenceTurnDetector, + VAD as InferenceVAD, type LLMModels, type STTModelString, type TTSModelString, @@ -63,6 +66,7 @@ import { type CloseEvent, CloseReason, type ConversationItemAddedEvent, + type EotPredictionEvent, type ErrorEvent, type FunctionToolsExecutedEvent, type MetricsCollectedEvent, @@ -199,7 +203,13 @@ export type VoiceOptions = { maxEndpointingDelay?: number; }; -export type TurnDetectionMode = 'stt' | 'vad' | 'realtime_llm' | 'manual' | _TurnDetector; +export type TurnDetectionMode = + | 'stt' + | 'vad' + | 'realtime_llm' + | 'manual' + | _TurnDetector + | BaseStreamingTurnDetector; export type AgentSessionCallbacks = { [AgentSessionEventTypes.UserInputTranscribed]: (ev: UserInputTranscribedEvent) => void; @@ -215,11 +225,18 @@ export type AgentSessionCallbacks = { [AgentSessionEventTypes.Error]: (ev: ErrorEvent) => void; [AgentSessionEventTypes.Close]: (ev: CloseEvent) => void; [AgentSessionEventTypes.OverlappingSpeech]: (ev: OverlappingSpeechEvent) => void; + [AgentSessionEventTypes.EotPrediction]: (ev: EotPredictionEvent) => void; }; export type AgentSessionOptions = { stt?: STT | STTModelString; - vad?: VAD; + /** + * Voice Activity Detection. When omitted, `AgentSession` auto-provisions a + * bundled `inference.VAD({ model: 'silero' })` and marks it as the default + * (so sites that previously distinguished "user supplied a VAD" continue + * to treat the bundled one as absent). Pass `null` to opt out entirely. + */ + vad?: VAD | null; llm?: LLM | RealtimeModel | LLMModels; tts?: TTS | TTSModelString; userData?: UserData; @@ -362,6 +379,15 @@ export class AgentSession< private _interruptionDetection?: InterruptionOptions['mode']; + /** + * True iff this session auto-provisioned the bundled silero VAD because the + * caller passed no `vad=`. Set once in the constructor; immutable from then + * on. Read it via `AgentActivity.usingDefaultVad` from voice-pipeline code. + * + * @internal + */ + _usingDefaultVad: boolean = false; + /** @internal */ _usageCollector: ModelUsageCollector = new ModelUsageCollector(); @@ -438,7 +464,19 @@ export class AgentSession< DEFAULT_SESSION_CONNECT_OPTIONS.maxUnrecoverableErrors, }; - this.vad = vad; + // VAD: undefined → auto-provision bundled inference.VAD (silero). The + // `_usingDefaultVad` marker is the single source of truth for "this VAD + // was framework-provisioned" — code paths that should ignore a default + // VAD read it via `AgentActivity.usingDefaultVad`. null → leave VAD off + // entirely. Otherwise use what the caller supplied. + this._usingDefaultVad = vad === undefined; + if (vad === undefined) { + this.vad = new InferenceVAD({ model: 'silero' }); + } else if (vad === null) { + this.vad = undefined; + } else { + this.vad = vad; + } if (typeof stt === 'string') { this.stt = InferenceSTT.fromModelString(stt); @@ -458,7 +496,16 @@ export class AgentSession< this.tts = tts; } - this.turnDetection = resolvedSessionOptions.turnHandling.turnDetection; + // Default turn_detection: when the caller didn't pin a mode or supply a + // detector instance (`undefined`/not-given), fall back to a fresh + // inference.TurnDetector so every session ships with audio EOT + // out of the box. An explicit `null` opts out entirely — no detector is + // built (mirrors Python `None` vs `NOT_GIVEN`). + const configuredTurnDetection = resolvedSessionOptions.turnHandling.turnDetection; + this.turnDetection = + configuredTurnDetection === null + ? undefined + : configuredTurnDetection ?? new InferenceTurnDetector(); this._interruptionDetection = resolvedSessionOptions.turnHandling.interruption?.mode; this._userData = userData; diff --git a/agents/src/voice/agent_session_default_vad.test.ts b/agents/src/voice/agent_session_default_vad.test.ts new file mode 100644 index 000000000..c48025805 --- /dev/null +++ b/agents/src/voice/agent_session_default_vad.test.ts @@ -0,0 +1,96 @@ +// SPDX-FileCopyrightText: 2026 LiveKit, Inc. +// +// SPDX-License-Identifier: Apache-2.0 + +/** + * Tests for the bundled-default VAD and turn-detection behavior on `AgentSession`. + * + * Port of six test additions on `tests/test_agent_session.py`: + * + * - `test_default_vad_is_auto_provisioned` + * - `test_explicit_vad_none_opts_out` + * - `test_user_supplied_vad_keeps_using_default_false` + * - `test_default_turn_detection_builds_default_eot` + * - `test_turn_detection_none_opts_out` + * - `test_user_supplied_turn_detector_passes_through` + */ +import { describe, expect, it } from 'vitest'; +import { TurnDetector } from '../inference/eot/detector.js'; +import type { VADStream } from '../vad.js'; +import { VAD as BaseVAD } from '../vad.js'; +import { AgentSession } from './agent_session.js'; + +class FakeVAD extends BaseVAD { + label = 'FakeVAD'; + constructor() { + super({ updateInterval: 32 }); + } + stream(): VADStream { + throw new Error('not used in this test'); + } +} + +describe('AgentSession default VAD', () => { + it('auto-provisions a default VAD when none passed', async () => { + const session = new AgentSession(); + try { + expect(session.vad).toBeDefined(); + expect(session._usingDefaultVad).toBe(true); + } finally { + await session.close().catch(() => {}); + } + }); + + it('explicit `vad: null` opts out', async () => { + const session = new AgentSession({ vad: null }); + try { + expect(session.vad).toBeUndefined(); + expect(session._usingDefaultVad).toBe(false); + } finally { + await session.close().catch(() => {}); + } + }); + + it('user-supplied VAD keeps _usingDefaultVad false', async () => { + const userVad = new FakeVAD(); + const session = new AgentSession({ vad: userVad }); + try { + expect(session.vad).toBe(userVad); + expect(session._usingDefaultVad).toBe(false); + } finally { + await session.close().catch(() => {}); + } + }); +}); + +describe('AgentSession default turn detection', () => { + it('auto-provisions a default TurnDetector when none given', async () => { + const session = new AgentSession(); + try { + expect(session.turnDetection).toBeInstanceOf(TurnDetector); + } finally { + await session.close().catch(() => {}); + } + }); + + it('explicit `turnDetection: null` opts out (no default detector built)', async () => { + // `null` is the explicit opt-out, distinct from `undefined` (not given); + // mirrors Python `turn_detection=None`. + const session = new AgentSession({ turnHandling: { turnDetection: null } }); + try { + expect(session.turnDetection).toBeUndefined(); + } finally { + await session.close().catch(() => {}); + } + }); + + it('passes a user-supplied turn detector through unchanged', async () => { + const userDetector = new TurnDetector({ version: 'v1-mini' }); + const session = new AgentSession({ turnHandling: { turnDetection: userDetector } }); + try { + expect(session.turnDetection).toBe(userDetector); + } finally { + await session.close().catch(() => {}); + } + }); +}); diff --git a/agents/src/voice/audio_recognition.ts b/agents/src/voice/audio_recognition.ts index 8933173ae..d94d84a84 100644 --- a/agents/src/voice/audio_recognition.ts +++ b/agents/src/voice/audio_recognition.ts @@ -14,6 +14,12 @@ import { import type { ReadableStream, WritableStreamDefaultWriter } from 'node:stream/web'; import { TransformStream } from 'node:stream/web'; import { isAPIError } from '../_exceptions.js'; +import { + BaseStreamingTurnDetector, + BaseStreamingTurnDetectorStream, + MIN_SILENCE_DURATION_MS, + type TurnDetectionEvent, +} from '../inference/eot/base.js'; import { apiConnectDefaults, intervalForRetry } from '../inference/interruption/defaults.js'; import { InterruptionDetectionError } from '../inference/interruption/errors.js'; import type { AdaptiveInterruptionDetector } from '../inference/interruption/interruption_detector.js'; @@ -32,10 +38,16 @@ import { type StreamChannel, createStreamChannel } from '../stream/stream_channe import { type SpeechEvent, SpeechEventType } from '../stt/stt.js'; import { traceTypes, tracer } from '../telemetry/index.js'; import { splitWords } from '../tokenize/basic/word.js'; -import { Task, cancelAndWait, delay, readStream, waitForAbort } from '../utils.js'; +import type { Future } from '../utils.js'; +import { Event, Task, cancelAndWait, delay, readStream, waitForAbort } from '../utils.js'; import { type VAD, type VADEvent, VADEventType, type VADStream } from '../vad.js'; import type { TurnDetectionMode } from './agent_session.js'; -import { type UserTurnExceededEvent, createUserTurnExceededEvent } from './events.js'; +import { + type EotPredictionEvent, + type UserTurnExceededEvent, + createEotPredictionEvent, + createUserTurnExceededEvent, +} from './events.js'; import type { STTNode } from './io.js'; import { type BaseEndpointing, @@ -86,6 +98,7 @@ export interface RecognitionHooks { onInterimTranscript: (ev: SpeechEvent, speaking: boolean | undefined) => void; onFinalTranscript: (ev: SpeechEvent, speaking: boolean | undefined) => void; onEndOfTurn: (info: EndOfTurnInfo) => Promise; + onEotPrediction: (ev: EotPredictionEvent) => void; onPreemptiveGeneration: (info: PreemptiveGenerationInfo) => void; onUserTurnExceeded: (ev: UserTurnExceededEvent) => void; @@ -98,6 +111,42 @@ interface UserTurnTracker { startedAt?: number; } +/** + * Edge-triggered event with an abort-aware `waitOnce` helper. + * + * Used by the audio-EOT bounce race: the bounce task awaits either the + * endpointing delay or a fresh "user started speaking" signal. We extend + * the base `Event` rather than reimplementing it because the base already + * handles the resolver / waiter bookkeeping; this subclass just layers a + * `waitOnce(signal)` that rejects on cancel so the race can tear down + * cleanly when the parent task is aborted. + */ +class SpeakingEvent extends Event { + /** + * Resolves on the next `set()`. Rejects (and cleans up the listener) if + * `signal` aborts first. Returns immediately if the event is already set. + */ + async waitOnce(signal: AbortSignal): Promise { + if (this.isSet) return; + let abortListener: (() => void) | undefined; + try { + await Promise.race([ + this.wait().then(() => undefined), + new Promise((_resolve, reject) => { + if (signal.aborted) { + reject(signal.reason ?? new Error('aborted')); + return; + } + abortListener = () => reject(signal.reason ?? new Error('aborted')); + signal.addEventListener('abort', abortListener, { once: true }); + }), + ]); + } finally { + if (abortListener !== undefined) signal.removeEventListener('abort', abortListener); + } + } +} + export class STTPipeline { static readonly PUMP_TASK_CANCEL_TIMEOUT = 5000; @@ -148,7 +197,11 @@ export interface _TurnDetector { readonly provider: string; unlikelyThreshold: (language?: LanguageCode) => Promise; supportsLanguage: (language?: LanguageCode) => Promise; - predictEndOfTurn(chatCtx: ChatContext, timeout?: number): Promise; + /** + * @param timeoutMs - Optional inference wait budget in milliseconds. The audio + * EOT detector honors it; text-based detectors currently ignore it. + */ + predictEndOfTurn(chatCtx: ChatContext, timeoutMs?: number): Promise; } export interface AudioRecognitionOptions { @@ -158,8 +211,17 @@ export interface AudioRecognitionOptions { stt?: STTNode; /** Voice activity detection. */ vad?: VAD; - /** Turn detector for end-of-turn prediction. */ - turnDetector?: _TurnDetector; + /** + * True iff the wired VAD was auto-provisioned by `AgentSession` rather than + * supplied by the caller. Read at every "is VAD configured?" call site so + * a framework-default VAD behaves like no VAD for downstream eligibility + * decisions (e.g. STT-hook `speaking=` payload). + */ + usingDefaultVad?: boolean; + /** Turn detector for end-of-turn prediction. Accepts text-based detectors + * via `_TurnDetector` (e.g. plugins/livekit) or audio-based detectors via + * `TurnDetector` (e.g. `inference.TurnDetector`). */ + turnDetector?: _TurnDetector | BaseStreamingTurnDetector; /** Turn detection mode. */ turnDetectionMode?: TurnDetectionMode; interruptionDetection?: AdaptiveInterruptionDetector; @@ -206,7 +268,42 @@ export class AudioRecognition { private stt?: STTNode; private sttPipeline?: STTPipeline; private vad?: VAD; - private turnDetector?: _TurnDetector; + private usingDefaultVad: boolean; + private turnDetector?: _TurnDetector | BaseStreamingTurnDetector; + private turnDetectorStream?: BaseStreamingTurnDetectorStream; + /** + * Future for the in-flight audio-EOT inference request. Recognition owns the + * request lifecycle: it starts a request on the VAD silence tick, holds the + * future here, awaits it (with the endpointing delay) in the eou bounce, and + * clears it on turn boundaries / superseding speech. + */ + private turnDetectorPredictionFut?: Future; + /** + * True between a turn flush (commit / clearUserTurn) and the next VAD + * start-of-speech. While set, a late stt final won't start a fresh request; + * the eou bounce short-circuits via `onMissingEotPrediction`. + */ + private turnDetectorFlushed = false; + /** Warn once per recognition when the eou bounce runs after a flush. */ + private turnDetectorLatePredictionWarned = false; + /** + * The last `TurnDetectionEvent` we forwarded via `onEotPrediction`, kept + * by reference to dedupe: both EOU triggers in a turn read the same + * resolved prediction future, but the event should fire once per request. + */ + private lastEmittedEotPrediction?: TurnDetectionEvent; + /** + * Edge-triggered "user is speaking" event used by the audio-EOT bounce + * race. Set on VAD `START_OF_SPEECH` (and on any `INFERENCE_DONE` with + * accumulated speech), cleared on `END_OF_SPEECH`. Mirrors Python + * `_user_speaking_event`. + * + * `Event.set()` is idempotent (re-setting an already-set event resolves + * any new waiters immediately); cleared on EOS so subsequent waiters + * park until the next utterance. + */ + private userSpeakingEvent = new SpeakingEvent(); + private warnedTurnDetectorPushFailure = false; private turnDetectionMode?: TurnDetectionMode; private endpointing: BaseEndpointing; private userTurnLimit?: UserTurnLimitOptions; @@ -287,7 +384,12 @@ export class AudioRecognition { this.hooks = opts.recognitionHooks; this.stt = opts.stt; this.vad = opts.vad; + this.usingDefaultVad = opts.usingDefaultVad ?? false; this.turnDetector = opts.turnDetector; + this.checkVadSilenceRequirement(); + // The FSM stream is opened on `start()` so callers can hand off the + // previous activity's stream (cloud↔local fallback state, in-flight + // inference) instead of forcing a cold restart. this.turnDetectionMode = opts.turnDetectionMode; this.userTurnLimit = opts.userTurnLimit; this.endpointing = @@ -336,6 +438,26 @@ export class AudioRecognition { { transform: (chunk, controller) => { controller.enqueue(chunk); + // Fan the same frame into the audio EOT detector stream when + // one is attached. The FSM accepts arbitrary-rate input and + // resamples internally. `pushAudio` is a no-op when the stream's + // internal channel is closed; any actual throw indicates a bug + // (e.g. resampler init failure, sample-rate mismatch). Log once + // when we hit that path so a regression doesn't silently drop + // every audio frame. + if (this.turnDetectorStream !== undefined) { + try { + this.turnDetectorStream.pushAudio(chunk); + } catch (err) { + if (!this.warnedTurnDetectorPushFailure) { + this.warnedTurnDetectorPushFailure = true; + this.logger.warn( + { err: err instanceof Error ? err.message : String(err) }, + 'audio EOT stream pushAudio failed; dropping frames for this turn', + ); + } + } + } if (this.subscriberWriters.length === 0) return; for (const writer of this.subscriberWriters) { writer.write(chunk).catch(() => { @@ -414,7 +536,163 @@ export class AudioRecognition { } } - async start(options?: { sttPipeline?: STTPipeline; inputStartedAt?: number }) { + /** True iff the user supplied their own VAD (default-VAD is treated as + * absent at sites that decide between "use VAD signal" and "STT-derived + * speaking"). */ + private get hasUserVad(): boolean { + return this.vad !== undefined && !this.usingDefaultVad; + } + + /** + * Swap the active turn detector at runtime. When an `BaseStreamingTurnDetector` + * is provided, opens a per-turn FSM stream after retiring the prior one. + * + * When `stream` is provided it is adopted as-is (handoff reuse) instead of + * opening a fresh stream on `detector`; the live transport stream — and its + * per-session cloud→local fallback state — survives the handoff. + */ + updateTurnDetector( + detector: _TurnDetector | BaseStreamingTurnDetector | undefined, + options?: { stream?: BaseStreamingTurnDetectorStream }, + ): void { + // Validate against the incoming detector before swapping in so the error + // — when raised — names the configuration that failed. + this.checkVadSilenceRequirement(detector); + this.turnDetector = detector; + + const reuseStream = options?.stream; + // Retire the prior stream before creating the new one. `detach()` frees + // the detector's single-stream slot synchronously (so `stream()` below + // won't throw if the same detector is reused), while the network teardown + // runs in the background. + const oldStream = this.turnDetectorStream; + if (oldStream !== undefined && oldStream !== reuseStream) { + oldStream.detach(); + void oldStream.aclose().catch(() => undefined); + } + // Cross-detector state should not leak: the cached speaking signal + // from the prior detector's turn must not race the new detector's + // first bounce. + this.userSpeakingEvent.clear(); + const newStream = + reuseStream !== undefined + ? reuseStream + : detector instanceof BaseStreamingTurnDetector + ? detector.stream() + : undefined; + // A different stream means a fresh request lifecycle: drop any held + // prediction future and re-arm so the adopting recognition starts its own + // request on the next VAD event. + if (this.turnDetectorStream !== newStream) { + this.turnDetectorPredictionFut = undefined; + this.turnDetectorFlushed = false; + } + this.turnDetectorStream = newStream; + } + + /** + * Detach the turn detector stream for handoff to another AudioRecognition. + * + * Returns the live stream (transport run loop intact) without closing it. + * The caller passes it to the new AudioRecognition via + * `start({ turnDetectorStream })`. The stream stays attached to its + * detector, retaining the detector's single-stream slot, so the new + * AudioRecognition must adopt it rather than open a second stream. + */ + detachTurnDetector(): BaseStreamingTurnDetectorStream | undefined { + const stream = this.turnDetectorStream; + this.turnDetectorStream = undefined; + // The adopting recognition starts a fresh request on its next VAD event, + // superseding any request that survived the handoff. + this.turnDetectorPredictionFut = undefined; + return stream; + } + + /** + * The audio EOT detector needs a wider silence window than typical VAD + * defaults. Rather than mutate the VAD's knob, require the caller to + * configure it: raise if the bound VAD exposes `minSilenceDuration` and it + * is below the floor. VADs that don't expose the knob are left untouched. + */ + private checkVadSilenceRequirement( + detector: _TurnDetector | BaseStreamingTurnDetector | undefined = this.turnDetector, + ): void { + if (!(detector instanceof BaseStreamingTurnDetector) || this.vad === undefined) { + return; + } + const current = this.vad.minSilenceDuration; + if (current === null) { + return; + } + const required = MIN_SILENCE_DURATION_MS + 50; + if (current < required) { + throw new Error( + `vad minSilenceDuration=${current}ms is too low for the TurnDetector. ` + + `Raise the VAD's minSilenceDuration to at least ${required}ms.`, + ); + } + } + + /** + * Speaking-guard wrapper for the bounce-EOU task, mirroring Python's + * `_bounce_eou_task_with_speaking_guard`. When an `BaseStreamingTurnDetector` + * is active, the bounce task races against the `userSpeakingEvent`: + * + * - if the user is already speaking, skip the EOU outright; + * - if the user starts speaking during the endpointing delay (e.g. + * the LLM hadn't returned yet but the user added another phrase), + * abort the inner bounce so the next turn drives the decision. + * + * VAD `START_OF_SPEECH` also calls `bounceEOUTask?.cancel()`, but the + * cancel path only races VAD sessions. STT-only audio-EOT setups need + * the explicit event-driven race here. + */ + private async bounceEOUTaskWithSpeakingGuard( + controller: AbortController, + inner: (innerController: AbortController) => Promise, + context: { + lastSpeakingTime: number | undefined; + lastFinalTranscriptTime: number; + speechStartTime: number | undefined; + }, + ): Promise { + if (this.speaking) { + this.logger.debug(context, 'user is still speaking, skipping end of turn task'); + return; + } + const innerController = new AbortController(); + // Propagate outer cancellation into the inner task. + const onOuterAbort = () => innerController.abort(); + controller.signal.addEventListener('abort', onOuterAbort, { once: true }); + + let speakingWon = false; + try { + const innerPromise = inner(innerController); + // When the speaking branch wins, the race settles before finally aborts + // innerController, leaving innerPromise's rejection uncaught without this. + void innerPromise.catch(() => {}); + await Promise.race([ + innerPromise, + this.userSpeakingEvent.waitOnce(controller.signal).then(() => { + speakingWon = true; + }), + ]); + if (speakingWon) { + this.logger.debug(context, 'user spoke during endpointing, cancelling end of turn task'); + } + } finally { + controller.signal.removeEventListener('abort', onOuterAbort); + // If the speaking-event branch won (or the outer was aborted), tear + // down the inner bounce so it doesn't keep awaiting the delay. + innerController.abort(); + } + } + + async start(options?: { + sttPipeline?: STTPipeline; + inputStartedAt?: number; + turnDetectorStream?: BaseStreamingTurnDetectorStream; + }) { this.startSttTasks(options?.sttPipeline, options?.inputStartedAt); this.vadTask = Task.from(({ signal }) => this.createVadTask(this.vad, signal)); @@ -428,6 +706,14 @@ export class AudioRecognition { this.interruptionTask.result.catch((err) => { this.logger.error(`Error running interruption task: ${err}`); }); + + // Open (or adopt) the audio EOT detector stream now that the activity is + // running. We only call `updateTurnDetector` for BaseStreamingTurnDetector / + // undefined detectors — plugin-based `_TurnDetector` instances are + // text-only and don't carry a stream. + if (this.turnDetector instanceof BaseStreamingTurnDetector || this.turnDetector === undefined) { + this.updateTurnDetector(this.turnDetector, { stream: options?.turnDetectorStream }); + } } async stop() { @@ -435,6 +721,11 @@ export class AudioRecognition { await this.sttForwardTask?.cancelAndWait(); await this.vadTask?.cancelAndWait(); await this.interruptionTask?.cancelAndWait(); + if (this.turnDetectorStream !== undefined) { + const stream = this.turnDetectorStream; + this.turnDetectorStream = undefined; + await stream.aclose().catch(() => undefined); + } } async disableInterruptionDetection(): Promise { @@ -850,7 +1141,7 @@ export class AudioRecognition { this.hooks.onFinalTranscript( ev, - this.vad || this.turnDetectionMode === 'stt' ? this.speaking : undefined, + this.hasUserVad || this.turnDetectionMode === 'stt' ? this.speaking : undefined, ); this.logger.debug( @@ -869,7 +1160,7 @@ export class AudioRecognition { this.audioInterimTranscript = ''; this.audioPreflightTranscript = ''; - if (!this.vad || this.lastSpeakingTime === undefined) { + if (!this.hasUserVad || this.lastSpeakingTime === undefined) { // vad disabled or missed a speech, use stt timestamp this.lastSpeakingTime = sttLastSpeakingTime; } @@ -896,14 +1187,14 @@ export class AudioRecognition { if (!this.speaking) { const chatCtx = this.hooks.retrieveChatCtx(); this.logger.debug('running EOU detection on stt FINAL_TRANSCRIPT'); - this.runEOUDetection(chatCtx); + this.runEOUDetection(chatCtx, 'stt'); } } break; case SpeechEventType.PREFLIGHT_TRANSCRIPT: this.hooks.onInterimTranscript( ev, - this.vad || this.turnDetectionMode === 'stt' ? this.speaking : undefined, + this.hasUserVad || this.turnDetectionMode === 'stt' ? this.speaking : undefined, ); const preflightTranscript = ev.alternatives?.[0]?.text ?? ''; const preflightConfidence = ev.alternatives?.[0]?.confidence ?? 0; @@ -936,7 +1227,7 @@ export class AudioRecognition { `${this.audioTranscript} ${preflightTranscript}`.trimStart(); this.audioInterimTranscript = preflightTranscript; - if (!this.vad || this.lastSpeakingTime === undefined) { + if (!this.hasUserVad || this.lastSpeakingTime === undefined) { // vad disabled or missed a speech, use stt timestamp this.lastSpeakingTime = sttLastSpeakingTime; } @@ -966,7 +1257,7 @@ export class AudioRecognition { this.logger.debug({ transcript: ev.alternatives?.[0]?.text }, 'interim transcript'); this.hooks.onInterimTranscript( ev, - this.vad || this.turnDetectionMode === 'stt' ? this.speaking : undefined, + this.hasUserVad || this.turnDetectionMode === 'stt' ? this.speaking : undefined, ); this.audioInterimTranscript = ev.alternatives?.[0]?.text ?? ''; break; @@ -996,6 +1287,10 @@ export class AudioRecognition { } this.speaking = true; this.lastSpeakingTime = sttLastSpeakingTime; + // STT-only sessions never see VAD events; surface the speaking + // signal here so the audio-EOT bounce race can still abort on a + // mid-window fresh utterance. + this.userSpeakingEvent.set(); this.bounceEOUTask?.cancel(); break; @@ -1032,7 +1327,9 @@ export class AudioRecognition { // and user state won't be updated until a new VAD SOS is received. // Reset VAD so that incorrect end of turn from STT can be corrected by VAD interruption. // If user is still speaking (an immediate VAD SOS will interrupt the agent). - if (this.vad && this.vadSpeechStarted) { + // Default-bundled VAD is treated as absent here — only user-supplied VADs + // are reset/flushed, matching the matrix in PR_DESCRIPTION. + if (this.hasUserVad && this.vadSpeechStarted) { if (this.vadStream) { this.vadStream.flush(); } else { @@ -1048,8 +1345,9 @@ export class AudioRecognition { ); } this.speaking = false; + this.userSpeakingEvent.clear(); this.userTurnCommitted = true; - if (!this.vad || this.lastSpeakingTime === undefined) { + if (!this.hasUserVad || this.lastSpeakingTime === undefined) { // vad disabled or missed a speech, use stt timestamp this.lastSpeakingTime = sttLastSpeakingTime; } @@ -1057,7 +1355,7 @@ export class AudioRecognition { if (!this.speaking) { const chatCtx = this.hooks.retrieveChatCtx(); this.logger.debug('running EOU detection on stt END_OF_SPEECH'); - this.runEOUDetection(chatCtx); + this.runEOUDetection(chatCtx, 'stt'); } } } @@ -1077,7 +1375,25 @@ export class AudioRecognition { } } - private runEOUDetection(chatCtx: ChatContext) { + private onMissingEotPrediction(): void { + if (this.turnDetectorFlushed) { + if (!this.turnDetectorLatePredictionWarned) { + this.turnDetectorLatePredictionWarned = true; + this.logger.warn( + 'eou detection ran after the audio eot turn was already flushed ' + + '(likely a late stt final). consider raising `minDelay` in the ' + + 'endpointing options to accommodate slow stt. subsequent ' + + 'occurrences will log at debug level.', + ); + } else { + this.logger.debug('stt transcript arrived after a turn flush, skipping eot prediction'); + } + } else { + this.logger.debug('no eot inference request in flight, skipping eot prediction'); + } + } + + private runEOUDetection(chatCtx: ChatContext, trigger: 'vad' | 'stt' | 'manual' = 'vad') { this.logger.debug( { stt: this.stt, @@ -1094,11 +1410,32 @@ export class AudioRecognition { } chatCtx = chatCtx.copy(); - chatCtx.addMessage({ role: 'user', content: this.audioTranscript }); + if (this.audioTranscript) { + chatCtx.addMessage({ role: 'user', content: this.audioTranscript }); + } - const turnDetector = - // disable EOU model if manual turn detection enabled - this.audioTranscript && this.turnDetectionMode !== 'manual' ? this.turnDetector : undefined; + // Pick the right detector: + // - manual mode: no detector (turn boundary decided externally) + // - audio EOT detector: prefer the per-turn stream (it caches the + // prediction for the current inference window so the bounce task + // can short-circuit on cache) + // - text-based detector: only run when we have a transcript to score + const hasAudioDetector = this.turnDetector instanceof BaseStreamingTurnDetector; + const useDetector = + this.turnDetectionMode !== 'manual' && (this.audioTranscript || hasAudioDetector); + // The unified type only covers the predict surface; the audio + // detector's per-turn stream stands in for the parent when one is + // attached so the cached prediction is available. + let turnDetector: _TurnDetector | BaseStreamingTurnDetectorStream | undefined; + if (!useDetector) { + turnDetector = undefined; + } else if (hasAudioDetector) { + turnDetector = this.turnDetectorStream; + } else { + // text-based detector — `this.turnDetector` cannot be the audio + // base class here, because `hasAudioDetector` already screened it. + turnDetector = this.turnDetector as _TurnDetector | undefined; + } const bounceEOUTask = ( @@ -1113,47 +1450,155 @@ export class AudioRecognition { const userTurnCtx = this.userTurnContext(userTurnSpan); if (turnDetector) { - await tracer.startActiveSpan( - async (span) => { - this.logger.debug('Running turn detector model'); + if (!(await turnDetector.supportsLanguage(this.lastLanguage))) { + // Unsupported language: produce no span and emit no prediction event. + this.logger.debug(`Turn detector does not support language ${this.lastLanguage}`); + } else { + await tracer.startActiveSpan( + async (span) => { + this.logger.debug('Running turn detector model'); + + // undefined => the prediction never resolved (e.g. timed out + // or inference threw); gates the span attributes and the emit + // below. + let endOfTurnProbability: number | undefined; + let unlikelyThreshold: number | undefined; + // True when the held future was already resolved when this + // bounce started — i.e. the prediction was served from the + // request the silence tick warmed, not awaited fresh. + let fromCache = false; + // The resolved prediction event for this turn, shared by + // reference across both EOU triggers (vad + stt final) so the + // emit can dedupe. + let predictionEvent: TurnDetectionEvent | undefined; + + if (turnDetector instanceof BaseStreamingTurnDetectorStream) { + const fut = this.turnDetectorPredictionFut; + if (fut === undefined) { + this.onMissingEotPrediction(); + } else { + fromCache = fut.done; + // Await the held future against the endpointing delay. + let timeoutId: ReturnType | undefined; + const winner = await Promise.race([ + fut.await.then((ev) => ({ kind: 'value', ev }) as const), + new Promise<{ kind: 'timeout' }>((resolve) => { + timeoutId = setTimeout( + () => resolve({ kind: 'timeout' }), + endpointingDelay, + ); + }), + ]); + if (timeoutId !== undefined) clearTimeout(timeoutId); + + // A newer trigger calls `bounceEOUTask?.cancel()`. A JS abort + // does NOT interrupt the await above, so bail here before + // touching shared state so the superseded bounce doesn't + // clobber a freshly-armed future or double-emit. + if (controller.signal.aborted) return; + + if (winner.kind === 'value') { + predictionEvent = winner.ev; + endOfTurnProbability = predictionEvent.endOfTurnProbability; + unlikelyThreshold = await turnDetector.unlikelyThreshold(this.lastLanguage); + } else { + this.logger.warn( + { timeoutMs: endpointingDelay }, + 'eot prediction timed out, committing without a prediction', + ); + turnDetector.cancelInference({ timedOut: true }); + this.turnDetectorPredictionFut = undefined; + } + } + } else { + try { + endOfTurnProbability = await turnDetector.predictEndOfTurn( + chatCtx, + endpointingDelay, + ); + unlikelyThreshold = await turnDetector.unlikelyThreshold(this.lastLanguage); + } catch (error) { + this.logger.error(error, 'Error predicting end of turn'); + } + // See the streaming-branch note: bail if a newer trigger + // superseded this bounce while it awaited. + if (controller.signal.aborted) return; + } - let endOfTurnProbability = 0.0; - let unlikelyThreshold: number | undefined; + if ( + endOfTurnProbability !== undefined && + unlikelyThreshold !== undefined && + endOfTurnProbability < unlikelyThreshold + ) { + endpointingDelay = this.endpointing.maxDelay; + } - if (!(await turnDetector.supportsLanguage(this.lastLanguage))) { - this.logger.debug(`Turn detector does not support language ${this.lastLanguage}`); - } else { - try { - endOfTurnProbability = await turnDetector.predictEndOfTurn(chatCtx); - unlikelyThreshold = await turnDetector.unlikelyThreshold(this.lastLanguage); + this.logger.debug( + { + endOfTurnProbability, + unlikelyThreshold, + endpointingDelay, + language: this.lastLanguage, + trigger, + fromCache, + }, + 'eot prediction', + ); - this.logger.debug( - { endOfTurnProbability, unlikelyThreshold, language: this.lastLanguage }, - 'end of turn probability', - ); + const prediction = predictionEvent; - if (unlikelyThreshold && endOfTurnProbability < unlikelyThreshold) { - endpointingDelay = this.endpointing.maxDelay; - } - } catch (error) { - this.logger.error(error, 'Error predicting end of turn'); + span.setAttribute( + traceTypes.ATTR_CHAT_CTX, + JSON.stringify(chatCtx.toJSON({ excludeTimestamp: false })), + ); + if (endOfTurnProbability !== undefined) { + span.setAttribute(traceTypes.ATTR_EOU_PROBABILITY, endOfTurnProbability); + } + if (unlikelyThreshold !== undefined) { + span.setAttribute(traceTypes.ATTR_EOU_UNLIKELY_THRESHOLD, unlikelyThreshold); + } + span.setAttribute(traceTypes.ATTR_EOU_DELAY, endpointingDelay); + span.setAttribute(traceTypes.ATTR_EOU_LANGUAGE, this.lastLanguage ?? ''); + span.setAttribute(traceTypes.ATTR_EOU_FROM_CACHE, fromCache); + span.setAttribute(traceTypes.ATTR_EOU_SOURCE, trigger); + + // Emit once the prediction resolved (a timeout / failed + // inference emits nothing). Both EOU triggers in a turn (vad + + // stt final) read the same resolved `TurnDetectionEvent`; dedupe + // by reference so the event fires once per request. The abort + // guard above drops a superseded bounce; this reference check + // catches the race where the first bounce completes (and emits) + // just before the second trigger fires. Text detectors have no + // shared event (`prediction === undefined`), so they always emit. + if ( + endOfTurnProbability !== undefined && + unlikelyThreshold !== undefined && + (prediction === undefined || prediction !== this.lastEmittedEotPrediction) + ) { + this.lastEmittedEotPrediction = prediction; + const inferenceDurationMs = prediction?.inferenceDuration ?? 0; + const delayMs = + lastSpeakingTime !== undefined ? Date.now() - lastSpeakingTime : 0; + this.hooks.onEotPrediction( + createEotPredictionEvent({ + probability: endOfTurnProbability, + threshold: unlikelyThreshold, + inferenceDurationMs, + delayMs, + }), + ); } - } - span.setAttribute( - traceTypes.ATTR_CHAT_CTX, - JSON.stringify(chatCtx.toJSON({ excludeTimestamp: false })), - ); - span.setAttribute(traceTypes.ATTR_EOU_PROBABILITY, endOfTurnProbability); - span.setAttribute(traceTypes.ATTR_EOU_UNLIKELY_THRESHOLD, unlikelyThreshold ?? 0); - span.setAttribute(traceTypes.ATTR_EOU_DELAY, endpointingDelay); - span.setAttribute(traceTypes.ATTR_EOU_LANGUAGE, this.lastLanguage ?? ''); - }, - { - name: 'eou_detection', - context: userTurnCtx, - }, - ); + if (prediction?.detectionDelay !== undefined) { + span.setAttribute(traceTypes.ATTR_EOU_DETECTION_DELAY, prediction.detectionDelay); + } + }, + { + name: 'eou_detection', + context: userTurnCtx, + }, + ); + } } let extraSleep = endpointingDelay; @@ -1219,6 +1664,15 @@ export class AudioRecognition { this.vadSpeechStarted = false; this.lastSpeakingTime = undefined; } + + // Flush the in-flight request and write the turn-boundary sentinel to + // the transport so the next turn's predict starts fresh — the normal + // EOU-commit path, mirroring clearUserTurn()'s flush on interrupt. + if (this.turnDetectorStream !== undefined) { + this.turnDetectorStream.flush('turn committed'); + this.turnDetectorPredictionFut = undefined; + this.turnDetectorFlushed = true; + } } this.userTurnCommitted = false; @@ -1227,9 +1681,24 @@ export class AudioRecognition { // cancel any existing EOU task this.bounceEOUTask?.cancel(); // copy the values before awaiting (the values can change) - this.bounceEOUTask = Task.from( - bounceEOUTask(this.lastSpeakingTime, this.lastFinalTranscriptTime, this.userTurnStart), - ); + const lastSpeakingTime = this.lastSpeakingTime; + const lastFinalTranscriptTime = this.lastFinalTranscriptTime; + const speechStartTime = this.userTurnStart; + + // Audio-EOT detectors get a speaking-guard wrapper: if the user starts + // speaking again during the endpointing delay, abort the EOU and let + // the next turn drive the decision. Text-based detectors (no audio + // pipeline) keep the simpler bounce task — they can't race against + // mid-window utterances anyway since they don't run during silence. + const factory = hasAudioDetector + ? (controller: AbortController) => + this.bounceEOUTaskWithSpeakingGuard( + controller, + bounceEOUTask(lastSpeakingTime, lastFinalTranscriptTime, speechStartTime), + { lastSpeakingTime, lastFinalTranscriptTime, speechStartTime }, + ) + : bounceEOUTask(lastSpeakingTime, lastFinalTranscriptTime, speechStartTime); + this.bounceEOUTask = Task.from(factory); this.bounceEOUTask.result .then(() => { @@ -1382,6 +1851,13 @@ export class AudioRecognition { otelContext.with(ctx, () => this.hooks.onStartOfSpeech(ev)); } this.speaking = true; + this.userSpeakingEvent.set(); + + // Audio EOT: tear down any in-flight inference for the now-stale + // prior window and re-arm so the next silence tick starts fresh. + this.turnDetectorStream?.cancelInference(); + this.turnDetectorPredictionFut = undefined; + this.turnDetectorFlushed = false; // Capture sample rate from the first VAD event if not already set if (ev.frames.length > 0 && ev.frames[0]) { @@ -1401,6 +1877,38 @@ export class AudioRecognition { // ev.rawAccumulatedSpeech is in ms (VADEvent durations are all ms in TS). this.speechStartTime = Date.now() - ev.rawAccumulatedSpeech; } + // Wake any speaking-guard waiter — STT-only sessions don't + // see START_OF_SPEECH but do see INFERENCE_DONE-with-speech. + this.userSpeakingEvent.set(); + + // A short intra-segment pause can resolve a request before VAD + // emits END_OF_SPEECH. When speech resumes (without a new SOS), + // drop that request so the next pause gets a fresh window. + if (this.speaking && this.turnDetectorPredictionFut !== undefined) { + this.turnDetectorStream?.cancelInference(); + this.turnDetectorPredictionFut = undefined; + } + } else if (!this.speaking) { + // A sub-threshold speech spike can set `userSpeakingEvent` without + // ever reaching START_OF_SPEECH, so no END_OF_SPEECH will fire to + // clear it. Clear it here once speech drops back to zero (confirmed + // turns are cleared by EOS). + this.userSpeakingEvent.clear(); + } + + // Audio EOT: start an inference request once we've seen enough + // trailing silence (matches Python's `MIN_SILENCE_DURATION_MS`), + // but only when no request is already in flight. The silence tick + // is the sole request trigger — and it warms even while the agent + // is speaking so an overlapping/interrupting turn still gets a + // window. + if ( + ev.rawAccumulatedSilence >= MIN_SILENCE_DURATION_MS && + this.speaking && + this.turnDetectorStream !== undefined && + this.turnDetectorPredictionFut === undefined + ) { + this.turnDetectorPredictionFut = this.turnDetectorStream.predict(); } break; case VADEventType.END_OF_SPEECH: @@ -1421,13 +1929,19 @@ export class AudioRecognition { // when VAD fires END_OF_SPEECH, it already waited for the silence_duration this.vadSpeechStarted = false; this.speaking = false; + this.userSpeakingEvent.clear(); + this.lastSpeakingTime = Date.now() - ev.silenceDuration - ev.inferenceDuration; + + // Audio EOT: END_OF_SPEECH no longer starts a request — the + // silence tick owns that. It consumes the already-armed future + // (if any) and runs the eou bounce. if ( this.vadBaseTurnDetection || (this.turnDetectionMode === 'stt' && this.userTurnCommitted) ) { const chatCtx = this.hooks.retrieveChatCtx(); - this.runEOUDetection(chatCtx); + this.runEOUDetection(chatCtx, 'vad'); } break; } @@ -1627,6 +2141,19 @@ export class AudioRecognition { this.speaking = false; this.userTurnCommitted = false; this.userTurnTracker = { words: 0, transcript: '' }; + // Clear the speaking event so a stale `set()` from the just-finished + // turn doesn't immediately trip the next speaking-guard race. + this.userSpeakingEvent.clear(); + // New turn → allow the next window's prediction to emit. + this.lastEmittedEotPrediction = undefined; + + // Any in-flight request on the audio stream belongs to the turn we + // just cleared — flush it so the next predict starts fresh. + if (this.turnDetectorStream !== undefined) { + this.turnDetectorStream.flush('clear_user_turn'); + this.turnDetectorPredictionFut = undefined; + this.turnDetectorFlushed = true; + } if (this.userTurnSpan?.isRecording()) { this.userTurnSpan.end(); @@ -1700,7 +2227,7 @@ export class AudioRecognition { const chatCtx = this.hooks.retrieveChatCtx(); this.logger.debug('running EOU detection on commitUserTurn'); - this.runEOUDetection(chatCtx); + this.runEOUDetection(chatCtx, 'manual'); this.userTurnCommitted = true; }; @@ -1747,6 +2274,13 @@ export class AudioRecognition { await this.vadTask?.cancelAndWait(); await this.bounceEOUTask?.cancelAndWait(); await this.interruptionTask?.cancelAndWait(); + + if (this.turnDetectorStream !== undefined) { + const stream = this.turnDetectorStream; + this.turnDetectorStream = undefined; + await stream.aclose().catch(() => undefined); + } + await this.interruptionStreamChannel?.close(); this.cancelBackchannelBoundary(); } diff --git a/agents/src/voice/audio_recognition_span.test.ts b/agents/src/voice/audio_recognition_span.test.ts index cfe92a821..5ce592042 100644 --- a/agents/src/voice/audio_recognition_span.test.ts +++ b/agents/src/voice/audio_recognition_span.test.ts @@ -110,6 +110,7 @@ describe('AudioRecognition user_turn span parity', () => { onInterimTranscript: vi.fn(), onFinalTranscript: vi.fn(), onPreemptiveGeneration: vi.fn(), + onEotPrediction: vi.fn(), retrieveChatCtx: () => ChatContext.empty(), onEndOfTurn: vi.fn(async () => true), }; @@ -191,6 +192,7 @@ describe('AudioRecognition user_turn span parity', () => { onInterimTranscript: vi.fn(), onFinalTranscript: vi.fn(), onPreemptiveGeneration: vi.fn(), + onEotPrediction: vi.fn(), retrieveChatCtx: () => ChatContext.empty(), onEndOfTurn: vi.fn(async () => true), }; diff --git a/agents/src/voice/audio_recognition_turn_detection.test.ts b/agents/src/voice/audio_recognition_turn_detection.test.ts new file mode 100644 index 000000000..216e398fe --- /dev/null +++ b/agents/src/voice/audio_recognition_turn_detection.test.ts @@ -0,0 +1,643 @@ +// SPDX-FileCopyrightText: 2026 LiveKit, Inc. +// +// SPDX-License-Identifier: Apache-2.0 + +/** + * Integration tests for `AudioRecognition` audio turn-detection wiring. + * + * Recognition owns all streaming turn-detection policy: it holds the in-flight + * inference request's future (`turnDetectorPredictionFut`), starts requests on + * VAD events only, awaits the future with the endpointing `minDelay` in the eou + * bounce, and flushes the stream on turn commits. Covered here: + * + * 1. The speaking-guard race in `runEOUDetection`: setting `userSpeakingEvent` + * mid-bounce must abort the commit so a late-arriving SOS doesn't ship the + * prior turn. + * 2. `onEotPrediction` dedup across the vad-EOS and stt-final triggers that + * share one resolved prediction future. + * 3. The prediction-future lifecycle against VAD events: requests start + * exclusively on the silence tick, resumed speech rearms the next pause, SOS + * teardown, the flushed-turn short-circuit for late stt finals, and the + * predict-timeout fallback signal. + * 4. The `minSilenceDuration` validation guarding an audio-EOT + VAD pairing. + * + * The stream-side request lifecycle lives in `inference/eot/base.test.ts`. + * + * Port of Python `tests/test_audio_recognition_turn_detection.py`. + */ +import { ParticipantKind } from '@livekit/rtc-node'; +import { describe, expect, it, vi } from 'vitest'; +import { + BaseStreamingTurnDetector, + BaseStreamingTurnDetectorStream, + MIN_SILENCE_DURATION_MS, + type TurnDetectionEvent, +} from '../inference/eot/base.js'; +import { ChatContext } from '../llm/chat_context.js'; +import { initializeLogger } from '../log.js'; +import { Future } from '../utils.js'; +import { type VAD, type VADEvent, VADEventType } from '../vad.js'; +import { + AudioRecognition, + type AudioRecognitionOptions, + type RecognitionHooks, + type _TurnDetector, +} from './audio_recognition.js'; + +initializeLogger({ pretty: false, level: 'silent' }); + +/** White-box view of the `AudioRecognition` internals these tests drive. */ +interface RecognitionInternals { + speaking: boolean; + isAgentSpeaking: boolean; + vad?: VAD; + turnDetector?: _TurnDetector | BaseStreamingTurnDetector; + turnDetectorStream?: BaseStreamingTurnDetectorStream; + turnDetectorPredictionFut?: Future; + turnDetectorFlushed: boolean; + turnDetectorLatePredictionWarned: boolean; + lastEmittedEotPrediction?: TurnDetectionEvent; + lastSpeakingTime?: number; + audioTranscript: string; + audioInterimTranscript: string; + audioPreflightTranscript: string; + sttRequestIds: string[]; + userSpeakingEvent: { isSet: boolean; set: () => void; clear: () => void }; + bounceEOUTask?: { + result: Promise; + cancel: () => void; + cancelAndWait: () => Promise; + done: boolean; + }; + runEOUDetection: (chatCtx: ChatContext, trigger?: 'vad' | 'stt' | 'manual') => void; + createVadTask: (vad: VAD | undefined, signal: AbortSignal) => Promise; + checkVadSilenceRequirement: (detector?: _TurnDetector | BaseStreamingTurnDetector) => void; + updateTurnDetector: (detector: _TurnDetector | BaseStreamingTurnDetector | undefined) => void; + clearUserTurn: () => void; +} + +function makeHooks(): RecognitionHooks { + return { + onInterruption: vi.fn(), + onStartOfSpeech: vi.fn(), + onVADInferenceDone: vi.fn(), + onEndOfSpeech: vi.fn(), + onInterimTranscript: vi.fn(), + onFinalTranscript: vi.fn(), + onEotPrediction: vi.fn(), + onPreemptiveGeneration: vi.fn(), + onUserTurnExceeded: vi.fn(), + retrieveChatCtx: () => ChatContext.empty(), + onEndOfTurn: vi.fn(async () => false), // don't commit by default + }; +} + +function makeRecognition(opts: Partial = {}): { + recognition: AudioRecognition; + internals: RecognitionInternals; + hooks: RecognitionHooks; +} { + const hooks = makeHooks(); + const full: AudioRecognitionOptions = { + recognitionHooks: hooks, + stt: undefined, + vad: undefined, + interruptionDetection: undefined, + turnDetectionMode: 'vad', + minEndpointingDelay: 10, + maxEndpointingDelay: 500, + getLinkedParticipant: () => ({ sid: 'p1', identity: 'bob', kind: ParticipantKind.AGENT }), + ...opts, + }; + const recognition = new AudioRecognition(full); + return { recognition, internals: recognition as unknown as RecognitionInternals, hooks }; +} + +/** + * A fake audio-EOT detector stream that passes `instanceof BaseStreamingTurnDetectorStream` + * (so `runEOUDetection` selects the audio path). `predict` hands out a fresh + * pending future each call, mirroring the real stream; tests install + * resolved/pending futures directly on `internals.turnDetectorPredictionFut` to + * model cached/awaiting predictions. + */ +function makeAudioStream(): BaseStreamingTurnDetectorStream { + const stream = Object.create(BaseStreamingTurnDetectorStream.prototype); + stream.supportsLanguage = vi.fn(async () => true); + stream.unlikelyThreshold = vi.fn(async () => 0.5); + stream.predict = vi.fn(() => new Future()); + stream.cancelInference = vi.fn(); + stream.flush = vi.fn(); + return stream as BaseStreamingTurnDetectorStream; +} + +function makeAudioDetector(stream: BaseStreamingTurnDetectorStream): BaseStreamingTurnDetector { + const detector = Object.create(BaseStreamingTurnDetector.prototype); + detector.stream = vi.fn(() => stream); + return detector as BaseStreamingTurnDetector; +} + +/** A resolved prediction future, as if the transport already answered. */ +function resolvedPrediction( + probability: number, + opts: { inferenceDuration?: number; detectionDelay?: number } = {}, +): { fut: Future; event: TurnDetectionEvent } { + const event: TurnDetectionEvent = { + type: 'eot_prediction', + endOfTurnProbability: probability, + lastSpeakingTimeMs: 0, + inferenceDuration: opts.inferenceDuration, + detectionDelay: opts.detectionDelay, + }; + const fut = new Future(); + fut.resolve(event); + return { fut, event }; +} + +function predictMock(stream: BaseStreamingTurnDetectorStream): ReturnType { + return stream.predict as unknown as ReturnType; +} + +function cancelInferenceMock(stream: BaseStreamingTurnDetectorStream): ReturnType { + return stream.cancelInference as unknown as ReturnType; +} + +function inferenceDone(rawAccumulatedSpeech: number, rawAccumulatedSilence = 0): VADEvent { + return { + type: VADEventType.INFERENCE_DONE, + samplesIndex: 0, + timestamp: 0, + speechDuration: 0, + silenceDuration: 0, + frames: [], + probability: 0, + inferenceDuration: 0, + speaking: false, + rawAccumulatedSilence, + rawAccumulatedSpeech, + }; +} + +function startOfSpeech(): VADEvent { + return { + type: VADEventType.START_OF_SPEECH, + samplesIndex: 0, + timestamp: 0, + speechDuration: 500, + silenceDuration: 0, + frames: [], + probability: 0, + inferenceDuration: 0, + speaking: true, + rawAccumulatedSilence: 0, + rawAccumulatedSpeech: 500, + }; +} + +function endOfSpeech(): VADEvent { + return { + type: VADEventType.END_OF_SPEECH, + samplesIndex: 0, + timestamp: 0, + speechDuration: 0, + silenceDuration: 300, + frames: [], + probability: 0, + inferenceDuration: 0, + speaking: false, + rawAccumulatedSilence: 300, + rawAccumulatedSpeech: 0, + }; +} + +/** Let queued microtasks + the VAD loop body run to completion. */ +function flush(): Promise { + return new Promise((resolve) => setImmediate(resolve)); +} + +/** + * Drive `createVadTask` against a scripted VAD stream so VAD events flow + * through the real handler. `feed()` resolves once the event has been processed + * and the loop has parked awaiting the next one. + */ +function runScriptedVad(internals: RecognitionInternals): { + feed: (ev: VADEvent) => Promise; + stop: () => Promise; +} { + let resolveNext: ((r: IteratorResult) => void) | null = null; + const buffered: VADEvent[] = []; + let closed = false; + + const stream = { + updateInputStream(_s: unknown) {}, + detachInputStream() {}, + close() { + closed = true; + if (resolveNext) { + resolveNext({ done: true, value: undefined as never }); + resolveNext = null; + } + }, + [Symbol.asyncIterator](): AsyncIterator { + return { + next(): Promise> { + if (buffered.length > 0) { + return Promise.resolve({ done: false, value: buffered.shift()! }); + } + if (closed) { + return Promise.resolve({ done: true, value: undefined as never }); + } + return new Promise((res) => { + resolveNext = res; + }); + }, + }; + }, + }; + + const vad = { stream: () => stream } as unknown as VAD; + const controller = new AbortController(); + const task = internals.createVadTask(vad, controller.signal); + + return { + async feed(ev: VADEvent) { + if (resolveNext) { + const res = resolveNext; + resolveNext = null; + res({ done: false, value: ev }); + } else { + buffered.push(ev); + } + await flush(); + await flush(); + }, + async stop() { + controller.abort(); + await task.catch(() => {}); + }, + }; +} + +describe('TestSpeakingGuardRace', () => { + it('cancels the in-flight bounce when speaking starts during endpointing', async () => { + const { internals, hooks } = makeRecognition(); + const stream = makeAudioStream(); + internals.turnDetectorStream = stream; + internals.turnDetector = makeAudioDetector(stream); + // sub-threshold prediction (0.2 < 0.5) extends endpointing to maxDelay + internals.turnDetectorPredictionFut = resolvedPrediction(0.2).fut; + + internals.runEOUDetection(ChatContext.empty(), 'vad'); + + // The bounce is parked in the ~500ms endpointing delay. Fire the speaking + // event well inside that window: the guard's race resolves with the + // speaking branch and the bounce is aborted before it can commit. + await new Promise((r) => setTimeout(r, 50)); + internals.userSpeakingEvent.set(); + + expect(internals.bounceEOUTask).toBeDefined(); + await internals.bounceEOUTask!.result.catch(() => {}); + + expect(hooks.onEndOfTurn).not.toHaveBeenCalled(); + }); + + it('short-circuits without spawning the bounce when already speaking', async () => { + const { internals, hooks } = makeRecognition(); + const stream = makeAudioStream(); + internals.turnDetectorStream = stream; + internals.turnDetector = makeAudioDetector(stream); + internals.speaking = true; + + internals.runEOUDetection(ChatContext.empty(), 'vad'); + + expect(internals.bounceEOUTask).toBeDefined(); + await internals.bounceEOUTask!.result.catch(() => {}); + + expect(hooks.onEndOfTurn).not.toHaveBeenCalled(); + // The guard bailed before the bounce body ran, so no request was awaited. + expect(predictMock(stream).mock.calls.length).toBe(0); + }); +}); + +describe('TestEotPredictionDedup', () => { + it('emits onEotPrediction once across vad then stt triggers', async () => { + const { internals, hooks } = makeRecognition(); + // One prediction per inference request — both triggers read this event by + // reference from the held future. + const { fut, event } = resolvedPrediction(0.2, { inferenceDuration: 50, detectionDelay: 100 }); + const stream = makeAudioStream(); + internals.turnDetectorStream = stream; + internals.turnDetector = makeAudioDetector(stream); + internals.turnDetectorPredictionFut = fut; + + // vad trigger: bounce emits, then parks in the endpointing sleep. + internals.runEOUDetection(ChatContext.empty(), 'vad'); + await flush(); + await flush(); + expect(hooks.onEotPrediction).toHaveBeenCalledTimes(1); + + // stt trigger: cancels the parked vad bounce and runs a fresh one that + // reads the same resolved future. Dedup must suppress a second emit. + internals.runEOUDetection(ChatContext.empty(), 'stt'); + await flush(); + await flush(); + + expect(hooks.onEotPrediction).toHaveBeenCalledTimes(1); + expect(internals.lastEmittedEotPrediction).toBe(event); + + await internals.bounceEOUTask?.cancelAndWait().catch(() => {}); + }); + + it('emits on every bounce for a text-based detector', async () => { + const { internals, hooks } = makeRecognition(); + // A text detector is not a BaseStreamingTurnDetector → no streaming window, + // so there's no shared prediction event and dedup never applies. + const textDetector: _TurnDetector = { + model: 'fake', + provider: 'fake', + supportsLanguage: vi.fn(async () => true), + unlikelyThreshold: vi.fn(async () => 0.5), + predictEndOfTurn: vi.fn(async () => 0.2), + }; + internals.turnDetector = textDetector; + internals.turnDetectorStream = undefined; + internals.audioTranscript = 'hello there'; + + internals.runEOUDetection(ChatContext.empty(), 'vad'); + await flush(); + await flush(); + expect(hooks.onEotPrediction).toHaveBeenCalledTimes(1); + + internals.runEOUDetection(ChatContext.empty(), 'stt'); + await flush(); + await flush(); + expect(hooks.onEotPrediction).toHaveBeenCalledTimes(2); + + await internals.bounceEOUTask?.cancelAndWait().catch(() => {}); + }); + + it('clearUserTurn resets the dedup guard so the next turn emits again', () => { + const { internals } = makeRecognition(); + internals.lastEmittedEotPrediction = resolvedPrediction(0.2).event; + internals.audioInterimTranscript = ''; + internals.audioPreflightTranscript = ''; + internals.sttRequestIds = []; + + internals.clearUserTurn(); + + expect(internals.lastEmittedEotPrediction).toBeUndefined(); + }); +}); + +describe('TestPredictionFutureLifecycle', () => { + it('silence tick starts a request once', async () => { + const { internals } = makeRecognition(); + const stream = makeAudioStream(); + internals.turnDetectorStream = stream; + internals.turnDetector = makeAudioDetector(stream); + internals.speaking = true; + + const vad = runScriptedVad(internals); + try { + await vad.feed(inferenceDone(0, 300)); + await vad.feed(inferenceDone(0, 400)); + + expect(predictMock(stream).mock.calls.length).toBe(1); + expect(internals.turnDetectorPredictionFut).toBeDefined(); + } finally { + await vad.stop(); + } + }); + + it('resumed speech without SOS rearms the next pause', async () => { + const { internals } = makeRecognition(); + const stream = makeAudioStream(); + internals.turnDetectorStream = stream; + internals.turnDetector = makeAudioDetector(stream); + internals.speaking = true; + + const vad = runScriptedVad(internals); + try { + await vad.feed(inferenceDone(0, 300)); + const firstFut = internals.turnDetectorPredictionFut; + expect(firstFut).toBeDefined(); + firstFut!.resolve(resolvedPrediction(0.1).event); + + // Speech resumes inside the still-open VAD segment → drop the request. + await vad.feed(inferenceDone(1, 0)); + expect(cancelInferenceMock(stream)).toHaveBeenCalledTimes(1); + expect(cancelInferenceMock(stream)).toHaveBeenCalledWith(); + expect(internals.turnDetectorPredictionFut).toBeUndefined(); + + // The next pause gets a fresh window. + await vad.feed(inferenceDone(0, 300)); + expect(predictMock(stream).mock.calls.length).toBe(2); + expect(internals.turnDetectorPredictionFut).toBeDefined(); + expect(internals.turnDetectorPredictionFut).not.toBe(firstFut); + } finally { + await vad.stop(); + } + }); + + it('silence tick starts a request even while the agent is speaking', async () => { + // The agent-speaking gate was dropped: the silence tick warms a prediction + // during the user's pause even while the agent is still speaking. + const { internals } = makeRecognition(); + const stream = makeAudioStream(); + internals.turnDetectorStream = stream; + internals.turnDetector = makeAudioDetector(stream); + internals.speaking = true; + internals.isAgentSpeaking = true; + + const vad = runScriptedVad(internals); + try { + await vad.feed(inferenceDone(0, 300)); + expect(predictMock(stream).mock.calls.length).toBe(1); + expect(internals.turnDetectorPredictionFut).toBeDefined(); + } finally { + await vad.stop(); + } + }); + + it('EOS consumes the silence-tick request without predicting', async () => { + const { internals, hooks } = makeRecognition(); + const stream = makeAudioStream(); + internals.turnDetectorStream = stream; + internals.turnDetector = makeAudioDetector(stream); + internals.speaking = true; + const { fut } = resolvedPrediction(0.9); + internals.turnDetectorPredictionFut = fut; + + const vad = runScriptedVad(internals); + try { + await vad.feed(endOfSpeech()); + expect(predictMock(stream).mock.calls.length).toBe(0); + expect(internals.turnDetectorPredictionFut).toBe(fut); + expect(internals.bounceEOUTask).toBeDefined(); + await internals.bounceEOUTask!.result.catch(() => {}); + expect(hooks.onEotPrediction).toHaveBeenCalledTimes(1); + } finally { + await vad.stop(); + } + }); + + it('SOS tears down the request and rearms', async () => { + const { internals } = makeRecognition(); + const stream = makeAudioStream(); + internals.turnDetectorStream = stream; + internals.turnDetector = makeAudioDetector(stream); + internals.turnDetectorPredictionFut = new Future(); + internals.turnDetectorFlushed = true; + + const vad = runScriptedVad(internals); + try { + await vad.feed(startOfSpeech()); + expect(cancelInferenceMock(stream)).toHaveBeenCalledTimes(1); + expect(cancelInferenceMock(stream)).toHaveBeenCalledWith(); + expect(internals.turnDetectorPredictionFut).toBeUndefined(); + expect(internals.turnDetectorFlushed).toBe(false); + } finally { + await vad.stop(); + } + }); + + it('EOS never starts a request', async () => { + const { internals } = makeRecognition(); + const stream = makeAudioStream(); + internals.turnDetectorStream = stream; + internals.turnDetector = makeAudioDetector(stream); + + const vad = runScriptedVad(internals); + try { + await vad.feed(endOfSpeech()); + expect(predictMock(stream).mock.calls.length).toBe(0); + expect(internals.turnDetectorPredictionFut).toBeUndefined(); + + const { fut } = resolvedPrediction(0.9); + internals.turnDetectorPredictionFut = fut; + await vad.feed(endOfSpeech()); + expect(predictMock(stream).mock.calls.length).toBe(0); + expect(internals.turnDetectorPredictionFut).toBe(fut); + } finally { + await vad.stop(); + } + }); + + it('late stt final after flush short-circuits and warns once', async () => { + const { internals, hooks } = makeRecognition(); + const stream = makeAudioStream(); + internals.turnDetectorStream = stream; + internals.turnDetector = makeAudioDetector(stream); + internals.turnDetectorFlushed = true; + + for (let i = 0; i < 2; i++) { + internals.runEOUDetection(ChatContext.empty(), 'stt'); + expect(internals.bounceEOUTask).toBeDefined(); + await internals.bounceEOUTask!.result.catch(() => {}); + } + + expect(predictMock(stream).mock.calls.length).toBe(0); + expect(hooks.onEotPrediction).not.toHaveBeenCalled(); + // Warn-once: the flag flips on the first late prediction, debug after. + expect(internals.turnDetectorLatePredictionWarned).toBe(true); + }); + + it('predict timeout signals fallback and drops the future', async () => { + const { internals, hooks } = makeRecognition(); + const stream = makeAudioStream(); + internals.turnDetectorStream = stream; + internals.turnDetector = makeAudioDetector(stream); + // A pending future that never resolves → times out at minDelay. + internals.turnDetectorPredictionFut = new Future(); + + internals.runEOUDetection(ChatContext.empty(), 'vad'); + expect(internals.bounceEOUTask).toBeDefined(); + await internals.bounceEOUTask!.result.catch(() => {}); + + expect(cancelInferenceMock(stream)).toHaveBeenCalledTimes(1); + expect(cancelInferenceMock(stream)).toHaveBeenCalledWith({ timedOut: true }); + expect(internals.turnDetectorPredictionFut).toBeUndefined(); + expect(hooks.onEotPrediction).not.toHaveBeenCalled(); + expect(stream.unlikelyThreshold).not.toHaveBeenCalled(); + expect(hooks.onEndOfTurn).toHaveBeenCalledTimes(1); + }); + + it('commit flushes the stream and marks the turn flushed', async () => { + const { internals, hooks } = makeRecognition(); + (hooks.onEndOfTurn as ReturnType).mockResolvedValue(true); + const stream = makeAudioStream(); + internals.turnDetectorStream = stream; + internals.turnDetector = makeAudioDetector(stream); + // confident → no maxDelay extension + internals.turnDetectorPredictionFut = resolvedPrediction(0.9).fut; + + internals.runEOUDetection(ChatContext.empty(), 'vad'); + expect(internals.bounceEOUTask).toBeDefined(); + await internals.bounceEOUTask!.result.catch(() => {}); + + expect(stream.flush).toHaveBeenCalledWith('turn committed'); + expect(internals.turnDetectorPredictionFut).toBeUndefined(); + expect(internals.turnDetectorFlushed).toBe(true); + }); +}); + +describe('TestVadMinSilenceRequirement', () => { + // The audio EOT detector needs ~200ms of trailing silence, so the VAD must + // report END_OF_SPEECH no earlier than that floor + a 50ms margin. + const requiredMs = MIN_SILENCE_DURATION_MS + 50; + const fakeVad = (minSilenceDuration: number | null): VAD => + ({ minSilenceDuration }) as unknown as VAD; + + it('raises when min silence is too low for an audio detector', () => { + const { internals } = makeRecognition(); + internals.vad = fakeVad(requiredMs - 1); + internals.turnDetector = makeAudioDetector(makeAudioStream()); + + expect(() => internals.checkVadSilenceRequirement()).toThrow(/minSilenceDuration/); + }); + + it('passes when min silence is adequate', () => { + const { internals } = makeRecognition(); + internals.vad = fakeVad(requiredMs + 250); + internals.turnDetector = makeAudioDetector(makeAudioStream()); + + expect(() => internals.checkVadSilenceRequirement()).not.toThrow(); + }); + + it('skips validation for a non-audio detector', () => { + const { internals } = makeRecognition(); + internals.vad = fakeVad(requiredMs - 1); + internals.turnDetector = { model: 'x', provider: 'x' } as unknown as _TurnDetector; + + expect(() => internals.checkVadSilenceRequirement()).not.toThrow(); + }); + + it('skips validation when there is no VAD', () => { + const { internals } = makeRecognition(); + internals.vad = undefined; + internals.turnDetector = makeAudioDetector(makeAudioStream()); + + expect(() => internals.checkVadSilenceRequirement()).not.toThrow(); + }); + + it('skips validation when the VAD exposes no min-silence knob', () => { + const { internals } = makeRecognition(); + // A VAD whose minSilenceDuration is null can't be validated → allowed. + internals.vad = fakeVad(null); + internals.turnDetector = makeAudioDetector(makeAudioStream()); + + expect(() => internals.checkVadSilenceRequirement()).not.toThrow(); + }); + + it('updateTurnDetector validates the pairing before building a stream', () => { + const { internals } = makeRecognition(); + internals.vad = fakeVad(requiredMs - 1); + const stream = makeAudioStream(); + const detector = makeAudioDetector(stream); + + expect(() => internals.updateTurnDetector(detector)).toThrow(/minSilenceDuration/); + + // Aborted before adopting the detector or opening a stream. + expect(internals.turnDetectorStream).toBeUndefined(); + expect((detector.stream as ReturnType).mock.calls.length).toBe(0); + }); +}); diff --git a/agents/src/voice/events.ts b/agents/src/voice/events.ts index a6923c866..e30cae983 100644 --- a/agents/src/voice/events.ts +++ b/agents/src/voice/events.ts @@ -34,6 +34,8 @@ export enum AgentSessionEventTypes { SpeechCreated = 'speech_created', AgentFalseInterruption = 'agent_false_interruption', OverlappingSpeech = 'overlapping_speech', + /** Audio EOT detector emitted a per-turn prediction. */ + EotPrediction = 'eot_prediction', Error = 'error', Close = 'close', } @@ -246,6 +248,46 @@ export const createSpeechCreatedEvent = ({ createdAt, }); +/** + * Audio EOT prediction landed on the wire. Emitted once per turn boundary + * decision when a `TurnDetector` is wired into the session. + * + * Port of Python `EotPredictionEvent`. + */ +export type EotPredictionEvent = { + type: 'eot_prediction'; + /** End-of-turn probability in [0, 1] returned by the detector. */ + probability: number; + /** Threshold below which the detector treats the prediction as unlikely. */ + threshold: number; + /** Model-side inference time, in milliseconds. */ + inferenceDurationMs: number; + /** End-of-speech → prediction receive time, in milliseconds. */ + delayMs: number; + createdAt: number; +}; + +export const createEotPredictionEvent = ({ + probability, + threshold, + inferenceDurationMs, + delayMs, + createdAt = Date.now(), +}: { + probability: number; + threshold: number; + inferenceDurationMs: number; + delayMs: number; + createdAt?: number; +}): EotPredictionEvent => ({ + type: 'eot_prediction', + probability, + threshold, + inferenceDurationMs, + delayMs, + createdAt, +}); + export type UserTurnExceededEvent = { type: 'user_turn_exceeded'; /** Transcript from the current uncommitted user turn only. */ diff --git a/agents/src/voice/remote_session.ts b/agents/src/voice/remote_session.ts index efe0b0547..235925b76 100644 --- a/agents/src/voice/remote_session.ts +++ b/agents/src/voice/remote_session.ts @@ -19,6 +19,7 @@ import { isInstructions, renderInstructions } from '../llm/chat_context.js'; import { type ToolContext, sortedToolNames } from '../llm/tool_context.js'; import { log } from '../log.js'; import type { + EOTModelUsage, InterruptionModelUsage, LLMModelUsage, STTModelUsage, @@ -33,6 +34,7 @@ import { type AgentState, type AgentStateChangedEvent, type ConversationItemAddedEvent, + type EotPredictionEvent, type ErrorEvent, type FunctionToolsExecutedEvent, type MetricsCollectedEvent, @@ -63,6 +65,7 @@ export type RemoteSessionEventTypes = | 'function_tools_executed' | 'overlapping_speech' | 'amd_prediction' + | 'eot_prediction' | 'session_usage' | 'debug_message' | 'error'; @@ -76,6 +79,7 @@ export type RemoteSessionCallbacks = { function_tools_executed: (ev: pb.AgentSessionEvent_FunctionToolsExecuted) => void; overlapping_speech: (ev: pb.AgentSessionEvent_OverlappingSpeech) => void; amd_prediction: (ev: pb.AgentSessionEvent_AmdPrediction) => void; + eot_prediction: (ev: pb.AgentSessionEvent_EotPrediction) => void; session_usage: (ev: pb.AgentSessionEvent_SessionUsageUpdated) => void; debug_message: (ev: pb.DebugMessage) => void; error: (ev: pb.AgentSessionEvent_Error) => void; @@ -584,6 +588,22 @@ function sessionUsageToProto(usage: AgentSessionUsage): pb.AgentSessionUsage { ); break; } + case 'eot_usage': { + const eu = mu as Partial; + modelUsages.push( + new pb.ModelUsage({ + usage: { + case: 'eot', + value: new pb.EotModelUsage({ + provider: eu.provider ?? '', + model: eu.model ?? '', + totalRequests: eu.totalRequests ?? 0, + }), + }, + }), + ); + break; + } } } return new pb.AgentSessionUsage({ modelUsage: modelUsages }); @@ -640,6 +660,7 @@ export class SessionHost { session.on(AgentSessionEventTypes.FunctionToolsExecuted, this.onFunctionToolsExecuted); session.on(AgentSessionEventTypes.MetricsCollected, this.onMetricsCollected); session.on(AgentSessionEventTypes.OverlappingSpeech, this.onOverlappingSpeech); + session.on(AgentSessionEventTypes.EotPrediction, this.onEotPrediction); session.on(AgentSessionEventTypes.Error, this.onHostError); session.on(AgentSessionEventTypes.DebugMessage, this.onDebugMessage); } @@ -669,6 +690,7 @@ export class SessionHost { this.session.off(AgentSessionEventTypes.FunctionToolsExecuted, this.onFunctionToolsExecuted); this.session.off(AgentSessionEventTypes.MetricsCollected, this.onMetricsCollected); this.session.off(AgentSessionEventTypes.OverlappingSpeech, this.onOverlappingSpeech); + this.session.off(AgentSessionEventTypes.EotPrediction, this.onEotPrediction); this.session.off(AgentSessionEventTypes.Error, this.onHostError); this.session.off(AgentSessionEventTypes.DebugMessage, this.onDebugMessage); } @@ -797,6 +819,10 @@ export class SessionHost { ); }; + private onEotPrediction = (event: EotPredictionEvent): void => { + this._onEotPrediction(event); + }; + private onOverlappingSpeech = (event: OverlappingSpeechEvent): void => { const value = new pb.AgentSessionEvent_OverlappingSpeech({ isInterruption: event.isInterruption, @@ -856,6 +882,23 @@ export class SessionHost { }); } + /** + * @internal — forwards an audio-EOT prediction to the connected + * {@link RemoteSession} peer. Mirrors python + * `SessionHost._on_eot_prediction`. + */ + _onEotPrediction(event: EotPredictionEvent): void { + this.emitEvent({ + case: 'eotPrediction', + value: new pb.AgentSessionEvent_EotPrediction({ + probability: event.probability, + threshold: event.threshold, + inferenceDuration: msToDuration(event.inferenceDurationMs), + delay: msToDuration(event.delayMs), + }), + }); + } + private async handleRequestSafe(req: pb.SessionRequest): Promise { try { await this.handleRequest(req); @@ -1153,6 +1196,9 @@ export class RemoteSession extends (EventEmitter as new () => TypedEventEmitter< case 'amdPrediction': this.emit('amd_prediction', ev.value); break; + case 'eotPrediction': + this.emit('eot_prediction', ev.value); + break; case 'sessionUsageUpdated': this.emit('session_usage', ev.value); break; diff --git a/agents/src/voice/turn_config/turn_handling.ts b/agents/src/voice/turn_config/turn_handling.ts index a7d3a51be..d3a557efa 100644 --- a/agents/src/voice/turn_config/turn_handling.ts +++ b/agents/src/voice/turn_config/turn_handling.ts @@ -36,11 +36,15 @@ export interface TurnHandlingOptions { * - `"realtime_llm"` – use server-side detection from a realtime LLM * - `"manual"` – caller controls turn boundaries explicitly * - * If not set, the session chooses the best available mode in priority order - * `realtime_llm → vad → stt → manual`; it automatically falls back if the necessary model - * is missing. + * - `undefined` (not set) – the session auto-provisions a default + * `inference.TurnDetector`, then chooses the best available mode in + * priority order `realtime_llm → vad → stt → manual`, falling back if the + * necessary model is missing. + * - `null` – explicitly opt out of turn detection (no default detector built). + * + * The `null`-vs-`undefined` distinction mirrors Python's `None` vs `NOT_GIVEN`. */ - turnDetection: TurnDetectionMode | undefined; + turnDetection: TurnDetectionMode | null | undefined; /** * Configuration for endpointing. */ diff --git a/agents/src/voice/turn_config/utils.ts b/agents/src/voice/turn_config/utils.ts index 8db23d49a..fbc3a4016 100644 --- a/agents/src/voice/turn_config/utils.ts +++ b/agents/src/voice/turn_config/utils.ts @@ -73,7 +73,13 @@ export function migrateLegacyOptions(legacyOptions: AgentSessionOption ...sessionOptions.turnHandling?.userTurnLimit, }, - turnDetection: sessionOptions?.turnHandling?.turnDetection ?? turnDetection, + // Preserve an explicit `null` (opt-out) distinctly from `undefined` (not + // given). `??` would collapse both, so only fall back to the deprecated + // top-level `turnDetection` when `turnHandling.turnDetection` is absent. + turnDetection: + sessionOptions?.turnHandling?.turnDetection !== undefined + ? sessionOptions.turnHandling.turnDetection + : turnDetection, } as const; if ( @@ -134,7 +140,12 @@ export function stripUndefined(obj: T): Partial { export function mergeWithDefaults(config: TurnHandlingOptions) { return { - turnDetection: config.turnDetection ?? defaultTurnHandlingOptions.turnDetection, + // Keep an explicit `null` (opt-out) — only an absent value falls back to + // the default, so the constructor can tell opt-out from not-given. + turnDetection: + config.turnDetection === undefined + ? defaultTurnHandlingOptions.turnDetection + : config.turnDetection, endpointing: { ...defaultEndpointingOptions, ...stripUndefined(config.endpointing) }, interruption: { ...defaultInterruptionOptions, ...stripUndefined(config.interruption) }, preemptiveGeneration: { diff --git a/agents/src/worker.ts b/agents/src/worker.ts index 8238a185e..b18a26a9a 100644 --- a/agents/src/worker.ts +++ b/agents/src/worker.ts @@ -15,10 +15,13 @@ import type { ParticipantInfo } from 'livekit-server-sdk'; import { AccessToken, RoomServiceClient } from 'livekit-server-sdk'; import { EventEmitter } from 'node:events'; import { availableParallelism } from 'node:os'; +import { extname } from 'node:path'; import { WebSocket } from 'ws'; import { APIStatusError } from './_exceptions.js'; import { getCpuMonitor } from './cpu.js'; import { HTTPServer } from './http_server.js'; +import { _getLocalInferenceModule } from './inference/_warmup.js'; +import { EOT_INFERENCE_METHOD } from './inference/eot/runner.js'; import { InferenceRunner } from './inference_runner.js'; import { InferenceProcExecutor } from './ipc/inference_proc_executor.js'; import { ProcPool } from './ipc/proc_pool.js'; @@ -33,6 +36,32 @@ const ASSIGNMENT_TIMEOUT = 7.5 * 1000; const UPDATE_LOAD_INTERVAL = 2.5 * 1000; const PROJECT_TYPE = 'nodejs'; +let localEotRunnerRegistered = false; +/** + * Register the local audio-EOT inference runner so it runs in the shared + * inference process. Idempotent and guarded by native-binding availability; + * a no-op (with a one-time warning) when `@livekit/local-inference` can't be + * loaded so the worker still starts on unsupported platforms. + */ +function maybeRegisterLocalEotRunner(): void { + if (localEotRunnerRegistered) return; + localEotRunnerRegistered = true; + if (InferenceRunner.registeredRunners[EOT_INFERENCE_METHOD]) return; + if (_getLocalInferenceModule() === undefined) { + log().warn( + '@livekit/local-inference native binding unavailable; local audio EOT disabled ' + + '(predictions will degrade to a positive default). cloud EOT and other turn ' + + 'detection modes are unaffected.', + ); + return; + } + const ext = extname(import.meta.url); // '.js' (built) or '.ts' (tsx/ts-node) + InferenceRunner.registerRunner( + EOT_INFERENCE_METHOD, + new URL(`./inference/eot/runner${ext}`, import.meta.url).toString(), + ); +} + class Default { static loadThreshold(production: boolean): number { if (production) { @@ -328,6 +357,13 @@ export class AgentServer { } } + // Register the local audio-EOT runner so it runs in the shared inference + // process (loaded once per host, ~138 MB) instead of in every job worker. + // Guarded by binding availability: on a platform where + // `@livekit/local-inference` can't load, skip registration so the worker + // still starts (local EOT then degrades to a positive-default prediction). + maybeRegisterLocalEotRunner(); + if (Object.entries(InferenceRunner.registeredRunners).length) { this.#inferenceExecutor = new InferenceProcExecutor({ runners: InferenceRunner.registeredRunners, diff --git a/examples/src/anam_realtime_agent.ts b/examples/src/anam_realtime_agent.ts index bc2fd33e4..fd4957faa 100644 --- a/examples/src/anam_realtime_agent.ts +++ b/examples/src/anam_realtime_agent.ts @@ -3,7 +3,6 @@ // SPDX-License-Identifier: Apache-2.0 import { type JobContext, - type JobProcess, ServerOptions, cli, defineAgent, @@ -12,17 +11,12 @@ import { voice, } from '@livekit/agents'; import * as anam from '@livekit/agents-plugin-anam'; -import * as livekit from '@livekit/agents-plugin-livekit'; import * as openai from '@livekit/agents-plugin-openai'; -import * as silero from '@livekit/agents-plugin-silero'; import { fileURLToPath } from 'node:url'; // Uses OpenAI Advanced Voice (Realtime), so no separate STT/TTS/VAD. export default defineAgent({ - prewarm: async (proc: JobProcess) => { - proc.userData.vad = await silero.VAD.load(); - }, entry: async (ctx: JobContext) => { initializeLogger({ pretty: true }); @@ -31,7 +25,6 @@ export default defineAgent({ }); const session = new voice.AgentSession({ - vad: ctx.proc.userData.vad! as silero.VAD, stt: new inference.STT({ model: 'deepgram/nova-3', language: 'en' }), tts: new inference.TTS({ model: 'cartesia/sonic-3', @@ -45,7 +38,6 @@ export default defineAgent({ turnDetection: null, inputAudioTranscription: null, }), - turnDetection: new livekit.turnDetector.EnglishModel(), }); await session.start({ diff --git a/examples/src/basic_agent.ts b/examples/src/basic_agent.ts index 95ecddb9a..10c8e5028 100644 --- a/examples/src/basic_agent.ts +++ b/examples/src/basic_agent.ts @@ -3,7 +3,6 @@ // SPDX-License-Identifier: Apache-2.0 import { type JobContext, - type JobProcess, ServerOptions, cli, defineAgent, @@ -13,16 +12,14 @@ import { metrics, voice, } from '@livekit/agents'; -import * as livekit from '@livekit/agents-plugin-livekit'; -import * as silero from '@livekit/agents-plugin-silero'; import { BackgroundVoiceCancellation } from '@livekit/noise-cancellation-node'; import { fileURLToPath } from 'node:url'; import { z } from 'zod'; +// No prewarm hook needed: the local EOT model runs in the shared inference +// process (loaded once per host), and the silero VAD (~2MB, in-process) +// lazy-loads on first stream. export default defineAgent({ - prewarm: async (proc: JobProcess) => { - proc.userData.vad = await silero.VAD.load(); - }, entry: async (ctx: JobContext) => { const agent = new voice.Agent({ instructions: @@ -43,9 +40,6 @@ export default defineAgent({ const logger = log(); const session = new voice.AgentSession({ - // VAD and turn detection are used to determine when the user is speaking and when the agent should respond - // See more at https://docs.livekit.io/agents/build/turns - vad: ctx.proc.userData.vad! as silero.VAD, // Speech-to-text (STT) is your agent's ears, turning the user's speech into text that the LLM can understand // See all available models at https://docs.livekit.io/agents/models/stt/ stt: new inference.STT({ @@ -69,7 +63,8 @@ export default defineAgent({ }), ttsTextTransforms: ['filter_markdown', 'filter_emoji'], turnHandling: { - turnDetection: new livekit.turnDetector.MultilingualModel(), + // turn detection defaults to the audio inference.TurnDetector when unset. + // See https://docs.livekit.io/agents/build/turns interruption: { // Enable false-interruption auto-resume behavior. resumeFalseInterruption: true, @@ -118,7 +113,7 @@ export default defineAgent({ }); session.on(voice.AgentSessionEventTypes.OverlappingSpeech, (ev) => { - logger.warn({ type: ev.type, isInterruption: ev.isInterruption }, 'user overlapping speech'); + logger.info({ type: ev.type, isInterruption: ev.isInterruption }, 'user overlapping speech'); }); await session.start({ diff --git a/examples/src/basic_agent_task.ts b/examples/src/basic_agent_task.ts index aacbeee5c..0549f4197 100644 --- a/examples/src/basic_agent_task.ts +++ b/examples/src/basic_agent_task.ts @@ -3,7 +3,6 @@ // SPDX-License-Identifier: Apache-2.0 import { type JobContext, - type JobProcess, ServerOptions, cli, defineAgent, @@ -12,7 +11,6 @@ import { voice, } from '@livekit/agents'; import * as openai from '@livekit/agents-plugin-openai'; -import * as silero from '@livekit/agents-plugin-silero'; import { fileURLToPath } from 'node:url'; import { z } from 'zod'; @@ -110,12 +108,8 @@ class SurveyAgent extends voice.Agent { } export default defineAgent({ - prewarm: async (proc: JobProcess) => { - proc.userData.vad = await silero.VAD.load(); - }, entry: async (ctx: JobContext) => { const session = new voice.AgentSession({ - vad: ctx.proc.userData.vad as silero.VAD, stt: new inference.STT({ model: 'deepgram/nova-3' }), llm: new openai.responses.LLM({ useWebSocket: true }), tts: new inference.TTS({ diff --git a/examples/src/basic_task_group.ts b/examples/src/basic_task_group.ts index d40befe2a..0c24c2059 100644 --- a/examples/src/basic_task_group.ts +++ b/examples/src/basic_task_group.ts @@ -3,7 +3,6 @@ // SPDX-License-Identifier: Apache-2.0 import { type JobContext, - type JobProcess, ServerOptions, beta, cli, @@ -13,7 +12,6 @@ import { voice, } from '@livekit/agents'; import * as openai from '@livekit/agents-plugin-openai'; -import * as silero from '@livekit/agents-plugin-silero'; import { fileURLToPath } from 'node:url'; import { z } from 'zod'; @@ -120,12 +118,8 @@ class TaskGroupDemoAgent extends voice.Agent { } export default defineAgent({ - prewarm: async (proc: JobProcess) => { - proc.userData.vad = await silero.VAD.load(); - }, entry: async (ctx: JobContext) => { const session = new voice.AgentSession({ - vad: ctx.proc.userData.vad as silero.VAD, stt: new inference.STT({ model: 'deepgram/nova-3' }), llm: new openai.responses.LLM({ model: 'gpt-5.2', diff --git a/examples/src/basic_tool_call_agent.ts b/examples/src/basic_tool_call_agent.ts index 5642ef488..8d576cf1f 100644 --- a/examples/src/basic_tool_call_agent.ts +++ b/examples/src/basic_tool_call_agent.ts @@ -3,7 +3,6 @@ // SPDX-License-Identifier: Apache-2.0 import { type JobContext, - type JobProcess, ServerOptions, cli, defineAgent, @@ -11,8 +10,6 @@ import { llm, voice, } from '@livekit/agents'; -import * as livekit from '@livekit/agents-plugin-livekit'; -import * as silero from '@livekit/agents-plugin-silero'; import { BackgroundVoiceCancellation } from '@livekit/noise-cancellation-node'; import { fileURLToPath } from 'node:url'; import { z } from 'zod'; @@ -39,9 +36,6 @@ class GameAgent extends voice.Agent { } export default defineAgent({ - prewarm: async (proc: JobProcess) => { - proc.userData.vad = await silero.VAD.load(); - }, entry: async (ctx: JobContext) => { const getWeather = llm.tool({ description: ' Called when the user asks about the weather.', @@ -133,17 +127,13 @@ export default defineAgent({ }, }); - const vad = ctx.proc.userData.vad! as silero.VAD; - const session = new voice.AgentSession({ - vad, stt: new inference.STT({ model: 'deepgram/nova-3', language: 'en' }), llm: new inference.LLM({ model: 'google/gemini-3-flash-preview' }), tts: new inference.TTS({ model: 'cartesia/sonic-3', voice: '9626c31c-bec5-4cca-baa8-f8ba9e84c8bc', }), - turnDetection: new livekit.turnDetector.MultilingualModel(), userData: { number: 0 }, voiceOptions: { preemptiveGeneration: true, diff --git a/examples/src/cartesia.ts b/examples/src/cartesia.ts index 34cd47fd4..e49bb623d 100644 --- a/examples/src/cartesia.ts +++ b/examples/src/cartesia.ts @@ -4,7 +4,6 @@ import type { llm as llmModule } from '@livekit/agents'; import { type JobContext, - type JobProcess, ServerOptions, cli, defineAgent, @@ -16,14 +15,10 @@ import { import * as cartesia from '@livekit/agents-plugin-cartesia'; import * as google from '@livekit/agents-plugin-google'; import * as openai from '@livekit/agents-plugin-openai'; -import * as silero from '@livekit/agents-plugin-silero'; import { BackgroundVoiceCancellation } from '@livekit/noise-cancellation-node'; import { fileURLToPath } from 'node:url'; export default defineAgent({ - prewarm: async (proc: JobProcess) => { - proc.userData.vad = await silero.VAD.load(); - }, entry: async (ctx: JobContext) => { const agent = new voice.Agent({ instructions: @@ -31,8 +26,6 @@ export default defineAgent({ }); const logger = log(); - const vad = - ctx.proc.userData.vad instanceof silero.VAD ? ctx.proc.userData.vad : await silero.VAD.load(); const apiKey = process.env.CARTESIA_API_KEY; @@ -67,7 +60,6 @@ export default defineAgent({ } const session = new voice.AgentSession({ - vad, stt: new cartesia.STT({ model: 'ink-2', apiKey }), llm, tts: new cartesia.TTS({ model: 'sonic-3.5', apiKey }), diff --git a/examples/src/comprehensive_test.ts b/examples/src/comprehensive_test.ts index 6e9fc8f07..9e9e25225 100644 --- a/examples/src/comprehensive_test.ts +++ b/examples/src/comprehensive_test.ts @@ -3,7 +3,6 @@ // SPDX-License-Identifier: Apache-2.0 import { type JobContext, - type JobProcess, ServerOptions, cli, dedent, @@ -21,7 +20,6 @@ import * as livekit from '@livekit/agents-plugin-livekit'; import * as neuphonic from '@livekit/agents-plugin-neuphonic'; import * as openai from '@livekit/agents-plugin-openai'; import * as resemble from '@livekit/agents-plugin-resemble'; -import * as silero from '@livekit/agents-plugin-silero'; import { BackgroundVoiceCancellation } from '@livekit/noise-cancellation-node'; import { fileURLToPath } from 'node:url'; import { z } from 'zod'; @@ -238,14 +236,9 @@ class TestAgent extends voice.Agent { } export default defineAgent({ - prewarm: async (proc: JobProcess) => { - proc.userData.vad = await silero.VAD.load(); - }, entry: async (ctx: JobContext) => { const logger = log(); - const vad = ctx.proc.userData.vad! as silero.VAD; const session = new voice.AgentSession({ - vad, userData: { testedSttChoices: new Set(), testedTtsChoices: new Set(), diff --git a/examples/src/custom_text_handler.ts b/examples/src/custom_text_handler.ts index 5ba65e773..5cc10e668 100644 --- a/examples/src/custom_text_handler.ts +++ b/examples/src/custom_text_handler.ts @@ -3,15 +3,12 @@ // SPDX-License-Identifier: Apache-2.0 import { type JobContext, - type JobProcess, ServerOptions, cli, defineAgent, inference, voice, } from '@livekit/agents'; -import * as livekit from '@livekit/agents-plugin-livekit'; -import * as silero from '@livekit/agents-plugin-silero'; import { BackgroundVoiceCancellation } from '@livekit/noise-cancellation-node'; import { fileURLToPath } from 'node:url'; @@ -39,26 +36,19 @@ const customTextInputHandler = (session: voice.AgentSession, event: voice.TextIn }; export default defineAgent({ - prewarm: async (proc: JobProcess) => { - proc.userData.vad = await silero.VAD.load(); - }, entry: async (ctx: JobContext) => { const agent = new voice.Agent({ instructions: "You are a helpful assistant, you can hear the user's message and respond to it.", }); - const vad = ctx.proc.userData.vad! as silero.VAD; - const session = new voice.AgentSession({ - vad, stt: new inference.STT({ model: 'deepgram/nova-3', language: 'en' }), llm: new inference.LLM({ model: 'openai/gpt-4.1-mini' }), tts: new inference.TTS({ model: 'cartesia/sonic-3', voice: '9626c31c-bec5-4cca-baa8-f8ba9e84c8bc', }), - turnDetection: new livekit.turnDetector.MultilingualModel(), }); await session.start({ diff --git a/examples/src/drive-thru/drivethru_agent.ts b/examples/src/drive-thru/drivethru_agent.ts index 9882f6fcd..c9a534dec 100644 --- a/examples/src/drive-thru/drivethru_agent.ts +++ b/examples/src/drive-thru/drivethru_agent.ts @@ -1,20 +1,10 @@ // SPDX-FileCopyrightText: 2025 LiveKit, Inc. // // SPDX-License-Identifier: Apache-2.0 -import { - type JobContext, - type JobProcess, - ServerOptions, - cli, - defineAgent, - llm, - voice, -} from '@livekit/agents'; +import { type JobContext, ServerOptions, cli, defineAgent, llm, voice } from '@livekit/agents'; import * as deepgram from '@livekit/agents-plugin-deepgram'; import * as elevenlabs from '@livekit/agents-plugin-elevenlabs'; -import * as livekit from '@livekit/agents-plugin-livekit'; import * as openai from '@livekit/agents-plugin-openai'; -import * as silero from '@livekit/agents-plugin-silero'; import { fileURLToPath } from 'node:url'; import { z } from 'zod'; import { @@ -376,19 +366,13 @@ export async function newUserData(): Promise { } export default defineAgent({ - prewarm: async (proc: JobProcess) => { - proc.userData.vad = await silero.VAD.load(); - }, entry: async (ctx: JobContext) => { const userdata = await newUserData(); - const vad = ctx.proc.userData.vad! as silero.VAD; const session = new voice.AgentSession({ - vad, stt: new deepgram.STT(), llm: new openai.LLM({ model: 'gpt-4.1', temperature: 0.45 }), tts: new elevenlabs.TTS(), - turnDetection: new livekit.turnDetector.MultilingualModel(), userData: userdata, voiceOptions: { maxToolSteps: 10, diff --git a/examples/src/elevenlabs_scribe_v2.ts b/examples/src/elevenlabs_scribe_v2.ts index d0574c02c..dbb4ac1fa 100644 --- a/examples/src/elevenlabs_scribe_v2.ts +++ b/examples/src/elevenlabs_scribe_v2.ts @@ -3,7 +3,6 @@ // SPDX-License-Identifier: Apache-2.0 import { type JobContext, - type JobProcess, ServerOptions, cli, defineAgent, @@ -11,13 +10,9 @@ import { voice, } from '@livekit/agents'; import * as elevenlabs from '@livekit/agents-plugin-elevenlabs'; -import * as silero from '@livekit/agents-plugin-silero'; import { fileURLToPath } from 'node:url'; export default defineAgent({ - prewarm: async (proc: JobProcess) => { - proc.userData.vad = await silero.VAD.load(); - }, entry: async (ctx: JobContext) => { const stt = new elevenlabs.STT({ useRealtime: true, @@ -32,7 +27,6 @@ export default defineAgent({ const session = new voice.AgentSession({ voiceOptions: { allowInterruptions: true }, - vad: ctx.proc.userData.vad! as silero.VAD, stt, llm: new inference.LLM({ model: 'openai/gpt-4.1-mini' }), tts: new inference.TTS({ model: 'cartesia/sonic-3' }), diff --git a/examples/src/frontdesk/frontdesk_agent.ts b/examples/src/frontdesk/frontdesk_agent.ts index d5d2e1ab1..8e9e50f42 100644 --- a/examples/src/frontdesk/frontdesk_agent.ts +++ b/examples/src/frontdesk/frontdesk_agent.ts @@ -1,20 +1,10 @@ // SPDX-FileCopyrightText: 2025 LiveKit, Inc. // // SPDX-License-Identifier: Apache-2.0 -import { - type JobContext, - type JobProcess, - ServerOptions, - cli, - defineAgent, - llm, - voice, -} from '@livekit/agents'; +import { type JobContext, ServerOptions, cli, defineAgent, llm, voice } from '@livekit/agents'; import * as deepgram from '@livekit/agents-plugin-deepgram'; import * as elevenlabs from '@livekit/agents-plugin-elevenlabs'; -import * as livekit from '@livekit/agents-plugin-livekit'; import * as openai from '@livekit/agents-plugin-openai'; -import * as silero from '@livekit/agents-plugin-silero'; import { BackgroundVoiceCancellation } from '@livekit/noise-cancellation-node'; import { fileURLToPath } from 'node:url'; import { z } from 'zod'; @@ -196,9 +186,6 @@ You must infer the appropriate range implicitly from the conversational context } export default defineAgent({ - prewarm: async (proc: JobProcess) => { - proc.userData.vad = await silero.VAD.load(); - }, entry: async (ctx: JobContext) => { const timezone = 'UTC'; @@ -220,13 +207,11 @@ export default defineAgent({ const userdata: Userdata = { cal }; const session = new voice.AgentSession({ - vad: ctx.proc.userData.vad! as silero.VAD, stt: new deepgram.STT(), llm: new openai.LLM({ model: 'gpt-4.1', }), tts: new elevenlabs.TTS(), - turnDetection: new livekit.turnDetector.MultilingualModel(), userData: userdata, voiceOptions: { maxToolSteps: 1, diff --git a/examples/src/gemini_realtime_agent.ts b/examples/src/gemini_realtime_agent.ts index 60cbb443e..3b82c8ec0 100644 --- a/examples/src/gemini_realtime_agent.ts +++ b/examples/src/gemini_realtime_agent.ts @@ -3,7 +3,6 @@ // SPDX-License-Identifier: Apache-2.0 import { type JobContext, - type JobProcess, ServerOptions, cli, dedent, @@ -12,7 +11,6 @@ import { voice, } from '@livekit/agents'; import * as google from '@livekit/agents-plugin-google'; -import * as silero from '@livekit/agents-plugin-silero'; import { fileURLToPath } from 'node:url'; import { z } from 'zod'; @@ -117,14 +115,10 @@ class StoryAgent extends voice.Agent { } export default defineAgent({ - prewarm: async (proc: JobProcess) => { - proc.userData.vad = await silero.VAD.load(); - }, entry: async (ctx: JobContext) => { const userdata: StoryData = {}; const session = new voice.AgentSession({ - vad: ctx.proc.userData.vad! as silero.VAD, llm: new google.realtime.RealtimeModel({ thinkingConfig: { // Making the thoughts false to speed up the realtime response diff --git a/examples/src/hume_tts.ts b/examples/src/hume_tts.ts index fbf05c689..86daa1854 100644 --- a/examples/src/hume_tts.ts +++ b/examples/src/hume_tts.ts @@ -3,7 +3,6 @@ // SPDX-License-Identifier: Apache-2.0 import { type JobContext, - type JobProcess, ServerOptions, cli, defineAgent, @@ -12,15 +11,10 @@ import { voice, } from '@livekit/agents'; import * as hume from '@livekit/agents-plugin-hume'; -import * as livekit from '@livekit/agents-plugin-livekit'; -import * as silero from '@livekit/agents-plugin-silero'; import { BackgroundVoiceCancellation } from '@livekit/noise-cancellation-node'; import { fileURLToPath } from 'node:url'; export default defineAgent({ - prewarm: async (proc: JobProcess) => { - proc.userData.vad = await silero.VAD.load(); - }, entry: async (ctx: JobContext) => { const agent = new voice.Agent({ instructions: @@ -39,8 +33,6 @@ export default defineAgent({ stt: 'deepgram/nova-3', llm: 'openai/gpt-4.1-mini', tts, - vad: ctx.proc.userData.vad! as silero.VAD, - turnDetection: new livekit.turnDetector.MultilingualModel(), voiceOptions: { preemptiveGeneration: true, }, diff --git a/examples/src/idle_user_timeout_example.ts b/examples/src/idle_user_timeout_example.ts index 47b9d2643..d326c881b 100644 --- a/examples/src/idle_user_timeout_example.ts +++ b/examples/src/idle_user_timeout_example.ts @@ -8,7 +8,6 @@ */ import { type JobContext, - type JobProcess, ServerOptions, Task, cli, @@ -19,21 +18,15 @@ import { log, voice, } from '@livekit/agents'; -import * as silero from '@livekit/agents-plugin-silero'; import { fileURLToPath } from 'node:url'; initializeLogger({ pretty: true }); export default defineAgent({ - prewarm: async (proc: JobProcess) => { - proc.userData.vad = await silero.VAD.load(); - }, entry: async (ctx: JobContext) => { const logger = log(); - const vad = ctx.proc.userData.vad! as silero.VAD; const session = new voice.AgentSession({ - vad, llm: new inference.LLM({ model: 'openai/gpt-4.1-mini' }), stt: new inference.STT({ model: 'deepgram/nova-3', language: 'en' }), tts: new inference.TTS({ diff --git a/examples/src/instructions_per_modality.ts b/examples/src/instructions_per_modality.ts index 71f2f3f04..793f9819c 100644 --- a/examples/src/instructions_per_modality.ts +++ b/examples/src/instructions_per_modality.ts @@ -3,7 +3,6 @@ // SPDX-License-Identifier: Apache-2.0 import { type JobContext, - type JobProcess, ServerOptions, cli, defineAgent, @@ -12,7 +11,6 @@ import { log, voice, } from '@livekit/agents'; -import * as silero from '@livekit/agents-plugin-silero'; import { fileURLToPath } from 'node:url'; import { z } from 'zod'; @@ -79,12 +77,8 @@ class SchedulingAgent extends voice.Agent { } export default defineAgent({ - prewarm: async (proc: JobProcess) => { - proc.userData.vad = await silero.VAD.load(); - }, entry: async (ctx: JobContext) => { const session = new voice.AgentSession({ - vad: ctx.proc.userData.vad! as silero.VAD, stt: new inference.STT({ model: 'deepgram/nova-3' }), llm: new inference.LLM({ model: 'openai/gpt-4.1-mini' }), tts: new inference.TTS({ diff --git a/examples/src/inworld_tts.ts b/examples/src/inworld_tts.ts index 9a4ddf2a4..ba9f6a888 100644 --- a/examples/src/inworld_tts.ts +++ b/examples/src/inworld_tts.ts @@ -3,7 +3,6 @@ // SPDX-License-Identifier: Apache-2.0 import { type JobContext, - type JobProcess, ServerOptions, cli, defineAgent, @@ -12,15 +11,10 @@ import { voice, } from '@livekit/agents'; import * as inworld from '@livekit/agents-plugin-inworld'; -import * as livekit from '@livekit/agents-plugin-livekit'; -import * as silero from '@livekit/agents-plugin-silero'; import { BackgroundVoiceCancellation } from '@livekit/noise-cancellation-node'; import { fileURLToPath } from 'node:url'; export default defineAgent({ - prewarm: async (proc: JobProcess) => { - proc.userData.vad = await silero.VAD.load(); - }, entry: async (ctx: JobContext) => { const agent = new voice.Agent({ instructions: @@ -69,10 +63,6 @@ export default defineAgent({ // Text-to-speech (TTS) is your agent's voice, turning the LLM's text into speech that the user can hear // See all available models as well as voice selections at https://docs.livekit.io/agents/models/tts/ tts, - // VAD and turn detection are used to determine when the user is speaking and when the agent should respond - // See more at https://docs.livekit.io/agents/build/turns - vad: ctx.proc.userData.vad! as silero.VAD, - turnDetection: new livekit.turnDetector.MultilingualModel(), // to use realtime model, replace the stt, llm, tts and vad with the following // llm: new openai.realtime.RealtimeModel(), voiceOptions: { diff --git a/examples/src/lemonslice_realtime_avatar.ts b/examples/src/lemonslice_realtime_avatar.ts index b2afc544b..c5d7c9ab8 100644 --- a/examples/src/lemonslice_realtime_avatar.ts +++ b/examples/src/lemonslice_realtime_avatar.ts @@ -3,7 +3,6 @@ // SPDX-License-Identifier: Apache-2.0 import { type JobContext, - type JobProcess, ServerOptions, cli, defineAgent, @@ -12,16 +11,11 @@ import { voice, } from '@livekit/agents'; import * as lemonslice from '@livekit/agents-plugin-lemonslice'; -import * as livekit from '@livekit/agents-plugin-livekit'; -import * as silero from '@livekit/agents-plugin-silero'; import { fileURLToPath } from 'node:url'; initializeLogger({ pretty: true }); export default defineAgent({ - prewarm: async (proc: JobProcess) => { - proc.userData.vad = await silero.VAD.load(); - }, entry: async (ctx: JobContext) => { try { const agent = new voice.Agent({ @@ -40,8 +34,6 @@ export default defineAgent({ model: 'cartesia/sonic-3', voice: '9626c31c-bec5-4cca-baa8-f8ba9e84c8bc', }), - turnDetection: new livekit.turnDetector.MultilingualModel(), - vad: ctx.proc.userData.vad! as silero.VAD, turnHandling: { interruption: { resumeFalseInterruption: false, diff --git a/examples/src/liveavatar_avatar.ts b/examples/src/liveavatar_avatar.ts index fe502d3b3..bc428142d 100644 --- a/examples/src/liveavatar_avatar.ts +++ b/examples/src/liveavatar_avatar.ts @@ -3,7 +3,6 @@ // SPDX-License-Identifier: Apache-2.0 import { type JobContext, - type JobProcess, ServerOptions, cli, defineAgent, @@ -13,14 +12,9 @@ import { voice, } from '@livekit/agents'; import * as liveavatar from '@livekit/agents-plugin-liveavatar'; -import * as livekit from '@livekit/agents-plugin-livekit'; -import * as silero from '@livekit/agents-plugin-silero'; import { fileURLToPath } from 'node:url'; export default defineAgent({ - prewarm: async (proc: JobProcess) => { - proc.userData.vad = await silero.VAD.load(); - }, entry: async (ctx: JobContext) => { const logger = log().child({ example: 'liveavatar_avatar' }); @@ -39,8 +33,6 @@ export default defineAgent({ model: 'cartesia/sonic-3', voice: '9626c31c-bec5-4cca-baa8-f8ba9e84c8bc', }), - turnDetection: new livekit.turnDetector.MultilingualModel(), - vad: ctx.proc.userData.vad! as silero.VAD, voiceOptions: { preemptiveGeneration: true, }, diff --git a/examples/src/llm_fallback_adapter.ts b/examples/src/llm_fallback_adapter.ts index d053464dc..d3d407214 100644 --- a/examples/src/llm_fallback_adapter.ts +++ b/examples/src/llm_fallback_adapter.ts @@ -16,26 +16,14 @@ * - Configurable timeouts and retry behavior * - Event emission when provider availability changes */ -import { - type JobContext, - type JobProcess, - ServerOptions, - cli, - defineAgent, - llm, - voice, -} from '@livekit/agents'; +import { type JobContext, ServerOptions, cli, defineAgent, llm, voice } from '@livekit/agents'; import * as deepgram from '@livekit/agents-plugin-deepgram'; import * as elevenlabs from '@livekit/agents-plugin-elevenlabs'; import * as openai from '@livekit/agents-plugin-openai'; -import * as silero from '@livekit/agents-plugin-silero'; import { fileURLToPath } from 'node:url'; import { z } from 'zod'; export default defineAgent({ - prewarm: async (proc: JobProcess) => { - proc.userData.vad = await silero.VAD.load(); - }, entry: async (ctx: JobContext) => { // Create multiple LLM instances for fallback // The FallbackAdapter will try them in order: primary -> secondary -> tertiary @@ -85,7 +73,6 @@ export default defineAgent({ }); const session = new voice.AgentSession({ - vad: ctx.proc.userData.vad! as silero.VAD, stt: new deepgram.STT(), tts: new elevenlabs.TTS(), llm: fallbackLLM, // Use the FallbackAdapter instead of a single LLM diff --git a/examples/src/manual_shutdown.ts b/examples/src/manual_shutdown.ts index 96bedb901..56f770ca0 100644 --- a/examples/src/manual_shutdown.ts +++ b/examples/src/manual_shutdown.ts @@ -3,7 +3,6 @@ // SPDX-License-Identifier: Apache-2.0 import { type JobContext, - type JobProcess, ServerOptions, cli, defineAgent, @@ -11,16 +10,11 @@ import { llm, voice, } from '@livekit/agents'; -import * as livekit from '@livekit/agents-plugin-livekit'; -import * as silero from '@livekit/agents-plugin-silero'; import { BackgroundVoiceCancellation } from '@livekit/noise-cancellation-node'; import { fileURLToPath } from 'node:url'; import { z } from 'zod'; export default defineAgent({ - prewarm: async (proc: JobProcess) => { - proc.userData.vad = await silero.VAD.load(); - }, entry: async (ctx: JobContext) => { const agent = new voice.Agent({ instructions: @@ -66,8 +60,6 @@ export default defineAgent({ model: 'cartesia/sonic-3', voice: '9626c31c-bec5-4cca-baa8-f8ba9e84c8bc', }), - vad: ctx.proc.userData.vad! as silero.VAD, - turnDetection: new livekit.turnDetector.MultilingualModel(), voiceOptions: { preemptiveGeneration: true, }, diff --git a/examples/src/multi_agent.ts b/examples/src/multi_agent.ts index 7f4819bed..263ba6093 100644 --- a/examples/src/multi_agent.ts +++ b/examples/src/multi_agent.ts @@ -3,7 +3,6 @@ // SPDX-License-Identifier: Apache-2.0 import { type JobContext, - type JobProcess, ServerOptions, cli, dedent, @@ -12,8 +11,6 @@ import { llm, voice, } from '@livekit/agents'; -import * as livekit from '@livekit/agents-plugin-livekit'; -import * as silero from '@livekit/agents-plugin-silero'; import { fileURLToPath } from 'node:url'; import { z } from 'zod'; @@ -72,14 +69,10 @@ class StoryAgent extends voice.Agent { } export default defineAgent({ - prewarm: async (proc: JobProcess) => { - proc.userData.vad = await silero.VAD.load(); - }, entry: async (ctx: JobContext) => { const userdata: StoryData = {}; const session = new voice.AgentSession({ - vad: ctx.proc.userData.vad! as silero.VAD, stt: new inference.STT({ model: 'deepgram/nova-3', language: 'en' }), tts: new inference.TTS({ model: 'cartesia/sonic-3', @@ -89,7 +82,6 @@ export default defineAgent({ // to use realtime model, replace the stt, llm, tts and vad with the following // llm: new openai.realtime.RealtimeModel(), userData: userdata, - turnDetection: new livekit.turnDetector.EnglishModel(), }); await session.start({ diff --git a/examples/src/push_to_talk.ts b/examples/src/push_to_talk.ts index ecba61363..06dbba40a 100644 --- a/examples/src/push_to_talk.ts +++ b/examples/src/push_to_talk.ts @@ -3,7 +3,6 @@ // SPDX-License-Identifier: Apache-2.0 import { type JobContext, - type JobProcess, ServerOptions, cli, defineAgent, @@ -11,7 +10,6 @@ import { initializeLogger, voice, } from '@livekit/agents'; -import * as silero from '@livekit/agents-plugin-silero'; import type { ChatContext, ChatMessage } from 'agents/dist/llm/chat_context.js'; import { fileURLToPath } from 'node:url'; @@ -25,14 +23,10 @@ class MyAgent extends voice.Agent { } export default defineAgent({ - prewarm: async (proc: JobProcess) => { - proc.userData.vad = await silero.VAD.load(); - }, entry: async (ctx: JobContext) => { initializeLogger({ pretty: true }); const session = new voice.AgentSession({ - vad: ctx.proc.userData.vad! as silero.VAD, stt: new inference.STT({ model: 'deepgram/nova-3', language: 'en' }), llm: new inference.LLM({ model: 'openai/gpt-4.1-mini' }), tts: new inference.TTS({ diff --git a/examples/src/raw_function_description.ts b/examples/src/raw_function_description.ts index 6548fd011..6a1d744a5 100644 --- a/examples/src/raw_function_description.ts +++ b/examples/src/raw_function_description.ts @@ -3,7 +3,6 @@ // SPDX-License-Identifier: Apache-2.0 import { type JobContext, - type JobProcess, ServerOptions, cli, defineAgent, @@ -11,8 +10,6 @@ import { llm, voice, } from '@livekit/agents'; -import * as livekit from '@livekit/agents-plugin-livekit'; -import * as silero from '@livekit/agents-plugin-silero'; import { fileURLToPath } from 'node:url'; function createRawFunctionAgent() { @@ -48,14 +45,8 @@ function createRawFunctionAgent() { } export default defineAgent({ - prewarm: async (proc: JobProcess) => { - proc.userData.vad = await silero.VAD.load(); - }, entry: async (ctx: JobContext) => { - const vad = ctx.proc.userData.vad! as silero.VAD; - const session = new voice.AgentSession({ - vad, stt: new inference.STT({ model: 'deepgram/nova-3', language: 'en', @@ -68,7 +59,6 @@ export default defineAgent({ // to use realtime model, replace the stt, llm, tts and vad with the following // llm: new openai.realtime.RealtimeModel(), userData: { number: 0 }, - turnDetection: new livekit.turnDetector.EnglishModel(), }); await session.start({ diff --git a/examples/src/realtime_agent.ts b/examples/src/realtime_agent.ts index b30171776..a6879262b 100644 --- a/examples/src/realtime_agent.ts +++ b/examples/src/realtime_agent.ts @@ -1,17 +1,8 @@ // SPDX-FileCopyrightText: 2025 LiveKit, Inc. // // SPDX-License-Identifier: Apache-2.0 -import { - type JobContext, - type JobProcess, - ServerOptions, - cli, - defineAgent, - llm, - voice, -} from '@livekit/agents'; +import { type JobContext, ServerOptions, cli, defineAgent, llm, voice } from '@livekit/agents'; import * as openai from '@livekit/agents-plugin-openai'; -import * as silero from '@livekit/agents-plugin-silero'; import { readFileSync } from 'node:fs'; import { fileURLToPath } from 'node:url'; import { z } from 'zod'; @@ -19,9 +10,6 @@ import { z } from 'zod'; const roomNameSchema = z.enum(['bedroom', 'living room', 'kitchen', 'bathroom', 'office']); export default defineAgent({ - prewarm: async (proc: JobProcess) => { - proc.userData.vad = await silero.VAD.load(); - }, entry: async (ctx: JobContext) => { const getWeather = llm.tool({ description: ' Called when the user asks about the weather.', diff --git a/examples/src/realtime_turn_detector.ts b/examples/src/realtime_turn_detector.ts index 6e6ff90dd..7057eac31 100644 --- a/examples/src/realtime_turn_detector.ts +++ b/examples/src/realtime_turn_detector.ts @@ -3,26 +3,20 @@ // SPDX-License-Identifier: Apache-2.0 import { type JobContext, - type JobProcess, ServerOptions, cli, defineAgent, + inference, voice, } from '@livekit/agents'; import * as deepgram from '@livekit/agents-plugin-deepgram'; import * as elevenlabs from '@livekit/agents-plugin-elevenlabs'; -import * as livekit from '@livekit/agents-plugin-livekit'; import * as openai from '@livekit/agents-plugin-openai'; -import * as silero from '@livekit/agents-plugin-silero'; import { fileURLToPath } from 'node:url'; export default defineAgent({ - prewarm: async (proc: JobProcess) => { - proc.userData.vad = await silero.VAD.load(); - }, entry: async (ctx: JobContext) => { const session = new voice.AgentSession({ - vad: ctx.proc.userData.vad! as silero.VAD, stt: new deepgram.STT(), tts: new elevenlabs.TTS(), // To use OpenAI Realtime API @@ -33,7 +27,7 @@ export default defineAgent({ turnDetection: null, inputAudioTranscription: null, }), - turnDetection: new livekit.turnDetector.EnglishModel(), + turnDetection: new inference.TurnDetector(), }); await session.start({ diff --git a/examples/src/realtime_with_tts.ts b/examples/src/realtime_with_tts.ts index d87db7853..05df047be 100644 --- a/examples/src/realtime_with_tts.ts +++ b/examples/src/realtime_with_tts.ts @@ -1,27 +1,14 @@ // SPDX-FileCopyrightText: 2025 LiveKit, Inc. // // SPDX-License-Identifier: Apache-2.0 -import { - type JobContext, - type JobProcess, - ServerOptions, - cli, - defineAgent, - llm, - log, - voice, -} from '@livekit/agents'; +import { type JobContext, ServerOptions, cli, defineAgent, llm, log, voice } from '@livekit/agents'; import * as cartesia from '@livekit/agents-plugin-cartesia'; import * as openai from '@livekit/agents-plugin-openai'; -import * as silero from '@livekit/agents-plugin-silero'; import { BackgroundVoiceCancellation } from '@livekit/noise-cancellation-node'; import { fileURLToPath } from 'node:url'; import { z } from 'zod'; export default defineAgent({ - prewarm: async (proc: JobProcess) => { - proc.userData.vad = await silero.VAD.load(); - }, entry: async (ctx: JobContext) => { const logger = log(); diff --git a/examples/src/restaurant_agent.ts b/examples/src/restaurant_agent.ts index d9faaf9a5..081552c1a 100644 --- a/examples/src/restaurant_agent.ts +++ b/examples/src/restaurant_agent.ts @@ -3,7 +3,6 @@ // SPDX-License-Identifier: Apache-2.0 import { type JobContext, - type JobProcess, ServerOptions, cli, dedent, @@ -12,7 +11,6 @@ import { llm, voice, } from '@livekit/agents'; -import * as silero from '@livekit/agents-plugin-silero'; import { fileURLToPath } from 'node:url'; import { z } from 'zod'; @@ -358,9 +356,6 @@ function createCheckoutAgent(menu: string) { } export default defineAgent({ - prewarm: async (proc: JobProcess) => { - proc.userData.vad = await silero.VAD.load(); - }, entry: async (ctx: JobContext) => { const menu = 'Pizza: $10, Salad: $5, Ice Cream: $3, Coffee: $2'; const userData = createUserData({ @@ -370,9 +365,10 @@ export default defineAgent({ checkout: createCheckoutAgent(menu), }); - const vad = ctx.proc.userData.vad! as silero.VAD; const session = new voice.AgentSession({ - vad, + // VAD is auto-provisioned by AgentSession (bundled silero via + // @livekit/local-inference). Pass `vad: null` to opt out, or pass + // your own `new inference.VAD({ ... })` to customise. stt: new inference.STT({ model: 'deepgram/nova-3' }), llm: new inference.LLM({ model: 'openai/gpt-4.1-mini' }), tts: new inference.TTS({ model: 'cartesia/sonic-3' }), diff --git a/examples/src/runway_avatar.ts b/examples/src/runway_avatar.ts index 3d3cde0bd..dd0c3aaf5 100644 --- a/examples/src/runway_avatar.ts +++ b/examples/src/runway_avatar.ts @@ -3,7 +3,6 @@ // SPDX-License-Identifier: Apache-2.0 import { type JobContext, - type JobProcess, ServerOptions, cli, defineAgent, @@ -13,17 +12,12 @@ import { } from '@livekit/agents'; import * as google from '@livekit/agents-plugin-google'; import * as runway from '@livekit/agents-plugin-runway'; -import * as silero from '@livekit/agents-plugin-silero'; import { fileURLToPath } from 'node:url'; export default defineAgent({ - prewarm: async (proc: JobProcess) => { - proc.userData.vad = await silero.VAD.load(); - }, entry: async (ctx: JobContext) => { const logger = log(); const session = new voice.AgentSession({ - vad: ctx.proc.userData.vad! as silero.VAD, llm: new google.realtime.RealtimeModel({ thinkingConfig: { includeThoughts: false }, }), diff --git a/examples/src/telephony_amd.ts b/examples/src/telephony_amd.ts index 424221935..d22e8e3d1 100644 --- a/examples/src/telephony_amd.ts +++ b/examples/src/telephony_amd.ts @@ -3,7 +3,6 @@ // SPDX-License-Identifier: Apache-2.0 import { type JobContext, - type JobProcess, ServerOptions, cli, defineAgent, @@ -11,8 +10,6 @@ import { log, voice, } from '@livekit/agents'; -import * as livekit from '@livekit/agents-plugin-livekit'; -import * as silero from '@livekit/agents-plugin-silero'; import { TrackKind } from '@livekit/rtc-node'; import { RoomServiceClient, SipClient } from 'livekit-server-sdk'; import { fileURLToPath } from 'node:url'; @@ -41,9 +38,6 @@ class MyAgent extends voice.Agent { * SIP_PARTICIPANT_IDENTITY — identity to assign the dialed participant */ export default defineAgent({ - prewarm: async (proc: JobProcess) => { - proc.userData.vad = await silero.VAD.load(); - }, entry: async (ctx: JobContext) => { const logger = log().child({ room: ctx.room.name }); @@ -57,10 +51,6 @@ export default defineAgent({ model: 'cartesia/sonic-3', voice: '9626c31c-bec5-4cca-baa8-f8ba9e84c8bc', }), - turnHandling: { - turnDetection: new livekit.turnDetector.MultilingualModel(), - }, - vad: ctx.proc.userData.vad! as silero.VAD, preemptiveGeneration: true, }); diff --git a/examples/src/tool_call_disfluency.ts b/examples/src/tool_call_disfluency.ts index 8f92183a8..c89917d54 100644 --- a/examples/src/tool_call_disfluency.ts +++ b/examples/src/tool_call_disfluency.ts @@ -4,7 +4,6 @@ import { AutoSubscribe, type JobContext, - type JobProcess, ServerOptions, cli, defineAgent, @@ -12,9 +11,7 @@ import { voice, } from '@livekit/agents'; import * as elevenlabs from '@livekit/agents-plugin-elevenlabs'; -import * as livekit from '@livekit/agents-plugin-livekit'; import * as openai from '@livekit/agents-plugin-openai'; -import * as silero from '@livekit/agents-plugin-silero'; import { fileURLToPath } from 'node:url'; import { z } from 'zod'; @@ -30,13 +27,9 @@ class VoiceAgent extends voice.Agent { } export default defineAgent({ - prewarm: async (proc: JobProcess) => { - proc.userData.vad = await silero.VAD.load(); - }, entry: async (ctx: JobContext) => { await ctx.connect(undefined, AutoSubscribe.AUDIO_ONLY, undefined); await ctx.waitForParticipant(); - const vad = ctx.proc.userData.vad! as silero.VAD; const getWeather = llm.tool({ description: ' Called when the user asks about the weather.', @@ -61,10 +54,8 @@ export default defineAgent({ }); const session = new voice.AgentSession({ - vad, llm: new openai.realtime.RealtimeModel(), tts: new elevenlabs.TTS(), - turnDetection: new livekit.turnDetector.MultilingualModel(), }); await session.start({ diff --git a/examples/src/warm_transfer.ts b/examples/src/warm_transfer.ts index d2d56e7f1..546993724 100644 --- a/examples/src/warm_transfer.ts +++ b/examples/src/warm_transfer.ts @@ -3,7 +3,6 @@ // SPDX-License-Identifier: Apache-2.0 import { type JobContext, - type JobProcess, ServerOptions, beta, cli, @@ -13,8 +12,6 @@ import { log, voice, } from '@livekit/agents'; -import * as livekit from '@livekit/agents-plugin-livekit'; -import * as silero from '@livekit/agents-plugin-silero'; import { BackgroundVoiceCancellation } from '@livekit/noise-cancellation-node'; import { fileURLToPath } from 'node:url'; @@ -95,20 +92,19 @@ Examples on when the tool should be called: } } +// No prewarm hook needed: the local EOT model runs in the shared inference +// process (loaded once per host), and the inference VAD (~2MB, in-process) +// lazy-loads on first stream. export default defineAgent({ - prewarm: async (proc: JobProcess) => { - proc.userData.vad = await silero.VAD.load(); - }, entry: async (ctx: JobContext) => { const session = new voice.AgentSession({ - vad: ctx.proc.userData.vad as silero.VAD, + vad: new inference.VAD(), llm: new inference.LLM({ model: 'openai/gpt-4.1-mini' }), stt: new inference.STT({ model: 'deepgram/nova-3', language: 'en' }), tts: new inference.TTS({ model: 'cartesia/sonic-3', voice: '9626c31c-bec5-4cca-baa8-f8ba9e84c8bc', }), - turnDetection: new livekit.turnDetector.MultilingualModel(), }); await session.start({ diff --git a/plugins/livekit/src/turn_detector/base.ts b/plugins/livekit/src/turn_detector/base.ts index 93ecdd7f9..3fa1dd139 100644 --- a/plugins/livekit/src/turn_detector/base.ts +++ b/plugins/livekit/src/turn_detector/base.ts @@ -231,8 +231,11 @@ export abstract class EOUModel { return (await this.unlikelyThreshold(language)) !== undefined; } + // `_timeoutMs` is part of the unified `_TurnDetector` contract (milliseconds, + // matching the audio EOT detector). Text-based inference is bounded by the IPC + // executor itself, so this detector does not use the value. // eslint-disable-next-line @typescript-eslint/no-unused-vars - async predictEndOfTurn(chatCtx: llm.ChatContext, timeout: number = 3): Promise { + async predictEndOfTurn(chatCtx: llm.ChatContext, _timeoutMs?: number): Promise { let messages: RawChatItem[] = []; for (const message of chatCtx.items) { diff --git a/plugins/livekit/src/turn_detector/index.ts b/plugins/livekit/src/turn_detector/index.ts index 8ffad4c1b..fe64920aa 100644 --- a/plugins/livekit/src/turn_detector/index.ts +++ b/plugins/livekit/src/turn_detector/index.ts @@ -6,6 +6,13 @@ import { extname } from 'node:path'; import { INFERENCE_METHOD_EN } from './english.js'; import { INFERENCE_METHOD_MULTILINGUAL } from './multilingual.js'; +console.warn( + 'The text-based turn detector from @livekit/agents-plugin-livekit is deprecated. ' + + 'The audio EOT detector in `@livekit/agents` inference (TurnDetector) replaces ' + + 'it and runs natively on-device via @livekit/local-inference. ' + + 'This text-based path will be removed in a future release.', +); + export { EOUModel } from './base.js'; export { EnglishModel } from './english.js'; export { MultilingualModel } from './multilingual.js'; diff --git a/plugins/livekit/src/turn_detector/multilingual.ts b/plugins/livekit/src/turn_detector/multilingual.ts index 57e94ba8d..cd0423913 100644 --- a/plugins/livekit/src/turn_detector/multilingual.ts +++ b/plugins/livekit/src/turn_detector/multilingual.ts @@ -68,10 +68,10 @@ export class MultilingualModel extends EOUModel { return threshold; } - async predictEndOfTurn(chatCtx: llm.ChatContext, timeout: number = 3): Promise { + async predictEndOfTurn(chatCtx: llm.ChatContext, timeoutMs?: number): Promise { const url = remoteInferenceUrl(); if (!url) { - return await super.predictEndOfTurn(chatCtx, timeout); + return await super.predictEndOfTurn(chatCtx, timeoutMs); } // Copy and process chat context similar to Python implementation diff --git a/plugins/silero/src/index.ts b/plugins/silero/src/index.ts index 2b5b67fb6..41a4dc96e 100644 --- a/plugins/silero/src/index.ts +++ b/plugins/silero/src/index.ts @@ -5,6 +5,14 @@ import { Plugin } from '@livekit/agents'; export { VAD, VADStream } from './vad.js'; +console.warn( + '@livekit/agents-plugin-silero is deprecated and will be removed in v2.0. ' + + 'AgentSession now defaults to the bundled silero VAD (via @livekit/local-inference); ' + + 'drop the explicit `vad=` argument entirely, pass `vad: null` to opt out, or use ' + + "`import { inference } from '@livekit/agents'; new inference.VAD({ model: 'silero', ... })` " + + 'to customise options.', +); + class SileroPlugin extends Plugin { constructor() { super({ diff --git a/plugins/silero/src/vad.test.ts b/plugins/silero/src/vad.test.ts index ac59ba5cf..89b1df17b 100644 --- a/plugins/silero/src/vad.test.ts +++ b/plugins/silero/src/vad.test.ts @@ -1,12 +1,18 @@ // SPDX-FileCopyrightText: 2026 LiveKit, Inc. // // SPDX-License-Identifier: Apache-2.0 -import { AudioByteStream, type VADEvent, VADEventType, mergeFrames } from '@livekit/agents'; +import { + AudioByteStream, + type VADEvent, + VADEventType, + type VADStream, + mergeFrames, +} from '@livekit/agents'; import { AudioFrame, AudioResampler } from '@livekit/rtc-node'; import { readFileSync } from 'node:fs'; import { join } from 'node:path'; import { describe, expect, it } from 'vitest'; -import { VAD, type VADStream } from './vad.js'; +import { VAD } from './vad.js'; const TARGET_SAMPLE_RATE = 16000; const CHUNK_DURATION_MS = 10; diff --git a/plugins/silero/src/vad.ts b/plugins/silero/src/vad.ts index b78d87a47..970017611 100644 --- a/plugins/silero/src/vad.ts +++ b/plugins/silero/src/vad.ts @@ -6,6 +6,7 @@ import { VADEventType, VADStream as baseStream, VAD as baseVAD, + inference, log, mergeFrames, } from '@livekit/agents'; @@ -97,12 +98,41 @@ export class VAD extends baseVAD { * @param options - * @returns Promise\<{@link VAD}\>: An instance of the VAD class ready for streaming. */ - static async load(opts: Partial = {}): Promise { + static async load(opts: Partial = {}): Promise { const mergedOpts: VADOptions = { ...defaultVADOptions, ...opts }; + + // When the requested settings are compatible with the bundled native + // implementation in `@livekit/local-inference`, delegate to + // `inference.VAD({ model: 'silero' })` so existing call sites transparently + // get the faster, COW-shared native path as part of the silero deprecation. + // The native lib only ships the 16 kHz model, so any other sample rate + // falls back to the legacy onnxruntime path below. + if (mergedOpts.sampleRate === 16000) { + if (!mergedOpts.forceCPU) { + log().warn( + 'forceCPU=false is ignored when using the bundled native VAD; the ' + + 'model runs CPU-only. Use a non-16kHz sampleRate to keep the legacy ' + + 'onnxruntime path that honors forceCPU.', + ); + } + return new inference.VAD({ + model: 'silero', + minSpeechDuration: mergedOpts.minSpeechDuration, + minSilenceDuration: mergedOpts.minSilenceDuration, + prefixPaddingDuration: mergedOpts.prefixPaddingDuration, + maxBufferedSpeech: mergedOpts.maxBufferedSpeech, + activationThreshold: mergedOpts.activationThreshold, + }); + } + const session = await newInferenceSession(mergedOpts.forceCPU); return new VAD(session, mergedOpts); } + override get minSilenceDuration(): number { + return this.#opts.minSilenceDuration; + } + stream(): VADStream { const stream = new VADStream( this, diff --git a/pnpm-lock.yaml b/pnpm-lock.yaml index 834973df5..108d9fdd8 100644 --- a/pnpm-lock.yaml +++ b/pnpm-lock.yaml @@ -48,7 +48,7 @@ importers: version: 6.21.0(eslint@8.57.0)(typescript@5.9.3) '@vitest/coverage-v8': specifier: 4.0.17 - version: 4.0.17(vitest@4.1.0(@opentelemetry/api@1.9.0)(@types/node@22.19.1)(vite@7.3.2(@types/node@22.19.1)(tsx@4.21.0))) + version: 4.0.17(vitest@4.0.17(@opentelemetry/api@1.9.0)(@types/node@22.19.1)(tsx@4.21.0)) eslint: specifier: ^8.56.0 version: 8.57.0 @@ -102,7 +102,7 @@ importers: version: 7.3.2(@types/node@22.19.1)(tsx@4.21.0) vitest: specifier: ^4.0.17 - version: 4.1.0(@opentelemetry/api@1.9.0)(@types/node@22.19.1)(vite@7.3.2(@types/node@22.19.1)(tsx@4.21.0)) + version: 4.0.17(@opentelemetry/api@1.9.0)(@types/node@22.19.1)(tsx@4.21.0) agents: dependencies: @@ -112,12 +112,15 @@ importers: '@ffmpeg-installer/ffmpeg': specifier: ^1.1.0 version: 1.1.0 + '@livekit/local-inference': + specifier: ^0.2.5 + version: 0.2.5 '@livekit/mutex': specifier: ^1.1.1 version: 1.1.1 '@livekit/protocol': - specifier: ^1.46.4 - version: 1.46.4 + specifier: ^1.46.5 + version: 1.46.6 '@livekit/throws-transformer': specifier: 0.1.8 version: 0.1.8(typescript@5.9.3) @@ -374,7 +377,7 @@ importers: version: 5.9.3 vitest: specifier: ^4.0.17 - version: 4.1.0(@opentelemetry/api@1.9.0)(@types/node@22.19.1)(vite@7.3.2(@types/node@22.19.1)(tsx@4.21.0)) + version: 4.0.17(@opentelemetry/api@1.9.0)(@types/node@22.19.1)(tsx@4.21.0) zod: specifier: ^4.1.12 version: 4.3.6 @@ -993,7 +996,7 @@ importers: version: 5.9.3 vitest: specifier: ^4.0.17 - version: 4.1.0(@opentelemetry/api@1.9.0)(@types/node@25.6.0)(vite@7.3.2(@types/node@25.6.0)(tsx@4.21.0)) + version: 4.0.17(@opentelemetry/api@1.9.0)(@types/node@25.6.0)(tsx@4.21.0) plugins/neuphonic: dependencies: @@ -1328,7 +1331,7 @@ importers: version: 1.0.16 vitest: specifier: ^4.0.17 - version: 4.1.0(@opentelemetry/api@1.9.0)(@types/node@22.19.1)(vite@7.3.2(@types/node@22.19.1)(tsx@4.21.0)) + version: 4.0.17(@opentelemetry/api@1.9.0)(@types/node@22.19.1)(tsx@4.21.0) devDependencies: '@livekit/agents': specifier: workspace:* @@ -2130,6 +2133,35 @@ packages: '@livekit/changesets-changelog-github@0.0.4': resolution: {integrity: sha512-MXaiLYwgkYciZb8G2wkVtZ1pJJzZmVx5cM30Q+ClslrIYyAqQhRbPmZDM79/5CGxb1MTemR/tfOM25tgJgAK0g==} + '@livekit/local-inference-darwin-arm64@0.2.5': + resolution: {integrity: sha512-tdAGJRiYwko0rOmeI/dXf7Mo5TF+oeWDsK55Ga/2PZ/SHuYZ8jkJAPRaG1k78ePsJ119lySWZsxnJdVnOJowRA==} + cpu: [arm64] + os: [darwin] + + '@livekit/local-inference-darwin-x64@0.2.5': + resolution: {integrity: sha512-FeJUHbx1swyAssS/X9CoI8s4OqeSrYJy/xhKhL0VnH1b5tlVfc6V5OjkLNZl55Jw9JYj0YkYpt0m0OIg3SvYRw==} + cpu: [x64] + os: [darwin] + + '@livekit/local-inference-linux-arm64-gnu@0.2.5': + resolution: {integrity: sha512-hXigtVBLS55wT6oOfpDl2Xh6mhfzsrMxvkLftFFfttjFfFjSouuxkxG5NgQTGP01DGAvYO6mnIP8ASK6livr1w==} + cpu: [arm64] + os: [linux] + + '@livekit/local-inference-linux-x64-gnu@0.2.5': + resolution: {integrity: sha512-3unNMNNc9rLCvGH6f3W6DKd4AlF5Z63mdOh9bGtEDZdPon/h7O3oWo9+6N/sHgULfHyD/vZn2NtT4MLtuhoJIw==} + cpu: [x64] + os: [linux] + + '@livekit/local-inference-win32-x64-msvc@0.2.5': + resolution: {integrity: sha512-3s9paiOPwU+TQYPHNLzMxm/xCoZ8swzt8GF2BZSofI/jL2ao4SK1J3D23JEZuQfuZF4iLZm2dlIxMqAodQ9TCA==} + cpu: [x64] + os: [win32] + + '@livekit/local-inference@0.2.5': + resolution: {integrity: sha512-0n2m4pld1jMqgeZyHs4+3q9gPzq0ousrx3wA8kULAoia/464uIsJ3JqrVGnH8yD4P/yrGeK11VpZ87S+hKeMAQ==} + engines: {node: '>=18.0.0'} + '@livekit/mutex@1.1.1': resolution: {integrity: sha512-EsshAucklmpuUAfkABPxJNhzj9v2sG7JuzFDL4ML1oJQSV14sqrpTYnsaOudMAw9yOaW53NU3QQTlUQoRs4czw==} @@ -2161,8 +2193,8 @@ packages: cpu: [x64] os: [win32] - '@livekit/protocol@1.46.4': - resolution: {integrity: sha512-yJZ8xvyVcs9CczK2V/EQQrSW0MA9VaZ1vL+FI6fd85KhIjfOg26HvrdUl2LZPT78Tu4R4opV4AW58eN5vgmzqg==} + '@livekit/protocol@1.46.6': + resolution: {integrity: sha512-upzlHP1vi/kZ/QqALZTFskQ0ifqc2f15RKucHYOsIHJsaXvEYanG75mAb7o+Yomfs4XhQ4BaRsdY+TFHXpaqrg==} '@livekit/rtc-ffi-bindings-darwin-arm64@0.12.60': resolution: {integrity: sha512-YHXqybkYfaTc3txJXXWoVogiSP3yKJdkaZlIlZ6IDMGnN9elUoHDYU+ZSn/rbdGu0pp4HUOzffXkbkItN735Bw==} @@ -2697,8 +2729,8 @@ packages: '@types/argparse@1.0.38': resolution: {integrity: sha512-ebDJ9b0e702Yr7pWgB0jzm+CX4Srzz8RcXtLJDJB+BSccqMa36uyH/zUsSYao5+BD1ytv3k3rPYCq4mAE1hsXA==} - '@types/chai@5.2.3': - resolution: {integrity: sha512-Mw558oeA9fFbv65/y4mHtXDs9bPnFMZAL/jxdPFUpOHHIXX91mcgEHbS5Lahr+pwZFR8A7GQleRWeI6cGFC2UA==} + '@types/chai@5.2.2': + resolution: {integrity: sha512-8kB30R7Hwqf40JPiKhVzodJs2Qc1ZJ5zuT3uzw5Hq/dhNCl3G3l83jfpdI1e20BP348+fV7VIL/+FxaXkqBmWg==} '@types/deep-eql@4.0.2': resolution: {integrity: sha512-c9h9dVVMigMPc4bwTvC5dxqtqJZwQPePsWjPlpSOnojbor6pGqdk541lfA7AqFQr5pB1BRdq0juY9db81BwyFw==} @@ -2706,9 +2738,6 @@ packages: '@types/estree@1.0.8': resolution: {integrity: sha512-dWHzHa2WqEXI/O1E9OjrocMTKJl2mSrEolh1Iomrv6U+JuNwaHXsXx9bLu5gG7BUWFIN0skIQJQ/L1rIex4X6w==} - '@types/estree@1.0.9': - resolution: {integrity: sha512-GhdPgy1el4/ImP05X05Uw4cw2/M93BCUmnEvWZNStlCzEKME4Fkk+YpoA5OiHNQmoS7Cafb8Xa3Pya8m1Qrzeg==} - '@types/fluent-ffmpeg@2.1.28': resolution: {integrity: sha512-5ovxsDwBcPfJ+eYs1I/ZpcYCnkce7pvH9AHSvrZllAp1ZPpTRDZAFjF3TRFbukxSgIYTTNYePbS0rKUmaxVbXw==} @@ -2816,14 +2845,14 @@ packages: '@vitest/browser': optional: true - '@vitest/expect@4.1.0': - resolution: {integrity: sha512-EIxG7k4wlWweuCLG9Y5InKFwpMEOyrMb6ZJ1ihYu02LVj/bzUwn2VMU+13PinsjRW75XnITeFrQBMH5+dLvCDA==} + '@vitest/expect@4.0.17': + resolution: {integrity: sha512-mEoqP3RqhKlbmUmntNDDCJeTDavDR+fVYkSOw8qRwJFaW/0/5zA9zFeTrHqNtcmwh6j26yMmwx2PqUDPzt5ZAQ==} - '@vitest/mocker@4.1.0': - resolution: {integrity: sha512-evxREh+Hork43+Y4IOhTo+h5lGmVRyjqI739Rz4RlUPqwrkFFDF6EMvOOYjTx4E8Tl6gyCLRL8Mu7Ry12a13Tw==} + '@vitest/mocker@4.0.17': + resolution: {integrity: sha512-+ZtQhLA3lDh1tI2wxe3yMsGzbp7uuJSWBM1iTIKCbppWTSBN09PUC+L+fyNlQApQoR+Ps8twt2pbSSXg2fQVEQ==} peerDependencies: msw: ^2.4.9 - vite: ^6.0.0 || ^7.0.0 || ^8.0.0-0 + vite: ^6.0.0 || ^7.0.0-0 peerDependenciesMeta: msw: optional: true @@ -2833,24 +2862,18 @@ packages: '@vitest/pretty-format@4.0.17': resolution: {integrity: sha512-Ah3VAYmjcEdHg6+MwFE17qyLqBHZ+ni2ScKCiW2XrlSBV4H3Z7vYfPfz7CWQ33gyu76oc0Ai36+kgLU3rfF4nw==} - '@vitest/pretty-format@4.1.0': - resolution: {integrity: sha512-3RZLZlh88Ib0J7NQTRATfc/3ZPOnSUn2uDBUoGNn5T36+bALixmzphN26OUD3LRXWkJu4H0s5vvUeqBiw+kS0A==} + '@vitest/runner@4.0.17': + resolution: {integrity: sha512-JmuQyf8aMWoo/LmNFppdpkfRVHJcsgzkbCA+/Bk7VfNH7RE6Ut2qxegeyx2j3ojtJtKIbIGy3h+KxGfYfk28YQ==} - '@vitest/runner@4.1.0': - resolution: {integrity: sha512-Duvx2OzQ7d6OjchL+trw+aSrb9idh7pnNfxrklo14p3zmNL4qPCDeIJAK+eBKYjkIwG96Bc6vYuxhqDXQOWpoQ==} + '@vitest/snapshot@4.0.17': + resolution: {integrity: sha512-npPelD7oyL+YQM2gbIYvlavlMVWUfNNGZPcu0aEUQXt7FXTuqhmgiYupPnAanhKvyP6Srs2pIbWo30K0RbDtRQ==} - '@vitest/snapshot@4.1.0': - resolution: {integrity: sha512-0Vy9euT1kgsnj1CHttwi9i9o+4rRLEaPRSOJ5gyv579GJkNpgJK+B4HSv/rAWixx2wdAFci1X4CEPjiu2bXIMg==} - - '@vitest/spy@4.1.0': - resolution: {integrity: sha512-pz77k+PgNpyMDv2FV6qmk5ZVau6c3R8HC8v342T2xlFxQKTrSeYw9waIJG8KgV9fFwAtTu4ceRzMivPTH6wSxw==} + '@vitest/spy@4.0.17': + resolution: {integrity: sha512-I1bQo8QaP6tZlTomQNWKJE6ym4SHf3oLS7ceNjozxxgzavRAgZDc06T7kD8gb9bXKEgcLNt00Z+kZO6KaJ62Ew==} '@vitest/utils@4.0.17': resolution: {integrity: sha512-RG6iy+IzQpa9SB8HAFHJ9Y+pTzI+h8553MrciN9eC6TFBErqrQaTas4vG+MVj8S4uKk8uTT2p0vgZPnTdxd96w==} - '@vitest/utils@4.1.0': - resolution: {integrity: sha512-XfPXT6a8TZY3dcGY8EdwsBulFCIw+BeeX0RZn2x/BtiY/75YGh8FeWGG8QISN/WhaqSrE2OrlDgtF8q5uhOTmw==} - abort-controller@3.0.0: resolution: {integrity: sha512-h8lQ8tacZYnR3vNQTgibj+tODHI5/+l06Au2Pcriv/Gmet0eaj4TwWH41sO9wnHDiQsEj19q0drzdWdeAHtweg==} engines: {node: '>=6.5'} @@ -2958,10 +2981,6 @@ packages: resolution: {integrity: sha512-bMxMKAjg13EBSVscxTaYA4mRc5t1UAXa2kXiGTNfZ079HIWXEkKmkgFrh/nJqamaLSrXO5H4WFFkPEaLJWbs3A==} engines: {node: '>= 0.4'} - assertion-error@2.0.1: - resolution: {integrity: sha512-Izi8RQcffqCeNVgFigKli1ssklIbpHnCYc6AknXGYoB6grJqyeby7jv12JUQgmTAnIDnbck1uxksT4dzN3PWBA==} - engines: {node: '>=12'} - ast-types-flow@0.0.8: resolution: {integrity: sha512-OH/2E5Fg20h2aPrbe+QL8JZQFko0YZaF+j4mnQ7BGhfavO7OpSLa8a0y9sBwomHdSbkhTS8TQNayBfnW5DwbvQ==} @@ -3128,9 +3147,6 @@ packages: resolution: {integrity: sha512-5IKcdX0nnYavi6G7TtOhwkYzyjfJlatbjMjuLSfE2kYT5pMDOilZ4OvMhi637CcDICTmz3wARPoyhqyX1Y+XvA==} engines: {node: ^14.18.0 || >=16.10.0} - convert-source-map@2.0.0: - resolution: {integrity: sha512-Kvp459HrV2FEJ1CAsi1Ku+MY3kasH19TFykTz2xWmMeq6bk2NU3XXvfJ+Q61m0xktWwt+1HSYf3JZsTms3aRJg==} - cross-spawn@7.0.6: resolution: {integrity: sha512-uV2QOWP2nWzsy2aMp8aRibhi9dlzF5Hgh5SHaB9OiTGEyDTiJJyx0uy51QXdyWbtAHNua4XJzUKca3OzKUd3vA==} engines: {node: '>= 8'} @@ -3286,8 +3302,8 @@ packages: resolution: {integrity: sha512-zoMwbCcH5hwUkKJkT8kDIBZSz9I6mVG//+lDCinLCGov4+r7NIy0ld8o03M0cJxl2spVf6ESYVS6/gpIfq1FFw==} engines: {node: '>= 0.4'} - es-module-lexer@2.1.0: - resolution: {integrity: sha512-n27zTYMjYu1aj4MjCWzSP7G9r75utsaoc8m61weK+W8JMBGGQybd43GstCXZ3WNmSFtGT9wi59qQTW6mhTR5LQ==} + es-module-lexer@1.7.0: + resolution: {integrity: sha512-jEQoCwk8hyb2AZziIOLhDqpm5+2ww5uIE6lkO/6jcOCusfk6LhMHpXXfBLXTZ7Ydyt0j4VoUQv6uGNYbdW+kBA==} es-object-atoms@1.1.1: resolution: {integrity: sha512-FGgH2h8zKNim9ljj7dankFPcICIK9Cp5bm+c2gQSYePhpaG5+esrLODihIorn+Pe6FGJzWhXQotPv73jTaldXA==} @@ -4246,6 +4262,10 @@ packages: resolution: {integrity: sha512-dRB78srN/l6gqWulah9SrxeYnxeddIG30+GOqK/9OlLVyLg3HPnr6SqOWTWOXKRwC2eGYCkZ59NNuSgvSrpgOA==} engines: {node: ^12.20.0 || ^14.13.1 || >=16.0.0} + node-gyp-build@4.8.4: + resolution: {integrity: sha512-LA4ZjwlnUblHVgq0oBF3Jl/6h/Nvs5fzBLwdEF4nuxnFdsfajde4WfxtJr3CaiH+F6ewcIB/q4jQ4UzPyid+CQ==} + hasBin: true + object-assign@4.1.1: resolution: {integrity: sha512-rJgTQnkUnH1sFw8yT6VSU3zD3sWmu6sZhIseY8VX+GRu3P6F7Fu+JNDoXfklElbLJSnc3FUQHVe4cU5hj+BcUg==} engines: {node: '>=0.10.0'} @@ -4747,9 +4767,6 @@ packages: std-env@3.10.0: resolution: {integrity: sha512-5GS12FdOZNliM5mAOxFRg7Ir0pWz8MdpYm6AY6VPkGpbA7ZzmbzNcBJQ0GPvvyWgcY7QAhCgf9Uy89I03faLkg==} - std-env@4.1.0: - resolution: {integrity: sha512-Rq7ybcX2RuC55r9oaPVEW7/xu3tj8u4GeBYHBWCychFtzMIr86A7e3PPEBPT37sHStKX3+TiX/Fr/ACmJLVlLQ==} - string-argv@0.3.2: resolution: {integrity: sha512-aqD2Q0144Z+/RqG52NeHEkZauTAUWJO8c6yTftGJKO3Tja5tUgIfmIl6kExvhtxSDP7fXB6DvzkfMpCd/F3G+Q==} engines: {node: '>=0.6.19'} @@ -4855,26 +4872,18 @@ packages: tinyexec@0.3.2: resolution: {integrity: sha512-KQQR9yN7R5+OSwaK0XQoj22pwHoTlgYqmUscPYoknOoWCWfj/5/ABTMRi69FrKU5ffPVh5QcFikpWJI/P1ocHA==} - tinyexec@1.2.4: - resolution: {integrity: sha512-SHf/r48b7vOrjve9PxJo3MN5v5yuyjHvdUcrQffT3WXMUfnGmHDVbC4k3sHJaJTgZCwpUplIaAo5ANtMyp3YHg==} + tinyexec@1.0.2: + resolution: {integrity: sha512-W/KYk+NFhkmsYpuHq5JykngiOCnxeVL8v8dFnqxSD8qEEdRfXk1SDM6JzNqcERbcGYj9tMrDQBYV9cjgnunFIg==} engines: {node: '>=18'} tinyglobby@0.2.16: resolution: {integrity: sha512-pn99VhoACYR8nFHhxqix+uvsbXineAasWm5ojXoN8xEwK5Kd3/TrhNn1wByuD52UxWRLy8pu+kRMniEi6Eq9Zg==} engines: {node: '>=12.0.0'} - tinyglobby@0.2.17: - resolution: {integrity: sha512-wXR/dYpcqKmfWpEdZjiKJOwCNFndD0DMnrW/cYjVGttEkBfVgcLFHoNrlj47mjOVic9yyNu65alsgF4NQyTa2g==} - engines: {node: '>=12.0.0'} - tinyrainbow@3.0.3: resolution: {integrity: sha512-PSkbLUoxOFRzJYjjxHJt9xro7D+iilgMX/C9lawzVuYiIdcihh9DXmVibBe8lmcFrRi/VzlPjBxbN7rH24q8/Q==} engines: {node: '>=14.0.0'} - tinyrainbow@3.1.0: - resolution: {integrity: sha512-Bf+ILmBgretUrdJxzXM0SgXLZ3XfiaUuOj/IKQHuTXip+05Xn+uyEYdVg0kYDipTBcLrCVyUzAPz7QmArb0mmw==} - engines: {node: '>=14.0.0'} - to-fast-properties@2.0.0: resolution: {integrity: sha512-/OaKK0xYrs3DmxRYqL/yDc+FxFUVYhDlXMhRmv3z915w2HF1tnN1omB354j8VUGO/hbRzyD6Y3sA7v7GS/ceog==} engines: {node: '>=4'} @@ -5051,21 +5060,20 @@ packages: yaml: optional: true - vitest@4.1.0: - resolution: {integrity: sha512-YbDrMF9jM2Lqc++2530UourxZHmkKLxrs4+mYhEwqWS97WJ7wOYEkcr+QfRgJ3PW9wz3odRijLZjHEaRLTNbqw==} + vitest@4.0.17: + resolution: {integrity: sha512-FQMeF0DJdWY0iOnbv466n/0BudNdKj1l5jYgl5JVTwjSsZSlqyXFt/9+1sEyhR6CLowbZpV7O1sCHrzBhucKKg==} engines: {node: ^20.0.0 || ^22.0.0 || >=24.0.0} hasBin: true peerDependencies: '@edge-runtime/vm': '*' '@opentelemetry/api': ^1.9.0 '@types/node': ^20.0.0 || ^22.0.0 || >=24.0.0 - '@vitest/browser-playwright': 4.1.0 - '@vitest/browser-preview': 4.1.0 - '@vitest/browser-webdriverio': 4.1.0 - '@vitest/ui': 4.1.0 + '@vitest/browser-playwright': 4.0.17 + '@vitest/browser-preview': 4.0.17 + '@vitest/browser-webdriverio': 4.0.17 + '@vitest/ui': 4.0.17 happy-dom: '*' jsdom: '*' - vite: ^6.0.0 || ^7.0.0 || ^8.0.0-0 peerDependenciesMeta: '@edge-runtime/vm': optional: true @@ -5833,6 +5841,31 @@ snapshots: transitivePeerDependencies: - encoding + '@livekit/local-inference-darwin-arm64@0.2.5': + optional: true + + '@livekit/local-inference-darwin-x64@0.2.5': + optional: true + + '@livekit/local-inference-linux-arm64-gnu@0.2.5': + optional: true + + '@livekit/local-inference-linux-x64-gnu@0.2.5': + optional: true + + '@livekit/local-inference-win32-x64-msvc@0.2.5': + optional: true + + '@livekit/local-inference@0.2.5': + dependencies: + node-gyp-build: 4.8.4 + optionalDependencies: + '@livekit/local-inference-darwin-arm64': 0.2.5 + '@livekit/local-inference-darwin-x64': 0.2.5 + '@livekit/local-inference-linux-arm64-gnu': 0.2.5 + '@livekit/local-inference-linux-x64-gnu': 0.2.5 + '@livekit/local-inference-win32-x64-msvc': 0.2.5 + '@livekit/mutex@1.1.1': {} '@livekit/noise-cancellation-darwin-arm64@0.1.9': @@ -5861,7 +5894,7 @@ snapshots: '@livekit/noise-cancellation-win32-x64@0.1.9': optional: true - '@livekit/protocol@1.46.4': + '@livekit/protocol@1.46.6': dependencies: '@bufbuild/protobuf': 1.10.1 @@ -6461,17 +6494,14 @@ snapshots: '@types/argparse@1.0.38': {} - '@types/chai@5.2.3': + '@types/chai@5.2.2': dependencies: '@types/deep-eql': 4.0.2 - assertion-error: 2.0.1 '@types/deep-eql@4.0.2': {} '@types/estree@1.0.8': {} - '@types/estree@1.0.9': {} - '@types/fluent-ffmpeg@2.1.28': dependencies: '@types/node': 22.19.1 @@ -6593,7 +6623,7 @@ snapshots: '@ungap/structured-clone@1.2.0': {} - '@vitest/coverage-v8@4.0.17(vitest@4.1.0(@opentelemetry/api@1.9.0)(@types/node@22.19.1)(vite@7.3.2(@types/node@22.19.1)(tsx@4.21.0)))': + '@vitest/coverage-v8@4.0.17(vitest@4.0.17(@opentelemetry/api@1.9.0)(@types/node@22.19.1)(tsx@4.21.0))': dependencies: '@bcoe/v8-coverage': 1.0.2 '@vitest/utils': 4.0.17 @@ -6605,28 +6635,28 @@ snapshots: obug: 2.1.1 std-env: 3.10.0 tinyrainbow: 3.0.3 - vitest: 4.1.0(@opentelemetry/api@1.9.0)(@types/node@22.19.1)(vite@7.3.2(@types/node@22.19.1)(tsx@4.21.0)) + vitest: 4.0.17(@opentelemetry/api@1.9.0)(@types/node@22.19.1)(tsx@4.21.0) - '@vitest/expect@4.1.0': + '@vitest/expect@4.0.17': dependencies: '@standard-schema/spec': 1.1.0 - '@types/chai': 5.2.3 - '@vitest/spy': 4.1.0 - '@vitest/utils': 4.1.0 + '@types/chai': 5.2.2 + '@vitest/spy': 4.0.17 + '@vitest/utils': 4.0.17 chai: 6.2.2 - tinyrainbow: 3.1.0 + tinyrainbow: 3.0.3 - '@vitest/mocker@4.1.0(vite@7.3.2(@types/node@22.19.1)(tsx@4.21.0))': + '@vitest/mocker@4.0.17(vite@7.3.2(@types/node@22.19.1)(tsx@4.21.0))': dependencies: - '@vitest/spy': 4.1.0 + '@vitest/spy': 4.0.17 estree-walker: 3.0.3 magic-string: 0.30.21 optionalDependencies: vite: 7.3.2(@types/node@22.19.1)(tsx@4.21.0) - '@vitest/mocker@4.1.0(vite@7.3.2(@types/node@25.6.0)(tsx@4.21.0))': + '@vitest/mocker@4.0.17(vite@7.3.2(@types/node@25.6.0)(tsx@4.21.0))': dependencies: - '@vitest/spy': 4.1.0 + '@vitest/spy': 4.0.17 estree-walker: 3.0.3 magic-string: 0.30.21 optionalDependencies: @@ -6636,35 +6666,24 @@ snapshots: dependencies: tinyrainbow: 3.0.3 - '@vitest/pretty-format@4.1.0': + '@vitest/runner@4.0.17': dependencies: - tinyrainbow: 3.1.0 - - '@vitest/runner@4.1.0': - dependencies: - '@vitest/utils': 4.1.0 + '@vitest/utils': 4.0.17 pathe: 2.0.3 - '@vitest/snapshot@4.1.0': + '@vitest/snapshot@4.0.17': dependencies: - '@vitest/pretty-format': 4.1.0 - '@vitest/utils': 4.1.0 + '@vitest/pretty-format': 4.0.17 magic-string: 0.30.21 pathe: 2.0.3 - '@vitest/spy@4.1.0': {} + '@vitest/spy@4.0.17': {} '@vitest/utils@4.0.17': dependencies: '@vitest/pretty-format': 4.0.17 tinyrainbow: 3.0.3 - '@vitest/utils@4.1.0': - dependencies: - '@vitest/pretty-format': 4.1.0 - convert-source-map: 2.0.0 - tinyrainbow: 3.1.0 - abort-controller@3.0.0: dependencies: event-target-shim: 5.0.1 @@ -6794,8 +6813,6 @@ snapshots: is-array-buffer: 3.0.4 is-shared-array-buffer: 1.0.3 - assertion-error@2.0.1: {} - ast-types-flow@0.0.8: {} ast-v8-to-istanbul@0.3.10: @@ -6945,8 +6962,6 @@ snapshots: consola@3.4.2: {} - convert-source-map@2.0.0: {} - cross-spawn@7.0.6: dependencies: path-key: 3.1.1 @@ -7135,7 +7150,7 @@ snapshots: iterator.prototype: 1.1.2 safe-array-concat: 1.1.2 - es-module-lexer@2.1.0: {} + es-module-lexer@1.7.0: {} es-object-atoms@1.1.1: dependencies: @@ -7490,7 +7505,7 @@ snapshots: estree-walker@3.0.3: dependencies: - '@types/estree': 1.0.9 + '@types/estree': 1.0.8 esutils@2.0.3: {} @@ -8069,7 +8084,7 @@ snapshots: livekit-server-sdk@2.14.1: dependencies: '@bufbuild/protobuf': 1.10.1 - '@livekit/protocol': 1.46.4 + '@livekit/protocol': 1.46.6 camelcase-keys: 9.1.3 jose: 5.2.4 @@ -8204,6 +8219,8 @@ snapshots: fetch-blob: 3.2.0 formdata-polyfill: 4.0.10 + node-gyp-build@4.8.4: {} + object-assign@4.1.1: {} object-inspect@1.13.1: {} @@ -8808,8 +8825,6 @@ snapshots: std-env@3.10.0: {} - std-env@4.1.0: {} - string-argv@0.3.2: {} string-width@4.2.3: @@ -8931,22 +8946,15 @@ snapshots: tinyexec@0.3.2: {} - tinyexec@1.2.4: {} + tinyexec@1.0.2: {} tinyglobby@0.2.16: dependencies: fdir: 6.5.0(picomatch@4.0.4) picomatch: 4.0.4 - tinyglobby@0.2.17: - dependencies: - fdir: 6.5.0(picomatch@4.0.4) - picomatch: 4.0.4 - tinyrainbow@3.0.3: {} - tinyrainbow@3.1.0: {} - to-fast-properties@2.0.0: {} to-regex-range@5.0.1: @@ -9152,61 +9160,81 @@ snapshots: fsevents: 2.3.3 tsx: 4.21.0 - vitest@4.1.0(@opentelemetry/api@1.9.0)(@types/node@22.19.1)(vite@7.3.2(@types/node@22.19.1)(tsx@4.21.0)): + vitest@4.0.17(@opentelemetry/api@1.9.0)(@types/node@22.19.1)(tsx@4.21.0): dependencies: - '@vitest/expect': 4.1.0 - '@vitest/mocker': 4.1.0(vite@7.3.2(@types/node@22.19.1)(tsx@4.21.0)) - '@vitest/pretty-format': 4.1.0 - '@vitest/runner': 4.1.0 - '@vitest/snapshot': 4.1.0 - '@vitest/spy': 4.1.0 - '@vitest/utils': 4.1.0 - es-module-lexer: 2.1.0 + '@vitest/expect': 4.0.17 + '@vitest/mocker': 4.0.17(vite@7.3.2(@types/node@22.19.1)(tsx@4.21.0)) + '@vitest/pretty-format': 4.0.17 + '@vitest/runner': 4.0.17 + '@vitest/snapshot': 4.0.17 + '@vitest/spy': 4.0.17 + '@vitest/utils': 4.0.17 + es-module-lexer: 1.7.0 expect-type: 1.3.0 magic-string: 0.30.21 obug: 2.1.1 pathe: 2.0.3 picomatch: 4.0.4 - std-env: 4.1.0 + std-env: 3.10.0 tinybench: 2.9.0 - tinyexec: 1.2.4 - tinyglobby: 0.2.17 - tinyrainbow: 3.1.0 + tinyexec: 1.0.2 + tinyglobby: 0.2.16 + tinyrainbow: 3.0.3 vite: 7.3.2(@types/node@22.19.1)(tsx@4.21.0) why-is-node-running: 2.3.0 optionalDependencies: '@opentelemetry/api': 1.9.0 '@types/node': 22.19.1 transitivePeerDependencies: + - jiti + - less + - lightningcss - msw + - sass + - sass-embedded + - stylus + - sugarss + - terser + - tsx + - yaml - vitest@4.1.0(@opentelemetry/api@1.9.0)(@types/node@25.6.0)(vite@7.3.2(@types/node@25.6.0)(tsx@4.21.0)): + vitest@4.0.17(@opentelemetry/api@1.9.0)(@types/node@25.6.0)(tsx@4.21.0): dependencies: - '@vitest/expect': 4.1.0 - '@vitest/mocker': 4.1.0(vite@7.3.2(@types/node@25.6.0)(tsx@4.21.0)) - '@vitest/pretty-format': 4.1.0 - '@vitest/runner': 4.1.0 - '@vitest/snapshot': 4.1.0 - '@vitest/spy': 4.1.0 - '@vitest/utils': 4.1.0 - es-module-lexer: 2.1.0 + '@vitest/expect': 4.0.17 + '@vitest/mocker': 4.0.17(vite@7.3.2(@types/node@25.6.0)(tsx@4.21.0)) + '@vitest/pretty-format': 4.0.17 + '@vitest/runner': 4.0.17 + '@vitest/snapshot': 4.0.17 + '@vitest/spy': 4.0.17 + '@vitest/utils': 4.0.17 + es-module-lexer: 1.7.0 expect-type: 1.3.0 magic-string: 0.30.21 obug: 2.1.1 pathe: 2.0.3 picomatch: 4.0.4 - std-env: 4.1.0 + std-env: 3.10.0 tinybench: 2.9.0 - tinyexec: 1.2.4 - tinyglobby: 0.2.17 - tinyrainbow: 3.1.0 + tinyexec: 1.0.2 + tinyglobby: 0.2.16 + tinyrainbow: 3.0.3 vite: 7.3.2(@types/node@25.6.0)(tsx@4.21.0) why-is-node-running: 2.3.0 optionalDependencies: '@opentelemetry/api': 1.9.0 '@types/node': 25.6.0 transitivePeerDependencies: + - jiti + - less + - lightningcss - msw + - sass + - sass-embedded + - stylus + - sugarss + - terser + - tsx + - yaml vscode-oniguruma@1.7.0: {} diff --git a/turbo.json b/turbo.json index b0bc90527..0a25e3eea 100644 --- a/turbo.json +++ b/turbo.json @@ -42,6 +42,7 @@ "LIVEKIT_INFERENCE_URL", "LIVEKIT_OUTBOUND_TRUNK_ID", "LIVEKIT_URL", + "LIVEKIT_WORKER_TOKEN", "LLAMA_API_KEY", "LIVEKIT_AGENT_ID", "LIVEKIT_AGENT_NAME",
LiveKit Ecosystem