diff --git a/.changeset/audio-eou.md b/.changeset/audio-eou.md
new file mode 100644
index 000000000..3afe5b4fc
--- /dev/null
+++ b/.changeset/audio-eou.md
@@ -0,0 +1,17 @@
+---
+"@livekit/agents": patch
+"@livekit/agents-plugin-silero": patch
+"@livekit/agents-plugins-livekit": patch
+---
+
+feat(core): audio end-of-turn detection with cloud → local fallback (AGT-2520)
+
+- New `inference.TurnDetector`: WebSocket cloud EOT transport (`version: 'v1'`, model name `turn-detector-v1`) with automatic fallback to the local native model (`version: 'v1-mini'`, model name `turn-detector-v1-mini`) via `@livekit/local-inference`. Auto-selects `'v1'` when `LIVEKIT_REMOTE_EOT_URL` is set, `'v1-mini'` otherwise. The `version` is the constructor knob; telemetry/billing report the full model name via `detector.model`.
+- The local EOT model runs in the shared inference process (the same `InferenceProcExecutor` the text turn detector uses), loaded once per worker host (~138 MB) instead of in every job worker. The runner is registered by default when the native binding is available, so the inference process spawns on worker startup; on platforms where the binding can't load, local EOT degrades to a positive-default prediction and the worker still starts. (This is a JS-specific divergence from Python, which keeps EOT in-process and relies on forkserver COW sharing.)
+- No prewarm helpers: EOT auto-warms in the inference process; the in-process silero VAD lazy-loads on first stream. (The `inference.prewarm*` helpers added during development were removed before release.)
+- New `inference.VAD` (local-only streaming VAD via `@livekit/local-inference`).
+- `AgentSession` now auto-provisions a bundled silero VAD when `vad` is omitted (`isDefault=true`). Pass `vad: null` to opt out.
+- `livekit-plugins-silero` is deprecated; pass `vad: null` to opt out of the bundled default, or use `inference.VAD({ model: 'silero', ... })` to customise.
+- `livekit-plugins-livekit` turn detector is deprecated in favor of `inference.TurnDetector`.
+- New `EOTInferenceMetrics` and `EOTModelUsage`; new telemetry span attributes (`lk.eou.source`, `lk.eou.from_cache`, `lk.eou.detection_delay`); new `eot_prediction` event forwarded over remote sessions.
+- Requires `@livekit/protocol` >= 1.46.5 (exposes the `AgentInference` message namespace used by the cloud transport, including the server-provided `SessionCreated` default thresholds).
diff --git a/MODEL_LICENSE b/MODEL_LICENSE
new file mode 100644
index 000000000..44bea4802
--- /dev/null
+++ b/MODEL_LICENSE
@@ -0,0 +1,113 @@
+LIVEKIT MODEL LICENSE AGREEMENT
+
+1. Introduction
+
+   LiveKit Incorporated ("LiveKit") is making available its proprietary models for
+   use pursuant to the terms and conditions of this Agreement. As further
+   described below, you may use these LiveKit models freely but can only use them
+   together with the LiveKit Agents framework. You cannot use the LiveKit models
+   on a standalone basis or with any other frameworks.
+
+   BY CLICKING "I ACCEPT," OR BY DOWNLOADING, INSTALLING, OR OTHERWISE ACCESSING
+   OR USING THE LIVEKIT MATERIALS, YOU AGREE THAT YOU HAVE READ AND UNDERSTOOD,
+   AND, AS A CONDITION TO YOUR USE OF THE LIVEKIT MATERIALS, YOU AGREE TO BE
+   BOUND BY, THE FOLLOWING TERMS AND CONDITIONS.
+
+2. Definitions
+
+   "Agreement" means this LiveKit Model License Agreement.
+
+   "Documentation" means the specifications, manuals, and documentation
+   accompanying any LiveKit Model and distributed by LiveKit.
+
+   "Licensee" or "you" means the individual or entity agreeing to be bound by
+   this Agreement.
+
+   "LiveKit Agents" means the proprietary LiveKit software framework for building
+   real-time multimodal AI applications with programmable backend participants.
+
+   "LiveKit Materials" means, collectively, the LiveKit Models and Documentation.
+
+   "LiveKit Model" means any of LiveKit's proprietary software models or
+   algorithms, including machine-learning software code, model weights,
+   inference-enabling software code, training-enabling software code, and
+   fine-tuning enabling software code. Any derivative works of a LiveKit Model,
+   whether developed by LiveKit, you, or any third party, will be deemed the
+   "LiveKit Model" for the purposes of this Agreement.
+
+3. License Rights
+
+   Right to Use LiveKit Materials. Subject to the terms and conditions of this
+   Agreement, including the requirements of Section 3.b, LiveKit grants you a
+   nonexclusive, nontransferable, worldwide, royalty-free license under LiveKit's
+   intellectual property rights to use, reproduce, distribute, copy, and create
+   derivative works of the LiveKit Materials.
+
+   Limitation on Use. As a condition to your use of the LiveKit Materials, you
+   agree: (i) not to use any LiveKit Models on a standalone basis or with any
+   frameworks other than LiveKit Agents; (ii) not to use any LiveKit Materials or
+   any output from, or results of using, LiveKit Models (including any derivative
+   works thereof) to improve or otherwise develop any other models that are not
+   LiveKit Models; or (iii) distribute or otherwise make available the LiveKit
+   Materials (including any derivative works thereof) except (x) pursuant to the
+   terms of this Agreement, and (y) you reproduce the above copyright notice.
+
+4. Intellectual Property
+
+   The LiveKit Materials are owned by LiveKit and its licensors. Except for the
+   rights granted to you under this Agreement, all rights are reserved and no
+   other express or implied rights are granted.
+
+   You will own any derivative works that you created from the LiveKit Materials,
+   subject to the terms of this Agreement.
+
+5. Disclaimer
+
+   UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN WRITING, LIVEKIT PROVIDES
+   THE LIVEKIT MATERIALS, AND ANY OUTPUT OR RESULTS THEREFROM, ON AN "AS IS"
+   BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, EITHER EXPRESS OR IMPLIED,
+   INCLUDING, WITHOUT LIMITATION, ANY WARRANTIES OR CONDITIONS OF TITLE,
+   NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. YOU
+   ARE SOLELY RESPONSIBLE FOR DETERMINING THE APPROPRIATENESS OF USING OR
+   REDISTRIBUTING THE LIVEKIT MATERIALS AND ASSUME ANY RISKS ASSOCIATED WITH YOUR
+   USE OF THE LIVEKIT MATERIALS AND ANY OUTPUT AND RESULTS.
+
+6. Limitation of Liability
+
+   IN NO EVENT AND UNDER NO LEGAL THEORY, WHETHER IN TORT (INCLUDING NEGLIGENCE),
+   CONTRACT, OR OTHERWISE, UNLESS REQUIRED BY APPLICABLE LAW (SUCH AS DELIBERATE
+   AND GROSSLY NEGLIGENT ACTS) OR AGREED TO IN WRITING, WILL LIVEKIT BE LIABLE TO
+   YOU FOR INDIRECT DAMAGES, INCLUDING ANY SPECIAL, INCIDENTAL, OR CONSEQUENTIAL
+   DAMAGES OF ANY CHARACTER ARISING AS A RESULT OF THIS AGREEMENT OR OUT OF THE
+   USE OR INABILITY TO USE THE LIVEKIT MATERIALS OR ANY OUTPUT OR RESULTS
+   THEREFROM (INCLUDING BUT NOT LIMITED TO DAMAGES FOR LOSS OF GOODWILL, WORK
+   STOPPAGE, COMPUTER FAILURE OR MALFUNCTION, OR ANY AND ALL OTHER COMMERCIAL
+   DAMAGES OR LOSSES), EVEN IF LIVEKIT HAS BEEN ADVISED OF THE POSSIBILITY OF
+   SUCH DAMAGES.
+
+7. Trademarks
+
+   This Agreement does not grant permission to use the trade names, trademarks,
+   service marks, or product names of LiveKit, except as required for reasonable
+   and customary use in describing the origin of the LiveKit Materials.
+
+8. Term and Termination
+
+   The term of this Agreement commences upon your acceptance of this Agreement
+   and continues in effect until you cease using the LiveKit Materials or it is
+   terminated by either party (on immediate written notice to the other party).
+   This Agreement will automatically terminate if you breach any of its terms.
+   Upon termination, you must immediately cease all use of the LiveKit Materials.
+   Sections 4, 5, 6, and 9 will survive termination.
+
+9. Governing Law and Venue
+
+   This Agreement is subject to the laws of the State of California, without
+   regard to its conflict of laws principles. The UN Convention on Contracts for
+   the International Sale of Goods does not apply to this Agreement. The courts
+   located in San Francisco, California, have exclusive jurisdiction for any
+   dispute arising out of this Agreement.
+
++ + + +
+
+Last Updated: November 25, 2024
diff --git a/README.md b/README.md
index 8b7de522f..04c044a81 100644
--- a/README.md
+++ b/README.md
@@ -368,6 +368,8 @@ To connect and talk to your agent:
 This project is licensed under `Apache-2.0`, and is [REUSE-3.2](https://reuse.software) compliant.
 Refer to [the license](LICENSES/Apache-2.0.txt) for details.
 
+The LiveKit turn detection models are licensed under the [LiveKit Model License](MODEL_LICENSE).
+
 <!--BEGIN_REPO_NAV-->
 <br/><table>
 <thead><tr><th colspan="2">LiveKit Ecosystem</th></tr></thead>
diff --git a/REUSE.toml b/REUSE.toml
index d2a802cee..1ed6c844d 100644
--- a/REUSE.toml
+++ b/REUSE.toml
@@ -7,6 +7,12 @@ SPDX-PackageName = "agents-js"
 SPDX-PackageSupplier = "LiveKit, Inc. <https://livekit.io>"
 SPDX-PackageDownloadLocation = "https://github.com/livekit/agents-js"
 
+# model license
+[[annotations]]
+path = ["MODEL_LICENSE"]
+SPDX-FileCopyrightText = "2024 LiveKit, Inc."
+SPDX-License-Identifier = "Apache-2.0"
+
 # trivial files
 [[annotations]]
 path = [".gitignore", "flake.lock", ".envrc", "packages/livekit-rtc/.gitignore", ".changeset/**", "**/CHANGELOG.md", "NOTICE", ".github/**"]
diff --git a/agents/package.json b/agents/package.json
index 1ada14451..e39309524 100644
--- a/agents/package.json
+++ b/agents/package.json
@@ -52,8 +52,9 @@
   "dependencies": {
     "@bufbuild/protobuf": "^1.10.0",
     "@ffmpeg-installer/ffmpeg": "^1.1.0",
+    "@livekit/local-inference": "^0.2.5",
     "@livekit/mutex": "^1.1.1",
-    "@livekit/protocol": "^1.46.4",
+    "@livekit/protocol": "^1.46.5",
     "@livekit/throws-transformer": "0.1.8",
     "@livekit/typed-emitter": "^3.0.0",
     "@opentelemetry/api": "^1.9.0",
diff --git a/agents/src/inference/_warmup.ts b/agents/src/inference/_warmup.ts
new file mode 100644
index 000000000..0c77e6814
--- /dev/null
+++ b/agents/src/inference/_warmup.ts
@@ -0,0 +1,45 @@
+// SPDX-FileCopyrightText: 2026 LiveKit, Inc.
+//
+// SPDX-License-Identifier: Apache-2.0
+
+/**
+ * Loader for the bundled `@livekit/local-inference` native binding.
+ *
+ * Memory model (measured ~138 MB for the EOT model, ~2 MB for VAD): Node has
+ * no forkserver/COW, so anything loaded in a job worker is private to that
+ * worker. To avoid paying ~138 MB per worker, the EOT model is NOT loaded in
+ * job workers — it runs in the shared `InferenceProcExecutor` (see
+ * `inference/eot/runner.ts`), loaded once per host. The VAD stays in-process
+ * (it's small and runs continuously) and is reached via this loader.
+ *
+ * There are intentionally no public `prewarm*` helpers: EOT auto-warms via
+ * the inference runner's `initialize()` at proc startup, and the VAD lazy-
+ * loads on first stream.
+ */
+import { createRequire } from 'node:module';
+import { log } from '../log.js';
+
+const cjsRequire = createRequire(import.meta.url);
+
+let nativeMod: typeof import('@livekit/local-inference') | undefined;
+let triedLoad = false;
+
+function getNative(): typeof import('@livekit/local-inference') | undefined {
+  if (triedLoad) return nativeMod;
+  triedLoad = true;
+  try {
+    nativeMod = cjsRequire('@livekit/local-inference') as typeof import('@livekit/local-inference');
+    return nativeMod;
+  } catch (err) {
+    log().warn(
+      { err: err instanceof Error ? err.message : String(err) },
+      '@livekit/local-inference native binding not loadable; local VAD/EOT paths disabled',
+    );
+    return undefined;
+  }
+}
+
+/** @internal Returns the loaded native module, or `undefined` if unavailable. */
+export function _getLocalInferenceModule(): typeof import('@livekit/local-inference') | undefined {
+  return getNative();
+}
diff --git a/agents/src/inference/eot/base.test.ts b/agents/src/inference/eot/base.test.ts
new file mode 100644
index 000000000..e3c00445f
--- /dev/null
+++ b/agents/src/inference/eot/base.test.ts
@@ -0,0 +1,269 @@
+// SPDX-FileCopyrightText: 2026 LiveKit, Inc.
+//
+// SPDX-License-Identifier: Apache-2.0
+
+/**
+ * Inference-request lifecycle tests for `BaseStreamingTurnDetectorStream`.
+ *
+ * The stream is a thin transport-facing surface: per-request state is one
+ * `(requestId, requestFut)` pair. `predict` starts a request and returns its
+ * future, superseding any previous request; the transport's single prediction
+ * completes the request by resolving the future; `cancelInference`/`flush`
+ * close a pending request, resolving its future with a default event so
+ * waiters never hang. All policy (when to start a request, await timeout, turn
+ * commits) lives in `AudioRecognition` and is covered by
+ * `voice/audio_recognition_turn_detection.test.ts`.
+ *
+ * Port of Python `tests/test_turn_detection_fsm.py`.
+ */
+import type { AudioFrame } from '@livekit/rtc-node';
+import { describe, expect, it } from 'vitest';
+import type { Future } from '../../utils.js';
+import {
+  BaseStreamingTurnDetector,
+  type BaseStreamingTurnDetectorOptions,
+  BaseStreamingTurnDetectorStream,
+  type FlushSentinel,
+  type StreamingTurnDetectionTransport,
+  type TurnDetectionEvent,
+} from './base.js';
+import { ThresholdOptions, type TurnDetectorModel } from './languages.js';
+
+class FakeTransport implements StreamingTurnDetectionTransport {
+  events: Array<[string, string]> = [];
+  private _stream: BaseStreamingTurnDetectorStream | undefined;
+
+  attach(stream: BaseStreamingTurnDetectorStream): void {
+    this._stream = stream;
+  }
+  async run(): Promise<void> {
+    if (this._stream === undefined) {
+      throw new Error('stream not bound');
+    }
+    await this._stream._drainAudioChannel();
+  }
+  runInference(requestId: string): void {
+    this.events.push(['run_inference', requestId]);
+  }
+  async pushFrame(_frame: AudioFrame): Promise<void> {
+    // no-op
+  }
+  async flush(_sentinel: FlushSentinel): Promise<void> {
+    // no-op
+  }
+  detach(): void {
+    // no-op
+  }
+}
+
+class FakeDetector extends BaseStreamingTurnDetector {
+  // Mirror Python's `_make_stream` default (the local mini model) so the
+  // timed-out-cancel test sees a non-cloud model and skips the fallback.
+  get model(): TurnDetectorModel {
+    return 'turn-detector-v1-mini';
+  }
+  stream(): BaseStreamingTurnDetectorStream {
+    throw new Error('unused in request-lifecycle tests');
+  }
+}
+
+class FakeBackend extends BaseStreamingTurnDetectorStream {
+  fakeTransport: FakeTransport;
+
+  constructor(opts: BaseStreamingTurnDetectorOptions) {
+    const transport = new FakeTransport();
+    super({ detector: new FakeDetector(opts), opts, transport });
+    this.fakeTransport = transport;
+  }
+
+  get events(): Array<[string, string]> {
+    return this.fakeTransport.events;
+  }
+
+  /** Mirror what a transport would do: hand the prediction to the stream. */
+  simulatePrediction(requestId: string, probability: number): void {
+    this._resolvePrediction(requestId, probability);
+  }
+
+  // Exposed for assertions.
+  get requestId(): string | undefined {
+    return this._requestId;
+  }
+  get requestFut(): Future<TurnDetectionEvent> | undefined {
+    return this._requestFut;
+  }
+}
+
+function makeOpts(thresholds: Record<string, number> = {}): BaseStreamingTurnDetectorOptions {
+  // Seed the resolved thresholds via a local-model dict override so `lookup`
+  // returns them (unmapped languages fall back to the shipped local table).
+  return {
+    sampleRate: 16000,
+    thresholds: new ThresholdOptions('turn-detector-v1-mini', thresholds),
+  };
+}
+
+function makeStream(thresholds: Record<string, number> = {}): FakeBackend {
+  return new FakeBackend(makeOpts(thresholds));
+}
+
+const countRunInference = (events: Array<[string, string]>) =>
+  events.filter((e) => e[0] === 'run_inference').length;
+
+describe('AudioTurnDetectionRequests', () => {
+  it('predict starts inference', async () => {
+    const s = makeStream();
+    try {
+      const fut = s.predict();
+      expect(s.requestId).toBeDefined();
+      expect(fut.done).toBe(false);
+      expect(s.events).toEqual([['run_inference', s.requestId!]]);
+    } finally {
+      await s.aclose();
+    }
+  });
+
+  it('predict supersedes previous request', async () => {
+    const s = makeStream();
+    try {
+      const oldFut = s.predict();
+      const oldId = s.requestId;
+      const newFut = s.predict();
+
+      expect(newFut).not.toBe(oldFut);
+      expect(s.requestId).not.toBe(oldId);
+      expect(oldFut.done).toBe(true);
+      expect((await oldFut.await).endOfTurnProbability).toBe(0.0);
+      expect(countRunInference(s.events)).toBe(2);
+    } finally {
+      await s.aclose();
+    }
+  });
+
+  it('cancelInference closes the request', async () => {
+    const s = makeStream();
+    try {
+      const fut = s.predict();
+      s.cancelInference();
+
+      expect(s.requestId).toBeUndefined();
+      expect(fut.done).toBe(true);
+      expect((await fut.await).endOfTurnProbability).toBe(0.0);
+    } finally {
+      await s.aclose();
+    }
+  });
+
+  it('cancelInference when idle is a no-op', async () => {
+    const s = makeStream();
+    try {
+      s.cancelInference();
+      expect(s.events).toEqual([]);
+    } finally {
+      await s.aclose();
+    }
+  });
+
+  it('late prediction after cancelInference is dropped', async () => {
+    const s = makeStream();
+    try {
+      const fut = s.predict();
+      const cancelledId = s.requestId!;
+      expect(cancelledId).toBeDefined();
+
+      s.cancelInference();
+      s.simulatePrediction(cancelledId, 0.9);
+      // cancelInference default (0.0), not the late 0.9.
+      expect((await fut.await).endOfTurnProbability).toBe(0.0);
+
+      const nextFut = s.predict();
+      expect(nextFut).not.toBe(fut);
+      expect(nextFut.done).toBe(false);
+      expect(countRunInference(s.events)).toBe(2);
+    } finally {
+      await s.aclose();
+    }
+  });
+
+  it('prediction completes the request', async () => {
+    const s = makeStream();
+    try {
+      const fut = s.predict();
+      const requestId = s.requestId!;
+      expect(requestId).toBeDefined();
+
+      s.simulatePrediction(requestId, 0.3);
+      expect(fut.done).toBe(true);
+      expect((await fut.await).endOfTurnProbability).toBe(0.3);
+      expect(s.requestId).toBeUndefined();
+    } finally {
+      await s.aclose();
+    }
+  });
+
+  it('flush closes the request', async () => {
+    const s = makeStream();
+    try {
+      const fut = s.predict();
+      s.flush('turn committed');
+      expect(s.requestId).toBeUndefined();
+      expect((await fut.await).endOfTurnProbability).toBe(0.0);
+    } finally {
+      await s.aclose();
+    }
+  });
+
+  it('flush does not overwrite a resolved prediction', async () => {
+    const s = makeStream();
+    try {
+      const fut = s.predict();
+      const requestId = s.requestId!;
+      expect(requestId).toBeDefined();
+      s.simulatePrediction(requestId, 0.7);
+
+      s.flush('turn committed');
+      expect((await fut.await).endOfTurnProbability).toBe(0.7);
+      expect(s.requestId).toBeUndefined();
+    } finally {
+      await s.aclose();
+    }
+  });
+
+  it('predict after endInput returns a resolved default', async () => {
+    const s = makeStream();
+    try {
+      s.endInput();
+      // `endInput` closes the audio channel asynchronously; wait for it.
+      await new Promise((resolve) => setTimeout(resolve, 20));
+      const fut = s.predict();
+      expect(fut.done).toBe(true);
+      expect((await fut.await).endOfTurnProbability).toBe(1.0);
+      expect(s.events.some((e) => e[0] === 'run_inference')).toBe(false);
+    } finally {
+      await s.aclose();
+    }
+  });
+
+  it('aclose resolves a pending future', async () => {
+    const s = makeStream();
+    const fut = s.predict();
+    await s.aclose();
+    expect(fut.done).toBe(true);
+    expect((await fut.await).endOfTurnProbability).toBe(0.0);
+  });
+
+  it('timed-out cancelInference does not fall back for the local model', async () => {
+    // `timedOut: true` only promotes the cloud→local fallback for the cloud
+    // model; the base stream (mini model) just closes the request — the cloud
+    // case is covered in detector.test.ts.
+    const s = makeStream();
+    try {
+      const fut = s.predict();
+      s.cancelInference({ timedOut: true });
+      expect((await fut.await).endOfTurnProbability).toBe(0.0);
+      expect(s.model).toBe('turn-detector-v1-mini');
+    } finally {
+      await s.aclose();
+    }
+  });
+});
diff --git a/agents/src/inference/eot/base.ts b/agents/src/inference/eot/base.ts
new file mode 100644
index 000000000..9ff17b983
--- /dev/null
+++ b/agents/src/inference/eot/base.ts
@@ -0,0 +1,543 @@
+// SPDX-FileCopyrightText: 2026 LiveKit, Inc.
+//
+// SPDX-License-Identifier: Apache-2.0
+
+/**
+ * Audio EOT (end-of-turn) detector base, the per-window inference stream, and
+ * the transport interface that concrete cloud/local backends implement.
+ *
+ * Concrete implementations live in `agents/src/inference/eot/`.
+ *
+ * Port of Python `livekit.agents.voice.turn.audio`.
+ */
+import type { AudioFrame } from '@livekit/rtc-node';
+import { AudioResampler, AudioResamplerQuality } from '@livekit/rtc-node';
+import type { TypedEventEmitter as TypedEmitter } from '@livekit/typed-emitter';
+import { EventEmitter } from 'node:events';
+import type { LanguageCode } from '../../language.js';
+import { log } from '../../log.js';
+import type { EOTInferenceMetrics } from '../../metrics/base.js';
+import { type StreamChannel, createStreamChannel } from '../../stream/stream_channel.js';
+import { Future, Task, cancelAndWait, shortuuid } from '../../utils.js';
+import type { ThresholdOptions, TurnDetectorModel } from './languages.js';
+
+export const DEFAULT_SAMPLE_RATE = 16000;
+export const MIN_SILENCE_DURATION_MS = 200;
+
+/**
+ * Options shared by the audio EOT stream and every transport.
+ *
+ * Cloud-only transport concerns (base URL, credentials, conn options)
+ * live on a separate options class owned by the cloud transport.
+ */
+export interface BaseStreamingTurnDetectorOptions {
+  sampleRate: number;
+  thresholds: ThresholdOptions;
+}
+
+/**
+ * Event emitted on each EOT prediction.
+ */
+export interface TurnDetectionEvent {
+  type: 'eot_prediction';
+  endOfTurnProbability: number;
+  /** Wall-clock time when the prediction landed (milliseconds since epoch). */
+  lastSpeakingTimeMs: number;
+  /** Latest input-audio creation time → prediction receive time (ms). */
+  detectionDelay?: number;
+  /** Server-side model inference time (ms). */
+  inferenceDuration?: number;
+}
+
+/**
+ * Sentinel value carried alongside flush requests. Signals a turn boundary
+ * to the transport so it can clear its buffered audio.
+ */
+export interface FlushSentinel {
+  readonly kind: 'flush';
+  reason?: string;
+}
+
+export function isFlushSentinel(value: unknown): value is FlushSentinel {
+  return typeof value === 'object' && value !== null && (value as FlushSentinel).kind === 'flush';
+}
+
+/**
+ * Transport adapter for `BaseStreamingTurnDetectorStream` — owns the I/O (WebSocket
+ * session, in-process predict, etc.). The stream calls these methods
+ * directly; transports report predictions back via
+ * `stream._resolvePrediction(requestId, probability, ...)`.
+ */
+export interface StreamingTurnDetectionTransport {
+  attach(stream: BaseStreamingTurnDetectorStream): void;
+  run(): Promise<void>;
+  runInference(requestId: string): void;
+  pushFrame(frame: AudioFrame): Promise<void>;
+  flush(sentinel: FlushSentinel): Promise<void>;
+  detach(): void;
+}
+
+export type BaseStreamingTurnDetectorCallbacks = {
+  metrics_collected: (metrics: EOTInferenceMetrics) => void;
+};
+
+/**
+ * Abstract base for audio EOT detectors. Holds the threshold table and
+ * provides `stream()` to create a per-turn FSM instance.
+ *
+ * Subclasses (`TurnDetector` in `inference/eot/detector.ts`) wire up
+ * concrete transports.
+ */
+export abstract class BaseStreamingTurnDetector extends (EventEmitter as new () => TypedEmitter<BaseStreamingTurnDetectorCallbacks>) {
+  protected _opts: BaseStreamingTurnDetectorOptions;
+  /**
+   * Active streams the detector tracks for bulk teardown via `aclose()`.
+   * `Set` rather than `WeakSet` because we need iteration; each stream
+   * removes itself on its own `aclose` (see `BaseStreamingTurnDetectorStream.aclose`)
+   * so the strong refs are released without requiring the caller to call
+   * `detector.aclose()`.
+   */
+  protected _streams: Set<BaseStreamingTurnDetectorStream> = new Set();
+
+  constructor(opts: BaseStreamingTurnDetectorOptions) {
+    super();
+    this._opts = opts;
+  }
+
+  /** @internal Stream lifecycle hook — called by the stream itself on close. */
+  _unregisterStream(stream: BaseStreamingTurnDetectorStream): void {
+    this._streams.delete(stream);
+  }
+
+  abstract get model(): TurnDetectorModel;
+
+  get provider(): string {
+    return 'livekit';
+  }
+
+  /** Most-recent materialized threshold map (after any cloud→local fallback
+   * rescale or server-default adoption). */
+  get thresholds(): Readonly<Record<string, number>> {
+    return this._opts.thresholds.thresholds;
+  }
+
+  /** Threshold below which the detector treats the prediction as "unlikely
+   * to be end-of-turn". Returns `undefined` when the language isn't covered. */
+  async unlikelyThreshold(language: LanguageCode | undefined): Promise<number | undefined> {
+    return this._opts.thresholds.lookup(language);
+  }
+
+  async supportsLanguage(language: LanguageCode | undefined): Promise<boolean> {
+    return this._opts.thresholds.supports(language);
+  }
+
+  abstract stream(): BaseStreamingTurnDetectorStream;
+
+  async aclose(): Promise<void> {
+    const streams = Array.from(this._streams);
+    this._streams.clear();
+    await Promise.allSettled(streams.map((s) => s.aclose()));
+  }
+}
+
+/**
+ * Per-window inference stream. A thin transport-facing surface: per-request
+ * state is one `(requestId, requestFut)` pair.
+ *
+ * - `predict()` starts a request and returns its future, superseding any
+ *   previous request.
+ * - the transport's single prediction completes the request by resolving the
+ *   future via `_resolvePrediction`.
+ * - `cancelInference()` / `flush(reason)` close a pending request, resolving
+ *   its future with a default event so waiters never hang.
+ *
+ * All policy (when to start a request, await timeout, turn commits) lives in
+ * `AudioRecognition`.
+ */
+export class SwapAbortError extends Error {
+  constructor() {
+    super('__swap__');
+    this.name = 'SwapAbortError';
+  }
+}
+
+export class BaseStreamingTurnDetectorStream {
+  protected _detector: BaseStreamingTurnDetector;
+  protected _opts: BaseStreamingTurnDetectorOptions;
+  protected _transport: StreamingTurnDetectionTransport;
+
+  private _audioInputSampleRate: number | undefined;
+  private _audioInputNumChannels: number | undefined;
+  private _audioResampler: AudioResampler | undefined;
+  private _audioChannel: StreamChannel<AudioFrame | FlushSentinel> = createStreamChannel();
+
+  /** Id of the in-flight inference request, or `undefined` when idle. */
+  protected _requestId: string | undefined;
+  /** Future for the in-flight request; resolves to the prediction event (or
+   * a default event when the request is cancelled / flushed). */
+  protected _requestFut: Future<TurnDetectionEvent> | undefined;
+
+  protected _mainTask: Task<void>;
+  protected _logger = log();
+  /**
+   * Aborted whenever the main loop needs to retry on a new transport (e.g.
+   * fallback). The base FSM also aborts it from `aclose()` so idle
+   * transports that are awaiting forever can be unstuck. Listeners check
+   * `signal.aborted` and surface a sentinel rejection so the `_run` loop
+   * can decide whether to continue or exit.
+   */
+  protected _swapController = new AbortController();
+
+  constructor(args: {
+    detector: BaseStreamingTurnDetector;
+    opts: BaseStreamingTurnDetectorOptions;
+    transport: StreamingTurnDetectionTransport;
+  }) {
+    this._detector = args.detector;
+    this._opts = args.opts;
+    this._transport = args.transport;
+    this._transport.attach(this);
+
+    this._mainTask = Task.from((controller) => this._mainTaskBody(controller));
+  }
+
+  // region: _TurnDetector protocol proxies
+
+  get model(): TurnDetectorModel {
+    return this._detector.model;
+  }
+
+  get provider(): string {
+    return this._detector.provider;
+  }
+
+  /** @internal Shared threshold resolver — the cloud transport reads it to
+   * adopt the server-sent defaults from `SessionCreated`. */
+  get thresholdsOptions(): ThresholdOptions {
+    return this._opts.thresholds;
+  }
+
+  async unlikelyThreshold(language: LanguageCode | undefined): Promise<number | undefined> {
+    return this._opts.thresholds.lookup(language);
+  }
+
+  async supportsLanguage(language: LanguageCode | undefined): Promise<boolean> {
+    return this._opts.thresholds.supports(language);
+  }
+
+  // endregion
+
+  // region: inference requests
+
+  /** Start a new inference request and return its future, superseding any
+   * previous request. */
+  predict(): Future<TurnDetectionEvent> {
+    if (this._audioChannel.closed) {
+      const fut = new Future<TurnDetectionEvent>();
+      fut.resolve(BaseStreamingTurnDetectorStream._defaultEvent(1.0));
+      return fut;
+    }
+
+    this.cancelInference(); // supersede any previous request
+    const fut = new Future<TurnDetectionEvent>();
+    this._requestId = shortuuid('turn_request_');
+    this._requestFut = fut;
+    // A transport may resolve synchronously (e.g. the local no-executor path
+    // defaults to 1.0 inline), which clears `_requestFut` via
+    // `_resolvePrediction`. Hold a local reference so we still return the
+    // resolved future rather than `undefined`.
+    this._transport.runInference(this._requestId);
+    return fut;
+  }
+
+  /** Close the current inference request (new speech, turn boundary,
+   * prediction timeout, mode change) and fall back if needed. */
+  cancelInference(opts: { timedOut?: boolean } = {}): void {
+    if (this._requestId !== undefined) {
+      const fut = this._requestFut;
+      this._requestId = undefined;
+      this._requestFut = undefined;
+      if (fut !== undefined && !fut.done) {
+        fut.resolve(BaseStreamingTurnDetectorStream._defaultEvent(0.0));
+      }
+    }
+
+    // trigger fallback immediately (the subclass timeout hook checks the
+    // model + signals the transport swap; the base hook is a no-op).
+    if (opts.timedOut) {
+      this._onPredictTimeout();
+    }
+  }
+
+  flush(reason?: string): void {
+    // Idempotent: a second call sends another sentinel that transports
+    // treat as a no-op (cloud: redundant session_flush; local: empty trim).
+    if (this._audioChannel.closed) {
+      return;
+    }
+    for (const resampled of this._flushAudioResampler()) {
+      void this._audioChannel.write(resampled);
+    }
+    const sentinel: FlushSentinel = {
+      kind: 'flush',
+      reason,
+    };
+    void this._audioChannel.write(sentinel);
+    this.cancelInference();
+  }
+
+  protected static _defaultEvent(probability: number): TurnDetectionEvent {
+    return {
+      type: 'eot_prediction',
+      endOfTurnProbability: probability,
+      lastSpeakingTimeMs: Date.now(),
+    };
+  }
+
+  // endregion
+
+  // region: audio ingress
+
+  pushAudio(frame: AudioFrame): void {
+    if (this._audioChannel.closed) {
+      return;
+    }
+    for (const resampled of this._resampleAudioFrame(frame)) {
+      void this._audioChannel.write(resampled);
+    }
+  }
+
+  endInput(): void {
+    this.flush();
+    void this._audioChannel.close();
+  }
+
+  private _resampleAudioFrame(frame: AudioFrame): AudioFrame[] {
+    if (this._audioInputSampleRate === undefined || this._audioInputNumChannels === undefined) {
+      this._audioInputSampleRate = frame.sampleRate;
+      this._audioInputNumChannels = frame.channels;
+      if (this._audioInputSampleRate !== this._opts.sampleRate) {
+        this._audioResampler = new AudioResampler(
+          this._audioInputSampleRate,
+          this._opts.sampleRate,
+          this._audioInputNumChannels,
+          AudioResamplerQuality.QUICK,
+        );
+      }
+    } else if (
+      frame.sampleRate !== this._audioInputSampleRate ||
+      frame.channels !== this._audioInputNumChannels
+    ) {
+      this._logger.error(
+        {
+          sampleRate: frame.sampleRate,
+          expectedSampleRate: this._audioInputSampleRate,
+          numChannels: frame.channels,
+          expectedNumChannels: this._audioInputNumChannels,
+        },
+        'a frame with different audio format was already pushed',
+      );
+      return [];
+    }
+    if (this._audioResampler === undefined) {
+      return [frame];
+    }
+    return this._audioResampler.push(frame);
+  }
+
+  private _flushAudioResampler(): AudioFrame[] {
+    const frames = this._audioResampler?.flush() ?? [];
+    this._resetAudioResampler();
+    return frames;
+  }
+
+  private _resetAudioResampler(): void {
+    this._audioResampler = undefined;
+    this._audioInputSampleRate = undefined;
+    this._audioInputNumChannels = undefined;
+  }
+
+  // endregion
+
+  // region: results
+
+  /**
+   * Accept a prediction from a transport. A stale response (request id
+   * mismatch) is ignored; otherwise the in-flight future resolves with the
+   * full `TurnDetectionEvent` and the request completes.
+   */
+  _resolvePrediction(
+    requestId: string,
+    probability: number,
+    opts: { inferenceDuration?: number; detectionDelay?: number } = {},
+  ): void {
+    // Drop predictions that land after teardown — an in-flight transport
+    // predict can resolve after `aclose` closed the channels.
+    if (this._closing) {
+      return;
+    }
+    if (requestId !== this._requestId) {
+      return;
+    }
+    const fut = this._requestFut;
+    this._requestId = undefined;
+    this._requestFut = undefined;
+    if (fut !== undefined && !fut.done) {
+      fut.resolve({
+        type: 'eot_prediction',
+        endOfTurnProbability: probability,
+        lastSpeakingTimeMs: Date.now(),
+        detectionDelay: opts.detectionDelay,
+        inferenceDuration: opts.inferenceDuration,
+      });
+    }
+  }
+
+  // endregion
+
+  // region: teardown
+
+  /**
+   * Synchronously release this stream's registration on its owning detector,
+   * so a replacement stream can be created before this one's async teardown
+   * finishes. Base is a no-op; detectors that enforce single-stream ownership
+   * override it. Idempotent.
+   */
+  detach(): void {
+    return;
+  }
+
+  async aclose(): Promise<void> {
+    this.endInput(); // the flush inside closes the in-flight request
+    this._closing = true;
+    this._swapController.abort();
+    await cancelAndWait([this._mainTask]);
+    this.cancelInference(); // defensive, normally a no-op
+    // Drop our strong reference on the parent detector so callers that
+    // forget `detector.aclose()` don't leak the stream graph.
+    this._detector._unregisterStream(this);
+  }
+
+  /** True once `aclose()` has been called. The `_run` loop uses this to
+   * distinguish swap-aborts (continue with new transport) from teardown
+   * aborts (exit). */
+  protected _closing = false;
+
+  // endregion
+
+  // region: main task scaffolding
+
+  private async _mainTaskBody(_controller: AbortController): Promise<void> {
+    await this._run();
+  }
+
+  /**
+   * Drain the shared audio channel into the current transport.
+   *
+   * The audio channel exposes a single `ReadableStream` (one underlying
+   * `transform.readable`), so only one reader may hold its lock at a time.
+   * When `signal` aborts (a transport being swapped out — e.g. cloud→local
+   * fallback — fires it via `detach()`), we release the reader lock right
+   * away: on a pending `read()` this rejects that read and frees the lock so
+   * the swapped-in transport's `_drainAudioChannel` can re-acquire it.
+   * Without this an orphaned drain would hold the lock forever and the next
+   * `getReader()` would throw "ReadableStream is locked".
+   */
+  async _drainAudioChannel(signal?: AbortSignal): Promise<void> {
+    const stream = this._audioChannel.stream();
+    const reader = stream.getReader();
+    const release = () => {
+      try {
+        reader.releaseLock();
+      } catch {
+        // already released
+      }
+    };
+    if (signal?.aborted) {
+      release();
+      return;
+    }
+    signal?.addEventListener('abort', release, { once: true });
+    try {
+      while (true) {
+        const { done, value } = await reader.read();
+        if (done) return;
+        if (isFlushSentinel(value)) {
+          await this._transport.flush(value);
+        } else {
+          await this._transport.pushFrame(value);
+        }
+      }
+    } catch (err) {
+      // The pending `read()` rejects when `release()` runs on abort — a clean
+      // swap-driven exit, not a drain failure.
+      if (signal?.aborted) return;
+      throw err;
+    } finally {
+      signal?.removeEventListener('abort', release);
+      release();
+    }
+  }
+
+  // endregion
+
+  // region: subclass hooks
+
+  /** Default: hand control to the transport. Subclasses override for
+   * cross-transport orchestration (e.g. cloud→local fallback). */
+  protected async _run(): Promise<void> {
+    await this._raceWithSwap(this._transport.run());
+  }
+
+  /**
+   * Race `inner` against `_swapController.signal`. If the signal aborts
+   * while `inner` is still pending, throw a `SwapAbortError` so the
+   * subclass loop can decide whether to continue or exit. Resets the
+   * controller after a swap-abort so subsequent races have a fresh signal.
+   *
+   * `aclose()` aborts during teardown — subclasses observe `_closing` to
+   * exit cleanly instead of looping.
+   */
+  protected async _raceWithSwap<T>(inner: Promise<T>): Promise<T> {
+    const signal = this._swapController.signal;
+    let onAbort: (() => void) | undefined;
+    const abortPromise = new Promise<never>((_, reject) => {
+      if (signal.aborted) {
+        reject(new SwapAbortError());
+        return;
+      }
+      onAbort = () => reject(new SwapAbortError());
+      signal.addEventListener('abort', onAbort, { once: true });
+    });
+    // If `inner` wins the race, the abort listener would otherwise stay
+    // registered and reject this now-orphaned promise when `aclose()` later
+    // aborts the controller — surfacing as an unhandledRejection. Swallow it
+    // (the race result is already settled) and remove the listener below.
+    abortPromise.catch(() => {});
+    try {
+      return await Promise.race([inner, abortPromise]);
+    } finally {
+      if (onAbort !== undefined) {
+        signal.removeEventListener('abort', onAbort);
+      }
+      if (signal.aborted) {
+        // Reset for the next iteration of the subclass loop.
+        this._swapController = new AbortController();
+      }
+    }
+  }
+
+  /** @internal Wake up an idle transport so the main loop can pick up a
+   * new one after fallback. Subclasses call this from their swap logic. */
+  protected _signalSwap(): void {
+    this._swapController.abort();
+  }
+
+  /** `predictEndOfTurn` timed out. Subclasses may override to react (e.g.
+   * promote local on cloud timeout). */
+  protected _onPredictTimeout(): void {
+    return;
+  }
+
+  // endregion
+}
diff --git a/agents/src/inference/eot/detector.test.ts b/agents/src/inference/eot/detector.test.ts
new file mode 100644
index 000000000..17b7bc09f
--- /dev/null
+++ b/agents/src/inference/eot/detector.test.ts
@@ -0,0 +1,650 @@
+// SPDX-FileCopyrightText: 2026 LiveKit, Inc.
+//
+// SPDX-License-Identifier: Apache-2.0
+
+/**
+ * Tests for the unified `TurnDetector` (auto-select + fallback + server defaults).
+ *
+ * Covers:
+ *
+ * - Auto-select via `LIVEKIT_REMOTE_EOT_URL` env var (with creds present,
+ *   with creds missing → silent downgrade).
+ * - Explicit-cloud missing creds throws.
+ * - Cloud → local fallback triggers (transport raise, predict timeout).
+ * - Fallback persistence across turns.
+ * - Local-failure handling (default 1.0, retry on next turn).
+ * - Per-session warning dedupe (one warning per failure mode).
+ * - Server-provided default thresholds adopted from `SessionCreated`.
+ * - Override resolution (scalar / dict / none) against the server defaults, the
+ *   override warning, runtime `updateOptions`, and the degenerate
+ *   (no usable thresholds) → fallback path.
+ * - Threshold rescaling against the server defaults on actual fallback.
+ *
+ * Port of Python `tests/test_audio_turn_detector_fallback.py`.
+ */
+import { AudioFrame } from '@livekit/rtc-node';
+import { afterEach, beforeEach, describe, expect, it, vi } from 'vitest';
+import { APIConnectionError, APIError } from '../../_exceptions.js';
+import type { InferenceExecutor } from '../../ipc/inference_executor.js';
+import { log } from '../../log.js';
+import { DEFAULT_API_CONNECT_OPTIONS } from '../../types.js';
+import type { BaseStreamingTurnDetectorStream } from './base.js';
+import {
+  type BaseStreamingTurnDetectorOptions,
+  type FlushSentinel,
+  type StreamingTurnDetectionTransport,
+} from './base.js';
+import { TurnDetector, TurnDetectorStreamImpl } from './detector.js';
+import { LOCAL_LANGUAGES, ThresholdOptions } from './languages.js';
+import { EOT_INFERENCE_METHOD } from './runner.js';
+import { LocalTransport } from './transports.js';
+
+// Stand-in for the per-language defaults a gateway returns in `SessionCreated`.
+const SERVER_THRESHOLDS: Record<string, number> = { en: 0.56, ja: 0.37, fr: 0.575 };
+const SERVER_DEFAULT_THRESHOLD = 0.5;
+
+async function waitFor(predicate: () => boolean, ticks = 50): Promise<void> {
+  for (let i = 0; i < ticks; i++) {
+    if (predicate()) return;
+    await new Promise<void>((r) => setImmediate(r));
+  }
+}
+
+interface ScriptedTransportOptions {
+  runBehavior?: 'idle' | 'raise' | 'return';
+  runExc?: Error;
+}
+
+class ScriptedTransport implements StreamingTurnDetectionTransport {
+  runBehavior: 'idle' | 'raise' | 'return';
+  runExc: Error | undefined;
+  runCalls = 0;
+  events: Array<[string, unknown]> = [];
+  private _stream: BaseStreamingTurnDetectorStream | undefined;
+
+  constructor(opts: ScriptedTransportOptions = {}) {
+    this.runBehavior = opts.runBehavior ?? 'idle';
+    this.runExc = opts.runExc;
+  }
+
+  attach(stream: BaseStreamingTurnDetectorStream): void {
+    this._stream = stream;
+  }
+  async run(): Promise<void> {
+    this.runCalls += 1;
+    if (this.runBehavior === 'raise') {
+      if (!this.runExc) throw new Error('runExc not set');
+      throw this.runExc;
+    }
+    if (this.runBehavior === 'return') {
+      return;
+    }
+    // idle — wait until cancelled (resolved by `detach()` via the
+    // scripted transport's no-op; in our tests the parent stream
+    // cancels via `aclose`).
+    await new Promise(() => undefined);
+  }
+  runInference(requestId: string): void {
+    this.events.push(['run_inference', requestId]);
+  }
+  async pushFrame(frame: AudioFrame): Promise<void> {
+    this.events.push(['push_frame', frame]);
+  }
+  async flush(sentinel: FlushSentinel): Promise<void> {
+    this.events.push(['flush', sentinel]);
+  }
+  detach(): void {
+    this.events.push(['detach', null]);
+  }
+}
+
+function detectorOpts(detector: TurnDetector): BaseStreamingTurnDetectorOptions {
+  return (detector as unknown as { _opts: BaseStreamingTurnDetectorOptions })._opts;
+}
+
+interface MakeStreamOpts {
+  model?: 'turn-detector-v1' | 'turn-detector-v1-mini';
+  userThreshold?: number | Record<string, number>;
+  detector?: TurnDetector;
+}
+
+/**
+ * Construct a stream wired to a scripted transport. The detector and stream
+ * share one `ThresholdOptions` (as in production). The cloud model starts with
+ * empty thresholds (its defaults arrive via `SessionCreated` — call
+ * `stream.thresholdsOptions._updateDefaults` to simulate that). The local mini
+ * model resolves its thresholds against `LOCAL_LANGUAGES` up front.
+ */
+function makeStreamWithTransport(
+  transport: StreamingTurnDetectionTransport,
+  opts: MakeStreamOpts = {},
+): TurnDetectorStreamImpl {
+  const model = opts.model ?? 'turn-detector-v1';
+  const detector = opts.detector ?? makeMockDetector(model, opts.userThreshold);
+  const stream = new TurnDetectorStreamImpl({
+    detector,
+    opts: detectorOpts(detector),
+    cloudOpts:
+      model === 'turn-detector-v1'
+        ? {
+            baseUrl: 'ws://test',
+            apiKey: 'x',
+            apiSecret: 'x',
+            connOptions: DEFAULT_API_CONNECT_OPTIONS,
+          }
+        : undefined,
+    model,
+    transport,
+  });
+  return stream;
+}
+
+/** Build a `TurnDetector` for assertions without going through env
+ * resolution — seed a specific model + threshold override for a stream we'll
+ * build separately. */
+function makeMockDetector(
+  model: 'turn-detector-v1' | 'turn-detector-v1-mini',
+  userThreshold?: number | Record<string, number>,
+): TurnDetector {
+  // Construct via the public constructor, then override the internal model +
+  // shared threshold options to match what we want for the assertion.
+  const originalEnv = { ...process.env };
+  if (model === 'turn-detector-v1-mini') {
+    delete process.env.LIVEKIT_REMOTE_EOT_URL;
+  } else {
+    process.env.LIVEKIT_REMOTE_EOT_URL = 'ws://test';
+    process.env.LIVEKIT_API_KEY = 'x';
+    process.env.LIVEKIT_API_SECRET = 'x';
+  }
+  const det = new TurnDetector();
+  process.env = originalEnv;
+  const internals = det as unknown as {
+    _model: typeof model;
+    _opts: BaseStreamingTurnDetectorOptions;
+  };
+  internals._model = model;
+  internals._opts = { ...internals._opts, thresholds: new ThresholdOptions(model, userThreshold) };
+  return det;
+}
+
+function withEnv(
+  overrides: Record<string, string | undefined>,
+  fn: () => void | Promise<void>,
+): void | Promise<void> {
+  const original = { ...process.env };
+  for (const [k, v] of Object.entries(overrides)) {
+    if (v === undefined) delete process.env[k];
+    else process.env[k] = v;
+  }
+  try {
+    const result = fn();
+    if (result instanceof Promise) {
+      return result.finally(() => {
+        process.env = original;
+      });
+    }
+    process.env = original;
+    return result;
+  } catch (err) {
+    process.env = original;
+    throw err;
+  }
+}
+
+// Stub `LocalTransport.run` so the fallback FSM doesn't hang on a real
+// drain loop. The behavior under test is the swap, not the post-swap I/O.
+let runSpy: ReturnType<typeof vi.spyOn>;
+beforeEach(() => {
+  runSpy = vi.spyOn(LocalTransport.prototype, 'run').mockImplementation(async () => undefined);
+});
+afterEach(() => {
+  runSpy.mockRestore();
+});
+
+describe('AutoSelect', () => {
+  it('selects local when no remote EOT url', () => {
+    void withEnv({ LIVEKIT_REMOTE_EOT_URL: undefined }, () => {
+      const detector = new TurnDetector();
+      expect(detector.model).toBe('turn-detector-v1-mini');
+    });
+  });
+
+  it('selects cloud when remote EOT url set', () => {
+    void withEnv(
+      {
+        LIVEKIT_REMOTE_EOT_URL: 'ws://gateway',
+        LIVEKIT_API_KEY: 'k',
+        LIVEKIT_API_SECRET: 's',
+      },
+      () => {
+        const detector = new TurnDetector();
+        expect(detector.model).toBe('turn-detector-v1');
+      },
+    );
+  });
+
+  it('downgrades to local when creds missing', () => {
+    void withEnv(
+      {
+        LIVEKIT_REMOTE_EOT_URL: 'ws://gateway',
+        LIVEKIT_API_KEY: undefined,
+        LIVEKIT_API_SECRET: undefined,
+        LIVEKIT_INFERENCE_API_KEY: undefined,
+        LIVEKIT_INFERENCE_API_SECRET: undefined,
+      },
+      () => {
+        const detector = new TurnDetector();
+        expect(detector.model).toBe('turn-detector-v1-mini');
+      },
+    );
+  });
+});
+
+describe('ExplicitModelErrors', () => {
+  it('explicit cloud missing creds throws', () => {
+    void withEnv(
+      {
+        LIVEKIT_REMOTE_EOT_URL: undefined,
+        LIVEKIT_API_KEY: undefined,
+        LIVEKIT_API_SECRET: undefined,
+        LIVEKIT_INFERENCE_API_KEY: undefined,
+        LIVEKIT_INFERENCE_API_SECRET: undefined,
+      },
+      () => {
+        expect(() => new TurnDetector({ version: 'v1' })).toThrow();
+      },
+    );
+  });
+});
+
+describe('Fallback', () => {
+  it('fallback on transport error swaps to local', async () => {
+    const transport = new ScriptedTransport({
+      runBehavior: 'raise',
+      runExc: new APIConnectionError({ message: 'boom' }),
+    });
+    const stream = makeStreamWithTransport(transport);
+    await waitFor(() => stream.model === 'turn-detector-v1-mini');
+    expect(stream.model).toBe('turn-detector-v1-mini');
+    expect(stream.isFallback).toBe(true);
+    expect(stream.warnedCloudFailure).toBe(true);
+    expect(transport.events).toContainEqual(['detach', null]);
+    await stream.aclose();
+  });
+
+  it('fallback on timed-out cancelInference', async () => {
+    const transport = new ScriptedTransport({ runBehavior: 'idle' });
+    const stream = makeStreamWithTransport(transport);
+    const fut = stream.predict();
+    // A timed-out cancel (driven by AudioRecognition's eou bounce) closes the
+    // request and promotes the cloud→local fallback.
+    stream.cancelInference({ timedOut: true });
+    expect((await fut.await).endOfTurnProbability).toBe(0.0);
+    await waitFor(() => stream.model === 'turn-detector-v1-mini');
+    expect(stream.model).toBe('turn-detector-v1-mini');
+    expect(stream.isFallback).toBe(true);
+    await stream.aclose();
+  });
+
+  it('fallback persists across turns', async () => {
+    const transport = new ScriptedTransport({
+      runBehavior: 'raise',
+      runExc: new APIConnectionError({ message: 'boom' }),
+    });
+    const stream = makeStreamWithTransport(transport);
+    await waitFor(() => stream.model === 'turn-detector-v1-mini');
+    expect(transport.runCalls).toBe(1);
+    stream.predict();
+    expect(stream.model).toBe('turn-detector-v1-mini');
+    await stream.aclose();
+  });
+});
+
+describe('MultiStreamOwnership', () => {
+  it('multiple streams can be opened off one detector', async () => {
+    let detector!: TurnDetector;
+    withEnv({ LIVEKIT_REMOTE_EOT_URL: undefined }, () => {
+      detector = new TurnDetector({ version: 'v1-mini' });
+    });
+    // Only one stream is active at a time in production; the detector still
+    // permits constructing several (they share its `ThresholdOptions`).
+    const s1 = detector.stream();
+    const s2 = detector.stream();
+    await s1.aclose();
+    await s2.aclose();
+  });
+});
+
+describe('DetectorViewAfterFallback', () => {
+  it('detector model + threshold follow the fallback (shared ThresholdOptions)', async () => {
+    let detector!: TurnDetector;
+    withEnv(
+      {
+        LIVEKIT_REMOTE_EOT_URL: 'ws://gateway',
+        LIVEKIT_API_KEY: 'k',
+        LIVEKIT_API_SECRET: 's',
+      },
+      () => {
+        detector = new TurnDetector({ unlikelyThreshold: 0.5 });
+      },
+    );
+    expect(detector.model).toBe('turn-detector-v1');
+    // scalar override is resolvable pre-session via the catch-all
+    expect(await detector.unlikelyThreshold('en')).toBeCloseTo(0.5);
+
+    const transport = new ScriptedTransport({ runBehavior: 'idle' });
+    const stream = new TurnDetectorStreamImpl({
+      detector,
+      opts: detectorOpts(detector),
+      cloudOpts: undefined,
+      model: 'turn-detector-v1',
+      transport,
+    });
+    // server defaults arrive, then the cloud session fails
+    stream.thresholdsOptions._updateDefaults({ ...SERVER_THRESHOLDS }, SERVER_DEFAULT_THRESHOLD);
+    stream._fallBackToLocal(new APIConnectionError({ message: 'boom' }));
+    await waitFor(() => stream.model === 'turn-detector-v1-mini');
+
+    // Both the stream and the detector (sharing one ThresholdOptions) reflect it.
+    expect(stream.model).toBe('turn-detector-v1-mini');
+    expect(detector.model).toBe('turn-detector-v1-mini');
+    const expected = LOCAL_LANGUAGES.en! * (0.5 / SERVER_THRESHOLDS.en!);
+    expect(await detector.unlikelyThreshold('en')).toBeCloseTo(expected);
+    await stream.aclose();
+  });
+});
+
+describe('LocalFailureRetry', () => {
+  it('local failure emits default and retries on next turn', async () => {
+    const transport = new ScriptedTransport({
+      runBehavior: 'raise',
+      runExc: new Error('local boom'),
+    });
+    const stream = makeStreamWithTransport(transport, { model: 'turn-detector-v1-mini' });
+    await waitFor(() => stream.warnedLocalFailure);
+    expect(stream.model).toBe('turn-detector-v1-mini');
+    expect(stream.isFallback).toBe(false);
+    expect(stream.warnedLocalFailure).toBe(true);
+    expect(stream.transport).toBe(transport);
+    await stream.aclose();
+  });
+});
+
+describe('WarningDedupe', () => {
+  it('cloud→local warning logged once per session', async () => {
+    const transport = new ScriptedTransport({
+      runBehavior: 'raise',
+      runExc: new APIConnectionError({ message: 'boom' }),
+    });
+    const stream = makeStreamWithTransport(transport);
+    await waitFor(() => stream.model === 'turn-detector-v1-mini');
+    // Trigger a second fallback path directly.
+    stream._fallBackToLocal(new APIConnectionError({ message: 'boom2' }));
+    // Across both invocations only one warning was emitted — tracked by
+    // the `warnedCloudFailure` flag staying flipped after the first call.
+    expect(stream.warnedCloudFailure).toBe(true);
+    await stream.aclose();
+  });
+
+  it('local warning logged once per session', async () => {
+    const transport = new ScriptedTransport({ runBehavior: 'idle' });
+    const stream = makeStreamWithTransport(transport, { model: 'turn-detector-v1-mini' });
+    stream._onLocalFailure(new Error('a'));
+    stream._onLocalFailure(new Error('b'));
+    expect(stream.warnedLocalFailure).toBe(true);
+    await stream.aclose();
+  });
+});
+
+describe('ResolveThresholds', () => {
+  // Cloud-override resolution against the server defaults, via ThresholdOptions.
+  function cloud(overrides?: number | Record<string, number>): ThresholdOptions {
+    const opts = new ThresholdOptions('turn-detector-v1', overrides);
+    opts._updateDefaults({ ...SERVER_THRESHOLDS }, SERVER_DEFAULT_THRESHOLD);
+    return opts;
+  }
+
+  it('no override adopts server map + fallback default', () => {
+    const opts = cloud();
+    expect(opts.thresholds).toEqual(SERVER_THRESHOLDS);
+    expect(opts.defaultThreshold).toBeCloseTo(SERVER_DEFAULT_THRESHOLD);
+  });
+
+  it('scalar override replaces with empty map', () => {
+    const opts = cloud(0.8);
+    // empty map → every language resolves through the scalar fallback
+    expect(opts.thresholds).toEqual({});
+    expect(opts.defaultThreshold).toBeCloseTo(0.8);
+  });
+
+  it('dict override layers on server map', () => {
+    const opts = cloud({ en: 0.7 });
+    expect(opts.thresholds.en).toBeCloseTo(0.7);
+    // unmapped languages keep the server values + server fallback
+    expect(opts.thresholds.ja).toBeCloseTo(SERVER_THRESHOLDS.ja!);
+    expect(opts.defaultThreshold).toBeCloseTo(SERVER_DEFAULT_THRESHOLD);
+  });
+
+  it('dict keys normalized', () => {
+    const opts = cloud({ English: 0.7, 'en-US': 0.7 });
+    expect(opts.thresholds.en).toBeCloseTo(0.7);
+  });
+});
+
+describe('ServerDefaults', () => {
+  it('cloud thresholds pending before session created', async () => {
+    const transport = new ScriptedTransport({ runBehavior: 'idle' });
+    const stream = makeStreamWithTransport(transport);
+    // A cloud detector has no per-language threshold until `SessionCreated`,
+    // but reports the language as supported so the first turn isn't skipped.
+    expect(await stream.unlikelyThreshold('en')).toBeUndefined();
+    expect(await stream.supportsLanguage('en')).toBe(true);
+    await stream.aclose();
+  });
+
+  it('cloud adopts server defaults', async () => {
+    const transport = new ScriptedTransport({ runBehavior: 'idle' });
+    const stream = makeStreamWithTransport(transport);
+    stream.thresholdsOptions._updateDefaults({ ...SERVER_THRESHOLDS }, SERVER_DEFAULT_THRESHOLD);
+    expect(await stream.unlikelyThreshold('en')).toBeCloseTo(SERVER_THRESHOLDS.en!);
+    // language absent from the server map → catch-all default
+    expect(await stream.unlikelyThreshold('de')).toBeCloseTo(SERVER_DEFAULT_THRESHOLD);
+    await stream.aclose();
+  });
+
+  it('dict override layers on server defaults', async () => {
+    const transport = new ScriptedTransport({ runBehavior: 'idle' });
+    const stream = makeStreamWithTransport(transport, { userThreshold: { en: 0.7, ja: 0.2 } });
+    stream.thresholdsOptions._updateDefaults({ ...SERVER_THRESHOLDS }, SERVER_DEFAULT_THRESHOLD);
+    expect(await stream.unlikelyThreshold('en')).toBeCloseTo(0.7);
+    expect(await stream.unlikelyThreshold('ja')).toBeCloseTo(0.2);
+    // fr not overridden → server default for fr
+    expect(await stream.unlikelyThreshold('fr')).toBeCloseTo(SERVER_THRESHOLDS.fr!);
+    await stream.aclose();
+  });
+
+  it('degenerate session created throws without override', async () => {
+    const transport = new ScriptedTransport({ runBehavior: 'idle' });
+    const stream = makeStreamWithTransport(transport);
+    expect(() => stream.thresholdsOptions._updateDefaults({}, 0.0)).toThrow(APIError);
+    await stream.aclose();
+  });
+
+  it('degenerate session created throws even with override', async () => {
+    const transport = new ScriptedTransport({ runBehavior: 'idle' });
+    const stream = makeStreamWithTransport(transport, { userThreshold: 0.8 });
+    expect(() => stream.thresholdsOptions._updateDefaults({}, 0.0)).toThrow(APIError);
+    await stream.aclose();
+  });
+});
+
+describe('OverrideWarning', () => {
+  it('warns on construction with override', () => {
+    const warnSpy = vi.spyOn(log(), 'warn');
+    try {
+      withEnv({ LIVEKIT_REMOTE_EOT_URL: undefined }, () => {
+        new TurnDetector({ unlikelyThreshold: 0.5 });
+      });
+      const warned = warnSpy.mock.calls.some((c) =>
+        JSON.stringify(c).includes('non-default turn detection threshold'),
+      );
+      expect(warned).toBe(true);
+    } finally {
+      warnSpy.mockRestore();
+    }
+  });
+
+  it('no warning without override', () => {
+    const warnSpy = vi.spyOn(log(), 'warn');
+    try {
+      withEnv({ LIVEKIT_REMOTE_EOT_URL: undefined }, () => {
+        new TurnDetector();
+      });
+      const warned = warnSpy.mock.calls.some((c) =>
+        JSON.stringify(c).includes('non-default turn detection threshold'),
+      );
+      expect(warned).toBe(false);
+    } finally {
+      warnSpy.mockRestore();
+    }
+  });
+});
+
+describe('UpdateOptions', () => {
+  it('re-resolves an active cloud stream against cached server defaults', async () => {
+    let detector!: TurnDetector;
+    withEnv(
+      {
+        LIVEKIT_REMOTE_EOT_URL: 'ws://gateway',
+        LIVEKIT_API_KEY: 'k',
+        LIVEKIT_API_SECRET: 's',
+      },
+      () => {
+        detector = new TurnDetector();
+      },
+    );
+    const transport = new ScriptedTransport({ runBehavior: 'idle' });
+    const stream = new TurnDetectorStreamImpl({
+      detector,
+      opts: detectorOpts(detector),
+      cloudOpts: undefined,
+      model: 'turn-detector-v1',
+      transport,
+    });
+    stream.thresholdsOptions._updateDefaults({ ...SERVER_THRESHOLDS }, SERVER_DEFAULT_THRESHOLD);
+    expect(await stream.unlikelyThreshold('en')).toBeCloseTo(SERVER_THRESHOLDS.en!);
+
+    detector.updateOptions({ unlikelyThreshold: 0.7 });
+    // the shared resolver re-resolves against the cached server defaults
+    expect(await stream.unlikelyThreshold('en')).toBeCloseTo(0.7);
+    await stream.aclose();
+  });
+
+  it('local model updateOptions', async () => {
+    let detector!: TurnDetector;
+    withEnv({ LIVEKIT_REMOTE_EOT_URL: undefined }, () => {
+      detector = new TurnDetector();
+    });
+    expect(detector.model).toBe('turn-detector-v1-mini');
+    detector.updateOptions({ unlikelyThreshold: 0.42 });
+    expect(await detector.unlikelyThreshold('en')).toBeCloseTo(0.42);
+    await detector.aclose();
+  });
+});
+
+describe('ThresholdRescaleOnFallback', () => {
+  it('scalar override rescaled against server on fallback', async () => {
+    const transport = new ScriptedTransport({ runBehavior: 'idle' });
+    const stream = makeStreamWithTransport(transport, { userThreshold: 0.5 });
+    stream.thresholdsOptions._updateDefaults({ ...SERVER_THRESHOLDS }, SERVER_DEFAULT_THRESHOLD);
+    stream._fallBackToLocal(new APIConnectionError({ message: 'boom' }));
+    await waitFor(() => stream.model === 'turn-detector-v1-mini');
+    expect(stream.isFallback).toBe(true);
+    expect(await stream.unlikelyThreshold('en')).toBeCloseTo(
+      LOCAL_LANGUAGES.en! * (0.5 / SERVER_THRESHOLDS.en!),
+    );
+    await stream.aclose();
+  });
+
+  it('no override fallback uses local table', async () => {
+    const transport = new ScriptedTransport({ runBehavior: 'idle' });
+    const stream = makeStreamWithTransport(transport);
+    stream.thresholdsOptions._updateDefaults({ ...SERVER_THRESHOLDS }, SERVER_DEFAULT_THRESHOLD);
+    stream._fallBackToLocal(new APIConnectionError({ message: 'boom' }));
+    await waitFor(() => stream.model === 'turn-detector-v1-mini');
+    // ratio 1.0 → local table unchanged
+    expect(await stream.unlikelyThreshold('en')).toBeCloseTo(LOCAL_LANGUAGES.en!);
+    await stream.aclose();
+  });
+
+  it('dict override rescaled per language on fallback', async () => {
+    const transport = new ScriptedTransport({ runBehavior: 'idle' });
+    const stream = makeStreamWithTransport(transport, { userThreshold: { en: 0.55, ja: 0.25 } });
+    stream.thresholdsOptions._updateDefaults({ ...SERVER_THRESHOLDS }, SERVER_DEFAULT_THRESHOLD);
+    stream._fallBackToLocal(new APIConnectionError({ message: 'boom' }));
+    await waitFor(() => stream.model === 'turn-detector-v1-mini');
+    expect(stream.isFallback).toBe(true);
+    expect(await stream.unlikelyThreshold('en')).toBeCloseTo(
+      LOCAL_LANGUAGES.en! * (0.55 / SERVER_THRESHOLDS.en!),
+    );
+    expect(await stream.unlikelyThreshold('ja')).toBeCloseTo(
+      LOCAL_LANGUAGES.ja! * (0.25 / SERVER_THRESHOLDS.ja!),
+    );
+    // fr not in dict → server value as effective → plain local default
+    expect(await stream.unlikelyThreshold('fr')).toBeCloseTo(LOCAL_LANGUAGES.fr!);
+    await stream.aclose();
+  });
+
+  it('fallback before session created uses local table with override applied', async () => {
+    // Cloud fails before any `SessionCreated` → no server map to rescale
+    // against, so the local table (with the override applied) is used directly.
+    const transport = new ScriptedTransport({
+      runBehavior: 'raise',
+      runExc: new APIConnectionError({ message: 'boom' }),
+    });
+    const stream = makeStreamWithTransport(transport, { userThreshold: 0.42 });
+    await waitFor(() => stream.model === 'turn-detector-v1-mini');
+    expect(stream.isFallback).toBe(true);
+    // scalar 0.42 → 0.42 for every language via the catch-all
+    expect(await stream.unlikelyThreshold('en')).toBeCloseTo(0.42);
+    await stream.aclose();
+  });
+});
+
+describe('LocalModelExecutor', () => {
+  function pcmFrame(samples = 320): AudioFrame {
+    return new AudioFrame(new Int16Array(samples), 16000, 1, samples);
+  }
+
+  it('routes local predict through the injected executor (base64 PCM)', async () => {
+    const doInference = vi.fn(async (method: string, data: unknown) => {
+      expect(method).toBe(EOT_INFERENCE_METHOD);
+      expect(typeof (data as { pcm: string }).pcm).toBe('string');
+      return { probability: 0.7, inferenceDurationMs: 5 };
+    });
+    const executor: InferenceExecutor = { doInference };
+    const detector = new TurnDetector({ version: 'v1-mini', executor });
+    const stream = detector.stream();
+    try {
+      stream.pushAudio(pcmFrame());
+      const ev = await stream.predict().await;
+      expect(ev.endOfTurnProbability).toBe(0.7);
+      expect(doInference).toHaveBeenCalledWith(EOT_INFERENCE_METHOD, expect.anything());
+    } finally {
+      await stream.aclose();
+    }
+  });
+
+  it('degrades to a positive default when no executor is available', async () => {
+    // explicit undefined → constructor falls through to getJobContext()
+    // (throws outside a job) → executor stays undefined.
+    const detector = new TurnDetector({ version: 'v1-mini', executor: undefined });
+    const stream = detector.stream();
+    try {
+      const ev = await stream.predict().await;
+      expect(ev.endOfTurnProbability).toBe(1.0);
+    } finally {
+      await stream.aclose();
+    }
+  });
+});
diff --git a/agents/src/inference/eot/detector.ts b/agents/src/inference/eot/detector.ts
new file mode 100644
index 000000000..a7a9ca115
--- /dev/null
+++ b/agents/src/inference/eot/detector.ts
@@ -0,0 +1,349 @@
+// SPDX-FileCopyrightText: 2026 LiveKit, Inc.
+//
+// SPDX-License-Identifier: Apache-2.0
+
+/**
+ * Audio end-of-turn detector with `turn-detector-v1` → `turn-detector-v1-mini`
+ * (cloud → local) fallback.
+ *
+ * Port of Python `livekit.agents.inference.eot.detector`.
+ */
+import type { InferenceExecutor } from '../../ipc/inference_executor.js';
+import { getJobContext } from '../../job.js';
+import { log } from '../../log.js';
+import { type APIConnectOptions, DEFAULT_API_CONNECT_OPTIONS } from '../../types.js';
+import { isDevMode, isHosted, resolveEnvVar } from '../../utils.js';
+import { getDefaultInferenceUrl } from '../utils.js';
+import {
+  BaseStreamingTurnDetector,
+  type BaseStreamingTurnDetectorOptions,
+  BaseStreamingTurnDetectorStream,
+  DEFAULT_SAMPLE_RATE,
+  type StreamingTurnDetectionTransport,
+  SwapAbortError,
+} from './base.js';
+import { ThresholdOptions, type TurnDetectorModel, type TurnDetectorVersion } from './languages.js';
+import { CloudTransport, type CloudTransportOptions, LocalTransport } from './transports.js';
+
+export interface TurnDetectorOptions {
+  /**
+   * Which turn-detector version to run. `'v1'` is the full cloud model (served
+   * over the inference gateway; model name `'turn-detector-v1'`); `'v1-mini'`
+   * is the local in-process model (`'turn-detector-v1-mini'`). When omitted,
+   * auto-selects `'v1'` on hosted/dev environments (falling back to `'v1-mini'`
+   * if cloud creds are missing) and `'v1-mini'` otherwise.
+   */
+  version?: TurnDetectorVersion;
+  unlikelyThreshold?: number | Record<string, number>;
+  baseUrl?: string;
+  apiKey?: string;
+  apiSecret?: string;
+  /** Sample rate (Hz). Defaults to 16000. */
+  sampleRate?: number;
+  connOptions?: APIConnectOptions;
+  /**
+   * Inference executor that runs the local `turn-detector-v1-mini` model in the
+   * shared inference process. Defaults to the current job's
+   * `getJobContext().inferenceExecutor`. `undefined` (no job context / binding
+   * unavailable) degrades the local model to a positive-default prediction.
+   * Mainly an override seam for tests.
+   */
+  executor?: InferenceExecutor;
+}
+
+export class TurnDetector extends BaseStreamingTurnDetector {
+  protected _model: TurnDetectorModel;
+  protected _cloudOpts: CloudTransportOptions | undefined;
+  protected _executor: InferenceExecutor | undefined;
+
+  constructor(opts: TurnDetectorOptions = {}) {
+    // auto = caller didn't pin a version; missing cloud creds warn-and-
+    // fall-back instead of raising.
+    const auto = opts.version === undefined;
+    const resolvedVersion: TurnDetectorVersion =
+      opts.version ?? (isHosted() || isDevMode() ? 'v1' : 'v1-mini');
+    let resolvedModel: TurnDetectorModel = `turn-detector-${resolvedVersion}`;
+
+    let cloudOpts: CloudTransportOptions | undefined;
+    if (resolvedVersion === 'v1') {
+      const baseUrl = resolveEnvVar(
+        opts.baseUrl,
+        ['LIVEKIT_INFERENCE_URL'],
+        getDefaultInferenceUrl(),
+      );
+      const apiKey = resolveEnvVar(opts.apiKey, ['LIVEKIT_INFERENCE_API_KEY', 'LIVEKIT_API_KEY']);
+      const apiSecret = resolveEnvVar(opts.apiSecret, [
+        'LIVEKIT_INFERENCE_API_SECRET',
+        'LIVEKIT_API_SECRET',
+      ]);
+      const missing: string[] = [];
+      if (!baseUrl) missing.push('LIVEKIT_INFERENCE_URL');
+      if (!apiKey) missing.push('LIVEKIT_API_KEY');
+      if (!apiSecret) missing.push('LIVEKIT_API_SECRET');
+      if (missing.length > 0) {
+        if (auto) {
+          log().warn(
+            { missing },
+            "LIVEKIT_INFERENCE_URL is set but creds are missing; falling back to 'v1-mini'",
+          );
+          resolvedModel = 'turn-detector-v1-mini';
+        } else {
+          throw new Error(
+            `TurnDetector(version='v1') requires ${missing.join(', ')} ` +
+              '(env or constructor argument).',
+          );
+        }
+      } else {
+        cloudOpts = {
+          baseUrl,
+          apiKey,
+          apiSecret,
+          connOptions: opts.connOptions ?? DEFAULT_API_CONNECT_OPTIONS,
+        };
+      }
+    }
+
+    const detectorOpts: BaseStreamingTurnDetectorOptions = {
+      sampleRate: opts.sampleRate ?? DEFAULT_SAMPLE_RATE,
+      thresholds: new ThresholdOptions(resolvedModel, opts.unlikelyThreshold),
+    };
+    super(detectorOpts);
+    this._model = resolvedModel;
+    this._cloudOpts = cloudOpts;
+    this._warnThresholdOverride();
+    // Default to the current job's shared inference executor. `getJobContext`
+    // throws outside a job (tests, standalone) — degrade to `undefined`
+    // (the local model then resolves a positive default) rather than throwing.
+    if (opts.executor !== undefined) {
+      this._executor = opts.executor;
+    } else {
+      try {
+        this._executor = getJobContext().inferenceExecutor;
+      } catch {
+        this._executor = undefined;
+      }
+    }
+  }
+
+  /** Current model name. Starts at the construction-time selection and flips to
+   * `'turn-detector-v1-mini'` after a cloud→local fallback: the detector and its
+   * (single) active stream share one mutable `ThresholdOptions`, and the
+   * stream writes the swap back here so EOU metrics and `audio_recognition`
+   * see a consistent view. The fallback is one-way and sticky. */
+  override get model(): TurnDetectorModel {
+    return this._model;
+  }
+
+  /** @internal Written by the active stream on cloud→local fallback. */
+  _setModel(model: TurnDetectorModel): void {
+    this._model = model;
+  }
+
+  protected _warnThresholdOverride(): void {
+    const overrides = this._opts.thresholds.overrides;
+    if (overrides !== undefined) {
+      log().warn(
+        { unlikelyThreshold: overrides },
+        'a non-default turn detection threshold was provided; the server provides calibrated ' +
+          'defaults and overriding them may be suboptimal',
+      );
+    }
+  }
+
+  /** Replace the user threshold override at runtime. The shared
+   * `ThresholdOptions` re-resolves against the current (server or shipped)
+   * defaults, so an active stream picks it up immediately. */
+  updateOptions(opts: { unlikelyThreshold?: number | Record<string, number> } = {}): void {
+    this._opts.thresholds.updateOverrides(opts.unlikelyThreshold);
+    this._warnThresholdOverride();
+  }
+
+  override stream(opts: { connOptions?: APIConnectOptions } = {}): BaseStreamingTurnDetectorStream {
+    const cloudOpts =
+      this._cloudOpts !== undefined
+        ? { ...this._cloudOpts, connOptions: opts.connOptions ?? this._cloudOpts.connOptions }
+        : undefined;
+    const stream = new TurnDetectorStreamImpl({
+      detector: this,
+      opts: this._opts,
+      cloudOpts,
+      model: this._model,
+      executor: this._executor,
+    });
+    this._streams.add(stream);
+    return stream;
+  }
+}
+
+export interface TurnDetectorStreamImplArgs {
+  detector: TurnDetector;
+  opts: BaseStreamingTurnDetectorOptions;
+  cloudOpts: CloudTransportOptions | undefined;
+  model: TurnDetectorModel;
+  /** Shared inference executor for the `turn-detector-v1-mini` (local) model
+   * (undefined degrades to a positive-default prediction). */
+  executor?: InferenceExecutor;
+  /** Optional transport override (for tests). When omitted, a transport is
+   * constructed from `model` + `cloudOpts`. */
+  transport?: StreamingTurnDetectionTransport;
+}
+
+/**
+ * Stream that owns the `turn-detector-v1` → `turn-detector-v1-mini` (cloud →
+ * local) fallback FSM. On cloud transport failure (`transport.run()` raises, or
+ * `predictEndOfTurn` times out), the stream swaps the transport and rescales
+ * per-language thresholds in place on the shared `ThresholdOptions`, then writes
+ * the model swap back to the owning detector so its view stays consistent.
+ */
+export class TurnDetectorStreamImpl extends BaseStreamingTurnDetectorStream {
+  protected _model: TurnDetectorModel;
+  protected _cloudOpts: CloudTransportOptions | undefined;
+  protected _executor: InferenceExecutor | undefined;
+  protected _isFallback = false;
+  protected _warnedCloudFailure = false;
+  protected _warnedLocalFailure = false;
+  private _detLogger = log();
+
+  constructor(args: TurnDetectorStreamImplArgs) {
+    const transport =
+      args.transport ??
+      (args.model === 'turn-detector-v1'
+        ? new CloudTransport({
+            detector: args.detector,
+            opts: args.opts,
+            cloudOpts: args.cloudOpts!,
+          })
+        : new LocalTransport({ opts: args.opts, executor: args.executor }));
+    super({ detector: args.detector, opts: args.opts, transport });
+    this._model = args.model;
+    this._cloudOpts = args.cloudOpts;
+    this._executor = args.executor;
+  }
+
+  /** This stream's *current* model name (flips to `'turn-detector-v1-mini'`
+   * after a cloud→local fallback). The swap is also written back to the owning
+   * detector, which shares this stream's mutable `ThresholdOptions`. */
+  override get model(): TurnDetectorModel {
+    return this._model;
+  }
+
+  get isFallback(): boolean {
+    return this._isFallback;
+  }
+
+  /** @internal Test-visible. */
+  get warnedCloudFailure(): boolean {
+    return this._warnedCloudFailure;
+  }
+  /** @internal Test-visible. */
+  get warnedLocalFailure(): boolean {
+    return this._warnedLocalFailure;
+  }
+  /** @internal Test-visible. */
+  get transport(): StreamingTurnDetectionTransport {
+    return this._transport;
+  }
+
+  /** @internal Test-visible: same logic as the path taken when `_run` catches
+   * a cloud transport error. Tests call this directly to verify the warning
+   * dedupe across multiple invocations on the same stream. */
+  _fallBackToLocal(reason: Error): void {
+    if (!this._warnedCloudFailure) {
+      this._detLogger.warn(
+        { reason: reason.message },
+        'cloud turn detector failed; falling back to local mini model',
+      );
+      this._warnedCloudFailure = true;
+    }
+    this._emitDefaultForInflight();
+    try {
+      this._transport.detach();
+    } catch {
+      // ignore detach errors during swap
+    }
+    // Mutate the shared `ThresholdOptions` in place so the rescaled local
+    // thresholds + model swap are visible to the owning detector (read by EOU
+    // metrics and `audio_recognition`) without a copy-back. Safe because only
+    // one active stream per detector is supported, and the swap is sticky.
+    this._opts.thresholds._toLocalFallback();
+    if (this._detector instanceof TurnDetector) {
+      this._detector._setModel('turn-detector-v1-mini');
+    }
+    this._transport = new LocalTransport({ opts: this._opts, executor: this._executor });
+    this._transport.attach(this);
+    this._model = 'turn-detector-v1-mini';
+    this._isFallback = true;
+  }
+
+  /** @internal Test-visible: same logic as the path taken when `_run` sees a
+   * local transport error. */
+  _onLocalFailure(reason: Error): void {
+    if (!this._warnedLocalFailure) {
+      this._detLogger.warn(
+        { reason: reason.message },
+        'local audio turn detector failed; defaulting to 1.0 and retrying on next turn',
+      );
+      this._warnedLocalFailure = true;
+    }
+    this._emitDefaultForInflight();
+  }
+
+  protected _emitDefaultForInflight(): void {
+    // Positive default so any waiter commits after minEndpointingDelay.
+    const requestId = this._requestId;
+    if (requestId !== undefined) {
+      this._resolvePrediction(requestId, 1.0);
+    }
+  }
+
+  override async aclose(): Promise<void> {
+    // Detach the transport first so the cloud send channel closes and its
+    // background sender/recv tasks tear down, then run the base teardown
+    // (which closes the audio channel and cancels the main task).
+    try {
+      this._transport.detach();
+    } catch {
+      // ignore detach errors during teardown
+    }
+    await super.aclose();
+  }
+
+  protected override async _run(): Promise<void> {
+    while (true) {
+      try {
+        await this._raceWithSwap(this._transport.run());
+        return;
+      } catch (err) {
+        if (err instanceof SwapAbortError) {
+          if (this._closing) return;
+          // A swap already happened (e.g. predict timeout → fallback).
+          // The new transport is mounted; loop and run it. Routing the
+          // swap through `SwapAbortError` (rather than through the
+          // cloud/local branch below) is what prevents the "timeout
+          // flips model mid-await" misclassification — the catch
+          // exits early before ever consulting `_model`.
+          continue;
+        }
+        const e = err instanceof Error ? err : new Error(String(err));
+        if (this._model === 'turn-detector-v1') {
+          this._fallBackToLocal(e);
+          continue;
+        }
+        this._onLocalFailure(e);
+        return;
+      }
+    }
+  }
+
+  protected override _onPredictTimeout(): void {
+    if (this._model === 'turn-detector-v1') {
+      // Signal the swap BEFORE mutating model/transport state. The
+      // race in `_raceWithSwap` is rejected with `SwapAbortError`
+      // immediately, so the main loop exits through the
+      // SwapAbortError branch and never consults `_model` for a
+      // classification that would race with the assignment below.
+      this._signalSwap();
+      this._fallBackToLocal(new Error('predict_end_of_turn'));
+    }
+  }
+}
diff --git a/agents/src/inference/eot/index.ts b/agents/src/inference/eot/index.ts
new file mode 100644
index 000000000..44483947e
--- /dev/null
+++ b/agents/src/inference/eot/index.ts
@@ -0,0 +1,8 @@
+// SPDX-FileCopyrightText: 2026 LiveKit, Inc.
+//
+// SPDX-License-Identifier: Apache-2.0
+export { TurnDetector, TurnDetectorStreamImpl } from './detector.js';
+export type { TurnDetectorOptions } from './detector.js';
+export { LOCAL_LANGUAGES, ThresholdOptions } from './languages.js';
+export type { ThresholdOverride, TurnDetectorModel, TurnDetectorVersion } from './languages.js';
+export { CloudTransport, LocalTransport, type CloudTransportOptions } from './transports.js';
diff --git a/agents/src/inference/eot/languages.ts b/agents/src/inference/eot/languages.ts
new file mode 100644
index 000000000..3f38fce9d
--- /dev/null
+++ b/agents/src/inference/eot/languages.ts
@@ -0,0 +1,246 @@
+// SPDX-FileCopyrightText: 2026 LiveKit, Inc.
+//
+// SPDX-License-Identifier: Apache-2.0
+
+/**
+ * Per-language `unlikely` thresholds for the mini detector.
+ *
+ * The cloud `turn-detector-v1` model receives calibrated defaults from the
+ * inference gateway (via the `SessionCreated` message); only the local
+ * `turn-detector-v1-mini` model ships a hardcoded table here.
+ */
+import { APIError } from '../../_exceptions.js';
+import type { LanguageCode } from '../../language.js';
+
+/** Full model name (used for telemetry/billing via `detector.model`). */
+export type TurnDetectorModel = 'turn-detector-v1' | 'turn-detector-v1-mini';
+
+/** Public `version` constructor argument; maps to a {@link TurnDetectorModel}. */
+export type TurnDetectorVersion = 'v1' | 'v1-mini';
+
+export const LOCAL_LANGUAGES: Readonly<Record<string, number>> = {
+  ar: 0.35,
+  de: 0.245,
+  en: 0.36,
+  es: 0.35,
+  fr: 0.285,
+  hi: 0.305,
+  id: 0.345,
+  it: 0.23,
+  ja: 0.295,
+  ko: 0.4,
+  nl: 0.2,
+  pt: 0.32,
+  tr: 0.255,
+  zh: 0.355,
+};
+
+/**
+ * BCP-47 language tag (or human-readable name) → ISO 639-1 two-letter code.
+ *
+ * Minimal port of Python's `LanguageCode` — covers the languages present in
+ * the threshold tables. Unknown inputs are returned lowercased and unchanged
+ * (callers should pass `en`, `en-US`, `English`, etc.).
+ */
+function normalizeLanguage(input: string): string {
+  const lower = input.toLowerCase().trim();
+  if (lower.length === 2) return lower;
+  const dashIdx = lower.indexOf('-');
+  if (dashIdx === 2) return lower.slice(0, 2);
+  // long-name aliases for languages in our tables
+  const aliases: Record<string, string> = {
+    arabic: 'ar',
+    german: 'de',
+    english: 'en',
+    spanish: 'es',
+    french: 'fr',
+    hindi: 'hi',
+    indonesian: 'id',
+    italian: 'it',
+    japanese: 'ja',
+    korean: 'ko',
+    dutch: 'nl',
+    portuguese: 'pt',
+    turkish: 'tr',
+    chinese: 'zh',
+    mandarin: 'zh',
+  };
+  return aliases[lower] ?? lower;
+}
+
+const round4 = (value: number): number => Math.round(value * 1e4) / 1e4;
+
+/**
+ * User-supplied threshold override: a single value applied to every language,
+ * a per-language map, or `undefined` (Python `NOT_GIVEN` — use the defaults).
+ */
+export type ThresholdOverride = number | Record<string, number> | undefined;
+
+function normalizeOverrides(overrides: ThresholdOverride): ThresholdOverride {
+  if (overrides === undefined || typeof overrides !== 'object') {
+    return overrides;
+  }
+  const out: Record<string, number> = {};
+  for (const [k, v] of Object.entries(overrides)) {
+    out[normalizeLanguage(k)] = Number(v);
+  }
+  return out;
+}
+
+/**
+ * Resolves per-language `unlikely` thresholds for the audio EOT detector.
+ *
+ * Holds three layers and re-materializes the effective map whenever any of
+ * them changes:
+ *
+ * - **overrides** — what the user passed (`unlikelyThreshold`), normalized.
+ * - **server/shipped defaults** — for `turn-detector-v1-mini` these are the
+ *   shipped `LOCAL_LANGUAGES` table; for the cloud `turn-detector-v1` they arrive
+ *   from the gateway via `_updateDefaults` (the `SessionCreated` message) and
+ *   are `undefined` until then.
+ * - **materialized** — `thresholds` (per-language map) + `defaultThreshold`
+ *   (catch-all for languages absent from the map).
+ *
+ * The detector and its (single) active stream share one instance; the
+ * cloud→local fallback mutates it in place via `_toLocalFallback`.
+ */
+export class ThresholdOptions {
+  private _model: TurnDetectorModel;
+  private _overrides: ThresholdOverride;
+
+  // server/shipped defaults
+  private _serverThresholds: Record<string, number> | undefined;
+  private _serverDefault: number | undefined;
+
+  // materialized values
+  private _thresholds: Record<string, number> = {};
+  private _default: number | undefined = undefined;
+
+  constructor(model: TurnDetectorModel, overrides: ThresholdOverride = undefined) {
+    this._model = model;
+    this._overrides = normalizeOverrides(overrides);
+    if (model === 'turn-detector-v1-mini') {
+      this._serverThresholds = { ...LOCAL_LANGUAGES };
+      this._serverDefault = LOCAL_LANGUAGES.en;
+    }
+    this._resolve();
+  }
+
+  get model(): TurnDetectorModel {
+    return this._model;
+  }
+
+  get overrides(): ThresholdOverride {
+    return this._overrides;
+  }
+
+  get thresholds(): Readonly<Record<string, number>> {
+    return this._thresholds;
+  }
+
+  get defaultThreshold(): number | undefined {
+    return this._default;
+  }
+
+  lookup(language: LanguageCode | string | undefined): number | undefined {
+    const key = language ? normalizeLanguage(language) : 'en';
+    // `key in map`, not `?? default` — a legitimate override of 0 must not
+    // fall through to the catch-all default.
+    return key in this._thresholds ? this._thresholds[key] : this._default;
+  }
+
+  supports(language: LanguageCode | string | undefined): boolean {
+    // A cloud detector reports every language as supported until its server
+    // defaults arrive, so the first turn (before `SessionCreated`) isn't
+    // skipped by the `audio_recognition` short-circuit.
+    const pending = this._model === 'turn-detector-v1' && this._serverThresholds === undefined;
+    return pending || this.lookup(language) !== undefined;
+  }
+
+  updateOverrides(overrides: ThresholdOverride): void {
+    this._overrides = normalizeOverrides(overrides);
+    this._resolve();
+  }
+
+  /**
+   * @internal Adopt the calibrated defaults a `turn-detector` gateway sends in
+   * `SessionCreated`. Raises (non-retryable) when the server produced no usable
+   * thresholds — the caller degrades the session to the local model.
+   */
+  _updateDefaults(serverThresholds: Record<string, number>, serverDefault: number): void {
+    if (!serverThresholds || Object.keys(serverThresholds).length === 0 || serverDefault <= 0) {
+      throw new APIError('turn detector session created without usable default thresholds', {
+        retryable: false,
+      });
+    }
+    const norm: Record<string, number> = {};
+    for (const [lang, value] of Object.entries(serverThresholds)) {
+      norm[normalizeLanguage(lang)] = round4(value);
+    }
+    this._serverThresholds = norm;
+    this._serverDefault = round4(serverDefault);
+    this._resolve();
+  }
+
+  /**
+   * @internal Promote to the local mini model on cloud→local fallback,
+   * preserving the user's effective-vs-default ratio per language:
+   * `local = LOCAL[lang] * (effective_t / server[lang])`.
+   */
+  _toLocalFallback(): void {
+    if (this._model === 'turn-detector-v1-mini') {
+      return;
+    }
+
+    let rescaled: Record<string, number> | undefined;
+    const server = this._serverThresholds;
+    if (server) {
+      rescaled = {};
+      for (const lang of Object.keys(server)) {
+        const activeT = this.lookup(lang);
+        const local = LOCAL_LANGUAGES[lang];
+        if (activeT !== undefined && local !== undefined && server[lang] !== 0) {
+          rescaled[lang] = local * (activeT / server[lang]!);
+        }
+      }
+    }
+
+    this._model = 'turn-detector-v1-mini';
+    this._serverThresholds = { ...LOCAL_LANGUAGES };
+    this._serverDefault = LOCAL_LANGUAGES.en;
+    this._resolve();
+
+    if (rescaled !== undefined) {
+      this._thresholds = rescaled;
+      this._default = this.lookup('en');
+    }
+  }
+
+  private _resolve(): void {
+    const scalarOverride = typeof this._overrides === 'number';
+    if (this._serverThresholds === undefined || this._serverDefault === undefined) {
+      // cloud defaults not received yet; only a scalar override resolves up front
+      this._thresholds = {};
+      this._default = scalarOverride ? (this._overrides as number) : undefined;
+      return;
+    }
+
+    if (this._overrides === undefined) {
+      this._thresholds = { ...this._serverThresholds };
+      this._default = this._serverDefault;
+      return;
+    }
+
+    if (scalarOverride) {
+      this._thresholds = {};
+      this._default = this._overrides as number;
+      return;
+    }
+
+    this._thresholds = {
+      ...this._serverThresholds,
+      ...(this._overrides as Record<string, number>),
+    };
+    this._default = this._serverDefault;
+  }
+}
diff --git a/agents/src/inference/eot/runner.test.ts b/agents/src/inference/eot/runner.test.ts
new file mode 100644
index 000000000..4058ca9d9
--- /dev/null
+++ b/agents/src/inference/eot/runner.test.ts
@@ -0,0 +1,55 @@
+// SPDX-FileCopyrightText: 2026 LiveKit, Inc.
+//
+// SPDX-License-Identifier: Apache-2.0
+import { afterEach, describe, expect, it, vi } from 'vitest';
+import * as warmup from '../_warmup.js';
+import EotRunner from './runner.js';
+
+describe('EotRunner', () => {
+  afterEach(() => {
+    vi.restoreAllMocks();
+  });
+
+  it('initializes the native EOT model and predicts on decoded PCM', async () => {
+    const received: Int16Array[] = [];
+    const fakeMod = {
+      initEot: vi.fn(),
+      initVad: vi.fn(),
+      createVad: vi.fn(),
+      VAD_WINDOW_SAMPLES: 512,
+      predict: vi.fn(async (pcm: Int16Array) => {
+        received.push(pcm);
+        return 0.83;
+      }),
+    };
+    vi.spyOn(warmup, '_getLocalInferenceModule').mockReturnValue(
+      fakeMod as unknown as ReturnType<typeof warmup._getLocalInferenceModule>,
+    );
+
+    const runner = new EotRunner();
+    await runner.initialize();
+    expect(fakeMod.initEot).toHaveBeenCalledOnce();
+
+    // 4 samples of s16le PCM → base64
+    const samples = Int16Array.from([1, -2, 3, -4]);
+    const pcm = Buffer.from(samples.buffer, samples.byteOffset, samples.byteLength).toString(
+      'base64',
+    );
+
+    const out = await runner.run({ pcm });
+    expect(out.probability).toBe(0.83);
+    expect(out.inferenceDurationMs).toBeGreaterThanOrEqual(0);
+
+    // the runner decoded the base64 back to the same samples
+    expect(received).toHaveLength(1);
+    expect(Array.from(received[0]!)).toEqual([1, -2, 3, -4]);
+
+    await runner.close();
+  });
+
+  it('throws on initialize when the native binding is unavailable', async () => {
+    vi.spyOn(warmup, '_getLocalInferenceModule').mockReturnValue(undefined);
+    const runner = new EotRunner();
+    await expect(runner.initialize()).rejects.toThrow(/native binding unavailable/);
+  });
+});
diff --git a/agents/src/inference/eot/runner.ts b/agents/src/inference/eot/runner.ts
new file mode 100644
index 000000000..d79ed7ad5
--- /dev/null
+++ b/agents/src/inference/eot/runner.ts
@@ -0,0 +1,71 @@
+// SPDX-FileCopyrightText: 2026 LiveKit, Inc.
+//
+// SPDX-License-Identifier: Apache-2.0
+
+/**
+ * Audio EOT inference runner — runs inside the shared `InferenceProcExecutor`
+ * so the ~138 MB native model loads once per host instead of once per job
+ * worker. Job-side transports reach it via `executor.doInference(...)`.
+ *
+ * The inference proc instantiates this with `new Runner()` (no args) and
+ * calls `initialize()` once at startup, then dispatches `run(data)` per
+ * request — see `ipc/inference_proc_lazy_main.ts`. Hence the default export
+ * + no-arg constructor.
+ */
+import { InferenceRunner } from '../../inference_runner.js';
+import { log } from '../../log.js';
+import { _getLocalInferenceModule } from '../_warmup.js';
+
+/** Inference method id used to register + dispatch the audio EOT runner. */
+export const EOT_INFERENCE_METHOD = 'lk_eot_audio';
+
+/** Request payload: base64-encoded 16 kHz s16le PCM (up to 1.2 s). */
+export interface EotInferenceInput {
+  pcm: string;
+}
+
+export interface EotInferenceOutput {
+  probability: number;
+  inferenceDurationMs: number;
+}
+
+export default class EotRunner extends InferenceRunner<EotInferenceInput, EotInferenceOutput> {
+  #logger = log();
+  #mod: ReturnType<typeof _getLocalInferenceModule>;
+
+  async initialize(): Promise<void> {
+    this.#mod = _getLocalInferenceModule();
+    if (this.#mod === undefined) {
+      throw new Error(
+        'EotRunner: @livekit/local-inference native binding unavailable in the inference process',
+      );
+    }
+    // Eagerly page in the EOT model singleton (~138 MB) so the first
+    // request doesn't pay the load on the hot path.
+    this.#mod.initEot();
+  }
+
+  async run(data: EotInferenceInput): Promise<EotInferenceOutput> {
+    if (this.#mod === undefined) {
+      throw new Error('EotRunner not initialized');
+    }
+    // base64 → bytes → Int16Array view (PCM is 16 kHz s16le)
+    const bytes = Buffer.from(data.pcm, 'base64');
+    const pcm = new Int16Array(bytes.buffer, bytes.byteOffset, Math.floor(bytes.byteLength / 2));
+    const t0 = performance.now();
+    let probability = 0.0;
+    try {
+      probability = await this.#mod.predict(pcm);
+    } catch (err) {
+      this.#logger.error(
+        { err: err instanceof Error ? err.message : String(err) },
+        'local audio EOT prediction failed',
+      );
+    }
+    return { probability, inferenceDurationMs: performance.now() - t0 };
+  }
+
+  async close(): Promise<void> {
+    return;
+  }
+}
diff --git a/agents/src/inference/eot/transports.test.ts b/agents/src/inference/eot/transports.test.ts
new file mode 100644
index 000000000..7db4838c2
--- /dev/null
+++ b/agents/src/inference/eot/transports.test.ts
@@ -0,0 +1,234 @@
+// SPDX-FileCopyrightText: 2026 LiveKit, Inc.
+//
+// SPDX-License-Identifier: Apache-2.0
+
+/**
+ * Tests for `CloudTransport` (cloud WS body, driven by the unified
+ * `TurnDetectorStreamImpl` stream).
+ *
+ * Uses an in-process fake WebSocket to drive the transport
+ * deterministically. Covers:
+ *
+ * - Retry counter resets after a successful connect (so transient drops
+ *   across the session lifetime don't accumulate toward `maxRetry`).
+ * - All outbound messages are FIFO-ordered on the wire, even when `runInference`
+ *   hooks fire synchronously between two awaited audio frames.
+ *
+ * Port of Python `tests/test_turn_detection_cloud_stream.py`.
+ */
+import { AgentInference } from '@livekit/protocol';
+import { AudioFrame } from '@livekit/rtc-node';
+import { describe, expect, it } from 'vitest';
+import { APIConnectionError } from '../../_exceptions.js';
+import { DEFAULT_API_CONNECT_OPTIONS } from '../../types.js';
+import { BaseStreamingTurnDetector, type BaseStreamingTurnDetectorOptions } from './base.js';
+import { TurnDetectorStreamImpl } from './detector.js';
+import { ThresholdOptions, type TurnDetectorModel } from './languages.js';
+import { CloudTransport, type CloudWebSocket } from './transports.js';
+
+const { ClientMessage } = AgentInference;
+
+/** Fake WebSocket capturing outbound frames as parsed `ClientMessage`s. */
+class FakeWS implements CloudWebSocket {
+  sent: InstanceType<typeof ClientMessage>[] = [];
+  readyState = 1; // OPEN
+  private closeCbs: Array<() => void> = [];
+
+  send(data: Uint8Array): void {
+    if (this.readyState !== 1) throw new Error('ws closed');
+    this.sent.push(ClientMessage.fromBinary(data));
+  }
+  close(): void {
+    this.readyState = 3; // CLOSED
+    for (const cb of this.closeCbs) cb();
+  }
+  on(event: 'message' | 'close' | 'error', cb: (...args: never[]) => void): void {
+    if (event === 'close') this.closeCbs.push(cb as () => void);
+    // message/error not driven in these tests
+  }
+}
+
+class FakeDetector extends BaseStreamingTurnDetector {
+  get model(): TurnDetectorModel {
+    return 'turn-detector-v1';
+  }
+  stream(): never {
+    throw new Error('unused');
+  }
+}
+
+interface MakeStreamResult {
+  stream: TurnDetectorStreamImpl;
+  fakeWs: FakeWS;
+  transport: CloudTransport;
+}
+
+function makeStream(opts: {
+  connectScript?: Array<Error | null>;
+  maxRetry?: number;
+  retryIntervalMs?: number;
+}): MakeStreamResult {
+  const fakeWs = new FakeWS();
+  const script = [...(opts.connectScript ?? [])];
+  const turnOpts: BaseStreamingTurnDetectorOptions = {
+    sampleRate: 16000,
+    thresholds: new ThresholdOptions('turn-detector-v1'),
+  };
+  const detector = new FakeDetector(turnOpts);
+  const cloudOpts = {
+    baseUrl: '',
+    apiKey: 'x',
+    apiSecret: 'x',
+    connOptions: {
+      ...DEFAULT_API_CONNECT_OPTIONS,
+      maxRetry: opts.maxRetry ?? 3,
+      retryIntervalMs: opts.retryIntervalMs ?? 0,
+    },
+  };
+  // Scripted connect: consume the script left-to-right. An Error rejects;
+  // null (or exhausted) returns the fake ws.
+  const connect = async (): Promise<CloudWebSocket> => {
+    if (script.length > 0) {
+      const r = script.shift();
+      if (r instanceof Error) throw r;
+    }
+    fakeWs.readyState = 1;
+    return fakeWs;
+  };
+  const transport = new CloudTransport({ detector, opts: turnOpts, cloudOpts, connect });
+  const stream = new TurnDetectorStreamImpl({
+    detector,
+    opts: turnOpts,
+    cloudOpts,
+    model: 'turn-detector-v1',
+    transport,
+  });
+  return { stream, fakeWs, transport };
+}
+
+async function tick(): Promise<void> {
+  await new Promise<void>((r) => setImmediate(r));
+}
+
+async function waitUntilConnected(transport: CloudTransport, ticks = 50): Promise<void> {
+  for (let i = 0; i < ticks; i++) {
+    if (transport.transportReady()) return;
+    await tick();
+  }
+  throw new Error('transport did not connect within timeout');
+}
+
+async function drainSendQueue(_transport: CloudTransport, ticks = 50): Promise<void> {
+  // Let the sender task flush the buffered ClientMsgs to the fake socket.
+  for (let i = 0; i < ticks; i++) {
+    await tick();
+  }
+}
+
+async function waitForCond(predicate: () => boolean, ticks = 50): Promise<void> {
+  for (let i = 0; i < ticks; i++) {
+    if (predicate()) return;
+    await tick();
+  }
+}
+
+function pcmFrame(samples = 320): AudioFrame {
+  return new AudioFrame(new Int16Array(samples), 16000, 1, samples);
+}
+
+describe('CloudStreamRetry', () => {
+  it('num retries resets after a successful connect', async () => {
+    const { stream, transport } = makeStream({
+      connectScript: [new APIConnectionError({ message: 'transient' }), null],
+      maxRetry: 3,
+      retryIntervalMs: 0,
+    });
+    try {
+      await waitUntilConnected(transport);
+      // Two attempts: first raised (counter 0→1), second succeeded → reset to 0.
+      expect(transport.connectCalls).toBe(2);
+      expect(transport.numRetries).toBe(0);
+    } finally {
+      await stream.aclose();
+    }
+  });
+});
+
+describe('CloudToLocalFallback', () => {
+  it('releases the shared audio reader lock on fallback (regression)', async () => {
+    const { stream, transport } = makeStream({ connectScript: [null] });
+    try {
+      await waitUntilConnected(transport);
+      // Drive a frame so the cloud drain task is actively parked on
+      // `reader.read()`, holding the audio channel's single reader lock.
+      stream.pushAudio(pcmFrame());
+      await tick();
+
+      // A timed-out cancelInference triggers a cloud→local fallback. The
+      // orphaned cloud drain must release the shared reader lock before the
+      // real `LocalTransport.run()` re-acquires it — otherwise `getReader()`
+      // throws "ReadableStream is locked", which is mis-reported as a local
+      // failure.
+      const fut = stream.predict();
+      stream.cancelInference({ timedOut: true });
+      await fut.await;
+
+      await waitForCond(() => stream.model === 'turn-detector-v1-mini');
+      expect(stream.isFallback).toBe(true);
+
+      // Let the swapped-in LocalTransport.run() re-acquire the reader and start
+      // draining. A freed lock ⇒ no "ReadableStream is locked" TypeError ⇒ no
+      // local failure flagged.
+      for (let i = 0; i < 10; i++) await tick();
+      expect(stream.warnedLocalFailure).toBe(false);
+    } finally {
+      await stream.aclose();
+    }
+  });
+});
+
+describe('CloudStreamSendOrdering', () => {
+  it('inferenceStart precedes inputAudio (FIFO)', async () => {
+    const { stream, fakeWs, transport } = makeStream({ connectScript: [null] });
+    try {
+      await waitUntilConnected(transport);
+      stream.predict();
+      stream.pushAudio(pcmFrame());
+      await drainSendQueue(transport);
+
+      const kinds = fakeWs.sent.map((m) => m.message.case);
+      const startIdx = kinds.indexOf('inferenceStart');
+      const audioIdx = kinds.indexOf('inputAudio');
+      expect(startIdx).toBeGreaterThanOrEqual(0);
+      expect(audioIdx).toBeGreaterThanOrEqual(0);
+      expect(startIdx).toBeLessThan(audioIdx);
+    } finally {
+      await stream.aclose();
+    }
+  });
+
+  it('consecutive inferenceStarts are serialized in call order', async () => {
+    // Two `runInference` hooks back-to-back (a predict superseding another)
+    // used to race at `ws.send`; the unified send channel serializes them in
+    // call order.
+    const { stream, fakeWs, transport } = makeStream({ connectScript: [null] });
+    try {
+      await waitUntilConnected(transport);
+      stream.predict();
+      const firstId = (stream as unknown as { _requestId?: string })._requestId;
+      stream.predict();
+      const secondId = (stream as unknown as { _requestId?: string })._requestId;
+      await drainSendQueue(transport);
+
+      const startIds: (string | undefined)[] = [];
+      for (const m of fakeWs.sent) {
+        if (m.message.case === 'inferenceStart') {
+          startIds.push(m.message.value.requestId);
+        }
+      }
+      expect(startIds).toEqual([firstId, secondId]);
+    } finally {
+      await stream.aclose();
+    }
+  });
+});
diff --git a/agents/src/inference/eot/transports.ts b/agents/src/inference/eot/transports.ts
new file mode 100644
index 000000000..ffbe3f991
--- /dev/null
+++ b/agents/src/inference/eot/transports.ts
@@ -0,0 +1,628 @@
+// SPDX-FileCopyrightText: 2026 LiveKit, Inc.
+//
+// SPDX-License-Identifier: Apache-2.0
+
+/**
+ * Audio EOT transports: cloud (WebSocket) + local (@livekit/local-inference).
+ *
+ * Port of Python `livekit.agents.inference.eot.transports`.
+ */
+import { type Duration, Timestamp } from '@bufbuild/protobuf';
+import { AgentInference } from '@livekit/protocol';
+import type { AudioFrame } from '@livekit/rtc-node';
+import { APIConnectionError, APIError, APIStatusError } from '../../_exceptions.js';
+import type { InferenceExecutor } from '../../ipc/inference_executor.js';
+import { log } from '../../log.js';
+import { type StreamChannel, createStreamChannel } from '../../stream/stream_channel.js';
+import { type APIConnectOptions, intervalForRetry } from '../../types.js';
+import { Task, delay } from '../../utils.js';
+import { buildMetadataHeaders, connectWs, createAccessToken } from '../utils.js';
+import {
+  type BaseStreamingTurnDetectorOptions,
+  type BaseStreamingTurnDetectorStream,
+  DEFAULT_SAMPLE_RATE,
+  type FlushSentinel,
+  type StreamingTurnDetectionTransport,
+} from './base.js';
+import type { TurnDetector } from './detector.js';
+import { EOT_INFERENCE_METHOD } from './runner.js';
+
+const AudioEncoding = AgentInference.AudioEncoding;
+const ClientMessageCtor = AgentInference.ClientMessage;
+const ServerMessageCtor = AgentInference.ServerMessage;
+const InferenceStart = AgentInference.InferenceStart;
+const InputAudio = AgentInference.InputAudio;
+const SessionClose = AgentInference.SessionClose;
+const SessionCreate = AgentInference.SessionCreate;
+const SessionFlush = AgentInference.SessionFlush;
+const SessionSettings = AgentInference.SessionSettings;
+type ClientMsg = InstanceType<typeof AgentInference.ClientMessage>;
+type ServerMsg = InstanceType<typeof AgentInference.ServerMessage>;
+
+export interface CloudTransportOptions {
+  baseUrl: string;
+  apiKey: string;
+  apiSecret: string;
+  connOptions: APIConnectOptions;
+}
+
+/**
+ * Minimal WebSocket shape both the real `ws` socket and test fakes satisfy.
+ * The cloud transport only needs send/close/readyState + the three events.
+ */
+export interface CloudWebSocket {
+  send(data: Uint8Array): void;
+  close(): void;
+  readonly readyState: number;
+  on(event: 'message', cb: (data: Buffer | ArrayBuffer | Buffer[]) => void): void;
+  on(event: 'close', cb: () => void): void;
+  on(event: 'error', cb: (err: Error) => void): void;
+}
+
+const WS_OPEN = 1;
+
+function nowTimestamp(): Timestamp {
+  const now = Date.now();
+  return new Timestamp({
+    seconds: BigInt(Math.floor(now / 1000)),
+    nanos: (now % 1000) * 1_000_000,
+  });
+}
+
+function timestampToMs(ts?: Timestamp): number {
+  if (ts === undefined) return 0;
+  return Number(ts.seconds) * 1000 + Math.floor(ts.nanos / 1_000_000);
+}
+
+function durationToMs(d?: Duration): number {
+  if (d === undefined) return 0;
+  return Number(d.seconds) * 1000 + Math.floor(d.nanos / 1_000_000);
+}
+
+// Native model operates on up to 1.2 s of 16 kHz s16le PCM per predict.
+const CLIENT_BUFFER_SECONDS = 1.2;
+const CLIENT_BUFFER_SAMPLES = Math.floor(CLIENT_BUFFER_SECONDS * DEFAULT_SAMPLE_RATE);
+
+/**
+ * Append-only ring buffer of 16-bit PCM samples used by the local transport
+ * to keep the last ~1.2 s of audio available for per-window prediction.
+ */
+class PcmRingBuffer {
+  private buf: Int16Array;
+  private writeIdx = 0;
+  private filled = 0;
+
+  constructor(public readonly capacity: number) {
+    this.buf = new Int16Array(capacity);
+  }
+
+  pushFrame(frame: AudioFrame): void {
+    const src = frame.data; // Int16Array
+    for (let i = 0; i < src.length; i++) {
+      this.buf[this.writeIdx] = src[i]!;
+      this.writeIdx = (this.writeIdx + 1) % this.capacity;
+    }
+    this.filled = Math.min(this.filled + src.length, this.capacity);
+  }
+
+  /** Returns a contiguous Int16Array snapshot of the last `filled` samples. */
+  read(): Int16Array {
+    const out = new Int16Array(this.filled);
+    const start = (this.writeIdx - this.filled + this.capacity) % this.capacity;
+    if (start + this.filled <= this.capacity) {
+      out.set(this.buf.subarray(start, start + this.filled));
+    } else {
+      const tail = this.capacity - start;
+      out.set(this.buf.subarray(start, this.capacity), 0);
+      out.set(this.buf.subarray(0, this.filled - tail), tail);
+    }
+    return out;
+  }
+
+  /** Drop the oldest `n` samples. */
+  shift(n: number): void {
+    this.filled = Math.max(0, this.filled - n);
+  }
+
+  get length(): number {
+    return this.filled;
+  }
+}
+
+/**
+ * Transport for the local `turn-detector-v1-mini` model.
+ *
+ * The native model runs in the shared `InferenceProcExecutor` (one load per
+ * host, ~138 MB) rather than in every job worker. Audio is buffered locally
+ * in the job process (no per-frame IPC); on each inference window the last
+ * ~1.2 s is snapshotted, base64-encoded, and sent over IPC to the runner
+ * (`inference/eot/runner.ts`) via `executor.doInference(...)`.
+ *
+ * When no executor is available (binding couldn't load on this platform),
+ * predictions resolve to a positive default (1.0) so the session still
+ * commits turns after `minDelay` — same as the existing local-failure path.
+ */
+export class LocalTransport implements StreamingTurnDetectionTransport {
+  protected _opts: BaseStreamingTurnDetectorOptions;
+  protected _executor: InferenceExecutor | undefined;
+  protected _buf: PcmRingBuffer;
+  protected _streamRef: WeakRef<BaseStreamingTurnDetectorStream> | undefined;
+  protected _tasks = new Set<Promise<void>>();
+  protected _warnedNoExecutor = false;
+  protected _logger = log();
+
+  constructor(opts: {
+    opts: BaseStreamingTurnDetectorOptions;
+    executor: InferenceExecutor | undefined;
+  }) {
+    this._opts = opts.opts;
+    this._executor = opts.executor;
+    this._buf = new PcmRingBuffer(CLIENT_BUFFER_SAMPLES);
+  }
+
+  attach(stream: BaseStreamingTurnDetectorStream): void {
+    this._streamRef = new WeakRef(stream);
+  }
+
+  runInference(requestId: string): void {
+    const snapshot = this._buf.read();
+    const task = this._predict(requestId, snapshot);
+    this._tasks.add(task);
+    void task.finally(() => this._tasks.delete(task));
+  }
+
+  protected async _predict(requestId: string, pcmSnapshot: Int16Array): Promise<void> {
+    const stream = this._streamRef?.deref();
+    if (stream === undefined) return;
+
+    if (this._executor === undefined) {
+      if (!this._warnedNoExecutor) {
+        this._warnedNoExecutor = true;
+        this._logger.warn(
+          'local audio EOT unavailable (no inference executor / native binding); ' +
+            'defaulting predictions to 1.0 so turns still commit after minDelay',
+        );
+      }
+      stream._resolvePrediction(requestId, 1.0);
+      return;
+    }
+
+    // base64-encode the s16le PCM so it survives the default JSON IPC
+    // serialization compactly (a raw Int16Array would balloon to an
+    // array-of-numbers). Only the snapshot crosses the boundary.
+    const pcm = Buffer.from(
+      pcmSnapshot.buffer,
+      pcmSnapshot.byteOffset,
+      pcmSnapshot.byteLength,
+    ).toString('base64');
+
+    let prob = 0.0;
+    let inferenceDurationMs = 0;
+    try {
+      const out = (await this._executor.doInference(EOT_INFERENCE_METHOD, {
+        pcm,
+      })) as { probability: number; inferenceDurationMs: number };
+      prob = out.probability;
+      inferenceDurationMs = out.inferenceDurationMs;
+    } catch (err) {
+      this._logger.error(
+        { err: err instanceof Error ? err.message : String(err) },
+        'local audio EOT inference (executor) failed',
+      );
+    }
+    const freshStream = this._streamRef?.deref();
+    if (freshStream === undefined) return;
+    freshStream._resolvePrediction(requestId, prob, { inferenceDuration: inferenceDurationMs });
+  }
+
+  async pushFrame(frame: AudioFrame): Promise<void> {
+    this._buf.pushFrame(frame);
+  }
+
+  async flush(_sentinel: FlushSentinel): Promise<void> {
+    if (this._buf.length > 0) {
+      this._buf.shift(this._buf.length);
+    }
+  }
+
+  detach(): void {
+    // We drop our references to the in-flight predicts, but the underlying IPC
+    // `doInference` calls aren't cancellable, so they run to completion in the
+    // inference process. Their results are harmless: `_predict` re-derefs the
+    // (now-gone) stream via `_streamRef.deref()` and the stream's request-id /
+    // closing guards discard any late prediction. (Python cancels the tasks;
+    // our IPC executor has no AbortSignal to thread through, so we can't.)
+    this._tasks.clear();
+  }
+
+  async run(): Promise<void> {
+    const stream = this._streamRef?.deref();
+    if (stream === undefined) return;
+    await stream._drainAudioChannel();
+  }
+}
+
+/**
+ * WebSocket transport for the `turn-detector-v1` (cloud) model.
+ *
+ * Maintains one inference session against the LiveKit Agent Gateway:
+ * connect → `SessionCreate` → three concurrent tasks (drain audio, send,
+ * receive) → protobuf encode/decode → `stream._resolvePrediction(...)` +
+ * `EOTInferenceMetrics` on the detector. Mirrors Python `_CloudTransport`.
+ *
+ * All outbound messages flow through a single FIFO send channel so control
+ * hooks fired synchronously between two awaited audio frames (e.g.
+ * `inferenceStart` then `inputAudio`) reach the wire in call order.
+ */
+export class CloudTransport implements StreamingTurnDetectionTransport {
+  protected _detectorRef: WeakRef<TurnDetector>;
+  protected _opts: BaseStreamingTurnDetectorOptions;
+  protected _cloudOpts: CloudTransportOptions;
+  protected _connOptions: APIConnectOptions;
+  protected _streamRef: WeakRef<BaseStreamingTurnDetectorStream> | undefined;
+  protected _ws: CloudWebSocket | undefined;
+  protected _numRetries = 0;
+  protected _connectCalls = 0;
+  /** Outbound FIFO for the active connection; recreated per `_runOnce`. */
+  protected _sendChannel: StreamChannel<ClientMsg> | undefined;
+  /** Set by `detach()`; stops the retry loop and suppresses the
+   * connection-closed throw so a teardown can't trigger a reconnect. */
+  protected _detached = false;
+  /** Aborted by `detach()` to release the audio-drain reader lock so a
+   * swapped-in transport can re-acquire the shared audio stream. */
+  protected _runAbort: AbortController | undefined;
+  protected _logger = log();
+  /** Optional connect override for tests; defaults to a real WS handshake. */
+  private _connectImpl: (() => Promise<CloudWebSocket>) | undefined;
+
+  constructor(args: {
+    detector: TurnDetector;
+    opts: BaseStreamingTurnDetectorOptions;
+    cloudOpts: CloudTransportOptions;
+    /** @internal test seam — supply a fake WebSocket factory. */
+    connect?: (transport: CloudTransport) => Promise<CloudWebSocket>;
+  }) {
+    this._detectorRef = new WeakRef(args.detector);
+    this._opts = args.opts;
+    this._cloudOpts = args.cloudOpts;
+    this._connOptions = args.cloudOpts.connOptions;
+    this._connectImpl = args.connect ? () => args.connect!(this) : undefined;
+  }
+
+  /** @internal Test-visible: number of connect attempts. */
+  get connectCalls(): number {
+    return this._connectCalls;
+  }
+  /** @internal Test-visible: retry counter (resets to 0 after a connect). */
+  get numRetries(): number {
+    return this._numRetries;
+  }
+
+  attach(stream: BaseStreamingTurnDetectorStream): void {
+    this._streamRef = new WeakRef(stream);
+  }
+
+  /** @internal Test-visible: true once the WS handshake is open. Not part of
+   * the transport interface — the stream FSM no longer gates on this. */
+  transportReady(): boolean {
+    return this._ws !== undefined && this._ws.readyState === WS_OPEN;
+  }
+
+  runInference(requestId: string): void {
+    this._enqueue(
+      new ClientMessageCtor({
+        message: { case: 'inferenceStart', value: new InferenceStart({ requestId }) },
+      }),
+    );
+  }
+
+  async pushFrame(frame: AudioFrame): Promise<void> {
+    if (frame.data.byteLength === 0) return;
+    this._enqueue(
+      new ClientMessageCtor({
+        message: {
+          case: 'inputAudio',
+          value: new InputAudio({
+            audio: new Uint8Array(frame.data.buffer, frame.data.byteOffset, frame.data.byteLength),
+            numSamples: frame.samplesPerChannel,
+            createdAt: nowTimestamp(),
+          }),
+        },
+      }),
+    );
+  }
+
+  async flush(_sentinel: FlushSentinel): Promise<void> {
+    this._enqueue(
+      new ClientMessageCtor({ message: { case: 'sessionFlush', value: new SessionFlush() } }),
+    );
+  }
+
+  detach(): void {
+    this._detached = true;
+    // Abort the active run: this releases the audio-drain reader lock (held by
+    // `stream._drainAudioChannel`) so a swapped-in transport can re-acquire the
+    // shared audio stream, and unblocks the recv/send tasks below.
+    this._runAbort?.abort();
+    void this._sendChannel?.close();
+    const ws = this._ws;
+    this._ws = undefined;
+    try {
+      ws?.close();
+    } catch {
+      // ignore
+    }
+  }
+
+  private _enqueue(msg: ClientMsg): void {
+    // The WS handle is cleared synchronously by `detach()` while
+    // `_sendChannel.close()` is still in flight (its `closed` flag flips
+    // asynchronously). Gate on `_ws` to drop late control hooks that the
+    // stream FSM may fire after the transport is being torn down.
+    if (this._ws === undefined || this._ws.readyState !== WS_OPEN) return;
+    const channel = this._sendChannel;
+    if (channel === undefined || channel.closed) return;
+    void channel.write(msg).catch(() => {});
+  }
+
+  private async _defaultConnect(): Promise<CloudWebSocket> {
+    let baseUrl = this._cloudOpts.baseUrl;
+    if (baseUrl.startsWith('http://')) baseUrl = baseUrl.replace('http://', 'ws://');
+    else if (baseUrl.startsWith('https://')) baseUrl = baseUrl.replace('https://', 'wss://');
+    const token = await createAccessToken(this._cloudOpts.apiKey, this._cloudOpts.apiSecret);
+    const headers = { ...buildMetadataHeaders(), Authorization: `Bearer ${token}` };
+    const ws = await connectWs(`${baseUrl}/eot`, headers, this._connOptions.timeoutMs);
+    return ws as unknown as CloudWebSocket;
+  }
+
+  private _warnTransportLatency(msg: ServerMsg): void {
+    const clientCreatedAtMs = timestampToMs(msg.clientCreatedAt);
+    const transportLatency = Date.now() - clientCreatedAtMs;
+    if (transportLatency > 500 && clientCreatedAtMs > 0) {
+      this._logger.warn(
+        { transportLatencyMs: transportLatency },
+        'turn detection transport latency is too high',
+      );
+    }
+  }
+
+  protected _processServerMessage(msg: ServerMsg): void {
+    const stream = this._streamRef?.deref();
+    if (stream === undefined) return;
+    const kind = msg.message.case;
+    if (kind === 'eotPrediction') {
+      const prediction = msg.message.value;
+      const stats = prediction.inferenceStats;
+      const requestSentAtMs = timestampToMs(stats?.latestClientCreatedAt);
+      const detectionDelayMs = requestSentAtMs > 0 ? Date.now() - requestSentAtMs : 0;
+      const inferenceDurationMs = durationToMs(stats?.serverE2eLatency);
+      stream._resolvePrediction(msg.requestId ?? '', prediction.probability, {
+        detectionDelay: detectionDelayMs,
+        inferenceDuration: inferenceDurationMs,
+      });
+      const detector = this._detectorRef.deref();
+      if (detector !== undefined) {
+        detector.emit('metrics_collected', {
+          type: 'eot_inference_metrics',
+          timestamp: Date.now(),
+          totalDuration: durationToMs(stats?.clientE2eLatency),
+          predictionDuration: inferenceDurationMs,
+          detectionDelay: detectionDelayMs,
+          numRequests: 1,
+          metadata: { modelName: detector.model, modelProvider: detector.provider },
+        });
+      }
+    } else if (kind === 'error') {
+      const err = msg.message.value;
+      throw new APIStatusError({
+        message: err.message,
+        options: { statusCode: err.code, requestId: msg.requestId },
+      });
+    } else if (kind === 'sessionCreated') {
+      this._warnTransportLatency(msg);
+      const created = msg.message.value;
+      // Adopt the gateway's calibrated default thresholds. A degenerate
+      // response (no usable thresholds) throws a non-retryable `APIError` that
+      // propagates out of the recv task → `run()` → the stream's cloud→local
+      // fallback.
+      stream.thresholdsOptions._updateDefaults(created.defaultThresholds, created.defaultThreshold);
+      this._logger.debug(
+        {
+          model: stream.thresholdsOptions.model,
+          thresholds: stream.thresholdsOptions.thresholds,
+          defaultThreshold: stream.thresholdsOptions.defaultThreshold,
+          overrides: stream.thresholdsOptions.overrides,
+        },
+        'audio turn detector initialized',
+      );
+    } else if (
+      kind === 'sessionClosed' ||
+      kind === 'inferenceStarted' ||
+      kind === 'inferenceStopped'
+    ) {
+      this._warnTransportLatency(msg);
+    } else {
+      this._logger.warn({ kind }, 'unexpected turn detector message');
+    }
+  }
+
+  async run(): Promise<void> {
+    const maxRetries = this._connOptions.maxRetry;
+    while (!this._detached && this._numRetries <= maxRetries) {
+      try {
+        await this._runOnce();
+        return;
+      } catch (err) {
+        // A detach (e.g. cloud→local fallback) tears the session down; don't
+        // surface that as a connection error or retry into a reconnect.
+        if (this._detached) return;
+        if (!(err instanceof APIError) || maxRetries === 0 || !err.retryable) throw err;
+        if (this._numRetries === maxRetries) {
+          throw new APIConnectionError({
+            message: `failed to connect livekit turn detector after ${this._numRetries} attempts`,
+          });
+        }
+        const retryIntervalMs = intervalForRetry(this._connOptions, this._numRetries);
+        this._logger.warn(
+          { err: err.message, attempt: this._numRetries, retryIntervalMs },
+          'livekit turn detector connection failed; retrying',
+        );
+        await delay(retryIntervalMs);
+        this._numRetries += 1;
+      }
+    }
+  }
+
+  protected async _runOnce(): Promise<void> {
+    const stream = this._streamRef?.deref();
+    if (stream === undefined) return;
+
+    // Per-run abort: `detach()` fires it to release the audio-drain reader
+    // lock and stop the recv/send tasks without a spurious "closed" throw.
+    const runAbort = new AbortController();
+    this._runAbort = runAbort;
+
+    this._connectCalls += 1;
+    const ws = await (this._connectImpl ?? this._defaultConnect.bind(this))();
+
+    // Detached while the handshake was in flight — don't revive the session.
+    if (this._detached) {
+      try {
+        ws.close();
+      } catch {
+        // ignore
+      }
+      return;
+    }
+
+    // Successful connect — reset transient-failure counter so drops across
+    // the session lifetime don't accumulate toward maxRetry.
+    this._numRetries = 0;
+    this._ws = ws;
+    const sendChannel = createStreamChannel<ClientMsg>();
+    this._sendChannel = sendChannel;
+
+    // Send the SessionCreate handshake first, before any queued control msg.
+    ws.send(
+      new ClientMessageCtor({
+        message: {
+          case: 'sessionCreate',
+          value: new SessionCreate({
+            settings: new SessionSettings({
+              sampleRate: this._opts.sampleRate,
+              encoding: AudioEncoding.PCM_S16LE,
+            }),
+          }),
+        },
+        createdAt: nowTimestamp(),
+      }).toBinary(),
+    );
+
+    let closingWs = false;
+    let socketErr: Error | undefined;
+    // Closing the recv channel makes the reader drain buffered frames and then
+    // observe `done`; we use it (not `abort`) on socket close/error so the
+    // post-drain throw below still decides the outcome.
+    const recvChannel = createStreamChannel<Uint8Array>();
+
+    ws.on('message', (data) => {
+      const chunk =
+        data instanceof Buffer
+          ? new Uint8Array(data.buffer, data.byteOffset, data.byteLength)
+          : Array.isArray(data)
+            ? new Uint8Array(Buffer.concat(data))
+            : new Uint8Array(data);
+      void recvChannel.write(chunk).catch(() => {});
+    });
+    ws.on('close', () => {
+      void recvChannel.close();
+      void sendChannel.close();
+    });
+    ws.on('error', (err) => {
+      socketErr = err;
+      void recvChannel.close();
+      void sendChannel.close();
+    });
+
+    const drainAudioTask = Task.from(async () => {
+      await stream._drainAudioChannel(runAbort.signal);
+      // Detached mid-drain (fallback/teardown): the lock is already released;
+      // skip the graceful sessionClose — the session is being abandoned.
+      if (runAbort.signal.aborted) return;
+      closingWs = true;
+      this._enqueue(
+        new ClientMessageCtor({ message: { case: 'sessionClose', value: new SessionClose() } }),
+      );
+      // Close after enqueue so the sender flushes `sessionClose` before exiting.
+      await sendChannel.close();
+    });
+
+    const senderTask = Task.from(async () => {
+      const reader = sendChannel.stream().getReader();
+      try {
+        while (true) {
+          const { done, value: msg } = await reader.read();
+          if (done) return;
+          if (msg.createdAt === undefined) msg.createdAt = nowTimestamp();
+          if (ws.readyState !== WS_OPEN) return;
+          try {
+            ws.send(msg.toBinary());
+          } catch {
+            return;
+          }
+        }
+      } finally {
+        reader.releaseLock();
+      }
+    });
+
+    const recvTask = Task.from(async () => {
+      const reader = recvChannel.stream().getReader();
+      try {
+        while (true) {
+          const { done, value: chunk } = await reader.read();
+          if (done) break;
+          this._processServerMessage(ServerMessageCtor.fromBinary(chunk));
+        }
+      } finally {
+        reader.releaseLock();
+      }
+      // A detach-driven ws close is expected teardown, not a failure.
+      if (socketErr !== undefined && !closingWs && !runAbort.signal.aborted) {
+        throw new APIConnectionError({
+          message: `turn detector connection error: ${socketErr.message}`,
+          options: { retryable: false },
+        });
+      }
+      if (!closingWs && !runAbort.signal.aborted) {
+        throw new APIStatusError({
+          message: 'turn detector connection closed unexpectedly',
+          options: { statusCode: -1, retryable: false },
+        });
+      }
+    });
+
+    try {
+      await Promise.all([drainAudioTask.result, senderTask.result, recvTask.result]);
+    } finally {
+      drainAudioTask.cancel();
+      senderTask.cancel();
+      recvTask.cancel();
+      void sendChannel.close();
+      void recvChannel.close();
+      this._ws = undefined;
+      try {
+        ws.close();
+      } catch {
+        // ignore
+      }
+    }
+  }
+}
+
+// Re-export the transport interface from the FSM module so callers that
+// import `StreamingTurnDetectionTransport` from this package barrel see the
+// same type.
+export type { StreamingTurnDetectionTransport };
+// Expose APIError so detector + fallback code can narrow on it.
+export type { APIError };
diff --git a/agents/src/inference/index.ts b/agents/src/inference/index.ts
index e77981c79..28bd2b1ad 100644
--- a/agents/src/inference/index.ts
+++ b/agents/src/inference/index.ts
@@ -1,10 +1,28 @@
 // SPDX-FileCopyrightText: 2025 LiveKit, Inc.
 //
 // SPDX-License-Identifier: Apache-2.0
+import * as eot from './eot/index.js';
 import * as llm from './llm.js';
 import * as stt from './stt.js';
 import * as tts from './tts.js';
 
+export { eot };
+export {
+  TurnDetector,
+  TurnDetectorStreamImpl,
+  LOCAL_LANGUAGES,
+  CloudTransport,
+  LocalTransport,
+  ThresholdOptions,
+  type TurnDetectorOptions,
+  type CloudTransportOptions,
+  type ThresholdOverride,
+  type TurnDetectorModel,
+  type TurnDetectorVersion,
+} from './eot/index.js';
+
+export { VAD, type VADOptions, type VADModels } from './vad.js';
+
 export {
   LLM,
   LLMStream,
diff --git a/agents/src/inference/stt.test.ts b/agents/src/inference/stt.test.ts
index 97a499daa..6e119cab5 100644
--- a/agents/src/inference/stt.test.ts
+++ b/agents/src/inference/stt.test.ts
@@ -13,6 +13,7 @@ import {
   normalizeSTTFallback,
   parseSTTModelString,
 } from './stt.js';
+import { VAD as InferenceVAD } from './vad.js';
 
 beforeAll(() => {
   initializeLogger({ level: 'silent', pretty: false });
@@ -343,9 +344,10 @@ describe('STT VAD handling for Speechmatics models', () => {
     await expect(stt.vadPromise).resolves.toBeUndefined();
   });
 
-  it('speechmatics model with no user vad sets up a silero loader', () => {
+  it('speechmatics model with no user vad falls back to the inference VAD', async () => {
     const stt = makeStt({ model: 'speechmatics/enhanced' });
-    expect(typeof stt['vad']).toBe('function');
+    expect(stt['vad']).toBeInstanceOf(InferenceVAD);
+    await expect(stt.vadPromise).resolves.toBe(stt['vad']);
   });
 
   it('speechmatics model with user vad uses that vad', async () => {
@@ -372,11 +374,11 @@ describe('STT VAD handling for Speechmatics models', () => {
     await expect(stt.vadPromise).resolves.toBeUndefined();
   });
 
-  it('updateOptions non-speechmatics → speechmatics sets up silero loader', () => {
+  it('updateOptions non-speechmatics → speechmatics falls back to the inference VAD', () => {
     const stt = makeStt({ model: 'deepgram/nova-3' });
     expect(stt['vad']).toBeUndefined();
 
     stt.updateOptions({ model: 'speechmatics/enhanced' });
-    expect(typeof stt['vad']).toBe('function');
+    expect(stt['vad']).toBeInstanceOf(InferenceVAD);
   });
 });
diff --git a/agents/src/inference/stt.ts b/agents/src/inference/stt.ts
index 2acb1bc80..d0d7bf7a5 100644
--- a/agents/src/inference/stt.ts
+++ b/agents/src/inference/stt.ts
@@ -27,6 +27,7 @@ import {
   sttServerEventSchema,
 } from './api_protos.js';
 import { type AnyString, connectWs, createAccessToken, getDefaultInferenceUrl } from './utils.js';
+import { VAD as InferenceVAD } from './vad.js';
 
 export type DeepgramModels =
   | 'deepgram/nova-3'
@@ -281,41 +282,20 @@ export function normalizeSTTFallback(
   return [makeFallback(fallback)];
 }
 
-type VADSource = VAD | (() => Promise<VAD>);
-
 function isSpeechmaticsModel(model: string | undefined): boolean {
   return model?.startsWith('speechmatics/') ?? false;
 }
 
-function loadSileroVAD(model: string): () => Promise<VAD> {
-  return async () => {
-    try {
-      const dynamicImport = (specifier: string) =>
-        import(specifier) as Promise<{ VAD: { load(): Promise<VAD> } }>;
-      const { VAD: SileroVAD } = await dynamicImport('@livekit/agents-plugin-silero');
-      return SileroVAD.load();
-    } catch (e) {
-      throw new Error(
-        `@livekit/agents-plugin-silero is required: model ${JSON.stringify(
-          model,
-        )} does not handle endpointing server-side.`,
-        { cause: e },
-      );
-    }
-  };
-}
-
-function resolveVADForModel(
-  model: string | undefined,
-  vad: VAD | undefined,
-): VADSource | undefined {
+function resolveVADForModel(model: string | undefined, vad: VAD | undefined): VAD | undefined {
   const speechmatics = isSpeechmaticsModel(model);
   if (vad && !speechmatics) {
     log().warn({ model }, '`vad` will be ignored: model handles endpointing server-side');
     return undefined;
   }
   if (speechmatics && vad === undefined) {
-    return loadSileroVAD(model!);
+    // Speechmatics doesn't endpoint server-side, so fall back to the in-tree
+    // local inference VAD rather than the deprecated silero plugin.
+    return new InferenceVAD();
   }
   return vad;
 }
@@ -345,17 +325,16 @@ export interface InferenceSTTOptions<TModel extends STTModels> {
 export class STT<TModel extends STTModels> extends BaseSTT {
   private opts: InferenceSTTOptions<TModel>;
   private streams: Set<SpeechStream<TModel>> = new Set();
-  private vad?: VADSource;
+  private vad?: VAD;
   private _vadPromise?: Promise<VAD | undefined>;
 
   /**
    * Resolves to the VAD instance for the current model, or `undefined` if the model
-   * handles endpointing server-side. Lazily computed on first read so callers that
-   * never need VAD don't pay the cost of loading Silero.
+   * handles endpointing server-side. Lazily computed on first read.
    */
   get vadPromise(): Promise<VAD | undefined> {
     if (this._vadPromise === undefined) {
-      this._vadPromise = typeof this.vad === 'function' ? this.vad() : Promise.resolve(this.vad);
+      this._vadPromise = Promise.resolve(this.vad);
     }
     return this._vadPromise;
   }
@@ -488,10 +467,7 @@ export class STT<TModel extends STTModels> extends BaseSTT {
     };
 
     if (nextOpts.model !== undefined) {
-      this.vad = resolveVADForModel(
-        nextOpts.model,
-        this.vad && typeof this.vad !== 'function' ? this.vad : undefined,
-      );
+      this.vad = resolveVADForModel(nextOpts.model, this.vad);
       this._vadPromise = undefined;
     }
 
diff --git a/agents/src/inference/utils.ts b/agents/src/inference/utils.ts
index 2849ed9ab..e1c545470 100644
--- a/agents/src/inference/utils.ts
+++ b/agents/src/inference/utils.ts
@@ -4,7 +4,7 @@
 import { ThrowsPromise } from '@livekit/throws-transformer/throws';
 import { AccessToken } from 'livekit-server-sdk';
 import { WebSocket } from 'ws';
-import { APIConnectionError, APIStatusError } from '../_exceptions.js';
+import { APIConnectionError, APIStatusError, APITimeoutError } from '../_exceptions.js';
 import { getJobContext } from '../job.js';
 import { version } from '../version.js';
 
@@ -97,7 +97,7 @@ export async function connectWs(
     const socket = new WebSocket(url, { headers: { ...buildMetadataHeaders(), ...headers } });
 
     const timeout = setTimeout(() => {
-      reject(new APIConnectionError({ message: 'Timeout connecting to LiveKit WebSocket' }));
+      reject(new APITimeoutError({ message: 'Timeout connecting to LiveKit WebSocket' }));
     }, timeoutMs);
 
     const onOpen = () => {
diff --git a/agents/src/inference/vad.test.ts b/agents/src/inference/vad.test.ts
new file mode 100644
index 000000000..fdfa926b8
--- /dev/null
+++ b/agents/src/inference/vad.test.ts
@@ -0,0 +1,63 @@
+// SPDX-FileCopyrightText: 2026 LiveKit, Inc.
+//
+// SPDX-License-Identifier: Apache-2.0
+import { beforeAll, describe, expect, it } from 'vitest';
+import { initializeLogger } from '../log.js';
+import type { VADStream } from '../vad.js';
+import { VAD, type VADOptions } from './vad.js';
+
+beforeAll(() => {
+  initializeLogger({ level: 'silent', pretty: false });
+});
+
+/** White-box view of an `InferenceVADStream`'s internal buffer state. */
+type StreamInternals = {
+  _opts: VADOptions;
+  _speechBuffer: Int16Array | null;
+  _prefixPaddingSamples: number;
+  _inputSampleRate: number;
+};
+
+const internals = (stream: VADStream): StreamInternals => stream as unknown as StreamInternals;
+
+describe('inference.VAD updateOptions propagation', () => {
+  it('fans out option changes to live streams', () => {
+    const vad = new VAD({ minSilenceDuration: 250 });
+    const stream = vad.stream();
+    try {
+      expect(internals(stream)._opts.minSilenceDuration).toBe(250);
+
+      vad.updateOptions({ minSilenceDuration: 800 });
+
+      // The already-created stream observes the new value, not a stale snapshot.
+      expect(internals(stream)._opts.minSilenceDuration).toBe(800);
+    } finally {
+      stream.close();
+    }
+  });
+
+  it('resizes a live stream speech buffer once the sample rate is known', () => {
+    const sampleRate = 16000;
+    const vad = new VAD({ maxBufferedSpeech: 10_000, prefixPaddingDuration: 500 });
+    const stream = vad.stream();
+    try {
+      // Simulate a stream that has already seen its first frame.
+      const s = internals(stream);
+      s._inputSampleRate = sampleRate;
+      s._prefixPaddingSamples = Math.trunc((500 * sampleRate) / 1000);
+      s._speechBuffer = new Int16Array(
+        Math.trunc((10_000 * sampleRate) / 1000) + s._prefixPaddingSamples,
+      );
+
+      vad.updateOptions({ maxBufferedSpeech: 20_000, prefixPaddingDuration: 1000 });
+
+      const expectedPrefix = Math.trunc((1000 * sampleRate) / 1000);
+      expect(s._prefixPaddingSamples).toBe(expectedPrefix);
+      expect(s._speechBuffer?.length).toBe(
+        Math.trunc((20_000 * sampleRate) / 1000) + expectedPrefix,
+      );
+    } finally {
+      stream.close();
+    }
+  });
+});
diff --git a/agents/src/inference/vad.ts b/agents/src/inference/vad.ts
new file mode 100644
index 000000000..ab1034174
--- /dev/null
+++ b/agents/src/inference/vad.ts
@@ -0,0 +1,452 @@
+// SPDX-FileCopyrightText: 2026 LiveKit, Inc.
+//
+// SPDX-License-Identifier: Apache-2.0
+
+/**
+ * Voice Activity Detection backed by `@livekit/local-inference`.
+ *
+ * Provides the same streaming VAD shape as `plugins/silero` but routes
+ * inference through the bundled native model so a default instance can be
+ * auto-provisioned by `AgentSession` without an explicit plugin import.
+ *
+ * Port of Python `livekit.agents.inference.vad`.
+ */
+import { AudioFrame, AudioResampler, AudioResamplerQuality } from '@livekit/rtc-node';
+import { log } from '../log.js';
+import { VAD as BaseVAD, VADStream as BaseVADStream, VADEventType } from '../vad.js';
+import { _getLocalInferenceModule } from './_warmup.js';
+
+const SLOW_INFERENCE_THRESHOLD_MS = 200;
+const MODEL_SAMPLE_RATE = 16000;
+
+export type VADModels = 'silero';
+
+export interface VADOptions {
+  /** Minimum speech duration (ms) before reporting START_OF_SPEECH. */
+  minSpeechDuration: number;
+  /** Trailing silence (ms) before reporting END_OF_SPEECH. */
+  minSilenceDuration: number;
+  /** Pre-roll (ms) included in the speech buffer ahead of START_OF_SPEECH. */
+  prefixPaddingDuration: number;
+  /** Maximum (ms) of buffered speech per utterance. */
+  maxBufferedSpeech: number;
+  /** Sigmoid probability threshold for activation. */
+  activationThreshold: number;
+  /** Sigmoid probability threshold for deactivation (defaults to
+   * `max(activationThreshold - 0.15, 0.01)`). */
+  deactivationThreshold: number;
+}
+
+const defaultVADOptions: VADOptions = {
+  minSpeechDuration: 50,
+  // 250ms (= MIN_SILENCE_DURATION_MS + 50) so the default satisfies the audio
+  // end-of-turn detector's silence-window requirement out of the box.
+  minSilenceDuration: 250,
+  prefixPaddingDuration: 500,
+  maxBufferedSpeech: 60_000,
+  activationThreshold: 0.5,
+  deactivationThreshold: 0.35,
+};
+
+export class VAD extends BaseVAD {
+  protected _opts: VADOptions;
+  protected _model: VADModels;
+  label = 'inference.VAD';
+  // Live streams, tracked weakly so they don't outlive their consumers. JS
+  // `WeakSet` isn't iterable, so we hold `WeakRef`s in a `Set` and prune dead
+  // entries on iteration — the iterable equivalent of Python's `weakref.WeakSet`.
+  #streams = new Set<WeakRef<InferenceVADStream>>();
+
+  constructor(opts: Partial<VADOptions> & { model?: VADModels } = {}) {
+    super({ updateInterval: 32 });
+    const model: VADModels = opts.model ?? 'silero';
+    if (model !== 'silero') {
+      throw new Error(`Unknown VAD model: ${String(model)}. Supported: 'silero'.`);
+    }
+    if (opts.deactivationThreshold !== undefined && opts.deactivationThreshold <= 0) {
+      throw new Error('deactivationThreshold must be greater than 0');
+    }
+    this._model = model;
+    const activation = opts.activationThreshold ?? defaultVADOptions.activationThreshold;
+    this._opts = {
+      ...defaultVADOptions,
+      ...opts,
+      activationThreshold: activation,
+      deactivationThreshold: opts.deactivationThreshold ?? Math.max(activation - 0.15, 0.01),
+    };
+  }
+
+  get model(): string {
+    return this._model;
+  }
+
+  get provider(): string {
+    return 'livekit-local-inference';
+  }
+
+  override get minSilenceDuration(): number {
+    return this._opts.minSilenceDuration;
+  }
+
+  /** Update one or more knobs at runtime, propagating to live streams. */
+  updateOptions(opts: Partial<VADOptions>): void {
+    this._opts = { ...this._opts, ...opts };
+    for (const ref of this.#streams) {
+      const stream = ref.deref();
+      if (stream === undefined) {
+        this.#streams.delete(ref);
+        continue;
+      }
+      stream.updateOptions(opts);
+    }
+  }
+
+  stream(): BaseVADStream {
+    // Each stream owns its own options snapshot so its `updateOptions` can read
+    // the prior `maxBufferedSpeech` before this VAD's copy is mutated.
+    const stream = new InferenceVADStream(this, { ...this._opts });
+    this.#streams.add(new WeakRef(stream));
+    return stream;
+  }
+}
+
+class InferenceVADStream extends BaseVADStream {
+  private _opts: VADOptions;
+  private _logger = log();
+  private _nativeVad:
+    | ReturnType<NonNullable<ReturnType<typeof _getLocalInferenceModule>>['createVad']>
+    | undefined;
+  private _windowSamples: number;
+  private _inputSampleRate = 0;
+  private _resampler: AudioResampler | undefined;
+  private _speechBuffer: Int16Array | null = null;
+  private _speechBufferMaxReached = false;
+  private _prefixPaddingSamples = 0;
+  private _pumpTask: Promise<void>;
+
+  constructor(parent: VAD, opts: VADOptions) {
+    super(parent);
+    this._opts = opts;
+    const mod = _getLocalInferenceModule();
+    if (mod === undefined) {
+      this._logger.warn(
+        'inference.VAD created without @livekit/local-inference; stream will be a no-op',
+      );
+      this._windowSamples = 512;
+    } else {
+      this._nativeVad = mod.createVad();
+      this._windowSamples = mod.VAD_WINDOW_SAMPLES;
+    }
+    this._pumpTask = this._pump().catch((err) => {
+      this._logger.error(
+        { err: err instanceof Error ? err.message : String(err) },
+        'VAD pump failed',
+      );
+    });
+  }
+
+  /**
+   * Apply updated options to this live stream. Once the input sample rate is
+   * known, recomputes the prefix-padding pre-roll and resizes the speech
+   * buffer in place, preserving any audio already accumulated.
+   */
+  updateOptions(opts: Partial<VADOptions>): void {
+    const oldMaxBufferedSpeech = this._opts.maxBufferedSpeech;
+    this._opts = { ...this._opts, ...opts };
+
+    if (this._inputSampleRate && this._speechBuffer !== null) {
+      this._prefixPaddingSamples = Math.trunc(
+        (this._opts.prefixPaddingDuration * this._inputSampleRate) / 1000,
+      );
+      const bufferSize =
+        Math.trunc((this._opts.maxBufferedSpeech * this._inputSampleRate) / 1000) +
+        this._prefixPaddingSamples;
+      const resized = new Int16Array(bufferSize);
+      resized.set(this._speechBuffer.subarray(0, Math.min(this._speechBuffer.length, bufferSize)));
+      this._speechBuffer = resized;
+
+      if (this._opts.maxBufferedSpeech > oldMaxBufferedSpeech) {
+        this._speechBufferMaxReached = false;
+      }
+    }
+  }
+
+  private async _pump(): Promise<void> {
+    let pubSpeaking = false;
+    let pubSpeechDurationMs = 0;
+    let pubSilenceDurationMs = 0;
+    let pubCurrentSample = 0;
+    let pubTimestampMs = 0;
+    let speechThresholdDurationMs = 0;
+    let silenceThresholdDurationMs = 0;
+    let inputFrames: AudioFrame[] = [];
+    let inferenceFrames: AudioFrame[] = [];
+    let inputCopyRemainingFrac = 0;
+    let extraInferenceTime = 0;
+    // Write cursor into `_speechBuffer`. The buffer holds:
+    //   [ ...prefix-padding (sliding pre-roll) ..., ...active speech... ]
+    // and is reset on END_OF_SPEECH (and on silence while idle) so the next
+    // turn starts from a fresh pre-roll window.
+    let speechBufferIndex = 0;
+
+    const resetWriteCursor = () => {
+      if (this._speechBuffer === null) return;
+      if (speechBufferIndex <= this._prefixPaddingSamples) return;
+      // Slide the most-recent `prefixPaddingSamples` samples to the head
+      // of the buffer so the next utterance has continuous pre-roll
+      // context (the audio that immediately preceded START_OF_SPEECH).
+      const paddingData = this._speechBuffer.subarray(
+        speechBufferIndex - this._prefixPaddingSamples,
+        speechBufferIndex,
+      );
+      this._speechBuffer.set(paddingData, 0);
+      speechBufferIndex = this._prefixPaddingSamples;
+      this._speechBufferMaxReached = false;
+    };
+
+    const resetState = () => {
+      this._nativeVad?.reset();
+
+      speechBufferIndex = 0;
+      this._speechBufferMaxReached = false;
+      this._speechBuffer?.fill(0);
+
+      pubSpeaking = false;
+      pubSpeechDurationMs = 0;
+      pubSilenceDurationMs = 0;
+      pubCurrentSample = 0;
+      pubTimestampMs = 0;
+      speechThresholdDurationMs = 0;
+      silenceThresholdDurationMs = 0;
+
+      inputFrames = [];
+      inferenceFrames = [];
+      inputCopyRemainingFrac = 0;
+      extraInferenceTime = 0;
+
+      this._resampler?.close?.();
+      if (this._inputSampleRate && this._inputSampleRate !== MODEL_SAMPLE_RATE) {
+        this._resampler = new AudioResampler(
+          this._inputSampleRate,
+          MODEL_SAMPLE_RATE,
+          1,
+          AudioResamplerQuality.QUICK,
+        );
+      } else {
+        this._resampler = undefined;
+      }
+    };
+
+    const copySpeechBuffer = (): AudioFrame => {
+      if (this._speechBuffer === null) {
+        return new AudioFrame(new Int16Array(0), this._inputSampleRate, 1, 0);
+      }
+      return new AudioFrame(
+        this._speechBuffer.subarray(0, speechBufferIndex),
+        this._inputSampleRate,
+        1,
+        speechBufferIndex,
+      );
+    };
+
+    while (!this.closed) {
+      const { done, value: frame } = await this.inputReader.read();
+      if (done) break;
+      if (typeof frame === 'symbol') {
+        resetState();
+        continue;
+      }
+
+      if (!this._inputSampleRate) {
+        this._inputSampleRate = frame.sampleRate;
+        this._prefixPaddingSamples = Math.trunc(
+          (this._opts.prefixPaddingDuration * this._inputSampleRate) / 1000,
+        );
+        const bufferSize =
+          Math.trunc((this._opts.maxBufferedSpeech * this._inputSampleRate) / 1000) +
+          this._prefixPaddingSamples;
+        this._speechBuffer = new Int16Array(bufferSize);
+        if (this._inputSampleRate !== MODEL_SAMPLE_RATE) {
+          this._resampler = new AudioResampler(
+            this._inputSampleRate,
+            MODEL_SAMPLE_RATE,
+            1,
+            AudioResamplerQuality.QUICK,
+          );
+        }
+      } else if (frame.sampleRate !== this._inputSampleRate) {
+        this._logger.error('a frame with a different sample rate was already pushed');
+        continue;
+      }
+
+      if (this._speechBuffer === null) continue;
+
+      inputFrames.push(frame);
+      if (this._resampler !== undefined) {
+        inferenceFrames.push(...this._resampler.push(frame));
+      } else {
+        inferenceFrames.push(frame);
+      }
+
+      while (!this.closed) {
+        const startTime = performance.now();
+        const availableInferenceSamples = inferenceFrames.reduce(
+          (acc, f) => acc + f.samplesPerChannel,
+          0,
+        );
+        if (availableInferenceSamples < this._windowSamples) break;
+
+        const inputFrame = mergeFrames(inputFrames);
+        const inferenceFrame = mergeFrames(inferenceFrames);
+        const inferenceWindow = inferenceFrame.data.subarray(0, this._windowSamples);
+
+        let p = 0.0;
+        if (this._nativeVad !== undefined) {
+          p = await this._nativeVad.predict(inferenceWindow);
+        }
+
+        const windowDurationMs = (this._windowSamples / MODEL_SAMPLE_RATE) * 1000;
+        pubCurrentSample += this._windowSamples;
+        pubTimestampMs += windowDurationMs;
+        const resamplingRatio = this._inputSampleRate / MODEL_SAMPLE_RATE;
+        const toCopy = this._windowSamples * resamplingRatio + inputCopyRemainingFrac;
+        const toCopyInt = Math.trunc(toCopy);
+        inputCopyRemainingFrac = toCopy - toCopyInt;
+
+        // Append the input-rate samples we just consumed into the
+        // speech buffer so START_OF_SPEECH / END_OF_SPEECH events can
+        // hand downstream consumers (STT, transcription) the prefix-
+        // padded audio they need.
+        const availableSpace = this._speechBuffer.length - speechBufferIndex;
+        const toCopyBuffer = Math.min(toCopyInt, availableSpace);
+        if (toCopyBuffer > 0) {
+          this._speechBuffer.set(inputFrame.data.subarray(0, toCopyBuffer), speechBufferIndex);
+          speechBufferIndex += toCopyBuffer;
+        } else if (!this._speechBufferMaxReached) {
+          this._speechBufferMaxReached = true;
+          this._logger.warn(
+            'maxBufferedSpeech reached, ignoring further data for the current speech input',
+          );
+        }
+
+        const inferenceDuration = performance.now() - startTime;
+        extraInferenceTime = Math.max(0, extraInferenceTime + inferenceDuration - windowDurationMs);
+        // Guard on the per-window inference duration (not the accumulated slack)
+        // to match Python; the accumulated value is still surfaced as the delay.
+        if (inferenceDuration > SLOW_INFERENCE_THRESHOLD_MS) {
+          this._logger.warn(
+            { extraInferenceTimeMs: extraInferenceTime },
+            'VAD slower than realtime',
+          );
+        }
+
+        if (pubSpeaking) pubSpeechDurationMs += windowDurationMs;
+        else pubSilenceDurationMs += windowDurationMs;
+
+        this.sendVADEvent({
+          type: VADEventType.INFERENCE_DONE,
+          samplesIndex: pubCurrentSample,
+          timestamp: pubTimestampMs,
+          silenceDuration: pubSilenceDurationMs,
+          speechDuration: pubSpeechDurationMs,
+          probability: p,
+          inferenceDuration,
+          frames: [
+            new AudioFrame(
+              inputFrame.data.subarray(0, toCopyInt),
+              this._inputSampleRate,
+              1,
+              toCopyInt,
+            ),
+          ],
+          speaking: pubSpeaking,
+          rawAccumulatedSilence: silenceThresholdDurationMs,
+          rawAccumulatedSpeech: speechThresholdDurationMs,
+        });
+
+        if (
+          p >= this._opts.activationThreshold ||
+          (pubSpeaking && p > this._opts.deactivationThreshold)
+        ) {
+          speechThresholdDurationMs += windowDurationMs;
+          silenceThresholdDurationMs = 0;
+          if (!pubSpeaking && speechThresholdDurationMs >= this._opts.minSpeechDuration) {
+            pubSpeaking = true;
+            pubSilenceDurationMs = 0;
+            pubSpeechDurationMs = speechThresholdDurationMs;
+            this.sendVADEvent({
+              type: VADEventType.START_OF_SPEECH,
+              samplesIndex: pubCurrentSample,
+              timestamp: pubTimestampMs,
+              silenceDuration: pubSilenceDurationMs,
+              speechDuration: pubSpeechDurationMs,
+              probability: p,
+              inferenceDuration,
+              frames: [copySpeechBuffer()],
+              speaking: true,
+              rawAccumulatedSilence: 0,
+              rawAccumulatedSpeech: 0,
+            });
+          }
+        } else {
+          silenceThresholdDurationMs += windowDurationMs;
+          speechThresholdDurationMs = 0;
+          // Keep a sliding pre-roll window while we're not in active
+          // speech — without this the buffer would fill with idle
+          // silence and the next START_OF_SPEECH would lose its
+          // prefix-padding context.
+          if (!pubSpeaking) resetWriteCursor();
+          if (pubSpeaking && silenceThresholdDurationMs >= this._opts.minSilenceDuration) {
+            pubSpeaking = false;
+            pubSilenceDurationMs = silenceThresholdDurationMs;
+            this.sendVADEvent({
+              type: VADEventType.END_OF_SPEECH,
+              samplesIndex: pubCurrentSample,
+              timestamp: pubTimestampMs,
+              silenceDuration: pubSilenceDurationMs,
+              speechDuration: Math.max(0, pubSpeechDurationMs - silenceThresholdDurationMs),
+              probability: p,
+              inferenceDuration,
+              frames: [copySpeechBuffer()],
+              speaking: false,
+              rawAccumulatedSilence: 0,
+              rawAccumulatedSpeech: 0,
+            });
+            pubSpeechDurationMs = 0;
+            resetWriteCursor();
+          }
+        }
+
+        inputFrames = [];
+        inferenceFrames = [];
+        if (inputFrame.data.length > toCopyInt) {
+          const data = inputFrame.data.subarray(toCopyInt);
+          inputFrames.push(new AudioFrame(data, this._inputSampleRate, 1, Math.trunc(data.length)));
+        }
+        if (inferenceFrame.data.length > this._windowSamples) {
+          const data = inferenceFrame.data.subarray(this._windowSamples);
+          inferenceFrames.push(new AudioFrame(data, MODEL_SAMPLE_RATE, 1, Math.trunc(data.length)));
+        }
+      }
+    }
+    this._resampler?.close?.();
+  }
+}
+
+/** Minimal frame-merging helper. The silero plugin uses `mergeFrames` from
+ * the agents package — for the inference VAD we keep a local copy to avoid
+ * an import cycle through `index.ts`. */
+function mergeFrames(frames: AudioFrame[]): AudioFrame {
+  if (frames.length === 1) return frames[0]!;
+  const sampleRate = frames[0]!.sampleRate;
+  const channels = frames[0]!.channels;
+  let total = 0;
+  for (const f of frames) total += f.samplesPerChannel;
+  const buf = new Int16Array(total * channels);
+  let offset = 0;
+  for (const f of frames) {
+    buf.set(f.data, offset);
+    offset += f.samplesPerChannel * channels;
+  }
+  return new AudioFrame(buf, sampleRate, channels, total);
+}
diff --git a/agents/src/metrics/base.ts b/agents/src/metrics/base.ts
index f6af79ec9..3ae5546ac 100644
--- a/agents/src/metrics/base.ts
+++ b/agents/src/metrics/base.ts
@@ -15,6 +15,7 @@ export type AgentMetrics =
   | TTSMetrics
   | VADMetrics
   | EOUMetrics
+  | EOTInferenceMetrics
   | RealtimeModelMetrics
   | InterruptionMetrics
   | AvatarMetrics;
@@ -197,6 +198,25 @@ export type RealtimeModelMetrics = {
   metadata?: MetricsMetadata;
 };
 
+/**
+ * Per-prediction telemetry for the audio EOT (end-of-turn) detector. Emitted
+ * by transports on each cloud or local prediction so we can track detection
+ * latency and inference time per call.
+ */
+export type EOTInferenceMetrics = {
+  type: 'eot_inference_metrics';
+  timestamp: number;
+  /** Latest RTT time taken to perform inference, in milliseconds. */
+  totalDuration: number;
+  /** Latest time taken by the model side, in milliseconds. */
+  predictionDuration: number;
+  /** Latest total time from audio-frame creation to prediction receive, in milliseconds. */
+  detectionDelay: number;
+  /** Number of prediction requests served (incremental). */
+  numRequests: number;
+  metadata?: MetricsMetadata;
+};
+
 export type InterruptionMetrics = {
   type: 'interruption_metrics';
   timestamp: number;
diff --git a/agents/src/metrics/index.ts b/agents/src/metrics/index.ts
index 0438b0219..9f2726460 100644
--- a/agents/src/metrics/index.ts
+++ b/agents/src/metrics/index.ts
@@ -5,6 +5,7 @@
 export type {
   AgentMetrics,
   AvatarMetrics,
+  EOTInferenceMetrics,
   EOUMetrics,
   InterruptionMetrics,
   LLMMetrics,
@@ -17,6 +18,7 @@ export type {
 export {
   filterZeroValues,
   ModelUsageCollector,
+  type EOTModelUsage,
   type InterruptionModelUsage,
   type LLMModelUsage,
   type ModelUsage,
diff --git a/agents/src/metrics/model_usage.ts b/agents/src/metrics/model_usage.ts
index 5e723fb51..2a7da58ca 100644
--- a/agents/src/metrics/model_usage.ts
+++ b/agents/src/metrics/model_usage.ts
@@ -3,6 +3,7 @@
 // SPDX-License-Identifier: Apache-2.0
 import type {
   AgentMetrics,
+  EOTInferenceMetrics,
   InterruptionMetrics,
   LLMMetrics,
   RealtimeModelMetrics,
@@ -84,7 +85,23 @@ export type InterruptionModelUsage = {
   totalRequests: number;
 };
 
-export type ModelUsage = LLMModelUsage | TTSModelUsage | STTModelUsage | InterruptionModelUsage;
+/** Aggregate per-provider usage for the audio EOT detector. */
+export type EOTModelUsage = {
+  type: 'eot_usage';
+  /** The provider name (e.g., 'livekit'). */
+  provider: string;
+  /** The model name (e.g., 'turn-detector-v1' for cloud, 'turn-detector-v1-mini' for local). */
+  model: string;
+  /** Total number of EOT prediction requests served. */
+  totalRequests: number;
+};
+
+export type ModelUsage =
+  | LLMModelUsage
+  | TTSModelUsage
+  | STTModelUsage
+  | InterruptionModelUsage
+  | EOTModelUsage;
 
 export function filterZeroValues<T extends ModelUsage>(usage: T): Partial<T> {
   const result: Partial<T> = {} as Partial<T>;
@@ -102,10 +119,17 @@ export class ModelUsageCollector {
   private sttUsage: Map<string, STTModelUsage> = new Map();
 
   private interruptionUsage: Map<string, InterruptionModelUsage> = new Map();
+  private eotUsage: Map<string, EOTModelUsage> = new Map();
 
   /** Extract provider and model from metrics metadata. */
   private extractProviderModel(
-    metrics: LLMMetrics | STTMetrics | TTSMetrics | RealtimeModelMetrics | InterruptionMetrics,
+    metrics:
+      | LLMMetrics
+      | STTMetrics
+      | TTSMetrics
+      | RealtimeModelMetrics
+      | InterruptionMetrics
+      | EOTInferenceMetrics,
   ): [string, string] {
     let provider = '';
     let model = '';
@@ -195,6 +219,21 @@ export class ModelUsageCollector {
     return usage;
   }
 
+  private getEotUsage(provider: string, model: string): EOTModelUsage {
+    const key = `${provider}:${model}`;
+    let usage = this.eotUsage.get(key);
+    if (!usage) {
+      usage = {
+        type: 'eot_usage',
+        provider,
+        model,
+        totalRequests: 0,
+      };
+      this.eotUsage.set(key, usage);
+    }
+    return usage;
+  }
+
   /** Collect metrics and aggregate usage by model/provider. */
   collect(metrics: AgentMetrics): void {
     if (metrics.type === 'llm_metrics') {
@@ -239,8 +278,13 @@ export class ModelUsageCollector {
       const [provider, model] = this.extractProviderModel(metrics);
       const usage = this.getInterruptionUsage(provider, model);
       usage.totalRequests += metrics.numRequests;
+    } else if (metrics.type === 'eot_inference_metrics') {
+      const [provider, model] = this.extractProviderModel(metrics);
+      const usage = this.getEotUsage(provider, model);
+      usage.totalRequests += metrics.numRequests;
     }
-    // VAD and EOU metrics are not aggregated for usage tracking.
+    // VAD and EOU (session-level summary) metrics are not aggregated for
+    // usage tracking; only per-prediction EOT inference metrics are.
   }
 
   flatten(): ModelUsage[] {
@@ -257,6 +301,9 @@ export class ModelUsageCollector {
     for (const u of this.interruptionUsage.values()) {
       result.push({ ...u });
     }
+    for (const u of this.eotUsage.values()) {
+      result.push({ ...u });
+    }
     return result;
   }
 }
diff --git a/agents/src/telemetry/trace_types.ts b/agents/src/telemetry/trace_types.ts
index 7c1bb159a..1f79eca63 100644
--- a/agents/src/telemetry/trace_types.ts
+++ b/agents/src/telemetry/trace_types.ts
@@ -65,6 +65,13 @@ export const ATTR_EOU_PROBABILITY = 'lk.eou.probability';
 export const ATTR_EOU_UNLIKELY_THRESHOLD = 'lk.eou.unlikely_threshold';
 export const ATTR_EOU_DELAY = 'lk.eou.endpointing_delay';
 export const ATTR_EOU_LANGUAGE = 'lk.eou.language';
+/** Which signal triggered the EOU detection: 'vad' | 'stt' | 'manual'. */
+export const ATTR_EOU_SOURCE = 'lk.eou.source';
+/** True when the audio EOT detector resolved this prediction from its
+ * inference-window cache instead of running a fresh predict. */
+export const ATTR_EOU_FROM_CACHE = 'lk.eou.from_cache';
+/** Latest input-audio creation time → prediction receive time (ms). */
+export const ATTR_EOU_DETECTION_DELAY = 'lk.eou.detection_delay';
 export const ATTR_USER_TRANSCRIPT = 'lk.user_transcript';
 export const ATTR_TRANSCRIPT_CONFIDENCE = 'lk.transcript_confidence';
 export const ATTR_TRANSCRIPTION_DELAY = 'lk.transcription_delay';
diff --git a/agents/src/utils.ts b/agents/src/utils.ts
index e537ea4e5..56d561025 100644
--- a/agents/src/utils.ts
+++ b/agents/src/utils.ts
@@ -1418,6 +1418,33 @@ export function asError(maybeError: unknown): Error {
   return new Error(String(maybeError));
 }
 
+/**
+ * Resolve a value that may come from an explicit argument, one of several
+ * environment variables (checked in order), or a final default.
+ *
+ * Mirrors Python `livekit.agents.utils.resolve_env_var`. Used by inference
+ * transports to plumb credentials and URLs (e.g. `LIVEKIT_REMOTE_EOT_URL`,
+ * `LIVEKIT_INFERENCE_API_KEY`).
+ */
+export function resolveEnvVar(
+  value: string | undefined,
+  envVars: readonly string[],
+  defaultValue = '',
+): string {
+  // An explicit empty string is a provided value, returned as-is; only
+  // `undefined` falls through to env resolution.
+  if (value !== undefined) {
+    return value;
+  }
+  for (const name of envVars) {
+    const v = process.env[name];
+    if (v !== undefined && v !== '') {
+      return v;
+    }
+  }
+  return defaultValue;
+}
+
 /**
  * Tagged template literal that strips common leading indentation from every line,
  * trims the first empty line and any trailing whitespace.
diff --git a/agents/src/utils_env.test.ts b/agents/src/utils_env.test.ts
new file mode 100644
index 000000000..71ecb531a
--- /dev/null
+++ b/agents/src/utils_env.test.ts
@@ -0,0 +1,88 @@
+// SPDX-FileCopyrightText: 2026 LiveKit, Inc.
+//
+// SPDX-License-Identifier: Apache-2.0
+
+/**
+ * Tests for the `resolveEnvVar` helper contract.
+ *
+ * Port of Python `tests/test_utils_env.py`.
+ */
+import { afterEach, beforeEach, describe, expect, it } from 'vitest';
+import { resolveEnvVar } from './utils.js';
+
+const ENV_KEYS = ['LIVEKIT_INFERENCE_URL', 'LIVEKIT_URL'] as const;
+const saved: Record<string, string | undefined> = {};
+
+beforeEach(() => {
+  for (const k of ENV_KEYS) {
+    saved[k] = process.env[k];
+    delete process.env[k];
+  }
+});
+
+afterEach(() => {
+  for (const k of ENV_KEYS) {
+    if (saved[k] === undefined) delete process.env[k];
+    else process.env[k] = saved[k];
+  }
+});
+
+describe('resolveEnvVar', () => {
+  it('returns empty string when no env or default', () => {
+    expect(resolveEnvVar(undefined, ['LIVEKIT_INFERENCE_URL'])).toBe('');
+  });
+
+  it('returns default when no matching env exists', () => {
+    expect(resolveEnvVar(undefined, ['LIVEKIT_INFERENCE_URL'], 'https://default.example.com')).toBe(
+      'https://default.example.com',
+    );
+  });
+
+  it('returns first matching env value', () => {
+    process.env.LIVEKIT_INFERENCE_URL = 'https://inference.example.com';
+    process.env.LIVEKIT_URL = 'https://livekit.example.com';
+    expect(
+      resolveEnvVar(
+        undefined,
+        ['LIVEKIT_INFERENCE_URL', 'LIVEKIT_URL'],
+        'https://default.example.com',
+      ),
+    ).toBe('https://inference.example.com');
+  });
+
+  it('falls back to later env when earlier env missing', () => {
+    process.env.LIVEKIT_URL = 'https://livekit.example.com';
+    expect(
+      resolveEnvVar(
+        undefined,
+        ['LIVEKIT_INFERENCE_URL', 'LIVEKIT_URL'],
+        'https://default.example.com',
+      ),
+    ).toBe('https://livekit.example.com');
+  });
+
+  it('prefers explicit value over environment', () => {
+    process.env.LIVEKIT_INFERENCE_URL = 'https://env.example.com';
+    expect(
+      resolveEnvVar(
+        'https://explicit.example.com',
+        ['LIVEKIT_INFERENCE_URL'],
+        'https://default.example.com',
+      ),
+    ).toBe('https://explicit.example.com');
+  });
+
+  it('treats empty env value as missing', () => {
+    process.env.LIVEKIT_INFERENCE_URL = '';
+    expect(resolveEnvVar(undefined, ['LIVEKIT_INFERENCE_URL'], 'https://default.example.com')).toBe(
+      'https://default.example.com',
+    );
+  });
+
+  it('treats whitespace env value as set', () => {
+    process.env.LIVEKIT_INFERENCE_URL = ' ';
+    expect(resolveEnvVar(undefined, ['LIVEKIT_INFERENCE_URL'], 'https://default.example.com')).toBe(
+      ' ',
+    );
+  });
+});
diff --git a/agents/src/vad.ts b/agents/src/vad.ts
index e946ae213..c0a2e8c16 100644
--- a/agents/src/vad.ts
+++ b/agents/src/vad.ts
@@ -28,11 +28,11 @@ export interface VADEvent {
    * Index of the audio sample where the event occurred, relative to the inference sample rate.
    */
   samplesIndex: number;
-  /** Timestamp when the event was fired. */
+  /** Timestamp (milliseconds since epoch) when the event was fired. */
   timestamp: number;
-  /** Duration of the speech segment in seconds. */
+  /** Duration of the speech segment in milliseconds. */
   speechDuration: number;
-  /** Duration of the silence segment in seconds. */
+  /** Duration of the silence segment in milliseconds. */
   silenceDuration: number;
   /**
    * List of audio frames associated with the speech.
@@ -45,7 +45,7 @@ export interface VADEvent {
   frames: AudioFrame[];
   /** Probability that speech is present (only for `INFERENCE_DONE` events). */
   probability: number;
-  /** Time taken to perform the inference, in seconds (only for `INFERENCE_DONE` events). */
+  /** Time taken to perform the inference, in milliseconds (only for `INFERENCE_DONE` events). */
   inferenceDuration: number;
   /** Indicates whether speech was detected in the frames. */
   speaking: boolean;
@@ -77,6 +77,19 @@ export abstract class VAD extends (EventEmitter as new () => TypedEmitter<VADCal
     return this.#capabilities;
   }
 
+  /**
+   * Current `minSilenceDuration` floor in milliseconds, or `null` if the
+   * VAD does not expose this knob.
+   *
+   * Consumed by `AudioRecognition` when a `TurnDetector` is active —
+   * the EOT detector needs a wider silence window than typical VAD
+   * defaults, so the floor is validated against this knob. Implementations
+   * that don't support introspection should leave the default `null` return.
+   */
+  get minSilenceDuration(): number | null {
+    return null;
+  }
+
   /**
    * Returns a {@link VADStream} that can be used to push audio frames and receive VAD events.
    */
diff --git a/agents/src/voice/agent_activity.ts b/agents/src/voice/agent_activity.ts
index 80660536d..f3b88351c 100644
--- a/agents/src/voice/agent_activity.ts
+++ b/agents/src/voice/agent_activity.ts
@@ -10,6 +10,10 @@ import { Heap } from 'heap-js';
 import { AsyncLocalStorage } from 'node:async_hooks';
 import { ReadableStream, TransformStream } from 'node:stream/web';
 import type { Logger } from 'pino';
+import {
+  BaseStreamingTurnDetector,
+  type BaseStreamingTurnDetectorStream,
+} from '../inference/eot/base.js';
 import type { InterruptionDetectionError } from '../inference/interruption/errors.js';
 import { AdaptiveInterruptionDetector } from '../inference/interruption/interruption_detector.js';
 import type { OverlappingSpeechEvent } from '../inference/interruption/types.js';
@@ -44,6 +48,7 @@ import type { LLMError } from '../llm/llm.js';
 import { isSameToolChoice, isSameToolContext } from '../llm/tool_context.js';
 import { log } from '../log.js';
 import type {
+  EOTInferenceMetrics,
   EOUMetrics,
   InterruptionMetrics,
   LLMMetrics,
@@ -88,7 +93,12 @@ import {
   type RecognitionHooks,
   type STTPipeline,
 } from './audio_recognition.js';
-import type { AgentState, AgentStateChangedEvent, UserTurnExceededEvent } from './events.js';
+import type {
+  AgentState,
+  AgentStateChangedEvent,
+  EotPredictionEvent,
+  UserTurnExceededEvent,
+} from './events.js';
 import {
   AgentSessionEventTypes,
   createAgentFalseInterruptionEvent,
@@ -129,6 +139,7 @@ export interface ReusableResources {
   sttPipeline?: STTPipeline;
   sttInputStartedAt?: number;
   rtSession?: RealtimeSession;
+  turnDetectorStream?: BaseStreamingTurnDetectorStream;
 }
 
 export class SchedulingPausedError extends Error {
@@ -155,6 +166,10 @@ export async function cleanupReusableResources(
     tasks.push(resources.rtSession.close());
     resources.rtSession = undefined;
   }
+  if (resources.turnDetectorStream) {
+    tasks.push(resources.turnDetectorStream.aclose());
+    resources.turnDetectorStream = undefined;
+  }
 
   if (tasks.length > 0) {
     const outputs = await ThrowsPromise.allSettled(tasks);
@@ -226,6 +241,14 @@ export class AgentActivity implements RecognitionHooks {
   private isInterruptionByAudioActivityEnabled: boolean;
   private isDefaultInterruptionByAudioActivityEnabled: boolean;
 
+  /**
+   * Validated turn detection for this activity. Equals `this.turnDetection`
+   * except when an `BaseStreamingTurnDetector` instance fails the runtime preconditions
+   * (no VAD, or RealtimeModel with server-side turn detection enabled), in
+   * which case it is downgraded to `undefined` and a warning is logged.
+   */
+  private _resolvedTurnDetection: TurnDetectionMode | undefined;
+
   // for false interruption handling
   private pausedSpeech?: PausedSpeechInfo;
   private falseInterruptionTimer?: NodeJS.Timeout;
@@ -292,8 +315,9 @@ export class AgentActivity implements RecognitionHooks {
     });
     this.q_updated = new Future();
 
+    this._resolvedTurnDetection = this._resolveTurnDetection(this.turnDetection);
     this.turnDetectionMode =
-      typeof this.turnDetection === 'string' ? this.turnDetection : undefined;
+      typeof this._resolvedTurnDetection === 'string' ? this._resolvedTurnDetection : undefined;
 
     if (this.turnDetectionMode === 'vad' && this.vad === undefined) {
       this.logger.warn(
@@ -342,10 +366,13 @@ export class AgentActivity implements RecognitionHooks {
         this.turnDetectionMode = undefined;
       }
 
-      // fallback to VAD if server side turn detection is disabled and VAD is available
+      // fallback to VAD if server side turn detection is disabled and the
+      // user explicitly supplied a VAD. The bundled-default VAD is treated
+      // as absent here so behavior matches "no vad passed" sessions.
       if (
         !this.llm.capabilities.turnDetection &&
         this.vad &&
+        !this.usingDefaultVad &&
         this.turnDetectionMode === undefined
       ) {
         this.turnDetectionMode = 'vad';
@@ -516,12 +543,27 @@ export class AgentActivity implements RecognitionHooks {
       this.vad.on('metrics_collected', this.onMetricsCollected);
     }
 
+    if (this._resolvedTurnDetection instanceof BaseStreamingTurnDetector) {
+      this._resolvedTurnDetection.on('metrics_collected', this.onMetricsCollected);
+    }
+
+    // Bundled-default VAD is treated as absent when the RealtimeModel does
+    // its own server-side turn detection — the realtime session is already
+    // canonical and an extra audio pipeline would just pay the native model
+    // load for no behavioral gain. User-supplied VADs still flow through
+    // (e.g. when the user wants adaptive interruption).
+    const realtimeUsesServerVad =
+      this.llm instanceof RealtimeModel && this.llm.capabilities.turnDetection === true;
+    const recognitionVad = this.usingDefaultVad && realtimeUsesServerVad ? undefined : this.vad;
+
     this.audioRecognition = new AudioRecognition({
       recognitionHooks: this,
       // Disable stt node if stt is not provided
       stt: this.stt ? (...args) => this.agent.sttNode(...args) : undefined,
-      vad: this.vad,
-      turnDetector: typeof this.turnDetection === 'string' ? undefined : this.turnDetection,
+      vad: recognitionVad,
+      usingDefaultVad: this.usingDefaultVad,
+      turnDetector:
+        typeof this._resolvedTurnDetection === 'string' ? undefined : this._resolvedTurnDetection,
       turnDetectionMode: this.turnDetectionMode,
       interruptionDetection: this.interruptionDetector,
       backchannelBoundary:
@@ -538,20 +580,29 @@ export class AgentActivity implements RecognitionHooks {
       shouldDiscardAudioForStt: () => this.shouldDiscardInputAudio(),
     });
 
-    if (reuseResources?.sttPipeline) {
+    const sttPipeline = reuseResources?.sttPipeline;
+    // carry the input epoch along with the reused pipeline: its stream clock
+    // is cumulative, so re-stamping inputStartedAt here would push STT-derived
+    // timestamps into the future and stall end-of-turn after every handoff
+    // (1.4.5 silence regression from #1603; see agent_task_handoff_eou.test.ts)
+    const sttInputStartedAt = reuseResources?.sttInputStartedAt;
+    const turnDetectorStream = reuseResources?.turnDetectorStream;
+    if (sttPipeline) {
       this.logger.debug('reusing STT pipeline from previous activity');
-      // carry the input epoch along with the reused pipeline: its stream clock
-      // is cumulative, so re-stamping inputStartedAt here would push STT-derived
-      // timestamps into the future and stall end-of-turn after every handoff
-      // (1.4.5 silence regression from #1603; see agent_task_handoff_eou.test.ts)
-      await this.audioRecognition.start({
-        sttPipeline: reuseResources.sttPipeline,
-        inputStartedAt: reuseResources.sttInputStartedAt,
-      });
-      reuseResources.sttPipeline = undefined; // ownership transferred
+    }
+    if (turnDetectorStream) {
+      this.logger.debug('reusing turn detector stream from previous activity');
+    }
+    await this.audioRecognition.start({
+      sttPipeline,
+      inputStartedAt: sttInputStartedAt,
+      turnDetectorStream,
+    });
+    if (reuseResources) {
+      // ownership transferred to the new AudioRecognition
+      reuseResources.sttPipeline = undefined;
       reuseResources.sttInputStartedAt = undefined;
-    } else {
-      await this.audioRecognition.start();
+      reuseResources.turnDetectorStream = undefined;
     }
 
     this.started = true;
@@ -592,6 +643,15 @@ export class AgentActivity implements RecognitionHooks {
         resources.sttInputStartedAt = this.audioRecognition.inputStartedAt;
       }
 
+      // reuse the turn detector stream during a handoff whenever we can
+      if (
+        this.audioRecognition &&
+        this._resolvedTurnDetection instanceof BaseStreamingTurnDetector &&
+        this._resolvedTurnDetection === newActivity._resolvedTurnDetection
+      ) {
+        resources.turnDetectorStream = this.audioRecognition.detachTurnDetector();
+      }
+
       // rt session
       if (
         this.realtimeSession &&
@@ -655,6 +715,18 @@ export class AgentActivity implements RecognitionHooks {
     return this.agent.vad || this.agentSession.vad;
   }
 
+  /**
+   * True iff the effective VAD for this activity is the framework-auto-provisioned
+   * default. False when the user passed `vad=` to either the agent or the
+   * session, even if the value happens to be the same silero model.
+   */
+  get usingDefaultVad(): boolean {
+    if (this.agent.vad !== undefined) {
+      return false;
+    }
+    return this.agentSession._usingDefaultVad;
+  }
+
   get stt(): STT | undefined {
     return this.agent.stt || this.agentSession.stt;
   }
@@ -980,7 +1052,13 @@ export class AgentActivity implements RecognitionHooks {
   // -- Metrics and errors --
 
   private onMetricsCollected = (
-    ev: STTMetrics | TTSMetrics | VADMetrics | LLMMetrics | RealtimeModelMetrics,
+    ev:
+      | STTMetrics
+      | TTSMetrics
+      | VADMetrics
+      | LLMMetrics
+      | RealtimeModelMetrics
+      | EOTInferenceMetrics,
   ) => {
     const speechHandle = speechHandleStorage.getStore();
     if (speechHandle && (ev.type === 'llm_metrics' || ev.type === 'tts_metrics')) {
@@ -1032,7 +1110,11 @@ export class AgentActivity implements RecognitionHooks {
   onInputSpeechStarted(_ev: InputSpeechStartedEvent): void {
     this.logger.info('onInputSpeechStarted');
 
-    if (!this.vad) {
+    // Bundled-default VAD is treated as absent here so the realtime
+    // session's own server-side turn detection drives the user-state /
+    // overlap-detection update, identical to a session that didn't
+    // configure any VAD.
+    if (!this.vad || this.usingDefaultVad) {
       this.agentSession._updateUserState('speaking');
       if (this.isInterruptionDetectionEnabled && this.audioRecognition) {
         this.audioRecognition.onStartOfOverlapSpeech(
@@ -1058,7 +1140,7 @@ export class AgentActivity implements RecognitionHooks {
   onInputSpeechStopped(ev: InputSpeechStoppedEvent): void {
     this.logger.info(ev, 'onInputSpeechStopped');
 
-    if (!this.vad) {
+    if (!this.vad || this.usingDefaultVad) {
       if (this.isInterruptionDetectionEnabled && this.audioRecognition) {
         this.audioRecognition.onEndOfOverlapSpeech(Date.now(), this.agentSession._userSpeakingSpan);
       }
@@ -1402,6 +1484,12 @@ export class AgentActivity implements RecognitionHooks {
     this.cancelSpeechPauseTask = this.cancelSpeechPause();
   }
 
+  /** Forward audio EOT predictions up to the session so listeners (e.g.
+   * remote-session forwarders) can observe them. */
+  onEotPrediction(ev: EotPredictionEvent): void {
+    this.agentSession.emit(AgentSessionEventTypes.EotPrediction, ev);
+  }
+
   onPreemptiveGeneration(info: PreemptiveGenerationInfo): void {
     const preemptiveOpts = this.agentSession.sessionOptions.turnHandling.preemptiveGeneration;
     if (
@@ -3916,6 +4004,29 @@ export class AgentActivity implements RecognitionHooks {
     }
   }
 
+  private _resolveTurnDetection(
+    turnDetection: TurnDetectionMode | undefined,
+  ): TurnDetectionMode | undefined {
+    if (turnDetection !== undefined && typeof turnDetection !== 'string') {
+      if (turnDetection instanceof BaseStreamingTurnDetector) {
+        if (this.vad === undefined) {
+          this.logger.warn(
+            'TurnDetector requires a VAD model. Pass vad=inference.VAD() to AgentSession/Agent or turnDetection=null to disable the default TurnDetector',
+          );
+          return undefined;
+        }
+        if (this.llm instanceof RealtimeModel && this.llm.capabilities.turnDetection) {
+          this.logger.warn(
+            'turnDetection is a TurnDetector, but the LLM is a RealtimeModel with server-side turn detection enabled, ignoring the turnDetection setting',
+          );
+          return undefined;
+        }
+      }
+      return turnDetection;
+    }
+    return turnDetection;
+  }
+
   private resolveInterruptionDetector(): AdaptiveInterruptionDetector | undefined {
     const agentInterruptionDetection = this.agent.turnHandling?.interruption?.mode;
     const sessionInterruptionDetection = this.agentSession.interruptionDetection;
@@ -3924,7 +4035,7 @@ export class AgentActivity implements RecognitionHooks {
         this.stt &&
         this.stt.capabilities.alignedTranscript &&
         this.stt.capabilities.streaming &&
-        this.vad &&
+        this.vad !== undefined &&
         this.turnDetection !== 'manual' &&
         this.turnDetection !== 'realtime_llm' &&
         !(this.llm instanceof RealtimeModel)
@@ -4182,6 +4293,10 @@ export class AgentActivity implements RecognitionHooks {
       this.vad.off('metrics_collected', this.onMetricsCollected);
     }
 
+    if (this._resolvedTurnDetection instanceof BaseStreamingTurnDetector) {
+      this._resolvedTurnDetection.off('metrics_collected', this.onMetricsCollected);
+    }
+
     this.detachAudioInput();
     this.realtimeSpans?.clear();
     await this.realtimeSession?.close();
diff --git a/agents/src/voice/agent_session.ts b/agents/src/voice/agent_session.ts
index 3cade1f2f..09c83a088 100644
--- a/agents/src/voice/agent_session.ts
+++ b/agents/src/voice/agent_session.ts
@@ -12,10 +12,13 @@ import { context as otelContext, trace } from '@opentelemetry/api';
 import { EventEmitter } from 'node:events';
 import type { ReadableStream } from 'node:stream/web';
 import type { z } from 'zod';
+import type { BaseStreamingTurnDetector } from '../inference/eot/base.js';
 import {
   LLM as InferenceLLM,
   STT as InferenceSTT,
   TTS as InferenceTTS,
+  TurnDetector as InferenceTurnDetector,
+  VAD as InferenceVAD,
   type LLMModels,
   type STTModelString,
   type TTSModelString,
@@ -63,6 +66,7 @@ import {
   type CloseEvent,
   CloseReason,
   type ConversationItemAddedEvent,
+  type EotPredictionEvent,
   type ErrorEvent,
   type FunctionToolsExecutedEvent,
   type MetricsCollectedEvent,
@@ -199,7 +203,13 @@ export type VoiceOptions = {
   maxEndpointingDelay?: number;
 };
 
-export type TurnDetectionMode = 'stt' | 'vad' | 'realtime_llm' | 'manual' | _TurnDetector;
+export type TurnDetectionMode =
+  | 'stt'
+  | 'vad'
+  | 'realtime_llm'
+  | 'manual'
+  | _TurnDetector
+  | BaseStreamingTurnDetector;
 
 export type AgentSessionCallbacks = {
   [AgentSessionEventTypes.UserInputTranscribed]: (ev: UserInputTranscribedEvent) => void;
@@ -215,11 +225,18 @@ export type AgentSessionCallbacks = {
   [AgentSessionEventTypes.Error]: (ev: ErrorEvent) => void;
   [AgentSessionEventTypes.Close]: (ev: CloseEvent) => void;
   [AgentSessionEventTypes.OverlappingSpeech]: (ev: OverlappingSpeechEvent) => void;
+  [AgentSessionEventTypes.EotPrediction]: (ev: EotPredictionEvent) => void;
 };
 
 export type AgentSessionOptions<UserData = UnknownUserData> = {
   stt?: STT | STTModelString;
-  vad?: VAD;
+  /**
+   * Voice Activity Detection. When omitted, `AgentSession` auto-provisions a
+   * bundled `inference.VAD({ model: 'silero' })` and marks it as the default
+   * (so sites that previously distinguished "user supplied a VAD" continue
+   * to treat the bundled one as absent). Pass `null` to opt out entirely.
+   */
+  vad?: VAD | null;
   llm?: LLM | RealtimeModel | LLMModels;
   tts?: TTS | TTSModelString;
   userData?: UserData;
@@ -362,6 +379,15 @@ export class AgentSession<
 
   private _interruptionDetection?: InterruptionOptions['mode'];
 
+  /**
+   * True iff this session auto-provisioned the bundled silero VAD because the
+   * caller passed no `vad=`. Set once in the constructor; immutable from then
+   * on. Read it via `AgentActivity.usingDefaultVad` from voice-pipeline code.
+   *
+   * @internal
+   */
+  _usingDefaultVad: boolean = false;
+
   /** @internal */
   _usageCollector: ModelUsageCollector = new ModelUsageCollector();
 
@@ -438,7 +464,19 @@ export class AgentSession<
         DEFAULT_SESSION_CONNECT_OPTIONS.maxUnrecoverableErrors,
     };
 
-    this.vad = vad;
+    // VAD: undefined → auto-provision bundled inference.VAD (silero). The
+    // `_usingDefaultVad` marker is the single source of truth for "this VAD
+    // was framework-provisioned" — code paths that should ignore a default
+    // VAD read it via `AgentActivity.usingDefaultVad`. null → leave VAD off
+    // entirely. Otherwise use what the caller supplied.
+    this._usingDefaultVad = vad === undefined;
+    if (vad === undefined) {
+      this.vad = new InferenceVAD({ model: 'silero' });
+    } else if (vad === null) {
+      this.vad = undefined;
+    } else {
+      this.vad = vad;
+    }
 
     if (typeof stt === 'string') {
       this.stt = InferenceSTT.fromModelString(stt);
@@ -458,7 +496,16 @@ export class AgentSession<
       this.tts = tts;
     }
 
-    this.turnDetection = resolvedSessionOptions.turnHandling.turnDetection;
+    // Default turn_detection: when the caller didn't pin a mode or supply a
+    // detector instance (`undefined`/not-given), fall back to a fresh
+    // inference.TurnDetector so every session ships with audio EOT
+    // out of the box. An explicit `null` opts out entirely — no detector is
+    // built (mirrors Python `None` vs `NOT_GIVEN`).
+    const configuredTurnDetection = resolvedSessionOptions.turnHandling.turnDetection;
+    this.turnDetection =
+      configuredTurnDetection === null
+        ? undefined
+        : configuredTurnDetection ?? new InferenceTurnDetector();
     this._interruptionDetection = resolvedSessionOptions.turnHandling.interruption?.mode;
     this._userData = userData;
 
diff --git a/agents/src/voice/agent_session_default_vad.test.ts b/agents/src/voice/agent_session_default_vad.test.ts
new file mode 100644
index 000000000..c48025805
--- /dev/null
+++ b/agents/src/voice/agent_session_default_vad.test.ts
@@ -0,0 +1,96 @@
+// SPDX-FileCopyrightText: 2026 LiveKit, Inc.
+//
+// SPDX-License-Identifier: Apache-2.0
+
+/**
+ * Tests for the bundled-default VAD and turn-detection behavior on `AgentSession`.
+ *
+ * Port of six test additions on `tests/test_agent_session.py`:
+ *
+ * - `test_default_vad_is_auto_provisioned`
+ * - `test_explicit_vad_none_opts_out`
+ * - `test_user_supplied_vad_keeps_using_default_false`
+ * - `test_default_turn_detection_builds_default_eot`
+ * - `test_turn_detection_none_opts_out`
+ * - `test_user_supplied_turn_detector_passes_through`
+ */
+import { describe, expect, it } from 'vitest';
+import { TurnDetector } from '../inference/eot/detector.js';
+import type { VADStream } from '../vad.js';
+import { VAD as BaseVAD } from '../vad.js';
+import { AgentSession } from './agent_session.js';
+
+class FakeVAD extends BaseVAD {
+  label = 'FakeVAD';
+  constructor() {
+    super({ updateInterval: 32 });
+  }
+  stream(): VADStream {
+    throw new Error('not used in this test');
+  }
+}
+
+describe('AgentSession default VAD', () => {
+  it('auto-provisions a default VAD when none passed', async () => {
+    const session = new AgentSession();
+    try {
+      expect(session.vad).toBeDefined();
+      expect(session._usingDefaultVad).toBe(true);
+    } finally {
+      await session.close().catch(() => {});
+    }
+  });
+
+  it('explicit `vad: null` opts out', async () => {
+    const session = new AgentSession({ vad: null });
+    try {
+      expect(session.vad).toBeUndefined();
+      expect(session._usingDefaultVad).toBe(false);
+    } finally {
+      await session.close().catch(() => {});
+    }
+  });
+
+  it('user-supplied VAD keeps _usingDefaultVad false', async () => {
+    const userVad = new FakeVAD();
+    const session = new AgentSession({ vad: userVad });
+    try {
+      expect(session.vad).toBe(userVad);
+      expect(session._usingDefaultVad).toBe(false);
+    } finally {
+      await session.close().catch(() => {});
+    }
+  });
+});
+
+describe('AgentSession default turn detection', () => {
+  it('auto-provisions a default TurnDetector when none given', async () => {
+    const session = new AgentSession();
+    try {
+      expect(session.turnDetection).toBeInstanceOf(TurnDetector);
+    } finally {
+      await session.close().catch(() => {});
+    }
+  });
+
+  it('explicit `turnDetection: null` opts out (no default detector built)', async () => {
+    // `null` is the explicit opt-out, distinct from `undefined` (not given);
+    // mirrors Python `turn_detection=None`.
+    const session = new AgentSession({ turnHandling: { turnDetection: null } });
+    try {
+      expect(session.turnDetection).toBeUndefined();
+    } finally {
+      await session.close().catch(() => {});
+    }
+  });
+
+  it('passes a user-supplied turn detector through unchanged', async () => {
+    const userDetector = new TurnDetector({ version: 'v1-mini' });
+    const session = new AgentSession({ turnHandling: { turnDetection: userDetector } });
+    try {
+      expect(session.turnDetection).toBe(userDetector);
+    } finally {
+      await session.close().catch(() => {});
+    }
+  });
+});
diff --git a/agents/src/voice/audio_recognition.ts b/agents/src/voice/audio_recognition.ts
index 8933173ae..d94d84a84 100644
--- a/agents/src/voice/audio_recognition.ts
+++ b/agents/src/voice/audio_recognition.ts
@@ -14,6 +14,12 @@ import {
 import type { ReadableStream, WritableStreamDefaultWriter } from 'node:stream/web';
 import { TransformStream } from 'node:stream/web';
 import { isAPIError } from '../_exceptions.js';
+import {
+  BaseStreamingTurnDetector,
+  BaseStreamingTurnDetectorStream,
+  MIN_SILENCE_DURATION_MS,
+  type TurnDetectionEvent,
+} from '../inference/eot/base.js';
 import { apiConnectDefaults, intervalForRetry } from '../inference/interruption/defaults.js';
 import { InterruptionDetectionError } from '../inference/interruption/errors.js';
 import type { AdaptiveInterruptionDetector } from '../inference/interruption/interruption_detector.js';
@@ -32,10 +38,16 @@ import { type StreamChannel, createStreamChannel } from '../stream/stream_channe
 import { type SpeechEvent, SpeechEventType } from '../stt/stt.js';
 import { traceTypes, tracer } from '../telemetry/index.js';
 import { splitWords } from '../tokenize/basic/word.js';
-import { Task, cancelAndWait, delay, readStream, waitForAbort } from '../utils.js';
+import type { Future } from '../utils.js';
+import { Event, Task, cancelAndWait, delay, readStream, waitForAbort } from '../utils.js';
 import { type VAD, type VADEvent, VADEventType, type VADStream } from '../vad.js';
 import type { TurnDetectionMode } from './agent_session.js';
-import { type UserTurnExceededEvent, createUserTurnExceededEvent } from './events.js';
+import {
+  type EotPredictionEvent,
+  type UserTurnExceededEvent,
+  createEotPredictionEvent,
+  createUserTurnExceededEvent,
+} from './events.js';
 import type { STTNode } from './io.js';
 import {
   type BaseEndpointing,
@@ -86,6 +98,7 @@ export interface RecognitionHooks {
   onInterimTranscript: (ev: SpeechEvent, speaking: boolean | undefined) => void;
   onFinalTranscript: (ev: SpeechEvent, speaking: boolean | undefined) => void;
   onEndOfTurn: (info: EndOfTurnInfo) => Promise<boolean>;
+  onEotPrediction: (ev: EotPredictionEvent) => void;
   onPreemptiveGeneration: (info: PreemptiveGenerationInfo) => void;
   onUserTurnExceeded: (ev: UserTurnExceededEvent) => void;
 
@@ -98,6 +111,42 @@ interface UserTurnTracker {
   startedAt?: number;
 }
 
+/**
+ * Edge-triggered event with an abort-aware `waitOnce` helper.
+ *
+ * Used by the audio-EOT bounce race: the bounce task awaits either the
+ * endpointing delay or a fresh "user started speaking" signal. We extend
+ * the base `Event` rather than reimplementing it because the base already
+ * handles the resolver / waiter bookkeeping; this subclass just layers a
+ * `waitOnce(signal)` that rejects on cancel so the race can tear down
+ * cleanly when the parent task is aborted.
+ */
+class SpeakingEvent extends Event {
+  /**
+   * Resolves on the next `set()`. Rejects (and cleans up the listener) if
+   * `signal` aborts first. Returns immediately if the event is already set.
+   */
+  async waitOnce(signal: AbortSignal): Promise<void> {
+    if (this.isSet) return;
+    let abortListener: (() => void) | undefined;
+    try {
+      await Promise.race([
+        this.wait().then(() => undefined),
+        new Promise<never>((_resolve, reject) => {
+          if (signal.aborted) {
+            reject(signal.reason ?? new Error('aborted'));
+            return;
+          }
+          abortListener = () => reject(signal.reason ?? new Error('aborted'));
+          signal.addEventListener('abort', abortListener, { once: true });
+        }),
+      ]);
+    } finally {
+      if (abortListener !== undefined) signal.removeEventListener('abort', abortListener);
+    }
+  }
+}
+
 export class STTPipeline {
   static readonly PUMP_TASK_CANCEL_TIMEOUT = 5000;
 
@@ -148,7 +197,11 @@ export interface _TurnDetector {
   readonly provider: string;
   unlikelyThreshold: (language?: LanguageCode) => Promise<number | undefined>;
   supportsLanguage: (language?: LanguageCode) => Promise<boolean>;
-  predictEndOfTurn(chatCtx: ChatContext, timeout?: number): Promise<number>;
+  /**
+   * @param timeoutMs - Optional inference wait budget in milliseconds. The audio
+   *   EOT detector honors it; text-based detectors currently ignore it.
+   */
+  predictEndOfTurn(chatCtx: ChatContext, timeoutMs?: number): Promise<number>;
 }
 
 export interface AudioRecognitionOptions {
@@ -158,8 +211,17 @@ export interface AudioRecognitionOptions {
   stt?: STTNode;
   /** Voice activity detection. */
   vad?: VAD;
-  /** Turn detector for end-of-turn prediction. */
-  turnDetector?: _TurnDetector;
+  /**
+   * True iff the wired VAD was auto-provisioned by `AgentSession` rather than
+   * supplied by the caller. Read at every "is VAD configured?" call site so
+   * a framework-default VAD behaves like no VAD for downstream eligibility
+   * decisions (e.g. STT-hook `speaking=` payload).
+   */
+  usingDefaultVad?: boolean;
+  /** Turn detector for end-of-turn prediction. Accepts text-based detectors
+   * via `_TurnDetector` (e.g. plugins/livekit) or audio-based detectors via
+   * `TurnDetector` (e.g. `inference.TurnDetector`). */
+  turnDetector?: _TurnDetector | BaseStreamingTurnDetector;
   /** Turn detection mode. */
   turnDetectionMode?: TurnDetectionMode;
   interruptionDetection?: AdaptiveInterruptionDetector;
@@ -206,7 +268,42 @@ export class AudioRecognition {
   private stt?: STTNode;
   private sttPipeline?: STTPipeline;
   private vad?: VAD;
-  private turnDetector?: _TurnDetector;
+  private usingDefaultVad: boolean;
+  private turnDetector?: _TurnDetector | BaseStreamingTurnDetector;
+  private turnDetectorStream?: BaseStreamingTurnDetectorStream;
+  /**
+   * Future for the in-flight audio-EOT inference request. Recognition owns the
+   * request lifecycle: it starts a request on the VAD silence tick, holds the
+   * future here, awaits it (with the endpointing delay) in the eou bounce, and
+   * clears it on turn boundaries / superseding speech.
+   */
+  private turnDetectorPredictionFut?: Future<TurnDetectionEvent>;
+  /**
+   * True between a turn flush (commit / clearUserTurn) and the next VAD
+   * start-of-speech. While set, a late stt final won't start a fresh request;
+   * the eou bounce short-circuits via `onMissingEotPrediction`.
+   */
+  private turnDetectorFlushed = false;
+  /** Warn once per recognition when the eou bounce runs after a flush. */
+  private turnDetectorLatePredictionWarned = false;
+  /**
+   * The last `TurnDetectionEvent` we forwarded via `onEotPrediction`, kept
+   * by reference to dedupe: both EOU triggers in a turn read the same
+   * resolved prediction future, but the event should fire once per request.
+   */
+  private lastEmittedEotPrediction?: TurnDetectionEvent;
+  /**
+   * Edge-triggered "user is speaking" event used by the audio-EOT bounce
+   * race. Set on VAD `START_OF_SPEECH` (and on any `INFERENCE_DONE` with
+   * accumulated speech), cleared on `END_OF_SPEECH`. Mirrors Python
+   * `_user_speaking_event`.
+   *
+   * `Event.set()` is idempotent (re-setting an already-set event resolves
+   * any new waiters immediately); cleared on EOS so subsequent waiters
+   * park until the next utterance.
+   */
+  private userSpeakingEvent = new SpeakingEvent();
+  private warnedTurnDetectorPushFailure = false;
   private turnDetectionMode?: TurnDetectionMode;
   private endpointing: BaseEndpointing;
   private userTurnLimit?: UserTurnLimitOptions;
@@ -287,7 +384,12 @@ export class AudioRecognition {
     this.hooks = opts.recognitionHooks;
     this.stt = opts.stt;
     this.vad = opts.vad;
+    this.usingDefaultVad = opts.usingDefaultVad ?? false;
     this.turnDetector = opts.turnDetector;
+    this.checkVadSilenceRequirement();
+    // The FSM stream is opened on `start()` so callers can hand off the
+    // previous activity's stream (cloud↔local fallback state, in-flight
+    // inference) instead of forcing a cold restart.
     this.turnDetectionMode = opts.turnDetectionMode;
     this.userTurnLimit = opts.userTurnLimit;
     this.endpointing =
@@ -336,6 +438,26 @@ export class AudioRecognition {
       {
         transform: (chunk, controller) => {
           controller.enqueue(chunk);
+          // Fan the same frame into the audio EOT detector stream when
+          // one is attached. The FSM accepts arbitrary-rate input and
+          // resamples internally. `pushAudio` is a no-op when the stream's
+          // internal channel is closed; any actual throw indicates a bug
+          // (e.g. resampler init failure, sample-rate mismatch). Log once
+          // when we hit that path so a regression doesn't silently drop
+          // every audio frame.
+          if (this.turnDetectorStream !== undefined) {
+            try {
+              this.turnDetectorStream.pushAudio(chunk);
+            } catch (err) {
+              if (!this.warnedTurnDetectorPushFailure) {
+                this.warnedTurnDetectorPushFailure = true;
+                this.logger.warn(
+                  { err: err instanceof Error ? err.message : String(err) },
+                  'audio EOT stream pushAudio failed; dropping frames for this turn',
+                );
+              }
+            }
+          }
           if (this.subscriberWriters.length === 0) return;
           for (const writer of this.subscriberWriters) {
             writer.write(chunk).catch(() => {
@@ -414,7 +536,163 @@ export class AudioRecognition {
     }
   }
 
-  async start(options?: { sttPipeline?: STTPipeline; inputStartedAt?: number }) {
+  /** True iff the user supplied their own VAD (default-VAD is treated as
+   * absent at sites that decide between "use VAD signal" and "STT-derived
+   * speaking"). */
+  private get hasUserVad(): boolean {
+    return this.vad !== undefined && !this.usingDefaultVad;
+  }
+
+  /**
+   * Swap the active turn detector at runtime. When an `BaseStreamingTurnDetector`
+   * is provided, opens a per-turn FSM stream after retiring the prior one.
+   *
+   * When `stream` is provided it is adopted as-is (handoff reuse) instead of
+   * opening a fresh stream on `detector`; the live transport stream — and its
+   * per-session cloud→local fallback state — survives the handoff.
+   */
+  updateTurnDetector(
+    detector: _TurnDetector | BaseStreamingTurnDetector | undefined,
+    options?: { stream?: BaseStreamingTurnDetectorStream },
+  ): void {
+    // Validate against the incoming detector before swapping in so the error
+    // — when raised — names the configuration that failed.
+    this.checkVadSilenceRequirement(detector);
+    this.turnDetector = detector;
+
+    const reuseStream = options?.stream;
+    // Retire the prior stream before creating the new one. `detach()` frees
+    // the detector's single-stream slot synchronously (so `stream()` below
+    // won't throw if the same detector is reused), while the network teardown
+    // runs in the background.
+    const oldStream = this.turnDetectorStream;
+    if (oldStream !== undefined && oldStream !== reuseStream) {
+      oldStream.detach();
+      void oldStream.aclose().catch(() => undefined);
+    }
+    // Cross-detector state should not leak: the cached speaking signal
+    // from the prior detector's turn must not race the new detector's
+    // first bounce.
+    this.userSpeakingEvent.clear();
+    const newStream =
+      reuseStream !== undefined
+        ? reuseStream
+        : detector instanceof BaseStreamingTurnDetector
+          ? detector.stream()
+          : undefined;
+    // A different stream means a fresh request lifecycle: drop any held
+    // prediction future and re-arm so the adopting recognition starts its own
+    // request on the next VAD event.
+    if (this.turnDetectorStream !== newStream) {
+      this.turnDetectorPredictionFut = undefined;
+      this.turnDetectorFlushed = false;
+    }
+    this.turnDetectorStream = newStream;
+  }
+
+  /**
+   * Detach the turn detector stream for handoff to another AudioRecognition.
+   *
+   * Returns the live stream (transport run loop intact) without closing it.
+   * The caller passes it to the new AudioRecognition via
+   * `start({ turnDetectorStream })`. The stream stays attached to its
+   * detector, retaining the detector's single-stream slot, so the new
+   * AudioRecognition must adopt it rather than open a second stream.
+   */
+  detachTurnDetector(): BaseStreamingTurnDetectorStream | undefined {
+    const stream = this.turnDetectorStream;
+    this.turnDetectorStream = undefined;
+    // The adopting recognition starts a fresh request on its next VAD event,
+    // superseding any request that survived the handoff.
+    this.turnDetectorPredictionFut = undefined;
+    return stream;
+  }
+
+  /**
+   * The audio EOT detector needs a wider silence window than typical VAD
+   * defaults. Rather than mutate the VAD's knob, require the caller to
+   * configure it: raise if the bound VAD exposes `minSilenceDuration` and it
+   * is below the floor. VADs that don't expose the knob are left untouched.
+   */
+  private checkVadSilenceRequirement(
+    detector: _TurnDetector | BaseStreamingTurnDetector | undefined = this.turnDetector,
+  ): void {
+    if (!(detector instanceof BaseStreamingTurnDetector) || this.vad === undefined) {
+      return;
+    }
+    const current = this.vad.minSilenceDuration;
+    if (current === null) {
+      return;
+    }
+    const required = MIN_SILENCE_DURATION_MS + 50;
+    if (current < required) {
+      throw new Error(
+        `vad minSilenceDuration=${current}ms is too low for the TurnDetector. ` +
+          `Raise the VAD's minSilenceDuration to at least ${required}ms.`,
+      );
+    }
+  }
+
+  /**
+   * Speaking-guard wrapper for the bounce-EOU task, mirroring Python's
+   * `_bounce_eou_task_with_speaking_guard`. When an `BaseStreamingTurnDetector`
+   * is active, the bounce task races against the `userSpeakingEvent`:
+   *
+   * - if the user is already speaking, skip the EOU outright;
+   * - if the user starts speaking during the endpointing delay (e.g.
+   *   the LLM hadn't returned yet but the user added another phrase),
+   *   abort the inner bounce so the next turn drives the decision.
+   *
+   * VAD `START_OF_SPEECH` also calls `bounceEOUTask?.cancel()`, but the
+   * cancel path only races VAD sessions. STT-only audio-EOT setups need
+   * the explicit event-driven race here.
+   */
+  private async bounceEOUTaskWithSpeakingGuard(
+    controller: AbortController,
+    inner: (innerController: AbortController) => Promise<void>,
+    context: {
+      lastSpeakingTime: number | undefined;
+      lastFinalTranscriptTime: number;
+      speechStartTime: number | undefined;
+    },
+  ): Promise<void> {
+    if (this.speaking) {
+      this.logger.debug(context, 'user is still speaking, skipping end of turn task');
+      return;
+    }
+    const innerController = new AbortController();
+    // Propagate outer cancellation into the inner task.
+    const onOuterAbort = () => innerController.abort();
+    controller.signal.addEventListener('abort', onOuterAbort, { once: true });
+
+    let speakingWon = false;
+    try {
+      const innerPromise = inner(innerController);
+      // When the speaking branch wins, the race settles before finally aborts
+      // innerController, leaving innerPromise's rejection uncaught without this.
+      void innerPromise.catch(() => {});
+      await Promise.race([
+        innerPromise,
+        this.userSpeakingEvent.waitOnce(controller.signal).then(() => {
+          speakingWon = true;
+        }),
+      ]);
+      if (speakingWon) {
+        this.logger.debug(context, 'user spoke during endpointing, cancelling end of turn task');
+      }
+    } finally {
+      controller.signal.removeEventListener('abort', onOuterAbort);
+      // If the speaking-event branch won (or the outer was aborted), tear
+      // down the inner bounce so it doesn't keep awaiting the delay.
+      innerController.abort();
+    }
+  }
+
+  async start(options?: {
+    sttPipeline?: STTPipeline;
+    inputStartedAt?: number;
+    turnDetectorStream?: BaseStreamingTurnDetectorStream;
+  }) {
     this.startSttTasks(options?.sttPipeline, options?.inputStartedAt);
 
     this.vadTask = Task.from(({ signal }) => this.createVadTask(this.vad, signal));
@@ -428,6 +706,14 @@ export class AudioRecognition {
     this.interruptionTask.result.catch((err) => {
       this.logger.error(`Error running interruption task: ${err}`);
     });
+
+    // Open (or adopt) the audio EOT detector stream now that the activity is
+    // running. We only call `updateTurnDetector` for BaseStreamingTurnDetector /
+    // undefined detectors — plugin-based `_TurnDetector` instances are
+    // text-only and don't carry a stream.
+    if (this.turnDetector instanceof BaseStreamingTurnDetector || this.turnDetector === undefined) {
+      this.updateTurnDetector(this.turnDetector, { stream: options?.turnDetectorStream });
+    }
   }
 
   async stop() {
@@ -435,6 +721,11 @@ export class AudioRecognition {
     await this.sttForwardTask?.cancelAndWait();
     await this.vadTask?.cancelAndWait();
     await this.interruptionTask?.cancelAndWait();
+    if (this.turnDetectorStream !== undefined) {
+      const stream = this.turnDetectorStream;
+      this.turnDetectorStream = undefined;
+      await stream.aclose().catch(() => undefined);
+    }
   }
 
   async disableInterruptionDetection(): Promise<void> {
@@ -850,7 +1141,7 @@ export class AudioRecognition {
 
         this.hooks.onFinalTranscript(
           ev,
-          this.vad || this.turnDetectionMode === 'stt' ? this.speaking : undefined,
+          this.hasUserVad || this.turnDetectionMode === 'stt' ? this.speaking : undefined,
         );
 
         this.logger.debug(
@@ -869,7 +1160,7 @@ export class AudioRecognition {
         this.audioInterimTranscript = '';
         this.audioPreflightTranscript = '';
 
-        if (!this.vad || this.lastSpeakingTime === undefined) {
+        if (!this.hasUserVad || this.lastSpeakingTime === undefined) {
           // vad disabled or missed a speech, use stt timestamp
           this.lastSpeakingTime = sttLastSpeakingTime;
         }
@@ -896,14 +1187,14 @@ export class AudioRecognition {
           if (!this.speaking) {
             const chatCtx = this.hooks.retrieveChatCtx();
             this.logger.debug('running EOU detection on stt FINAL_TRANSCRIPT');
-            this.runEOUDetection(chatCtx);
+            this.runEOUDetection(chatCtx, 'stt');
           }
         }
         break;
       case SpeechEventType.PREFLIGHT_TRANSCRIPT:
         this.hooks.onInterimTranscript(
           ev,
-          this.vad || this.turnDetectionMode === 'stt' ? this.speaking : undefined,
+          this.hasUserVad || this.turnDetectionMode === 'stt' ? this.speaking : undefined,
         );
         const preflightTranscript = ev.alternatives?.[0]?.text ?? '';
         const preflightConfidence = ev.alternatives?.[0]?.confidence ?? 0;
@@ -936,7 +1227,7 @@ export class AudioRecognition {
           `${this.audioTranscript} ${preflightTranscript}`.trimStart();
         this.audioInterimTranscript = preflightTranscript;
 
-        if (!this.vad || this.lastSpeakingTime === undefined) {
+        if (!this.hasUserVad || this.lastSpeakingTime === undefined) {
           // vad disabled or missed a speech, use stt timestamp
           this.lastSpeakingTime = sttLastSpeakingTime;
         }
@@ -966,7 +1257,7 @@ export class AudioRecognition {
         this.logger.debug({ transcript: ev.alternatives?.[0]?.text }, 'interim transcript');
         this.hooks.onInterimTranscript(
           ev,
-          this.vad || this.turnDetectionMode === 'stt' ? this.speaking : undefined,
+          this.hasUserVad || this.turnDetectionMode === 'stt' ? this.speaking : undefined,
         );
         this.audioInterimTranscript = ev.alternatives?.[0]?.text ?? '';
         break;
@@ -996,6 +1287,10 @@ export class AudioRecognition {
         }
         this.speaking = true;
         this.lastSpeakingTime = sttLastSpeakingTime;
+        // STT-only sessions never see VAD events; surface the speaking
+        // signal here so the audio-EOT bounce race can still abort on a
+        // mid-window fresh utterance.
+        this.userSpeakingEvent.set();
 
         this.bounceEOUTask?.cancel();
         break;
@@ -1032,7 +1327,9 @@ export class AudioRecognition {
         // and user state won't be updated until a new VAD SOS is received.
         // Reset VAD so that incorrect end of turn from STT can be corrected by VAD interruption.
         // If user is still speaking (an immediate VAD SOS will interrupt the agent).
-        if (this.vad && this.vadSpeechStarted) {
+        // Default-bundled VAD is treated as absent here — only user-supplied VADs
+        // are reset/flushed, matching the matrix in PR_DESCRIPTION.
+        if (this.hasUserVad && this.vadSpeechStarted) {
           if (this.vadStream) {
             this.vadStream.flush();
           } else {
@@ -1048,8 +1345,9 @@ export class AudioRecognition {
           );
         }
         this.speaking = false;
+        this.userSpeakingEvent.clear();
         this.userTurnCommitted = true;
-        if (!this.vad || this.lastSpeakingTime === undefined) {
+        if (!this.hasUserVad || this.lastSpeakingTime === undefined) {
           // vad disabled or missed a speech, use stt timestamp
           this.lastSpeakingTime = sttLastSpeakingTime;
         }
@@ -1057,7 +1355,7 @@ export class AudioRecognition {
         if (!this.speaking) {
           const chatCtx = this.hooks.retrieveChatCtx();
           this.logger.debug('running EOU detection on stt END_OF_SPEECH');
-          this.runEOUDetection(chatCtx);
+          this.runEOUDetection(chatCtx, 'stt');
         }
     }
   }
@@ -1077,7 +1375,25 @@ export class AudioRecognition {
     }
   }
 
-  private runEOUDetection(chatCtx: ChatContext) {
+  private onMissingEotPrediction(): void {
+    if (this.turnDetectorFlushed) {
+      if (!this.turnDetectorLatePredictionWarned) {
+        this.turnDetectorLatePredictionWarned = true;
+        this.logger.warn(
+          'eou detection ran after the audio eot turn was already flushed ' +
+            '(likely a late stt final). consider raising `minDelay` in the ' +
+            'endpointing options to accommodate slow stt. subsequent ' +
+            'occurrences will log at debug level.',
+        );
+      } else {
+        this.logger.debug('stt transcript arrived after a turn flush, skipping eot prediction');
+      }
+    } else {
+      this.logger.debug('no eot inference request in flight, skipping eot prediction');
+    }
+  }
+
+  private runEOUDetection(chatCtx: ChatContext, trigger: 'vad' | 'stt' | 'manual' = 'vad') {
     this.logger.debug(
       {
         stt: this.stt,
@@ -1094,11 +1410,32 @@ export class AudioRecognition {
     }
 
     chatCtx = chatCtx.copy();
-    chatCtx.addMessage({ role: 'user', content: this.audioTranscript });
+    if (this.audioTranscript) {
+      chatCtx.addMessage({ role: 'user', content: this.audioTranscript });
+    }
 
-    const turnDetector =
-      // disable EOU model if manual turn detection enabled
-      this.audioTranscript && this.turnDetectionMode !== 'manual' ? this.turnDetector : undefined;
+    // Pick the right detector:
+    //  - manual mode: no detector (turn boundary decided externally)
+    //  - audio EOT detector: prefer the per-turn stream (it caches the
+    //    prediction for the current inference window so the bounce task
+    //    can short-circuit on cache)
+    //  - text-based detector: only run when we have a transcript to score
+    const hasAudioDetector = this.turnDetector instanceof BaseStreamingTurnDetector;
+    const useDetector =
+      this.turnDetectionMode !== 'manual' && (this.audioTranscript || hasAudioDetector);
+    // The unified type only covers the predict surface; the audio
+    // detector's per-turn stream stands in for the parent when one is
+    // attached so the cached prediction is available.
+    let turnDetector: _TurnDetector | BaseStreamingTurnDetectorStream | undefined;
+    if (!useDetector) {
+      turnDetector = undefined;
+    } else if (hasAudioDetector) {
+      turnDetector = this.turnDetectorStream;
+    } else {
+      // text-based detector — `this.turnDetector` cannot be the audio
+      // base class here, because `hasAudioDetector` already screened it.
+      turnDetector = this.turnDetector as _TurnDetector | undefined;
+    }
 
     const bounceEOUTask =
       (
@@ -1113,47 +1450,155 @@ export class AudioRecognition {
         const userTurnCtx = this.userTurnContext(userTurnSpan);
 
         if (turnDetector) {
-          await tracer.startActiveSpan(
-            async (span) => {
-              this.logger.debug('Running turn detector model');
+          if (!(await turnDetector.supportsLanguage(this.lastLanguage))) {
+            // Unsupported language: produce no span and emit no prediction event.
+            this.logger.debug(`Turn detector does not support language ${this.lastLanguage}`);
+          } else {
+            await tracer.startActiveSpan(
+              async (span) => {
+                this.logger.debug('Running turn detector model');
+
+                // undefined => the prediction never resolved (e.g. timed out
+                // or inference threw); gates the span attributes and the emit
+                // below.
+                let endOfTurnProbability: number | undefined;
+                let unlikelyThreshold: number | undefined;
+                // True when the held future was already resolved when this
+                // bounce started — i.e. the prediction was served from the
+                // request the silence tick warmed, not awaited fresh.
+                let fromCache = false;
+                // The resolved prediction event for this turn, shared by
+                // reference across both EOU triggers (vad + stt final) so the
+                // emit can dedupe.
+                let predictionEvent: TurnDetectionEvent | undefined;
+
+                if (turnDetector instanceof BaseStreamingTurnDetectorStream) {
+                  const fut = this.turnDetectorPredictionFut;
+                  if (fut === undefined) {
+                    this.onMissingEotPrediction();
+                  } else {
+                    fromCache = fut.done;
+                    // Await the held future against the endpointing delay.
+                    let timeoutId: ReturnType<typeof setTimeout> | undefined;
+                    const winner = await Promise.race([
+                      fut.await.then((ev) => ({ kind: 'value', ev }) as const),
+                      new Promise<{ kind: 'timeout' }>((resolve) => {
+                        timeoutId = setTimeout(
+                          () => resolve({ kind: 'timeout' }),
+                          endpointingDelay,
+                        );
+                      }),
+                    ]);
+                    if (timeoutId !== undefined) clearTimeout(timeoutId);
+
+                    // A newer trigger calls `bounceEOUTask?.cancel()`. A JS abort
+                    // does NOT interrupt the await above, so bail here before
+                    // touching shared state so the superseded bounce doesn't
+                    // clobber a freshly-armed future or double-emit.
+                    if (controller.signal.aborted) return;
+
+                    if (winner.kind === 'value') {
+                      predictionEvent = winner.ev;
+                      endOfTurnProbability = predictionEvent.endOfTurnProbability;
+                      unlikelyThreshold = await turnDetector.unlikelyThreshold(this.lastLanguage);
+                    } else {
+                      this.logger.warn(
+                        { timeoutMs: endpointingDelay },
+                        'eot prediction timed out, committing without a prediction',
+                      );
+                      turnDetector.cancelInference({ timedOut: true });
+                      this.turnDetectorPredictionFut = undefined;
+                    }
+                  }
+                } else {
+                  try {
+                    endOfTurnProbability = await turnDetector.predictEndOfTurn(
+                      chatCtx,
+                      endpointingDelay,
+                    );
+                    unlikelyThreshold = await turnDetector.unlikelyThreshold(this.lastLanguage);
+                  } catch (error) {
+                    this.logger.error(error, 'Error predicting end of turn');
+                  }
+                  // See the streaming-branch note: bail if a newer trigger
+                  // superseded this bounce while it awaited.
+                  if (controller.signal.aborted) return;
+                }
 
-              let endOfTurnProbability = 0.0;
-              let unlikelyThreshold: number | undefined;
+                if (
+                  endOfTurnProbability !== undefined &&
+                  unlikelyThreshold !== undefined &&
+                  endOfTurnProbability < unlikelyThreshold
+                ) {
+                  endpointingDelay = this.endpointing.maxDelay;
+                }
 
-              if (!(await turnDetector.supportsLanguage(this.lastLanguage))) {
-                this.logger.debug(`Turn detector does not support language ${this.lastLanguage}`);
-              } else {
-                try {
-                  endOfTurnProbability = await turnDetector.predictEndOfTurn(chatCtx);
-                  unlikelyThreshold = await turnDetector.unlikelyThreshold(this.lastLanguage);
+                this.logger.debug(
+                  {
+                    endOfTurnProbability,
+                    unlikelyThreshold,
+                    endpointingDelay,
+                    language: this.lastLanguage,
+                    trigger,
+                    fromCache,
+                  },
+                  'eot prediction',
+                );
 
-                  this.logger.debug(
-                    { endOfTurnProbability, unlikelyThreshold, language: this.lastLanguage },
-                    'end of turn probability',
-                  );
+                const prediction = predictionEvent;
 
-                  if (unlikelyThreshold && endOfTurnProbability < unlikelyThreshold) {
-                    endpointingDelay = this.endpointing.maxDelay;
-                  }
-                } catch (error) {
-                  this.logger.error(error, 'Error predicting end of turn');
+                span.setAttribute(
+                  traceTypes.ATTR_CHAT_CTX,
+                  JSON.stringify(chatCtx.toJSON({ excludeTimestamp: false })),
+                );
+                if (endOfTurnProbability !== undefined) {
+                  span.setAttribute(traceTypes.ATTR_EOU_PROBABILITY, endOfTurnProbability);
+                }
+                if (unlikelyThreshold !== undefined) {
+                  span.setAttribute(traceTypes.ATTR_EOU_UNLIKELY_THRESHOLD, unlikelyThreshold);
+                }
+                span.setAttribute(traceTypes.ATTR_EOU_DELAY, endpointingDelay);
+                span.setAttribute(traceTypes.ATTR_EOU_LANGUAGE, this.lastLanguage ?? '');
+                span.setAttribute(traceTypes.ATTR_EOU_FROM_CACHE, fromCache);
+                span.setAttribute(traceTypes.ATTR_EOU_SOURCE, trigger);
+
+                // Emit once the prediction resolved (a timeout / failed
+                // inference emits nothing). Both EOU triggers in a turn (vad +
+                // stt final) read the same resolved `TurnDetectionEvent`; dedupe
+                // by reference so the event fires once per request. The abort
+                // guard above drops a superseded bounce; this reference check
+                // catches the race where the first bounce completes (and emits)
+                // just before the second trigger fires. Text detectors have no
+                // shared event (`prediction === undefined`), so they always emit.
+                if (
+                  endOfTurnProbability !== undefined &&
+                  unlikelyThreshold !== undefined &&
+                  (prediction === undefined || prediction !== this.lastEmittedEotPrediction)
+                ) {
+                  this.lastEmittedEotPrediction = prediction;
+                  const inferenceDurationMs = prediction?.inferenceDuration ?? 0;
+                  const delayMs =
+                    lastSpeakingTime !== undefined ? Date.now() - lastSpeakingTime : 0;
+                  this.hooks.onEotPrediction(
+                    createEotPredictionEvent({
+                      probability: endOfTurnProbability,
+                      threshold: unlikelyThreshold,
+                      inferenceDurationMs,
+                      delayMs,
+                    }),
+                  );
                 }
-              }
 
-              span.setAttribute(
-                traceTypes.ATTR_CHAT_CTX,
-                JSON.stringify(chatCtx.toJSON({ excludeTimestamp: false })),
-              );
-              span.setAttribute(traceTypes.ATTR_EOU_PROBABILITY, endOfTurnProbability);
-              span.setAttribute(traceTypes.ATTR_EOU_UNLIKELY_THRESHOLD, unlikelyThreshold ?? 0);
-              span.setAttribute(traceTypes.ATTR_EOU_DELAY, endpointingDelay);
-              span.setAttribute(traceTypes.ATTR_EOU_LANGUAGE, this.lastLanguage ?? '');
-            },
-            {
-              name: 'eou_detection',
-              context: userTurnCtx,
-            },
-          );
+                if (prediction?.detectionDelay !== undefined) {
+                  span.setAttribute(traceTypes.ATTR_EOU_DETECTION_DELAY, prediction.detectionDelay);
+                }
+              },
+              {
+                name: 'eou_detection',
+                context: userTurnCtx,
+              },
+            );
+          }
         }
 
         let extraSleep = endpointingDelay;
@@ -1219,6 +1664,15 @@ export class AudioRecognition {
             this.vadSpeechStarted = false;
             this.lastSpeakingTime = undefined;
           }
+
+          // Flush the in-flight request and write the turn-boundary sentinel to
+          // the transport so the next turn's predict starts fresh — the normal
+          // EOU-commit path, mirroring clearUserTurn()'s flush on interrupt.
+          if (this.turnDetectorStream !== undefined) {
+            this.turnDetectorStream.flush('turn committed');
+            this.turnDetectorPredictionFut = undefined;
+            this.turnDetectorFlushed = true;
+          }
         }
 
         this.userTurnCommitted = false;
@@ -1227,9 +1681,24 @@ export class AudioRecognition {
     // cancel any existing EOU task
     this.bounceEOUTask?.cancel();
     // copy the values before awaiting (the values can change)
-    this.bounceEOUTask = Task.from(
-      bounceEOUTask(this.lastSpeakingTime, this.lastFinalTranscriptTime, this.userTurnStart),
-    );
+    const lastSpeakingTime = this.lastSpeakingTime;
+    const lastFinalTranscriptTime = this.lastFinalTranscriptTime;
+    const speechStartTime = this.userTurnStart;
+
+    // Audio-EOT detectors get a speaking-guard wrapper: if the user starts
+    // speaking again during the endpointing delay, abort the EOU and let
+    // the next turn drive the decision. Text-based detectors (no audio
+    // pipeline) keep the simpler bounce task — they can't race against
+    // mid-window utterances anyway since they don't run during silence.
+    const factory = hasAudioDetector
+      ? (controller: AbortController) =>
+          this.bounceEOUTaskWithSpeakingGuard(
+            controller,
+            bounceEOUTask(lastSpeakingTime, lastFinalTranscriptTime, speechStartTime),
+            { lastSpeakingTime, lastFinalTranscriptTime, speechStartTime },
+          )
+      : bounceEOUTask(lastSpeakingTime, lastFinalTranscriptTime, speechStartTime);
+    this.bounceEOUTask = Task.from(factory);
 
     this.bounceEOUTask.result
       .then(() => {
@@ -1382,6 +1851,13 @@ export class AudioRecognition {
               otelContext.with(ctx, () => this.hooks.onStartOfSpeech(ev));
             }
             this.speaking = true;
+            this.userSpeakingEvent.set();
+
+            // Audio EOT: tear down any in-flight inference for the now-stale
+            // prior window and re-arm so the next silence tick starts fresh.
+            this.turnDetectorStream?.cancelInference();
+            this.turnDetectorPredictionFut = undefined;
+            this.turnDetectorFlushed = false;
 
             // Capture sample rate from the first VAD event if not already set
             if (ev.frames.length > 0 && ev.frames[0]) {
@@ -1401,6 +1877,38 @@ export class AudioRecognition {
                 // ev.rawAccumulatedSpeech is in ms (VADEvent durations are all ms in TS).
                 this.speechStartTime = Date.now() - ev.rawAccumulatedSpeech;
               }
+              // Wake any speaking-guard waiter — STT-only sessions don't
+              // see START_OF_SPEECH but do see INFERENCE_DONE-with-speech.
+              this.userSpeakingEvent.set();
+
+              // A short intra-segment pause can resolve a request before VAD
+              // emits END_OF_SPEECH. When speech resumes (without a new SOS),
+              // drop that request so the next pause gets a fresh window.
+              if (this.speaking && this.turnDetectorPredictionFut !== undefined) {
+                this.turnDetectorStream?.cancelInference();
+                this.turnDetectorPredictionFut = undefined;
+              }
+            } else if (!this.speaking) {
+              // A sub-threshold speech spike can set `userSpeakingEvent` without
+              // ever reaching START_OF_SPEECH, so no END_OF_SPEECH will fire to
+              // clear it. Clear it here once speech drops back to zero (confirmed
+              // turns are cleared by EOS).
+              this.userSpeakingEvent.clear();
+            }
+
+            // Audio EOT: start an inference request once we've seen enough
+            // trailing silence (matches Python's `MIN_SILENCE_DURATION_MS`),
+            // but only when no request is already in flight. The silence tick
+            // is the sole request trigger — and it warms even while the agent
+            // is speaking so an overlapping/interrupting turn still gets a
+            // window.
+            if (
+              ev.rawAccumulatedSilence >= MIN_SILENCE_DURATION_MS &&
+              this.speaking &&
+              this.turnDetectorStream !== undefined &&
+              this.turnDetectorPredictionFut === undefined
+            ) {
+              this.turnDetectorPredictionFut = this.turnDetectorStream.predict();
             }
             break;
           case VADEventType.END_OF_SPEECH:
@@ -1421,13 +1929,19 @@ export class AudioRecognition {
             // when VAD fires END_OF_SPEECH, it already waited for the silence_duration
             this.vadSpeechStarted = false;
             this.speaking = false;
+            this.userSpeakingEvent.clear();
+            this.lastSpeakingTime = Date.now() - ev.silenceDuration - ev.inferenceDuration;
+
+            // Audio EOT: END_OF_SPEECH no longer starts a request — the
+            // silence tick owns that. It consumes the already-armed future
+            // (if any) and runs the eou bounce.
 
             if (
               this.vadBaseTurnDetection ||
               (this.turnDetectionMode === 'stt' && this.userTurnCommitted)
             ) {
               const chatCtx = this.hooks.retrieveChatCtx();
-              this.runEOUDetection(chatCtx);
+              this.runEOUDetection(chatCtx, 'vad');
             }
             break;
         }
@@ -1627,6 +2141,19 @@ export class AudioRecognition {
     this.speaking = false;
     this.userTurnCommitted = false;
     this.userTurnTracker = { words: 0, transcript: '' };
+    // Clear the speaking event so a stale `set()` from the just-finished
+    // turn doesn't immediately trip the next speaking-guard race.
+    this.userSpeakingEvent.clear();
+    // New turn → allow the next window's prediction to emit.
+    this.lastEmittedEotPrediction = undefined;
+
+    // Any in-flight request on the audio stream belongs to the turn we
+    // just cleared — flush it so the next predict starts fresh.
+    if (this.turnDetectorStream !== undefined) {
+      this.turnDetectorStream.flush('clear_user_turn');
+      this.turnDetectorPredictionFut = undefined;
+      this.turnDetectorFlushed = true;
+    }
 
     if (this.userTurnSpan?.isRecording()) {
       this.userTurnSpan.end();
@@ -1700,7 +2227,7 @@ export class AudioRecognition {
 
         const chatCtx = this.hooks.retrieveChatCtx();
         this.logger.debug('running EOU detection on commitUserTurn');
-        this.runEOUDetection(chatCtx);
+        this.runEOUDetection(chatCtx, 'manual');
         this.userTurnCommitted = true;
       };
 
@@ -1747,6 +2274,13 @@ export class AudioRecognition {
     await this.vadTask?.cancelAndWait();
     await this.bounceEOUTask?.cancelAndWait();
     await this.interruptionTask?.cancelAndWait();
+
+    if (this.turnDetectorStream !== undefined) {
+      const stream = this.turnDetectorStream;
+      this.turnDetectorStream = undefined;
+      await stream.aclose().catch(() => undefined);
+    }
+
     await this.interruptionStreamChannel?.close();
     this.cancelBackchannelBoundary();
   }
diff --git a/agents/src/voice/audio_recognition_span.test.ts b/agents/src/voice/audio_recognition_span.test.ts
index cfe92a821..5ce592042 100644
--- a/agents/src/voice/audio_recognition_span.test.ts
+++ b/agents/src/voice/audio_recognition_span.test.ts
@@ -110,6 +110,7 @@ describe('AudioRecognition user_turn span parity', () => {
       onInterimTranscript: vi.fn(),
       onFinalTranscript: vi.fn(),
       onPreemptiveGeneration: vi.fn(),
+      onEotPrediction: vi.fn(),
       retrieveChatCtx: () => ChatContext.empty(),
       onEndOfTurn: vi.fn(async () => true),
     };
@@ -191,6 +192,7 @@ describe('AudioRecognition user_turn span parity', () => {
       onInterimTranscript: vi.fn(),
       onFinalTranscript: vi.fn(),
       onPreemptiveGeneration: vi.fn(),
+      onEotPrediction: vi.fn(),
       retrieveChatCtx: () => ChatContext.empty(),
       onEndOfTurn: vi.fn(async () => true),
     };
diff --git a/agents/src/voice/audio_recognition_turn_detection.test.ts b/agents/src/voice/audio_recognition_turn_detection.test.ts
new file mode 100644
index 000000000..216e398fe
--- /dev/null
+++ b/agents/src/voice/audio_recognition_turn_detection.test.ts
@@ -0,0 +1,643 @@
+// SPDX-FileCopyrightText: 2026 LiveKit, Inc.
+//
+// SPDX-License-Identifier: Apache-2.0
+
+/**
+ * Integration tests for `AudioRecognition` audio turn-detection wiring.
+ *
+ * Recognition owns all streaming turn-detection policy: it holds the in-flight
+ * inference request's future (`turnDetectorPredictionFut`), starts requests on
+ * VAD events only, awaits the future with the endpointing `minDelay` in the eou
+ * bounce, and flushes the stream on turn commits. Covered here:
+ *
+ * 1. The speaking-guard race in `runEOUDetection`: setting `userSpeakingEvent`
+ *    mid-bounce must abort the commit so a late-arriving SOS doesn't ship the
+ *    prior turn.
+ * 2. `onEotPrediction` dedup across the vad-EOS and stt-final triggers that
+ *    share one resolved prediction future.
+ * 3. The prediction-future lifecycle against VAD events: requests start
+ *    exclusively on the silence tick, resumed speech rearms the next pause, SOS
+ *    teardown, the flushed-turn short-circuit for late stt finals, and the
+ *    predict-timeout fallback signal.
+ * 4. The `minSilenceDuration` validation guarding an audio-EOT + VAD pairing.
+ *
+ * The stream-side request lifecycle lives in `inference/eot/base.test.ts`.
+ *
+ * Port of Python `tests/test_audio_recognition_turn_detection.py`.
+ */
+import { ParticipantKind } from '@livekit/rtc-node';
+import { describe, expect, it, vi } from 'vitest';
+import {
+  BaseStreamingTurnDetector,
+  BaseStreamingTurnDetectorStream,
+  MIN_SILENCE_DURATION_MS,
+  type TurnDetectionEvent,
+} from '../inference/eot/base.js';
+import { ChatContext } from '../llm/chat_context.js';
+import { initializeLogger } from '../log.js';
+import { Future } from '../utils.js';
+import { type VAD, type VADEvent, VADEventType } from '../vad.js';
+import {
+  AudioRecognition,
+  type AudioRecognitionOptions,
+  type RecognitionHooks,
+  type _TurnDetector,
+} from './audio_recognition.js';
+
+initializeLogger({ pretty: false, level: 'silent' });
+
+/** White-box view of the `AudioRecognition` internals these tests drive. */
+interface RecognitionInternals {
+  speaking: boolean;
+  isAgentSpeaking: boolean;
+  vad?: VAD;
+  turnDetector?: _TurnDetector | BaseStreamingTurnDetector;
+  turnDetectorStream?: BaseStreamingTurnDetectorStream;
+  turnDetectorPredictionFut?: Future<TurnDetectionEvent>;
+  turnDetectorFlushed: boolean;
+  turnDetectorLatePredictionWarned: boolean;
+  lastEmittedEotPrediction?: TurnDetectionEvent;
+  lastSpeakingTime?: number;
+  audioTranscript: string;
+  audioInterimTranscript: string;
+  audioPreflightTranscript: string;
+  sttRequestIds: string[];
+  userSpeakingEvent: { isSet: boolean; set: () => void; clear: () => void };
+  bounceEOUTask?: {
+    result: Promise<void>;
+    cancel: () => void;
+    cancelAndWait: () => Promise<void>;
+    done: boolean;
+  };
+  runEOUDetection: (chatCtx: ChatContext, trigger?: 'vad' | 'stt' | 'manual') => void;
+  createVadTask: (vad: VAD | undefined, signal: AbortSignal) => Promise<void>;
+  checkVadSilenceRequirement: (detector?: _TurnDetector | BaseStreamingTurnDetector) => void;
+  updateTurnDetector: (detector: _TurnDetector | BaseStreamingTurnDetector | undefined) => void;
+  clearUserTurn: () => void;
+}
+
+function makeHooks(): RecognitionHooks {
+  return {
+    onInterruption: vi.fn(),
+    onStartOfSpeech: vi.fn(),
+    onVADInferenceDone: vi.fn(),
+    onEndOfSpeech: vi.fn(),
+    onInterimTranscript: vi.fn(),
+    onFinalTranscript: vi.fn(),
+    onEotPrediction: vi.fn(),
+    onPreemptiveGeneration: vi.fn(),
+    onUserTurnExceeded: vi.fn(),
+    retrieveChatCtx: () => ChatContext.empty(),
+    onEndOfTurn: vi.fn(async () => false), // don't commit by default
+  };
+}
+
+function makeRecognition(opts: Partial<AudioRecognitionOptions> = {}): {
+  recognition: AudioRecognition;
+  internals: RecognitionInternals;
+  hooks: RecognitionHooks;
+} {
+  const hooks = makeHooks();
+  const full: AudioRecognitionOptions = {
+    recognitionHooks: hooks,
+    stt: undefined,
+    vad: undefined,
+    interruptionDetection: undefined,
+    turnDetectionMode: 'vad',
+    minEndpointingDelay: 10,
+    maxEndpointingDelay: 500,
+    getLinkedParticipant: () => ({ sid: 'p1', identity: 'bob', kind: ParticipantKind.AGENT }),
+    ...opts,
+  };
+  const recognition = new AudioRecognition(full);
+  return { recognition, internals: recognition as unknown as RecognitionInternals, hooks };
+}
+
+/**
+ * A fake audio-EOT detector stream that passes `instanceof BaseStreamingTurnDetectorStream`
+ * (so `runEOUDetection` selects the audio path). `predict` hands out a fresh
+ * pending future each call, mirroring the real stream; tests install
+ * resolved/pending futures directly on `internals.turnDetectorPredictionFut` to
+ * model cached/awaiting predictions.
+ */
+function makeAudioStream(): BaseStreamingTurnDetectorStream {
+  const stream = Object.create(BaseStreamingTurnDetectorStream.prototype);
+  stream.supportsLanguage = vi.fn(async () => true);
+  stream.unlikelyThreshold = vi.fn(async () => 0.5);
+  stream.predict = vi.fn(() => new Future<TurnDetectionEvent>());
+  stream.cancelInference = vi.fn();
+  stream.flush = vi.fn();
+  return stream as BaseStreamingTurnDetectorStream;
+}
+
+function makeAudioDetector(stream: BaseStreamingTurnDetectorStream): BaseStreamingTurnDetector {
+  const detector = Object.create(BaseStreamingTurnDetector.prototype);
+  detector.stream = vi.fn(() => stream);
+  return detector as BaseStreamingTurnDetector;
+}
+
+/** A resolved prediction future, as if the transport already answered. */
+function resolvedPrediction(
+  probability: number,
+  opts: { inferenceDuration?: number; detectionDelay?: number } = {},
+): { fut: Future<TurnDetectionEvent>; event: TurnDetectionEvent } {
+  const event: TurnDetectionEvent = {
+    type: 'eot_prediction',
+    endOfTurnProbability: probability,
+    lastSpeakingTimeMs: 0,
+    inferenceDuration: opts.inferenceDuration,
+    detectionDelay: opts.detectionDelay,
+  };
+  const fut = new Future<TurnDetectionEvent>();
+  fut.resolve(event);
+  return { fut, event };
+}
+
+function predictMock(stream: BaseStreamingTurnDetectorStream): ReturnType<typeof vi.fn> {
+  return stream.predict as unknown as ReturnType<typeof vi.fn>;
+}
+
+function cancelInferenceMock(stream: BaseStreamingTurnDetectorStream): ReturnType<typeof vi.fn> {
+  return stream.cancelInference as unknown as ReturnType<typeof vi.fn>;
+}
+
+function inferenceDone(rawAccumulatedSpeech: number, rawAccumulatedSilence = 0): VADEvent {
+  return {
+    type: VADEventType.INFERENCE_DONE,
+    samplesIndex: 0,
+    timestamp: 0,
+    speechDuration: 0,
+    silenceDuration: 0,
+    frames: [],
+    probability: 0,
+    inferenceDuration: 0,
+    speaking: false,
+    rawAccumulatedSilence,
+    rawAccumulatedSpeech,
+  };
+}
+
+function startOfSpeech(): VADEvent {
+  return {
+    type: VADEventType.START_OF_SPEECH,
+    samplesIndex: 0,
+    timestamp: 0,
+    speechDuration: 500,
+    silenceDuration: 0,
+    frames: [],
+    probability: 0,
+    inferenceDuration: 0,
+    speaking: true,
+    rawAccumulatedSilence: 0,
+    rawAccumulatedSpeech: 500,
+  };
+}
+
+function endOfSpeech(): VADEvent {
+  return {
+    type: VADEventType.END_OF_SPEECH,
+    samplesIndex: 0,
+    timestamp: 0,
+    speechDuration: 0,
+    silenceDuration: 300,
+    frames: [],
+    probability: 0,
+    inferenceDuration: 0,
+    speaking: false,
+    rawAccumulatedSilence: 300,
+    rawAccumulatedSpeech: 0,
+  };
+}
+
+/** Let queued microtasks + the VAD loop body run to completion. */
+function flush(): Promise<void> {
+  return new Promise((resolve) => setImmediate(resolve));
+}
+
+/**
+ * Drive `createVadTask` against a scripted VAD stream so VAD events flow
+ * through the real handler. `feed()` resolves once the event has been processed
+ * and the loop has parked awaiting the next one.
+ */
+function runScriptedVad(internals: RecognitionInternals): {
+  feed: (ev: VADEvent) => Promise<void>;
+  stop: () => Promise<void>;
+} {
+  let resolveNext: ((r: IteratorResult<VADEvent>) => void) | null = null;
+  const buffered: VADEvent[] = [];
+  let closed = false;
+
+  const stream = {
+    updateInputStream(_s: unknown) {},
+    detachInputStream() {},
+    close() {
+      closed = true;
+      if (resolveNext) {
+        resolveNext({ done: true, value: undefined as never });
+        resolveNext = null;
+      }
+    },
+    [Symbol.asyncIterator](): AsyncIterator<VADEvent> {
+      return {
+        next(): Promise<IteratorResult<VADEvent>> {
+          if (buffered.length > 0) {
+            return Promise.resolve({ done: false, value: buffered.shift()! });
+          }
+          if (closed) {
+            return Promise.resolve({ done: true, value: undefined as never });
+          }
+          return new Promise((res) => {
+            resolveNext = res;
+          });
+        },
+      };
+    },
+  };
+
+  const vad = { stream: () => stream } as unknown as VAD;
+  const controller = new AbortController();
+  const task = internals.createVadTask(vad, controller.signal);
+
+  return {
+    async feed(ev: VADEvent) {
+      if (resolveNext) {
+        const res = resolveNext;
+        resolveNext = null;
+        res({ done: false, value: ev });
+      } else {
+        buffered.push(ev);
+      }
+      await flush();
+      await flush();
+    },
+    async stop() {
+      controller.abort();
+      await task.catch(() => {});
+    },
+  };
+}
+
+describe('TestSpeakingGuardRace', () => {
+  it('cancels the in-flight bounce when speaking starts during endpointing', async () => {
+    const { internals, hooks } = makeRecognition();
+    const stream = makeAudioStream();
+    internals.turnDetectorStream = stream;
+    internals.turnDetector = makeAudioDetector(stream);
+    // sub-threshold prediction (0.2 < 0.5) extends endpointing to maxDelay
+    internals.turnDetectorPredictionFut = resolvedPrediction(0.2).fut;
+
+    internals.runEOUDetection(ChatContext.empty(), 'vad');
+
+    // The bounce is parked in the ~500ms endpointing delay. Fire the speaking
+    // event well inside that window: the guard's race resolves with the
+    // speaking branch and the bounce is aborted before it can commit.
+    await new Promise((r) => setTimeout(r, 50));
+    internals.userSpeakingEvent.set();
+
+    expect(internals.bounceEOUTask).toBeDefined();
+    await internals.bounceEOUTask!.result.catch(() => {});
+
+    expect(hooks.onEndOfTurn).not.toHaveBeenCalled();
+  });
+
+  it('short-circuits without spawning the bounce when already speaking', async () => {
+    const { internals, hooks } = makeRecognition();
+    const stream = makeAudioStream();
+    internals.turnDetectorStream = stream;
+    internals.turnDetector = makeAudioDetector(stream);
+    internals.speaking = true;
+
+    internals.runEOUDetection(ChatContext.empty(), 'vad');
+
+    expect(internals.bounceEOUTask).toBeDefined();
+    await internals.bounceEOUTask!.result.catch(() => {});
+
+    expect(hooks.onEndOfTurn).not.toHaveBeenCalled();
+    // The guard bailed before the bounce body ran, so no request was awaited.
+    expect(predictMock(stream).mock.calls.length).toBe(0);
+  });
+});
+
+describe('TestEotPredictionDedup', () => {
+  it('emits onEotPrediction once across vad then stt triggers', async () => {
+    const { internals, hooks } = makeRecognition();
+    // One prediction per inference request — both triggers read this event by
+    // reference from the held future.
+    const { fut, event } = resolvedPrediction(0.2, { inferenceDuration: 50, detectionDelay: 100 });
+    const stream = makeAudioStream();
+    internals.turnDetectorStream = stream;
+    internals.turnDetector = makeAudioDetector(stream);
+    internals.turnDetectorPredictionFut = fut;
+
+    // vad trigger: bounce emits, then parks in the endpointing sleep.
+    internals.runEOUDetection(ChatContext.empty(), 'vad');
+    await flush();
+    await flush();
+    expect(hooks.onEotPrediction).toHaveBeenCalledTimes(1);
+
+    // stt trigger: cancels the parked vad bounce and runs a fresh one that
+    // reads the same resolved future. Dedup must suppress a second emit.
+    internals.runEOUDetection(ChatContext.empty(), 'stt');
+    await flush();
+    await flush();
+
+    expect(hooks.onEotPrediction).toHaveBeenCalledTimes(1);
+    expect(internals.lastEmittedEotPrediction).toBe(event);
+
+    await internals.bounceEOUTask?.cancelAndWait().catch(() => {});
+  });
+
+  it('emits on every bounce for a text-based detector', async () => {
+    const { internals, hooks } = makeRecognition();
+    // A text detector is not a BaseStreamingTurnDetector → no streaming window,
+    // so there's no shared prediction event and dedup never applies.
+    const textDetector: _TurnDetector = {
+      model: 'fake',
+      provider: 'fake',
+      supportsLanguage: vi.fn(async () => true),
+      unlikelyThreshold: vi.fn(async () => 0.5),
+      predictEndOfTurn: vi.fn(async () => 0.2),
+    };
+    internals.turnDetector = textDetector;
+    internals.turnDetectorStream = undefined;
+    internals.audioTranscript = 'hello there';
+
+    internals.runEOUDetection(ChatContext.empty(), 'vad');
+    await flush();
+    await flush();
+    expect(hooks.onEotPrediction).toHaveBeenCalledTimes(1);
+
+    internals.runEOUDetection(ChatContext.empty(), 'stt');
+    await flush();
+    await flush();
+    expect(hooks.onEotPrediction).toHaveBeenCalledTimes(2);
+
+    await internals.bounceEOUTask?.cancelAndWait().catch(() => {});
+  });
+
+  it('clearUserTurn resets the dedup guard so the next turn emits again', () => {
+    const { internals } = makeRecognition();
+    internals.lastEmittedEotPrediction = resolvedPrediction(0.2).event;
+    internals.audioInterimTranscript = '';
+    internals.audioPreflightTranscript = '';
+    internals.sttRequestIds = [];
+
+    internals.clearUserTurn();
+
+    expect(internals.lastEmittedEotPrediction).toBeUndefined();
+  });
+});
+
+describe('TestPredictionFutureLifecycle', () => {
+  it('silence tick starts a request once', async () => {
+    const { internals } = makeRecognition();
+    const stream = makeAudioStream();
+    internals.turnDetectorStream = stream;
+    internals.turnDetector = makeAudioDetector(stream);
+    internals.speaking = true;
+
+    const vad = runScriptedVad(internals);
+    try {
+      await vad.feed(inferenceDone(0, 300));
+      await vad.feed(inferenceDone(0, 400));
+
+      expect(predictMock(stream).mock.calls.length).toBe(1);
+      expect(internals.turnDetectorPredictionFut).toBeDefined();
+    } finally {
+      await vad.stop();
+    }
+  });
+
+  it('resumed speech without SOS rearms the next pause', async () => {
+    const { internals } = makeRecognition();
+    const stream = makeAudioStream();
+    internals.turnDetectorStream = stream;
+    internals.turnDetector = makeAudioDetector(stream);
+    internals.speaking = true;
+
+    const vad = runScriptedVad(internals);
+    try {
+      await vad.feed(inferenceDone(0, 300));
+      const firstFut = internals.turnDetectorPredictionFut;
+      expect(firstFut).toBeDefined();
+      firstFut!.resolve(resolvedPrediction(0.1).event);
+
+      // Speech resumes inside the still-open VAD segment → drop the request.
+      await vad.feed(inferenceDone(1, 0));
+      expect(cancelInferenceMock(stream)).toHaveBeenCalledTimes(1);
+      expect(cancelInferenceMock(stream)).toHaveBeenCalledWith();
+      expect(internals.turnDetectorPredictionFut).toBeUndefined();
+
+      // The next pause gets a fresh window.
+      await vad.feed(inferenceDone(0, 300));
+      expect(predictMock(stream).mock.calls.length).toBe(2);
+      expect(internals.turnDetectorPredictionFut).toBeDefined();
+      expect(internals.turnDetectorPredictionFut).not.toBe(firstFut);
+    } finally {
+      await vad.stop();
+    }
+  });
+
+  it('silence tick starts a request even while the agent is speaking', async () => {
+    // The agent-speaking gate was dropped: the silence tick warms a prediction
+    // during the user's pause even while the agent is still speaking.
+    const { internals } = makeRecognition();
+    const stream = makeAudioStream();
+    internals.turnDetectorStream = stream;
+    internals.turnDetector = makeAudioDetector(stream);
+    internals.speaking = true;
+    internals.isAgentSpeaking = true;
+
+    const vad = runScriptedVad(internals);
+    try {
+      await vad.feed(inferenceDone(0, 300));
+      expect(predictMock(stream).mock.calls.length).toBe(1);
+      expect(internals.turnDetectorPredictionFut).toBeDefined();
+    } finally {
+      await vad.stop();
+    }
+  });
+
+  it('EOS consumes the silence-tick request without predicting', async () => {
+    const { internals, hooks } = makeRecognition();
+    const stream = makeAudioStream();
+    internals.turnDetectorStream = stream;
+    internals.turnDetector = makeAudioDetector(stream);
+    internals.speaking = true;
+    const { fut } = resolvedPrediction(0.9);
+    internals.turnDetectorPredictionFut = fut;
+
+    const vad = runScriptedVad(internals);
+    try {
+      await vad.feed(endOfSpeech());
+      expect(predictMock(stream).mock.calls.length).toBe(0);
+      expect(internals.turnDetectorPredictionFut).toBe(fut);
+      expect(internals.bounceEOUTask).toBeDefined();
+      await internals.bounceEOUTask!.result.catch(() => {});
+      expect(hooks.onEotPrediction).toHaveBeenCalledTimes(1);
+    } finally {
+      await vad.stop();
+    }
+  });
+
+  it('SOS tears down the request and rearms', async () => {
+    const { internals } = makeRecognition();
+    const stream = makeAudioStream();
+    internals.turnDetectorStream = stream;
+    internals.turnDetector = makeAudioDetector(stream);
+    internals.turnDetectorPredictionFut = new Future<TurnDetectionEvent>();
+    internals.turnDetectorFlushed = true;
+
+    const vad = runScriptedVad(internals);
+    try {
+      await vad.feed(startOfSpeech());
+      expect(cancelInferenceMock(stream)).toHaveBeenCalledTimes(1);
+      expect(cancelInferenceMock(stream)).toHaveBeenCalledWith();
+      expect(internals.turnDetectorPredictionFut).toBeUndefined();
+      expect(internals.turnDetectorFlushed).toBe(false);
+    } finally {
+      await vad.stop();
+    }
+  });
+
+  it('EOS never starts a request', async () => {
+    const { internals } = makeRecognition();
+    const stream = makeAudioStream();
+    internals.turnDetectorStream = stream;
+    internals.turnDetector = makeAudioDetector(stream);
+
+    const vad = runScriptedVad(internals);
+    try {
+      await vad.feed(endOfSpeech());
+      expect(predictMock(stream).mock.calls.length).toBe(0);
+      expect(internals.turnDetectorPredictionFut).toBeUndefined();
+
+      const { fut } = resolvedPrediction(0.9);
+      internals.turnDetectorPredictionFut = fut;
+      await vad.feed(endOfSpeech());
+      expect(predictMock(stream).mock.calls.length).toBe(0);
+      expect(internals.turnDetectorPredictionFut).toBe(fut);
+    } finally {
+      await vad.stop();
+    }
+  });
+
+  it('late stt final after flush short-circuits and warns once', async () => {
+    const { internals, hooks } = makeRecognition();
+    const stream = makeAudioStream();
+    internals.turnDetectorStream = stream;
+    internals.turnDetector = makeAudioDetector(stream);
+    internals.turnDetectorFlushed = true;
+
+    for (let i = 0; i < 2; i++) {
+      internals.runEOUDetection(ChatContext.empty(), 'stt');
+      expect(internals.bounceEOUTask).toBeDefined();
+      await internals.bounceEOUTask!.result.catch(() => {});
+    }
+
+    expect(predictMock(stream).mock.calls.length).toBe(0);
+    expect(hooks.onEotPrediction).not.toHaveBeenCalled();
+    // Warn-once: the flag flips on the first late prediction, debug after.
+    expect(internals.turnDetectorLatePredictionWarned).toBe(true);
+  });
+
+  it('predict timeout signals fallback and drops the future', async () => {
+    const { internals, hooks } = makeRecognition();
+    const stream = makeAudioStream();
+    internals.turnDetectorStream = stream;
+    internals.turnDetector = makeAudioDetector(stream);
+    // A pending future that never resolves → times out at minDelay.
+    internals.turnDetectorPredictionFut = new Future<TurnDetectionEvent>();
+
+    internals.runEOUDetection(ChatContext.empty(), 'vad');
+    expect(internals.bounceEOUTask).toBeDefined();
+    await internals.bounceEOUTask!.result.catch(() => {});
+
+    expect(cancelInferenceMock(stream)).toHaveBeenCalledTimes(1);
+    expect(cancelInferenceMock(stream)).toHaveBeenCalledWith({ timedOut: true });
+    expect(internals.turnDetectorPredictionFut).toBeUndefined();
+    expect(hooks.onEotPrediction).not.toHaveBeenCalled();
+    expect(stream.unlikelyThreshold).not.toHaveBeenCalled();
+    expect(hooks.onEndOfTurn).toHaveBeenCalledTimes(1);
+  });
+
+  it('commit flushes the stream and marks the turn flushed', async () => {
+    const { internals, hooks } = makeRecognition();
+    (hooks.onEndOfTurn as ReturnType<typeof vi.fn>).mockResolvedValue(true);
+    const stream = makeAudioStream();
+    internals.turnDetectorStream = stream;
+    internals.turnDetector = makeAudioDetector(stream);
+    // confident → no maxDelay extension
+    internals.turnDetectorPredictionFut = resolvedPrediction(0.9).fut;
+
+    internals.runEOUDetection(ChatContext.empty(), 'vad');
+    expect(internals.bounceEOUTask).toBeDefined();
+    await internals.bounceEOUTask!.result.catch(() => {});
+
+    expect(stream.flush).toHaveBeenCalledWith('turn committed');
+    expect(internals.turnDetectorPredictionFut).toBeUndefined();
+    expect(internals.turnDetectorFlushed).toBe(true);
+  });
+});
+
+describe('TestVadMinSilenceRequirement', () => {
+  // The audio EOT detector needs ~200ms of trailing silence, so the VAD must
+  // report END_OF_SPEECH no earlier than that floor + a 50ms margin.
+  const requiredMs = MIN_SILENCE_DURATION_MS + 50;
+  const fakeVad = (minSilenceDuration: number | null): VAD =>
+    ({ minSilenceDuration }) as unknown as VAD;
+
+  it('raises when min silence is too low for an audio detector', () => {
+    const { internals } = makeRecognition();
+    internals.vad = fakeVad(requiredMs - 1);
+    internals.turnDetector = makeAudioDetector(makeAudioStream());
+
+    expect(() => internals.checkVadSilenceRequirement()).toThrow(/minSilenceDuration/);
+  });
+
+  it('passes when min silence is adequate', () => {
+    const { internals } = makeRecognition();
+    internals.vad = fakeVad(requiredMs + 250);
+    internals.turnDetector = makeAudioDetector(makeAudioStream());
+
+    expect(() => internals.checkVadSilenceRequirement()).not.toThrow();
+  });
+
+  it('skips validation for a non-audio detector', () => {
+    const { internals } = makeRecognition();
+    internals.vad = fakeVad(requiredMs - 1);
+    internals.turnDetector = { model: 'x', provider: 'x' } as unknown as _TurnDetector;
+
+    expect(() => internals.checkVadSilenceRequirement()).not.toThrow();
+  });
+
+  it('skips validation when there is no VAD', () => {
+    const { internals } = makeRecognition();
+    internals.vad = undefined;
+    internals.turnDetector = makeAudioDetector(makeAudioStream());
+
+    expect(() => internals.checkVadSilenceRequirement()).not.toThrow();
+  });
+
+  it('skips validation when the VAD exposes no min-silence knob', () => {
+    const { internals } = makeRecognition();
+    // A VAD whose minSilenceDuration is null can't be validated → allowed.
+    internals.vad = fakeVad(null);
+    internals.turnDetector = makeAudioDetector(makeAudioStream());
+
+    expect(() => internals.checkVadSilenceRequirement()).not.toThrow();
+  });
+
+  it('updateTurnDetector validates the pairing before building a stream', () => {
+    const { internals } = makeRecognition();
+    internals.vad = fakeVad(requiredMs - 1);
+    const stream = makeAudioStream();
+    const detector = makeAudioDetector(stream);
+
+    expect(() => internals.updateTurnDetector(detector)).toThrow(/minSilenceDuration/);
+
+    // Aborted before adopting the detector or opening a stream.
+    expect(internals.turnDetectorStream).toBeUndefined();
+    expect((detector.stream as ReturnType<typeof vi.fn>).mock.calls.length).toBe(0);
+  });
+});
diff --git a/agents/src/voice/events.ts b/agents/src/voice/events.ts
index a6923c866..e30cae983 100644
--- a/agents/src/voice/events.ts
+++ b/agents/src/voice/events.ts
@@ -34,6 +34,8 @@ export enum AgentSessionEventTypes {
   SpeechCreated = 'speech_created',
   AgentFalseInterruption = 'agent_false_interruption',
   OverlappingSpeech = 'overlapping_speech',
+  /** Audio EOT detector emitted a per-turn prediction. */
+  EotPrediction = 'eot_prediction',
   Error = 'error',
   Close = 'close',
 }
@@ -246,6 +248,46 @@ export const createSpeechCreatedEvent = ({
   createdAt,
 });
 
+/**
+ * Audio EOT prediction landed on the wire. Emitted once per turn boundary
+ * decision when a `TurnDetector` is wired into the session.
+ *
+ * Port of Python `EotPredictionEvent`.
+ */
+export type EotPredictionEvent = {
+  type: 'eot_prediction';
+  /** End-of-turn probability in [0, 1] returned by the detector. */
+  probability: number;
+  /** Threshold below which the detector treats the prediction as unlikely. */
+  threshold: number;
+  /** Model-side inference time, in milliseconds. */
+  inferenceDurationMs: number;
+  /** End-of-speech → prediction receive time, in milliseconds. */
+  delayMs: number;
+  createdAt: number;
+};
+
+export const createEotPredictionEvent = ({
+  probability,
+  threshold,
+  inferenceDurationMs,
+  delayMs,
+  createdAt = Date.now(),
+}: {
+  probability: number;
+  threshold: number;
+  inferenceDurationMs: number;
+  delayMs: number;
+  createdAt?: number;
+}): EotPredictionEvent => ({
+  type: 'eot_prediction',
+  probability,
+  threshold,
+  inferenceDurationMs,
+  delayMs,
+  createdAt,
+});
+
 export type UserTurnExceededEvent = {
   type: 'user_turn_exceeded';
   /** Transcript from the current uncommitted user turn only. */
diff --git a/agents/src/voice/remote_session.ts b/agents/src/voice/remote_session.ts
index efe0b0547..235925b76 100644
--- a/agents/src/voice/remote_session.ts
+++ b/agents/src/voice/remote_session.ts
@@ -19,6 +19,7 @@ import { isInstructions, renderInstructions } from '../llm/chat_context.js';
 import { type ToolContext, sortedToolNames } from '../llm/tool_context.js';
 import { log } from '../log.js';
 import type {
+  EOTModelUsage,
   InterruptionModelUsage,
   LLMModelUsage,
   STTModelUsage,
@@ -33,6 +34,7 @@ import {
   type AgentState,
   type AgentStateChangedEvent,
   type ConversationItemAddedEvent,
+  type EotPredictionEvent,
   type ErrorEvent,
   type FunctionToolsExecutedEvent,
   type MetricsCollectedEvent,
@@ -63,6 +65,7 @@ export type RemoteSessionEventTypes =
   | 'function_tools_executed'
   | 'overlapping_speech'
   | 'amd_prediction'
+  | 'eot_prediction'
   | 'session_usage'
   | 'debug_message'
   | 'error';
@@ -76,6 +79,7 @@ export type RemoteSessionCallbacks = {
   function_tools_executed: (ev: pb.AgentSessionEvent_FunctionToolsExecuted) => void;
   overlapping_speech: (ev: pb.AgentSessionEvent_OverlappingSpeech) => void;
   amd_prediction: (ev: pb.AgentSessionEvent_AmdPrediction) => void;
+  eot_prediction: (ev: pb.AgentSessionEvent_EotPrediction) => void;
   session_usage: (ev: pb.AgentSessionEvent_SessionUsageUpdated) => void;
   debug_message: (ev: pb.DebugMessage) => void;
   error: (ev: pb.AgentSessionEvent_Error) => void;
@@ -584,6 +588,22 @@ function sessionUsageToProto(usage: AgentSessionUsage): pb.AgentSessionUsage {
         );
         break;
       }
+      case 'eot_usage': {
+        const eu = mu as Partial<EOTModelUsage>;
+        modelUsages.push(
+          new pb.ModelUsage({
+            usage: {
+              case: 'eot',
+              value: new pb.EotModelUsage({
+                provider: eu.provider ?? '',
+                model: eu.model ?? '',
+                totalRequests: eu.totalRequests ?? 0,
+              }),
+            },
+          }),
+        );
+        break;
+      }
     }
   }
   return new pb.AgentSessionUsage({ modelUsage: modelUsages });
@@ -640,6 +660,7 @@ export class SessionHost {
       session.on(AgentSessionEventTypes.FunctionToolsExecuted, this.onFunctionToolsExecuted);
       session.on(AgentSessionEventTypes.MetricsCollected, this.onMetricsCollected);
       session.on(AgentSessionEventTypes.OverlappingSpeech, this.onOverlappingSpeech);
+      session.on(AgentSessionEventTypes.EotPrediction, this.onEotPrediction);
       session.on(AgentSessionEventTypes.Error, this.onHostError);
       session.on(AgentSessionEventTypes.DebugMessage, this.onDebugMessage);
     }
@@ -669,6 +690,7 @@ export class SessionHost {
       this.session.off(AgentSessionEventTypes.FunctionToolsExecuted, this.onFunctionToolsExecuted);
       this.session.off(AgentSessionEventTypes.MetricsCollected, this.onMetricsCollected);
       this.session.off(AgentSessionEventTypes.OverlappingSpeech, this.onOverlappingSpeech);
+      this.session.off(AgentSessionEventTypes.EotPrediction, this.onEotPrediction);
       this.session.off(AgentSessionEventTypes.Error, this.onHostError);
       this.session.off(AgentSessionEventTypes.DebugMessage, this.onDebugMessage);
     }
@@ -797,6 +819,10 @@ export class SessionHost {
     );
   };
 
+  private onEotPrediction = (event: EotPredictionEvent): void => {
+    this._onEotPrediction(event);
+  };
+
   private onOverlappingSpeech = (event: OverlappingSpeechEvent): void => {
     const value = new pb.AgentSessionEvent_OverlappingSpeech({
       isInterruption: event.isInterruption,
@@ -856,6 +882,23 @@ export class SessionHost {
     });
   }
 
+  /**
+   * @internal — forwards an audio-EOT prediction to the connected
+   * {@link RemoteSession} peer. Mirrors python
+   * `SessionHost._on_eot_prediction`.
+   */
+  _onEotPrediction(event: EotPredictionEvent): void {
+    this.emitEvent({
+      case: 'eotPrediction',
+      value: new pb.AgentSessionEvent_EotPrediction({
+        probability: event.probability,
+        threshold: event.threshold,
+        inferenceDuration: msToDuration(event.inferenceDurationMs),
+        delay: msToDuration(event.delayMs),
+      }),
+    });
+  }
+
   private async handleRequestSafe(req: pb.SessionRequest): Promise<void> {
     try {
       await this.handleRequest(req);
@@ -1153,6 +1196,9 @@ export class RemoteSession extends (EventEmitter as new () => TypedEventEmitter<
       case 'amdPrediction':
         this.emit('amd_prediction', ev.value);
         break;
+      case 'eotPrediction':
+        this.emit('eot_prediction', ev.value);
+        break;
       case 'sessionUsageUpdated':
         this.emit('session_usage', ev.value);
         break;
diff --git a/agents/src/voice/turn_config/turn_handling.ts b/agents/src/voice/turn_config/turn_handling.ts
index a7d3a51be..d3a557efa 100644
--- a/agents/src/voice/turn_config/turn_handling.ts
+++ b/agents/src/voice/turn_config/turn_handling.ts
@@ -36,11 +36,15 @@ export interface TurnHandlingOptions {
    * - `"realtime_llm"` – use server-side detection from a realtime LLM
    * - `"manual"` – caller controls turn boundaries explicitly
    *
-   * If not set, the session chooses the best available mode in priority order
-   * `realtime_llm → vad → stt → manual`; it automatically falls back if the necessary model
-   * is missing.
+   * - `undefined` (not set) – the session auto-provisions a default
+   *   `inference.TurnDetector`, then chooses the best available mode in
+   *   priority order `realtime_llm → vad → stt → manual`, falling back if the
+   *   necessary model is missing.
+   * - `null` – explicitly opt out of turn detection (no default detector built).
+   *
+   * The `null`-vs-`undefined` distinction mirrors Python's `None` vs `NOT_GIVEN`.
    */
-  turnDetection: TurnDetectionMode | undefined;
+  turnDetection: TurnDetectionMode | null | undefined;
   /**
    * Configuration for endpointing.
    */
diff --git a/agents/src/voice/turn_config/utils.ts b/agents/src/voice/turn_config/utils.ts
index 8db23d49a..fbc3a4016 100644
--- a/agents/src/voice/turn_config/utils.ts
+++ b/agents/src/voice/turn_config/utils.ts
@@ -73,7 +73,13 @@ export function migrateLegacyOptions<UserData>(legacyOptions: AgentSessionOption
       ...sessionOptions.turnHandling?.userTurnLimit,
     },
 
-    turnDetection: sessionOptions?.turnHandling?.turnDetection ?? turnDetection,
+    // Preserve an explicit `null` (opt-out) distinctly from `undefined` (not
+    // given). `??` would collapse both, so only fall back to the deprecated
+    // top-level `turnDetection` when `turnHandling.turnDetection` is absent.
+    turnDetection:
+      sessionOptions?.turnHandling?.turnDetection !== undefined
+        ? sessionOptions.turnHandling.turnDetection
+        : turnDetection,
   } as const;
 
   if (
@@ -134,7 +140,12 @@ export function stripUndefined<T extends object>(obj: T): Partial<T> {
 
 export function mergeWithDefaults(config: TurnHandlingOptions) {
   return {
-    turnDetection: config.turnDetection ?? defaultTurnHandlingOptions.turnDetection,
+    // Keep an explicit `null` (opt-out) — only an absent value falls back to
+    // the default, so the constructor can tell opt-out from not-given.
+    turnDetection:
+      config.turnDetection === undefined
+        ? defaultTurnHandlingOptions.turnDetection
+        : config.turnDetection,
     endpointing: { ...defaultEndpointingOptions, ...stripUndefined(config.endpointing) },
     interruption: { ...defaultInterruptionOptions, ...stripUndefined(config.interruption) },
     preemptiveGeneration: {
diff --git a/agents/src/worker.ts b/agents/src/worker.ts
index 8238a185e..b18a26a9a 100644
--- a/agents/src/worker.ts
+++ b/agents/src/worker.ts
@@ -15,10 +15,13 @@ import type { ParticipantInfo } from 'livekit-server-sdk';
 import { AccessToken, RoomServiceClient } from 'livekit-server-sdk';
 import { EventEmitter } from 'node:events';
 import { availableParallelism } from 'node:os';
+import { extname } from 'node:path';
 import { WebSocket } from 'ws';
 import { APIStatusError } from './_exceptions.js';
 import { getCpuMonitor } from './cpu.js';
 import { HTTPServer } from './http_server.js';
+import { _getLocalInferenceModule } from './inference/_warmup.js';
+import { EOT_INFERENCE_METHOD } from './inference/eot/runner.js';
 import { InferenceRunner } from './inference_runner.js';
 import { InferenceProcExecutor } from './ipc/inference_proc_executor.js';
 import { ProcPool } from './ipc/proc_pool.js';
@@ -33,6 +36,32 @@ const ASSIGNMENT_TIMEOUT = 7.5 * 1000;
 const UPDATE_LOAD_INTERVAL = 2.5 * 1000;
 const PROJECT_TYPE = 'nodejs';
 
+let localEotRunnerRegistered = false;
+/**
+ * Register the local audio-EOT inference runner so it runs in the shared
+ * inference process. Idempotent and guarded by native-binding availability;
+ * a no-op (with a one-time warning) when `@livekit/local-inference` can't be
+ * loaded so the worker still starts on unsupported platforms.
+ */
+function maybeRegisterLocalEotRunner(): void {
+  if (localEotRunnerRegistered) return;
+  localEotRunnerRegistered = true;
+  if (InferenceRunner.registeredRunners[EOT_INFERENCE_METHOD]) return;
+  if (_getLocalInferenceModule() === undefined) {
+    log().warn(
+      '@livekit/local-inference native binding unavailable; local audio EOT disabled ' +
+        '(predictions will degrade to a positive default). cloud EOT and other turn ' +
+        'detection modes are unaffected.',
+    );
+    return;
+  }
+  const ext = extname(import.meta.url); // '.js' (built) or '.ts' (tsx/ts-node)
+  InferenceRunner.registerRunner(
+    EOT_INFERENCE_METHOD,
+    new URL(`./inference/eot/runner${ext}`, import.meta.url).toString(),
+  );
+}
+
 class Default {
   static loadThreshold(production: boolean): number {
     if (production) {
@@ -328,6 +357,13 @@ export class AgentServer {
       }
     }
 
+    // Register the local audio-EOT runner so it runs in the shared inference
+    // process (loaded once per host, ~138 MB) instead of in every job worker.
+    // Guarded by binding availability: on a platform where
+    // `@livekit/local-inference` can't load, skip registration so the worker
+    // still starts (local EOT then degrades to a positive-default prediction).
+    maybeRegisterLocalEotRunner();
+
     if (Object.entries(InferenceRunner.registeredRunners).length) {
       this.#inferenceExecutor = new InferenceProcExecutor({
         runners: InferenceRunner.registeredRunners,
diff --git a/examples/src/anam_realtime_agent.ts b/examples/src/anam_realtime_agent.ts
index bc2fd33e4..fd4957faa 100644
--- a/examples/src/anam_realtime_agent.ts
+++ b/examples/src/anam_realtime_agent.ts
@@ -3,7 +3,6 @@
 // SPDX-License-Identifier: Apache-2.0
 import {
   type JobContext,
-  type JobProcess,
   ServerOptions,
   cli,
   defineAgent,
@@ -12,17 +11,12 @@ import {
   voice,
 } from '@livekit/agents';
 import * as anam from '@livekit/agents-plugin-anam';
-import * as livekit from '@livekit/agents-plugin-livekit';
 import * as openai from '@livekit/agents-plugin-openai';
-import * as silero from '@livekit/agents-plugin-silero';
 import { fileURLToPath } from 'node:url';
 
 // Uses OpenAI Advanced Voice (Realtime), so no separate STT/TTS/VAD.
 
 export default defineAgent({
-  prewarm: async (proc: JobProcess) => {
-    proc.userData.vad = await silero.VAD.load();
-  },
   entry: async (ctx: JobContext) => {
     initializeLogger({ pretty: true });
 
@@ -31,7 +25,6 @@ export default defineAgent({
     });
 
     const session = new voice.AgentSession({
-      vad: ctx.proc.userData.vad! as silero.VAD,
       stt: new inference.STT({ model: 'deepgram/nova-3', language: 'en' }),
       tts: new inference.TTS({
         model: 'cartesia/sonic-3',
@@ -45,7 +38,6 @@ export default defineAgent({
         turnDetection: null,
         inputAudioTranscription: null,
       }),
-      turnDetection: new livekit.turnDetector.EnglishModel(),
     });
 
     await session.start({
diff --git a/examples/src/basic_agent.ts b/examples/src/basic_agent.ts
index 95ecddb9a..10c8e5028 100644
--- a/examples/src/basic_agent.ts
+++ b/examples/src/basic_agent.ts
@@ -3,7 +3,6 @@
 // SPDX-License-Identifier: Apache-2.0
 import {
   type JobContext,
-  type JobProcess,
   ServerOptions,
   cli,
   defineAgent,
@@ -13,16 +12,14 @@ import {
   metrics,
   voice,
 } from '@livekit/agents';
-import * as livekit from '@livekit/agents-plugin-livekit';
-import * as silero from '@livekit/agents-plugin-silero';
 import { BackgroundVoiceCancellation } from '@livekit/noise-cancellation-node';
 import { fileURLToPath } from 'node:url';
 import { z } from 'zod';
 
+// No prewarm hook needed: the local EOT model runs in the shared inference
+// process (loaded once per host), and the silero VAD (~2MB, in-process)
+// lazy-loads on first stream.
 export default defineAgent({
-  prewarm: async (proc: JobProcess) => {
-    proc.userData.vad = await silero.VAD.load();
-  },
   entry: async (ctx: JobContext) => {
     const agent = new voice.Agent({
       instructions:
@@ -43,9 +40,6 @@ export default defineAgent({
     const logger = log();
 
     const session = new voice.AgentSession({
-      // VAD and turn detection are used to determine when the user is speaking and when the agent should respond
-      // See more at https://docs.livekit.io/agents/build/turns
-      vad: ctx.proc.userData.vad! as silero.VAD,
       // Speech-to-text (STT) is your agent's ears, turning the user's speech into text that the LLM can understand
       // See all available models at https://docs.livekit.io/agents/models/stt/
       stt: new inference.STT({
@@ -69,7 +63,8 @@ export default defineAgent({
       }),
       ttsTextTransforms: ['filter_markdown', 'filter_emoji'],
       turnHandling: {
-        turnDetection: new livekit.turnDetector.MultilingualModel(),
+        // turn detection defaults to the audio inference.TurnDetector when unset.
+        // See https://docs.livekit.io/agents/build/turns
         interruption: {
           // Enable false-interruption auto-resume behavior.
           resumeFalseInterruption: true,
@@ -118,7 +113,7 @@ export default defineAgent({
     });
 
     session.on(voice.AgentSessionEventTypes.OverlappingSpeech, (ev) => {
-      logger.warn({ type: ev.type, isInterruption: ev.isInterruption }, 'user overlapping speech');
+      logger.info({ type: ev.type, isInterruption: ev.isInterruption }, 'user overlapping speech');
     });
 
     await session.start({
diff --git a/examples/src/basic_agent_task.ts b/examples/src/basic_agent_task.ts
index aacbeee5c..0549f4197 100644
--- a/examples/src/basic_agent_task.ts
+++ b/examples/src/basic_agent_task.ts
@@ -3,7 +3,6 @@
 // SPDX-License-Identifier: Apache-2.0
 import {
   type JobContext,
-  type JobProcess,
   ServerOptions,
   cli,
   defineAgent,
@@ -12,7 +11,6 @@ import {
   voice,
 } from '@livekit/agents';
 import * as openai from '@livekit/agents-plugin-openai';
-import * as silero from '@livekit/agents-plugin-silero';
 import { fileURLToPath } from 'node:url';
 import { z } from 'zod';
 
@@ -110,12 +108,8 @@ class SurveyAgent extends voice.Agent {
 }
 
 export default defineAgent({
-  prewarm: async (proc: JobProcess) => {
-    proc.userData.vad = await silero.VAD.load();
-  },
   entry: async (ctx: JobContext) => {
     const session = new voice.AgentSession({
-      vad: ctx.proc.userData.vad as silero.VAD,
       stt: new inference.STT({ model: 'deepgram/nova-3' }),
       llm: new openai.responses.LLM({ useWebSocket: true }),
       tts: new inference.TTS({
diff --git a/examples/src/basic_task_group.ts b/examples/src/basic_task_group.ts
index d40befe2a..0c24c2059 100644
--- a/examples/src/basic_task_group.ts
+++ b/examples/src/basic_task_group.ts
@@ -3,7 +3,6 @@
 // SPDX-License-Identifier: Apache-2.0
 import {
   type JobContext,
-  type JobProcess,
   ServerOptions,
   beta,
   cli,
@@ -13,7 +12,6 @@ import {
   voice,
 } from '@livekit/agents';
 import * as openai from '@livekit/agents-plugin-openai';
-import * as silero from '@livekit/agents-plugin-silero';
 import { fileURLToPath } from 'node:url';
 import { z } from 'zod';
 
@@ -120,12 +118,8 @@ class TaskGroupDemoAgent extends voice.Agent {
 }
 
 export default defineAgent({
-  prewarm: async (proc: JobProcess) => {
-    proc.userData.vad = await silero.VAD.load();
-  },
   entry: async (ctx: JobContext) => {
     const session = new voice.AgentSession({
-      vad: ctx.proc.userData.vad as silero.VAD,
       stt: new inference.STT({ model: 'deepgram/nova-3' }),
       llm: new openai.responses.LLM({
         model: 'gpt-5.2',
diff --git a/examples/src/basic_tool_call_agent.ts b/examples/src/basic_tool_call_agent.ts
index 5642ef488..8d576cf1f 100644
--- a/examples/src/basic_tool_call_agent.ts
+++ b/examples/src/basic_tool_call_agent.ts
@@ -3,7 +3,6 @@
 // SPDX-License-Identifier: Apache-2.0
 import {
   type JobContext,
-  type JobProcess,
   ServerOptions,
   cli,
   defineAgent,
@@ -11,8 +10,6 @@ import {
   llm,
   voice,
 } from '@livekit/agents';
-import * as livekit from '@livekit/agents-plugin-livekit';
-import * as silero from '@livekit/agents-plugin-silero';
 import { BackgroundVoiceCancellation } from '@livekit/noise-cancellation-node';
 import { fileURLToPath } from 'node:url';
 import { z } from 'zod';
@@ -39,9 +36,6 @@ class GameAgent extends voice.Agent<UserData> {
 }
 
 export default defineAgent({
-  prewarm: async (proc: JobProcess) => {
-    proc.userData.vad = await silero.VAD.load();
-  },
   entry: async (ctx: JobContext) => {
     const getWeather = llm.tool({
       description: ' Called when the user asks about the weather.',
@@ -133,17 +127,13 @@ export default defineAgent({
       },
     });
 
-    const vad = ctx.proc.userData.vad! as silero.VAD;
-
     const session = new voice.AgentSession({
-      vad,
       stt: new inference.STT({ model: 'deepgram/nova-3', language: 'en' }),
       llm: new inference.LLM({ model: 'google/gemini-3-flash-preview' }),
       tts: new inference.TTS({
         model: 'cartesia/sonic-3',
         voice: '9626c31c-bec5-4cca-baa8-f8ba9e84c8bc',
       }),
-      turnDetection: new livekit.turnDetector.MultilingualModel(),
       userData: { number: 0 },
       voiceOptions: {
         preemptiveGeneration: true,
diff --git a/examples/src/cartesia.ts b/examples/src/cartesia.ts
index 34cd47fd4..e49bb623d 100644
--- a/examples/src/cartesia.ts
+++ b/examples/src/cartesia.ts
@@ -4,7 +4,6 @@
 import type { llm as llmModule } from '@livekit/agents';
 import {
   type JobContext,
-  type JobProcess,
   ServerOptions,
   cli,
   defineAgent,
@@ -16,14 +15,10 @@ import {
 import * as cartesia from '@livekit/agents-plugin-cartesia';
 import * as google from '@livekit/agents-plugin-google';
 import * as openai from '@livekit/agents-plugin-openai';
-import * as silero from '@livekit/agents-plugin-silero';
 import { BackgroundVoiceCancellation } from '@livekit/noise-cancellation-node';
 import { fileURLToPath } from 'node:url';
 
 export default defineAgent({
-  prewarm: async (proc: JobProcess) => {
-    proc.userData.vad = await silero.VAD.load();
-  },
   entry: async (ctx: JobContext) => {
     const agent = new voice.Agent({
       instructions:
@@ -31,8 +26,6 @@ export default defineAgent({
     });
 
     const logger = log();
-    const vad =
-      ctx.proc.userData.vad instanceof silero.VAD ? ctx.proc.userData.vad : await silero.VAD.load();
 
     const apiKey = process.env.CARTESIA_API_KEY;
 
@@ -67,7 +60,6 @@ export default defineAgent({
     }
 
     const session = new voice.AgentSession({
-      vad,
       stt: new cartesia.STT({ model: 'ink-2', apiKey }),
       llm,
       tts: new cartesia.TTS({ model: 'sonic-3.5', apiKey }),
diff --git a/examples/src/comprehensive_test.ts b/examples/src/comprehensive_test.ts
index 6e9fc8f07..9e9e25225 100644
--- a/examples/src/comprehensive_test.ts
+++ b/examples/src/comprehensive_test.ts
@@ -3,7 +3,6 @@
 // SPDX-License-Identifier: Apache-2.0
 import {
   type JobContext,
-  type JobProcess,
   ServerOptions,
   cli,
   dedent,
@@ -21,7 +20,6 @@ import * as livekit from '@livekit/agents-plugin-livekit';
 import * as neuphonic from '@livekit/agents-plugin-neuphonic';
 import * as openai from '@livekit/agents-plugin-openai';
 import * as resemble from '@livekit/agents-plugin-resemble';
-import * as silero from '@livekit/agents-plugin-silero';
 import { BackgroundVoiceCancellation } from '@livekit/noise-cancellation-node';
 import { fileURLToPath } from 'node:url';
 import { z } from 'zod';
@@ -238,14 +236,9 @@ class TestAgent extends voice.Agent<UserData> {
 }
 
 export default defineAgent({
-  prewarm: async (proc: JobProcess) => {
-    proc.userData.vad = await silero.VAD.load();
-  },
   entry: async (ctx: JobContext) => {
     const logger = log();
-    const vad = ctx.proc.userData.vad! as silero.VAD;
     const session = new voice.AgentSession({
-      vad,
       userData: {
         testedSttChoices: new Set(),
         testedTtsChoices: new Set(),
diff --git a/examples/src/custom_text_handler.ts b/examples/src/custom_text_handler.ts
index 5ba65e773..5cc10e668 100644
--- a/examples/src/custom_text_handler.ts
+++ b/examples/src/custom_text_handler.ts
@@ -3,15 +3,12 @@
 // SPDX-License-Identifier: Apache-2.0
 import {
   type JobContext,
-  type JobProcess,
   ServerOptions,
   cli,
   defineAgent,
   inference,
   voice,
 } from '@livekit/agents';
-import * as livekit from '@livekit/agents-plugin-livekit';
-import * as silero from '@livekit/agents-plugin-silero';
 import { BackgroundVoiceCancellation } from '@livekit/noise-cancellation-node';
 import { fileURLToPath } from 'node:url';
 
@@ -39,26 +36,19 @@ const customTextInputHandler = (session: voice.AgentSession, event: voice.TextIn
 };
 
 export default defineAgent({
-  prewarm: async (proc: JobProcess) => {
-    proc.userData.vad = await silero.VAD.load();
-  },
   entry: async (ctx: JobContext) => {
     const agent = new voice.Agent({
       instructions:
         "You are a helpful assistant, you can hear the user's message and respond to it.",
     });
 
-    const vad = ctx.proc.userData.vad! as silero.VAD;
-
     const session = new voice.AgentSession({
-      vad,
       stt: new inference.STT({ model: 'deepgram/nova-3', language: 'en' }),
       llm: new inference.LLM({ model: 'openai/gpt-4.1-mini' }),
       tts: new inference.TTS({
         model: 'cartesia/sonic-3',
         voice: '9626c31c-bec5-4cca-baa8-f8ba9e84c8bc',
       }),
-      turnDetection: new livekit.turnDetector.MultilingualModel(),
     });
 
     await session.start({
diff --git a/examples/src/drive-thru/drivethru_agent.ts b/examples/src/drive-thru/drivethru_agent.ts
index 9882f6fcd..c9a534dec 100644
--- a/examples/src/drive-thru/drivethru_agent.ts
+++ b/examples/src/drive-thru/drivethru_agent.ts
@@ -1,20 +1,10 @@
 // SPDX-FileCopyrightText: 2025 LiveKit, Inc.
 //
 // SPDX-License-Identifier: Apache-2.0
-import {
-  type JobContext,
-  type JobProcess,
-  ServerOptions,
-  cli,
-  defineAgent,
-  llm,
-  voice,
-} from '@livekit/agents';
+import { type JobContext, ServerOptions, cli, defineAgent, llm, voice } from '@livekit/agents';
 import * as deepgram from '@livekit/agents-plugin-deepgram';
 import * as elevenlabs from '@livekit/agents-plugin-elevenlabs';
-import * as livekit from '@livekit/agents-plugin-livekit';
 import * as openai from '@livekit/agents-plugin-openai';
-import * as silero from '@livekit/agents-plugin-silero';
 import { fileURLToPath } from 'node:url';
 import { z } from 'zod';
 import {
@@ -376,19 +366,13 @@ export async function newUserData(): Promise<UserData> {
 }
 
 export default defineAgent({
-  prewarm: async (proc: JobProcess) => {
-    proc.userData.vad = await silero.VAD.load();
-  },
   entry: async (ctx: JobContext) => {
     const userdata = await newUserData();
 
-    const vad = ctx.proc.userData.vad! as silero.VAD;
     const session = new voice.AgentSession({
-      vad,
       stt: new deepgram.STT(),
       llm: new openai.LLM({ model: 'gpt-4.1', temperature: 0.45 }),
       tts: new elevenlabs.TTS(),
-      turnDetection: new livekit.turnDetector.MultilingualModel(),
       userData: userdata,
       voiceOptions: {
         maxToolSteps: 10,
diff --git a/examples/src/elevenlabs_scribe_v2.ts b/examples/src/elevenlabs_scribe_v2.ts
index d0574c02c..dbb4ac1fa 100644
--- a/examples/src/elevenlabs_scribe_v2.ts
+++ b/examples/src/elevenlabs_scribe_v2.ts
@@ -3,7 +3,6 @@
 // SPDX-License-Identifier: Apache-2.0
 import {
   type JobContext,
-  type JobProcess,
   ServerOptions,
   cli,
   defineAgent,
@@ -11,13 +10,9 @@ import {
   voice,
 } from '@livekit/agents';
 import * as elevenlabs from '@livekit/agents-plugin-elevenlabs';
-import * as silero from '@livekit/agents-plugin-silero';
 import { fileURLToPath } from 'node:url';
 
 export default defineAgent({
-  prewarm: async (proc: JobProcess) => {
-    proc.userData.vad = await silero.VAD.load();
-  },
   entry: async (ctx: JobContext) => {
     const stt = new elevenlabs.STT({
       useRealtime: true,
@@ -32,7 +27,6 @@ export default defineAgent({
 
     const session = new voice.AgentSession({
       voiceOptions: { allowInterruptions: true },
-      vad: ctx.proc.userData.vad! as silero.VAD,
       stt,
       llm: new inference.LLM({ model: 'openai/gpt-4.1-mini' }),
       tts: new inference.TTS({ model: 'cartesia/sonic-3' }),
diff --git a/examples/src/frontdesk/frontdesk_agent.ts b/examples/src/frontdesk/frontdesk_agent.ts
index d5d2e1ab1..8e9e50f42 100644
--- a/examples/src/frontdesk/frontdesk_agent.ts
+++ b/examples/src/frontdesk/frontdesk_agent.ts
@@ -1,20 +1,10 @@
 // SPDX-FileCopyrightText: 2025 LiveKit, Inc.
 //
 // SPDX-License-Identifier: Apache-2.0
-import {
-  type JobContext,
-  type JobProcess,
-  ServerOptions,
-  cli,
-  defineAgent,
-  llm,
-  voice,
-} from '@livekit/agents';
+import { type JobContext, ServerOptions, cli, defineAgent, llm, voice } from '@livekit/agents';
 import * as deepgram from '@livekit/agents-plugin-deepgram';
 import * as elevenlabs from '@livekit/agents-plugin-elevenlabs';
-import * as livekit from '@livekit/agents-plugin-livekit';
 import * as openai from '@livekit/agents-plugin-openai';
-import * as silero from '@livekit/agents-plugin-silero';
 import { BackgroundVoiceCancellation } from '@livekit/noise-cancellation-node';
 import { fileURLToPath } from 'node:url';
 import { z } from 'zod';
@@ -196,9 +186,6 @@ You must infer the appropriate range implicitly from the conversational context
 }
 
 export default defineAgent({
-  prewarm: async (proc: JobProcess) => {
-    proc.userData.vad = await silero.VAD.load();
-  },
   entry: async (ctx: JobContext) => {
     const timezone = 'UTC';
 
@@ -220,13 +207,11 @@ export default defineAgent({
     const userdata: Userdata = { cal };
 
     const session = new voice.AgentSession({
-      vad: ctx.proc.userData.vad! as silero.VAD,
       stt: new deepgram.STT(),
       llm: new openai.LLM({
         model: 'gpt-4.1',
       }),
       tts: new elevenlabs.TTS(),
-      turnDetection: new livekit.turnDetector.MultilingualModel(),
       userData: userdata,
       voiceOptions: {
         maxToolSteps: 1,
diff --git a/examples/src/gemini_realtime_agent.ts b/examples/src/gemini_realtime_agent.ts
index 60cbb443e..3b82c8ec0 100644
--- a/examples/src/gemini_realtime_agent.ts
+++ b/examples/src/gemini_realtime_agent.ts
@@ -3,7 +3,6 @@
 // SPDX-License-Identifier: Apache-2.0
 import {
   type JobContext,
-  type JobProcess,
   ServerOptions,
   cli,
   dedent,
@@ -12,7 +11,6 @@ import {
   voice,
 } from '@livekit/agents';
 import * as google from '@livekit/agents-plugin-google';
-import * as silero from '@livekit/agents-plugin-silero';
 import { fileURLToPath } from 'node:url';
 import { z } from 'zod';
 
@@ -117,14 +115,10 @@ class StoryAgent extends voice.Agent<StoryData> {
 }
 
 export default defineAgent({
-  prewarm: async (proc: JobProcess) => {
-    proc.userData.vad = await silero.VAD.load();
-  },
   entry: async (ctx: JobContext) => {
     const userdata: StoryData = {};
 
     const session = new voice.AgentSession({
-      vad: ctx.proc.userData.vad! as silero.VAD,
       llm: new google.realtime.RealtimeModel({
         thinkingConfig: {
           // Making the thoughts false to speed up the realtime response
diff --git a/examples/src/hume_tts.ts b/examples/src/hume_tts.ts
index fbf05c689..86daa1854 100644
--- a/examples/src/hume_tts.ts
+++ b/examples/src/hume_tts.ts
@@ -3,7 +3,6 @@
 // SPDX-License-Identifier: Apache-2.0
 import {
   type JobContext,
-  type JobProcess,
   ServerOptions,
   cli,
   defineAgent,
@@ -12,15 +11,10 @@ import {
   voice,
 } from '@livekit/agents';
 import * as hume from '@livekit/agents-plugin-hume';
-import * as livekit from '@livekit/agents-plugin-livekit';
-import * as silero from '@livekit/agents-plugin-silero';
 import { BackgroundVoiceCancellation } from '@livekit/noise-cancellation-node';
 import { fileURLToPath } from 'node:url';
 
 export default defineAgent({
-  prewarm: async (proc: JobProcess) => {
-    proc.userData.vad = await silero.VAD.load();
-  },
   entry: async (ctx: JobContext) => {
     const agent = new voice.Agent({
       instructions:
@@ -39,8 +33,6 @@ export default defineAgent({
       stt: 'deepgram/nova-3',
       llm: 'openai/gpt-4.1-mini',
       tts,
-      vad: ctx.proc.userData.vad! as silero.VAD,
-      turnDetection: new livekit.turnDetector.MultilingualModel(),
       voiceOptions: {
         preemptiveGeneration: true,
       },
diff --git a/examples/src/idle_user_timeout_example.ts b/examples/src/idle_user_timeout_example.ts
index 47b9d2643..d326c881b 100644
--- a/examples/src/idle_user_timeout_example.ts
+++ b/examples/src/idle_user_timeout_example.ts
@@ -8,7 +8,6 @@
  */
 import {
   type JobContext,
-  type JobProcess,
   ServerOptions,
   Task,
   cli,
@@ -19,21 +18,15 @@ import {
   log,
   voice,
 } from '@livekit/agents';
-import * as silero from '@livekit/agents-plugin-silero';
 import { fileURLToPath } from 'node:url';
 
 initializeLogger({ pretty: true });
 
 export default defineAgent({
-  prewarm: async (proc: JobProcess) => {
-    proc.userData.vad = await silero.VAD.load();
-  },
   entry: async (ctx: JobContext) => {
     const logger = log();
-    const vad = ctx.proc.userData.vad! as silero.VAD;
 
     const session = new voice.AgentSession({
-      vad,
       llm: new inference.LLM({ model: 'openai/gpt-4.1-mini' }),
       stt: new inference.STT({ model: 'deepgram/nova-3', language: 'en' }),
       tts: new inference.TTS({
diff --git a/examples/src/instructions_per_modality.ts b/examples/src/instructions_per_modality.ts
index 71f2f3f04..793f9819c 100644
--- a/examples/src/instructions_per_modality.ts
+++ b/examples/src/instructions_per_modality.ts
@@ -3,7 +3,6 @@
 // SPDX-License-Identifier: Apache-2.0
 import {
   type JobContext,
-  type JobProcess,
   ServerOptions,
   cli,
   defineAgent,
@@ -12,7 +11,6 @@ import {
   log,
   voice,
 } from '@livekit/agents';
-import * as silero from '@livekit/agents-plugin-silero';
 import { fileURLToPath } from 'node:url';
 import { z } from 'zod';
 
@@ -79,12 +77,8 @@ class SchedulingAgent extends voice.Agent {
 }
 
 export default defineAgent({
-  prewarm: async (proc: JobProcess) => {
-    proc.userData.vad = await silero.VAD.load();
-  },
   entry: async (ctx: JobContext) => {
     const session = new voice.AgentSession({
-      vad: ctx.proc.userData.vad! as silero.VAD,
       stt: new inference.STT({ model: 'deepgram/nova-3' }),
       llm: new inference.LLM({ model: 'openai/gpt-4.1-mini' }),
       tts: new inference.TTS({
diff --git a/examples/src/inworld_tts.ts b/examples/src/inworld_tts.ts
index 9a4ddf2a4..ba9f6a888 100644
--- a/examples/src/inworld_tts.ts
+++ b/examples/src/inworld_tts.ts
@@ -3,7 +3,6 @@
 // SPDX-License-Identifier: Apache-2.0
 import {
   type JobContext,
-  type JobProcess,
   ServerOptions,
   cli,
   defineAgent,
@@ -12,15 +11,10 @@ import {
   voice,
 } from '@livekit/agents';
 import * as inworld from '@livekit/agents-plugin-inworld';
-import * as livekit from '@livekit/agents-plugin-livekit';
-import * as silero from '@livekit/agents-plugin-silero';
 import { BackgroundVoiceCancellation } from '@livekit/noise-cancellation-node';
 import { fileURLToPath } from 'node:url';
 
 export default defineAgent({
-  prewarm: async (proc: JobProcess) => {
-    proc.userData.vad = await silero.VAD.load();
-  },
   entry: async (ctx: JobContext) => {
     const agent = new voice.Agent({
       instructions:
@@ -69,10 +63,6 @@ export default defineAgent({
       // Text-to-speech (TTS) is your agent's voice, turning the LLM's text into speech that the user can hear
       // See all available models as well as voice selections at https://docs.livekit.io/agents/models/tts/
       tts,
-      // VAD and turn detection are used to determine when the user is speaking and when the agent should respond
-      // See more at https://docs.livekit.io/agents/build/turns
-      vad: ctx.proc.userData.vad! as silero.VAD,
-      turnDetection: new livekit.turnDetector.MultilingualModel(),
       // to use realtime model, replace the stt, llm, tts and vad with the following
       // llm: new openai.realtime.RealtimeModel(),
       voiceOptions: {
diff --git a/examples/src/lemonslice_realtime_avatar.ts b/examples/src/lemonslice_realtime_avatar.ts
index b2afc544b..c5d7c9ab8 100644
--- a/examples/src/lemonslice_realtime_avatar.ts
+++ b/examples/src/lemonslice_realtime_avatar.ts
@@ -3,7 +3,6 @@
 // SPDX-License-Identifier: Apache-2.0
 import {
   type JobContext,
-  type JobProcess,
   ServerOptions,
   cli,
   defineAgent,
@@ -12,16 +11,11 @@ import {
   voice,
 } from '@livekit/agents';
 import * as lemonslice from '@livekit/agents-plugin-lemonslice';
-import * as livekit from '@livekit/agents-plugin-livekit';
-import * as silero from '@livekit/agents-plugin-silero';
 import { fileURLToPath } from 'node:url';
 
 initializeLogger({ pretty: true });
 
 export default defineAgent({
-  prewarm: async (proc: JobProcess) => {
-    proc.userData.vad = await silero.VAD.load();
-  },
   entry: async (ctx: JobContext) => {
     try {
       const agent = new voice.Agent({
@@ -40,8 +34,6 @@ export default defineAgent({
           model: 'cartesia/sonic-3',
           voice: '9626c31c-bec5-4cca-baa8-f8ba9e84c8bc',
         }),
-        turnDetection: new livekit.turnDetector.MultilingualModel(),
-        vad: ctx.proc.userData.vad! as silero.VAD,
         turnHandling: {
           interruption: {
             resumeFalseInterruption: false,
diff --git a/examples/src/liveavatar_avatar.ts b/examples/src/liveavatar_avatar.ts
index fe502d3b3..bc428142d 100644
--- a/examples/src/liveavatar_avatar.ts
+++ b/examples/src/liveavatar_avatar.ts
@@ -3,7 +3,6 @@
 // SPDX-License-Identifier: Apache-2.0
 import {
   type JobContext,
-  type JobProcess,
   ServerOptions,
   cli,
   defineAgent,
@@ -13,14 +12,9 @@ import {
   voice,
 } from '@livekit/agents';
 import * as liveavatar from '@livekit/agents-plugin-liveavatar';
-import * as livekit from '@livekit/agents-plugin-livekit';
-import * as silero from '@livekit/agents-plugin-silero';
 import { fileURLToPath } from 'node:url';
 
 export default defineAgent({
-  prewarm: async (proc: JobProcess) => {
-    proc.userData.vad = await silero.VAD.load();
-  },
   entry: async (ctx: JobContext) => {
     const logger = log().child({ example: 'liveavatar_avatar' });
 
@@ -39,8 +33,6 @@ export default defineAgent({
         model: 'cartesia/sonic-3',
         voice: '9626c31c-bec5-4cca-baa8-f8ba9e84c8bc',
       }),
-      turnDetection: new livekit.turnDetector.MultilingualModel(),
-      vad: ctx.proc.userData.vad! as silero.VAD,
       voiceOptions: {
         preemptiveGeneration: true,
       },
diff --git a/examples/src/llm_fallback_adapter.ts b/examples/src/llm_fallback_adapter.ts
index d053464dc..d3d407214 100644
--- a/examples/src/llm_fallback_adapter.ts
+++ b/examples/src/llm_fallback_adapter.ts
@@ -16,26 +16,14 @@
  * - Configurable timeouts and retry behavior
  * - Event emission when provider availability changes
  */
-import {
-  type JobContext,
-  type JobProcess,
-  ServerOptions,
-  cli,
-  defineAgent,
-  llm,
-  voice,
-} from '@livekit/agents';
+import { type JobContext, ServerOptions, cli, defineAgent, llm, voice } from '@livekit/agents';
 import * as deepgram from '@livekit/agents-plugin-deepgram';
 import * as elevenlabs from '@livekit/agents-plugin-elevenlabs';
 import * as openai from '@livekit/agents-plugin-openai';
-import * as silero from '@livekit/agents-plugin-silero';
 import { fileURLToPath } from 'node:url';
 import { z } from 'zod';
 
 export default defineAgent({
-  prewarm: async (proc: JobProcess) => {
-    proc.userData.vad = await silero.VAD.load();
-  },
   entry: async (ctx: JobContext) => {
     // Create multiple LLM instances for fallback
     // The FallbackAdapter will try them in order: primary -> secondary -> tertiary
@@ -85,7 +73,6 @@ export default defineAgent({
     });
 
     const session = new voice.AgentSession({
-      vad: ctx.proc.userData.vad! as silero.VAD,
       stt: new deepgram.STT(),
       tts: new elevenlabs.TTS(),
       llm: fallbackLLM, // Use the FallbackAdapter instead of a single LLM
diff --git a/examples/src/manual_shutdown.ts b/examples/src/manual_shutdown.ts
index 96bedb901..56f770ca0 100644
--- a/examples/src/manual_shutdown.ts
+++ b/examples/src/manual_shutdown.ts
@@ -3,7 +3,6 @@
 // SPDX-License-Identifier: Apache-2.0
 import {
   type JobContext,
-  type JobProcess,
   ServerOptions,
   cli,
   defineAgent,
@@ -11,16 +10,11 @@ import {
   llm,
   voice,
 } from '@livekit/agents';
-import * as livekit from '@livekit/agents-plugin-livekit';
-import * as silero from '@livekit/agents-plugin-silero';
 import { BackgroundVoiceCancellation } from '@livekit/noise-cancellation-node';
 import { fileURLToPath } from 'node:url';
 import { z } from 'zod';
 
 export default defineAgent({
-  prewarm: async (proc: JobProcess) => {
-    proc.userData.vad = await silero.VAD.load();
-  },
   entry: async (ctx: JobContext) => {
     const agent = new voice.Agent({
       instructions:
@@ -66,8 +60,6 @@ export default defineAgent({
         model: 'cartesia/sonic-3',
         voice: '9626c31c-bec5-4cca-baa8-f8ba9e84c8bc',
       }),
-      vad: ctx.proc.userData.vad! as silero.VAD,
-      turnDetection: new livekit.turnDetector.MultilingualModel(),
       voiceOptions: {
         preemptiveGeneration: true,
       },
diff --git a/examples/src/multi_agent.ts b/examples/src/multi_agent.ts
index 7f4819bed..263ba6093 100644
--- a/examples/src/multi_agent.ts
+++ b/examples/src/multi_agent.ts
@@ -3,7 +3,6 @@
 // SPDX-License-Identifier: Apache-2.0
 import {
   type JobContext,
-  type JobProcess,
   ServerOptions,
   cli,
   dedent,
@@ -12,8 +11,6 @@ import {
   llm,
   voice,
 } from '@livekit/agents';
-import * as livekit from '@livekit/agents-plugin-livekit';
-import * as silero from '@livekit/agents-plugin-silero';
 import { fileURLToPath } from 'node:url';
 import { z } from 'zod';
 
@@ -72,14 +69,10 @@ class StoryAgent extends voice.Agent<StoryData> {
 }
 
 export default defineAgent({
-  prewarm: async (proc: JobProcess) => {
-    proc.userData.vad = await silero.VAD.load();
-  },
   entry: async (ctx: JobContext) => {
     const userdata: StoryData = {};
 
     const session = new voice.AgentSession({
-      vad: ctx.proc.userData.vad! as silero.VAD,
       stt: new inference.STT({ model: 'deepgram/nova-3', language: 'en' }),
       tts: new inference.TTS({
         model: 'cartesia/sonic-3',
@@ -89,7 +82,6 @@ export default defineAgent({
       // to use realtime model, replace the stt, llm, tts and vad with the following
       // llm: new openai.realtime.RealtimeModel(),
       userData: userdata,
-      turnDetection: new livekit.turnDetector.EnglishModel(),
     });
 
     await session.start({
diff --git a/examples/src/push_to_talk.ts b/examples/src/push_to_talk.ts
index ecba61363..06dbba40a 100644
--- a/examples/src/push_to_talk.ts
+++ b/examples/src/push_to_talk.ts
@@ -3,7 +3,6 @@
 // SPDX-License-Identifier: Apache-2.0
 import {
   type JobContext,
-  type JobProcess,
   ServerOptions,
   cli,
   defineAgent,
@@ -11,7 +10,6 @@ import {
   initializeLogger,
   voice,
 } from '@livekit/agents';
-import * as silero from '@livekit/agents-plugin-silero';
 import type { ChatContext, ChatMessage } from 'agents/dist/llm/chat_context.js';
 import { fileURLToPath } from 'node:url';
 
@@ -25,14 +23,10 @@ class MyAgent extends voice.Agent {
 }
 
 export default defineAgent({
-  prewarm: async (proc: JobProcess) => {
-    proc.userData.vad = await silero.VAD.load();
-  },
   entry: async (ctx: JobContext) => {
     initializeLogger({ pretty: true });
 
     const session = new voice.AgentSession({
-      vad: ctx.proc.userData.vad! as silero.VAD,
       stt: new inference.STT({ model: 'deepgram/nova-3', language: 'en' }),
       llm: new inference.LLM({ model: 'openai/gpt-4.1-mini' }),
       tts: new inference.TTS({
diff --git a/examples/src/raw_function_description.ts b/examples/src/raw_function_description.ts
index 6548fd011..6a1d744a5 100644
--- a/examples/src/raw_function_description.ts
+++ b/examples/src/raw_function_description.ts
@@ -3,7 +3,6 @@
 // SPDX-License-Identifier: Apache-2.0
 import {
   type JobContext,
-  type JobProcess,
   ServerOptions,
   cli,
   defineAgent,
@@ -11,8 +10,6 @@ import {
   llm,
   voice,
 } from '@livekit/agents';
-import * as livekit from '@livekit/agents-plugin-livekit';
-import * as silero from '@livekit/agents-plugin-silero';
 import { fileURLToPath } from 'node:url';
 
 function createRawFunctionAgent() {
@@ -48,14 +45,8 @@ function createRawFunctionAgent() {
 }
 
 export default defineAgent({
-  prewarm: async (proc: JobProcess) => {
-    proc.userData.vad = await silero.VAD.load();
-  },
   entry: async (ctx: JobContext) => {
-    const vad = ctx.proc.userData.vad! as silero.VAD;
-
     const session = new voice.AgentSession({
-      vad,
       stt: new inference.STT({
         model: 'deepgram/nova-3',
         language: 'en',
@@ -68,7 +59,6 @@ export default defineAgent({
       // to use realtime model, replace the stt, llm, tts and vad with the following
       // llm: new openai.realtime.RealtimeModel(),
       userData: { number: 0 },
-      turnDetection: new livekit.turnDetector.EnglishModel(),
     });
 
     await session.start({
diff --git a/examples/src/realtime_agent.ts b/examples/src/realtime_agent.ts
index b30171776..a6879262b 100644
--- a/examples/src/realtime_agent.ts
+++ b/examples/src/realtime_agent.ts
@@ -1,17 +1,8 @@
 // SPDX-FileCopyrightText: 2025 LiveKit, Inc.
 //
 // SPDX-License-Identifier: Apache-2.0
-import {
-  type JobContext,
-  type JobProcess,
-  ServerOptions,
-  cli,
-  defineAgent,
-  llm,
-  voice,
-} from '@livekit/agents';
+import { type JobContext, ServerOptions, cli, defineAgent, llm, voice } from '@livekit/agents';
 import * as openai from '@livekit/agents-plugin-openai';
-import * as silero from '@livekit/agents-plugin-silero';
 import { readFileSync } from 'node:fs';
 import { fileURLToPath } from 'node:url';
 import { z } from 'zod';
@@ -19,9 +10,6 @@ import { z } from 'zod';
 const roomNameSchema = z.enum(['bedroom', 'living room', 'kitchen', 'bathroom', 'office']);
 
 export default defineAgent({
-  prewarm: async (proc: JobProcess) => {
-    proc.userData.vad = await silero.VAD.load();
-  },
   entry: async (ctx: JobContext) => {
     const getWeather = llm.tool({
       description: ' Called when the user asks about the weather.',
diff --git a/examples/src/realtime_turn_detector.ts b/examples/src/realtime_turn_detector.ts
index 6e6ff90dd..7057eac31 100644
--- a/examples/src/realtime_turn_detector.ts
+++ b/examples/src/realtime_turn_detector.ts
@@ -3,26 +3,20 @@
 // SPDX-License-Identifier: Apache-2.0
 import {
   type JobContext,
-  type JobProcess,
   ServerOptions,
   cli,
   defineAgent,
+  inference,
   voice,
 } from '@livekit/agents';
 import * as deepgram from '@livekit/agents-plugin-deepgram';
 import * as elevenlabs from '@livekit/agents-plugin-elevenlabs';
-import * as livekit from '@livekit/agents-plugin-livekit';
 import * as openai from '@livekit/agents-plugin-openai';
-import * as silero from '@livekit/agents-plugin-silero';
 import { fileURLToPath } from 'node:url';
 
 export default defineAgent({
-  prewarm: async (proc: JobProcess) => {
-    proc.userData.vad = await silero.VAD.load();
-  },
   entry: async (ctx: JobContext) => {
     const session = new voice.AgentSession({
-      vad: ctx.proc.userData.vad! as silero.VAD,
       stt: new deepgram.STT(),
       tts: new elevenlabs.TTS(),
       // To use OpenAI Realtime API
@@ -33,7 +27,7 @@ export default defineAgent({
         turnDetection: null,
         inputAudioTranscription: null,
       }),
-      turnDetection: new livekit.turnDetector.EnglishModel(),
+      turnDetection: new inference.TurnDetector(),
     });
 
     await session.start({
diff --git a/examples/src/realtime_with_tts.ts b/examples/src/realtime_with_tts.ts
index d87db7853..05df047be 100644
--- a/examples/src/realtime_with_tts.ts
+++ b/examples/src/realtime_with_tts.ts
@@ -1,27 +1,14 @@
 // SPDX-FileCopyrightText: 2025 LiveKit, Inc.
 //
 // SPDX-License-Identifier: Apache-2.0
-import {
-  type JobContext,
-  type JobProcess,
-  ServerOptions,
-  cli,
-  defineAgent,
-  llm,
-  log,
-  voice,
-} from '@livekit/agents';
+import { type JobContext, ServerOptions, cli, defineAgent, llm, log, voice } from '@livekit/agents';
 import * as cartesia from '@livekit/agents-plugin-cartesia';
 import * as openai from '@livekit/agents-plugin-openai';
-import * as silero from '@livekit/agents-plugin-silero';
 import { BackgroundVoiceCancellation } from '@livekit/noise-cancellation-node';
 import { fileURLToPath } from 'node:url';
 import { z } from 'zod';
 
 export default defineAgent({
-  prewarm: async (proc: JobProcess) => {
-    proc.userData.vad = await silero.VAD.load();
-  },
   entry: async (ctx: JobContext) => {
     const logger = log();
 
diff --git a/examples/src/restaurant_agent.ts b/examples/src/restaurant_agent.ts
index d9faaf9a5..081552c1a 100644
--- a/examples/src/restaurant_agent.ts
+++ b/examples/src/restaurant_agent.ts
@@ -3,7 +3,6 @@
 // SPDX-License-Identifier: Apache-2.0
 import {
   type JobContext,
-  type JobProcess,
   ServerOptions,
   cli,
   dedent,
@@ -12,7 +11,6 @@ import {
   llm,
   voice,
 } from '@livekit/agents';
-import * as silero from '@livekit/agents-plugin-silero';
 import { fileURLToPath } from 'node:url';
 import { z } from 'zod';
 
@@ -358,9 +356,6 @@ function createCheckoutAgent(menu: string) {
 }
 
 export default defineAgent({
-  prewarm: async (proc: JobProcess) => {
-    proc.userData.vad = await silero.VAD.load();
-  },
   entry: async (ctx: JobContext) => {
     const menu = 'Pizza: $10, Salad: $5, Ice Cream: $3, Coffee: $2';
     const userData = createUserData({
@@ -370,9 +365,10 @@ export default defineAgent({
       checkout: createCheckoutAgent(menu),
     });
 
-    const vad = ctx.proc.userData.vad! as silero.VAD;
     const session = new voice.AgentSession({
-      vad,
+      // VAD is auto-provisioned by AgentSession (bundled silero via
+      // @livekit/local-inference). Pass `vad: null` to opt out, or pass
+      // your own `new inference.VAD({ ... })` to customise.
       stt: new inference.STT({ model: 'deepgram/nova-3' }),
       llm: new inference.LLM({ model: 'openai/gpt-4.1-mini' }),
       tts: new inference.TTS({ model: 'cartesia/sonic-3' }),
diff --git a/examples/src/runway_avatar.ts b/examples/src/runway_avatar.ts
index 3d3cde0bd..dd0c3aaf5 100644
--- a/examples/src/runway_avatar.ts
+++ b/examples/src/runway_avatar.ts
@@ -3,7 +3,6 @@
 // SPDX-License-Identifier: Apache-2.0
 import {
   type JobContext,
-  type JobProcess,
   ServerOptions,
   cli,
   defineAgent,
@@ -13,17 +12,12 @@ import {
 } from '@livekit/agents';
 import * as google from '@livekit/agents-plugin-google';
 import * as runway from '@livekit/agents-plugin-runway';
-import * as silero from '@livekit/agents-plugin-silero';
 import { fileURLToPath } from 'node:url';
 
 export default defineAgent({
-  prewarm: async (proc: JobProcess) => {
-    proc.userData.vad = await silero.VAD.load();
-  },
   entry: async (ctx: JobContext) => {
     const logger = log();
     const session = new voice.AgentSession({
-      vad: ctx.proc.userData.vad! as silero.VAD,
       llm: new google.realtime.RealtimeModel({
         thinkingConfig: { includeThoughts: false },
       }),
diff --git a/examples/src/telephony_amd.ts b/examples/src/telephony_amd.ts
index 424221935..d22e8e3d1 100644
--- a/examples/src/telephony_amd.ts
+++ b/examples/src/telephony_amd.ts
@@ -3,7 +3,6 @@
 // SPDX-License-Identifier: Apache-2.0
 import {
   type JobContext,
-  type JobProcess,
   ServerOptions,
   cli,
   defineAgent,
@@ -11,8 +10,6 @@ import {
   log,
   voice,
 } from '@livekit/agents';
-import * as livekit from '@livekit/agents-plugin-livekit';
-import * as silero from '@livekit/agents-plugin-silero';
 import { TrackKind } from '@livekit/rtc-node';
 import { RoomServiceClient, SipClient } from 'livekit-server-sdk';
 import { fileURLToPath } from 'node:url';
@@ -41,9 +38,6 @@ class MyAgent extends voice.Agent {
  *   SIP_PARTICIPANT_IDENTITY  — identity to assign the dialed participant
  */
 export default defineAgent({
-  prewarm: async (proc: JobProcess) => {
-    proc.userData.vad = await silero.VAD.load();
-  },
   entry: async (ctx: JobContext) => {
     const logger = log().child({ room: ctx.room.name });
 
@@ -57,10 +51,6 @@ export default defineAgent({
         model: 'cartesia/sonic-3',
         voice: '9626c31c-bec5-4cca-baa8-f8ba9e84c8bc',
       }),
-      turnHandling: {
-        turnDetection: new livekit.turnDetector.MultilingualModel(),
-      },
-      vad: ctx.proc.userData.vad! as silero.VAD,
       preemptiveGeneration: true,
     });
 
diff --git a/examples/src/tool_call_disfluency.ts b/examples/src/tool_call_disfluency.ts
index 8f92183a8..c89917d54 100644
--- a/examples/src/tool_call_disfluency.ts
+++ b/examples/src/tool_call_disfluency.ts
@@ -4,7 +4,6 @@
 import {
   AutoSubscribe,
   type JobContext,
-  type JobProcess,
   ServerOptions,
   cli,
   defineAgent,
@@ -12,9 +11,7 @@ import {
   voice,
 } from '@livekit/agents';
 import * as elevenlabs from '@livekit/agents-plugin-elevenlabs';
-import * as livekit from '@livekit/agents-plugin-livekit';
 import * as openai from '@livekit/agents-plugin-openai';
-import * as silero from '@livekit/agents-plugin-silero';
 import { fileURLToPath } from 'node:url';
 import { z } from 'zod';
 
@@ -30,13 +27,9 @@ class VoiceAgent extends voice.Agent {
 }
 
 export default defineAgent({
-  prewarm: async (proc: JobProcess) => {
-    proc.userData.vad = await silero.VAD.load();
-  },
   entry: async (ctx: JobContext) => {
     await ctx.connect(undefined, AutoSubscribe.AUDIO_ONLY, undefined);
     await ctx.waitForParticipant();
-    const vad = ctx.proc.userData.vad! as silero.VAD;
 
     const getWeather = llm.tool({
       description: ' Called when the user asks about the weather.',
@@ -61,10 +54,8 @@ export default defineAgent({
     });
 
     const session = new voice.AgentSession({
-      vad,
       llm: new openai.realtime.RealtimeModel(),
       tts: new elevenlabs.TTS(),
-      turnDetection: new livekit.turnDetector.MultilingualModel(),
     });
 
     await session.start({
diff --git a/examples/src/warm_transfer.ts b/examples/src/warm_transfer.ts
index d2d56e7f1..546993724 100644
--- a/examples/src/warm_transfer.ts
+++ b/examples/src/warm_transfer.ts
@@ -3,7 +3,6 @@
 // SPDX-License-Identifier: Apache-2.0
 import {
   type JobContext,
-  type JobProcess,
   ServerOptions,
   beta,
   cli,
@@ -13,8 +12,6 @@ import {
   log,
   voice,
 } from '@livekit/agents';
-import * as livekit from '@livekit/agents-plugin-livekit';
-import * as silero from '@livekit/agents-plugin-silero';
 import { BackgroundVoiceCancellation } from '@livekit/noise-cancellation-node';
 import { fileURLToPath } from 'node:url';
 
@@ -95,20 +92,19 @@ Examples on when the tool should be called:
   }
 }
 
+// No prewarm hook needed: the local EOT model runs in the shared inference
+// process (loaded once per host), and the inference VAD (~2MB, in-process)
+// lazy-loads on first stream.
 export default defineAgent({
-  prewarm: async (proc: JobProcess) => {
-    proc.userData.vad = await silero.VAD.load();
-  },
   entry: async (ctx: JobContext) => {
     const session = new voice.AgentSession({
-      vad: ctx.proc.userData.vad as silero.VAD,
+      vad: new inference.VAD(),
       llm: new inference.LLM({ model: 'openai/gpt-4.1-mini' }),
       stt: new inference.STT({ model: 'deepgram/nova-3', language: 'en' }),
       tts: new inference.TTS({
         model: 'cartesia/sonic-3',
         voice: '9626c31c-bec5-4cca-baa8-f8ba9e84c8bc',
       }),
-      turnDetection: new livekit.turnDetector.MultilingualModel(),
     });
 
     await session.start({
diff --git a/plugins/livekit/src/turn_detector/base.ts b/plugins/livekit/src/turn_detector/base.ts
index 93ecdd7f9..3fa1dd139 100644
--- a/plugins/livekit/src/turn_detector/base.ts
+++ b/plugins/livekit/src/turn_detector/base.ts
@@ -231,8 +231,11 @@ export abstract class EOUModel {
     return (await this.unlikelyThreshold(language)) !== undefined;
   }
 
+  // `_timeoutMs` is part of the unified `_TurnDetector` contract (milliseconds,
+  // matching the audio EOT detector). Text-based inference is bounded by the IPC
+  // executor itself, so this detector does not use the value.
   // eslint-disable-next-line @typescript-eslint/no-unused-vars
-  async predictEndOfTurn(chatCtx: llm.ChatContext, timeout: number = 3): Promise<number> {
+  async predictEndOfTurn(chatCtx: llm.ChatContext, _timeoutMs?: number): Promise<number> {
     let messages: RawChatItem[] = [];
 
     for (const message of chatCtx.items) {
diff --git a/plugins/livekit/src/turn_detector/index.ts b/plugins/livekit/src/turn_detector/index.ts
index 8ffad4c1b..fe64920aa 100644
--- a/plugins/livekit/src/turn_detector/index.ts
+++ b/plugins/livekit/src/turn_detector/index.ts
@@ -6,6 +6,13 @@ import { extname } from 'node:path';
 import { INFERENCE_METHOD_EN } from './english.js';
 import { INFERENCE_METHOD_MULTILINGUAL } from './multilingual.js';
 
+console.warn(
+  'The text-based turn detector from @livekit/agents-plugin-livekit is deprecated. ' +
+    'The audio EOT detector in `@livekit/agents` inference (TurnDetector) replaces ' +
+    'it and runs natively on-device via @livekit/local-inference. ' +
+    'This text-based path will be removed in a future release.',
+);
+
 export { EOUModel } from './base.js';
 export { EnglishModel } from './english.js';
 export { MultilingualModel } from './multilingual.js';
diff --git a/plugins/livekit/src/turn_detector/multilingual.ts b/plugins/livekit/src/turn_detector/multilingual.ts
index 57e94ba8d..cd0423913 100644
--- a/plugins/livekit/src/turn_detector/multilingual.ts
+++ b/plugins/livekit/src/turn_detector/multilingual.ts
@@ -68,10 +68,10 @@ export class MultilingualModel extends EOUModel {
     return threshold;
   }
 
-  async predictEndOfTurn(chatCtx: llm.ChatContext, timeout: number = 3): Promise<number> {
+  async predictEndOfTurn(chatCtx: llm.ChatContext, timeoutMs?: number): Promise<number> {
     const url = remoteInferenceUrl();
     if (!url) {
-      return await super.predictEndOfTurn(chatCtx, timeout);
+      return await super.predictEndOfTurn(chatCtx, timeoutMs);
     }
 
     // Copy and process chat context similar to Python implementation
diff --git a/plugins/silero/src/index.ts b/plugins/silero/src/index.ts
index 2b5b67fb6..41a4dc96e 100644
--- a/plugins/silero/src/index.ts
+++ b/plugins/silero/src/index.ts
@@ -5,6 +5,14 @@ import { Plugin } from '@livekit/agents';
 
 export { VAD, VADStream } from './vad.js';
 
+console.warn(
+  '@livekit/agents-plugin-silero is deprecated and will be removed in v2.0. ' +
+    'AgentSession now defaults to the bundled silero VAD (via @livekit/local-inference); ' +
+    'drop the explicit `vad=` argument entirely, pass `vad: null` to opt out, or use ' +
+    "`import { inference } from '@livekit/agents'; new inference.VAD({ model: 'silero', ... })` " +
+    'to customise options.',
+);
+
 class SileroPlugin extends Plugin {
   constructor() {
     super({
diff --git a/plugins/silero/src/vad.test.ts b/plugins/silero/src/vad.test.ts
index ac59ba5cf..89b1df17b 100644
--- a/plugins/silero/src/vad.test.ts
+++ b/plugins/silero/src/vad.test.ts
@@ -1,12 +1,18 @@
 // SPDX-FileCopyrightText: 2026 LiveKit, Inc.
 //
 // SPDX-License-Identifier: Apache-2.0
-import { AudioByteStream, type VADEvent, VADEventType, mergeFrames } from '@livekit/agents';
+import {
+  AudioByteStream,
+  type VADEvent,
+  VADEventType,
+  type VADStream,
+  mergeFrames,
+} from '@livekit/agents';
 import { AudioFrame, AudioResampler } from '@livekit/rtc-node';
 import { readFileSync } from 'node:fs';
 import { join } from 'node:path';
 import { describe, expect, it } from 'vitest';
-import { VAD, type VADStream } from './vad.js';
+import { VAD } from './vad.js';
 
 const TARGET_SAMPLE_RATE = 16000;
 const CHUNK_DURATION_MS = 10;
diff --git a/plugins/silero/src/vad.ts b/plugins/silero/src/vad.ts
index b78d87a47..970017611 100644
--- a/plugins/silero/src/vad.ts
+++ b/plugins/silero/src/vad.ts
@@ -6,6 +6,7 @@ import {
   VADEventType,
   VADStream as baseStream,
   VAD as baseVAD,
+  inference,
   log,
   mergeFrames,
 } from '@livekit/agents';
@@ -97,12 +98,41 @@ export class VAD extends baseVAD {
    * @param options -
    * @returns Promise\<{@link VAD}\>: An instance of the VAD class ready for streaming.
    */
-  static async load(opts: Partial<VADOptions> = {}): Promise<VAD> {
+  static async load(opts: Partial<VADOptions> = {}): Promise<baseVAD> {
     const mergedOpts: VADOptions = { ...defaultVADOptions, ...opts };
+
+    // When the requested settings are compatible with the bundled native
+    // implementation in `@livekit/local-inference`, delegate to
+    // `inference.VAD({ model: 'silero' })` so existing call sites transparently
+    // get the faster, COW-shared native path as part of the silero deprecation.
+    // The native lib only ships the 16 kHz model, so any other sample rate
+    // falls back to the legacy onnxruntime path below.
+    if (mergedOpts.sampleRate === 16000) {
+      if (!mergedOpts.forceCPU) {
+        log().warn(
+          'forceCPU=false is ignored when using the bundled native VAD; the ' +
+            'model runs CPU-only. Use a non-16kHz sampleRate to keep the legacy ' +
+            'onnxruntime path that honors forceCPU.',
+        );
+      }
+      return new inference.VAD({
+        model: 'silero',
+        minSpeechDuration: mergedOpts.minSpeechDuration,
+        minSilenceDuration: mergedOpts.minSilenceDuration,
+        prefixPaddingDuration: mergedOpts.prefixPaddingDuration,
+        maxBufferedSpeech: mergedOpts.maxBufferedSpeech,
+        activationThreshold: mergedOpts.activationThreshold,
+      });
+    }
+
     const session = await newInferenceSession(mergedOpts.forceCPU);
     return new VAD(session, mergedOpts);
   }
 
+  override get minSilenceDuration(): number {
+    return this.#opts.minSilenceDuration;
+  }
+
   stream(): VADStream {
     const stream = new VADStream(
       this,
diff --git a/pnpm-lock.yaml b/pnpm-lock.yaml
index 834973df5..108d9fdd8 100644
--- a/pnpm-lock.yaml
+++ b/pnpm-lock.yaml
@@ -48,7 +48,7 @@ importers:
         version: 6.21.0(eslint@8.57.0)(typescript@5.9.3)
       '@vitest/coverage-v8':
         specifier: 4.0.17
-        version: 4.0.17(vitest@4.1.0(@opentelemetry/api@1.9.0)(@types/node@22.19.1)(vite@7.3.2(@types/node@22.19.1)(tsx@4.21.0)))
+        version: 4.0.17(vitest@4.0.17(@opentelemetry/api@1.9.0)(@types/node@22.19.1)(tsx@4.21.0))
       eslint:
         specifier: ^8.56.0
         version: 8.57.0
@@ -102,7 +102,7 @@ importers:
         version: 7.3.2(@types/node@22.19.1)(tsx@4.21.0)
       vitest:
         specifier: ^4.0.17
-        version: 4.1.0(@opentelemetry/api@1.9.0)(@types/node@22.19.1)(vite@7.3.2(@types/node@22.19.1)(tsx@4.21.0))
+        version: 4.0.17(@opentelemetry/api@1.9.0)(@types/node@22.19.1)(tsx@4.21.0)
 
   agents:
     dependencies:
@@ -112,12 +112,15 @@ importers:
       '@ffmpeg-installer/ffmpeg':
         specifier: ^1.1.0
         version: 1.1.0
+      '@livekit/local-inference':
+        specifier: ^0.2.5
+        version: 0.2.5
       '@livekit/mutex':
         specifier: ^1.1.1
         version: 1.1.1
       '@livekit/protocol':
-        specifier: ^1.46.4
-        version: 1.46.4
+        specifier: ^1.46.5
+        version: 1.46.6
       '@livekit/throws-transformer':
         specifier: 0.1.8
         version: 0.1.8(typescript@5.9.3)
@@ -374,7 +377,7 @@ importers:
         version: 5.9.3
       vitest:
         specifier: ^4.0.17
-        version: 4.1.0(@opentelemetry/api@1.9.0)(@types/node@22.19.1)(vite@7.3.2(@types/node@22.19.1)(tsx@4.21.0))
+        version: 4.0.17(@opentelemetry/api@1.9.0)(@types/node@22.19.1)(tsx@4.21.0)
       zod:
         specifier: ^4.1.12
         version: 4.3.6
@@ -993,7 +996,7 @@ importers:
         version: 5.9.3
       vitest:
         specifier: ^4.0.17
-        version: 4.1.0(@opentelemetry/api@1.9.0)(@types/node@25.6.0)(vite@7.3.2(@types/node@25.6.0)(tsx@4.21.0))
+        version: 4.0.17(@opentelemetry/api@1.9.0)(@types/node@25.6.0)(tsx@4.21.0)
 
   plugins/neuphonic:
     dependencies:
@@ -1328,7 +1331,7 @@ importers:
         version: 1.0.16
       vitest:
         specifier: ^4.0.17
-        version: 4.1.0(@opentelemetry/api@1.9.0)(@types/node@22.19.1)(vite@7.3.2(@types/node@22.19.1)(tsx@4.21.0))
+        version: 4.0.17(@opentelemetry/api@1.9.0)(@types/node@22.19.1)(tsx@4.21.0)
     devDependencies:
       '@livekit/agents':
         specifier: workspace:*
@@ -2130,6 +2133,35 @@ packages:
   '@livekit/changesets-changelog-github@0.0.4':
     resolution: {integrity: sha512-MXaiLYwgkYciZb8G2wkVtZ1pJJzZmVx5cM30Q+ClslrIYyAqQhRbPmZDM79/5CGxb1MTemR/tfOM25tgJgAK0g==}
 
+  '@livekit/local-inference-darwin-arm64@0.2.5':
+    resolution: {integrity: sha512-tdAGJRiYwko0rOmeI/dXf7Mo5TF+oeWDsK55Ga/2PZ/SHuYZ8jkJAPRaG1k78ePsJ119lySWZsxnJdVnOJowRA==}
+    cpu: [arm64]
+    os: [darwin]
+
+  '@livekit/local-inference-darwin-x64@0.2.5':
+    resolution: {integrity: sha512-FeJUHbx1swyAssS/X9CoI8s4OqeSrYJy/xhKhL0VnH1b5tlVfc6V5OjkLNZl55Jw9JYj0YkYpt0m0OIg3SvYRw==}
+    cpu: [x64]
+    os: [darwin]
+
+  '@livekit/local-inference-linux-arm64-gnu@0.2.5':
+    resolution: {integrity: sha512-hXigtVBLS55wT6oOfpDl2Xh6mhfzsrMxvkLftFFfttjFfFjSouuxkxG5NgQTGP01DGAvYO6mnIP8ASK6livr1w==}
+    cpu: [arm64]
+    os: [linux]
+
+  '@livekit/local-inference-linux-x64-gnu@0.2.5':
+    resolution: {integrity: sha512-3unNMNNc9rLCvGH6f3W6DKd4AlF5Z63mdOh9bGtEDZdPon/h7O3oWo9+6N/sHgULfHyD/vZn2NtT4MLtuhoJIw==}
+    cpu: [x64]
+    os: [linux]
+
+  '@livekit/local-inference-win32-x64-msvc@0.2.5':
+    resolution: {integrity: sha512-3s9paiOPwU+TQYPHNLzMxm/xCoZ8swzt8GF2BZSofI/jL2ao4SK1J3D23JEZuQfuZF4iLZm2dlIxMqAodQ9TCA==}
+    cpu: [x64]
+    os: [win32]
+
+  '@livekit/local-inference@0.2.5':
+    resolution: {integrity: sha512-0n2m4pld1jMqgeZyHs4+3q9gPzq0ousrx3wA8kULAoia/464uIsJ3JqrVGnH8yD4P/yrGeK11VpZ87S+hKeMAQ==}
+    engines: {node: '>=18.0.0'}
+
   '@livekit/mutex@1.1.1':
     resolution: {integrity: sha512-EsshAucklmpuUAfkABPxJNhzj9v2sG7JuzFDL4ML1oJQSV14sqrpTYnsaOudMAw9yOaW53NU3QQTlUQoRs4czw==}
 
@@ -2161,8 +2193,8 @@ packages:
     cpu: [x64]
     os: [win32]
 
-  '@livekit/protocol@1.46.4':
-    resolution: {integrity: sha512-yJZ8xvyVcs9CczK2V/EQQrSW0MA9VaZ1vL+FI6fd85KhIjfOg26HvrdUl2LZPT78Tu4R4opV4AW58eN5vgmzqg==}
+  '@livekit/protocol@1.46.6':
+    resolution: {integrity: sha512-upzlHP1vi/kZ/QqALZTFskQ0ifqc2f15RKucHYOsIHJsaXvEYanG75mAb7o+Yomfs4XhQ4BaRsdY+TFHXpaqrg==}
 
   '@livekit/rtc-ffi-bindings-darwin-arm64@0.12.60':
     resolution: {integrity: sha512-YHXqybkYfaTc3txJXXWoVogiSP3yKJdkaZlIlZ6IDMGnN9elUoHDYU+ZSn/rbdGu0pp4HUOzffXkbkItN735Bw==}
@@ -2697,8 +2729,8 @@ packages:
   '@types/argparse@1.0.38':
     resolution: {integrity: sha512-ebDJ9b0e702Yr7pWgB0jzm+CX4Srzz8RcXtLJDJB+BSccqMa36uyH/zUsSYao5+BD1ytv3k3rPYCq4mAE1hsXA==}
 
-  '@types/chai@5.2.3':
-    resolution: {integrity: sha512-Mw558oeA9fFbv65/y4mHtXDs9bPnFMZAL/jxdPFUpOHHIXX91mcgEHbS5Lahr+pwZFR8A7GQleRWeI6cGFC2UA==}
+  '@types/chai@5.2.2':
+    resolution: {integrity: sha512-8kB30R7Hwqf40JPiKhVzodJs2Qc1ZJ5zuT3uzw5Hq/dhNCl3G3l83jfpdI1e20BP348+fV7VIL/+FxaXkqBmWg==}
 
   '@types/deep-eql@4.0.2':
     resolution: {integrity: sha512-c9h9dVVMigMPc4bwTvC5dxqtqJZwQPePsWjPlpSOnojbor6pGqdk541lfA7AqFQr5pB1BRdq0juY9db81BwyFw==}
@@ -2706,9 +2738,6 @@ packages:
   '@types/estree@1.0.8':
     resolution: {integrity: sha512-dWHzHa2WqEXI/O1E9OjrocMTKJl2mSrEolh1Iomrv6U+JuNwaHXsXx9bLu5gG7BUWFIN0skIQJQ/L1rIex4X6w==}
 
-  '@types/estree@1.0.9':
-    resolution: {integrity: sha512-GhdPgy1el4/ImP05X05Uw4cw2/M93BCUmnEvWZNStlCzEKME4Fkk+YpoA5OiHNQmoS7Cafb8Xa3Pya8m1Qrzeg==}
-
   '@types/fluent-ffmpeg@2.1.28':
     resolution: {integrity: sha512-5ovxsDwBcPfJ+eYs1I/ZpcYCnkce7pvH9AHSvrZllAp1ZPpTRDZAFjF3TRFbukxSgIYTTNYePbS0rKUmaxVbXw==}
 
@@ -2816,14 +2845,14 @@ packages:
       '@vitest/browser':
         optional: true
 
-  '@vitest/expect@4.1.0':
-    resolution: {integrity: sha512-EIxG7k4wlWweuCLG9Y5InKFwpMEOyrMb6ZJ1ihYu02LVj/bzUwn2VMU+13PinsjRW75XnITeFrQBMH5+dLvCDA==}
+  '@vitest/expect@4.0.17':
+    resolution: {integrity: sha512-mEoqP3RqhKlbmUmntNDDCJeTDavDR+fVYkSOw8qRwJFaW/0/5zA9zFeTrHqNtcmwh6j26yMmwx2PqUDPzt5ZAQ==}
 
-  '@vitest/mocker@4.1.0':
-    resolution: {integrity: sha512-evxREh+Hork43+Y4IOhTo+h5lGmVRyjqI739Rz4RlUPqwrkFFDF6EMvOOYjTx4E8Tl6gyCLRL8Mu7Ry12a13Tw==}
+  '@vitest/mocker@4.0.17':
+    resolution: {integrity: sha512-+ZtQhLA3lDh1tI2wxe3yMsGzbp7uuJSWBM1iTIKCbppWTSBN09PUC+L+fyNlQApQoR+Ps8twt2pbSSXg2fQVEQ==}
     peerDependencies:
       msw: ^2.4.9
-      vite: ^6.0.0 || ^7.0.0 || ^8.0.0-0
+      vite: ^6.0.0 || ^7.0.0-0
     peerDependenciesMeta:
       msw:
         optional: true
@@ -2833,24 +2862,18 @@ packages:
   '@vitest/pretty-format@4.0.17':
     resolution: {integrity: sha512-Ah3VAYmjcEdHg6+MwFE17qyLqBHZ+ni2ScKCiW2XrlSBV4H3Z7vYfPfz7CWQ33gyu76oc0Ai36+kgLU3rfF4nw==}
 
-  '@vitest/pretty-format@4.1.0':
-    resolution: {integrity: sha512-3RZLZlh88Ib0J7NQTRATfc/3ZPOnSUn2uDBUoGNn5T36+bALixmzphN26OUD3LRXWkJu4H0s5vvUeqBiw+kS0A==}
+  '@vitest/runner@4.0.17':
+    resolution: {integrity: sha512-JmuQyf8aMWoo/LmNFppdpkfRVHJcsgzkbCA+/Bk7VfNH7RE6Ut2qxegeyx2j3ojtJtKIbIGy3h+KxGfYfk28YQ==}
 
-  '@vitest/runner@4.1.0':
-    resolution: {integrity: sha512-Duvx2OzQ7d6OjchL+trw+aSrb9idh7pnNfxrklo14p3zmNL4qPCDeIJAK+eBKYjkIwG96Bc6vYuxhqDXQOWpoQ==}
+  '@vitest/snapshot@4.0.17':
+    resolution: {integrity: sha512-npPelD7oyL+YQM2gbIYvlavlMVWUfNNGZPcu0aEUQXt7FXTuqhmgiYupPnAanhKvyP6Srs2pIbWo30K0RbDtRQ==}
 
-  '@vitest/snapshot@4.1.0':
-    resolution: {integrity: sha512-0Vy9euT1kgsnj1CHttwi9i9o+4rRLEaPRSOJ5gyv579GJkNpgJK+B4HSv/rAWixx2wdAFci1X4CEPjiu2bXIMg==}
-
-  '@vitest/spy@4.1.0':
-    resolution: {integrity: sha512-pz77k+PgNpyMDv2FV6qmk5ZVau6c3R8HC8v342T2xlFxQKTrSeYw9waIJG8KgV9fFwAtTu4ceRzMivPTH6wSxw==}
+  '@vitest/spy@4.0.17':
+    resolution: {integrity: sha512-I1bQo8QaP6tZlTomQNWKJE6ym4SHf3oLS7ceNjozxxgzavRAgZDc06T7kD8gb9bXKEgcLNt00Z+kZO6KaJ62Ew==}
 
   '@vitest/utils@4.0.17':
     resolution: {integrity: sha512-RG6iy+IzQpa9SB8HAFHJ9Y+pTzI+h8553MrciN9eC6TFBErqrQaTas4vG+MVj8S4uKk8uTT2p0vgZPnTdxd96w==}
 
-  '@vitest/utils@4.1.0':
-    resolution: {integrity: sha512-XfPXT6a8TZY3dcGY8EdwsBulFCIw+BeeX0RZn2x/BtiY/75YGh8FeWGG8QISN/WhaqSrE2OrlDgtF8q5uhOTmw==}
-
   abort-controller@3.0.0:
     resolution: {integrity: sha512-h8lQ8tacZYnR3vNQTgibj+tODHI5/+l06Au2Pcriv/Gmet0eaj4TwWH41sO9wnHDiQsEj19q0drzdWdeAHtweg==}
     engines: {node: '>=6.5'}
@@ -2958,10 +2981,6 @@ packages:
     resolution: {integrity: sha512-bMxMKAjg13EBSVscxTaYA4mRc5t1UAXa2kXiGTNfZ079HIWXEkKmkgFrh/nJqamaLSrXO5H4WFFkPEaLJWbs3A==}
     engines: {node: '>= 0.4'}
 
-  assertion-error@2.0.1:
-    resolution: {integrity: sha512-Izi8RQcffqCeNVgFigKli1ssklIbpHnCYc6AknXGYoB6grJqyeby7jv12JUQgmTAnIDnbck1uxksT4dzN3PWBA==}
-    engines: {node: '>=12'}
-
   ast-types-flow@0.0.8:
     resolution: {integrity: sha512-OH/2E5Fg20h2aPrbe+QL8JZQFko0YZaF+j4mnQ7BGhfavO7OpSLa8a0y9sBwomHdSbkhTS8TQNayBfnW5DwbvQ==}
 
@@ -3128,9 +3147,6 @@ packages:
     resolution: {integrity: sha512-5IKcdX0nnYavi6G7TtOhwkYzyjfJlatbjMjuLSfE2kYT5pMDOilZ4OvMhi637CcDICTmz3wARPoyhqyX1Y+XvA==}
     engines: {node: ^14.18.0 || >=16.10.0}
 
-  convert-source-map@2.0.0:
-    resolution: {integrity: sha512-Kvp459HrV2FEJ1CAsi1Ku+MY3kasH19TFykTz2xWmMeq6bk2NU3XXvfJ+Q61m0xktWwt+1HSYf3JZsTms3aRJg==}
-
   cross-spawn@7.0.6:
     resolution: {integrity: sha512-uV2QOWP2nWzsy2aMp8aRibhi9dlzF5Hgh5SHaB9OiTGEyDTiJJyx0uy51QXdyWbtAHNua4XJzUKca3OzKUd3vA==}
     engines: {node: '>= 8'}
@@ -3286,8 +3302,8 @@ packages:
     resolution: {integrity: sha512-zoMwbCcH5hwUkKJkT8kDIBZSz9I6mVG//+lDCinLCGov4+r7NIy0ld8o03M0cJxl2spVf6ESYVS6/gpIfq1FFw==}
     engines: {node: '>= 0.4'}
 
-  es-module-lexer@2.1.0:
-    resolution: {integrity: sha512-n27zTYMjYu1aj4MjCWzSP7G9r75utsaoc8m61weK+W8JMBGGQybd43GstCXZ3WNmSFtGT9wi59qQTW6mhTR5LQ==}
+  es-module-lexer@1.7.0:
+    resolution: {integrity: sha512-jEQoCwk8hyb2AZziIOLhDqpm5+2ww5uIE6lkO/6jcOCusfk6LhMHpXXfBLXTZ7Ydyt0j4VoUQv6uGNYbdW+kBA==}
 
   es-object-atoms@1.1.1:
     resolution: {integrity: sha512-FGgH2h8zKNim9ljj7dankFPcICIK9Cp5bm+c2gQSYePhpaG5+esrLODihIorn+Pe6FGJzWhXQotPv73jTaldXA==}
@@ -4246,6 +4262,10 @@ packages:
     resolution: {integrity: sha512-dRB78srN/l6gqWulah9SrxeYnxeddIG30+GOqK/9OlLVyLg3HPnr6SqOWTWOXKRwC2eGYCkZ59NNuSgvSrpgOA==}
     engines: {node: ^12.20.0 || ^14.13.1 || >=16.0.0}
 
+  node-gyp-build@4.8.4:
+    resolution: {integrity: sha512-LA4ZjwlnUblHVgq0oBF3Jl/6h/Nvs5fzBLwdEF4nuxnFdsfajde4WfxtJr3CaiH+F6ewcIB/q4jQ4UzPyid+CQ==}
+    hasBin: true
+
   object-assign@4.1.1:
     resolution: {integrity: sha512-rJgTQnkUnH1sFw8yT6VSU3zD3sWmu6sZhIseY8VX+GRu3P6F7Fu+JNDoXfklElbLJSnc3FUQHVe4cU5hj+BcUg==}
     engines: {node: '>=0.10.0'}
@@ -4747,9 +4767,6 @@ packages:
   std-env@3.10.0:
     resolution: {integrity: sha512-5GS12FdOZNliM5mAOxFRg7Ir0pWz8MdpYm6AY6VPkGpbA7ZzmbzNcBJQ0GPvvyWgcY7QAhCgf9Uy89I03faLkg==}
 
-  std-env@4.1.0:
-    resolution: {integrity: sha512-Rq7ybcX2RuC55r9oaPVEW7/xu3tj8u4GeBYHBWCychFtzMIr86A7e3PPEBPT37sHStKX3+TiX/Fr/ACmJLVlLQ==}
-
   string-argv@0.3.2:
     resolution: {integrity: sha512-aqD2Q0144Z+/RqG52NeHEkZauTAUWJO8c6yTftGJKO3Tja5tUgIfmIl6kExvhtxSDP7fXB6DvzkfMpCd/F3G+Q==}
     engines: {node: '>=0.6.19'}
@@ -4855,26 +4872,18 @@ packages:
   tinyexec@0.3.2:
     resolution: {integrity: sha512-KQQR9yN7R5+OSwaK0XQoj22pwHoTlgYqmUscPYoknOoWCWfj/5/ABTMRi69FrKU5ffPVh5QcFikpWJI/P1ocHA==}
 
-  tinyexec@1.2.4:
-    resolution: {integrity: sha512-SHf/r48b7vOrjve9PxJo3MN5v5yuyjHvdUcrQffT3WXMUfnGmHDVbC4k3sHJaJTgZCwpUplIaAo5ANtMyp3YHg==}
+  tinyexec@1.0.2:
+    resolution: {integrity: sha512-W/KYk+NFhkmsYpuHq5JykngiOCnxeVL8v8dFnqxSD8qEEdRfXk1SDM6JzNqcERbcGYj9tMrDQBYV9cjgnunFIg==}
     engines: {node: '>=18'}
 
   tinyglobby@0.2.16:
     resolution: {integrity: sha512-pn99VhoACYR8nFHhxqix+uvsbXineAasWm5ojXoN8xEwK5Kd3/TrhNn1wByuD52UxWRLy8pu+kRMniEi6Eq9Zg==}
     engines: {node: '>=12.0.0'}
 
-  tinyglobby@0.2.17:
-    resolution: {integrity: sha512-wXR/dYpcqKmfWpEdZjiKJOwCNFndD0DMnrW/cYjVGttEkBfVgcLFHoNrlj47mjOVic9yyNu65alsgF4NQyTa2g==}
-    engines: {node: '>=12.0.0'}
-
   tinyrainbow@3.0.3:
     resolution: {integrity: sha512-PSkbLUoxOFRzJYjjxHJt9xro7D+iilgMX/C9lawzVuYiIdcihh9DXmVibBe8lmcFrRi/VzlPjBxbN7rH24q8/Q==}
     engines: {node: '>=14.0.0'}
 
-  tinyrainbow@3.1.0:
-    resolution: {integrity: sha512-Bf+ILmBgretUrdJxzXM0SgXLZ3XfiaUuOj/IKQHuTXip+05Xn+uyEYdVg0kYDipTBcLrCVyUzAPz7QmArb0mmw==}
-    engines: {node: '>=14.0.0'}
-
   to-fast-properties@2.0.0:
     resolution: {integrity: sha512-/OaKK0xYrs3DmxRYqL/yDc+FxFUVYhDlXMhRmv3z915w2HF1tnN1omB354j8VUGO/hbRzyD6Y3sA7v7GS/ceog==}
     engines: {node: '>=4'}
@@ -5051,21 +5060,20 @@ packages:
       yaml:
         optional: true
 
-  vitest@4.1.0:
-    resolution: {integrity: sha512-YbDrMF9jM2Lqc++2530UourxZHmkKLxrs4+mYhEwqWS97WJ7wOYEkcr+QfRgJ3PW9wz3odRijLZjHEaRLTNbqw==}
+  vitest@4.0.17:
+    resolution: {integrity: sha512-FQMeF0DJdWY0iOnbv466n/0BudNdKj1l5jYgl5JVTwjSsZSlqyXFt/9+1sEyhR6CLowbZpV7O1sCHrzBhucKKg==}
     engines: {node: ^20.0.0 || ^22.0.0 || >=24.0.0}
     hasBin: true
     peerDependencies:
       '@edge-runtime/vm': '*'
       '@opentelemetry/api': ^1.9.0
       '@types/node': ^20.0.0 || ^22.0.0 || >=24.0.0
-      '@vitest/browser-playwright': 4.1.0
-      '@vitest/browser-preview': 4.1.0
-      '@vitest/browser-webdriverio': 4.1.0
-      '@vitest/ui': 4.1.0
+      '@vitest/browser-playwright': 4.0.17
+      '@vitest/browser-preview': 4.0.17
+      '@vitest/browser-webdriverio': 4.0.17
+      '@vitest/ui': 4.0.17
       happy-dom: '*'
       jsdom: '*'
-      vite: ^6.0.0 || ^7.0.0 || ^8.0.0-0
     peerDependenciesMeta:
       '@edge-runtime/vm':
         optional: true
@@ -5833,6 +5841,31 @@ snapshots:
     transitivePeerDependencies:
       - encoding
 
+  '@livekit/local-inference-darwin-arm64@0.2.5':
+    optional: true
+
+  '@livekit/local-inference-darwin-x64@0.2.5':
+    optional: true
+
+  '@livekit/local-inference-linux-arm64-gnu@0.2.5':
+    optional: true
+
+  '@livekit/local-inference-linux-x64-gnu@0.2.5':
+    optional: true
+
+  '@livekit/local-inference-win32-x64-msvc@0.2.5':
+    optional: true
+
+  '@livekit/local-inference@0.2.5':
+    dependencies:
+      node-gyp-build: 4.8.4
+    optionalDependencies:
+      '@livekit/local-inference-darwin-arm64': 0.2.5
+      '@livekit/local-inference-darwin-x64': 0.2.5
+      '@livekit/local-inference-linux-arm64-gnu': 0.2.5
+      '@livekit/local-inference-linux-x64-gnu': 0.2.5
+      '@livekit/local-inference-win32-x64-msvc': 0.2.5
+
   '@livekit/mutex@1.1.1': {}
 
   '@livekit/noise-cancellation-darwin-arm64@0.1.9':
@@ -5861,7 +5894,7 @@ snapshots:
   '@livekit/noise-cancellation-win32-x64@0.1.9':
     optional: true
 
-  '@livekit/protocol@1.46.4':
+  '@livekit/protocol@1.46.6':
     dependencies:
       '@bufbuild/protobuf': 1.10.1
 
@@ -6461,17 +6494,14 @@ snapshots:
 
   '@types/argparse@1.0.38': {}
 
-  '@types/chai@5.2.3':
+  '@types/chai@5.2.2':
     dependencies:
       '@types/deep-eql': 4.0.2
-      assertion-error: 2.0.1
 
   '@types/deep-eql@4.0.2': {}
 
   '@types/estree@1.0.8': {}
 
-  '@types/estree@1.0.9': {}
-
   '@types/fluent-ffmpeg@2.1.28':
     dependencies:
       '@types/node': 22.19.1
@@ -6593,7 +6623,7 @@ snapshots:
 
   '@ungap/structured-clone@1.2.0': {}
 
-  '@vitest/coverage-v8@4.0.17(vitest@4.1.0(@opentelemetry/api@1.9.0)(@types/node@22.19.1)(vite@7.3.2(@types/node@22.19.1)(tsx@4.21.0)))':
+  '@vitest/coverage-v8@4.0.17(vitest@4.0.17(@opentelemetry/api@1.9.0)(@types/node@22.19.1)(tsx@4.21.0))':
     dependencies:
       '@bcoe/v8-coverage': 1.0.2
       '@vitest/utils': 4.0.17
@@ -6605,28 +6635,28 @@ snapshots:
       obug: 2.1.1
       std-env: 3.10.0
       tinyrainbow: 3.0.3
-      vitest: 4.1.0(@opentelemetry/api@1.9.0)(@types/node@22.19.1)(vite@7.3.2(@types/node@22.19.1)(tsx@4.21.0))
+      vitest: 4.0.17(@opentelemetry/api@1.9.0)(@types/node@22.19.1)(tsx@4.21.0)
 
-  '@vitest/expect@4.1.0':
+  '@vitest/expect@4.0.17':
     dependencies:
       '@standard-schema/spec': 1.1.0
-      '@types/chai': 5.2.3
-      '@vitest/spy': 4.1.0
-      '@vitest/utils': 4.1.0
+      '@types/chai': 5.2.2
+      '@vitest/spy': 4.0.17
+      '@vitest/utils': 4.0.17
       chai: 6.2.2
-      tinyrainbow: 3.1.0
+      tinyrainbow: 3.0.3
 
-  '@vitest/mocker@4.1.0(vite@7.3.2(@types/node@22.19.1)(tsx@4.21.0))':
+  '@vitest/mocker@4.0.17(vite@7.3.2(@types/node@22.19.1)(tsx@4.21.0))':
     dependencies:
-      '@vitest/spy': 4.1.0
+      '@vitest/spy': 4.0.17
       estree-walker: 3.0.3
       magic-string: 0.30.21
     optionalDependencies:
       vite: 7.3.2(@types/node@22.19.1)(tsx@4.21.0)
 
-  '@vitest/mocker@4.1.0(vite@7.3.2(@types/node@25.6.0)(tsx@4.21.0))':
+  '@vitest/mocker@4.0.17(vite@7.3.2(@types/node@25.6.0)(tsx@4.21.0))':
     dependencies:
-      '@vitest/spy': 4.1.0
+      '@vitest/spy': 4.0.17
       estree-walker: 3.0.3
       magic-string: 0.30.21
     optionalDependencies:
@@ -6636,35 +6666,24 @@ snapshots:
     dependencies:
       tinyrainbow: 3.0.3
 
-  '@vitest/pretty-format@4.1.0':
+  '@vitest/runner@4.0.17':
     dependencies:
-      tinyrainbow: 3.1.0
-
-  '@vitest/runner@4.1.0':
-    dependencies:
-      '@vitest/utils': 4.1.0
+      '@vitest/utils': 4.0.17
       pathe: 2.0.3
 
-  '@vitest/snapshot@4.1.0':
+  '@vitest/snapshot@4.0.17':
     dependencies:
-      '@vitest/pretty-format': 4.1.0
-      '@vitest/utils': 4.1.0
+      '@vitest/pretty-format': 4.0.17
       magic-string: 0.30.21
       pathe: 2.0.3
 
-  '@vitest/spy@4.1.0': {}
+  '@vitest/spy@4.0.17': {}
 
   '@vitest/utils@4.0.17':
     dependencies:
       '@vitest/pretty-format': 4.0.17
       tinyrainbow: 3.0.3
 
-  '@vitest/utils@4.1.0':
-    dependencies:
-      '@vitest/pretty-format': 4.1.0
-      convert-source-map: 2.0.0
-      tinyrainbow: 3.1.0
-
   abort-controller@3.0.0:
     dependencies:
       event-target-shim: 5.0.1
@@ -6794,8 +6813,6 @@ snapshots:
       is-array-buffer: 3.0.4
       is-shared-array-buffer: 1.0.3
 
-  assertion-error@2.0.1: {}
-
   ast-types-flow@0.0.8: {}
 
   ast-v8-to-istanbul@0.3.10:
@@ -6945,8 +6962,6 @@ snapshots:
 
   consola@3.4.2: {}
 
-  convert-source-map@2.0.0: {}
-
   cross-spawn@7.0.6:
     dependencies:
       path-key: 3.1.1
@@ -7135,7 +7150,7 @@ snapshots:
       iterator.prototype: 1.1.2
       safe-array-concat: 1.1.2
 
-  es-module-lexer@2.1.0: {}
+  es-module-lexer@1.7.0: {}
 
   es-object-atoms@1.1.1:
     dependencies:
@@ -7490,7 +7505,7 @@ snapshots:
 
   estree-walker@3.0.3:
     dependencies:
-      '@types/estree': 1.0.9
+      '@types/estree': 1.0.8
 
   esutils@2.0.3: {}
 
@@ -8069,7 +8084,7 @@ snapshots:
   livekit-server-sdk@2.14.1:
     dependencies:
       '@bufbuild/protobuf': 1.10.1
-      '@livekit/protocol': 1.46.4
+      '@livekit/protocol': 1.46.6
       camelcase-keys: 9.1.3
       jose: 5.2.4
 
@@ -8204,6 +8219,8 @@ snapshots:
       fetch-blob: 3.2.0
       formdata-polyfill: 4.0.10
 
+  node-gyp-build@4.8.4: {}
+
   object-assign@4.1.1: {}
 
   object-inspect@1.13.1: {}
@@ -8808,8 +8825,6 @@ snapshots:
 
   std-env@3.10.0: {}
 
-  std-env@4.1.0: {}
-
   string-argv@0.3.2: {}
 
   string-width@4.2.3:
@@ -8931,22 +8946,15 @@ snapshots:
 
   tinyexec@0.3.2: {}
 
-  tinyexec@1.2.4: {}
+  tinyexec@1.0.2: {}
 
   tinyglobby@0.2.16:
     dependencies:
       fdir: 6.5.0(picomatch@4.0.4)
       picomatch: 4.0.4
 
-  tinyglobby@0.2.17:
-    dependencies:
-      fdir: 6.5.0(picomatch@4.0.4)
-      picomatch: 4.0.4
-
   tinyrainbow@3.0.3: {}
 
-  tinyrainbow@3.1.0: {}
-
   to-fast-properties@2.0.0: {}
 
   to-regex-range@5.0.1:
@@ -9152,61 +9160,81 @@ snapshots:
       fsevents: 2.3.3
       tsx: 4.21.0
 
-  vitest@4.1.0(@opentelemetry/api@1.9.0)(@types/node@22.19.1)(vite@7.3.2(@types/node@22.19.1)(tsx@4.21.0)):
+  vitest@4.0.17(@opentelemetry/api@1.9.0)(@types/node@22.19.1)(tsx@4.21.0):
     dependencies:
-      '@vitest/expect': 4.1.0
-      '@vitest/mocker': 4.1.0(vite@7.3.2(@types/node@22.19.1)(tsx@4.21.0))
-      '@vitest/pretty-format': 4.1.0
-      '@vitest/runner': 4.1.0
-      '@vitest/snapshot': 4.1.0
-      '@vitest/spy': 4.1.0
-      '@vitest/utils': 4.1.0
-      es-module-lexer: 2.1.0
+      '@vitest/expect': 4.0.17
+      '@vitest/mocker': 4.0.17(vite@7.3.2(@types/node@22.19.1)(tsx@4.21.0))
+      '@vitest/pretty-format': 4.0.17
+      '@vitest/runner': 4.0.17
+      '@vitest/snapshot': 4.0.17
+      '@vitest/spy': 4.0.17
+      '@vitest/utils': 4.0.17
+      es-module-lexer: 1.7.0
       expect-type: 1.3.0
       magic-string: 0.30.21
       obug: 2.1.1
       pathe: 2.0.3
       picomatch: 4.0.4
-      std-env: 4.1.0
+      std-env: 3.10.0
       tinybench: 2.9.0
-      tinyexec: 1.2.4
-      tinyglobby: 0.2.17
-      tinyrainbow: 3.1.0
+      tinyexec: 1.0.2
+      tinyglobby: 0.2.16
+      tinyrainbow: 3.0.3
       vite: 7.3.2(@types/node@22.19.1)(tsx@4.21.0)
       why-is-node-running: 2.3.0
     optionalDependencies:
       '@opentelemetry/api': 1.9.0
       '@types/node': 22.19.1
     transitivePeerDependencies:
+      - jiti
+      - less
+      - lightningcss
       - msw
+      - sass
+      - sass-embedded
+      - stylus
+      - sugarss
+      - terser
+      - tsx
+      - yaml
 
-  vitest@4.1.0(@opentelemetry/api@1.9.0)(@types/node@25.6.0)(vite@7.3.2(@types/node@25.6.0)(tsx@4.21.0)):
+  vitest@4.0.17(@opentelemetry/api@1.9.0)(@types/node@25.6.0)(tsx@4.21.0):
     dependencies:
-      '@vitest/expect': 4.1.0
-      '@vitest/mocker': 4.1.0(vite@7.3.2(@types/node@25.6.0)(tsx@4.21.0))
-      '@vitest/pretty-format': 4.1.0
-      '@vitest/runner': 4.1.0
-      '@vitest/snapshot': 4.1.0
-      '@vitest/spy': 4.1.0
-      '@vitest/utils': 4.1.0
-      es-module-lexer: 2.1.0
+      '@vitest/expect': 4.0.17
+      '@vitest/mocker': 4.0.17(vite@7.3.2(@types/node@25.6.0)(tsx@4.21.0))
+      '@vitest/pretty-format': 4.0.17
+      '@vitest/runner': 4.0.17
+      '@vitest/snapshot': 4.0.17
+      '@vitest/spy': 4.0.17
+      '@vitest/utils': 4.0.17
+      es-module-lexer: 1.7.0
       expect-type: 1.3.0
       magic-string: 0.30.21
       obug: 2.1.1
       pathe: 2.0.3
       picomatch: 4.0.4
-      std-env: 4.1.0
+      std-env: 3.10.0
       tinybench: 2.9.0
-      tinyexec: 1.2.4
-      tinyglobby: 0.2.17
-      tinyrainbow: 3.1.0
+      tinyexec: 1.0.2
+      tinyglobby: 0.2.16
+      tinyrainbow: 3.0.3
       vite: 7.3.2(@types/node@25.6.0)(tsx@4.21.0)
       why-is-node-running: 2.3.0
     optionalDependencies:
       '@opentelemetry/api': 1.9.0
       '@types/node': 25.6.0
     transitivePeerDependencies:
+      - jiti
+      - less
+      - lightningcss
       - msw
+      - sass
+      - sass-embedded
+      - stylus
+      - sugarss
+      - terser
+      - tsx
+      - yaml
 
   vscode-oniguruma@1.7.0: {}
 
diff --git a/turbo.json b/turbo.json
index b0bc90527..0a25e3eea 100644
--- a/turbo.json
+++ b/turbo.json
@@ -42,6 +42,7 @@
     "LIVEKIT_INFERENCE_URL",
     "LIVEKIT_OUTBOUND_TRUNK_ID",
     "LIVEKIT_URL",
+    "LIVEKIT_WORKER_TOKEN",
     "LLAMA_API_KEY",
     "LIVEKIT_AGENT_ID",
     "LIVEKIT_AGENT_NAME",