From 96ee459db7eb635ef9f0312f31ce3654af18e087 Mon Sep 17 00:00:00 2001
From: Bilal Tahir <bilal@jellypod.ai>
Date: Thu, 11 Jun 2026 00:40:29 +0000
Subject: [PATCH] feat(speechsdk): add speech-sdk multi-provider TTS plugin

New @livekit/agents-plugin-speechsdk package: non-streaming TTS across 15
providers through one provider/model string, including providers without a
dedicated plugin (Murf, Smallest.ai, fal.ai-hosted open-weight models).
Synthesis requests raw PCM and resamples to the configured frame rate with
AudioResampler when a provider's native rate differs. speech-sdk's internal
retry is disabled so the framework's ChunkedStream retry policy owns retries.

Co-Authored-By: Claude Fable 5 <noreply@anthropic.com>
---
 .changeset/speechsdk-tts-plugin.md   |   5 +
 CLAUDE.md                            |   2 +-
 README.md                            |   1 +
 plugins/speechsdk/README.md          |  42 ++++
 plugins/speechsdk/api-extractor.json |   5 +
 plugins/speechsdk/package.json       |  52 +++++
 plugins/speechsdk/src/index.ts       |  19 ++
 plugins/speechsdk/src/models.ts      |  28 +++
 plugins/speechsdk/src/tts.test.ts    |  39 ++++
 plugins/speechsdk/src/tts.ts         | 278 +++++++++++++++++++++++++++
 plugins/speechsdk/tsconfig.json      |  14 ++
 plugins/speechsdk/tsup.config.ts     |   7 +
 pnpm-lock.yaml                       |  81 ++++++++
 turbo.json                           |   1 +
 14 files changed, 573 insertions(+), 1 deletion(-)
 create mode 100644 .changeset/speechsdk-tts-plugin.md
 create mode 100644 plugins/speechsdk/README.md
 create mode 100644 plugins/speechsdk/api-extractor.json
 create mode 100644 plugins/speechsdk/package.json
 create mode 100644 plugins/speechsdk/src/index.ts
 create mode 100644 plugins/speechsdk/src/models.ts
 create mode 100644 plugins/speechsdk/src/tts.test.ts
 create mode 100644 plugins/speechsdk/src/tts.ts
 create mode 100644 plugins/speechsdk/tsconfig.json
 create mode 100644 plugins/speechsdk/tsup.config.ts

diff --git a/.changeset/speechsdk-tts-plugin.md b/.changeset/speechsdk-tts-plugin.md
new file mode 100644
index 000000000..8ec015f28
--- /dev/null
+++ b/.changeset/speechsdk-tts-plugin.md
@@ -0,0 +1,5 @@
+---
+'@livekit/agents-plugin-speechsdk': patch
+---
+
+Add the speech-sdk TTS plugin: synthesis across 15 providers via one `provider/model` string, including providers without a dedicated plugin (Murf, Smallest.ai, fal.ai-hosted open-weight models).
diff --git a/CLAUDE.md b/CLAUDE.md
index 14d8b5b82..1c52da0d6 100644
--- a/CLAUDE.md
+++ b/CLAUDE.md
@@ -126,7 +126,7 @@ Plugin capabilities by type:
 
 - **LLM**: openai, google, baseten, mistralai
 - **STT**: deepgram (v1+v2), openai, baseten, sarvam (v1/v2/v3), mistralai, inworld, cartesia
-- **TTS**: cartesia, elevenlabs, deepgram, openai, neuphonic, resemble, rime, inworld, baseten, sarvam (v1/v2/v3), mistralai, fishaudio, hume
+- **TTS**: cartesia, elevenlabs, deepgram, openai, neuphonic, resemble, rime, inworld, baseten, sarvam (v1/v2/v3), mistralai, fishaudio, hume, speechsdk (multi-provider)
 - **VAD**: silero (ONNX-based, local)
 - **EOU/Turn Detection**: livekit (HuggingFace + ONNX)
 - **Realtime**: openai (+ responses/, ws/ modules), google (beta), xai, phonic
diff --git a/README.md b/README.md
index 8b7de522f..0b83b9dc0 100644
--- a/README.md
+++ b/README.md
@@ -82,6 +82,7 @@ Currently, only the following plugins are supported:
 | [@livekit/agents-plugin-phonic](https://www.npmjs.com/package/@livekit/agents-plugin-phonic)         | Realtime      |
 | [@livekit/agents-plugin-fishaudio](https://www.npmjs.com/package/@livekit/agents-plugin-fishaudio)   | TTS           |
 | [@livekit/agents-plugin-hume](https://www.npmjs.com/package/@livekit/agents-plugin-hume)             | TTS           |
+| [@livekit/agents-plugin-speechsdk](https://www.npmjs.com/package/@livekit/agents-plugin-speechsdk)   | TTS           |
 
 ## Docs and guides
 
diff --git a/plugins/speechsdk/README.md b/plugins/speechsdk/README.md
new file mode 100644
index 000000000..0dc2f0b18
--- /dev/null
+++ b/plugins/speechsdk/README.md
@@ -0,0 +1,42 @@
+<!--
+SPDX-FileCopyrightText: 2026 LiveKit, Inc.
+
+SPDX-License-Identifier: Apache-2.0
+-->
+# speech-sdk plugin for LiveKit Agents
+
+The Agents Framework is designed for building realtime, programmable
+participants that run on servers. Use it to create conversational, multi-modal
+voice agents that can see, hear, and understand.
+
+This package contains the [speech-sdk](https://github.com/Jellypod-Inc/speech-sdk)
+plugin, which allows for voice synthesis across 15 TTS providers through a
+single `provider/model` string, including providers without a dedicated
+LiveKit plugin (Murf, Smallest.ai, and fal.ai-hosted open-weight models such
+as Kokoro):
+
+```ts
+import * as speechsdk from '@livekit/agents-plugin-speechsdk';
+
+const tts = new speechsdk.TTS({ model: 'openai/gpt-4o-mini-tts', voice: 'alloy' });
+// or: { model: 'murf/FALCON', voice: 'en-US-amara' }
+// or: { model: 'fal-ai/kokoro/american-english', voice: 'af_heart' }
+```
+
+Calls go directly to the selected provider using your own API key from the
+provider's standard environment variable (`OPENAI_API_KEY`, `MURF_API_KEY`,
+and so on). Optionally, setting `SPEECHBASE_API_KEY` routes the same
+`provider/model` strings through [speechbase.ai](https://speechbase.ai), a
+hosted gateway, so one key covers every provider; without it, calls go
+directly to the provider.
+
+Synthesis is non-streaming (`AgentSession` wraps it in a sentence-level
+`StreamAdapter` automatically). For latency-critical production agents, a
+dedicated provider plugin with native WebSocket streaming remains the better
+choice when one exists; this plugin is useful for evaluating providers and
+for reaching providers without a dedicated plugin. Output is delivered as raw
+16-bit little-endian PCM (24 kHz by default; other native rates are
+resampled).
+
+See the [repository](https://github.com/livekit/agents-js) for more information
+about the framework as a whole.
diff --git a/plugins/speechsdk/api-extractor.json b/plugins/speechsdk/api-extractor.json
new file mode 100644
index 000000000..cf6493161
--- /dev/null
+++ b/plugins/speechsdk/api-extractor.json
@@ -0,0 +1,5 @@
+{
+    "$schema": "https://developer.microsoft.com/json-schemas/api-extractor/v7/api-extractor.schema.json",
+    "extends": "../../api-extractor-shared.json",
+    "mainEntryPointFilePath": "./dist/index.d.ts"
+}
diff --git a/plugins/speechsdk/package.json b/plugins/speechsdk/package.json
new file mode 100644
index 000000000..1f3a3776e
--- /dev/null
+++ b/plugins/speechsdk/package.json
@@ -0,0 +1,52 @@
+{
+  "name": "@livekit/agents-plugin-speechsdk",
+  "version": "1.4.5",
+  "description": "speech-sdk multi-provider TTS plugin for LiveKit Node Agents",
+  "main": "dist/index.js",
+  "require": "dist/index.cjs",
+  "types": "dist/index.d.ts",
+  "exports": {
+    "import": {
+      "types": "./dist/index.d.ts",
+      "default": "./dist/index.js"
+    },
+    "require": {
+      "types": "./dist/index.d.cts",
+      "default": "./dist/index.cjs"
+    }
+  },
+  "author": "LiveKit",
+  "type": "module",
+  "repository": "git@github.com:livekit/agents-js.git",
+  "license": "Apache-2.0",
+  "files": [
+    "dist",
+    "src",
+    "README.md"
+  ],
+  "scripts": {
+    "build": "tsup --onSuccess \"pnpm build:types\"",
+    "build:types": "tsc --declaration --emitDeclarationOnly && node ../../scripts/copyDeclarationOutput.js",
+    "clean": "rm -rf dist",
+    "clean:build": "pnpm clean && pnpm build",
+    "lint": "eslint -f unix \"src/**/*.{ts,js}\"",
+    "api:check": "api-extractor run --typescript-compiler-folder ../../node_modules/typescript",
+    "api:update": "api-extractor run --local --typescript-compiler-folder ../../node_modules/typescript --verbose"
+  },
+  "dependencies": {
+    "@speech-sdk/core": "^0.14.0"
+  },
+  "devDependencies": {
+    "@livekit/agents": "workspace:*",
+    "@livekit/agents-plugin-openai": "workspace:*",
+    "@livekit/agents-plugins-test": "workspace:*",
+    "@livekit/rtc-node": "catalog:",
+    "@microsoft/api-extractor": "^7.35.0",
+    "tsup": "^8.3.5",
+    "typescript": "^5.0.0"
+  },
+  "peerDependencies": {
+    "@livekit/agents": "workspace:*",
+    "@livekit/rtc-node": "catalog:"
+  }
+}
diff --git a/plugins/speechsdk/src/index.ts b/plugins/speechsdk/src/index.ts
new file mode 100644
index 000000000..8343608aa
--- /dev/null
+++ b/plugins/speechsdk/src/index.ts
@@ -0,0 +1,19 @@
+// SPDX-FileCopyrightText: 2026 LiveKit, Inc.
+//
+// SPDX-License-Identifier: Apache-2.0
+import { Plugin } from '@livekit/agents';
+
+export * from './models.js';
+export * from './tts.js';
+
+class SpeechSDKPlugin extends Plugin {
+  constructor() {
+    super({
+      title: 'speechsdk',
+      version: __PACKAGE_VERSION__,
+      package: __PACKAGE_NAME__,
+    });
+  }
+}
+
+Plugin.registerPlugin(new SpeechSDKPlugin());
diff --git a/plugins/speechsdk/src/models.ts b/plugins/speechsdk/src/models.ts
new file mode 100644
index 000000000..c242ed8c8
--- /dev/null
+++ b/plugins/speechsdk/src/models.ts
@@ -0,0 +1,28 @@
+// SPDX-FileCopyrightText: 2026 LiveKit, Inc.
+//
+// SPDX-License-Identifier: Apache-2.0
+
+/**
+ * TTS providers supported by speech-sdk, used as the prefix of a `provider/model` string.
+ */
+export type TTSProviders =
+  | 'cartesia'
+  | 'deepgram'
+  | 'elevenlabs'
+  | 'fal-ai'
+  | 'fish-audio'
+  | 'google'
+  | 'hume'
+  | 'inworld'
+  | 'minimax'
+  | 'mistral'
+  | 'murf'
+  | 'openai'
+  | 'resemble'
+  | 'smallest-ai'
+  | 'xai';
+
+/**
+ * A `provider/model` string, e.g. `openai/gpt-4o-mini-tts` or `elevenlabs/eleven_flash_v2_5`.
+ */
+export type TTSModels = `${TTSProviders}/${string}`;
diff --git a/plugins/speechsdk/src/tts.test.ts b/plugins/speechsdk/src/tts.test.ts
new file mode 100644
index 000000000..112c33737
--- /dev/null
+++ b/plugins/speechsdk/src/tts.test.ts
@@ -0,0 +1,39 @@
+// SPDX-FileCopyrightText: 2026 LiveKit, Inc.
+//
+// SPDX-License-Identifier: Apache-2.0
+import { STT } from '@livekit/agents-plugin-openai';
+import { tts } from '@livekit/agents-plugins-test';
+import { describe, expect, it } from 'vitest';
+import { TTS } from './tts.js';
+
+describe('SpeechSDK TTS model strings', () => {
+  it('rejects a model without a provider prefix', () => {
+    expect(() => new TTS({ model: 'gpt-4o-mini-tts', speechbaseApiKey: undefined })).toThrow(
+      /provider\/model/,
+    );
+  });
+
+  it('rejects an unknown provider prefix', () => {
+    expect(() => new TTS({ model: 'acme/some-model', speechbaseApiKey: undefined })).toThrow(
+      /Unknown speech-sdk provider/,
+    );
+  });
+
+  it('splits path-style model ids on the first slash only', () => {
+    const instance = new TTS({ model: 'fal-ai/kokoro/american-english' });
+    expect(instance.provider).toEqual('fal-ai');
+    expect(instance.model).toEqual('fal-ai/kokoro/american-english');
+  });
+});
+
+const hasOpenAIKey = Boolean(process.env.OPENAI_API_KEY);
+
+if (hasOpenAIKey) {
+  describe('SpeechSDK TTS', async () => {
+    await tts(new TTS(), new STT({ useRealtime: false, model: 'whisper-1' }), { streaming: false });
+  });
+} else {
+  describe('SpeechSDK TTS', () => {
+    it.skip('requires OPENAI_API_KEY', () => {});
+  });
+}
diff --git a/plugins/speechsdk/src/tts.ts b/plugins/speechsdk/src/tts.ts
new file mode 100644
index 000000000..0fff0d6aa
--- /dev/null
+++ b/plugins/speechsdk/src/tts.ts
@@ -0,0 +1,278 @@
+// SPDX-FileCopyrightText: 2026 LiveKit, Inc.
+//
+// SPDX-License-Identifier: Apache-2.0
+import {
+  type APIConnectOptions,
+  APIError,
+  APIStatusError,
+  AudioByteStream,
+  shortuuid,
+  tts,
+} from '@livekit/agents';
+import { AudioFrame, AudioResampler } from '@livekit/rtc-node';
+import type * as speechProviders from '@speech-sdk/core/providers';
+import type { ResolvedModel } from '@speech-sdk/core/types';
+import type { TTSModels, TTSProviders } from './models.js';
+
+const SPEECHSDK_TTS_CHANNELS = 1;
+const DEFAULT_SAMPLE_RATE = 24000;
+const DEFAULT_MODEL: TTSModels = 'openai/gpt-4o-mini-tts';
+const DEFAULT_VOICE = 'alloy';
+
+const RETRYABLE_STATUS_CODES = new Set([408, 429]);
+const PCM_RATE_REGEX = /rate=(\d+)/;
+
+type ProvidersModule = typeof speechProviders;
+type SpeechModelFactory = (config: { apiKey?: string }) => (modelId: string) => ResolvedModel;
+
+const PROVIDER_FACTORIES: Record<TTSProviders, (mod: ProvidersModule) => SpeechModelFactory> = {
+  cartesia: (mod) => mod.createCartesia,
+  deepgram: (mod) => mod.createDeepgram,
+  elevenlabs: (mod) => mod.createElevenLabs,
+  'fal-ai': (mod) => mod.createFal,
+  'fish-audio': (mod) => mod.createFishAudio,
+  google: (mod) => mod.createGoogle,
+  hume: (mod) => mod.createHume,
+  inworld: (mod) => mod.createInworld,
+  minimax: (mod) => mod.createMiniMax,
+  mistral: (mod) => mod.createMistral,
+  murf: (mod) => mod.createMurf,
+  openai: (mod) => mod.createOpenAI,
+  resemble: (mod) => mod.createResemble,
+  'smallest-ai': (mod) => mod.createSmallestAI,
+  xai: (mod) => mod.createXai,
+};
+
+const isKnownProvider = (provider: string): provider is TTSProviders =>
+  provider in PROVIDER_FACTORIES;
+
+// fal model ids are path-style ("kokoro/american-english"), so split on the first slash only.
+const parseModel = (model: string): { provider: string; modelId: string } => {
+  const slash = model.indexOf('/');
+  if (slash <= 0 || slash === model.length - 1) {
+    throw new Error(
+      `Invalid speech-sdk model "${model}": expected a "provider/model" string, e.g. "${DEFAULT_MODEL}"`,
+    );
+  }
+  return { provider: model.slice(0, slash), modelId: model.slice(slash + 1) };
+};
+
+export interface TTSOptions {
+  /**
+   * Model as a `provider/model` string, e.g. `openai/gpt-4o-mini-tts` or
+   * `elevenlabs/eleven_flash_v2_5`. The prefix selects which provider API is called.
+   */
+  model: TTSModels | string;
+  /** Voice ID, as defined by the selected provider. */
+  voice: string;
+  /** Sample rate of emitted frames in Hz. Audio returned at another native rate is resampled. */
+  sampleRate: number;
+  /**
+   * Provider API key. Defaults to the selected provider's standard environment variable
+   * (e.g. `OPENAI_API_KEY`, `ELEVENLABS_API_KEY`, `MURF_API_KEY`).
+   */
+  apiKey?: string;
+  /**
+   * SpeechBase gateway API key, defaulting to `SPEECHBASE_API_KEY`. When set, `provider/model`
+   * strings are routed through the speechbase.ai gateway with this single key; when unset,
+   * calls go directly to the provider with your own key.
+   */
+  speechbaseApiKey?: string;
+  /** Additional provider-specific request options forwarded to speech-sdk. */
+  providerOptions?: Record<string, unknown>;
+}
+
+const defaultTTSOptions: TTSOptions = {
+  model: DEFAULT_MODEL,
+  voice: DEFAULT_VOICE,
+  sampleRate: DEFAULT_SAMPLE_RATE,
+  speechbaseApiKey: process.env.SPEECHBASE_API_KEY,
+};
+
+const validateModel = (opts: TTSOptions) => {
+  const { provider } = parseModel(opts.model);
+  if (!opts.speechbaseApiKey && !isKnownProvider(provider)) {
+    throw new Error(
+      `Unknown speech-sdk provider "${provider}", expected one of: ${Object.keys(PROVIDER_FACTORIES).join(', ')}`,
+    );
+  }
+};
+
+const resolveSpeechModel = async (opts: TTSOptions): Promise<string | ResolvedModel> => {
+  if (opts.speechbaseApiKey) {
+    return opts.model;
+  }
+  const { provider, modelId } = parseModel(opts.model);
+  if (!isKnownProvider(provider)) {
+    throw new Error(`Unknown speech-sdk provider "${provider}"`);
+  }
+  const providers = await import('@speech-sdk/core/providers');
+  const factory = PROVIDER_FACTORIES[provider](providers);
+  return factory(opts.apiKey !== undefined ? { apiKey: opts.apiKey } : {})(modelId);
+};
+
+const toAPIError = async (error: unknown): Promise<unknown> => {
+  const { ApiError: SpeechApiError, SpeechSDKError } = await import('@speech-sdk/core');
+  if (error instanceof SpeechApiError) {
+    return new APIStatusError({
+      message: error.message,
+      options: {
+        statusCode: error.statusCode,
+        retryable: RETRYABLE_STATUS_CODES.has(error.statusCode) || error.statusCode >= 500,
+      },
+    });
+  }
+  if (error instanceof SpeechSDKError) {
+    return new APIError(error.message, { retryable: false });
+  }
+  return error;
+};
+
+export class TTS extends tts.TTS {
+  #opts: TTSOptions;
+  label = 'speechsdk.TTS';
+  private abortController = new AbortController();
+
+  get model(): string {
+    return this.#opts.model;
+  }
+
+  get provider(): string {
+    return parseModel(this.#opts.model).provider;
+  }
+
+  /**
+   * Create a new instance of speech-sdk TTS.
+   *
+   * @remarks
+   * The provider's API key must be set in its standard environment variable (e.g.
+   * `OPENAI_API_KEY` for `openai/...` models) or passed via the `apiKey` option.
+   */
+  constructor(opts: Partial<TTSOptions> = {}) {
+    const merged = { ...defaultTTSOptions, ...opts };
+    super(merged.sampleRate, SPEECHSDK_TTS_CHANNELS, { streaming: false });
+    validateModel(merged);
+    this.#opts = merged;
+  }
+
+  updateOptions(opts: { model?: TTSModels | string; voice?: string }) {
+    const merged = { ...this.#opts, ...opts };
+    validateModel(merged);
+    this.#opts = merged;
+  }
+
+  synthesize(
+    text: string,
+    connOptions?: APIConnectOptions,
+    abortSignal?: AbortSignal,
+  ): ChunkedStream {
+    const signal = abortSignal
+      ? AbortSignal.any([abortSignal, this.abortController.signal])
+      : this.abortController.signal;
+    return new ChunkedStream(this, text, this.#opts, connOptions, signal);
+  }
+
+  stream(): tts.SynthesizeStream {
+    throw new Error('Streaming is not supported on SpeechSDK TTS');
+  }
+
+  async close(): Promise<void> {
+    this.abortController.abort();
+  }
+}
+
+export class ChunkedStream extends tts.ChunkedStream {
+  label = 'speechsdk.ChunkedStream';
+  #opts: TTSOptions;
+
+  constructor(
+    tts: TTS,
+    text: string,
+    opts: TTSOptions,
+    connOptions?: APIConnectOptions,
+    abortSignal?: AbortSignal,
+  ) {
+    super(text, tts, connOptions, abortSignal);
+    this.#opts = opts;
+  }
+
+  protected async run() {
+    try {
+      const { generateSpeech } = await import('@speech-sdk/core');
+      const model = await resolveSpeechModel(this.#opts);
+      // maxRetries: 0 disables speech-sdk's internal retry; the retry policy in
+      // tts.ChunkedStream (connOptions) owns retries to avoid multiplying attempts.
+      const result = await generateSpeech({
+        model,
+        text: this.inputText,
+        voice: this.#opts.voice,
+        output: { format: 'pcm' },
+        providerOptions: this.#opts.providerOptions,
+        apiKey: this.#opts.speechbaseApiKey,
+        maxRetries: 0,
+        abortSignal: this.abortController.signal,
+      });
+
+      const requestId = shortuuid();
+      const frames = this.#toFrames(result.audio.uint8Array, result.audio.mediaType);
+
+      let lastFrame: AudioFrame | undefined;
+      const sendLastFrame = (segmentId: string, final: boolean) => {
+        if (lastFrame) {
+          this.queue.put({ requestId, segmentId, frame: lastFrame, final });
+          lastFrame = undefined;
+        }
+      };
+
+      for (const frame of frames) {
+        sendLastFrame(requestId, false);
+        lastFrame = frame;
+      }
+      sendLastFrame(requestId, true);
+    } catch (error) {
+      if (this.abortController.signal.aborted) {
+        return;
+      }
+      if (error instanceof Error && error.name === 'AbortError') {
+        return;
+      }
+      throw await toAPIError(error);
+    } finally {
+      this.queue.close();
+    }
+  }
+
+  #toFrames(pcm: Uint8Array, mediaType: string): AudioFrame[] {
+    const rateMatch = PCM_RATE_REGEX.exec(mediaType);
+    if (!rateMatch) {
+      throw new APIError(`speech-sdk returned unexpected mediaType "${mediaType}", expected PCM`, {
+        retryable: false,
+      });
+    }
+    const nativeRate = Number(rateMatch[1]);
+    const bstream = new AudioByteStream(this.#opts.sampleRate, SPEECHSDK_TTS_CHANNELS);
+
+    if (nativeRate === this.#opts.sampleRate) {
+      return [...bstream.write(pcm), ...bstream.flush()];
+    }
+
+    const aligned = pcm.byteOffset % 2 === 0 ? pcm : pcm.slice();
+    const samples = new Int16Array(
+      aligned.buffer,
+      aligned.byteOffset,
+      Math.floor(aligned.byteLength / 2),
+    );
+    const nativeFrame = new AudioFrame(samples, nativeRate, SPEECHSDK_TTS_CHANNELS, samples.length);
+    const resampler = new AudioResampler(nativeRate, this.#opts.sampleRate, SPEECHSDK_TTS_CHANNELS);
+    const frames: AudioFrame[] = [];
+    try {
+      for (const resampled of [...resampler.push(nativeFrame), ...resampler.flush()]) {
+        frames.push(...bstream.write(resampled.data));
+      }
+    } finally {
+      resampler.close();
+    }
+    frames.push(...bstream.flush());
+    return frames;
+  }
+}
diff --git a/plugins/speechsdk/tsconfig.json b/plugins/speechsdk/tsconfig.json
new file mode 100644
index 000000000..41188b488
--- /dev/null
+++ b/plugins/speechsdk/tsconfig.json
@@ -0,0 +1,14 @@
+{
+    "extends": "../../tsconfig.json",
+    "include": ["./src"],
+    "compilerOptions": {
+      "rootDir": "./src",
+      "declarationDir": "./dist",
+      "outDir": "./dist"
+    },
+    "typedocOptions": {
+      "name": "plugins/agents-plugin-speechsdk",
+      "entryPointStrategy": "resolve",
+      "entryPoints": ["src/index.ts"]
+    }
+}
diff --git a/plugins/speechsdk/tsup.config.ts b/plugins/speechsdk/tsup.config.ts
new file mode 100644
index 000000000..8ca20961f
--- /dev/null
+++ b/plugins/speechsdk/tsup.config.ts
@@ -0,0 +1,7 @@
+import { defineConfig } from 'tsup';
+
+import defaults from '../../tsup.config';
+
+export default defineConfig({
+  ...defaults,
+});
diff --git a/pnpm-lock.yaml b/pnpm-lock.yaml
index 7a3ebdd0c..dbec21809 100644
--- a/pnpm-lock.yaml
+++ b/pnpm-lock.yaml
@@ -1296,6 +1296,34 @@ importers:
         specifier: ^5.0.0
         version: 5.9.3
 
+  plugins/speechsdk:
+    dependencies:
+      '@speech-sdk/core':
+        specifier: ^0.14.0
+        version: 0.14.0
+    devDependencies:
+      '@livekit/agents':
+        specifier: workspace:*
+        version: link:../../agents
+      '@livekit/agents-plugin-openai':
+        specifier: workspace:*
+        version: link:../openai
+      '@livekit/agents-plugins-test':
+        specifier: workspace:*
+        version: link:../test
+      '@livekit/rtc-node':
+        specifier: 'catalog:'
+        version: 0.13.29
+      '@microsoft/api-extractor':
+        specifier: ^7.35.0
+        version: 7.43.7(@types/node@25.6.0)
+      tsup:
+        specifier: ^8.3.5
+        version: 8.4.0(@microsoft/api-extractor@7.43.7(@types/node@25.6.0))(postcss@8.5.9)(tsx@4.21.0)(typescript@5.9.3)
+      typescript:
+        specifier: ^5.0.0
+        version: 5.9.3
+
   plugins/tavus:
     dependencies:
       livekit-server-sdk:
@@ -2217,6 +2245,11 @@ packages:
   '@manypkg/get-packages@1.1.3':
     resolution: {integrity: sha512-fo+QhuU3qE/2TQMQmbVMqaQ6EWbMhi4ABWP+O4AM1NqPBuy0OrApV5LO6BrrgnhtAHS2NH6RrVk9OL181tTi8A==}
 
+  '@mediabunny/mp3-encoder@1.46.0':
+    resolution: {integrity: sha512-SG69nm0ntRR4yc5TP9LG1/oxrh5PcoIEZwFm8BXecj3xNF0V+PlnpyJ1IQVS/JEXRIOvBJQ09YFknVGnJNAJrQ==}
+    peerDependencies:
+      mediabunny: ^1.0.0
+
   '@microsoft/api-extractor-model@7.28.17':
     resolution: {integrity: sha512-b2AfLP33oEVtWLeNavSBRdyDa8sKlXjN4pdhBnC4HLontOtjILhL1ERAmZObF4PWSyChnnC2vjb47C9WKCFRGg==}
 
@@ -2652,6 +2685,9 @@ packages:
   '@rushstack/ts-command-line@4.21.0':
     resolution: {integrity: sha512-z38FLUCn8M9FQf19gJ9eltdwkvc47PxvJmVZS6aKwbBAa3Pis3r3A+ZcBCVPNb9h/Tbga+i0tHdzoSGUoji9GQ==}
 
+  '@speech-sdk/core@0.14.0':
+    resolution: {integrity: sha512-oxVzxpV2eQ9Y8I/tJCUulD3FYf8f4y4WM//eyz7MPPuhnawpAv1j54v49EW5XoxlEOPHJaDf1+Vwmr5X5gnk1Q==}
+
   '@standard-schema/spec@1.1.0':
     resolution: {integrity: sha512-l2aFy5jALhniG5HgqrD6jXLi/rUWrKvqN/qJx6yoJsgKhblVd+iqqU4RCXavm/jPityDo5TCvKMnpjKnOriy0w==}
 
@@ -2673,6 +2709,12 @@ packages:
   '@types/deep-eql@4.0.2':
     resolution: {integrity: sha512-c9h9dVVMigMPc4bwTvC5dxqtqJZwQPePsWjPlpSOnojbor6pGqdk541lfA7AqFQr5pB1BRdq0juY9db81BwyFw==}
 
+  '@types/dom-mediacapture-transform@0.1.11':
+    resolution: {integrity: sha512-Y2p+nGf1bF2XMttBnsVPHUWzRRZzqUoJAKmiP10b5umnO6DDrWI0BrGDJy1pOHoOULVmGSfFNkQrAlC5dcj6nQ==}
+
+  '@types/dom-webcodecs@0.1.13':
+    resolution: {integrity: sha512-O5hkiFIcjjszPIYyUSyvScyvrBoV3NOEEZx/pMlsu44TKzWNkLVBBxnxJz42in5n3QIolYOcBYFCPZZ0h8SkwQ==}
+
   '@types/estree@1.0.8':
     resolution: {integrity: sha512-dWHzHa2WqEXI/O1E9OjrocMTKJl2mSrEolh1Iomrv6U+JuNwaHXsXx9bLu5gG7BUWFIN0skIQJQ/L1rIex4X6w==}
 
@@ -3861,6 +3903,10 @@ packages:
     resolution: {integrity: sha512-5KoIu2Ngpyek75jXodFvnafB6DJgr3u8uuK0LEZJjrU19DrMD3EVERaR8sjz8CCGgpZvxPl9SuE1GMVPFHx1mw==}
     engines: {node: '>= 0.4'}
 
+  is-network-error@1.3.2:
+    resolution: {integrity: sha512-PhBY86zaxNZUuWP6h13Vu5oFe0XY6/UlKzQnYFELzGVHygP3MxmvTfYSG7GN3aIab/iWudSMgjSnG9Dq+nHrgA==}
+    engines: {node: '>=16'}
+
   is-number-object@1.0.7:
     resolution: {integrity: sha512-k1U0IRzLMo7ZlYIfzRu23Oh6MiIFasgpb9X76eqfFZAqwH44UI4KTBvBYIZ1dSL9ZzChTB9ShHfLkR4pdW5krQ==}
     engines: {node: '>= 0.4'}
@@ -4131,6 +4177,9 @@ packages:
     resolution: {integrity: sha512-/IXtbwEk5HTPyEwyKX6hGkYXxM9nbj64B+ilVJnC/R6B0pH5G4V3b0pVbL7DBj4tkhBAppbQUlf6F6Xl9LHu1g==}
     engines: {node: '>= 0.4'}
 
+  mediabunny@1.46.0:
+    resolution: {integrity: sha512-HIaMQ6PVMndrFb9sNCh9/uAsLFBfLzyAgbk3kN7ZqlRRq//9RPYqIOQtXYpkbWBZU8ZJkaAUNLDtwTcdKo7yQw==}
+
   merge2@1.4.1:
     resolution: {integrity: sha512-8q7VEgMJW4J8tcfVPy8g09NcQwZdbwFEqhe/WZkoIzjn/3TGDwtOCYtXGxA3O8tPzpczCCDgv+P2P5y00ZJOOg==}
     engines: {node: '>= 8'}
@@ -4322,6 +4371,10 @@ packages:
     resolution: {integrity: sha512-312Id396EbJdvRONlngUx0NydfrIQ5lsYu0znKVUzVvArzEIt08V1qhtyESbGVd1FGX7UKtiFp5uwKZdM8wIuQ==}
     engines: {node: '>=8'}
 
+  p-retry@8.0.0:
+    resolution: {integrity: sha512-kFVqH1HxOHp8LupNsOys7bSV09VYTRLxarH/mokO4Rqhk6wGi70E0jh4VzvVGXfEVNggHoHLAMWsQqHyU1Ey9A==}
+    engines: {node: '>=22'}
+
   p-try@2.2.0:
     resolution: {integrity: sha512-R4nPAVTAU0B9D35/Gk3uJf/7XYbQcyohSKdvAxIRSNghFl4e71hVoGnBNQz9cWaXxO2I10KTC+3jMdvvoKw6dQ==}
     engines: {node: '>=6'}
@@ -5920,6 +5973,10 @@ snapshots:
       globby: 11.1.0
       read-yaml-file: 1.1.0
 
+  '@mediabunny/mp3-encoder@1.46.0(mediabunny@1.46.0)':
+    dependencies:
+      mediabunny: 1.46.0
+
   '@microsoft/api-extractor-model@7.28.17(@types/node@22.19.1)':
     dependencies:
       '@microsoft/tsdoc': 0.14.2
@@ -6425,6 +6482,13 @@ snapshots:
     transitivePeerDependencies:
       - '@types/node'
 
+  '@speech-sdk/core@0.14.0':
+    dependencies:
+      '@mediabunny/mp3-encoder': 1.46.0(mediabunny@1.46.0)
+      mediabunny: 1.46.0
+      p-retry: 8.0.0
+      zod: 4.3.6
+
   '@standard-schema/spec@1.1.0': {}
 
   '@trivago/prettier-plugin-sort-imports@4.3.0(prettier@3.2.5)':
@@ -6448,6 +6512,12 @@ snapshots:
 
   '@types/deep-eql@4.0.2': {}
 
+  '@types/dom-mediacapture-transform@0.1.11':
+    dependencies:
+      '@types/dom-webcodecs': 0.1.13
+
+  '@types/dom-webcodecs@0.1.13': {}
+
   '@types/estree@1.0.8': {}
 
   '@types/estree@1.0.9': {}
@@ -7871,6 +7941,8 @@ snapshots:
 
   is-negative-zero@2.0.3: {}
 
+  is-network-error@1.3.2: {}
+
   is-number-object@1.0.7:
     dependencies:
       has-tostringtag: 1.0.2
@@ -8115,6 +8187,11 @@ snapshots:
 
   math-intrinsics@1.1.0: {}
 
+  mediabunny@1.46.0:
+    dependencies:
+      '@types/dom-mediacapture-transform': 0.1.11
+      '@types/dom-webcodecs': 0.1.13
+
   merge2@1.4.1: {}
 
   micromatch@4.0.8:
@@ -8307,6 +8384,10 @@ snapshots:
       '@types/retry': 0.12.0
       retry: 0.13.1
 
+  p-retry@8.0.0:
+    dependencies:
+      is-network-error: 1.3.2
+
   p-try@2.2.0: {}
 
   package-json-from-dist@1.0.1: {}
diff --git a/turbo.json b/turbo.json
index c3b1e3ee6..56986853f 100644
--- a/turbo.json
+++ b/turbo.json
@@ -95,6 +95,7 @@
     "MINIMAX_API_KEY",
     "MINIMAX_BASE_URL",
     "MISTRAL_API_KEY",
+    "SPEECHBASE_API_KEY",
     "VITEST"
   ],
   "pipeline": {