JacobLinCool · JacobLinCool · Apr 7, 2026 · Apr 7, 2026 · Copilot · Apr 7, 2026
diff --git a/apps/mentora/src/routes/conversations/[id]/+page.svelte b/apps/mentora/src/routes/conversations/[id]/+page.svelte
@@ -595,7 +595,7 @@
         showUserReplies = !showUserReplies;
     }
 
-    function playBase64Audio(base64: string, mimeType: string = "audio/mp3") {
+    function playBase64Audio(base64: string, mimeType: string) {
         try {
             const binary = atob(base64);
             const bytes = new Uint8Array(binary.length);
@@ -648,10 +648,7 @@
                     : m.conversation_error();
                 awaitingAiReply = false;
             } else if (res.data?.audio) {
-                playBase64Audio(
-                    res.data.audio,
-                    res.data.audioMimeType || "audio/mp3",
-                );
+                playBase64Audio(res.data.audio, res.data.audioMimeType);
             }
         } catch (e) {
             console.error("Error sending audio turn:", e);
@@ -699,10 +696,7 @@
                 messageInput = "";
                 showTextInput = false;
                 if (res.data?.audio) {
-                    playBase64Audio(
-                        res.data.audio,
-                        res.data.audioMimeType || "audio/mp3",
-                    );
+                    playBase64Audio(res.data.audio, res.data.audioMimeType);
                 }
             }
         } catch (e) {

diff --git a/packages/mentora-ai/src/executor/index.ts b/packages/mentora-ai/src/executor/index.ts
@@ -3,3 +3,4 @@ export { GeminiContentExecutor } from "./content-generator.js";
 export * from "./gemini.js";
 export { BaseTokenTracker } from "./token-tracker.js";
 export { GeminiTTSExecutor } from "./tts.js";
+export { encodePcm16AsWav } from "./wav.js";
diff --git a/packages/mentora-ai/src/executor/tts.ts b/packages/mentora-ai/src/executor/tts.ts
@@ -2,8 +2,45 @@
  * GeminiTTSExecutor implements text-to-speech using Google Gemini API
  */
 import type { GoogleGenAI } from "@google/genai";
-import type { TTSExecutor } from "../types.js";
+import type { SynthesizedAudio, TTSExecutor } from "../types.js";
 import { BaseTokenTracker } from "./token-tracker.js";
+import { encodePcm16AsWav } from "./wav.js";
+
+const PCM_MIME_TYPES = new Set(["audio/l16", "audio/pcm"]);
+
+function normalizeGeminiAudioResponse(audioPart: {
+    data?: string;
+    mimeType?: string;
+}): SynthesizedAudio {
+    const { data } = audioPart;
+    if (!data) {
+        throw new Error("No audio data received from TTS model");
+    }
+
+    const normalizedMimeType = audioPart.mimeType
+        ?.toLowerCase()
+        .split(";")[0]
+        ?.trim();
+
+    if (
+        normalizedMimeType === "audio/wav" ||
+        normalizedMimeType === "audio/wave"
+    ) {
+        return {
+            audioBase64: data,
+            mimeType: "audio/wav",
+        };
+    }
+
+    // Gemini preview TTS returns raw 24 kHz 16-bit mono PCM by default.
+    if (!normalizedMimeType || PCM_MIME_TYPES.has(normalizedMimeType)) {
+        return encodePcm16AsWav(data);
+    }
+
+    throw new Error(
+        `Unsupported TTS audio MIME type: ${audioPart.mimeType ?? "<missing>"}`,
+    );
+}
 
 /**
  * Gemini-based TTS Executor
@@ -20,9 +57,9 @@ export class GeminiTTSExecutor extends BaseTokenTracker implements TTSExecutor {
     /**
      * Synthesize text to speech
      * @param text - Text to synthesize
-     * @returns Base64 encoded audio (MP3 format)
+     * @returns Base64 encoded browser-playable audio
      */
-    async synthesize(text: string): Promise<string> {
+    async synthesize(text: string): Promise<SynthesizedAudio> {
         try {
             const response = await this.genai.models.generateContent({
                 model: this.model,
@@ -48,13 +85,9 @@ export class GeminiTTSExecutor extends BaseTokenTracker implements TTSExecutor {
             this.accumulateUsage(response.usageMetadata);
 
             const speech =
-                response.candidates?.[0]?.content?.parts?.[0]?.inlineData?.data;
-
-            if (!speech) {
-                throw new Error("No audio data received from TTS model");
-            }
+                response.candidates?.[0]?.content?.parts?.[0]?.inlineData;
 
-            return speech;
+            return normalizeGeminiAudioResponse(speech ?? {});
         } catch (error) {
             console.error(
                 "[GeminiTTSExecutor] Error synthesizing speech:",

diff --git a/packages/mentora-ai/src/executor/wav.ts b/packages/mentora-ai/src/executor/wav.ts
@@ -0,0 +1,57 @@
+import type { SynthesizedAudio } from "../types.js";
+
+const WAV_HEADER_BYTES = 44;
+
+function writeAscii(view: DataView, offset: number, value: string) {
+    for (let i = 0; i < value.length; i++) {
+        view.setUint8(offset + i, value.charCodeAt(i));
+    }
+}
+
+/**
+ * Wrap raw PCM16LE bytes in a WAV container so browsers can play the result.
+ */
+export function encodePcm16AsWav(
+    pcmBase64: string,
+    {
+        channelCount = 1,
+        sampleRate = 24_000,
+        bytesPerSample = 2,
+    }: {
+        channelCount?: number;
+        sampleRate?: number;
+        bytesPerSample?: number;
+    } = {},
+): SynthesizedAudio {
-        bytesPerSample?: number;
-    } = {},
-): SynthesizedAudio {
+        bytesPerSample?: 2;
+    } = {},
+): SynthesizedAudio {
+    if (bytesPerSample !== 2) {
+        throw new Error(
+            `encodePcm16AsWav only supports 16-bit PCM input (bytesPerSample must be 2, received ${bytesPerSample}).`,
+        );
+    }
-        bytesPerSample?: number;
-    } = {},
-): SynthesizedAudio {
+        bytesPerSample?: 2;
+    } = {},
+): SynthesizedAudio {
+    if (bytesPerSample !== 2) {
+        throw new Error(
+            `encodePcm16AsWav only supports 16-bit PCM input (bytesPerSample must be 2, received ${bytesPerSample}).`,
+        );
+    }
+    const pcmBytes = Buffer.from(pcmBase64, "base64");
+    const wavBytes = new Uint8Array(WAV_HEADER_BYTES + pcmBytes.length);
+    const view = new DataView(
+        wavBytes.buffer,
+        wavBytes.byteOffset,
+        wavBytes.byteLength,
+    );
+
+    const byteRate = sampleRate * channelCount * bytesPerSample;
+    const blockAlign = channelCount * bytesPerSample;
+    const bitsPerSample = bytesPerSample * 8;
+
+    writeAscii(view, 0, "RIFF");
+    view.setUint32(4, 36 + pcmBytes.length, true);
+    writeAscii(view, 8, "WAVE");
+    writeAscii(view, 12, "fmt ");
+    view.setUint32(16, 16, true);
+    view.setUint16(20, 1, true);
+    view.setUint16(22, channelCount, true);
+    view.setUint32(24, sampleRate, true);
+    view.setUint32(28, byteRate, true);
+    view.setUint16(32, blockAlign, true);
+    view.setUint16(34, bitsPerSample, true);
+    writeAscii(view, 36, "data");
+    view.setUint32(40, pcmBytes.length, true);
+    wavBytes.set(pcmBytes, WAV_HEADER_BYTES);
+
+    return {
+        audioBase64: Buffer.from(wavBytes).toString("base64"),
+        mimeType: "audio/wav",
+    };
+}
diff --git a/packages/mentora-ai/src/types.ts b/packages/mentora-ai/src/types.ts
@@ -69,6 +69,16 @@ export interface TokenTracker {
     resetTokenUsage(): void;
 }
 
+/**
+ * Synthesized audio payload ready for downstream consumers
+ */
+export interface SynthesizedAudio {
+    /** Base64 encoded audio bytes */
+    audioBase64: string;
+    /** IANA MIME type for the audio payload */
+    mimeType: string;
+}
+
 /**
  * Executor interface for running prompts against an LLM
  */
@@ -113,7 +123,7 @@ export interface TTSExecutor extends TokenTracker {
     /**
      * Synthesize text to speech
      * @param text - Text to synthesize
-     * @returns Base64 encoded audio string
+     * @returns Base64 encoded audio payload and MIME type
      */
-    synthesize(text: string): Promise<string>;
+    synthesize(text: string): Promise<SynthesizedAudio>;
 }
diff --git a/packages/mentora-ai/tests/tts-audio.test.ts b/packages/mentora-ai/tests/tts-audio.test.ts
@@ -0,0 +1,24 @@
+import { describe, expect, it } from "vitest";
+import { encodePcm16AsWav } from "../src/executor/wav.js";
+
+describe("encodePcm16AsWav", () => {
+    it("wraps PCM bytes in a WAV container with the expected header", () => {
+        const pcmBytes = Uint8Array.from([0x01, 0x02, 0x03, 0x04]);
+        const result = encodePcm16AsWav(
+            Buffer.from(pcmBytes).toString("base64"),
+        );
+        const wavBytes = Buffer.from(result.audioBase64, "base64");
+
+        expect(result.mimeType).toBe("audio/wav");
+        expect(wavBytes.toString("ascii", 0, 4)).toBe("RIFF");
+        expect(wavBytes.toString("ascii", 8, 12)).toBe("WAVE");
+        expect(wavBytes.toString("ascii", 12, 16)).toBe("fmt ");
+        expect(wavBytes.readUInt16LE(20)).toBe(1);
+        expect(wavBytes.readUInt16LE(22)).toBe(1);
+        expect(wavBytes.readUInt32LE(24)).toBe(24_000);
+        expect(wavBytes.readUInt16LE(34)).toBe(16);
+        expect(wavBytes.toString("ascii", 36, 40)).toBe("data");
+        expect(wavBytes.readUInt32LE(40)).toBe(pcmBytes.length);
+        expect([...wavBytes.subarray(44)]).toEqual([...pcmBytes]);
+    });
+});
diff --git a/packages/mentora-api/src/lib/explorer/api-spec.ts b/packages/mentora-api/src/lib/explorer/api-spec.ts
@@ -810,7 +810,7 @@ export const apiModules: APIModule[] = [
 					response: {
 						text: 'Can you explain your reasoning in more detail?',
 						audio: '<base64>',
-						audioMimeType: 'audio/mp3'
+						audioMimeType: 'audio/wav'
 					}
 				}
 			}

diff --git a/packages/mentora-api/src/lib/server/application/conversation-service.ts b/packages/mentora-api/src/lib/server/application/conversation-service.ts
@@ -408,11 +408,13 @@ export class ConversationService {
 
 		const aiTurnId = randomUUID();
 		let aiAudioBase64: string;
-		const aiAudioMimeType = 'audio/mp3';
+		let aiAudioMimeType: string;
 		try {
 			const ttsExecutor = getTTSExecutor(requestApiKey);
 			ttsExecutor.resetTokenUsage();
-			aiAudioBase64 = await ttsExecutor.synthesize(llmResult.aiMessage);
+			const synthesizedAudio = await ttsExecutor.synthesize(llmResult.aiMessage);
+			aiAudioBase64 = synthesizedAudio.audioBase64;
+			aiAudioMimeType = synthesizedAudio.mimeType;
 			ttsUsageReport = createTokenUsageReport([
 				{
 					feature: TOKEN_USAGE_FEATURES.CONVERSATION_TTS,

diff --git a/packages/mentora-api/tests/conversation-service-asr.unit.test.ts b/packages/mentora-api/tests/conversation-service-asr.unit.test.ts
@@ -119,7 +119,10 @@ function createMockASRExecutor(transcribeResult: string | Error) {
 function createMockTTSExecutor() {
 	return {
 		resetTokenUsage: vi.fn(),
-		synthesize: vi.fn().mockResolvedValue('base64-audio-data'),
+		synthesize: vi.fn().mockResolvedValue({
+			audioBase64: 'base64-audio-data',
+			mimeType: 'audio/wav'
+		}),
 		getTokenUsage: vi.fn().mockReturnValue({
 			cachedContentTokenCount: 0,
 			candidatesTokenCount: 10,
@@ -215,7 +218,7 @@ describe('ConversationService.addTurn – ASR error handling', () => {
 
 		expect(result.text).toBe('AI response');
 		expect(result.audio).toBe('base64-audio-data');
-		expect(result.audioMimeType).toBe('audio/mp3');
+		expect(result.audioMimeType).toBe('audio/wav');
 
 		// Verify ASR was called with correct params
 		const asrExecutor = mockedGetASRExecutor.mock.results[0].value;

diff --git a/packages/mentora-api/tests/sdk-full-scenario.integration.test.ts b/packages/mentora-api/tests/sdk-full-scenario.integration.test.ts
@@ -151,7 +151,7 @@ describe('Mentora SDK Full Scenario (Integration)', () => {
 					JSON.stringify({
 						text: 'Mocked assistant response',
 						audio: 'ZmFrZS1hdWRpby1kYXRh',
-						audioMimeType: 'audio/mp3',
+						audioMimeType: 'audio/wav',
 						tokenUsage: {
 							byFeature: {
 								conversation_llm: {