diff --git a/apps/mentora/src/routes/conversations/[id]/+page.svelte b/apps/mentora/src/routes/conversations/[id]/+page.svelte index bbf04e6a..04292150 100644 --- a/apps/mentora/src/routes/conversations/[id]/+page.svelte +++ b/apps/mentora/src/routes/conversations/[id]/+page.svelte @@ -595,7 +595,7 @@ showUserReplies = !showUserReplies; } - function playBase64Audio(base64: string, mimeType: string = "audio/mp3") { + function playBase64Audio(base64: string, mimeType: string) { try { const binary = atob(base64); const bytes = new Uint8Array(binary.length); @@ -648,10 +648,7 @@ : m.conversation_error(); awaitingAiReply = false; } else if (res.data?.audio) { - playBase64Audio( - res.data.audio, - res.data.audioMimeType || "audio/mp3", - ); + playBase64Audio(res.data.audio, res.data.audioMimeType); } } catch (e) { console.error("Error sending audio turn:", e); @@ -699,10 +696,7 @@ messageInput = ""; showTextInput = false; if (res.data?.audio) { - playBase64Audio( - res.data.audio, - res.data.audioMimeType || "audio/mp3", - ); + playBase64Audio(res.data.audio, res.data.audioMimeType); } } } catch (e) { diff --git a/packages/mentora-ai/src/executor/index.ts b/packages/mentora-ai/src/executor/index.ts index 51bdd9f7..c4228b27 100644 --- a/packages/mentora-ai/src/executor/index.ts +++ b/packages/mentora-ai/src/executor/index.ts @@ -3,3 +3,4 @@ export { GeminiContentExecutor } from "./content-generator.js"; export * from "./gemini.js"; export { BaseTokenTracker } from "./token-tracker.js"; export { GeminiTTSExecutor } from "./tts.js"; +export { encodePcm16AsWav } from "./wav.js"; diff --git a/packages/mentora-ai/src/executor/tts.ts b/packages/mentora-ai/src/executor/tts.ts index 12734712..66c226bd 100644 --- a/packages/mentora-ai/src/executor/tts.ts +++ b/packages/mentora-ai/src/executor/tts.ts @@ -2,8 +2,45 @@ * GeminiTTSExecutor implements text-to-speech using Google Gemini API */ import type { GoogleGenAI } from "@google/genai"; -import type { TTSExecutor } from "../types.js"; +import type { SynthesizedAudio, TTSExecutor } from "../types.js"; import { BaseTokenTracker } from "./token-tracker.js"; +import { encodePcm16AsWav } from "./wav.js"; + +const PCM_MIME_TYPES = new Set(["audio/l16", "audio/pcm"]); + +function normalizeGeminiAudioResponse(audioPart: { + data?: string; + mimeType?: string; +}): SynthesizedAudio { + const { data } = audioPart; + if (!data) { + throw new Error("No audio data received from TTS model"); + } + + const normalizedMimeType = audioPart.mimeType + ?.toLowerCase() + .split(";")[0] + ?.trim(); + + if ( + normalizedMimeType === "audio/wav" || + normalizedMimeType === "audio/wave" + ) { + return { + audioBase64: data, + mimeType: "audio/wav", + }; + } + + // Gemini preview TTS returns raw 24 kHz 16-bit mono PCM by default. + if (!normalizedMimeType || PCM_MIME_TYPES.has(normalizedMimeType)) { + return encodePcm16AsWav(data); + } + + throw new Error( + `Unsupported TTS audio MIME type: ${audioPart.mimeType ?? ""}`, + ); +} /** * Gemini-based TTS Executor @@ -20,9 +57,9 @@ export class GeminiTTSExecutor extends BaseTokenTracker implements TTSExecutor { /** * Synthesize text to speech * @param text - Text to synthesize - * @returns Base64 encoded audio (MP3 format) + * @returns Base64 encoded browser-playable audio */ - async synthesize(text: string): Promise { + async synthesize(text: string): Promise { try { const response = await this.genai.models.generateContent({ model: this.model, @@ -48,13 +85,9 @@ export class GeminiTTSExecutor extends BaseTokenTracker implements TTSExecutor { this.accumulateUsage(response.usageMetadata); const speech = - response.candidates?.[0]?.content?.parts?.[0]?.inlineData?.data; - - if (!speech) { - throw new Error("No audio data received from TTS model"); - } + response.candidates?.[0]?.content?.parts?.[0]?.inlineData; - return speech; + return normalizeGeminiAudioResponse(speech ?? {}); } catch (error) { console.error( "[GeminiTTSExecutor] Error synthesizing speech:", diff --git a/packages/mentora-ai/src/executor/wav.ts b/packages/mentora-ai/src/executor/wav.ts new file mode 100644 index 00000000..1b38f378 --- /dev/null +++ b/packages/mentora-ai/src/executor/wav.ts @@ -0,0 +1,57 @@ +import type { SynthesizedAudio } from "../types.js"; + +const WAV_HEADER_BYTES = 44; + +function writeAscii(view: DataView, offset: number, value: string) { + for (let i = 0; i < value.length; i++) { + view.setUint8(offset + i, value.charCodeAt(i)); + } +} + +/** + * Wrap raw PCM16LE bytes in a WAV container so browsers can play the result. + */ +export function encodePcm16AsWav( + pcmBase64: string, + { + channelCount = 1, + sampleRate = 24_000, + bytesPerSample = 2, + }: { + channelCount?: number; + sampleRate?: number; + bytesPerSample?: number; + } = {}, +): SynthesizedAudio { + const pcmBytes = Buffer.from(pcmBase64, "base64"); + const wavBytes = new Uint8Array(WAV_HEADER_BYTES + pcmBytes.length); + const view = new DataView( + wavBytes.buffer, + wavBytes.byteOffset, + wavBytes.byteLength, + ); + + const byteRate = sampleRate * channelCount * bytesPerSample; + const blockAlign = channelCount * bytesPerSample; + const bitsPerSample = bytesPerSample * 8; + + writeAscii(view, 0, "RIFF"); + view.setUint32(4, 36 + pcmBytes.length, true); + writeAscii(view, 8, "WAVE"); + writeAscii(view, 12, "fmt "); + view.setUint32(16, 16, true); + view.setUint16(20, 1, true); + view.setUint16(22, channelCount, true); + view.setUint32(24, sampleRate, true); + view.setUint32(28, byteRate, true); + view.setUint16(32, blockAlign, true); + view.setUint16(34, bitsPerSample, true); + writeAscii(view, 36, "data"); + view.setUint32(40, pcmBytes.length, true); + wavBytes.set(pcmBytes, WAV_HEADER_BYTES); + + return { + audioBase64: Buffer.from(wavBytes).toString("base64"), + mimeType: "audio/wav", + }; +} diff --git a/packages/mentora-ai/src/types.ts b/packages/mentora-ai/src/types.ts index 7f31b619..e005985a 100644 --- a/packages/mentora-ai/src/types.ts +++ b/packages/mentora-ai/src/types.ts @@ -69,6 +69,16 @@ export interface TokenTracker { resetTokenUsage(): void; } +/** + * Synthesized audio payload ready for downstream consumers + */ +export interface SynthesizedAudio { + /** Base64 encoded audio bytes */ + audioBase64: string; + /** IANA MIME type for the audio payload */ + mimeType: string; +} + /** * Executor interface for running prompts against an LLM */ @@ -113,7 +123,7 @@ export interface TTSExecutor extends TokenTracker { /** * Synthesize text to speech * @param text - Text to synthesize - * @returns Base64 encoded audio string + * @returns Base64 encoded audio payload and MIME type */ - synthesize(text: string): Promise; + synthesize(text: string): Promise; } diff --git a/packages/mentora-ai/tests/tts-audio.test.ts b/packages/mentora-ai/tests/tts-audio.test.ts new file mode 100644 index 00000000..8e49e15e --- /dev/null +++ b/packages/mentora-ai/tests/tts-audio.test.ts @@ -0,0 +1,24 @@ +import { describe, expect, it } from "vitest"; +import { encodePcm16AsWav } from "../src/executor/wav.js"; + +describe("encodePcm16AsWav", () => { + it("wraps PCM bytes in a WAV container with the expected header", () => { + const pcmBytes = Uint8Array.from([0x01, 0x02, 0x03, 0x04]); + const result = encodePcm16AsWav( + Buffer.from(pcmBytes).toString("base64"), + ); + const wavBytes = Buffer.from(result.audioBase64, "base64"); + + expect(result.mimeType).toBe("audio/wav"); + expect(wavBytes.toString("ascii", 0, 4)).toBe("RIFF"); + expect(wavBytes.toString("ascii", 8, 12)).toBe("WAVE"); + expect(wavBytes.toString("ascii", 12, 16)).toBe("fmt "); + expect(wavBytes.readUInt16LE(20)).toBe(1); + expect(wavBytes.readUInt16LE(22)).toBe(1); + expect(wavBytes.readUInt32LE(24)).toBe(24_000); + expect(wavBytes.readUInt16LE(34)).toBe(16); + expect(wavBytes.toString("ascii", 36, 40)).toBe("data"); + expect(wavBytes.readUInt32LE(40)).toBe(pcmBytes.length); + expect([...wavBytes.subarray(44)]).toEqual([...pcmBytes]); + }); +}); diff --git a/packages/mentora-api/src/lib/explorer/api-spec.ts b/packages/mentora-api/src/lib/explorer/api-spec.ts index 5fc0174e..b30fe3ba 100644 --- a/packages/mentora-api/src/lib/explorer/api-spec.ts +++ b/packages/mentora-api/src/lib/explorer/api-spec.ts @@ -810,7 +810,7 @@ export const apiModules: APIModule[] = [ response: { text: 'Can you explain your reasoning in more detail?', audio: '', - audioMimeType: 'audio/mp3' + audioMimeType: 'audio/wav' } } } diff --git a/packages/mentora-api/src/lib/server/application/conversation-service.ts b/packages/mentora-api/src/lib/server/application/conversation-service.ts index 6ee573d4..4ce367f7 100644 --- a/packages/mentora-api/src/lib/server/application/conversation-service.ts +++ b/packages/mentora-api/src/lib/server/application/conversation-service.ts @@ -408,11 +408,13 @@ export class ConversationService { const aiTurnId = randomUUID(); let aiAudioBase64: string; - const aiAudioMimeType = 'audio/mp3'; + let aiAudioMimeType: string; try { const ttsExecutor = getTTSExecutor(requestApiKey); ttsExecutor.resetTokenUsage(); - aiAudioBase64 = await ttsExecutor.synthesize(llmResult.aiMessage); + const synthesizedAudio = await ttsExecutor.synthesize(llmResult.aiMessage); + aiAudioBase64 = synthesizedAudio.audioBase64; + aiAudioMimeType = synthesizedAudio.mimeType; ttsUsageReport = createTokenUsageReport([ { feature: TOKEN_USAGE_FEATURES.CONVERSATION_TTS, diff --git a/packages/mentora-api/tests/conversation-service-asr.unit.test.ts b/packages/mentora-api/tests/conversation-service-asr.unit.test.ts index cdc5326a..fb9cb619 100644 --- a/packages/mentora-api/tests/conversation-service-asr.unit.test.ts +++ b/packages/mentora-api/tests/conversation-service-asr.unit.test.ts @@ -119,7 +119,10 @@ function createMockASRExecutor(transcribeResult: string | Error) { function createMockTTSExecutor() { return { resetTokenUsage: vi.fn(), - synthesize: vi.fn().mockResolvedValue('base64-audio-data'), + synthesize: vi.fn().mockResolvedValue({ + audioBase64: 'base64-audio-data', + mimeType: 'audio/wav' + }), getTokenUsage: vi.fn().mockReturnValue({ cachedContentTokenCount: 0, candidatesTokenCount: 10, @@ -215,7 +218,7 @@ describe('ConversationService.addTurn – ASR error handling', () => { expect(result.text).toBe('AI response'); expect(result.audio).toBe('base64-audio-data'); - expect(result.audioMimeType).toBe('audio/mp3'); + expect(result.audioMimeType).toBe('audio/wav'); // Verify ASR was called with correct params const asrExecutor = mockedGetASRExecutor.mock.results[0].value; diff --git a/packages/mentora-api/tests/sdk-full-scenario.integration.test.ts b/packages/mentora-api/tests/sdk-full-scenario.integration.test.ts index c39b37c2..f3aaf3e3 100644 --- a/packages/mentora-api/tests/sdk-full-scenario.integration.test.ts +++ b/packages/mentora-api/tests/sdk-full-scenario.integration.test.ts @@ -151,7 +151,7 @@ describe('Mentora SDK Full Scenario (Integration)', () => { JSON.stringify({ text: 'Mocked assistant response', audio: 'ZmFrZS1hdWRpby1kYXRh', - audioMimeType: 'audio/mp3', + audioMimeType: 'audio/wav', tokenUsage: { byFeature: { conversation_llm: {