Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
12 changes: 3 additions & 9 deletions apps/mentora/src/routes/conversations/[id]/+page.svelte
Original file line number Diff line number Diff line change
Expand Up @@ -595,7 +595,7 @@
showUserReplies = !showUserReplies;
}

function playBase64Audio(base64: string, mimeType: string = "audio/mp3") {
function playBase64Audio(base64: string, mimeType: string) {
try {
const binary = atob(base64);
const bytes = new Uint8Array(binary.length);
Expand Down Expand Up @@ -648,10 +648,7 @@
: m.conversation_error();
awaitingAiReply = false;
} else if (res.data?.audio) {
playBase64Audio(
res.data.audio,
res.data.audioMimeType || "audio/mp3",
);
playBase64Audio(res.data.audio, res.data.audioMimeType);
}
} catch (e) {
console.error("Error sending audio turn:", e);
Expand Down Expand Up @@ -699,10 +696,7 @@
messageInput = "";
showTextInput = false;
if (res.data?.audio) {
playBase64Audio(
res.data.audio,
res.data.audioMimeType || "audio/mp3",
);
playBase64Audio(res.data.audio, res.data.audioMimeType);
}
}
} catch (e) {
Expand Down
1 change: 1 addition & 0 deletions packages/mentora-ai/src/executor/index.ts
Original file line number Diff line number Diff line change
Expand Up @@ -3,3 +3,4 @@ export { GeminiContentExecutor } from "./content-generator.js";
export * from "./gemini.js";
export { BaseTokenTracker } from "./token-tracker.js";
export { GeminiTTSExecutor } from "./tts.js";
export { encodePcm16AsWav } from "./wav.js";
51 changes: 42 additions & 9 deletions packages/mentora-ai/src/executor/tts.ts
Original file line number Diff line number Diff line change
Expand Up @@ -2,8 +2,45 @@
* GeminiTTSExecutor implements text-to-speech using Google Gemini API
*/
import type { GoogleGenAI } from "@google/genai";
import type { TTSExecutor } from "../types.js";
import type { SynthesizedAudio, TTSExecutor } from "../types.js";
import { BaseTokenTracker } from "./token-tracker.js";
import { encodePcm16AsWav } from "./wav.js";

const PCM_MIME_TYPES = new Set(["audio/l16", "audio/pcm"]);

function normalizeGeminiAudioResponse(audioPart: {
data?: string;
mimeType?: string;
}): SynthesizedAudio {
const { data } = audioPart;
if (!data) {
throw new Error("No audio data received from TTS model");
}

const normalizedMimeType = audioPart.mimeType
?.toLowerCase()
.split(";")[0]
?.trim();

if (
normalizedMimeType === "audio/wav" ||
normalizedMimeType === "audio/wave"
) {
return {
audioBase64: data,
mimeType: "audio/wav",
};
}

// Gemini preview TTS returns raw 24 kHz 16-bit mono PCM by default.
if (!normalizedMimeType || PCM_MIME_TYPES.has(normalizedMimeType)) {
return encodePcm16AsWav(data);
}

throw new Error(
`Unsupported TTS audio MIME type: ${audioPart.mimeType ?? "<missing>"}`,
);
Comment on lines +20 to +42
Copy link

Copilot AI Apr 7, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

normalizeGeminiAudioResponse currently only accepts "audio/wav"/"audio/wave" and raw PCM (missing/"audio/l16"/"audio/pcm"). If Gemini returns other common but browser-playable audio types (e.g. audio/mpeg) or WAV aliases (e.g. audio/x-wav, audio/vnd.wave), this will throw and break TTS despite the PR goal of propagating MIME types. Consider normalizing additional WAV aliases and passing through supported codecs (or explicitly documenting/handling them) instead of throwing.

Copilot uses AI. Check for mistakes.
}

/**
* Gemini-based TTS Executor
Expand All @@ -20,9 +57,9 @@ export class GeminiTTSExecutor extends BaseTokenTracker implements TTSExecutor {
/**
* Synthesize text to speech
* @param text - Text to synthesize
* @returns Base64 encoded audio (MP3 format)
* @returns Base64 encoded browser-playable audio
*/
async synthesize(text: string): Promise<string> {
async synthesize(text: string): Promise<SynthesizedAudio> {
try {
const response = await this.genai.models.generateContent({
model: this.model,
Expand All @@ -48,13 +85,9 @@ export class GeminiTTSExecutor extends BaseTokenTracker implements TTSExecutor {
this.accumulateUsage(response.usageMetadata);

const speech =
response.candidates?.[0]?.content?.parts?.[0]?.inlineData?.data;

if (!speech) {
throw new Error("No audio data received from TTS model");
}
response.candidates?.[0]?.content?.parts?.[0]?.inlineData;

return speech;
return normalizeGeminiAudioResponse(speech ?? {});
} catch (error) {
console.error(
"[GeminiTTSExecutor] Error synthesizing speech:",
Expand Down
57 changes: 57 additions & 0 deletions packages/mentora-ai/src/executor/wav.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,57 @@
import type { SynthesizedAudio } from "../types.js";

const WAV_HEADER_BYTES = 44;

function writeAscii(view: DataView, offset: number, value: string) {
for (let i = 0; i < value.length; i++) {
view.setUint8(offset + i, value.charCodeAt(i));
}
}

/**
* Wrap raw PCM16LE bytes in a WAV container so browsers can play the result.
*/
export function encodePcm16AsWav(
pcmBase64: string,
{
channelCount = 1,
sampleRate = 24_000,
bytesPerSample = 2,
}: {
channelCount?: number;
sampleRate?: number;
bytesPerSample?: number;
} = {},
): SynthesizedAudio {
Comment on lines +23 to +25
Copy link

Copilot AI Apr 7, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

encodePcm16AsWav exposes bytesPerSample/channelCount/sampleRate options, so it can generate WAV headers for non-16-bit PCM as well. The current name implies it only supports PCM16; consider either asserting bytesPerSample===2 (and maybe channelCount/sampleRate expectations) or renaming the helper to reflect the broader capability to avoid misuse.

Suggested change
bytesPerSample?: number;
} = {},
): SynthesizedAudio {
bytesPerSample?: 2;
} = {},
): SynthesizedAudio {
if (bytesPerSample !== 2) {
throw new Error(
`encodePcm16AsWav only supports 16-bit PCM input (bytesPerSample must be 2, received ${bytesPerSample}).`,
);
}

Copilot uses AI. Check for mistakes.
const pcmBytes = Buffer.from(pcmBase64, "base64");
const wavBytes = new Uint8Array(WAV_HEADER_BYTES + pcmBytes.length);
const view = new DataView(
wavBytes.buffer,
wavBytes.byteOffset,
wavBytes.byteLength,
);

const byteRate = sampleRate * channelCount * bytesPerSample;
const blockAlign = channelCount * bytesPerSample;
const bitsPerSample = bytesPerSample * 8;

writeAscii(view, 0, "RIFF");
view.setUint32(4, 36 + pcmBytes.length, true);
writeAscii(view, 8, "WAVE");
writeAscii(view, 12, "fmt ");
view.setUint32(16, 16, true);
view.setUint16(20, 1, true);
view.setUint16(22, channelCount, true);
view.setUint32(24, sampleRate, true);
view.setUint32(28, byteRate, true);
view.setUint16(32, blockAlign, true);
view.setUint16(34, bitsPerSample, true);
writeAscii(view, 36, "data");
view.setUint32(40, pcmBytes.length, true);
wavBytes.set(pcmBytes, WAV_HEADER_BYTES);

return {
audioBase64: Buffer.from(wavBytes).toString("base64"),
mimeType: "audio/wav",
};
}
14 changes: 12 additions & 2 deletions packages/mentora-ai/src/types.ts
Original file line number Diff line number Diff line change
Expand Up @@ -69,6 +69,16 @@ export interface TokenTracker {
resetTokenUsage(): void;
}

/**
* Synthesized audio payload ready for downstream consumers
*/
export interface SynthesizedAudio {
/** Base64 encoded audio bytes */
audioBase64: string;
/** IANA MIME type for the audio payload */
mimeType: string;
}

/**
* Executor interface for running prompts against an LLM
*/
Expand Down Expand Up @@ -113,7 +123,7 @@ export interface TTSExecutor extends TokenTracker {
/**
* Synthesize text to speech
* @param text - Text to synthesize
* @returns Base64 encoded audio string
* @returns Base64 encoded audio payload and MIME type
*/
synthesize(text: string): Promise<string>;
synthesize(text: string): Promise<SynthesizedAudio>;
}
24 changes: 24 additions & 0 deletions packages/mentora-ai/tests/tts-audio.test.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,24 @@
import { describe, expect, it } from "vitest";
import { encodePcm16AsWav } from "../src/executor/wav.js";

describe("encodePcm16AsWav", () => {
it("wraps PCM bytes in a WAV container with the expected header", () => {
const pcmBytes = Uint8Array.from([0x01, 0x02, 0x03, 0x04]);
const result = encodePcm16AsWav(
Buffer.from(pcmBytes).toString("base64"),
);
const wavBytes = Buffer.from(result.audioBase64, "base64");

expect(result.mimeType).toBe("audio/wav");
expect(wavBytes.toString("ascii", 0, 4)).toBe("RIFF");
expect(wavBytes.toString("ascii", 8, 12)).toBe("WAVE");
expect(wavBytes.toString("ascii", 12, 16)).toBe("fmt ");
expect(wavBytes.readUInt16LE(20)).toBe(1);
expect(wavBytes.readUInt16LE(22)).toBe(1);
expect(wavBytes.readUInt32LE(24)).toBe(24_000);
expect(wavBytes.readUInt16LE(34)).toBe(16);
expect(wavBytes.toString("ascii", 36, 40)).toBe("data");
expect(wavBytes.readUInt32LE(40)).toBe(pcmBytes.length);
expect([...wavBytes.subarray(44)]).toEqual([...pcmBytes]);
});
});
2 changes: 1 addition & 1 deletion packages/mentora-api/src/lib/explorer/api-spec.ts
Original file line number Diff line number Diff line change
Expand Up @@ -810,7 +810,7 @@ export const apiModules: APIModule[] = [
response: {
text: 'Can you explain your reasoning in more detail?',
audio: '<base64>',
audioMimeType: 'audio/mp3'
audioMimeType: 'audio/wav'
}
}
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -408,11 +408,13 @@ export class ConversationService {

const aiTurnId = randomUUID();
let aiAudioBase64: string;
const aiAudioMimeType = 'audio/mp3';
let aiAudioMimeType: string;
try {
const ttsExecutor = getTTSExecutor(requestApiKey);
ttsExecutor.resetTokenUsage();
aiAudioBase64 = await ttsExecutor.synthesize(llmResult.aiMessage);
const synthesizedAudio = await ttsExecutor.synthesize(llmResult.aiMessage);
aiAudioBase64 = synthesizedAudio.audioBase64;
aiAudioMimeType = synthesizedAudio.mimeType;
ttsUsageReport = createTokenUsageReport([
{
feature: TOKEN_USAGE_FEATURES.CONVERSATION_TTS,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -119,7 +119,10 @@ function createMockASRExecutor(transcribeResult: string | Error) {
function createMockTTSExecutor() {
return {
resetTokenUsage: vi.fn(),
synthesize: vi.fn().mockResolvedValue('base64-audio-data'),
synthesize: vi.fn().mockResolvedValue({
audioBase64: 'base64-audio-data',
mimeType: 'audio/wav'
}),
getTokenUsage: vi.fn().mockReturnValue({
cachedContentTokenCount: 0,
candidatesTokenCount: 10,
Expand Down Expand Up @@ -215,7 +218,7 @@ describe('ConversationService.addTurn – ASR error handling', () => {

expect(result.text).toBe('AI response');
expect(result.audio).toBe('base64-audio-data');
expect(result.audioMimeType).toBe('audio/mp3');
expect(result.audioMimeType).toBe('audio/wav');

// Verify ASR was called with correct params
const asrExecutor = mockedGetASRExecutor.mock.results[0].value;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -151,7 +151,7 @@ describe('Mentora SDK Full Scenario (Integration)', () => {
JSON.stringify({
text: 'Mocked assistant response',
audio: 'ZmFrZS1hdWRpby1kYXRh',
audioMimeType: 'audio/mp3',
audioMimeType: 'audio/wav',
tokenUsage: {
byFeature: {
conversation_llm: {
Expand Down
Loading