11import Foundation
2+ import os. log
23import DialectCore
34
5+ private let ttsLog = Logger ( subsystem: " com.babyap.tts " , category: " OnDeviceTTS " )
6+
47/// On-device TTS engine using ONNX Runtime for CosyVoice 3 inference.
58///
69/// Runs the full CosyVoice 3 pipeline locally:
@@ -74,15 +77,19 @@ actor OnDeviceTTSEngine: TTSEngine {
7477
7578 // Load voice data (speaker embedding, prompt tokens, prompt mel)
7679 let voiceData = try loadVoiceData ( for: voice)
80+ ttsLog. info ( " [TTS] Voice data loaded: emb= \( voiceData. speakerEmbedding. count) , tokens= \( voiceData. promptSpeechTokens. count) , melFrames= \( voiceData. promptMelFrames) " )
7781
7882 // Stage 1: LLM — text + prompt → speech tokens
83+ ttsLog. info ( " [TTS] Stage 1: Starting LLM inference for ' \( text. prefix ( 50 ) ) '... " )
7984 let speechTokens = try await llm. generateSpeechTokens (
8085 text: text,
8186 promptText: voiceData. promptText,
8287 promptSpeechTokens: voiceData. promptSpeechTokens
8388 )
89+ ttsLog. info ( " [TTS] Stage 1 complete: \( speechTokens. count) speech tokens generated " )
8490
8591 // Stage 2 + 3: Flow matching → mel → vocoder → PCM audio
92+ ttsLog. info ( " [TTS] Stage 2+3: Starting flow matching + vocoder... " )
8693 let pcmSamples = try await vocoder. generateAudio (
8794 speechTokens: speechTokens,
8895 speakerEmbedding: voiceData. speakerEmbedding,
@@ -91,11 +98,22 @@ actor OnDeviceTTSEngine: TTSEngine {
9198 promptMelFrames: voiceData. promptMelFrames
9299 )
93100
101+ // Diagnostic: check audio sample stats
102+ let absMax = pcmSamples. map { abs ( $0) } . max ( ) ?? 0
103+ let nonZero = pcmSamples. filter { abs ( $0) > 1e-6 } . count
104+ ttsLog. info ( " [TTS] Stage 2+3 complete: \( pcmSamples. count) samples, absMax= \( absMax) , nonZero= \( nonZero) / \( pcmSamples. count) " )
105+
106+ if absMax < 1e-4 {
107+ ttsLog. warning ( " [TTS] WARNING: Audio samples are near-zero — output will be silent! " )
108+ }
109+
94110 // Convert PCM Float32 → 16-bit WAV
95- return WAVEncoder . encode (
111+ let wavData = WAVEncoder . encode (
96112 samples: pcmSamples,
97113 sampleRate: FlowVocoderInference . sampleRate
98114 )
115+ ttsLog. info ( " [TTS] WAV encoded: \( wavData. count) bytes, duration ~ \( Double ( pcmSamples. count) / Double( FlowVocoderInference . sampleRate) ) s " )
116+ return wavData
99117 }
100118
101119 /// Release all cached data and ONNX sessions to free memory.
@@ -131,6 +149,7 @@ actor OnDeviceTTSEngine: TTSEngine {
131149 // Check app bundle first (voice data is small and bundled with the app),
132150 // then fall back to the downloaded models directory.
133151 let voicesDir = voiceDataDirectory ( for: voice. id)
152+ ttsLog. info ( " [TTS] Loading voice data from: \( voicesDir. path) " )
134153
135154 // Load speaker embedding (192 × Float32)
136155 let embPath = voicesDir. appendingPathComponent ( " speaker_embedding.bin " )
@@ -177,9 +196,13 @@ actor OnDeviceTTSEngine: TTSEngine {
177196 /// Resolve the directory containing voice data files for the given voice ID.
178197 /// Checks the app bundle first, then the downloaded models directory.
179198 private func voiceDataDirectory( for voiceId: String ) -> URL {
180- // App bundle: App/Resources/voices/{voiceId}/
181- if let bundlePath = Bundle . main. path ( forResource: " speaker_embedding " , ofType: " bin " , inDirectory: " voices/ \( voiceId) " ) {
182- return URL ( fileURLWithPath: bundlePath) . deletingLastPathComponent ( )
199+ // App bundle: voices/{voiceId}/ (folder reference)
200+ let bundleDir = Bundle . main. bundleURL
201+ . appendingPathComponent ( " voices " , isDirectory: true )
202+ . appendingPathComponent ( voiceId, isDirectory: true )
203+ let embFile = bundleDir. appendingPathComponent ( " speaker_embedding.bin " )
204+ if FileManager . default. fileExists ( atPath: embFile. path) {
205+ return bundleDir
183206 }
184207 // Fallback: downloaded models directory
185208 return ONNXSessionManager . defaultModelsDirectory
0 commit comments