fix: bundle path resolution + add TTS pipeline diagnostic logging

psypeal · claude · psypeal · commit aeb24cb21d0b · 2026-02-16T03:12:30.000-08:00
Fix voice data directory resolution to use Bundle.main.bundleURL
instead of Bundle.main.path(forResource:) which doesn't work with
folder references. Add comprehensive os.Logger diagnostics across
all pipeline stages (LLM, Flow, HiFT) to trace audio generation
issues via Xcode console.

Co-Authored-By: Claude Opus 4.6 &lt;noreply@anthropic.com&gt;
diff --git a/App/Services/TTS/FlowVocoderInference.swift b/App/Services/TTS/FlowVocoderInference.swift
@@ -1,6 +1,9 @@
 import Foundation
+import os.log
 import OnnxRuntimeBindings
 
+private let ttsLog = Logger(subsystem: "com.babyap.tts", category: "FlowVocoder")
+
 /// Runs CosyVoice 3 Flow Matching (Stage 2) and HiFT Vocoder (Stage 3) using ONNX Runtime.
 ///
 /// Stage 2 converts speech tokens → mel spectrogram via conditional flow matching.
@@ -75,6 +78,9 @@ actor FlowVocoderInference {
             promptMel: promptMel,
             promptMelFrames: promptMelFrames
         )
+        let melAbsMax = mel.data.map { abs($0) }.max() ?? 0
+        let melHasNaN = mel.data.contains { $0.isNaN || $0.isInfinite }
+        ttsLog.info("[Flow] Mel output: \(mel.melLen) frames, absMax=\(melAbsMax), hasNaN=\(melHasNaN)")
         return try await hiftInference(mel: mel.data, melLen: mel.melLen)
     }
 
@@ -221,16 +227,22 @@ actor FlowVocoderInference {
     private func hiftInference(mel: [Float], melLen: Int) async throws -> [Float] {
 
         // Step 1: Predict F0 — [1, 80, melLen] → [1, melLen]
+        ttsLog.info("[HiFT] Step 1: Predicting F0 from mel [1,80,\(melLen)]...")
         let f0 = try await runModel(
             hiftF0PredictorModel,
             inputs: ["mel": try makeFloatTensor(mel, shape: [1, Self.melBins, melLen])]
         )
+        let f0Max = f0.max() ?? 0
+        let f0Min = f0.min() ?? 0
+        ttsLog.info("[HiFT] F0: \(f0.count) values, range [\(f0Min), \(f0Max)]")
 
         // Step 2: Source generator — [1, 1, melLen] → [1, 1, timeUp]
+        ttsLog.info("[HiFT] Step 2: Source generation from F0...")
         let source = try await runModel(
             hiftSourceGeneratorModel,
             inputs: ["f0": try makeFloatTensor(f0, shape: [1, 1, f0.count])]
         )
+        ttsLog.info("[HiFT] Source: \(source.count) samples, absMax=\(source.map { abs($0) }.max() ?? 0)")
 
         // Step 3: STFT of source signal using AudioMath
         // n_fft=16, hop=4, center=true, Hann window → [18, stftFrames]
@@ -269,9 +281,11 @@ actor FlowVocoderInference {
         )
 
         // Step 6: Clip audio to [-0.99, 0.99]
+        let preClipMax = audio.map { abs($0) }.max() ?? 0
         for i in 0..<audio.count {
             audio[i] = min(max(audio[i], -Self.audioLimit), Self.audioLimit)
         }
+        ttsLog.info("[HiFT] Audio output: \(audio.count) samples, preClipAbsMax=\(preClipMax)")
 
         return audio
     }
diff --git a/App/Services/TTS/LLMInference.swift b/App/Services/TTS/LLMInference.swift
@@ -1,6 +1,9 @@
 import Foundation
+import os.log
 import OnnxRuntimeBindings
 
+private let ttsLog = Logger(subsystem: "com.babyap.tts", category: "LLMInference")
+
 /// Runs the CosyVoice 3 LLM autoregressive inference pipeline using ONNX Runtime.
 ///
 /// This is Stage 1 of the CosyVoice 3 TTS pipeline:
@@ -105,6 +108,7 @@ actor LLMInference {
         // Step 8: Autoregressive decode loop
         let minTokens = max(10, ttsTextLen * 2)
         let maxTokens = min(500, ttsTextLen * 20)
+        ttsLog.info("[LLM] Starting decode loop: textLen=\(ttsTextLen), minTokens=\(minTokens), maxTokens=\(maxTokens)")
 
         var outputTokens: [Int64] = []
         var kvCache = initialKVCache
@@ -141,6 +145,7 @@ actor LLMInference {
             logits = try await runLLMDecoder(hiddenState: newHidden)
         }
 
+        ttsLog.info("[LLM] Decode complete: \(outputTokens.count) tokens generated. Range: [\(outputTokens.min() ?? 0), \(outputTokens.max() ?? 0)]")
         return outputTokens
     }
 
diff --git a/App/Services/TTS/OnDeviceTTSEngine.swift b/App/Services/TTS/OnDeviceTTSEngine.swift
@@ -1,6 +1,9 @@
 import Foundation
+import os.log
 import DialectCore
 
+private let ttsLog = Logger(subsystem: "com.babyap.tts", category: "OnDeviceTTS")
+
 /// On-device TTS engine using ONNX Runtime for CosyVoice 3 inference.
 ///
 /// Runs the full CosyVoice 3 pipeline locally:
@@ -74,15 +77,19 @@ actor OnDeviceTTSEngine: TTSEngine {
 
         // Load voice data (speaker embedding, prompt tokens, prompt mel)
         let voiceData = try loadVoiceData(for: voice)
+        ttsLog.info("[TTS] Voice data loaded: emb=\(voiceData.speakerEmbedding.count), tokens=\(voiceData.promptSpeechTokens.count), melFrames=\(voiceData.promptMelFrames)")
 
         // Stage 1: LLM — text + prompt → speech tokens
+        ttsLog.info("[TTS] Stage 1: Starting LLM inference for '\(text.prefix(50))'...")
         let speechTokens = try await llm.generateSpeechTokens(
             text: text,
             promptText: voiceData.promptText,
             promptSpeechTokens: voiceData.promptSpeechTokens
         )
+        ttsLog.info("[TTS] Stage 1 complete: \(speechTokens.count) speech tokens generated")
 
         // Stage 2 + 3: Flow matching → mel → vocoder → PCM audio
+        ttsLog.info("[TTS] Stage 2+3: Starting flow matching + vocoder...")
         let pcmSamples = try await vocoder.generateAudio(
             speechTokens: speechTokens,
             speakerEmbedding: voiceData.speakerEmbedding,
@@ -91,11 +98,22 @@ actor OnDeviceTTSEngine: TTSEngine {
             promptMelFrames: voiceData.promptMelFrames
         )
 
+        // Diagnostic: check audio sample stats
+        let absMax = pcmSamples.map { abs($0) }.max() ?? 0
+        let nonZero = pcmSamples.filter { abs($0) > 1e-6 }.count
+        ttsLog.info("[TTS] Stage 2+3 complete: \(pcmSamples.count) samples, absMax=\(absMax), nonZero=\(nonZero)/\(pcmSamples.count)")
+
+        if absMax < 1e-4 {
+            ttsLog.warning("[TTS] WARNING: Audio samples are near-zero — output will be silent!")
+        }
+
         // Convert PCM Float32 → 16-bit WAV
-        return WAVEncoder.encode(
+        let wavData = WAVEncoder.encode(
             samples: pcmSamples,
             sampleRate: FlowVocoderInference.sampleRate
         )
+        ttsLog.info("[TTS] WAV encoded: \(wavData.count) bytes, duration ~\(Double(pcmSamples.count) / Double(FlowVocoderInference.sampleRate))s")
+        return wavData
     }
 
     /// Release all cached data and ONNX sessions to free memory.
@@ -131,6 +149,7 @@ actor OnDeviceTTSEngine: TTSEngine {
         // Check app bundle first (voice data is small and bundled with the app),
         // then fall back to the downloaded models directory.
         let voicesDir = voiceDataDirectory(for: voice.id)
+        ttsLog.info("[TTS] Loading voice data from: \(voicesDir.path)")
 
         // Load speaker embedding (192 × Float32)
         let embPath = voicesDir.appendingPathComponent("speaker_embedding.bin")
@@ -177,9 +196,13 @@ actor OnDeviceTTSEngine: TTSEngine {
     /// Resolve the directory containing voice data files for the given voice ID.
     /// Checks the app bundle first, then the downloaded models directory.
     private func voiceDataDirectory(for voiceId: String) -> URL {
-        // App bundle: App/Resources/voices/{voiceId}/
-        if let bundlePath = Bundle.main.path(forResource: "speaker_embedding", ofType: "bin", inDirectory: "voices/\(voiceId)") {
-            return URL(fileURLWithPath: bundlePath).deletingLastPathComponent()
+        // App bundle: voices/{voiceId}/ (folder reference)
+        let bundleDir = Bundle.main.bundleURL
+            .appendingPathComponent("voices", isDirectory: true)
+            .appendingPathComponent(voiceId, isDirectory: true)
+        let embFile = bundleDir.appendingPathComponent("speaker_embedding.bin")
+        if FileManager.default.fileExists(atPath: embFile.path) {
+            return bundleDir
         }
         // Fallback: downloaded models directory
         return ONNXSessionManager.defaultModelsDirectory