Skip to content

Commit aeb24cb

Browse files
psypealclaude
andcommitted
fix: bundle path resolution + add TTS pipeline diagnostic logging
Fix voice data directory resolution to use Bundle.main.bundleURL instead of Bundle.main.path(forResource:) which doesn't work with folder references. Add comprehensive os.Logger diagnostics across all pipeline stages (LLM, Flow, HiFT) to trace audio generation issues via Xcode console. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
1 parent f0e257b commit aeb24cb

3 files changed

Lines changed: 46 additions & 4 deletions

File tree

App/Services/TTS/FlowVocoderInference.swift

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,9 @@
11
import Foundation
2+
import os.log
23
import OnnxRuntimeBindings
34

5+
private let ttsLog = Logger(subsystem: "com.babyap.tts", category: "FlowVocoder")
6+
47
/// Runs CosyVoice 3 Flow Matching (Stage 2) and HiFT Vocoder (Stage 3) using ONNX Runtime.
58
///
69
/// Stage 2 converts speech tokens → mel spectrogram via conditional flow matching.
@@ -75,6 +78,9 @@ actor FlowVocoderInference {
7578
promptMel: promptMel,
7679
promptMelFrames: promptMelFrames
7780
)
81+
let melAbsMax = mel.data.map { abs($0) }.max() ?? 0
82+
let melHasNaN = mel.data.contains { $0.isNaN || $0.isInfinite }
83+
ttsLog.info("[Flow] Mel output: \(mel.melLen) frames, absMax=\(melAbsMax), hasNaN=\(melHasNaN)")
7884
return try await hiftInference(mel: mel.data, melLen: mel.melLen)
7985
}
8086

@@ -221,16 +227,22 @@ actor FlowVocoderInference {
221227
private func hiftInference(mel: [Float], melLen: Int) async throws -> [Float] {
222228

223229
// Step 1: Predict F0 — [1, 80, melLen] → [1, melLen]
230+
ttsLog.info("[HiFT] Step 1: Predicting F0 from mel [1,80,\(melLen)]...")
224231
let f0 = try await runModel(
225232
hiftF0PredictorModel,
226233
inputs: ["mel": try makeFloatTensor(mel, shape: [1, Self.melBins, melLen])]
227234
)
235+
let f0Max = f0.max() ?? 0
236+
let f0Min = f0.min() ?? 0
237+
ttsLog.info("[HiFT] F0: \(f0.count) values, range [\(f0Min), \(f0Max)]")
228238

229239
// Step 2: Source generator — [1, 1, melLen] → [1, 1, timeUp]
240+
ttsLog.info("[HiFT] Step 2: Source generation from F0...")
230241
let source = try await runModel(
231242
hiftSourceGeneratorModel,
232243
inputs: ["f0": try makeFloatTensor(f0, shape: [1, 1, f0.count])]
233244
)
245+
ttsLog.info("[HiFT] Source: \(source.count) samples, absMax=\(source.map { abs($0) }.max() ?? 0)")
234246

235247
// Step 3: STFT of source signal using AudioMath
236248
// n_fft=16, hop=4, center=true, Hann window → [18, stftFrames]
@@ -269,9 +281,11 @@ actor FlowVocoderInference {
269281
)
270282

271283
// Step 6: Clip audio to [-0.99, 0.99]
284+
let preClipMax = audio.map { abs($0) }.max() ?? 0
272285
for i in 0..<audio.count {
273286
audio[i] = min(max(audio[i], -Self.audioLimit), Self.audioLimit)
274287
}
288+
ttsLog.info("[HiFT] Audio output: \(audio.count) samples, preClipAbsMax=\(preClipMax)")
275289

276290
return audio
277291
}

App/Services/TTS/LLMInference.swift

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,9 @@
11
import Foundation
2+
import os.log
23
import OnnxRuntimeBindings
34

5+
private let ttsLog = Logger(subsystem: "com.babyap.tts", category: "LLMInference")
6+
47
/// Runs the CosyVoice 3 LLM autoregressive inference pipeline using ONNX Runtime.
58
///
69
/// This is Stage 1 of the CosyVoice 3 TTS pipeline:
@@ -105,6 +108,7 @@ actor LLMInference {
105108
// Step 8: Autoregressive decode loop
106109
let minTokens = max(10, ttsTextLen * 2)
107110
let maxTokens = min(500, ttsTextLen * 20)
111+
ttsLog.info("[LLM] Starting decode loop: textLen=\(ttsTextLen), minTokens=\(minTokens), maxTokens=\(maxTokens)")
108112

109113
var outputTokens: [Int64] = []
110114
var kvCache = initialKVCache
@@ -141,6 +145,7 @@ actor LLMInference {
141145
logits = try await runLLMDecoder(hiddenState: newHidden)
142146
}
143147

148+
ttsLog.info("[LLM] Decode complete: \(outputTokens.count) tokens generated. Range: [\(outputTokens.min() ?? 0), \(outputTokens.max() ?? 0)]")
144149
return outputTokens
145150
}
146151

App/Services/TTS/OnDeviceTTSEngine.swift

Lines changed: 27 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,9 @@
11
import Foundation
2+
import os.log
23
import DialectCore
34

5+
private let ttsLog = Logger(subsystem: "com.babyap.tts", category: "OnDeviceTTS")
6+
47
/// On-device TTS engine using ONNX Runtime for CosyVoice 3 inference.
58
///
69
/// Runs the full CosyVoice 3 pipeline locally:
@@ -74,15 +77,19 @@ actor OnDeviceTTSEngine: TTSEngine {
7477

7578
// Load voice data (speaker embedding, prompt tokens, prompt mel)
7679
let voiceData = try loadVoiceData(for: voice)
80+
ttsLog.info("[TTS] Voice data loaded: emb=\(voiceData.speakerEmbedding.count), tokens=\(voiceData.promptSpeechTokens.count), melFrames=\(voiceData.promptMelFrames)")
7781

7882
// Stage 1: LLM — text + prompt → speech tokens
83+
ttsLog.info("[TTS] Stage 1: Starting LLM inference for '\(text.prefix(50))'...")
7984
let speechTokens = try await llm.generateSpeechTokens(
8085
text: text,
8186
promptText: voiceData.promptText,
8287
promptSpeechTokens: voiceData.promptSpeechTokens
8388
)
89+
ttsLog.info("[TTS] Stage 1 complete: \(speechTokens.count) speech tokens generated")
8490

8591
// Stage 2 + 3: Flow matching → mel → vocoder → PCM audio
92+
ttsLog.info("[TTS] Stage 2+3: Starting flow matching + vocoder...")
8693
let pcmSamples = try await vocoder.generateAudio(
8794
speechTokens: speechTokens,
8895
speakerEmbedding: voiceData.speakerEmbedding,
@@ -91,11 +98,22 @@ actor OnDeviceTTSEngine: TTSEngine {
9198
promptMelFrames: voiceData.promptMelFrames
9299
)
93100

101+
// Diagnostic: check audio sample stats
102+
let absMax = pcmSamples.map { abs($0) }.max() ?? 0
103+
let nonZero = pcmSamples.filter { abs($0) > 1e-6 }.count
104+
ttsLog.info("[TTS] Stage 2+3 complete: \(pcmSamples.count) samples, absMax=\(absMax), nonZero=\(nonZero)/\(pcmSamples.count)")
105+
106+
if absMax < 1e-4 {
107+
ttsLog.warning("[TTS] WARNING: Audio samples are near-zero — output will be silent!")
108+
}
109+
94110
// Convert PCM Float32 → 16-bit WAV
95-
return WAVEncoder.encode(
111+
let wavData = WAVEncoder.encode(
96112
samples: pcmSamples,
97113
sampleRate: FlowVocoderInference.sampleRate
98114
)
115+
ttsLog.info("[TTS] WAV encoded: \(wavData.count) bytes, duration ~\(Double(pcmSamples.count) / Double(FlowVocoderInference.sampleRate))s")
116+
return wavData
99117
}
100118

101119
/// Release all cached data and ONNX sessions to free memory.
@@ -131,6 +149,7 @@ actor OnDeviceTTSEngine: TTSEngine {
131149
// Check app bundle first (voice data is small and bundled with the app),
132150
// then fall back to the downloaded models directory.
133151
let voicesDir = voiceDataDirectory(for: voice.id)
152+
ttsLog.info("[TTS] Loading voice data from: \(voicesDir.path)")
134153

135154
// Load speaker embedding (192 × Float32)
136155
let embPath = voicesDir.appendingPathComponent("speaker_embedding.bin")
@@ -177,9 +196,13 @@ actor OnDeviceTTSEngine: TTSEngine {
177196
/// Resolve the directory containing voice data files for the given voice ID.
178197
/// Checks the app bundle first, then the downloaded models directory.
179198
private func voiceDataDirectory(for voiceId: String) -> URL {
180-
// App bundle: App/Resources/voices/{voiceId}/
181-
if let bundlePath = Bundle.main.path(forResource: "speaker_embedding", ofType: "bin", inDirectory: "voices/\(voiceId)") {
182-
return URL(fileURLWithPath: bundlePath).deletingLastPathComponent()
199+
// App bundle: voices/{voiceId}/ (folder reference)
200+
let bundleDir = Bundle.main.bundleURL
201+
.appendingPathComponent("voices", isDirectory: true)
202+
.appendingPathComponent(voiceId, isDirectory: true)
203+
let embFile = bundleDir.appendingPathComponent("speaker_embedding.bin")
204+
if FileManager.default.fileExists(atPath: embFile.path) {
205+
return bundleDir
183206
}
184207
// Fallback: downloaded models directory
185208
return ONNXSessionManager.defaultModelsDirectory

0 commit comments

Comments
 (0)