fix: исправить декодирование multi-codebook и оптимизировать память

askidmobile · askidmobile · commit 94f451e80dec · 2026-01-27T02:54:11.000+03:00
- Исправить trailing_text_hidden для non_streaming_mode (использовать tts_pad)
- Конвертировать interleaved токены в multi-codebook формат для декодера
- Хранить только последний hidden state вместо всех (экономия памяти)
- Добавить fade-out 100ms для устранения шума в конце аудио
- Уменьшить max_seq_len до 400 (~33 сек при 12Hz)
- Убрать задержку при скрытии overlay инициализации
- Увеличить высоту окна приложения
diff --git a/crates/acoustic-model/src/model.rs b/crates/acoustic-model/src/model.rs
@@ -565,7 +565,8 @@ impl Model {
         let mut sampler = Sampler::new(sampling_config.clone());
         let mut generated_zeroth: Vec<u32> = Vec::with_capacity(max_new_tokens);
         let mut all_frames: Vec<Vec<u32>> = Vec::with_capacity(max_new_tokens);
-        let mut all_hidden_states: Vec<Tensor> = Vec::with_capacity(max_new_tokens);
+        // Note: We only keep the last hidden state to save memory
+        let mut last_hidden_state: Option<Tensor> = None;
 
         // Suppress tokens: special tokens (2048-3071) except EOS
         let suppress_start = 2048u32;
@@ -626,7 +627,7 @@ impl Model {
         }
 
         generated_zeroth.push(current_zeroth_token);
-        all_hidden_states.push(hidden_states.i((.., seq_len - 1..seq_len, ..))?.clone());
+        last_hidden_state = Some(hidden_states.i((.., seq_len - 1..seq_len, ..))?.clone());
 
         // Predict residual codebooks for first token using CodePredictor
         let first_hidden = hidden_states.i((.., seq_len - 1..seq_len, ..))?;
@@ -735,14 +736,17 @@ impl Model {
             // Sample next zeroth token
             let next_zeroth = sampler.sample(&logits_vec);
 
-            // Log progress
-            if step % 50 == 0 || next_zeroth >= 2048 {
+            // Log progress every 10 steps or on special events
+            let is_eos = Some(next_zeroth) == eos_token_id;
+            if step % 10 == 0 || next_zeroth >= 2048 || is_eos {
                 info!(
-                    "Step {}: zeroth={}, eos={:?}, is_eos={}, generated={}",
+                    "Step {}/{}: zeroth={}, eos={:?}, is_eos={}, min_reached={}, generated={}",
                     step,
+                    max_new_tokens,
                     next_zeroth,
                     eos_token_id,
-                    Some(next_zeroth) == eos_token_id,
+                    is_eos,
+                    generated_zeroth.len() >= min_new_tokens,
                     generated_zeroth.len()
                 );
             }
@@ -758,8 +762,8 @@ impl Model {
                 break;
             }
 
-            // Store hidden state
-            all_hidden_states.push(hidden_states.clone());
+            // Keep only last hidden state to save memory
+            last_hidden_state = Some(hidden_states.clone());
 
             generated_zeroth.push(next_zeroth);
 
@@ -792,14 +796,13 @@ impl Model {
             all_frames.len()
         );
 
-        // Concatenate hidden states
-        let concatenated_hidden = if all_hidden_states.is_empty() {
-            Tensor::zeros((1, 0, self.config.hidden_size), DType::F32, &self.device)?
-        } else {
-            Tensor::cat(&all_hidden_states, 1)?
-        };
+        // Return last hidden state only (or empty tensor)
+        let final_hidden = last_hidden_state.unwrap_or_else(|| {
+            Tensor::zeros((1, 1, self.config.hidden_size), DType::F32, &self.device)
+                .expect("Failed to create empty hidden state")
+        });
 
-        Ok((generated_zeroth, all_frames, concatenated_hidden))
+        Ok((generated_zeroth, all_frames, final_hidden))
     }
 
     /// Suppress special tokens in logits (set to -inf) except for EOS token.
diff --git a/crates/runtime/src/pipeline.rs b/crates/runtime/src/pipeline.rs
@@ -52,7 +52,7 @@ impl Default for PipelineConfig {
             default_lang: Lang::Ru,
             crossfade_ms: DEFAULT_CROSSFADE_MS,
             chunk_tokens: 10,
-            max_seq_len: 4096, // Max sequence length for audio generation
+            max_seq_len: 400, // Max sequence length for audio generation (~33 seconds at 12Hz)
             default_speaker: None,
             is_custom_voice: false,
         }
@@ -749,50 +749,30 @@ impl TtsPipeline {
         info!("min_new_tokens={} (matching Python SDK)", min_tokens);
 
         // ========== COMPUTE trailing_text_hidden ==========
-        // Python SDK (modeling_qwen3_tts.py:2230-2232):
-        // trailing_text_hidden = torch.cat((self.talker.text_projection(
-        //     self.talker.get_text_embeddings()(input_id[:, 4:-5])
-        // ), tts_eos_embed), dim=1)
+        // In non_streaming_mode (which we use), ALL text tokens are already in prefill.
+        // Python SDK (modeling_qwen3_tts.py:2227):
+        //     trailing_text_hidden = tts_pad_embed
         //
-        // This is: text_projection(text_tokens[1:]) concatenated with tts_eos_embed
-        // The first text token goes into prefill, remaining are for trailing conditioning
+        // This is just a single tts_pad embedding that gets used for ALL generation steps.
+        // The text conditioning comes from the prefill, not from trailing_text_hidden.
         //
-        // For text "Hello world" with tokens [15339, 1917]:
-        // - text_tokens[0] = 15339 goes into prefill (combined with codec embeddings)
-        // - text_tokens[1:] = [1917] + tts_eos becomes trailing_text_hidden
-        //
-        // During generation step i:
-        // - if i < len(trailing_text_hidden): use trailing_text_hidden[i]
-        // - else: use tts_pad_embed
-        // trailing_text_hidden: text conditioning for generation steps
-        // Each step uses trailing_text_hidden[step] until exhausted, then uses tts_pad_embed
-        let trailing_text_hidden = if text_tokens.len() > 1 {
-            // Build trailing tokens: text_tokens[1:] + tts_eos
-            let mut trailing_tokens: Vec<u32> = text_tokens[1..].to_vec();
-            trailing_tokens.push(st.tts_eos_token_id);
-
-            let trailing_tensor = Tensor::new(trailing_tokens.as_slice(), &self.device)
-                .map_err(|e| TtsError::inference(format!("tensor creation failed: {e}")))?
-                .unsqueeze(0)
-                .map_err(|e| TtsError::inference(format!("unsqueeze failed: {e}")))?;
-
-            // Get text embeddings with projection
-            let trailing_embed = model
-                .get_text_embedding(&trailing_tensor)
-                .map_err(|e| TtsError::inference(format!("trailing text embed failed: {e}")))?;
-
-            info!(
-                "Built trailing_text_hidden from {} tokens (text[1:] + eos), shape: {:?}",
-                trailing_tokens.len(),
-                trailing_embed.dims()
-            );
+        // IMPORTANT: In non_streaming_mode, trailing_text_hidden is NOT the remaining text!
+        // It's just tts_pad because all text is already encoded in the prefill.
+        let tts_pad_tensor = Tensor::new(&[st.tts_pad_token_id], &self.device)
+            .map_err(|e| TtsError::inference(format!("tensor creation failed: {e}")))?
+            .unsqueeze(0)
+            .map_err(|e| TtsError::inference(format!("unsqueeze failed: {e}")))?;
 
-            Some(trailing_embed)
-        } else {
-            // Only one text token, no trailing hidden needed
-            info!("Single text token, no trailing_text_hidden");
-            None
-        };
+        let tts_pad_embed = model
+            .get_text_embedding(&tts_pad_tensor)
+            .map_err(|e| TtsError::inference(format!("tts_pad embed failed: {e}")))?;
+
+        info!(
+            "non_streaming_mode: trailing_text_hidden = tts_pad_embed only, shape: {:?}",
+            tts_pad_embed.dims()
+        );
+
+        let trailing_text_hidden = Some(tts_pad_embed);
 
         // Use new method with CodePredictor if available
         // This correctly sums all 16 codebook embeddings at each generation step
@@ -1247,37 +1227,34 @@ impl TtsPipeline {
             "Acoustic tokens generated"
         );
 
-        // 4. Filter out special tokens before decoding
-        // Codec special tokens (2148-2157) are control tokens, not audio data:
-        // - 2148: codec_pad_id
-        // - 2149: codec_bos_id
-        // - 2150: codec_eos_id
-        // - 2154: codec_think_id
-        // - 2155: codec_nothink_id
-        // - 2156: codec_think_bos_id
-        // - 2157: codec_think_eos_id
-        // Audio tokens are in range 0-2047 (codec vocab_size)
-        let filtered_tokens: Vec<u32> = acoustic_tokens
-            .iter()
-            .filter(|&&t| t < 2048) // valid audio tokens
-            .copied()
-            .collect();
+        // 4. acoustic_tokens is in interleaved format: [c0_f0, c1_f0, ..., c15_f0, c0_f1, ...]
+        // Convert to multi-codebook format: Vec<Vec<u32>> where each inner Vec is one codebook
+        const NUM_CODEBOOKS: usize = 16;
+        let num_frames = acoustic_tokens.len() / NUM_CODEBOOKS;
 
-        if filtered_tokens.is_empty() {
-            return Err(TtsError::inference(
-                "no valid audio tokens generated (all were special tokens)".to_string(),
-            ));
+        if num_frames == 0 {
+            return Err(TtsError::inference("no audio frames generated".to_string()));
+        }
+
+        // Reshape interleaved to [num_codebooks][num_frames]
+        let mut multi_tokens: Vec<Vec<u32>> = vec![Vec::with_capacity(num_frames); NUM_CODEBOOKS];
+        for frame_idx in 0..num_frames {
+            for cb_idx in 0..NUM_CODEBOOKS {
+                let token = acoustic_tokens[frame_idx * NUM_CODEBOOKS + cb_idx];
+                // Clamp tokens to valid range (0-2047)
+                let clamped = if token >= 2048 { 0 } else { token };
+                multi_tokens[cb_idx].push(clamped);
+            }
         }
 
         debug!(
-            original = acoustic_tokens.len(),
-            filtered = filtered_tokens.len(),
-            removed = acoustic_tokens.len() - filtered_tokens.len(),
-            "Filtered special tokens"
+            num_frames = num_frames,
+            codebooks = NUM_CODEBOOKS,
+            "Converted interleaved to multi-codebook format"
         );
 
-        // 5. Decode to audio
-        let audio = self.decode_audio(&filtered_tokens)?;
+        // 5. Decode using multi-codebook decoder
+        let audio = self.codec.decode_multi(&multi_tokens)?;
         debug!(
             samples = audio.num_samples(),
             duration_ms = audio.duration_ms(),
diff --git a/crates/tts-app/src/tts.rs b/crates/tts-app/src/tts.rs
@@ -11,7 +11,7 @@ use tauri::State;
 use tokio::sync::Mutex;
 use tracing::{error, info};
 
-use audio_codec_12hz::apply_fade_in;
+use audio_codec_12hz::{apply_fade_in, apply_fade_out};
 use runtime::TtsPipeline;
 use tts_core::Lang;
 
@@ -200,10 +200,11 @@ pub async fn speak(
     let sample_rate = audio.sample_rate;
     let num_samples = audio.num_samples();
 
-    // Apply fade-in to remove artifacts
+    // Apply fade-in/fade-out to remove artifacts at beginning and end
     // audio.pcm is Arc<[f32]>, need to clone to mutable Vec
     let mut samples: Vec<f32> = audio.pcm.to_vec();
-    apply_fade_in(&mut samples, 50.0, sample_rate);
+    apply_fade_in(&mut samples, 20.0, sample_rate); // 20ms fade-in
+    apply_fade_out(&mut samples, 100.0, sample_rate); // 100ms fade-out (longer to remove end noise)
 
     // Create WAV in memory
     let wav_data = create_wav_buffer(&samples, sample_rate);
diff --git a/crates/tts-app/tauri.conf.json b/crates/tts-app/tauri.conf.json
@@ -12,11 +12,11 @@
       {
         "title": "Qwen3-TTS",
         "width": 600,
-        "height": 700,
+        "height": 900,
         "resizable": true,
         "center": true,
         "minWidth": 400,
-        "minHeight": 500
+        "minHeight": 600
       }
     ],
     "security": {
diff --git a/crates/tts-app/ui/index.html b/crates/tts-app/ui/index.html
@@ -450,19 +450,13 @@ <h1>Qwen3-TTS</h1>
                 console.log('Setting isInitialized = true');
                 isInitialized = true;
                 
-                // Show success message briefly
-                showInitStatus('Model loaded! Starting...', 'success');
+                // Hide overlay immediately
+                console.log('Hiding initOverlay immediately');
+                initOverlay.style.display = 'none';
+                initOverlay.classList.add('hidden');
                 
-                // Small delay to let user see success, then hide overlay
-                setTimeout(() => {
-                    console.log('Hiding initOverlay, element:', initOverlay);
-                    initOverlay.style.display = 'none'; // Direct style as backup
-                    initOverlay.classList.add('hidden');
-                    console.log('initOverlay classes after:', initOverlay.className);
-                    
-                    speakBtn.disabled = false;
-                    console.log('Init complete!');
-                }, 500);
+                speakBtn.disabled = false;
+                console.log('Init complete! Main UI should be visible now.');
 
             } catch (e) {
                 console.error('initTTS error:', e);

Original file line number	Diff line number	Diff line change
`@@ -12,11 +12,11 @@`
`12`	`12`	`{`
`13`	`13`	`"title": "Qwen3-TTS",`
`14`	`14`	`"width": 600,`
`15`		`- "height": 700,`
	`15`	`+ "height": 900,`
`16`	`16`	`"resizable": true,`
`17`	`17`	`"center": true,`
`18`	`18`	`"minWidth": 400,`
`19`		`- "minHeight": 500`
	`19`	`+ "minHeight": 600`
`20`	`20`	`}`
`21`	`21`	`],`
`22`	`22`	`"security": {`