Fix model input data type

Edresson · Edresson · commit 42da55b75472 · 2026-04-06T11:01:30.000-07:00
Signed-off-by: Edresson Casanova &lt;edresson1@gmail.com&gt;
diff --git a/nemo/collections/common/data/lhotse/cutset.py b/nemo/collections/common/data/lhotse/cutset.py
@@ -1105,7 +1105,7 @@ def convert_cut_fn(cut: Cut) -> Cut:
         assert new_cut.recording is old_target_audio, f"{new_cut.id}: recording object not swapped"
         assert new_cut.target_audio is old_recording, f"{new_cut.id}: target_audio object not swapped"
 
-        new_cut.formatter = "s2s_duplex_reverse_role"
+        new_cut.task = "s2s_duplex_reverse_role"
         return new_cut
 
     cuts = cuts.map(convert_cut_fn)
diff --git a/nemo/collections/speechlm2/models/duplex_ear_tts.py b/nemo/collections/speechlm2/models/duplex_ear_tts.py
@@ -131,7 +131,7 @@ def get_codec_silence_frame_last_one(self):
         audio, audio_len = self.pad_audio_to_factor(audio, audio_len, self.target_samples_per_frame)
 
         with ensures_target_precision(self.audio_codec_run_dtype), torch.no_grad():
-            sil_codes, sil_codes_lens = self.audio_codec.encode(audio.unsqueeze(1), audio_len)
+            sil_codes, sil_codes_lens = self.audio_codec.encode(audio.unsqueeze(1).to(self.audio_codec_run_dtype), audio_len)
             return sil_codes[0, -1]
 
     def get_codec_silence_frame(self):
@@ -142,7 +142,7 @@ def get_codec_silence_frame(self):
         audio, audio_len = self.pad_audio_to_factor(audio, audio_len, self.target_samples_per_frame)
 
         with ensures_target_precision(self.audio_codec_run_dtype), torch.no_grad():
-            sil_codes, _ = self.audio_codec.encode(audio.unsqueeze(1), audio_len)  # [1, T, C]
+            sil_codes, _ = self.audio_codec.encode(audio.unsqueeze(1).to(self.audio_codec_run_dtype), audio_len)  # [1, T, C]
             sil_codes = sil_codes[0]  # [T, C]
 
         # Convert each frame (C tokens) into a tuple
@@ -328,7 +328,7 @@ def prepare_inputs(self, batch: dict):
             target_audio, target_audio_lens, self.target_samples_per_frame, 1
         )
         with ensures_target_precision(self.audio_codec_run_dtype), torch.no_grad():
-            target_codes, target_codes_lens = self.audio_codec.encode(target_audio.unsqueeze(1), target_audio_lens)
+            target_codes, target_codes_lens = self.audio_codec.encode(target_audio.unsqueeze(1).to(self.audio_codec_run_dtype), target_audio_lens)
 
         with fp32_precision():
             target_len = target_codes.shape[1]
@@ -1013,7 +1013,7 @@ def set_init_inputs(self, speaker_audio=None, speaker_audio_lens=None, system_pr
             [target_audio.size(-1)] * target_audio.size(0), dtype=torch.long, device=self.device
         )
         with ensures_target_precision(self.audio_codec_run_dtype), torch.no_grad():
-            code, _ = self.audio_codec.encode(target_audio.unsqueeze(1), target_audio_len)
+            code, _ = self.audio_codec.encode(target_audio.unsqueeze(1).to(self.audio_codec_run_dtype), target_audio_len)
 
         # get context hidden
         if self.cfg.tts_config.context_hidden_size is not None:
@@ -1683,7 +1683,7 @@ def setup_audio_codec(model):
         p.requires_grad = False
 
     model.audio_codec.eval()
-    model.audio_codec.to(model.device)  # force codec to run in the same device as the main model
+    model.audio_codec.to(model.device) # force codec to run in the same device as the main model
 
     assert callable(model.tts_model.set_rvq_embs)
 
diff --git a/nemo/collections/speechlm2/modules/ear_tts_model.py b/nemo/collections/speechlm2/modules/ear_tts_model.py
@@ -1149,7 +1149,7 @@ def depthsum_embedding(self, code: Tensor) -> Tensor:
         _, v, h = self.rvq_embs.size()
         device = code.device
 
-        ret = torch.zeros((b, t, h), device=device)
+        ret = torch.zeros((b, t, h), device=device, dtype=self.rvq_embs.dtype)
         embs = F.pad(self.rvq_embs, [0, 0, 0, 1])
         for i in range(d):
             emb = embs[i]
@@ -1203,7 +1203,7 @@ def _prepare_conditioning(
         asr_speech_tokens_emb: Tensor | None,
     ) -> Tensor:
         """Computes the final conditioning tensor by combining all sources."""
-        cond = torch.zeros((1, 1, self.hidden_size), device=uncond_dec_flag.device)
+        cond = torch.zeros((1, 1, self.hidden_size), device=uncond_dec_flag.device, dtype=self.rvq_embs.dtype)
 
         if self.embed_context is not None and context_hidden_state is not None:
             cond = cond + self.embed_context(context_hidden_state)