From 58d2e70cfc0d1e507492d3860d96d21bb0ebd6c1 Mon Sep 17 00:00:00 2001
From: dujunling <dujunling@bytedance.com>
Date: Mon, 24 Nov 2025 16:15:08 +0800
Subject: [PATCH 1/3] feat: support gpu & support invoke speechscore via code

---
 speechscore/scores/distill_mos/distill_mos.py |  5 +++--
 speechscore/scores/distill_mos/sqa.py         |  2 +-
 speechscore/scores/dnsmos/dnsmos.py           | 10 ++++++----
 speechscore/scores/nisqa/nisqa.py             |  3 ++-
 4 files changed, 12 insertions(+), 8 deletions(-)

diff --git a/speechscore/scores/distill_mos/distill_mos.py b/speechscore/scores/distill_mos/distill_mos.py
index a47e7762..a43ffe09 100644
--- a/speechscore/scores/distill_mos/distill_mos.py
+++ b/speechscore/scores/distill_mos/distill_mos.py
@@ -8,11 +8,12 @@ def __init__(self):
         super(DISTILL_MOS, self).__init__(name='DISTILL_MOS')
         self.intrusive = False
         self.score_rate = 16000
-        self.model = ConvTransformerSQAModel()
+        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+        self.model = ConvTransformerSQAModel().to(self.device)
         self.model.eval()
 
     def windowed_scoring(self, audios, score_rate):
     	
-        score = self.model(torch.from_numpy(np.expand_dims(audios[0], axis=0)).float())
+        score = self.model(torch.from_numpy(np.expand_dims(audios[0], axis=0)).float().to(self.device))
         score_np = score.detach().cpu().numpy()
         return score_np[0][0]
\ No newline at end of file
diff --git a/speechscore/scores/distill_mos/sqa.py b/speechscore/scores/distill_mos/sqa.py
index ce4aa6cc..4fb8a0a5 100644
--- a/speechscore/scores/distill_mos/sqa.py
+++ b/speechscore/scores/distill_mos/sqa.py
@@ -16,7 +16,7 @@
 SEQ_LEN = 122880
 MAX_HOP_LEN = 16000
 
-DEFAULT_WEIGHTS_CHKPT = os.path.join("scores/distill_mos/weights", "distill_mos_v7.pt")
+DEFAULT_WEIGHTS_CHKPT = os.path.join(os.path.dirname(os.path.abspath(__file__)), "weights", "distill_mos_v7.pt")
 
 
 def _complex_compressed(x, hop_length, win_length):
diff --git a/speechscore/scores/dnsmos/dnsmos.py b/speechscore/scores/dnsmos/dnsmos.py
index 358bdb7e..55098c47 100644
--- a/speechscore/scores/dnsmos/dnsmos.py
+++ b/speechscore/scores/dnsmos/dnsmos.py
@@ -11,14 +11,15 @@
 
 from basis import ScoreBasis
 
+module_dir = os.path.dirname(os.path.abspath(__file__))
 
 class DNSMOS(ScoreBasis):
     def __init__(self):
         super(DNSMOS, self).__init__(name='DNSMOS')
         self.intrusive = True
         self.score_rate = 16000
-        self.p808_model_path = os.path.join('scores/dnsmos/DNSMOS', 'model_v8.onnx')    
-        self.primary_model_path = os.path.join('scores/dnsmos/DNSMOS', 'sig_bak_ovr.onnx')
+        self.p808_model_path = os.path.join(module_dir, 'DNSMOS', 'model_v8.onnx')
+        self.primary_model_path = os.path.join(module_dir, 'DNSMOS', 'sig_bak_ovr.onnx')
         self.compute_score = ComputeScore(self.primary_model_path, self.p808_model_path)
 
     def windowed_scoring(self, audios, rate):
@@ -26,8 +27,9 @@ def windowed_scoring(self, audios, rate):
 
 class ComputeScore:
     def __init__(self, primary_model_path, p808_model_path) -> None:
-        self.onnx_sess = ort.InferenceSession(primary_model_path)
-        self.p808_onnx_sess = ort.InferenceSession(p808_model_path)
+        providers = ['CUDAExecutionProvider', 'CPUExecutionProvider']
+        self.onnx_sess = ort.InferenceSession(primary_model_path, providers=providers)
+        self.p808_onnx_sess = ort.InferenceSession(p808_model_path, providers=providers)
         
     def audio_melspec(self, audio, n_mels=120, frame_size=320, hop_length=160, sr=16000, to_db=True):
         mel_spec = librosa.feature.melspectrogram(y=audio, sr=sr, n_fft=frame_size+1, hop_length=hop_length, n_mels=n_mels)
diff --git a/speechscore/scores/nisqa/nisqa.py b/speechscore/scores/nisqa/nisqa.py
index 4c5b619a..a2c73470 100644
--- a/speechscore/scores/nisqa/nisqa.py
+++ b/speechscore/scores/nisqa/nisqa.py
@@ -1,12 +1,13 @@
 from basis import ScoreBasis
 from scores.nisqa.cal_nisqa import load_nisqa_model
+import os
 
 class NISQA(ScoreBasis):
     def __init__(self):
         super(NISQA, self).__init__(name='NISQA')
         self.intrusive = False
         self.score_rate = 48000
-        self.model = load_nisqa_model("scores/nisqa/weights/nisqa.tar", device='cpu')
+        self.model = load_nisqa_model(os.path.join(os.path.dirname(os.path.abspath(__file__)), "weights/nisqa.tar"), device='cpu')
  
     def windowed_scoring(self, audios, score_rate):
         from scores.nisqa.cal_nisqa import cal_NISQA

From 8f2ae3e279685699afc5ea547650334a0b73e794 Mon Sep 17 00:00:00 2001
From: dujunling <dujunling@bytedance.com>
Date: Mon, 24 Nov 2025 18:04:10 +0800
Subject: [PATCH 2/3] feat: support gpu & support invoke speechscore via code

---
 speechscore/scores/distill_mos/distill_mos.py | 3 ++-
 speechscore/scores/nisqa/nisqa.py             | 5 +++--
 2 files changed, 5 insertions(+), 3 deletions(-)

diff --git a/speechscore/scores/distill_mos/distill_mos.py b/speechscore/scores/distill_mos/distill_mos.py
index a43ffe09..bacabdba 100644
--- a/speechscore/scores/distill_mos/distill_mos.py
+++ b/speechscore/scores/distill_mos/distill_mos.py
@@ -14,6 +14,7 @@ def __init__(self):
 
     def windowed_scoring(self, audios, score_rate):
     	
-        score = self.model(torch.from_numpy(np.expand_dims(audios[0], axis=0)).float().to(self.device))
+        score = self.model(
+            torch.from_numpy(np.expand_dims(audios[0], axis=0)).float().to(self.device))
         score_np = score.detach().cpu().numpy()
         return score_np[0][0]
\ No newline at end of file
diff --git a/speechscore/scores/nisqa/nisqa.py b/speechscore/scores/nisqa/nisqa.py
index a2c73470..0d9a1184 100644
--- a/speechscore/scores/nisqa/nisqa.py
+++ b/speechscore/scores/nisqa/nisqa.py
@@ -1,13 +1,14 @@
 from basis import ScoreBasis
 from scores.nisqa.cal_nisqa import load_nisqa_model
 import os
-
+import torch
 class NISQA(ScoreBasis):
     def __init__(self):
         super(NISQA, self).__init__(name='NISQA')
         self.intrusive = False
         self.score_rate = 48000
-        self.model = load_nisqa_model(os.path.join(os.path.dirname(os.path.abspath(__file__)), "weights/nisqa.tar"), device='cpu')
+        device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+        self.model = load_nisqa_model(os.path.join(os.path.dirname(os.path.abspath(__file__)), "weights/nisqa.tar"), device=device)
  
     def windowed_scoring(self, audios, score_rate):
         from scores.nisqa.cal_nisqa import cal_NISQA

From c88b2314cb99aae097fd7e9ac52dfd72f3cfe65b Mon Sep 17 00:00:00 2001
From: dujunling <dujunling@bytedance.com>
Date: Tue, 25 Nov 2025 11:56:41 +0800
Subject: [PATCH 3/3] feat: Support scoring for the trailing incomplete audio
 window

---
 speechscore/basis.py | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/speechscore/basis.py b/speechscore/basis.py
index 87ffe41c..90a55c4f 100644
--- a/speechscore/basis.py
+++ b/speechscore/basis.py
@@ -33,12 +33,17 @@ def scoring(self, data, window=None, score_rate=None):
                 audios[index] = audio
 
         if window is not None:
+            maxlen = len(audios[0])
             framer = Framing(window * score_rate, window * score_rate, maxlen)
             nwin = framer.nwin
             result = {}
             for (t, win) in enumerate(framer):
                 result_t = self.windowed_scoring([audio[win] for audio in audios], score_rate)
                 result[t] = result_t
+            if win and maxlen > win.stop:
+                last_win = slice(win.stop, maxlen)
+                result_t = self.windowed_scoring([audio[last_win] for audio in audios], score_rate) 
+                result[nwin] = result_t
         else:
             result = self.windowed_scoring(audios, score_rate)
         return result