wavlab-speech · FengJin1117 · Dec 3, 2025 · Dec 3, 2025 · Dec 3, 2025
diff --git a/docs/supported_metrics.md b/docs/supported_metrics.md
@@ -60,6 +60,7 @@ We include x mark if the metric is auto-installed in versa.
 | 52 |   | WV-MOS (MOS score prediction by fine-tuned wav2vec2.0 model) | wvmos | wvmos | [wvmos](https://github.com/AndreevP/wvmos) | [paper](https://arxiv.org/abs/2203.13086) |
 | 53 |   |SIG-MOS | sigmos | {SIGMOS_COL, SIGMOS_DISC, SIGMOS_LOUD, SIGMOS_REVERB, SIGMOS_SIG, SIGMOS_OVRL} | [sigmos](https://github.com/microsoft/SIG-Challenge/tree/main/ICASSP2024/sigmos) |[paper](https://arxiv.org/pdf/2309.07385) |
 | 54 | x | VQScore (Self-Supervised Speech Quality Estimation and Enhancement Using Only Clean Speech)  | vqscore | vqscore | [VQScore](https://github.com/JasonSWFu/VQscore) | [paper](https://arxiv.org/abs/2402.16321) |
+| 55 | x | SongEval (A Benchmark Dataset for Song Aesthetics Evaluation) | songeval | songeval | [SongEval](https://github.com/ASLP-lab/SongEval) | [paper](https://arxiv.org/abs/2505.10793) |
 
 
 ### Dependent Metrics

diff --git a/egs/separate_metrics/songeval.yaml b/egs/separate_metrics/songeval.yaml
@@ -0,0 +1,5 @@
+# SongEval Config
+# More info in https://github.com/ASLP-lab/SongEval
+- name: songeval
+
+
diff --git a/versa/__init__.py b/versa/__init__.py
@@ -114,3 +114,4 @@
 from versa.utterance_metrics.dpam_distance import dpam_metric, dpam_model_setup
 from versa.utterance_metrics.cdpam_distance import cdpam_metric, cdpam_model_setup
 from versa.utterance_metrics.vqscore import vqscore_metric, vqscore_setup
+from versa.utterance_metrics.songeval import songeval_metric, songeval_setup
diff --git a/versa/metrics.py b/versa/metrics.py
@@ -216,4 +216,5 @@
     "arecho_wer",
     "arecho_cer",
     "arecho_nisqa_real_mos",
+    "songeval",
 ]
diff --git a/versa/scorer_shared.py b/versa/scorer_shared.py
@@ -148,6 +148,23 @@ def load_score_modules(score_config, use_gt=True, use_gt_text=False, use_gpu=Fal
             }
             logging.info("Initiate pseudo MOS evaluation successfully.")
 
+        elif config["name"] == "songeval":
+            logging.info("Loading SongEval evaluation...")
+            from versa import songeval_metric, songeval_model_setup
+
+            model_dict = songeval_model_setup(
+                use_gpu=use_gpu
+            )
+            score_modules["songeval"] = {     
+                "module": songeval_metric,
+                "args": {
+                    "model_dict": model_dict,
+                    "fs": 24000,
+                    "use_gpu": use_gpu,
+                },
+            }
+            logging.info("Initiate SongEval evaluation successfully.")
+
         elif config["name"] == "pesq":
             if not use_gt:
                 logging.warning(
@@ -1101,6 +1118,12 @@ def use_score_modules(score_modules, gen_wav, gt_wav, gen_sr, text=None):
             score = score_modules[key]["module"](
                 gen_wav, gen_sr, **score_modules[key]["args"]
             )
+        elif key == "songeval":
+            score = score_modules[key]["module"](
+                score_modules[key]["args"]["model_dict"],
+                gen_wav,
+                gen_sr,
+            )
         elif key in ["pesq", "stoi", "estoi"]:
             score = score_modules[key]["module"](gen_wav, gt_wav, gen_sr)
         elif key == "visqol":
@@ -1208,6 +1231,7 @@ def use_score_modules(score_modules, gen_wav, gt_wav, gen_sr, text=None):
                 gen_sr,
             )
 
+
         elif "qwen2_audio" in key:
             if key == "qwen2_audio":
                 continue  # skip the base model, only use the specific metrics

diff --git a/versa/utterance_metrics/songeval.py b/versa/utterance_metrics/songeval.py
@@ -0,0 +1,103 @@
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+
+import os
+from pathlib import Path
+import librosa
+import torch
+import numpy as np
+from muq import MuQ
+from omegaconf import OmegaConf
+from safetensors.torch import load_file
+import requests
+import logging
+import subprocess
+from hydra.utils import instantiate
+
+
+logger = logging.getLogger(__name__)
+logging.basicConfig(level=logging.INFO)
+
+
+def songeval_model_setup(cache_dir="versa_cache", use_gpu=False):
+    """
+    Setup SongEval model for evaluation.
+    Auto-downloads model if not present.
+    """
+    device = "cuda" if use_gpu and torch.cuda.is_available() else "cpu"
+
+    cache_dir = Path(cache_dir)
+    cache_dir.mkdir(parents=True, exist_ok=True)
+
+    repo_url = "https://github.com/ASLP-lab/SongEval.git"
+
+    songeval_dir = cache_dir / "SongEval"
+
+    if not songeval_dir.exists():
+        logger.info(f"Cloning SongEval repository into {cache_dir}")
+        subprocess.run(["git", "clone", repo_url, str(songeval_dir)], check=True)
+    else:
+        logger.info(f"Using existing SongEval repository in {cache_dir}")
+
+    import sys
+    sys.path.insert(0, str(songeval_dir)) 
+    model_path = songeval_dir / "ckpt" / "model.safetensors"
+    config_path = songeval_dir / "config.yaml"
+
+    if not model_path.exists():
+        raise FileNotFoundError(f"Model file not found in {model_path}")
+    if not config_path.exists():
+        raise FileNotFoundError(f"Config file not found in {config_path}")
+
+    # load classifier model weights
+    with torch.no_grad():
+        train_config = OmegaConf.load(config_path)
+        # model = instantiate(train_config.generator).to(device).eval()
+        model = instantiate(train_config.generator)
+        state_dict = load_file(model_path, device="cpu")
+        model.load_state_dict(state_dict, strict=False)
+        model = model.to(device).eval()
+
+    # load MuQ as encoder
+    muq_model = MuQ.from_pretrained("OpenMuQ/MuQ-large-msd-iter")
+    muq_model = muq_model.to(device).eval()
+
+    model_dict = {"model": model, "muq": muq_model, "device": device}
+    return model_dict
+
+
+def songeval_metric(model_dict, pred, fs):
+    """
+    pred: np.ndarray, original waveform
+    fs: original sampling rate
+    return: dict, metric results (5 dimensions)
+    """
+    device = model_dict["device"]
+    model = model_dict["model"]
+    muq_model = model_dict["muq"]
+
+    # resample to 24kHz
+    pred = librosa.resample(pred, orig_sr=fs, target_sr=24000)
+
+    audio = torch.tensor(pred).unsqueeze(0).to(device)
+    with torch.no_grad():
+        output = muq_model(audio, output_hidden_states=True)
+        hidden = output["hidden_states"][6]
+        scores_g = model(hidden).squeeze(0)
+
+    values = {
+        "Coherence": round(scores_g[0].item(), 4),
+        "Musicality": round(scores_g[1].item(), 4),
+        "Memorability": round(scores_g[2].item(), 4),
+        "Clarity": round(scores_g[3].item(), 4),
+        "Naturalness": round(scores_g[4].item(), 4),
+    }
+
+    return values
+
+
+if __name__ == "__main__":
+    # tese example
+    a = np.random.rand(24000).astype(np.float32)
+    model = songeval_model_setup(use_gpu=True)
+    print("metrics:", songeval_metric(model, a, 24000))