Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions docs/supported_metrics.md
Original file line number Diff line number Diff line change
Expand Up @@ -60,6 +60,7 @@ We include x mark if the metric is auto-installed in versa.
| 52 | | WV-MOS (MOS score prediction by fine-tuned wav2vec2.0 model) | wvmos | wvmos | [wvmos](https://github.com/AndreevP/wvmos) | [paper](https://arxiv.org/abs/2203.13086) |
| 53 | |SIG-MOS | sigmos | {SIGMOS_COL, SIGMOS_DISC, SIGMOS_LOUD, SIGMOS_REVERB, SIGMOS_SIG, SIGMOS_OVRL} | [sigmos](https://github.com/microsoft/SIG-Challenge/tree/main/ICASSP2024/sigmos) |[paper](https://arxiv.org/pdf/2309.07385) |
| 54 | x | VQScore (Self-Supervised Speech Quality Estimation and Enhancement Using Only Clean Speech) | vqscore | vqscore | [VQScore](https://github.com/JasonSWFu/VQscore) | [paper](https://arxiv.org/abs/2402.16321) |
| 55 | x | SongEval (A Benchmark Dataset for Song Aesthetics Evaluation) | songeval | songeval | [SongEval](https://github.com/ASLP-lab/SongEval) | [paper](https://arxiv.org/abs/2505.10793) |


### Dependent Metrics
Expand Down
5 changes: 5 additions & 0 deletions egs/separate_metrics/songeval.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
# SongEval Config
# More info in https://github.com/ASLP-lab/SongEval
- name: songeval


1 change: 1 addition & 0 deletions versa/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -114,3 +114,4 @@
from versa.utterance_metrics.dpam_distance import dpam_metric, dpam_model_setup
from versa.utterance_metrics.cdpam_distance import cdpam_metric, cdpam_model_setup
from versa.utterance_metrics.vqscore import vqscore_metric, vqscore_setup
from versa.utterance_metrics.songeval import songeval_metric, songeval_setup
1 change: 1 addition & 0 deletions versa/metrics.py
Original file line number Diff line number Diff line change
Expand Up @@ -216,4 +216,5 @@
"arecho_wer",
"arecho_cer",
"arecho_nisqa_real_mos",
"songeval",
]
24 changes: 24 additions & 0 deletions versa/scorer_shared.py
Original file line number Diff line number Diff line change
Expand Up @@ -148,6 +148,23 @@ def load_score_modules(score_config, use_gt=True, use_gt_text=False, use_gpu=Fal
}
logging.info("Initiate pseudo MOS evaluation successfully.")

elif config["name"] == "songeval":
logging.info("Loading SongEval evaluation...")
from versa import songeval_metric, songeval_model_setup

model_dict = songeval_model_setup(
use_gpu=use_gpu
)
score_modules["songeval"] = {
"module": songeval_metric,
"args": {
"model_dict": model_dict,
"fs": 24000,
"use_gpu": use_gpu,
},
}
logging.info("Initiate SongEval evaluation successfully.")

elif config["name"] == "pesq":
if not use_gt:
logging.warning(
Expand Down Expand Up @@ -1101,6 +1118,12 @@ def use_score_modules(score_modules, gen_wav, gt_wav, gen_sr, text=None):
score = score_modules[key]["module"](
gen_wav, gen_sr, **score_modules[key]["args"]
)
elif key == "songeval":
score = score_modules[key]["module"](
score_modules[key]["args"]["model_dict"],
gen_wav,
gen_sr,
)
elif key in ["pesq", "stoi", "estoi"]:
score = score_modules[key]["module"](gen_wav, gt_wav, gen_sr)
elif key == "visqol":
Expand Down Expand Up @@ -1208,6 +1231,7 @@ def use_score_modules(score_modules, gen_wav, gt_wav, gen_sr, text=None):
gen_sr,
)


elif "qwen2_audio" in key:
if key == "qwen2_audio":
continue # skip the base model, only use the specific metrics
Expand Down
103 changes: 103 additions & 0 deletions versa/utterance_metrics/songeval.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,103 @@
#!/usr/bin/env python3
# -*- coding: utf-8 -*-

import os
from pathlib import Path
import librosa
import torch
import numpy as np
from muq import MuQ
from omegaconf import OmegaConf
from safetensors.torch import load_file
import requests
import logging
import subprocess
from hydra.utils import instantiate


logger = logging.getLogger(__name__)
logging.basicConfig(level=logging.INFO)


def songeval_model_setup(cache_dir="versa_cache", use_gpu=False):
"""
Setup SongEval model for evaluation.
Auto-downloads model if not present.
"""
device = "cuda" if use_gpu and torch.cuda.is_available() else "cpu"

cache_dir = Path(cache_dir)
cache_dir.mkdir(parents=True, exist_ok=True)

repo_url = "https://github.com/ASLP-lab/SongEval.git"

songeval_dir = cache_dir / "SongEval"

if not songeval_dir.exists():
logger.info(f"Cloning SongEval repository into {cache_dir}")
subprocess.run(["git", "clone", repo_url, str(songeval_dir)], check=True)
else:
logger.info(f"Using existing SongEval repository in {cache_dir}")

import sys
sys.path.insert(0, str(songeval_dir))
model_path = songeval_dir / "ckpt" / "model.safetensors"
config_path = songeval_dir / "config.yaml"

if not model_path.exists():
raise FileNotFoundError(f"Model file not found in {model_path}")
if not config_path.exists():
raise FileNotFoundError(f"Config file not found in {config_path}")

# load classifier model weights
with torch.no_grad():
train_config = OmegaConf.load(config_path)
# model = instantiate(train_config.generator).to(device).eval()
model = instantiate(train_config.generator)
state_dict = load_file(model_path, device="cpu")
model.load_state_dict(state_dict, strict=False)
model = model.to(device).eval()

# load MuQ as encoder
muq_model = MuQ.from_pretrained("OpenMuQ/MuQ-large-msd-iter")
muq_model = muq_model.to(device).eval()

model_dict = {"model": model, "muq": muq_model, "device": device}
return model_dict


def songeval_metric(model_dict, pred, fs):
"""
pred: np.ndarray, original waveform
fs: original sampling rate
return: dict, metric results (5 dimensions)
"""
device = model_dict["device"]
model = model_dict["model"]
muq_model = model_dict["muq"]

# resample to 24kHz
pred = librosa.resample(pred, orig_sr=fs, target_sr=24000)

audio = torch.tensor(pred).unsqueeze(0).to(device)
with torch.no_grad():
output = muq_model(audio, output_hidden_states=True)
hidden = output["hidden_states"][6]
scores_g = model(hidden).squeeze(0)

values = {
"Coherence": round(scores_g[0].item(), 4),
"Musicality": round(scores_g[1].item(), 4),
"Memorability": round(scores_g[2].item(), 4),
"Clarity": round(scores_g[3].item(), 4),
"Naturalness": round(scores_g[4].item(), 4),
}

return values


if __name__ == "__main__":
# tese example
a = np.random.rand(24000).astype(np.float32)
model = songeval_model_setup(use_gpu=True)
print("metrics:", songeval_metric(model, a, 24000))