Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -169,3 +169,4 @@ fadtk/
scoreq/
fairseq/
UTMOSv2/
/data
40 changes: 30 additions & 10 deletions run.sh
Original file line number Diff line number Diff line change
@@ -1,38 +1,58 @@
stage=2
stage=0

# download data
if [ $stage -eq 0 ]; then
echo stage $stage: Prepare data

if [ ! -d data/LibriSpeech/test-clean ]; then
mkdir -p data
wget http://www.openslr.org/resources/12/test-clean.tar.gz -P ./data
(cd ./data && tar -xvzf test-clean.tar.gz)
# librispeech
if [ ! -d data//LibriSpeech/test-clean ]; then
mkdir -p data/
wget http://www.openslr.org/resources/12/test-clean.tar.gz -P data/
(cd data/ && tar -xvzf test-clean.tar.gz)
rm data/test-clean.tar.gz
fi

if [ ! -d data/LibriSpeech/test-clean/prepared ]; then
python scripts/prepare_librispeech-test-clean.py --root_dir data/LibriSpeech/test-clean
if [ ! -d data//LibriSpeech/test-clean/prepared ]; then
python scripts/prepare_librispeech-test-clean.py --root_dir data//LibriSpeech/test-clean
fi

# musdb
if [ ! -d data/musdb/test ]; then
wget https://zenodo.org/records/3338373/files/musdb18hq.zip -P data/
(cd data/ && unzip musdb18hq.zip -d ./musdb)
rm data/musdb18hq.zip
fi

if [ ! -d data/musdb/prepared ]; then
python scripts/prepare_musdb.py --main_directory data/musdb/ --output_dir data/musdb/prepared --chunk_length 5.0
fi

# audioset
if [ ! -d data/audioset ]; then
python scripts/prepare_audioset-test.py --output_dir data/audioset
fi

fi

# Evaluation
pred_path=data/LibriSpeech/test-clean/prepared/ori.scp
gt_path=data/LibriSpeech/test-clean/prepared/ori.scp
tag=musdb_encodec_24k_12bps
eval_sr=24000
if [ $stage -eq 1 ]; then
result_path="test_result"
result_path="test_result_${tag}"

echo stage $stage: Evaluation
if test -f ${result_path}; then
echo ${result_path} exists
else
python versa/bin/scorer.py \
--score_config egs/speech.yaml \
--score_config egs/general.yaml \
--use_gpu True \
--gt ${gt_path} \
--pred ${pred_path} \
--output_file ${result_path}
--output_file ${result_path} \
--eval_sr ${eval_sr} \ # change in versa necessary!
fi

python scripts/average_result.py --file_path ${result_path} >> ${result_path}
Expand Down
83 changes: 83 additions & 0 deletions scripts/prepare_audioset-test.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,83 @@
from huggingface_hub import HfApi
import os, argparse
import tarfile

# Initialize the Hugging Face API
api = HfApi()

def main():
# Set up argument parsing
parser = argparse.ArgumentParser(description="Split 'mix.wav' files into chunks.")
parser.add_argument('--output_dir', type=str, help="Where AudioSet files will be saved.")

# Parse the arguments
args = parser.parse_args()

# Define the repository details
repo_id = "agkphysics/AudioSet" # Dataset repository
repo_path = "data"
local_save_dir = args.output_dir
audio_dump = os.path.join(local_save_dir, "audio_files")

# Create the local directory if it doesn't exist
os.makedirs(local_save_dir, exist_ok=True)

# List files in the dataset repository (specify repo_type="dataset")
repo_files = api.list_repo_files(repo_id=repo_id, repo_type="dataset")

# Filter files matching the desired pattern and path
files_to_download = [
file for file in repo_files
if file.startswith(repo_path) and file.endswith(".tar") and "eval" in file
]

print(f"Files to download: {files_to_download}")

# Base URL for the dataset files
base_url = f"https://huggingface.co/datasets/{repo_id}/resolve/main/"

# Download each file
for file_path in files_to_download:
file_url = base_url + file_path
local_file_path = os.path.join(local_save_dir, os.path.basename(file_path))
if os.path.exists(local_file_path):
print(f"File {local_file_path} already exists, skipping download.")
else:
print(f"Downloading {file_url} to {local_file_path}...")

# Download the file manually using requests
import requests
response = requests.get(file_url, stream=True)
if response.status_code == 200:
with open(local_file_path, "wb") as f:
for chunk in response.iter_content(chunk_size=1024):
if chunk:
f.write(chunk)
else:
print(f"Failed to download {file_url}, status code: {response.status_code}")

# Extract the .tar file
print(f"Extracting {local_file_path} to {audio_dump}...")
try:
with tarfile.open(local_file_path, "r") as tar:
tar.extractall(
path=audio_dump,
members=[
member for member in tar.getmembers()
if member.isfile() # Only extract files, skip directories
]
)
except Exception as e:
print(f"Error extracting {local_file_path}: {e}")

# Delete the .tar file after successful extraction
print(f"Deleting {local_file_path}...")
try:
os.remove(local_file_path)
except Exception as e:
print(f"Error deleting {local_file_path}: {e}")

print("All files downloaded and extracted.")

if __name__ == "__main__":
main()
65 changes: 65 additions & 0 deletions scripts/prepare_musdb.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,65 @@
import os
import argparse
import soundfile as sf
import numpy as np
from glob import glob
from tqdm import tqdm
import librosa

def split_audio(file_path, chunk_length, output_dir, parent_folder):
"""Split the audio file into non-overlapping chunks and save them."""
# Load the audio file using soundfile
print(f"Processing: {file_path}")
waveform, sample_rate = librosa.load(file_path, mono=True)
total_length = len(waveform) / sample_rate # Total length in seconds

# Calculate the number of chunks
num_chunks = int(np.ceil(total_length / chunk_length))

# Split and save each chunk
for i in range(num_chunks):
start_time = i * chunk_length
end_time = min((i + 1) * chunk_length, total_length)

# Find the sample indices for the chunk
start_sample = int(start_time * sample_rate)
end_sample = int(end_time * sample_rate)

# Extract the chunk waveform
chunk_waveform = waveform[start_sample:end_sample]

# Build the output file path
output_filename = f"{parent_folder}_{start_time}_{end_time}.wav"
output_file_path = os.path.join(output_dir, output_filename)

# Save the chunk to the output directory
sf.write(output_file_path, chunk_waveform, sample_rate)
print(f"Saved chunks for : {parent_folder}")

def process_directory(main_directory, chunk_length, output_dir):
"""Process all subdirectories in the main directory and split 'mix.wav' files."""
for mix_wav_path in tqdm(glob(os.path.join(main_directory, '**/mixture.wav'), recursive=True)):
# Get the parent folder name
parent_folder = mix_wav_path.split('/')[-2].replace(' ','_')
# Split the 'mix.wav' into chunks
split_audio(mix_wav_path, chunk_length, output_dir, parent_folder)

def main():
# Set up argument parsing
parser = argparse.ArgumentParser(description="Split 'mix.wav' files into chunks.")
parser.add_argument('--main_directory', type=str, help="Path to the main directory containing subfolders.")
parser.add_argument('--output_dir', type=str, help="Directory where the chunks will be saved.")
parser.add_argument('--chunk_length', type=float, help="Length of each chunk in seconds.")

# Parse the arguments
args = parser.parse_args()

# Ensure output directory exists
if not os.path.exists(args.output_dir):
os.makedirs(args.output_dir)

# Process the directory
process_directory(args.main_directory, args.chunk_length, args.output_dir)

if __name__ == "__main__":
main()
7 changes: 6 additions & 1 deletion versa/bin/scorer.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,11 @@
def get_parser() -> argparse.Namespace:
"""Get argument parser."""
parser = argparse.ArgumentParser(description="Speech Evaluation Interface")
parser.add_argument(
"--eval_sr",
type=int,
help="All wfs wil lbe resampeld to eval_sr prior to eval.",
)
parser.add_argument(
"--pred",
type=str,
Expand Down Expand Up @@ -140,14 +145,14 @@ def main():
)

assert len(score_config) > 0, "no scoring function is provided"

score_info = list_scoring(
gen_files,
score_modules,
gt_files,
text_info,
output_file=args.output_file,
io=args.io,
eval_sr=args.eval_sr
)
logging.info("Summary: {}".format(load_summary(score_info)))

Expand Down
57 changes: 8 additions & 49 deletions versa/scorer_shared.py
Original file line number Diff line number Diff line change
Expand Up @@ -47,7 +47,7 @@ def audio_loader_setup(audio, io):
audio_files = kaldiio.load_scp(audio)
elif io == "dir":
audio_files = find_files(audio)
elif io == "soundfile":
else:
audio_files = {}
with open(audio) as f:
for line in f.readlines():
Expand Down Expand Up @@ -362,40 +362,6 @@ def load_score_modules(score_config, use_gt=True, use_gt_text=False, use_gpu=Fal
),
}
logging.info("Initiate Whisper WER calculation successfully")

elif config["name"] == "scoreq_ref":
if not use_gt:
logging.warning("Cannot use scoreq_ref because no gt audio is provided")
continue

logging.info("Loadding scoreq metrics with reference")
from versa import scoreq_ref_setup, scoreq_ref
model = scoreq_ref_setup(
data_domain=config.get("data_domain", "synthetic"),
cache_dir=config.get("model_cache", "./scoreq_pt-models"),
use_gpu=use_gpu,
)

score_modules["scoreq_ref"] = {
"module": scoreq_ref,
"model": model,
}
logging.info("Initiate scoreq (with reference) successfully")

elif config["name"] == "scoreq_nr":
logging.info("Loadding scoreq metrics without reference")
from versa import scoreq_nr_setup, scoreq_nr
model = scoreq_nr_setup(
data_domain=config.get("data_domain", "synthetic"),
cache_dir=config.get("model_cache", "./scoreq_pt-models"),
use_gpu=use_gpu,
)

score_modules["scoreq_nr"] = {
"module": scoreq_nr,
"model": model,
}
logging.info("Initiate scoreq (with reference) successfully")

return score_modules

Expand Down Expand Up @@ -451,14 +417,6 @@ def use_score_modules(score_modules, gen_wav, gt_wav, gen_sr, text=None):
text,
gen_sr,
)
elif key == "scoreq_ref":
score = score_modules[key]["module"](
score_modules[key]["model"],
gen_wav, gt_wav, gen_sr)
elif key == "scoreq_nr":
score = score_modules[key]["module"](
score_modules[key]["model"],
gen_wav, gen_sr)
else:
raise NotImplementedError(f"Not supported {key}")

Expand All @@ -474,6 +432,7 @@ def list_scoring(
text_info=None,
output_file=None,
io="kaldi",
eval_sr=16_000
):
if output_file is not None:
f = open(output_file, "w", encoding="utf-8")
Expand Down Expand Up @@ -531,16 +490,16 @@ def list_scoring(
else:
text = None

if gt_sr is not None and gen_sr > gt_sr:
if gt_sr != eval_sr:
logging.warning(
"Resampling the generated audio to match the ground truth audio"
"Resampling the ground truth audio to match the eval sr"
)
gen_wav = librosa.resample(gen_wav, orig_sr=gen_sr, target_sr=gt_sr)
elif gt_sr is not None and gen_sr < gt_sr:
gt_wav = librosa.resample(gt_wav, orig_sr=gt_sr, target_sr=eval_sr)
if gen_sr != eval_sr:
logging.warning(
"Resampling the ground truth audio to match the generated audio"
"Resampling the generated audio to match the eval sr"
)
gt_wav = librosa.resample(gt_wav, orig_sr=gt_sr, target_sr=gen_sr)
gen_wav = librosa.resample(gen_wav, orig_sr=gen_sr, target_sr=eval_sr)

utt_score = {"key": key}

Expand Down