diff --git a/.github/workflows/pypi_publish.yml b/.github/workflows/pypi_publish.yml index 5cfc5dd..e944c12 100644 --- a/.github/workflows/pypi_publish.yml +++ b/.github/workflows/pypi_publish.yml @@ -26,7 +26,7 @@ jobs: - name: Build wheels run: python -m cibuildwheel env: - CIBW_BUILD: "cp39-* cp310-* cp311-* cp312-*" + CIBW_BUILD: "cp310-* cp311-* cp312-* cp313-*" CIBW_ARCHS: "x86_64" CIBW_ARCHS_MACOS: "x86_64 arm64" CIBW_BEFORE_BUILD: | diff --git a/.github/workflows/test_pypi_publish.yml b/.github/workflows/test_pypi_publish.yml index 8328b29..cbdf340 100644 --- a/.github/workflows/test_pypi_publish.yml +++ b/.github/workflows/test_pypi_publish.yml @@ -28,7 +28,7 @@ jobs: - name: Build wheels run: python -m cibuildwheel env: - CIBW_BUILD: "cp39-* cp310-* cp311-* cp312-*" + CIBW_BUILD: "cp310-* cp311-* cp312-* cp313-*" CIBW_ARCHS: "x86_64" CIBW_ARCHS_MACOS: "x86_64 arm64" CIBW_BEFORE_BUILD: | diff --git a/.github/workflows/unittest.yml b/.github/workflows/unittest.yml index 6c9bc9a..ff04d20 100644 --- a/.github/workflows/unittest.yml +++ b/.github/workflows/unittest.yml @@ -11,7 +11,7 @@ jobs: fail-fast: false matrix: platform: [ubuntu-latest, macos-latest] - python-version: ["3.9", "3.10", "3.11", "3.12"] + python-version: ["3.10", "3.11", "3.12", "3.13"] runs-on: ${{ matrix.platform }} steps: diff --git a/README.md b/README.md index 4750e38..d5d9706 100644 --- a/README.md +++ b/README.md @@ -14,7 +14,7 @@ The full documentation for matchmaker is available online at [readthedocs.org](h ### Prerequisites -- Available Python version: 3.12 +- Available Python version: 3.11, 3.12, 3.13 - [Fluidsynth](https://www.fluidsynth.org/) - [PortAudio](http://www.portaudio.com/) diff --git a/matchmaker/dp/oltw_arzt.py b/matchmaker/dp/oltw_arzt.py index a6749cf..9f7e657 100644 --- a/matchmaker/dp/oltw_arzt.py +++ b/matchmaker/dp/oltw_arzt.py @@ -13,7 +13,8 @@ from matchmaker.base import OnlineAlignment from matchmaker.dp.dtw_loop import oltw_arzt_loop -from matchmaker.features.audio import FRAME_RATE, QUEUE_TIMEOUT +from matchmaker.features.audio import FRAME_RATE +from matchmaker.io.audio import QUEUE_TIMEOUT from matchmaker.utils import ( CYTHONIZED_METRICS_W_ARGUMENTS, CYTHONIZED_METRICS_WO_ARGUMENTS, @@ -26,10 +27,11 @@ RECVQueue, set_latency_stats, ) +from matchmaker.utils.stream import STREAM_END -STEP_SIZE: int = 5 -WINDOW_SIZE: int = 5 -START_WINDOW_SIZE: Union[float, int] = 0.25 +STEP_SIZE: int = 3 +WINDOW_SIZE: int = 10 +START_WINDOW_SIZE: Union[float, int] = 0.1 class OnlineTimeWarpingArzt(OnlineAlignment): @@ -98,9 +100,9 @@ def __init__( current_position: int = 0, frame_rate: int = FRAME_RATE, queue: Optional[RECVQueue] = None, - state_to_ref_time_map = None, - ref_to_state_time_map = None, - state_space = None, + state_to_ref_time_map=None, + ref_to_state_time_map=None, + state_space=None, **kwargs, ) -> None: super().__init__(reference_features=reference_features) @@ -154,7 +156,7 @@ def __init__( self.N_ref: int = self.reference_features.shape[0] self.frame_rate = frame_rate - self.window_size: int = window_size * self.frame_rate + self.window_size: int = int(np.round(window_size * self.frame_rate)) self.step_size: int = step_size self.start_window_size: int = int(np.round(start_window_size * frame_rate)) self.init_position: int = current_position @@ -178,12 +180,22 @@ def __init__( } self.state_to_ref_time_map = state_to_ref_time_map self.ref_to_state_time_map = ref_to_state_time_map - self.state_space = state_space #if state_space != None else np.unique(self.reference_features.note_array()["onset_beat"]) + self.state_space = state_space + self._ref_frame_to_beat: Optional[NDArray[np.float32]] = kwargs.get( + "ref_frame_to_beat", None + ) + + @property + def current_beat(self) -> float: + """Current score position in beats.""" + if self._ref_frame_to_beat is not None: + idx = min(self.current_position, len(self._ref_frame_to_beat) - 1) + return float(self._ref_frame_to_beat[idx]) + return float(self.current_position) @property - def warping_path(self) -> NDArray[np.int32]: - wp = (np.array(self._warping_path).T).astype(np.int32) - return wp + def warping_path(self) -> NDArray[np.float32]: + return np.array(self._warping_path).T def __call__(self, input: NDArray[np.float32]) -> int: self.step(input) @@ -211,10 +223,18 @@ def run(self, verbose: bool = True) -> Generator[int, None, NDArray[np.float32]] self.reset() if verbose: - pbar = progressbar.ProgressBar(max_value=self.N_ref, redirect_stdout=True) + pbar = progressbar.ProgressBar( + max_value=len(self.state_space), + redirect_stdout=True, + redirect_stderr=True, + ) + pbar.start() while self.is_still_following(): - features, f_time = self.queue.get(timeout=QUEUE_TIMEOUT) + item = self.queue.get(timeout=QUEUE_TIMEOUT) + if item is STREAM_END: + break + features, f_time = item self.last_queue_update = time.time() self.input_features = ( np.concatenate((self.input_features, features)) @@ -224,13 +244,13 @@ def run(self, verbose: bool = True) -> Generator[int, None, NDArray[np.float32]] self.step(features) if verbose: - pbar.update(int(self.current_position)) + pbar.update(int(np.searchsorted(self.state_space, self.current_beat))) latency = time.time() - self.last_queue_update self.latency_stats = set_latency_stats( latency, self.latency_stats, self.input_index ) - yield self.current_position + yield self.current_beat if verbose: pbar.finish() @@ -289,23 +309,17 @@ def step(self, input_features: NDArray[np.float32]) -> None: min_index=min_index, ) - # adapt current_position: do not go backwards, - # but also go a maximum of N steps forward - - if self.input_index == 0: - # enforce the first time step to stay at the - # initial position - self.current_position = min( # TODO: Is this necessary? - max(self.current_position, min_index), - self.current_position, - ) - else: - self.current_position = min( - max(self.current_position, min_index), - self.current_position + self.step_size, + # Clamp new position: no backwards, max step_size forward per frame + if self.input_index > 0: + self.current_position = int( + np.clip( + min_index, + self.current_position, + self.current_position + self.step_size, + ) ) - self._warping_path.append((self.current_position, self.input_index)) + self._warping_path.append((self.current_beat, self.input_index)) # update input index self.input_index += 1 diff --git a/matchmaker/dp/oltw_dixon.py b/matchmaker/dp/oltw_dixon.py index 8c55a05..11e46eb 100644 --- a/matchmaker/dp/oltw_dixon.py +++ b/matchmaker/dp/oltw_dixon.py @@ -14,8 +14,10 @@ from numpy.typing import NDArray from matchmaker.base import OnlineAlignment -from matchmaker.features.audio import FRAME_RATE, QUEUE_TIMEOUT +from matchmaker.features.audio import FRAME_RATE +from matchmaker.io.audio import QUEUE_TIMEOUT from matchmaker.utils.misc import set_latency_stats +from matchmaker.utils.stream import STREAM_END class Direction(IntEnum): @@ -74,9 +76,9 @@ def __init__( max_run_count=MAX_RUN_COUNT, frame_per_seg=FRAME_PER_SEG, frame_rate=FRAME_RATE, - state_to_ref_time_map = None, - ref_to_state_time_map = None, - state_space = None, + state_to_ref_time_map=None, + ref_to_state_time_map=None, + state_space=None, **kwargs, ): super().__init__(reference_features=reference_features) @@ -90,6 +92,7 @@ def __init__( self.state_to_ref_time_map = state_to_ref_time_map self.ref_to_state_time_map = ref_to_state_time_map self.state_space = state_space + self._ref_frame_to_beat = kwargs.get("ref_frame_to_beat", None) self.reset() def reset(self): @@ -114,6 +117,14 @@ def reset(self): } self._initialized = False + @property + def current_beat(self) -> float: + """Current score position in beats.""" + if self._ref_frame_to_beat is not None: + idx = min(self.best_ref, len(self._ref_frame_to_beat) - 1) + return float(self._ref_frame_to_beat[idx]) + return float(self.best_ref) + @property def warping_path(self) -> NDArray[np.float32]: # [shape=(2, T)] return self.wp @@ -276,8 +287,8 @@ def get_expand_direction(self): return Direction.REF def save_history(self): - """Append current best alignment point to warping path.""" - new_point = np.array([[self.best_ref], [self.best_input]]) + """Append current best alignment point to warping path (beats, input_frame).""" + new_point = np.array([[self.current_beat], [self.best_input]]) self.wp = np.concatenate((self.wp, new_point), axis=1) def __call__(self, input_features: NDArray[np.float32]) -> int: @@ -363,21 +374,29 @@ def run(self, verbose=True): self.reset() if verbose: - pbar = progressbar.ProgressBar(max_value=self.N_ref, redirect_stdout=True) + pbar = progressbar.ProgressBar( + max_value=len(self.state_space), + redirect_stdout=True, + redirect_stderr=True, + ) + pbar.start() while self.is_still_following(): - input_feature, f_time = self.queue.get(timeout=QUEUE_TIMEOUT) + item = self.queue.get(timeout=QUEUE_TIMEOUT) + if item is STREAM_END: + break + input_feature, f_time = item self.last_queue_update = time.time() self.step(input_feature) if verbose: - pbar.update(int(self.current_position)) + pbar.update(int(np.searchsorted(self.state_space, self.current_beat))) latency = time.time() - self.last_queue_update self.latency_stats = set_latency_stats( latency, self.latency_stats, self.input_index ) - yield self.current_position + yield self.current_beat if verbose: pbar.finish() diff --git a/matchmaker/features/audio.py b/matchmaker/features/audio.py index 3a9c27b..f68641d 100644 --- a/matchmaker/features/audio.py +++ b/matchmaker/features/audio.py @@ -20,7 +20,6 @@ DCT_TYPE = 2 NORM = np.inf FEATURES = "chroma" -QUEUE_TIMEOUT = 1 # Type hint for Input Audio frame. InputAudioSeries = np.ndarray diff --git a/matchmaker/io/audio.py b/matchmaker/io/audio.py index 043ecf9..ee04156 100644 --- a/matchmaker/io/audio.py +++ b/matchmaker/io/audio.py @@ -12,16 +12,21 @@ import numpy as np import pyaudio -from matchmaker.features.audio import HOP_LENGTH, SAMPLE_RATE, ChromagramProcessor +from matchmaker.features.audio import ( + HOP_LENGTH, + SAMPLE_RATE, + ChromagramProcessor, +) from matchmaker.utils.audio import ( get_audio_devices, get_default_input_device_index, get_device_index_from_name, ) from matchmaker.utils.misc import RECVQueue, set_latency_stats -from matchmaker.utils.stream import Stream +from matchmaker.utils.stream import STREAM_END, Stream CHANNELS = 1 +QUEUE_TIMEOUT = 10 class AudioStream(Stream): @@ -53,7 +58,7 @@ def __init__( hop_length: int = HOP_LENGTH, queue: Optional[RECVQueue] = None, device_name_or_index: Optional[Union[str, int]] = None, - wait: bool = True, + wait: bool = False, target_sr: int = SAMPLE_RATE, ): if processor is None: @@ -123,9 +128,14 @@ def __init__( "min_latency": float("inf"), } self.input_index = 0 + self._preloaded_audio = None if self.mock: self.run = self.run_offline + # Pre-load and resample audio so the stream thread can start + # producing frames immediately (avoids queue-timeout race condition + # when librosa.load takes longer than QUEUE_TIMEOUT). + self._preload_audio() else: self.run = self.run_online @@ -159,6 +169,8 @@ def _process_frame( # initial y target_audio = np.frombuffer(data, dtype=np.float32) self._process_feature(target_audio, time_info["input_buffer_adc_time"]) + if not self.stream_start.is_set(): + self.stream_start.set() return (data, pyaudio.paContinue) @@ -225,6 +237,13 @@ def stop_listening(self) -> None: self.audio_interface.terminate() self.listen = False + def _preload_audio(self) -> None: + """Pre-load and resample audio file so run_offline can start immediately.""" + audio_y, sr = librosa.load(self.file_path, sr=None) + if sr != self.target_sr: + audio_y = librosa.resample(y=audio_y, orig_sr=sr, target_sr=self.target_sr) + self._preloaded_audio = audio_y + def run_offline(self) -> None: """Process audio file in offline mode. @@ -240,12 +259,17 @@ def run_offline(self) -> None: self.start_listening() self.init_time = time.time() - audio_y, sr = librosa.load(self.file_path, sr=None) - if sr != self.target_sr: - audio_y = librosa.resample(y=audio_y, orig_sr=sr, target_sr=self.target_sr) - sr = self.target_sr + if self._preloaded_audio is not None: + audio_y = self._preloaded_audio + self._preloaded_audio = None # free memory + else: + audio_y, sr = librosa.load(self.file_path, sr=None) + if sr != self.target_sr: + audio_y = librosa.resample( + y=audio_y, orig_sr=sr, target_sr=self.target_sr + ) + sr = self.target_sr - time_interval = self.hop_length / float(sr) # Pad to next hop_length boundary so no trailing samples are lost remainder = len(audio_y) % self.hop_length if remainder > 0: @@ -253,6 +277,7 @@ def run_offline(self) -> None: (audio_y, np.zeros(self.hop_length - remainder, dtype=np.float32)) ) trimmed_audio = audio_y + time_interval = self.hop_length / float(sr) # Do not stop early on digital silence (all-zeros tails). while trimmed_audio.size > 0: self.input_index += 1 @@ -260,11 +285,15 @@ def run_offline(self) -> None: target_audio = trimmed_audio[: self.hop_length] self._process_feature(target_audio, self.last_data_received) trimmed_audio = trimmed_audio[self.hop_length :] - elapsed_time = time.time() - self.last_data_received + + if not self.stream_start.is_set(): + self.stream_start.set() if self.wait: + elapsed_time = time.time() - self.last_data_received time.sleep(max(time_interval - elapsed_time, 0)) + self.queue.put(STREAM_END) self.stop_listening() def run_online(self) -> None: diff --git a/matchmaker/io/midi.py b/matchmaker/io/midi.py index cfdca59..489fec8 100644 --- a/matchmaker/io/midi.py +++ b/matchmaker/io/midi.py @@ -139,6 +139,8 @@ def _process_frame_message( self.queue.put(((data, c_time), output)) else: self.queue.put(output) + if not self.stream_start.is_set(): + self.stream_start.set() def _process_frame_window( self, diff --git a/matchmaker/matchmaker.py b/matchmaker/matchmaker.py index 776e081..8f772f8 100644 --- a/matchmaker/matchmaker.py +++ b/matchmaker/matchmaker.py @@ -1,13 +1,15 @@ import os import sys +import time from pathlib import Path from typing import Optional, Union import numpy as np import partitura +import scipy.interpolate from partitura.io.exportmidi import get_ppq -from partitura.score import Part, merge_parts from partitura.musicanalysis.performance_codec import get_time_maps_from_alignment +from partitura.score import Part, merge_parts from matchmaker.dp import OnlineTimeWarpingArzt, OnlineTimeWarpingDixon from matchmaker.features.audio import ( @@ -39,8 +41,7 @@ TOLERANCES_IN_BEATS, TOLERANCES_IN_MILLISECONDS, get_evaluation_results, - transfer_from_perf_to_predicted_score, - transfer_from_score_to_predicted_perf, + transfer_positions, ) from matchmaker.utils.misc import ( adjust_tempo_for_performance_file, @@ -52,39 +53,31 @@ ) from matchmaker.utils.tempo_models import KalmanTempoModel +PathLike = Union[str, bytes, os.PathLike] sys.setrecursionlimit(10_000) -PathLike = Union[str, bytes, os.PathLike] DEFAULT_TEMPO = 120 - - -DEFAULT_DISTANCE_FUNCS = { - "arzt": OnlineTimeWarpingArzt.DEFAULT_DISTANCE_FUNC, - "dixon": OnlineTimeWarpingDixon.DEFAULT_DISTANCE_FUNC, - "hmm": None, - "outerhmm": None, - "audio_outerhmm": None, - "pthmm": None, -} - DEFAULT_METHODS = { "audio": "arzt", "midi": "outerhmm", } - AVAILABLE_METHODS = ["arzt", "dixon", "hmm", "pthmm", "outerhmm", "audio_outerhmm"] KWARGS = { "audio": { "dixon": { + "feature_type": "lse", "window_size": 10, }, "arzt": { - "window_size": 5, - "start_window_size": 0.25, - "step_size" : 5,}, + "window_size": 10, + "start_window_size": 0.1, + "step_size": 3, + }, "audio_outerhmm": { + "feature_type": "cqt_spectral_flux", "sample_rate": 16000, - "frame_rate": 50, + "frame_rate": 25, + "s_j": 0.0, }, }, "midi": { @@ -151,39 +144,37 @@ def __init__( self, score_file: PathLike, performance_file: Union[PathLike, None] = None, - wait: bool = True, # only for offline option. For debugging or fast testing, set to False - input_type: str = "audio", # 'audio' or 'midi' - feature_type: str = None, + input_type: str = "audio", method: str = None, - distance_func: Optional[str] = None, + *, + feature_type: str = None, device_name_or_index: Union[str, int] = None, + tempo: Optional[float] = None, sample_rate: int = SAMPLE_RATE, frame_rate: int = FRAME_RATE, - tempo: Optional[float] = None, - kwargs=KWARGS, - unfold_score=True, auto_adjust_tempo: bool = False, + wait: bool = False, + unfold_score=True, + kwargs=KWARGS, ): self.score_file = str(score_file) self.performance_file = ( str(performance_file) if performance_file is not None else None ) - # if input_type not in ("audio", "midi"): - # raise ValueError(f"Invalid input_type {input_type}") self.input_type = input_type self.feature_type = feature_type self.frame_rate = frame_rate if input_type == "audio" else 1 self.sample_rate = sample_rate self.hop_length = sample_rate // self.frame_rate self.score_part: Optional[Part] = None - self.distance_func = distance_func self.device_name_or_index = device_name_or_index self.processor = None self.stream = None self.score_follower = None self.reference_features = None self._has_run = False + self.alignment_duration = None # validate method first if method is None: @@ -192,7 +183,7 @@ def __init__( raise ValueError(f"Invalid method. Available methods: {AVAILABLE_METHODS}") self.method = method - self.config = kwargs[self.input_type][self.method] + self.config = dict(kwargs[self.input_type][self.method]) self.auto_adjust_tempo = auto_adjust_tempo # Apply method-specific defaults from config (only if not explicitly provided by caller) @@ -213,8 +204,22 @@ def __init__( score = partitura.load_score(self.score_file) if unfold_score: - score = partitura.score.unfold_part_maximal(score, ignore_leaps=False) - self.score_part = merge_parts(score.parts) + try: + # Ensure recursion limit is high enough for deepcopy of + # complex scores. External libraries (e.g. madmom) may + # lower it during processing. + _prev_limit = sys.getrecursionlimit() + sys.setrecursionlimit(max(_prev_limit, 10_000)) + unfolded = partitura.score.unfold_part_maximal( + score, ignore_leaps=False + ) + self.score_part = merge_parts(unfolded.parts) + sys.setrecursionlimit(_prev_limit) + except Exception: + sys.setrecursionlimit(max(sys.getrecursionlimit(), 10_000)) + self.score_part = merge_parts(score.parts) + else: + self.score_part = merge_parts(score.parts) except Exception as e: raise ValueError(f"Invalid score file: {e}") @@ -229,12 +234,8 @@ def __init__( # setup feature processor if self.feature_type is None: - if input_type == "audio": - self.feature_type = ( - "cqt_spectral_flux" if method == "audio_outerhmm" else "chroma" - ) - else: - self.feature_type = "pitch_ioi" + default = "chroma" if input_type == "audio" else "pitch_ioi" + self.feature_type = self.config.get("feature_type", default) if self.feature_type == "chroma": self.processor = ChromagramProcessor( @@ -244,18 +245,22 @@ def __init__( elif self.feature_type == "mfcc": self.processor = MFCCProcessor( sample_rate=self.sample_rate, + hop_length=self.hop_length, ) elif self.feature_type == "cqt": self.processor = CQTProcessor( sample_rate=self.sample_rate, + hop_length=self.hop_length, ) elif self.feature_type == "mel": self.processor = MelSpectrogramProcessor( sample_rate=self.sample_rate, + hop_length=self.hop_length, ) elif self.feature_type == "lse": self.processor = LogSpectralEnergyProcessor( sample_rate=self.sample_rate, + hop_length=self.hop_length, ) elif self.feature_type == "pitch_ioi": self.processor = PitchIOIProcessor(piano_range=self.config["piano_range"]) @@ -281,9 +286,6 @@ def __init__( f"Invalid performance file. Expected MIDI file, but got {self.performance_file}" ) - # setup distance function - if distance_func is None: - distance_func = DEFAULT_DISTANCE_FUNCS[self.method] # setup stream device if self.input_type == "audio": @@ -308,67 +310,69 @@ def __init__( self.reference_features = self.preprocess_score() - if distance_func is None: - distance_func = DEFAULT_DISTANCE_FUNCS[method] - if method == "arzt": - state_to_ref_time_map, ref_to_state_time_map = self.get_time_maps() + try: + state_to_ref_time_map, ref_to_state_time_map = self.get_time_maps() + except Exception: + state_to_ref_time_map, ref_to_state_time_map = None, None self.score_follower = OnlineTimeWarpingArzt( reference_features=self.reference_features, queue=self.stream.queue, - distance_func=distance_func, frame_rate=self.frame_rate, - window_size=self.config["window_size"], - start_window_size=self.config["start_window_size"], state_to_ref_time_map=state_to_ref_time_map, ref_to_state_time_map=ref_to_state_time_map, - step_size=self.config["step_size"], - state_space=np.unique(self.score_part.note_array()["onset_beat"]) + state_space=np.unique(self.score_part.note_array()["onset_beat"]), + ref_frame_to_beat=self._build_ref_frame_to_beat(), + **self.config, ) elif method == "dixon": - state_to_ref_time_map, ref_to_state_time_map = self.get_time_maps() + try: + state_to_ref_time_map, ref_to_state_time_map = self.get_time_maps() + except Exception: + state_to_ref_time_map, ref_to_state_time_map = None, None self.score_follower = OnlineTimeWarpingDixon( reference_features=self.reference_features, queue=self.stream.queue, - distance_func=distance_func, frame_rate=self.frame_rate, - window_size=self.config["window_size"], state_to_ref_time_map=state_to_ref_time_map, ref_to_state_time_map=ref_to_state_time_map, - state_space=np.unique(self.score_part.note_array()["onset_beat"]) + state_space=np.unique(self.score_part.note_array()["onset_beat"]), + ref_frame_to_beat=self._build_ref_frame_to_beat(), + **self.config, ) elif method == "hmm" and self.input_type == "midi": self.score_follower = PitchIOIHMM( reference_features=self.reference_features, queue=self.stream.queue, - tempo_model=self.config["tempo_model"], has_insertions=True, - piano_range=self.config["piano_range"], + **self.config, ) elif method == "pthmm" and self.input_type == "audio": self.score_follower = GaussianAudioPitchTempoHMM( reference_features=self.reference_features, queue=self.stream.queue, + **self.config, ) elif method == "audio_outerhmm" and self.input_type == "audio": self.score_follower = AudioOuterProductHMM( reference_features=self.reference_features, queue=self.stream.queue, tempo=self.tempo, - sample_rate=self.sample_rate, hop_length=self.hop_length, + **self.config, ) elif method == "pthmm" and self.input_type == "midi": self.score_follower = PitchHMM( reference_features=self.reference_features, queue=self.stream.queue, has_insertions=True, - piano_range=self.config["piano_range"], + **self.config, ) elif method == "outerhmm" and self.input_type == "midi": self.score_follower = OuterProductHMM( reference_features=self.reference_features, queue=self.stream.queue, + **self.config, ) else: raise ValueError("Invalid method") @@ -381,7 +385,9 @@ def preprocess_score(self): ) if self.method in {"arzt", "dixon"}: - self.ppart = partitura.utils.music.performance_from_part(self.score_part, bpm=self.tempo) + self.ppart = partitura.utils.music.performance_from_part( + self.score_part, bpm=self.tempo + ) self.ppart.sustain_pedal_threshold = 127 if self.input_type == "audio": self.score_audio = generate_score_audio( @@ -406,22 +412,27 @@ def preprocess_score(self): return reference_features else: return self.score_part.note_array() - + def get_time_maps(self): - alignment = [{"label" : "match", "score_id" : nid, "performance_id": nid} for nid in self.score_part.note_array()["id"]] - return get_time_maps_from_alignment(self.ppart.note_array(), self.score_part.note_array(), alignment) + sna = self.score_part.note_array() + pna = self.ppart.note_array() + note_ids = sna["id"] + # If note IDs are missing, use index-based IDs + if len(set(note_ids)) <= 1: + synth_ids = [f"n{i}" for i in range(len(sna))] + sna = sna.copy() + sna["id"] = synth_ids + pna = pna.copy() + pna["id"] = synth_ids[: len(pna)] + note_ids = synth_ids + alignment = [ + {"label": "match", "score_id": nid, "performance_id": nid} + for nid in note_ids + ] + return get_time_maps_from_alignment(pna, sna, alignment) def _convert_frame_to_beat(self, current_frame: int) -> float: - """ - Convert frame number to relative beat position in the score. - - Parameters - ---------- - frame_rate : int - Frame rate of the audio stream - current_frame : int - Current frame number - """ + """Convert frame number to beat position in the score.""" tick = get_ppq(self.score_part) timeline_time = (current_frame / self.frame_rate) * tick * (self.tempo / 60) beat_position = np.round( @@ -430,6 +441,13 @@ def _convert_frame_to_beat(self, current_frame: int) -> float: ) return beat_position + def _build_ref_frame_to_beat(self) -> np.ndarray: + """Precompute beat position for each reference feature frame.""" + n_ref = self.reference_features.shape[0] + return np.array( + [self._convert_frame_to_beat(i) for i in range(n_ref)], + ) + def build_score_annotations( self, level="beat", @@ -533,37 +551,45 @@ def run_evaluation( self, perf_annotations: Union[PathLike, np.ndarray], level: str = "note", - tolerances: list = TOLERANCES_IN_MILLISECONDS, - musical_beat: bool = False, # beat annots are difference in some dataset + tolerances: list = None, + musical_beat: bool = False, debug: bool = False, save_dir: PathLike = None, run_name: str = None, - domain: str = "performance", # "score" or "performance" + domain: str = "score", + plot_dist_matrix: bool = True, ) -> dict: """ - Evaluate the score following process + Evaluate the score following process. + + When domain="score" (default), returns beat-based metrics as primary + and ms-based metrics under "ms" key. When domain="performance", + returns ms-based metrics only (legacy behavior). Parameters ---------- perf_annotations : PathLike or np.ndarray - Path to the performance annotations file (tab-separated), - or numpy array of annotation times in seconds. + Path to the performance annotations file or numpy array of onset times (seconds). level : str - Level of annotations to use: bar, beat or note - tolerance : list - Tolerances to use for evaluation (in milliseconds) + Annotation level: "beat" or "note" + tolerances : list or None + Tolerances for evaluation. If None, uses default for the domain. + musical_beat : bool + Whether to use musical beat debug : bool - Whether to save the score and performance audio with beat annotations + Whether to save debug outputs domain : str - Evaluation domain, either "score" or "performance". - "score" domain evaluates in beat unit, "performance" domain evaluates in second unit. (Default: "performance") + "score" (default, beat-based primary) or "performance" (ms-based, legacy) Returns ------- dict - Evaluation results with mean, median, std, skewness, kurtosis, and - accuracy for each tolerance + Evaluation results. If domain="score", includes both beat and ms metrics. """ + if tolerances is None: + tolerances = ( + TOLERANCES_IN_BEATS if domain == "score" else TOLERANCES_IN_MILLISECONDS + ) if not self._has_run: raise ValueError("Must call run() before evaluation") @@ -577,65 +603,101 @@ def run_evaluation( original_perf_annots_counts = len(perf_annots) - min_length = min(len(score_annots), len(perf_annots)) - score_annots = score_annots[:min_length] - perf_annots = perf_annots[:min_length] + # min_length = min(len(score_annots), len(perf_annots)) + # score_annots = score_annots[:min_length] + # perf_annots = perf_annots[:min_length] - mode = ( - "state" - if (self.input_type == "midi" or self.method == "audio_outerhmm") - else "frame" + wp = self.score_follower.warping_path + score_annots_beats = self.build_score_annotations( + level, musical_beat, return_type="beats" ) - perf_annots_predicted = transfer_from_score_to_predicted_perf( - self.score_follower.warping_path, - score_annots, - frame_rate=self.frame_rate, - mode=mode, + + # --- Per-frame evaluation --- + # Build GT interpolator: score beat → perf time (seconds) + valid_gt = np.isfinite(perf_annots) + gt_interp = scipy.interpolate.interp1d( + score_annots_beats[valid_gt], + perf_annots[valid_gt], + bounds_error=False, + fill_value=np.nan, ) - score_annots_predicted = transfer_from_perf_to_predicted_score( - self.score_follower.warping_path, - perf_annots, + wp_score = wp[0].astype(float) + wp_perf = wp[1].astype(float) + + # Convert wp perf axis to seconds + if self.input_type == "midi": + # MIDI: wp_perf is IOI-accumulated from 0; shift by first note onset + _perf = partitura.load_performance_midi(self.performance_file) + midi_offset = float(_perf.note_array()["onset_sec"].min()) + wp_perf_sec = wp_perf + midi_offset + else: + # Audio: wp_perf is frame index + wp_perf_sec = wp_perf / self.frame_rate + + # For each wp entry: GT perf time for predicted beat vs actual perf time + gt_perf_times = gt_interp(wp_score) + perf_annots_predicted = transfer_positions( + wp, + score_annots_beats, frame_rate=self.frame_rate, - mode=mode, + domain="performance", ) - score_annots = score_annots[: len(score_annots_predicted)] - if original_perf_annots_counts != len(perf_annots_predicted): - print( - f"Length of the annotation changed: {original_perf_annots_counts} -> {len(perf_annots_predicted)}" - ) - - # Evaluation metrics if domain == "performance": eval_results = get_evaluation_results( - perf_annots, - perf_annots_predicted, - total_counts=original_perf_annots_counts, + gt_perf_times, + wp_perf_sec, + total_counts=len(wp_score), tolerances=tolerances, ) else: - score_annots_predicted = self.convert_timestamps_to_beats( - score_annots_predicted + # Score domain: beat-based (primary) + ms-based (secondary) + score_annots_predicted = transfer_positions( + wp, perf_annots, frame_rate=self.frame_rate, domain="score" ) - if tolerances == TOLERANCES_IN_MILLISECONDS: - tolerances = TOLERANCES_IN_BEATS - eval_results = get_evaluation_results( + score_annots = score_annots[: len(score_annots_predicted)] + beat_tolerances = ( + tolerances + if tolerances != TOLERANCES_IN_MILLISECONDS + else TOLERANCES_IN_BEATS + ) + beat_results = get_evaluation_results( score_annots, score_annots_predicted, total_counts=original_perf_annots_counts, - tolerances=tolerances, + tolerances=beat_tolerances, in_seconds=False, ) + ms_results = get_evaluation_results( + gt_perf_times, + wp_perf_sec, + total_counts=len(wp_score), + tolerances=TOLERANCES_IN_MILLISECONDS, + ) + eval_results = {"beat": beat_results, "ms": ms_results} + + # Real-Time Factor (domain-independent) + if self.alignment_duration is not None: + finite_perf = perf_annots[np.isfinite(perf_annots)] + if len(finite_perf) > 0: + perf_duration = float(np.max(finite_perf) - np.min(finite_perf)) + if perf_duration > 0: + eval_results["rtf"] = float( + f"{self.alignment_duration / perf_duration:.4f}" + ) + if self.input_type == "audio": latency_results = self.get_latency_stats() eval_results.update(latency_results) # Debug: save warping path TSV, results JSON, and plots if debug and save_dir is not None: + # For plot y-axis: use beats when wp[0] is in beats + debug_score_annots = score_annots_beats save_debug_results( warping_path=self.score_follower.warping_path, - score_annots=score_annots, + score_annots=debug_score_annots, perf_annots=perf_annots, perf_annots_predicted=perf_annots_predicted, eval_results=eval_results, @@ -643,16 +705,36 @@ def run_evaluation( save_dir=save_dir, run_name=run_name or "results", state_space=getattr(self.score_follower, "state_space", None), - ref_features=getattr(self.score_follower, "reference_features", None), - input_features=getattr(self.score_follower, "input_features", None), - distance_func=getattr(self.score_follower, "distance_func", None), + ref_features=( + getattr(self.score_follower, "reference_features", None) + if plot_dist_matrix + else None + ), + input_features=( + getattr(self.score_follower, "input_features", None) + if plot_dist_matrix + else None + ), + distance_func=( + getattr(self.score_follower, "distance_func", None) + if plot_dist_matrix + else None + ), + ref_frame_to_beat=getattr( + self.score_follower, "_ref_frame_to_beat", None + ), ) return eval_results def run(self, verbose: bool = True, wait: bool = True): """ - Run the score following process + Run the score following process. + + Measures wall-clock time as ``alignment_duration`` (seconds), + which covers both feature extraction (producer thread) and + score following inference (main thread) running concurrently. + RTF is computed as ``alignment_duration / performance_duration``. Yields ------ @@ -665,12 +747,11 @@ def run(self, verbose: bool = True, wait: bool = True): Alignment results with warping path """ with self.stream: + self.stream.stream_start.wait() + t0 = time.time() for current_position in self.score_follower.run(verbose=verbose): - if self.input_type == "audio" and self.method != "audio_outerhmm": - position_in_beat = self._convert_frame_to_beat(current_position) - yield position_in_beat - else: - yield float(self.score_follower.state_space[current_position]) + yield current_position + self.alignment_duration = time.time() - t0 self._has_run = True return self.score_follower.warping_path diff --git a/matchmaker/prob/hmm.py b/matchmaker/prob/hmm.py index bfbaf4c..c6210e2 100644 --- a/matchmaker/prob/hmm.py +++ b/matchmaker/prob/hmm.py @@ -47,6 +47,7 @@ DEFAULT_GUMBEL_AUDIO_SCALE = 0.05 QUEUE_TIMEOUT = 10 + class BaseHMM(HiddenMarkovModel): """ Base class for Hidden Markov Model alignment methods. @@ -193,6 +194,7 @@ def __init__( initial_probabilities: Optional[np.ndarray] = None, has_insertions: bool = True, piano_range: bool = True, + **kwargs, ) -> None: """ Initialize the object. @@ -283,9 +285,9 @@ def __init__( def __call__(self, input, *args, **kwargs): frame_index = args[0] if args else None - + pitch_obs = input - + current_state = self.forward_algorithm_step( observation=pitch_obs, log_probabilities=False, @@ -339,7 +341,7 @@ def _build_hmm_modules( transition_matrix = stable_transition_matrix( n_states=len(unique_onsets_s), dist=gumbel_l, - scale=1.0,#0.5, + scale=1.0, # 0.5, inserted_states=inserted_states, ) initial_probabilities = init_dist( @@ -547,7 +549,9 @@ def gumbel_transition_matrix( # TODO check works for audio (parameter) np.arange(n_states), loc=i + mp_trans_state * 2 - 1, scale=scale ) else: - transition_matrix[i] = gumbel_l.pdf(np.arange(n_states), loc=i + mp_trans_state * 2 - 1, scale=scale) + transition_matrix[i] = gumbel_l.pdf( + np.arange(n_states), loc=i + mp_trans_state * 2 - 1, scale=scale + ) # Normalize transition matrix (so that it is a proper stochastic matrix): transition_matrix /= transition_matrix.sum(1, keepdims=True) @@ -555,10 +559,11 @@ def gumbel_transition_matrix( # TODO check works for audio (parameter) # Return the computed transition matrix: return transition_matrix + def stable_transition_matrix( # TODO check works for audio (parameter) n_states: int, mp_trans_state: int = 1, - dist = gumbel_l, + dist=gumbel_l, scale: float = 0.5, inserted_states: bool = False, ) -> NDArrayFloat: @@ -606,7 +611,9 @@ def stable_transition_matrix( # TODO check works for audio (parameter) np.arange(n_states), loc=i + mp_trans_state * 2 - 1, scale=scale ) else: - transition_matrix[i] = dist.pdf(np.arange(n_states), loc=i + mp_trans_state * 2 - 1, scale=scale) + transition_matrix[i] = dist.pdf( + np.arange(n_states), loc=i + mp_trans_state * 2 - 1, scale=scale + ) # Normalize transition matrix (so that it is a proper stochastic matrix): transition_matrix /= transition_matrix.sum(1, keepdims=True) @@ -614,6 +621,7 @@ def stable_transition_matrix( # TODO check works for audio (parameter) # Return the computed transition matrix: return transition_matrix + def init_dist( n_states: int, dist=gumbel_l, @@ -784,6 +792,7 @@ def compute_discrete_pitch_profiles( return pitch_profiles + # Old version, to be deprecated. def compute_discrete_pitch_profiles_old( chord_pitches: NDArrayFloat, @@ -1131,7 +1140,6 @@ def __init__( self.states = np.arange(len(audio_features)) def __call__(self, observation: NDArrayFloat) -> NDArrayFloat: - pitch_obs, tempo_est = observation if self.current_state is None: @@ -1152,7 +1160,6 @@ def __call__(self, observation: NDArrayFloat) -> NDArrayFloat: obs_prob = pitch_prob * tempo_prob - return obs_prob @@ -1201,7 +1208,6 @@ def __init__( self.states = np.arange(len(audio_features)) def __call__(self, observation: NDArrayFloat) -> NDArrayFloat: - pitch_obs, tempo_est = observation # ioi_idx = self.current_state if self.current_state is not None else 0 @@ -1298,6 +1304,7 @@ def __init__(self, pitch_profiles, ioi_matrix, ioi_precision): ioi_prob_args=ioi_prob_args, ) + class ACCPitchIOIObservationModel(ObservationModel): """ Computes the probabilities that an observation was emitted, i.e. the @@ -1489,6 +1496,7 @@ def __init__( initial_probabilities: Optional[np.ndarray] = None, has_insertions: bool = False, piano_range: bool = False, + **kwargs, ) -> None: """ Initialize the object. @@ -1656,17 +1664,17 @@ def _build_hmm_modules( self, piano_range: bool = False, inserted_states: bool = True, - observation_model = ACCPitchIOIObservationModel, - tempo_model = KalmanTempoModel, + observation_model=ACCPitchIOIObservationModel, + tempo_model=KalmanTempoModel, ): snote_array = self.reference_features - + unique_sonsets = np.unique(snote_array["onset_beat"]) unique_sonset_idxs = [ np.where(snote_array["onset_beat"] == ui)[0] for ui in unique_sonsets ] chord_pitches = [snote_array["pitch"][uix] for uix in unique_sonset_idxs] - + pitch_profiles = compute_discrete_pitch_profiles( chord_pitches=chord_pitches, piano_range=piano_range, @@ -1676,7 +1684,7 @@ def _build_hmm_modules( unique_onsets=unique_sonsets, inserted_states=inserted_states, ) - + observation_model = observation_model( pitch_profiles=pitch_profiles, ioi_matrix=ioi_matrix, @@ -1703,11 +1711,11 @@ def _build_hmm_modules( init_score_onset=unique_sonsets.min(), init_beat_period=60 / 100, ) - + transition_matrix = stable_transition_matrix( n_states=len(ioi_matrix[0]), dist=gumbel_l, - scale=1.0,#0.5, + scale=1.0, # 0.5, inserted_states=inserted_states, ) initial_probabilities = init_dist( @@ -1738,14 +1746,14 @@ def run(self, verbose: bool = True): # TODO: check self.queue.get() format. maybe this should actually be a tuple try: queue_input = self.queue.get(timeout=QUEUE_TIMEOUT) - #features, f_time = queue_input - #print(f'{features=}, {f_time=}') + # features, f_time = queue_input + # print(f'{features=}, {f_time=}') except: break - #TODO: try MidiStream.return_midi_messages = True + # TODO: try MidiStream.return_midi_messages = True if queue_input is not None: - #print(f'pitch_ioi: {queue_input=}') + # print(f'pitch_ioi: {queue_input=}') current_state = self.__call__(queue_input) empty_counter = 0 if current_state == prev_state: @@ -1755,7 +1763,6 @@ def run(self, verbose: bool = True): break else: same_state_counter = 0 - if verbose: pbar.update(int(current_state)) @@ -1987,6 +1994,7 @@ def __init__( initial_probabilities: Optional[np.ndarray] = None, state_space: Optional[NDArray] = None, patience: int = 200, + **kwargs, ) -> None: """ Initialize the object. @@ -2090,7 +2098,6 @@ def __init__( self.input_features = None self.distance_func = "Euclidean" - BaseHMM.__init__( self, observation_model=observation_model, diff --git a/matchmaker/prob/outer_product_hmm.py b/matchmaker/prob/outer_product_hmm.py index adb3d57..fcdde1e 100644 --- a/matchmaker/prob/outer_product_hmm.py +++ b/matchmaker/prob/outer_product_hmm.py @@ -13,7 +13,6 @@ viterbi_step_cy = None import numpy as np - from partitura.score import Part, Score, ScoreLike NDArrayFloat = NDArray[np.float32] @@ -216,6 +215,7 @@ def __init__( r: Optional[np.ndarray] = None, other_prob: float = 1e-6, patience: int = 10, + **kwargs, ) -> None: """ Outer-product Hidden Markov Model for score following. diff --git a/matchmaker/prob/outer_product_hmm_audio.py b/matchmaker/prob/outer_product_hmm_audio.py index e6dc3b9..24bc8df 100644 --- a/matchmaker/prob/outer_product_hmm_audio.py +++ b/matchmaker/prob/outer_product_hmm_audio.py @@ -8,32 +8,27 @@ from partitura.score import Part, Score, ScoreLike from matchmaker.base import OnlineAlignment -from matchmaker.features.audio import QUEUE_TIMEOUT +from matchmaker.io.audio import QUEUE_TIMEOUT from matchmaker.utils.misc import RECVQueue, set_latency_stats +from matchmaker.utils.stream import STREAM_END NDArrayFloat = NDArray[np.float32] NDArrayInt = NDArray[np.int32] -DEFAULT_PITCH_ERROR_PROBS = { - "correct_pitch_prob": 0.9497, - "semi_tone_error_prob": 0.0145 / 2.0, - "whole_tone_error_prob": 0.0224 / 2.0, - "octave_error_prob": 0.0047 / 2.0, - "within_one_octave_error_prob": 0.0086 / 9.0 / 2.0, -} - -# DEFAULT_TRANSITIONS = [ -# (1, 1.0), # normal (i→i+1) -# (2, 1e-50), # deletion (i→i+2), HHMMState_simple.hpp: log10(-50) -# ] +# Nakamura et al. 2016 Section IV-B experimental parameters. +# Neighbourhood transitions a^{(nbh)}_{j,i} from nakamura_data.py: +# These are the "small transition probabilities" for the banded structure. +# Paper: a_{i,i} = 0 (self-transition handled by bottom HMM a00), +# a_{i,i+2} = 1e-50 (deletion, effectively 0). +# We use the empirical values from [13] (Nakamura JNMR 2014). DEFAULT_TRANSITIONS = [ - (-3, 0.001), - (-2, 0.001), - (-1, 0.002), - (0, 0.01342), - (1, 0.96), - (2, 0.01), - (3, 0.002), + (-3, 0.00509), + (-2, 0.00516), + (-1, 0.00886), + (0, 0.01342), # insertion (staying at same top state) + (1, 0.94531), # normal forward progression + (2, 0.00610), # deletion (skip one note) + (3, 0.00073), ] DEFAULT_D1 = 3 @@ -43,8 +38,11 @@ _FLUX_EXIT_BOOST: float = 1.0 _OTHER_PROB: float = 1e-6 -_PAUSE_ENTRY_PROB: float = 0.01 # probability of entering pause state from sound -_PAUSE_DURATION_SEC: float = 0.5 +# Paper IV-B: +# a_{0,1}^{(i)} = 1e-100 (pause entry: almost never enter pause) +# a_{1,1}^{(i)} = 0.999 (pause self-transition: once in pause, stay) +_PAUSE_ENTRY_PROB: float = 1e-100 +_PAUSE_SELF_TRANSITION: float = 0.999 _PAUSE_EMISSION_MAX: float = 1e-3 @@ -127,11 +125,13 @@ def __init__( reference_features: np.ndarray, queue: Optional[RECVQueue] = None, transitions: Optional[List[tuple[int, float]]] = None, - pitch_error_probs: Optional[dict[str, float]] = None, patience: int = 0, tempo: float = 120.0, sample_rate: int = 16000, hop_length: int = 320, + s_j: float = 1e-5, + r_i: Optional[np.ndarray] = None, + **kwargs, ) -> None: self.reference_features = reference_features OnlineAlignment.__init__( @@ -173,16 +173,10 @@ def __init__( self.transitions = ( transitions if transitions is not None else DEFAULT_TRANSITIONS ) - self.pitch_error_probs = ( - pitch_error_probs - if pitch_error_probs is not None - else DEFAULT_PITCH_ERROR_PROBS - ) self.other_prob = _OTHER_PROB self.sample_rate = int(sample_rate) self.hop_length = int(hop_length) self.pause_entry_prob = _PAUSE_ENTRY_PROB - self.pause_duration_sec = _PAUSE_DURATION_SEC self.pause_emission_max = _PAUSE_EMISSION_MAX # Transition setup with banded structure @@ -202,7 +196,23 @@ def __init__( row_sums = self.alpha.sum(axis=1, keepdims=True) self.alpha = self.alpha / row_sums - self.current_state = 0 + # Repeat/skip factorization (Nakamura Eq.11): + # a_{j,i} = a^{(nbh)}_{j,i} + s_j * r_i + # s_j: probability of stopping at event j before a repeat/skip + # r_i: probability of resuming at event i after a repeat/skip + self.S = np.full(self.n_states, float(s_j), dtype=float) + if r_i is not None: + self.r = np.asarray(r_i, dtype=float) + else: + self.r = np.ones(self.n_states, dtype=float) / self.n_states + + # Renormalize alpha to account for s_j mass: + # Σ_i a_{j,i} = Σ_i a^{(nbh)}_{j,i} + s_j * Σ_i r_i = 1 + # => Σ_i a^{(nbh)}_{j,i} = 1 - s_j (since Σ_i r_i = 1) + if s_j > 0: + self.alpha = self.alpha * (1.0 - self.S[:, None]) + + self.current_state_index = 0 self._warping_path = [] self._current_chord = np.zeros(88, dtype=int) self.patience = int(patience) @@ -223,13 +233,8 @@ def __init__( tempo=tempo, frame_rate=frame_rate, ) - self.a11 = float( - np.clip( - self._pause_self_transition_prob(self.pause_duration_sec, frame_rate), - 0.0, - 1.0, - ) - ) + # Paper IV-B: a_{1,1}^{(i)} = 0.999 (pause self-transition) + self.a11 = float(_PAUSE_SELF_TRANSITION) # Pause entry prob a01 (II-E) move_prob = 1.0 - self.a00 p_pause = float(np.clip(self.pause_entry_prob, 0.0, 1.0)) @@ -243,9 +248,21 @@ def __init__( self.e0 = np.clip(1.0 - self.a00 - self.a01, 1e-10, 1.0) self.e1 = float(np.clip(1.0 - self.a11, 1e-10, 1.0)) + # Precompute alpha diagonals and sliding window indices for vectorized forward_step + self._alpha_diags = [] + for d in range(-self.D2, self.D1 + 1): + self._alpha_diags.append(np.diagonal(self.alpha, offset=-d).copy()) + self._j_starts = np.maximum(0, np.arange(self.n_states) - self.D2) + self._j_ends = np.minimum(self.n_states, np.arange(self.n_states) + self.D1 + 1) + + @property + def warping_path(self) -> np.ndarray: + return np.array(self._warping_path).T + @property - def warping_path(self) -> NDArrayInt: - return (np.array(self._warping_path).T).astype(np.int32) + def current_position(self) -> float: + """Current score position in beats.""" + return float(self.state_space[self.current_state_index]) @staticmethod def _pause_self_transition_prob( @@ -286,8 +303,8 @@ def _compute_chord_self_transition_probs( return np.clip(1.0 - 1.0 / d_i, 1e-6, 1.0 - 1e-6) def is_still_following(self) -> bool: - if self.current_state is not None: - return self.current_state <= self.n_states - 1 + if self.current_state_index is not None: + return self.current_state_index <= self.n_states - 1 return False def __call__(self, input, *args, **kwargs) -> Optional[int]: @@ -321,10 +338,12 @@ def __call__(self, input, *args, **kwargs) -> Optional[int]: top_scores = probs[0::2] + probs[1::2] new_top = int(np.argmax(top_scores)) - self.current_state = new_top - self._warping_path.append((self.current_state, self.input_index)) + self.current_state_index = new_top + self._warping_path.append( + (float(self.state_space[self.current_state_index]), self.input_index) + ) self.input_index += 1 - return self.current_state + return self.current_state_index def compute_obs_likelihood( self, @@ -390,14 +409,31 @@ def forward_step( prev_sound = np.asarray(prev_probs[0::2], dtype=float) prev_pause = np.asarray(prev_probs[1::2], dtype=float) - # Emission - emit_sound = self.compute_obs_likelihood(observation) - emit_pause_scalar = self._compute_pause_emission(observation) + # --- Single _preprocess_obs call + inline emission computation --- + obs = _preprocess_obs(observation) + cqt = np.maximum(obs[:88] if obs.size >= 88 else obs, 0.0) + cqt_sum = cqt.sum() + + # Sound emission (from compute_obs_likelihood) + if cqt_sum <= 0: + emit_sound = np.full(N, 1e-300, dtype=float) + else: + cqt_norm = cqt / cqt_sum + em = self.chord_harmonic_mask @ cqt_norm + emit_sound = np.maximum(np.nan_to_num(em, nan=1e-12), 1e-12) + + # Pause emission (from _compute_pause_emission) + if cqt_sum <= 0: + emit_pause_scalar = min(1.0, self.pause_emission_max) + else: + var = float(np.var(cqt / cqt_sum)) + emit_pause_scalar = min( + max(1.0 / (1.0 + 200.0 * var), 1e-300), self.pause_emission_max + ) emit_pause = np.full(N, emit_pause_scalar, dtype=float) # Spectral-flux-driven exit boost - obs_flat = _preprocess_obs(observation) - flux = float(obs_flat[88]) if obs_flat.size > 88 else 0.0 + flux = float(obs[88]) if obs.size > 88 else 0.0 f = flux / (flux + 1.0) # [0,1) boost = 1.0 + _FLUX_EXIT_BOOST * f e0 = np.clip(self.e0 * boost, 1e-10, 1.0 - self.a01 - 1e-10) @@ -406,18 +442,27 @@ def forward_step( # Exit masses from each top state j (Eq.(6)) exit_mass = prev_sound * e0 + prev_pause * self.e1 # (N,) - # Compute neigh_sum_i for each i (banded, Eq.(9)) - neigh_sum = np.zeros(N, dtype=float) - for i in range(N): - j_start = max(0, i - self.D2) - j_end = min(N, i + self.D1 + 1) - ssum = 0.0 - for j in range(j_start, j_end): - a = float(self.alpha[j, i]) - if a <= 0: - continue - ssum += exit_mass[j] * a - neigh_sum[i] = ssum + # --- Vectorized neighbourhood sum (replaces O(N*(D1+D2)) Python loop) --- + # Global skip term: Σ_j exit_mass[j] * S[j] (O(N), computed once) + global_skip_sum = float(np.dot(exit_mass, self.S)) + + # Local neighbourhood transition: sum over diagonals of alpha + local_nbh = np.zeros(N, dtype=float) + for k, d in enumerate(range(-self.D2, self.D1 + 1)): + diag = self._alpha_diags[k] + L = len(diag) + src = max(0, d) # source index offset in exit_mass + dst = max(0, -d) # destination index offset in local_nbh + local_nbh[dst : dst + L] += exit_mass[src : src + L] * diag + + # Local skip via cumsum sliding window: O(N) instead of O(N*D) + eS = exit_mass * self.S + cumsum_eS = np.empty(N + 1, dtype=float) + cumsum_eS[0] = 0.0 + np.cumsum(eS, out=cumsum_eS[1:]) + local_skip = cumsum_eS[self._j_ends] - cumsum_eS[self._j_starts] + + neigh_sum = local_nbh + self.r * (global_skip_sum - local_skip) # Within-top bottom transitions within_sound = prev_sound * a00 @@ -448,21 +493,27 @@ def run( same_state_counter = 0 empty_counter = 0 if verbose: - pbar = progressbar.ProgressBar(maxval=self.n_states) + pbar = progressbar.ProgressBar( + maxval=len(self.state_space), + redirect_stdout=True, + redirect_stderr=True, + ) pbar.start() while self.is_still_following(): - prev_state = self.current_state + prev_state = self.current_state_index try: queue_input = self.queue.get(timeout=QUEUE_TIMEOUT) except Empty: break + if queue_input is STREAM_END: + break self.last_queue_update = time.time() if queue_input is not None: - current_state = self(queue_input) + self(queue_input) empty_counter = 0 - if current_state == prev_state: + if self.current_state_index == prev_state: if self.patience > 0: if same_state_counter < self.patience: same_state_counter += 1 @@ -472,13 +523,12 @@ def run( same_state_counter = 0 if verbose: - if current_state is not None: - pbar.update(int(current_state) + 1) # states starts with 0 + pbar.update(self.current_state_index) latency = time.time() - self.last_queue_update self.latency_stats = set_latency_stats( latency, self.latency_stats, self.input_index ) - yield current_state + yield self.current_position if verbose: pbar.finish() diff --git a/matchmaker/utils/eval.py b/matchmaker/utils/eval.py index 20e5192..a88d7ac 100644 --- a/matchmaker/utils/eval.py +++ b/matchmaker/utils/eval.py @@ -1,5 +1,3 @@ -from typing import TypedDict, Union - import numpy as np import scipy @@ -11,159 +9,97 @@ def transfer_positions( wp, ref_anns, frame_rate, - reverse=False, *, - mode: str = "auto", - reducer: str = "min", - state_offset: Union[int, str] = "auto", - output: str = "seconds", + domain: str = "score", + aggregation_func=None, ): """ - Transfer the positions of the reference annotations to the target annotations using the warping path. - - This function supports two common warping-path conventions: - - - **frame mode** (classic DTW-style): wp[0] and wp[1] are frame indices for reference/target features. - - **state mode** (HMM/score-state): wp[0] contains *reference state indices* and wp[1] contains *target frame indices*. + Transfer positions between score and performance using the warping path. Parameters ---------- wp : np.array with shape (2, T) - array of warping path. - warping_path[0] is the index of the reference (score) feature and warping_path[1] is the index of the target(input) feature. - ref_ann : List[float] - In **frame mode**, reference annotations in seconds. - In **state mode**, a sequence whose length equals the number of reference states (e.g., score unique_onsets); - the values are not used except for determining the number of states. + Warping path. wp[0] = score beats, wp[1] = performance frame indices. + ref_anns : array-like + Query positions (seconds for domain="score", + beats for domain="performance"). frame_rate : int - frame rate of the audio. - reverse : bool - If True, swap the direction (target -> reference). - mode : {"auto", "frame", "state"} - Warping-path convention. "auto" picks "state" when wp[0] looks like small discrete state indices. - reducer : {"min", "max", "median", "mean"} - In **state mode**, how to select a single representative target frame for each state when multiple wp entries - map to the same state. - state_offset : {"auto"} or int - In **state mode**, wp[0] may start at 0 or 1 (or have a leading start-state). "auto" chooses the offset that - best matches the expected number of states. - output : {"seconds", "frames"} - Return unit. "seconds" divides frames by frame_rate; "frames" returns frame indices. + Frame rate of the audio. + domain : {"score", "performance"} + Domain of the output. + "score": perf→score lookup. Given performance times (seconds), + return predicted score positions (beats). + "performance": score→perf lookup. Given score beats, return + predicted performance times (seconds). + aggregation_func : callable or None + Function to aggregate multiple values sharing the same key + (e.g., np.max, np.min, np.mean). If None, defaults to: + - domain="score": last entry in temporal order (tracker's + final decision at that frame) + - domain="performance": np.min (earliest arrival at that beat, + i.e. first-crossing rule) Returns ------- - predicted_targets : np.array with shape (T,) - Predicted target positions (seconds or frames depending on output). + predicted : np.array + Predicted positions in the target domain. """ - if output not in {"seconds", "frames"}: - raise ValueError(f"Invalid output={output!r}. Use 'seconds' or 'frames'.") - - if reverse: - x, y = wp[1], wp[0] - else: - x, y = wp[0], wp[1] - - if mode not in {"auto", "frame", "state"}: - raise ValueError(f"Invalid mode={mode!r}. Use 'auto', 'frame', or 'state'.") - - # Heuristic: state paths have small discrete indices (often << target frames), - # while frame paths typically cover most reference frames (unique count is large). - if mode == "auto": - x_unique = np.unique(x) - n_ref = len(ref_anns) - looks_like_state = (x_unique.size <= max(4, 2 * n_ref)) and ( - int(np.max(x)) <= max(10, 5 * n_ref) - ) - mode = "state" if looks_like_state else "frame" - - if mode == "frame": - # Causal nearest neighbor interpolation (reference seconds -> reference frames -> target frames) - ref_anns_frame = np.round(np.asarray(ref_anns) * frame_rate) - predicted_targets = np.ones(len(ref_anns_frame), dtype=float) * np.nan - - for i, r in enumerate(ref_anns_frame): - # 1) Scan all x values less than or equal to r and find the largest x value - past_indices = np.where(x <= r)[0] - if past_indices.size > 0: - # Find indices corresponding to the largest x value - max_x_val = x[past_indices[-1]] - max_x_indices = np.where(x == max_x_val)[0] - - # 2) Among all y values mapped to this x value, select the minimum y value - corresponding_y_values = y[max_x_indices] - predicted_targets[i] = float(np.min(corresponding_y_values)) - - if output == "frames": - return predicted_targets - return np.asarray(predicted_targets) / frame_rate - - # mode == "state" - # Goal: for each reference state index, select representative target frame from wp. - num_states = len(ref_anns) - predicted_frames = np.ones(num_states, dtype=float) * np.nan - - x_int = np.asarray(x, dtype=int) - y_int = np.asarray(y, dtype=int) - - if reducer not in {"min", "max", "median", "mean"}: - raise ValueError( - f"Invalid reducer={reducer!r}. Use 'min', 'max', 'median', or 'mean'." - ) - - if state_offset == "auto": - # Choose offset that maximizes overlap between expected states and observed wp state indices. - observed = np.unique(x_int) - candidates = [] - for off in (0, 1, int(np.min(x_int))): - if off not in candidates: - candidates.append(off) - best_off = candidates[0] - best_overlap = -1 - for off in candidates: - expected = np.arange(off, off + num_states, dtype=int) - overlap = np.intersect1d(observed, expected).size - if overlap > best_overlap: - best_overlap = overlap - best_off = off - offset = best_off + if domain not in {"score", "performance"}: + raise ValueError(f"Invalid domain={domain!r}. Use 'score' or 'performance'.") + + wp_score = wp[0].astype(float) + wp_perf = wp[1].astype(float) + queries = np.asarray(ref_anns, dtype=float) + + def _last(arr): + return arr[-1] + + if aggregation_func is None: + aggregation_func = _last if domain == "score" else np.min + + if domain == "score": + # Perf → Score: "at perf time t, what is the tracker's score position?" + # Group by perf frame, take the last entry by default (tracker's final decision). + query_frames = queries * frame_rate + + sort_idx = np.argsort(wp_perf, kind="stable") + wp_perf_sorted = wp_perf[sort_idx] + wp_score_sorted = wp_score[sort_idx] + + unique_frames, first_idx = np.unique(wp_perf_sorted, return_index=True) + reduced_scores = np.empty(len(unique_frames)) + for g in range(len(unique_frames)): + start = first_idx[g] + end = ( + first_idx[g + 1] if g + 1 < len(unique_frames) else len(wp_score_sorted) + ) + reduced_scores[g] = aggregation_func(wp_score_sorted[start:end]) + + # unique_frames is monotonic → searchsorted for last frame ≤ query + indices = np.searchsorted(unique_frames, query_frames, side="right") - 1 + predicted = np.full(len(queries), np.nan) + valid = indices >= 0 + predicted[valid] = reduced_scores[indices[valid]] + return predicted else: - offset = int(state_offset) - - for s in range(num_states): - wp_state = s + offset - idx = np.where(x_int == wp_state)[0] - if idx.size == 0: - continue - vals = y_int[idx].astype(float) - if reducer == "min": - predicted_frames[s] = float(np.min(vals)) - elif reducer == "max": - predicted_frames[s] = float(np.max(vals)) - elif reducer == "median": - predicted_frames[s] = float(np.median(vals)) - else: # mean - predicted_frames[s] = float(np.mean(vals)) - - if output == "frames": - return predicted_frames - return predicted_frames / frame_rate - - -def transfer_from_score_to_predicted_perf(wp, score_annots, frame_rate, mode="auto"): - predicted_perf_idx = transfer_positions( - wp, - score_annots, - frame_rate, - mode=mode, - ) - return predicted_perf_idx - - -def transfer_from_perf_to_predicted_score(wp, perf_annots, frame_rate, mode="auto"): - predicted_score_idx = transfer_positions( - wp, perf_annots, frame_rate, reverse=True, mode=mode - ) - return predicted_score_idx + # Score → Perf: "when did the tracker first reach beat b?" + # Group by score position, aggregate perf frame values per group. + sort_idx = np.argsort(wp_score, kind="stable") + wp_score_sorted = wp_score[sort_idx] + wp_perf_sorted = wp_perf[sort_idx] + + unique_beats, first_idx = np.unique(wp_score_sorted, return_index=True) + reduced_perf = np.empty(len(unique_beats)) + for g in range(len(unique_beats)): + start = first_idx[g] + end = first_idx[g + 1] if g + 1 < len(unique_beats) else len(wp_perf_sorted) + reduced_perf[g] = aggregation_func(wp_perf_sorted[start:end]) + + indices = np.searchsorted(unique_beats, queries, side="left") + predicted = np.full(len(queries), np.nan) + valid = indices < len(unique_beats) + predicted[valid] = reduced_perf[indices[valid]] + return predicted / frame_rate def get_evaluation_results( @@ -171,23 +107,25 @@ def get_evaluation_results( predicted_annots, total_counts, tolerances=TOLERANCES_IN_MILLISECONDS, - pcr_threshold=2_000, # 2 seconds in_seconds=True, ): if in_seconds: - errors_in_delay = (gt_annots - predicted_annots) * 1000 # in milliseconds + errors_in_delay = (gt_annots - predicted_annots) * 1000 else: errors_in_delay = gt_annots - predicted_annots - filtered_errors_in_delay = errors_in_delay[np.abs(errors_in_delay) <= pcr_threshold] - filtered_abs_errors_in_delay = np.abs(filtered_errors_in_delay) + abs_errors_in_delay = np.abs(errors_in_delay) results = { - "mean": float(f"{np.nanmean(filtered_abs_errors_in_delay):.4f}"), - "median": float(f"{np.nanmedian(filtered_abs_errors_in_delay):.4f}"), - "std": float(f"{np.nanstd(filtered_abs_errors_in_delay):.4f}"), - "skewness": float(f"{scipy.stats.skew(filtered_errors_in_delay):.4f}"), - "kurtosis": float(f"{scipy.stats.kurtosis(filtered_errors_in_delay):.4f}"), + "mean": float(f"{np.nanmean(abs_errors_in_delay):.4f}"), + "median": float(f"{np.nanmedian(abs_errors_in_delay):.4f}"), + "std": float(f"{np.nanstd(abs_errors_in_delay):.4f}"), + "skewness": float( + f"{scipy.stats.skew(errors_in_delay, nan_policy='omit'):.4f}" + ), + "kurtosis": float( + f"{scipy.stats.kurtosis(errors_in_delay, nan_policy='omit'):.4f}" + ), } if in_seconds: @@ -201,6 +139,4 @@ def get_evaluation_results( f"{np.sum(np.abs(errors_in_delay) <= tau) / total_counts:.4f}" ) - results["pcr"] = float(f"{len(filtered_errors_in_delay) / total_counts:.4f}") - results["count"] = len(filtered_abs_errors_in_delay) return results diff --git a/matchmaker/utils/misc.py b/matchmaker/utils/misc.py index ce58269..df6c415 100644 --- a/matchmaker/utils/misc.py +++ b/matchmaker/utils/misc.py @@ -431,7 +431,7 @@ def adjust_tempo_for_performance_file( ): """ Adjust the tempo of the score part to match the performance file. - We round up the tempo to the nearest 20 bpm to avoid too much optimization. + We round the tempo to the nearest 10 bpm to avoid too much optimization. Parameters ---------- @@ -449,9 +449,7 @@ def adjust_tempo_for_performance_file( else: target_length = librosa.get_duration(path=str(performance_file)) ratio = target_length / source_length - rounded_tempo = int( - (default_tempo / ratio + 19) // 20 * 20 - ) # round up to nearest 20 + rounded_tempo = int(round(default_tempo / ratio / 10) * 10) # round to nearest 10 print( f"default tempo: {default_tempo} (score length: {source_length}) -> adjusted_tempo: {rounded_tempo} (perf length: {target_length})" ) @@ -521,6 +519,15 @@ def save_nparray_to_csv(array: NDArray, save_path: str): writer.writerows(array) +def _beats_to_frames( + beats: np.ndarray, + ref_frame_to_beat: np.ndarray, +) -> np.ndarray: + """Convert beat positions to (float) frame indices via inverse interpolation.""" + frames = np.arange(len(ref_frame_to_beat), dtype=float) + return np.interp(beats, ref_frame_to_beat, frames) + + def plot_alignment( warping_path: np.ndarray, perf_annots: np.ndarray, @@ -533,92 +540,91 @@ def plot_alignment( ref_features: Optional[np.ndarray] = None, input_features: Optional[np.ndarray] = None, distance_func=None, + ref_frame_to_beat: Optional[np.ndarray] = None, ): - """Plot warping path, GT annotations, and predicted points in one figure. - - Layers (back to front): distance matrix → warping path → predicted → GT. - """ + """Plot warping path, GT annotations, and predicted points.""" save_dir.mkdir(parents=True, exist_ok=True) gt = np.asarray(perf_annots, dtype=float) pred = np.asarray(perf_annots_predicted, dtype=float) n = min(len(gt), len(pred)) gt, pred = gt[:n], pred[:n] - has_dist_matrix = ( + fig, ax = plt.subplots(figsize=(30, 30)) + + # Distance matrix background + show_dist = False + if ( ref_features is not None and input_features is not None and distance_func is not None - ) + ): + try: + if isinstance(distance_func, str): + dist = scipy.spatial.distance.cdist( + ref_features, input_features, metric=distance_func + ) + else: + dist = np.array( + [ + [distance_func(r, i) for i in input_features] + for r in ref_features + ], + dtype=np.float32, + ) + ax.imshow( + dist, + aspect="auto", + origin="lower", + interpolation="nearest", + extent=(0, input_features.shape[0] - 1, 0, ref_features.shape[0] - 1), + ) + show_dist = True + except Exception: + pass - fig, ax = plt.subplots(figsize=(30, 30)) + # x-axis: performance time in frames + x_gt = gt * float(frame_rate) + wp_x = warping_path[1] - if has_dist_matrix: - # DTW mode: everything in frame space - dist = scipy.spatial.distance.cdist( - ref_features, - input_features, - metric=distance_func, - ) - ax.imshow( - dist, - aspect="auto", - origin="lower", - interpolation="nearest", - extent=(0, input_features.shape[0] - 1, 0, ref_features.shape[0] - 1), - ) - x_gt = gt * float(frame_rate) - x_pred = pred * float(frame_rate) - if score_y is not None: - y = np.asarray(score_y, dtype=float)[:n] * float(frame_rate) - else: - y = np.arange(n) - ylabel = "score (frames)" - wp_x = warping_path[1] - wp_y = warping_path[0] + # y-axis: score position (beats) + wp_in_beats = np.issubdtype(warping_path[0].dtype, np.floating) + if state_space is not None and not wp_in_beats: + wp_y = state_space[warping_path[0]] + elif show_dist and wp_in_beats and ref_frame_to_beat is not None: + wp_y = _beats_to_frames(warping_path[0], ref_frame_to_beat) else: - # HMM mode: x in frames, y in beats via state_space - x_gt = gt * float(frame_rate) - x_pred = pred * float(frame_rate) - if score_y is None: - y = np.arange(n) - ylabel = "annotation index" - else: - y = np.asarray(score_y, dtype=float)[:n] - ylabel = "score position (beats)" - wp_x = warping_path[1] - if state_space is not None: - wp_y = state_space[warping_path[0]] - else: - wp_y = warping_path[0] - - # 1. Warping path - if has_dist_matrix: - ax.plot( - wp_x, - wp_y, - ".", - color="white", - alpha=0.7, - markersize=15, - label="warping path", - zorder=2, - ) + wp_y = warping_path[0] + + # GT score positions (y-axis for annotation dots) + if score_y is not None: + y_gt = np.asarray(score_y, dtype=float)[:n] + if show_dist and wp_in_beats and ref_frame_to_beat is not None: + y_gt = _beats_to_frames(y_gt, ref_frame_to_beat) else: - ax.plot( - wp_x, - wp_y, - ".", - color="lime", - alpha=0.5, - markersize=15, - label="warping path", - zorder=2, - ) + y_gt = np.arange(n) - # 2. Predicted points + # Predicted score positions at GT perf times (perf→score direction) + wp_x_sorted = np.asarray(wp_x, dtype=float) + wp_y_sorted = np.asarray(wp_y, dtype=float) + if len(wp_x_sorted) > 1: + y_pred = np.interp(x_gt, wp_x_sorted, wp_y_sorted) + else: + y_pred = y_gt + + # Plot layers + ax.plot( + wp_x, + wp_y, + ".", + color="white" if show_dist else "lime", + alpha=0.7 if show_dist else 0.5, + markersize=15, + label="warping path", + zorder=2, + ) ax.scatter( - x_pred, - y, + x_gt, + y_pred, label="predicted", s=80, alpha=0.9, @@ -627,11 +633,9 @@ def plot_alignment( linewidths=0, zorder=3, ) - - # 3. GT annotations (front) ax.scatter( x_gt, - y, + y_gt, label="ground truth", s=120, alpha=0.9, @@ -641,8 +645,26 @@ def plot_alignment( zorder=4, ) + if show_dist: + ax.set_xlim(0, input_features.shape[0] - 1) + ax.set_ylim(0, ref_features.shape[0] - 1) + + # Beat tick labels when projected to frame space + if show_dist and wp_in_beats and ref_frame_to_beat is not None: + finite_beats = ref_frame_to_beat[np.isfinite(ref_frame_to_beat)] + beat_min, beat_max = ( + finite_beats[0], + finite_beats[-1] if len(finite_beats) > 0 else (0, 1), + ) + n_ticks = max(2, min(12, int(beat_max - beat_min) + 1)) + beat_ticks = np.unique( + np.round(np.linspace(beat_min, beat_max, n_ticks)).astype(int) + ) + ax.set_yticks(_beats_to_frames(beat_ticks.astype(float), ref_frame_to_beat)) + ax.set_yticklabels([str(b) for b in beat_ticks]) + ax.set_xlabel("performance frame") - ax.set_ylabel(ylabel) + ax.set_ylabel("score position (beats)") ax.set_title(f"[{save_dir.name}] alignment ({name})") ax.grid(True, alpha=0.2) ax.legend(loc="best") @@ -664,28 +686,29 @@ def save_debug_results( ref_features: Optional[np.ndarray] = None, input_features: Optional[np.ndarray] = None, distance_func=None, + ref_frame_to_beat: Optional[np.ndarray] = None, ): """Save debug outputs: warping path TSV, results JSON, and alignment plot.""" save_dir = Path(save_dir) save_dir.mkdir(parents=True, exist_ok=True) - # 1. Warping path TSV + results JSON + # 1. Warping path TSV + results JSON + GT annotations save_nparray_to_csv(warping_path.T, (save_dir / f"wp_{run_name}.tsv").as_posix()) + gt_pairs = np.column_stack([score_annots, perf_annots]) + save_nparray_to_csv(gt_pairs, (save_dir / f"gt_{run_name}.tsv").as_posix()) import json with open(save_dir / f"{run_name}.json", "w") as f: json.dump(eval_results, f, indent=4) # 2. Alignment plot - if state_space is not None: - score_y = state_space - else: - sx = np.asarray(score_annots, dtype=float) - score_y = ( - sx - if sx.ndim == 1 and len(sx) == len(perf_annots) and np.all(np.diff(sx) >= 0) - else None - ) + # score_y = beat positions for each annotation (y-axis of the plot) + sx = np.asarray(score_annots, dtype=float) + score_y = ( + sx + if sx.ndim == 1 and len(sx) == len(perf_annots) and np.all(np.diff(sx) >= 0) + else None + ) plot_alignment( warping_path, perf_annots, @@ -698,4 +721,5 @@ def save_debug_results( ref_features=ref_features, input_features=input_features, distance_func=distance_func, + ref_frame_to_beat=ref_frame_to_beat, ) diff --git a/matchmaker/utils/stream.py b/matchmaker/utils/stream.py index ca51389..33753ea 100644 --- a/matchmaker/utils/stream.py +++ b/matchmaker/utils/stream.py @@ -6,11 +6,15 @@ from __future__ import annotations +import threading import time from threading import Thread from types import TracebackType from typing import TYPE_CHECKING, Any, Callable, Optional, Type, Union +STREAM_START = threading.Event # call STREAM_START() to create per-instance event +STREAM_END = object() # put into queue to signal end-of-stream + if TYPE_CHECKING: # pragma: no cover from matchmaker.utils.processor import Processor @@ -43,6 +47,7 @@ def __init__( self.mock = mock self.listen = False self.init_time = None + self.stream_start = STREAM_START() def start_listening(self): """ diff --git a/pyproject.toml b/pyproject.toml index 1dfd23a..005b31d 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -7,7 +7,7 @@ name = "pymatchmaker" version = "0.2.1" description = "A package for real-time music alignment" readme = "README.md" -requires-python = ">=3.9" +requires-python = ">=3.10" license = { text = "Apache 2.0" } keywords = ["music", "alignment", "midi", "audio"] authors = [ @@ -18,6 +18,10 @@ classifiers = [ "License :: OSI Approved :: Apache Software License", "Programming Language :: Python", "Programming Language :: Python :: 3", + "Programming Language :: Python :: 3.10", + "Programming Language :: Python :: 3.11", + "Programming Language :: Python :: 3.12", + "Programming Language :: Python :: 3.13", "Programming Language :: Cython", "Programming Language :: Python :: Implementation :: CPython", "Programming Language :: Python :: Implementation :: PyPy", @@ -31,10 +35,10 @@ dependencies = [ "python-rtmidi>=1.5.8", "mido>=1.3.2", "numpy>=1.26.3,<2.0", - "scipy>=1.11.4,<1.15", + "scipy>=1.11.4", "librosa>=0.10.1", "pandas>=2.0.0", - "partitura>=1.7.0", + "partitura>=1.8.0", "progressbar2>=4.2.0", "python-hiddenmarkov>=0.1.3", "pyaudio>=0.2.14", diff --git a/run_examples.py b/run_examples.py index 4b13c59..245b1b7 100644 --- a/run_examples.py +++ b/run_examples.py @@ -31,6 +31,12 @@ def main(): group = parser.add_mutually_exclusive_group() group.add_argument("--audio", action="store_true", help="Use audio input mode") group.add_argument("--midi", action="store_true", help="Use MIDI input mode") + parser.add_argument( + "--method", + type=str, + default=None, + help="Score following method (e.g., arzt, dixon, outerhmm, audio_outerhmm)", + ) args = parser.parse_args() input_mode = "midi" if args.midi else "audio" @@ -39,7 +45,10 @@ def main(): print(f"Running matchmaker with the score file ({SCORE_FILE.name})...") print("-" * 50) - method = "outerhmm" if input_mode == "midi" else "arzt" + if args.method is not None: + method = args.method + else: + method = "outerhmm" if input_mode == "midi" else "arzt" # Initialize matchmaker (simulation mode) try: @@ -54,7 +63,7 @@ def main(): return # Run real-time score following - for current_position in mm.run(wait=True): + for current_position in mm.run(): timestamp = datetime.datetime.now().strftime("%H:%M:%S.%f")[:-3] print(f"[{timestamp}] Current beat position: {current_position}") diff --git a/tests/test_io_audio.py b/tests/test_io_audio.py index a313758..f4a55cd 100644 --- a/tests/test_io_audio.py +++ b/tests/test_io_audio.py @@ -227,7 +227,11 @@ def test_offline_input(self, mock_stdout=None): print("current time", self.stream.current_time) self.stream.join() - outputs = list(self.stream.queue.queue) + from matchmaker.utils.stream import STREAM_END + + outputs = [ + item for item in self.stream.queue.queue if item is not STREAM_END + ] for _, ftime in outputs: self.assertTrue(isinstance(ftime, float)) diff --git a/tests/test_matchmaker.py b/tests/test_matchmaker.py index 8a7d894..8de2704 100644 --- a/tests/test_matchmaker.py +++ b/tests/test_matchmaker.py @@ -5,6 +5,8 @@ import warnings from pathlib import Path +import numpy as np + from matchmaker import Matchmaker from matchmaker.dp import OnlineTimeWarpingArzt from matchmaker.dp.oltw_dixon import OnlineTimeWarpingDixon @@ -14,6 +16,7 @@ from matchmaker.io.midi import MidiStream from matchmaker.prob.hmm import PitchIOIHMM from matchmaker.prob.outer_product_hmm import OuterProductHMM +from matchmaker.prob.outer_product_hmm_audio import AudioOuterProductHMM warnings.filterwarnings("ignore", module="partitura") warnings.filterwarnings("ignore", module="librosa") @@ -22,21 +25,17 @@ class TestMatchmaker(unittest.TestCase): def setUp(self): # Set up paths to test files - self.score_file = "./tests/resources/Bach-fugue_bwv_858.musicxml" - self.performance_file_audio = "./tests/resources/Bach-fugue_bwv_858.mp3" - self.performance_file_midi = "./tests/resources/Bach-fugue_bwv_858.mid" + self.score_file = "./matchmaker/assets/simple_mozart_k265_var1.musicxml" + self.performance_file_audio = "./matchmaker/assets/simple_mozart_k265_var1.mp3" + self.performance_file_midi = "./matchmaker/assets/simple_mozart_k265_var1.mid" self.performance_file_annotations = ( - "./tests/resources/Bach-fugue_bwv_858_note_annotations.txt" + "./matchmaker/assets/simple_mozart_k265_var1_note_annotations.txt" + ) + self.performance_file_beat_annotations = ( + "./matchmaker/assets/simple_mozart_k265_var1_beat_annotations.txt" ) self.test_datasets = [ - { - "name": "bach_fugue_bwv_858", - "score": "./tests/resources/Bach-fugue_bwv_858.musicxml", - "audio": "./tests/resources/Bach-fugue_bwv_858.mp3", - "midi": "./tests/resources/Bach-fugue_bwv_858.mid", - "annotations": "./tests/resources/Bach-fugue_bwv_858_note_annotations.txt", - }, { "name": "simple_mozart_k265_var1", "score": "./matchmaker/assets/simple_mozart_k265_var1.musicxml", @@ -44,6 +43,13 @@ def setUp(self): "midi": "./matchmaker/assets/simple_mozart_k265_var1.mid", "annotations": "./matchmaker/assets/simple_mozart_k265_var1_note_annotations.txt", }, + { + "name": "bach_fugue_bwv_858", + "score": "./tests/resources/Bach-fugue_bwv_858.musicxml", + "audio": "./tests/resources/Bach-fugue_bwv_858.mp3", + "midi": "./tests/resources/Bach-fugue_bwv_858.mid", + "annotations": "./tests/resources/Bach-fugue_bwv_858_note_annotations.txt", + }, ] def test_matchmaker_audio_init(self): @@ -72,6 +78,7 @@ def test_matchmaker_audio_run(self): # When & Then: running the alignment process, the yielded result should be a float values for position_in_beat in mm.run(verbose=False): self.assertIsInstance(position_in_beat, float) + break def test_matchmaker_audio_run_with_result(self): # Given: a Matchmaker instance with audio input @@ -95,12 +102,11 @@ def test_matchmaker_audio_run_with_result(self): def test_matchmaker_audio_run_with_evaluation(self): for dataset in self.test_datasets: - for method in ["arzt", "dixon"]: + for method in ["arzt", "dixon", "audio_outerhmm"]: with self.subTest(dataset=dataset["name"], method=method): mm = Matchmaker( score_file=dataset["score"], performance_file=dataset["audio"], - wait=False, input_type="audio", method=method, ) @@ -116,15 +122,15 @@ def test_matchmaker_audio_run_with_evaluation(self): current_test = f"{dataset['name']}_{method}" results = mm.run_evaluation( dataset["annotations"], - debug=True, - save_dir=Path("./tests/results"), - run_name=current_test, + debug=False, + # save_dir=Path("./tests/results"), + # run_name=current_test, ) print(f"[{current_test}] RESULTS: {json.dumps(results, indent=4)}") # Then: the results should at least be 0.5 for threshold in ["300ms", "500ms", "1000ms"]: - self.assertGreaterEqual(results[threshold], 0.5) + self.assertGreaterEqual(results["ms"][threshold], 0.5) def test_matchmaker_audio_run_with_evaluation_cqt(self): # Given: a Matchmaker instance with audio input @@ -134,7 +140,6 @@ def test_matchmaker_audio_run_with_evaluation_cqt(self): wait=False, input_type="audio", feature_type="cqt", - distance_func="Cosine", method="arzt", ) try: @@ -154,7 +159,7 @@ def test_matchmaker_audio_run_with_evaluation_cqt(self): # Then: the results should at least be 0.5 for threshold in ["300ms", "500ms", "1000ms"]: - self.assertGreaterEqual(results[threshold], 0.5) + self.assertGreaterEqual(results["ms"][threshold], 0.5) def test_matchmaker_audio_run_with_evaluation_in_beats(self): # Given: a Matchmaker instance with audio input @@ -172,18 +177,14 @@ def test_matchmaker_audio_run_with_evaluation_in_beats(self): mm._has_run = True results = mm.run_evaluation( - "./tests/resources/Bach-fugue_bwv_858_beat_annotations.txt", - level="beat", - debug=True, - save_dir=Path("./tests/results"), - run_name="test_matchmaker_audio_run_with_evaluation_in_beats", + self.performance_file_annotations, domain="score", ) print(f"RESULTS: {json.dumps(results, indent=4)}") # Then: the results should at least be 0.5 for threshold in ["0.3b", "0.5b", "1b"]: - self.assertGreaterEqual(results[threshold], 0.5) + self.assertGreaterEqual(results["beat"][threshold], 0.5) def test_matchmaker_audio_run_with_evaluation_before_run(self): # Given: a Matchmaker instance with audio input @@ -226,6 +227,47 @@ def test_matchmaker_audio_arzt_init(self): self.assertIsInstance(mm.stream, AudioStream) self.assertIsInstance(mm.score_follower, OnlineTimeWarpingArzt) + def test_matchmaker_audio_outerhmm_init(self): + mm = Matchmaker( + score_file=self.score_file, + performance_file=self.performance_file_audio, + input_type="audio", + method="audio_outerhmm", + ) + + self.assertIsInstance(mm.stream, AudioStream) + self.assertIsInstance(mm.score_follower, AudioOuterProductHMM) + + def test_matchmaker_audio_outerhmm_run(self): + mm = Matchmaker( + score_file=self.score_file, + performance_file=self.performance_file_audio, + input_type="audio", + method="audio_outerhmm", + ) + + for position_in_beat in mm.run(verbose=False): + self.assertIsInstance(position_in_beat, float) + break + + def test_matchmaker_audio_rtf(self): + for method in ["arzt", "dixon", "audio_outerhmm"]: + with self.subTest(method=method): + mm = Matchmaker( + score_file=self.score_file, + performance_file=self.performance_file_audio, + input_type="audio", + method=method, + ) + list(mm.run(verbose=False)) + + results = mm.run_evaluation( + self.performance_file_annotations, + ) + self.assertIn("rtf", results) + self.assertGreater(results["rtf"], 0) + self.assertLess(results["rtf"], 1.0) + def test_matchmaker_with_frame_rate(self): # Given: a Matchmaker instance with audio input mm = Matchmaker( @@ -233,12 +275,12 @@ def test_matchmaker_with_frame_rate(self): performance_file=self.performance_file_audio, wait=False, input_type="audio", - frame_rate=100, + frame_rate=50, ) - # Then: the frame rate should be 100 - self.assertEqual(mm.frame_rate, 100) - self.assertEqual(mm.score_follower.frame_rate, 100) + # Then: the frame rate should be 50 + self.assertEqual(mm.frame_rate, 50) + self.assertEqual(mm.score_follower.frame_rate, 50) def test_matchmaker_invalid_input_type(self): # Test Matchmaker with invalid input type @@ -293,10 +335,10 @@ def test_matchmaker_midi_run(self): ) # When & Then: running the alignment process, - # the yielded result should be a float values + # the yielded result should be numeric (int state index for MIDI) for position_in_beat in mm.run(): - self.assertIsInstance(position_in_beat, float) - if position_in_beat >= 130: + self.assertIsInstance(position_in_beat, (int, float, np.integer)) + if position_in_beat >= 10: break diff --git a/tests/test_oltw_arzt.py b/tests/test_oltw_arzt.py index 5bf6083..599a02b 100644 --- a/tests/test_oltw_arzt.py +++ b/tests/test_oltw_arzt.py @@ -22,7 +22,7 @@ RNG = np.random.RandomState(1984) -SCIPY_DISTANCES = [ +_ALL_SCIPY_DISTANCES = [ "braycurtis", "canberra", "chebyshev", @@ -36,13 +36,14 @@ "dice", "hamming", "jaccard", - "kulczynski1", "rogerstanimoto", "russellrao", "sokalmichener", "sokalsneath", "yule", ] +# Filter to distances available in the installed scipy version +SCIPY_DISTANCES = [d for d in _ALL_SCIPY_DISTANCES if hasattr(sp_distance, d)] class TestOnlineTimeWarpingArzt(unittest.TestCase): diff --git a/tests/test_prob_hmm.py b/tests/test_prob_hmm.py index 9335866..b790820 100644 --- a/tests/test_prob_hmm.py +++ b/tests/test_prob_hmm.py @@ -197,7 +197,7 @@ def test_init(self): "./tests/resources/Bach-fugue_bwv_858_annotations.txt" ) - self.performance = process_audio_offline( - perf_info=self.performance_file_audio, + self.performance = process_midi_offline( + perf_info=self.performance_file_midi, processor=PitchProcessor(piano_range=True), )