diff --git a/.github/workflows/pypi_publish.yml b/.github/workflows/pypi_publish.yml index 5cfc5dd..e944c12 100644 --- a/.github/workflows/pypi_publish.yml +++ b/.github/workflows/pypi_publish.yml @@ -26,7 +26,7 @@ jobs: - name: Build wheels run: python -m cibuildwheel env: - CIBW_BUILD: "cp39-* cp310-* cp311-* cp312-*" + CIBW_BUILD: "cp310-* cp311-* cp312-* cp313-*" CIBW_ARCHS: "x86_64" CIBW_ARCHS_MACOS: "x86_64 arm64" CIBW_BEFORE_BUILD: | diff --git a/.github/workflows/test_pypi_publish.yml b/.github/workflows/test_pypi_publish.yml index 8328b29..cbdf340 100644 --- a/.github/workflows/test_pypi_publish.yml +++ b/.github/workflows/test_pypi_publish.yml @@ -28,7 +28,7 @@ jobs: - name: Build wheels run: python -m cibuildwheel env: - CIBW_BUILD: "cp39-* cp310-* cp311-* cp312-*" + CIBW_BUILD: "cp310-* cp311-* cp312-* cp313-*" CIBW_ARCHS: "x86_64" CIBW_ARCHS_MACOS: "x86_64 arm64" CIBW_BEFORE_BUILD: | diff --git a/.github/workflows/unittest.yml b/.github/workflows/unittest.yml index 6c9bc9a..ff04d20 100644 --- a/.github/workflows/unittest.yml +++ b/.github/workflows/unittest.yml @@ -11,7 +11,7 @@ jobs: fail-fast: false matrix: platform: [ubuntu-latest, macos-latest] - python-version: ["3.9", "3.10", "3.11", "3.12"] + python-version: ["3.10", "3.11", "3.12", "3.13"] runs-on: ${{ matrix.platform }} steps: diff --git a/README.md b/README.md index 4750e38..e845174 100644 --- a/README.md +++ b/README.md @@ -14,7 +14,7 @@ The full documentation for matchmaker is available online at [readthedocs.org](h ### Prerequisites -- Available Python version: 3.12 +- Available Python version: 3.10, 3.11, 3.12, 3.13 - [Fluidsynth](https://www.fluidsynth.org/) - [PortAudio](http://www.portaudio.com/) @@ -121,6 +121,7 @@ from matchmaker import Matchmaker mm = Matchmaker( score_file="path/to/score", + input_type="audio", device_name_or_index="MacBookPro Microphone", ) for current_position in mm.run(): @@ -132,18 +133,18 @@ for current_position in mm.run(): The repository includes a ready-to-use example script that demonstrates the complete workflow: ```bash -# Run with audio input (default) -python run_examples.py +# Run with audio input (uses arzt method as default) +python run_examples.py --audio -# Run with MIDI input -python run_examples.py --midi +# Run with MIDI input and specific method +python run_examples.py --midi --method hmm ``` This script runs a complete example with score following and evaluation, saving results to the `results/` directory. ### Testing with Different Methods or Features -For testing with Audio input, you can specify the alignment method as follows: +You can specify the alignment method and feature processor as follows: ```python from matchmaker import Matchmaker @@ -151,37 +152,42 @@ from matchmaker import Matchmaker mm = Matchmaker( score_file="path/to/score", input_type="audio", - method="dixon", # or "arzt" (default) + method="arzt", # see Alignment Methods section + processor="chroma", # see Features section ) for current_position in mm.run(): print(current_position) ``` For options regarding the `method`, please refer to the [Alignment Methods](#alignment-methods) section. -For options regarding the `feature_type`, please refer to the [Features](#features) section. +For options regarding the `processor`, please refer to the [Features](#features) section. ## Alignment Methods Matchmaker currently supports the following alignment methods: -- `"dixon"`: On-line time warping algorithm by S. Dixon (2005). Currently supports audio input; MIDI support coming soon. -- `"arzt"`: On-line time warping algorithm adapted from Brazier and Widmer (2020) (based on the work by Arzt et al. (2010)). Currently supports audio input; MIDI support coming soon. -- `"hmm"`: Hidden Markov Model-based score follower by Cancino-Chacón et al. (2023), based on the state-space score followers by Duan et al. (2011) and Jiang and Raphael (2020). Currently supports MIDI input; Audio support coming soon. +- `"arzt"`: On-line time warping algorithm adapted from Brazier and Widmer (2020) (based on the work by Arzt et al. (2010)). Supports audio and MIDI input. +- `"dixon"`: On-line time warping algorithm by S. Dixon (2005). Supports audio and MIDI input. +- `"outerhmm"`: Outer-product HMM score follower by E. Nakamura (2014). Supports audio and MIDI input. +- `"hmm"`: Hidden Markov Model-based score follower by Cancino-Chacón et al. (2023), based on the state-space score followers by Duan et al. (2011) and Jiang and Raphael (2020). Supports MIDI input. +- `"pthmm"`: Pitch-based HMM score follower. Supports MIDI input. ## Features Matchmaker currently supports the following feature types: - For audio: - - `"chroma"`: Chroma features. Default feature type for audio input. + - `"chroma"`: Chroma features. Default for audio input. - `"mfcc"`: Mel-frequency cepstral coefficients. - - `"mel"`: Mel-Spectrogram. - - `"logspectral"`: Log-spectral features used in Dixon (2005). + - `"cqt"`: Constant-Q transform. + - `"mel"`: Mel-spectrogram. + - `"lse"`: Log-spectral energy features used in Dixon (2005). + - `"cqt_spectral_flux"`: CQT-based spectral flux used in Nakamura et al. (2014). - For MIDI: - - `pianoroll`: Piano-roll features. Default feature type for MIDI input. - - `"pitch"`: Pitch features for MIDI input. - - `"pitchclass"`: Pitch class features for MIDI input. + - `"pitch_ioi"`: Pitch and inter-onset interval features. Default for MIDI input. + - `"pianoroll"`: Piano-roll features. + - `"pitchclass"`: Pitch class features. ## Configurations @@ -189,10 +195,8 @@ Initialization parameters for the `Matchmaker` class: - `score_file` (str): Path to the score file. - `input_type` (str): Type of input data. Options: `"audio"`, `"midi"`. -- `feature_type` (str): Type of feature to use. Options: `"chroma"`, `"mfcc"`, `"cqt"`, `"spectrogram"`, `"onset"`. -- `method` (str): Alignment method to use. Options: `"dixon"`, `"arzt"`, `"hmm"`. -- `sample_rate` (int): Sample rate of the input audio data. -- `frame_rate` (int): Frame rate of the input audio/MIDI data. +- `method` (str): Alignment method to use. See [Alignment Methods](#alignment-methods) for available options. +- `processor` (str): Type of feature processor to use. See [Features](#features) for available options. - `device_name_or_index` (str or int): The audio/MIDI device name or index you want to use. If `None`, the default device will be used. ## Citing Matchmaker diff --git a/matchmaker/dp/__init__.py b/matchmaker/dp/__init__.py index 3903e5f..4a063d3 100644 --- a/matchmaker/dp/__init__.py +++ b/matchmaker/dp/__init__.py @@ -1,12 +1,20 @@ #!/usr/bin/python # -*- coding: utf-8 -*- """ -Top module for dynamic programming-based alignment methods -""" +Dynamic programming-based alignment methods. -from .oltw_arzt import OnlineTimeWarpingArzt -from .oltw_dixon import OnlineTimeWarpingDixon +Each algorithm has a base class and two variants: + - Frame: fixed-rate features (audio) + - Event: onset-level features (MIDI) +""" -# Alias -OLTWArzt = OnlineTimeWarpingArzt -OLTWDixon = OnlineTimeWarpingDixon +from .oltw_arzt import ( + OnlineTimeWarpingArzt, + OnlineTimeWarpingArztEvent, + OnlineTimeWarpingArztFrame, +) +from .oltw_dixon import ( + OnlineTimeWarpingDixon, + OnlineTimeWarpingDixonEvent, + OnlineTimeWarpingDixonFrame, +) diff --git a/matchmaker/dp/oltw_arzt.py b/matchmaker/dp/oltw_arzt.py index a6749cf..422692d 100644 --- a/matchmaker/dp/oltw_arzt.py +++ b/matchmaker/dp/oltw_arzt.py @@ -1,7 +1,17 @@ #!/usr/bin/python # -*- coding: utf-8 -*- """ -On-line Dynamic Time Warping +On-line Time Warping (OLTWArzt) + +OLTW with step-size constraint and adaptive window, based on: + Arzt & Widmer (2010) "Simple Tempo Models for Real-Time Music Tracking" + Arzt & Widmer (2015) "Real-Time Music Tracking Using Multiple Performances + as a Reference" + +Classes: + OnlineTimeWarpingArzt — base class (common properties, step-size clamp, run loop) + OnlineTimeWarpingArztFrame — frame-level variant for audio (Cython-accelerated) + OnlineTimeWarpingArztEvent — event-level variant for MIDI (onset-by-onset) """ import time @@ -9,80 +19,226 @@ import numpy as np import progressbar +import scipy.spatial.distance from numpy.typing import NDArray from matchmaker.base import OnlineAlignment from matchmaker.dp.dtw_loop import oltw_arzt_loop -from matchmaker.features.audio import FRAME_RATE, QUEUE_TIMEOUT +from matchmaker.features.audio import FRAME_RATE +from matchmaker.io.audio import QUEUE_TIMEOUT +from matchmaker.io.queue import RECVQueue +from matchmaker.io.stream import STREAM_END from matchmaker.utils import ( CYTHONIZED_METRICS_W_ARGUMENTS, CYTHONIZED_METRICS_WO_ARGUMENTS, distances, ) from matchmaker.utils.distances import Metric, vdist -from matchmaker.utils.misc import ( +from matchmaker.utils.errors import ( MatchmakerInvalidOptionError, MatchmakerInvalidParameterTypeError, - RECVQueue, - set_latency_stats, ) +from matchmaker.utils.misc import set_latency_stats + +STEP_SIZE: int = 3 +START_WINDOW_SIZE: Union[float, int] = 0.1 +WINDOW_SIZE: int = 10 + -STEP_SIZE: int = 5 -WINDOW_SIZE: int = 5 -START_WINDOW_SIZE: Union[float, int] = 0.25 +# --------------------------------------------------------------------------- +# Base class +# --------------------------------------------------------------------------- class OnlineTimeWarpingArzt(OnlineAlignment): - """ - Fast On-line Time Warping + """Base class for Arzt-style OLTW with step-size constraint. + + Subclasses must implement ``step()`` and may override ``reset()``, + ``get_window()``, and ``_init_distance_func()``. Parameters ---------- reference_features : np.ndarray - A 2D array with dimensions (`n_timesteps`, `n_features`) containing the - features of the reference the input is going to be aligned to. - + Feature matrix for the reference (score) sequence. window_size : int - Size of the window for searching the optimal path in the cumulative - cost matrix. - + Search window size (interpretation depends on subclass). step_size : int - Size of the step + Maximum position advance per input step. + start_window_size : int + Window size during warmup. + queue : RECVQueue or None + Input queue for streaming. + state_space : np.ndarray or None + Score positions (beats) for each reference element. + """ + + def __init__( + self, + reference_features: NDArray[np.float32], + window_size=WINDOW_SIZE, + step_size: int = STEP_SIZE, + start_window_size=START_WINDOW_SIZE, + queue: Optional[RECVQueue] = None, + state_space=None, + **kwargs, + ) -> None: + super().__init__(reference_features=reference_features) + self.N_ref: int = self.reference_features.shape[0] + self.queue = queue or RECVQueue() + self.step_size: int = step_size + self.state_space = state_space + self._ref_frame_to_beat: Optional[NDArray] = kwargs.get( + "ref_frame_to_beat", None + ) + # Subclasses set self._window_size and self._start_window_size + # after calling super().__init__ + self.init_position: int = kwargs.get("current_position", 0) + self.last_queue_update = time.time() + self.latency_stats: Dict[str, float] = { + "total_latency": 0, + "total_frames": 0, + "max_latency": 0, + "min_latency": float("inf"), + } + # Legacy attributes (used by some callers) + self.state_to_ref_time_map = kwargs.get("state_to_ref_time_map", None) + self.ref_to_state_time_map = kwargs.get("ref_to_state_time_map", None) - distance_func : str, tuple (str, dict) or callable - Distance function for computing pairwise distances. + self.reset() - start_window_size: int - Size of the starting window size. + def reset(self) -> None: + self.current_position: int = self.init_position + self.positions: List[int] = [] + self._warping_path: List = [] + self.global_cost_matrix: NDArray[np.float32] = np.full( + (self.N_ref + 1, 2), np.inf, dtype=np.float32 + ) + self.input_index: int = 0 + self.input_features = None - Attributes - ---------- - reference_features : np.ndarray - See description above. + @property + def current_beat(self) -> float: + if self._ref_frame_to_beat is not None: + idx = min(self.current_position, len(self._ref_frame_to_beat) - 1) + return float(self._ref_frame_to_beat[idx]) + if self.state_space is not None: + idx = min(self.current_position, len(self.state_space) - 1) + return float(self.state_space[idx]) + return float(self.current_position) - window_size : int - See description above. + @property + def warping_path(self) -> NDArray[np.float32]: + if self._warping_path: + return np.array(self._warping_path).T + return np.empty((2, 0)) - step_size : int - See description above. + @property + def window_index(self) -> int: + return self.current_position - input_features : list - List with the input features (updates every time there is a step). + def __call__(self, input: NDArray[np.float32]) -> int: + self.step(input) + return self.current_position - current_position : int - Index of the current position. + def is_still_following(self) -> bool: + return self.current_position < self.N_ref - 1 - warping_path : list - List of tuples containing the current position and the corresponding - index in the array of `reference_features`. + def get_window(self) -> Tuple[int, int]: + """Return (start, end) of the search window.""" + raise NotImplementedError - positions : list - List of the positions for each input. + def step(self, input_features: NDArray[np.float32]) -> None: + """Process one input element and update position.""" + raise NotImplementedError + + def _clamp_position(self, min_index: int) -> None: + """Apply step-size constraint: no backwards, max step_size forward.""" + if self.input_index > 0: + self.current_position = int( + np.clip( + min_index, + self.current_position, + self.current_position + self.step_size, + ) + ) + else: + self.current_position = min_index + + def run(self, verbose: bool = True) -> Generator[float, None, NDArray]: + """Run alignment from queue. + + Yields + ------ + float + Current score position in beats. + + Returns + ------- + np.ndarray + Warping path (2, T). + """ + self.reset() + + if verbose: + n = len(self.state_space) if self.state_space is not None else self.N_ref + pbar = progressbar.ProgressBar( + max_value=n, + redirect_stdout=True, + redirect_stderr=True, + ) + pbar.start() + + while self.is_still_following(): + item = self.queue.get(timeout=QUEUE_TIMEOUT) + if item is STREAM_END: + break + if item is None: + continue + features = item[0] if isinstance(item, tuple) else item + self.last_queue_update = time.time() + + # Frame subclass accumulates input_features for plotting + if hasattr(self, "_accumulate_input") and self._accumulate_input: + self.input_features = ( + np.concatenate((self.input_features, features)) + if self.input_features is not None + else features + ) + + self.step(features) + + if verbose: + pbar.update( + int(np.searchsorted(self.state_space, self.current_beat)) + if self.state_space is not None + else min(self.current_position, self.N_ref - 1) + ) + + latency = time.time() - self.last_queue_update + self.latency_stats = set_latency_stats( + latency, self.latency_stats, self.input_index + ) + yield self.current_beat + + if verbose: + pbar.finish() + + return self.warping_path + + +# --------------------------------------------------------------------------- +# Frame-level subclass (audio) +# --------------------------------------------------------------------------- + + +class OnlineTimeWarpingArztFrame(OnlineTimeWarpingArzt): + """Frame-level OLTW for audio input. + + Uses Cython-accelerated ``oltw_arzt_loop`` and Matchmaker distance + functions. Window size is in seconds (converted to frames via frame_rate). """ DEFAULT_DISTANCE_FUNC: str = "Manhattan" - distance_func: Callable[[NDArray[np.float32]], NDArray[np.float32]] - vdist: Callable[[NDArray[np.float32]], NDArray[np.float32]] def __init__( self, @@ -90,24 +246,38 @@ def __init__( window_size: int = WINDOW_SIZE, step_size: int = STEP_SIZE, distance_func: Union[ - str, - Callable, - Tuple[str, Dict[str, Any]], + str, Callable, Tuple[str, Dict[str, Any]] ] = DEFAULT_DISTANCE_FUNC, - start_window_size: int = START_WINDOW_SIZE, + start_window_size: Union[float, int] = START_WINDOW_SIZE, current_position: int = 0, frame_rate: int = FRAME_RATE, queue: Optional[RECVQueue] = None, - state_to_ref_time_map = None, - ref_to_state_time_map = None, - state_space = None, + state_to_ref_time_map=None, + ref_to_state_time_map=None, + state_space=None, **kwargs, ) -> None: - super().__init__(reference_features=reference_features) + super().__init__( + reference_features=reference_features, + window_size=window_size, + step_size=step_size, + start_window_size=start_window_size, + queue=queue, + state_space=state_space, + current_position=current_position, + state_to_ref_time_map=state_to_ref_time_map, + ref_to_state_time_map=ref_to_state_time_map, + **kwargs, + ) + self.frame_rate = frame_rate + self._window_size = int(np.round(window_size * self.frame_rate)) + self._start_window_size = int(np.round(start_window_size * frame_rate)) + self._accumulate_input = True - self.input_features: List[NDArray[np.float32]] = None - self.queue = queue or RECVQueue() + # Set distance function (Cython or callable) + self._init_distance_func(distance_func) + def _init_distance_func(self, distance_func): if not (isinstance(distance_func, (str, tuple)) or callable(distance_func)): raise MatchmakerInvalidParameterTypeError( parameter_name="distance_func", @@ -115,7 +285,6 @@ def __init__( actual_parameter_type=type(distance_func), ) - # Set local cost function if isinstance(distance_func, str): if distance_func not in CYTHONIZED_METRICS_WO_ARGUMENTS: raise MatchmakerInvalidOptionError( @@ -123,9 +292,7 @@ def __init__( valid_options=CYTHONIZED_METRICS_WO_ARGUMENTS, value=distance_func, ) - # If distance_func is a string self.distance_func = getattr(distances, distance_func)() - elif isinstance(distance_func, tuple): if distance_func[0] not in CYTHONIZED_METRICS_W_ARGUMENTS: raise MatchmakerInvalidOptionError( @@ -133,146 +300,32 @@ def __init__( valid_options=CYTHONIZED_METRICS_W_ARGUMENTS, value=distance_func[0], ) - # distance_func is a tuple with the arguments to instantiate - # the cost self.distance_func = getattr(distances, distance_func[0])( **distance_func[1] ) - elif callable(distance_func): - # If the local cost is a callable self.distance_func = distance_func - # A callable to compute the distance between the rows of matrix and a vector if isinstance(self.distance_func, Metric): self.vdist = vdist else: - # TODO: Speed this up somehow instead of list comprehension self.vdist = lambda X, y, lcf: np.array([lcf(x, y) for x in X]).astype( np.float32 ) - self.N_ref: int = self.reference_features.shape[0] - self.frame_rate = frame_rate - self.window_size: int = window_size * self.frame_rate - self.step_size: int = step_size - self.start_window_size: int = int(np.round(start_window_size * frame_rate)) - self.init_position: int = current_position - self.current_position: int = current_position - self.positions: List[int] = [] - self._warping_path: List = [] - self.global_cost_matrix: NDArray[np.float32] = ( - np.ones((reference_features.shape[0] + 1, 2), dtype=np.float32) * np.inf - ).astype(np.float32) - self.input_index: int = 0 - self.go_backwards: bool = False - self.update_window_index: bool = False - self.restart: bool = False - self.is_following: bool = False - self.last_queue_update = time.time() - self.latency_stats: Dict[str, float] = { - "total_latency": 0, - "total_frames": 0, - "max_latency": 0, - "min_latency": float("inf"), - } - self.state_to_ref_time_map = state_to_ref_time_map - self.ref_to_state_time_map = ref_to_state_time_map - self.state_space = state_space #if state_space != None else np.unique(self.reference_features.note_array()["onset_beat"]) - - @property - def warping_path(self) -> NDArray[np.int32]: - wp = (np.array(self._warping_path).T).astype(np.int32) - return wp - - def __call__(self, input: NDArray[np.float32]) -> int: - self.step(input) - return self.current_position - - def run(self, verbose: bool = True) -> Generator[int, None, NDArray[np.float32]]: - """Run the online alignment process. - - Parameters - ---------- - verbose : bool, optional - Whether to show progress bar, by default True - - Yields - ------ - int - Current position in the reference sequence - - Returns - ------- - NDArray[np.float32] - The warping path as a 2D array where each column contains - (reference_position, input_position) - """ - self.reset() - - if verbose: - pbar = progressbar.ProgressBar(max_value=self.N_ref, redirect_stdout=True) - - while self.is_still_following(): - features, f_time = self.queue.get(timeout=QUEUE_TIMEOUT) - self.last_queue_update = time.time() - self.input_features = ( - np.concatenate((self.input_features, features)) - if self.input_features is not None - else features - ) - self.step(features) - - if verbose: - pbar.update(int(self.current_position)) - - latency = time.time() - self.last_queue_update - self.latency_stats = set_latency_stats( - latency, self.latency_stats, self.input_index - ) - yield self.current_position - - if verbose: - pbar.finish() - - return self.warping_path - - def is_still_following(self): - # TODO: check stopping if the follower is stuck. - return self.current_position <= self.N_ref - 2 - - def reset(self) -> None: - self.current_position = self.init_position - self.positions = [] - self._warping_path: List = [] - self.global_cost_matrix = ( - np.ones((self.reference_features.shape[0] + 1, 2), dtype=np.float32) - * np.inf - ) - self.input_index = 0 - self.update_window_index = False - def get_window(self) -> Tuple[int, int]: - w_size = self.window_size - if self.window_index < self.start_window_size: - w_size = self.start_window_size - window_start = max(self.window_index - w_size, 0) - window_end = min(self.window_index + w_size, self.N_ref) - return window_start, window_end - - @property - def window_index(self) -> int: - return self.current_position + w = self._window_size + if self.window_index < self._start_window_size: + w = self._start_window_size + start = max(self.window_index - w, 0) + end = min(self.window_index + w, self.N_ref) + return start, end def step(self, input_features: NDArray[np.float32]) -> None: - """ - Update the current position and the warping path. - """ min_costs = np.inf min_index = max(self.window_index - self.step_size, 0) window_start, window_end = self.get_window() - # compute local cost beforehand as it is much faster (~twice as fast) window_cost = self.vdist( self.reference_features[window_start:window_end], input_features.squeeze(), @@ -289,25 +342,139 @@ def step(self, input_features: NDArray[np.float32]) -> None: min_index=min_index, ) - # adapt current_position: do not go backwards, - # but also go a maximum of N steps forward + self._clamp_position(min_index) + self._warping_path.append((self.current_beat, self.input_index)) + self.input_index += 1 + + +# --------------------------------------------------------------------------- +# Event-level subclass (MIDI) +# --------------------------------------------------------------------------- + + +class OnlineTimeWarpingArztEvent(OnlineTimeWarpingArzt): + """Event-level OLTW for MIDI input. + + Each step processes one onset event. Uses scipy for distance computation + and a pure-Python rolling DP loop. Window size is in number of events. - if self.input_index == 0: - # enforce the first time step to stay at the - # initial position - self.current_position = min( # TODO: Is this necessary? - max(self.current_position, min_index), - self.current_position, + When queue provides ``(feature, timestamp)`` tuples, the warping path + records ``(score_beat, perf_time_sec)`` instead of ``(beat, input_index)``. + """ + + def __init__( + self, + reference_features: NDArray[np.float32], + window_size: int = 30, + step_size: int = 5, + start_window_size: int = 5, + distance_func: str = "cosine", + queue: Optional[RECVQueue] = None, + state_space=None, + **kwargs, + ) -> None: + super().__init__( + reference_features=reference_features, + window_size=window_size, + step_size=step_size, + start_window_size=start_window_size, + queue=queue, + state_space=state_space, + **kwargs, + ) + self._window_size = window_size + self._start_window_size = start_window_size + self._scipy_metric = distance_func + + def get_window(self) -> Tuple[int, int]: + w = self._window_size + if self.input_index < self._start_window_size: + w = self._start_window_size + start = max(self.window_index - self.step_size, 0) + end = min(start + w, self.N_ref) + return start, end + + def step( + self, input_features: NDArray[np.float32], perf_time: float = None + ) -> None: + feat = np.asarray(input_features, dtype=np.float32).squeeze() + + window_start, window_end = self.get_window() + window_cost = scipy.spatial.distance.cdist( + self.reference_features[window_start:window_end], + feat.reshape(1, -1), + metric=self._scipy_metric, + ).flatten() + + # Rolling 2-column DP (same logic as oltw_arzt_loop) + min_costs = np.inf + min_index = max(self.window_index - self.step_size, 0) + + for idx_w, score_index in enumerate(range(window_start, window_end)): + si = score_index + 1 # 1-indexed in cost matrix + + if score_index == 0 and self.input_index == 0: + self.global_cost_matrix[1, 1] = window_cost[idx_w] + min_costs = window_cost[idx_w] + min_index = 0 + continue + + local_dist = window_cost[idx_w] + d1 = self.global_cost_matrix[si - 1, 1] + local_dist + d2 = self.global_cost_matrix[si, 0] + local_dist + d3 = self.global_cost_matrix[si - 1, 0] + local_dist + best = min(d1, d2, d3) + self.global_cost_matrix[si, 1] = best + + norm_cost = best / (self.input_index + score_index + 1.0) + if norm_cost < min_costs: + min_costs = norm_cost + min_index = score_index + + # Shift columns + self.global_cost_matrix[:, 0] = self.global_cost_matrix[:, 1] + self.global_cost_matrix[:, 1] = np.inf + + self._clamp_position(min_index) + t = perf_time if perf_time is not None else self.input_index + self._warping_path.append((self.current_beat, t)) + self.input_index += 1 + + def run(self, verbose: bool = True) -> Generator[float, None, NDArray]: + """Run from queue. Extracts timestamp from (feature, time) tuples.""" + self.reset() + if verbose: + pbar = progressbar.ProgressBar( + max_value=self.N_ref, + redirect_stdout=True, + redirect_stderr=True, ) - else: - self.current_position = min( - max(self.current_position, min_index), - self.current_position + self.step_size, + pbar.start() + + while self.is_still_following(): + item = self.queue.get(timeout=QUEUE_TIMEOUT) + if item is STREAM_END: + break + if item is None: + continue + if isinstance(item, tuple): + features, perf_time = item[0], float(item[1]) + else: + features, perf_time = item, None + self.last_queue_update = time.time() + self.step(features, perf_time=perf_time) + + if verbose: + pbar.update(min(self.current_position, self.N_ref - 1)) + latency = time.time() - self.last_queue_update + self.latency_stats = set_latency_stats( + latency, self.latency_stats, self.input_index ) + yield self.current_beat - self._warping_path.append((self.current_position, self.input_index)) - # update input index - self.input_index += 1 + if verbose: + pbar.finish() + return self.warping_path if __name__ == "__main__": diff --git a/matchmaker/dp/oltw_dixon.py b/matchmaker/dp/oltw_dixon.py index 8c55a05..0fb1821 100644 --- a/matchmaker/dp/oltw_dixon.py +++ b/matchmaker/dp/oltw_dixon.py @@ -1,20 +1,34 @@ #!/usr/bin/python # -*- coding: utf-8 -*- """ -On-line Dynamic Time Warping (Dixon 2005) +On-line Time Warping (OLTWDixon) + +OLTW with backward-forward search, based on: + Dixon (2005) "An On-Line Time Warping Algorithm for Tracking Musical + Performances" (IJCAI) + Dixon (2005) "Live Tracking of Musical Performances Using On-Line Time + Warping" (DAFx) + +Classes: + OnlineTimeWarpingDixon — base class (common properties, run loop) + OnlineTimeWarpingDixonFrame — frame-level variant for audio (backward-forward search) + OnlineTimeWarpingDixonEvent — event-level variant for MIDI (onset-by-onset) """ import time from enum import IntEnum -from typing import Dict +from typing import Dict, Generator, Optional import numpy as np import progressbar -import scipy +import scipy.spatial.distance from numpy.typing import NDArray from matchmaker.base import OnlineAlignment -from matchmaker.features.audio import FRAME_RATE, QUEUE_TIMEOUT +from matchmaker.features.audio import FRAME_RATE +from matchmaker.io.audio import QUEUE_TIMEOUT +from matchmaker.io.queue import RECVQueue +from matchmaker.io.stream import STREAM_END from matchmaker.utils.misc import set_latency_stats @@ -25,78 +39,154 @@ class Direction(IntEnum): MAX_RUN_COUNT: int = 3 -FRAME_PER_SEG = 1 -WINDOW_SIZE = 10 # seconds +WINDOW_SIZE = 10 # seconds for frame, events for event + + +# --------------------------------------------------------------------------- +# Base class +# --------------------------------------------------------------------------- class OnlineTimeWarpingDixon(OnlineAlignment): - """ - On-line Dynamic Time Warping (Dixon) + """Base class for Dixon-style OLTW. + + Subclasses must implement ``step()``, ``reset()``, ``is_still_following()``, + and the ``current_beat`` / ``warping_path`` properties. Parameters ---------- reference_features : np.ndarray - A 2D array with dimensions (`F(n_features)`, `T(n_timesteps)`) containing the - features of the reference the input is going to be aligned to. + Feature matrix for the reference (score) sequence. + queue : RECVQueue or None + Input queue for streaming. + max_run_count : int + Max consecutive same-direction advances. + distance_func : str + Distance metric. + state_space : np.ndarray or None + Score positions (beats). + """ - distance_func : str, tuple (str, dict) or callable - Distance function for computing pairwise distances. + DEFAULT_DISTANCE_FUNC: str = "euclidean" - window_size : int - Size of the window for searching the optimal path in the cumulative cost matrix. + def __init__( + self, + reference_features, + queue=None, + max_run_count=MAX_RUN_COUNT, + distance_func=DEFAULT_DISTANCE_FUNC, + state_space=None, + **kwargs, + ): + super().__init__(reference_features=reference_features) + self.queue = queue + self.N_ref = self.reference_features.shape[0] + self.max_run_count = max_run_count + self.distance_func = ( + distance_func if isinstance(distance_func, str) else "euclidean" + ) + self.state_space = state_space + self._ref_frame_to_beat = kwargs.get("ref_frame_to_beat", None) + self.state_to_ref_time_map = kwargs.get("state_to_ref_time_map", None) + self.ref_to_state_time_map = kwargs.get("ref_to_state_time_map", None) + self.current_position = 0 + self.input_index = 0 + self.last_queue_update = time.time() + self.latency_stats: Dict[str, float] = { + "total_latency": 0, + "total_frames": 0, + "max_latency": 0, + "min_latency": float("inf"), + } - max_run_count : int - Maximum consecutive advances in one direction before the other - direction is forced. Constrains the slope to [1/max_run_count, max_run_count]. + def __call__(self, input_features: NDArray[np.float32]) -> int: + self.step(input_features) + return self.current_position - frame_per_seg : int - Number of frames per segment (audio chunk). + def run(self, verbose=True) -> Generator[float, None, NDArray]: + """Run alignment from queue.""" + self.reset() - frame_rate : int - Frame rate of the audio stream. + if verbose: + n = len(self.state_space) if self.state_space is not None else self.N_ref + pbar = progressbar.ProgressBar( + max_value=n, + redirect_stdout=True, + redirect_stderr=True, + ) + pbar.start() - Attributes - ---------- - warping_path : np.ndarray [shape=(2, T)] - Warping path. ``warping_path[0]`` = reference index, - ``warping_path[1]`` = input index. - """ + while self.is_still_following(): + item = self.queue.get(timeout=QUEUE_TIMEOUT) + if item is STREAM_END: + break + if item is None: + continue + input_feature = item[0] if isinstance(item, tuple) else item + self.last_queue_update = time.time() + self.step(input_feature) - DEFAULT_DISTANCE_FUNC: str = "euclidean" - distance_func: str + if verbose: + pbar.update( + int(np.searchsorted(self.state_space, self.current_beat)) + if self.state_space is not None + else min(self.current_position, self.N_ref - 1) + ) + + latency = time.time() - self.last_queue_update + self.latency_stats = set_latency_stats( + latency, self.latency_stats, self.input_index + ) + yield self.current_beat + + if verbose: + pbar.finish() + + return self.warping_path + + +# --------------------------------------------------------------------------- +# Frame-level subclass (audio) +# --------------------------------------------------------------------------- + + +class OnlineTimeWarpingDixonFrame(OnlineTimeWarpingDixon): + """Frame-level OLTW with backward-forward search for audio input. + + Window size is in seconds (converted to frames via frame_rate). + """ def __init__( self, reference_features, - queue, + queue=None, window_size=WINDOW_SIZE, - distance_func=DEFAULT_DISTANCE_FUNC, + distance_func=OnlineTimeWarpingDixon.DEFAULT_DISTANCE_FUNC, max_run_count=MAX_RUN_COUNT, - frame_per_seg=FRAME_PER_SEG, frame_rate=FRAME_RATE, - state_to_ref_time_map = None, - ref_to_state_time_map = None, - state_space = None, + state_to_ref_time_map=None, + ref_to_state_time_map=None, + state_space=None, **kwargs, ): - super().__init__(reference_features=reference_features) - self.queue = queue - self.N_ref = self.reference_features.shape[0] + super().__init__( + reference_features=reference_features, + queue=queue, + max_run_count=max_run_count, + distance_func=distance_func, + state_space=state_space, + state_to_ref_time_map=state_to_ref_time_map, + ref_to_state_time_map=ref_to_state_time_map, + **kwargs, + ) self.frame_rate = frame_rate self.w = int(window_size * self.frame_rate) - self.distance_func = distance_func.lower() - self.max_run_count = max_run_count - self.frame_per_seg = frame_per_seg - self.state_to_ref_time_map = state_to_ref_time_map - self.ref_to_state_time_map = ref_to_state_time_map - self.state_space = state_space self.reset() def reset(self): - """Reset mutable state for a new alignment run.""" self.input_features = np.empty((0, self.reference_features.shape[1])) self.acc_dist_matrix = np.full((self.w, self.w), np.inf, dtype=np.float32) - self.wp = np.array([[0, 0]]).T # [shape=(2, T)] + self.wp = np.array([[0, 0]]).T self.ref_pointer = 0 self.input_pointer = 0 self.run_count_ref = 0 @@ -106,7 +196,7 @@ def reset(self): self.current_position = 0 self.input_index = 0 self.last_queue_update = time.time() - self.latency_stats: Dict[str, float] = { + self.latency_stats = { "total_latency": 0, "total_frames": 0, "max_latency": 0, @@ -115,9 +205,21 @@ def reset(self): self._initialized = False @property - def warping_path(self) -> NDArray[np.float32]: # [shape=(2, T)] + def current_beat(self) -> float: + if self._ref_frame_to_beat is not None: + idx = min(self.best_ref, len(self._ref_frame_to_beat) - 1) + return float(self._ref_frame_to_beat[idx]) + return float(self.best_ref) + + @property + def warping_path(self) -> NDArray[np.float32]: return self.wp + def is_still_following(self): + return self.ref_pointer <= (self.N_ref - 1) + + # ---- internal methods ---- + def _ref_offset(self): return max(0, self.ref_pointer - self.w) @@ -131,24 +233,13 @@ def _input_win_size(self): return min(self.w, self.input_pointer) def offset(self): - """Calculate the offset for the current window position.""" return np.array([self._ref_offset(), self._input_offset()]) def _compute_cell(self, r_win, i_win): - """ - Compute DTW accumulated cost for cell (r_win, i_win) in window coordinates. - - Uses Sakoe-Chiba symmetric weighting (DAFx 2005, Eq. 3): - D[i,j] = min(D[i-1,j] + d, D[i,j-1] + d, D[i-1,j-1] + 2d) - - Matrix convention: acc_dist_matrix[ref_win, input_win] - """ abs_ref = self._ref_offset() + r_win abs_input = self._input_offset() + i_win - ref_feature = self.reference_features[abs_ref] input_feature = self.input_features[abs_input] - local_dist = scipy.spatial.distance.cdist( ref_feature.reshape(1, -1), input_feature.reshape(1, -1), @@ -166,145 +257,85 @@ def _compute_cell(self, r_win, i_win): self.acc_dist_matrix[r_win - 1, i_win] + local_dist ) else: - candidates = [ + self.acc_dist_matrix[r_win, i_win] = min( self.acc_dist_matrix[r_win - 1, i_win] + local_dist, self.acc_dist_matrix[r_win, i_win - 1] + local_dist, self.acc_dist_matrix[r_win - 1, i_win - 1] + 2 * local_dist, - ] - self.acc_dist_matrix[r_win, i_win] = min(candidates) + ) def _accept_input(self, input_features): - """Store input features and advance input pointer.""" self.input_features = np.vstack([self.input_features, input_features]) - self.input_pointer += self.frame_per_seg + self.input_pointer += 1 def _compute_input_column(self): - """Shift the window left if full, then fill the new rightmost column.""" if self.input_pointer > self.w: self.acc_dist_matrix[:, :-1] = self.acc_dist_matrix[:, 1:] self.acc_dist_matrix[:, -1] = np.inf - input_col = self._input_win_size() - 1 for r in range(self._ref_win_size()): self._compute_cell(r, input_col) def _advance_ref(self): - """Advance ref pointer, shift the window up if full, then fill the new bottom row.""" - self.ref_pointer += self.frame_per_seg - + self.ref_pointer += 1 if self.ref_pointer > self.w: self.acc_dist_matrix[:-1, :] = self.acc_dist_matrix[1:, :] self.acc_dist_matrix[-1, :] = np.inf - ref_row = self._ref_win_size() - 1 for c in range(self._input_win_size()): self._compute_cell(ref_row, c) def _determine_direction(self): - """Apply MaxRunCount slope constraint to the suggested direction""" - suggested_direction = self.get_expand_direction() - + suggested = self._get_expand_direction() if self.run_count_ref >= self.max_run_count: return Direction.TARGET elif self.run_count_input >= self.max_run_count: return Direction.REF - else: - return suggested_direction - - def get_expand_direction(self): - """ - Determine expansion direction (GetInc from Dixon 2005, IJCAI Fig. 1). - - Searches the L-shaped boundary of the sliding window — the last ref row - and the last input column — for the cell with minimum normalised - accumulated cost. The direction is chosen based on where that minimum - falls: - - Corner (last row AND last col) → BOTH - - In the column (not at last row) → TARGET (advance input to catch up) - - In the row (not at last col) → REF (advance ref to catch up) - - Normalization: cost / (1 + i + j), an O(1) approximation of the true - path-length divisor (DAFx 2005, Sec. 2). - """ + return suggested + + def _get_expand_direction(self): ref_ws = self._ref_win_size() input_ws = self._input_win_size() ref_off = self._ref_offset() input_off = self._input_offset() - - # Current row and column in window coordinates - row = ref_ws - 1 # last ref row - col = input_ws - 1 # last input column - + row = ref_ws - 1 + col = input_ws - 1 best_cost = np.inf - best_row = row - best_col = col + best_row, best_col = row, col - # Search current column (last input column) across all ref rows for r in range(ref_ws): - abs_r = ref_off + r - abs_i = input_off + col - cost = self.acc_dist_matrix[r, col] - norm_cost = cost / (1 + abs_r + abs_i) - if norm_cost < best_cost: - best_cost = norm_cost - best_row = r - best_col = col - - # Search current row (last ref row) across all input columns + cost = self.acc_dist_matrix[r, col] / (1 + ref_off + r + input_off + col) + if cost < best_cost: + best_cost = cost + best_row, best_col = r, col + for c in range(input_ws): - abs_r = ref_off + row - abs_i = input_off + c - cost = self.acc_dist_matrix[row, c] - norm_cost = cost / (1 + abs_r + abs_i) - if norm_cost < best_cost: - best_cost = norm_cost - best_row = row - best_col = c - - # Convert best window coords to absolute + cost = self.acc_dist_matrix[row, c] / (1 + ref_off + row + input_off + c) + if cost < best_cost: + best_cost = cost + best_row, best_col = row, c + self.best_ref = ref_off + best_row self.best_input = input_off + best_col - # Determine direction based on where the best point is if best_row == row and best_col == col: return Direction.BOTH elif best_row < row: - # Best is in the column (not at current row) → input needs to catch up return Direction.TARGET - else: - # Best is in the row (not at current col) → ref needs to catch up - return Direction.REF + return Direction.REF def save_history(self): - """Append current best alignment point to warping path.""" - new_point = np.array([[self.best_ref], [self.best_input]]) + new_point = np.array([[self.current_beat], [self.best_input]]) self.wp = np.concatenate((self.wp, new_point), axis=1) - def __call__(self, input_features: NDArray[np.float32]) -> int: - self.step(input_features) - return self.current_position - def step(self, input_features): - """Process one input frame and update alignment state. - - Handles REF-only catch-up internally so that each call - consumes exactly one input frame. - - Parameters - ---------- - input_features : NDArray[np.float32] - A single input feature frame. - """ - # First call: initialization (matches run()'s pre-loop logic) if not self._initialized: self._accept_input(input_features) - self.ref_pointer += self.frame_per_seg + self.ref_pointer += 1 self._compute_cell(0, 0) self._initialized = True self.input_index += 1 return - # Phase 1: REF-only catch-up (0 or more iterations) direction = None while self.is_still_following(): direction = self._determine_direction() @@ -319,18 +350,16 @@ def step(self, input_features): if not self.is_still_following(): return - # Phase 2: consume input (direction is TARGET or BOTH) self._accept_input(input_features) self._compute_input_column() - if direction is not Direction.TARGET: # BOTH + if direction is not Direction.TARGET: self._advance_ref() - # Update run counts if direction == Direction.TARGET: self.run_count_input += 1 self.run_count_ref = 0 - else: # BOTH + else: self.run_count_ref = 1 self.run_count_input = 0 @@ -338,48 +367,212 @@ def step(self, input_features): self.current_position = self.best_ref self.input_index += 1 - def is_still_following(self): - return self.ref_pointer <= (self.N_ref - self.frame_per_seg) - - def run(self, verbose=True): - """Run the online alignment process. - - Parameters - ---------- - verbose : bool, optional - Whether to show progress bar, by default True - - Yields - ------ - int - Current position in the reference sequence - - Returns - ------- - NDArray[np.float32] - The warping path as a 2D array where each column contains - (reference_position, input_position) - """ + +# --------------------------------------------------------------------------- +# Event-level subclass (MIDI) +# --------------------------------------------------------------------------- + + +class OnlineTimeWarpingDixonEvent(OnlineTimeWarpingDixon): + """Event-level OLTW for MIDI input. + + Each step processes one onset event. Uses sparse cost matrix with + path-length normalization and reference catch-up based on argmin + of normalized path cost. + + Parameters + ---------- + c : int + Search width in number of events. + """ + + def __init__( + self, + reference_features: NDArray[np.float32], + queue: Optional[RECVQueue] = None, + c: int = 30, + max_run_count: int = MAX_RUN_COUNT, + distance_func: str = "euclidean", + state_space=None, + **kwargs, + ) -> None: + super().__init__( + reference_features=reference_features, + queue=queue, + max_run_count=max_run_count, + distance_func=distance_func, + state_space=state_space, + **kwargs, + ) + self.ref = self.reference_features.astype(np.float32) + self.c = c self.reset() + def reset(self) -> None: + self._D: Dict = {} + self._L: Dict = {} + self._inputs = [] + self.t = -1 + self.j = -1 + self.run_count_input = 0 + self.run_count_ref = 0 + self.current_position = 0 + self.input_index = 0 + self._warping_path = [] + self.last_queue_update = time.time() + self.latency_stats = { + "total_latency": 0, + "total_frames": 0, + "max_latency": 0, + "min_latency": float("inf"), + } + + @property + def current_beat(self) -> float: + if self.state_space is not None: + idx = min(self.current_position, len(self.state_space) - 1) + return float(self.state_space[idx]) + return float(self.current_position) + + @property + def warping_path(self) -> NDArray: + return ( + np.array(self._warping_path).T if self._warping_path else np.empty((2, 0)) + ) + + def is_still_following(self) -> bool: + return self.current_position < self.N_ref - 1 + + # ---- cost matrix (sparse) ---- + + def _local_cost(self, ref_idx, input_idx): + return scipy.spatial.distance.cdist( + self.ref[ref_idx].reshape(1, -1), + self._inputs[input_idx].reshape(1, -1), + metric=self.distance_func, + )[0, 0] + + def _get_D(self, i, j): + return self._D.get((i, j), np.inf) + + def _get_L(self, i, j): + return self._L.get((i, j), 0) + + def _evaluate(self, ref_i, inp_j): + if ref_i < 0 or inp_j < 0 or ref_i >= self.N_ref: + return + d = self._local_cost(ref_i, inp_j) + if ref_i == 0 and inp_j == 0: + self._D[(ref_i, inp_j)] = d + self._L[(ref_i, inp_j)] = 1 + return + best_cost, best_len = np.inf, 0 + for prev_i, prev_j, w in [ + (ref_i - 1, inp_j, 1), + (ref_i, inp_j - 1, 1), + (ref_i - 1, inp_j - 1, 2), + ]: + prev = self._D.get((prev_i, prev_j), np.inf) + if prev < np.inf: + c = prev + w * d + if c < best_cost: + best_cost = c + best_len = self._L.get((prev_i, prev_j), 0) + 1 + if best_cost < np.inf: + self._D[(ref_i, inp_j)] = best_cost + self._L[(ref_i, inp_j)] = best_len + + def _norm_cost(self, ref_i, inp_j): + L = self._get_L(ref_i, inp_j) + return self._get_D(ref_i, inp_j) / L if L > 0 else np.inf + + def _argmin_path_cost(self): + best_cost, best_ref, best_inp = np.inf, self.j, self.t + for k in range(max(0, self.t - self.c + 1), self.t + 1): + nc = self._norm_cost(self.j, k) + if nc < best_cost: + best_cost, best_ref, best_inp = nc, self.j, k + for k in range(max(0, self.j - self.c + 1), self.j + 1): + nc = self._norm_cost(k, self.t) + if nc < best_cost: + best_cost, best_ref, best_inp = nc, k, self.t + return best_ref, best_inp + + def _compute_input_column(self): + for k in range(max(0, self.j - self.c + 1), self.j + 1): + self._evaluate(k, self.t) + + def _compute_ref_row(self): + for k in range(max(0, self.t - self.c + 1), self.t + 1): + self._evaluate(self.j, k) + + def step(self, input_feat: NDArray[np.float32], perf_time: float = None) -> None: + self._inputs.append(np.asarray(input_feat, dtype=np.float32)) + self.t += 1 + self._compute_input_column() + + if self.t < self.c and self.j + 1 < self.N_ref: + self.j += 1 + self._compute_ref_row() + self.run_count_input = 0 + self.run_count_ref = 0 + else: + self.run_count_input += 1 + self.run_count_ref = 0 + while self.j + 1 < self.N_ref: + best_ref, best_inp = self._argmin_path_cost() + if best_ref >= self.j and best_inp < self.t: + self.j += 1 + self._compute_ref_row() + self.run_count_ref += 1 + self.run_count_input = 0 + if self.run_count_ref >= self.max_run_count: + break + else: + break + + best_ref, _ = self._argmin_path_cost() + self.current_position = min(best_ref, self.N_ref - 1) + t = perf_time if perf_time is not None else self.input_index + self._warping_path.append((self.current_beat, t)) + self.input_index += 1 + + def run(self, verbose=True) -> Generator[float, None, NDArray]: + """Run from queue. Extracts timestamp from (feature, time) tuples.""" + self.reset() if verbose: - pbar = progressbar.ProgressBar(max_value=self.N_ref, redirect_stdout=True) + pbar = progressbar.ProgressBar( + max_value=self.N_ref, + redirect_stdout=True, + redirect_stderr=True, + ) + pbar.start() while self.is_still_following(): - input_feature, f_time = self.queue.get(timeout=QUEUE_TIMEOUT) + item = self.queue.get(timeout=QUEUE_TIMEOUT) + if item is STREAM_END: + break + if item is None: + continue + if isinstance(item, tuple): + features, perf_time = item[0], float(item[1]) + else: + features, perf_time = item, None self.last_queue_update = time.time() - self.step(input_feature) + self.step(features, perf_time=perf_time) if verbose: - pbar.update(int(self.current_position)) - + pbar.update(min(self.current_position, self.N_ref - 1)) latency = time.time() - self.last_queue_update self.latency_stats = set_latency_stats( latency, self.latency_stats, self.input_index ) - yield self.current_position + yield self.current_beat if verbose: pbar.finish() + return self.warping_path - return self.wp + +if __name__ == "__main__": + pass # pragma: no cover diff --git a/matchmaker/features/audio.py b/matchmaker/features/audio.py index 3a9c27b..7316a4b 100644 --- a/matchmaker/features/audio.py +++ b/matchmaker/features/audio.py @@ -9,7 +9,7 @@ import librosa import numpy as np -from matchmaker.utils.processor import Processor +from matchmaker.features.processor import Processor SAMPLE_RATE = 44100 FRAME_RATE = 30 @@ -20,7 +20,6 @@ DCT_TYPE = 2 NORM = np.inf FEATURES = "chroma" -QUEUE_TIMEOUT = 1 # Type hint for Input Audio frame. InputAudioSeries = np.ndarray diff --git a/matchmaker/features/midi.py b/matchmaker/features/midi.py index bd2d946..802eec1 100644 --- a/matchmaker/features/midi.py +++ b/matchmaker/features/midi.py @@ -13,13 +13,15 @@ from partitura.score import Part, Score, ScoreLike, merge_parts from partitura.utils.music import performance_from_part -from matchmaker.utils.processor import Processor +from matchmaker.features.processor import Processor from matchmaker.utils.symbolic import ( framed_midi_messages_from_performance, midi_messages_from_performance, ) from matchmaker.utils.typing import InputMIDIFrame, NDArrayFloat +CHORD_THRESHOLD: float = 0.035 # seconds; matches IOI_THRESHOLD in outer_product_hmm + class PitchProcessor(Processor): """ @@ -102,6 +104,11 @@ class PitchIOIProcessor(Processor): return_pitch_list: bool If True, it will return an array of MIDI pitch values, instead of a "piano roll" slice. + + chord_threshold : float + Maximum gap (seconds) between note-ons to be considered part of + the same chord. Notes within this gap are accumulated and flushed + together. Set to 0 to disable chord grouping. """ prev_time: Optional[float] @@ -111,12 +118,40 @@ def __init__( self, piano_range: bool = False, return_pitch_list: bool = False, + chord_threshold: float = CHORD_THRESHOLD, ) -> None: super().__init__() self.prev_time = None self.piano_range = piano_range self.return_pitch_list = return_pitch_list self.piano_shift = 21 if piano_range else 0 + self.chord_threshold = chord_threshold + self._pending_pitch = np.zeros(128, dtype=np.float32) + self._pending_list: List[int] = [] + self._pending_time: Optional[float] = None + + def _flush(self) -> Tuple[NDArrayFloat, float]: + if self.prev_time is None: + ioi_obs = 0.0 + else: + ioi_obs = self._pending_time - self.prev_time + self.prev_time = self._pending_time + + if self.return_pitch_list: + result = ( + np.array(self._pending_list, dtype=np.float32), + ioi_obs, + ) + else: + pitch = self._pending_pitch + if self.piano_range: + pitch = pitch[21:109] + result = (pitch.copy(), ioi_obs) + + self._pending_pitch = np.zeros(128, dtype=np.float32) + self._pending_list = [] + self._pending_time = None + return result def __call__( self, @@ -126,46 +161,37 @@ def __call__( data, f_time = frame else: data = frame - # pitch_obs = [] - pitch_obs = np.zeros( - 128, - dtype=np.float32, - ) - # TODO: Replace the for loop with list comprehension - pitch_obs_list = [] - for msg, _ in data: + result = None + for msg, m_time in data: if ( getattr(msg, "type", "other") == "note_on" and getattr(msg, "velocity", 0) > 0 ): - pitch_obs[msg.note] = 1 - pitch_obs_list.append(msg.note - self.piano_shift) + if ( + self._pending_time is not None + and (m_time - self._pending_time) > self.chord_threshold + ): + result = self._flush() - if pitch_obs.sum() > 0: - if self.prev_time is None: - # There is no IOI for the first observed note - ioi_obs = 0.0 - else: - ioi_obs = f_time - self.prev_time - self.prev_time = f_time - if self.piano_range: - pitch_obs = pitch_obs[21:109] + self._pending_pitch[msg.note] = 1 + self._pending_list.append(msg.note - self.piano_shift) + if self._pending_time is None: + self._pending_time = m_time - if self.return_pitch_list: - return ( - np.array( - pitch_obs_list, - dtype=np.float32, - ), - ioi_obs, - ) - return (pitch_obs, ioi_obs) - else: - return None + return result + + def flush_remaining(self) -> Optional[Tuple[NDArrayFloat, float]]: + """Flush any remaining pending notes (call at end of stream).""" + if self._pending_time is not None: + return self._flush() + return None def reset(self) -> None: - pass + self.prev_time = None + self._pending_pitch = np.zeros(128, dtype=np.float32) + self._pending_list = [] + self._pending_time = None class PianoRollProcessor(Processor): @@ -234,6 +260,84 @@ def reset(self) -> None: self.active_notes = dict() +class OnsetPianoRollProcessor(Processor): + """Binary pianoroll per chord onset, with streaming chord grouping. + + Accumulates note-on events and flushes a chord vector when the time + gap between consecutive note-ons exceeds ``chord_threshold``. + + Returns ``(pianoroll, timestamp)`` when a chord is flushed, ``None`` + while accumulating. + + Parameters + ---------- + piano_range : bool + If True, 88 keys (21-108). Otherwise 128. + chord_threshold : float + Maximum gap (seconds) between note-ons in the same chord. + """ + + def __init__( + self, + piano_range: bool = True, + chord_threshold: float = CHORD_THRESHOLD, + dtype: type = np.float32, + ): + Processor.__init__(self) + self.piano_range = piano_range + self.chord_threshold = chord_threshold + self.dtype = dtype + self._dim = 88 if piano_range else 128 + self._offset = 21 if piano_range else 0 + self._pending_notes: List[int] = [] + self._pending_time: Optional[float] = None + + def _flush(self) -> Tuple[np.ndarray, float]: + vec = np.zeros(self._dim, dtype=self.dtype) + for note in self._pending_notes: + idx = note - self._offset + if 0 <= idx < self._dim: + vec[idx] = 1.0 + t = self._pending_time + self._pending_notes = [] + self._pending_time = None + return (vec, t) + + def __call__(self, frame: InputMIDIFrame) -> Optional[Tuple[np.ndarray, float]]: + if isinstance(frame, tuple): + data, f_time = frame + else: + data = frame + f_time = 0.0 + + result = None + for msg, m_time in data: + if msg.type == "note_on" and msg.velocity > 0: + # Flush previous chord if gap exceeded + if ( + self._pending_time is not None + and (m_time - self._pending_time) > self.chord_threshold + and self._pending_notes + ): + result = self._flush() + + self._pending_notes.append(msg.note) + if self._pending_time is None: + self._pending_time = m_time + + return result + + def flush_remaining(self) -> Optional[Tuple[np.ndarray, float]]: + """Flush any remaining pending notes (call at end of stream).""" + if self._pending_notes: + return self._flush() + return None + + def reset(self) -> None: + self._pending_notes = [] + self._pending_time = None + + class PitchClassPianoRollProcessor(Processor): """ A class to convert a MIDI file time slice to a piano roll representation. @@ -353,5 +457,88 @@ def compute_features_from_symbolic( return outputs +# --------------------------------------------------------------------------- +# Onset-level features (for event-level OLTW) +# --------------------------------------------------------------------------- + + +def onset_pianoroll( + note_array: NDArray, + onset_key: str = "onset_beat", + piano_range: bool = True, +) -> Tuple[NDArray, NDArray]: + """One binary pitch vector per unique onset. + + Parameters + ---------- + note_array : np.ndarray + Structured array with "pitch" and ``onset_key`` fields. + onset_key : str + Column name for onset times ("onset_beat" or "onset_sec"). + piano_range : bool + If True, 88 keys (MIDI 21-108). Otherwise 128. + + Returns + ------- + features : np.ndarray [N_onsets, D] + onsets : np.ndarray [N_onsets] + """ + dim = 88 if piano_range else 128 + offset = 21 if piano_range else 0 + onsets = np.unique(note_array[onset_key]) + features = np.zeros((len(onsets), dim), dtype=np.float32) + for i, t in enumerate(onsets): + for p in note_array["pitch"][note_array[onset_key] == t]: + idx = p - offset + if 0 <= idx < dim: + features[i, idx] = 1.0 + return features, onsets + + +def group_onsets( + note_array: NDArray, + threshold: float = CHORD_THRESHOLD, + piano_range: bool = True, +) -> Tuple[NDArray, NDArray]: + """Group note onsets within ``threshold`` seconds into chords. + + Parameters + ---------- + note_array : np.ndarray + Structured array with "pitch" and "onset_sec" fields. + threshold : float + Maximum time gap (seconds) to merge into one chord. + piano_range : bool + If True, 88 keys. Otherwise 128. + + Returns + ------- + features : np.ndarray [N_chords, D] + times : np.ndarray [N_chords] + Mean onset time per chord group. + """ + dim = 88 if piano_range else 128 + offset = 21 if piano_range else 0 + onsets = note_array["onset_sec"] + si = np.argsort(onsets) + groups, cur = [], [si[0]] + for i in range(1, len(si)): + if onsets[si[i]] - onsets[si[i - 1]] < threshold: + cur.append(si[i]) + else: + groups.append(cur) + cur = [si[i]] + groups.append(cur) + features = np.zeros((len(groups), dim), dtype=np.float32) + times = np.zeros(len(groups)) + for i, g in enumerate(groups): + times[i] = np.mean(onsets[g]) + for idx in g: + p = note_array["pitch"][idx] - offset + if 0 <= p < dim: + features[i, p] = 1.0 + return features, times + + if __name__ == "__main__": # pragma: no cover pass diff --git a/matchmaker/utils/processor.py b/matchmaker/features/processor.py similarity index 100% rename from matchmaker/utils/processor.py rename to matchmaker/features/processor.py diff --git a/matchmaker/io/audio.py b/matchmaker/io/audio.py index 043ecf9..9fa596e 100644 --- a/matchmaker/io/audio.py +++ b/matchmaker/io/audio.py @@ -12,16 +12,22 @@ import numpy as np import pyaudio -from matchmaker.features.audio import HOP_LENGTH, SAMPLE_RATE, ChromagramProcessor +from matchmaker.features.audio import ( + HOP_LENGTH, + SAMPLE_RATE, + ChromagramProcessor, +) +from matchmaker.io.queue import RECVQueue +from matchmaker.io.stream import STREAM_END, Stream from matchmaker.utils.audio import ( get_audio_devices, get_default_input_device_index, get_device_index_from_name, ) -from matchmaker.utils.misc import RECVQueue, set_latency_stats -from matchmaker.utils.stream import Stream +from matchmaker.utils.misc import set_latency_stats CHANNELS = 1 +QUEUE_TIMEOUT = 10 class AudioStream(Stream): @@ -53,7 +59,7 @@ def __init__( hop_length: int = HOP_LENGTH, queue: Optional[RECVQueue] = None, device_name_or_index: Optional[Union[str, int]] = None, - wait: bool = True, + wait: bool = False, target_sr: int = SAMPLE_RATE, ): if processor is None: @@ -123,9 +129,14 @@ def __init__( "min_latency": float("inf"), } self.input_index = 0 + self._preloaded_audio = None if self.mock: self.run = self.run_offline + # Pre-load and resample audio so the stream thread can start + # producing frames immediately (avoids queue-timeout race condition + # when librosa.load takes longer than QUEUE_TIMEOUT). + self._preload_audio() else: self.run = self.run_online @@ -159,6 +170,8 @@ def _process_frame( # initial y target_audio = np.frombuffer(data, dtype=np.float32) self._process_feature(target_audio, time_info["input_buffer_adc_time"]) + if not self.stream_start.is_set(): + self.stream_start.set() return (data, pyaudio.paContinue) @@ -225,6 +238,13 @@ def stop_listening(self) -> None: self.audio_interface.terminate() self.listen = False + def _preload_audio(self) -> None: + """Pre-load and resample audio file so run_offline can start immediately.""" + audio_y, sr = librosa.load(self.file_path, sr=None) + if sr != self.target_sr: + audio_y = librosa.resample(y=audio_y, orig_sr=sr, target_sr=self.target_sr) + self._preloaded_audio = audio_y + def run_offline(self) -> None: """Process audio file in offline mode. @@ -240,12 +260,17 @@ def run_offline(self) -> None: self.start_listening() self.init_time = time.time() - audio_y, sr = librosa.load(self.file_path, sr=None) - if sr != self.target_sr: - audio_y = librosa.resample(y=audio_y, orig_sr=sr, target_sr=self.target_sr) - sr = self.target_sr + if self._preloaded_audio is not None: + audio_y = self._preloaded_audio + self._preloaded_audio = None # free memory + else: + audio_y, sr = librosa.load(self.file_path, sr=None) + if sr != self.target_sr: + audio_y = librosa.resample( + y=audio_y, orig_sr=sr, target_sr=self.target_sr + ) + sr = self.target_sr - time_interval = self.hop_length / float(sr) # Pad to next hop_length boundary so no trailing samples are lost remainder = len(audio_y) % self.hop_length if remainder > 0: @@ -253,6 +278,7 @@ def run_offline(self) -> None: (audio_y, np.zeros(self.hop_length - remainder, dtype=np.float32)) ) trimmed_audio = audio_y + time_interval = self.hop_length / float(sr) # Do not stop early on digital silence (all-zeros tails). while trimmed_audio.size > 0: self.input_index += 1 @@ -260,11 +286,15 @@ def run_offline(self) -> None: target_audio = trimmed_audio[: self.hop_length] self._process_feature(target_audio, self.last_data_received) trimmed_audio = trimmed_audio[self.hop_length :] - elapsed_time = time.time() - self.last_data_received + + if not self.stream_start.is_set(): + self.stream_start.set() if self.wait: + elapsed_time = time.time() - self.last_data_received time.sleep(max(time_interval - elapsed_time, 0)) + self.queue.put(STREAM_END) self.stop_listening() def run_online(self) -> None: diff --git a/matchmaker/io/midi.py b/matchmaker/io/midi.py index cfdca59..b83b5f7 100644 --- a/matchmaker/io/midi.py +++ b/matchmaker/io/midi.py @@ -12,10 +12,10 @@ from mido.ports import BaseInput as MidiInputPort from matchmaker.features.midi import PitchIOIProcessor +from matchmaker.features.processor import Processor from matchmaker.io.mediator import CeusMediator -from matchmaker.utils.misc import RECVQueue -from matchmaker.utils.processor import Processor -from matchmaker.utils.stream import Stream +from matchmaker.io.queue import RECVQueue +from matchmaker.io.stream import STREAM_END, Stream from matchmaker.utils.symbolic import ( Buffer, framed_midi_messages_from_performance, @@ -107,21 +107,19 @@ def __init__( self.midi_messages = [] self.polling_period = polling_period + if (polling_period is None) and (self.mock is False): self.is_windowed = False self.run = self.run_online_single self._process_frame = self._process_frame_message - elif (polling_period is None) and (self.mock is True): self.is_windowed = False self.run = self.run_offline_single self._process_frame = self._process_frame_message - elif (polling_period is not None) and (self.mock is False): self.is_windowed = True self.run = self.run_online_windowed self._process_frame = self._process_frame_window - elif (polling_period is not None) and (self.mock is True): self.is_windowed = True self.run = self.run_offline_windowed @@ -135,10 +133,16 @@ def _process_frame_message( **kwargs, ) -> None: output = self.processor(([(data, c_time)], c_time)) + + if output is None: + return + if self.return_midi_messages: self.queue.put(((data, c_time), output)) else: self.queue.put(output) + if not self.stream_start.is_set(): + self.stream_start.set() def _process_frame_window( self, @@ -149,11 +153,15 @@ def _process_frame_window( # the data is the Buffer instance output = self.processor((data.frame[:], data.time)) - # if output is not None: + if output is None: + return + if self.return_midi_messages: self.queue.put((data.frame, output)) else: self.queue.put(output) + if not self.stream_start.is_set(): + self.stream_start.set() def run_online_single(self): self.start_listening() @@ -219,8 +227,8 @@ def run_offline_single(self): midi_messages, message_times = midi_messages_from_performance( perf=self.file_path, ) - self.init_time = message_times.min() self.start_listening() + self.init_time = message_times.min() for msg, c_time in zip(midi_messages, message_times): self.add_midi_message( msg=msg, @@ -230,6 +238,12 @@ def run_offline_single(self): data=msg, c_time=c_time, ) + # Flush remaining chord from OnsetPianoRollProcessor + if hasattr(self.processor, "flush_remaining"): + last = self.processor.flush_remaining() + if last is not None: + self.queue.put(last) + self.queue.put(STREAM_END) self.stop_listening() def run_offline_windowed(self): @@ -247,6 +261,11 @@ def run_offline_windowed(self): self._process_frame_window( data=frame, ) + if hasattr(self.processor, "flush_remaining"): + last = self.processor.flush_remaining() + if last is not None: + self.queue.put(last) + self.queue.put(STREAM_END) @property def current_time(self) -> Optional[float]: diff --git a/matchmaker/io/queue.py b/matchmaker/io/queue.py new file mode 100644 index 0000000..4799016 --- /dev/null +++ b/matchmaker/io/queue.py @@ -0,0 +1,31 @@ +#!/usr/bin/python +# -*- coding: utf-8 -*- +""" +Queue utilities for matchmaker streaming. +""" + +from queue import Empty, Queue +from typing import Any + + +class RECVQueue(Queue): + """Queue with a recv method (like Pipe). + + Uses Queue.get with a timeout to remain interruptable via + KeyboardInterrupt. Periodically queries with a 1s timeout as a + middle ground between busy-waiting and uninterruptable blocked waiting. + """ + + def __init__(self) -> None: + Queue.__init__(self) + + def recv(self) -> Any: + """Return and remove an item from the queue.""" + while True: + try: + return self.get(timeout=1) + except Empty: # pragma: no cover + pass + + def poll(self) -> bool: + return self.empty() diff --git a/matchmaker/utils/stream.py b/matchmaker/io/stream.py similarity index 91% rename from matchmaker/utils/stream.py rename to matchmaker/io/stream.py index ca51389..115b547 100644 --- a/matchmaker/utils/stream.py +++ b/matchmaker/io/stream.py @@ -6,13 +6,17 @@ from __future__ import annotations +import threading import time from threading import Thread from types import TracebackType from typing import TYPE_CHECKING, Any, Callable, Optional, Type, Union +STREAM_START = threading.Event # call STREAM_START() to create per-instance event +STREAM_END = object() # put into queue to signal end-of-stream + if TYPE_CHECKING: # pragma: no cover - from matchmaker.utils.processor import Processor + from matchmaker.features.processor import Processor class Stream(Thread): @@ -43,6 +47,7 @@ def __init__( self.mock = mock self.listen = False self.init_time = None + self.stream_start = STREAM_START() def start_listening(self): """ diff --git a/matchmaker/matchmaker.py b/matchmaker/matchmaker.py index 776e081..de3da68 100644 --- a/matchmaker/matchmaker.py +++ b/matchmaker/matchmaker.py @@ -1,15 +1,21 @@ import os import sys +import time from pathlib import Path from typing import Optional, Union import numpy as np import partitura from partitura.io.exportmidi import get_ppq -from partitura.score import Part, merge_parts from partitura.musicanalysis.performance_codec import get_time_maps_from_alignment +from partitura.score import Part, merge_parts -from matchmaker.dp import OnlineTimeWarpingArzt, OnlineTimeWarpingDixon +from matchmaker.dp import ( + OnlineTimeWarpingArztEvent, + OnlineTimeWarpingArztFrame, + OnlineTimeWarpingDixonEvent, + OnlineTimeWarpingDixonFrame, +) from matchmaker.features.audio import ( FRAME_RATE, SAMPLE_RATE, @@ -21,26 +27,20 @@ MFCCProcessor, ) from matchmaker.features.midi import ( + OnsetPianoRollProcessor, PianoRollProcessor, PitchClassPianoRollProcessor, PitchIOIProcessor, + onset_pianoroll, ) from matchmaker.io.audio import AudioStream from matchmaker.io.midi import MidiStream -from matchmaker.prob.hmm import ( - GaussianAudioPitchHMM, - GaussianAudioPitchTempoHMM, - PitchHMM, - PitchIOIHMM, -) -from matchmaker.prob.outer_product_hmm import OuterProductHMM -from matchmaker.prob.outer_product_hmm_audio import AudioOuterProductHMM +from matchmaker.prob import AudioOuterProductHMM, OuterProductHMM, PitchHMM, PitchIOIHMM from matchmaker.utils.eval import ( TOLERANCES_IN_BEATS, TOLERANCES_IN_MILLISECONDS, - get_evaluation_results, - transfer_from_perf_to_predicted_score, - transfer_from_score_to_predicted_perf, + evaluate_alignment, + transfer_positions, ) from matchmaker.utils.misc import ( adjust_tempo_for_performance_file, @@ -52,53 +52,53 @@ ) from matchmaker.utils.tempo_models import KalmanTempoModel +PathLike = Union[str, bytes, os.PathLike] sys.setrecursionlimit(10_000) -PathLike = Union[str, bytes, os.PathLike] DEFAULT_TEMPO = 120 - - -DEFAULT_DISTANCE_FUNCS = { - "arzt": OnlineTimeWarpingArzt.DEFAULT_DISTANCE_FUNC, - "dixon": OnlineTimeWarpingDixon.DEFAULT_DISTANCE_FUNC, - "hmm": None, - "outerhmm": None, - "audio_outerhmm": None, - "pthmm": None, +MIDI_FRAME_RATE = 1 # dummy value for MIDI input +DEFAULT_PROCESSOR = { + "audio": "chroma", + "midi": "pitch_ioi", } - DEFAULT_METHODS = { "audio": "arzt", "midi": "outerhmm", } - -AVAILABLE_METHODS = ["arzt", "dixon", "hmm", "pthmm", "outerhmm", "audio_outerhmm"] +AVAILABLE_METHODS = ["arzt", "dixon", "hmm", "pthmm", "outerhmm"] +OLTW_METHODS = {"arzt", "dixon"} KWARGS = { "audio": { "dixon": { + "processor": "lse", "window_size": 10, }, "arzt": { - "window_size": 5, - "start_window_size": 0.25, - "step_size" : 5,}, - "audio_outerhmm": { + "window_size": 10, + "start_window_size": 0.1, + "step_size": 3, + }, + "outerhmm": { + "processor": "cqt_spectral_flux", "sample_rate": 16000, - "frame_rate": 50, + "frame_rate": 25, + "s_j": 0.0, }, }, "midi": { "arzt": { "processor": "pianoroll", "piano_range": True, - "window_size": 200, - "start_window_size": 200, + "frame_rate": 100, + "window_size": 2, + "start_window_size": 2, "step_size": 5, }, "dixon": { "processor": "pianoroll", "piano_range": True, - "window_size": 30, + "frame_rate": 100, + "window_size": 0.3, }, "hmm": { "processor": "pitch_ioi", @@ -131,8 +131,8 @@ class Matchmaker(object): only for offline option. For debugging or fast testing, set to False input_type : str Type of input to use: audio or midi - feature_type : str - Type of feature to use + processor : str + Type of feature processor to use method : str Score following method to use device_name_or_index : Union[str, int] @@ -151,39 +151,33 @@ def __init__( self, score_file: PathLike, performance_file: Union[PathLike, None] = None, - wait: bool = True, # only for offline option. For debugging or fast testing, set to False - input_type: str = "audio", # 'audio' or 'midi' - feature_type: str = None, + input_type: str = "audio", method: str = None, - distance_func: Optional[str] = None, + *, + processor: str = None, device_name_or_index: Union[str, int] = None, + tempo: Optional[float] = None, sample_rate: int = SAMPLE_RATE, frame_rate: int = FRAME_RATE, - tempo: Optional[float] = None, - kwargs=KWARGS, - unfold_score=True, auto_adjust_tempo: bool = False, + wait: bool = False, + unfold_score=True, + kwargs=KWARGS, ): self.score_file = str(score_file) self.performance_file = ( str(performance_file) if performance_file is not None else None ) - # if input_type not in ("audio", "midi"): - # raise ValueError(f"Invalid input_type {input_type}") self.input_type = input_type - self.feature_type = feature_type - self.frame_rate = frame_rate if input_type == "audio" else 1 - self.sample_rate = sample_rate - self.hop_length = sample_rate // self.frame_rate self.score_part: Optional[Part] = None - self.distance_func = distance_func self.device_name_or_index = device_name_or_index self.processor = None self.stream = None self.score_follower = None self.reference_features = None self._has_run = False + self.alignment_duration = None # validate method first if method is None: @@ -192,14 +186,14 @@ def __init__( raise ValueError(f"Invalid method. Available methods: {AVAILABLE_METHODS}") self.method = method - self.config = kwargs[self.input_type][self.method] + self.config = dict(kwargs.get(self.input_type, {}).get(self.method, {})) self.auto_adjust_tempo = auto_adjust_tempo - # Apply method-specific defaults from config (only if not explicitly provided by caller) - if sample_rate == SAMPLE_RATE and "sample_rate" in self.config: - self.sample_rate = self.config["sample_rate"] - if frame_rate == FRAME_RATE and "frame_rate" in self.config: - self.frame_rate = self.config["frame_rate"] + # Resolve sample_rate and frame_rate: config overrides defaults + self.sample_rate = self.config.pop("sample_rate", sample_rate) + self.frame_rate = self.config.pop( + "frame_rate", frame_rate if input_type == "audio" else MIDI_FRAME_RATE + ) self.hop_length = self.sample_rate // self.frame_rate # setup score file @@ -213,8 +207,22 @@ def __init__( score = partitura.load_score(self.score_file) if unfold_score: - score = partitura.score.unfold_part_maximal(score, ignore_leaps=False) - self.score_part = merge_parts(score.parts) + try: + # Ensure recursion limit is high enough for deepcopy of + # complex scores. External libraries (e.g. madmom) may + # lower it during processing. + _prev_limit = sys.getrecursionlimit() + sys.setrecursionlimit(max(_prev_limit, 10_000)) + unfolded = partitura.score.unfold_part_maximal( + score, ignore_leaps=False + ) + self.score_part = merge_parts(unfolded.parts) + sys.setrecursionlimit(_prev_limit) + except Exception: + sys.setrecursionlimit(max(sys.getrecursionlimit(), 10_000)) + self.score_part = merge_parts(score.parts) + else: + self.score_part = merge_parts(score.parts) except Exception as e: raise ValueError(f"Invalid score file: {e}") @@ -227,49 +235,10 @@ def __init__( score_tempo = get_tempo_from_score(self.score_part, self.score_file) self.tempo = score_tempo if score_tempo is not None else DEFAULT_TEMPO - # setup feature processor - if self.feature_type is None: - if input_type == "audio": - self.feature_type = ( - "cqt_spectral_flux" if method == "audio_outerhmm" else "chroma" - ) - else: - self.feature_type = "pitch_ioi" - - if self.feature_type == "chroma": - self.processor = ChromagramProcessor( - sample_rate=self.sample_rate, - hop_length=self.hop_length, - ) - elif self.feature_type == "mfcc": - self.processor = MFCCProcessor( - sample_rate=self.sample_rate, - ) - elif self.feature_type == "cqt": - self.processor = CQTProcessor( - sample_rate=self.sample_rate, - ) - elif self.feature_type == "mel": - self.processor = MelSpectrogramProcessor( - sample_rate=self.sample_rate, - ) - elif self.feature_type == "lse": - self.processor = LogSpectralEnergyProcessor( - sample_rate=self.sample_rate, - ) - elif self.feature_type == "pitch_ioi": - self.processor = PitchIOIProcessor(piano_range=self.config["piano_range"]) - elif self.feature_type == "pitchclass": - self.processor = PitchClassPianoRollProcessor() - elif self.feature_type == "pianoroll": - self.processor = PianoRollProcessor(piano_range=self.config["piano_range"]) - elif self.feature_type == "cqt_spectral_flux": - self.processor = CQTSpectralFluxProcessor( - sample_rate=self.sample_rate, - hop_length=self.hop_length, - ) - else: - raise ValueError(f"Invalid feature type `{self.feature_type}`") + processor_type = processor or self.config.pop( + "processor", DEFAULT_PROCESSOR[input_type] + ) + self.processor = self._build_processor(method, processor_type) if self.performance_file is not None: if self.input_type == "audio" and not is_audio_file(self.performance_file): @@ -281,13 +250,41 @@ def __init__( f"Invalid performance file. Expected MIDI file, but got {self.performance_file}" ) - # setup distance function - if distance_func is None: - distance_func = DEFAULT_DISTANCE_FUNCS[self.method] - # setup stream device + self.stream = self._build_stream(method, wait) + self.reference_features = self.preprocess_score() + self.score_follower = self._build_score_follower(method) + + def _build_processor(self, method, processor_type): + audio_kw = dict(sample_rate=self.sample_rate, hop_length=self.hop_length) + + AUDIO_PROCESSORS = { + "chroma": lambda: ChromagramProcessor(**audio_kw), + "mfcc": lambda: MFCCProcessor(**audio_kw), + "cqt": lambda: CQTProcessor(**audio_kw), + "mel": lambda: MelSpectrogramProcessor(**audio_kw), + "lse": lambda: LogSpectralEnergyProcessor(**audio_kw), + "cqt_spectral_flux": lambda: CQTSpectralFluxProcessor(**audio_kw), + } + MIDI_PROCESSORS = { + "pitch_ioi": lambda: PitchIOIProcessor( + piano_range=self.config["piano_range"], + return_pitch_list=(method == "hmm"), + ), + "pitchclass": lambda: PitchClassPianoRollProcessor(), + "pianoroll": lambda: PianoRollProcessor( + piano_range=self.config["piano_range"], + ), + } + if processor_type in AUDIO_PROCESSORS: + return AUDIO_PROCESSORS[processor_type]() + elif processor_type in MIDI_PROCESSORS: + return MIDI_PROCESSORS[processor_type]() + raise ValueError(f"Invalid feature type '{processor_type}'") + + def _build_stream(self, method, wait): if self.input_type == "audio": - self.stream = AudioStream( + return AudioStream( processor=self.processor, device_name_or_index=self.device_name_or_index, file_path=self.performance_file, @@ -297,131 +294,167 @@ def __init__( hop_length=self.hop_length, ) elif self.input_type == "midi": - self.stream = MidiStream( - processor=self.processor, + if method in OLTW_METHODS: + processor = OnsetPianoRollProcessor( + piano_range=self.config.get("piano_range", True), + ) + else: + processor = self.processor + return MidiStream( + processor=processor, port=self.device_name_or_index, file_path=self.performance_file, - **({"polling_period": None} if method == "outerhmm" else {}), + polling_period=None, ) - else: - raise ValueError(f"Invalid input type {self.input_type}") + raise ValueError(f"Invalid input type '{self.input_type}'") - self.reference_features = self.preprocess_score() + def _build_score_follower(self, method): + if self.input_type == "audio": + return self._build_audio_follower(method) + elif self.input_type == "midi": + return self._build_symbolic_follower(method) + raise ValueError(f"Invalid input_type '{self.input_type}'") - if distance_func is None: - distance_func = DEFAULT_DISTANCE_FUNCS[method] + def _build_audio_follower(self, method): + ref = self.reference_features + queue = self.stream.queue + state_space = np.unique(self.score_part.note_array()["onset_beat"]) - if method == "arzt": - state_to_ref_time_map, ref_to_state_time_map = self.get_time_maps() - self.score_follower = OnlineTimeWarpingArzt( - reference_features=self.reference_features, - queue=self.stream.queue, - distance_func=distance_func, - frame_rate=self.frame_rate, - window_size=self.config["window_size"], - start_window_size=self.config["start_window_size"], - state_to_ref_time_map=state_to_ref_time_map, - ref_to_state_time_map=ref_to_state_time_map, - step_size=self.config["step_size"], - state_space=np.unique(self.score_part.note_array()["onset_beat"]) + if method in OLTW_METHODS: + self.ppart = partitura.utils.music.performance_from_part( + self.score_part, bpm=self.tempo ) - elif method == "dixon": - state_to_ref_time_map, ref_to_state_time_map = self.get_time_maps() - self.score_follower = OnlineTimeWarpingDixon( - reference_features=self.reference_features, - queue=self.stream.queue, - distance_func=distance_func, - frame_rate=self.frame_rate, - window_size=self.config["window_size"], - state_to_ref_time_map=state_to_ref_time_map, - ref_to_state_time_map=ref_to_state_time_map, - state_space=np.unique(self.score_part.note_array()["onset_beat"]) - ) - elif method == "hmm" and self.input_type == "midi": - self.score_follower = PitchIOIHMM( - reference_features=self.reference_features, - queue=self.stream.queue, - tempo_model=self.config["tempo_model"], - has_insertions=True, - piano_range=self.config["piano_range"], + self.ppart.sustain_pedal_threshold = 127 + try: + stm, rtm = self.get_time_maps() + except Exception: + stm, rtm = None, None + cls = ( + OnlineTimeWarpingArztFrame + if method == "arzt" + else OnlineTimeWarpingDixonFrame ) - elif method == "pthmm" and self.input_type == "audio": - self.score_follower = GaussianAudioPitchTempoHMM( - reference_features=self.reference_features, - queue=self.stream.queue, + return cls( + reference_features=ref, + queue=queue, + state_space=state_space, + frame_rate=self.frame_rate, + state_to_ref_time_map=stm, + ref_to_state_time_map=rtm, + ref_frame_to_beat=self._build_ref_frame_to_beat(), + **self.config, ) - elif method == "audio_outerhmm" and self.input_type == "audio": - self.score_follower = AudioOuterProductHMM( - reference_features=self.reference_features, - queue=self.stream.queue, + elif method == "outerhmm": + return AudioOuterProductHMM( + reference_features=ref, + queue=queue, tempo=self.tempo, - sample_rate=self.sample_rate, hop_length=self.hop_length, + **self.config, + ) + raise ValueError(f"No audio follower for method '{method}'") + + def _build_symbolic_follower(self, method): + ref = self.reference_features + queue = self.stream.queue + + if method in OLTW_METHODS: + # Convert note_array to onset pianoroll for event-level OLTW + onset_ref, state_space = onset_pianoroll( + ref, + onset_key="onset_beat", + piano_range=self.config.get("piano_range", True), + ) + # Filter out frame-level config keys + skip = { + "window_size", + "start_window_size", + "frame_rate", + "processor", + "piano_range", + } + config = {k: v for k, v in self.config.items() if k not in skip} + cls = ( + OnlineTimeWarpingArztEvent + if method == "arzt" + else OnlineTimeWarpingDixonEvent ) - elif method == "pthmm" and self.input_type == "midi": - self.score_follower = PitchHMM( - reference_features=self.reference_features, - queue=self.stream.queue, + return cls( + reference_features=onset_ref, + queue=queue, + state_space=state_space, + **config, + ) + elif method == "hmm": + return PitchIOIHMM( + reference_features=ref, + queue=queue, has_insertions=True, - piano_range=self.config["piano_range"], + **self.config, + ) + elif method == "pthmm": + return PitchHMM( + reference_features=ref, + queue=queue, + has_insertions=True, + **self.config, ) - elif method == "outerhmm" and self.input_type == "midi": - self.score_follower = OuterProductHMM( - reference_features=self.reference_features, - queue=self.stream.queue, + elif method == "outerhmm": + return OuterProductHMM( + reference_features=ref, + queue=queue, + **self.config, ) + raise ValueError(f"No MIDI follower for method '{method}'") + + def _wp_perf_to_seconds(self, wp_perf): + """Convert warping path performance axis to absolute seconds.""" + if self.input_type == "audio": + return wp_perf / self.frame_rate + elif self.method in OLTW_METHODS: + return wp_perf # OnsetPianoRollProcessor provides absolute timestamps else: - raise ValueError("Invalid method") + # HMM: IOI-accumulated from 0; shift by first note onset + _perf = partitura.load_performance_midi(self.performance_file) + return wp_perf + float(_perf.note_array()["onset_sec"].min()) def preprocess_score(self): - """Preprocess score to extract reference features.""" + """Extract reference features from the score.""" if self.auto_adjust_tempo and self.performance_file is not None: self.tempo = adjust_tempo_for_performance_file( self.score_part, self.performance_file, self.tempo ) - if self.method in {"arzt", "dixon"}: - self.ppart = partitura.utils.music.performance_from_part(self.score_part, bpm=self.tempo) - self.ppart.sustain_pedal_threshold = 127 - if self.input_type == "audio": - self.score_audio = generate_score_audio( - self.score_part, self.tempo, self.sample_rate - ).astype(np.float32) - reference_features = self.processor(self.score_audio) - self.processor.reset() - return reference_features - else: - polling_period = 0.01 - reference_features = ( - partitura.utils.music.compute_pianoroll( - note_info=self.ppart, - time_unit="sec", - time_div=int(np.round(1 / polling_period)), - binary=True, - piano_range=self.config["piano_range"], - ) - .toarray() - .T - ).astype(np.float32) - return reference_features - else: - return self.score_part.note_array() - + if self.input_type == "audio" and self.method in OLTW_METHODS: + score_audio = generate_score_audio( + self.score_part, self.tempo, self.sample_rate + ).astype(np.float32) + features = self.processor(score_audio) + self.processor.reset() + return features + + return self.score_part.note_array() + def get_time_maps(self): - alignment = [{"label" : "match", "score_id" : nid, "performance_id": nid} for nid in self.score_part.note_array()["id"]] - return get_time_maps_from_alignment(self.ppart.note_array(), self.score_part.note_array(), alignment) + sna = self.score_part.note_array() + pna = self.ppart.note_array() + note_ids = sna["id"] + # If note IDs are missing, use index-based IDs + if len(set(note_ids)) <= 1: + synth_ids = [f"n{i}" for i in range(len(sna))] + sna = sna.copy() + sna["id"] = synth_ids + pna = pna.copy() + pna["id"] = synth_ids[: len(pna)] + note_ids = synth_ids + alignment = [ + {"label": "match", "score_id": nid, "performance_id": nid} + for nid in note_ids + ] + return get_time_maps_from_alignment(pna, sna, alignment) def _convert_frame_to_beat(self, current_frame: int) -> float: - """ - Convert frame number to relative beat position in the score. - - Parameters - ---------- - frame_rate : int - Frame rate of the audio stream - current_frame : int - Current frame number - """ + """Convert frame number to beat position in the score.""" tick = get_ppq(self.score_part) timeline_time = (current_frame / self.frame_rate) * tick * (self.tempo / 60) beat_position = np.round( @@ -430,6 +463,13 @@ def _convert_frame_to_beat(self, current_frame: int) -> float: ) return beat_position + def _build_ref_frame_to_beat(self) -> np.ndarray: + """Precompute beat position for each reference feature frame.""" + n_ref = self.reference_features.shape[0] + return np.array( + [self._convert_frame_to_beat(i) for i in range(n_ref)], + ) + def build_score_annotations( self, level="beat", @@ -484,34 +524,6 @@ def build_score_annotations( return score_annots - def convert_timestamps_to_beats(self, timestamps): - """ - Convert an array of timestamps (in seconds) to beat positions. - - Parameters - ---------- - timestamps : array-like - Array of timestamps in seconds - - Returns - ------- - beats : np.ndarray - Array of beat positions corresponding to the input timestamps - """ - beats = [] - tick = get_ppq(self.score_part) - - for timestamp in timestamps: - timeline_time = timestamp * tick * (self.tempo / 60) - - beat_position = np.round( - self.score_part.beat_map(timeline_time), - decimals=2, - ) - beats.append(beat_position) - - return np.array(beats) - def get_latency_stats(self): feature_stats = self.stream.latency_stats inference_stats = self.score_follower.latency_stats @@ -533,37 +545,45 @@ def run_evaluation( self, perf_annotations: Union[PathLike, np.ndarray], level: str = "note", - tolerances: list = TOLERANCES_IN_MILLISECONDS, - musical_beat: bool = False, # beat annots are difference in some dataset + tolerances: list = None, + musical_beat: bool = False, debug: bool = False, save_dir: PathLike = None, run_name: str = None, - domain: str = "performance", # "score" or "performance" + domain: str = "score", + plot_dist_matrix: bool = True, ) -> dict: """ - Evaluate the score following process + Evaluate the score following process. + + When domain="score" (default), returns beat-based metrics as primary + and ms-based metrics under "ms" key. When domain="performance", + returns ms-based metrics only (legacy behavior). Parameters ---------- perf_annotations : PathLike or np.ndarray - Path to the performance annotations file (tab-separated), - or numpy array of annotation times in seconds. + Path to the performance annotations file or numpy array of onset times (seconds). level : str - Level of annotations to use: bar, beat or note - tolerance : list - Tolerances to use for evaluation (in milliseconds) + Annotation level: "beat" or "note" + tolerances : list or None + Tolerances for evaluation. If None, uses default for the domain. + musical_beat : bool + Whether to use musical beat debug : bool - Whether to save the score and performance audio with beat annotations + Whether to save debug outputs domain : str - Evaluation domain, either "score" or "performance". - "score" domain evaluates in beat unit, "performance" domain evaluates in second unit. (Default: "performance") + "score" (default, beat-based primary) or "performance" (ms-based, legacy) Returns ------- dict - Evaluation results with mean, median, std, skewness, kurtosis, and - accuracy for each tolerance + Evaluation results. If domain="score", includes both beat and ms metrics. """ + if tolerances is None: + tolerances = ( + TOLERANCES_IN_BEATS if domain == "score" else TOLERANCES_IN_MILLISECONDS + ) if not self._has_run: raise ValueError("Must call run() before evaluation") @@ -572,87 +592,77 @@ def run_evaluation( else: perf_annots = np.loadtxt(fname=perf_annotations, delimiter="\t", usecols=0) - return_type = "seconds" if domain == "performance" else "beats" - score_annots = self.build_score_annotations(level, musical_beat, return_type) - - original_perf_annots_counts = len(perf_annots) + wp = self.score_follower.warping_path + wp_score = wp[0].astype(float) + wp_perf_sec = self._wp_perf_to_seconds(wp[1].astype(float)) - min_length = min(len(score_annots), len(perf_annots)) - score_annots = score_annots[:min_length] - perf_annots = perf_annots[:min_length] - - mode = ( - "state" - if (self.input_type == "midi" or self.method == "audio_outerhmm") - else "frame" - ) - perf_annots_predicted = transfer_from_score_to_predicted_perf( - self.score_follower.warping_path, - score_annots, - frame_rate=self.frame_rate, - mode=mode, + score_annots_beats = self.build_score_annotations( + level, musical_beat, return_type="beats" ) + min_length = min(len(score_annots_beats), len(perf_annots)) + score_annots_beats = score_annots_beats[:min_length] + perf_annots = perf_annots[:min_length] - score_annots_predicted = transfer_from_perf_to_predicted_score( - self.score_follower.warping_path, + eval_results = evaluate_alignment( + wp_score, + wp_perf_sec, + score_annots_beats, perf_annots, - frame_rate=self.frame_rate, - mode=mode, + beat_tolerances=tolerances if domain == "score" else TOLERANCES_IN_BEATS, + ms_tolerances=TOLERANCES_IN_MILLISECONDS, ) - score_annots = score_annots[: len(score_annots_predicted)] - if original_perf_annots_counts != len(perf_annots_predicted): - print( - f"Length of the annotation changed: {original_perf_annots_counts} -> {len(perf_annots_predicted)}" - ) + # Real-Time Factor (domain-independent) + if self.alignment_duration is not None: + finite_perf = perf_annots[np.isfinite(perf_annots)] + if len(finite_perf) > 0: + perf_duration = float(np.max(finite_perf) - np.min(finite_perf)) + if perf_duration > 0: + eval_results["rtf"] = float( + f"{self.alignment_duration / perf_duration:.4f}" + ) - # Evaluation metrics - if domain == "performance": - eval_results = get_evaluation_results( - perf_annots, - perf_annots_predicted, - total_counts=original_perf_annots_counts, - tolerances=tolerances, - ) - else: - score_annots_predicted = self.convert_timestamps_to_beats( - score_annots_predicted - ) - if tolerances == TOLERANCES_IN_MILLISECONDS: - tolerances = TOLERANCES_IN_BEATS - eval_results = get_evaluation_results( - score_annots, - score_annots_predicted, - total_counts=original_perf_annots_counts, - tolerances=tolerances, - in_seconds=False, - ) if self.input_type == "audio": latency_results = self.get_latency_stats() eval_results.update(latency_results) - # Debug: save warping path TSV, results JSON, and plots if debug and save_dir is not None: + wp_sec = np.array([wp_score, wp_perf_sec]) + sf = self.score_follower save_debug_results( - warping_path=self.score_follower.warping_path, - score_annots=score_annots, + warping_path=wp_sec, + score_annots=score_annots_beats, perf_annots=perf_annots, - perf_annots_predicted=perf_annots_predicted, + perf_annots_predicted=transfer_positions( + wp_sec, + score_annots_beats, + frame_rate=1, + domain="performance", + ), eval_results=eval_results, frame_rate=self.frame_rate, save_dir=save_dir, run_name=run_name or "results", - state_space=getattr(self.score_follower, "state_space", None), - ref_features=getattr(self.score_follower, "reference_features", None), - input_features=getattr(self.score_follower, "input_features", None), - distance_func=getattr(self.score_follower, "distance_func", None), + state_space=getattr(sf, "state_space", None), + ref_features=( + getattr(sf, "reference_features", None) + if plot_dist_matrix + else None + ), + input_features=( + getattr(sf, "input_features", None) if plot_dist_matrix else None + ), + distance_func=( + getattr(sf, "distance_func", None) if plot_dist_matrix else None + ), + ref_frame_to_beat=getattr(sf, "_ref_frame_to_beat", None), ) return eval_results - def run(self, verbose: bool = True, wait: bool = True): + def run(self, verbose: bool = True): """ - Run the score following process + Run the score following process. Yields ------ @@ -661,16 +671,15 @@ def run(self, verbose: bool = True, wait: bool = True): Returns ------- - list - Alignment results with warping path + np.ndarray + Warping path (2, T) """ with self.stream: + self.stream.stream_start.wait() + t0 = time.time() for current_position in self.score_follower.run(verbose=verbose): - if self.input_type == "audio" and self.method != "audio_outerhmm": - position_in_beat = self._convert_frame_to_beat(current_position) - yield position_in_beat - else: - yield float(self.score_follower.state_space[current_position]) + yield current_position + self.alignment_duration = time.time() - t0 self._has_run = True return self.score_follower.warping_path diff --git a/matchmaker/prob/__init__.py b/matchmaker/prob/__init__.py index c5ab843..122fa75 100644 --- a/matchmaker/prob/__init__.py +++ b/matchmaker/prob/__init__.py @@ -1,7 +1,9 @@ #!/usr/bin/python # -*- coding: utf-8 -*- """ -Probabilistic methods for music alignment +Probabilistic methods for music alignment. """ -from .hmm import PitchIOIHMM +from .hmm import PitchHMM, PitchIOIHMM +from .outer_product_hmm import OuterProductHMM +from .outer_product_hmm_audio import AudioOuterProductHMM diff --git a/matchmaker/prob/hmm.py b/matchmaker/prob/hmm.py index bfbaf4c..562bf5c 100644 --- a/matchmaker/prob/hmm.py +++ b/matchmaker/prob/hmm.py @@ -22,9 +22,10 @@ from scipy.stats import gumbel_l, norm from matchmaker.base import OnlineAlignment +from matchmaker.io.queue import RECVQueue +from matchmaker.io.stream import STREAM_END +from matchmaker.utils.errors import MatchmakerMissingParameterError from matchmaker.utils.misc import ( - MatchmakerMissingParameterError, - RECVQueue, get_window_indices, interleave_with_constant, set_latency_stats, @@ -47,6 +48,7 @@ DEFAULT_GUMBEL_AUDIO_SCALE = 0.05 QUEUE_TIMEOUT = 10 + class BaseHMM(HiddenMarkovModel): """ Base class for Hidden Markov Model alignment methods. @@ -109,39 +111,37 @@ def __init__( } @property - def warping_path(self) -> NDArrayInt: - return (np.array(self._warping_path).T).astype(np.int32) + def warping_path(self): + return np.array(self._warping_path).T - def __call__(self, input: NDArrayFloat) -> float: + def __call__(self, input: NDArrayFloat, *args, **kwargs) -> float: current_state = self.forward_algorithm_step( observation=input, log_probabilities=False, ) - self._warping_path.append((current_state, self.input_index)) + try: + beat = float(self.state_space[current_state]) + except (ValueError, TypeError): + beat = current_state + self._warping_path.append((beat, self.input_index)) self.input_index += 1 self.current_state = current_state return current_state - def run(self) -> Generator[int, None, NDArrayInt]: + def run(self, verbose: bool = True) -> Generator[int, None, NDArrayInt]: if self.queue is not None: - prev_state = self.current_state - same_state_counter = 0 while self.is_still_following(): - target_feature = self.queue.get() - - current_state = self(target_feature) + item = self.queue.get() + if item is STREAM_END: + break + if item is None: + continue - if current_state == prev_state: - if same_state_counter < self.patience: - same_state_counter += 1 - else: - break - else: - same_state_counter = 0 + current_state = self(item) - yield current_state + yield float(self.state_space[current_state]) return self.warping_path @@ -193,6 +193,7 @@ def __init__( initial_probabilities: Optional[np.ndarray] = None, has_insertions: bool = True, piano_range: bool = True, + **kwargs, ) -> None: """ Initialize the object. @@ -227,6 +228,7 @@ def __init__( reference_features=reference_features, ) self.reference_features = reference_features + self.perf_onset = None ( observation_model, transition_matrix, @@ -282,17 +284,25 @@ def __init__( ) def __call__(self, input, *args, **kwargs): - frame_index = args[0] if args else None - - pitch_obs = input - + # PitchIOIProcessor returns (pitch_array, ioi). Use pitch for + # observation, ioi for time tracking. + if isinstance(input, tuple): + pitch_obs, ioi_obs = input + if self.perf_onset is None: + self.perf_onset = 0 + else: + self.perf_onset += ioi_obs + else: + pitch_obs = input + current_state = self.forward_algorithm_step( observation=pitch_obs, log_probabilities=False, ) - self._warping_path.append((current_state, self.input_index)) - self.input_index = self.input_index + 1 if frame_index is None else frame_index - + beat = float(self.state_space[current_state]) + perf_time = getattr(self, "perf_onset", self.input_index) or self.input_index + self._warping_path.append((beat, perf_time)) + self.input_index += 1 self.current_state = current_state return self.current_state @@ -339,7 +349,7 @@ def _build_hmm_modules( transition_matrix = stable_transition_matrix( n_states=len(unique_onsets_s), dist=gumbel_l, - scale=1.0,#0.5, + scale=1.0, # 0.5, inserted_states=inserted_states, ) initial_probabilities = init_dist( @@ -547,7 +557,9 @@ def gumbel_transition_matrix( # TODO check works for audio (parameter) np.arange(n_states), loc=i + mp_trans_state * 2 - 1, scale=scale ) else: - transition_matrix[i] = gumbel_l.pdf(np.arange(n_states), loc=i + mp_trans_state * 2 - 1, scale=scale) + transition_matrix[i] = gumbel_l.pdf( + np.arange(n_states), loc=i + mp_trans_state * 2 - 1, scale=scale + ) # Normalize transition matrix (so that it is a proper stochastic matrix): transition_matrix /= transition_matrix.sum(1, keepdims=True) @@ -555,10 +567,11 @@ def gumbel_transition_matrix( # TODO check works for audio (parameter) # Return the computed transition matrix: return transition_matrix + def stable_transition_matrix( # TODO check works for audio (parameter) n_states: int, mp_trans_state: int = 1, - dist = gumbel_l, + dist=gumbel_l, scale: float = 0.5, inserted_states: bool = False, ) -> NDArrayFloat: @@ -606,7 +619,9 @@ def stable_transition_matrix( # TODO check works for audio (parameter) np.arange(n_states), loc=i + mp_trans_state * 2 - 1, scale=scale ) else: - transition_matrix[i] = dist.pdf(np.arange(n_states), loc=i + mp_trans_state * 2 - 1, scale=scale) + transition_matrix[i] = dist.pdf( + np.arange(n_states), loc=i + mp_trans_state * 2 - 1, scale=scale + ) # Normalize transition matrix (so that it is a proper stochastic matrix): transition_matrix /= transition_matrix.sum(1, keepdims=True) @@ -614,6 +629,7 @@ def stable_transition_matrix( # TODO check works for audio (parameter) # Return the computed transition matrix: return transition_matrix + def init_dist( n_states: int, dist=gumbel_l, @@ -784,6 +800,7 @@ def compute_discrete_pitch_profiles( return pitch_profiles + # Old version, to be deprecated. def compute_discrete_pitch_profiles_old( chord_pitches: NDArrayFloat, @@ -1131,7 +1148,6 @@ def __init__( self.states = np.arange(len(audio_features)) def __call__(self, observation: NDArrayFloat) -> NDArrayFloat: - pitch_obs, tempo_est = observation if self.current_state is None: @@ -1152,7 +1168,6 @@ def __call__(self, observation: NDArrayFloat) -> NDArrayFloat: obs_prob = pitch_prob * tempo_prob - return obs_prob @@ -1201,7 +1216,6 @@ def __init__( self.states = np.arange(len(audio_features)) def __call__(self, observation: NDArrayFloat) -> NDArrayFloat: - pitch_obs, tempo_est = observation # ioi_idx = self.current_state if self.current_state is not None else 0 @@ -1298,6 +1312,7 @@ def __init__(self, pitch_profiles, ioi_matrix, ioi_precision): ioi_prob_args=ioi_prob_args, ) + class ACCPitchIOIObservationModel(ObservationModel): """ Computes the probabilities that an observation was emitted, i.e. the @@ -1489,6 +1504,7 @@ def __init__( initial_probabilities: Optional[np.ndarray] = None, has_insertions: bool = False, piano_range: bool = False, + **kwargs, ) -> None: """ Initialize the object. @@ -1597,6 +1613,7 @@ def __init__( tempo_model=tempo_model, has_insertions=has_insertions, queue=queue, + **kwargs, ) def __call__(self, input, *args, **kwargs): @@ -1615,7 +1632,8 @@ def __call__(self, input, *args, **kwargs): ), log_probabilities=False, ) - self._warping_path.append((current_state, self.input_index)) + score_beat = float(self.state_space[current_state]) + self._warping_path.append((score_beat, self.perf_onset)) self.input_index = self.input_index + 1 if frame_index is None else frame_index if self.current_state is None: @@ -1656,17 +1674,17 @@ def _build_hmm_modules( self, piano_range: bool = False, inserted_states: bool = True, - observation_model = ACCPitchIOIObservationModel, - tempo_model = KalmanTempoModel, + observation_model=ACCPitchIOIObservationModel, + tempo_model=KalmanTempoModel, ): snote_array = self.reference_features - + unique_sonsets = np.unique(snote_array["onset_beat"]) unique_sonset_idxs = [ np.where(snote_array["onset_beat"] == ui)[0] for ui in unique_sonsets ] chord_pitches = [snote_array["pitch"][uix] for uix in unique_sonset_idxs] - + pitch_profiles = compute_discrete_pitch_profiles( chord_pitches=chord_pitches, piano_range=piano_range, @@ -1676,7 +1694,7 @@ def _build_hmm_modules( unique_onsets=unique_sonsets, inserted_states=inserted_states, ) - + observation_model = observation_model( pitch_profiles=pitch_profiles, ioi_matrix=ioi_matrix, @@ -1703,11 +1721,11 @@ def _build_hmm_modules( init_score_onset=unique_sonsets.min(), init_beat_period=60 / 100, ) - + transition_matrix = stable_transition_matrix( n_states=len(ioi_matrix[0]), dist=gumbel_l, - scale=1.0,#0.5, + scale=1.0, # 0.5, inserted_states=inserted_states, ) initial_probabilities = init_dist( @@ -1724,45 +1742,20 @@ def _build_hmm_modules( ) def run(self, verbose: bool = True): - same_state_counter = 0 - empty_counter = 0 - verbose = False - if verbose: - pbar = progressbar.ProgressBar( - maxval=self.n_states, # redirect_stdout=True - ) - pbar.start() - while self.is_still_following(): - prev_state = self.current_state - # TODO: check self.queue.get() format. maybe this should actually be a tuple try: - queue_input = self.queue.get(timeout=QUEUE_TIMEOUT) - #features, f_time = queue_input - #print(f'{features=}, {f_time=}') + item = self.queue.get(timeout=QUEUE_TIMEOUT) except: break - #TODO: try MidiStream.return_midi_messages = True + if item is STREAM_END: + break + if item is None: + continue - if queue_input is not None: - #print(f'pitch_ioi: {queue_input=}') - current_state = self.__call__(queue_input) - empty_counter = 0 - if current_state == prev_state: - if same_state_counter < self.patience: - same_state_counter += 1 - else: - break - else: - same_state_counter = 0 - + current_state = self.__call__(item) - if verbose: - pbar.update(int(current_state)) - yield current_state + yield float(self.state_space[current_state]) - if verbose: - pbar.finish() return self.warping_path @@ -1938,6 +1931,8 @@ def run(self, verbose: bool = True): while self.is_still_following(): prev_state = self.current_state queue_input = self.queue.get(timeout=QUEUE_TIMEOUT) + if queue_input is STREAM_END: + break features, f_time = queue_input self.last_queue_update = time.time() self.input_features = ( @@ -1961,7 +1956,7 @@ def run(self, verbose: bool = True): ) if verbose: pbar.update(int(current_state)) - yield current_state + yield float(self.state_space[current_state]) if verbose: pbar.finish() @@ -1987,6 +1982,7 @@ def __init__( initial_probabilities: Optional[np.ndarray] = None, state_space: Optional[NDArray] = None, patience: int = 200, + **kwargs, ) -> None: """ Initialize the object. @@ -2090,7 +2086,6 @@ def __init__( self.input_features = None self.distance_func = "Euclidean" - BaseHMM.__init__( self, observation_model=observation_model, @@ -2153,6 +2148,8 @@ def run(self, verbose: bool = True): while self.is_still_following(): prev_state = self.current_state queue_input = self.queue.get(timeout=QUEUE_TIMEOUT) + if queue_input is STREAM_END: + break features, f_time = queue_input self.last_queue_update = time.time() self.input_features = ( @@ -2176,7 +2173,7 @@ def run(self, verbose: bool = True): ) if verbose: pbar.update(int(current_state)) - yield current_state + yield float(self.state_space[current_state]) if verbose: pbar.finish() diff --git a/matchmaker/prob/outer_product_hmm.py b/matchmaker/prob/outer_product_hmm.py index adb3d57..680aa70 100644 --- a/matchmaker/prob/outer_product_hmm.py +++ b/matchmaker/prob/outer_product_hmm.py @@ -4,7 +4,8 @@ from numpy.typing import NDArray from matchmaker.base import OnlineAlignment -from matchmaker.utils.misc import RECVQueue +from matchmaker.io.queue import RECVQueue +from matchmaker.io.stream import STREAM_END try: # import the compiled function (name depends on your .pyx) @@ -13,7 +14,6 @@ viterbi_step_cy = None import numpy as np - from partitura.score import Part, Score, ScoreLike NDArrayFloat = NDArray[np.float32] @@ -216,6 +216,7 @@ def __init__( r: Optional[np.ndarray] = None, other_prob: float = 1e-6, patience: int = 10, + **kwargs, ) -> None: """ Outer-product Hidden Markov Model for score following. @@ -285,15 +286,18 @@ def __init__( ) self.current_state = 0 + self.input_index = 0 + self.perf_time = 0.0 self._warping_path = [] self._current_chord = np.zeros(88, dtype=int) self.patience = patience - self.state_probabilities = np.ones(self.n_states) / self.n_states + self.state_probabilities = np.zeros(self.n_states) + self.state_probabilities[0] = 1.0 self.is_first_observation = True @property - def warping_path(self) -> NDArrayInt: - return (np.array(self._warping_path).T).astype(np.int32) + def warping_path(self) -> NDArray: + return np.array(self._warping_path).T def is_still_following(self) -> bool: if self.current_state is not None: @@ -305,17 +309,22 @@ def __call__( self, input: tuple[np.ndarray, float], *args, **kwargs ) -> Optional[int]: pitch_obs, ioi = input + self.perf_time += ioi if ioi < IOI_THRESHOLD: self._current_chord = np.maximum(self._current_chord, pitch_obs) - return self.current_state + # Return None to signal "chord continuation" (no state advance); + # run() uses this to skip the patience counter. + return None else: self._current_chord = pitch_obs self.state_probabilities = self.viterbi_step( self.state_probabilities, self._current_chord ) self.current_state = np.argmax(self.state_probabilities) - self._warping_path.append(self.current_state) + score_beat = float(self.state_space[self.current_state]) + self._warping_path.append((score_beat, self.perf_time)) + self.input_index += 1 return self.current_state @@ -337,8 +346,10 @@ def compute_obs_likelihood( b[i] = likelihood of observing `observation` at state i. """ - b = self.b_table[:, 21:109] * observation - return b + log_b = np.log(np.maximum(self.b_table[:, 21:109], 1e-300)) # (N, 88) + log_em = log_b @ observation # (N,): log-product over active pitches + log_em -= log_em.max() # shift for numerical stability + return np.exp(log_em) # (N,) # Viterbi update def viterbi_step( @@ -392,8 +403,8 @@ def viterbi_step( local_max = val skip_contrib = self.r[i] * global_skip_max - new_probs[i] = sum( - b[i] * (skip_contrib if skip_contrib >= local_max else local_max) + new_probs[i] = b[i] * ( + skip_contrib if skip_contrib >= local_max else local_max ) if np.sum(new_probs) > 0: new_probs /= np.sum(new_probs) @@ -417,8 +428,13 @@ def run( prev_state = self.current_state queue_input = self.queue.get() + if queue_input is STREAM_END: + break if queue_input is not None: current_state = self(queue_input) + if current_state is None: + # Chord continuation: no state advance, skip patience. + continue empty_counter = 0 if current_state == prev_state: if same_state_counter < self.patience: @@ -429,10 +445,8 @@ def run( same_state_counter = 0 if verbose: - # current_state may be None (no state yet); guard the update - if current_state is not None: - pbar.update(int(current_state)) - yield current_state + pbar.update(int(current_state)) + yield float(self.state_space[current_state]) if verbose: pbar.finish() diff --git a/matchmaker/prob/outer_product_hmm_audio.py b/matchmaker/prob/outer_product_hmm_audio.py index e6dc3b9..25d12dc 100644 --- a/matchmaker/prob/outer_product_hmm_audio.py +++ b/matchmaker/prob/outer_product_hmm_audio.py @@ -8,32 +8,28 @@ from partitura.score import Part, Score, ScoreLike from matchmaker.base import OnlineAlignment -from matchmaker.features.audio import QUEUE_TIMEOUT -from matchmaker.utils.misc import RECVQueue, set_latency_stats +from matchmaker.io.audio import QUEUE_TIMEOUT +from matchmaker.io.queue import RECVQueue +from matchmaker.io.stream import STREAM_END +from matchmaker.utils.misc import set_latency_stats NDArrayFloat = NDArray[np.float32] NDArrayInt = NDArray[np.int32] -DEFAULT_PITCH_ERROR_PROBS = { - "correct_pitch_prob": 0.9497, - "semi_tone_error_prob": 0.0145 / 2.0, - "whole_tone_error_prob": 0.0224 / 2.0, - "octave_error_prob": 0.0047 / 2.0, - "within_one_octave_error_prob": 0.0086 / 9.0 / 2.0, -} - -# DEFAULT_TRANSITIONS = [ -# (1, 1.0), # normal (i→i+1) -# (2, 1e-50), # deletion (i→i+2), HHMMState_simple.hpp: log10(-50) -# ] +# Nakamura et al. 2016 Section IV-B experimental parameters. +# Neighbourhood transitions a^{(nbh)}_{j,i} from nakamura_data.py: +# These are the "small transition probabilities" for the banded structure. +# Paper: a_{i,i} = 0 (self-transition handled by bottom HMM a00), +# a_{i,i+2} = 1e-50 (deletion, effectively 0). +# We use the empirical values from [13] (Nakamura JNMR 2014). DEFAULT_TRANSITIONS = [ - (-3, 0.001), - (-2, 0.001), - (-1, 0.002), - (0, 0.01342), - (1, 0.96), - (2, 0.01), - (3, 0.002), + (-3, 0.00509), + (-2, 0.00516), + (-1, 0.00886), + (0, 0.01342), # insertion (staying at same top state) + (1, 0.94531), # normal forward progression + (2, 0.00610), # deletion (skip one note) + (3, 0.00073), ] DEFAULT_D1 = 3 @@ -43,8 +39,11 @@ _FLUX_EXIT_BOOST: float = 1.0 _OTHER_PROB: float = 1e-6 -_PAUSE_ENTRY_PROB: float = 0.01 # probability of entering pause state from sound -_PAUSE_DURATION_SEC: float = 0.5 +# Paper IV-B: +# a_{0,1}^{(i)} = 1e-100 (pause entry: almost never enter pause) +# a_{1,1}^{(i)} = 0.999 (pause self-transition: once in pause, stay) +_PAUSE_ENTRY_PROB: float = 1e-100 +_PAUSE_SELF_TRANSITION: float = 0.999 _PAUSE_EMISSION_MAX: float = 1e-3 @@ -127,11 +126,13 @@ def __init__( reference_features: np.ndarray, queue: Optional[RECVQueue] = None, transitions: Optional[List[tuple[int, float]]] = None, - pitch_error_probs: Optional[dict[str, float]] = None, patience: int = 0, tempo: float = 120.0, sample_rate: int = 16000, hop_length: int = 320, + s_j: float = 1e-5, + r_i: Optional[np.ndarray] = None, + **kwargs, ) -> None: self.reference_features = reference_features OnlineAlignment.__init__( @@ -173,16 +174,10 @@ def __init__( self.transitions = ( transitions if transitions is not None else DEFAULT_TRANSITIONS ) - self.pitch_error_probs = ( - pitch_error_probs - if pitch_error_probs is not None - else DEFAULT_PITCH_ERROR_PROBS - ) self.other_prob = _OTHER_PROB self.sample_rate = int(sample_rate) self.hop_length = int(hop_length) self.pause_entry_prob = _PAUSE_ENTRY_PROB - self.pause_duration_sec = _PAUSE_DURATION_SEC self.pause_emission_max = _PAUSE_EMISSION_MAX # Transition setup with banded structure @@ -202,7 +197,23 @@ def __init__( row_sums = self.alpha.sum(axis=1, keepdims=True) self.alpha = self.alpha / row_sums - self.current_state = 0 + # Repeat/skip factorization (Nakamura Eq.11): + # a_{j,i} = a^{(nbh)}_{j,i} + s_j * r_i + # s_j: probability of stopping at event j before a repeat/skip + # r_i: probability of resuming at event i after a repeat/skip + self.S = np.full(self.n_states, float(s_j), dtype=float) + if r_i is not None: + self.r = np.asarray(r_i, dtype=float) + else: + self.r = np.ones(self.n_states, dtype=float) / self.n_states + + # Renormalize alpha to account for s_j mass: + # Σ_i a_{j,i} = Σ_i a^{(nbh)}_{j,i} + s_j * Σ_i r_i = 1 + # => Σ_i a^{(nbh)}_{j,i} = 1 - s_j (since Σ_i r_i = 1) + if s_j > 0: + self.alpha = self.alpha * (1.0 - self.S[:, None]) + + self.current_state_index = 0 self._warping_path = [] self._current_chord = np.zeros(88, dtype=int) self.patience = int(patience) @@ -223,13 +234,8 @@ def __init__( tempo=tempo, frame_rate=frame_rate, ) - self.a11 = float( - np.clip( - self._pause_self_transition_prob(self.pause_duration_sec, frame_rate), - 0.0, - 1.0, - ) - ) + # Paper IV-B: a_{1,1}^{(i)} = 0.999 (pause self-transition) + self.a11 = float(_PAUSE_SELF_TRANSITION) # Pause entry prob a01 (II-E) move_prob = 1.0 - self.a00 p_pause = float(np.clip(self.pause_entry_prob, 0.0, 1.0)) @@ -243,9 +249,21 @@ def __init__( self.e0 = np.clip(1.0 - self.a00 - self.a01, 1e-10, 1.0) self.e1 = float(np.clip(1.0 - self.a11, 1e-10, 1.0)) + # Precompute alpha diagonals and sliding window indices for vectorized forward_step + self._alpha_diags = [] + for d in range(-self.D2, self.D1 + 1): + self._alpha_diags.append(np.diagonal(self.alpha, offset=-d).copy()) + self._j_starts = np.maximum(0, np.arange(self.n_states) - self.D2) + self._j_ends = np.minimum(self.n_states, np.arange(self.n_states) + self.D1 + 1) + + @property + def warping_path(self) -> np.ndarray: + return np.array(self._warping_path).T + @property - def warping_path(self) -> NDArrayInt: - return (np.array(self._warping_path).T).astype(np.int32) + def current_position(self) -> float: + """Current score position in beats.""" + return float(self.state_space[self.current_state_index]) @staticmethod def _pause_self_transition_prob( @@ -286,8 +304,8 @@ def _compute_chord_self_transition_probs( return np.clip(1.0 - 1.0 / d_i, 1e-6, 1.0 - 1e-6) def is_still_following(self) -> bool: - if self.current_state is not None: - return self.current_state <= self.n_states - 1 + if self.current_state_index is not None: + return self.current_state_index < self.n_states - 1 return False def __call__(self, input, *args, **kwargs) -> Optional[int]: @@ -321,10 +339,12 @@ def __call__(self, input, *args, **kwargs) -> Optional[int]: top_scores = probs[0::2] + probs[1::2] new_top = int(np.argmax(top_scores)) - self.current_state = new_top - self._warping_path.append((self.current_state, self.input_index)) + self.current_state_index = new_top + self._warping_path.append( + (float(self.state_space[self.current_state_index]), self.input_index) + ) self.input_index += 1 - return self.current_state + return self.current_state_index def compute_obs_likelihood( self, @@ -390,14 +410,31 @@ def forward_step( prev_sound = np.asarray(prev_probs[0::2], dtype=float) prev_pause = np.asarray(prev_probs[1::2], dtype=float) - # Emission - emit_sound = self.compute_obs_likelihood(observation) - emit_pause_scalar = self._compute_pause_emission(observation) + # --- Single _preprocess_obs call + inline emission computation --- + obs = _preprocess_obs(observation) + cqt = np.maximum(obs[:88] if obs.size >= 88 else obs, 0.0) + cqt_sum = cqt.sum() + + # Sound emission (from compute_obs_likelihood) + if cqt_sum <= 0: + emit_sound = np.full(N, 1e-300, dtype=float) + else: + cqt_norm = cqt / cqt_sum + em = self.chord_harmonic_mask @ cqt_norm + emit_sound = np.maximum(np.nan_to_num(em, nan=1e-12), 1e-12) + + # Pause emission (from _compute_pause_emission) + if cqt_sum <= 0: + emit_pause_scalar = min(1.0, self.pause_emission_max) + else: + var = float(np.var(cqt / cqt_sum)) + emit_pause_scalar = min( + max(1.0 / (1.0 + 200.0 * var), 1e-300), self.pause_emission_max + ) emit_pause = np.full(N, emit_pause_scalar, dtype=float) # Spectral-flux-driven exit boost - obs_flat = _preprocess_obs(observation) - flux = float(obs_flat[88]) if obs_flat.size > 88 else 0.0 + flux = float(obs[88]) if obs.size > 88 else 0.0 f = flux / (flux + 1.0) # [0,1) boost = 1.0 + _FLUX_EXIT_BOOST * f e0 = np.clip(self.e0 * boost, 1e-10, 1.0 - self.a01 - 1e-10) @@ -406,18 +443,27 @@ def forward_step( # Exit masses from each top state j (Eq.(6)) exit_mass = prev_sound * e0 + prev_pause * self.e1 # (N,) - # Compute neigh_sum_i for each i (banded, Eq.(9)) - neigh_sum = np.zeros(N, dtype=float) - for i in range(N): - j_start = max(0, i - self.D2) - j_end = min(N, i + self.D1 + 1) - ssum = 0.0 - for j in range(j_start, j_end): - a = float(self.alpha[j, i]) - if a <= 0: - continue - ssum += exit_mass[j] * a - neigh_sum[i] = ssum + # --- Vectorized neighbourhood sum (replaces O(N*(D1+D2)) Python loop) --- + # Global skip term: Σ_j exit_mass[j] * S[j] (O(N), computed once) + global_skip_sum = float(np.dot(exit_mass, self.S)) + + # Local neighbourhood transition: sum over diagonals of alpha + local_nbh = np.zeros(N, dtype=float) + for k, d in enumerate(range(-self.D2, self.D1 + 1)): + diag = self._alpha_diags[k] + L = len(diag) + src = max(0, d) # source index offset in exit_mass + dst = max(0, -d) # destination index offset in local_nbh + local_nbh[dst : dst + L] += exit_mass[src : src + L] * diag + + # Local skip via cumsum sliding window: O(N) instead of O(N*D) + eS = exit_mass * self.S + cumsum_eS = np.empty(N + 1, dtype=float) + cumsum_eS[0] = 0.0 + np.cumsum(eS, out=cumsum_eS[1:]) + local_skip = cumsum_eS[self._j_ends] - cumsum_eS[self._j_starts] + + neigh_sum = local_nbh + self.r * (global_skip_sum - local_skip) # Within-top bottom transitions within_sound = prev_sound * a00 @@ -448,21 +494,27 @@ def run( same_state_counter = 0 empty_counter = 0 if verbose: - pbar = progressbar.ProgressBar(maxval=self.n_states) + pbar = progressbar.ProgressBar( + maxval=len(self.state_space), + redirect_stdout=True, + redirect_stderr=True, + ) pbar.start() while self.is_still_following(): - prev_state = self.current_state + prev_state = self.current_state_index try: queue_input = self.queue.get(timeout=QUEUE_TIMEOUT) except Empty: break + if queue_input is STREAM_END: + break self.last_queue_update = time.time() if queue_input is not None: - current_state = self(queue_input) + self(queue_input) empty_counter = 0 - if current_state == prev_state: + if self.current_state_index == prev_state: if self.patience > 0: if same_state_counter < self.patience: same_state_counter += 1 @@ -472,13 +524,12 @@ def run( same_state_counter = 0 if verbose: - if current_state is not None: - pbar.update(int(current_state) + 1) # states starts with 0 + pbar.update(self.current_state_index + 1) latency = time.time() - self.last_queue_update self.latency_stats = set_latency_stats( latency, self.latency_stats, self.input_index ) - yield current_state + yield self.current_position if verbose: pbar.finish() diff --git a/matchmaker/utils/errors.py b/matchmaker/utils/errors.py new file mode 100644 index 0000000..2cf055d --- /dev/null +++ b/matchmaker/utils/errors.py @@ -0,0 +1,46 @@ +#!/usr/bin/python +# -*- coding: utf-8 -*- +""" +Custom exception classes for matchmaker. +""" + +from typing import Iterable, List, Union + + +class MatchmakerInvalidParameterTypeError(Exception): + """Error for flagging an invalid parameter type.""" + + def __init__( + self, + parameter_name: str, + required_parameter_type: Union[type, Iterable[type]], + actual_parameter_type: type, + *args, + ) -> None: + if isinstance(required_parameter_type, Iterable): + rqpt = ", ".join([f"{pt}" for pt in required_parameter_type]) + else: + rqpt = required_parameter_type + message = f"`{parameter_name}` was expected to be {rqpt}, but is {actual_parameter_type}" + super().__init__(message, *args) + + +class MatchmakerInvalidOptionError(Exception): + """Error for invalid option.""" + + def __init__(self, parameter_name, valid_options, value, *args) -> None: + rqop = ", ".join([f"{op}" for op in valid_options]) + message = f"`{parameter_name}` was expected to be in {rqop}, but is {value}" + super().__init__(message, *args) + + +class MatchmakerMissingParameterError(Exception): + """Error for flagging a missing parameter.""" + + def __init__(self, parameter_name: Union[str, List[str]], *args) -> None: + if isinstance(parameter_name, Iterable) and not isinstance(parameter_name, str): + message = ", ".join([f"`{pn}`" for pn in parameter_name]) + message = f"{message} were not given" + else: + message = f"`{parameter_name}` was not given." + super().__init__(message, *args) diff --git a/matchmaker/utils/eval.py b/matchmaker/utils/eval.py index 20e5192..5d0c313 100644 --- a/matchmaker/utils/eval.py +++ b/matchmaker/utils/eval.py @@ -1,5 +1,3 @@ -from typing import TypedDict, Union - import numpy as np import scipy @@ -11,159 +9,97 @@ def transfer_positions( wp, ref_anns, frame_rate, - reverse=False, *, - mode: str = "auto", - reducer: str = "min", - state_offset: Union[int, str] = "auto", - output: str = "seconds", + domain: str = "score", + aggregation_func=None, ): """ - Transfer the positions of the reference annotations to the target annotations using the warping path. - - This function supports two common warping-path conventions: - - - **frame mode** (classic DTW-style): wp[0] and wp[1] are frame indices for reference/target features. - - **state mode** (HMM/score-state): wp[0] contains *reference state indices* and wp[1] contains *target frame indices*. + Transfer positions between score and performance using the warping path. Parameters ---------- wp : np.array with shape (2, T) - array of warping path. - warping_path[0] is the index of the reference (score) feature and warping_path[1] is the index of the target(input) feature. - ref_ann : List[float] - In **frame mode**, reference annotations in seconds. - In **state mode**, a sequence whose length equals the number of reference states (e.g., score unique_onsets); - the values are not used except for determining the number of states. + Warping path. wp[0] = score beats, wp[1] = performance frame indices. + ref_anns : array-like + Query positions (seconds for domain="score", + beats for domain="performance"). frame_rate : int - frame rate of the audio. - reverse : bool - If True, swap the direction (target -> reference). - mode : {"auto", "frame", "state"} - Warping-path convention. "auto" picks "state" when wp[0] looks like small discrete state indices. - reducer : {"min", "max", "median", "mean"} - In **state mode**, how to select a single representative target frame for each state when multiple wp entries - map to the same state. - state_offset : {"auto"} or int - In **state mode**, wp[0] may start at 0 or 1 (or have a leading start-state). "auto" chooses the offset that - best matches the expected number of states. - output : {"seconds", "frames"} - Return unit. "seconds" divides frames by frame_rate; "frames" returns frame indices. + Frame rate of the audio. + domain : {"score", "performance"} + Domain of the output. + "score": perf→score lookup. Given performance times (seconds), + return predicted score positions (beats). + "performance": score→perf lookup. Given score beats, return + predicted performance times (seconds). + aggregation_func : callable or None + Function to aggregate multiple values sharing the same key + (e.g., np.max, np.min, np.mean). If None, defaults to: + - domain="score": last entry in temporal order (tracker's + final decision at that frame) + - domain="performance": np.min (earliest arrival at that beat, + i.e. first-crossing rule) Returns ------- - predicted_targets : np.array with shape (T,) - Predicted target positions (seconds or frames depending on output). + predicted : np.array + Predicted positions in the target domain. """ - if output not in {"seconds", "frames"}: - raise ValueError(f"Invalid output={output!r}. Use 'seconds' or 'frames'.") - - if reverse: - x, y = wp[1], wp[0] - else: - x, y = wp[0], wp[1] - - if mode not in {"auto", "frame", "state"}: - raise ValueError(f"Invalid mode={mode!r}. Use 'auto', 'frame', or 'state'.") - - # Heuristic: state paths have small discrete indices (often << target frames), - # while frame paths typically cover most reference frames (unique count is large). - if mode == "auto": - x_unique = np.unique(x) - n_ref = len(ref_anns) - looks_like_state = (x_unique.size <= max(4, 2 * n_ref)) and ( - int(np.max(x)) <= max(10, 5 * n_ref) - ) - mode = "state" if looks_like_state else "frame" - - if mode == "frame": - # Causal nearest neighbor interpolation (reference seconds -> reference frames -> target frames) - ref_anns_frame = np.round(np.asarray(ref_anns) * frame_rate) - predicted_targets = np.ones(len(ref_anns_frame), dtype=float) * np.nan - - for i, r in enumerate(ref_anns_frame): - # 1) Scan all x values less than or equal to r and find the largest x value - past_indices = np.where(x <= r)[0] - if past_indices.size > 0: - # Find indices corresponding to the largest x value - max_x_val = x[past_indices[-1]] - max_x_indices = np.where(x == max_x_val)[0] - - # 2) Among all y values mapped to this x value, select the minimum y value - corresponding_y_values = y[max_x_indices] - predicted_targets[i] = float(np.min(corresponding_y_values)) - - if output == "frames": - return predicted_targets - return np.asarray(predicted_targets) / frame_rate - - # mode == "state" - # Goal: for each reference state index, select representative target frame from wp. - num_states = len(ref_anns) - predicted_frames = np.ones(num_states, dtype=float) * np.nan - - x_int = np.asarray(x, dtype=int) - y_int = np.asarray(y, dtype=int) - - if reducer not in {"min", "max", "median", "mean"}: - raise ValueError( - f"Invalid reducer={reducer!r}. Use 'min', 'max', 'median', or 'mean'." - ) - - if state_offset == "auto": - # Choose offset that maximizes overlap between expected states and observed wp state indices. - observed = np.unique(x_int) - candidates = [] - for off in (0, 1, int(np.min(x_int))): - if off not in candidates: - candidates.append(off) - best_off = candidates[0] - best_overlap = -1 - for off in candidates: - expected = np.arange(off, off + num_states, dtype=int) - overlap = np.intersect1d(observed, expected).size - if overlap > best_overlap: - best_overlap = overlap - best_off = off - offset = best_off + if domain not in {"score", "performance"}: + raise ValueError(f"Invalid domain={domain!r}. Use 'score' or 'performance'.") + + wp_score = wp[0].astype(float) + wp_perf = wp[1].astype(float) + queries = np.asarray(ref_anns, dtype=float) + + def _last(arr): + return arr[-1] + + if aggregation_func is None: + aggregation_func = _last if domain == "score" else np.min + + if domain == "score": + # Perf → Score: "at perf time t, what is the tracker's score position?" + # Group by perf frame, take the last entry by default (tracker's final decision). + query_frames = queries * frame_rate + + sort_idx = np.argsort(wp_perf, kind="stable") + wp_perf_sorted = wp_perf[sort_idx] + wp_score_sorted = wp_score[sort_idx] + + unique_frames, first_idx = np.unique(wp_perf_sorted, return_index=True) + reduced_scores = np.empty(len(unique_frames)) + for g in range(len(unique_frames)): + start = first_idx[g] + end = ( + first_idx[g + 1] if g + 1 < len(unique_frames) else len(wp_score_sorted) + ) + reduced_scores[g] = aggregation_func(wp_score_sorted[start:end]) + + # unique_frames is monotonic → searchsorted for last frame ≤ query + indices = np.searchsorted(unique_frames, query_frames, side="right") - 1 + predicted = np.full(len(queries), np.nan) + valid = indices >= 0 + predicted[valid] = reduced_scores[indices[valid]] + return predicted else: - offset = int(state_offset) - - for s in range(num_states): - wp_state = s + offset - idx = np.where(x_int == wp_state)[0] - if idx.size == 0: - continue - vals = y_int[idx].astype(float) - if reducer == "min": - predicted_frames[s] = float(np.min(vals)) - elif reducer == "max": - predicted_frames[s] = float(np.max(vals)) - elif reducer == "median": - predicted_frames[s] = float(np.median(vals)) - else: # mean - predicted_frames[s] = float(np.mean(vals)) - - if output == "frames": - return predicted_frames - return predicted_frames / frame_rate - - -def transfer_from_score_to_predicted_perf(wp, score_annots, frame_rate, mode="auto"): - predicted_perf_idx = transfer_positions( - wp, - score_annots, - frame_rate, - mode=mode, - ) - return predicted_perf_idx - - -def transfer_from_perf_to_predicted_score(wp, perf_annots, frame_rate, mode="auto"): - predicted_score_idx = transfer_positions( - wp, perf_annots, frame_rate, reverse=True, mode=mode - ) - return predicted_score_idx + # Score → Perf: "when did the tracker first reach beat b?" + # Group by score position, aggregate perf frame values per group. + sort_idx = np.argsort(wp_score, kind="stable") + wp_score_sorted = wp_score[sort_idx] + wp_perf_sorted = wp_perf[sort_idx] + + unique_beats, first_idx = np.unique(wp_score_sorted, return_index=True) + reduced_perf = np.empty(len(unique_beats)) + for g in range(len(unique_beats)): + start = first_idx[g] + end = first_idx[g + 1] if g + 1 < len(unique_beats) else len(wp_perf_sorted) + reduced_perf[g] = aggregation_func(wp_perf_sorted[start:end]) + + indices = np.searchsorted(unique_beats, queries, side="left") + predicted = np.full(len(queries), np.nan) + valid = indices < len(unique_beats) + predicted[valid] = reduced_perf[indices[valid]] + return predicted / frame_rate def get_evaluation_results( @@ -171,23 +107,25 @@ def get_evaluation_results( predicted_annots, total_counts, tolerances=TOLERANCES_IN_MILLISECONDS, - pcr_threshold=2_000, # 2 seconds in_seconds=True, ): if in_seconds: - errors_in_delay = (gt_annots - predicted_annots) * 1000 # in milliseconds + errors_in_delay = (gt_annots - predicted_annots) * 1000 else: errors_in_delay = gt_annots - predicted_annots - filtered_errors_in_delay = errors_in_delay[np.abs(errors_in_delay) <= pcr_threshold] - filtered_abs_errors_in_delay = np.abs(filtered_errors_in_delay) + abs_errors_in_delay = np.abs(errors_in_delay) results = { - "mean": float(f"{np.nanmean(filtered_abs_errors_in_delay):.4f}"), - "median": float(f"{np.nanmedian(filtered_abs_errors_in_delay):.4f}"), - "std": float(f"{np.nanstd(filtered_abs_errors_in_delay):.4f}"), - "skewness": float(f"{scipy.stats.skew(filtered_errors_in_delay):.4f}"), - "kurtosis": float(f"{scipy.stats.kurtosis(filtered_errors_in_delay):.4f}"), + "mean": float(f"{np.nanmean(abs_errors_in_delay):.4f}"), + "median": float(f"{np.nanmedian(abs_errors_in_delay):.4f}"), + "std": float(f"{np.nanstd(abs_errors_in_delay):.4f}"), + "skewness": float( + f"{scipy.stats.skew(errors_in_delay, nan_policy='omit'):.4f}" + ), + "kurtosis": float( + f"{scipy.stats.kurtosis(errors_in_delay, nan_policy='omit'):.4f}" + ), } if in_seconds: @@ -201,6 +139,72 @@ def get_evaluation_results( f"{np.sum(np.abs(errors_in_delay) <= tau) / total_counts:.4f}" ) - results["pcr"] = float(f"{len(filtered_errors_in_delay) / total_counts:.4f}") - results["count"] = len(filtered_abs_errors_in_delay) return results + + +def evaluate_alignment( + wp_score, + wp_perf_sec, + gt_score_beats, + gt_perf_sec, + beat_tolerances=TOLERANCES_IN_BEATS, + ms_tolerances=TOLERANCES_IN_MILLISECONDS, +): + """Evaluate alignment quality in both beat and ms domains. + + Parameters + ---------- + wp_score : np.ndarray + Warping path score axis (beats). + wp_perf_sec : np.ndarray + Warping path performance axis (seconds). + gt_score_beats : np.ndarray + Ground truth score positions (beats). + gt_perf_sec : np.ndarray + Ground truth performance times (seconds). + beat_tolerances : list + Tolerance thresholds for beat error. + ms_tolerances : list + Tolerance thresholds for ms error. + + Returns + ------- + dict + {"beat": {...}, "ms": {...}} + """ + # GT interpolator: score beat → perf time + valid = np.isfinite(gt_perf_sec) & np.isfinite(gt_score_beats) + gt_interp = scipy.interpolate.interp1d( + gt_score_beats[valid], + gt_perf_sec[valid], + bounds_error=False, + fill_value=np.nan, + ) + + # Beat metrics: perf→score prediction + wp_sec = np.array([wp_score, wp_perf_sec]) + score_predicted = transfer_positions( + wp_sec, + gt_perf_sec, + frame_rate=1, + domain="score", + ) + n_valid = min(len(gt_score_beats), len(score_predicted)) + beat_results = get_evaluation_results( + gt_score_beats[:n_valid], + score_predicted[:n_valid], + total_counts=len(gt_score_beats), + tolerances=beat_tolerances, + in_seconds=False, + ) + + # Ms metrics: score→perf prediction + gt_perf_for_wp = gt_interp(wp_score) + ms_results = get_evaluation_results( + gt_perf_for_wp, + wp_perf_sec, + total_counts=len(wp_score), + tolerances=ms_tolerances, + ) + + return {"beat": beat_results, "ms": ms_results} diff --git a/matchmaker/utils/misc.py b/matchmaker/utils/misc.py index ce58269..5bb2ad2 100644 --- a/matchmaker/utils/misc.py +++ b/matchmaker/utils/misc.py @@ -9,8 +9,7 @@ import re import xml.etree.ElementTree as ET from pathlib import Path -from queue import Empty, Queue -from typing import Any, Dict, Iterable, List, Optional, Union +from typing import Dict, Optional, Union import librosa import mido @@ -146,53 +145,6 @@ def extract_tempo_marking_from_musicxml( return None -class MatchmakerInvalidParameterTypeError(Exception): - """ - Error for flagging an invalid parameter type. - """ - - def __init__( - self, - parameter_name: str, - required_parameter_type: Union[type, Iterable[type]], - actual_parameter_type: type, - *args, - ) -> None: - if isinstance(required_parameter_type, Iterable): - rqpt = ", ".join([f"{pt}" for pt in required_parameter_type]) - else: - rqpt = required_parameter_type - message = f"`{parameter_name}` was expected to be {rqpt}, but is {actual_parameter_type}" - - super().__init__(message, *args) - - -class MatchmakerInvalidOptionError(Exception): - """ - Error for invalid option. - """ - - def __init__(self, parameter_name, valid_options, value, *args) -> None: - rqop = ", ".join([f"{op}" for op in valid_options]) - message = f"`{parameter_name}` was expected to be in {rqop}, but is {value}" - - super().__init__(message, *args) - - -class MatchmakerMissingParameterError(Exception): - """ - Error for flagging a missing parameter - """ - - def __init__(self, parameter_name: Union[str, List[str]], *args) -> None: - if isinstance(parameter_name, Iterable) and not isinstance(parameter_name, str): - message = ", ".join([f"`{pn}`" for pn in parameter_name]) - message = f"{message} were not given" - else: - message = f"`{parameter_name}` was not given." - super().__init__(message, *args) - - def ensure_rng( seed: Union[numbers.Integral, np.random.RandomState], ) -> np.random.RandomState: @@ -224,33 +176,6 @@ def ensure_rng( ) -class RECVQueue(Queue): - """ - Queue with a recv method (like Pipe) - - This class uses python's Queue.get with a timeout makes it interruptable via KeyboardInterrupt - and even for the future where that is possibly out-dated, the interrupt can happen after each timeout - so periodically query the queue with a timeout of 1s each attempt, finding a middleground - between busy-waiting and uninterruptable blocked waiting - """ - - def __init__(self) -> None: - Queue.__init__(self) - - def recv(self) -> Any: - """ - Return and remove an item from the queue. - """ - while True: - try: - return self.get(timeout=1) - except Empty: # pragma: no cover - pass - - def poll(self) -> bool: - return self.empty() - - def get_window_indices(indices: np.ndarray, context: int) -> np.ndarray: # Create a range array from -context to context (inclusive) range_array = np.arange(-context, context + 1) @@ -431,7 +356,7 @@ def adjust_tempo_for_performance_file( ): """ Adjust the tempo of the score part to match the performance file. - We round up the tempo to the nearest 20 bpm to avoid too much optimization. + We round the tempo to the nearest 10 bpm to avoid too much optimization. Parameters ---------- @@ -449,9 +374,7 @@ def adjust_tempo_for_performance_file( else: target_length = librosa.get_duration(path=str(performance_file)) ratio = target_length / source_length - rounded_tempo = int( - (default_tempo / ratio + 19) // 20 * 20 - ) # round up to nearest 20 + rounded_tempo = int(round(default_tempo / ratio / 10) * 10) # round to nearest 10 print( f"default tempo: {default_tempo} (score length: {source_length}) -> adjusted_tempo: {rounded_tempo} (perf length: {target_length})" ) @@ -521,6 +444,15 @@ def save_nparray_to_csv(array: NDArray, save_path: str): writer.writerows(array) +def _beats_to_frames( + beats: np.ndarray, + ref_frame_to_beat: np.ndarray, +) -> np.ndarray: + """Convert beat positions to (float) frame indices via inverse interpolation.""" + frames = np.arange(len(ref_frame_to_beat), dtype=float) + return np.interp(beats, ref_frame_to_beat, frames) + + def plot_alignment( warping_path: np.ndarray, perf_annots: np.ndarray, @@ -533,92 +465,93 @@ def plot_alignment( ref_features: Optional[np.ndarray] = None, input_features: Optional[np.ndarray] = None, distance_func=None, + ref_frame_to_beat: Optional[np.ndarray] = None, ): - """Plot warping path, GT annotations, and predicted points in one figure. - - Layers (back to front): distance matrix → warping path → predicted → GT. - """ + """Plot warping path, GT annotations, and predicted points.""" save_dir.mkdir(parents=True, exist_ok=True) gt = np.asarray(perf_annots, dtype=float) pred = np.asarray(perf_annots_predicted, dtype=float) n = min(len(gt), len(pred)) gt, pred = gt[:n], pred[:n] - has_dist_matrix = ( + fig, ax = plt.subplots(figsize=(30, 30)) + + # Distance matrix background + show_dist = False + if ( ref_features is not None and input_features is not None and distance_func is not None - ) + ): + try: + if isinstance(distance_func, str): + dist = scipy.spatial.distance.cdist( + ref_features, input_features, metric=distance_func + ) + else: + dist = np.array( + [ + [distance_func(r, i) for i in input_features] + for r in ref_features + ], + dtype=np.float32, + ) + n_input = input_features.shape[0] + n_ref = ref_features.shape[0] + ax.imshow( + dist, + aspect="auto", + origin="lower", + interpolation="nearest", + extent=(0, n_input - 1, 0, n_ref - 1), + ) + show_dist = True + except Exception: + pass - fig, ax = plt.subplots(figsize=(30, 30)) + # x-axis: performance time in frames + x_gt = gt * float(frame_rate) + wp_x = warping_path[1] * float(frame_rate) - if has_dist_matrix: - # DTW mode: everything in frame space - dist = scipy.spatial.distance.cdist( - ref_features, - input_features, - metric=distance_func, - ) - ax.imshow( - dist, - aspect="auto", - origin="lower", - interpolation="nearest", - extent=(0, input_features.shape[0] - 1, 0, ref_features.shape[0] - 1), - ) - x_gt = gt * float(frame_rate) - x_pred = pred * float(frame_rate) - if score_y is not None: - y = np.asarray(score_y, dtype=float)[:n] * float(frame_rate) - else: - y = np.arange(n) - ylabel = "score (frames)" - wp_x = warping_path[1] - wp_y = warping_path[0] + # y-axis: score position (beats) + wp_in_beats = np.issubdtype(warping_path[0].dtype, np.floating) + if state_space is not None and not wp_in_beats: + wp_y = state_space[warping_path[0]] + elif show_dist and wp_in_beats and ref_frame_to_beat is not None: + wp_y = _beats_to_frames(warping_path[0], ref_frame_to_beat) else: - # HMM mode: x in frames, y in beats via state_space - x_gt = gt * float(frame_rate) - x_pred = pred * float(frame_rate) - if score_y is None: - y = np.arange(n) - ylabel = "annotation index" - else: - y = np.asarray(score_y, dtype=float)[:n] - ylabel = "score position (beats)" - wp_x = warping_path[1] - if state_space is not None: - wp_y = state_space[warping_path[0]] - else: - wp_y = warping_path[0] - - # 1. Warping path - if has_dist_matrix: - ax.plot( - wp_x, - wp_y, - ".", - color="white", - alpha=0.7, - markersize=15, - label="warping path", - zorder=2, - ) + wp_y = warping_path[0] + + # GT score positions (y-axis for annotation dots) + if score_y is not None: + y_gt = np.asarray(score_y, dtype=float)[:n] + if show_dist and wp_in_beats and ref_frame_to_beat is not None: + y_gt = _beats_to_frames(y_gt, ref_frame_to_beat) else: - ax.plot( - wp_x, - wp_y, - ".", - color="lime", - alpha=0.5, - markersize=15, - label="warping path", - zorder=2, - ) + y_gt = np.arange(n) - # 2. Predicted points + # Predicted score positions at GT perf times (perf→score direction) + wp_x_sorted = np.asarray(wp_x, dtype=float) + wp_y_sorted = np.asarray(wp_y, dtype=float) + if len(wp_x_sorted) > 1: + y_pred = np.interp(x_gt, wp_x_sorted, wp_y_sorted) + else: + y_pred = y_gt + + # Plot layers + ax.plot( + wp_x, + wp_y, + ".", + color="white" if show_dist else "lime", + alpha=0.7 if show_dist else 0.5, + markersize=15, + label="warping path", + zorder=2, + ) ax.scatter( - x_pred, - y, + x_gt, + y_pred, label="predicted", s=80, alpha=0.9, @@ -627,11 +560,9 @@ def plot_alignment( linewidths=0, zorder=3, ) - - # 3. GT annotations (front) ax.scatter( x_gt, - y, + y_gt, label="ground truth", s=120, alpha=0.9, @@ -641,8 +572,26 @@ def plot_alignment( zorder=4, ) + if show_dist: + ax.set_xlim(0, input_features.shape[0] - 1) + ax.set_ylim(0, ref_features.shape[0] - 1) + + # Beat tick labels when projected to frame space + if show_dist and wp_in_beats and ref_frame_to_beat is not None: + finite_beats = ref_frame_to_beat[np.isfinite(ref_frame_to_beat)] + beat_min, beat_max = ( + finite_beats[0], + finite_beats[-1] if len(finite_beats) > 0 else (0, 1), + ) + n_ticks = max(2, min(12, int(beat_max - beat_min) + 1)) + beat_ticks = np.unique( + np.round(np.linspace(beat_min, beat_max, n_ticks)).astype(int) + ) + ax.set_yticks(_beats_to_frames(beat_ticks.astype(float), ref_frame_to_beat)) + ax.set_yticklabels([str(b) for b in beat_ticks]) + ax.set_xlabel("performance frame") - ax.set_ylabel(ylabel) + ax.set_ylabel("score position (beats)") ax.set_title(f"[{save_dir.name}] alignment ({name})") ax.grid(True, alpha=0.2) ax.legend(loc="best") @@ -664,28 +613,29 @@ def save_debug_results( ref_features: Optional[np.ndarray] = None, input_features: Optional[np.ndarray] = None, distance_func=None, + ref_frame_to_beat: Optional[np.ndarray] = None, ): """Save debug outputs: warping path TSV, results JSON, and alignment plot.""" save_dir = Path(save_dir) save_dir.mkdir(parents=True, exist_ok=True) - # 1. Warping path TSV + results JSON + # 1. Warping path TSV + results JSON + GT annotations save_nparray_to_csv(warping_path.T, (save_dir / f"wp_{run_name}.tsv").as_posix()) + gt_pairs = np.column_stack([score_annots, perf_annots]) + save_nparray_to_csv(gt_pairs, (save_dir / f"gt_{run_name}.tsv").as_posix()) import json with open(save_dir / f"{run_name}.json", "w") as f: json.dump(eval_results, f, indent=4) # 2. Alignment plot - if state_space is not None: - score_y = state_space - else: - sx = np.asarray(score_annots, dtype=float) - score_y = ( - sx - if sx.ndim == 1 and len(sx) == len(perf_annots) and np.all(np.diff(sx) >= 0) - else None - ) + # score_y = beat positions for each annotation (y-axis of the plot) + sx = np.asarray(score_annots, dtype=float) + score_y = ( + sx + if sx.ndim == 1 and len(sx) == len(perf_annots) and np.all(np.diff(sx) >= 0) + else None + ) plot_alignment( warping_path, perf_annots, @@ -698,4 +648,5 @@ def save_debug_results( ref_features=ref_features, input_features=input_features, distance_func=distance_func, + ref_frame_to_beat=ref_frame_to_beat, ) diff --git a/pyproject.toml b/pyproject.toml index 1dfd23a..005b31d 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -7,7 +7,7 @@ name = "pymatchmaker" version = "0.2.1" description = "A package for real-time music alignment" readme = "README.md" -requires-python = ">=3.9" +requires-python = ">=3.10" license = { text = "Apache 2.0" } keywords = ["music", "alignment", "midi", "audio"] authors = [ @@ -18,6 +18,10 @@ classifiers = [ "License :: OSI Approved :: Apache Software License", "Programming Language :: Python", "Programming Language :: Python :: 3", + "Programming Language :: Python :: 3.10", + "Programming Language :: Python :: 3.11", + "Programming Language :: Python :: 3.12", + "Programming Language :: Python :: 3.13", "Programming Language :: Cython", "Programming Language :: Python :: Implementation :: CPython", "Programming Language :: Python :: Implementation :: PyPy", @@ -31,10 +35,10 @@ dependencies = [ "python-rtmidi>=1.5.8", "mido>=1.3.2", "numpy>=1.26.3,<2.0", - "scipy>=1.11.4,<1.15", + "scipy>=1.11.4", "librosa>=0.10.1", "pandas>=2.0.0", - "partitura>=1.7.0", + "partitura>=1.8.0", "progressbar2>=4.2.0", "python-hiddenmarkov>=0.1.3", "pyaudio>=0.2.14", diff --git a/run_examples.py b/run_examples.py index 4b13c59..245b1b7 100644 --- a/run_examples.py +++ b/run_examples.py @@ -31,6 +31,12 @@ def main(): group = parser.add_mutually_exclusive_group() group.add_argument("--audio", action="store_true", help="Use audio input mode") group.add_argument("--midi", action="store_true", help="Use MIDI input mode") + parser.add_argument( + "--method", + type=str, + default=None, + help="Score following method (e.g., arzt, dixon, outerhmm, audio_outerhmm)", + ) args = parser.parse_args() input_mode = "midi" if args.midi else "audio" @@ -39,7 +45,10 @@ def main(): print(f"Running matchmaker with the score file ({SCORE_FILE.name})...") print("-" * 50) - method = "outerhmm" if input_mode == "midi" else "arzt" + if args.method is not None: + method = args.method + else: + method = "outerhmm" if input_mode == "midi" else "arzt" # Initialize matchmaker (simulation mode) try: @@ -54,7 +63,7 @@ def main(): return # Run real-time score following - for current_position in mm.run(wait=True): + for current_position in mm.run(): timestamp = datetime.datetime.now().strftime("%H:%M:%S.%f")[:-3] print(f"[{timestamp}] Current beat position: {current_position}") diff --git a/tests/test_io_audio.py b/tests/test_io_audio.py index a313758..2fdedb9 100644 --- a/tests/test_io_audio.py +++ b/tests/test_io_audio.py @@ -19,9 +19,9 @@ MelSpectrogramProcessor, MFCCProcessor, ) +from matchmaker.features.processor import DummyProcessor from matchmaker.io.audio import AudioStream from matchmaker.utils.audio import check_input_audio_devices, get_audio_devices -from matchmaker.utils.processor import DummyProcessor from tests.utils import generate_sine_wave HAS_AUDIO_INPUT = check_input_audio_devices() @@ -227,7 +227,11 @@ def test_offline_input(self, mock_stdout=None): print("current time", self.stream.current_time) self.stream.join() - outputs = list(self.stream.queue.queue) + from matchmaker.io.stream import STREAM_END + + outputs = [ + item for item in self.stream.queue.queue if item is not STREAM_END + ] for _, ftime in outputs: self.assertTrue(isinstance(ftime, float)) diff --git a/tests/test_matchmaker.py b/tests/test_matchmaker.py index 8a7d894..b032326 100644 --- a/tests/test_matchmaker.py +++ b/tests/test_matchmaker.py @@ -5,6 +5,8 @@ import warnings from pathlib import Path +import numpy as np + from matchmaker import Matchmaker from matchmaker.dp import OnlineTimeWarpingArzt from matchmaker.dp.oltw_dixon import OnlineTimeWarpingDixon @@ -14,6 +16,7 @@ from matchmaker.io.midi import MidiStream from matchmaker.prob.hmm import PitchIOIHMM from matchmaker.prob.outer_product_hmm import OuterProductHMM +from matchmaker.prob.outer_product_hmm_audio import AudioOuterProductHMM warnings.filterwarnings("ignore", module="partitura") warnings.filterwarnings("ignore", module="librosa") @@ -22,21 +25,17 @@ class TestMatchmaker(unittest.TestCase): def setUp(self): # Set up paths to test files - self.score_file = "./tests/resources/Bach-fugue_bwv_858.musicxml" - self.performance_file_audio = "./tests/resources/Bach-fugue_bwv_858.mp3" - self.performance_file_midi = "./tests/resources/Bach-fugue_bwv_858.mid" + self.score_file = "./matchmaker/assets/simple_mozart_k265_var1.musicxml" + self.performance_file_audio = "./matchmaker/assets/simple_mozart_k265_var1.mp3" + self.performance_file_midi = "./matchmaker/assets/simple_mozart_k265_var1.mid" self.performance_file_annotations = ( - "./tests/resources/Bach-fugue_bwv_858_note_annotations.txt" + "./matchmaker/assets/simple_mozart_k265_var1_note_annotations.txt" + ) + self.performance_file_beat_annotations = ( + "./matchmaker/assets/simple_mozart_k265_var1_beat_annotations.txt" ) self.test_datasets = [ - { - "name": "bach_fugue_bwv_858", - "score": "./tests/resources/Bach-fugue_bwv_858.musicxml", - "audio": "./tests/resources/Bach-fugue_bwv_858.mp3", - "midi": "./tests/resources/Bach-fugue_bwv_858.mid", - "annotations": "./tests/resources/Bach-fugue_bwv_858_note_annotations.txt", - }, { "name": "simple_mozart_k265_var1", "score": "./matchmaker/assets/simple_mozart_k265_var1.musicxml", @@ -44,6 +43,13 @@ def setUp(self): "midi": "./matchmaker/assets/simple_mozart_k265_var1.mid", "annotations": "./matchmaker/assets/simple_mozart_k265_var1_note_annotations.txt", }, + { + "name": "bach_fugue_bwv_858", + "score": "./tests/resources/Bach-fugue_bwv_858.musicxml", + "audio": "./tests/resources/Bach-fugue_bwv_858.mp3", + "midi": "./tests/resources/Bach-fugue_bwv_858.mid", + "annotations": "./tests/resources/Bach-fugue_bwv_858_note_annotations.txt", + }, ] def test_matchmaker_audio_init(self): @@ -72,6 +78,7 @@ def test_matchmaker_audio_run(self): # When & Then: running the alignment process, the yielded result should be a float values for position_in_beat in mm.run(verbose=False): self.assertIsInstance(position_in_beat, float) + break def test_matchmaker_audio_run_with_result(self): # Given: a Matchmaker instance with audio input @@ -95,12 +102,11 @@ def test_matchmaker_audio_run_with_result(self): def test_matchmaker_audio_run_with_evaluation(self): for dataset in self.test_datasets: - for method in ["arzt", "dixon"]: + for method in ["arzt", "dixon", "outerhmm"]: with self.subTest(dataset=dataset["name"], method=method): mm = Matchmaker( score_file=dataset["score"], performance_file=dataset["audio"], - wait=False, input_type="audio", method=method, ) @@ -116,15 +122,15 @@ def test_matchmaker_audio_run_with_evaluation(self): current_test = f"{dataset['name']}_{method}" results = mm.run_evaluation( dataset["annotations"], - debug=True, - save_dir=Path("./tests/results"), - run_name=current_test, + debug=False, + # save_dir=Path("./tests/results"), + # run_name=current_test, ) print(f"[{current_test}] RESULTS: {json.dumps(results, indent=4)}") # Then: the results should at least be 0.5 for threshold in ["300ms", "500ms", "1000ms"]: - self.assertGreaterEqual(results[threshold], 0.5) + self.assertGreaterEqual(results["ms"][threshold], 0.5) def test_matchmaker_audio_run_with_evaluation_cqt(self): # Given: a Matchmaker instance with audio input @@ -133,8 +139,7 @@ def test_matchmaker_audio_run_with_evaluation_cqt(self): performance_file=self.performance_file_audio, wait=False, input_type="audio", - feature_type="cqt", - distance_func="Cosine", + processor="cqt", method="arzt", ) try: @@ -154,7 +159,7 @@ def test_matchmaker_audio_run_with_evaluation_cqt(self): # Then: the results should at least be 0.5 for threshold in ["300ms", "500ms", "1000ms"]: - self.assertGreaterEqual(results[threshold], 0.5) + self.assertGreaterEqual(results["ms"][threshold], 0.5) def test_matchmaker_audio_run_with_evaluation_in_beats(self): # Given: a Matchmaker instance with audio input @@ -172,18 +177,14 @@ def test_matchmaker_audio_run_with_evaluation_in_beats(self): mm._has_run = True results = mm.run_evaluation( - "./tests/resources/Bach-fugue_bwv_858_beat_annotations.txt", - level="beat", - debug=True, - save_dir=Path("./tests/results"), - run_name="test_matchmaker_audio_run_with_evaluation_in_beats", + self.performance_file_annotations, domain="score", ) print(f"RESULTS: {json.dumps(results, indent=4)}") # Then: the results should at least be 0.5 for threshold in ["0.3b", "0.5b", "1b"]: - self.assertGreaterEqual(results[threshold], 0.5) + self.assertGreaterEqual(results["beat"][threshold], 0.5) def test_matchmaker_audio_run_with_evaluation_before_run(self): # Given: a Matchmaker instance with audio input @@ -226,6 +227,47 @@ def test_matchmaker_audio_arzt_init(self): self.assertIsInstance(mm.stream, AudioStream) self.assertIsInstance(mm.score_follower, OnlineTimeWarpingArzt) + def test_matchmaker_audio_outerhmm_init(self): + mm = Matchmaker( + score_file=self.score_file, + performance_file=self.performance_file_audio, + input_type="audio", + method="outerhmm", + ) + + self.assertIsInstance(mm.stream, AudioStream) + self.assertIsInstance(mm.score_follower, AudioOuterProductHMM) + + def test_matchmaker_audio_outerhmm_run(self): + mm = Matchmaker( + score_file=self.score_file, + performance_file=self.performance_file_audio, + input_type="audio", + method="outerhmm", + ) + + for position_in_beat in mm.run(verbose=False): + self.assertIsInstance(position_in_beat, float) + break + + def test_matchmaker_audio_rtf(self): + for method in ["arzt", "dixon", "outerhmm"]: + with self.subTest(method=method): + mm = Matchmaker( + score_file=self.score_file, + performance_file=self.performance_file_audio, + input_type="audio", + method=method, + ) + list(mm.run(verbose=False)) + + results = mm.run_evaluation( + self.performance_file_annotations, + ) + self.assertIn("rtf", results) + self.assertGreater(results["rtf"], 0) + self.assertLess(results["rtf"], 1.0) + def test_matchmaker_with_frame_rate(self): # Given: a Matchmaker instance with audio input mm = Matchmaker( @@ -233,12 +275,12 @@ def test_matchmaker_with_frame_rate(self): performance_file=self.performance_file_audio, wait=False, input_type="audio", - frame_rate=100, + frame_rate=50, ) - # Then: the frame rate should be 100 - self.assertEqual(mm.frame_rate, 100) - self.assertEqual(mm.score_follower.frame_rate, 100) + # Then: the frame rate should be 50 + self.assertEqual(mm.frame_rate, 50) + self.assertEqual(mm.score_follower.frame_rate, 50) def test_matchmaker_invalid_input_type(self): # Test Matchmaker with invalid input type @@ -293,12 +335,54 @@ def test_matchmaker_midi_run(self): ) # When & Then: running the alignment process, - # the yielded result should be a float values + # the yielded result should be numeric (int state index for MIDI) for position_in_beat in mm.run(): - self.assertIsInstance(position_in_beat, float) - if position_in_beat >= 130: + self.assertIsInstance(position_in_beat, (int, float, np.integer)) + if position_in_beat >= 10: break + def test_matchmaker_midi_run_with_evaluation(self): + """Test all MIDI methods: run + evaluate on test datasets.""" + for dataset in self.test_datasets: + for method in ["hmm", "pthmm", "outerhmm", "arzt", "dixon"]: + with self.subTest(dataset=dataset["name"], method=method): + mm = Matchmaker( + score_file=dataset["score"], + performance_file=dataset["midi"], + input_type="midi", + method=method, + ) + + try: + positions = list(mm.run()) + except queue.Empty as e: + print(f"Error: {type(e)}, {e}") + traceback.print_exc() + mm._has_run = True + + # All methods should produce positions + self.assertGreater(len(positions), 0) + + # WP should be valid + wp = mm.score_follower.warping_path + self.assertEqual(wp.shape[0], 2) + self.assertGreater(wp.shape[1], 0) + + # Evaluate all methods + results = mm.run_evaluation( + dataset["annotations"], + debug=False, + ) + current_test = f"{dataset['name']}_{method}_midi" + print( + f"[{current_test}] beat_0.5b={results['beat']['0.5b']:.3f}, ms_300ms={results['ms']['300ms']:.3f}" + ) + + for threshold in ["0.5b", "1b"]: + self.assertGreaterEqual(results["beat"][threshold], 0.3) + for threshold in ["300ms", "500ms", "1000ms"]: + self.assertGreaterEqual(results["ms"][threshold], 0.3) + if __name__ == "__main__": unittest.main() diff --git a/tests/test_oltw_arzt.py b/tests/test_oltw_arzt.py index 5bf6083..7633e5c 100644 --- a/tests/test_oltw_arzt.py +++ b/tests/test_oltw_arzt.py @@ -9,12 +9,12 @@ import numpy as np from scipy.spatial import distance as sp_distance -from matchmaker.dp.oltw_arzt import OnlineTimeWarpingArzt +from matchmaker.dp.oltw_arzt import OnlineTimeWarpingArztFrame as OnlineTimeWarpingArzt from matchmaker.utils import ( CYTHONIZED_METRICS_W_ARGUMENTS, CYTHONIZED_METRICS_WO_ARGUMENTS, ) -from matchmaker.utils.misc import ( +from matchmaker.utils.errors import ( MatchmakerInvalidOptionError, MatchmakerInvalidParameterTypeError, ) @@ -22,7 +22,7 @@ RNG = np.random.RandomState(1984) -SCIPY_DISTANCES = [ +_ALL_SCIPY_DISTANCES = [ "braycurtis", "canberra", "chebyshev", @@ -36,13 +36,14 @@ "dice", "hamming", "jaccard", - "kulczynski1", "rogerstanimoto", "russellrao", "sokalmichener", "sokalsneath", "yule", ] +# Filter to distances available in the installed scipy version +SCIPY_DISTANCES = [d for d in _ALL_SCIPY_DISTANCES if hasattr(sp_distance, d)] class TestOnlineTimeWarpingArzt(unittest.TestCase): diff --git a/tests/test_prob_hmm.py b/tests/test_prob_hmm.py index 9335866..b790820 100644 --- a/tests/test_prob_hmm.py +++ b/tests/test_prob_hmm.py @@ -197,7 +197,7 @@ def test_init(self): "./tests/resources/Bach-fugue_bwv_858_annotations.txt" ) - self.performance = process_audio_offline( - perf_info=self.performance_file_audio, + self.performance = process_midi_offline( + perf_info=self.performance_file_midi, processor=PitchProcessor(piano_range=True), ) diff --git a/tests/test_utils_misc.py b/tests/test_utils_misc.py index 497c0f5..deb76ee 100644 --- a/tests/test_utils_misc.py +++ b/tests/test_utils_misc.py @@ -4,13 +4,13 @@ import numpy as np -from matchmaker.utils.misc import ( +from matchmaker.io.queue import RECVQueue +from matchmaker.utils.errors import ( MatchmakerInvalidOptionError, MatchmakerInvalidParameterTypeError, MatchmakerMissingParameterError, - RECVQueue, - ensure_rng, ) +from matchmaker.utils.misc import ensure_rng class TestMatchmakerExceptions(unittest.TestCase): diff --git a/tests/test_utils_processor.py b/tests/test_utils_processor.py index a08c727..a11b9f3 100644 --- a/tests/test_utils_processor.py +++ b/tests/test_utils_processor.py @@ -8,7 +8,7 @@ import numpy as np -from matchmaker.utils.processor import DummyProcessor, Processor, ProcessorWrapper +from matchmaker.features.processor import DummyProcessor, Processor, ProcessorWrapper RNG = np.random.RandomState(1984) diff --git a/tests/test_utils_stream.py b/tests/test_utils_stream.py index 9bdf2a7..5ebbd08 100644 --- a/tests/test_utils_stream.py +++ b/tests/test_utils_stream.py @@ -9,8 +9,8 @@ import numpy as np -from matchmaker.utils.processor import DummyProcessor -from matchmaker.utils.stream import Stream +from matchmaker.features.processor import DummyProcessor +from matchmaker.io.stream import Stream RNG = np.random.RandomState(1984) diff --git a/tests/utils.py b/tests/utils.py index 80f1dca..a7f4226 100644 --- a/tests/utils.py +++ b/tests/utils.py @@ -18,7 +18,8 @@ from matchmaker.io.audio import HOP_LENGTH, SAMPLE_RATE, AudioStream from matchmaker.io.midi import POLLING_PERIOD, MidiStream -from matchmaker.utils.misc import RECVQueue +from matchmaker.io.queue import RECVQueue +from matchmaker.io.stream import STREAM_END # Random number generator RNG = np.random.RandomState(1984) @@ -289,7 +290,7 @@ def process_midi_offline( ) as stream: pass - outputs = list(queue.queue) + outputs = [item for item in queue.queue if item is not STREAM_END] return outputs