fix bug in plot_alignment

laurenceyoon · laurenceyoon · commit 42e3c1ca1a6c · 2026-04-02T13:17:48.000Z
diff --git a/matchmaker/matchmaker.py b/matchmaker/matchmaker.py
@@ -551,38 +551,45 @@ def run_evaluation(
         self,
         perf_annotations: Union[PathLike, np.ndarray],
         level: str = "note",
-        tolerances: list = TOLERANCES_IN_MILLISECONDS,
-        musical_beat: bool = False,  # beat annots are difference in some dataset
+        tolerances: list = None,
+        musical_beat: bool = False,
         debug: bool = False,
         save_dir: PathLike = None,
         run_name: str = None,
-        domain: str = "performance",  # "score" or "performance"
+        domain: str = "score",
         plot_dist_matrix: bool = True,
     ) -> dict:
         """
-        Evaluate the score following process
+        Evaluate the score following process.
+
+        When domain="score" (default), returns beat-based metrics as primary
+        and ms-based metrics under "ms" key. When domain="performance",
+        returns ms-based metrics only (legacy behavior).
 
         Parameters
         ----------
         perf_annotations : PathLike or np.ndarray
-            Path to the performance annotations file (tab-separated),
-            or numpy array of annotation times in seconds.
+            Path to the performance annotations file or numpy array of onset times (seconds).
         level : str
-            Level of annotations to use: bar, beat or note
-        tolerance : list
-            Tolerances to use for evaluation (in milliseconds)
+            Annotation level: "beat" or "note"
+        tolerances : list or None
+            Tolerances for evaluation. If None, uses default for the domain.
+        musical_beat : bool
+            Whether to use musical beat
         debug : bool
-            Whether to save the score and performance audio with beat annotations
+            Whether to save debug outputs
         domain : str
-            Evaluation domain, either "score" or "performance".
-            "score" domain evaluates in beat unit, "performance" domain evaluates in second unit. (Default: "performance")
+            "score" (default, beat-based primary) or "performance" (ms-based, legacy)
 
         Returns
         -------
         dict
-            Evaluation results with mean, median, std, skewness, kurtosis, and
-            accuracy for each tolerance
+            Evaluation results. If domain="score", includes both beat and ms metrics.
         """
+        if tolerances is None:
+            tolerances = (
+                TOLERANCES_IN_BEATS if domain == "score" else TOLERANCES_IN_MILLISECONDS
+            )
         if not self._has_run:
             raise ValueError("Must call run() before evaluation")
 
@@ -643,26 +650,43 @@ def run_evaluation(
                 wp_perf_sec,
                 total_counts=len(wp_score),
                 tolerances=tolerances,
-                perf_times=wp_perf_sec,
-                alignment_duration=self.alignment_duration,
             )
         else:
-            # Score domain: compare predicted beats vs GT beats
+            # Score domain: beat-based (primary) + ms-based (secondary)
             score_annots_predicted = transfer_positions(
                 wp, perf_annots, frame_rate=self.frame_rate, domain="score"
             )
             score_annots = score_annots[: len(score_annots_predicted)]
-            if tolerances == TOLERANCES_IN_MILLISECONDS:
-                tolerances = TOLERANCES_IN_BEATS
-            eval_results = get_evaluation_results(
+            beat_tolerances = (
+                tolerances
+                if tolerances != TOLERANCES_IN_MILLISECONDS
+                else TOLERANCES_IN_BEATS
+            )
+            beat_results = get_evaluation_results(
                 score_annots,
                 score_annots_predicted,
                 total_counts=original_perf_annots_counts,
-                tolerances=tolerances,
+                tolerances=beat_tolerances,
                 in_seconds=False,
-                perf_times=perf_annots,
-                alignment_duration=self.alignment_duration,
             )
+            ms_results = get_evaluation_results(
+                gt_perf_times,
+                wp_perf_sec,
+                total_counts=len(wp_score),
+                tolerances=TOLERANCES_IN_MILLISECONDS,
+            )
+            eval_results = {"beat": beat_results, "ms": ms_results}
+
+        # Real-Time Factor (domain-independent)
+        if self.alignment_duration is not None:
+            finite_perf = perf_annots[np.isfinite(perf_annots)]
+            if len(finite_perf) > 0:
+                perf_duration = float(np.max(finite_perf) - np.min(finite_perf))
+                if perf_duration > 0:
+                    eval_results["rtf"] = float(
+                        f"{self.alignment_duration / perf_duration:.4f}"
+                    )
+
         if self.input_type == "audio":
             latency_results = self.get_latency_stats()
             eval_results.update(latency_results)
diff --git a/matchmaker/utils/eval.py b/matchmaker/utils/eval.py
@@ -108,8 +108,6 @@ def get_evaluation_results(
     total_counts,
     tolerances=TOLERANCES_IN_MILLISECONDS,
     in_seconds=True,
-    perf_times=None,
-    alignment_duration=None,
 ):
     if in_seconds:
         errors_in_delay = (gt_annots - predicted_annots) * 1000
@@ -141,12 +139,4 @@ def get_evaluation_results(
                 f"{np.sum(np.abs(errors_in_delay) <= tau) / total_counts:.4f}"
             )
 
-    # Real-Time Factor (wall-clock alignment_duration / performance_duration).
-    if alignment_duration is not None and perf_times is not None:
-        finite_perf = perf_times[np.isfinite(perf_times)]
-        if len(finite_perf) > 0:
-            perf_duration = float(np.max(finite_perf) - np.min(finite_perf))
-            if perf_duration > 0:
-                results["rtf"] = float(f"{alignment_duration / perf_duration:.4f}")
-
     return results
diff --git a/matchmaker/utils/misc.py b/matchmaker/utils/misc.py
@@ -584,7 +584,6 @@ def plot_alignment(
 
     # x-axis: performance time in frames
     x_gt = gt * float(frame_rate)
-    x_pred = pred * float(frame_rate)
     wp_x = warping_path[1]
 
     # y-axis: score position (beats)
@@ -596,12 +595,21 @@ def plot_alignment(
     else:
         wp_y = warping_path[0]
 
+    # GT score positions (y-axis for annotation dots)
     if score_y is not None:
-        y = np.asarray(score_y, dtype=float)[:n]
+        y_gt = np.asarray(score_y, dtype=float)[:n]
         if show_dist and wp_in_beats and ref_frame_to_beat is not None:
-            y = _beats_to_frames(y, ref_frame_to_beat)
+            y_gt = _beats_to_frames(y_gt, ref_frame_to_beat)
     else:
-        y = np.arange(n)
+        y_gt = np.arange(n)
+
+    # Predicted score positions at GT perf times (perf→score direction)
+    wp_x_sorted = np.asarray(wp_x, dtype=float)
+    wp_y_sorted = np.asarray(wp_y, dtype=float)
+    if len(wp_x_sorted) > 1:
+        y_pred = np.interp(x_gt, wp_x_sorted, wp_y_sorted)
+    else:
+        y_pred = y_gt
 
     # Plot layers
     ax.plot(
@@ -615,8 +623,8 @@ def plot_alignment(
         zorder=2,
     )
     ax.scatter(
-        x_pred,
-        y,
+        x_gt,
+        y_pred,
         label="predicted",
         s=80,
         alpha=0.9,
@@ -627,7 +635,7 @@ def plot_alignment(
     )
     ax.scatter(
         x_gt,
-        y,
+        y_gt,
         label="ground truth",
         s=120,
         alpha=0.9,
@@ -643,11 +651,14 @@ def plot_alignment(
 
     # Beat tick labels when projected to frame space
     if show_dist and wp_in_beats and ref_frame_to_beat is not None:
-        beat_min, beat_max = ref_frame_to_beat[0], ref_frame_to_beat[-1]
+        finite_beats = ref_frame_to_beat[np.isfinite(ref_frame_to_beat)]
+        beat_min, beat_max = (
+            finite_beats[0],
+            finite_beats[-1] if len(finite_beats) > 0 else (0, 1),
+        )
+        n_ticks = max(2, min(12, int(beat_max - beat_min) + 1))
         beat_ticks = np.unique(
-            np.round(
-                np.linspace(beat_min, beat_max, min(12, int(beat_max - beat_min) + 1))
-            ).astype(int)
+            np.round(np.linspace(beat_min, beat_max, n_ticks)).astype(int)
         )
         ax.set_yticks(_beats_to_frames(beat_ticks.astype(float), ref_frame_to_beat))
         ax.set_yticklabels([str(b) for b in beat_ticks])
diff --git a/tests/test_matchmaker.py b/tests/test_matchmaker.py
@@ -130,7 +130,7 @@ def test_matchmaker_audio_run_with_evaluation(self):
 
                     # Then: the results should at least be 0.5
                     for threshold in ["300ms", "500ms", "1000ms"]:
-                        self.assertGreaterEqual(results[threshold], 0.5)
+                        self.assertGreaterEqual(results["ms"][threshold], 0.5)
 
     def test_matchmaker_audio_run_with_evaluation_cqt(self):
         # Given: a Matchmaker instance with audio input
@@ -159,7 +159,7 @@ def test_matchmaker_audio_run_with_evaluation_cqt(self):
 
         # Then: the results should at least be 0.5
         for threshold in ["300ms", "500ms", "1000ms"]:
-            self.assertGreaterEqual(results[threshold], 0.5)
+            self.assertGreaterEqual(results["ms"][threshold], 0.5)
 
     def test_matchmaker_audio_run_with_evaluation_in_beats(self):
         # Given: a Matchmaker instance with audio input
@@ -184,7 +184,7 @@ def test_matchmaker_audio_run_with_evaluation_in_beats(self):
 
         # Then: the results should at least be 0.5
         for threshold in ["0.3b", "0.5b", "1b"]:
-            self.assertGreaterEqual(results[threshold], 0.5)
+            self.assertGreaterEqual(results["beat"][threshold], 0.5)
 
     def test_matchmaker_audio_run_with_evaluation_before_run(self):
         # Given: a Matchmaker instance with audio input