@@ -464,6 +464,132 @@ def test_fail_classification_displayed(
464464 assert "FAIL" in result .output
465465
466466
467+ # --------------------------------------------------------------------------- #
468+ # Test: Delta sign correctness
469+ # --------------------------------------------------------------------------- #
470+
471+
472+ class TestDeltaSignDisplay :
473+ """Tests that delta percentages display the correct sign.
474+
475+ delta_pct is (catalog - measured) / catalog * 100, so:
476+ - positive delta = measured below catalog (slower)
477+ - negative delta = measured above catalog (faster)
478+ - zero delta = exact match
479+ """
480+
481+ @patch ("mlx_stack.core.benchmark.run_benchmark" )
482+ def test_below_catalog_shows_positive_delta (
483+ self ,
484+ mock_bench : MagicMock ,
485+ runner : CliRunner ,
486+ ) -> None :
487+ """Measured 60 vs catalog 85 → delta_pct=+29.4 → should display +29.4%."""
488+ result_data = BenchmarkResult_ (
489+ model_id = "test-model" ,
490+ quant = "int4" ,
491+ iterations = [
492+ IterationResult (
493+ prompt_tps = 80.0 , gen_tps = 60.0 ,
494+ prompt_tokens = 1000 , completion_tokens = 100 , total_time = 10.0 ,
495+ ),
496+ ],
497+ prompt_tps_mean = 80.0 ,
498+ prompt_tps_std = 0.0 ,
499+ gen_tps_mean = 60.0 ,
500+ gen_tps_std = 0.0 ,
501+ classifications = [
502+ MetricClassification (
503+ metric = "gen_tps" ,
504+ measured = 60.0 ,
505+ catalog = 85.0 ,
506+ delta_pct = 29.4 ,
507+ classification = "WARN" ,
508+ ),
509+ ],
510+ catalog_data_available = True ,
511+ )
512+ mock_bench .return_value = result_data
513+
514+ result = runner .invoke (cli , ["bench" , "fast" ])
515+ assert result .exit_code == 0
516+ assert "+29.4%" in result .output
517+
518+ @patch ("mlx_stack.core.benchmark.run_benchmark" )
519+ def test_above_catalog_shows_negative_delta (
520+ self ,
521+ mock_bench : MagicMock ,
522+ runner : CliRunner ,
523+ ) -> None :
524+ """Measured 90 vs catalog 85 → delta_pct=-5.9 → should display -5.9%."""
525+ result_data = BenchmarkResult_ (
526+ model_id = "test-model" ,
527+ quant = "int4" ,
528+ iterations = [
529+ IterationResult (
530+ prompt_tps = 100.0 , gen_tps = 90.0 ,
531+ prompt_tokens = 1000 , completion_tokens = 100 , total_time = 10.0 ,
532+ ),
533+ ],
534+ prompt_tps_mean = 100.0 ,
535+ prompt_tps_std = 0.0 ,
536+ gen_tps_mean = 90.0 ,
537+ gen_tps_std = 0.0 ,
538+ classifications = [
539+ MetricClassification (
540+ metric = "gen_tps" ,
541+ measured = 90.0 ,
542+ catalog = 85.0 ,
543+ delta_pct = - 5.9 ,
544+ classification = "PASS" ,
545+ ),
546+ ],
547+ catalog_data_available = True ,
548+ )
549+ mock_bench .return_value = result_data
550+
551+ result = runner .invoke (cli , ["bench" , "fast" ])
552+ assert result .exit_code == 0
553+ assert "-5.9%" in result .output
554+
555+ @patch ("mlx_stack.core.benchmark.run_benchmark" )
556+ def test_exact_match_shows_zero_delta (
557+ self ,
558+ mock_bench : MagicMock ,
559+ runner : CliRunner ,
560+ ) -> None :
561+ """Measured == catalog → delta_pct=0.0 → should display +0.0%."""
562+ result_data = BenchmarkResult_ (
563+ model_id = "test-model" ,
564+ quant = "int4" ,
565+ iterations = [
566+ IterationResult (
567+ prompt_tps = 155.0 , gen_tps = 85.0 ,
568+ prompt_tokens = 1000 , completion_tokens = 100 , total_time = 10.0 ,
569+ ),
570+ ],
571+ prompt_tps_mean = 155.0 ,
572+ prompt_tps_std = 0.0 ,
573+ gen_tps_mean = 85.0 ,
574+ gen_tps_std = 0.0 ,
575+ classifications = [
576+ MetricClassification (
577+ metric = "gen_tps" ,
578+ measured = 85.0 ,
579+ catalog = 85.0 ,
580+ delta_pct = 0.0 ,
581+ classification = "PASS" ,
582+ ),
583+ ],
584+ catalog_data_available = True ,
585+ )
586+ mock_bench .return_value = result_data
587+
588+ result = runner .invoke (cli , ["bench" , "fast" ])
589+ assert result .exit_code == 0
590+ assert "+0.0%" in result .output
591+
592+
467593# --------------------------------------------------------------------------- #
468594# Test: Tool-calling display variants
469595# --------------------------------------------------------------------------- #
0 commit comments