Skip to content

Commit ae7d209

Browse files
authored
Merge pull request #9 from weklund/fix/bench-delta-sign
fix: correct inverted sign on benchmark delta display
2 parents a4ad044 + 5801fec commit ae7d209

2 files changed

Lines changed: 127 additions & 5 deletions

File tree

src/mlx_stack/cli/bench.py

Lines changed: 1 addition & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -141,12 +141,8 @@ def _display_results(result: BenchmarkResult_, out: Console, save: bool = False)
141141
else:
142142
result_style = "[bold red]FAIL[/bold red]"
143143

144-
# Format delta
144+
# Format delta — positive means below catalog, negative means above
145145
delta_str = f"{cls.delta_pct:+.1f}%"
146-
if cls.delta_pct > 0:
147-
delta_str = f"-{cls.delta_pct:.1f}%" # Below catalog
148-
else:
149-
delta_str = f"+{abs(cls.delta_pct):.1f}%" # Above catalog
150146

151147
metric_name = cls.metric.replace("_", " ").title().replace("Tps", "TPS")
152148

tests/unit/test_cli_bench.py

Lines changed: 126 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -464,6 +464,132 @@ def test_fail_classification_displayed(
464464
assert "FAIL" in result.output
465465

466466

467+
# --------------------------------------------------------------------------- #
468+
# Test: Delta sign correctness
469+
# --------------------------------------------------------------------------- #
470+
471+
472+
class TestDeltaSignDisplay:
473+
"""Tests that delta percentages display the correct sign.
474+
475+
delta_pct is (catalog - measured) / catalog * 100, so:
476+
- positive delta = measured below catalog (slower)
477+
- negative delta = measured above catalog (faster)
478+
- zero delta = exact match
479+
"""
480+
481+
@patch("mlx_stack.core.benchmark.run_benchmark")
482+
def test_below_catalog_shows_positive_delta(
483+
self,
484+
mock_bench: MagicMock,
485+
runner: CliRunner,
486+
) -> None:
487+
"""Measured 60 vs catalog 85 → delta_pct=+29.4 → should display +29.4%."""
488+
result_data = BenchmarkResult_(
489+
model_id="test-model",
490+
quant="int4",
491+
iterations=[
492+
IterationResult(
493+
prompt_tps=80.0, gen_tps=60.0,
494+
prompt_tokens=1000, completion_tokens=100, total_time=10.0,
495+
),
496+
],
497+
prompt_tps_mean=80.0,
498+
prompt_tps_std=0.0,
499+
gen_tps_mean=60.0,
500+
gen_tps_std=0.0,
501+
classifications=[
502+
MetricClassification(
503+
metric="gen_tps",
504+
measured=60.0,
505+
catalog=85.0,
506+
delta_pct=29.4,
507+
classification="WARN",
508+
),
509+
],
510+
catalog_data_available=True,
511+
)
512+
mock_bench.return_value = result_data
513+
514+
result = runner.invoke(cli, ["bench", "fast"])
515+
assert result.exit_code == 0
516+
assert "+29.4%" in result.output
517+
518+
@patch("mlx_stack.core.benchmark.run_benchmark")
519+
def test_above_catalog_shows_negative_delta(
520+
self,
521+
mock_bench: MagicMock,
522+
runner: CliRunner,
523+
) -> None:
524+
"""Measured 90 vs catalog 85 → delta_pct=-5.9 → should display -5.9%."""
525+
result_data = BenchmarkResult_(
526+
model_id="test-model",
527+
quant="int4",
528+
iterations=[
529+
IterationResult(
530+
prompt_tps=100.0, gen_tps=90.0,
531+
prompt_tokens=1000, completion_tokens=100, total_time=10.0,
532+
),
533+
],
534+
prompt_tps_mean=100.0,
535+
prompt_tps_std=0.0,
536+
gen_tps_mean=90.0,
537+
gen_tps_std=0.0,
538+
classifications=[
539+
MetricClassification(
540+
metric="gen_tps",
541+
measured=90.0,
542+
catalog=85.0,
543+
delta_pct=-5.9,
544+
classification="PASS",
545+
),
546+
],
547+
catalog_data_available=True,
548+
)
549+
mock_bench.return_value = result_data
550+
551+
result = runner.invoke(cli, ["bench", "fast"])
552+
assert result.exit_code == 0
553+
assert "-5.9%" in result.output
554+
555+
@patch("mlx_stack.core.benchmark.run_benchmark")
556+
def test_exact_match_shows_zero_delta(
557+
self,
558+
mock_bench: MagicMock,
559+
runner: CliRunner,
560+
) -> None:
561+
"""Measured == catalog → delta_pct=0.0 → should display +0.0%."""
562+
result_data = BenchmarkResult_(
563+
model_id="test-model",
564+
quant="int4",
565+
iterations=[
566+
IterationResult(
567+
prompt_tps=155.0, gen_tps=85.0,
568+
prompt_tokens=1000, completion_tokens=100, total_time=10.0,
569+
),
570+
],
571+
prompt_tps_mean=155.0,
572+
prompt_tps_std=0.0,
573+
gen_tps_mean=85.0,
574+
gen_tps_std=0.0,
575+
classifications=[
576+
MetricClassification(
577+
metric="gen_tps",
578+
measured=85.0,
579+
catalog=85.0,
580+
delta_pct=0.0,
581+
classification="PASS",
582+
),
583+
],
584+
catalog_data_available=True,
585+
)
586+
mock_bench.return_value = result_data
587+
588+
result = runner.invoke(cli, ["bench", "fast"])
589+
assert result.exit_code == 0
590+
assert "+0.0%" in result.output
591+
592+
467593
# --------------------------------------------------------------------------- #
468594
# Test: Tool-calling display variants
469595
# --------------------------------------------------------------------------- #

0 commit comments

Comments
 (0)