NousResearch · Osraka · May 17, 2026
diff --git a/README.md b/README.md
@@ -65,6 +65,11 @@ python3 run_eval.py \
 `results/<label>/report.md` is paste-ready for a PR body. Per-run JSON
 goes to `results/<label>/runs/`.
 
+When `--compare-to` is used, the harness only renders deltas when the
+current run and baseline used identical fixture and probe-bank contents.
+If either input changed, the report calls that out and omits the deltas
+instead of comparing unlike eval targets.
+
 ## What ships
 
 | Path | Purpose |

diff --git a/report.py b/report.py
@@ -61,6 +61,20 @@ def summarize_fixture_runs(
 
     fixture_name = fixture_runs[0]["fixture_name"]
     n_runs = len(fixture_runs)
+    fixture_sha256 = fixture_runs[0].get("fixture_sha256", "")
+    probe_bank_sha256 = fixture_runs[0].get("probe_bank_sha256", "")
+
+    for run in fixture_runs[1:]:
+        if run.get("fixture_sha256", "") != fixture_sha256:
+            raise ValueError(
+                f"{fixture_name}: fixture_sha256 changed across runs; "
+                "refusing to summarize incompatible payloads"
+            )
+        if run.get("probe_bank_sha256", "") != probe_bank_sha256:
+            raise ValueError(
+                f"{fixture_name}: probe_bank_sha256 changed across runs; "
+                "refusing to summarize incompatible payloads"
+            )
 
     # Per-probe-per-dimension aggregation across runs
     probe_ids = [p["id"] for p in fixture_runs[0]["probes"]]
@@ -112,9 +126,35 @@ def summarize_fixture_runs(
         "overall_median": overall_median,
         "misses": misses,
         "compression": fixture_runs[0].get("compression", {}),
+        "fixture_sha256": fixture_sha256,
+        "probe_bank_sha256": probe_bank_sha256,
     }
 
 
+def _baseline_incompatibility(
+    current: Dict[str, Any],
+    baseline: Optional[Dict[str, Any]],
+) -> Optional[str]:
+    """Explain why a baseline cannot safely produce deltas for one fixture."""
+    if baseline is None:
+        return None
+
+    current_fixture = current.get("fixture_sha256", "")
+    current_probes = current.get("probe_bank_sha256", "")
+    baseline_fixture = baseline.get("fixture_sha256", "")
+    baseline_probes = baseline.get("probe_bank_sha256", "")
+
+    if not all((current_fixture, current_probes, baseline_fixture, baseline_probes)):
+        return "missing fixture/probe fingerprints"
+    if current_fixture != baseline_fixture and current_probes != baseline_probes:
+        return "fixture and probe bank changed"
+    if current_fixture != baseline_fixture:
+        return "fixture changed"
+    if current_probes != baseline_probes:
+        return "probe bank changed"
+    return None
+
+
 def render_report(
     *,
     label: str,
@@ -144,6 +184,7 @@ def render_report(
     baseline_by_name: Dict[str, Dict[str, Any]] = {}
     if baseline_summaries:
         baseline_by_name = {s["fixture_name"]: s for s in baseline_summaries}
+    incompatible_baselines: List[Dict[str, str]] = []
 
     # Main table
     header = ["Fixture"] + DIMENSIONS + ["overall"]
@@ -152,19 +193,42 @@ def render_report(
     for s in summaries:
         row = [s["fixture_name"]]
         baseline = baseline_by_name.get(s["fixture_name"])
+        incompatibility = _baseline_incompatibility(s, baseline)
+        if incompatibility and baseline is not None:
+            incompatible_baselines.append({
+                "fixture_name": s["fixture_name"],
+                "reason": incompatibility,
+            })
         for d in DIMENSIONS:
             cur = s["dimension_medians"][d]
-            if baseline and d in baseline.get("dimension_medians", {}):
+            if (
+                baseline
+                and not incompatibility
+                and d in baseline.get("dimension_medians", {})
+            ):
                 row.append(_format_delta(baseline["dimension_medians"][d], cur))
             else:
                 row.append(_format_score(cur))
-        if baseline:
+        if baseline and not incompatibility:
             row.append(_format_delta(baseline["overall_median"], s["overall_median"]))
         else:
             row.append(_format_score(s["overall_median"]))
         lines.append("| " + " | ".join(row) + " |")
     lines.append("")
 
+    if incompatible_baselines:
+        lines.append("### Baseline comparisons skipped")
+        lines.append("")
+        lines.append(
+            "Deltas were omitted for fixtures whose inputs no longer match the "
+            "baseline. Re-run the baseline against the current fixture and probe "
+            "bank before interpreting score changes."
+        )
+        lines.append("")
+        for item in incompatible_baselines:
+            lines.append(f"- `{item['fixture_name']}`: {item['reason']}")
+        lines.append("")
+
     # Compression metadata
     lines.append("### Compression summary")
     lines.append("")

diff --git a/run_eval.py b/run_eval.py
@@ -10,6 +10,7 @@
 """
 from __future__ import annotations
 
+import hashlib
 import json
 import logging
 import sys
@@ -68,6 +69,19 @@ def _load_probes(name: str) -> Dict[str, Any]:
         return json.load(fh)
 
 
+def _sha256_file(path: Path) -> str:
+    """Return a stable content hash for one on-disk eval input file."""
+    return hashlib.sha256(path.read_bytes()).hexdigest()
+
+
+def _input_fingerprints(name: str) -> Dict[str, str]:
+    """Hash the fixture and probe bank that define one eval target."""
+    return {
+        "fixture_sha256": _sha256_file(FIXTURES_DIR / f"{name}.json"),
+        "probe_bank_sha256": _sha256_file(PROBES_DIR / f"{name}.probes.json"),
+    }
+
+
 def _resolve_runtime(
     *,
     provider_override: Optional[str],
@@ -102,6 +116,7 @@ def _run_one_fixture(
 ) -> Dict[str, Any]:
     fx = _load_fixture(fixture_name)
     probes = _load_probes(fixture_name)
+    fingerprints = _input_fingerprints(fixture_name)
 
     logger.info(
         "[%s run=%d] compressing (%d messages, ctx=%d)",
@@ -196,6 +211,7 @@ def _run_one_fixture(
     return {
         "fixture_name": fixture_name,
         "run_index": run_index,
+        **fingerprints,
         "compression": {
             "pre_tokens": compression["pre_tokens"],
             "post_tokens": compression["post_tokens"],

diff --git a/tests/test_compression_eval.py b/tests/test_compression_eval.py
@@ -200,7 +200,14 @@ def test_build_judge_prompt_includes_all_score_scale_levels():
 # ---------- report.summarize_fixture_runs ----------
 
 
-def _fake_run(fixture_name: str, run_index: int, probe_scores: dict) -> dict:
+def _fake_run(
+    fixture_name: str,
+    run_index: int,
+    probe_scores: dict,
+    *,
+    fixture_sha256: str = "fixture-v1",
+    probe_bank_sha256: str = "probes-v1",
+) -> dict:
     """Build a synthetic per-run payload for summariser tests."""
     probes = []
     for pid, per_dim in probe_scores.items():
@@ -220,6 +227,8 @@ def _fake_run(fixture_name: str, run_index: int, probe_scores: dict) -> dict:
     return {
         "fixture_name": fixture_name,
         "run_index": run_index,
+        "fixture_sha256": fixture_sha256,
+        "probe_bank_sha256": probe_bank_sha256,
         "compression": {
             "pre_tokens": 10000,
             "post_tokens": 5000,
@@ -278,6 +287,20 @@ def test_summarize_medians_across_runs():
     assert s["runs"] == 3
 
 
+def test_summarize_rejects_mixed_input_fingerprints():
+    runs = [
+        _fake_run("fx1", 1, {"p": _all_dims(4)}),
+        _fake_run(
+            "fx1",
+            2,
+            {"p": _all_dims(4)},
+            fixture_sha256="fixture-v2",
+        ),
+    ]
+    with pytest.raises(ValueError, match="fixture_sha256 changed"):
+        summarize_fixture_runs(runs)
+
+
 def test_summarize_empty_input():
     assert summarize_fixture_runs([]) == {}
 
@@ -322,6 +345,50 @@ def test_render_report_shows_deltas_when_baseline_provided():
     assert "Deltas shown against baseline" in md
 
 
+def test_render_report_skips_deltas_when_fixture_changed():
+    baseline_runs = [_fake_run("fx", 1, {"p1": _all_dims(3)})]
+    current_runs = [
+        _fake_run(
+            "fx",
+            1,
+            {"p1": _all_dims(4)},
+            fixture_sha256="fixture-v2",
+        )
+    ]
+    baseline = [summarize_fixture_runs(baseline_runs)]
+    current = [summarize_fixture_runs(current_runs)]
+    md = render_report(
+        label="test",
+        compressor_model="m",
+        judge_model="m",
+        runs_per_fixture=1,
+        summaries=current,
+        baseline_summaries=baseline,
+    )
+    assert "+1.00" not in md
+    assert "Baseline comparisons skipped" in md
+    assert "`fx`: fixture changed" in md
+
+
+def test_render_report_skips_deltas_when_old_baseline_lacks_fingerprints():
+    baseline_runs = [_fake_run("fx", 1, {"p1": _all_dims(3)})]
+    current_runs = [_fake_run("fx", 1, {"p1": _all_dims(4)})]
+    baseline = [summarize_fixture_runs(baseline_runs)]
+    baseline[0]["fixture_sha256"] = ""
+    baseline[0]["probe_bank_sha256"] = ""
+    current = [summarize_fixture_runs(current_runs)]
+    md = render_report(
+        label="test",
+        compressor_model="m",
+        judge_model="m",
+        runs_per_fixture=1,
+        summaries=current,
+        baseline_summaries=baseline,
+    )
+    assert "+1.00" not in md
+    assert "`fx`: missing fixture/probe fingerprints" in md
+
+
 def test_render_report_lists_misses_section():
     runs = [_fake_run("fx", 1, {
         "good": _all_dims(4),