Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 5 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -65,6 +65,11 @@ python3 run_eval.py \
`results/<label>/report.md` is paste-ready for a PR body. Per-run JSON
goes to `results/<label>/runs/`.

When `--compare-to` is used, the harness only renders deltas when the
current run and baseline used identical fixture and probe-bank contents.
If either input changed, the report calls that out and omits the deltas
instead of comparing unlike eval targets.

## What ships

| Path | Purpose |
Expand Down
68 changes: 66 additions & 2 deletions report.py
Original file line number Diff line number Diff line change
Expand Up @@ -61,6 +61,20 @@ def summarize_fixture_runs(

fixture_name = fixture_runs[0]["fixture_name"]
n_runs = len(fixture_runs)
fixture_sha256 = fixture_runs[0].get("fixture_sha256", "")
probe_bank_sha256 = fixture_runs[0].get("probe_bank_sha256", "")

for run in fixture_runs[1:]:
if run.get("fixture_sha256", "") != fixture_sha256:
raise ValueError(
f"{fixture_name}: fixture_sha256 changed across runs; "
"refusing to summarize incompatible payloads"
)
if run.get("probe_bank_sha256", "") != probe_bank_sha256:
raise ValueError(
f"{fixture_name}: probe_bank_sha256 changed across runs; "
"refusing to summarize incompatible payloads"
)

# Per-probe-per-dimension aggregation across runs
probe_ids = [p["id"] for p in fixture_runs[0]["probes"]]
Expand Down Expand Up @@ -112,9 +126,35 @@ def summarize_fixture_runs(
"overall_median": overall_median,
"misses": misses,
"compression": fixture_runs[0].get("compression", {}),
"fixture_sha256": fixture_sha256,
"probe_bank_sha256": probe_bank_sha256,
}


def _baseline_incompatibility(
current: Dict[str, Any],
baseline: Optional[Dict[str, Any]],
) -> Optional[str]:
"""Explain why a baseline cannot safely produce deltas for one fixture."""
if baseline is None:
return None

current_fixture = current.get("fixture_sha256", "")
current_probes = current.get("probe_bank_sha256", "")
baseline_fixture = baseline.get("fixture_sha256", "")
baseline_probes = baseline.get("probe_bank_sha256", "")

if not all((current_fixture, current_probes, baseline_fixture, baseline_probes)):
return "missing fixture/probe fingerprints"
if current_fixture != baseline_fixture and current_probes != baseline_probes:
return "fixture and probe bank changed"
if current_fixture != baseline_fixture:
return "fixture changed"
if current_probes != baseline_probes:
return "probe bank changed"
return None


def render_report(
*,
label: str,
Expand Down Expand Up @@ -144,6 +184,7 @@ def render_report(
baseline_by_name: Dict[str, Dict[str, Any]] = {}
if baseline_summaries:
baseline_by_name = {s["fixture_name"]: s for s in baseline_summaries}
incompatible_baselines: List[Dict[str, str]] = []

# Main table
header = ["Fixture"] + DIMENSIONS + ["overall"]
Expand All @@ -152,19 +193,42 @@ def render_report(
for s in summaries:
row = [s["fixture_name"]]
baseline = baseline_by_name.get(s["fixture_name"])
incompatibility = _baseline_incompatibility(s, baseline)
if incompatibility and baseline is not None:
incompatible_baselines.append({
"fixture_name": s["fixture_name"],
"reason": incompatibility,
})
for d in DIMENSIONS:
cur = s["dimension_medians"][d]
if baseline and d in baseline.get("dimension_medians", {}):
if (
baseline
and not incompatibility
and d in baseline.get("dimension_medians", {})
):
row.append(_format_delta(baseline["dimension_medians"][d], cur))
else:
row.append(_format_score(cur))
if baseline:
if baseline and not incompatibility:
row.append(_format_delta(baseline["overall_median"], s["overall_median"]))
else:
row.append(_format_score(s["overall_median"]))
lines.append("| " + " | ".join(row) + " |")
lines.append("")

if incompatible_baselines:
lines.append("### Baseline comparisons skipped")
lines.append("")
lines.append(
"Deltas were omitted for fixtures whose inputs no longer match the "
"baseline. Re-run the baseline against the current fixture and probe "
"bank before interpreting score changes."
)
lines.append("")
for item in incompatible_baselines:
lines.append(f"- `{item['fixture_name']}`: {item['reason']}")
lines.append("")

# Compression metadata
lines.append("### Compression summary")
lines.append("")
Expand Down
16 changes: 16 additions & 0 deletions run_eval.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@
"""
from __future__ import annotations

import hashlib
import json
import logging
import sys
Expand Down Expand Up @@ -68,6 +69,19 @@ def _load_probes(name: str) -> Dict[str, Any]:
return json.load(fh)


def _sha256_file(path: Path) -> str:
"""Return a stable content hash for one on-disk eval input file."""
return hashlib.sha256(path.read_bytes()).hexdigest()


def _input_fingerprints(name: str) -> Dict[str, str]:
"""Hash the fixture and probe bank that define one eval target."""
return {
"fixture_sha256": _sha256_file(FIXTURES_DIR / f"{name}.json"),
"probe_bank_sha256": _sha256_file(PROBES_DIR / f"{name}.probes.json"),
}


def _resolve_runtime(
*,
provider_override: Optional[str],
Expand Down Expand Up @@ -102,6 +116,7 @@ def _run_one_fixture(
) -> Dict[str, Any]:
fx = _load_fixture(fixture_name)
probes = _load_probes(fixture_name)
fingerprints = _input_fingerprints(fixture_name)

logger.info(
"[%s run=%d] compressing (%d messages, ctx=%d)",
Expand Down Expand Up @@ -196,6 +211,7 @@ def _run_one_fixture(
return {
"fixture_name": fixture_name,
"run_index": run_index,
**fingerprints,
"compression": {
"pre_tokens": compression["pre_tokens"],
"post_tokens": compression["post_tokens"],
Expand Down
69 changes: 68 additions & 1 deletion tests/test_compression_eval.py
Original file line number Diff line number Diff line change
Expand Up @@ -200,7 +200,14 @@ def test_build_judge_prompt_includes_all_score_scale_levels():
# ---------- report.summarize_fixture_runs ----------


def _fake_run(fixture_name: str, run_index: int, probe_scores: dict) -> dict:
def _fake_run(
fixture_name: str,
run_index: int,
probe_scores: dict,
*,
fixture_sha256: str = "fixture-v1",
probe_bank_sha256: str = "probes-v1",
) -> dict:
"""Build a synthetic per-run payload for summariser tests."""
probes = []
for pid, per_dim in probe_scores.items():
Expand All @@ -220,6 +227,8 @@ def _fake_run(fixture_name: str, run_index: int, probe_scores: dict) -> dict:
return {
"fixture_name": fixture_name,
"run_index": run_index,
"fixture_sha256": fixture_sha256,
"probe_bank_sha256": probe_bank_sha256,
"compression": {
"pre_tokens": 10000,
"post_tokens": 5000,
Expand Down Expand Up @@ -278,6 +287,20 @@ def test_summarize_medians_across_runs():
assert s["runs"] == 3


def test_summarize_rejects_mixed_input_fingerprints():
runs = [
_fake_run("fx1", 1, {"p": _all_dims(4)}),
_fake_run(
"fx1",
2,
{"p": _all_dims(4)},
fixture_sha256="fixture-v2",
),
]
with pytest.raises(ValueError, match="fixture_sha256 changed"):
summarize_fixture_runs(runs)


def test_summarize_empty_input():
assert summarize_fixture_runs([]) == {}

Expand Down Expand Up @@ -322,6 +345,50 @@ def test_render_report_shows_deltas_when_baseline_provided():
assert "Deltas shown against baseline" in md


def test_render_report_skips_deltas_when_fixture_changed():
baseline_runs = [_fake_run("fx", 1, {"p1": _all_dims(3)})]
current_runs = [
_fake_run(
"fx",
1,
{"p1": _all_dims(4)},
fixture_sha256="fixture-v2",
)
]
baseline = [summarize_fixture_runs(baseline_runs)]
current = [summarize_fixture_runs(current_runs)]
md = render_report(
label="test",
compressor_model="m",
judge_model="m",
runs_per_fixture=1,
summaries=current,
baseline_summaries=baseline,
)
assert "+1.00" not in md
assert "Baseline comparisons skipped" in md
assert "`fx`: fixture changed" in md


def test_render_report_skips_deltas_when_old_baseline_lacks_fingerprints():
baseline_runs = [_fake_run("fx", 1, {"p1": _all_dims(3)})]
current_runs = [_fake_run("fx", 1, {"p1": _all_dims(4)})]
baseline = [summarize_fixture_runs(baseline_runs)]
baseline[0]["fixture_sha256"] = ""
baseline[0]["probe_bank_sha256"] = ""
current = [summarize_fixture_runs(current_runs)]
md = render_report(
label="test",
compressor_model="m",
judge_model="m",
runs_per_fixture=1,
summaries=current,
baseline_summaries=baseline,
)
assert "+1.00" not in md
assert "`fx`: missing fixture/probe fingerprints" in md


def test_render_report_lists_misses_section():
runs = [_fake_run("fx", 1, {
"good": _all_dims(4),
Expand Down