Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@ Initial release. Judge-vs-human calibration for LLM evals.

### Notes
- Metrics delegate to scipy / scikit-learn (never hand-rolled); a regression test verifies Kendall-τ parity against the author's `cross-vendor-judges` tau-b across discriminating tie/degenerate cases.
- The drift verdict's action cutoffs (κ≥0.61 / ≥0.41) are disclosed as the tool's editorial operating point, not an established standard. The Landis-Koch band citation applies to Cohen's κ only; Kendall-τ uses heuristic strength bands.
- The drift verdict's action cutoffs (κ≥0.61 / ≥0.41) are disclosed as the tool's editorial operating point, not an established standard. The Landis-Koch band citation applies to Cohen's κ only; Kendall-τ has no established band standard, so it reuses the same κ cutoffs as a disclosed heuristic.
- Not distributed via PyPI by design; install dependencies with `pip install -r requirements.txt`.

[0.1.0]: https://github.com/Mike-E-Log/ai-eval-toolkit/releases/tag/v0.1.0
6 changes: 4 additions & 2 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -51,8 +51,10 @@ The choices here are the point — each is a deliberate call about *not over-cla
no invented threshold applies there.
- **The drift verdict's action cutoffs (≥0.61 / ≥0.41) are disclosed as editorial**, not
dressed up as standard — every verdict carries an explicit caveat.
- **Kendall-τ uses heuristic strength bands, NOT Landis-Koch** — τ is rank correlation, not
κ; conflating them would be wrong, so they're kept distinct.
- **Kendall-τ has no established band standard** — τ is rank correlation, not κ, so
Landis-Koch's κ interpretation doesn't formally apply. The tool reuses the same κ
cutoffs as a *disclosed heuristic* (the τ output's `band_basis` says exactly that),
rather than inventing a second arbitrary scale or pretending a τ standard exists.
- **Metrics via scipy / scikit-learn, never hand-rolled** — auditable against reference impls.
- **The `bias` command is an open question, not a verdict** — experimental, threshold-free by design.

Expand Down
22 changes: 13 additions & 9 deletions eval_tk/calibrate.py
Original file line number Diff line number Diff line change
Expand Up @@ -49,7 +49,8 @@ def load_labels(labels_path: str) -> tuple[list[int], list[int], int]:
return human, judge, dropped


# Landis & Koch (1977) interpretation bands for kappa.
# Landis & Koch (1977) bands. Established standard for Cohen's kappa; reused as a
# disclosed heuristic for Kendall's tau (which has no standard) — see _BAND_BASIS.
_BANDS = [
(0.81, "almost perfect"),
(0.61, "substantial"),
Expand All @@ -73,14 +74,17 @@ def kendall_tau(human: list[int], judge: list[int]) -> float:
return float(kendalltau(human, judge).correlation)


def landis_koch_band(kappa: float) -> str:
"""Map kappa to the Landis-Koch band label. Returns 'undefined' for NaN input."""
if _isnan(kappa):
def strength_band(value: float) -> str:
"""Map an agreement/correlation value to a strength label via the Landis-Koch
(1977) cutoffs. For Cohen's kappa these ARE the established standard; for
Kendall's tau there is no established band standard, so the same cutoffs are
reused only as a disclosed heuristic (band_basis says so). 'undefined' on NaN."""
if _isnan(value):
return "undefined"
if kappa < 0.0:
if value < 0.0:
return "poor"
for threshold, label in _BANDS:
if kappa >= threshold:
if value >= threshold:
return label
return "poor"

Expand Down Expand Up @@ -126,8 +130,8 @@ def demo(metric: str = "cohen_kappa") -> dict:

_BAND_BASIS = {
"cohen_kappa": "Landis-Koch 1977 (kappa)",
"kendall_tau": ("heuristic strength bands "
"(NOT Landis-Koch; tau is rank correlation, not kappa)"),
"kendall_tau": ("tool heuristic: reuses the Landis-Koch kappa cutoffs "
"(tau has no established band standard, so treat the label as rough)"),
}


Expand All @@ -138,7 +142,7 @@ def agreement(labels_path: str, metric: str = "cohen_kappa") -> dict:
"use metric='cohen_kappa' or 'kendall_tau'")
human, judge, dropped = load_labels(labels_path)
value = cohen_kappa(human, judge) if metric == "cohen_kappa" else kendall_tau(human, judge)
band = landis_koch_band(value)
band = strength_band(value)
critique = f"{dropped} row(s) dropped on inner-join" if dropped else "no rows dropped"
return {
"metric": metric,
Expand Down
16 changes: 11 additions & 5 deletions tests/test_parity.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,12 +21,12 @@ def test_kendall_tau_matches_scipy_to_6dp():
assert round(got, 6) == round(want, 6)


@pytest.mark.parametrize("kappa,band", [
@pytest.mark.parametrize("value,band", [
(-0.1, "poor"), (0.10, "slight"), (0.30, "fair"),
(0.50, "moderate"), (0.70, "substantial"), (0.90, "almost perfect"),
])
def test_landis_koch_bands(kappa, band):
assert calibrate.landis_koch_band(kappa) == band
def test_strength_band_cutoffs(value, band):
assert calibrate.strength_band(value) == band


def test_perfect_agreement_kappa_one():
Expand All @@ -49,15 +49,21 @@ def test_agreement_end_to_end_returns_contract(tmp_path):
assert "Landis-Koch" in out["band_basis"]


def test_kendall_tau_band_basis_disclaims_landis_koch(tmp_path):
def test_kendall_tau_band_basis_discloses_heuristic_reuse(tmp_path):
import csv
p = tmp_path / "tau.csv"
with open(p, "w", newline="") as f:
w = csv.writer(f); w.writerow(["item_id", "human_label", "judge_label"])
for i, (h, j) in enumerate([(0,0),(1,1),(2,1),(1,1),(0,0),(2,2)]):
w.writerow([f"i{i}", h, j])
out = calibrate.agreement(str(p), metric="kendall_tau")
assert "NOT Landis-Koch" in out["band_basis"]
basis = out["band_basis"].lower()
# Honest contract: tau has NO established band standard, and the tool must
# ADMIT it reuses the kappa cutoffs as a heuristic — not claim a distinct scale
# while silently using the kappa numbers (the original mislabel this pins against).
assert "heuristic" in basis
assert "kappa" in basis # admits the reuse, not a separate scale
assert "no established" in basis # does not pretend a tau standard exists
assert "operating cutoffs are this tool's recommendation" in out["drift_verdict"]


Expand Down
Loading