Mike-E-Log · Mike-E-Log · May 20, 2026 · May 20, 2026
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -18,7 +18,7 @@ Initial release. Judge-vs-human calibration for LLM evals.
 
 ### Notes
 - Metrics delegate to scipy / scikit-learn (never hand-rolled); a regression test verifies Kendall-τ parity against the author's `cross-vendor-judges` tau-b across discriminating tie/degenerate cases.
-- The drift verdict's action cutoffs (κ≥0.61 / ≥0.41) are disclosed as the tool's editorial operating point, not an established standard. The Landis-Koch band citation applies to Cohen's κ only; Kendall-τ uses heuristic strength bands.
+- The drift verdict's action cutoffs (κ≥0.61 / ≥0.41) are disclosed as the tool's editorial operating point, not an established standard. The Landis-Koch band citation applies to Cohen's κ only; Kendall-τ has no established band standard, so it reuses the same κ cutoffs as a disclosed heuristic.
 - Not distributed via PyPI by design; install dependencies with `pip install -r requirements.txt`.
 
 [0.1.0]: https://github.com/Mike-E-Log/ai-eval-toolkit/releases/tag/v0.1.0
diff --git a/README.md b/README.md
@@ -51,8 +51,10 @@ The choices here are the point — each is a deliberate call about *not over-cla
   no invented threshold applies there.
 - **The drift verdict's action cutoffs (≥0.61 / ≥0.41) are disclosed as editorial**, not
   dressed up as standard — every verdict carries an explicit caveat.
-- **Kendall-τ uses heuristic strength bands, NOT Landis-Koch** — τ is rank correlation, not
-  κ; conflating them would be wrong, so they're kept distinct.
+- **Kendall-τ has no established band standard** — τ is rank correlation, not κ, so
+  Landis-Koch's κ interpretation doesn't formally apply. The tool reuses the same κ
+  cutoffs as a *disclosed heuristic* (the τ output's `band_basis` says exactly that),
+  rather than inventing a second arbitrary scale or pretending a τ standard exists.
 - **Metrics via scipy / scikit-learn, never hand-rolled** — auditable against reference impls.
 - **The `bias` command is an open question, not a verdict** — experimental, threshold-free by design.
 

diff --git a/eval_tk/calibrate.py b/eval_tk/calibrate.py
@@ -49,7 +49,8 @@ def load_labels(labels_path: str) -> tuple[list[int], list[int], int]:
     return human, judge, dropped
 
 
-# Landis & Koch (1977) interpretation bands for kappa.
+# Landis & Koch (1977) bands. Established standard for Cohen's kappa; reused as a
+# disclosed heuristic for Kendall's tau (which has no standard) — see _BAND_BASIS.
 _BANDS = [
     (0.81, "almost perfect"),
     (0.61, "substantial"),
@@ -73,14 +74,17 @@ def kendall_tau(human: list[int], judge: list[int]) -> float:
         return float(kendalltau(human, judge).correlation)
 
 
-def landis_koch_band(kappa: float) -> str:
-    """Map kappa to the Landis-Koch band label. Returns 'undefined' for NaN input."""
-    if _isnan(kappa):
+def strength_band(value: float) -> str:
+    """Map an agreement/correlation value to a strength label via the Landis-Koch
+    (1977) cutoffs. For Cohen's kappa these ARE the established standard; for
+    Kendall's tau there is no established band standard, so the same cutoffs are
+    reused only as a disclosed heuristic (band_basis says so). 'undefined' on NaN."""
+    if _isnan(value):
         return "undefined"
-    if kappa < 0.0:
+    if value < 0.0:
         return "poor"
     for threshold, label in _BANDS:
-        if kappa >= threshold:
+        if value >= threshold:
             return label
     return "poor"
 
@@ -126,8 +130,8 @@ def demo(metric: str = "cohen_kappa") -> dict:
 
 _BAND_BASIS = {
     "cohen_kappa": "Landis-Koch 1977 (kappa)",
-    "kendall_tau": ("heuristic strength bands "
-                    "(NOT Landis-Koch; tau is rank correlation, not kappa)"),
+    "kendall_tau": ("tool heuristic: reuses the Landis-Koch kappa cutoffs "
+                    "(tau has no established band standard, so treat the label as rough)"),
 }
 
 
@@ -138,7 +142,7 @@ def agreement(labels_path: str, metric: str = "cohen_kappa") -> dict:
                           "use metric='cohen_kappa' or 'kendall_tau'")
     human, judge, dropped = load_labels(labels_path)
     value = cohen_kappa(human, judge) if metric == "cohen_kappa" else kendall_tau(human, judge)
-    band = landis_koch_band(value)
+    band = strength_band(value)
     critique = f"{dropped} row(s) dropped on inner-join" if dropped else "no rows dropped"
     return {
         "metric": metric,

diff --git a/tests/test_parity.py b/tests/test_parity.py
@@ -21,12 +21,12 @@ def test_kendall_tau_matches_scipy_to_6dp():
     assert round(got, 6) == round(want, 6)
 
 
-@pytest.mark.parametrize("kappa,band", [
+@pytest.mark.parametrize("value,band", [
     (-0.1, "poor"), (0.10, "slight"), (0.30, "fair"),
     (0.50, "moderate"), (0.70, "substantial"), (0.90, "almost perfect"),
 ])
-def test_landis_koch_bands(kappa, band):
-    assert calibrate.landis_koch_band(kappa) == band
+def test_strength_band_cutoffs(value, band):
+    assert calibrate.strength_band(value) == band
 
 
 def test_perfect_agreement_kappa_one():
@@ -49,15 +49,21 @@ def test_agreement_end_to_end_returns_contract(tmp_path):
     assert "Landis-Koch" in out["band_basis"]
 
 
-def test_kendall_tau_band_basis_disclaims_landis_koch(tmp_path):
+def test_kendall_tau_band_basis_discloses_heuristic_reuse(tmp_path):
     import csv
     p = tmp_path / "tau.csv"
     with open(p, "w", newline="") as f:
         w = csv.writer(f); w.writerow(["item_id", "human_label", "judge_label"])
         for i, (h, j) in enumerate([(0,0),(1,1),(2,1),(1,1),(0,0),(2,2)]):
             w.writerow([f"i{i}", h, j])
     out = calibrate.agreement(str(p), metric="kendall_tau")
-    assert "NOT Landis-Koch" in out["band_basis"]
+    basis = out["band_basis"].lower()
+    # Honest contract: tau has NO established band standard, and the tool must
+    # ADMIT it reuses the kappa cutoffs as a heuristic — not claim a distinct scale
+    # while silently using the kappa numbers (the original mislabel this pins against).
+    assert "heuristic" in basis
+    assert "kappa" in basis                       # admits the reuse, not a separate scale
+    assert "no established" in basis              # does not pretend a tau standard exists
     assert "operating cutoffs are this tool's recommendation" in out["drift_verdict"]