diff --git a/evolution/skills/evolve_skill.py b/evolution/skills/evolve_skill.py
index 4a2dff83..71c78f7c 100644
--- a/evolution/skills/evolve_skill.py
+++ b/evolution/skills/evolve_skill.py
@@ -164,6 +164,29 @@ def _knee_point_payload(knee_pick: Optional[CandidatePick]) -> dict[str, Any]:
     }
 
 
+def _deferred_knee_point_payload(
+    *, best_idx: int, val_score: float, body_chars: int,
+) -> dict[str, Any]:
+    """Payload for the val-best path that defers to GEPA's best_idx.
+
+    Regenerated calibration showed the epsilon-band selector picked
+    GEPA's default in every run across five epsilon modes; the val-best
+    short-circuit skips the band walk entirely. `band_roster` stays a
+    list so downstream calibration scripts that access it via
+    ``.get("band_roster", [])`` keep working.
+    """
+    return {
+        "applied": False,
+        "fallback": "gepa_default",
+        "picked_idx": best_idx,
+        "gepa_default_idx": best_idx,
+        "picked_val_score": val_score,
+        "picked_body_chars": body_chars,
+        "gepa_default_body_chars": body_chars,
+        "band_roster": [],
+    }
+
+
 def _holdout_evaluate_with_metric(module, holdout_examples, metric, lm) -> tuple[float, list[float]]:
     """Score `module` on the holdout via dspy.Evaluate.
 
@@ -992,36 +1015,57 @@ def evolve(
             elapsed = time.time() - start_time
             console.print(f"\n  {optimizer_name} optimization completed in {elapsed:.1f}s")
 
-            # GEPA's default ("best by aggregate valset score") overfits on small
-            # valsets — observed 1.000 valset / 0.78 holdout on obsidian. Knee-point
-            # picks the most parsimonious candidate within ε=1/n_val instead.
+            # The val-best path defers to GEPA's argmax (details.best_idx).
+            # Regenerated calibration showed the epsilon-band selector picked
+            # GEPA's default 10/10 across five epsilon modes; see
+            # reports/calibration_findings.md Finding 3. The --knee-point-strategy
+            # smallest path still routes through select_knee_point for users
+            # explicitly chasing compression.
             # Skipped cleanly when MIPROv2 fallback fired (no detailed_results).
             knee_pick: Optional[CandidatePick] = None
+            knee_payload: dict[str, Any] = {
+                "applied": False, "reason": "no_detailed_results",
+            }
             if hasattr(optimized_module, "detailed_results"):
                 details = optimized_module.detailed_results
-                knee_pick = select_knee_point(
-                    candidates=details.candidates,
-                    val_aggregate_scores=details.val_aggregate_scores,
-                    n_val=len(valset),
-                    static_validator=lambda txt: validator.validate_static(
-                        reassemble_skill(skill["frontmatter"], txt), "skill",
-                    ),
-                    gepa_default_idx=details.best_idx,
-                    epsilon=knee_point_epsilon,
-                    strategy=knee_point_strategy,
-                )
-                # Fresh module instead of mutating in place: avoids carrying
-                # ChainOfThought state (demos, etc.) from the GEPA-default module —
-                # we only want the picked candidate's instruction text.
-                optimized_module = SkillModule(knee_pick.skill_text)
-                console.print(
-                    f"\n[bold]Knee-point selection[/bold]: picked candidate "
-                    f"{knee_pick.picked_idx} (val={knee_pick.val_score:.3f}, "
-                    f"rank {knee_pick.val_rank_in_band} of {knee_pick.band_size} "
-                    f"in band, {knee_pick.body_chars} chars vs GEPA default "
-                    f"{knee_pick.gepa_default_body_chars} chars; ε={knee_pick.epsilon:.3f}; "
-                    f"fallback={knee_pick.fallback})"
-                )
+                if knee_point_strategy == "smallest":
+                    knee_pick = select_knee_point(
+                        candidates=details.candidates,
+                        val_aggregate_scores=details.val_aggregate_scores,
+                        n_val=len(valset),
+                        static_validator=lambda txt: validator.validate_static(
+                            reassemble_skill(skill["frontmatter"], txt), "skill",
+                        ),
+                        gepa_default_idx=details.best_idx,
+                        epsilon=knee_point_epsilon,
+                        strategy=knee_point_strategy,
+                    )
+                    optimized_module = SkillModule(knee_pick.skill_text)
+                    knee_payload = _knee_point_payload(knee_pick)
+                    console.print(
+                        f"\n[bold]Knee-point selection[/bold]: picked candidate "
+                        f"{knee_pick.picked_idx} (val={knee_pick.val_score:.3f}, "
+                        f"rank {knee_pick.val_rank_in_band} of {knee_pick.band_size} "
+                        f"in band, {knee_pick.body_chars} chars vs GEPA default "
+                        f"{knee_pick.gepa_default_body_chars} chars; ε={knee_pick.epsilon:.3f}; "
+                        f"fallback={knee_pick.fallback})"
+                    )
+                else:
+                    # val-best no longer walks the band on static failure;
+                    # --knee-point-strategy smallest preserves that behavior.
+                    best_text = details.candidates[details.best_idx].skill_text
+                    optimized_module = SkillModule(best_text)
+                    knee_payload = _deferred_knee_point_payload(
+                        best_idx=details.best_idx,
+                        val_score=float(details.val_aggregate_scores[details.best_idx]),
+                        body_chars=len(best_text),
+                    )
+                    console.print(
+                        f"\n[bold]Candidate selection[/bold]: GEPA val-argmax "
+                        f"(candidate {details.best_idx}, val="
+                        f"{details.val_aggregate_scores[details.best_idx]:.3f}, "
+                        f"{len(best_text)} chars)"
+                    )
 
             evolved_body = optimized_module.skill_text
             evolved_full = reassemble_skill(skill["frontmatter"], evolved_body)
@@ -1049,7 +1093,7 @@ def evolve(
                     "decision_signal": "synthetic",
                     "failed_constraints": [c.constraint_name for c in static_constraints if not c.passed],
                     "messages": [c.message for c in static_constraints if not c.passed],
-                    "knee_point": _knee_point_payload(knee_pick),
+                    "knee_point": knee_payload,
                     "dataset": _dataset_payload(dataset),
                     "run_inputs": build_run_inputs(
                         config=config,
@@ -1148,7 +1192,7 @@ def evolve(
                         "baseline_chars": baseline_chars,
                         "evolved_chars": evolved_chars,
                         "growth_pct": growth_pct,
-                        "knee_point": _knee_point_payload(knee_pick),
+                        "knee_point": knee_payload,
                         "dataset": _dataset_payload(dataset),
                         "run_inputs": run_inputs,
                     })
@@ -1191,7 +1235,7 @@ def evolve(
                         "baseline_chars": baseline_chars,
                         "evolved_chars": evolved_chars,
                         "growth_pct": growth_pct,
-                        "knee_point": _knee_point_payload(knee_pick),
+                        "knee_point": knee_payload,
                         "dataset": _dataset_payload(dataset),
                         "run_inputs": run_inputs,
                     })
@@ -1333,7 +1377,7 @@ def evolve(
                 "win_loss": _compute_win_loss(baseline_per_example, evolved_per_example),
                 "failed_constraints": [c.constraint_name for c in growth_constraints if not c.passed],
                 "messages": [c.message for c in growth_constraints if not c.passed],
-                "knee_point": _knee_point_payload(knee_pick),
+                "knee_point": knee_payload,
                 "dataset": _dataset_payload(dataset),
                 "run_inputs": run_inputs,
             }
@@ -1610,19 +1654,19 @@ def evolve(
     "--knee-point-epsilon",
     default=None,
     type=float,
-    help="Advanced: ε tolerance for knee-point Pareto selection. Default = "
-    "1/n_val (one valset example's worth of disagreement). Override only when "
-    "you have a calibrated reason — random tightening narrows the band and "
-    "biases selection back toward the GEPA default.",
+    help="Advanced: ε tolerance for the knee-point band. Only used by "
+    "--knee-point-strategy=smallest; the default val-best path defers to "
+    "GEPA's val-argmax and ignores ε. Default = 1/n_val (one valset "
+    "example's worth of disagreement).",
 )
 @click.option(
     "--knee-point-strategy",
     default="val-best",
     type=click.Choice(["val-best", "smallest"]),
-    help="Within the ε-band, which candidate to pick. val-best (default): "
-    "highest val score wins, smallest body as tiebreak. smallest: greedy "
-    "parsimony — picks the smallest body regardless of val cost; "
-    "available for users explicitly chasing compression.",
+    help="How to pick the deployed candidate from GEPA's output. val-best "
+    "(default): defer to GEPA's val-argmax (best_idx) — does not walk an "
+    "ε-band. smallest: walk the ε-band and pick the smallest body, "
+    "accepting val cost for compression.",
 )
 @click.option(
     "--bap-safety-margin",
diff --git a/evolution/skills/knee_point.py b/evolution/skills/knee_point.py
index f5cb09a5..51340c8c 100644
--- a/evolution/skills/knee_point.py
+++ b/evolution/skills/knee_point.py
@@ -13,8 +13,6 @@
 
 from __future__ import annotations
 
-import math
-import random
 from dataclasses import dataclass
 from typing import Any, Callable, Optional, Protocol
 
@@ -24,56 +22,6 @@ class _SupportsSkillText(Protocol):
     def skill_text(self) -> str: ...
 
 
-def _estimate_val_noise(
-    val_subscores: list[list[float]],
-    best_idx: int,
-    *,
-    n_bootstrap: int = 1000,
-    confidence: float = 0.90,
-    seed: int = 0,
-) -> float:
-    """Estimate the noise floor on val scores via paired bootstrap.
-
-    Returns the half-width of the ``confidence``-level CI on the mean
-    pairwise diff between the best candidate and each competitor. Used as
-    the knee-point ε so the band reflects the empirical resolution of
-    valset scoring rather than the geometric 1/n_val floor, which sits
-    an order of magnitude below the actual paired noise at typical
-    n_val (8–50).
-
-    Single-candidate fallback: with no competitor to pair against, returns
-    ``0.5 / sqrt(n_val)`` — the worst-case binomial SE at p=0.5.
-    """
-    if len(val_subscores) < 2:
-        return 0.5 / math.sqrt(len(val_subscores[best_idx]))
-
-    best = val_subscores[best_idx]
-    diffs: list[float] = []
-    for k, other in enumerate(val_subscores):
-        if k == best_idx:
-            continue
-        covered = min(len(best), len(other))
-        diffs.extend(best[i] - other[i] for i in range(covered))
-
-    if not diffs or all(d == 0.0 for d in diffs):
-        return 0.0
-
-    rng = random.Random(seed)
-    n = len(diffs)
-    boot_means: list[float] = []
-    for _ in range(n_bootstrap):
-        sample_sum = 0.0
-        for _ in range(n):
-            sample_sum += diffs[rng.randrange(n)]
-        boot_means.append(sample_sum / n)
-
-    boot_means.sort()
-    tail = (1.0 - confidence) / 2.0
-    lower = boot_means[int(tail * n_bootstrap)]
-    upper = boot_means[min(int((1.0 - tail) * n_bootstrap), n_bootstrap - 1)]
-    return (upper - lower) / 2.0
-
-
 @dataclass(frozen=True)
 class CandidatePick:
     """A selected candidate plus the diagnostics needed to debug the choice.
diff --git a/evolution/tools/evolve_tool.py b/evolution/tools/evolve_tool.py
index 40759bde..7b62edcb 100644
--- a/evolution/tools/evolve_tool.py
+++ b/evolution/tools/evolve_tool.py
@@ -69,7 +69,6 @@
 )
 from evolution.core.run_inputs import build_run_inputs
 from evolution.core.stats import paired_bootstrap
-from evolution.skills.knee_point import CandidatePick, select_knee_point
 from evolution.tools.session_mining import (
     HermesToolImporter,
     build_tool_dataset_from_sessions,
@@ -187,21 +186,24 @@ def _compute_win_loss(
     }
 
 
-def _knee_point_payload(knee_pick: Optional[CandidatePick]) -> dict[str, Any]:
-    if knee_pick is None:
-        return {"applied": False, "reason": "no_detailed_results"}
+def _deferred_knee_point_payload(
+    *, best_idx: int, val_score: float, body_chars: int,
+) -> dict[str, Any]:
+    """Payload for the val-best path that defers to GEPA's best_idx.
+
+    Mirrors evolve_skill's deferred payload. `band_roster` stays a list so
+    downstream calibration scripts that access it via
+    ``.get("band_roster", [])`` keep working.
+    """
     return {
-        "applied": True,
-        "fallback": knee_pick.fallback,
-        "epsilon": knee_pick.epsilon,
-        "band_size": knee_pick.band_size,
-        "picked_idx": knee_pick.picked_idx,
-        "picked_val_score": knee_pick.val_score,
-        "picked_val_rank_in_band": knee_pick.val_rank_in_band,
-        "picked_body_chars": knee_pick.body_chars,
-        "gepa_default_idx": knee_pick.gepa_default_idx,
-        "gepa_default_body_chars": knee_pick.gepa_default_body_chars,
-        "band_roster": knee_pick.band_roster,
+        "applied": False,
+        "fallback": "gepa_default",
+        "picked_idx": best_idx,
+        "gepa_default_idx": best_idx,
+        "picked_val_score": val_score,
+        "picked_body_chars": body_chars,
+        "gepa_default_body_chars": body_chars,
+        "band_roster": [],
     }
 
 
@@ -751,30 +753,33 @@ def evolve(
             elapsed = time.time() - start_time
             console.print(f"\n  GEPA optimization completed in {elapsed:.1f}s")
 
-            knee_pick: Optional[CandidatePick] = None
+            # Defer to GEPA's val-argmax (details.best_idx). Regenerated
+            # calibration showed the epsilon-band selector picked GEPA's
+            # default 10/10 across five epsilon modes; see
+            # reports/calibration_findings.md Finding 3.
+            knee_payload: dict[str, Any] = {
+                "applied": False, "reason": "no_detailed_results",
+            }
             if hasattr(optimized_module, "detailed_results"):
                 details = optimized_module.detailed_results
-                knee_pick = select_knee_point(
-                    candidates=details.candidates,
-                    val_aggregate_scores=details.val_aggregate_scores,
-                    n_val=len(valset),
-                    static_validator=lambda txt: validator.validate_static(txt, "tool_description"),
-                    gepa_default_idx=details.best_idx,
-                    text_extractor=lambda c: _candidate_description(c, tool_name),
+                evolved_description = _candidate_description(
+                    details.candidates[details.best_idx], tool_name,
                 )
-                evolved_description = _candidate_description(knee_pick.module, tool_name)
                 optimized_module = ToolModule(
                     target_tool_name=tool_name,
                     manifest=manifest,
                     target_description=evolved_description,
                 )
+                knee_payload = _deferred_knee_point_payload(
+                    best_idx=details.best_idx,
+                    val_score=float(details.val_aggregate_scores[details.best_idx]),
+                    body_chars=len(evolved_description),
+                )
                 console.print(
-                    f"\n[bold]Knee-point selection[/bold]: picked candidate "
-                    f"{knee_pick.picked_idx} (val={knee_pick.val_score:.3f}, "
-                    f"rank {knee_pick.val_rank_in_band} of {knee_pick.band_size} in band, "
-                    f"{knee_pick.body_chars} chars vs GEPA default "
-                    f"{knee_pick.gepa_default_body_chars}; ε={knee_pick.epsilon:.3f}; "
-                    f"fallback={knee_pick.fallback})"
+                    f"\n[bold]Candidate selection[/bold]: GEPA val-argmax "
+                    f"(candidate {details.best_idx}, val="
+                    f"{details.val_aggregate_scores[details.best_idx]:.3f}, "
+                    f"{len(evolved_description)} chars)"
                 )
             else:
                 evolved_description = optimized_module.description_text
@@ -817,7 +822,7 @@ def evolve(
                     "decision_signal": "synthetic",
                     "failed_constraints": [c.constraint_name for c in static_constraints if not c.passed],
                     "messages": [c.message for c in static_constraints if not c.passed],
-                    "knee_point": _knee_point_payload(knee_pick),
+                    "knee_point": knee_payload,
                     "dataset": _dataset_payload(dataset, dropped_tools=manifest.dropped_tools, sessiondb_drops=sessiondb_drops),
                     "run_inputs": run_inputs,
                     **tool_payload_fields,
@@ -895,7 +900,7 @@ def evolve(
                         "baseline_chars": baseline_chars,
                         "evolved_chars": evolved_chars,
                         "growth_pct": growth_pct,
-                        "knee_point": _knee_point_payload(knee_pick),
+                        "knee_point": knee_payload,
                         "dataset": _dataset_payload(dataset, dropped_tools=manifest.dropped_tools, sessiondb_drops=sessiondb_drops),
                         "run_inputs": run_inputs,
                         **tool_payload_fields,
@@ -940,7 +945,7 @@ def evolve(
                         "baseline_chars": baseline_chars,
                         "evolved_chars": evolved_chars,
                         "growth_pct": growth_pct,
-                        "knee_point": _knee_point_payload(knee_pick),
+                        "knee_point": knee_payload,
                         "dataset": _dataset_payload(dataset, dropped_tools=manifest.dropped_tools, sessiondb_drops=sessiondb_drops),
                         "run_inputs": run_inputs,
                         **tool_payload_fields,
@@ -1067,7 +1072,7 @@ def evolve(
                 "win_loss": _compute_win_loss(baseline_per_example, evolved_per_example),
                 "failed_constraints": [c.constraint_name for c in growth_constraints if not c.passed],
                 "messages": [c.message for c in growth_constraints if not c.passed],
-                "knee_point": _knee_point_payload(knee_pick),
+                "knee_point": knee_payload,
                 "dataset": _dataset_payload(dataset, dropped_tools=manifest.dropped_tools, sessiondb_drops=sessiondb_drops),
                 "run_inputs": run_inputs,
                 **tool_payload_fields,
diff --git a/reports/calibration_findings.md b/reports/calibration_findings.md
index da028236..cf7ec97d 100644
--- a/reports/calibration_findings.md
+++ b/reports/calibration_findings.md
@@ -41,6 +41,25 @@ The four campaign studies produced robust observations even though the (free, sl
 
 These are starting points for a future campaign with a richer corpus, not current defaults.
 
+### 2026-05-24 update — knee-point ε confirmed as a no-op on val-best
+
+A targeted regeneration replayed the four-skill calibration corpus
+(nano-pdf, apple-notes, polymarket, huggingface-hub) at N\*=250,
+ratio\*=0.65 across 10 runs and five ε modes:
+
+| Mode | Mean transfer error | Deploy rate |
+|---|---|---|
+| 1.0 / n\_val (status quo) | 0.0466 | 70% |
+| 0.5 / n\_val | 0.0466 | 70% |
+| 2.0 / n\_val | 0.0466 | 70% |
+| 3.0 / n\_val | 0.0466 | 70% |
+| noise-estimated (paired-bootstrap) | 0.0466 | 70% |
+
+10/10 mode agreement on a 10-run replay across nano-pdf, apple-notes,
+polymarket, huggingface-hub at N\*=250, ratio\*=0.65. Selector dropped
+from val-best path; `--knee-point-strategy smallest` preserved for
+compression-bias users.
+
 ## Finding 4 — Non-inferiority gate at tolerance 0.05 strictly improves on `no_regression`
 
 A post-hoc gate-rule replay (script: `scripts/analysis/option1_replay_gate_rule.py` on the campaign's archive branch) tested whether the **non-inferiority** rule (`bootstrap.lower_bound ≥ -tolerance`) better matches the campaign's compression-bias behavior than the current `no_regression_only` rule (`bootstrap.mean ≥ 0`). Sweep across the 17 instrumented runs:
diff --git a/tests/skills/test_evolve_skill_cl_aware_gate.py b/tests/skills/test_evolve_skill_cl_aware_gate.py
index b285c9d7..146179b6 100644
--- a/tests/skills/test_evolve_skill_cl_aware_gate.py
+++ b/tests/skills/test_evolve_skill_cl_aware_gate.py
@@ -35,7 +35,6 @@
 from evolution.core.dataset_builder import EvalDataset, EvalExample
 from evolution.core.saturation_check import SaturationReport
 from evolution.skills.evolve_skill import evolve
-from evolution.skills.knee_point import CandidatePick
 from evolution.validation.report import (
     PhaseResult,
     TaskResult,
@@ -154,31 +153,6 @@ def _phase(tasks: list[TaskResult]) -> PhaseResult:
     )
 
 
-def _make_knee_pick(evolved_body: str) -> CandidatePick:
-    """Build a CandidatePick that select_knee_point would return.
-
-    ``skill_text`` IS the evolved body (no frontmatter). evolve_skill.py
-    then reassembles the full file via reassemble_skill(frontmatter, body)
-    for the static checks, but force_run() is called with the body alone.
-    """
-    fake_module = MagicMock()
-    fake_module.skill_text = evolved_body
-    return CandidatePick(
-        module=fake_module,
-        skill_text=evolved_body,
-        body_chars=len(evolved_body),
-        val_score=0.8,
-        val_rank_in_band=1,
-        band_size=1,
-        epsilon=0.1,
-        fallback="knee",
-        picked_idx=0,
-        gepa_default_idx=0,
-        gepa_default_body_chars=len(evolved_body),
-        band_roster=[],
-    )
-
-
 def _make_fake_gepa(evolved_body: str):
     """Build a fake dspy.GEPA whose ``compile()`` returns a module with
     the detailed_results shape the knee-point path expects."""
@@ -228,7 +202,6 @@ def _patch_stack(
     """
     fake_builder = MagicMock()
     fake_builder.generate.return_value = _fake_skill_dataset()
-    knee_pick = _make_knee_pick(evolved_body)
     evolved_per = [holdout_evolved_mean] * holdout_n
 
     def _maybe_build(**kwargs):
@@ -259,10 +232,6 @@ def _maybe_build(**kwargs):
             "evolution.skills.evolve_skill.dspy.GEPA",
             new=_make_fake_gepa(evolved_body),
         ))
-        stack.enter_context(patch(
-            "evolution.skills.evolve_skill.select_knee_point",
-            return_value=knee_pick,
-        ))
         stack.enter_context(patch(
             "evolution.skills.evolve_skill._holdout_evaluate_with_metric",
             return_value=(holdout_evolved_mean, evolved_per),
diff --git a/tests/skills/test_evolve_skill_saturation_preflight.py b/tests/skills/test_evolve_skill_saturation_preflight.py
index 67131357..781719ab 100644
--- a/tests/skills/test_evolve_skill_saturation_preflight.py
+++ b/tests/skills/test_evolve_skill_saturation_preflight.py
@@ -72,22 +72,26 @@ def test_healthy_band_does_not_prompt(self, skill_dir):
         past the abort branch.
         """
         from evolution.core.saturation_check import SaturationReport
-        from evolution.skills.knee_point import CandidatePick
+        from types import SimpleNamespace
         healthy = SaturationReport(
             band="healthy", holdout_score=0.5, holdout_n=10,
             holdout_per_example=[0.5] * 10, suggestions=[], thresholds={},
         )
-        fake_module = MagicMock()
-        fake_module.skill_text = "evolved skill text"
-        knee_pick = CandidatePick(
-            module=fake_module, skill_text="evolved skill text", body_chars=18,
-            val_score=0.8, val_rank_in_band=1, band_size=1, epsilon=0.1,
-            fallback="knee", picked_idx=0, gepa_default_idx=0,
-            gepa_default_body_chars=18, band_roster=[],
-        )
         fake_builder = MagicMock()
         fake_builder.generate.return_value = _fake_skill_dataset()
+        # Shape the fake GEPA's compile() output so the val-best path's
+        # details.val_aggregate_scores[best_idx] resolves to a real float
+        # and details.candidates[best_idx].skill_text resolves to a string.
         gepa_mock = MagicMock()
+        fake_candidate = MagicMock()
+        fake_candidate.skill_text = "evolved skill text"
+        fake_optimized = MagicMock()
+        fake_optimized.detailed_results = SimpleNamespace(
+            candidates=[fake_candidate],
+            val_aggregate_scores=[1.0],
+            best_idx=0,
+        )
+        gepa_mock.return_value.compile.return_value = fake_optimized
         with patch(
             "evolution.skills.evolve_skill.SyntheticDatasetBuilder", return_value=fake_builder
         ), patch(
@@ -97,8 +101,6 @@ def test_healthy_band_does_not_prompt(self, skill_dir):
         ), patch(
             "evolution.skills.evolve_skill.interactive_confirm"
         ) as mock_confirm, patch("evolution.skills.evolve_skill.dspy.GEPA", gepa_mock), patch(
-            "evolution.skills.evolve_skill.select_knee_point", return_value=knee_pick
-        ), patch(
             "evolution.skills.evolve_skill._holdout_evaluate_with_metric"
         ) as mock_holdout_eval:
             mock_holdout_eval.return_value = (0.6, [0.6] * 10)
@@ -189,22 +191,26 @@ def test_force_saturation_check_overrides_abort(self, skill_dir):
         actually overrode the abort.
         """
         from evolution.core.saturation_check import SaturationReport
-        from evolution.skills.knee_point import CandidatePick
+        from types import SimpleNamespace
         saturated = SaturationReport(
             band="no_headroom", holdout_score=0.99, holdout_n=50,
             holdout_per_example=[1.0] * 50, suggestions=["x"], thresholds={},
         )
-        fake_module = MagicMock()
-        fake_module.skill_text = "evolved skill text"
-        knee_pick = CandidatePick(
-            module=fake_module, skill_text="evolved skill text", body_chars=18,
-            val_score=0.8, val_rank_in_band=1, band_size=1, epsilon=0.1,
-            fallback="knee", picked_idx=0, gepa_default_idx=0,
-            gepa_default_body_chars=18, band_roster=[],
-        )
         fake_builder = MagicMock()
         fake_builder.generate.return_value = _fake_skill_dataset()
+        # Shape the fake GEPA's compile() output so the val-best path's
+        # details.val_aggregate_scores[best_idx] resolves to a real float
+        # and details.candidates[best_idx].skill_text resolves to a string.
         gepa_mock = MagicMock()
+        fake_candidate = MagicMock()
+        fake_candidate.skill_text = "evolved skill text"
+        fake_optimized = MagicMock()
+        fake_optimized.detailed_results = SimpleNamespace(
+            candidates=[fake_candidate],
+            val_aggregate_scores=[1.0],
+            best_idx=0,
+        )
+        gepa_mock.return_value.compile.return_value = fake_optimized
         with patch(
             "evolution.skills.evolve_skill.SyntheticDatasetBuilder", return_value=fake_builder
         ), patch(
@@ -216,8 +222,6 @@ def test_force_saturation_check_overrides_abort(self, skill_dir):
         ), patch(
             "evolution.skills.evolve_skill.interactive_confirm"
         ) as mock_confirm, patch("evolution.skills.evolve_skill.dspy.GEPA", gepa_mock), patch(
-            "evolution.skills.evolve_skill.select_knee_point", return_value=knee_pick
-        ), patch(
             "evolution.skills.evolve_skill._holdout_evaluate_with_metric"
         ) as mock_holdout_eval:
             mock_holdout_eval.return_value = (0.6, [0.6] * 10)
@@ -236,23 +240,24 @@ def test_cache_reuse_skips_baseline_re_eval_after_gepa(self, skill_dir):
         module should NOT be re-scored on the holdout after GEPA finishes.
         This is the 'net cost ~zero' contract."""
         from evolution.core.saturation_check import SaturationReport
-        from evolution.skills.knee_point import CandidatePick
+        from types import SimpleNamespace
         from unittest.mock import MagicMock
 
         healthy = SaturationReport(
             band="healthy", holdout_score=0.6, holdout_n=10,
             holdout_per_example=[0.6] * 10, suggestions=[], thresholds={},
         )
-        # Fake knee-point result so execution reaches the holdout site.
-        # skill_text must be a non-empty string so SkillModule can be built.
-        fake_module = MagicMock()
-        fake_module.skill_text = "evolved skill text"
-        knee_pick = CandidatePick(
-            module=fake_module, skill_text="evolved skill text", body_chars=18,
-            val_score=0.8, val_rank_in_band=1, band_size=1, epsilon=0.1,
-            fallback="knee", picked_idx=0, gepa_default_idx=0,
-            gepa_default_body_chars=18, band_roster=[],
+        # Shape the fake GEPA's compile() output so the val-best path's
+        # details.candidates[best_idx].skill_text returns a real string.
+        gepa_mock = MagicMock()
+        fake_candidate = SimpleNamespace(skill_text="evolved skill text")
+        fake_optimized = MagicMock()
+        fake_optimized.detailed_results = SimpleNamespace(
+            candidates=[fake_candidate],
+            val_aggregate_scores=[1.0],
+            best_idx=0,
         )
+        gepa_mock.return_value.compile.return_value = fake_optimized
         fake_builder = MagicMock()
         fake_builder.generate.return_value = _fake_skill_dataset()
         with patch(
@@ -261,9 +266,7 @@ def test_cache_reuse_skips_baseline_re_eval_after_gepa(self, skill_dir):
             "evolution.skills.evolve_skill.saturation_preflight", return_value=healthy
         ), patch(
             "evolution.skills.evolve_skill._preflight_lm_credentials"
-        ), patch("evolution.skills.evolve_skill.dspy.GEPA"), patch(
-            "evolution.skills.evolve_skill.select_knee_point", return_value=knee_pick
-        ), patch(
+        ), patch("evolution.skills.evolve_skill.dspy.GEPA", gepa_mock), patch(
             "evolution.skills.evolve_skill._holdout_evaluate_with_metric"
         ) as mock_holdout_eval:
             mock_holdout_eval.return_value = (0.6, [0.6] * 10)
@@ -293,7 +296,7 @@ def test_flag_passes_through_to_dspy_gepa(self, skill_dir):
         CLI with --gepa-minibatch-size 7. Assert the constructed instance
         carries the value on the documented attribute."""
         from evolution.core.saturation_check import SaturationReport
-        from evolution.skills.knee_point import CandidatePick
+        from types import SimpleNamespace
         captured: dict = {}
         original_init = __import__("dspy").GEPA.__init__
 
@@ -305,13 +308,17 @@ def recording_init(self, *args, **kwargs):
             band="healthy", holdout_score=0.6, holdout_n=10,
             holdout_per_example=[0.6] * 10, suggestions=[], thresholds={},
         )
+        # Shape the fake GEPA's compile() output so the val-best path's
+        # details.val_aggregate_scores[best_idx] resolves to a real float
+        # and details.candidates[best_idx].skill_text resolves to a string.
+        fake_candidate = MagicMock()
+        fake_candidate.skill_text = "evolved skill text"
         fake_module = MagicMock()
         fake_module.skill_text = "evolved skill text"
-        knee_pick = CandidatePick(
-            module=fake_module, skill_text="evolved skill text", body_chars=18,
-            val_score=0.8, val_rank_in_band=1, band_size=1, epsilon=0.1,
-            fallback="knee", picked_idx=0, gepa_default_idx=0,
-            gepa_default_body_chars=18, band_roster=[],
+        fake_module.detailed_results = SimpleNamespace(
+            candidates=[fake_candidate],
+            val_aggregate_scores=[1.0],
+            best_idx=0,
         )
         fake_builder = MagicMock()
         fake_builder.generate.return_value = _fake_skill_dataset()
@@ -323,8 +330,6 @@ def recording_init(self, *args, **kwargs):
             "evolution.skills.evolve_skill._preflight_lm_credentials"
         ), patch("evolution.skills.evolve_skill.dspy.GEPA.__init__", recording_init), patch(
             "evolution.skills.evolve_skill.dspy.GEPA.compile", return_value=fake_module
-        ), patch(
-            "evolution.skills.evolve_skill.select_knee_point", return_value=knee_pick
         ), patch(
             "evolution.skills.evolve_skill._holdout_evaluate_with_metric",
             return_value=(0.6, [0.6] * 10),
diff --git a/tests/skills/test_knee_point_noise_estimation.py b/tests/skills/test_knee_point_noise_estimation.py
deleted file mode 100644
index fa2c6ca7..00000000
--- a/tests/skills/test_knee_point_noise_estimation.py
+++ /dev/null
@@ -1,100 +0,0 @@
-"""Tests for noise-estimated knee-point ε via paired bootstrap.
-
-Pure-Python, no LM. Synthetic val_subscores matrices exercise the helper's
-degenerate paths (saturation, single candidate, all-zero diffs, partial
-coverage) and pin its order-of-magnitude behavior against the analytical
-binomial SE for a Bernoulli front.
-"""
-
-from __future__ import annotations
-
-import math
-import random
-
-import pytest
-
-from evolution.skills.knee_point import _estimate_val_noise
-
-
-class TestEstimateValNoise:
-    def test_estimate_val_noise_returns_zero_on_saturated_matrix(self):
-        # Every candidate scores 1.0 everywhere → diff vector is all zeros →
-        # bootstrap CI collapses to [0, 0]. No useful signal, no band.
-        val_subscores = [[1.0] * 50 for _ in range(5)]
-        eps = _estimate_val_noise(val_subscores, best_idx=0)
-        assert eps == 0.0
-
-    def test_estimate_val_noise_matches_analytical_se_on_bernoulli_p_half(self):
-        # Independent Bernoulli(0.5) draws for best vs one competitor. The
-        # paired diff has Var(X-Y) = 2·p(1-p) = 0.5 at p=0.5, so the SE of
-        # the mean diff at n=50 is √(0.5/50) = 0.1. A 90% normal CI half-
-        # width is ~1.645·SE ≈ 0.165. The helper's bootstrap CI half-width
-        # should land in this neighborhood; a wide tolerance catches sign
-        # errors and axis mistakes without overfitting to RNG quirks.
-        rng = random.Random(123)
-        n = 50
-        best_scores = [float(rng.random() < 0.5) for _ in range(n)]
-        other_scores = [float(rng.random() < 0.5) for _ in range(n)]
-        val_subscores = [best_scores, other_scores]
-
-        eps = _estimate_val_noise(val_subscores, best_idx=0)
-
-        paired_se = math.sqrt(2.0 * 0.5 * 0.5 / n)
-        analytical_ci_half = 1.645 * paired_se  # ≈ 0.165
-        assert eps == pytest.approx(analytical_ci_half, rel=0.4, abs=0.05)
-
-    def test_estimate_val_noise_widens_with_higher_variance(self):
-        # Low-variance: diffs cluster tight (~0.01 spread).
-        # High-variance: diffs span ±0.5. Bootstrap CI half-width must
-        # be strictly larger on the high-variance matrix.
-        n = 40
-        best_low = [0.5] * n
-        other_low = [0.5 + (0.01 if i % 2 == 0 else -0.01) for i in range(n)]
-        low_var = [best_low, other_low]
-
-        best_high = [0.5] * n
-        other_high = [0.5 + (0.5 if i % 2 == 0 else -0.5) for i in range(n)]
-        high_var = [best_high, other_high]
-
-        eps_low = _estimate_val_noise(low_var, best_idx=0)
-        eps_high = _estimate_val_noise(high_var, best_idx=0)
-
-        assert eps_high > eps_low
-
-    def test_estimate_val_noise_falls_back_on_single_candidate(self):
-        # Only one candidate → no paired diffs possible. Degenerate path
-        # returns the binomial-SE-ish floor 0.5 / √n_val.
-        n_val = 64
-        val_subscores = [[0.7] * n_val]
-        eps = _estimate_val_noise(val_subscores, best_idx=0)
-        assert eps == pytest.approx(0.5 / math.sqrt(n_val))
-        assert eps == pytest.approx(0.0625)
-
-    def test_estimate_val_noise_is_deterministic_with_seed(self):
-        rng = random.Random(42)
-        n = 30
-        best_scores = [float(rng.random() < 0.6) for _ in range(n)]
-        other_scores = [float(rng.random() < 0.4) for _ in range(n)]
-        val_subscores = [best_scores, other_scores]
-
-        eps_a = _estimate_val_noise(val_subscores, best_idx=0)
-        eps_b = _estimate_val_noise(val_subscores, best_idx=0)
-        assert eps_a == eps_b
-
-    def test_estimate_val_noise_handles_partial_coverage(self):
-        # Coverage policy under test: align by position; aggregate only over
-        # indices present in both best and competitor (i.e., the first
-        # min(len(best), len(k)) positions). This matches how DSPy stores
-        # val_subscores positionally per-example; positions beyond the
-        # shorter list are treated as un-evaluated, not as zeros.
-        n_best = 50
-        n_other = 30
-        rng = random.Random(7)
-        best_scores = [float(rng.random() < 0.5) for _ in range(n_best)]
-        other_scores = [float(rng.random() < 0.5) for _ in range(n_other)]
-        val_subscores = [best_scores, other_scores]
-
-        eps = _estimate_val_noise(val_subscores, best_idx=0)
-        # No crash; non-negative; finite.
-        assert eps >= 0.0
-        assert math.isfinite(eps)
diff --git a/tests/tools/test_evolve_tool_cl_aware_gate.py b/tests/tools/test_evolve_tool_cl_aware_gate.py
index 20db7b3d..b237a3ff 100644
--- a/tests/tools/test_evolve_tool_cl_aware_gate.py
+++ b/tests/tools/test_evolve_tool_cl_aware_gate.py
@@ -25,7 +25,6 @@
 
 from evolution.core.dataset_builder import EvalExample
 from evolution.core.saturation_check import SaturationReport
-from evolution.skills.knee_point import CandidatePick
 from evolution.tools.evolve_tool import evolve
 from evolution.validation.report import (
     PhaseResult,
@@ -122,25 +121,6 @@ def _phase(tasks: list[TaskResult]) -> PhaseResult:
     )
 
 
-def _make_knee_pick(evolved_description: str) -> CandidatePick:
-    """Build a CandidatePick that select_knee_point would return."""
-    fake_module = MagicMock()
-    return CandidatePick(
-        module=fake_module,
-        skill_text=evolved_description,
-        body_chars=len(evolved_description),
-        val_score=0.8,
-        val_rank_in_band=1,
-        band_size=1,
-        epsilon=0.1,
-        fallback="knee",
-        picked_idx=0,
-        gepa_default_idx=0,
-        gepa_default_body_chars=len(evolved_description),
-        band_roster=[],
-    )
-
-
 def _make_fake_gepa(evolved_description: str):
     """Build a fake dspy.GEPA whose ``compile()`` returns a module with
     the detailed_results shape the knee-point path expects."""
@@ -195,7 +175,6 @@ def _patch_stack(
     """
     fake_builder = MagicMock()
     fake_builder.generate_tool_selection.return_value = _fake_tool_examples()
-    knee_pick = _make_knee_pick(evolved_description)
     evolved_per = [holdout_evolved_mean] * holdout_n
 
     def _maybe_build(**kwargs):
@@ -226,10 +205,6 @@ def _maybe_build(**kwargs):
             "evolution.tools.evolve_tool.dspy.GEPA",
             new=_make_fake_gepa(evolved_description),
         ))
-        stack.enter_context(patch(
-            "evolution.tools.evolve_tool.select_knee_point",
-            return_value=knee_pick,
-        ))
         stack.enter_context(patch(
             "evolution.tools.evolve_tool._candidate_description",
             return_value=evolved_description,
diff --git a/tests/tools/test_evolve_tool_saturation_preflight.py b/tests/tools/test_evolve_tool_saturation_preflight.py
index fce76cd5..becb07c2 100644
--- a/tests/tools/test_evolve_tool_saturation_preflight.py
+++ b/tests/tools/test_evolve_tool_saturation_preflight.py
@@ -78,21 +78,23 @@ def test_healthy_band_does_not_prompt(self, manifest_dir):
         proves the run actually proceeded past the abort branch.
         """
         from evolution.core.saturation_check import SaturationReport
-        from evolution.skills.knee_point import CandidatePick
+        from types import SimpleNamespace
         healthy = SaturationReport(
             band="healthy", holdout_score=0.5, holdout_n=10,
             holdout_per_example=[0.5] * 10, suggestions=[], thresholds={},
         )
-        fake_module = MagicMock()
-        knee_pick = CandidatePick(
-            module=fake_module, skill_text="evolved desc", body_chars=12,
-            val_score=0.8, val_rank_in_band=1, band_size=1, epsilon=0.1,
-            fallback="knee", picked_idx=0, gepa_default_idx=0,
-            gepa_default_body_chars=12, band_roster=[],
-        )
         fake_builder = MagicMock()
         fake_builder.generate_tool_selection.return_value = _fake_tool_examples()
+        # Shape the fake GEPA's compile() output so the val-best path's
+        # details.val_aggregate_scores[best_idx] resolves to a real float.
         gepa_mock = MagicMock()
+        fake_optimized = MagicMock()
+        fake_optimized.detailed_results = SimpleNamespace(
+            candidates=[MagicMock()],
+            val_aggregate_scores=[1.0],
+            best_idx=0,
+        )
+        gepa_mock.return_value.compile.return_value = fake_optimized
         with patch(
             "evolution.tools.evolve_tool.SyntheticDatasetBuilder", return_value=fake_builder
         ), patch(
@@ -102,8 +104,6 @@ def test_healthy_band_does_not_prompt(self, manifest_dir):
         ), patch(
             "evolution.tools.evolve_tool.interactive_confirm"
         ) as mock_confirm, patch("evolution.tools.evolve_tool.dspy.GEPA", gepa_mock), patch(
-            "evolution.tools.evolve_tool.select_knee_point", return_value=knee_pick
-        ), patch(
             "evolution.tools.evolve_tool._candidate_description", return_value="evolved desc"
         ), patch(
             "evolution.tools.evolve_tool._holdout_evaluate_with_metric"
@@ -200,22 +200,24 @@ def test_force_saturation_check_overrides_abort(self, manifest_dir):
         overrode the abort.
         """
         from evolution.core.saturation_check import SaturationReport
-        from evolution.skills.knee_point import CandidatePick
+        from types import SimpleNamespace
         saturated = SaturationReport(
             band="no_headroom", holdout_score=0.99, holdout_n=50,
             holdout_per_example=[1.0] * 50,
             suggestions=["x"], thresholds={},
         )
-        fake_module = MagicMock()
-        knee_pick = CandidatePick(
-            module=fake_module, skill_text="evolved desc", body_chars=12,
-            val_score=0.8, val_rank_in_band=1, band_size=1, epsilon=0.1,
-            fallback="knee", picked_idx=0, gepa_default_idx=0,
-            gepa_default_body_chars=12, band_roster=[],
-        )
         fake_builder = MagicMock()
         fake_builder.generate_tool_selection.return_value = _fake_tool_examples()
+        # Shape the fake GEPA's compile() output so the val-best path's
+        # details.val_aggregate_scores[best_idx] resolves to a real float.
         gepa_mock = MagicMock()
+        fake_optimized = MagicMock()
+        fake_optimized.detailed_results = SimpleNamespace(
+            candidates=[MagicMock()],
+            val_aggregate_scores=[1.0],
+            best_idx=0,
+        )
+        gepa_mock.return_value.compile.return_value = fake_optimized
         with patch(
             "evolution.tools.evolve_tool.SyntheticDatasetBuilder", return_value=fake_builder
         ), patch(
@@ -227,8 +229,6 @@ def test_force_saturation_check_overrides_abort(self, manifest_dir):
         ), patch(
             "evolution.tools.evolve_tool.interactive_confirm"
         ) as mock_confirm, patch("evolution.tools.evolve_tool.dspy.GEPA", gepa_mock), patch(
-            "evolution.tools.evolve_tool.select_knee_point", return_value=knee_pick
-        ), patch(
             "evolution.tools.evolve_tool._candidate_description", return_value="evolved desc"
         ), patch(
             "evolution.tools.evolve_tool._holdout_evaluate_with_metric"
@@ -249,7 +249,7 @@ def test_cache_reuse_skips_baseline_re_eval_after_gepa(self, manifest_dir):
         module should NOT be re-scored on the holdout after GEPA finishes.
         This is the 'net cost ~zero' contract."""
         from evolution.core.saturation_check import SaturationReport
-        from evolution.skills.knee_point import CandidatePick
+        from types import SimpleNamespace
         from unittest.mock import MagicMock
 
         # Healthy report so preflight passes without prompting; preflight
@@ -258,14 +258,16 @@ def test_cache_reuse_skips_baseline_re_eval_after_gepa(self, manifest_dir):
             band="healthy", holdout_score=0.6, holdout_n=10,
             holdout_per_example=[0.6] * 10, suggestions=[], thresholds={},
         )
-        # Fake knee-point result so execution reaches the holdout site.
-        fake_module = MagicMock()
-        knee_pick = CandidatePick(
-            module=fake_module, skill_text="evolved desc", body_chars=12,
-            val_score=0.8, val_rank_in_band=1, band_size=1, epsilon=0.1,
-            fallback="knee", picked_idx=0, gepa_default_idx=0,
-            gepa_default_body_chars=12, band_roster=[],
+        # Shape the fake GEPA's compile() output so the val-best path's
+        # details.val_aggregate_scores[best_idx] resolves to a real float.
+        gepa_mock = MagicMock()
+        fake_optimized = MagicMock()
+        fake_optimized.detailed_results = SimpleNamespace(
+            candidates=[MagicMock()],
+            val_aggregate_scores=[1.0],
+            best_idx=0,
         )
+        gepa_mock.return_value.compile.return_value = fake_optimized
         fake_builder = MagicMock()
         fake_builder.generate_tool_selection.return_value = _fake_tool_examples()
         with patch(
@@ -274,9 +276,7 @@ def test_cache_reuse_skips_baseline_re_eval_after_gepa(self, manifest_dir):
             "evolution.tools.evolve_tool.saturation_preflight", return_value=healthy
         ), patch(
             "evolution.tools.evolve_tool._preflight_lm_credentials"
-        ), patch("evolution.tools.evolve_tool.dspy.GEPA"), patch(
-            "evolution.tools.evolve_tool.select_knee_point", return_value=knee_pick
-        ), patch(
+        ), patch("evolution.tools.evolve_tool.dspy.GEPA", gepa_mock), patch(
             "evolution.tools.evolve_tool._candidate_description", return_value="evolved desc"
         ), patch(
             "evolution.tools.evolve_tool._holdout_evaluate_with_metric"
@@ -309,7 +309,7 @@ def test_flag_passes_through_to_dspy_gepa(self, manifest_dir):
         carries the value on the documented attribute. Catches future
         DSPy refactors that rename reflection_minibatch_size."""
         from evolution.core.saturation_check import SaturationReport
-        from evolution.skills.knee_point import CandidatePick
+        from types import SimpleNamespace
         captured: dict = {}
         original_init = __import__("dspy").GEPA.__init__
 
@@ -321,12 +321,13 @@ def recording_init(self, *args, **kwargs):
             band="healthy", holdout_score=0.6, holdout_n=10,
             holdout_per_example=[0.6] * 10, suggestions=[], thresholds={},
         )
+        # Shape the fake GEPA's compile() output so the val-best path's
+        # details.val_aggregate_scores[best_idx] resolves to a real float.
         fake_module = MagicMock()
-        knee_pick = CandidatePick(
-            module=fake_module, skill_text="evolved desc", body_chars=12,
-            val_score=0.8, val_rank_in_band=1, band_size=1, epsilon=0.1,
-            fallback="knee", picked_idx=0, gepa_default_idx=0,
-            gepa_default_body_chars=12, band_roster=[],
+        fake_module.detailed_results = SimpleNamespace(
+            candidates=[MagicMock()],
+            val_aggregate_scores=[1.0],
+            best_idx=0,
         )
         fake_builder = MagicMock()
         fake_builder.generate_tool_selection.return_value = _fake_tool_examples()
@@ -338,8 +339,6 @@ def recording_init(self, *args, **kwargs):
             "evolution.tools.evolve_tool._preflight_lm_credentials"
         ), patch("evolution.tools.evolve_tool.dspy.GEPA.__init__", recording_init), patch(
             "evolution.tools.evolve_tool.dspy.GEPA.compile", return_value=fake_module
-        ), patch(
-            "evolution.tools.evolve_tool.select_knee_point", return_value=knee_pick
         ), patch(
             "evolution.tools.evolve_tool._candidate_description", return_value="evolved desc"
         ), patch(