diff --git a/evolution/skills/evolve_skill.py b/evolution/skills/evolve_skill.py index 4a2dff83..71c78f7c 100644 --- a/evolution/skills/evolve_skill.py +++ b/evolution/skills/evolve_skill.py @@ -164,6 +164,29 @@ def _knee_point_payload(knee_pick: Optional[CandidatePick]) -> dict[str, Any]: } +def _deferred_knee_point_payload( + *, best_idx: int, val_score: float, body_chars: int, +) -> dict[str, Any]: + """Payload for the val-best path that defers to GEPA's best_idx. + + Regenerated calibration showed the epsilon-band selector picked + GEPA's default in every run across five epsilon modes; the val-best + short-circuit skips the band walk entirely. `band_roster` stays a + list so downstream calibration scripts that access it via + ``.get("band_roster", [])`` keep working. + """ + return { + "applied": False, + "fallback": "gepa_default", + "picked_idx": best_idx, + "gepa_default_idx": best_idx, + "picked_val_score": val_score, + "picked_body_chars": body_chars, + "gepa_default_body_chars": body_chars, + "band_roster": [], + } + + def _holdout_evaluate_with_metric(module, holdout_examples, metric, lm) -> tuple[float, list[float]]: """Score `module` on the holdout via dspy.Evaluate. @@ -992,36 +1015,57 @@ def evolve( elapsed = time.time() - start_time console.print(f"\n {optimizer_name} optimization completed in {elapsed:.1f}s") - # GEPA's default ("best by aggregate valset score") overfits on small - # valsets — observed 1.000 valset / 0.78 holdout on obsidian. Knee-point - # picks the most parsimonious candidate within ε=1/n_val instead. + # The val-best path defers to GEPA's argmax (details.best_idx). + # Regenerated calibration showed the epsilon-band selector picked + # GEPA's default 10/10 across five epsilon modes; see + # reports/calibration_findings.md Finding 3. The --knee-point-strategy + # smallest path still routes through select_knee_point for users + # explicitly chasing compression. # Skipped cleanly when MIPROv2 fallback fired (no detailed_results). knee_pick: Optional[CandidatePick] = None + knee_payload: dict[str, Any] = { + "applied": False, "reason": "no_detailed_results", + } if hasattr(optimized_module, "detailed_results"): details = optimized_module.detailed_results - knee_pick = select_knee_point( - candidates=details.candidates, - val_aggregate_scores=details.val_aggregate_scores, - n_val=len(valset), - static_validator=lambda txt: validator.validate_static( - reassemble_skill(skill["frontmatter"], txt), "skill", - ), - gepa_default_idx=details.best_idx, - epsilon=knee_point_epsilon, - strategy=knee_point_strategy, - ) - # Fresh module instead of mutating in place: avoids carrying - # ChainOfThought state (demos, etc.) from the GEPA-default module — - # we only want the picked candidate's instruction text. - optimized_module = SkillModule(knee_pick.skill_text) - console.print( - f"\n[bold]Knee-point selection[/bold]: picked candidate " - f"{knee_pick.picked_idx} (val={knee_pick.val_score:.3f}, " - f"rank {knee_pick.val_rank_in_band} of {knee_pick.band_size} " - f"in band, {knee_pick.body_chars} chars vs GEPA default " - f"{knee_pick.gepa_default_body_chars} chars; ε={knee_pick.epsilon:.3f}; " - f"fallback={knee_pick.fallback})" - ) + if knee_point_strategy == "smallest": + knee_pick = select_knee_point( + candidates=details.candidates, + val_aggregate_scores=details.val_aggregate_scores, + n_val=len(valset), + static_validator=lambda txt: validator.validate_static( + reassemble_skill(skill["frontmatter"], txt), "skill", + ), + gepa_default_idx=details.best_idx, + epsilon=knee_point_epsilon, + strategy=knee_point_strategy, + ) + optimized_module = SkillModule(knee_pick.skill_text) + knee_payload = _knee_point_payload(knee_pick) + console.print( + f"\n[bold]Knee-point selection[/bold]: picked candidate " + f"{knee_pick.picked_idx} (val={knee_pick.val_score:.3f}, " + f"rank {knee_pick.val_rank_in_band} of {knee_pick.band_size} " + f"in band, {knee_pick.body_chars} chars vs GEPA default " + f"{knee_pick.gepa_default_body_chars} chars; ε={knee_pick.epsilon:.3f}; " + f"fallback={knee_pick.fallback})" + ) + else: + # val-best no longer walks the band on static failure; + # --knee-point-strategy smallest preserves that behavior. + best_text = details.candidates[details.best_idx].skill_text + optimized_module = SkillModule(best_text) + knee_payload = _deferred_knee_point_payload( + best_idx=details.best_idx, + val_score=float(details.val_aggregate_scores[details.best_idx]), + body_chars=len(best_text), + ) + console.print( + f"\n[bold]Candidate selection[/bold]: GEPA val-argmax " + f"(candidate {details.best_idx}, val=" + f"{details.val_aggregate_scores[details.best_idx]:.3f}, " + f"{len(best_text)} chars)" + ) evolved_body = optimized_module.skill_text evolved_full = reassemble_skill(skill["frontmatter"], evolved_body) @@ -1049,7 +1093,7 @@ def evolve( "decision_signal": "synthetic", "failed_constraints": [c.constraint_name for c in static_constraints if not c.passed], "messages": [c.message for c in static_constraints if not c.passed], - "knee_point": _knee_point_payload(knee_pick), + "knee_point": knee_payload, "dataset": _dataset_payload(dataset), "run_inputs": build_run_inputs( config=config, @@ -1148,7 +1192,7 @@ def evolve( "baseline_chars": baseline_chars, "evolved_chars": evolved_chars, "growth_pct": growth_pct, - "knee_point": _knee_point_payload(knee_pick), + "knee_point": knee_payload, "dataset": _dataset_payload(dataset), "run_inputs": run_inputs, }) @@ -1191,7 +1235,7 @@ def evolve( "baseline_chars": baseline_chars, "evolved_chars": evolved_chars, "growth_pct": growth_pct, - "knee_point": _knee_point_payload(knee_pick), + "knee_point": knee_payload, "dataset": _dataset_payload(dataset), "run_inputs": run_inputs, }) @@ -1333,7 +1377,7 @@ def evolve( "win_loss": _compute_win_loss(baseline_per_example, evolved_per_example), "failed_constraints": [c.constraint_name for c in growth_constraints if not c.passed], "messages": [c.message for c in growth_constraints if not c.passed], - "knee_point": _knee_point_payload(knee_pick), + "knee_point": knee_payload, "dataset": _dataset_payload(dataset), "run_inputs": run_inputs, } @@ -1610,19 +1654,19 @@ def evolve( "--knee-point-epsilon", default=None, type=float, - help="Advanced: ε tolerance for knee-point Pareto selection. Default = " - "1/n_val (one valset example's worth of disagreement). Override only when " - "you have a calibrated reason — random tightening narrows the band and " - "biases selection back toward the GEPA default.", + help="Advanced: ε tolerance for the knee-point band. Only used by " + "--knee-point-strategy=smallest; the default val-best path defers to " + "GEPA's val-argmax and ignores ε. Default = 1/n_val (one valset " + "example's worth of disagreement).", ) @click.option( "--knee-point-strategy", default="val-best", type=click.Choice(["val-best", "smallest"]), - help="Within the ε-band, which candidate to pick. val-best (default): " - "highest val score wins, smallest body as tiebreak. smallest: greedy " - "parsimony — picks the smallest body regardless of val cost; " - "available for users explicitly chasing compression.", + help="How to pick the deployed candidate from GEPA's output. val-best " + "(default): defer to GEPA's val-argmax (best_idx) — does not walk an " + "ε-band. smallest: walk the ε-band and pick the smallest body, " + "accepting val cost for compression.", ) @click.option( "--bap-safety-margin", diff --git a/evolution/skills/knee_point.py b/evolution/skills/knee_point.py index f5cb09a5..51340c8c 100644 --- a/evolution/skills/knee_point.py +++ b/evolution/skills/knee_point.py @@ -13,8 +13,6 @@ from __future__ import annotations -import math -import random from dataclasses import dataclass from typing import Any, Callable, Optional, Protocol @@ -24,56 +22,6 @@ class _SupportsSkillText(Protocol): def skill_text(self) -> str: ... -def _estimate_val_noise( - val_subscores: list[list[float]], - best_idx: int, - *, - n_bootstrap: int = 1000, - confidence: float = 0.90, - seed: int = 0, -) -> float: - """Estimate the noise floor on val scores via paired bootstrap. - - Returns the half-width of the ``confidence``-level CI on the mean - pairwise diff between the best candidate and each competitor. Used as - the knee-point ε so the band reflects the empirical resolution of - valset scoring rather than the geometric 1/n_val floor, which sits - an order of magnitude below the actual paired noise at typical - n_val (8–50). - - Single-candidate fallback: with no competitor to pair against, returns - ``0.5 / sqrt(n_val)`` — the worst-case binomial SE at p=0.5. - """ - if len(val_subscores) < 2: - return 0.5 / math.sqrt(len(val_subscores[best_idx])) - - best = val_subscores[best_idx] - diffs: list[float] = [] - for k, other in enumerate(val_subscores): - if k == best_idx: - continue - covered = min(len(best), len(other)) - diffs.extend(best[i] - other[i] for i in range(covered)) - - if not diffs or all(d == 0.0 for d in diffs): - return 0.0 - - rng = random.Random(seed) - n = len(diffs) - boot_means: list[float] = [] - for _ in range(n_bootstrap): - sample_sum = 0.0 - for _ in range(n): - sample_sum += diffs[rng.randrange(n)] - boot_means.append(sample_sum / n) - - boot_means.sort() - tail = (1.0 - confidence) / 2.0 - lower = boot_means[int(tail * n_bootstrap)] - upper = boot_means[min(int((1.0 - tail) * n_bootstrap), n_bootstrap - 1)] - return (upper - lower) / 2.0 - - @dataclass(frozen=True) class CandidatePick: """A selected candidate plus the diagnostics needed to debug the choice. diff --git a/evolution/tools/evolve_tool.py b/evolution/tools/evolve_tool.py index 40759bde..7b62edcb 100644 --- a/evolution/tools/evolve_tool.py +++ b/evolution/tools/evolve_tool.py @@ -69,7 +69,6 @@ ) from evolution.core.run_inputs import build_run_inputs from evolution.core.stats import paired_bootstrap -from evolution.skills.knee_point import CandidatePick, select_knee_point from evolution.tools.session_mining import ( HermesToolImporter, build_tool_dataset_from_sessions, @@ -187,21 +186,24 @@ def _compute_win_loss( } -def _knee_point_payload(knee_pick: Optional[CandidatePick]) -> dict[str, Any]: - if knee_pick is None: - return {"applied": False, "reason": "no_detailed_results"} +def _deferred_knee_point_payload( + *, best_idx: int, val_score: float, body_chars: int, +) -> dict[str, Any]: + """Payload for the val-best path that defers to GEPA's best_idx. + + Mirrors evolve_skill's deferred payload. `band_roster` stays a list so + downstream calibration scripts that access it via + ``.get("band_roster", [])`` keep working. + """ return { - "applied": True, - "fallback": knee_pick.fallback, - "epsilon": knee_pick.epsilon, - "band_size": knee_pick.band_size, - "picked_idx": knee_pick.picked_idx, - "picked_val_score": knee_pick.val_score, - "picked_val_rank_in_band": knee_pick.val_rank_in_band, - "picked_body_chars": knee_pick.body_chars, - "gepa_default_idx": knee_pick.gepa_default_idx, - "gepa_default_body_chars": knee_pick.gepa_default_body_chars, - "band_roster": knee_pick.band_roster, + "applied": False, + "fallback": "gepa_default", + "picked_idx": best_idx, + "gepa_default_idx": best_idx, + "picked_val_score": val_score, + "picked_body_chars": body_chars, + "gepa_default_body_chars": body_chars, + "band_roster": [], } @@ -751,30 +753,33 @@ def evolve( elapsed = time.time() - start_time console.print(f"\n GEPA optimization completed in {elapsed:.1f}s") - knee_pick: Optional[CandidatePick] = None + # Defer to GEPA's val-argmax (details.best_idx). Regenerated + # calibration showed the epsilon-band selector picked GEPA's + # default 10/10 across five epsilon modes; see + # reports/calibration_findings.md Finding 3. + knee_payload: dict[str, Any] = { + "applied": False, "reason": "no_detailed_results", + } if hasattr(optimized_module, "detailed_results"): details = optimized_module.detailed_results - knee_pick = select_knee_point( - candidates=details.candidates, - val_aggregate_scores=details.val_aggregate_scores, - n_val=len(valset), - static_validator=lambda txt: validator.validate_static(txt, "tool_description"), - gepa_default_idx=details.best_idx, - text_extractor=lambda c: _candidate_description(c, tool_name), + evolved_description = _candidate_description( + details.candidates[details.best_idx], tool_name, ) - evolved_description = _candidate_description(knee_pick.module, tool_name) optimized_module = ToolModule( target_tool_name=tool_name, manifest=manifest, target_description=evolved_description, ) + knee_payload = _deferred_knee_point_payload( + best_idx=details.best_idx, + val_score=float(details.val_aggregate_scores[details.best_idx]), + body_chars=len(evolved_description), + ) console.print( - f"\n[bold]Knee-point selection[/bold]: picked candidate " - f"{knee_pick.picked_idx} (val={knee_pick.val_score:.3f}, " - f"rank {knee_pick.val_rank_in_band} of {knee_pick.band_size} in band, " - f"{knee_pick.body_chars} chars vs GEPA default " - f"{knee_pick.gepa_default_body_chars}; ε={knee_pick.epsilon:.3f}; " - f"fallback={knee_pick.fallback})" + f"\n[bold]Candidate selection[/bold]: GEPA val-argmax " + f"(candidate {details.best_idx}, val=" + f"{details.val_aggregate_scores[details.best_idx]:.3f}, " + f"{len(evolved_description)} chars)" ) else: evolved_description = optimized_module.description_text @@ -817,7 +822,7 @@ def evolve( "decision_signal": "synthetic", "failed_constraints": [c.constraint_name for c in static_constraints if not c.passed], "messages": [c.message for c in static_constraints if not c.passed], - "knee_point": _knee_point_payload(knee_pick), + "knee_point": knee_payload, "dataset": _dataset_payload(dataset, dropped_tools=manifest.dropped_tools, sessiondb_drops=sessiondb_drops), "run_inputs": run_inputs, **tool_payload_fields, @@ -895,7 +900,7 @@ def evolve( "baseline_chars": baseline_chars, "evolved_chars": evolved_chars, "growth_pct": growth_pct, - "knee_point": _knee_point_payload(knee_pick), + "knee_point": knee_payload, "dataset": _dataset_payload(dataset, dropped_tools=manifest.dropped_tools, sessiondb_drops=sessiondb_drops), "run_inputs": run_inputs, **tool_payload_fields, @@ -940,7 +945,7 @@ def evolve( "baseline_chars": baseline_chars, "evolved_chars": evolved_chars, "growth_pct": growth_pct, - "knee_point": _knee_point_payload(knee_pick), + "knee_point": knee_payload, "dataset": _dataset_payload(dataset, dropped_tools=manifest.dropped_tools, sessiondb_drops=sessiondb_drops), "run_inputs": run_inputs, **tool_payload_fields, @@ -1067,7 +1072,7 @@ def evolve( "win_loss": _compute_win_loss(baseline_per_example, evolved_per_example), "failed_constraints": [c.constraint_name for c in growth_constraints if not c.passed], "messages": [c.message for c in growth_constraints if not c.passed], - "knee_point": _knee_point_payload(knee_pick), + "knee_point": knee_payload, "dataset": _dataset_payload(dataset, dropped_tools=manifest.dropped_tools, sessiondb_drops=sessiondb_drops), "run_inputs": run_inputs, **tool_payload_fields, diff --git a/reports/calibration_findings.md b/reports/calibration_findings.md index da028236..cf7ec97d 100644 --- a/reports/calibration_findings.md +++ b/reports/calibration_findings.md @@ -41,6 +41,25 @@ The four campaign studies produced robust observations even though the (free, sl These are starting points for a future campaign with a richer corpus, not current defaults. +### 2026-05-24 update — knee-point ε confirmed as a no-op on val-best + +A targeted regeneration replayed the four-skill calibration corpus +(nano-pdf, apple-notes, polymarket, huggingface-hub) at N\*=250, +ratio\*=0.65 across 10 runs and five ε modes: + +| Mode | Mean transfer error | Deploy rate | +|---|---|---| +| 1.0 / n\_val (status quo) | 0.0466 | 70% | +| 0.5 / n\_val | 0.0466 | 70% | +| 2.0 / n\_val | 0.0466 | 70% | +| 3.0 / n\_val | 0.0466 | 70% | +| noise-estimated (paired-bootstrap) | 0.0466 | 70% | + +10/10 mode agreement on a 10-run replay across nano-pdf, apple-notes, +polymarket, huggingface-hub at N\*=250, ratio\*=0.65. Selector dropped +from val-best path; `--knee-point-strategy smallest` preserved for +compression-bias users. + ## Finding 4 — Non-inferiority gate at tolerance 0.05 strictly improves on `no_regression` A post-hoc gate-rule replay (script: `scripts/analysis/option1_replay_gate_rule.py` on the campaign's archive branch) tested whether the **non-inferiority** rule (`bootstrap.lower_bound ≥ -tolerance`) better matches the campaign's compression-bias behavior than the current `no_regression_only` rule (`bootstrap.mean ≥ 0`). Sweep across the 17 instrumented runs: diff --git a/tests/skills/test_evolve_skill_cl_aware_gate.py b/tests/skills/test_evolve_skill_cl_aware_gate.py index b285c9d7..146179b6 100644 --- a/tests/skills/test_evolve_skill_cl_aware_gate.py +++ b/tests/skills/test_evolve_skill_cl_aware_gate.py @@ -35,7 +35,6 @@ from evolution.core.dataset_builder import EvalDataset, EvalExample from evolution.core.saturation_check import SaturationReport from evolution.skills.evolve_skill import evolve -from evolution.skills.knee_point import CandidatePick from evolution.validation.report import ( PhaseResult, TaskResult, @@ -154,31 +153,6 @@ def _phase(tasks: list[TaskResult]) -> PhaseResult: ) -def _make_knee_pick(evolved_body: str) -> CandidatePick: - """Build a CandidatePick that select_knee_point would return. - - ``skill_text`` IS the evolved body (no frontmatter). evolve_skill.py - then reassembles the full file via reassemble_skill(frontmatter, body) - for the static checks, but force_run() is called with the body alone. - """ - fake_module = MagicMock() - fake_module.skill_text = evolved_body - return CandidatePick( - module=fake_module, - skill_text=evolved_body, - body_chars=len(evolved_body), - val_score=0.8, - val_rank_in_band=1, - band_size=1, - epsilon=0.1, - fallback="knee", - picked_idx=0, - gepa_default_idx=0, - gepa_default_body_chars=len(evolved_body), - band_roster=[], - ) - - def _make_fake_gepa(evolved_body: str): """Build a fake dspy.GEPA whose ``compile()`` returns a module with the detailed_results shape the knee-point path expects.""" @@ -228,7 +202,6 @@ def _patch_stack( """ fake_builder = MagicMock() fake_builder.generate.return_value = _fake_skill_dataset() - knee_pick = _make_knee_pick(evolved_body) evolved_per = [holdout_evolved_mean] * holdout_n def _maybe_build(**kwargs): @@ -259,10 +232,6 @@ def _maybe_build(**kwargs): "evolution.skills.evolve_skill.dspy.GEPA", new=_make_fake_gepa(evolved_body), )) - stack.enter_context(patch( - "evolution.skills.evolve_skill.select_knee_point", - return_value=knee_pick, - )) stack.enter_context(patch( "evolution.skills.evolve_skill._holdout_evaluate_with_metric", return_value=(holdout_evolved_mean, evolved_per), diff --git a/tests/skills/test_evolve_skill_saturation_preflight.py b/tests/skills/test_evolve_skill_saturation_preflight.py index 67131357..781719ab 100644 --- a/tests/skills/test_evolve_skill_saturation_preflight.py +++ b/tests/skills/test_evolve_skill_saturation_preflight.py @@ -72,22 +72,26 @@ def test_healthy_band_does_not_prompt(self, skill_dir): past the abort branch. """ from evolution.core.saturation_check import SaturationReport - from evolution.skills.knee_point import CandidatePick + from types import SimpleNamespace healthy = SaturationReport( band="healthy", holdout_score=0.5, holdout_n=10, holdout_per_example=[0.5] * 10, suggestions=[], thresholds={}, ) - fake_module = MagicMock() - fake_module.skill_text = "evolved skill text" - knee_pick = CandidatePick( - module=fake_module, skill_text="evolved skill text", body_chars=18, - val_score=0.8, val_rank_in_band=1, band_size=1, epsilon=0.1, - fallback="knee", picked_idx=0, gepa_default_idx=0, - gepa_default_body_chars=18, band_roster=[], - ) fake_builder = MagicMock() fake_builder.generate.return_value = _fake_skill_dataset() + # Shape the fake GEPA's compile() output so the val-best path's + # details.val_aggregate_scores[best_idx] resolves to a real float + # and details.candidates[best_idx].skill_text resolves to a string. gepa_mock = MagicMock() + fake_candidate = MagicMock() + fake_candidate.skill_text = "evolved skill text" + fake_optimized = MagicMock() + fake_optimized.detailed_results = SimpleNamespace( + candidates=[fake_candidate], + val_aggregate_scores=[1.0], + best_idx=0, + ) + gepa_mock.return_value.compile.return_value = fake_optimized with patch( "evolution.skills.evolve_skill.SyntheticDatasetBuilder", return_value=fake_builder ), patch( @@ -97,8 +101,6 @@ def test_healthy_band_does_not_prompt(self, skill_dir): ), patch( "evolution.skills.evolve_skill.interactive_confirm" ) as mock_confirm, patch("evolution.skills.evolve_skill.dspy.GEPA", gepa_mock), patch( - "evolution.skills.evolve_skill.select_knee_point", return_value=knee_pick - ), patch( "evolution.skills.evolve_skill._holdout_evaluate_with_metric" ) as mock_holdout_eval: mock_holdout_eval.return_value = (0.6, [0.6] * 10) @@ -189,22 +191,26 @@ def test_force_saturation_check_overrides_abort(self, skill_dir): actually overrode the abort. """ from evolution.core.saturation_check import SaturationReport - from evolution.skills.knee_point import CandidatePick + from types import SimpleNamespace saturated = SaturationReport( band="no_headroom", holdout_score=0.99, holdout_n=50, holdout_per_example=[1.0] * 50, suggestions=["x"], thresholds={}, ) - fake_module = MagicMock() - fake_module.skill_text = "evolved skill text" - knee_pick = CandidatePick( - module=fake_module, skill_text="evolved skill text", body_chars=18, - val_score=0.8, val_rank_in_band=1, band_size=1, epsilon=0.1, - fallback="knee", picked_idx=0, gepa_default_idx=0, - gepa_default_body_chars=18, band_roster=[], - ) fake_builder = MagicMock() fake_builder.generate.return_value = _fake_skill_dataset() + # Shape the fake GEPA's compile() output so the val-best path's + # details.val_aggregate_scores[best_idx] resolves to a real float + # and details.candidates[best_idx].skill_text resolves to a string. gepa_mock = MagicMock() + fake_candidate = MagicMock() + fake_candidate.skill_text = "evolved skill text" + fake_optimized = MagicMock() + fake_optimized.detailed_results = SimpleNamespace( + candidates=[fake_candidate], + val_aggregate_scores=[1.0], + best_idx=0, + ) + gepa_mock.return_value.compile.return_value = fake_optimized with patch( "evolution.skills.evolve_skill.SyntheticDatasetBuilder", return_value=fake_builder ), patch( @@ -216,8 +222,6 @@ def test_force_saturation_check_overrides_abort(self, skill_dir): ), patch( "evolution.skills.evolve_skill.interactive_confirm" ) as mock_confirm, patch("evolution.skills.evolve_skill.dspy.GEPA", gepa_mock), patch( - "evolution.skills.evolve_skill.select_knee_point", return_value=knee_pick - ), patch( "evolution.skills.evolve_skill._holdout_evaluate_with_metric" ) as mock_holdout_eval: mock_holdout_eval.return_value = (0.6, [0.6] * 10) @@ -236,23 +240,24 @@ def test_cache_reuse_skips_baseline_re_eval_after_gepa(self, skill_dir): module should NOT be re-scored on the holdout after GEPA finishes. This is the 'net cost ~zero' contract.""" from evolution.core.saturation_check import SaturationReport - from evolution.skills.knee_point import CandidatePick + from types import SimpleNamespace from unittest.mock import MagicMock healthy = SaturationReport( band="healthy", holdout_score=0.6, holdout_n=10, holdout_per_example=[0.6] * 10, suggestions=[], thresholds={}, ) - # Fake knee-point result so execution reaches the holdout site. - # skill_text must be a non-empty string so SkillModule can be built. - fake_module = MagicMock() - fake_module.skill_text = "evolved skill text" - knee_pick = CandidatePick( - module=fake_module, skill_text="evolved skill text", body_chars=18, - val_score=0.8, val_rank_in_band=1, band_size=1, epsilon=0.1, - fallback="knee", picked_idx=0, gepa_default_idx=0, - gepa_default_body_chars=18, band_roster=[], + # Shape the fake GEPA's compile() output so the val-best path's + # details.candidates[best_idx].skill_text returns a real string. + gepa_mock = MagicMock() + fake_candidate = SimpleNamespace(skill_text="evolved skill text") + fake_optimized = MagicMock() + fake_optimized.detailed_results = SimpleNamespace( + candidates=[fake_candidate], + val_aggregate_scores=[1.0], + best_idx=0, ) + gepa_mock.return_value.compile.return_value = fake_optimized fake_builder = MagicMock() fake_builder.generate.return_value = _fake_skill_dataset() with patch( @@ -261,9 +266,7 @@ def test_cache_reuse_skips_baseline_re_eval_after_gepa(self, skill_dir): "evolution.skills.evolve_skill.saturation_preflight", return_value=healthy ), patch( "evolution.skills.evolve_skill._preflight_lm_credentials" - ), patch("evolution.skills.evolve_skill.dspy.GEPA"), patch( - "evolution.skills.evolve_skill.select_knee_point", return_value=knee_pick - ), patch( + ), patch("evolution.skills.evolve_skill.dspy.GEPA", gepa_mock), patch( "evolution.skills.evolve_skill._holdout_evaluate_with_metric" ) as mock_holdout_eval: mock_holdout_eval.return_value = (0.6, [0.6] * 10) @@ -293,7 +296,7 @@ def test_flag_passes_through_to_dspy_gepa(self, skill_dir): CLI with --gepa-minibatch-size 7. Assert the constructed instance carries the value on the documented attribute.""" from evolution.core.saturation_check import SaturationReport - from evolution.skills.knee_point import CandidatePick + from types import SimpleNamespace captured: dict = {} original_init = __import__("dspy").GEPA.__init__ @@ -305,13 +308,17 @@ def recording_init(self, *args, **kwargs): band="healthy", holdout_score=0.6, holdout_n=10, holdout_per_example=[0.6] * 10, suggestions=[], thresholds={}, ) + # Shape the fake GEPA's compile() output so the val-best path's + # details.val_aggregate_scores[best_idx] resolves to a real float + # and details.candidates[best_idx].skill_text resolves to a string. + fake_candidate = MagicMock() + fake_candidate.skill_text = "evolved skill text" fake_module = MagicMock() fake_module.skill_text = "evolved skill text" - knee_pick = CandidatePick( - module=fake_module, skill_text="evolved skill text", body_chars=18, - val_score=0.8, val_rank_in_band=1, band_size=1, epsilon=0.1, - fallback="knee", picked_idx=0, gepa_default_idx=0, - gepa_default_body_chars=18, band_roster=[], + fake_module.detailed_results = SimpleNamespace( + candidates=[fake_candidate], + val_aggregate_scores=[1.0], + best_idx=0, ) fake_builder = MagicMock() fake_builder.generate.return_value = _fake_skill_dataset() @@ -323,8 +330,6 @@ def recording_init(self, *args, **kwargs): "evolution.skills.evolve_skill._preflight_lm_credentials" ), patch("evolution.skills.evolve_skill.dspy.GEPA.__init__", recording_init), patch( "evolution.skills.evolve_skill.dspy.GEPA.compile", return_value=fake_module - ), patch( - "evolution.skills.evolve_skill.select_knee_point", return_value=knee_pick ), patch( "evolution.skills.evolve_skill._holdout_evaluate_with_metric", return_value=(0.6, [0.6] * 10), diff --git a/tests/skills/test_knee_point_noise_estimation.py b/tests/skills/test_knee_point_noise_estimation.py deleted file mode 100644 index fa2c6ca7..00000000 --- a/tests/skills/test_knee_point_noise_estimation.py +++ /dev/null @@ -1,100 +0,0 @@ -"""Tests for noise-estimated knee-point ε via paired bootstrap. - -Pure-Python, no LM. Synthetic val_subscores matrices exercise the helper's -degenerate paths (saturation, single candidate, all-zero diffs, partial -coverage) and pin its order-of-magnitude behavior against the analytical -binomial SE for a Bernoulli front. -""" - -from __future__ import annotations - -import math -import random - -import pytest - -from evolution.skills.knee_point import _estimate_val_noise - - -class TestEstimateValNoise: - def test_estimate_val_noise_returns_zero_on_saturated_matrix(self): - # Every candidate scores 1.0 everywhere → diff vector is all zeros → - # bootstrap CI collapses to [0, 0]. No useful signal, no band. - val_subscores = [[1.0] * 50 for _ in range(5)] - eps = _estimate_val_noise(val_subscores, best_idx=0) - assert eps == 0.0 - - def test_estimate_val_noise_matches_analytical_se_on_bernoulli_p_half(self): - # Independent Bernoulli(0.5) draws for best vs one competitor. The - # paired diff has Var(X-Y) = 2·p(1-p) = 0.5 at p=0.5, so the SE of - # the mean diff at n=50 is √(0.5/50) = 0.1. A 90% normal CI half- - # width is ~1.645·SE ≈ 0.165. The helper's bootstrap CI half-width - # should land in this neighborhood; a wide tolerance catches sign - # errors and axis mistakes without overfitting to RNG quirks. - rng = random.Random(123) - n = 50 - best_scores = [float(rng.random() < 0.5) for _ in range(n)] - other_scores = [float(rng.random() < 0.5) for _ in range(n)] - val_subscores = [best_scores, other_scores] - - eps = _estimate_val_noise(val_subscores, best_idx=0) - - paired_se = math.sqrt(2.0 * 0.5 * 0.5 / n) - analytical_ci_half = 1.645 * paired_se # ≈ 0.165 - assert eps == pytest.approx(analytical_ci_half, rel=0.4, abs=0.05) - - def test_estimate_val_noise_widens_with_higher_variance(self): - # Low-variance: diffs cluster tight (~0.01 spread). - # High-variance: diffs span ±0.5. Bootstrap CI half-width must - # be strictly larger on the high-variance matrix. - n = 40 - best_low = [0.5] * n - other_low = [0.5 + (0.01 if i % 2 == 0 else -0.01) for i in range(n)] - low_var = [best_low, other_low] - - best_high = [0.5] * n - other_high = [0.5 + (0.5 if i % 2 == 0 else -0.5) for i in range(n)] - high_var = [best_high, other_high] - - eps_low = _estimate_val_noise(low_var, best_idx=0) - eps_high = _estimate_val_noise(high_var, best_idx=0) - - assert eps_high > eps_low - - def test_estimate_val_noise_falls_back_on_single_candidate(self): - # Only one candidate → no paired diffs possible. Degenerate path - # returns the binomial-SE-ish floor 0.5 / √n_val. - n_val = 64 - val_subscores = [[0.7] * n_val] - eps = _estimate_val_noise(val_subscores, best_idx=0) - assert eps == pytest.approx(0.5 / math.sqrt(n_val)) - assert eps == pytest.approx(0.0625) - - def test_estimate_val_noise_is_deterministic_with_seed(self): - rng = random.Random(42) - n = 30 - best_scores = [float(rng.random() < 0.6) for _ in range(n)] - other_scores = [float(rng.random() < 0.4) for _ in range(n)] - val_subscores = [best_scores, other_scores] - - eps_a = _estimate_val_noise(val_subscores, best_idx=0) - eps_b = _estimate_val_noise(val_subscores, best_idx=0) - assert eps_a == eps_b - - def test_estimate_val_noise_handles_partial_coverage(self): - # Coverage policy under test: align by position; aggregate only over - # indices present in both best and competitor (i.e., the first - # min(len(best), len(k)) positions). This matches how DSPy stores - # val_subscores positionally per-example; positions beyond the - # shorter list are treated as un-evaluated, not as zeros. - n_best = 50 - n_other = 30 - rng = random.Random(7) - best_scores = [float(rng.random() < 0.5) for _ in range(n_best)] - other_scores = [float(rng.random() < 0.5) for _ in range(n_other)] - val_subscores = [best_scores, other_scores] - - eps = _estimate_val_noise(val_subscores, best_idx=0) - # No crash; non-negative; finite. - assert eps >= 0.0 - assert math.isfinite(eps) diff --git a/tests/tools/test_evolve_tool_cl_aware_gate.py b/tests/tools/test_evolve_tool_cl_aware_gate.py index 20db7b3d..b237a3ff 100644 --- a/tests/tools/test_evolve_tool_cl_aware_gate.py +++ b/tests/tools/test_evolve_tool_cl_aware_gate.py @@ -25,7 +25,6 @@ from evolution.core.dataset_builder import EvalExample from evolution.core.saturation_check import SaturationReport -from evolution.skills.knee_point import CandidatePick from evolution.tools.evolve_tool import evolve from evolution.validation.report import ( PhaseResult, @@ -122,25 +121,6 @@ def _phase(tasks: list[TaskResult]) -> PhaseResult: ) -def _make_knee_pick(evolved_description: str) -> CandidatePick: - """Build a CandidatePick that select_knee_point would return.""" - fake_module = MagicMock() - return CandidatePick( - module=fake_module, - skill_text=evolved_description, - body_chars=len(evolved_description), - val_score=0.8, - val_rank_in_band=1, - band_size=1, - epsilon=0.1, - fallback="knee", - picked_idx=0, - gepa_default_idx=0, - gepa_default_body_chars=len(evolved_description), - band_roster=[], - ) - - def _make_fake_gepa(evolved_description: str): """Build a fake dspy.GEPA whose ``compile()`` returns a module with the detailed_results shape the knee-point path expects.""" @@ -195,7 +175,6 @@ def _patch_stack( """ fake_builder = MagicMock() fake_builder.generate_tool_selection.return_value = _fake_tool_examples() - knee_pick = _make_knee_pick(evolved_description) evolved_per = [holdout_evolved_mean] * holdout_n def _maybe_build(**kwargs): @@ -226,10 +205,6 @@ def _maybe_build(**kwargs): "evolution.tools.evolve_tool.dspy.GEPA", new=_make_fake_gepa(evolved_description), )) - stack.enter_context(patch( - "evolution.tools.evolve_tool.select_knee_point", - return_value=knee_pick, - )) stack.enter_context(patch( "evolution.tools.evolve_tool._candidate_description", return_value=evolved_description, diff --git a/tests/tools/test_evolve_tool_saturation_preflight.py b/tests/tools/test_evolve_tool_saturation_preflight.py index fce76cd5..becb07c2 100644 --- a/tests/tools/test_evolve_tool_saturation_preflight.py +++ b/tests/tools/test_evolve_tool_saturation_preflight.py @@ -78,21 +78,23 @@ def test_healthy_band_does_not_prompt(self, manifest_dir): proves the run actually proceeded past the abort branch. """ from evolution.core.saturation_check import SaturationReport - from evolution.skills.knee_point import CandidatePick + from types import SimpleNamespace healthy = SaturationReport( band="healthy", holdout_score=0.5, holdout_n=10, holdout_per_example=[0.5] * 10, suggestions=[], thresholds={}, ) - fake_module = MagicMock() - knee_pick = CandidatePick( - module=fake_module, skill_text="evolved desc", body_chars=12, - val_score=0.8, val_rank_in_band=1, band_size=1, epsilon=0.1, - fallback="knee", picked_idx=0, gepa_default_idx=0, - gepa_default_body_chars=12, band_roster=[], - ) fake_builder = MagicMock() fake_builder.generate_tool_selection.return_value = _fake_tool_examples() + # Shape the fake GEPA's compile() output so the val-best path's + # details.val_aggregate_scores[best_idx] resolves to a real float. gepa_mock = MagicMock() + fake_optimized = MagicMock() + fake_optimized.detailed_results = SimpleNamespace( + candidates=[MagicMock()], + val_aggregate_scores=[1.0], + best_idx=0, + ) + gepa_mock.return_value.compile.return_value = fake_optimized with patch( "evolution.tools.evolve_tool.SyntheticDatasetBuilder", return_value=fake_builder ), patch( @@ -102,8 +104,6 @@ def test_healthy_band_does_not_prompt(self, manifest_dir): ), patch( "evolution.tools.evolve_tool.interactive_confirm" ) as mock_confirm, patch("evolution.tools.evolve_tool.dspy.GEPA", gepa_mock), patch( - "evolution.tools.evolve_tool.select_knee_point", return_value=knee_pick - ), patch( "evolution.tools.evolve_tool._candidate_description", return_value="evolved desc" ), patch( "evolution.tools.evolve_tool._holdout_evaluate_with_metric" @@ -200,22 +200,24 @@ def test_force_saturation_check_overrides_abort(self, manifest_dir): overrode the abort. """ from evolution.core.saturation_check import SaturationReport - from evolution.skills.knee_point import CandidatePick + from types import SimpleNamespace saturated = SaturationReport( band="no_headroom", holdout_score=0.99, holdout_n=50, holdout_per_example=[1.0] * 50, suggestions=["x"], thresholds={}, ) - fake_module = MagicMock() - knee_pick = CandidatePick( - module=fake_module, skill_text="evolved desc", body_chars=12, - val_score=0.8, val_rank_in_band=1, band_size=1, epsilon=0.1, - fallback="knee", picked_idx=0, gepa_default_idx=0, - gepa_default_body_chars=12, band_roster=[], - ) fake_builder = MagicMock() fake_builder.generate_tool_selection.return_value = _fake_tool_examples() + # Shape the fake GEPA's compile() output so the val-best path's + # details.val_aggregate_scores[best_idx] resolves to a real float. gepa_mock = MagicMock() + fake_optimized = MagicMock() + fake_optimized.detailed_results = SimpleNamespace( + candidates=[MagicMock()], + val_aggregate_scores=[1.0], + best_idx=0, + ) + gepa_mock.return_value.compile.return_value = fake_optimized with patch( "evolution.tools.evolve_tool.SyntheticDatasetBuilder", return_value=fake_builder ), patch( @@ -227,8 +229,6 @@ def test_force_saturation_check_overrides_abort(self, manifest_dir): ), patch( "evolution.tools.evolve_tool.interactive_confirm" ) as mock_confirm, patch("evolution.tools.evolve_tool.dspy.GEPA", gepa_mock), patch( - "evolution.tools.evolve_tool.select_knee_point", return_value=knee_pick - ), patch( "evolution.tools.evolve_tool._candidate_description", return_value="evolved desc" ), patch( "evolution.tools.evolve_tool._holdout_evaluate_with_metric" @@ -249,7 +249,7 @@ def test_cache_reuse_skips_baseline_re_eval_after_gepa(self, manifest_dir): module should NOT be re-scored on the holdout after GEPA finishes. This is the 'net cost ~zero' contract.""" from evolution.core.saturation_check import SaturationReport - from evolution.skills.knee_point import CandidatePick + from types import SimpleNamespace from unittest.mock import MagicMock # Healthy report so preflight passes without prompting; preflight @@ -258,14 +258,16 @@ def test_cache_reuse_skips_baseline_re_eval_after_gepa(self, manifest_dir): band="healthy", holdout_score=0.6, holdout_n=10, holdout_per_example=[0.6] * 10, suggestions=[], thresholds={}, ) - # Fake knee-point result so execution reaches the holdout site. - fake_module = MagicMock() - knee_pick = CandidatePick( - module=fake_module, skill_text="evolved desc", body_chars=12, - val_score=0.8, val_rank_in_band=1, band_size=1, epsilon=0.1, - fallback="knee", picked_idx=0, gepa_default_idx=0, - gepa_default_body_chars=12, band_roster=[], + # Shape the fake GEPA's compile() output so the val-best path's + # details.val_aggregate_scores[best_idx] resolves to a real float. + gepa_mock = MagicMock() + fake_optimized = MagicMock() + fake_optimized.detailed_results = SimpleNamespace( + candidates=[MagicMock()], + val_aggregate_scores=[1.0], + best_idx=0, ) + gepa_mock.return_value.compile.return_value = fake_optimized fake_builder = MagicMock() fake_builder.generate_tool_selection.return_value = _fake_tool_examples() with patch( @@ -274,9 +276,7 @@ def test_cache_reuse_skips_baseline_re_eval_after_gepa(self, manifest_dir): "evolution.tools.evolve_tool.saturation_preflight", return_value=healthy ), patch( "evolution.tools.evolve_tool._preflight_lm_credentials" - ), patch("evolution.tools.evolve_tool.dspy.GEPA"), patch( - "evolution.tools.evolve_tool.select_knee_point", return_value=knee_pick - ), patch( + ), patch("evolution.tools.evolve_tool.dspy.GEPA", gepa_mock), patch( "evolution.tools.evolve_tool._candidate_description", return_value="evolved desc" ), patch( "evolution.tools.evolve_tool._holdout_evaluate_with_metric" @@ -309,7 +309,7 @@ def test_flag_passes_through_to_dspy_gepa(self, manifest_dir): carries the value on the documented attribute. Catches future DSPy refactors that rename reflection_minibatch_size.""" from evolution.core.saturation_check import SaturationReport - from evolution.skills.knee_point import CandidatePick + from types import SimpleNamespace captured: dict = {} original_init = __import__("dspy").GEPA.__init__ @@ -321,12 +321,13 @@ def recording_init(self, *args, **kwargs): band="healthy", holdout_score=0.6, holdout_n=10, holdout_per_example=[0.6] * 10, suggestions=[], thresholds={}, ) + # Shape the fake GEPA's compile() output so the val-best path's + # details.val_aggregate_scores[best_idx] resolves to a real float. fake_module = MagicMock() - knee_pick = CandidatePick( - module=fake_module, skill_text="evolved desc", body_chars=12, - val_score=0.8, val_rank_in_band=1, band_size=1, epsilon=0.1, - fallback="knee", picked_idx=0, gepa_default_idx=0, - gepa_default_body_chars=12, band_roster=[], + fake_module.detailed_results = SimpleNamespace( + candidates=[MagicMock()], + val_aggregate_scores=[1.0], + best_idx=0, ) fake_builder = MagicMock() fake_builder.generate_tool_selection.return_value = _fake_tool_examples() @@ -338,8 +339,6 @@ def recording_init(self, *args, **kwargs): "evolution.tools.evolve_tool._preflight_lm_credentials" ), patch("evolution.tools.evolve_tool.dspy.GEPA.__init__", recording_init), patch( "evolution.tools.evolve_tool.dspy.GEPA.compile", return_value=fake_module - ), patch( - "evolution.tools.evolve_tool.select_knee_point", return_value=knee_pick ), patch( "evolution.tools.evolve_tool._candidate_description", return_value="evolved desc" ), patch(