From ae2e7250038baebe1be1ded92e7c6e0eac81acdb Mon Sep 17 00:00:00 2001 From: Justin Ramos Date: Thu, 21 May 2026 10:57:34 -0600 Subject: [PATCH 1/8] feat(core): saturation pre-flight helper + force_run on closed-loop cache Adds evolution/core/saturation_check.py mirroring auth_check's shape: pure saturation_preflight() returns a SaturationReport classifying the baseline into healthy / no_headroom / weak_signal / uniform_failure. Call sites in evolve_skill / evolve_tool will render a Rich panel and decide whether to prompt or default-deny (next two commits). Also adds ClosedLoopFeedbackCache.force_run: bypasses should_run() and propagates validator exceptions (unlike get_or_run which swallows the expected ones to keep GEPA going). Preflight needs to fire the validator once at startup, before any judge scores have been recorded, which is when get_or_run would return None in sampled mode. Pure helpers; no wiring yet. Wiring lands in feat(skills) and feat(tools) follow-ups. --- evolution/core/closed_loop_feedback.py | 26 ++ evolution/core/saturation_check.py | 253 +++++++++++++++++++ tests/core/test_closed_loop_feedback.py | 59 +++++ tests/core/test_saturation_check.py | 317 ++++++++++++++++++++++++ 4 files changed, 655 insertions(+) create mode 100644 evolution/core/saturation_check.py create mode 100644 tests/core/test_saturation_check.py diff --git a/evolution/core/closed_loop_feedback.py b/evolution/core/closed_loop_feedback.py index 1cd40b44..ea8dd6b0 100644 --- a/evolution/core/closed_loop_feedback.py +++ b/evolution/core/closed_loop_feedback.py @@ -179,6 +179,32 @@ def get_or_run(self, candidate_text: str) -> Optional[ValidationReport]: self._iters_since_last_run = 0 return report + def force_run(self, candidate_text: str) -> ValidationReport: + """Run the validator now, bypassing the saturation gate. + + Use at preflight or anywhere a baseline probe is needed. + Result is cached for downstream ``get_or_run`` hits on the same + text. Propagates validator exceptions (unlike ``get_or_run``, + which swallows the expected ones to keep GEPA going) — preflight + callers want to know the probe failed. + """ + key = self._key(candidate_text) + with self._lock: + cached = self._cache.get(key) + if cached is not None: + return cached + self._artifact_writer(candidate_text, self._evolved_path) + inputs = ValidationInputs( + tool_name=self._artifact_name, + suite=self._suite, + baseline_artifact=self._baseline_path, + evolved_artifact=self._evolved_path, + ) + report = self._validator.validate(inputs) + self._cache[key] = report + self._iters_since_last_run = 0 + return report + def get_task_verdict( self, candidate_text: str, task_id: str ) -> Optional[TaskResult]: diff --git a/evolution/core/saturation_check.py b/evolution/core/saturation_check.py new file mode 100644 index 00000000..6fcfa468 --- /dev/null +++ b/evolution/core/saturation_check.py @@ -0,0 +1,253 @@ +"""Saturation pre-flight: detect doomed evolve_* runs before GEPA spends budget. + +Mirrors the shape of evolution.core.auth_check: a pure helper that +returns a structured report. Call sites in evolve_skill / evolve_tool +render a Rich panel and decide whether to prompt or default-deny. + +See docs/superpowers/specs/2026-05-21-path-f-saturation-preflight-design.md +""" + +from __future__ import annotations + +import sys +from dataclasses import dataclass, field +from typing import Literal, Optional, TypeAlias + +import dspy +from rich.console import Console +from rich.panel import Panel +from rich.text import Text + +SaturationBand: TypeAlias = Literal[ + "healthy", "no_headroom", "weak_signal", "uniform_failure" +] + +DEFAULT_THRESHOLDS: dict[str, float] = { + "no_headroom_synthetic": 0.99, + "weak_signal_synthetic": 0.95, + "no_headroom_closed_loop": 0.95, + "uniform_failure_closed_loop": 0.15, +} + + +@dataclass +class SaturationReport: + band: SaturationBand + holdout_score: float + holdout_n: int + holdout_per_example: list[float] + closed_loop_score: Optional[float] = None + closed_loop_n: Optional[int] = None + closed_loop_per_example: Optional[list[float]] = None + suggestions: list[str] = field(default_factory=list) + thresholds: dict[str, float] = field(default_factory=dict) + + +def _classify_band( + *, + holdout_score: float, + closed_loop_score: Optional[float], + thresholds: dict[str, float], +) -> tuple[SaturationBand, list[str]]: + """Categorize a (synthetic, closed-loop) score pair into a band. + + Returns (band, suggestions_to_show_user). + """ + no_head_syn = thresholds["no_headroom_synthetic"] + weak_syn = thresholds["weak_signal_synthetic"] + no_head_cl = thresholds["no_headroom_closed_loop"] + uniform_cl = thresholds["uniform_failure_closed_loop"] + + if closed_loop_score is not None and closed_loop_score <= uniform_cl: + return "uniform_failure", [ + "Validator agent appears too weak to use the tool/skill — all behavioral tasks fail uniformly.", + "Try a stronger --closed-loop-agent-model.", + "Or harden the suite tasks so failure modes are interesting, not 'model can't execute'.", + ] + + if holdout_score >= no_head_syn and ( + closed_loop_score is None or closed_loop_score >= no_head_cl + ): + return "no_headroom", [ + "Baseline already saturates the eval. No measurable headroom to evolve into.", + "Try a harder closed-loop suite, or pick a different optimization target.", + "Sanity check: is the synthetic generator producing trivially-correct tasks?", + ] + + if ( + holdout_score >= weak_syn + and closed_loop_score is not None + and uniform_cl < closed_loop_score < no_head_cl + ): + return "weak_signal", [ + "Judge saturating but closed-loop has signal; GEPA's small-minibatch acceptance will struggle.", + "Expect many proposals rejected — bump --iterations above 5.", + "Larger minibatch (Path E follow-up) would help once landed.", + ] + + return "healthy", [] + + +def _score_baseline_on_holdout( + *, + baseline_module, + holdout_examples: list, + metric, + lm, +) -> tuple[float, list[float]]: + """Run dspy.Evaluate on the baseline, return (mean, per_example_scores). + + Carved out as its own helper so tests can patch it without touching DSPy + plumbing. Shape matches _holdout_evaluate_with_metric in evolve_*.py. + """ + def two_arg_metric(example, prediction, *_args, **_kwargs): + result = metric(example, prediction) + return float(getattr(result, "score", result)) + + evaluator = dspy.Evaluate( + devset=holdout_examples, + metric=two_arg_metric, + num_threads=4, + provide_traceback=True, + max_errors=len(holdout_examples) * 100, + ) + with dspy.context(lm=lm): + result = evaluator(baseline_module) + mean = float(result.score) / 100.0 + per_example = [float(s) for _, _, s in result.results] + return mean, per_example + + +def saturation_preflight( + *, + baseline_module, + holdout_examples: list, + metric, + lm, + closed_loop_cache=None, + baseline_artifact_text: Optional[str] = None, + thresholds: Optional[dict[str, float]] = None, +) -> SaturationReport: + """Score baseline on holdout (and closed-loop suite if cache provided), + classify into a band, return a report. Pure: no side effects. + + Call sites are responsible for rendering panels, prompting, and exiting. + """ + if not holdout_examples: + raise ValueError("holdout_examples is empty; nothing to score") + thresholds = thresholds if thresholds is not None else dict(DEFAULT_THRESHOLDS) + + holdout_mean, holdout_per_example = _score_baseline_on_holdout( + baseline_module=baseline_module, + holdout_examples=holdout_examples, + metric=metric, + lm=lm, + ) + + closed_loop_mean: Optional[float] = None + closed_loop_n: Optional[int] = None + closed_loop_per_example: Optional[list[float]] = None + if closed_loop_cache is not None: + if baseline_artifact_text is None: + raise ValueError( + "baseline_artifact_text is required when closed_loop_cache is provided" + ) + report = closed_loop_cache.force_run(baseline_artifact_text) + per_example = [1.0 if t.passed else 0.0 for t in report.evolved.tasks] + closed_loop_per_example = per_example + closed_loop_n = len(per_example) + closed_loop_mean = sum(per_example) / len(per_example) if per_example else 0.0 + + band, suggestions = _classify_band( + holdout_score=holdout_mean, + closed_loop_score=closed_loop_mean, + thresholds=thresholds, + ) + + return SaturationReport( + band=band, + holdout_score=holdout_mean, + holdout_n=len(holdout_per_example), + holdout_per_example=holdout_per_example, + closed_loop_score=closed_loop_mean, + closed_loop_n=closed_loop_n, + closed_loop_per_example=closed_loop_per_example, + suggestions=suggestions, + thresholds=dict(thresholds), + ) + + +_BAND_TITLES: dict[SaturationBand, str] = { + "healthy": "Saturation check passed", + "no_headroom": "No measurable headroom", + "weak_signal": "Weak signal — expect a hard run", + "uniform_failure": "Uniform failure — validator too weak", +} + +_BAND_STYLES: dict[SaturationBand, str] = { + "healthy": "green", + "no_headroom": "yellow", + "weak_signal": "yellow", + "uniform_failure": "yellow", +} + + +def render_saturation_panel( + report: SaturationReport, *, console: Optional[Console] = None, +) -> None: + """Print a Rich panel to ``console`` (or default stdout) summarizing the report. + + Healthy band: one-line acknowledgement. Warn bands: full panel with + scores + band-specific suggestions. + """ + if console is None: + console = Console() + + if report.band == "healthy": + console.print( + f"[dim]Saturation check passed (holdout={report.holdout_score:.3f}" + + ( + f", closed-loop={report.closed_loop_score:.3f}" + if report.closed_loop_score is not None + else "" + ) + + ").[/dim]" + ) + return + + body = Text() + body.append(f"Band: {report.band}\n", style="bold") + body.append(f"Holdout (synthetic): {report.holdout_score:.3f} over {report.holdout_n} examples\n") + if report.closed_loop_score is not None: + body.append( + f"Closed-loop (behavioral): {report.closed_loop_score:.3f} over {report.closed_loop_n} tasks\n" + ) + body.append("\nSuggestions:\n", style="bold") + for s in report.suggestions: + body.append(f" • {s}\n") + + console.print( + Panel( + body, + title=_BAND_TITLES[report.band], + border_style=_BAND_STYLES[report.band], + ) + ) + + +def is_non_interactive() -> bool: + """True when stdin isn't a TTY. Used by call sites to decide between + prompting for y/N and printing the override-flag hint.""" + return not sys.stdin.isatty() + + +def interactive_confirm(prompt: str = "Continue anyway? [y/N] ") -> bool: + """Read one line from stdin; return True only for {y, yes} case-insensitive. + + Ctrl-C / KeyboardInterrupt → False (treat like 'n', no traceback noise). + """ + try: + answer = input(prompt) + except (KeyboardInterrupt, EOFError): + return False + return answer.strip().lower() in {"y", "yes"} diff --git a/tests/core/test_closed_loop_feedback.py b/tests/core/test_closed_loop_feedback.py index 291a0e14..038d665b 100644 --- a/tests/core/test_closed_loop_feedback.py +++ b/tests/core/test_closed_loop_feedback.py @@ -533,3 +533,62 @@ def test_write_text_artifact_helper_writes_plain_text(self, tmp_path): path = tmp_path / "out.md" write_text_artifact("hello world\n", path) assert path.read_text() == "hello world\n" + + +class TestForceRun: + """`force_run` bypasses should_run() and propagates errors (unlike + get_or_run which swallows expected validator errors).""" + + def test_force_run_fires_in_sampled_mode_before_any_judge_scores(self, tmp_path): + """In default sampled mode with empty judge history, should_run() + returns False — but force_run runs the validator anyway.""" + suite = _build_suite(tmp_path) + report = _build_report() + validator = MagicMock() + validator.validate.return_value = report + cache = ClosedLoopFeedbackCache( + validator=validator, + suite=suite, + artifact_name="write_file", + baseline_artifact_text="baseline desc", + gate_mode="sampled", + ) + assert cache.should_run() is False + + result = cache.force_run("candidate desc") + + assert result is report + validator.validate.assert_called_once() + + def test_force_run_uses_cache_on_repeat_calls(self, tmp_path): + """Second call with same candidate_text returns the cached report + without re-running the validator.""" + suite = _build_suite(tmp_path) + report = _build_report() + validator = MagicMock() + validator.validate.return_value = report + cache = ClosedLoopFeedbackCache( + validator=validator, suite=suite, artifact_name="t", + baseline_artifact_text="b", gate_mode="sampled", + ) + + first = cache.force_run("cand") + second = cache.force_run("cand") + + assert first is second + assert validator.validate.call_count == 1 + + def test_force_run_propagates_validator_errors(self, tmp_path): + """force_run propagates ConcurrentRunError (unlike get_or_run, + which swallows it and returns None to keep GEPA going). Preflight + callers want to know the probe failed.""" + suite = _build_suite(tmp_path) + validator = MagicMock() + validator.validate.side_effect = ConcurrentRunError("locked") + cache = ClosedLoopFeedbackCache( + validator=validator, suite=suite, artifact_name="t", + baseline_artifact_text="b", gate_mode="sampled", + ) + + with pytest.raises(ConcurrentRunError): + cache.force_run("cand") diff --git a/tests/core/test_saturation_check.py b/tests/core/test_saturation_check.py new file mode 100644 index 00000000..0429c3dd --- /dev/null +++ b/tests/core/test_saturation_check.py @@ -0,0 +1,317 @@ +"""Tests for evolution.core.saturation_check. + +All tests use hand-built scores or mock the LM/validator — zero real +LM spend. Pattern mirrors tests/core/test_closed_loop_feedback.py. +""" + +from __future__ import annotations + +import pytest +from unittest.mock import MagicMock, patch + +from evolution.core.saturation_check import ( + DEFAULT_THRESHOLDS, + SaturationReport, + _classify_band, + saturation_preflight, +) + + +class TestClassifyBand: + def test_healthy_when_synthetic_below_weak_threshold(self): + band, _ = _classify_band( + holdout_score=0.85, closed_loop_score=None, thresholds=DEFAULT_THRESHOLDS, + ) + assert band == "healthy" + + def test_no_headroom_synthetic_only(self): + band, suggestions = _classify_band( + holdout_score=0.99, closed_loop_score=None, thresholds=DEFAULT_THRESHOLDS, + ) + assert band == "no_headroom" + assert any("harder" in s.lower() or "different target" in s.lower() for s in suggestions) + + def test_no_headroom_with_closed_loop_also_saturated(self): + band, _ = _classify_band( + holdout_score=0.99, closed_loop_score=0.98, thresholds=DEFAULT_THRESHOLDS, + ) + assert band == "no_headroom" + + def test_weak_signal_when_closed_loop_in_middle_band(self): + band, suggestions = _classify_band( + holdout_score=0.97, closed_loop_score=0.60, thresholds=DEFAULT_THRESHOLDS, + ) + assert band == "weak_signal" + assert any("minibatch" in s.lower() or "iterations" in s.lower() for s in suggestions) + + def test_uniform_failure_when_closed_loop_below_threshold(self): + band, suggestions = _classify_band( + holdout_score=0.98, closed_loop_score=0.10, thresholds=DEFAULT_THRESHOLDS, + ) + assert band == "uniform_failure" + assert any("validator" in s.lower() or "stronger" in s.lower() for s in suggestions) + + def test_boundary_exactly_at_no_headroom_synthetic_triggers(self): + """0.99 exactly should trigger no_headroom (>= comparison).""" + band, _ = _classify_band( + holdout_score=0.99, closed_loop_score=None, thresholds=DEFAULT_THRESHOLDS, + ) + assert band == "no_headroom" + + def test_boundary_just_below_no_headroom_does_not_trigger(self): + band, _ = _classify_band( + holdout_score=0.989, closed_loop_score=None, thresholds=DEFAULT_THRESHOLDS, + ) + assert band == "healthy" + + def test_custom_thresholds_propagate(self): + custom = {**DEFAULT_THRESHOLDS, "no_headroom_synthetic": 0.80} + band, _ = _classify_band( + holdout_score=0.85, closed_loop_score=None, thresholds=custom, + ) + assert band == "no_headroom" + + +class TestSaturationPreflightNoClosedLoop: + def test_returns_healthy_when_baseline_below_threshold(self): + baseline_module = MagicMock() + holdout_examples = [MagicMock() for _ in range(5)] + metric = MagicMock() + lm = MagicMock() + + with patch( + "evolution.core.saturation_check._score_baseline_on_holdout", + return_value=(0.60, [0.6, 0.6, 0.6, 0.6, 0.6]), + ): + report = saturation_preflight( + baseline_module=baseline_module, + holdout_examples=holdout_examples, + metric=metric, + lm=lm, + ) + + assert report.band == "healthy" + assert report.holdout_score == 0.60 + assert report.holdout_n == 5 + assert report.holdout_per_example == [0.6, 0.6, 0.6, 0.6, 0.6] + assert report.closed_loop_score is None + + def test_returns_no_headroom_when_baseline_at_ceiling(self): + with patch( + "evolution.core.saturation_check._score_baseline_on_holdout", + return_value=(1.0, [1.0] * 5), + ): + report = saturation_preflight( + baseline_module=MagicMock(), + holdout_examples=[MagicMock() for _ in range(5)], + metric=MagicMock(), + lm=MagicMock(), + ) + + assert report.band == "no_headroom" + assert len(report.suggestions) >= 1 + + def test_raises_on_empty_holdout(self): + with pytest.raises(ValueError, match="holdout_examples"): + saturation_preflight( + baseline_module=MagicMock(), + holdout_examples=[], + metric=MagicMock(), + lm=MagicMock(), + ) + + +class TestSaturationPreflightWithClosedLoop: + def _make_validation_report(self, *, n_pass: int, n_fail: int): + """Build a minimal real ValidationReport whose evolved phase has the + requested pass/fail counts. Uses real dataclasses (not MagicMock) so + a future field rename breaks the test loudly.""" + from evolution.validation.report import ( + PhaseResult, TaskResult, ValidationReport, WinLoss, + ) + passed_tasks = [ + TaskResult( + task_id=f"p{i}", passed=True, abstained=False, + tool_calls_seq=[], duration_seconds=0.0, + ) + for i in range(n_pass) + ] + failed_tasks = [ + TaskResult( + task_id=f"f{i}", passed=False, abstained=False, + tool_calls_seq=[], duration_seconds=0.0, + ) + for i in range(n_fail) + ] + tasks = passed_tasks + failed_tasks + total = n_pass + n_fail + phase = PhaseResult( + pass_rate=n_pass / max(1, total), + n_passed=n_pass, + n_failed=n_fail, + n_abstained=0, + tasks=tasks, + ) + delta = WinLoss(n_wins=0, n_losses=0, n_ties=total, pass_rate_change=0.0) + return ValidationReport( + schema_version="1", + tool="t", + task_suite_path="suite.jsonl", + task_suite_sha256="x" * 64, + baseline=phase, + evolved=phase, + delta=delta, + decision="pass", + decision_reasons=[], + ) + + def test_closed_loop_score_lands_in_report(self): + cache = MagicMock() + cache.force_run.return_value = self._make_validation_report(n_pass=3, n_fail=4) + + with patch( + "evolution.core.saturation_check._score_baseline_on_holdout", + return_value=(0.99, [1.0] * 5), + ): + report = saturation_preflight( + baseline_module=MagicMock(), + holdout_examples=[MagicMock() for _ in range(5)], + metric=MagicMock(), + lm=MagicMock(), + closed_loop_cache=cache, + baseline_artifact_text="baseline desc", + ) + + cache.force_run.assert_called_once_with("baseline desc") + assert report.closed_loop_n == 7 + assert report.closed_loop_score == pytest.approx(3 / 7) + assert report.closed_loop_per_example == [1.0, 1.0, 1.0, 0.0, 0.0, 0.0, 0.0] + + def test_uniform_failure_band_triggers(self): + cache = MagicMock() + cache.force_run.return_value = self._make_validation_report(n_pass=0, n_fail=7) + with patch( + "evolution.core.saturation_check._score_baseline_on_holdout", + return_value=(0.99, [1.0] * 5), + ): + report = saturation_preflight( + baseline_module=MagicMock(), + holdout_examples=[MagicMock() for _ in range(5)], + metric=MagicMock(), + lm=MagicMock(), + closed_loop_cache=cache, + baseline_artifact_text="b", + ) + assert report.band == "uniform_failure" + + def test_weak_signal_band_triggers(self): + cache = MagicMock() + cache.force_run.return_value = self._make_validation_report(n_pass=4, n_fail=3) + with patch( + "evolution.core.saturation_check._score_baseline_on_holdout", + return_value=(0.97, [1.0] * 5), + ): + report = saturation_preflight( + baseline_module=MagicMock(), + holdout_examples=[MagicMock() for _ in range(5)], + metric=MagicMock(), + lm=MagicMock(), + closed_loop_cache=cache, + baseline_artifact_text="b", + ) + assert report.band == "weak_signal" + + def test_missing_baseline_text_raises(self): + cache = MagicMock() + with patch( + "evolution.core.saturation_check._score_baseline_on_holdout", + return_value=(0.5, [0.5]), + ): + with pytest.raises(ValueError, match="baseline_artifact_text"): + saturation_preflight( + baseline_module=MagicMock(), + holdout_examples=[MagicMock()], + metric=MagicMock(), lm=MagicMock(), + closed_loop_cache=cache, + baseline_artifact_text=None, + ) + + +class TestRenderPanel: + def _render_to_string(self, report: SaturationReport) -> str: + from io import StringIO + from rich.console import Console + from evolution.core.saturation_check import render_saturation_panel + + buf = StringIO() + console = Console(file=buf, width=100, color_system=None, force_terminal=False) + render_saturation_panel(report, console=console) + return buf.getvalue() + + def test_no_headroom_panel_includes_band_name_and_suggestion(self): + report = SaturationReport( + band="no_headroom", holdout_score=0.99, holdout_n=50, + holdout_per_example=[1.0] * 50, + suggestions=["Try a harder closed-loop suite", "Pick a different target"], + thresholds=DEFAULT_THRESHOLDS, + ) + out = self._render_to_string(report) + assert "no_headroom" in out.lower() or "no headroom" in out.lower() + assert "harder closed-loop suite" in out + assert "0.99" in out + + def test_weak_signal_panel_shows_closed_loop_score(self): + report = SaturationReport( + band="weak_signal", holdout_score=0.97, holdout_n=50, + holdout_per_example=[1.0] * 50, + closed_loop_score=0.60, closed_loop_n=7, closed_loop_per_example=[], + suggestions=["Bump iterations"], thresholds=DEFAULT_THRESHOLDS, + ) + out = self._render_to_string(report) + assert "0.60" in out or "60" in out + assert "Bump iterations" in out + + def test_healthy_panel_is_terse(self): + """healthy band should be one-line / minimal — most of the panel + machinery is for the warn bands. This test just verifies it doesn't + blow up.""" + report = SaturationReport( + band="healthy", holdout_score=0.60, holdout_n=50, + holdout_per_example=[0.6] * 50, + suggestions=[], thresholds=DEFAULT_THRESHOLDS, + ) + out = self._render_to_string(report) + assert "healthy" in out.lower() or "passed" in out.lower() + + +class TestIsNonInteractive: + def test_returns_true_when_stdin_not_tty(self, monkeypatch): + monkeypatch.setattr("sys.stdin.isatty", lambda: False) + from evolution.core.saturation_check import is_non_interactive + assert is_non_interactive() is True + + def test_returns_false_when_stdin_is_tty(self, monkeypatch): + monkeypatch.setattr("sys.stdin.isatty", lambda: True) + from evolution.core.saturation_check import is_non_interactive + assert is_non_interactive() is False + + +class TestInteractiveConfirm: + @pytest.mark.parametrize("answer", ["y", "Y", "yes", "YES", "Yes"]) + def test_returns_true_for_yes_variants(self, monkeypatch, answer): + monkeypatch.setattr("builtins.input", lambda _prompt="": answer) + from evolution.core.saturation_check import interactive_confirm + assert interactive_confirm() is True + + @pytest.mark.parametrize("answer", ["n", "no", "", "anything else", "ynope"]) + def test_returns_false_for_everything_else(self, monkeypatch, answer): + monkeypatch.setattr("builtins.input", lambda _prompt="": answer) + from evolution.core.saturation_check import interactive_confirm + assert interactive_confirm() is False + + def test_returns_false_on_keyboard_interrupt(self, monkeypatch): + def _raise(_prompt=""): + raise KeyboardInterrupt() + monkeypatch.setattr("builtins.input", _raise) + from evolution.core.saturation_check import interactive_confirm + assert interactive_confirm() is False From d6e6c834aebce1f26fedc3885cde65216cf90aff Mon Sep 17 00:00:00 2001 From: Justin Ramos Date: Thu, 21 May 2026 11:06:37 -0600 Subject: [PATCH 2/8] feat(tools): wire saturation pre-flight into evolve_tool After the synthetic dataset builds and the baseline module / metric / closed_loop_cache are constructed (and before GEPA setup), the framework now runs the saturation preflight from feat(core). Two new flags: --no-saturation-check (skip entirely) and --force-saturation-check (run + render but bypass the abort/prompt). Default UX in interactive contexts is warn+confirm; in non-interactive contexts (no TTY on stdin), non-healthy bands exit cleanly with a "use --force-saturation-check" hint. The baseline holdout per-example scores from the preflight are stashed and reused at the post-GEPA holdout-comparison call site, so the baseline isn't re-scored at run end. Net cost: ~zero. Closes the "doomed runs spend GEPA budget before any signal" gap documented in reports/pareto_frontier_feasibility.md spike #2. --- evolution/tools/evolve_tool.py | 73 ++++++++++- .../test_evolve_tool_saturation_preflight.py | 123 ++++++++++++++++++ 2 files changed, 193 insertions(+), 3 deletions(-) create mode 100644 tests/tools/test_evolve_tool_saturation_preflight.py diff --git a/evolution/tools/evolve_tool.py b/evolution/tools/evolve_tool.py index 04b8fd90..23e44b01 100644 --- a/evolution/tools/evolve_tool.py +++ b/evolution/tools/evolve_tool.py @@ -24,6 +24,12 @@ from evolution.core.config import EvolutionConfig from evolution.core.auth_check import preflight as _preflight_lm_credentials +from evolution.core.saturation_check import ( + saturation_preflight, + render_saturation_panel, + interactive_confirm, + is_non_interactive, +) from evolution.core.cost_advisor import ( find_cheaper_alternative as _find_cheaper_alternative, render_suggestion_panel as _render_cost_suggestion_panel, @@ -368,6 +374,8 @@ def evolve( closed_loop_task_timeout_seconds: Optional[int] = None, skip_preflight: bool = False, skip_cost_suggest: bool = False, + skip_saturation_check: bool = False, + force_saturation_check: bool = False, ) -> dict[str, Any]: """Evolve one tool description inside a manifest. @@ -647,6 +655,36 @@ def evolve( if closed_loop_in_valset: valset = valset + behavioral_examples + cached_baseline_holdout_per_example = None + if not skip_saturation_check: + holdout_examples_for_preflight = _build_examples( + dataset.holdout, for_module=True + ) + sat_report = saturation_preflight( + baseline_module=baseline_module, + holdout_examples=holdout_examples_for_preflight, + metric=metric, + lm=lm, + closed_loop_cache=closed_loop_cache, + baseline_artifact_text=baseline_description, + ) + if sat_report.band != "healthy": + render_saturation_panel(sat_report, console=console) + if not force_saturation_check: + if is_non_interactive(): + console.print( + "[yellow]Non-interactive context; refusing to " + "proceed. Pass --force-saturation-check to " + "override.[/yellow]" + ) + sys.exit(0) + if not interactive_confirm(): + console.print("[yellow]Aborted by user.[/yellow]") + sys.exit(0) + else: + render_saturation_panel(sat_report, console=console) + cached_baseline_holdout_per_example = sat_report.holdout_per_example + console.print(f"\n[bold cyan]Running GEPA optimization (max_full_evals={iterations})[/bold cyan]\n") start_time = time.time() @@ -762,9 +800,13 @@ def evolve( f"\n[bold]Evaluating on holdout set ({len(dataset.holdout)} examples)[/bold]" ) holdout_examples = _build_examples(dataset.holdout, for_module=True) - avg_baseline, baseline_per_example = _holdout_evaluate_with_metric( - baseline_module, holdout_examples, metric, lm, - ) + if cached_baseline_holdout_per_example is not None: + baseline_per_example = cached_baseline_holdout_per_example + avg_baseline = sum(baseline_per_example) / len(baseline_per_example) + else: + avg_baseline, baseline_per_example = _holdout_evaluate_with_metric( + baseline_module, holdout_examples, metric, lm, + ) avg_evolved, evolved_per_example = _holdout_evaluate_with_metric( optimized_module, holdout_examples, metric, lm, ) @@ -1187,6 +1229,27 @@ def evolve( "and prints a Rich panel with a paste-ready --eval-model flag. " "Pass this to suppress the panel.", ) +@click.option( + "--no-saturation-check", + "skip_saturation_check", + is_flag=True, + default=False, + help="Skip the saturation pre-flight. By default, the framework " + "scores the baseline on the holdout (and the closed-loop suite, " + "if --closed-loop-during-evolution is set) BEFORE GEPA starts " + "and refuses to spend on a saturated target. Pass this to skip " + "(useful when you've already validated headroom externally).", +) +@click.option( + "--force-saturation-check", + "force_saturation_check", + is_flag=True, + default=False, + help="Run the saturation pre-flight, render the panel, but proceed " + "regardless of band. Required to override a non-healthy verdict " + "in non-interactive contexts (no TTY). Without this in such a " + "context, the framework exits cleanly without spending GEPA budget.", +) @click.option( "--closed-loop-in-valset/--no-closed-loop-in-valset", "closed_loop_in_valset", @@ -1238,6 +1301,8 @@ def main( benchmark_timeout_seconds: int, skip_preflight: bool, skip_cost_suggest: bool, + skip_saturation_check: bool, + force_saturation_check: bool, closed_loop_suite_path: Optional[Path], closed_loop_hermes_repo: Optional[Path], closed_loop_saturation_threshold: float, @@ -1288,6 +1353,8 @@ def main( closed_loop_task_timeout_seconds=closed_loop_task_timeout_seconds, skip_preflight=skip_preflight, skip_cost_suggest=skip_cost_suggest, + skip_saturation_check=skip_saturation_check, + force_saturation_check=force_saturation_check, ) except HermesProviderError as exc: # Render a clean error panel instead of dumping a Python traceback — diff --git a/tests/tools/test_evolve_tool_saturation_preflight.py b/tests/tools/test_evolve_tool_saturation_preflight.py new file mode 100644 index 00000000..ec0b8705 --- /dev/null +++ b/tests/tools/test_evolve_tool_saturation_preflight.py @@ -0,0 +1,123 @@ +"""Integration tests for the saturation pre-flight wiring in evolve_tool. + +Mocks the LM and the dataset builder so each test runs in ≤2s — +zero real LM spend. Mirrors tests/tools/test_evolve_tool_closed_loop.py. +""" + +from __future__ import annotations + +from pathlib import Path +from unittest.mock import MagicMock, patch + +import pytest +from click.testing import CliRunner + +from evolution.tools.evolve_tool import main as evolve_tool_main + + +def _minimal_manifest_dir(tmp_path: Path) -> Path: + """Write a one-tool _SCHEMA file so the manifest loads.""" + tools_dir = tmp_path / "tools" + tools_dir.mkdir() + (tools_dir / "__init__.py").write_text("") + (tools_dir / "my_tools.py").write_text( + 'WRITE_FILE_SCHEMA = {\n' + ' "name": "write_file",\n' + ' "description": "Write to a file.",\n' + ' "input_schema": {"type": "object", "properties": {}},\n' + '}\n' + ) + return tools_dir + + +@pytest.fixture +def manifest_dir(tmp_path): + return _minimal_manifest_dir(tmp_path) + + +class TestSaturationPreflightCLI: + def test_no_saturation_check_flag_skips_helper(self, manifest_dir): + """--no-saturation-check skips the preflight helper entirely.""" + with patch( + "evolution.tools.evolve_tool.saturation_preflight" + ) as mock_preflight, patch( + "evolution.tools.evolve_tool._preflight_lm_credentials" + ), patch("evolution.tools.evolve_tool.dspy.GEPA"): + runner = CliRunner() + runner.invoke( + evolve_tool_main, + ["--tool", "write_file", "--manifest", str(manifest_dir), + "--iterations", "1", "--no-saturation-check", "--no-preflight"], + ) + mock_preflight.assert_not_called() + + def test_healthy_band_does_not_prompt(self, manifest_dir): + """When preflight returns healthy, no panel, no prompt; GEPA proceeds.""" + from evolution.core.saturation_check import SaturationReport + healthy = SaturationReport( + band="healthy", holdout_score=0.5, holdout_n=10, + holdout_per_example=[0.5] * 10, suggestions=[], thresholds={}, + ) + with patch( + "evolution.tools.evolve_tool.saturation_preflight", return_value=healthy + ), patch( + "evolution.tools.evolve_tool._preflight_lm_credentials" + ), patch( + "evolution.tools.evolve_tool.interactive_confirm" + ) as mock_confirm, patch("evolution.tools.evolve_tool.dspy.GEPA"): + runner = CliRunner() + runner.invoke( + evolve_tool_main, + ["--tool", "write_file", "--manifest", str(manifest_dir), + "--iterations", "1", "--no-preflight"], + ) + mock_confirm.assert_not_called() + + def test_saturated_band_non_interactive_aborts(self, manifest_dir): + """no_headroom band in non-interactive context exits cleanly without GEPA.""" + from evolution.core.saturation_check import SaturationReport + saturated = SaturationReport( + band="no_headroom", holdout_score=0.99, holdout_n=50, + holdout_per_example=[1.0] * 50, + suggestions=["Try a harder suite"], thresholds={}, + ) + gepa_mock = MagicMock() + with patch( + "evolution.tools.evolve_tool.saturation_preflight", return_value=saturated + ), patch( + "evolution.tools.evolve_tool._preflight_lm_credentials" + ), patch( + "evolution.tools.evolve_tool.is_non_interactive", return_value=True + ), patch("evolution.tools.evolve_tool.dspy.GEPA", gepa_mock): + runner = CliRunner() + result = runner.invoke( + evolve_tool_main, + ["--tool", "write_file", "--manifest", str(manifest_dir), + "--iterations", "1", "--no-preflight"], + ) + gepa_mock.assert_not_called() + assert "force-saturation-check" in result.output + + def test_force_saturation_check_overrides_abort(self, manifest_dir): + """--force-saturation-check renders panel but lets GEPA run.""" + from evolution.core.saturation_check import SaturationReport + saturated = SaturationReport( + band="no_headroom", holdout_score=0.99, holdout_n=50, + holdout_per_example=[1.0] * 50, + suggestions=["x"], thresholds={}, + ) + with patch( + "evolution.tools.evolve_tool.saturation_preflight", return_value=saturated + ), patch( + "evolution.tools.evolve_tool._preflight_lm_credentials" + ), patch( + "evolution.tools.evolve_tool.interactive_confirm" + ) as mock_confirm, patch("evolution.tools.evolve_tool.dspy.GEPA"): + runner = CliRunner() + runner.invoke( + evolve_tool_main, + ["--tool", "write_file", "--manifest", str(manifest_dir), + "--iterations", "1", "--force-saturation-check", "--no-preflight"], + ) + # confirm is bypassed when --force-saturation-check is set + mock_confirm.assert_not_called() From 6a2546d7812695d63bf435f5637aae38e8d6b647 Mon Sep 17 00:00:00 2001 From: Justin Ramos Date: Thu, 21 May 2026 11:23:30 -0600 Subject: [PATCH 3/8] feat(skills): wire saturation pre-flight into evolve_skill MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Symmetric to the evolve_tool wiring from the previous commit. After the synthetic dataset builds and baseline_module / metric / closed_loop_cache are constructed (and before GEPA setup), the framework runs saturation_preflight; non-healthy bands prompt (interactive) or default-deny (non-interactive) with a --force-saturation-check override. Baseline holdout per-example scores from the preflight are reused at the post-GEPA holdout-comparison call site to keep net cost ~zero. The per-candidate _holdout_evaluate_with_metric inside the knee-point loop is deliberately untouched — only the final baseline-vs-evolved comparison reuses the cached scores. Completes Path F across both pipelines. --- evolution/skills/evolve_skill.py | 70 +++++++++++- .../test_evolve_skill_saturation_preflight.py | 107 ++++++++++++++++++ 2 files changed, 174 insertions(+), 3 deletions(-) create mode 100644 tests/skills/test_evolve_skill_saturation_preflight.py diff --git a/evolution/skills/evolve_skill.py b/evolution/skills/evolve_skill.py index 2dbdfd05..9d0986be 100644 --- a/evolution/skills/evolve_skill.py +++ b/evolution/skills/evolve_skill.py @@ -24,6 +24,12 @@ from evolution.core.config import EvolutionConfig from evolution.core.auth_check import preflight as _preflight_lm_credentials +from evolution.core.saturation_check import ( + saturation_preflight, + render_saturation_panel, + interactive_confirm, + is_non_interactive, +) from evolution.core.cost_advisor import ( find_cheaper_alternative as _find_cheaper_alternative, render_suggestion_panel as _render_cost_suggestion_panel, @@ -598,6 +604,8 @@ def evolve( benchmark_timeout_seconds: int = 600, skip_preflight: bool = False, skip_cost_suggest: bool = False, + skip_saturation_check: bool = False, + force_saturation_check: bool = False, closed_loop_suite_path: Optional[Path] = None, closed_loop_saturation_threshold: float = 0.95, closed_loop_min_iters: int = 3, @@ -879,6 +887,34 @@ def evolve( if closed_loop_in_valset: valset = valset + behavioral_examples + cached_baseline_holdout_per_example = None + if not skip_saturation_check: + holdout_examples_for_preflight = dataset.to_dspy_examples("holdout") + sat_report = saturation_preflight( + baseline_module=baseline_module, + holdout_examples=holdout_examples_for_preflight, + metric=metric, + lm=lm, + closed_loop_cache=closed_loop_cache, + baseline_artifact_text=skill["body"], + ) + if sat_report.band != "healthy": + render_saturation_panel(sat_report, console=console) + if not force_saturation_check: + if is_non_interactive(): + console.print( + "[yellow]Non-interactive context; refusing to " + "proceed. Pass --force-saturation-check to " + "override.[/yellow]" + ) + sys.exit(0) + if not interactive_confirm(): + console.print("[yellow]Aborted by user.[/yellow]") + sys.exit(0) + else: + render_saturation_panel(sat_report, console=console) + cached_baseline_holdout_per_example = sat_report.holdout_per_example + console.print(f"\n[bold cyan]Running GEPA optimization (budget={gepa_budget})...[/bold cyan]\n") start_time = time.time() @@ -1004,9 +1040,13 @@ def evolve( ) holdout_examples = dataset.to_dspy_examples("holdout") - avg_baseline, baseline_per_example = _holdout_evaluate_with_metric( - baseline_module, holdout_examples, metric, lm, - ) + if cached_baseline_holdout_per_example is not None: + baseline_per_example = cached_baseline_holdout_per_example + avg_baseline = sum(baseline_per_example) / len(baseline_per_example) + else: + avg_baseline, baseline_per_example = _holdout_evaluate_with_metric( + baseline_module, holdout_examples, metric, lm, + ) avg_evolved, evolved_per_example = _holdout_evaluate_with_metric( optimized_module, holdout_examples, metric, lm, ) @@ -1501,6 +1541,26 @@ def evolve( "and prints a Rich panel with a paste-ready --eval-model flag. " "Pass this to suppress the panel.", ) +@click.option( + "--no-saturation-check", + "skip_saturation_check", + is_flag=True, + default=False, + help="Skip the saturation pre-flight. By default, the framework " + "scores the baseline on the holdout (and the closed-loop suite, " + "if --closed-loop-during-evolution is set) BEFORE GEPA starts " + "and refuses to spend on a saturated target. Pass this to skip " + "(useful when you've already validated headroom externally).", +) +@click.option( + "--force-saturation-check", + "force_saturation_check", + is_flag=True, + default=False, + help="Run the saturation pre-flight, render the panel, but proceed " + "regardless of band. Required to override a non-healthy verdict " + "in non-interactive contexts (no TTY).", +) @click.option( "--closed-loop-during-evolution", "closed_loop_suite_path", @@ -1592,6 +1652,8 @@ def main(skill, iterations, eval_source, dataset_path, optimizer_model, reflecti benchmark_cmd, benchmark_timeout_seconds, skip_preflight, skip_cost_suggest, + skip_saturation_check, + force_saturation_check, closed_loop_suite_path, closed_loop_saturation_threshold, closed_loop_min_iters, @@ -1637,6 +1699,8 @@ def main(skill, iterations, eval_source, dataset_path, optimizer_model, reflecti benchmark_timeout_seconds=benchmark_timeout_seconds, skip_preflight=skip_preflight, skip_cost_suggest=skip_cost_suggest, + skip_saturation_check=skip_saturation_check, + force_saturation_check=force_saturation_check, closed_loop_suite_path=closed_loop_suite_path, closed_loop_saturation_threshold=closed_loop_saturation_threshold, closed_loop_min_iters=closed_loop_min_iters, diff --git a/tests/skills/test_evolve_skill_saturation_preflight.py b/tests/skills/test_evolve_skill_saturation_preflight.py new file mode 100644 index 00000000..1b866d4f --- /dev/null +++ b/tests/skills/test_evolve_skill_saturation_preflight.py @@ -0,0 +1,107 @@ +"""Integration tests for saturation pre-flight wiring in evolve_skill. + +Symmetric to tests/tools/test_evolve_tool_saturation_preflight.py. +""" + +from __future__ import annotations + +from pathlib import Path +from unittest.mock import MagicMock, patch + +import pytest +from click.testing import CliRunner + +from evolution.skills.evolve_skill import main as evolve_skill_main + + +@pytest.fixture +def skill_dir(tmp_path): + """Write a minimal SKILL.md so skill discovery succeeds.""" + skills_root = tmp_path / "skills" + skill_path = skills_root / "demo-skill" + skill_path.mkdir(parents=True) + (skill_path / "SKILL.md").write_text( + "---\nname: demo-skill\ndescription: a test skill\n---\n\nDo X.\n" + ) + return skills_root + + +class TestSaturationPreflightCLI: + def test_no_saturation_check_flag_skips_helper(self, skill_dir): + with patch( + "evolution.skills.evolve_skill.saturation_preflight" + ) as mock_preflight, patch( + "evolution.skills.evolve_skill._preflight_lm_credentials" + ), patch("evolution.skills.evolve_skill.dspy.GEPA"): + runner = CliRunner() + runner.invoke( + evolve_skill_main, + ["--skill", "demo-skill", "--skill-source-dir", str(skill_dir), + "--iterations", "1", "--no-saturation-check", "--no-preflight"], + ) + mock_preflight.assert_not_called() + + def test_healthy_band_does_not_prompt(self, skill_dir): + from evolution.core.saturation_check import SaturationReport + healthy = SaturationReport( + band="healthy", holdout_score=0.5, holdout_n=10, + holdout_per_example=[0.5] * 10, suggestions=[], thresholds={}, + ) + with patch( + "evolution.skills.evolve_skill.saturation_preflight", return_value=healthy + ), patch( + "evolution.skills.evolve_skill._preflight_lm_credentials" + ), patch( + "evolution.skills.evolve_skill.interactive_confirm" + ) as mock_confirm, patch("evolution.skills.evolve_skill.dspy.GEPA"): + runner = CliRunner() + runner.invoke( + evolve_skill_main, + ["--skill", "demo-skill", "--skill-source-dir", str(skill_dir), + "--iterations", "1", "--no-preflight"], + ) + mock_confirm.assert_not_called() + + def test_saturated_band_non_interactive_aborts(self, skill_dir): + from evolution.core.saturation_check import SaturationReport + saturated = SaturationReport( + band="no_headroom", holdout_score=0.99, holdout_n=50, + holdout_per_example=[1.0] * 50, suggestions=["x"], thresholds={}, + ) + gepa_mock = MagicMock() + with patch( + "evolution.skills.evolve_skill.saturation_preflight", return_value=saturated + ), patch( + "evolution.skills.evolve_skill._preflight_lm_credentials" + ), patch( + "evolution.skills.evolve_skill.is_non_interactive", return_value=True + ), patch("evolution.skills.evolve_skill.dspy.GEPA", gepa_mock): + runner = CliRunner() + result = runner.invoke( + evolve_skill_main, + ["--skill", "demo-skill", "--skill-source-dir", str(skill_dir), + "--iterations", "1", "--no-preflight"], + ) + gepa_mock.assert_not_called() + assert "force-saturation-check" in result.output + + def test_force_saturation_check_overrides_abort(self, skill_dir): + from evolution.core.saturation_check import SaturationReport + saturated = SaturationReport( + band="no_headroom", holdout_score=0.99, holdout_n=50, + holdout_per_example=[1.0] * 50, suggestions=["x"], thresholds={}, + ) + with patch( + "evolution.skills.evolve_skill.saturation_preflight", return_value=saturated + ), patch( + "evolution.skills.evolve_skill._preflight_lm_credentials" + ), patch( + "evolution.skills.evolve_skill.interactive_confirm" + ) as mock_confirm, patch("evolution.skills.evolve_skill.dspy.GEPA"): + runner = CliRunner() + runner.invoke( + evolve_skill_main, + ["--skill", "demo-skill", "--skill-source-dir", str(skill_dir), + "--iterations", "1", "--force-saturation-check", "--no-preflight"], + ) + mock_confirm.assert_not_called() From 360b51b5c79fa6dfb432ecbbf8122d0765f002a6 Mon Sep 17 00:00:00 2001 From: Justin Ramos Date: Thu, 21 May 2026 11:45:51 -0600 Subject: [PATCH 4/8] fix(core): preserve first-fire guarantee in force_run; add cache-reuse tests Two follow-ups from the final code review of the Path F branch: 1. ClosedLoopFeedbackCache.force_run was resetting _iters_since_last_run to 0, eating the "allow first fire" allowance that __init__ sets up (= min_iters). In sampled gate_mode this delayed the first GEPA-time closed-loop fire by min_iters iterations. Now force_run preserves the allowance so subsequent get_or_run calls fire as originally designed. Tests confirm should_run() still returns True after a force_run when judge history is empty. 2. Added integration tests for both evolve_tool and evolve_skill that verify the cache-reuse mechanism: when the saturation preflight runs and populates the cached baseline holdout scores, the post-GEPA evaluation site reuses them instead of re-running the baseline eval. This locks in the "net cost ~zero" correctness claim. --- evolution/core/closed_loop_feedback.py | 2 +- tests/core/test_closed_loop_feedback.py | 39 +++++++++++++++ .../test_evolve_skill_saturation_preflight.py | 47 ++++++++++++++++++ .../test_evolve_tool_saturation_preflight.py | 49 +++++++++++++++++++ 4 files changed, 136 insertions(+), 1 deletion(-) diff --git a/evolution/core/closed_loop_feedback.py b/evolution/core/closed_loop_feedback.py index ea8dd6b0..94e858cd 100644 --- a/evolution/core/closed_loop_feedback.py +++ b/evolution/core/closed_loop_feedback.py @@ -202,7 +202,7 @@ def force_run(self, candidate_text: str) -> ValidationReport: ) report = self._validator.validate(inputs) self._cache[key] = report - self._iters_since_last_run = 0 + self._iters_since_last_run = self.min_iters return report def get_task_verdict( diff --git a/tests/core/test_closed_loop_feedback.py b/tests/core/test_closed_loop_feedback.py index 038d665b..c6e3b77f 100644 --- a/tests/core/test_closed_loop_feedback.py +++ b/tests/core/test_closed_loop_feedback.py @@ -592,3 +592,42 @@ def test_force_run_propagates_validator_errors(self, tmp_path): with pytest.raises(ConcurrentRunError): cache.force_run("cand") + + def test_force_run_preserves_first_fire_for_subsequent_get_or_run(self, tmp_path): + """force_run at preflight must not eat the first-fire allowance. + + The init contract is _iters_since_last_run = min_iters so that the + first GEPA-time record_judge_score call pushes the counter above the + periodic floor and the immediately following get_or_run fires. + force_run must restore that same value so the guarantee holds even + when preflight ran before GEPA started. + + In sampled mode, should_run() returns False when judge_history is + empty (there's an early-return guard). The allowance only takes effect + after the first record_judge_score — at that point _iters_since_last_run + must be >= min_iters. When force_run incorrectly reset to 0, one + record_judge_score call would leave _iters_since_last_run = 1 < min_iters, + and should_run() would return False, delaying the first GEPA fire.""" + suite = _build_suite(tmp_path) + report = _build_report() + validator = MagicMock() + validator.validate.return_value = report + cache = ClosedLoopFeedbackCache( + validator=validator, suite=suite, artifact_name="t", + baseline_artifact_text="b", gate_mode="sampled", + min_iters=3, + ) + + # Preflight fires once (simulates saturation_preflight at init time) + cache.force_run("baseline") + + # Simulate the first GEPA metric call recording a judge score. + # After this, _iters_since_last_run must be >= min_iters so + # should_run() returns True (periodic floor is met). + cache.record_judge_score(0.7) # non-saturating score + + assert cache.should_run() is True, ( + "After force_run + one record_judge_score, should_run() must be True " + "(_iters_since_last_run should be min_iters+1 >= min_iters). " + "force_run reset to 0 would leave it at 1 < 3 (min_iters), causing False." + ) diff --git a/tests/skills/test_evolve_skill_saturation_preflight.py b/tests/skills/test_evolve_skill_saturation_preflight.py index 1b866d4f..62421e39 100644 --- a/tests/skills/test_evolve_skill_saturation_preflight.py +++ b/tests/skills/test_evolve_skill_saturation_preflight.py @@ -105,3 +105,50 @@ def test_force_saturation_check_overrides_abort(self, skill_dir): "--iterations", "1", "--force-saturation-check", "--no-preflight"], ) mock_confirm.assert_not_called() + + def test_cache_reuse_skips_baseline_re_eval_after_gepa(self, skill_dir): + """When the saturation preflight runs, the cached baseline holdout + scores must be reused at the post-GEPA evaluation site — the baseline + module should NOT be re-scored on the holdout after GEPA finishes. + This is the 'net cost ~zero' contract.""" + from evolution.core.saturation_check import SaturationReport + from evolution.skills.knee_point import CandidatePick + from unittest.mock import MagicMock + + healthy = SaturationReport( + band="healthy", holdout_score=0.6, holdout_n=10, + holdout_per_example=[0.6] * 10, suggestions=[], thresholds={}, + ) + # Fake knee-point result so execution reaches the holdout site. + # skill_text must be a non-empty string so SkillModule can be built. + fake_module = MagicMock() + fake_module.skill_text = "evolved skill text" + knee_pick = CandidatePick( + module=fake_module, skill_text="evolved skill text", body_chars=18, + val_score=0.8, val_rank_in_band=1, band_size=1, epsilon=0.1, + fallback="knee", picked_idx=0, gepa_default_idx=0, + gepa_default_body_chars=18, band_roster=[], + ) + with patch( + "evolution.skills.evolve_skill.saturation_preflight", return_value=healthy + ), patch( + "evolution.skills.evolve_skill._preflight_lm_credentials" + ), patch("evolution.skills.evolve_skill.dspy.GEPA"), patch( + "evolution.skills.evolve_skill.select_knee_point", return_value=knee_pick + ), patch( + "evolution.skills.evolve_skill._holdout_evaluate_with_metric" + ) as mock_holdout_eval: + mock_holdout_eval.return_value = (0.6, [0.6] * 10) + runner = CliRunner() + runner.invoke( + evolve_skill_main, + ["--skill", "demo-skill", "--skill-source-dir", str(skill_dir), + "--iterations", "1", "--no-preflight"], + ) + # With preflight populating the cache, baseline should NOT be + # re-evaluated post-GEPA. Only evolved should be evaluated, so + # _holdout_evaluate_with_metric is called exactly once. + assert mock_holdout_eval.call_count == 1, ( + f"Expected baseline holdout to be reused from preflight cache " + f"(1 call for evolved only), got {mock_holdout_eval.call_count}" + ) diff --git a/tests/tools/test_evolve_tool_saturation_preflight.py b/tests/tools/test_evolve_tool_saturation_preflight.py index ec0b8705..9eeb215b 100644 --- a/tests/tools/test_evolve_tool_saturation_preflight.py +++ b/tests/tools/test_evolve_tool_saturation_preflight.py @@ -121,3 +121,52 @@ def test_force_saturation_check_overrides_abort(self, manifest_dir): ) # confirm is bypassed when --force-saturation-check is set mock_confirm.assert_not_called() + + def test_cache_reuse_skips_baseline_re_eval_after_gepa(self, manifest_dir): + """When the saturation preflight runs, the cached baseline holdout + scores must be reused at the post-GEPA evaluation site — the baseline + module should NOT be re-scored on the holdout after GEPA finishes. + This is the 'net cost ~zero' contract.""" + from evolution.core.saturation_check import SaturationReport + from evolution.skills.knee_point import CandidatePick + from unittest.mock import MagicMock + + # Healthy report so preflight passes without prompting; preflight + # still populates holdout_per_example which gets reused. + healthy = SaturationReport( + band="healthy", holdout_score=0.6, holdout_n=10, + holdout_per_example=[0.6] * 10, suggestions=[], thresholds={}, + ) + # Fake knee-point result so execution reaches the holdout site. + fake_module = MagicMock() + knee_pick = CandidatePick( + module=fake_module, skill_text="evolved desc", body_chars=12, + val_score=0.8, val_rank_in_band=1, band_size=1, epsilon=0.1, + fallback="knee", picked_idx=0, gepa_default_idx=0, + gepa_default_body_chars=12, band_roster=[], + ) + with patch( + "evolution.tools.evolve_tool.saturation_preflight", return_value=healthy + ), patch( + "evolution.tools.evolve_tool._preflight_lm_credentials" + ), patch("evolution.tools.evolve_tool.dspy.GEPA"), patch( + "evolution.tools.evolve_tool.select_knee_point", return_value=knee_pick + ), patch( + "evolution.tools.evolve_tool._candidate_description", return_value="evolved desc" + ), patch( + "evolution.tools.evolve_tool._holdout_evaluate_with_metric" + ) as mock_holdout_eval: + mock_holdout_eval.return_value = (0.6, [0.6] * 10) + runner = CliRunner() + runner.invoke( + evolve_tool_main, + ["--tool", "write_file", "--manifest", str(manifest_dir), + "--iterations", "1", "--no-preflight"], + ) + # With preflight populating the cache, baseline should NOT be + # re-evaluated post-GEPA. Only evolved should be evaluated, so + # _holdout_evaluate_with_metric is called exactly once. + assert mock_holdout_eval.call_count == 1, ( + f"Expected baseline holdout to be reused from preflight cache " + f"(1 call for evolved only), got {mock_holdout_eval.call_count}" + ) From 1ce0456b875ad7300afdac681a655c66671c9c8a Mon Sep 17 00:00:00 2001 From: Justin Ramos Date: Thu, 21 May 2026 18:27:24 -0600 Subject: [PATCH 5/8] fix(core): widen no_headroom band to catch CL-saturated + synthetic-close case MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The default thresholds shipped in feat(core) were too strict for the case Path F was built to catch. Spike #1 in the feasibility report documented synthetic=0.987 + closed-loop=1.0 for the saturated write_file baseline — GEPA can't improve on that, but the strict AND (synthetic ≥ 0.99) gate let it through as healthy. The realtime smoke from the merge-readiness check confirmed: preflight ran, both scores looked right, classifier returned healthy, GEPA burned 155 no-op iterations. Refined no_headroom logic: - (synthetic ≥ 0.99 AND no CL signal) — unchanged, judge alone pegged - (CL ≥ 0.95 AND synthetic ≥ weak_syn=0.95) — NEW, both signals effectively pegged The synthetic_close gate on the new clause keeps (synthetic=0.5, CL=1.0) classified as healthy — that scenario means there's real judge signal to optimize over (or the eval is misconfigured) and should not auto-abort. Two new tests pin both the smoke case and the edge case. --- evolution/core/saturation_check.py | 20 ++++++++++++++++++-- tests/core/test_saturation_check.py | 21 +++++++++++++++++++++ 2 files changed, 39 insertions(+), 2 deletions(-) diff --git a/evolution/core/saturation_check.py b/evolution/core/saturation_check.py index 6fcfa468..df32e43f 100644 --- a/evolution/core/saturation_check.py +++ b/evolution/core/saturation_check.py @@ -65,8 +65,24 @@ def _classify_band( "Or harden the suite tasks so failure modes are interesting, not 'model can't execute'.", ] - if holdout_score >= no_head_syn and ( - closed_loop_score is None or closed_loop_score >= no_head_cl + synthetic_saturated = holdout_score >= no_head_syn + synthetic_close = holdout_score >= weak_syn + cl_saturated = ( + closed_loop_score is not None and closed_loop_score >= no_head_cl + ) + no_cl_signal = closed_loop_score is None + + # no_headroom triggers when: + # - synthetic alone is saturated and there's no closed-loop signal + # (only signal available is judge, and it's pegged), OR + # - closed-loop is saturated AND synthetic is close enough (≥ weak + # threshold) that the judge isn't producing a useful gradient either. + # CL-saturated alone with a low synthetic (< weak_syn) does NOT trigger: + # there's real judge signal to optimize over even when behavioral is + # pegged, and that scenario usually means a misconfigured eval rather + # than true saturation. + if (synthetic_saturated and no_cl_signal) or ( + cl_saturated and synthetic_close ): return "no_headroom", [ "Baseline already saturates the eval. No measurable headroom to evolve into.", diff --git a/tests/core/test_saturation_check.py b/tests/core/test_saturation_check.py index 0429c3dd..da337007 100644 --- a/tests/core/test_saturation_check.py +++ b/tests/core/test_saturation_check.py @@ -71,6 +71,27 @@ def test_custom_thresholds_propagate(self): ) assert band == "no_headroom" + def test_no_headroom_when_cl_saturated_and_synthetic_close(self): + """The smoke case: synthetic 0.987 (below strict no_head_syn=0.99 + but above weak_syn=0.95), closed-loop 1.0. Both signals + effectively pegged → no_headroom should trigger so the user + doesn't burn GEPA budget on a hopeless run.""" + band, _ = _classify_band( + holdout_score=0.987, closed_loop_score=1.0, + thresholds=DEFAULT_THRESHOLDS, + ) + assert band == "no_headroom" + + def test_healthy_when_cl_saturated_but_synthetic_low(self): + """Edge case: behavioral suite pegged at 1.0 but synthetic at 0.5 + means there's real judge signal to optimize over (or the eval is + misconfigured). Don't auto-abort — proceed and let GEPA try.""" + band, _ = _classify_band( + holdout_score=0.5, closed_loop_score=1.0, + thresholds=DEFAULT_THRESHOLDS, + ) + assert band == "healthy" + class TestSaturationPreflightNoClosedLoop: def test_returns_healthy_when_baseline_below_threshold(self): From d4058bd02f288a151b4ad484776e1c1c992b5f84 Mon Sep 17 00:00:00 2001 From: Justin Ramos Date: Thu, 21 May 2026 18:45:47 -0600 Subject: [PATCH 6/8] docs: bring project docs in line with Path F (saturation pre-flight) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Updates across the docs/ knowledge base, AGENTS.md, README.md, and PLAN.md to reflect the new saturation pre-flight feature: - architecture.md: top-level flow now shows the pre-flight + abort path; new design pattern #10 separates the pre-flight (a "should we even start" decision) from the deploy gate (a "did we improve" decision). - components.md: new saturation_check.py section documenting the band classifier logic + public surface; force_run added to the ClosedLoopFeedbackCache surface. - data_models.md: new SaturationReport dataclass entry. - workflows.md: Workflow 1 gets a Phase B.5 mermaid for the pre-flight; Phase D's holdout step shows the cache-reuse branch. - interfaces.md: --no-saturation-check + --force-saturation-check added to both skill and tool flag tables. - index.md: new routing entry, new cross-cutting topic, refreshed test count (681 → 1076), maintenance-note entry for the default thresholds (likely to be calibrated). - codebase_info.md: saturation_check.py added to layout + LOC table; test count refreshed. - framework_advantages.md: new "Saturation pre-flight that refuses to spend budget on hopeless runs" section, positioned as a framework advantage over raw GEPA. - AGENTS.md: 5-line run summary updated; component map adds saturation_check.py; planned/deferred section gets a Path D/E/C entry pointing at the feasibility report. - README.md: new "Saturation pre-flight" section in the Safety knobs area with example panel output. - PLAN.md: deviation #8 gets a follow-up paragraph noting that Path F addresses the user-visible symptom but not the underlying acceptance-gate mechanism. No source files touched. --- AGENTS.md | 6 +++-- PLAN.md | 2 ++ README.md | 21 ++++++++++++++++ docs/architecture.md | 14 ++++++++--- docs/codebase_info.md | 6 +++-- docs/components.md | 26 +++++++++++++++++++ docs/data_models.md | 20 +++++++++++++++ docs/framework_advantages.md | 6 +++++ docs/index.md | 7 ++++-- docs/interfaces.md | 4 +++ docs/workflows.md | 49 +++++++++++++++++++++++++++++++++--- 11 files changed, 149 insertions(+), 12 deletions(-) diff --git a/AGENTS.md b/AGENTS.md index da162ee9..ff85cf60 100644 --- a/AGENTS.md +++ b/AGENTS.md @@ -74,9 +74,9 @@ The `evolution//` directories form **a clean layering**: `evolution/core/` 1. CLI resolves `--skill ` to a `SKILL.md` via the `SkillSource` walk. 2. Eval dataset is built (synthetic LM gen / golden file / sessiondb mining). -3. Skill body wrapped as `dspy.Module`; GEPA optimizes it with `BudgetAwareProposer` injecting a char budget into the reflection prompt. +3. Skill body wrapped as `dspy.Module`. **Saturation pre-flight** (`evolution/core/saturation_check.py`) scores the baseline on the holdout + closed-loop suite, classifies into one of four bands, and aborts (or prompts) on non-`healthy` bands — `--no-saturation-check` to skip, `--force-saturation-check` to override the default-deny in non-interactive contexts. Then GEPA optimizes the candidate with `BudgetAwareProposer` injecting a char budget into the reflection prompt. 4. Knee-point Pareto selection walks the candidates within ε of the best valset score in `--knee-point-strategy` order. Default `val-best`: highest val first, smallest body as tiebreak. `smallest` (greedy parsimony) is available via the flag for users explicitly chasing compression. -5. Static constraints + paired-bootstrap growth-quality gate decide deploy vs. reject; both outcomes write `gate_decision.json`. The default rule is `no_regression` (`mean >= 0`); `--quality-gate non-inferiority` switches to `lower_bound > -inferiority_tolerance` (recommended for compression-focused runs at small N where the bootstrap CI swamps tiny effects). +5. Static constraints + paired-bootstrap growth-quality gate decide deploy vs. reject; both outcomes write `gate_decision.json`. The default rule is `no_regression` (`mean >= 0`); `--quality-gate non-inferiority` switches to `lower_bound > -inferiority_tolerance` (recommended for compression-focused runs at small N where the bootstrap CI swamps tiny effects). The post-GEPA holdout eval reuses the baseline scores from the pre-flight, so net cost stays ~zero when the pre-flight ran. ## What lives where @@ -101,6 +101,7 @@ The `evolution//` directories form **a clean layering**: `evolution/core/` | Tool-flavored judge + tool metric | `evolution/tools/tool_judge.py` | | Behavioral `dspy.Example` builder for closed-loop trainset | `evolution/core/behavioral_example.py` | | Closed-loop verdict cache + deterministic feedback rendering | `evolution/core/closed_loop_feedback.py` | +| Saturation pre-flight (band classifier + Rich panel + interactive confirm) | `evolution/core/saturation_check.py` | | Deploy gate (static + growth-quality) | `evolution/core/constraints.py` | | Preset table + gate-decision persistence (shared by skill/tool) | `evolution/core/quality_gate.py` | | Paired-bootstrap CI | `evolution/core/stats.py` | @@ -268,6 +269,7 @@ Open questions deferred to future PRs (per `PLAN.md` deviation notes): - GEPA Pareto-frontier checkpointing (so a `TimeoutError` mid-run doesn't lose all candidates) - Skill-size-based reflection-LM timeout scaling - BCa bootstrap upgrade once N≥20 routinely +- **GEPA acceptance-gate work** (deviation #8 follow-up): the saturation pre-flight (`evolution/core/saturation_check.py`) addresses the user-visible symptom on saturated baselines (abort before GEPA spends budget). The underlying mechanism gap — stochastic small-minibatch `sum()` acceptance discarding per-instance signal — is tracked as Path D/E/C in `reports/pareto_frontier_feasibility.md` and remains future work (likely an upstream DSPy or GEPA PR). ## When to consult which doc diff --git a/PLAN.md b/PLAN.md index a7b12172..5c587f64 100644 --- a/PLAN.md +++ b/PLAN.md @@ -460,6 +460,8 @@ These descriptions are sent with every API call as part of the tool schema — e 7. **N=2 saturated baselines.** Weak-target hunt ran `evolve_tool` against `write_file` (98.8–99.2% holdout, 3 seeds, 1×/3× iter) and `search_files` (98.6% holdout). Both runs produced evolved descriptions byte-identical to the baseline — the knee-point picker correctly reverts to the unchanged baseline when GEPA's variants tie. The framework's tool-description pipeline is regression-catching, not improvement-finding, on these hand-tuned descriptions. 8. **Closed-loop signal can flow into reflection but doesn't change selection on saturated baselines.** The `--closed-loop-during-evolution` flag plumbs `ValidationReport`s into the GEPA reflection LM's feedback channel via the existing 5-arg metric protocol, opt-in, saturation-gated. Verified end-to-end on `write_file`: closed-loop fired (file mutated + restored), the reflection LM saw the verdict, GEPA still selected the baseline byte-for-byte. The bottleneck sits upstream of reflection — GEPA's `sum(judge_scores)` acceptance rule ties when every candidate hits 1.0 on a saturated minibatch. Extending the Pareto frontier into behavioral space (closed-loop tasks as additional training-set instances with their own per-instance scores so a candidate can stay on the frontier by winning behavioral tasks) is the structural direction that would address this; the cache + renderer added here are the natural building blocks for that work. + **Follow-up — Path F (saturation pre-flight) addresses the user-visible symptom, not the underlying mechanism.** A separate investigation (`reports/pareto_frontier_feasibility.md`, two spike runs) confirmed the deviation's diagnosis and reframed it: the bottleneck isn't frontier shape, it's GEPA's stochastic small-minibatch `sum()` acceptance gate discarding per-instance signal before it can move selection. Path F (`evolution/core/saturation_check.py`) ships the user-visible fix — detect the saturated case before GEPA starts, render a panel explaining why no improvement is possible, default-deny in non-interactive contexts. This prevents the wasted-budget UX without solving the mechanism gap. The mechanism-side fix (Pareto-dominance acceptance, larger minibatch, or stratified sampling) is tracked as "Path D/E/C" in the feasibility report and remains future work. + ### Phase 3: System Prompt Evolution **Goal:** Optimize the sections of the system prompt that guide agent behavior. diff --git a/README.md b/README.md index f63fd5fa..c5558f3b 100644 --- a/README.md +++ b/README.md @@ -245,6 +245,27 @@ uv run python -m evolution.tools.evolve_tool --tool X --manifest Y \ Env vars: `EVOLVED_PATH`, `BASELINE_PATH`, `RUN_DIR`, `TARGET_NAME`, `ARTIFACT_TYPE`. The hook runs under `/bin/sh -c` — interactive aliases are not available; invoke binaries by full name. Trust boundary: the command string is yours, do not pass strings you didn't write yourself. +### Saturation pre-flight (don't burn GEPA budget on hopeless runs) + +By default, every `evolve_skill` / `evolve_tool` run does a pre-flight: score the baseline on the holdout (and the closed-loop suite, if `--closed-loop-during-evolution` is set), classify into one of four bands (`healthy` / `no_headroom` / `weak_signal` / `uniform_failure`), and refuse to spend GEPA budget on a baseline that's already saturated. + +``` +Saturation check: holdout=0.987 (50 ex), closed-loop=1.000 (7 tasks) +╭─── No measurable headroom ───────────╮ +│ Band: no_headroom │ +│ • Baseline already saturates the eval│ +│ • Try a harder closed-loop suite │ +│ • Sanity check: synthetic generator? │ +╰──────────────────────────────────────╯ +Non-interactive context; refusing to proceed. +Pass --force-saturation-check to override. +``` + +In interactive contexts, non-`healthy` bands prompt for confirmation (`Continue anyway? [y/N]`). In non-interactive contexts (no TTY on stdin — CI, background jobs, cron), the framework default-denies and exits cleanly with the override hint. Net cost is ~zero: the probe's holdout scores are reused at the post-GEPA evaluation site, so the baseline isn't re-scored at run end. + +- `--no-saturation-check` skips the probe entirely (useful when you've already validated headroom externally) +- `--force-saturation-check` runs the probe + renders the panel but proceeds regardless of band + ### Closed-loop validation (real agent on real tasks) The framework's deploy gate scores evolved artifacts against an LM-judge on a synthetic eval set. That's a closed loop: an LM scoring another LM's output on tasks a third LM made up. To break the loop, point a real agent at a small task suite with the baseline and evolved artifacts and see whether real agent behavior actually shifted: diff --git a/docs/architecture.md b/docs/architecture.md index 9806c77c..7e2f14d5 100644 --- a/docs/architecture.md +++ b/docs/architecture.md @@ -13,12 +13,17 @@ flowchart LR A[CLI
--skill X] --> B[Resolve SKILL.md
SkillSource] B --> C[Build eval dataset
synthetic / golden / sessiondb] C --> D[Wrap as
SkillModule dspy.Module] - D --> E[GEPA optimizer
+ BudgetAwareProposer] + D --> SAT[Saturation pre-flight
baseline holdout + closed-loop probe] + SAT --> SATB{band ==
healthy?} + SATB -- no --> SATA[Rich panel + prompt
or default-deny] + SATA -- abort --> Z[sys.exit 0] + SATA -- proceed --> E + SATB -- yes --> E[GEPA optimizer
+ BudgetAwareProposer] E --> F[Knee-point
Pareto selection] F --> G[Static
constraints] G --> H{pass?} H -- no --> I[Write evolved_FAILED.md
+ gate_decision.json] - H -- yes --> J[Holdout eval
dspy.Evaluate × 2] + H -- yes --> J[Holdout eval
dspy.Evaluate × 1 evolved
baseline reused from SAT] J --> K[Paired bootstrap
per-example deltas] K --> L[Growth-with-quality
gate] L --> M{deploy?} @@ -166,7 +171,10 @@ When growth is below the free threshold, the gate degrades to "no-regression onl ### 9. Cost-ceiling kill switch `LMTimingCallback` also drives a per-run `CostLedger` that accumulates per-call cost from litellm's `_hidden_params`. `--max-total-cost-usd ` arms the ledger; once the accumulated cost crosses `N`, the next LM call raises `CostCeilingExceeded` from `LMTimingCallback.on_lm_start`. The orchestrator catches this at the top level and writes a `decision="aborted"` `gate_decision.json` with `cost_at_abort_usd` + `cost_ceiling_usd` + `cost_summary`. Worst-case overshoot is one LM call past the ceiling. -### 10. Closed-loop validation as a separate surface +### 10. Saturation pre-flight as a separate concern from the gate +`evolution/core/saturation_check.py` runs BEFORE GEPA setup: scores the baseline on the holdout (and the closed-loop suite when configured), classifies into four bands (`healthy` / `no_headroom` / `weak_signal` / `uniform_failure`), and renders a Rich panel. Non-healthy bands prompt for confirmation in interactive contexts; default-deny in non-interactive contexts (no TTY) with a `--force-saturation-check` override. Skippable with `--no-saturation-check`. The probe's `holdout_per_example` is stashed and reused at the post-GEPA holdout site so net cost stays ~zero. Mirrors the `evolution/core/auth_check.py` pattern: pure helper returns a structured `SaturationReport`; rendering + exit handled by the call site. This is independent of the deploy gate (which runs AFTER GEPA on the evolved artifact) — the pre-flight is a "should we even start" decision; the gate is a "did we improve" decision. + +### 11. Closed-loop validation as a separate surface `evolution/validation/` runs a real agent (`hermes -z`) through a JSONL task suite with baseline vs evolved artifacts spliced into the live install. Available three ways: - **Post-gate veto** (`--benchmark-cmd "python -m evolution.validation.closed_loop ..."`) — runs after the deploy gate passes; nonzero exit flips the decision to reject with `reason="benchmark_failed"`. - **Reflection feedback** (`--closed-loop-during-evolution --closed-loop-mode feedback`) — `ClosedLoopFeedbackCache` runs the validator during the GEPA loop, saturation-gated, and the verdict is rendered into the reflection LM's input via the metric's `dspy.Prediction.feedback` string. Score channel untouched. diff --git a/docs/codebase_info.md b/docs/codebase_info.md index 83da616a..b6e787d8 100644 --- a/docs/codebase_info.md +++ b/docs/codebase_info.md @@ -50,6 +50,7 @@ evolution/ │ ├── fitness.py # LLMJudge + GEPA-shaped metric + behavioral score helper │ ├── lm_timing_callback.py # LM-call observability + cost ledger + cost-ceiling kill switch │ ├── quality_gate.py # preset table + write_gate_decision (shared by skill/tool pipelines) +│ ├── saturation_check.py # pre-flight: classify baseline into healthy/no_headroom/weak_signal/uniform_failure + Rich panel + abort │ ├── skill_sources.py # SkillSource protocol + 3 implementations │ └── stats.py # paired_bootstrap CI ├── skills/ # Tier 1: skill-file evolution @@ -90,7 +91,8 @@ evolution/ | `evolution/core/fitness.py` | ~380 | LLMJudge + skill/tool fitness metrics + behavioral score helper | | `evolution/core/constraints.py` | ~320 | static + growth-with-quality + size constraints | | `evolution/skills/budget_aware_proposer.py` | ~300 | char-budget reflection prompt | -| `evolution/core/closed_loop_feedback.py` | ~295 | cache + saturation gate + deterministic feedback block | +| `evolution/core/closed_loop_feedback.py` | ~320 | cache + saturation gate + deterministic feedback block + `force_run` (bypasses gate for pre-flight) | +| `evolution/core/saturation_check.py` | ~255 | pre-flight: band classifier + `SaturationReport` + Rich panel + interactive confirm | | `evolution/tools/tool_judge.py` | ~230 | tool-flavored judge + GEPA-shaped metric with behavioral branch | | `evolution/validation/validator.py` | ~220 | mutate + restore live agent file with flock + checksum drift check | | `evolution/validation/report.py` | ~225 | ValidationReport JSON + Rich rendering + two-condition decision | @@ -109,7 +111,7 @@ evolution/ | `evolution/core/behavioral_example.py` | ~35 | builder for behavioral dspy.Examples | | **Total** | **~9,000** | excludes empty `__init__.py` shims | -Test suite: 37 test files under `tests/core/`, `tests/skills/`, `tests/tools/`, `tests/validation/`. **681 tests** collected. +Test suite: 55 test files under `tests/core/`, `tests/skills/`, `tests/tools/`, `tests/validation/`. **1076 tests** collected. ## Runtime dependencies diff --git a/docs/components.md b/docs/components.md index 734dc179..88211422 100644 --- a/docs/components.md +++ b/docs/components.md @@ -163,6 +163,7 @@ Score is **never** modified by `pred_trace` enrichment — GEPA enforces score e - `.should_run() -> bool` — the gate. `gate_mode="sampled"` (default, opportunistic feedback-only use): fire when `min(recent_window) >= saturation_threshold` OR `iters_since_last_run >= min_iters`. `gate_mode="always"` (selection-affecting trainset use): always open — every novel candidate must score every time. - `.get_or_run(candidate_text) -> Optional[ValidationReport]` — cache key is `sha256(candidate + suite.sha256)`. Cache hit returns cached report; miss writes the candidate's description into a tmp JSON manifest and calls `validator.validate()`. Validator failures (`ConcurrentRunError`, `StaleBackupError`, `ChecksumDriftError`) log `WARNING` and return `None` — closed-loop failure must never take the GEPA run down. - `.get_task_verdict(candidate_text, task_id) -> Optional[TaskResult]` — calls `get_or_run` and indexes `report.evolved.tasks` by `task_id`. Returns `None` if the gate is closed or the validator raised a swallowed error or the task isn't present. +- `.force_run(candidate_text) -> ValidationReport` — same shape as `get_or_run` but bypasses `should_run()` and propagates validator exceptions (instead of logging + returning `None`). Used by the saturation pre-flight (`evolution/core/saturation_check.py`) to fire the validator on the baseline once before any judge scores have been recorded — in default `gate_mode="sampled"`, `should_run()` returns `False` until either a judge score saturates or the periodic floor elapses, so `get_or_run` would silently no-op at preflight time. Preserves the "next `get_or_run` is allowed to fire immediately" guarantee by resetting `_iters_since_last_run` to `min_iters` (the same value `__init__` uses), so the saturation gate's first-fire allowance for downstream callers is intact. - `render_feedback_block(report: ValidationReport) -> str` — module-level function. Renders the cached report as a deterministic `[CLOSED_LOOP]` block (or `[CLOSED_LOOP-NOISY]` when `|Δpass_rate| < 0.15`) with decision, decision_reasons, win/loss/tie counts, and per-task diffs for tasks whose verdict changed. Determinism is required because GEPA hashes reflective-dataset entries for caching. **Two use modes**, both wired through `evolve_tool` CLI flags: @@ -170,6 +171,31 @@ Score is **never** modified by `pred_trace` enrichment — GEPA enforces score e 1. **Feedback enricher** (`--closed-loop-mode feedback`, default): the metric's `_augment_feedback_with_closed_loop` helper calls `get_or_run` on the candidate currently under reflection, then appends the rendered block to the metric's `dspy.Prediction.feedback`. Saturation-gated so it only fires when the judge has converged. Score is unchanged. 2. **Trainset score channel** (`--closed-loop-mode trainset`): `build_behavioral_examples(suite)` injects per-task `dspy.Example`s into the trainset. The metric's behavioral branch calls `get_task_verdict` on each behavioral example and returns the binary verdict as score. Behavioral wins contribute to `sum(minibatch_scores)`, breaking judge ties at acceptance. +## evolution/core/saturation_check.py — pre-flight that detects doomed runs + +**Owns:** the pre-GEPA probe that scores the baseline on the holdout (and the closed-loop suite, if configured), classifies the result into one of four bands, and lets the call site decide whether to prompt for confirmation or default-deny. Independent of any GEPA-side change; mirrors the shape of `evolution/core/auth_check.py` (pure helper returns a structured report; rendering + exit handled by the call site). + +**Public surface:** + +- `SaturationBand: Literal["healthy", "no_headroom", "weak_signal", "uniform_failure"]` — the four-band classification. +- `DEFAULT_THRESHOLDS: dict[str, float]` — `no_headroom_synthetic=0.99`, `weak_signal_synthetic=0.95`, `no_headroom_closed_loop=0.95`, `uniform_failure_closed_loop=0.15`. +- `SaturationReport` dataclass — the contract between the helper and the call site. Carries the band, holdout score + per-example list (reused downstream for cache reuse), the closed-loop score + per-example list when present, the band-specific suggestion strings, and the thresholds that produced the band. +- `saturation_preflight(baseline_module, holdout_examples, metric, lm, closed_loop_cache=None, baseline_artifact_text=None, thresholds=None) -> SaturationReport` — pure function. Scores baseline via `_score_baseline_on_holdout` (a thin wrapper around `dspy.Evaluate` carved out so tests can patch the DSPy boundary), then fires `closed_loop_cache.force_run(baseline_artifact_text)` when the cache is provided. Raises `ValueError` on empty `holdout_examples` before any LM call. +- `render_saturation_panel(report, console=None) -> None` — emits a one-line dim acknowledgement for the `healthy` band, or a Rich `Panel` (yellow border) with band, score lines, and bulleted suggestions for the warn bands. +- `interactive_confirm(prompt="Continue anyway? [y/N] ") -> bool` — reads stdin; returns `True` only for `{y, yes}` case-insensitive. Catches `KeyboardInterrupt` and `EOFError`, returning `False` (treats as "n", no traceback noise). +- `is_non_interactive() -> bool` — `not sys.stdin.isatty()`. Call sites use it to decide between prompting and printing the override hint. + +**Band classifier logic** (`_classify_band`, in priority order): + +1. **`uniform_failure`** if `closed_loop_score is not None AND closed_loop_score <= 0.15` — validator agent too weak to use the artifact at all; signal isn't discriminating. +2. **`no_headroom`** if either: + - `holdout_score >= 0.99 AND closed_loop_score is None` — only signal available is the judge, and it's pegged, OR + - `closed_loop_score >= 0.95 AND holdout_score >= 0.95` — both signals effectively saturated. The `holdout_score >= 0.95` gate on this clause keeps `(synthetic=0.5, CL=1.0)` classified as `healthy` (there's real judge headroom even with behavioral pegged; usually means misconfigured eval rather than true saturation). +3. **`weak_signal`** if `holdout_score >= 0.95 AND 0.15 < closed_loop_score < 0.95` — judge saturating but closed-loop discriminates; GEPA's small-minibatch acceptance will struggle (per the deviation #8 finding); expect many proposals rejected. +4. **`healthy`** otherwise — no panel, just a one-line dim log. + +**Call-site integration:** both `evolve_skill.py` and `evolve_tool.py` invoke the helper after the dataset is built and `baseline_module`/`metric`/`closed_loop_cache` are constructed but before GEPA setup. The `holdout_per_example` list from the report is stashed and reused at the post-GEPA `_holdout_evaluate_with_metric` site — so the baseline isn't re-scored at run end. Net cost: ~zero (the probe is the holdout eval shifted earlier). See `--no-saturation-check` / `--force-saturation-check` in `interfaces.md`. + ## evolution/core/constraints.py — deploy gate **Owns:** all constraint checks and the deploy gate's two-stage decision. diff --git a/docs/data_models.md b/docs/data_models.md index f7001626..a41a97b2 100644 --- a/docs/data_models.md +++ b/docs/data_models.md @@ -285,6 +285,26 @@ Both must hold to return `"pass"`; else `"regression"`. The 2:1 win-loss ratio i `ValidationReport.to_dict()` round-trips to `validation_report.json` written under `output/validation///`. +## SaturationReport (`evolution/core/saturation_check.py`) + +In-memory only. Built by `saturation_preflight(...)` before GEPA setup, consumed by the call site in `evolve_skill` / `evolve_tool` to decide whether to abort or proceed. Not currently serialized to disk — the `holdout_per_example` list flows directly into the post-GEPA `_holdout_evaluate_with_metric` baseline-cache reuse path. + +```python +@dataclass +class SaturationReport: + band: SaturationBand # "healthy" | "no_headroom" | "weak_signal" | "uniform_failure" + holdout_score: float # baseline mean on holdout + holdout_n: int # number of holdout examples scored + holdout_per_example: list[float] # per-example scores (reused at post-GEPA evaluation) + closed_loop_score: Optional[float] = None # None when no --closed-loop-during-evolution suite + closed_loop_n: Optional[int] = None # number of behavioral tasks scored + closed_loop_per_example: Optional[list[float]] = None + suggestions: list[str] = field(default_factory=list) # band-specific user-facing strings + thresholds: dict[str, float] = field(default_factory=dict) # snapshot of values that produced the band +``` + +`SaturationBand` is a `Literal` of four strings. `DEFAULT_THRESHOLDS` ships as `{no_headroom_synthetic: 0.99, weak_signal_synthetic: 0.95, no_headroom_closed_loop: 0.95, uniform_failure_closed_loop: 0.15}`. See `components.md`'s `saturation_check.py` section for the classifier logic. + ## Evolved manifest output JSON `output/tools///evolved_manifest.json` (deploy) and `evolved_FAILED.json` (reject) have the same shape as the input MCP-shape manifest: diff --git a/docs/framework_advantages.md b/docs/framework_advantages.md index 42cd36bb..a10c338b 100644 --- a/docs/framework_advantages.md +++ b/docs/framework_advantages.md @@ -44,6 +44,12 @@ The fitness function is a composite LLM-as-judge metric: separate scores for cor Files: `evolution/skills/budget_aware_proposer.py`, `evolution/core/fitness.py`. +### Saturation pre-flight that refuses to spend budget on hopeless runs + +GEPA will happily burn an hour optimizing a target that has no measurable headroom — every reflective mutation gets rejected because the minibatch ties at 100%, and you end up with the baseline byte-for-byte plus a bill. The framework's pre-flight (`evolution/core/saturation_check.py`) catches this BEFORE GEPA starts: scores the baseline on the holdout (and the closed-loop suite, if configured), classifies into `healthy` / `no_headroom` / `weak_signal` / `uniform_failure`, and either prompts the user (interactive) or default-denies with a `--force-saturation-check` override (non-interactive). Net cost is ~zero — the probe's holdout scores are reused at the post-GEPA evaluation site. When the run does proceed, the user has band-specific suggestions for the warn cases (try a stronger validator model, try a harder suite, increase iterations). Raw `dspy.GEPA` has no equivalent. + +Files: `evolution/core/saturation_check.py`. + ## Telemetry as a first-class feature Every run writes `gate_decision.json` (schema_version `"4"`) capturing the deploy decision, the paired-bootstrap statistics, the static-constraint results, the knee-point band roster, and an explicit comparison against the candidate stock GEPA would have picked. Combined with `metrics.json` (deploy summary) and `run.log` (every LM call timing), this means a deploy decision is auditable post-hoc and the system can be re-calibrated on accumulated runs. Most upstream users won't realize they're missing this until they need to debug a bad ship. diff --git a/docs/index.md b/docs/index.md index 22ba38d2..06b5c3aa 100644 --- a/docs/index.md +++ b/docs/index.md @@ -6,7 +6,7 @@ This directory is a structured documentation set for **`agent-self-evolution`** **Start here every time.** This file is the entry point — it describes which documents to consult for which kinds of question. Load it into context first; the other docs are loaded on demand. -The codebase is mid-sized (~9K LOC of source + 37 test files / ~680 tests) and architecturally dense — most of the substance is in *why* things are shaped a certain way, not *what* they are. The docs prioritize that "why." +The codebase is mid-sized (~9K LOC of source + 55 test files / ~1076 tests) and architecturally dense — most of the substance is in *why* things are shaped a certain way, not *what* they are. The docs prioritize that "why." ### Question routing table @@ -30,6 +30,7 @@ The codebase is mid-sized (~9K LOC of source + 37 test files / ~680 tests) and a | **How does closed-loop signal reach GEPA during evolution** | `components.md` (closed_loop_feedback.py, behavioral_example.py) → `architecture.md` (closed-loop feedback patterns) → `workflows.md` (Workflow 11) | | **What does `--max-total-cost-usd` actually do on abort** | `data_models.md` (cost-ceiling-abort variant of gate_decision.json) → `components.md` (lm_timing_callback.py) | | **What does `--benchmark-cmd` do** | `interfaces.md` (CLI: benchmark-cmd) → `data_models.md` (benchmark block) | +| **Why did the run abort before GEPA started / what's the saturation panel** | `components.md` (saturation_check.py) → `architecture.md` (pattern 10) → `workflows.md` (Workflow 1 Phase B.5) → `data_models.md` (SaturationReport) | | **What's tested vs. not** | `interfaces.md` (test surfaces locked by tests) → `workflows.md` (Workflow 8) | | **What dependencies are pinned and why** | `dependencies.md` | | **What's planned but not built** | `codebase_info.md` (implementation status table) → `PLAN.md` | @@ -70,6 +71,7 @@ The codebase is mid-sized (~9K LOC of source + 37 test files / ~680 tests) and a - **The deploy gate decision** spans `architecture.md` (statistical substrate), `components.md` (`constraints.py`), `data_models.md` (`gate_decision.json` schema), and `workflows.md` (Workflow 1 Phase D, Workflow 2). Read together when debugging a deploy decision. - **LM observability** lives in `components.md` (`lm_timing_callback.py`), `interfaces.md` (litellm integration), and `dependencies.md` (litellm pinning rationale). - **Skill discovery** is in `components.md` (`skill_sources.py`), `interfaces.md` (SkillSource Protocol), and `codebase_info.md` (priority order). +- **Saturation pre-flight** is in `components.md` (`saturation_check.py`), `architecture.md` (decision 10), `workflows.md` (Workflow 1 Phase B.5), `data_models.md` (`SaturationReport`), and `interfaces.md` (CLI flags `--no-saturation-check` / `--force-saturation-check`). Read together when debugging a "why did the run abort before GEPA" or "why was the panel suggested" question. ## Maintenance notes @@ -78,8 +80,9 @@ The fast-moving parts to verify against source when consulting these docs: - `EvolutionConfig` defaults (especially `eval_dataset_size`, `growth_*`, `bootstrap_*`) - `gate_decision.json` schema_version (currently `"4"`) - LM model defaults in `evolve_skill.py` / `evolve_tool.py` CLI options -- Test count (currently ~680) +- Test count (currently ~1076) - LM `request_timeout` / `num_retries` — may be tuned further - Closed-loop CLI flags on `evolve_tool` (`--closed-loop-during-evolution`, `--closed-loop-mode`, …) +- Saturation pre-flight default thresholds (`evolution/core/saturation_check.py:DEFAULT_THRESHOLDS`) — likely to be calibrated as more real-world bands are observed When updating: edit the relevant file, then check whether the "Question routing table" above still points to the right place. The index file is loaded into AI-assistant context every conversation, so small accuracy improvements here pay off broadly. diff --git a/docs/interfaces.md b/docs/interfaces.md index 917d2f2b..20ad548a 100644 --- a/docs/interfaces.md +++ b/docs/interfaces.md @@ -72,6 +72,8 @@ Both delivery flags are no-ops on a reject decision and emit a one-line stderr n | `--benchmark-cmd ""` | off | Deploy-gate hook: shell command run AFTER the framework's own deploy gate passes; nonzero exit flips the decision to `reject` with `reason="benchmark_failed"`. Receives `EVOLVED_PATH`, `BASELINE_PATH`, `RUN_DIR`, `TARGET_NAME`, `ARTIFACT_TYPE` via env. Runs under `/bin/sh -c`; aliases and shell functions from your interactive shell are not available. Trust boundary: the command string is yours; do not pass strings you didn't write. Adds a `benchmark` block to `gate_decision.json`. | | `--benchmark-timeout-seconds INT` | `600` | Wall-clock cap for the `--benchmark-cmd` hook. Timeout treated as a benchmark fail with `reason="timeout"`. | | `--closed-loop-during-evolution ` | off | Wired symmetrically with `evolve_tool` for CLI consistency. Skill-side closed-loop validation requires a `SkillFileInstaller` that doesn't exist yet, so setting this flag raises with a clear error. | +| `--no-saturation-check` | off | Skip the saturation pre-flight (`evolution/core/saturation_check.py`). By default, the framework scores the baseline on the holdout (and the closed-loop suite, if `--closed-loop-during-evolution` is set) BEFORE GEPA starts; non-`healthy` bands prompt for confirmation (interactive) or default-deny (non-interactive) with a `--force-saturation-check` override. Pass `--no-saturation-check` to skip the probe entirely. | +| `--force-saturation-check` | off | Run the saturation pre-flight, render the panel, but proceed regardless of band. Required to override a non-`healthy` verdict in non-interactive contexts (no TTY on stdin). Without this in such a context, the framework exits cleanly without spending GEPA budget. | ### Exit conditions - `sys.exit(1)` if skill not found across all `SkillSource`s — prints available skills per source. @@ -112,6 +114,8 @@ Evolves one tool's top-level `description` field inside an MCP-shape manifest. T | `--closed-loop-saturation-threshold FLOAT` | `0.95` | Min judge score over the recent window for the saturation gate to open. Only consumed in `feedback` mode (`trainset` / `both` use `gate_mode="always"`). | | `--closed-loop-min-iters INT` | `3` | Periodic-fire floor: fire closed-loop at least every N reflective iterations even when the judge isn't saturating. `feedback` mode only. | | `--closed-loop-window-size INT` | `8` | Number of recent judge scores the saturation gate inspects. `feedback` mode only. | +| `--no-saturation-check` | off | Skip the saturation pre-flight (`evolution/core/saturation_check.py`). By default, the framework scores the baseline on the holdout (and the closed-loop suite, if configured) BEFORE GEPA starts; non-`healthy` bands prompt for confirmation (interactive) or default-deny (non-interactive) with a `--force-saturation-check` override. Pass `--no-saturation-check` to skip the probe entirely. | +| `--force-saturation-check` | off | Run the saturation pre-flight, render the panel, but proceed regardless of band. Required to override a non-`healthy` verdict in non-interactive contexts (no TTY on stdin). | `main()` rejects `--closed-loop-during-evolution` without `--closed-loop-hermes-repo`, and rejects `--closed-loop-mode != feedback` without `--closed-loop-during-evolution`. Local imports keep the validation stack out of cold-path runs. diff --git a/docs/workflows.md b/docs/workflows.md index 0ddd4451..eb80148a 100644 --- a/docs/workflows.md +++ b/docs/workflows.md @@ -69,6 +69,45 @@ sequenceDiagram Baseline static checks here are **warn-only** — they never block the run. The metric is built once so DSPy's LM cache lines up across GEPA per-iteration scoring and the holdout eval in Phase D. +### Phase B.5 — Saturation pre-flight (default on; abort before GEPA spends budget) + +```mermaid +sequenceDiagram + autonumber + participant CLI as evolve_skill + participant Sat as saturation_preflight + participant Eval as dspy.Evaluate + participant CLC as ClosedLoopFeedbackCache + participant Panel as render_saturation_panel + participant U as User + + CLI->>Sat: saturation_preflight(baseline, holdout, metric, lm, cl_cache?, baseline_text) + Sat->>Eval: evaluate(baseline_module, holdout) + Eval-->>Sat: avg_baseline, baseline_per_example + opt --closed-loop-during-evolution is set + Sat->>CLC: force_run(baseline_text) + CLC-->>Sat: ValidationReport (bypasses should_run) + end + Sat->>Sat: _classify_band(holdout, closed_loop?, DEFAULT_THRESHOLDS) + Sat-->>CLI: SaturationReport(band, holdout_per_example, suggestions, ...) + + alt band == "healthy" + CLI->>Panel: one-line dim acknowledgement + else non-healthy + CLI->>Panel: render Rich panel (band + scores + suggestions) + alt --force-saturation-check + Note over CLI: proceed regardless + else interactive + CLI->>U: "Continue anyway? [y/N]" + U-->>CLI: y → proceed | n → sys.exit(0) + else non-interactive + CLI->>CLI: print "Use --force-saturation-check to override"; sys.exit(0) + end + end +``` + +Skippable with `--no-saturation-check`. The probe's `baseline_per_example` is stashed and reused at Phase D's holdout comparison (the baseline isn't re-scored at run end), so net cost is ~zero when the run proceeds. On an abort, GEPA never starts — the user is left with a clear panel explaining why and what to try next. See `components.md` (`saturation_check.py`) for the four-band classifier and `data_models.md` (`SaturationReport`) for the report shape. + ### Phase C — Optimize: GEPA loop, then knee-point pick ```mermaid @@ -122,8 +161,12 @@ sequenceDiagram CLI->>Val: validate_static(evolved_full, "skill") Val-->>CLI: pass - CLI->>Eval: evaluate(baseline_module, holdout) - Eval-->>CLI: avg_baseline, baseline_per_example + alt Phase B.5 cached baseline_per_example + Note over CLI,Eval: skip baseline call; reuse from saturation_preflight + else fresh + CLI->>Eval: evaluate(baseline_module, holdout) + Eval-->>CLI: avg_baseline, baseline_per_example + end CLI->>Eval: evaluate(optimized_module, holdout) Eval-->>CLI: avg_evolved, evolved_per_example @@ -137,7 +180,7 @@ sequenceDiagram CLI-->>U: ✓ Evolution improved skill by +0.054 (+6.1%) ``` -Holdout costs ≈ 2 × |holdout| judge calls (baseline + evolved). The bootstrap runs on the per-example improvement vector; `validate_growth_with_quality` then applies the curve `required(growth) = max(0, slope * (growth - free))` and only deploys if both `mean ≥ required` and `lower_bound > 0`. +Holdout costs ≈ 1 × |holdout| judge calls when the saturation pre-flight ran (the baseline scores are reused from `SaturationReport.holdout_per_example`); 2 × |holdout| when `--no-saturation-check` is set. The bootstrap runs on the per-example improvement vector; `validate_growth_with_quality` then applies the curve `required(growth) = max(0, slope * (growth - free))` and only deploys if both `mean ≥ required` and `lower_bound > 0`. ## Workflow 2: Evolve a skill (rejected on quality gate) From 3b515b525632e481c71f93bd2af25e070b80ce38 Mon Sep 17 00:00:00 2001 From: Justin Ramos Date: Thu, 21 May 2026 19:58:52 -0600 Subject: [PATCH 7/8] fix(tests): mock SyntheticDatasetBuilder in saturation-preflight integration tests MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Two of the new integration tests reached the real synthetic dataset generator before the (mocked) saturation_preflight, so CI runs with a fake OPENAI_API_KEY died on AuthError before the code under test ever executed: - test_saturated_band_non_interactive_aborts (both pipelines) - test_cache_reuse_skips_baseline_re_eval_after_gepa (both pipelines) Add a SyntheticDatasetBuilder mock that returns a small list/EvalDataset of fake EvalExamples (no LM calls). Skill-side fake dataset is sized to 50 examples (30/10/10) so the holdout ≥ EvolutionConfig.min_holdout_size guard doesn't trip before reaching the preflight. Verified locally by running the test files under env -i ... OPENAI_API_KEY=sk-fake-test-key uv run pytest ... to match the CI environment — all 10 saturation-preflight tests pass, full suite still at 1076. The other 3 tests in each file (test_no_saturation_check_flag_skips_helper, test_healthy_band_does_not_prompt, test_force_saturation_check_overrides_abort) "pass" in CI for the wrong reason — their assertions are satisfied even when the run dies on AuthError before reaching the wiring under test. Worth tightening in a follow-up; not blocking this fix. --- .../test_evolve_skill_saturation_preflight.py | 28 +++++++++++++++++++ .../test_evolve_tool_saturation_preflight.py | 23 +++++++++++++++ 2 files changed, 51 insertions(+) diff --git a/tests/skills/test_evolve_skill_saturation_preflight.py b/tests/skills/test_evolve_skill_saturation_preflight.py index 62421e39..8aee4359 100644 --- a/tests/skills/test_evolve_skill_saturation_preflight.py +++ b/tests/skills/test_evolve_skill_saturation_preflight.py @@ -26,6 +26,26 @@ def skill_dir(tmp_path): return skills_root +def _fake_skill_dataset(n: int = 50): + """Build a real-shaped EvalDataset with n fake examples (no LM calls). + + Used by tests that need to flow through evolve() up to the saturation + preflight wiring; replaces SyntheticDatasetBuilder.generate so CI runs + with a fake OPENAI_API_KEY don't die on AuthError before reaching the + code under test. Default n=50 gives 30/10/10 splits — the holdout + must be ≥ EvolutionConfig.min_holdout_size (default 10) or evolve() + aborts before the preflight wiring. + """ + from evolution.core.dataset_builder import EvalDataset, EvalExample + examples = [ + EvalExample(task_input=f"task {i}", expected_behavior=f"rubric {i}") + for i in range(n) + ] + return EvalDataset( + train=examples[:30], val=examples[30:40], holdout=examples[40:50], + ) + + class TestSaturationPreflightCLI: def test_no_saturation_check_flag_skips_helper(self, skill_dir): with patch( @@ -69,7 +89,11 @@ def test_saturated_band_non_interactive_aborts(self, skill_dir): holdout_per_example=[1.0] * 50, suggestions=["x"], thresholds={}, ) gepa_mock = MagicMock() + fake_builder = MagicMock() + fake_builder.generate.return_value = _fake_skill_dataset() with patch( + "evolution.skills.evolve_skill.SyntheticDatasetBuilder", return_value=fake_builder + ), patch( "evolution.skills.evolve_skill.saturation_preflight", return_value=saturated ), patch( "evolution.skills.evolve_skill._preflight_lm_credentials" @@ -129,7 +153,11 @@ def test_cache_reuse_skips_baseline_re_eval_after_gepa(self, skill_dir): fallback="knee", picked_idx=0, gepa_default_idx=0, gepa_default_body_chars=18, band_roster=[], ) + fake_builder = MagicMock() + fake_builder.generate.return_value = _fake_skill_dataset() with patch( + "evolution.skills.evolve_skill.SyntheticDatasetBuilder", return_value=fake_builder + ), patch( "evolution.skills.evolve_skill.saturation_preflight", return_value=healthy ), patch( "evolution.skills.evolve_skill._preflight_lm_credentials" diff --git a/tests/tools/test_evolve_tool_saturation_preflight.py b/tests/tools/test_evolve_tool_saturation_preflight.py index 9eeb215b..f73f4acb 100644 --- a/tests/tools/test_evolve_tool_saturation_preflight.py +++ b/tests/tools/test_evolve_tool_saturation_preflight.py @@ -35,6 +35,21 @@ def manifest_dir(tmp_path): return _minimal_manifest_dir(tmp_path) +def _fake_tool_examples(n: int = 30): + """Build n fake EvalExamples without calling an LM. + + Used by tests that need to flow through evolve() up to the saturation + preflight wiring; replaces SyntheticDatasetBuilder.generate_tool_selection + so CI runs with a fake OPENAI_API_KEY don't die on AuthError before + reaching the code under test. + """ + from evolution.core.dataset_builder import EvalExample + return [ + EvalExample(task_input=f"task {i}", expected_behavior=f"rubric {i}") + for i in range(n) + ] + + class TestSaturationPreflightCLI: def test_no_saturation_check_flag_skips_helper(self, manifest_dir): """--no-saturation-check skips the preflight helper entirely.""" @@ -82,7 +97,11 @@ def test_saturated_band_non_interactive_aborts(self, manifest_dir): suggestions=["Try a harder suite"], thresholds={}, ) gepa_mock = MagicMock() + fake_builder = MagicMock() + fake_builder.generate_tool_selection.return_value = _fake_tool_examples() with patch( + "evolution.tools.evolve_tool.SyntheticDatasetBuilder", return_value=fake_builder + ), patch( "evolution.tools.evolve_tool.saturation_preflight", return_value=saturated ), patch( "evolution.tools.evolve_tool._preflight_lm_credentials" @@ -145,7 +164,11 @@ def test_cache_reuse_skips_baseline_re_eval_after_gepa(self, manifest_dir): fallback="knee", picked_idx=0, gepa_default_idx=0, gepa_default_body_chars=12, band_roster=[], ) + fake_builder = MagicMock() + fake_builder.generate_tool_selection.return_value = _fake_tool_examples() with patch( + "evolution.tools.evolve_tool.SyntheticDatasetBuilder", return_value=fake_builder + ), patch( "evolution.tools.evolve_tool.saturation_preflight", return_value=healthy ), patch( "evolution.tools.evolve_tool._preflight_lm_credentials" From c931bf20acd6f616676e93b622f77afce9b97cf3 Mon Sep 17 00:00:00 2001 From: Justin Ramos Date: Thu, 21 May 2026 20:38:12 -0600 Subject: [PATCH 8/8] review followups: exit code, tempdir cleanup, docstrings, test tightening MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Addresses six items from the PR review: 1. Non-interactive deny now exits 3 (was 0). A scheduled / CI / cron wrapper couldn't previously distinguish "refused to run because no TTY" from "ran cleanly". Interactive user-said-no still exits 0 (success-by-intent). The integration test asserts the new code. 2. ClosedLoopFeedbackCache registers weakref.finalize for its tmp dir so SystemExit (the saturation abort path) triggers cleanup instead of leaking dirs into /tmp for the OS reaper to handle 3+ days later. Updated the class docstring to match. 3. saturation_preflight's docstring no longer claims "Pure: no side effects" — it has LM eval, may run a validator subprocess, mutates the cache. The actual property is "doesn't render, prompt, or exit" — call sites own those — and the docstring now says exactly that. 4. force_run's docstring spells out the _iters_since_last_run = min_iters contract (preserving the first-fire allowance for downstream get_or_run callers). Inline comment on the __init__ assignment anchors the invariant in both places so a future "cleanup" can't silently regress the fix to 0. 5. interactive_confirm's docstring acknowledges the EOFError branch the code already catches (not just KeyboardInterrupt). 6. De-vacuoused 2 CLI tests that previously passed even when production was mutated to ignore the flags they claimed to test: test_force_saturation_check_overrides_abort and test_healthy_band_does_not_prompt now assert GEPA was actually instantiated. Both add the SyntheticDatasetBuilder / select_knee_point / _holdout_evaluate_with_metric mock chain so the run flows through the production code instead of dying on AuthError at dataset gen. Added a new test_user_declines_at_prompt_aborts in both pipelines covering the previously-untested "Aborted by user." branch. 77 saturation-related tests pass, full suite at 1078 (was 1076). --- evolution/core/closed_loop_feedback.py | 31 ++++- evolution/core/saturation_check.py | 11 +- evolution/skills/evolve_skill.py | 7 +- evolution/tools/evolve_tool.py | 7 +- .../test_evolve_skill_saturation_preflight.py | 104 +++++++++++++++- .../test_evolve_tool_saturation_preflight.py | 112 +++++++++++++++++- 6 files changed, 257 insertions(+), 15 deletions(-) diff --git a/evolution/core/closed_loop_feedback.py b/evolution/core/closed_loop_feedback.py index 94e858cd..0cee09bb 100644 --- a/evolution/core/closed_loop_feedback.py +++ b/evolution/core/closed_loop_feedback.py @@ -24,8 +24,10 @@ import hashlib import json import logging +import shutil import tempfile import threading +import weakref from pathlib import Path from typing import Callable, Literal, Optional @@ -62,8 +64,11 @@ class ClosedLoopFeedbackCache: """Run-bounded cache of closed-loop verdicts keyed by candidate text. One instance per ``evolve_tool`` / ``evolve_skill`` invocation. The - tmp dir lives for the cache's lifetime; the OS reclaims it at process - exit (no explicit cleanup). + tmp dir lives for the cache's lifetime and is cleaned up via + ``weakref.finalize`` when the cache is garbage-collected — including + on ``SystemExit`` from the saturation pre-flight's abort path, which + would otherwise leak the dir until the OS's /tmp reaper ran (3+ + days on macOS, weekly on most Linux servers). The cache is shared across metric calls within a run, including across DSPy's parallel ``Evaluate`` workers. The threading lock prevents @@ -112,13 +117,26 @@ def __init__( ) self._tmp_dir = Path(tempfile.mkdtemp(prefix="cl_feedback_")) + # Clean up the tmp dir when the cache is garbage-collected. This + # fires on normal completion AND on SystemExit (e.g. the saturation + # pre-flight's non-interactive abort), where atexit-only cleanup + # would leak the dir for days. + self._cleanup_finalizer = weakref.finalize( + self, shutil.rmtree, self._tmp_dir, ignore_errors=True + ) self._baseline_path = self._tmp_dir / f"baseline{artifact_suffix}" self._evolved_path = self._tmp_dir / f"evolved{artifact_suffix}" self._artifact_writer(baseline_artifact_text, self._baseline_path) self._cache: dict[str, ValidationReport] = {} self._judge_history: list[float] = [] - self._iters_since_last_run = self.min_iters # allow first fire + # First-fire allowance: starts at min_iters so the first + # record_judge_score → should_run cycle satisfies the periodic + # floor (iters_since_last_run >= min_iters) and fires immediately + # in sampled gate_mode even before any judge saturation. force_run + # restores this same value rather than 0 to preserve the allowance + # for downstream get_or_run callers; see force_run's docstring. + self._iters_since_last_run = self.min_iters self._lock = threading.Lock() def record_judge_score(self, score: float) -> None: @@ -187,6 +205,13 @@ def force_run(self, candidate_text: str) -> ValidationReport: text. Propagates validator exceptions (unlike ``get_or_run``, which swallows the expected ones to keep GEPA going) — preflight callers want to know the probe failed. + + Sets ``_iters_since_last_run = self.min_iters`` (not 0 like + ``get_or_run``'s post-run reset) so the first GEPA-time + ``record_judge_score`` + ``should_run`` cycle after preflight + still satisfies the periodic floor and fires immediately, + preserving the first-fire allowance ``__init__`` sets up. A + regression test pins this invariant; do not change to 0. """ key = self._key(candidate_text) with self._lock: diff --git a/evolution/core/saturation_check.py b/evolution/core/saturation_check.py index df32e43f..16b65a5d 100644 --- a/evolution/core/saturation_check.py +++ b/evolution/core/saturation_check.py @@ -145,9 +145,12 @@ def saturation_preflight( thresholds: Optional[dict[str, float]] = None, ) -> SaturationReport: """Score baseline on holdout (and closed-loop suite if cache provided), - classify into a band, return a report. Pure: no side effects. + classify into a band, return a report. - Call sites are responsible for rendering panels, prompting, and exiting. + Does the work — LM eval of the baseline, optional closed-loop validator + fire via ``force_run``, cache mutation, possibly a subprocess. The + "purity" we care about is at a higher layer: this function doesn't + render panels, prompt for confirmation, or exit. Call sites own those. """ if not holdout_examples: raise ValueError("holdout_examples is empty; nothing to score") @@ -260,7 +263,9 @@ def is_non_interactive() -> bool: def interactive_confirm(prompt: str = "Continue anyway? [y/N] ") -> bool: """Read one line from stdin; return True only for {y, yes} case-insensitive. - Ctrl-C / KeyboardInterrupt → False (treat like 'n', no traceback noise). + Ctrl-C and stdin EOF both → False (treat like 'n', no traceback noise). + The EOF branch matters in practice when stdin is piped from ``/dev/null`` + or a closed pipe. """ try: answer = input(prompt) diff --git a/evolution/skills/evolve_skill.py b/evolution/skills/evolve_skill.py index 9d0986be..bfeb6cc1 100644 --- a/evolution/skills/evolve_skill.py +++ b/evolution/skills/evolve_skill.py @@ -907,7 +907,12 @@ def evolve( "proceed. Pass --force-saturation-check to " "override.[/yellow]" ) - sys.exit(0) + # Exit code 3 distinguishes "refused to run for + # lack of a TTY to confirm against" from clean + # success (0) or hard user errors (1). Lets a + # wrapping CI / cron / scheduled runner detect + # silent denial. + sys.exit(3) if not interactive_confirm(): console.print("[yellow]Aborted by user.[/yellow]") sys.exit(0) diff --git a/evolution/tools/evolve_tool.py b/evolution/tools/evolve_tool.py index 23e44b01..cd9b5028 100644 --- a/evolution/tools/evolve_tool.py +++ b/evolution/tools/evolve_tool.py @@ -677,7 +677,12 @@ def evolve( "proceed. Pass --force-saturation-check to " "override.[/yellow]" ) - sys.exit(0) + # Exit code 3 distinguishes "refused to run for + # lack of a TTY to confirm against" from clean + # success (0) or hard user errors (1). Lets a + # wrapping CI / cron / scheduled runner detect + # silent denial. + sys.exit(3) if not interactive_confirm(): console.print("[yellow]Aborted by user.[/yellow]") sys.exit(0) diff --git a/tests/skills/test_evolve_skill_saturation_preflight.py b/tests/skills/test_evolve_skill_saturation_preflight.py index 8aee4359..252b2d8d 100644 --- a/tests/skills/test_evolve_skill_saturation_preflight.py +++ b/tests/skills/test_evolve_skill_saturation_preflight.py @@ -62,18 +62,46 @@ def test_no_saturation_check_flag_skips_helper(self, skill_dir): mock_preflight.assert_not_called() def test_healthy_band_does_not_prompt(self, skill_dir): + """When preflight returns healthy: no prompt AND GEPA actually runs. + + Asserting only ``mock_confirm.assert_not_called()`` is vacuous — + a future boolean inversion in the call site would still pass that + assertion because CliRunner's non-TTY stdin hits the + ``is_non_interactive`` short-circuit before reaching the confirm. + Asserting GEPA was instantiated proves the run actually proceeded + past the abort branch. + """ from evolution.core.saturation_check import SaturationReport + from evolution.skills.knee_point import CandidatePick healthy = SaturationReport( band="healthy", holdout_score=0.5, holdout_n=10, holdout_per_example=[0.5] * 10, suggestions=[], thresholds={}, ) + fake_module = MagicMock() + fake_module.skill_text = "evolved skill text" + knee_pick = CandidatePick( + module=fake_module, skill_text="evolved skill text", body_chars=18, + val_score=0.8, val_rank_in_band=1, band_size=1, epsilon=0.1, + fallback="knee", picked_idx=0, gepa_default_idx=0, + gepa_default_body_chars=18, band_roster=[], + ) + fake_builder = MagicMock() + fake_builder.generate.return_value = _fake_skill_dataset() + gepa_mock = MagicMock() with patch( + "evolution.skills.evolve_skill.SyntheticDatasetBuilder", return_value=fake_builder + ), patch( "evolution.skills.evolve_skill.saturation_preflight", return_value=healthy ), patch( "evolution.skills.evolve_skill._preflight_lm_credentials" ), patch( "evolution.skills.evolve_skill.interactive_confirm" - ) as mock_confirm, patch("evolution.skills.evolve_skill.dspy.GEPA"): + ) as mock_confirm, patch("evolution.skills.evolve_skill.dspy.GEPA", gepa_mock), patch( + "evolution.skills.evolve_skill.select_knee_point", return_value=knee_pick + ), patch( + "evolution.skills.evolve_skill._holdout_evaluate_with_metric" + ) as mock_holdout_eval: + mock_holdout_eval.return_value = (0.6, [0.6] * 10) runner = CliRunner() runner.invoke( evolve_skill_main, @@ -81,6 +109,7 @@ def test_healthy_band_does_not_prompt(self, skill_dir): "--iterations", "1", "--no-preflight"], ) mock_confirm.assert_not_called() + gepa_mock.assert_called_once() def test_saturated_band_non_interactive_aborts(self, skill_dir): from evolution.core.saturation_check import SaturationReport @@ -108,20 +137,90 @@ def test_saturated_band_non_interactive_aborts(self, skill_dir): ) gepa_mock.assert_not_called() assert "force-saturation-check" in result.output + assert result.exit_code == 3, ( + f"Non-interactive deny should exit 3 (distinct from clean " + f"success=0 / user errors=1), got {result.exit_code}" + ) + + def test_user_declines_at_prompt_aborts(self, skill_dir): + """Interactive context, non-healthy band, user types 'n': prints + 'Aborted by user.', exits 0, no GEPA. Covers the + ``if not interactive_confirm(): sys.exit(0)`` branch that has + no other end-to-end coverage.""" + from evolution.core.saturation_check import SaturationReport + saturated = SaturationReport( + band="no_headroom", holdout_score=0.99, holdout_n=50, + holdout_per_example=[1.0] * 50, suggestions=["x"], thresholds={}, + ) + fake_builder = MagicMock() + fake_builder.generate.return_value = _fake_skill_dataset() + gepa_mock = MagicMock() + with patch( + "evolution.skills.evolve_skill.SyntheticDatasetBuilder", return_value=fake_builder + ), patch( + "evolution.skills.evolve_skill.saturation_preflight", return_value=saturated + ), patch( + "evolution.skills.evolve_skill._preflight_lm_credentials" + ), patch( + "evolution.skills.evolve_skill.is_non_interactive", return_value=False + ), patch( + "evolution.skills.evolve_skill.interactive_confirm", return_value=False + ), patch("evolution.skills.evolve_skill.dspy.GEPA", gepa_mock): + runner = CliRunner() + result = runner.invoke( + evolve_skill_main, + ["--skill", "demo-skill", "--skill-source-dir", str(skill_dir), + "--iterations", "1", "--no-preflight"], + ) + gepa_mock.assert_not_called() + assert "Aborted by user" in result.output + assert result.exit_code == 0, ( + f"Interactive user-said-no abort should exit 0, got {result.exit_code}" + ) def test_force_saturation_check_overrides_abort(self, skill_dir): + """--force-saturation-check on a saturated baseline in a + non-interactive context: panel renders, confirm is bypassed, AND + GEPA actually runs. + + Asserting only ``mock_confirm.assert_not_called()`` would be + vacuous (the non-TTY guard exits before reaching confirm anyway); + the GEPA-was-instantiated assertion proves the force flag + actually overrode the abort. + """ from evolution.core.saturation_check import SaturationReport + from evolution.skills.knee_point import CandidatePick saturated = SaturationReport( band="no_headroom", holdout_score=0.99, holdout_n=50, holdout_per_example=[1.0] * 50, suggestions=["x"], thresholds={}, ) + fake_module = MagicMock() + fake_module.skill_text = "evolved skill text" + knee_pick = CandidatePick( + module=fake_module, skill_text="evolved skill text", body_chars=18, + val_score=0.8, val_rank_in_band=1, band_size=1, epsilon=0.1, + fallback="knee", picked_idx=0, gepa_default_idx=0, + gepa_default_body_chars=18, band_roster=[], + ) + fake_builder = MagicMock() + fake_builder.generate.return_value = _fake_skill_dataset() + gepa_mock = MagicMock() with patch( + "evolution.skills.evolve_skill.SyntheticDatasetBuilder", return_value=fake_builder + ), patch( "evolution.skills.evolve_skill.saturation_preflight", return_value=saturated ), patch( "evolution.skills.evolve_skill._preflight_lm_credentials" + ), patch( + "evolution.skills.evolve_skill.is_non_interactive", return_value=True ), patch( "evolution.skills.evolve_skill.interactive_confirm" - ) as mock_confirm, patch("evolution.skills.evolve_skill.dspy.GEPA"): + ) as mock_confirm, patch("evolution.skills.evolve_skill.dspy.GEPA", gepa_mock), patch( + "evolution.skills.evolve_skill.select_knee_point", return_value=knee_pick + ), patch( + "evolution.skills.evolve_skill._holdout_evaluate_with_metric" + ) as mock_holdout_eval: + mock_holdout_eval.return_value = (0.6, [0.6] * 10) runner = CliRunner() runner.invoke( evolve_skill_main, @@ -129,6 +228,7 @@ def test_force_saturation_check_overrides_abort(self, skill_dir): "--iterations", "1", "--force-saturation-check", "--no-preflight"], ) mock_confirm.assert_not_called() + gepa_mock.assert_called_once() def test_cache_reuse_skips_baseline_re_eval_after_gepa(self, skill_dir): """When the saturation preflight runs, the cached baseline holdout diff --git a/tests/tools/test_evolve_tool_saturation_preflight.py b/tests/tools/test_evolve_tool_saturation_preflight.py index f73f4acb..62a68425 100644 --- a/tests/tools/test_evolve_tool_saturation_preflight.py +++ b/tests/tools/test_evolve_tool_saturation_preflight.py @@ -67,19 +67,48 @@ def test_no_saturation_check_flag_skips_helper(self, manifest_dir): mock_preflight.assert_not_called() def test_healthy_band_does_not_prompt(self, manifest_dir): - """When preflight returns healthy, no panel, no prompt; GEPA proceeds.""" + """When preflight returns healthy: no prompt AND GEPA actually runs. + + Asserting only ``mock_confirm.assert_not_called()`` is vacuous — + a future boolean inversion (e.g. the call site flipping to ``if + sat_report.band == "healthy":``) would still pass that assertion + because CliRunner's non-TTY stdin would hit the + ``is_non_interactive`` short-circuit and ``sys.exit(3)`` before + reaching ``interactive_confirm``. Asserting GEPA was instantiated + proves the run actually proceeded past the abort branch. + """ from evolution.core.saturation_check import SaturationReport + from evolution.skills.knee_point import CandidatePick healthy = SaturationReport( band="healthy", holdout_score=0.5, holdout_n=10, holdout_per_example=[0.5] * 10, suggestions=[], thresholds={}, ) + fake_module = MagicMock() + knee_pick = CandidatePick( + module=fake_module, skill_text="evolved desc", body_chars=12, + val_score=0.8, val_rank_in_band=1, band_size=1, epsilon=0.1, + fallback="knee", picked_idx=0, gepa_default_idx=0, + gepa_default_body_chars=12, band_roster=[], + ) + fake_builder = MagicMock() + fake_builder.generate_tool_selection.return_value = _fake_tool_examples() + gepa_mock = MagicMock() with patch( + "evolution.tools.evolve_tool.SyntheticDatasetBuilder", return_value=fake_builder + ), patch( "evolution.tools.evolve_tool.saturation_preflight", return_value=healthy ), patch( "evolution.tools.evolve_tool._preflight_lm_credentials" ), patch( "evolution.tools.evolve_tool.interactive_confirm" - ) as mock_confirm, patch("evolution.tools.evolve_tool.dspy.GEPA"): + ) as mock_confirm, patch("evolution.tools.evolve_tool.dspy.GEPA", gepa_mock), patch( + "evolution.tools.evolve_tool.select_knee_point", return_value=knee_pick + ), patch( + "evolution.tools.evolve_tool._candidate_description", return_value="evolved desc" + ), patch( + "evolution.tools.evolve_tool._holdout_evaluate_with_metric" + ) as mock_holdout_eval: + mock_holdout_eval.return_value = (0.6, [0.6] * 10) runner = CliRunner() runner.invoke( evolve_tool_main, @@ -87,6 +116,7 @@ def test_healthy_band_does_not_prompt(self, manifest_dir): "--iterations", "1", "--no-preflight"], ) mock_confirm.assert_not_called() + gepa_mock.assert_called_once() def test_saturated_band_non_interactive_aborts(self, manifest_dir): """no_headroom band in non-interactive context exits cleanly without GEPA.""" @@ -116,30 +146,102 @@ def test_saturated_band_non_interactive_aborts(self, manifest_dir): ) gepa_mock.assert_not_called() assert "force-saturation-check" in result.output + assert result.exit_code == 3, ( + f"Non-interactive deny should exit 3 (distinct from clean " + f"success=0 / user errors=1), got {result.exit_code}" + ) + + def test_user_declines_at_prompt_aborts(self, manifest_dir): + """Interactive context, non-healthy band, user types 'n': prints + 'Aborted by user.', exits 0, no GEPA. Covers the + ``if not interactive_confirm(): sys.exit(0)`` branch that has + no other end-to-end coverage.""" + from evolution.core.saturation_check import SaturationReport + saturated = SaturationReport( + band="no_headroom", holdout_score=0.99, holdout_n=50, + holdout_per_example=[1.0] * 50, suggestions=["x"], thresholds={}, + ) + fake_builder = MagicMock() + fake_builder.generate_tool_selection.return_value = _fake_tool_examples() + gepa_mock = MagicMock() + with patch( + "evolution.tools.evolve_tool.SyntheticDatasetBuilder", return_value=fake_builder + ), patch( + "evolution.tools.evolve_tool.saturation_preflight", return_value=saturated + ), patch( + "evolution.tools.evolve_tool._preflight_lm_credentials" + ), patch( + "evolution.tools.evolve_tool.is_non_interactive", return_value=False + ), patch( + "evolution.tools.evolve_tool.interactive_confirm", return_value=False + ), patch("evolution.tools.evolve_tool.dspy.GEPA", gepa_mock): + runner = CliRunner() + result = runner.invoke( + evolve_tool_main, + ["--tool", "write_file", "--manifest", str(manifest_dir), + "--iterations", "1", "--no-preflight"], + ) + gepa_mock.assert_not_called() + assert "Aborted by user" in result.output + assert result.exit_code == 0, ( + f"Interactive user-said-no abort should exit 0, got {result.exit_code}" + ) def test_force_saturation_check_overrides_abort(self, manifest_dir): - """--force-saturation-check renders panel but lets GEPA run.""" + """--force-saturation-check on a saturated baseline in a + non-interactive context: panel renders, confirm is bypassed, AND + GEPA actually runs. + + Asserting only ``mock_confirm.assert_not_called()`` is vacuous + on its own: an inverted force-flag check would still pass that + assertion because the non-TTY ``is_non_interactive`` branch + ``sys.exit(3)``s before reaching ``interactive_confirm``. The + GEPA-was-instantiated assertion proves the force flag actually + overrode the abort. + """ from evolution.core.saturation_check import SaturationReport + from evolution.skills.knee_point import CandidatePick saturated = SaturationReport( band="no_headroom", holdout_score=0.99, holdout_n=50, holdout_per_example=[1.0] * 50, suggestions=["x"], thresholds={}, ) + fake_module = MagicMock() + knee_pick = CandidatePick( + module=fake_module, skill_text="evolved desc", body_chars=12, + val_score=0.8, val_rank_in_band=1, band_size=1, epsilon=0.1, + fallback="knee", picked_idx=0, gepa_default_idx=0, + gepa_default_body_chars=12, band_roster=[], + ) + fake_builder = MagicMock() + fake_builder.generate_tool_selection.return_value = _fake_tool_examples() + gepa_mock = MagicMock() with patch( + "evolution.tools.evolve_tool.SyntheticDatasetBuilder", return_value=fake_builder + ), patch( "evolution.tools.evolve_tool.saturation_preflight", return_value=saturated ), patch( "evolution.tools.evolve_tool._preflight_lm_credentials" + ), patch( + "evolution.tools.evolve_tool.is_non_interactive", return_value=True ), patch( "evolution.tools.evolve_tool.interactive_confirm" - ) as mock_confirm, patch("evolution.tools.evolve_tool.dspy.GEPA"): + ) as mock_confirm, patch("evolution.tools.evolve_tool.dspy.GEPA", gepa_mock), patch( + "evolution.tools.evolve_tool.select_knee_point", return_value=knee_pick + ), patch( + "evolution.tools.evolve_tool._candidate_description", return_value="evolved desc" + ), patch( + "evolution.tools.evolve_tool._holdout_evaluate_with_metric" + ) as mock_holdout_eval: + mock_holdout_eval.return_value = (0.6, [0.6] * 10) runner = CliRunner() runner.invoke( evolve_tool_main, ["--tool", "write_file", "--manifest", str(manifest_dir), "--iterations", "1", "--force-saturation-check", "--no-preflight"], ) - # confirm is bypassed when --force-saturation-check is set mock_confirm.assert_not_called() + gepa_mock.assert_called_once() def test_cache_reuse_skips_baseline_re_eval_after_gepa(self, manifest_dir): """When the saturation preflight runs, the cached baseline holdout