From ae2e7250038baebe1be1ded92e7c6e0eac81acdb Mon Sep 17 00:00:00 2001
From: Justin Ramos <justin.ramos@gmail.com>
Date: Thu, 21 May 2026 10:57:34 -0600
Subject: [PATCH 1/8] feat(core): saturation pre-flight helper + force_run on
 closed-loop cache

Adds evolution/core/saturation_check.py mirroring auth_check's shape:
pure saturation_preflight() returns a SaturationReport classifying the
baseline into healthy / no_headroom / weak_signal / uniform_failure.
Call sites in evolve_skill / evolve_tool will render a Rich panel and
decide whether to prompt or default-deny (next two commits).

Also adds ClosedLoopFeedbackCache.force_run: bypasses should_run()
and propagates validator exceptions (unlike get_or_run which swallows
the expected ones to keep GEPA going). Preflight needs to fire the
validator once at startup, before any judge scores have been recorded,
which is when get_or_run would return None in sampled mode.

Pure helpers; no wiring yet. Wiring lands in feat(skills) and
feat(tools) follow-ups.
---
 evolution/core/closed_loop_feedback.py  |  26 ++
 evolution/core/saturation_check.py      | 253 +++++++++++++++++++
 tests/core/test_closed_loop_feedback.py |  59 +++++
 tests/core/test_saturation_check.py     | 317 ++++++++++++++++++++++++
 4 files changed, 655 insertions(+)
 create mode 100644 evolution/core/saturation_check.py
 create mode 100644 tests/core/test_saturation_check.py

diff --git a/evolution/core/closed_loop_feedback.py b/evolution/core/closed_loop_feedback.py
index 1cd40b44..ea8dd6b0 100644
--- a/evolution/core/closed_loop_feedback.py
+++ b/evolution/core/closed_loop_feedback.py
@@ -179,6 +179,32 @@ def get_or_run(self, candidate_text: str) -> Optional[ValidationReport]:
             self._iters_since_last_run = 0
             return report
 
+    def force_run(self, candidate_text: str) -> ValidationReport:
+        """Run the validator now, bypassing the saturation gate.
+
+        Use at preflight or anywhere a baseline probe is needed.
+        Result is cached for downstream ``get_or_run`` hits on the same
+        text. Propagates validator exceptions (unlike ``get_or_run``,
+        which swallows the expected ones to keep GEPA going) — preflight
+        callers want to know the probe failed.
+        """
+        key = self._key(candidate_text)
+        with self._lock:
+            cached = self._cache.get(key)
+            if cached is not None:
+                return cached
+            self._artifact_writer(candidate_text, self._evolved_path)
+            inputs = ValidationInputs(
+                tool_name=self._artifact_name,
+                suite=self._suite,
+                baseline_artifact=self._baseline_path,
+                evolved_artifact=self._evolved_path,
+            )
+            report = self._validator.validate(inputs)
+            self._cache[key] = report
+            self._iters_since_last_run = 0
+            return report
+
     def get_task_verdict(
         self, candidate_text: str, task_id: str
     ) -> Optional[TaskResult]:
diff --git a/evolution/core/saturation_check.py b/evolution/core/saturation_check.py
new file mode 100644
index 00000000..6fcfa468
--- /dev/null
+++ b/evolution/core/saturation_check.py
@@ -0,0 +1,253 @@
+"""Saturation pre-flight: detect doomed evolve_* runs before GEPA spends budget.
+
+Mirrors the shape of evolution.core.auth_check: a pure helper that
+returns a structured report. Call sites in evolve_skill / evolve_tool
+render a Rich panel and decide whether to prompt or default-deny.
+
+See docs/superpowers/specs/2026-05-21-path-f-saturation-preflight-design.md
+"""
+
+from __future__ import annotations
+
+import sys
+from dataclasses import dataclass, field
+from typing import Literal, Optional, TypeAlias
+
+import dspy
+from rich.console import Console
+from rich.panel import Panel
+from rich.text import Text
+
+SaturationBand: TypeAlias = Literal[
+    "healthy", "no_headroom", "weak_signal", "uniform_failure"
+]
+
+DEFAULT_THRESHOLDS: dict[str, float] = {
+    "no_headroom_synthetic": 0.99,
+    "weak_signal_synthetic": 0.95,
+    "no_headroom_closed_loop": 0.95,
+    "uniform_failure_closed_loop": 0.15,
+}
+
+
+@dataclass
+class SaturationReport:
+    band: SaturationBand
+    holdout_score: float
+    holdout_n: int
+    holdout_per_example: list[float]
+    closed_loop_score: Optional[float] = None
+    closed_loop_n: Optional[int] = None
+    closed_loop_per_example: Optional[list[float]] = None
+    suggestions: list[str] = field(default_factory=list)
+    thresholds: dict[str, float] = field(default_factory=dict)
+
+
+def _classify_band(
+    *,
+    holdout_score: float,
+    closed_loop_score: Optional[float],
+    thresholds: dict[str, float],
+) -> tuple[SaturationBand, list[str]]:
+    """Categorize a (synthetic, closed-loop) score pair into a band.
+
+    Returns (band, suggestions_to_show_user).
+    """
+    no_head_syn = thresholds["no_headroom_synthetic"]
+    weak_syn = thresholds["weak_signal_synthetic"]
+    no_head_cl = thresholds["no_headroom_closed_loop"]
+    uniform_cl = thresholds["uniform_failure_closed_loop"]
+
+    if closed_loop_score is not None and closed_loop_score <= uniform_cl:
+        return "uniform_failure", [
+            "Validator agent appears too weak to use the tool/skill — all behavioral tasks fail uniformly.",
+            "Try a stronger --closed-loop-agent-model.",
+            "Or harden the suite tasks so failure modes are interesting, not 'model can't execute'.",
+        ]
+
+    if holdout_score >= no_head_syn and (
+        closed_loop_score is None or closed_loop_score >= no_head_cl
+    ):
+        return "no_headroom", [
+            "Baseline already saturates the eval. No measurable headroom to evolve into.",
+            "Try a harder closed-loop suite, or pick a different optimization target.",
+            "Sanity check: is the synthetic generator producing trivially-correct tasks?",
+        ]
+
+    if (
+        holdout_score >= weak_syn
+        and closed_loop_score is not None
+        and uniform_cl < closed_loop_score < no_head_cl
+    ):
+        return "weak_signal", [
+            "Judge saturating but closed-loop has signal; GEPA's small-minibatch acceptance will struggle.",
+            "Expect many proposals rejected — bump --iterations above 5.",
+            "Larger minibatch (Path E follow-up) would help once landed.",
+        ]
+
+    return "healthy", []
+
+
+def _score_baseline_on_holdout(
+    *,
+    baseline_module,
+    holdout_examples: list,
+    metric,
+    lm,
+) -> tuple[float, list[float]]:
+    """Run dspy.Evaluate on the baseline, return (mean, per_example_scores).
+
+    Carved out as its own helper so tests can patch it without touching DSPy
+    plumbing. Shape matches _holdout_evaluate_with_metric in evolve_*.py.
+    """
+    def two_arg_metric(example, prediction, *_args, **_kwargs):
+        result = metric(example, prediction)
+        return float(getattr(result, "score", result))
+
+    evaluator = dspy.Evaluate(
+        devset=holdout_examples,
+        metric=two_arg_metric,
+        num_threads=4,
+        provide_traceback=True,
+        max_errors=len(holdout_examples) * 100,
+    )
+    with dspy.context(lm=lm):
+        result = evaluator(baseline_module)
+    mean = float(result.score) / 100.0
+    per_example = [float(s) for _, _, s in result.results]
+    return mean, per_example
+
+
+def saturation_preflight(
+    *,
+    baseline_module,
+    holdout_examples: list,
+    metric,
+    lm,
+    closed_loop_cache=None,
+    baseline_artifact_text: Optional[str] = None,
+    thresholds: Optional[dict[str, float]] = None,
+) -> SaturationReport:
+    """Score baseline on holdout (and closed-loop suite if cache provided),
+    classify into a band, return a report. Pure: no side effects.
+
+    Call sites are responsible for rendering panels, prompting, and exiting.
+    """
+    if not holdout_examples:
+        raise ValueError("holdout_examples is empty; nothing to score")
+    thresholds = thresholds if thresholds is not None else dict(DEFAULT_THRESHOLDS)
+
+    holdout_mean, holdout_per_example = _score_baseline_on_holdout(
+        baseline_module=baseline_module,
+        holdout_examples=holdout_examples,
+        metric=metric,
+        lm=lm,
+    )
+
+    closed_loop_mean: Optional[float] = None
+    closed_loop_n: Optional[int] = None
+    closed_loop_per_example: Optional[list[float]] = None
+    if closed_loop_cache is not None:
+        if baseline_artifact_text is None:
+            raise ValueError(
+                "baseline_artifact_text is required when closed_loop_cache is provided"
+            )
+        report = closed_loop_cache.force_run(baseline_artifact_text)
+        per_example = [1.0 if t.passed else 0.0 for t in report.evolved.tasks]
+        closed_loop_per_example = per_example
+        closed_loop_n = len(per_example)
+        closed_loop_mean = sum(per_example) / len(per_example) if per_example else 0.0
+
+    band, suggestions = _classify_band(
+        holdout_score=holdout_mean,
+        closed_loop_score=closed_loop_mean,
+        thresholds=thresholds,
+    )
+
+    return SaturationReport(
+        band=band,
+        holdout_score=holdout_mean,
+        holdout_n=len(holdout_per_example),
+        holdout_per_example=holdout_per_example,
+        closed_loop_score=closed_loop_mean,
+        closed_loop_n=closed_loop_n,
+        closed_loop_per_example=closed_loop_per_example,
+        suggestions=suggestions,
+        thresholds=dict(thresholds),
+    )
+
+
+_BAND_TITLES: dict[SaturationBand, str] = {
+    "healthy": "Saturation check passed",
+    "no_headroom": "No measurable headroom",
+    "weak_signal": "Weak signal — expect a hard run",
+    "uniform_failure": "Uniform failure — validator too weak",
+}
+
+_BAND_STYLES: dict[SaturationBand, str] = {
+    "healthy": "green",
+    "no_headroom": "yellow",
+    "weak_signal": "yellow",
+    "uniform_failure": "yellow",
+}
+
+
+def render_saturation_panel(
+    report: SaturationReport, *, console: Optional[Console] = None,
+) -> None:
+    """Print a Rich panel to ``console`` (or default stdout) summarizing the report.
+
+    Healthy band: one-line acknowledgement. Warn bands: full panel with
+    scores + band-specific suggestions.
+    """
+    if console is None:
+        console = Console()
+
+    if report.band == "healthy":
+        console.print(
+            f"[dim]Saturation check passed (holdout={report.holdout_score:.3f}"
+            + (
+                f", closed-loop={report.closed_loop_score:.3f}"
+                if report.closed_loop_score is not None
+                else ""
+            )
+            + ").[/dim]"
+        )
+        return
+
+    body = Text()
+    body.append(f"Band: {report.band}\n", style="bold")
+    body.append(f"Holdout (synthetic): {report.holdout_score:.3f} over {report.holdout_n} examples\n")
+    if report.closed_loop_score is not None:
+        body.append(
+            f"Closed-loop (behavioral): {report.closed_loop_score:.3f} over {report.closed_loop_n} tasks\n"
+        )
+    body.append("\nSuggestions:\n", style="bold")
+    for s in report.suggestions:
+        body.append(f"  • {s}\n")
+
+    console.print(
+        Panel(
+            body,
+            title=_BAND_TITLES[report.band],
+            border_style=_BAND_STYLES[report.band],
+        )
+    )
+
+
+def is_non_interactive() -> bool:
+    """True when stdin isn't a TTY. Used by call sites to decide between
+    prompting for y/N and printing the override-flag hint."""
+    return not sys.stdin.isatty()
+
+
+def interactive_confirm(prompt: str = "Continue anyway? [y/N] ") -> bool:
+    """Read one line from stdin; return True only for {y, yes} case-insensitive.
+
+    Ctrl-C / KeyboardInterrupt → False (treat like 'n', no traceback noise).
+    """
+    try:
+        answer = input(prompt)
+    except (KeyboardInterrupt, EOFError):
+        return False
+    return answer.strip().lower() in {"y", "yes"}
diff --git a/tests/core/test_closed_loop_feedback.py b/tests/core/test_closed_loop_feedback.py
index 291a0e14..038d665b 100644
--- a/tests/core/test_closed_loop_feedback.py
+++ b/tests/core/test_closed_loop_feedback.py
@@ -533,3 +533,62 @@ def test_write_text_artifact_helper_writes_plain_text(self, tmp_path):
         path = tmp_path / "out.md"
         write_text_artifact("hello world\n", path)
         assert path.read_text() == "hello world\n"
+
+
+class TestForceRun:
+    """`force_run` bypasses should_run() and propagates errors (unlike
+    get_or_run which swallows expected validator errors)."""
+
+    def test_force_run_fires_in_sampled_mode_before_any_judge_scores(self, tmp_path):
+        """In default sampled mode with empty judge history, should_run()
+        returns False — but force_run runs the validator anyway."""
+        suite = _build_suite(tmp_path)
+        report = _build_report()
+        validator = MagicMock()
+        validator.validate.return_value = report
+        cache = ClosedLoopFeedbackCache(
+            validator=validator,
+            suite=suite,
+            artifact_name="write_file",
+            baseline_artifact_text="baseline desc",
+            gate_mode="sampled",
+        )
+        assert cache.should_run() is False
+
+        result = cache.force_run("candidate desc")
+
+        assert result is report
+        validator.validate.assert_called_once()
+
+    def test_force_run_uses_cache_on_repeat_calls(self, tmp_path):
+        """Second call with same candidate_text returns the cached report
+        without re-running the validator."""
+        suite = _build_suite(tmp_path)
+        report = _build_report()
+        validator = MagicMock()
+        validator.validate.return_value = report
+        cache = ClosedLoopFeedbackCache(
+            validator=validator, suite=suite, artifact_name="t",
+            baseline_artifact_text="b", gate_mode="sampled",
+        )
+
+        first = cache.force_run("cand")
+        second = cache.force_run("cand")
+
+        assert first is second
+        assert validator.validate.call_count == 1
+
+    def test_force_run_propagates_validator_errors(self, tmp_path):
+        """force_run propagates ConcurrentRunError (unlike get_or_run,
+        which swallows it and returns None to keep GEPA going). Preflight
+        callers want to know the probe failed."""
+        suite = _build_suite(tmp_path)
+        validator = MagicMock()
+        validator.validate.side_effect = ConcurrentRunError("locked")
+        cache = ClosedLoopFeedbackCache(
+            validator=validator, suite=suite, artifact_name="t",
+            baseline_artifact_text="b", gate_mode="sampled",
+        )
+
+        with pytest.raises(ConcurrentRunError):
+            cache.force_run("cand")
diff --git a/tests/core/test_saturation_check.py b/tests/core/test_saturation_check.py
new file mode 100644
index 00000000..0429c3dd
--- /dev/null
+++ b/tests/core/test_saturation_check.py
@@ -0,0 +1,317 @@
+"""Tests for evolution.core.saturation_check.
+
+All tests use hand-built scores or mock the LM/validator — zero real
+LM spend. Pattern mirrors tests/core/test_closed_loop_feedback.py.
+"""
+
+from __future__ import annotations
+
+import pytest
+from unittest.mock import MagicMock, patch
+
+from evolution.core.saturation_check import (
+    DEFAULT_THRESHOLDS,
+    SaturationReport,
+    _classify_band,
+    saturation_preflight,
+)
+
+
+class TestClassifyBand:
+    def test_healthy_when_synthetic_below_weak_threshold(self):
+        band, _ = _classify_band(
+            holdout_score=0.85, closed_loop_score=None, thresholds=DEFAULT_THRESHOLDS,
+        )
+        assert band == "healthy"
+
+    def test_no_headroom_synthetic_only(self):
+        band, suggestions = _classify_band(
+            holdout_score=0.99, closed_loop_score=None, thresholds=DEFAULT_THRESHOLDS,
+        )
+        assert band == "no_headroom"
+        assert any("harder" in s.lower() or "different target" in s.lower() for s in suggestions)
+
+    def test_no_headroom_with_closed_loop_also_saturated(self):
+        band, _ = _classify_band(
+            holdout_score=0.99, closed_loop_score=0.98, thresholds=DEFAULT_THRESHOLDS,
+        )
+        assert band == "no_headroom"
+
+    def test_weak_signal_when_closed_loop_in_middle_band(self):
+        band, suggestions = _classify_band(
+            holdout_score=0.97, closed_loop_score=0.60, thresholds=DEFAULT_THRESHOLDS,
+        )
+        assert band == "weak_signal"
+        assert any("minibatch" in s.lower() or "iterations" in s.lower() for s in suggestions)
+
+    def test_uniform_failure_when_closed_loop_below_threshold(self):
+        band, suggestions = _classify_band(
+            holdout_score=0.98, closed_loop_score=0.10, thresholds=DEFAULT_THRESHOLDS,
+        )
+        assert band == "uniform_failure"
+        assert any("validator" in s.lower() or "stronger" in s.lower() for s in suggestions)
+
+    def test_boundary_exactly_at_no_headroom_synthetic_triggers(self):
+        """0.99 exactly should trigger no_headroom (>= comparison)."""
+        band, _ = _classify_band(
+            holdout_score=0.99, closed_loop_score=None, thresholds=DEFAULT_THRESHOLDS,
+        )
+        assert band == "no_headroom"
+
+    def test_boundary_just_below_no_headroom_does_not_trigger(self):
+        band, _ = _classify_band(
+            holdout_score=0.989, closed_loop_score=None, thresholds=DEFAULT_THRESHOLDS,
+        )
+        assert band == "healthy"
+
+    def test_custom_thresholds_propagate(self):
+        custom = {**DEFAULT_THRESHOLDS, "no_headroom_synthetic": 0.80}
+        band, _ = _classify_band(
+            holdout_score=0.85, closed_loop_score=None, thresholds=custom,
+        )
+        assert band == "no_headroom"
+
+
+class TestSaturationPreflightNoClosedLoop:
+    def test_returns_healthy_when_baseline_below_threshold(self):
+        baseline_module = MagicMock()
+        holdout_examples = [MagicMock() for _ in range(5)]
+        metric = MagicMock()
+        lm = MagicMock()
+
+        with patch(
+            "evolution.core.saturation_check._score_baseline_on_holdout",
+            return_value=(0.60, [0.6, 0.6, 0.6, 0.6, 0.6]),
+        ):
+            report = saturation_preflight(
+                baseline_module=baseline_module,
+                holdout_examples=holdout_examples,
+                metric=metric,
+                lm=lm,
+            )
+
+        assert report.band == "healthy"
+        assert report.holdout_score == 0.60
+        assert report.holdout_n == 5
+        assert report.holdout_per_example == [0.6, 0.6, 0.6, 0.6, 0.6]
+        assert report.closed_loop_score is None
+
+    def test_returns_no_headroom_when_baseline_at_ceiling(self):
+        with patch(
+            "evolution.core.saturation_check._score_baseline_on_holdout",
+            return_value=(1.0, [1.0] * 5),
+        ):
+            report = saturation_preflight(
+                baseline_module=MagicMock(),
+                holdout_examples=[MagicMock() for _ in range(5)],
+                metric=MagicMock(),
+                lm=MagicMock(),
+            )
+
+        assert report.band == "no_headroom"
+        assert len(report.suggestions) >= 1
+
+    def test_raises_on_empty_holdout(self):
+        with pytest.raises(ValueError, match="holdout_examples"):
+            saturation_preflight(
+                baseline_module=MagicMock(),
+                holdout_examples=[],
+                metric=MagicMock(),
+                lm=MagicMock(),
+            )
+
+
+class TestSaturationPreflightWithClosedLoop:
+    def _make_validation_report(self, *, n_pass: int, n_fail: int):
+        """Build a minimal real ValidationReport whose evolved phase has the
+        requested pass/fail counts. Uses real dataclasses (not MagicMock) so
+        a future field rename breaks the test loudly."""
+        from evolution.validation.report import (
+            PhaseResult, TaskResult, ValidationReport, WinLoss,
+        )
+        passed_tasks = [
+            TaskResult(
+                task_id=f"p{i}", passed=True, abstained=False,
+                tool_calls_seq=[], duration_seconds=0.0,
+            )
+            for i in range(n_pass)
+        ]
+        failed_tasks = [
+            TaskResult(
+                task_id=f"f{i}", passed=False, abstained=False,
+                tool_calls_seq=[], duration_seconds=0.0,
+            )
+            for i in range(n_fail)
+        ]
+        tasks = passed_tasks + failed_tasks
+        total = n_pass + n_fail
+        phase = PhaseResult(
+            pass_rate=n_pass / max(1, total),
+            n_passed=n_pass,
+            n_failed=n_fail,
+            n_abstained=0,
+            tasks=tasks,
+        )
+        delta = WinLoss(n_wins=0, n_losses=0, n_ties=total, pass_rate_change=0.0)
+        return ValidationReport(
+            schema_version="1",
+            tool="t",
+            task_suite_path="suite.jsonl",
+            task_suite_sha256="x" * 64,
+            baseline=phase,
+            evolved=phase,
+            delta=delta,
+            decision="pass",
+            decision_reasons=[],
+        )
+
+    def test_closed_loop_score_lands_in_report(self):
+        cache = MagicMock()
+        cache.force_run.return_value = self._make_validation_report(n_pass=3, n_fail=4)
+
+        with patch(
+            "evolution.core.saturation_check._score_baseline_on_holdout",
+            return_value=(0.99, [1.0] * 5),
+        ):
+            report = saturation_preflight(
+                baseline_module=MagicMock(),
+                holdout_examples=[MagicMock() for _ in range(5)],
+                metric=MagicMock(),
+                lm=MagicMock(),
+                closed_loop_cache=cache,
+                baseline_artifact_text="baseline desc",
+            )
+
+        cache.force_run.assert_called_once_with("baseline desc")
+        assert report.closed_loop_n == 7
+        assert report.closed_loop_score == pytest.approx(3 / 7)
+        assert report.closed_loop_per_example == [1.0, 1.0, 1.0, 0.0, 0.0, 0.0, 0.0]
+
+    def test_uniform_failure_band_triggers(self):
+        cache = MagicMock()
+        cache.force_run.return_value = self._make_validation_report(n_pass=0, n_fail=7)
+        with patch(
+            "evolution.core.saturation_check._score_baseline_on_holdout",
+            return_value=(0.99, [1.0] * 5),
+        ):
+            report = saturation_preflight(
+                baseline_module=MagicMock(),
+                holdout_examples=[MagicMock() for _ in range(5)],
+                metric=MagicMock(),
+                lm=MagicMock(),
+                closed_loop_cache=cache,
+                baseline_artifact_text="b",
+            )
+        assert report.band == "uniform_failure"
+
+    def test_weak_signal_band_triggers(self):
+        cache = MagicMock()
+        cache.force_run.return_value = self._make_validation_report(n_pass=4, n_fail=3)
+        with patch(
+            "evolution.core.saturation_check._score_baseline_on_holdout",
+            return_value=(0.97, [1.0] * 5),
+        ):
+            report = saturation_preflight(
+                baseline_module=MagicMock(),
+                holdout_examples=[MagicMock() for _ in range(5)],
+                metric=MagicMock(),
+                lm=MagicMock(),
+                closed_loop_cache=cache,
+                baseline_artifact_text="b",
+            )
+        assert report.band == "weak_signal"
+
+    def test_missing_baseline_text_raises(self):
+        cache = MagicMock()
+        with patch(
+            "evolution.core.saturation_check._score_baseline_on_holdout",
+            return_value=(0.5, [0.5]),
+        ):
+            with pytest.raises(ValueError, match="baseline_artifact_text"):
+                saturation_preflight(
+                    baseline_module=MagicMock(),
+                    holdout_examples=[MagicMock()],
+                    metric=MagicMock(), lm=MagicMock(),
+                    closed_loop_cache=cache,
+                    baseline_artifact_text=None,
+                )
+
+
+class TestRenderPanel:
+    def _render_to_string(self, report: SaturationReport) -> str:
+        from io import StringIO
+        from rich.console import Console
+        from evolution.core.saturation_check import render_saturation_panel
+
+        buf = StringIO()
+        console = Console(file=buf, width=100, color_system=None, force_terminal=False)
+        render_saturation_panel(report, console=console)
+        return buf.getvalue()
+
+    def test_no_headroom_panel_includes_band_name_and_suggestion(self):
+        report = SaturationReport(
+            band="no_headroom", holdout_score=0.99, holdout_n=50,
+            holdout_per_example=[1.0] * 50,
+            suggestions=["Try a harder closed-loop suite", "Pick a different target"],
+            thresholds=DEFAULT_THRESHOLDS,
+        )
+        out = self._render_to_string(report)
+        assert "no_headroom" in out.lower() or "no headroom" in out.lower()
+        assert "harder closed-loop suite" in out
+        assert "0.99" in out
+
+    def test_weak_signal_panel_shows_closed_loop_score(self):
+        report = SaturationReport(
+            band="weak_signal", holdout_score=0.97, holdout_n=50,
+            holdout_per_example=[1.0] * 50,
+            closed_loop_score=0.60, closed_loop_n=7, closed_loop_per_example=[],
+            suggestions=["Bump iterations"], thresholds=DEFAULT_THRESHOLDS,
+        )
+        out = self._render_to_string(report)
+        assert "0.60" in out or "60" in out
+        assert "Bump iterations" in out
+
+    def test_healthy_panel_is_terse(self):
+        """healthy band should be one-line / minimal — most of the panel
+        machinery is for the warn bands. This test just verifies it doesn't
+        blow up."""
+        report = SaturationReport(
+            band="healthy", holdout_score=0.60, holdout_n=50,
+            holdout_per_example=[0.6] * 50,
+            suggestions=[], thresholds=DEFAULT_THRESHOLDS,
+        )
+        out = self._render_to_string(report)
+        assert "healthy" in out.lower() or "passed" in out.lower()
+
+
+class TestIsNonInteractive:
+    def test_returns_true_when_stdin_not_tty(self, monkeypatch):
+        monkeypatch.setattr("sys.stdin.isatty", lambda: False)
+        from evolution.core.saturation_check import is_non_interactive
+        assert is_non_interactive() is True
+
+    def test_returns_false_when_stdin_is_tty(self, monkeypatch):
+        monkeypatch.setattr("sys.stdin.isatty", lambda: True)
+        from evolution.core.saturation_check import is_non_interactive
+        assert is_non_interactive() is False
+
+
+class TestInteractiveConfirm:
+    @pytest.mark.parametrize("answer", ["y", "Y", "yes", "YES", "Yes"])
+    def test_returns_true_for_yes_variants(self, monkeypatch, answer):
+        monkeypatch.setattr("builtins.input", lambda _prompt="": answer)
+        from evolution.core.saturation_check import interactive_confirm
+        assert interactive_confirm() is True
+
+    @pytest.mark.parametrize("answer", ["n", "no", "", "anything else", "ynope"])
+    def test_returns_false_for_everything_else(self, monkeypatch, answer):
+        monkeypatch.setattr("builtins.input", lambda _prompt="": answer)
+        from evolution.core.saturation_check import interactive_confirm
+        assert interactive_confirm() is False
+
+    def test_returns_false_on_keyboard_interrupt(self, monkeypatch):
+        def _raise(_prompt=""):
+            raise KeyboardInterrupt()
+        monkeypatch.setattr("builtins.input", _raise)
+        from evolution.core.saturation_check import interactive_confirm
+        assert interactive_confirm() is False

From d6e6c834aebce1f26fedc3885cde65216cf90aff Mon Sep 17 00:00:00 2001
From: Justin Ramos <justin.ramos@gmail.com>
Date: Thu, 21 May 2026 11:06:37 -0600
Subject: [PATCH 2/8] feat(tools): wire saturation pre-flight into evolve_tool

After the synthetic dataset builds and the baseline module / metric /
closed_loop_cache are constructed (and before GEPA setup), the
framework now runs the saturation preflight from feat(core). Two new
flags: --no-saturation-check (skip entirely) and
--force-saturation-check (run + render but bypass the
abort/prompt). Default UX in interactive contexts is warn+confirm;
in non-interactive contexts (no TTY on stdin), non-healthy bands
exit cleanly with a "use --force-saturation-check" hint.

The baseline holdout per-example scores from the preflight are
stashed and reused at the post-GEPA holdout-comparison call site, so
the baseline isn't re-scored at run end. Net cost: ~zero.

Closes the "doomed runs spend GEPA budget before any signal" gap
documented in reports/pareto_frontier_feasibility.md spike #2.
---
 evolution/tools/evolve_tool.py                |  73 ++++++++++-
 .../test_evolve_tool_saturation_preflight.py  | 123 ++++++++++++++++++
 2 files changed, 193 insertions(+), 3 deletions(-)
 create mode 100644 tests/tools/test_evolve_tool_saturation_preflight.py

diff --git a/evolution/tools/evolve_tool.py b/evolution/tools/evolve_tool.py
index 04b8fd90..23e44b01 100644
--- a/evolution/tools/evolve_tool.py
+++ b/evolution/tools/evolve_tool.py
@@ -24,6 +24,12 @@
 
 from evolution.core.config import EvolutionConfig
 from evolution.core.auth_check import preflight as _preflight_lm_credentials
+from evolution.core.saturation_check import (
+    saturation_preflight,
+    render_saturation_panel,
+    interactive_confirm,
+    is_non_interactive,
+)
 from evolution.core.cost_advisor import (
     find_cheaper_alternative as _find_cheaper_alternative,
     render_suggestion_panel as _render_cost_suggestion_panel,
@@ -368,6 +374,8 @@ def evolve(
     closed_loop_task_timeout_seconds: Optional[int] = None,
     skip_preflight: bool = False,
     skip_cost_suggest: bool = False,
+    skip_saturation_check: bool = False,
+    force_saturation_check: bool = False,
 ) -> dict[str, Any]:
     """Evolve one tool description inside a manifest.
 
@@ -647,6 +655,36 @@ def evolve(
                 if closed_loop_in_valset:
                     valset = valset + behavioral_examples
 
+            cached_baseline_holdout_per_example = None
+            if not skip_saturation_check:
+                holdout_examples_for_preflight = _build_examples(
+                    dataset.holdout, for_module=True
+                )
+                sat_report = saturation_preflight(
+                    baseline_module=baseline_module,
+                    holdout_examples=holdout_examples_for_preflight,
+                    metric=metric,
+                    lm=lm,
+                    closed_loop_cache=closed_loop_cache,
+                    baseline_artifact_text=baseline_description,
+                )
+                if sat_report.band != "healthy":
+                    render_saturation_panel(sat_report, console=console)
+                    if not force_saturation_check:
+                        if is_non_interactive():
+                            console.print(
+                                "[yellow]Non-interactive context; refusing to "
+                                "proceed. Pass --force-saturation-check to "
+                                "override.[/yellow]"
+                            )
+                            sys.exit(0)
+                        if not interactive_confirm():
+                            console.print("[yellow]Aborted by user.[/yellow]")
+                            sys.exit(0)
+                else:
+                    render_saturation_panel(sat_report, console=console)
+                cached_baseline_holdout_per_example = sat_report.holdout_per_example
+
             console.print(f"\n[bold cyan]Running GEPA optimization (max_full_evals={iterations})[/bold cyan]\n")
             start_time = time.time()
 
@@ -762,9 +800,13 @@ def evolve(
                 f"\n[bold]Evaluating on holdout set ({len(dataset.holdout)} examples)[/bold]"
             )
             holdout_examples = _build_examples(dataset.holdout, for_module=True)
-            avg_baseline, baseline_per_example = _holdout_evaluate_with_metric(
-                baseline_module, holdout_examples, metric, lm,
-            )
+            if cached_baseline_holdout_per_example is not None:
+                baseline_per_example = cached_baseline_holdout_per_example
+                avg_baseline = sum(baseline_per_example) / len(baseline_per_example)
+            else:
+                avg_baseline, baseline_per_example = _holdout_evaluate_with_metric(
+                    baseline_module, holdout_examples, metric, lm,
+                )
             avg_evolved, evolved_per_example = _holdout_evaluate_with_metric(
                 optimized_module, holdout_examples, metric, lm,
             )
@@ -1187,6 +1229,27 @@ def evolve(
          "and prints a Rich panel with a paste-ready --eval-model flag. "
          "Pass this to suppress the panel.",
 )
+@click.option(
+    "--no-saturation-check",
+    "skip_saturation_check",
+    is_flag=True,
+    default=False,
+    help="Skip the saturation pre-flight. By default, the framework "
+         "scores the baseline on the holdout (and the closed-loop suite, "
+         "if --closed-loop-during-evolution is set) BEFORE GEPA starts "
+         "and refuses to spend on a saturated target. Pass this to skip "
+         "(useful when you've already validated headroom externally).",
+)
+@click.option(
+    "--force-saturation-check",
+    "force_saturation_check",
+    is_flag=True,
+    default=False,
+    help="Run the saturation pre-flight, render the panel, but proceed "
+         "regardless of band. Required to override a non-healthy verdict "
+         "in non-interactive contexts (no TTY). Without this in such a "
+         "context, the framework exits cleanly without spending GEPA budget.",
+)
 @click.option(
     "--closed-loop-in-valset/--no-closed-loop-in-valset",
     "closed_loop_in_valset",
@@ -1238,6 +1301,8 @@ def main(
     benchmark_timeout_seconds: int,
     skip_preflight: bool,
     skip_cost_suggest: bool,
+    skip_saturation_check: bool,
+    force_saturation_check: bool,
     closed_loop_suite_path: Optional[Path],
     closed_loop_hermes_repo: Optional[Path],
     closed_loop_saturation_threshold: float,
@@ -1288,6 +1353,8 @@ def main(
             closed_loop_task_timeout_seconds=closed_loop_task_timeout_seconds,
             skip_preflight=skip_preflight,
             skip_cost_suggest=skip_cost_suggest,
+            skip_saturation_check=skip_saturation_check,
+            force_saturation_check=force_saturation_check,
         )
     except HermesProviderError as exc:
         # Render a clean error panel instead of dumping a Python traceback —
diff --git a/tests/tools/test_evolve_tool_saturation_preflight.py b/tests/tools/test_evolve_tool_saturation_preflight.py
new file mode 100644
index 00000000..ec0b8705
--- /dev/null
+++ b/tests/tools/test_evolve_tool_saturation_preflight.py
@@ -0,0 +1,123 @@
+"""Integration tests for the saturation pre-flight wiring in evolve_tool.
+
+Mocks the LM and the dataset builder so each test runs in ≤2s —
+zero real LM spend. Mirrors tests/tools/test_evolve_tool_closed_loop.py.
+"""
+
+from __future__ import annotations
+
+from pathlib import Path
+from unittest.mock import MagicMock, patch
+
+import pytest
+from click.testing import CliRunner
+
+from evolution.tools.evolve_tool import main as evolve_tool_main
+
+
+def _minimal_manifest_dir(tmp_path: Path) -> Path:
+    """Write a one-tool _SCHEMA file so the manifest loads."""
+    tools_dir = tmp_path / "tools"
+    tools_dir.mkdir()
+    (tools_dir / "__init__.py").write_text("")
+    (tools_dir / "my_tools.py").write_text(
+        'WRITE_FILE_SCHEMA = {\n'
+        '    "name": "write_file",\n'
+        '    "description": "Write to a file.",\n'
+        '    "input_schema": {"type": "object", "properties": {}},\n'
+        '}\n'
+    )
+    return tools_dir
+
+
+@pytest.fixture
+def manifest_dir(tmp_path):
+    return _minimal_manifest_dir(tmp_path)
+
+
+class TestSaturationPreflightCLI:
+    def test_no_saturation_check_flag_skips_helper(self, manifest_dir):
+        """--no-saturation-check skips the preflight helper entirely."""
+        with patch(
+            "evolution.tools.evolve_tool.saturation_preflight"
+        ) as mock_preflight, patch(
+            "evolution.tools.evolve_tool._preflight_lm_credentials"
+        ), patch("evolution.tools.evolve_tool.dspy.GEPA"):
+            runner = CliRunner()
+            runner.invoke(
+                evolve_tool_main,
+                ["--tool", "write_file", "--manifest", str(manifest_dir),
+                 "--iterations", "1", "--no-saturation-check", "--no-preflight"],
+            )
+            mock_preflight.assert_not_called()
+
+    def test_healthy_band_does_not_prompt(self, manifest_dir):
+        """When preflight returns healthy, no panel, no prompt; GEPA proceeds."""
+        from evolution.core.saturation_check import SaturationReport
+        healthy = SaturationReport(
+            band="healthy", holdout_score=0.5, holdout_n=10,
+            holdout_per_example=[0.5] * 10, suggestions=[], thresholds={},
+        )
+        with patch(
+            "evolution.tools.evolve_tool.saturation_preflight", return_value=healthy
+        ), patch(
+            "evolution.tools.evolve_tool._preflight_lm_credentials"
+        ), patch(
+            "evolution.tools.evolve_tool.interactive_confirm"
+        ) as mock_confirm, patch("evolution.tools.evolve_tool.dspy.GEPA"):
+            runner = CliRunner()
+            runner.invoke(
+                evolve_tool_main,
+                ["--tool", "write_file", "--manifest", str(manifest_dir),
+                 "--iterations", "1", "--no-preflight"],
+            )
+            mock_confirm.assert_not_called()
+
+    def test_saturated_band_non_interactive_aborts(self, manifest_dir):
+        """no_headroom band in non-interactive context exits cleanly without GEPA."""
+        from evolution.core.saturation_check import SaturationReport
+        saturated = SaturationReport(
+            band="no_headroom", holdout_score=0.99, holdout_n=50,
+            holdout_per_example=[1.0] * 50,
+            suggestions=["Try a harder suite"], thresholds={},
+        )
+        gepa_mock = MagicMock()
+        with patch(
+            "evolution.tools.evolve_tool.saturation_preflight", return_value=saturated
+        ), patch(
+            "evolution.tools.evolve_tool._preflight_lm_credentials"
+        ), patch(
+            "evolution.tools.evolve_tool.is_non_interactive", return_value=True
+        ), patch("evolution.tools.evolve_tool.dspy.GEPA", gepa_mock):
+            runner = CliRunner()
+            result = runner.invoke(
+                evolve_tool_main,
+                ["--tool", "write_file", "--manifest", str(manifest_dir),
+                 "--iterations", "1", "--no-preflight"],
+            )
+            gepa_mock.assert_not_called()
+            assert "force-saturation-check" in result.output
+
+    def test_force_saturation_check_overrides_abort(self, manifest_dir):
+        """--force-saturation-check renders panel but lets GEPA run."""
+        from evolution.core.saturation_check import SaturationReport
+        saturated = SaturationReport(
+            band="no_headroom", holdout_score=0.99, holdout_n=50,
+            holdout_per_example=[1.0] * 50,
+            suggestions=["x"], thresholds={},
+        )
+        with patch(
+            "evolution.tools.evolve_tool.saturation_preflight", return_value=saturated
+        ), patch(
+            "evolution.tools.evolve_tool._preflight_lm_credentials"
+        ), patch(
+            "evolution.tools.evolve_tool.interactive_confirm"
+        ) as mock_confirm, patch("evolution.tools.evolve_tool.dspy.GEPA"):
+            runner = CliRunner()
+            runner.invoke(
+                evolve_tool_main,
+                ["--tool", "write_file", "--manifest", str(manifest_dir),
+                 "--iterations", "1", "--force-saturation-check", "--no-preflight"],
+            )
+            # confirm is bypassed when --force-saturation-check is set
+            mock_confirm.assert_not_called()

From 6a2546d7812695d63bf435f5637aae38e8d6b647 Mon Sep 17 00:00:00 2001
From: Justin Ramos <justin.ramos@gmail.com>
Date: Thu, 21 May 2026 11:23:30 -0600
Subject: [PATCH 3/8] feat(skills): wire saturation pre-flight into
 evolve_skill
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Symmetric to the evolve_tool wiring from the previous commit.
After the synthetic dataset builds and baseline_module / metric /
closed_loop_cache are constructed (and before GEPA setup), the
framework runs saturation_preflight; non-healthy bands prompt
(interactive) or default-deny (non-interactive) with a
--force-saturation-check override. Baseline holdout per-example
scores from the preflight are reused at the post-GEPA
holdout-comparison call site to keep net cost ~zero.

The per-candidate _holdout_evaluate_with_metric inside the
knee-point loop is deliberately untouched — only the final
baseline-vs-evolved comparison reuses the cached scores.

Completes Path F across both pipelines.
---
 evolution/skills/evolve_skill.py              |  70 +++++++++++-
 .../test_evolve_skill_saturation_preflight.py | 107 ++++++++++++++++++
 2 files changed, 174 insertions(+), 3 deletions(-)
 create mode 100644 tests/skills/test_evolve_skill_saturation_preflight.py

diff --git a/evolution/skills/evolve_skill.py b/evolution/skills/evolve_skill.py
index 2dbdfd05..9d0986be 100644
--- a/evolution/skills/evolve_skill.py
+++ b/evolution/skills/evolve_skill.py
@@ -24,6 +24,12 @@
 
 from evolution.core.config import EvolutionConfig
 from evolution.core.auth_check import preflight as _preflight_lm_credentials
+from evolution.core.saturation_check import (
+    saturation_preflight,
+    render_saturation_panel,
+    interactive_confirm,
+    is_non_interactive,
+)
 from evolution.core.cost_advisor import (
     find_cheaper_alternative as _find_cheaper_alternative,
     render_suggestion_panel as _render_cost_suggestion_panel,
@@ -598,6 +604,8 @@ def evolve(
     benchmark_timeout_seconds: int = 600,
     skip_preflight: bool = False,
     skip_cost_suggest: bool = False,
+    skip_saturation_check: bool = False,
+    force_saturation_check: bool = False,
     closed_loop_suite_path: Optional[Path] = None,
     closed_loop_saturation_threshold: float = 0.95,
     closed_loop_min_iters: int = 3,
@@ -879,6 +887,34 @@ def evolve(
                 if closed_loop_in_valset:
                     valset = valset + behavioral_examples
 
+            cached_baseline_holdout_per_example = None
+            if not skip_saturation_check:
+                holdout_examples_for_preflight = dataset.to_dspy_examples("holdout")
+                sat_report = saturation_preflight(
+                    baseline_module=baseline_module,
+                    holdout_examples=holdout_examples_for_preflight,
+                    metric=metric,
+                    lm=lm,
+                    closed_loop_cache=closed_loop_cache,
+                    baseline_artifact_text=skill["body"],
+                )
+                if sat_report.band != "healthy":
+                    render_saturation_panel(sat_report, console=console)
+                    if not force_saturation_check:
+                        if is_non_interactive():
+                            console.print(
+                                "[yellow]Non-interactive context; refusing to "
+                                "proceed. Pass --force-saturation-check to "
+                                "override.[/yellow]"
+                            )
+                            sys.exit(0)
+                        if not interactive_confirm():
+                            console.print("[yellow]Aborted by user.[/yellow]")
+                            sys.exit(0)
+                else:
+                    render_saturation_panel(sat_report, console=console)
+                cached_baseline_holdout_per_example = sat_report.holdout_per_example
+
             console.print(f"\n[bold cyan]Running GEPA optimization (budget={gepa_budget})...[/bold cyan]\n")
 
             start_time = time.time()
@@ -1004,9 +1040,13 @@ def evolve(
             )
 
             holdout_examples = dataset.to_dspy_examples("holdout")
-            avg_baseline, baseline_per_example = _holdout_evaluate_with_metric(
-                baseline_module, holdout_examples, metric, lm,
-            )
+            if cached_baseline_holdout_per_example is not None:
+                baseline_per_example = cached_baseline_holdout_per_example
+                avg_baseline = sum(baseline_per_example) / len(baseline_per_example)
+            else:
+                avg_baseline, baseline_per_example = _holdout_evaluate_with_metric(
+                    baseline_module, holdout_examples, metric, lm,
+                )
             avg_evolved, evolved_per_example = _holdout_evaluate_with_metric(
                 optimized_module, holdout_examples, metric, lm,
             )
@@ -1501,6 +1541,26 @@ def evolve(
          "and prints a Rich panel with a paste-ready --eval-model flag. "
          "Pass this to suppress the panel.",
 )
+@click.option(
+    "--no-saturation-check",
+    "skip_saturation_check",
+    is_flag=True,
+    default=False,
+    help="Skip the saturation pre-flight. By default, the framework "
+         "scores the baseline on the holdout (and the closed-loop suite, "
+         "if --closed-loop-during-evolution is set) BEFORE GEPA starts "
+         "and refuses to spend on a saturated target. Pass this to skip "
+         "(useful when you've already validated headroom externally).",
+)
+@click.option(
+    "--force-saturation-check",
+    "force_saturation_check",
+    is_flag=True,
+    default=False,
+    help="Run the saturation pre-flight, render the panel, but proceed "
+         "regardless of band. Required to override a non-healthy verdict "
+         "in non-interactive contexts (no TTY).",
+)
 @click.option(
     "--closed-loop-during-evolution",
     "closed_loop_suite_path",
@@ -1592,6 +1652,8 @@ def main(skill, iterations, eval_source, dataset_path, optimizer_model, reflecti
          benchmark_cmd, benchmark_timeout_seconds,
          skip_preflight,
          skip_cost_suggest,
+         skip_saturation_check,
+         force_saturation_check,
          closed_loop_suite_path,
          closed_loop_saturation_threshold,
          closed_loop_min_iters,
@@ -1637,6 +1699,8 @@ def main(skill, iterations, eval_source, dataset_path, optimizer_model, reflecti
             benchmark_timeout_seconds=benchmark_timeout_seconds,
             skip_preflight=skip_preflight,
             skip_cost_suggest=skip_cost_suggest,
+            skip_saturation_check=skip_saturation_check,
+            force_saturation_check=force_saturation_check,
             closed_loop_suite_path=closed_loop_suite_path,
             closed_loop_saturation_threshold=closed_loop_saturation_threshold,
             closed_loop_min_iters=closed_loop_min_iters,
diff --git a/tests/skills/test_evolve_skill_saturation_preflight.py b/tests/skills/test_evolve_skill_saturation_preflight.py
new file mode 100644
index 00000000..1b866d4f
--- /dev/null
+++ b/tests/skills/test_evolve_skill_saturation_preflight.py
@@ -0,0 +1,107 @@
+"""Integration tests for saturation pre-flight wiring in evolve_skill.
+
+Symmetric to tests/tools/test_evolve_tool_saturation_preflight.py.
+"""
+
+from __future__ import annotations
+
+from pathlib import Path
+from unittest.mock import MagicMock, patch
+
+import pytest
+from click.testing import CliRunner
+
+from evolution.skills.evolve_skill import main as evolve_skill_main
+
+
+@pytest.fixture
+def skill_dir(tmp_path):
+    """Write a minimal SKILL.md so skill discovery succeeds."""
+    skills_root = tmp_path / "skills"
+    skill_path = skills_root / "demo-skill"
+    skill_path.mkdir(parents=True)
+    (skill_path / "SKILL.md").write_text(
+        "---\nname: demo-skill\ndescription: a test skill\n---\n\nDo X.\n"
+    )
+    return skills_root
+
+
+class TestSaturationPreflightCLI:
+    def test_no_saturation_check_flag_skips_helper(self, skill_dir):
+        with patch(
+            "evolution.skills.evolve_skill.saturation_preflight"
+        ) as mock_preflight, patch(
+            "evolution.skills.evolve_skill._preflight_lm_credentials"
+        ), patch("evolution.skills.evolve_skill.dspy.GEPA"):
+            runner = CliRunner()
+            runner.invoke(
+                evolve_skill_main,
+                ["--skill", "demo-skill", "--skill-source-dir", str(skill_dir),
+                 "--iterations", "1", "--no-saturation-check", "--no-preflight"],
+            )
+            mock_preflight.assert_not_called()
+
+    def test_healthy_band_does_not_prompt(self, skill_dir):
+        from evolution.core.saturation_check import SaturationReport
+        healthy = SaturationReport(
+            band="healthy", holdout_score=0.5, holdout_n=10,
+            holdout_per_example=[0.5] * 10, suggestions=[], thresholds={},
+        )
+        with patch(
+            "evolution.skills.evolve_skill.saturation_preflight", return_value=healthy
+        ), patch(
+            "evolution.skills.evolve_skill._preflight_lm_credentials"
+        ), patch(
+            "evolution.skills.evolve_skill.interactive_confirm"
+        ) as mock_confirm, patch("evolution.skills.evolve_skill.dspy.GEPA"):
+            runner = CliRunner()
+            runner.invoke(
+                evolve_skill_main,
+                ["--skill", "demo-skill", "--skill-source-dir", str(skill_dir),
+                 "--iterations", "1", "--no-preflight"],
+            )
+            mock_confirm.assert_not_called()
+
+    def test_saturated_band_non_interactive_aborts(self, skill_dir):
+        from evolution.core.saturation_check import SaturationReport
+        saturated = SaturationReport(
+            band="no_headroom", holdout_score=0.99, holdout_n=50,
+            holdout_per_example=[1.0] * 50, suggestions=["x"], thresholds={},
+        )
+        gepa_mock = MagicMock()
+        with patch(
+            "evolution.skills.evolve_skill.saturation_preflight", return_value=saturated
+        ), patch(
+            "evolution.skills.evolve_skill._preflight_lm_credentials"
+        ), patch(
+            "evolution.skills.evolve_skill.is_non_interactive", return_value=True
+        ), patch("evolution.skills.evolve_skill.dspy.GEPA", gepa_mock):
+            runner = CliRunner()
+            result = runner.invoke(
+                evolve_skill_main,
+                ["--skill", "demo-skill", "--skill-source-dir", str(skill_dir),
+                 "--iterations", "1", "--no-preflight"],
+            )
+            gepa_mock.assert_not_called()
+            assert "force-saturation-check" in result.output
+
+    def test_force_saturation_check_overrides_abort(self, skill_dir):
+        from evolution.core.saturation_check import SaturationReport
+        saturated = SaturationReport(
+            band="no_headroom", holdout_score=0.99, holdout_n=50,
+            holdout_per_example=[1.0] * 50, suggestions=["x"], thresholds={},
+        )
+        with patch(
+            "evolution.skills.evolve_skill.saturation_preflight", return_value=saturated
+        ), patch(
+            "evolution.skills.evolve_skill._preflight_lm_credentials"
+        ), patch(
+            "evolution.skills.evolve_skill.interactive_confirm"
+        ) as mock_confirm, patch("evolution.skills.evolve_skill.dspy.GEPA"):
+            runner = CliRunner()
+            runner.invoke(
+                evolve_skill_main,
+                ["--skill", "demo-skill", "--skill-source-dir", str(skill_dir),
+                 "--iterations", "1", "--force-saturation-check", "--no-preflight"],
+            )
+            mock_confirm.assert_not_called()

From 360b51b5c79fa6dfb432ecbbf8122d0765f002a6 Mon Sep 17 00:00:00 2001
From: Justin Ramos <justin.ramos@gmail.com>
Date: Thu, 21 May 2026 11:45:51 -0600
Subject: [PATCH 4/8] fix(core): preserve first-fire guarantee in force_run;
 add cache-reuse tests

Two follow-ups from the final code review of the Path F branch:

1. ClosedLoopFeedbackCache.force_run was resetting _iters_since_last_run
   to 0, eating the "allow first fire" allowance that __init__ sets up
   (= min_iters). In sampled gate_mode this delayed the first GEPA-time
   closed-loop fire by min_iters iterations. Now force_run preserves
   the allowance so subsequent get_or_run calls fire as originally
   designed. Tests confirm should_run() still returns True after a
   force_run when judge history is empty.

2. Added integration tests for both evolve_tool and evolve_skill that
   verify the cache-reuse mechanism: when the saturation preflight runs
   and populates the cached baseline holdout scores, the post-GEPA
   evaluation site reuses them instead of re-running the baseline
   eval. This locks in the "net cost ~zero" correctness claim.
---
 evolution/core/closed_loop_feedback.py        |  2 +-
 tests/core/test_closed_loop_feedback.py       | 39 +++++++++++++++
 .../test_evolve_skill_saturation_preflight.py | 47 ++++++++++++++++++
 .../test_evolve_tool_saturation_preflight.py  | 49 +++++++++++++++++++
 4 files changed, 136 insertions(+), 1 deletion(-)

diff --git a/evolution/core/closed_loop_feedback.py b/evolution/core/closed_loop_feedback.py
index ea8dd6b0..94e858cd 100644
--- a/evolution/core/closed_loop_feedback.py
+++ b/evolution/core/closed_loop_feedback.py
@@ -202,7 +202,7 @@ def force_run(self, candidate_text: str) -> ValidationReport:
             )
             report = self._validator.validate(inputs)
             self._cache[key] = report
-            self._iters_since_last_run = 0
+            self._iters_since_last_run = self.min_iters
             return report
 
     def get_task_verdict(
diff --git a/tests/core/test_closed_loop_feedback.py b/tests/core/test_closed_loop_feedback.py
index 038d665b..c6e3b77f 100644
--- a/tests/core/test_closed_loop_feedback.py
+++ b/tests/core/test_closed_loop_feedback.py
@@ -592,3 +592,42 @@ def test_force_run_propagates_validator_errors(self, tmp_path):
 
         with pytest.raises(ConcurrentRunError):
             cache.force_run("cand")
+
+    def test_force_run_preserves_first_fire_for_subsequent_get_or_run(self, tmp_path):
+        """force_run at preflight must not eat the first-fire allowance.
+
+        The init contract is _iters_since_last_run = min_iters so that the
+        first GEPA-time record_judge_score call pushes the counter above the
+        periodic floor and the immediately following get_or_run fires.
+        force_run must restore that same value so the guarantee holds even
+        when preflight ran before GEPA started.
+
+        In sampled mode, should_run() returns False when judge_history is
+        empty (there's an early-return guard). The allowance only takes effect
+        after the first record_judge_score — at that point _iters_since_last_run
+        must be >= min_iters.  When force_run incorrectly reset to 0, one
+        record_judge_score call would leave _iters_since_last_run = 1 < min_iters,
+        and should_run() would return False, delaying the first GEPA fire."""
+        suite = _build_suite(tmp_path)
+        report = _build_report()
+        validator = MagicMock()
+        validator.validate.return_value = report
+        cache = ClosedLoopFeedbackCache(
+            validator=validator, suite=suite, artifact_name="t",
+            baseline_artifact_text="b", gate_mode="sampled",
+            min_iters=3,
+        )
+
+        # Preflight fires once (simulates saturation_preflight at init time)
+        cache.force_run("baseline")
+
+        # Simulate the first GEPA metric call recording a judge score.
+        # After this, _iters_since_last_run must be >= min_iters so
+        # should_run() returns True (periodic floor is met).
+        cache.record_judge_score(0.7)  # non-saturating score
+
+        assert cache.should_run() is True, (
+            "After force_run + one record_judge_score, should_run() must be True "
+            "(_iters_since_last_run should be min_iters+1 >= min_iters). "
+            "force_run reset to 0 would leave it at 1 < 3 (min_iters), causing False."
+        )
diff --git a/tests/skills/test_evolve_skill_saturation_preflight.py b/tests/skills/test_evolve_skill_saturation_preflight.py
index 1b866d4f..62421e39 100644
--- a/tests/skills/test_evolve_skill_saturation_preflight.py
+++ b/tests/skills/test_evolve_skill_saturation_preflight.py
@@ -105,3 +105,50 @@ def test_force_saturation_check_overrides_abort(self, skill_dir):
                  "--iterations", "1", "--force-saturation-check", "--no-preflight"],
             )
             mock_confirm.assert_not_called()
+
+    def test_cache_reuse_skips_baseline_re_eval_after_gepa(self, skill_dir):
+        """When the saturation preflight runs, the cached baseline holdout
+        scores must be reused at the post-GEPA evaluation site — the baseline
+        module should NOT be re-scored on the holdout after GEPA finishes.
+        This is the 'net cost ~zero' contract."""
+        from evolution.core.saturation_check import SaturationReport
+        from evolution.skills.knee_point import CandidatePick
+        from unittest.mock import MagicMock
+
+        healthy = SaturationReport(
+            band="healthy", holdout_score=0.6, holdout_n=10,
+            holdout_per_example=[0.6] * 10, suggestions=[], thresholds={},
+        )
+        # Fake knee-point result so execution reaches the holdout site.
+        # skill_text must be a non-empty string so SkillModule can be built.
+        fake_module = MagicMock()
+        fake_module.skill_text = "evolved skill text"
+        knee_pick = CandidatePick(
+            module=fake_module, skill_text="evolved skill text", body_chars=18,
+            val_score=0.8, val_rank_in_band=1, band_size=1, epsilon=0.1,
+            fallback="knee", picked_idx=0, gepa_default_idx=0,
+            gepa_default_body_chars=18, band_roster=[],
+        )
+        with patch(
+            "evolution.skills.evolve_skill.saturation_preflight", return_value=healthy
+        ), patch(
+            "evolution.skills.evolve_skill._preflight_lm_credentials"
+        ), patch("evolution.skills.evolve_skill.dspy.GEPA"), patch(
+            "evolution.skills.evolve_skill.select_knee_point", return_value=knee_pick
+        ), patch(
+            "evolution.skills.evolve_skill._holdout_evaluate_with_metric"
+        ) as mock_holdout_eval:
+            mock_holdout_eval.return_value = (0.6, [0.6] * 10)
+            runner = CliRunner()
+            runner.invoke(
+                evolve_skill_main,
+                ["--skill", "demo-skill", "--skill-source-dir", str(skill_dir),
+                 "--iterations", "1", "--no-preflight"],
+            )
+            # With preflight populating the cache, baseline should NOT be
+            # re-evaluated post-GEPA. Only evolved should be evaluated, so
+            # _holdout_evaluate_with_metric is called exactly once.
+            assert mock_holdout_eval.call_count == 1, (
+                f"Expected baseline holdout to be reused from preflight cache "
+                f"(1 call for evolved only), got {mock_holdout_eval.call_count}"
+            )
diff --git a/tests/tools/test_evolve_tool_saturation_preflight.py b/tests/tools/test_evolve_tool_saturation_preflight.py
index ec0b8705..9eeb215b 100644
--- a/tests/tools/test_evolve_tool_saturation_preflight.py
+++ b/tests/tools/test_evolve_tool_saturation_preflight.py
@@ -121,3 +121,52 @@ def test_force_saturation_check_overrides_abort(self, manifest_dir):
             )
             # confirm is bypassed when --force-saturation-check is set
             mock_confirm.assert_not_called()
+
+    def test_cache_reuse_skips_baseline_re_eval_after_gepa(self, manifest_dir):
+        """When the saturation preflight runs, the cached baseline holdout
+        scores must be reused at the post-GEPA evaluation site — the baseline
+        module should NOT be re-scored on the holdout after GEPA finishes.
+        This is the 'net cost ~zero' contract."""
+        from evolution.core.saturation_check import SaturationReport
+        from evolution.skills.knee_point import CandidatePick
+        from unittest.mock import MagicMock
+
+        # Healthy report so preflight passes without prompting; preflight
+        # still populates holdout_per_example which gets reused.
+        healthy = SaturationReport(
+            band="healthy", holdout_score=0.6, holdout_n=10,
+            holdout_per_example=[0.6] * 10, suggestions=[], thresholds={},
+        )
+        # Fake knee-point result so execution reaches the holdout site.
+        fake_module = MagicMock()
+        knee_pick = CandidatePick(
+            module=fake_module, skill_text="evolved desc", body_chars=12,
+            val_score=0.8, val_rank_in_band=1, band_size=1, epsilon=0.1,
+            fallback="knee", picked_idx=0, gepa_default_idx=0,
+            gepa_default_body_chars=12, band_roster=[],
+        )
+        with patch(
+            "evolution.tools.evolve_tool.saturation_preflight", return_value=healthy
+        ), patch(
+            "evolution.tools.evolve_tool._preflight_lm_credentials"
+        ), patch("evolution.tools.evolve_tool.dspy.GEPA"), patch(
+            "evolution.tools.evolve_tool.select_knee_point", return_value=knee_pick
+        ), patch(
+            "evolution.tools.evolve_tool._candidate_description", return_value="evolved desc"
+        ), patch(
+            "evolution.tools.evolve_tool._holdout_evaluate_with_metric"
+        ) as mock_holdout_eval:
+            mock_holdout_eval.return_value = (0.6, [0.6] * 10)
+            runner = CliRunner()
+            runner.invoke(
+                evolve_tool_main,
+                ["--tool", "write_file", "--manifest", str(manifest_dir),
+                 "--iterations", "1", "--no-preflight"],
+            )
+            # With preflight populating the cache, baseline should NOT be
+            # re-evaluated post-GEPA. Only evolved should be evaluated, so
+            # _holdout_evaluate_with_metric is called exactly once.
+            assert mock_holdout_eval.call_count == 1, (
+                f"Expected baseline holdout to be reused from preflight cache "
+                f"(1 call for evolved only), got {mock_holdout_eval.call_count}"
+            )

From 1ce0456b875ad7300afdac681a655c66671c9c8a Mon Sep 17 00:00:00 2001
From: Justin Ramos <justin.ramos@gmail.com>
Date: Thu, 21 May 2026 18:27:24 -0600
Subject: [PATCH 5/8] fix(core): widen no_headroom band to catch CL-saturated +
 synthetic-close case
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The default thresholds shipped in feat(core) were too strict for the
case Path F was built to catch. Spike #1 in the feasibility report
documented synthetic=0.987 + closed-loop=1.0 for the saturated
write_file baseline — GEPA can't improve on that, but the strict AND
(synthetic ≥ 0.99) gate let it through as healthy. The realtime
smoke from the merge-readiness check confirmed: preflight ran, both
scores looked right, classifier returned healthy, GEPA burned 155
no-op iterations.

Refined no_headroom logic:
- (synthetic ≥ 0.99 AND no CL signal) — unchanged, judge alone pegged
- (CL ≥ 0.95 AND synthetic ≥ weak_syn=0.95) — NEW, both signals
  effectively pegged

The synthetic_close gate on the new clause keeps (synthetic=0.5,
CL=1.0) classified as healthy — that scenario means there's real
judge signal to optimize over (or the eval is misconfigured) and
should not auto-abort.

Two new tests pin both the smoke case and the edge case.
---
 evolution/core/saturation_check.py  | 20 ++++++++++++++++++--
 tests/core/test_saturation_check.py | 21 +++++++++++++++++++++
 2 files changed, 39 insertions(+), 2 deletions(-)

diff --git a/evolution/core/saturation_check.py b/evolution/core/saturation_check.py
index 6fcfa468..df32e43f 100644
--- a/evolution/core/saturation_check.py
+++ b/evolution/core/saturation_check.py
@@ -65,8 +65,24 @@ def _classify_band(
             "Or harden the suite tasks so failure modes are interesting, not 'model can't execute'.",
         ]
 
-    if holdout_score >= no_head_syn and (
-        closed_loop_score is None or closed_loop_score >= no_head_cl
+    synthetic_saturated = holdout_score >= no_head_syn
+    synthetic_close = holdout_score >= weak_syn
+    cl_saturated = (
+        closed_loop_score is not None and closed_loop_score >= no_head_cl
+    )
+    no_cl_signal = closed_loop_score is None
+
+    # no_headroom triggers when:
+    #   - synthetic alone is saturated and there's no closed-loop signal
+    #     (only signal available is judge, and it's pegged), OR
+    #   - closed-loop is saturated AND synthetic is close enough (≥ weak
+    #     threshold) that the judge isn't producing a useful gradient either.
+    # CL-saturated alone with a low synthetic (< weak_syn) does NOT trigger:
+    # there's real judge signal to optimize over even when behavioral is
+    # pegged, and that scenario usually means a misconfigured eval rather
+    # than true saturation.
+    if (synthetic_saturated and no_cl_signal) or (
+        cl_saturated and synthetic_close
     ):
         return "no_headroom", [
             "Baseline already saturates the eval. No measurable headroom to evolve into.",
diff --git a/tests/core/test_saturation_check.py b/tests/core/test_saturation_check.py
index 0429c3dd..da337007 100644
--- a/tests/core/test_saturation_check.py
+++ b/tests/core/test_saturation_check.py
@@ -71,6 +71,27 @@ def test_custom_thresholds_propagate(self):
         )
         assert band == "no_headroom"
 
+    def test_no_headroom_when_cl_saturated_and_synthetic_close(self):
+        """The smoke case: synthetic 0.987 (below strict no_head_syn=0.99
+        but above weak_syn=0.95), closed-loop 1.0. Both signals
+        effectively pegged → no_headroom should trigger so the user
+        doesn't burn GEPA budget on a hopeless run."""
+        band, _ = _classify_band(
+            holdout_score=0.987, closed_loop_score=1.0,
+            thresholds=DEFAULT_THRESHOLDS,
+        )
+        assert band == "no_headroom"
+
+    def test_healthy_when_cl_saturated_but_synthetic_low(self):
+        """Edge case: behavioral suite pegged at 1.0 but synthetic at 0.5
+        means there's real judge signal to optimize over (or the eval is
+        misconfigured). Don't auto-abort — proceed and let GEPA try."""
+        band, _ = _classify_band(
+            holdout_score=0.5, closed_loop_score=1.0,
+            thresholds=DEFAULT_THRESHOLDS,
+        )
+        assert band == "healthy"
+
 
 class TestSaturationPreflightNoClosedLoop:
     def test_returns_healthy_when_baseline_below_threshold(self):

From d4058bd02f288a151b4ad484776e1c1c992b5f84 Mon Sep 17 00:00:00 2001
From: Justin Ramos <justin.ramos@gmail.com>
Date: Thu, 21 May 2026 18:45:47 -0600
Subject: [PATCH 6/8] docs: bring project docs in line with Path F (saturation
 pre-flight)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Updates across the docs/ knowledge base, AGENTS.md, README.md, and
PLAN.md to reflect the new saturation pre-flight feature:

- architecture.md: top-level flow now shows the pre-flight + abort
  path; new design pattern #10 separates the pre-flight (a "should we
  even start" decision) from the deploy gate (a "did we improve"
  decision).
- components.md: new saturation_check.py section documenting the
  band classifier logic + public surface; force_run added to the
  ClosedLoopFeedbackCache surface.
- data_models.md: new SaturationReport dataclass entry.
- workflows.md: Workflow 1 gets a Phase B.5 mermaid for the
  pre-flight; Phase D's holdout step shows the cache-reuse branch.
- interfaces.md: --no-saturation-check + --force-saturation-check
  added to both skill and tool flag tables.
- index.md: new routing entry, new cross-cutting topic, refreshed
  test count (681 → 1076), maintenance-note entry for the default
  thresholds (likely to be calibrated).
- codebase_info.md: saturation_check.py added to layout + LOC table;
  test count refreshed.
- framework_advantages.md: new "Saturation pre-flight that refuses
  to spend budget on hopeless runs" section, positioned as a
  framework advantage over raw GEPA.
- AGENTS.md: 5-line run summary updated; component map adds
  saturation_check.py; planned/deferred section gets a Path D/E/C
  entry pointing at the feasibility report.
- README.md: new "Saturation pre-flight" section in the Safety knobs
  area with example panel output.
- PLAN.md: deviation #8 gets a follow-up paragraph noting that
  Path F addresses the user-visible symptom but not the underlying
  acceptance-gate mechanism.

No source files touched.
---
 AGENTS.md                    |  6 +++--
 PLAN.md                      |  2 ++
 README.md                    | 21 ++++++++++++++++
 docs/architecture.md         | 14 ++++++++---
 docs/codebase_info.md        |  6 +++--
 docs/components.md           | 26 +++++++++++++++++++
 docs/data_models.md          | 20 +++++++++++++++
 docs/framework_advantages.md |  6 +++++
 docs/index.md                |  7 ++++--
 docs/interfaces.md           |  4 +++
 docs/workflows.md            | 49 +++++++++++++++++++++++++++++++++---
 11 files changed, 149 insertions(+), 12 deletions(-)

diff --git a/AGENTS.md b/AGENTS.md
index da162ee9..ff85cf60 100644
--- a/AGENTS.md
+++ b/AGENTS.md
@@ -74,9 +74,9 @@ The `evolution/<tier>/` directories form **a clean layering**: `evolution/core/`
 
 1. CLI resolves `--skill <name>` to a `SKILL.md` via the `SkillSource` walk.
 2. Eval dataset is built (synthetic LM gen / golden file / sessiondb mining).
-3. Skill body wrapped as `dspy.Module`; GEPA optimizes it with `BudgetAwareProposer` injecting a char budget into the reflection prompt.
+3. Skill body wrapped as `dspy.Module`. **Saturation pre-flight** (`evolution/core/saturation_check.py`) scores the baseline on the holdout + closed-loop suite, classifies into one of four bands, and aborts (or prompts) on non-`healthy` bands — `--no-saturation-check` to skip, `--force-saturation-check` to override the default-deny in non-interactive contexts. Then GEPA optimizes the candidate with `BudgetAwareProposer` injecting a char budget into the reflection prompt.
 4. Knee-point Pareto selection walks the candidates within ε of the best valset score in `--knee-point-strategy` order. Default `val-best`: highest val first, smallest body as tiebreak. `smallest` (greedy parsimony) is available via the flag for users explicitly chasing compression.
-5. Static constraints + paired-bootstrap growth-quality gate decide deploy vs. reject; both outcomes write `gate_decision.json`. The default rule is `no_regression` (`mean >= 0`); `--quality-gate non-inferiority` switches to `lower_bound > -inferiority_tolerance` (recommended for compression-focused runs at small N where the bootstrap CI swamps tiny effects).
+5. Static constraints + paired-bootstrap growth-quality gate decide deploy vs. reject; both outcomes write `gate_decision.json`. The default rule is `no_regression` (`mean >= 0`); `--quality-gate non-inferiority` switches to `lower_bound > -inferiority_tolerance` (recommended for compression-focused runs at small N where the bootstrap CI swamps tiny effects). The post-GEPA holdout eval reuses the baseline scores from the pre-flight, so net cost stays ~zero when the pre-flight ran.
 
 ## What lives where
 
@@ -101,6 +101,7 @@ The `evolution/<tier>/` directories form **a clean layering**: `evolution/core/`
 | Tool-flavored judge + tool metric | `evolution/tools/tool_judge.py` |
 | Behavioral `dspy.Example` builder for closed-loop trainset | `evolution/core/behavioral_example.py` |
 | Closed-loop verdict cache + deterministic feedback rendering | `evolution/core/closed_loop_feedback.py` |
+| Saturation pre-flight (band classifier + Rich panel + interactive confirm) | `evolution/core/saturation_check.py` |
 | Deploy gate (static + growth-quality) | `evolution/core/constraints.py` |
 | Preset table + gate-decision persistence (shared by skill/tool) | `evolution/core/quality_gate.py` |
 | Paired-bootstrap CI | `evolution/core/stats.py` |
@@ -268,6 +269,7 @@ Open questions deferred to future PRs (per `PLAN.md` deviation notes):
 - GEPA Pareto-frontier checkpointing (so a `TimeoutError` mid-run doesn't lose all candidates)
 - Skill-size-based reflection-LM timeout scaling
 - BCa bootstrap upgrade once N≥20 routinely
+- **GEPA acceptance-gate work** (deviation #8 follow-up): the saturation pre-flight (`evolution/core/saturation_check.py`) addresses the user-visible symptom on saturated baselines (abort before GEPA spends budget). The underlying mechanism gap — stochastic small-minibatch `sum()` acceptance discarding per-instance signal — is tracked as Path D/E/C in `reports/pareto_frontier_feasibility.md` and remains future work (likely an upstream DSPy or GEPA PR).
 
 ## When to consult which doc
 
diff --git a/PLAN.md b/PLAN.md
index a7b12172..5c587f64 100644
--- a/PLAN.md
+++ b/PLAN.md
@@ -460,6 +460,8 @@ These descriptions are sent with every API call as part of the tool schema — e
 7. **N=2 saturated baselines.** Weak-target hunt ran `evolve_tool` against `write_file` (98.8–99.2% holdout, 3 seeds, 1×/3× iter) and `search_files` (98.6% holdout). Both runs produced evolved descriptions byte-identical to the baseline — the knee-point picker correctly reverts to the unchanged baseline when GEPA's variants tie. The framework's tool-description pipeline is regression-catching, not improvement-finding, on these hand-tuned descriptions.
 8. **Closed-loop signal can flow into reflection but doesn't change selection on saturated baselines.** The `--closed-loop-during-evolution` flag plumbs `ValidationReport`s into the GEPA reflection LM's feedback channel via the existing 5-arg metric protocol, opt-in, saturation-gated. Verified end-to-end on `write_file`: closed-loop fired (file mutated + restored), the reflection LM saw the verdict, GEPA still selected the baseline byte-for-byte. The bottleneck sits upstream of reflection — GEPA's `sum(judge_scores)` acceptance rule ties when every candidate hits 1.0 on a saturated minibatch. Extending the Pareto frontier into behavioral space (closed-loop tasks as additional training-set instances with their own per-instance scores so a candidate can stay on the frontier by winning behavioral tasks) is the structural direction that would address this; the cache + renderer added here are the natural building blocks for that work.
 
+   **Follow-up — Path F (saturation pre-flight) addresses the user-visible symptom, not the underlying mechanism.** A separate investigation (`reports/pareto_frontier_feasibility.md`, two spike runs) confirmed the deviation's diagnosis and reframed it: the bottleneck isn't frontier shape, it's GEPA's stochastic small-minibatch `sum()` acceptance gate discarding per-instance signal before it can move selection. Path F (`evolution/core/saturation_check.py`) ships the user-visible fix — detect the saturated case before GEPA starts, render a panel explaining why no improvement is possible, default-deny in non-interactive contexts. This prevents the wasted-budget UX without solving the mechanism gap. The mechanism-side fix (Pareto-dominance acceptance, larger minibatch, or stratified sampling) is tracked as "Path D/E/C" in the feasibility report and remains future work.
+
 ### Phase 3: System Prompt Evolution
 
 **Goal:** Optimize the sections of the system prompt that guide agent behavior.
diff --git a/README.md b/README.md
index f63fd5fa..c5558f3b 100644
--- a/README.md
+++ b/README.md
@@ -245,6 +245,27 @@ uv run python -m evolution.tools.evolve_tool --tool X --manifest Y \
 
 Env vars: `EVOLVED_PATH`, `BASELINE_PATH`, `RUN_DIR`, `TARGET_NAME`, `ARTIFACT_TYPE`. The hook runs under `/bin/sh -c` — interactive aliases are not available; invoke binaries by full name. Trust boundary: the command string is yours, do not pass strings you didn't write yourself.
 
+### Saturation pre-flight (don't burn GEPA budget on hopeless runs)
+
+By default, every `evolve_skill` / `evolve_tool` run does a pre-flight: score the baseline on the holdout (and the closed-loop suite, if `--closed-loop-during-evolution` is set), classify into one of four bands (`healthy` / `no_headroom` / `weak_signal` / `uniform_failure`), and refuse to spend GEPA budget on a baseline that's already saturated.
+
+```
+Saturation check: holdout=0.987 (50 ex), closed-loop=1.000 (7 tasks)
+╭─── No measurable headroom ───────────╮
+│ Band: no_headroom                    │
+│ • Baseline already saturates the eval│
+│ • Try a harder closed-loop suite     │
+│ • Sanity check: synthetic generator? │
+╰──────────────────────────────────────╯
+Non-interactive context; refusing to proceed.
+Pass --force-saturation-check to override.
+```
+
+In interactive contexts, non-`healthy` bands prompt for confirmation (`Continue anyway? [y/N]`). In non-interactive contexts (no TTY on stdin — CI, background jobs, cron), the framework default-denies and exits cleanly with the override hint. Net cost is ~zero: the probe's holdout scores are reused at the post-GEPA evaluation site, so the baseline isn't re-scored at run end.
+
+- `--no-saturation-check` skips the probe entirely (useful when you've already validated headroom externally)
+- `--force-saturation-check` runs the probe + renders the panel but proceeds regardless of band
+
 ### Closed-loop validation (real agent on real tasks)
 
 The framework's deploy gate scores evolved artifacts against an LM-judge on a synthetic eval set. That's a closed loop: an LM scoring another LM's output on tasks a third LM made up. To break the loop, point a real agent at a small task suite with the baseline and evolved artifacts and see whether real agent behavior actually shifted:
diff --git a/docs/architecture.md b/docs/architecture.md
index 9806c77c..7e2f14d5 100644
--- a/docs/architecture.md
+++ b/docs/architecture.md
@@ -13,12 +13,17 @@ flowchart LR
     A[CLI<br/>--skill X] --> B[Resolve SKILL.md<br/>SkillSource]
     B --> C[Build eval dataset<br/>synthetic / golden / sessiondb]
     C --> D[Wrap as<br/>SkillModule dspy.Module]
-    D --> E[GEPA optimizer<br/>+ BudgetAwareProposer]
+    D --> SAT[Saturation pre-flight<br/>baseline holdout + closed-loop probe]
+    SAT --> SATB{band ==<br/>healthy?}
+    SATB -- no --> SATA[Rich panel + prompt<br/>or default-deny]
+    SATA -- abort --> Z[sys.exit 0]
+    SATA -- proceed --> E
+    SATB -- yes --> E[GEPA optimizer<br/>+ BudgetAwareProposer]
     E --> F[Knee-point<br/>Pareto selection]
     F --> G[Static<br/>constraints]
     G --> H{pass?}
     H -- no --> I[Write evolved_FAILED.md<br/>+ gate_decision.json]
-    H -- yes --> J[Holdout eval<br/>dspy.Evaluate × 2]
+    H -- yes --> J[Holdout eval<br/>dspy.Evaluate × 1 evolved<br/>baseline reused from SAT]
     J --> K[Paired bootstrap<br/>per-example deltas]
     K --> L[Growth-with-quality<br/>gate]
     L --> M{deploy?}
@@ -166,7 +171,10 @@ When growth is below the free threshold, the gate degrades to "no-regression onl
 ### 9. Cost-ceiling kill switch
 `LMTimingCallback` also drives a per-run `CostLedger` that accumulates per-call cost from litellm's `_hidden_params`. `--max-total-cost-usd <N>` arms the ledger; once the accumulated cost crosses `N`, the next LM call raises `CostCeilingExceeded` from `LMTimingCallback.on_lm_start`. The orchestrator catches this at the top level and writes a `decision="aborted"` `gate_decision.json` with `cost_at_abort_usd` + `cost_ceiling_usd` + `cost_summary`. Worst-case overshoot is one LM call past the ceiling.
 
-### 10. Closed-loop validation as a separate surface
+### 10. Saturation pre-flight as a separate concern from the gate
+`evolution/core/saturation_check.py` runs BEFORE GEPA setup: scores the baseline on the holdout (and the closed-loop suite when configured), classifies into four bands (`healthy` / `no_headroom` / `weak_signal` / `uniform_failure`), and renders a Rich panel. Non-healthy bands prompt for confirmation in interactive contexts; default-deny in non-interactive contexts (no TTY) with a `--force-saturation-check` override. Skippable with `--no-saturation-check`. The probe's `holdout_per_example` is stashed and reused at the post-GEPA holdout site so net cost stays ~zero. Mirrors the `evolution/core/auth_check.py` pattern: pure helper returns a structured `SaturationReport`; rendering + exit handled by the call site. This is independent of the deploy gate (which runs AFTER GEPA on the evolved artifact) — the pre-flight is a "should we even start" decision; the gate is a "did we improve" decision.
+
+### 11. Closed-loop validation as a separate surface
 `evolution/validation/` runs a real agent (`hermes -z`) through a JSONL task suite with baseline vs evolved artifacts spliced into the live install. Available three ways:
 - **Post-gate veto** (`--benchmark-cmd "python -m evolution.validation.closed_loop ..."`) — runs after the deploy gate passes; nonzero exit flips the decision to reject with `reason="benchmark_failed"`.
 - **Reflection feedback** (`--closed-loop-during-evolution <suite.jsonl> --closed-loop-mode feedback`) — `ClosedLoopFeedbackCache` runs the validator during the GEPA loop, saturation-gated, and the verdict is rendered into the reflection LM's input via the metric's `dspy.Prediction.feedback` string. Score channel untouched.
diff --git a/docs/codebase_info.md b/docs/codebase_info.md
index 83da616a..b6e787d8 100644
--- a/docs/codebase_info.md
+++ b/docs/codebase_info.md
@@ -50,6 +50,7 @@ evolution/
 │   ├── fitness.py                       # LLMJudge + GEPA-shaped metric + behavioral score helper
 │   ├── lm_timing_callback.py            # LM-call observability + cost ledger + cost-ceiling kill switch
 │   ├── quality_gate.py                  # preset table + write_gate_decision (shared by skill/tool pipelines)
+│   ├── saturation_check.py              # pre-flight: classify baseline into healthy/no_headroom/weak_signal/uniform_failure + Rich panel + abort
 │   ├── skill_sources.py                 # SkillSource protocol + 3 implementations
 │   └── stats.py                         # paired_bootstrap CI
 ├── skills/                              # Tier 1: skill-file evolution
@@ -90,7 +91,8 @@ evolution/
 | `evolution/core/fitness.py` | ~380 | LLMJudge + skill/tool fitness metrics + behavioral score helper |
 | `evolution/core/constraints.py` | ~320 | static + growth-with-quality + size constraints |
 | `evolution/skills/budget_aware_proposer.py` | ~300 | char-budget reflection prompt |
-| `evolution/core/closed_loop_feedback.py` | ~295 | cache + saturation gate + deterministic feedback block |
+| `evolution/core/closed_loop_feedback.py` | ~320 | cache + saturation gate + deterministic feedback block + `force_run` (bypasses gate for pre-flight) |
+| `evolution/core/saturation_check.py` | ~255 | pre-flight: band classifier + `SaturationReport` + Rich panel + interactive confirm |
 | `evolution/tools/tool_judge.py` | ~230 | tool-flavored judge + GEPA-shaped metric with behavioral branch |
 | `evolution/validation/validator.py` | ~220 | mutate + restore live agent file with flock + checksum drift check |
 | `evolution/validation/report.py` | ~225 | ValidationReport JSON + Rich rendering + two-condition decision |
@@ -109,7 +111,7 @@ evolution/
 | `evolution/core/behavioral_example.py` | ~35 | builder for behavioral dspy.Examples |
 | **Total** | **~9,000** | excludes empty `__init__.py` shims |
 
-Test suite: 37 test files under `tests/core/`, `tests/skills/`, `tests/tools/`, `tests/validation/`. **681 tests** collected.
+Test suite: 55 test files under `tests/core/`, `tests/skills/`, `tests/tools/`, `tests/validation/`. **1076 tests** collected.
 
 ## Runtime dependencies
 
diff --git a/docs/components.md b/docs/components.md
index 734dc179..88211422 100644
--- a/docs/components.md
+++ b/docs/components.md
@@ -163,6 +163,7 @@ Score is **never** modified by `pred_trace` enrichment — GEPA enforces score e
 - `.should_run() -> bool` — the gate. `gate_mode="sampled"` (default, opportunistic feedback-only use): fire when `min(recent_window) >= saturation_threshold` OR `iters_since_last_run >= min_iters`. `gate_mode="always"` (selection-affecting trainset use): always open — every novel candidate must score every time.
 - `.get_or_run(candidate_text) -> Optional[ValidationReport]` — cache key is `sha256(candidate + suite.sha256)`. Cache hit returns cached report; miss writes the candidate's description into a tmp JSON manifest and calls `validator.validate()`. Validator failures (`ConcurrentRunError`, `StaleBackupError`, `ChecksumDriftError`) log `WARNING` and return `None` — closed-loop failure must never take the GEPA run down.
 - `.get_task_verdict(candidate_text, task_id) -> Optional[TaskResult]` — calls `get_or_run` and indexes `report.evolved.tasks` by `task_id`. Returns `None` if the gate is closed or the validator raised a swallowed error or the task isn't present.
+- `.force_run(candidate_text) -> ValidationReport` — same shape as `get_or_run` but bypasses `should_run()` and propagates validator exceptions (instead of logging + returning `None`). Used by the saturation pre-flight (`evolution/core/saturation_check.py`) to fire the validator on the baseline once before any judge scores have been recorded — in default `gate_mode="sampled"`, `should_run()` returns `False` until either a judge score saturates or the periodic floor elapses, so `get_or_run` would silently no-op at preflight time. Preserves the "next `get_or_run` is allowed to fire immediately" guarantee by resetting `_iters_since_last_run` to `min_iters` (the same value `__init__` uses), so the saturation gate's first-fire allowance for downstream callers is intact.
 - `render_feedback_block(report: ValidationReport) -> str` — module-level function. Renders the cached report as a deterministic `[CLOSED_LOOP]` block (or `[CLOSED_LOOP-NOISY]` when `|Δpass_rate| < 0.15`) with decision, decision_reasons, win/loss/tie counts, and per-task diffs for tasks whose verdict changed. Determinism is required because GEPA hashes reflective-dataset entries for caching.
 
 **Two use modes**, both wired through `evolve_tool` CLI flags:
@@ -170,6 +171,31 @@ Score is **never** modified by `pred_trace` enrichment — GEPA enforces score e
 1. **Feedback enricher** (`--closed-loop-mode feedback`, default): the metric's `_augment_feedback_with_closed_loop` helper calls `get_or_run` on the candidate currently under reflection, then appends the rendered block to the metric's `dspy.Prediction.feedback`. Saturation-gated so it only fires when the judge has converged. Score is unchanged.
 2. **Trainset score channel** (`--closed-loop-mode trainset`): `build_behavioral_examples(suite)` injects per-task `dspy.Example`s into the trainset. The metric's behavioral branch calls `get_task_verdict` on each behavioral example and returns the binary verdict as score. Behavioral wins contribute to `sum(minibatch_scores)`, breaking judge ties at acceptance.
 
+## evolution/core/saturation_check.py — pre-flight that detects doomed runs
+
+**Owns:** the pre-GEPA probe that scores the baseline on the holdout (and the closed-loop suite, if configured), classifies the result into one of four bands, and lets the call site decide whether to prompt for confirmation or default-deny. Independent of any GEPA-side change; mirrors the shape of `evolution/core/auth_check.py` (pure helper returns a structured report; rendering + exit handled by the call site).
+
+**Public surface:**
+
+- `SaturationBand: Literal["healthy", "no_headroom", "weak_signal", "uniform_failure"]` — the four-band classification.
+- `DEFAULT_THRESHOLDS: dict[str, float]` — `no_headroom_synthetic=0.99`, `weak_signal_synthetic=0.95`, `no_headroom_closed_loop=0.95`, `uniform_failure_closed_loop=0.15`.
+- `SaturationReport` dataclass — the contract between the helper and the call site. Carries the band, holdout score + per-example list (reused downstream for cache reuse), the closed-loop score + per-example list when present, the band-specific suggestion strings, and the thresholds that produced the band.
+- `saturation_preflight(baseline_module, holdout_examples, metric, lm, closed_loop_cache=None, baseline_artifact_text=None, thresholds=None) -> SaturationReport` — pure function. Scores baseline via `_score_baseline_on_holdout` (a thin wrapper around `dspy.Evaluate` carved out so tests can patch the DSPy boundary), then fires `closed_loop_cache.force_run(baseline_artifact_text)` when the cache is provided. Raises `ValueError` on empty `holdout_examples` before any LM call.
+- `render_saturation_panel(report, console=None) -> None` — emits a one-line dim acknowledgement for the `healthy` band, or a Rich `Panel` (yellow border) with band, score lines, and bulleted suggestions for the warn bands.
+- `interactive_confirm(prompt="Continue anyway? [y/N] ") -> bool` — reads stdin; returns `True` only for `{y, yes}` case-insensitive. Catches `KeyboardInterrupt` and `EOFError`, returning `False` (treats as "n", no traceback noise).
+- `is_non_interactive() -> bool` — `not sys.stdin.isatty()`. Call sites use it to decide between prompting and printing the override hint.
+
+**Band classifier logic** (`_classify_band`, in priority order):
+
+1. **`uniform_failure`** if `closed_loop_score is not None AND closed_loop_score <= 0.15` — validator agent too weak to use the artifact at all; signal isn't discriminating.
+2. **`no_headroom`** if either:
+   - `holdout_score >= 0.99 AND closed_loop_score is None` — only signal available is the judge, and it's pegged, OR
+   - `closed_loop_score >= 0.95 AND holdout_score >= 0.95` — both signals effectively saturated. The `holdout_score >= 0.95` gate on this clause keeps `(synthetic=0.5, CL=1.0)` classified as `healthy` (there's real judge headroom even with behavioral pegged; usually means misconfigured eval rather than true saturation).
+3. **`weak_signal`** if `holdout_score >= 0.95 AND 0.15 < closed_loop_score < 0.95` — judge saturating but closed-loop discriminates; GEPA's small-minibatch acceptance will struggle (per the deviation #8 finding); expect many proposals rejected.
+4. **`healthy`** otherwise — no panel, just a one-line dim log.
+
+**Call-site integration:** both `evolve_skill.py` and `evolve_tool.py` invoke the helper after the dataset is built and `baseline_module`/`metric`/`closed_loop_cache` are constructed but before GEPA setup. The `holdout_per_example` list from the report is stashed and reused at the post-GEPA `_holdout_evaluate_with_metric` site — so the baseline isn't re-scored at run end. Net cost: ~zero (the probe is the holdout eval shifted earlier). See `--no-saturation-check` / `--force-saturation-check` in `interfaces.md`.
+
 ## evolution/core/constraints.py — deploy gate
 
 **Owns:** all constraint checks and the deploy gate's two-stage decision.
diff --git a/docs/data_models.md b/docs/data_models.md
index f7001626..a41a97b2 100644
--- a/docs/data_models.md
+++ b/docs/data_models.md
@@ -285,6 +285,26 @@ Both must hold to return `"pass"`; else `"regression"`. The 2:1 win-loss ratio i
 
 `ValidationReport.to_dict()` round-trips to `validation_report.json` written under `output/validation/<tool>/<timestamp>/`.
 
+## SaturationReport (`evolution/core/saturation_check.py`)
+
+In-memory only. Built by `saturation_preflight(...)` before GEPA setup, consumed by the call site in `evolve_skill` / `evolve_tool` to decide whether to abort or proceed. Not currently serialized to disk — the `holdout_per_example` list flows directly into the post-GEPA `_holdout_evaluate_with_metric` baseline-cache reuse path.
+
+```python
+@dataclass
+class SaturationReport:
+    band: SaturationBand                          # "healthy" | "no_headroom" | "weak_signal" | "uniform_failure"
+    holdout_score: float                          # baseline mean on holdout
+    holdout_n: int                                # number of holdout examples scored
+    holdout_per_example: list[float]              # per-example scores (reused at post-GEPA evaluation)
+    closed_loop_score: Optional[float] = None     # None when no --closed-loop-during-evolution suite
+    closed_loop_n: Optional[int] = None           # number of behavioral tasks scored
+    closed_loop_per_example: Optional[list[float]] = None
+    suggestions: list[str] = field(default_factory=list)   # band-specific user-facing strings
+    thresholds: dict[str, float] = field(default_factory=dict)   # snapshot of values that produced the band
+```
+
+`SaturationBand` is a `Literal` of four strings. `DEFAULT_THRESHOLDS` ships as `{no_headroom_synthetic: 0.99, weak_signal_synthetic: 0.95, no_headroom_closed_loop: 0.95, uniform_failure_closed_loop: 0.15}`. See `components.md`'s `saturation_check.py` section for the classifier logic.
+
 ## Evolved manifest output JSON
 
 `output/tools/<tool>/<timestamp>/evolved_manifest.json` (deploy) and `evolved_FAILED.json` (reject) have the same shape as the input MCP-shape manifest:
diff --git a/docs/framework_advantages.md b/docs/framework_advantages.md
index 42cd36bb..a10c338b 100644
--- a/docs/framework_advantages.md
+++ b/docs/framework_advantages.md
@@ -44,6 +44,12 @@ The fitness function is a composite LLM-as-judge metric: separate scores for cor
 
 Files: `evolution/skills/budget_aware_proposer.py`, `evolution/core/fitness.py`.
 
+### Saturation pre-flight that refuses to spend budget on hopeless runs
+
+GEPA will happily burn an hour optimizing a target that has no measurable headroom — every reflective mutation gets rejected because the minibatch ties at 100%, and you end up with the baseline byte-for-byte plus a bill. The framework's pre-flight (`evolution/core/saturation_check.py`) catches this BEFORE GEPA starts: scores the baseline on the holdout (and the closed-loop suite, if configured), classifies into `healthy` / `no_headroom` / `weak_signal` / `uniform_failure`, and either prompts the user (interactive) or default-denies with a `--force-saturation-check` override (non-interactive). Net cost is ~zero — the probe's holdout scores are reused at the post-GEPA evaluation site. When the run does proceed, the user has band-specific suggestions for the warn cases (try a stronger validator model, try a harder suite, increase iterations). Raw `dspy.GEPA` has no equivalent.
+
+Files: `evolution/core/saturation_check.py`.
+
 ## Telemetry as a first-class feature
 
 Every run writes `gate_decision.json` (schema_version `"4"`) capturing the deploy decision, the paired-bootstrap statistics, the static-constraint results, the knee-point band roster, and an explicit comparison against the candidate stock GEPA would have picked. Combined with `metrics.json` (deploy summary) and `run.log` (every LM call timing), this means a deploy decision is auditable post-hoc and the system can be re-calibrated on accumulated runs. Most upstream users won't realize they're missing this until they need to debug a bad ship.
diff --git a/docs/index.md b/docs/index.md
index 22ba38d2..06b5c3aa 100644
--- a/docs/index.md
+++ b/docs/index.md
@@ -6,7 +6,7 @@ This directory is a structured documentation set for **`agent-self-evolution`**
 
 **Start here every time.** This file is the entry point — it describes which documents to consult for which kinds of question. Load it into context first; the other docs are loaded on demand.
 
-The codebase is mid-sized (~9K LOC of source + 37 test files / ~680 tests) and architecturally dense — most of the substance is in *why* things are shaped a certain way, not *what* they are. The docs prioritize that "why."
+The codebase is mid-sized (~9K LOC of source + 55 test files / ~1076 tests) and architecturally dense — most of the substance is in *why* things are shaped a certain way, not *what* they are. The docs prioritize that "why."
 
 ### Question routing table
 
@@ -30,6 +30,7 @@ The codebase is mid-sized (~9K LOC of source + 37 test files / ~680 tests) and a
 | **How does closed-loop signal reach GEPA during evolution** | `components.md` (closed_loop_feedback.py, behavioral_example.py) → `architecture.md` (closed-loop feedback patterns) → `workflows.md` (Workflow 11) |
 | **What does `--max-total-cost-usd` actually do on abort** | `data_models.md` (cost-ceiling-abort variant of gate_decision.json) → `components.md` (lm_timing_callback.py) |
 | **What does `--benchmark-cmd` do** | `interfaces.md` (CLI: benchmark-cmd) → `data_models.md` (benchmark block) |
+| **Why did the run abort before GEPA started / what's the saturation panel** | `components.md` (saturation_check.py) → `architecture.md` (pattern 10) → `workflows.md` (Workflow 1 Phase B.5) → `data_models.md` (SaturationReport) |
 | **What's tested vs. not** | `interfaces.md` (test surfaces locked by tests) → `workflows.md` (Workflow 8) |
 | **What dependencies are pinned and why** | `dependencies.md` |
 | **What's planned but not built** | `codebase_info.md` (implementation status table) → `PLAN.md` |
@@ -70,6 +71,7 @@ The codebase is mid-sized (~9K LOC of source + 37 test files / ~680 tests) and a
 - **The deploy gate decision** spans `architecture.md` (statistical substrate), `components.md` (`constraints.py`), `data_models.md` (`gate_decision.json` schema), and `workflows.md` (Workflow 1 Phase D, Workflow 2). Read together when debugging a deploy decision.
 - **LM observability** lives in `components.md` (`lm_timing_callback.py`), `interfaces.md` (litellm integration), and `dependencies.md` (litellm pinning rationale).
 - **Skill discovery** is in `components.md` (`skill_sources.py`), `interfaces.md` (SkillSource Protocol), and `codebase_info.md` (priority order).
+- **Saturation pre-flight** is in `components.md` (`saturation_check.py`), `architecture.md` (decision 10), `workflows.md` (Workflow 1 Phase B.5), `data_models.md` (`SaturationReport`), and `interfaces.md` (CLI flags `--no-saturation-check` / `--force-saturation-check`). Read together when debugging a "why did the run abort before GEPA" or "why was the panel suggested" question.
 
 ## Maintenance notes
 
@@ -78,8 +80,9 @@ The fast-moving parts to verify against source when consulting these docs:
 - `EvolutionConfig` defaults (especially `eval_dataset_size`, `growth_*`, `bootstrap_*`)
 - `gate_decision.json` schema_version (currently `"4"`)
 - LM model defaults in `evolve_skill.py` / `evolve_tool.py` CLI options
-- Test count (currently ~680)
+- Test count (currently ~1076)
 - LM `request_timeout` / `num_retries` — may be tuned further
 - Closed-loop CLI flags on `evolve_tool` (`--closed-loop-during-evolution`, `--closed-loop-mode`, …)
+- Saturation pre-flight default thresholds (`evolution/core/saturation_check.py:DEFAULT_THRESHOLDS`) — likely to be calibrated as more real-world bands are observed
 
 When updating: edit the relevant file, then check whether the "Question routing table" above still points to the right place. The index file is loaded into AI-assistant context every conversation, so small accuracy improvements here pay off broadly.
diff --git a/docs/interfaces.md b/docs/interfaces.md
index 917d2f2b..20ad548a 100644
--- a/docs/interfaces.md
+++ b/docs/interfaces.md
@@ -72,6 +72,8 @@ Both delivery flags are no-ops on a reject decision and emit a one-line stderr n
 | `--benchmark-cmd "<shell command>"` | off | Deploy-gate hook: shell command run AFTER the framework's own deploy gate passes; nonzero exit flips the decision to `reject` with `reason="benchmark_failed"`. Receives `EVOLVED_PATH`, `BASELINE_PATH`, `RUN_DIR`, `TARGET_NAME`, `ARTIFACT_TYPE` via env. Runs under `/bin/sh -c`; aliases and shell functions from your interactive shell are not available. Trust boundary: the command string is yours; do not pass strings you didn't write. Adds a `benchmark` block to `gate_decision.json`. |
 | `--benchmark-timeout-seconds INT` | `600` | Wall-clock cap for the `--benchmark-cmd` hook. Timeout treated as a benchmark fail with `reason="timeout"`. |
 | `--closed-loop-during-evolution <suite.jsonl>` | off | Wired symmetrically with `evolve_tool` for CLI consistency. Skill-side closed-loop validation requires a `SkillFileInstaller` that doesn't exist yet, so setting this flag raises with a clear error. |
+| `--no-saturation-check` | off | Skip the saturation pre-flight (`evolution/core/saturation_check.py`). By default, the framework scores the baseline on the holdout (and the closed-loop suite, if `--closed-loop-during-evolution` is set) BEFORE GEPA starts; non-`healthy` bands prompt for confirmation (interactive) or default-deny (non-interactive) with a `--force-saturation-check` override. Pass `--no-saturation-check` to skip the probe entirely. |
+| `--force-saturation-check` | off | Run the saturation pre-flight, render the panel, but proceed regardless of band. Required to override a non-`healthy` verdict in non-interactive contexts (no TTY on stdin). Without this in such a context, the framework exits cleanly without spending GEPA budget. |
 
 ### Exit conditions
 - `sys.exit(1)` if skill not found across all `SkillSource`s — prints available skills per source.
@@ -112,6 +114,8 @@ Evolves one tool's top-level `description` field inside an MCP-shape manifest. T
 | `--closed-loop-saturation-threshold FLOAT` | `0.95` | Min judge score over the recent window for the saturation gate to open. Only consumed in `feedback` mode (`trainset` / `both` use `gate_mode="always"`). |
 | `--closed-loop-min-iters INT` | `3` | Periodic-fire floor: fire closed-loop at least every N reflective iterations even when the judge isn't saturating. `feedback` mode only. |
 | `--closed-loop-window-size INT` | `8` | Number of recent judge scores the saturation gate inspects. `feedback` mode only. |
+| `--no-saturation-check` | off | Skip the saturation pre-flight (`evolution/core/saturation_check.py`). By default, the framework scores the baseline on the holdout (and the closed-loop suite, if configured) BEFORE GEPA starts; non-`healthy` bands prompt for confirmation (interactive) or default-deny (non-interactive) with a `--force-saturation-check` override. Pass `--no-saturation-check` to skip the probe entirely. |
+| `--force-saturation-check` | off | Run the saturation pre-flight, render the panel, but proceed regardless of band. Required to override a non-`healthy` verdict in non-interactive contexts (no TTY on stdin). |
 
 `main()` rejects `--closed-loop-during-evolution` without `--closed-loop-hermes-repo`, and rejects `--closed-loop-mode != feedback` without `--closed-loop-during-evolution`. Local imports keep the validation stack out of cold-path runs.
 
diff --git a/docs/workflows.md b/docs/workflows.md
index 0ddd4451..eb80148a 100644
--- a/docs/workflows.md
+++ b/docs/workflows.md
@@ -69,6 +69,45 @@ sequenceDiagram
 
 Baseline static checks here are **warn-only** — they never block the run. The metric is built once so DSPy's LM cache lines up across GEPA per-iteration scoring and the holdout eval in Phase D.
 
+### Phase B.5 — Saturation pre-flight (default on; abort before GEPA spends budget)
+
+```mermaid
+sequenceDiagram
+    autonumber
+    participant CLI as evolve_skill
+    participant Sat as saturation_preflight
+    participant Eval as dspy.Evaluate
+    participant CLC as ClosedLoopFeedbackCache
+    participant Panel as render_saturation_panel
+    participant U as User
+
+    CLI->>Sat: saturation_preflight(baseline, holdout, metric, lm, cl_cache?, baseline_text)
+    Sat->>Eval: evaluate(baseline_module, holdout)
+    Eval-->>Sat: avg_baseline, baseline_per_example
+    opt --closed-loop-during-evolution is set
+        Sat->>CLC: force_run(baseline_text)
+        CLC-->>Sat: ValidationReport (bypasses should_run)
+    end
+    Sat->>Sat: _classify_band(holdout, closed_loop?, DEFAULT_THRESHOLDS)
+    Sat-->>CLI: SaturationReport(band, holdout_per_example, suggestions, ...)
+
+    alt band == "healthy"
+        CLI->>Panel: one-line dim acknowledgement
+    else non-healthy
+        CLI->>Panel: render Rich panel (band + scores + suggestions)
+        alt --force-saturation-check
+            Note over CLI: proceed regardless
+        else interactive
+            CLI->>U: "Continue anyway? [y/N]"
+            U-->>CLI: y → proceed | n → sys.exit(0)
+        else non-interactive
+            CLI->>CLI: print "Use --force-saturation-check to override"; sys.exit(0)
+        end
+    end
+```
+
+Skippable with `--no-saturation-check`. The probe's `baseline_per_example` is stashed and reused at Phase D's holdout comparison (the baseline isn't re-scored at run end), so net cost is ~zero when the run proceeds. On an abort, GEPA never starts — the user is left with a clear panel explaining why and what to try next. See `components.md` (`saturation_check.py`) for the four-band classifier and `data_models.md` (`SaturationReport`) for the report shape.
+
 ### Phase C — Optimize: GEPA loop, then knee-point pick
 
 ```mermaid
@@ -122,8 +161,12 @@ sequenceDiagram
     CLI->>Val: validate_static(evolved_full, "skill")
     Val-->>CLI: pass
 
-    CLI->>Eval: evaluate(baseline_module, holdout)
-    Eval-->>CLI: avg_baseline, baseline_per_example
+    alt Phase B.5 cached baseline_per_example
+        Note over CLI,Eval: skip baseline call; reuse from saturation_preflight
+    else fresh
+        CLI->>Eval: evaluate(baseline_module, holdout)
+        Eval-->>CLI: avg_baseline, baseline_per_example
+    end
     CLI->>Eval: evaluate(optimized_module, holdout)
     Eval-->>CLI: avg_evolved, evolved_per_example
 
@@ -137,7 +180,7 @@ sequenceDiagram
     CLI-->>U: ✓ Evolution improved skill by +0.054 (+6.1%)
 ```
 
-Holdout costs ≈ 2 × |holdout| judge calls (baseline + evolved). The bootstrap runs on the per-example improvement vector; `validate_growth_with_quality` then applies the curve `required(growth) = max(0, slope * (growth - free))` and only deploys if both `mean ≥ required` and `lower_bound > 0`.
+Holdout costs ≈ 1 × |holdout| judge calls when the saturation pre-flight ran (the baseline scores are reused from `SaturationReport.holdout_per_example`); 2 × |holdout| when `--no-saturation-check` is set. The bootstrap runs on the per-example improvement vector; `validate_growth_with_quality` then applies the curve `required(growth) = max(0, slope * (growth - free))` and only deploys if both `mean ≥ required` and `lower_bound > 0`.
 
 ## Workflow 2: Evolve a skill (rejected on quality gate)
 

From 3b515b525632e481c71f93bd2af25e070b80ce38 Mon Sep 17 00:00:00 2001
From: Justin Ramos <justin.ramos@gmail.com>
Date: Thu, 21 May 2026 19:58:52 -0600
Subject: [PATCH 7/8] fix(tests): mock SyntheticDatasetBuilder in
 saturation-preflight integration tests
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Two of the new integration tests reached the real synthetic dataset
generator before the (mocked) saturation_preflight, so CI runs with a
fake OPENAI_API_KEY died on AuthError before the code under test ever
executed:

- test_saturated_band_non_interactive_aborts (both pipelines)
- test_cache_reuse_skips_baseline_re_eval_after_gepa (both pipelines)

Add a SyntheticDatasetBuilder mock that returns a small list/EvalDataset
of fake EvalExamples (no LM calls). Skill-side fake dataset is sized to
50 examples (30/10/10) so the holdout ≥ EvolutionConfig.min_holdout_size
guard doesn't trip before reaching the preflight.

Verified locally by running the test files under
  env -i ... OPENAI_API_KEY=sk-fake-test-key uv run pytest ...
to match the CI environment — all 10 saturation-preflight tests pass,
full suite still at 1076.

The other 3 tests in each file (test_no_saturation_check_flag_skips_helper,
test_healthy_band_does_not_prompt, test_force_saturation_check_overrides_abort)
"pass" in CI for the wrong reason — their assertions are satisfied even
when the run dies on AuthError before reaching the wiring under test.
Worth tightening in a follow-up; not blocking this fix.
---
 .../test_evolve_skill_saturation_preflight.py | 28 +++++++++++++++++++
 .../test_evolve_tool_saturation_preflight.py  | 23 +++++++++++++++
 2 files changed, 51 insertions(+)

diff --git a/tests/skills/test_evolve_skill_saturation_preflight.py b/tests/skills/test_evolve_skill_saturation_preflight.py
index 62421e39..8aee4359 100644
--- a/tests/skills/test_evolve_skill_saturation_preflight.py
+++ b/tests/skills/test_evolve_skill_saturation_preflight.py
@@ -26,6 +26,26 @@ def skill_dir(tmp_path):
     return skills_root
 
 
+def _fake_skill_dataset(n: int = 50):
+    """Build a real-shaped EvalDataset with n fake examples (no LM calls).
+
+    Used by tests that need to flow through evolve() up to the saturation
+    preflight wiring; replaces SyntheticDatasetBuilder.generate so CI runs
+    with a fake OPENAI_API_KEY don't die on AuthError before reaching the
+    code under test. Default n=50 gives 30/10/10 splits — the holdout
+    must be ≥ EvolutionConfig.min_holdout_size (default 10) or evolve()
+    aborts before the preflight wiring.
+    """
+    from evolution.core.dataset_builder import EvalDataset, EvalExample
+    examples = [
+        EvalExample(task_input=f"task {i}", expected_behavior=f"rubric {i}")
+        for i in range(n)
+    ]
+    return EvalDataset(
+        train=examples[:30], val=examples[30:40], holdout=examples[40:50],
+    )
+
+
 class TestSaturationPreflightCLI:
     def test_no_saturation_check_flag_skips_helper(self, skill_dir):
         with patch(
@@ -69,7 +89,11 @@ def test_saturated_band_non_interactive_aborts(self, skill_dir):
             holdout_per_example=[1.0] * 50, suggestions=["x"], thresholds={},
         )
         gepa_mock = MagicMock()
+        fake_builder = MagicMock()
+        fake_builder.generate.return_value = _fake_skill_dataset()
         with patch(
+            "evolution.skills.evolve_skill.SyntheticDatasetBuilder", return_value=fake_builder
+        ), patch(
             "evolution.skills.evolve_skill.saturation_preflight", return_value=saturated
         ), patch(
             "evolution.skills.evolve_skill._preflight_lm_credentials"
@@ -129,7 +153,11 @@ def test_cache_reuse_skips_baseline_re_eval_after_gepa(self, skill_dir):
             fallback="knee", picked_idx=0, gepa_default_idx=0,
             gepa_default_body_chars=18, band_roster=[],
         )
+        fake_builder = MagicMock()
+        fake_builder.generate.return_value = _fake_skill_dataset()
         with patch(
+            "evolution.skills.evolve_skill.SyntheticDatasetBuilder", return_value=fake_builder
+        ), patch(
             "evolution.skills.evolve_skill.saturation_preflight", return_value=healthy
         ), patch(
             "evolution.skills.evolve_skill._preflight_lm_credentials"
diff --git a/tests/tools/test_evolve_tool_saturation_preflight.py b/tests/tools/test_evolve_tool_saturation_preflight.py
index 9eeb215b..f73f4acb 100644
--- a/tests/tools/test_evolve_tool_saturation_preflight.py
+++ b/tests/tools/test_evolve_tool_saturation_preflight.py
@@ -35,6 +35,21 @@ def manifest_dir(tmp_path):
     return _minimal_manifest_dir(tmp_path)
 
 
+def _fake_tool_examples(n: int = 30):
+    """Build n fake EvalExamples without calling an LM.
+
+    Used by tests that need to flow through evolve() up to the saturation
+    preflight wiring; replaces SyntheticDatasetBuilder.generate_tool_selection
+    so CI runs with a fake OPENAI_API_KEY don't die on AuthError before
+    reaching the code under test.
+    """
+    from evolution.core.dataset_builder import EvalExample
+    return [
+        EvalExample(task_input=f"task {i}", expected_behavior=f"rubric {i}")
+        for i in range(n)
+    ]
+
+
 class TestSaturationPreflightCLI:
     def test_no_saturation_check_flag_skips_helper(self, manifest_dir):
         """--no-saturation-check skips the preflight helper entirely."""
@@ -82,7 +97,11 @@ def test_saturated_band_non_interactive_aborts(self, manifest_dir):
             suggestions=["Try a harder suite"], thresholds={},
         )
         gepa_mock = MagicMock()
+        fake_builder = MagicMock()
+        fake_builder.generate_tool_selection.return_value = _fake_tool_examples()
         with patch(
+            "evolution.tools.evolve_tool.SyntheticDatasetBuilder", return_value=fake_builder
+        ), patch(
             "evolution.tools.evolve_tool.saturation_preflight", return_value=saturated
         ), patch(
             "evolution.tools.evolve_tool._preflight_lm_credentials"
@@ -145,7 +164,11 @@ def test_cache_reuse_skips_baseline_re_eval_after_gepa(self, manifest_dir):
             fallback="knee", picked_idx=0, gepa_default_idx=0,
             gepa_default_body_chars=12, band_roster=[],
         )
+        fake_builder = MagicMock()
+        fake_builder.generate_tool_selection.return_value = _fake_tool_examples()
         with patch(
+            "evolution.tools.evolve_tool.SyntheticDatasetBuilder", return_value=fake_builder
+        ), patch(
             "evolution.tools.evolve_tool.saturation_preflight", return_value=healthy
         ), patch(
             "evolution.tools.evolve_tool._preflight_lm_credentials"

From c931bf20acd6f616676e93b622f77afce9b97cf3 Mon Sep 17 00:00:00 2001
From: Justin Ramos <justin.ramos@gmail.com>
Date: Thu, 21 May 2026 20:38:12 -0600
Subject: [PATCH 8/8] review followups: exit code, tempdir cleanup, docstrings,
 test tightening
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Addresses six items from the PR review:

1. Non-interactive deny now exits 3 (was 0). A scheduled / CI / cron
   wrapper couldn't previously distinguish "refused to run because no
   TTY" from "ran cleanly". Interactive user-said-no still exits 0
   (success-by-intent). The integration test asserts the new code.

2. ClosedLoopFeedbackCache registers weakref.finalize for its tmp dir
   so SystemExit (the saturation abort path) triggers cleanup instead
   of leaking dirs into /tmp for the OS reaper to handle 3+ days later.
   Updated the class docstring to match.

3. saturation_preflight's docstring no longer claims "Pure: no side
   effects" — it has LM eval, may run a validator subprocess, mutates
   the cache. The actual property is "doesn't render, prompt, or exit"
   — call sites own those — and the docstring now says exactly that.

4. force_run's docstring spells out the _iters_since_last_run = min_iters
   contract (preserving the first-fire allowance for downstream
   get_or_run callers). Inline comment on the __init__ assignment
   anchors the invariant in both places so a future "cleanup" can't
   silently regress the fix to 0.

5. interactive_confirm's docstring acknowledges the EOFError branch
   the code already catches (not just KeyboardInterrupt).

6. De-vacuoused 2 CLI tests that previously passed even when production
   was mutated to ignore the flags they claimed to test:
   test_force_saturation_check_overrides_abort and
   test_healthy_band_does_not_prompt now assert GEPA was actually
   instantiated. Both add the SyntheticDatasetBuilder / select_knee_point
   / _holdout_evaluate_with_metric mock chain so the run flows through
   the production code instead of dying on AuthError at dataset gen.
   Added a new test_user_declines_at_prompt_aborts in both pipelines
   covering the previously-untested "Aborted by user." branch.

77 saturation-related tests pass, full suite at 1078 (was 1076).
---
 evolution/core/closed_loop_feedback.py        |  31 ++++-
 evolution/core/saturation_check.py            |  11 +-
 evolution/skills/evolve_skill.py              |   7 +-
 evolution/tools/evolve_tool.py                |   7 +-
 .../test_evolve_skill_saturation_preflight.py | 104 +++++++++++++++-
 .../test_evolve_tool_saturation_preflight.py  | 112 +++++++++++++++++-
 6 files changed, 257 insertions(+), 15 deletions(-)

diff --git a/evolution/core/closed_loop_feedback.py b/evolution/core/closed_loop_feedback.py
index 94e858cd..0cee09bb 100644
--- a/evolution/core/closed_loop_feedback.py
+++ b/evolution/core/closed_loop_feedback.py
@@ -24,8 +24,10 @@
 import hashlib
 import json
 import logging
+import shutil
 import tempfile
 import threading
+import weakref
 from pathlib import Path
 from typing import Callable, Literal, Optional
 
@@ -62,8 +64,11 @@ class ClosedLoopFeedbackCache:
     """Run-bounded cache of closed-loop verdicts keyed by candidate text.
 
     One instance per ``evolve_tool`` / ``evolve_skill`` invocation. The
-    tmp dir lives for the cache's lifetime; the OS reclaims it at process
-    exit (no explicit cleanup).
+    tmp dir lives for the cache's lifetime and is cleaned up via
+    ``weakref.finalize`` when the cache is garbage-collected — including
+    on ``SystemExit`` from the saturation pre-flight's abort path, which
+    would otherwise leak the dir until the OS's /tmp reaper ran (3+
+    days on macOS, weekly on most Linux servers).
 
     The cache is shared across metric calls within a run, including across
     DSPy's parallel ``Evaluate`` workers. The threading lock prevents
@@ -112,13 +117,26 @@ def __init__(
         )
 
         self._tmp_dir = Path(tempfile.mkdtemp(prefix="cl_feedback_"))
+        # Clean up the tmp dir when the cache is garbage-collected. This
+        # fires on normal completion AND on SystemExit (e.g. the saturation
+        # pre-flight's non-interactive abort), where atexit-only cleanup
+        # would leak the dir for days.
+        self._cleanup_finalizer = weakref.finalize(
+            self, shutil.rmtree, self._tmp_dir, ignore_errors=True
+        )
         self._baseline_path = self._tmp_dir / f"baseline{artifact_suffix}"
         self._evolved_path = self._tmp_dir / f"evolved{artifact_suffix}"
         self._artifact_writer(baseline_artifact_text, self._baseline_path)
 
         self._cache: dict[str, ValidationReport] = {}
         self._judge_history: list[float] = []
-        self._iters_since_last_run = self.min_iters  # allow first fire
+        # First-fire allowance: starts at min_iters so the first
+        # record_judge_score → should_run cycle satisfies the periodic
+        # floor (iters_since_last_run >= min_iters) and fires immediately
+        # in sampled gate_mode even before any judge saturation. force_run
+        # restores this same value rather than 0 to preserve the allowance
+        # for downstream get_or_run callers; see force_run's docstring.
+        self._iters_since_last_run = self.min_iters
         self._lock = threading.Lock()
 
     def record_judge_score(self, score: float) -> None:
@@ -187,6 +205,13 @@ def force_run(self, candidate_text: str) -> ValidationReport:
         text. Propagates validator exceptions (unlike ``get_or_run``,
         which swallows the expected ones to keep GEPA going) — preflight
         callers want to know the probe failed.
+
+        Sets ``_iters_since_last_run = self.min_iters`` (not 0 like
+        ``get_or_run``'s post-run reset) so the first GEPA-time
+        ``record_judge_score`` + ``should_run`` cycle after preflight
+        still satisfies the periodic floor and fires immediately,
+        preserving the first-fire allowance ``__init__`` sets up. A
+        regression test pins this invariant; do not change to 0.
         """
         key = self._key(candidate_text)
         with self._lock:
diff --git a/evolution/core/saturation_check.py b/evolution/core/saturation_check.py
index df32e43f..16b65a5d 100644
--- a/evolution/core/saturation_check.py
+++ b/evolution/core/saturation_check.py
@@ -145,9 +145,12 @@ def saturation_preflight(
     thresholds: Optional[dict[str, float]] = None,
 ) -> SaturationReport:
     """Score baseline on holdout (and closed-loop suite if cache provided),
-    classify into a band, return a report. Pure: no side effects.
+    classify into a band, return a report.
 
-    Call sites are responsible for rendering panels, prompting, and exiting.
+    Does the work — LM eval of the baseline, optional closed-loop validator
+    fire via ``force_run``, cache mutation, possibly a subprocess. The
+    "purity" we care about is at a higher layer: this function doesn't
+    render panels, prompt for confirmation, or exit. Call sites own those.
     """
     if not holdout_examples:
         raise ValueError("holdout_examples is empty; nothing to score")
@@ -260,7 +263,9 @@ def is_non_interactive() -> bool:
 def interactive_confirm(prompt: str = "Continue anyway? [y/N] ") -> bool:
     """Read one line from stdin; return True only for {y, yes} case-insensitive.
 
-    Ctrl-C / KeyboardInterrupt → False (treat like 'n', no traceback noise).
+    Ctrl-C and stdin EOF both → False (treat like 'n', no traceback noise).
+    The EOF branch matters in practice when stdin is piped from ``/dev/null``
+    or a closed pipe.
     """
     try:
         answer = input(prompt)
diff --git a/evolution/skills/evolve_skill.py b/evolution/skills/evolve_skill.py
index 9d0986be..bfeb6cc1 100644
--- a/evolution/skills/evolve_skill.py
+++ b/evolution/skills/evolve_skill.py
@@ -907,7 +907,12 @@ def evolve(
                                 "proceed. Pass --force-saturation-check to "
                                 "override.[/yellow]"
                             )
-                            sys.exit(0)
+                            # Exit code 3 distinguishes "refused to run for
+                            # lack of a TTY to confirm against" from clean
+                            # success (0) or hard user errors (1). Lets a
+                            # wrapping CI / cron / scheduled runner detect
+                            # silent denial.
+                            sys.exit(3)
                         if not interactive_confirm():
                             console.print("[yellow]Aborted by user.[/yellow]")
                             sys.exit(0)
diff --git a/evolution/tools/evolve_tool.py b/evolution/tools/evolve_tool.py
index 23e44b01..cd9b5028 100644
--- a/evolution/tools/evolve_tool.py
+++ b/evolution/tools/evolve_tool.py
@@ -677,7 +677,12 @@ def evolve(
                                 "proceed. Pass --force-saturation-check to "
                                 "override.[/yellow]"
                             )
-                            sys.exit(0)
+                            # Exit code 3 distinguishes "refused to run for
+                            # lack of a TTY to confirm against" from clean
+                            # success (0) or hard user errors (1). Lets a
+                            # wrapping CI / cron / scheduled runner detect
+                            # silent denial.
+                            sys.exit(3)
                         if not interactive_confirm():
                             console.print("[yellow]Aborted by user.[/yellow]")
                             sys.exit(0)
diff --git a/tests/skills/test_evolve_skill_saturation_preflight.py b/tests/skills/test_evolve_skill_saturation_preflight.py
index 8aee4359..252b2d8d 100644
--- a/tests/skills/test_evolve_skill_saturation_preflight.py
+++ b/tests/skills/test_evolve_skill_saturation_preflight.py
@@ -62,18 +62,46 @@ def test_no_saturation_check_flag_skips_helper(self, skill_dir):
             mock_preflight.assert_not_called()
 
     def test_healthy_band_does_not_prompt(self, skill_dir):
+        """When preflight returns healthy: no prompt AND GEPA actually runs.
+
+        Asserting only ``mock_confirm.assert_not_called()`` is vacuous —
+        a future boolean inversion in the call site would still pass that
+        assertion because CliRunner's non-TTY stdin hits the
+        ``is_non_interactive`` short-circuit before reaching the confirm.
+        Asserting GEPA was instantiated proves the run actually proceeded
+        past the abort branch.
+        """
         from evolution.core.saturation_check import SaturationReport
+        from evolution.skills.knee_point import CandidatePick
         healthy = SaturationReport(
             band="healthy", holdout_score=0.5, holdout_n=10,
             holdout_per_example=[0.5] * 10, suggestions=[], thresholds={},
         )
+        fake_module = MagicMock()
+        fake_module.skill_text = "evolved skill text"
+        knee_pick = CandidatePick(
+            module=fake_module, skill_text="evolved skill text", body_chars=18,
+            val_score=0.8, val_rank_in_band=1, band_size=1, epsilon=0.1,
+            fallback="knee", picked_idx=0, gepa_default_idx=0,
+            gepa_default_body_chars=18, band_roster=[],
+        )
+        fake_builder = MagicMock()
+        fake_builder.generate.return_value = _fake_skill_dataset()
+        gepa_mock = MagicMock()
         with patch(
+            "evolution.skills.evolve_skill.SyntheticDatasetBuilder", return_value=fake_builder
+        ), patch(
             "evolution.skills.evolve_skill.saturation_preflight", return_value=healthy
         ), patch(
             "evolution.skills.evolve_skill._preflight_lm_credentials"
         ), patch(
             "evolution.skills.evolve_skill.interactive_confirm"
-        ) as mock_confirm, patch("evolution.skills.evolve_skill.dspy.GEPA"):
+        ) as mock_confirm, patch("evolution.skills.evolve_skill.dspy.GEPA", gepa_mock), patch(
+            "evolution.skills.evolve_skill.select_knee_point", return_value=knee_pick
+        ), patch(
+            "evolution.skills.evolve_skill._holdout_evaluate_with_metric"
+        ) as mock_holdout_eval:
+            mock_holdout_eval.return_value = (0.6, [0.6] * 10)
             runner = CliRunner()
             runner.invoke(
                 evolve_skill_main,
@@ -81,6 +109,7 @@ def test_healthy_band_does_not_prompt(self, skill_dir):
                  "--iterations", "1", "--no-preflight"],
             )
             mock_confirm.assert_not_called()
+            gepa_mock.assert_called_once()
 
     def test_saturated_band_non_interactive_aborts(self, skill_dir):
         from evolution.core.saturation_check import SaturationReport
@@ -108,20 +137,90 @@ def test_saturated_band_non_interactive_aborts(self, skill_dir):
             )
             gepa_mock.assert_not_called()
             assert "force-saturation-check" in result.output
+            assert result.exit_code == 3, (
+                f"Non-interactive deny should exit 3 (distinct from clean "
+                f"success=0 / user errors=1), got {result.exit_code}"
+            )
+
+    def test_user_declines_at_prompt_aborts(self, skill_dir):
+        """Interactive context, non-healthy band, user types 'n': prints
+        'Aborted by user.', exits 0, no GEPA. Covers the
+        ``if not interactive_confirm(): sys.exit(0)`` branch that has
+        no other end-to-end coverage."""
+        from evolution.core.saturation_check import SaturationReport
+        saturated = SaturationReport(
+            band="no_headroom", holdout_score=0.99, holdout_n=50,
+            holdout_per_example=[1.0] * 50, suggestions=["x"], thresholds={},
+        )
+        fake_builder = MagicMock()
+        fake_builder.generate.return_value = _fake_skill_dataset()
+        gepa_mock = MagicMock()
+        with patch(
+            "evolution.skills.evolve_skill.SyntheticDatasetBuilder", return_value=fake_builder
+        ), patch(
+            "evolution.skills.evolve_skill.saturation_preflight", return_value=saturated
+        ), patch(
+            "evolution.skills.evolve_skill._preflight_lm_credentials"
+        ), patch(
+            "evolution.skills.evolve_skill.is_non_interactive", return_value=False
+        ), patch(
+            "evolution.skills.evolve_skill.interactive_confirm", return_value=False
+        ), patch("evolution.skills.evolve_skill.dspy.GEPA", gepa_mock):
+            runner = CliRunner()
+            result = runner.invoke(
+                evolve_skill_main,
+                ["--skill", "demo-skill", "--skill-source-dir", str(skill_dir),
+                 "--iterations", "1", "--no-preflight"],
+            )
+            gepa_mock.assert_not_called()
+            assert "Aborted by user" in result.output
+            assert result.exit_code == 0, (
+                f"Interactive user-said-no abort should exit 0, got {result.exit_code}"
+            )
 
     def test_force_saturation_check_overrides_abort(self, skill_dir):
+        """--force-saturation-check on a saturated baseline in a
+        non-interactive context: panel renders, confirm is bypassed, AND
+        GEPA actually runs.
+
+        Asserting only ``mock_confirm.assert_not_called()`` would be
+        vacuous (the non-TTY guard exits before reaching confirm anyway);
+        the GEPA-was-instantiated assertion proves the force flag
+        actually overrode the abort.
+        """
         from evolution.core.saturation_check import SaturationReport
+        from evolution.skills.knee_point import CandidatePick
         saturated = SaturationReport(
             band="no_headroom", holdout_score=0.99, holdout_n=50,
             holdout_per_example=[1.0] * 50, suggestions=["x"], thresholds={},
         )
+        fake_module = MagicMock()
+        fake_module.skill_text = "evolved skill text"
+        knee_pick = CandidatePick(
+            module=fake_module, skill_text="evolved skill text", body_chars=18,
+            val_score=0.8, val_rank_in_band=1, band_size=1, epsilon=0.1,
+            fallback="knee", picked_idx=0, gepa_default_idx=0,
+            gepa_default_body_chars=18, band_roster=[],
+        )
+        fake_builder = MagicMock()
+        fake_builder.generate.return_value = _fake_skill_dataset()
+        gepa_mock = MagicMock()
         with patch(
+            "evolution.skills.evolve_skill.SyntheticDatasetBuilder", return_value=fake_builder
+        ), patch(
             "evolution.skills.evolve_skill.saturation_preflight", return_value=saturated
         ), patch(
             "evolution.skills.evolve_skill._preflight_lm_credentials"
+        ), patch(
+            "evolution.skills.evolve_skill.is_non_interactive", return_value=True
         ), patch(
             "evolution.skills.evolve_skill.interactive_confirm"
-        ) as mock_confirm, patch("evolution.skills.evolve_skill.dspy.GEPA"):
+        ) as mock_confirm, patch("evolution.skills.evolve_skill.dspy.GEPA", gepa_mock), patch(
+            "evolution.skills.evolve_skill.select_knee_point", return_value=knee_pick
+        ), patch(
+            "evolution.skills.evolve_skill._holdout_evaluate_with_metric"
+        ) as mock_holdout_eval:
+            mock_holdout_eval.return_value = (0.6, [0.6] * 10)
             runner = CliRunner()
             runner.invoke(
                 evolve_skill_main,
@@ -129,6 +228,7 @@ def test_force_saturation_check_overrides_abort(self, skill_dir):
                  "--iterations", "1", "--force-saturation-check", "--no-preflight"],
             )
             mock_confirm.assert_not_called()
+            gepa_mock.assert_called_once()
 
     def test_cache_reuse_skips_baseline_re_eval_after_gepa(self, skill_dir):
         """When the saturation preflight runs, the cached baseline holdout
diff --git a/tests/tools/test_evolve_tool_saturation_preflight.py b/tests/tools/test_evolve_tool_saturation_preflight.py
index f73f4acb..62a68425 100644
--- a/tests/tools/test_evolve_tool_saturation_preflight.py
+++ b/tests/tools/test_evolve_tool_saturation_preflight.py
@@ -67,19 +67,48 @@ def test_no_saturation_check_flag_skips_helper(self, manifest_dir):
             mock_preflight.assert_not_called()
 
     def test_healthy_band_does_not_prompt(self, manifest_dir):
-        """When preflight returns healthy, no panel, no prompt; GEPA proceeds."""
+        """When preflight returns healthy: no prompt AND GEPA actually runs.
+
+        Asserting only ``mock_confirm.assert_not_called()`` is vacuous —
+        a future boolean inversion (e.g. the call site flipping to ``if
+        sat_report.band == "healthy":``) would still pass that assertion
+        because CliRunner's non-TTY stdin would hit the
+        ``is_non_interactive`` short-circuit and ``sys.exit(3)`` before
+        reaching ``interactive_confirm``. Asserting GEPA was instantiated
+        proves the run actually proceeded past the abort branch.
+        """
         from evolution.core.saturation_check import SaturationReport
+        from evolution.skills.knee_point import CandidatePick
         healthy = SaturationReport(
             band="healthy", holdout_score=0.5, holdout_n=10,
             holdout_per_example=[0.5] * 10, suggestions=[], thresholds={},
         )
+        fake_module = MagicMock()
+        knee_pick = CandidatePick(
+            module=fake_module, skill_text="evolved desc", body_chars=12,
+            val_score=0.8, val_rank_in_band=1, band_size=1, epsilon=0.1,
+            fallback="knee", picked_idx=0, gepa_default_idx=0,
+            gepa_default_body_chars=12, band_roster=[],
+        )
+        fake_builder = MagicMock()
+        fake_builder.generate_tool_selection.return_value = _fake_tool_examples()
+        gepa_mock = MagicMock()
         with patch(
+            "evolution.tools.evolve_tool.SyntheticDatasetBuilder", return_value=fake_builder
+        ), patch(
             "evolution.tools.evolve_tool.saturation_preflight", return_value=healthy
         ), patch(
             "evolution.tools.evolve_tool._preflight_lm_credentials"
         ), patch(
             "evolution.tools.evolve_tool.interactive_confirm"
-        ) as mock_confirm, patch("evolution.tools.evolve_tool.dspy.GEPA"):
+        ) as mock_confirm, patch("evolution.tools.evolve_tool.dspy.GEPA", gepa_mock), patch(
+            "evolution.tools.evolve_tool.select_knee_point", return_value=knee_pick
+        ), patch(
+            "evolution.tools.evolve_tool._candidate_description", return_value="evolved desc"
+        ), patch(
+            "evolution.tools.evolve_tool._holdout_evaluate_with_metric"
+        ) as mock_holdout_eval:
+            mock_holdout_eval.return_value = (0.6, [0.6] * 10)
             runner = CliRunner()
             runner.invoke(
                 evolve_tool_main,
@@ -87,6 +116,7 @@ def test_healthy_band_does_not_prompt(self, manifest_dir):
                  "--iterations", "1", "--no-preflight"],
             )
             mock_confirm.assert_not_called()
+            gepa_mock.assert_called_once()
 
     def test_saturated_band_non_interactive_aborts(self, manifest_dir):
         """no_headroom band in non-interactive context exits cleanly without GEPA."""
@@ -116,30 +146,102 @@ def test_saturated_band_non_interactive_aborts(self, manifest_dir):
             )
             gepa_mock.assert_not_called()
             assert "force-saturation-check" in result.output
+            assert result.exit_code == 3, (
+                f"Non-interactive deny should exit 3 (distinct from clean "
+                f"success=0 / user errors=1), got {result.exit_code}"
+            )
+
+    def test_user_declines_at_prompt_aborts(self, manifest_dir):
+        """Interactive context, non-healthy band, user types 'n': prints
+        'Aborted by user.', exits 0, no GEPA. Covers the
+        ``if not interactive_confirm(): sys.exit(0)`` branch that has
+        no other end-to-end coverage."""
+        from evolution.core.saturation_check import SaturationReport
+        saturated = SaturationReport(
+            band="no_headroom", holdout_score=0.99, holdout_n=50,
+            holdout_per_example=[1.0] * 50, suggestions=["x"], thresholds={},
+        )
+        fake_builder = MagicMock()
+        fake_builder.generate_tool_selection.return_value = _fake_tool_examples()
+        gepa_mock = MagicMock()
+        with patch(
+            "evolution.tools.evolve_tool.SyntheticDatasetBuilder", return_value=fake_builder
+        ), patch(
+            "evolution.tools.evolve_tool.saturation_preflight", return_value=saturated
+        ), patch(
+            "evolution.tools.evolve_tool._preflight_lm_credentials"
+        ), patch(
+            "evolution.tools.evolve_tool.is_non_interactive", return_value=False
+        ), patch(
+            "evolution.tools.evolve_tool.interactive_confirm", return_value=False
+        ), patch("evolution.tools.evolve_tool.dspy.GEPA", gepa_mock):
+            runner = CliRunner()
+            result = runner.invoke(
+                evolve_tool_main,
+                ["--tool", "write_file", "--manifest", str(manifest_dir),
+                 "--iterations", "1", "--no-preflight"],
+            )
+            gepa_mock.assert_not_called()
+            assert "Aborted by user" in result.output
+            assert result.exit_code == 0, (
+                f"Interactive user-said-no abort should exit 0, got {result.exit_code}"
+            )
 
     def test_force_saturation_check_overrides_abort(self, manifest_dir):
-        """--force-saturation-check renders panel but lets GEPA run."""
+        """--force-saturation-check on a saturated baseline in a
+        non-interactive context: panel renders, confirm is bypassed, AND
+        GEPA actually runs.
+
+        Asserting only ``mock_confirm.assert_not_called()`` is vacuous
+        on its own: an inverted force-flag check would still pass that
+        assertion because the non-TTY ``is_non_interactive`` branch
+        ``sys.exit(3)``s before reaching ``interactive_confirm``. The
+        GEPA-was-instantiated assertion proves the force flag actually
+        overrode the abort.
+        """
         from evolution.core.saturation_check import SaturationReport
+        from evolution.skills.knee_point import CandidatePick
         saturated = SaturationReport(
             band="no_headroom", holdout_score=0.99, holdout_n=50,
             holdout_per_example=[1.0] * 50,
             suggestions=["x"], thresholds={},
         )
+        fake_module = MagicMock()
+        knee_pick = CandidatePick(
+            module=fake_module, skill_text="evolved desc", body_chars=12,
+            val_score=0.8, val_rank_in_band=1, band_size=1, epsilon=0.1,
+            fallback="knee", picked_idx=0, gepa_default_idx=0,
+            gepa_default_body_chars=12, band_roster=[],
+        )
+        fake_builder = MagicMock()
+        fake_builder.generate_tool_selection.return_value = _fake_tool_examples()
+        gepa_mock = MagicMock()
         with patch(
+            "evolution.tools.evolve_tool.SyntheticDatasetBuilder", return_value=fake_builder
+        ), patch(
             "evolution.tools.evolve_tool.saturation_preflight", return_value=saturated
         ), patch(
             "evolution.tools.evolve_tool._preflight_lm_credentials"
+        ), patch(
+            "evolution.tools.evolve_tool.is_non_interactive", return_value=True
         ), patch(
             "evolution.tools.evolve_tool.interactive_confirm"
-        ) as mock_confirm, patch("evolution.tools.evolve_tool.dspy.GEPA"):
+        ) as mock_confirm, patch("evolution.tools.evolve_tool.dspy.GEPA", gepa_mock), patch(
+            "evolution.tools.evolve_tool.select_knee_point", return_value=knee_pick
+        ), patch(
+            "evolution.tools.evolve_tool._candidate_description", return_value="evolved desc"
+        ), patch(
+            "evolution.tools.evolve_tool._holdout_evaluate_with_metric"
+        ) as mock_holdout_eval:
+            mock_holdout_eval.return_value = (0.6, [0.6] * 10)
             runner = CliRunner()
             runner.invoke(
                 evolve_tool_main,
                 ["--tool", "write_file", "--manifest", str(manifest_dir),
                  "--iterations", "1", "--force-saturation-check", "--no-preflight"],
             )
-            # confirm is bypassed when --force-saturation-check is set
             mock_confirm.assert_not_called()
+            gepa_mock.assert_called_once()
 
     def test_cache_reuse_skips_baseline_re_eval_after_gepa(self, manifest_dir):
         """When the saturation preflight runs, the cached baseline holdout