diff --git a/evolution/core/saturation_check.py b/evolution/core/saturation_check.py index 90acdb5..a8eaec8 100644 --- a/evolution/core/saturation_check.py +++ b/evolution/core/saturation_check.py @@ -60,9 +60,9 @@ def _classify_band( if closed_loop_score is not None and closed_loop_score <= uniform_cl: return "uniform_failure", [ - "Validator agent appears too weak to use the tool/skill — all behavioral tasks fail uniformly.", - "Try a stronger --closed-loop-agent-model.", - "Or harden the suite tasks so failure modes are interesting, not 'model can't execute'.", + "Baseline scored 0 on every behavioral task — GEPA has nothing to optimize for.", + "First check the validator actually ran: look in run.log for a 'Stripped LiteLLM provider prefix' line confirming --closed-loop-agent-model routed correctly, and for a non-zero number of subprocess LM calls.", + "If the validator did run: try a stronger --closed-loop-agent-model, or harden the suite tasks so failure modes are interesting rather than 'model can't execute the task.'", ] synthetic_saturated = holdout_score >= no_head_syn @@ -200,7 +200,7 @@ def saturation_preflight( "healthy": "Saturation check passed", "no_headroom": "No measurable headroom", "weak_signal": "Weak signal — expect a hard run", - "uniform_failure": "Uniform failure — validator too weak", + "uniform_failure": "Uniform failure — closed-loop scored zero on every task", } _BAND_STYLES: dict[SaturationBand, str] = { diff --git a/tests/core/test_saturation_check.py b/tests/core/test_saturation_check.py index da33700..4dda5a0 100644 --- a/tests/core/test_saturation_check.py +++ b/tests/core/test_saturation_check.py @@ -50,6 +50,15 @@ def test_uniform_failure_when_closed_loop_below_threshold(self): ) assert band == "uniform_failure" assert any("validator" in s.lower() or "stronger" in s.lower() for s in suggestions) + # The "first check the validator actually ran" hint guards against + # the historical silent-failure: hermes -m treated litellm-formatted + # model strings as openrouter routing, broke auth, returned 0-turn + # sessions, and the framework reported it as "validator too weak." + # The hint points users at the run.log line that confirms routing. + assert any( + "stripped litellm" in s.lower() or "run.log" in s.lower() or "routed correctly" in s.lower() + for s in suggestions + ) def test_boundary_exactly_at_no_headroom_synthetic_triggers(self): """0.99 exactly should trigger no_headroom (>= comparison)."""