From d8552085ea170f63bacc027c442f8e7210bfa3e1 Mon Sep 17 00:00:00 2001 From: Justin Ramos Date: Fri, 22 May 2026 19:36:36 -0600 Subject: [PATCH] chore: tighten uniform_failure suggestions after prefix-routing fix MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The "validator appears too weak" suggestion was actively misleading historically: hermes -m treated LiteLLM provider prefixes as openrouter routing, breaking auth and returning 0-turn sessions that the framework counted as task failures. Users (and reviewers) followed the suggestion to bump model strength when the actual fix was routing. Now that the routing bug is fixed (#66), the residual uniform_failure cases are more likely to be misconfiguration than capability. Lead the suggestion list with "first check the validator actually ran" and point users at the run.log line that confirms routing. Panel title softened from "validator too weak" to "closed-loop scored zero on every task" — observation, not diagnosis. --- evolution/core/saturation_check.py | 8 ++++---- tests/core/test_saturation_check.py | 9 +++++++++ 2 files changed, 13 insertions(+), 4 deletions(-) diff --git a/evolution/core/saturation_check.py b/evolution/core/saturation_check.py index 90acdb5..a8eaec8 100644 --- a/evolution/core/saturation_check.py +++ b/evolution/core/saturation_check.py @@ -60,9 +60,9 @@ def _classify_band( if closed_loop_score is not None and closed_loop_score <= uniform_cl: return "uniform_failure", [ - "Validator agent appears too weak to use the tool/skill — all behavioral tasks fail uniformly.", - "Try a stronger --closed-loop-agent-model.", - "Or harden the suite tasks so failure modes are interesting, not 'model can't execute'.", + "Baseline scored 0 on every behavioral task — GEPA has nothing to optimize for.", + "First check the validator actually ran: look in run.log for a 'Stripped LiteLLM provider prefix' line confirming --closed-loop-agent-model routed correctly, and for a non-zero number of subprocess LM calls.", + "If the validator did run: try a stronger --closed-loop-agent-model, or harden the suite tasks so failure modes are interesting rather than 'model can't execute the task.'", ] synthetic_saturated = holdout_score >= no_head_syn @@ -200,7 +200,7 @@ def saturation_preflight( "healthy": "Saturation check passed", "no_headroom": "No measurable headroom", "weak_signal": "Weak signal — expect a hard run", - "uniform_failure": "Uniform failure — validator too weak", + "uniform_failure": "Uniform failure — closed-loop scored zero on every task", } _BAND_STYLES: dict[SaturationBand, str] = { diff --git a/tests/core/test_saturation_check.py b/tests/core/test_saturation_check.py index da33700..4dda5a0 100644 --- a/tests/core/test_saturation_check.py +++ b/tests/core/test_saturation_check.py @@ -50,6 +50,15 @@ def test_uniform_failure_when_closed_loop_below_threshold(self): ) assert band == "uniform_failure" assert any("validator" in s.lower() or "stronger" in s.lower() for s in suggestions) + # The "first check the validator actually ran" hint guards against + # the historical silent-failure: hermes -m treated litellm-formatted + # model strings as openrouter routing, broke auth, returned 0-turn + # sessions, and the framework reported it as "validator too weak." + # The hint points users at the run.log line that confirms routing. + assert any( + "stripped litellm" in s.lower() or "run.log" in s.lower() or "routed correctly" in s.lower() + for s in suggestions + ) def test_boundary_exactly_at_no_headroom_synthetic_triggers(self): """0.99 exactly should trigger no_headroom (>= comparison)."""