From d8552085ea170f63bacc027c442f8e7210bfa3e1 Mon Sep 17 00:00:00 2001
From: Justin Ramos <justin.ramos@gmail.com>
Date: Fri, 22 May 2026 19:36:36 -0600
Subject: [PATCH] chore: tighten uniform_failure suggestions after
 prefix-routing fix
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The "validator appears too weak" suggestion was actively misleading
historically: hermes -m treated LiteLLM provider prefixes as openrouter
routing, breaking auth and returning 0-turn sessions that the framework
counted as task failures. Users (and reviewers) followed the suggestion
to bump model strength when the actual fix was routing.

Now that the routing bug is fixed (#66), the residual uniform_failure
cases are more likely to be misconfiguration than capability. Lead the
suggestion list with "first check the validator actually ran" and point
users at the run.log line that confirms routing.

Panel title softened from "validator too weak" to "closed-loop scored
zero on every task" — observation, not diagnosis.
---
 evolution/core/saturation_check.py  | 8 ++++----
 tests/core/test_saturation_check.py | 9 +++++++++
 2 files changed, 13 insertions(+), 4 deletions(-)

diff --git a/evolution/core/saturation_check.py b/evolution/core/saturation_check.py
index 90acdb5..a8eaec8 100644
--- a/evolution/core/saturation_check.py
+++ b/evolution/core/saturation_check.py
@@ -60,9 +60,9 @@ def _classify_band(
 
     if closed_loop_score is not None and closed_loop_score <= uniform_cl:
         return "uniform_failure", [
-            "Validator agent appears too weak to use the tool/skill — all behavioral tasks fail uniformly.",
-            "Try a stronger --closed-loop-agent-model.",
-            "Or harden the suite tasks so failure modes are interesting, not 'model can't execute'.",
+            "Baseline scored 0 on every behavioral task — GEPA has nothing to optimize for.",
+            "First check the validator actually ran: look in run.log for a 'Stripped LiteLLM provider prefix' line confirming --closed-loop-agent-model routed correctly, and for a non-zero number of subprocess LM calls.",
+            "If the validator did run: try a stronger --closed-loop-agent-model, or harden the suite tasks so failure modes are interesting rather than 'model can't execute the task.'",
         ]
 
     synthetic_saturated = holdout_score >= no_head_syn
@@ -200,7 +200,7 @@ def saturation_preflight(
     "healthy": "Saturation check passed",
     "no_headroom": "No measurable headroom",
     "weak_signal": "Weak signal — expect a hard run",
-    "uniform_failure": "Uniform failure — validator too weak",
+    "uniform_failure": "Uniform failure — closed-loop scored zero on every task",
 }
 
 _BAND_STYLES: dict[SaturationBand, str] = {
diff --git a/tests/core/test_saturation_check.py b/tests/core/test_saturation_check.py
index da33700..4dda5a0 100644
--- a/tests/core/test_saturation_check.py
+++ b/tests/core/test_saturation_check.py
@@ -50,6 +50,15 @@ def test_uniform_failure_when_closed_loop_below_threshold(self):
         )
         assert band == "uniform_failure"
         assert any("validator" in s.lower() or "stronger" in s.lower() for s in suggestions)
+        # The "first check the validator actually ran" hint guards against
+        # the historical silent-failure: hermes -m treated litellm-formatted
+        # model strings as openrouter routing, broke auth, returned 0-turn
+        # sessions, and the framework reported it as "validator too weak."
+        # The hint points users at the run.log line that confirms routing.
+        assert any(
+            "stripped litellm" in s.lower() or "run.log" in s.lower() or "routed correctly" in s.lower()
+            for s in suggestions
+        )
 
     def test_boundary_exactly_at_no_headroom_synthetic_triggers(self):
         """0.99 exactly should trigger no_headroom (>= comparison)."""