From 6e65a94f20c11acd05ec251ba3214eb998046077 Mon Sep 17 00:00:00 2001 From: Justin Ramos Date: Fri, 22 May 2026 19:12:30 -0600 Subject: [PATCH] fix: strip LiteLLM provider prefix before hermes -m MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Closed-loop validation has been silently scoring 0/N for any user who passed a LiteLLM-formatted model string (e.g. `openai/gpt-4o-mini`) to `--closed-loop-agent-model`. The hermes `-m` flag interprets `/` as openrouter-style routing, which switches the subprocess base_url to openrouter.ai. An OpenAI key in the user's hermes config isn't valid for openrouter, so the agent loop dies with no turn and the framework reports it as `uniform_failure` ("validator too weak"), hiding the real cause. Strip known LiteLLM provider prefixes in HermesAgentRunner.__init__ so users get the behavior they expect from the model string they use everywhere else in the framework. Unknown prefixes pass through, so openrouter-style routing through an unrecognized vendor still works. Verified end-to-end: same probe that previously reported 0/7 (with the bug) now reports 7/7 with `gpt-5.4-mini` and 6/7 with `gpt-5-mini` — the validator was working all along, just routed wrong. --- evolution/validation/hermes_runner.py | 47 +++++++++++++++++- tests/validation/test_hermes_runner.py | 66 ++++++++++++++++++++++++++ 2 files changed, 112 insertions(+), 1 deletion(-) diff --git a/evolution/validation/hermes_runner.py b/evolution/validation/hermes_runner.py index 4a4dacd1..f93e118c 100644 --- a/evolution/validation/hermes_runner.py +++ b/evolution/validation/hermes_runner.py @@ -32,6 +32,36 @@ _FINAL_TEXT_TAIL_BYTES = 4096 DEFAULT_TASK_TIMEOUT_SECONDS = 120 +# Known LiteLLM provider prefixes the rest of the framework uses +# (DSPy / litellm convention: ``/``). The hermes -m flag +# interprets the same shape as openrouter-style routing — passing +# ``openai/gpt-4o-mini`` silently switches base_url to openrouter.ai and +# breaks auth for direct-provider configs, producing a 0-turn session that +# the saturation pre-flight misreports as "validator too weak". We strip +# these prefixes at the hermes boundary so users get the behavior they +# expect when they pass the same model string they use elsewhere. +_LITELLM_PROVIDER_PREFIXES = ( + "openai/", + "anthropic/", + "azure/", + "gemini/", + "cohere/", + "bedrock/", + "mistral/", +) + + +def _strip_litellm_provider_prefix(model: str) -> str: + """Strip a known LiteLLM provider prefix from a model name. + + Returns ``model`` unchanged when no recognized prefix is present, so + openrouter-style routing through an unrecognized vendor still works. + """ + for prefix in _LITELLM_PROVIDER_PREFIXES: + if model.startswith(prefix): + return model[len(prefix):] + return model + class HermesAgentRunner: """Invoke ``hermes -z`` and parse the resulting session JSON. @@ -63,7 +93,22 @@ def __init__( # against a deliberately weaker agent model than the user's # daily-driver default — saturation on capable models hides # behavioral signal that a weaker model would expose. - self.model = model + # + # Normalize LiteLLM-style provider prefixes (``openai/``, etc.) + # before storing: hermes -m treats ``/`` as + # openrouter routing which silently switches the base_url. See + # ``_strip_litellm_provider_prefix`` for the full rationale. + if model is not None: + normalized = _strip_litellm_provider_prefix(model) + if normalized != model: + logger.info( + "Stripped LiteLLM provider prefix from hermes -m model: " + "%r → %r (avoids accidental openrouter routing)", + model, normalized, + ) + self.model = normalized + else: + self.model = None def run(self, ctx: TaskRunContext) -> AgentRunResult: message = ctx.user_message diff --git a/tests/validation/test_hermes_runner.py b/tests/validation/test_hermes_runner.py index c56faa9e..d970c29e 100644 --- a/tests/validation/test_hermes_runner.py +++ b/tests/validation/test_hermes_runner.py @@ -19,10 +19,43 @@ from evolution.validation.agent_runner import TaskRunContext from evolution.validation.hermes_runner import ( HermesAgentRunner, + _strip_litellm_provider_prefix, parse_session_result, ) +class TestStripLitellmProviderPrefix: + """The hermes -m flag interprets '/' as openrouter-style + routing. Direct-provider users naturally pass litellm-formatted names + like 'openai/gpt-4o-mini' from elsewhere in the framework; this helper + normalizes them back to bare model names that hermes -m accepts.""" + + def test_strips_openai_prefix(self): + assert _strip_litellm_provider_prefix("openai/gpt-4o-mini") == "gpt-4o-mini" + + def test_strips_anthropic_prefix(self): + assert _strip_litellm_provider_prefix("anthropic/claude-opus-4-7") == "claude-opus-4-7" + + def test_strips_azure_prefix(self): + assert _strip_litellm_provider_prefix("azure/gpt-4") == "gpt-4" + + def test_strips_gemini_prefix(self): + assert _strip_litellm_provider_prefix("gemini/gemini-1.5-pro") == "gemini-1.5-pro" + + def test_leaves_bare_model_unchanged(self): + assert _strip_litellm_provider_prefix("gpt-4o-mini") == "gpt-4o-mini" + + def test_leaves_unknown_prefix_unchanged(self): + # Unknown providers pass through so openrouter-style routing through + # an unrecognized vendor still works. + assert _strip_litellm_provider_prefix("custom-vendor/model-x") == "custom-vendor/model-x" + + def test_only_strips_first_prefix(self): + # Defensive: nested prefixes (e.g. openrouter passthrough) shouldn't be + # double-stripped — keep the second segment intact. + assert _strip_litellm_provider_prefix("openai/foo/bar") == "foo/bar" + + def _write_session(path: Path, messages: list[dict], **extra) -> None: payload = { "session_id": "test", @@ -346,6 +379,39 @@ def _fake_run(*args, **kwargs): assert "-m" not in captured["args"] assert captured["args"] == ["hermes", "-z", "hello"] + def test_litellm_prefix_stripped_before_minus_m(self, fixture_dir, tmp_path): + """Regression: hermes -m treats '/' as openrouter-style + routing (silently switches base_url to openrouter.ai and breaks auth + for direct-provider configs). Users naturally pass litellm-formatted + names like 'openai/gpt-4o-mini' from elsewhere in the framework, so + the runner must strip known litellm provider prefixes before -m to + avoid a silent 0-turn 'agent never ran' failure that misreports as + 'validator too weak' at the saturation pre-flight.""" + runner = HermesAgentRunner( + user_config_path=tmp_path / "nonexistent", + model="openai/gpt-4o-mini", + ) + captured: dict = {} + + def _fake_run(*args, **kwargs): + captured["args"] = args[0] if args else kwargs.get("args") + sandbox = Path(kwargs["env"]["HERMES_HOME"]) + (sandbox / "sessions").mkdir(exist_ok=True) + _write_session( + sandbox / "sessions" / "session_test.json", + [{"role": "assistant", "content": "ok"}], + ) + return type("CP", (), {"returncode": 0, "stdout": "", "stderr": ""})() + + with patch("evolution.validation.hermes_runner.subprocess.run", side_effect=_fake_run): + runner.run(TaskRunContext( + user_message="debug", + fixture_dir=fixture_dir, + )) + + assert captured["args"][:4] == ["hermes", "-m", "gpt-4o-mini", "-z"] + assert "openai/" not in captured["args"][2] + def test_skills_src_none_means_no_skills_dir_created(self, fixture_dir, tmp_path): """Tool-side runs (no skills_src) must not create an empty skills/ directory in the sandbox — keeps the legacy code path bit-for-bit."""