pinchbench · olearycrew · May 14, 2026 · May 13, 2026
diff --git a/README.md b/README.md
@@ -116,6 +116,9 @@ By default (no `--judge` flag), the LLM judge runs as an OpenClaw agent session.
 # Direct API via OpenRouter
 ./scripts/run.sh --model openai/gpt-4o --judge openrouter/anthropic/claude-sonnet-4-5
 
+# Direct API via Kilo Gateway
+./scripts/run.sh --model openai/gpt-4o --judge kilo/anthropic/claude-sonnet-4-5
+
 # Direct API via Anthropic
 ./scripts/run.sh --model openai/gpt-4o --judge anthropic/claude-sonnet-4-5-20250514
 
@@ -126,7 +129,7 @@ By default (no `--judge` flag), the LLM judge runs as an OpenClaw agent session.
 ./scripts/run.sh --model openai/gpt-4o --judge claude
 ```
 
-Required env vars: `OPENROUTER_API_KEY`, `ANTHROPIC_API_KEY`, or `OPENAI_API_KEY` depending on the judge model prefix.
+Required env vars: `OPENROUTER_API_KEY`, `KILO_API_KEY`, `ANTHROPIC_API_KEY`, or `OPENAI_API_KEY` depending on the judge model prefix.
 
 ## Contributing Tasks
 
@@ -164,4 +167,3 @@ MIT — see [LICENSE](LICENSE) for details.
 ---
 
 _Claw-some AI agent testing_ 🦞
-
diff --git a/scripts/benchmark.py b/scripts/benchmark.py
@@ -239,8 +239,9 @@ def _parse_args() -> argparse.Namespace:
         default=None,
         help=(
             "Judge model or backend. Default (unset): OpenClaw agent session with "
-            "openrouter/anthropic/claude-opus-4.5. Set to a model ID to call its API "
-            "directly (e.g. openai/gpt-4o, anthropic/claude-sonnet-4-5-20250514, claude)"
+            "openrouter/anthropic/claude-haiku-4.5. Set to a model ID to call its API "
+            "directly (e.g. kilo/anthropic/claude-sonnet-4-5, openai/gpt-4o, "
+            "anthropic/claude-sonnet-4-5-20250514, claude)"
         ),
     )
     parser.add_argument(

diff --git a/scripts/lib_agent.py b/scripts/lib_agent.py
@@ -1185,6 +1185,7 @@ def call_judge_api(
 
     Dispatches based on model prefix:
       - openrouter/* -> OpenRouter chat completions API
+      - kilo/*       -> Kilo Gateway chat completions API
       - anthropic/*  -> Anthropic Messages API
       - openai/*     -> OpenAI chat completions API
       - claude       -> headless Claude CLI (claude -p)
@@ -1193,6 +1194,8 @@ def call_judge_api(
     """
     if model == "claude" or model.startswith("claude:"):
         return _judge_via_claude_cli(prompt, model, timeout_seconds)
+    if model.startswith("kilo/"):
+        return _judge_via_kilo(prompt, model, timeout_seconds)
     if model.startswith("anthropic/"):
         return _judge_via_anthropic(prompt, model, timeout_seconds)
     if model.startswith("openai/"):
@@ -1265,6 +1268,20 @@ def _judge_via_openrouter(prompt: str, model: str, timeout_seconds: float) -> Di
     )
 
 
+def _judge_via_kilo(prompt: str, model: str, timeout_seconds: float) -> Dict[str, Any]:
+    api_key = os.environ.get("KILO_API_KEY")
+    if not api_key:
+        return {"status": "error", "text": "", "error": "KILO_API_KEY not set"}
+    bare_model = model.removeprefix("kilo/")
+    return _judge_via_openai_compat(
+        prompt,
+        bare_model,
+        "https://api.kilo.ai/api/gateway/chat/completions",
+        api_key,
+        timeout_seconds,
+    )
+
+
 def _judge_via_openai(prompt: str, model: str, timeout_seconds: float) -> Dict[str, Any]:
     api_key = os.environ.get("OPENAI_API_KEY")
     if not api_key:

diff --git a/tests/test_lib_agent_judge.py b/tests/test_lib_agent_judge.py
@@ -0,0 +1,102 @@
+from __future__ import annotations
+
+import json
+import os
+import sys
+import unittest
+from pathlib import Path
+from unittest.mock import patch
+
+
+ROOT = Path(__file__).resolve().parents[1]
+SCRIPTS_DIR = ROOT / "scripts"
+if str(SCRIPTS_DIR) not in sys.path:
+    sys.path.insert(0, str(SCRIPTS_DIR))
+
+from lib_agent import call_judge_api  # noqa: E402
+
+
+class _FakeResponse:
+    def __enter__(self) -> "_FakeResponse":
+        return self
+
+    def __exit__(self, exc_type, exc, tb) -> None:
+        return None
+
+    def read(self) -> bytes:
+        return json.dumps(
+            {"choices": [{"message": {"content": '{"total": 1.0}'}}]}
+        ).encode("utf-8")
+
+
+class KiloJudgeTests(unittest.TestCase):
+    def test_call_judge_api_kilo_requires_kilo_api_key(self) -> None:
+        with patch.dict(os.environ, {}, clear=True):
+            result = call_judge_api(
+                prompt="grade this",
+                model="kilo/anthropic/claude-sonnet-4-5",
+            )
+
+        self.assertEqual(result["status"], "error")
+        self.assertEqual(result["text"], "")
+        self.assertEqual(result["error"], "KILO_API_KEY not set")
+
+    def test_call_judge_api_kilo_posts_to_gateway_with_bare_model(self) -> None:
+        captured_request = None
+
+        def fake_urlopen(req, timeout):
+            nonlocal captured_request
+            captured_request = req
+            self.assertEqual(timeout, 12.5)
+            return _FakeResponse()
+
+        with patch.dict(os.environ, {"KILO_API_KEY": "test-key"}, clear=True), patch(
+            "lib_agent.request.urlopen", side_effect=fake_urlopen
+        ):
+            result = call_judge_api(
+                prompt="grade this",
+                model="kilo/anthropic/claude-sonnet-4-5",
+                timeout_seconds=12.5,
+            )
+
+        self.assertEqual(result["status"], "success")
+        self.assertEqual(result["text"], '{"total": 1.0}')
+        self.assertIsNotNone(captured_request)
+        self.assertEqual(
+            captured_request.full_url,
+            "https://api.kilo.ai/api/gateway/chat/completions",
+        )
+        self.assertEqual(captured_request.get_method(), "POST")
+        self.assertEqual(captured_request.headers["Authorization"], "Bearer test-key")
+        self.assertEqual(captured_request.headers["Content-type"], "application/json")
+
+        payload = json.loads(captured_request.data.decode("utf-8"))
+        self.assertEqual(payload["model"], "anthropic/claude-sonnet-4-5")
+        self.assertEqual(payload["temperature"], 0.0)
+        self.assertEqual(payload["max_completion_tokens"], 2048)
+        self.assertEqual(payload["messages"][0]["role"], "system")
+        self.assertEqual(payload["messages"][1], {"role": "user", "content": "grade this"})
+
+    def test_call_judge_api_kilo_dispatch_does_not_fall_back_to_openrouter(self) -> None:
+        with patch.dict(os.environ, {"KILO_API_KEY": "test-key"}, clear=True), patch(
+            "lib_agent._judge_via_openai_compat",
+            return_value={"status": "success", "text": "ok"},
+        ) as compat:
+            result = call_judge_api(
+                prompt="grade this",
+                model="kilo/openai/gpt-4o",
+                timeout_seconds=30,
+            )
+
+        self.assertEqual(result, {"status": "success", "text": "ok"})
+        compat.assert_called_once_with(
+            "grade this",
+            "openai/gpt-4o",
+            "https://api.kilo.ai/api/gateway/chat/completions",
+            "test-key",
+            30,
+        )
+
+
+if __name__ == "__main__":
+    unittest.main()