diff --git a/README.md b/README.md index 66d8a979..6c005bce 100644 --- a/README.md +++ b/README.md @@ -116,6 +116,9 @@ By default (no `--judge` flag), the LLM judge runs as an OpenClaw agent session. # Direct API via OpenRouter ./scripts/run.sh --model openai/gpt-4o --judge openrouter/anthropic/claude-sonnet-4-5 +# Direct API via Kilo Gateway +./scripts/run.sh --model openai/gpt-4o --judge kilo/anthropic/claude-sonnet-4-5 + # Direct API via Anthropic ./scripts/run.sh --model openai/gpt-4o --judge anthropic/claude-sonnet-4-5-20250514 @@ -126,7 +129,7 @@ By default (no `--judge` flag), the LLM judge runs as an OpenClaw agent session. ./scripts/run.sh --model openai/gpt-4o --judge claude ``` -Required env vars: `OPENROUTER_API_KEY`, `ANTHROPIC_API_KEY`, or `OPENAI_API_KEY` depending on the judge model prefix. +Required env vars: `OPENROUTER_API_KEY`, `KILO_API_KEY`, `ANTHROPIC_API_KEY`, or `OPENAI_API_KEY` depending on the judge model prefix. ## Contributing Tasks @@ -164,4 +167,3 @@ MIT — see [LICENSE](LICENSE) for details. --- _Claw-some AI agent testing_ 🦞 - diff --git a/scripts/benchmark.py b/scripts/benchmark.py index ee392b6d..2cfbe786 100644 --- a/scripts/benchmark.py +++ b/scripts/benchmark.py @@ -239,8 +239,9 @@ def _parse_args() -> argparse.Namespace: default=None, help=( "Judge model or backend. Default (unset): OpenClaw agent session with " - "openrouter/anthropic/claude-opus-4.5. Set to a model ID to call its API " - "directly (e.g. openai/gpt-4o, anthropic/claude-sonnet-4-5-20250514, claude)" + "openrouter/anthropic/claude-haiku-4.5. Set to a model ID to call its API " + "directly (e.g. kilo/anthropic/claude-sonnet-4-5, openai/gpt-4o, " + "anthropic/claude-sonnet-4-5-20250514, claude)" ), ) parser.add_argument( diff --git a/scripts/lib_agent.py b/scripts/lib_agent.py index 62070298..ec2d3c48 100644 --- a/scripts/lib_agent.py +++ b/scripts/lib_agent.py @@ -1185,6 +1185,7 @@ def call_judge_api( Dispatches based on model prefix: - openrouter/* -> OpenRouter chat completions API + - kilo/* -> Kilo Gateway chat completions API - anthropic/* -> Anthropic Messages API - openai/* -> OpenAI chat completions API - claude -> headless Claude CLI (claude -p) @@ -1193,6 +1194,8 @@ def call_judge_api( """ if model == "claude" or model.startswith("claude:"): return _judge_via_claude_cli(prompt, model, timeout_seconds) + if model.startswith("kilo/"): + return _judge_via_kilo(prompt, model, timeout_seconds) if model.startswith("anthropic/"): return _judge_via_anthropic(prompt, model, timeout_seconds) if model.startswith("openai/"): @@ -1265,6 +1268,20 @@ def _judge_via_openrouter(prompt: str, model: str, timeout_seconds: float) -> Di ) +def _judge_via_kilo(prompt: str, model: str, timeout_seconds: float) -> Dict[str, Any]: + api_key = os.environ.get("KILO_API_KEY") + if not api_key: + return {"status": "error", "text": "", "error": "KILO_API_KEY not set"} + bare_model = model.removeprefix("kilo/") + return _judge_via_openai_compat( + prompt, + bare_model, + "https://api.kilo.ai/api/gateway/chat/completions", + api_key, + timeout_seconds, + ) + + def _judge_via_openai(prompt: str, model: str, timeout_seconds: float) -> Dict[str, Any]: api_key = os.environ.get("OPENAI_API_KEY") if not api_key: diff --git a/tests/test_lib_agent_judge.py b/tests/test_lib_agent_judge.py new file mode 100644 index 00000000..42e65c7f --- /dev/null +++ b/tests/test_lib_agent_judge.py @@ -0,0 +1,102 @@ +from __future__ import annotations + +import json +import os +import sys +import unittest +from pathlib import Path +from unittest.mock import patch + + +ROOT = Path(__file__).resolve().parents[1] +SCRIPTS_DIR = ROOT / "scripts" +if str(SCRIPTS_DIR) not in sys.path: + sys.path.insert(0, str(SCRIPTS_DIR)) + +from lib_agent import call_judge_api # noqa: E402 + + +class _FakeResponse: + def __enter__(self) -> "_FakeResponse": + return self + + def __exit__(self, exc_type, exc, tb) -> None: + return None + + def read(self) -> bytes: + return json.dumps( + {"choices": [{"message": {"content": '{"total": 1.0}'}}]} + ).encode("utf-8") + + +class KiloJudgeTests(unittest.TestCase): + def test_call_judge_api_kilo_requires_kilo_api_key(self) -> None: + with patch.dict(os.environ, {}, clear=True): + result = call_judge_api( + prompt="grade this", + model="kilo/anthropic/claude-sonnet-4-5", + ) + + self.assertEqual(result["status"], "error") + self.assertEqual(result["text"], "") + self.assertEqual(result["error"], "KILO_API_KEY not set") + + def test_call_judge_api_kilo_posts_to_gateway_with_bare_model(self) -> None: + captured_request = None + + def fake_urlopen(req, timeout): + nonlocal captured_request + captured_request = req + self.assertEqual(timeout, 12.5) + return _FakeResponse() + + with patch.dict(os.environ, {"KILO_API_KEY": "test-key"}, clear=True), patch( + "lib_agent.request.urlopen", side_effect=fake_urlopen + ): + result = call_judge_api( + prompt="grade this", + model="kilo/anthropic/claude-sonnet-4-5", + timeout_seconds=12.5, + ) + + self.assertEqual(result["status"], "success") + self.assertEqual(result["text"], '{"total": 1.0}') + self.assertIsNotNone(captured_request) + self.assertEqual( + captured_request.full_url, + "https://api.kilo.ai/api/gateway/chat/completions", + ) + self.assertEqual(captured_request.get_method(), "POST") + self.assertEqual(captured_request.headers["Authorization"], "Bearer test-key") + self.assertEqual(captured_request.headers["Content-type"], "application/json") + + payload = json.loads(captured_request.data.decode("utf-8")) + self.assertEqual(payload["model"], "anthropic/claude-sonnet-4-5") + self.assertEqual(payload["temperature"], 0.0) + self.assertEqual(payload["max_completion_tokens"], 2048) + self.assertEqual(payload["messages"][0]["role"], "system") + self.assertEqual(payload["messages"][1], {"role": "user", "content": "grade this"}) + + def test_call_judge_api_kilo_dispatch_does_not_fall_back_to_openrouter(self) -> None: + with patch.dict(os.environ, {"KILO_API_KEY": "test-key"}, clear=True), patch( + "lib_agent._judge_via_openai_compat", + return_value={"status": "success", "text": "ok"}, + ) as compat: + result = call_judge_api( + prompt="grade this", + model="kilo/openai/gpt-4o", + timeout_seconds=30, + ) + + self.assertEqual(result, {"status": "success", "text": "ok"}) + compat.assert_called_once_with( + "grade this", + "openai/gpt-4o", + "https://api.kilo.ai/api/gateway/chat/completions", + "test-key", + 30, + ) + + +if __name__ == "__main__": + unittest.main()