Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 4 additions & 2 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -116,6 +116,9 @@ By default (no `--judge` flag), the LLM judge runs as an OpenClaw agent session.
# Direct API via OpenRouter
./scripts/run.sh --model openai/gpt-4o --judge openrouter/anthropic/claude-sonnet-4-5

# Direct API via Kilo Gateway
./scripts/run.sh --model openai/gpt-4o --judge kilo/anthropic/claude-sonnet-4-5

# Direct API via Anthropic
./scripts/run.sh --model openai/gpt-4o --judge anthropic/claude-sonnet-4-5-20250514

Expand All @@ -126,7 +129,7 @@ By default (no `--judge` flag), the LLM judge runs as an OpenClaw agent session.
./scripts/run.sh --model openai/gpt-4o --judge claude
```

Required env vars: `OPENROUTER_API_KEY`, `ANTHROPIC_API_KEY`, or `OPENAI_API_KEY` depending on the judge model prefix.
Required env vars: `OPENROUTER_API_KEY`, `KILO_API_KEY`, `ANTHROPIC_API_KEY`, or `OPENAI_API_KEY` depending on the judge model prefix.

## Contributing Tasks

Expand Down Expand Up @@ -164,4 +167,3 @@ MIT — see [LICENSE](LICENSE) for details.
---

_Claw-some AI agent testing_ 🦞

5 changes: 3 additions & 2 deletions scripts/benchmark.py
Original file line number Diff line number Diff line change
Expand Up @@ -239,8 +239,9 @@ def _parse_args() -> argparse.Namespace:
default=None,
help=(
"Judge model or backend. Default (unset): OpenClaw agent session with "
"openrouter/anthropic/claude-opus-4.5. Set to a model ID to call its API "
"directly (e.g. openai/gpt-4o, anthropic/claude-sonnet-4-5-20250514, claude)"
"openrouter/anthropic/claude-haiku-4.5. Set to a model ID to call its API "
"directly (e.g. kilo/anthropic/claude-sonnet-4-5, openai/gpt-4o, "
"anthropic/claude-sonnet-4-5-20250514, claude)"
),
)
parser.add_argument(
Expand Down
17 changes: 17 additions & 0 deletions scripts/lib_agent.py
Original file line number Diff line number Diff line change
Expand Up @@ -1185,6 +1185,7 @@ def call_judge_api(

Dispatches based on model prefix:
- openrouter/* -> OpenRouter chat completions API
- kilo/* -> Kilo Gateway chat completions API
- anthropic/* -> Anthropic Messages API
- openai/* -> OpenAI chat completions API
- claude -> headless Claude CLI (claude -p)
Expand All @@ -1193,6 +1194,8 @@ def call_judge_api(
"""
if model == "claude" or model.startswith("claude:"):
return _judge_via_claude_cli(prompt, model, timeout_seconds)
if model.startswith("kilo/"):
return _judge_via_kilo(prompt, model, timeout_seconds)
if model.startswith("anthropic/"):
return _judge_via_anthropic(prompt, model, timeout_seconds)
if model.startswith("openai/"):
Expand Down Expand Up @@ -1265,6 +1268,20 @@ def _judge_via_openrouter(prompt: str, model: str, timeout_seconds: float) -> Di
)


def _judge_via_kilo(prompt: str, model: str, timeout_seconds: float) -> Dict[str, Any]:
api_key = os.environ.get("KILO_API_KEY")
if not api_key:
return {"status": "error", "text": "", "error": "KILO_API_KEY not set"}
bare_model = model.removeprefix("kilo/")
return _judge_via_openai_compat(
prompt,
bare_model,
"https://api.kilo.ai/api/gateway/chat/completions",
api_key,
timeout_seconds,
)


def _judge_via_openai(prompt: str, model: str, timeout_seconds: float) -> Dict[str, Any]:
api_key = os.environ.get("OPENAI_API_KEY")
if not api_key:
Expand Down
102 changes: 102 additions & 0 deletions tests/test_lib_agent_judge.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,102 @@
from __future__ import annotations

import json
import os
import sys
import unittest
from pathlib import Path
from unittest.mock import patch


ROOT = Path(__file__).resolve().parents[1]
SCRIPTS_DIR = ROOT / "scripts"
if str(SCRIPTS_DIR) not in sys.path:
sys.path.insert(0, str(SCRIPTS_DIR))

from lib_agent import call_judge_api # noqa: E402


class _FakeResponse:
def __enter__(self) -> "_FakeResponse":
return self

def __exit__(self, exc_type, exc, tb) -> None:
return None

def read(self) -> bytes:
return json.dumps(
{"choices": [{"message": {"content": '{"total": 1.0}'}}]}
).encode("utf-8")


class KiloJudgeTests(unittest.TestCase):
def test_call_judge_api_kilo_requires_kilo_api_key(self) -> None:
with patch.dict(os.environ, {}, clear=True):
result = call_judge_api(
prompt="grade this",
model="kilo/anthropic/claude-sonnet-4-5",
)

self.assertEqual(result["status"], "error")
self.assertEqual(result["text"], "")
self.assertEqual(result["error"], "KILO_API_KEY not set")

def test_call_judge_api_kilo_posts_to_gateway_with_bare_model(self) -> None:
captured_request = None

def fake_urlopen(req, timeout):
nonlocal captured_request
captured_request = req
self.assertEqual(timeout, 12.5)
return _FakeResponse()

with patch.dict(os.environ, {"KILO_API_KEY": "test-key"}, clear=True), patch(
"lib_agent.request.urlopen", side_effect=fake_urlopen
):
result = call_judge_api(
prompt="grade this",
model="kilo/anthropic/claude-sonnet-4-5",
timeout_seconds=12.5,
)

self.assertEqual(result["status"], "success")
self.assertEqual(result["text"], '{"total": 1.0}')
self.assertIsNotNone(captured_request)
self.assertEqual(
captured_request.full_url,
"https://api.kilo.ai/api/gateway/chat/completions",
)
self.assertEqual(captured_request.get_method(), "POST")
self.assertEqual(captured_request.headers["Authorization"], "Bearer test-key")
self.assertEqual(captured_request.headers["Content-type"], "application/json")

payload = json.loads(captured_request.data.decode("utf-8"))
self.assertEqual(payload["model"], "anthropic/claude-sonnet-4-5")
self.assertEqual(payload["temperature"], 0.0)
self.assertEqual(payload["max_completion_tokens"], 2048)
self.assertEqual(payload["messages"][0]["role"], "system")
self.assertEqual(payload["messages"][1], {"role": "user", "content": "grade this"})

def test_call_judge_api_kilo_dispatch_does_not_fall_back_to_openrouter(self) -> None:
with patch.dict(os.environ, {"KILO_API_KEY": "test-key"}, clear=True), patch(
"lib_agent._judge_via_openai_compat",
return_value={"status": "success", "text": "ok"},
) as compat:
result = call_judge_api(
prompt="grade this",
model="kilo/openai/gpt-4o",
timeout_seconds=30,
)

self.assertEqual(result, {"status": "success", "text": "ok"})
compat.assert_called_once_with(
"grade this",
"openai/gpt-4o",
"https://api.kilo.ai/api/gateway/chat/completions",
"test-key",
30,
)


if __name__ == "__main__":
unittest.main()
Loading