Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 3 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -335,7 +335,9 @@ skill-validator score evaluate --provider claude-cli <path>
|---|---|---|---|
| `anthropic` (default) | `ANTHROPIC_API_KEY` | `claude-sonnet-4-5-20250929` | Anthropic |
| `openai` | `OPENAI_API_KEY` | `gpt-5.2` | OpenAI, Ollama, Together, Groq, Azure, etc. |
| `claude-cli` | _(none)_ | `sonnet` | Claude CLI (uses locally authenticated `claude` binary) |
| `claude-cli` | _(none)_ | `sonnet` | Claude CLI (uses locally authenticated `claude` binary) \* |

\* **Accuracy note:** The `claude-cli` provider shells out to the `claude` CLI, which loads local context (CLAUDE.md files, project memory, rules) into each scoring call. This extra context may influence scores, making them less reproducible across environments compared to the API-based providers. For the most consistent results, use the `anthropic` or `openai` providers with an API key.

Use `--model` to override the default model and `--base-url` to point at any OpenAI-compatible endpoint (e.g. `http://localhost:11434/v1` for Ollama). If the endpoint requires a specific token limit parameter, use `--max-tokens-style` to override auto-detection:

Expand Down
4 changes: 3 additions & 1 deletion examples/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -41,7 +41,9 @@ improve the skill content before requesting a human review.
- OpenAI-compatible: `export OPENAI_API_KEY=...` (some endpoints accept a
placeholder) and provide the `--base-url` when prompted.
- Claude CLI: No API key needed — uses the locally authenticated `claude`
binary (e.g. via a company or team subscription).
binary (e.g. via a company or team subscription). Note: scores may be less
consistent than API-based providers because the CLI loads local context
(CLAUDE.md, memory) into each call.
4. Add `.score_cache/` to your `.gitignore`. LLM scoring caches results inside
each skill directory, and these should not be committed.
5. Ask your agent to review a skill. The skill stores configuration in
Expand Down
6 changes: 6 additions & 0 deletions examples/review-skill/references/llm-scoring.md
Original file line number Diff line number Diff line change
Expand Up @@ -60,6 +60,12 @@ installation instructions.
The default model is `sonnet`. The user can specify a different model with the
`--model` flag (e.g. `--model opus`).

**Accuracy note:** The Claude CLI loads local context (CLAUDE.md files, project
memory, rules) into each scoring call. This extra context may influence scores,
making them less reproducible across environments compared to the API-based
providers. For the most consistent results, use the `anthropic` or `openai`
providers with an API key.

### OpenAI-compatible provider

This uses the OpenAI provider with a custom `--base-url`. It supports any
Expand Down
15 changes: 14 additions & 1 deletion judge/client.go
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,10 @@ import (
// that a hanging upstream doesn't block the caller indefinitely.
var defaultHTTPClient = &http.Client{Timeout: 30 * time.Second}

// lookPath is used to locate the claude binary. It is a variable so tests
// can substitute a stub when the real binary is not installed.
var lookPath = exec.LookPath

// LLMClient is the interface for making LLM API calls.
type LLMClient interface {
// Complete sends a system prompt and user content to the LLM and returns the text response.
Expand Down Expand Up @@ -52,6 +56,9 @@ func NewClient(opts ClientOptions) (LLMClient, error) {

switch strings.ToLower(opts.Provider) {
case "claude-cli":
if _, err := lookPath("claude"); err != nil {
return nil, fmt.Errorf("claude-cli provider requires the \"claude\" binary: %w", err)
}
model := opts.Model
if model == "" {
model = "sonnet"
Expand Down Expand Up @@ -302,7 +309,8 @@ type claudeCLIClient struct {
func (c *claudeCLIClient) Provider() string { return "claude-cli" }
func (c *claudeCLIClient) ModelName() string { return c.model }

func (c *claudeCLIClient) Complete(ctx context.Context, systemPrompt, userContent string) (string, error) {
// buildArgs returns the CLI arguments for a claude invocation.
func (c *claudeCLIClient) buildArgs(systemPrompt, userContent string) []string {
args := []string{
"-p",
"--output-format", "text",
Expand All @@ -312,6 +320,11 @@ func (c *claudeCLIClient) Complete(ctx context.Context, systemPrompt, userConten
args = append(args, "--system-prompt", systemPrompt)
}
args = append(args, userContent)
return args
}

func (c *claudeCLIClient) Complete(ctx context.Context, systemPrompt, userContent string) (string, error) {
args := c.buildArgs(systemPrompt, userContent)

cmd := exec.CommandContext(ctx, "claude", args...)
var stdout, stderr bytes.Buffer
Expand Down
61 changes: 61 additions & 0 deletions judge/client_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -2,12 +2,28 @@ package judge

import (
"encoding/json"
"fmt"
"net/http"
"net/http/httptest"
"strings"
"testing"
)

// stubLookPath replaces the lookPath variable for the duration of a test,
// restoring the original when the test completes.
func stubLookPath(t *testing.T, found bool) {
t.Helper()
orig := lookPath
t.Cleanup(func() { lookPath = orig })
if found {
lookPath = func(file string) (string, error) { return "/usr/bin/" + file, nil }
} else {
lookPath = func(file string) (string, error) { return "", fmt.Errorf("not found: %s", file) }
}
}

func TestClaudeCLIClientDefaults(t *testing.T) {
stubLookPath(t, true)
client, err := NewClient(ClientOptions{Provider: "claude-cli"})
if err != nil {
t.Fatalf("NewClient: %v", err)
Expand All @@ -21,6 +37,7 @@ func TestClaudeCLIClientDefaults(t *testing.T) {
}

func TestClaudeCLIClientCustomModel(t *testing.T) {
stubLookPath(t, true)
client, err := NewClient(ClientOptions{Provider: "claude-cli", Model: "opus"})
if err != nil {
t.Fatalf("NewClient: %v", err)
Expand All @@ -31,6 +48,8 @@ func TestClaudeCLIClientCustomModel(t *testing.T) {
}

func TestClaudeCLINoAPIKeyRequired(t *testing.T) {
stubLookPath(t, true)

// claude-cli should not require an API key
_, err := NewClient(ClientOptions{Provider: "claude-cli"})
if err != nil {
Expand All @@ -44,6 +63,48 @@ func TestClaudeCLINoAPIKeyRequired(t *testing.T) {
}
}

func TestClaudeCLIMissingBinary(t *testing.T) {
stubLookPath(t, false)

_, err := NewClient(ClientOptions{Provider: "claude-cli"})
if err == nil {
t.Fatal("expected error when claude binary is not found")
}
if got := err.Error(); !strings.Contains(got, "claude-cli provider requires") {
t.Errorf("unexpected error message: %s", got)
}
}

func TestClaudeCLIBuildArgs(t *testing.T) {
c := &claudeCLIClient{model: "sonnet"}

t.Run("with system prompt", func(t *testing.T) {
args := c.buildArgs("you are a judge", "score this")
want := []string{"-p", "--output-format", "text", "--model", "sonnet", "--system-prompt", "you are a judge", "score this"}
if len(args) != len(want) {
t.Fatalf("got %d args, want %d: %v", len(args), len(want), args)
}
for i := range want {
if args[i] != want[i] {
t.Errorf("args[%d] = %q, want %q", i, args[i], want[i])
}
}
})

t.Run("without system prompt", func(t *testing.T) {
args := c.buildArgs("", "score this")
for _, a := range args {
if a == "--system-prompt" {
t.Error("--system-prompt should not be present when system prompt is empty")
}
}
// Last arg should be the user content
if args[len(args)-1] != "score this" {
t.Errorf("last arg = %q, want %q", args[len(args)-1], "score this")
}
})
}

func TestUseMaxCompletionTokens(t *testing.T) {
tests := []struct {
model string
Expand Down
4 changes: 2 additions & 2 deletions judge/example_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,8 @@ func ExampleNewClient() {
// Provider: anthropic, Model: claude-sonnet-4-5-20250929
}

// ExampleNewClient_claudeCLI demonstrates creating a claude-cli client.
// This example is not executed as a test because it requires the claude binary.
func ExampleNewClient_claudeCLI() {
client, err := judge.NewClient(judge.ClientOptions{
Provider: "claude-cli",
Expand All @@ -33,8 +35,6 @@ func ExampleNewClient_claudeCLI() {
}

fmt.Printf("Provider: %s, Model: %s\n", client.Provider(), client.ModelName())
// Output:
// Provider: claude-cli, Model: sonnet
}

func ExampleNewClient_openai() {
Expand Down
Loading