diff --git a/README.md b/README.md index 5164c24..94ad4ab 100644 --- a/README.md +++ b/README.md @@ -335,7 +335,9 @@ skill-validator score evaluate --provider claude-cli |---|---|---|---| | `anthropic` (default) | `ANTHROPIC_API_KEY` | `claude-sonnet-4-5-20250929` | Anthropic | | `openai` | `OPENAI_API_KEY` | `gpt-5.2` | OpenAI, Ollama, Together, Groq, Azure, etc. | -| `claude-cli` | _(none)_ | `sonnet` | Claude CLI (uses locally authenticated `claude` binary) | +| `claude-cli` | _(none)_ | `sonnet` | Claude CLI (uses locally authenticated `claude` binary) \* | + +\* **Accuracy note:** The `claude-cli` provider shells out to the `claude` CLI, which loads local context (CLAUDE.md files, project memory, rules) into each scoring call. This extra context may influence scores, making them less reproducible across environments compared to the API-based providers. For the most consistent results, use the `anthropic` or `openai` providers with an API key. Use `--model` to override the default model and `--base-url` to point at any OpenAI-compatible endpoint (e.g. `http://localhost:11434/v1` for Ollama). If the endpoint requires a specific token limit parameter, use `--max-tokens-style` to override auto-detection: diff --git a/examples/README.md b/examples/README.md index bf94a69..5918151 100644 --- a/examples/README.md +++ b/examples/README.md @@ -41,7 +41,9 @@ improve the skill content before requesting a human review. - OpenAI-compatible: `export OPENAI_API_KEY=...` (some endpoints accept a placeholder) and provide the `--base-url` when prompted. - Claude CLI: No API key needed — uses the locally authenticated `claude` - binary (e.g. via a company or team subscription). + binary (e.g. via a company or team subscription). Note: scores may be less + consistent than API-based providers because the CLI loads local context + (CLAUDE.md, memory) into each call. 4. Add `.score_cache/` to your `.gitignore`. LLM scoring caches results inside each skill directory, and these should not be committed. 5. Ask your agent to review a skill. The skill stores configuration in diff --git a/examples/review-skill/references/llm-scoring.md b/examples/review-skill/references/llm-scoring.md index 093815b..80daa12 100644 --- a/examples/review-skill/references/llm-scoring.md +++ b/examples/review-skill/references/llm-scoring.md @@ -60,6 +60,12 @@ installation instructions. The default model is `sonnet`. The user can specify a different model with the `--model` flag (e.g. `--model opus`). +**Accuracy note:** The Claude CLI loads local context (CLAUDE.md files, project +memory, rules) into each scoring call. This extra context may influence scores, +making them less reproducible across environments compared to the API-based +providers. For the most consistent results, use the `anthropic` or `openai` +providers with an API key. + ### OpenAI-compatible provider This uses the OpenAI provider with a custom `--base-url`. It supports any diff --git a/judge/client.go b/judge/client.go index 1ba5798..d4261ef 100644 --- a/judge/client.go +++ b/judge/client.go @@ -16,6 +16,10 @@ import ( // that a hanging upstream doesn't block the caller indefinitely. var defaultHTTPClient = &http.Client{Timeout: 30 * time.Second} +// lookPath is used to locate the claude binary. It is a variable so tests +// can substitute a stub when the real binary is not installed. +var lookPath = exec.LookPath + // LLMClient is the interface for making LLM API calls. type LLMClient interface { // Complete sends a system prompt and user content to the LLM and returns the text response. @@ -52,6 +56,9 @@ func NewClient(opts ClientOptions) (LLMClient, error) { switch strings.ToLower(opts.Provider) { case "claude-cli": + if _, err := lookPath("claude"); err != nil { + return nil, fmt.Errorf("claude-cli provider requires the \"claude\" binary: %w", err) + } model := opts.Model if model == "" { model = "sonnet" @@ -302,7 +309,8 @@ type claudeCLIClient struct { func (c *claudeCLIClient) Provider() string { return "claude-cli" } func (c *claudeCLIClient) ModelName() string { return c.model } -func (c *claudeCLIClient) Complete(ctx context.Context, systemPrompt, userContent string) (string, error) { +// buildArgs returns the CLI arguments for a claude invocation. +func (c *claudeCLIClient) buildArgs(systemPrompt, userContent string) []string { args := []string{ "-p", "--output-format", "text", @@ -312,6 +320,11 @@ func (c *claudeCLIClient) Complete(ctx context.Context, systemPrompt, userConten args = append(args, "--system-prompt", systemPrompt) } args = append(args, userContent) + return args +} + +func (c *claudeCLIClient) Complete(ctx context.Context, systemPrompt, userContent string) (string, error) { + args := c.buildArgs(systemPrompt, userContent) cmd := exec.CommandContext(ctx, "claude", args...) var stdout, stderr bytes.Buffer diff --git a/judge/client_test.go b/judge/client_test.go index 4979d08..e25c835 100644 --- a/judge/client_test.go +++ b/judge/client_test.go @@ -2,12 +2,28 @@ package judge import ( "encoding/json" + "fmt" "net/http" "net/http/httptest" + "strings" "testing" ) +// stubLookPath replaces the lookPath variable for the duration of a test, +// restoring the original when the test completes. +func stubLookPath(t *testing.T, found bool) { + t.Helper() + orig := lookPath + t.Cleanup(func() { lookPath = orig }) + if found { + lookPath = func(file string) (string, error) { return "/usr/bin/" + file, nil } + } else { + lookPath = func(file string) (string, error) { return "", fmt.Errorf("not found: %s", file) } + } +} + func TestClaudeCLIClientDefaults(t *testing.T) { + stubLookPath(t, true) client, err := NewClient(ClientOptions{Provider: "claude-cli"}) if err != nil { t.Fatalf("NewClient: %v", err) @@ -21,6 +37,7 @@ func TestClaudeCLIClientDefaults(t *testing.T) { } func TestClaudeCLIClientCustomModel(t *testing.T) { + stubLookPath(t, true) client, err := NewClient(ClientOptions{Provider: "claude-cli", Model: "opus"}) if err != nil { t.Fatalf("NewClient: %v", err) @@ -31,6 +48,8 @@ func TestClaudeCLIClientCustomModel(t *testing.T) { } func TestClaudeCLINoAPIKeyRequired(t *testing.T) { + stubLookPath(t, true) + // claude-cli should not require an API key _, err := NewClient(ClientOptions{Provider: "claude-cli"}) if err != nil { @@ -44,6 +63,48 @@ func TestClaudeCLINoAPIKeyRequired(t *testing.T) { } } +func TestClaudeCLIMissingBinary(t *testing.T) { + stubLookPath(t, false) + + _, err := NewClient(ClientOptions{Provider: "claude-cli"}) + if err == nil { + t.Fatal("expected error when claude binary is not found") + } + if got := err.Error(); !strings.Contains(got, "claude-cli provider requires") { + t.Errorf("unexpected error message: %s", got) + } +} + +func TestClaudeCLIBuildArgs(t *testing.T) { + c := &claudeCLIClient{model: "sonnet"} + + t.Run("with system prompt", func(t *testing.T) { + args := c.buildArgs("you are a judge", "score this") + want := []string{"-p", "--output-format", "text", "--model", "sonnet", "--system-prompt", "you are a judge", "score this"} + if len(args) != len(want) { + t.Fatalf("got %d args, want %d: %v", len(args), len(want), args) + } + for i := range want { + if args[i] != want[i] { + t.Errorf("args[%d] = %q, want %q", i, args[i], want[i]) + } + } + }) + + t.Run("without system prompt", func(t *testing.T) { + args := c.buildArgs("", "score this") + for _, a := range args { + if a == "--system-prompt" { + t.Error("--system-prompt should not be present when system prompt is empty") + } + } + // Last arg should be the user content + if args[len(args)-1] != "score this" { + t.Errorf("last arg = %q, want %q", args[len(args)-1], "score this") + } + }) +} + func TestUseMaxCompletionTokens(t *testing.T) { tests := []struct { model string diff --git a/judge/example_test.go b/judge/example_test.go index 4342449..67bf1e4 100644 --- a/judge/example_test.go +++ b/judge/example_test.go @@ -23,6 +23,8 @@ func ExampleNewClient() { // Provider: anthropic, Model: claude-sonnet-4-5-20250929 } +// ExampleNewClient_claudeCLI demonstrates creating a claude-cli client. +// This example is not executed as a test because it requires the claude binary. func ExampleNewClient_claudeCLI() { client, err := judge.NewClient(judge.ClientOptions{ Provider: "claude-cli", @@ -33,8 +35,6 @@ func ExampleNewClient_claudeCLI() { } fmt.Printf("Provider: %s, Model: %s\n", client.Provider(), client.ModelName()) - // Output: - // Provider: claude-cli, Model: sonnet } func ExampleNewClient_openai() {