agent-ecosystem · dacharyc · Mar 24, 2026 · Mar 24, 2026 · Mar 24, 2026
diff --git a/README.md b/README.md
@@ -335,7 +335,9 @@ skill-validator score evaluate --provider claude-cli <path>
 |---|---|---|---|
 | `anthropic` (default) | `ANTHROPIC_API_KEY` | `claude-sonnet-4-5-20250929` | Anthropic |
 | `openai` | `OPENAI_API_KEY` | `gpt-5.2` | OpenAI, Ollama, Together, Groq, Azure, etc. |
-| `claude-cli` | _(none)_ | `sonnet` | Claude CLI (uses locally authenticated `claude` binary) |
+| `claude-cli` | _(none)_ | `sonnet` | Claude CLI (uses locally authenticated `claude` binary) \* |
+
+\* **Accuracy note:** The `claude-cli` provider shells out to the `claude` CLI, which loads local context (CLAUDE.md files, project memory, rules) into each scoring call. This extra context may influence scores, making them less reproducible across environments compared to the API-based providers. For the most consistent results, use the `anthropic` or `openai` providers with an API key.
 
 Use `--model` to override the default model and `--base-url` to point at any OpenAI-compatible endpoint (e.g. `http://localhost:11434/v1` for Ollama). If the endpoint requires a specific token limit parameter, use `--max-tokens-style` to override auto-detection:
 

diff --git a/examples/README.md b/examples/README.md
@@ -41,7 +41,9 @@ improve the skill content before requesting a human review.
    - OpenAI-compatible: `export OPENAI_API_KEY=...` (some endpoints accept a
      placeholder) and provide the `--base-url` when prompted.
    - Claude CLI: No API key needed — uses the locally authenticated `claude`
-     binary (e.g. via a company or team subscription).
+     binary (e.g. via a company or team subscription). Note: scores may be less
+     consistent than API-based providers because the CLI loads local context
+     (CLAUDE.md, memory) into each call.
 4. Add `.score_cache/` to your `.gitignore`. LLM scoring caches results inside
    each skill directory, and these should not be committed.
 5. Ask your agent to review a skill. The skill stores configuration in

diff --git a/examples/review-skill/references/llm-scoring.md b/examples/review-skill/references/llm-scoring.md
@@ -60,6 +60,12 @@ installation instructions.
 The default model is `sonnet`. The user can specify a different model with the
 `--model` flag (e.g. `--model opus`).
 
+**Accuracy note:** The Claude CLI loads local context (CLAUDE.md files, project
+memory, rules) into each scoring call. This extra context may influence scores,
+making them less reproducible across environments compared to the API-based
+providers. For the most consistent results, use the `anthropic` or `openai`
+providers with an API key.
+
 ### OpenAI-compatible provider
 
 This uses the OpenAI provider with a custom `--base-url`. It supports any

diff --git a/judge/client.go b/judge/client.go
@@ -16,6 +16,10 @@ import (
 // that a hanging upstream doesn't block the caller indefinitely.
 var defaultHTTPClient = &http.Client{Timeout: 30 * time.Second}
 
+// lookPath is used to locate the claude binary. It is a variable so tests
+// can substitute a stub when the real binary is not installed.
+var lookPath = exec.LookPath
+
 // LLMClient is the interface for making LLM API calls.
 type LLMClient interface {
 	// Complete sends a system prompt and user content to the LLM and returns the text response.
@@ -52,6 +56,9 @@ func NewClient(opts ClientOptions) (LLMClient, error) {
 
 	switch strings.ToLower(opts.Provider) {
 	case "claude-cli":
+		if _, err := lookPath("claude"); err != nil {
+			return nil, fmt.Errorf("claude-cli provider requires the \"claude\" binary: %w", err)
+		}
 		model := opts.Model
 		if model == "" {
 			model = "sonnet"
@@ -302,7 +309,8 @@ type claudeCLIClient struct {
 func (c *claudeCLIClient) Provider() string  { return "claude-cli" }
 func (c *claudeCLIClient) ModelName() string { return c.model }
 
-func (c *claudeCLIClient) Complete(ctx context.Context, systemPrompt, userContent string) (string, error) {
+// buildArgs returns the CLI arguments for a claude invocation.
+func (c *claudeCLIClient) buildArgs(systemPrompt, userContent string) []string {
 	args := []string{
 		"-p",
 		"--output-format", "text",
@@ -312,6 +320,11 @@ func (c *claudeCLIClient) Complete(ctx context.Context, systemPrompt, userConten
 		args = append(args, "--system-prompt", systemPrompt)
 	}
 	args = append(args, userContent)
+	return args
+}
+
+func (c *claudeCLIClient) Complete(ctx context.Context, systemPrompt, userContent string) (string, error) {
+	args := c.buildArgs(systemPrompt, userContent)
 
 	cmd := exec.CommandContext(ctx, "claude", args...)
 	var stdout, stderr bytes.Buffer

diff --git a/judge/client_test.go b/judge/client_test.go
@@ -2,12 +2,28 @@ package judge
 
 import (
 	"encoding/json"
+	"fmt"
 	"net/http"
 	"net/http/httptest"
+	"strings"
 	"testing"
 )
 
+// stubLookPath replaces the lookPath variable for the duration of a test,
+// restoring the original when the test completes.
+func stubLookPath(t *testing.T, found bool) {
+	t.Helper()
+	orig := lookPath
+	t.Cleanup(func() { lookPath = orig })
+	if found {
+		lookPath = func(file string) (string, error) { return "/usr/bin/" + file, nil }
+	} else {
+		lookPath = func(file string) (string, error) { return "", fmt.Errorf("not found: %s", file) }
+	}
+}
+
 func TestClaudeCLIClientDefaults(t *testing.T) {
+	stubLookPath(t, true)
 	client, err := NewClient(ClientOptions{Provider: "claude-cli"})
 	if err != nil {
 		t.Fatalf("NewClient: %v", err)
@@ -21,6 +37,7 @@ func TestClaudeCLIClientDefaults(t *testing.T) {
 }
 
 func TestClaudeCLIClientCustomModel(t *testing.T) {
+	stubLookPath(t, true)
 	client, err := NewClient(ClientOptions{Provider: "claude-cli", Model: "opus"})
 	if err != nil {
 		t.Fatalf("NewClient: %v", err)
@@ -31,6 +48,8 @@ func TestClaudeCLIClientCustomModel(t *testing.T) {
 }
 
 func TestClaudeCLINoAPIKeyRequired(t *testing.T) {
+	stubLookPath(t, true)
+
 	// claude-cli should not require an API key
 	_, err := NewClient(ClientOptions{Provider: "claude-cli"})
 	if err != nil {
@@ -44,6 +63,48 @@ func TestClaudeCLINoAPIKeyRequired(t *testing.T) {
 	}
 }
 
+func TestClaudeCLIMissingBinary(t *testing.T) {
+	stubLookPath(t, false)
+
+	_, err := NewClient(ClientOptions{Provider: "claude-cli"})
+	if err == nil {
+		t.Fatal("expected error when claude binary is not found")
+	}
+	if got := err.Error(); !strings.Contains(got, "claude-cli provider requires") {
+		t.Errorf("unexpected error message: %s", got)
+	}
+}
+
+func TestClaudeCLIBuildArgs(t *testing.T) {
+	c := &claudeCLIClient{model: "sonnet"}
+
+	t.Run("with system prompt", func(t *testing.T) {
+		args := c.buildArgs("you are a judge", "score this")
+		want := []string{"-p", "--output-format", "text", "--model", "sonnet", "--system-prompt", "you are a judge", "score this"}
+		if len(args) != len(want) {
+			t.Fatalf("got %d args, want %d: %v", len(args), len(want), args)
+		}
+		for i := range want {
+			if args[i] != want[i] {
+				t.Errorf("args[%d] = %q, want %q", i, args[i], want[i])
+			}
+		}
+	})
+
+	t.Run("without system prompt", func(t *testing.T) {
+		args := c.buildArgs("", "score this")
+		for _, a := range args {
+			if a == "--system-prompt" {
+				t.Error("--system-prompt should not be present when system prompt is empty")
+			}
+		}
+		// Last arg should be the user content
+		if args[len(args)-1] != "score this" {
+			t.Errorf("last arg = %q, want %q", args[len(args)-1], "score this")
+		}
+	})
+}
+
 func TestUseMaxCompletionTokens(t *testing.T) {
 	tests := []struct {
 		model string

diff --git a/judge/example_test.go b/judge/example_test.go
@@ -23,6 +23,8 @@ func ExampleNewClient() {
 	// Provider: anthropic, Model: claude-sonnet-4-5-20250929
 }
 
+// ExampleNewClient_claudeCLI demonstrates creating a claude-cli client.
+// This example is not executed as a test because it requires the claude binary.
 func ExampleNewClient_claudeCLI() {
 	client, err := judge.NewClient(judge.ClientOptions{
 		Provider: "claude-cli",
@@ -33,8 +35,6 @@ func ExampleNewClient_claudeCLI() {
 	}
 
 	fmt.Printf("Provider: %s, Model: %s\n", client.Provider(), client.ModelName())
-	// Output:
-	// Provider: claude-cli, Model: sonnet
 }
 
 func ExampleNewClient_openai() {