From f4fa293db41a8a8aa76bb2c0ed308206b8c7ed84 Mon Sep 17 00:00:00 2001 From: Marco Walz Date: Mon, 23 Mar 2026 12:42:26 +0100 Subject: [PATCH] feat: add claude-cli provider for LLM scoring without API keys Adds a new "claude-cli" provider that shells out to the locally installed `claude` binary. This enables LLM scoring for users who are already authenticated via the CLI (e.g. company or team subscriptions) without requiring an explicit API key. --- README.md | 10 ++-- cmd/score_evaluate.go | 22 +++++--- examples/README.md | 6 ++- examples/review-skill/SKILL.md | 22 ++++---- .../references/install-skill-validator.md | 1 + .../review-skill/references/llm-scoring.md | 42 ++++++++++++++- judge/client.go | 51 +++++++++++++++++-- judge/client_test.go | 37 ++++++++++++++ judge/example_test.go | 14 +++++ 9 files changed, 178 insertions(+), 27 deletions(-) diff --git a/README.md b/README.md index c1739e0..5164c24 100644 --- a/README.md +++ b/README.md @@ -111,7 +111,7 @@ API documentation and runnable examples are on [pkg.go.dev](https://pkg.go.dev/g #### Custom LLM providers -The built-in clients cover Anthropic and OpenAI-compatible APIs. For other providers, implement the `judge.LLMClient` interface: +The built-in clients cover Anthropic, OpenAI-compatible APIs, and the Claude CLI. For other providers, implement the `judge.LLMClient` interface: ```go type LLMClient interface { @@ -324,14 +324,18 @@ skill-validator score evaluate --skill-only skill-validator score evaluate --refs-only skill-validator score evaluate --display files skill-validator score evaluate path/to/references/api-guide.md + +# Or use the Claude CLI (no API key needed if already authenticated) +skill-validator score evaluate --provider claude-cli ``` -**Provider support**: Requires an API key via environment variable. Use `--provider` to select the backend: +**Provider support**: Use `--provider` to select the backend: | Provider | Env var | Default model | Covers | |---|---|---|---| | `anthropic` (default) | `ANTHROPIC_API_KEY` | `claude-sonnet-4-5-20250929` | Anthropic | | `openai` | `OPENAI_API_KEY` | `gpt-5.2` | OpenAI, Ollama, Together, Groq, Azure, etc. | +| `claude-cli` | _(none)_ | `sonnet` | Claude CLI (uses locally authenticated `claude` binary) | Use `--model` to override the default model and `--base-url` to point at any OpenAI-compatible endpoint (e.g. `http://localhost:11434/v1` for Ollama). If the endpoint requires a specific token limit parameter, use `--max-tokens-style` to override auto-detection: @@ -633,7 +637,7 @@ If no `SKILL.md` is found at the root or in any immediate subdirectory, the vali The [`examples/`](examples/) directory contains ready-to-use workflows that extend skill-validator: -- **[review-skill](examples/review-skill/)** — An Agent Skill that walks a coding agent through a full skill review (structural validation, content checks, LLM scoring with Anthropic or OpenAI). Copy it into your agent's skill directory to iterate on skills during local development before requesting a human review. +- **[review-skill](examples/review-skill/)** — An Agent Skill that walks a coding agent through a full skill review (structural validation, content checks, LLM scoring with Anthropic, OpenAI, or Claude CLI). Copy it into your agent's skill directory to iterate on skills during local development before requesting a human review. - **[ci](examples/ci/)** — A GitHub Actions workflow and companion script that validate changed skills on every pull request. Copy into your repo's `.github/` directory to enforce a minimum quality bar before merging. See the [examples README](examples/README.md) for setup instructions. diff --git a/cmd/score_evaluate.go b/cmd/score_evaluate.go index ff5466a..1c1c5f8 100644 --- a/cmd/score_evaluate.go +++ b/cmd/score_evaluate.go @@ -39,14 +39,18 @@ The path can be: Requires an API key via environment variable: ANTHROPIC_API_KEY (for --provider anthropic, the default) - OPENAI_API_KEY (for --provider openai)`, + OPENAI_API_KEY (for --provider openai) + +The claude-cli provider uses the locally installed "claude" CLI and does not +require an API key. This is useful when the CLI is already authenticated +(e.g. via a company or team subscription).`, Args: cobra.ExactArgs(1), RunE: runScoreEvaluate, } func init() { - scoreEvaluateCmd.Flags().StringVar(&evalProvider, "provider", "anthropic", "LLM provider: anthropic or openai") - scoreEvaluateCmd.Flags().StringVar(&evalModel, "model", "", "model name (default: claude-sonnet-4-5-20250929 for anthropic, gpt-5.2 for openai)") + scoreEvaluateCmd.Flags().StringVar(&evalProvider, "provider", "anthropic", "LLM provider: anthropic, openai, or claude-cli") + scoreEvaluateCmd.Flags().StringVar(&evalModel, "model", "", "model name (default: claude-sonnet-4-5-20250929 for anthropic, gpt-5.2 for openai, sonnet for claude-cli)") scoreEvaluateCmd.Flags().StringVar(&evalBaseURL, "base-url", "", "API base URL (for openai-compatible endpoints)") scoreEvaluateCmd.Flags().BoolVar(&evalRescore, "rescore", false, "re-score and overwrite cached results") scoreEvaluateCmd.Flags().BoolVar(&evalSkillOnly, "skill-only", false, "score only SKILL.md, skip reference files") @@ -74,10 +78,14 @@ func runScoreEvaluate(cmd *cobra.Command, args []string) error { return fmt.Errorf("--max-tokens-style must be \"auto\", \"max_tokens\", or \"max_completion_tokens\"") } - // Resolve API key - apiKey, err := resolveAPIKey(evalProvider) - if err != nil { - return err + // Resolve API key (not needed for claude-cli) + var apiKey string + if strings.ToLower(evalProvider) != "claude-cli" { + var err error + apiKey, err = resolveAPIKey(evalProvider) + if err != nil { + return err + } } client, err := judge.NewClient(judge.ClientOptions{ diff --git a/examples/README.md b/examples/README.md index e5108e5..bf94a69 100644 --- a/examples/README.md +++ b/examples/README.md @@ -22,8 +22,8 @@ improve the skill content before requesting a human review. 1. Checks prerequisites (skill-validator binary, API keys) 2. Runs `skill-validator check` for structural validation 3. Reviews content for examples, edge cases, and scope-gating -4. Optionally scores the skill with an LLM judge (Anthropic, OpenAI, or any - OpenAI-compatible endpoint) +4. Optionally scores the skill with an LLM judge (Anthropic, OpenAI, any + OpenAI-compatible endpoint, or the Claude CLI) 5. Supports cross-model comparison to validate scores across model families 6. Presents a summary with prioritized action items and a publish recommendation @@ -40,6 +40,8 @@ improve the skill content before requesting a human review. - OpenAI: `export OPENAI_API_KEY=sk-...` - OpenAI-compatible: `export OPENAI_API_KEY=...` (some endpoints accept a placeholder) and provide the `--base-url` when prompted. + - Claude CLI: No API key needed — uses the locally authenticated `claude` + binary (e.g. via a company or team subscription). 4. Add `.score_cache/` to your `.gitignore`. LLM scoring caches results inside each skill directory, and these should not be committed. 5. Ask your agent to review a skill. The skill stores configuration in diff --git a/examples/review-skill/SKILL.md b/examples/review-skill/SKILL.md index 94f7dfe..f066c38 100644 --- a/examples/review-skill/SKILL.md +++ b/examples/review-skill/SKILL.md @@ -6,7 +6,7 @@ description: >- structural issues, scores the skill with an LLM judge, and interprets results to advise authors on what to address. Use when a user wants to review, validate, or quality-check an Agent Skill. -compatibility: Requires skill-validator CLI. LLM scoring requires an Anthropic or OpenAI API key, OR can be skipped for structural-only review. +compatibility: Requires skill-validator CLI. LLM scoring requires an Anthropic or OpenAI API key, the Claude CLI, OR can be skipped for structural-only review. metadata: author: agent-ecosystem version: "1.0" @@ -41,16 +41,17 @@ Options 2-3: continue below. **If no state file exists**, or the user chose to re-check/change, ask: -> LLM scoring uses an Anthropic or OpenAI-compatible API. Without an API key, -> we run structural validation only. +> LLM scoring uses an Anthropic or OpenAI-compatible API, or the Claude CLI. +> Without an API key or CLI, we run structural validation only. > > 1. **Anthropic** — use Claude via the Anthropic API (requires `ANTHROPIC_API_KEY`) > 2. **OpenAI** — use GPT via the OpenAI API (requires `OPENAI_API_KEY`) > 3. **OpenAI-compatible** — use a custom endpoint (Ollama, Groq, Azure, Together, etc.) -> 4. **Skip LLM scoring** — structural validation only +> 4. **Claude CLI** — use the locally authenticated `claude` binary (no API key needed) +> 5. **Skip LLM scoring** — structural validation only -Options 1-3: set `LLM_SCORING=true` and record the provider choice. -Option 4: set `LLM_SCORING=false`. Run Step 1a only, then jump to Step 2. +Options 1-4: set `LLM_SCORING=true` and record the provider choice. +Option 5: set `LLM_SCORING=false`. Run Step 1a only, then jump to Step 2. **If the user chose option 1 or 2**, ask about cross-model comparison: @@ -63,8 +64,9 @@ Option 4: set `LLM_SCORING=false`. Run Step 1a only, then jump to Step 2. Option 1: set `CROSS_MODEL=true`. Option 2: set `CROSS_MODEL=false`. -Do not offer cross-model comparison for option 3 (OpenAI-compatible), since the -second provider would need a standard Anthropic or OpenAI key. +Do not offer cross-model comparison for option 3 (OpenAI-compatible) or option 4 +(Claude CLI), since the second provider would need a standard Anthropic or +OpenAI key. After Step 1a, follow [references/llm-scoring.md](references/llm-scoring.md) for API key checks @@ -84,7 +86,7 @@ follow [references/install-skill-validator.md](references/install-skill-validato Do NOT proceed until this succeeds. -If `LLM_SCORING=true`, complete the API key checks in +If `LLM_SCORING=true`, complete the provider checks in [references/llm-scoring.md](references/llm-scoring.md) before continuing. ### Save state after prerequisites pass @@ -97,7 +99,7 @@ mkdir -p ~/.config/skill-validator cat > ~/.config/skill-validator/review-state.yaml << 'EOF' prereqs_passed: true llm_scoring: -provider: +provider: model: base_url: cross_model: diff --git a/examples/review-skill/references/install-skill-validator.md b/examples/review-skill/references/install-skill-validator.md index cf94545..73de0eb 100644 --- a/examples/review-skill/references/install-skill-validator.md +++ b/examples/review-skill/references/install-skill-validator.md @@ -40,3 +40,4 @@ LLM scoring requires one of: - **Anthropic API key** — set `ANTHROPIC_API_KEY` environment variable - **OpenAI API key** — set `OPENAI_API_KEY` environment variable - **OpenAI-compatible endpoint** — set `OPENAI_API_KEY` and provide a `--base-url` +- **Claude CLI** — use `--provider claude-cli` (no API key needed; uses the locally authenticated `claude` binary) diff --git a/examples/review-skill/references/llm-scoring.md b/examples/review-skill/references/llm-scoring.md index c4f8844..093815b 100644 --- a/examples/review-skill/references/llm-scoring.md +++ b/examples/review-skill/references/llm-scoring.md @@ -3,7 +3,7 @@ Provider-specific prerequisites and LLM scoring steps. Only follow this if the user selected an LLM provider in Step 0. -## API Key Prerequisites +## Provider Prerequisites Complete after Step 1a (binary check) passes. @@ -42,6 +42,24 @@ The default model is `gpt-5.2`. The user can specify a different model with the `--model` flag. For applications where a frontier model is more appropriate, the user can specify `--model gpt-5.4`, but this will increase scoring cost. +### Claude CLI provider + +The Claude CLI provider shells out to the locally installed `claude` binary. +No API key is needed — it uses the CLI's existing authentication (e.g. a +company or team subscription). + +Verify the CLI is available: + +```bash +claude --version +``` + +If not found, see https://docs.anthropic.com/en/docs/claude-code for +installation instructions. + +The default model is `sonnet`. The user can specify a different model with the +`--model` flag (e.g. `--model opus`). + ### OpenAI-compatible provider This uses the OpenAI provider with a custom `--base-url`. It supports any @@ -132,6 +150,28 @@ skill-validator score evaluate --provider openai --full-content --display Add `--model ` if the user specified a model other than gpt-5.2. +### Claude CLI + +Check for cached scores: + +```bash +skill-validator score report -o json 2>/dev/null +``` + +If scored output exists, use `--rescore` to generate fresh scores: + +```bash +skill-validator score evaluate --provider claude-cli --full-content --display files -o json --rescore +``` + +If no cached scores exist, run without `--rescore`: + +```bash +skill-validator score evaluate --provider claude-cli --full-content --display files -o json +``` + +Add `--model ` if the user specified a model other than sonnet. + ### OpenAI-compatible ```bash diff --git a/judge/client.go b/judge/client.go index 23656f1..1ba5798 100644 --- a/judge/client.go +++ b/judge/client.go @@ -7,6 +7,7 @@ import ( "fmt" "io" "net/http" + "os/exec" "strings" "time" ) @@ -27,8 +28,8 @@ type LLMClient interface { // ClientOptions holds configuration for creating an LLM client. type ClientOptions struct { - Provider string // "anthropic" or "openai" - APIKey string // Required + Provider string // "anthropic", "openai", or "claude-cli" + APIKey string // Required for anthropic and openai; unused for claude-cli BaseURL string // Optional; defaults per provider Model string // Optional; defaults per provider MaxTokensStyle string // "auto", "max_tokens", or "max_completion_tokens" @@ -38,8 +39,9 @@ type ClientOptions struct { // NewClient creates an LLMClient for the given options. // If Model is empty, a default is chosen per provider. // For the openai provider, BaseURL defaults to "https://api.openai.com/v1" if empty. +// The claude-cli provider shells out to the "claude" CLI and does not require an API key. func NewClient(opts ClientOptions) (LLMClient, error) { - if opts.APIKey == "" { + if strings.ToLower(opts.Provider) != "claude-cli" && opts.APIKey == "" { return nil, fmt.Errorf("API key is required") } @@ -49,6 +51,12 @@ func NewClient(opts ClientOptions) (LLMClient, error) { } switch strings.ToLower(opts.Provider) { + case "claude-cli": + model := opts.Model + if model == "" { + model = "sonnet" + } + return &claudeCLIClient{model: model}, nil case "anthropic": model := opts.Model if model == "" { @@ -71,7 +79,7 @@ func NewClient(opts ClientOptions) (LLMClient, error) { baseURL = strings.TrimRight(baseURL, "/") return &openaiClient{apiKey: opts.APIKey, baseURL: baseURL, model: model, maxTokensStyle: opts.MaxTokensStyle, maxTokens: maxResp}, nil default: - return nil, fmt.Errorf("unsupported provider %q (use \"anthropic\" or \"openai\")", opts.Provider) + return nil, fmt.Errorf("unsupported provider %q (use \"anthropic\", \"openai\", or \"claude-cli\")", opts.Provider) } } @@ -281,3 +289,38 @@ func (c *openaiClient) Complete(ctx context.Context, systemPrompt, userContent s return result.Choices[0].Message.Content, nil } + +// --- Claude CLI client --- + +// claudeCLIClient invokes the "claude" CLI for completions. +// This is useful when the CLI is already authenticated (e.g. via a company +// subscription) and no explicit API key is needed. +type claudeCLIClient struct { + model string +} + +func (c *claudeCLIClient) Provider() string { return "claude-cli" } +func (c *claudeCLIClient) ModelName() string { return c.model } + +func (c *claudeCLIClient) Complete(ctx context.Context, systemPrompt, userContent string) (string, error) { + args := []string{ + "-p", + "--output-format", "text", + "--model", c.model, + } + if systemPrompt != "" { + args = append(args, "--system-prompt", systemPrompt) + } + args = append(args, userContent) + + cmd := exec.CommandContext(ctx, "claude", args...) + var stdout, stderr bytes.Buffer + cmd.Stdout = &stdout + cmd.Stderr = &stderr + + if err := cmd.Run(); err != nil { + return "", fmt.Errorf("claude CLI failed: %w: %s", err, stderr.String()) + } + + return strings.TrimSpace(stdout.String()), nil +} diff --git a/judge/client_test.go b/judge/client_test.go index 202064f..4979d08 100644 --- a/judge/client_test.go +++ b/judge/client_test.go @@ -7,6 +7,43 @@ import ( "testing" ) +func TestClaudeCLIClientDefaults(t *testing.T) { + client, err := NewClient(ClientOptions{Provider: "claude-cli"}) + if err != nil { + t.Fatalf("NewClient: %v", err) + } + if client.Provider() != "claude-cli" { + t.Errorf("Provider() = %q, want %q", client.Provider(), "claude-cli") + } + if client.ModelName() != "sonnet" { + t.Errorf("ModelName() = %q, want %q", client.ModelName(), "sonnet") + } +} + +func TestClaudeCLIClientCustomModel(t *testing.T) { + client, err := NewClient(ClientOptions{Provider: "claude-cli", Model: "opus"}) + if err != nil { + t.Fatalf("NewClient: %v", err) + } + if client.ModelName() != "opus" { + t.Errorf("ModelName() = %q, want %q", client.ModelName(), "opus") + } +} + +func TestClaudeCLINoAPIKeyRequired(t *testing.T) { + // claude-cli should not require an API key + _, err := NewClient(ClientOptions{Provider: "claude-cli"}) + if err != nil { + t.Fatalf("expected no error without API key for claude-cli, got: %v", err) + } + + // Other providers still require it + _, err = NewClient(ClientOptions{Provider: "anthropic"}) + if err == nil { + t.Fatal("expected error without API key for anthropic") + } +} + func TestUseMaxCompletionTokens(t *testing.T) { tests := []struct { model string diff --git a/judge/example_test.go b/judge/example_test.go index 7c94dcf..4342449 100644 --- a/judge/example_test.go +++ b/judge/example_test.go @@ -23,6 +23,20 @@ func ExampleNewClient() { // Provider: anthropic, Model: claude-sonnet-4-5-20250929 } +func ExampleNewClient_claudeCLI() { + client, err := judge.NewClient(judge.ClientOptions{ + Provider: "claude-cli", + // Model defaults to "sonnet"; no API key needed + }) + if err != nil { + panic(err) + } + + fmt.Printf("Provider: %s, Model: %s\n", client.Provider(), client.ModelName()) + // Output: + // Provider: claude-cli, Model: sonnet +} + func ExampleNewClient_openai() { client, err := judge.NewClient(judge.ClientOptions{ Provider: "openai",