Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
10 changes: 7 additions & 3 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -111,7 +111,7 @@ API documentation and runnable examples are on [pkg.go.dev](https://pkg.go.dev/g

#### Custom LLM providers

The built-in clients cover Anthropic and OpenAI-compatible APIs. For other providers, implement the `judge.LLMClient` interface:
The built-in clients cover Anthropic, OpenAI-compatible APIs, and the Claude CLI. For other providers, implement the `judge.LLMClient` interface:

```go
type LLMClient interface {
Expand Down Expand Up @@ -324,14 +324,18 @@ skill-validator score evaluate --skill-only <path>
skill-validator score evaluate --refs-only <path>
skill-validator score evaluate --display files <path>
skill-validator score evaluate path/to/references/api-guide.md

# Or use the Claude CLI (no API key needed if already authenticated)
skill-validator score evaluate --provider claude-cli <path>
```

**Provider support**: Requires an API key via environment variable. Use `--provider` to select the backend:
**Provider support**: Use `--provider` to select the backend:

| Provider | Env var | Default model | Covers |
|---|---|---|---|
| `anthropic` (default) | `ANTHROPIC_API_KEY` | `claude-sonnet-4-5-20250929` | Anthropic |
| `openai` | `OPENAI_API_KEY` | `gpt-5.2` | OpenAI, Ollama, Together, Groq, Azure, etc. |
| `claude-cli` | _(none)_ | `sonnet` | Claude CLI (uses locally authenticated `claude` binary) |

Use `--model` to override the default model and `--base-url` to point at any OpenAI-compatible endpoint (e.g. `http://localhost:11434/v1` for Ollama). If the endpoint requires a specific token limit parameter, use `--max-tokens-style` to override auto-detection:

Expand Down Expand Up @@ -633,7 +637,7 @@ If no `SKILL.md` is found at the root or in any immediate subdirectory, the vali

The [`examples/`](examples/) directory contains ready-to-use workflows that extend skill-validator:

- **[review-skill](examples/review-skill/)** — An Agent Skill that walks a coding agent through a full skill review (structural validation, content checks, LLM scoring with Anthropic or OpenAI). Copy it into your agent's skill directory to iterate on skills during local development before requesting a human review.
- **[review-skill](examples/review-skill/)** — An Agent Skill that walks a coding agent through a full skill review (structural validation, content checks, LLM scoring with Anthropic, OpenAI, or Claude CLI). Copy it into your agent's skill directory to iterate on skills during local development before requesting a human review.
- **[ci](examples/ci/)** — A GitHub Actions workflow and companion script that validate changed skills on every pull request. Copy into your repo's `.github/` directory to enforce a minimum quality bar before merging.

See the [examples README](examples/README.md) for setup instructions.
Expand Down
22 changes: 15 additions & 7 deletions cmd/score_evaluate.go
Original file line number Diff line number Diff line change
Expand Up @@ -39,14 +39,18 @@ The path can be:

Requires an API key via environment variable:
ANTHROPIC_API_KEY (for --provider anthropic, the default)
OPENAI_API_KEY (for --provider openai)`,
OPENAI_API_KEY (for --provider openai)

The claude-cli provider uses the locally installed "claude" CLI and does not
require an API key. This is useful when the CLI is already authenticated
(e.g. via a company or team subscription).`,
Args: cobra.ExactArgs(1),
RunE: runScoreEvaluate,
}

func init() {
scoreEvaluateCmd.Flags().StringVar(&evalProvider, "provider", "anthropic", "LLM provider: anthropic or openai")
scoreEvaluateCmd.Flags().StringVar(&evalModel, "model", "", "model name (default: claude-sonnet-4-5-20250929 for anthropic, gpt-5.2 for openai)")
scoreEvaluateCmd.Flags().StringVar(&evalProvider, "provider", "anthropic", "LLM provider: anthropic, openai, or claude-cli")
scoreEvaluateCmd.Flags().StringVar(&evalModel, "model", "", "model name (default: claude-sonnet-4-5-20250929 for anthropic, gpt-5.2 for openai, sonnet for claude-cli)")
scoreEvaluateCmd.Flags().StringVar(&evalBaseURL, "base-url", "", "API base URL (for openai-compatible endpoints)")
scoreEvaluateCmd.Flags().BoolVar(&evalRescore, "rescore", false, "re-score and overwrite cached results")
scoreEvaluateCmd.Flags().BoolVar(&evalSkillOnly, "skill-only", false, "score only SKILL.md, skip reference files")
Expand Down Expand Up @@ -74,10 +78,14 @@ func runScoreEvaluate(cmd *cobra.Command, args []string) error {
return fmt.Errorf("--max-tokens-style must be \"auto\", \"max_tokens\", or \"max_completion_tokens\"")
}

// Resolve API key
apiKey, err := resolveAPIKey(evalProvider)
if err != nil {
return err
// Resolve API key (not needed for claude-cli)
var apiKey string
if strings.ToLower(evalProvider) != "claude-cli" {
var err error
apiKey, err = resolveAPIKey(evalProvider)
if err != nil {
return err
}
}

client, err := judge.NewClient(judge.ClientOptions{
Expand Down
6 changes: 4 additions & 2 deletions examples/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -22,8 +22,8 @@ improve the skill content before requesting a human review.
1. Checks prerequisites (skill-validator binary, API keys)
2. Runs `skill-validator check` for structural validation
3. Reviews content for examples, edge cases, and scope-gating
4. Optionally scores the skill with an LLM judge (Anthropic, OpenAI, or any
OpenAI-compatible endpoint)
4. Optionally scores the skill with an LLM judge (Anthropic, OpenAI, any
OpenAI-compatible endpoint, or the Claude CLI)
5. Supports cross-model comparison to validate scores across model families
6. Presents a summary with prioritized action items and a publish recommendation

Expand All @@ -40,6 +40,8 @@ improve the skill content before requesting a human review.
- OpenAI: `export OPENAI_API_KEY=sk-...`
- OpenAI-compatible: `export OPENAI_API_KEY=...` (some endpoints accept a
placeholder) and provide the `--base-url` when prompted.
- Claude CLI: No API key needed — uses the locally authenticated `claude`
binary (e.g. via a company or team subscription).
4. Add `.score_cache/` to your `.gitignore`. LLM scoring caches results inside
each skill directory, and these should not be committed.
5. Ask your agent to review a skill. The skill stores configuration in
Expand Down
22 changes: 12 additions & 10 deletions examples/review-skill/SKILL.md
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@ description: >-
structural issues, scores the skill with an LLM judge, and interprets results
to advise authors on what to address. Use when a user wants to review,
validate, or quality-check an Agent Skill.
compatibility: Requires skill-validator CLI. LLM scoring requires an Anthropic or OpenAI API key, OR can be skipped for structural-only review.
compatibility: Requires skill-validator CLI. LLM scoring requires an Anthropic or OpenAI API key, the Claude CLI, OR can be skipped for structural-only review.
metadata:
author: agent-ecosystem
version: "1.0"
Expand Down Expand Up @@ -41,16 +41,17 @@ Options 2-3: continue below.

**If no state file exists**, or the user chose to re-check/change, ask:

> LLM scoring uses an Anthropic or OpenAI-compatible API. Without an API key,
> we run structural validation only.
> LLM scoring uses an Anthropic or OpenAI-compatible API, or the Claude CLI.
> Without an API key or CLI, we run structural validation only.
>
> 1. **Anthropic** — use Claude via the Anthropic API (requires `ANTHROPIC_API_KEY`)
> 2. **OpenAI** — use GPT via the OpenAI API (requires `OPENAI_API_KEY`)
> 3. **OpenAI-compatible** — use a custom endpoint (Ollama, Groq, Azure, Together, etc.)
> 4. **Skip LLM scoring** — structural validation only
> 4. **Claude CLI** — use the locally authenticated `claude` binary (no API key needed)
> 5. **Skip LLM scoring** — structural validation only

Options 1-3: set `LLM_SCORING=true` and record the provider choice.
Option 4: set `LLM_SCORING=false`. Run Step 1a only, then jump to Step 2.
Options 1-4: set `LLM_SCORING=true` and record the provider choice.
Option 5: set `LLM_SCORING=false`. Run Step 1a only, then jump to Step 2.

**If the user chose option 1 or 2**, ask about cross-model comparison:

Expand All @@ -63,8 +64,9 @@ Option 4: set `LLM_SCORING=false`. Run Step 1a only, then jump to Step 2.

Option 1: set `CROSS_MODEL=true`. Option 2: set `CROSS_MODEL=false`.

Do not offer cross-model comparison for option 3 (OpenAI-compatible), since the
second provider would need a standard Anthropic or OpenAI key.
Do not offer cross-model comparison for option 3 (OpenAI-compatible) or option 4
(Claude CLI), since the second provider would need a standard Anthropic or
OpenAI key.

After Step 1a, follow
[references/llm-scoring.md](references/llm-scoring.md) for API key checks
Expand All @@ -84,7 +86,7 @@ follow [references/install-skill-validator.md](references/install-skill-validato

Do NOT proceed until this succeeds.

If `LLM_SCORING=true`, complete the API key checks in
If `LLM_SCORING=true`, complete the provider checks in
[references/llm-scoring.md](references/llm-scoring.md) before continuing.

### Save state after prerequisites pass
Expand All @@ -97,7 +99,7 @@ mkdir -p ~/.config/skill-validator
cat > ~/.config/skill-validator/review-state.yaml << 'EOF'
prereqs_passed: true
llm_scoring: <true or false>
provider: <anthropic, openai, or openai-compatible>
provider: <anthropic, openai, openai-compatible, or claude-cli>
model: <model name if specified, or "default">
base_url: <custom base URL if openai-compatible, or omit>
cross_model: <true or false>
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -40,3 +40,4 @@ LLM scoring requires one of:
- **Anthropic API key** — set `ANTHROPIC_API_KEY` environment variable
- **OpenAI API key** — set `OPENAI_API_KEY` environment variable
- **OpenAI-compatible endpoint** — set `OPENAI_API_KEY` and provide a `--base-url`
- **Claude CLI** — use `--provider claude-cli` (no API key needed; uses the locally authenticated `claude` binary)
42 changes: 41 additions & 1 deletion examples/review-skill/references/llm-scoring.md
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
Provider-specific prerequisites and LLM scoring steps. Only follow this if the
user selected an LLM provider in Step 0.

## API Key Prerequisites
## Provider Prerequisites

Complete after Step 1a (binary check) passes.

Expand Down Expand Up @@ -42,6 +42,24 @@ The default model is `gpt-5.2`. The user can specify a different model with the
`--model` flag. For applications where a frontier model is more appropriate,
the user can specify `--model gpt-5.4`, but this will increase scoring cost.

### Claude CLI provider

The Claude CLI provider shells out to the locally installed `claude` binary.
No API key is needed — it uses the CLI's existing authentication (e.g. a
company or team subscription).

Verify the CLI is available:

```bash
claude --version
```

If not found, see https://docs.anthropic.com/en/docs/claude-code for
installation instructions.

The default model is `sonnet`. The user can specify a different model with the
`--model` flag (e.g. `--model opus`).

### OpenAI-compatible provider

This uses the OpenAI provider with a custom `--base-url`. It supports any
Expand Down Expand Up @@ -132,6 +150,28 @@ skill-validator score evaluate <path> --provider openai --full-content --display

Add `--model <name>` if the user specified a model other than gpt-5.2.

### Claude CLI

Check for cached scores:

```bash
skill-validator score report <path> -o json 2>/dev/null
```

If scored output exists, use `--rescore` to generate fresh scores:

```bash
skill-validator score evaluate <path> --provider claude-cli --full-content --display files -o json --rescore
```

If no cached scores exist, run without `--rescore`:

```bash
skill-validator score evaluate <path> --provider claude-cli --full-content --display files -o json
```

Add `--model <name>` if the user specified a model other than sonnet.

### OpenAI-compatible

```bash
Expand Down
51 changes: 47 additions & 4 deletions judge/client.go
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@ import (
"fmt"
"io"
"net/http"
"os/exec"
"strings"
"time"
)
Expand All @@ -27,8 +28,8 @@ type LLMClient interface {

// ClientOptions holds configuration for creating an LLM client.
type ClientOptions struct {
Provider string // "anthropic" or "openai"
APIKey string // Required
Provider string // "anthropic", "openai", or "claude-cli"
APIKey string // Required for anthropic and openai; unused for claude-cli
BaseURL string // Optional; defaults per provider
Model string // Optional; defaults per provider
MaxTokensStyle string // "auto", "max_tokens", or "max_completion_tokens"
Expand All @@ -38,8 +39,9 @@ type ClientOptions struct {
// NewClient creates an LLMClient for the given options.
// If Model is empty, a default is chosen per provider.
// For the openai provider, BaseURL defaults to "https://api.openai.com/v1" if empty.
// The claude-cli provider shells out to the "claude" CLI and does not require an API key.
func NewClient(opts ClientOptions) (LLMClient, error) {
if opts.APIKey == "" {
if strings.ToLower(opts.Provider) != "claude-cli" && opts.APIKey == "" {
return nil, fmt.Errorf("API key is required")
}

Expand All @@ -49,6 +51,12 @@ func NewClient(opts ClientOptions) (LLMClient, error) {
}

switch strings.ToLower(opts.Provider) {
case "claude-cli":
model := opts.Model
if model == "" {
model = "sonnet"
}
return &claudeCLIClient{model: model}, nil
case "anthropic":
model := opts.Model
if model == "" {
Expand All @@ -71,7 +79,7 @@ func NewClient(opts ClientOptions) (LLMClient, error) {
baseURL = strings.TrimRight(baseURL, "/")
return &openaiClient{apiKey: opts.APIKey, baseURL: baseURL, model: model, maxTokensStyle: opts.MaxTokensStyle, maxTokens: maxResp}, nil
default:
return nil, fmt.Errorf("unsupported provider %q (use \"anthropic\" or \"openai\")", opts.Provider)
return nil, fmt.Errorf("unsupported provider %q (use \"anthropic\", \"openai\", or \"claude-cli\")", opts.Provider)
}
}

Expand Down Expand Up @@ -281,3 +289,38 @@ func (c *openaiClient) Complete(ctx context.Context, systemPrompt, userContent s

return result.Choices[0].Message.Content, nil
}

// --- Claude CLI client ---

// claudeCLIClient invokes the "claude" CLI for completions.
// This is useful when the CLI is already authenticated (e.g. via a company
// subscription) and no explicit API key is needed.
type claudeCLIClient struct {
model string
}

func (c *claudeCLIClient) Provider() string { return "claude-cli" }
func (c *claudeCLIClient) ModelName() string { return c.model }

func (c *claudeCLIClient) Complete(ctx context.Context, systemPrompt, userContent string) (string, error) {
args := []string{
"-p",
"--output-format", "text",
"--model", c.model,
}
if systemPrompt != "" {
args = append(args, "--system-prompt", systemPrompt)
}
args = append(args, userContent)

cmd := exec.CommandContext(ctx, "claude", args...)
var stdout, stderr bytes.Buffer
cmd.Stdout = &stdout
cmd.Stderr = &stderr

if err := cmd.Run(); err != nil {
return "", fmt.Errorf("claude CLI failed: %w: %s", err, stderr.String())
}

return strings.TrimSpace(stdout.String()), nil
}
37 changes: 37 additions & 0 deletions judge/client_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,43 @@ import (
"testing"
)

func TestClaudeCLIClientDefaults(t *testing.T) {
client, err := NewClient(ClientOptions{Provider: "claude-cli"})
if err != nil {
t.Fatalf("NewClient: %v", err)
}
if client.Provider() != "claude-cli" {
t.Errorf("Provider() = %q, want %q", client.Provider(), "claude-cli")
}
if client.ModelName() != "sonnet" {
t.Errorf("ModelName() = %q, want %q", client.ModelName(), "sonnet")
}
}

func TestClaudeCLIClientCustomModel(t *testing.T) {
client, err := NewClient(ClientOptions{Provider: "claude-cli", Model: "opus"})
if err != nil {
t.Fatalf("NewClient: %v", err)
}
if client.ModelName() != "opus" {
t.Errorf("ModelName() = %q, want %q", client.ModelName(), "opus")
}
}

func TestClaudeCLINoAPIKeyRequired(t *testing.T) {
// claude-cli should not require an API key
_, err := NewClient(ClientOptions{Provider: "claude-cli"})
if err != nil {
t.Fatalf("expected no error without API key for claude-cli, got: %v", err)
}

// Other providers still require it
_, err = NewClient(ClientOptions{Provider: "anthropic"})
if err == nil {
t.Fatal("expected error without API key for anthropic")
}
}

func TestUseMaxCompletionTokens(t *testing.T) {
tests := []struct {
model string
Expand Down
14 changes: 14 additions & 0 deletions judge/example_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,20 @@ func ExampleNewClient() {
// Provider: anthropic, Model: claude-sonnet-4-5-20250929
}

func ExampleNewClient_claudeCLI() {
client, err := judge.NewClient(judge.ClientOptions{
Provider: "claude-cli",
// Model defaults to "sonnet"; no API key needed
})
if err != nil {
panic(err)
}

fmt.Printf("Provider: %s, Model: %s\n", client.Provider(), client.ModelName())
// Output:
// Provider: claude-cli, Model: sonnet
}

func ExampleNewClient_openai() {
client, err := judge.NewClient(judge.ClientOptions{
Provider: "openai",
Expand Down
Loading