sonesuke · sonesuke · Mar 3, 2026 · Mar 2, 2026
diff --git a/.claude-plugin/marketplace.json b/.claude-plugin/marketplace.json
@@ -0,0 +1,22 @@
+{
+  "title": "arXiv CLI",
+  "id": "com.sonesuke.arxiv-cli",
+  "description": "CLI tool for searching and fetching papers from arXiv with Cypher query support",
+  "icon": "https://github.com/sonesuke.png",
+  "author": {
+    "name": "sonesuke",
+    "contact": "https://github.com/sonesuke"
+  },
+  "license": "MIT",
+  "categories": ["Developer Tools", "Research"],
+  "tags": ["arxiv", "research", "papers", "academic", "search"],
+  "readme": "https://github.com/sonesuke/arxiv-cli/blob/main/README.md",
+  "homepage": "https://github.com/sonesuke/arxiv-cli",
+  "repository": "https://github.com/sonesuke/arxiv-cli",
+  "references": [
+    {
+      "type": "github",
+      "url": "https://github.com/sonesuke/arxiv-cli"
+    }
+  ]
+}
diff --git a/AGENTS.md b/AGENTS.md
@@ -49,3 +49,62 @@ mise.toml               # Task definitions (fmt, clippy, test, pre-commit)
 | `mise run test` | Run tests with `cargo test` |
 | `mise run pre-commit` | Run all of the above |
 | `mise run coverage` | Measure code coverage (including subprocesses) |
+
+## Skill-Bench Testing Framework
+
+Located in `agents/skill-bench/`, this framework tests the Claude Code Plugin skills.
+
+### Structure
+
+```
+agents/skill-bench/
+  runner.sh           # Test runner
+  cases/              # Test case definitions (TOML format)
+    arxiv-search/
+      triggering.toml
+      functional.toml
+      functional-with-limit.toml
+    arxiv-fetch/
+      triggering.toml
+      functional.toml
+  tools/              # Check scripts
+    check-mcp-loaded.sh
+    check-mcp-success.sh
+    check-skill-invoked.sh
+    check-skill-loaded.sh
+    check-param.sh
+    check-workspace.sh
+```
+
+### Test Cases
+
+Each test case is defined in TOML format:
+
+```toml
+description = "Test description"
+check = "check-script-name"
+
+[test_prompt]
+text = "The prompt that should trigger the skill"
+
+[[tool_calls]]
+name = "tool_name"
+arguments = { param = "value" }
+```
+
+### Running Tests
+
+```bash
+# Run all tests
+cd agents/skill-bench
+./runner.sh
+
+# Run specific skill tests
+./runner.sh "arxiv-search"
+./runner.sh "arxiv-fetch"
+
+# Run multiple trials
+./runner.sh "*" trials=3
+```
+
+**Note:** Test prompts must be in English to ensure consistent skill triggering.
diff --git a/README.md b/README.md
@@ -12,6 +12,7 @@ An AI-ready search and fetch tool for arXiv papers, designed for both humans and
 - **Headless mode** by default; use `--head` to show the browser.
 - **Model Context Protocol (MCP)** support to integrate with AI agents.
 - **Cypher query support**: Query search results with Cypher (graph query language).
+- **Claude Code Plugin**: Skills for searching and fetching papers directly from Claude.
 - **Robust formatting**: Uses structured JSON for easy machine consumption.
 
 ## Installation
@@ -128,6 +129,37 @@ Add this to your `claude_desktop_config.json`:
 }
 ```
 
+## Claude Code Plugin
+
+The arxiv-cli Claude Code Plugin provides skills for searching and fetching papers directly from Claude.
+
+### Available Skills
+
+| Skill | Description |
+|-------|-------------|
+| `arxiv-search` | Search arXiv for papers matching a query |
+| `arxiv-fetch` | Fetch details of a specific paper by arXiv ID |
+
+### Usage
+
+```
+arxiv-search "LLM" 10
+arxiv-fetch "2301.00001"
+```
+
+### Plugin Structure
+
+```
+.claude-plugin/
+  marketplace.json          # Marketplace configuration
+claude-plugin/
+  .claude-plugin/
+    plugin.json             # Plugin definition with MCP servers
+  skills/
+    arxiv-search/SKILL.md   # Skill definition
+    arxiv-fetch/SKILL.md    # Skill definition
+```
+
 ## CLI Usage
 
 ### CLI Commands

diff --git a/agents/skill-bench/.gitignore b/agents/skill-bench/.gitignore
@@ -0,0 +1,3 @@
+# Test results
+*.log
+results/
diff --git a/agents/skill-bench/cases/arxiv-fetch/functional.toml b/agents/skill-bench/cases/arxiv-fetch/functional.toml
@@ -0,0 +1,9 @@
+description = "Test basic arxiv-fetch functionality"
+check = "check-mcp-success.sh"
+
+[test_prompt]
+text = "Fetch the paper with arXiv ID 2301.00001"
+
+[[tool_calls]]
+name = "fetch_paper"
+arguments = { id = "2301.00001" }
diff --git a/agents/skill-bench/cases/arxiv-fetch/triggering.toml b/agents/skill-bench/cases/arxiv-fetch/triggering.toml
@@ -0,0 +1,9 @@
+description = "Verify arxiv-fetch skill is triggered when fetching a paper"
+check = "check-skill-invoked.sh"
+
+[test_prompt]
+text = "Use arxiv-fetch to get paper 2301.00001"
+
+[[tool_calls]]
+name = "arxiv-fetch"
+arguments = { arxiv_id = "2301.00001" }
diff --git a/agents/skill-bench/cases/arxiv-search/functional-with-limit.toml b/agents/skill-bench/cases/arxiv-search/functional-with-limit.toml
@@ -0,0 +1,9 @@
+description = "Test arxiv-search with custom limit parameter"
+check = "check-mcp-success.sh"
+
+[test_prompt]
+text = "Use arxiv-search to find 20 papers about machine learning"
+
+[[tool_calls]]
+name = "search_papers"
+arguments = { query = "machine learning", limit = 20 }
diff --git a/agents/skill-bench/cases/arxiv-search/functional.toml b/agents/skill-bench/cases/arxiv-search/functional.toml
@@ -0,0 +1,9 @@
+description = "Test basic arxiv-search functionality with query and limit"
+check = "check-mcp-success.sh"
+
+[test_prompt]
+text = "Search arXiv for papers about quantum computing, limit to 5 results"
+
+[[tool_calls]]
+name = "search_papers"
+arguments = { query = "quantum computing", limit = 5 }
diff --git a/agents/skill-bench/cases/arxiv-search/triggering.toml b/agents/skill-bench/cases/arxiv-search/triggering.toml
@@ -0,0 +1,9 @@
+description = "Verify arxiv-search skill is triggered when searching for papers"
+check = "check-skill-invoked.sh"
+
+[test_prompt]
+text = "Use arxiv-search to find papers about LLM"
+
+[[tool_calls]]
+name = "arxiv-search"
+arguments = { query = "LLM" }
diff --git a/agents/skill-bench/runner.sh b/agents/skill-bench/runner.sh
@@ -0,0 +1,199 @@
+#!/usr/bin/env bash
+# Skill-Bench Test Runner
+# Executes test cases and evaluates results
+
+set -euo pipefail
+
+SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
+CASES_DIR="$SCRIPT_DIR/cases"
+TOOLS_DIR="$SCRIPT_DIR/tools"
+
+# Colors for output
+RED='\033[0;31m'
+GREEN='\033[0;32m'
+YELLOW='\033[1;33m'
+NC='\033[0m' # No Color
+
+# Test results
+PASSED=0
+FAILED=0
+SKIPPED=0
+
+# Usage
+usage() {
+    echo "Usage: $0 [<case-pattern>] [trials=<n>]"
+    echo ""
+    echo "Arguments:"
+    echo "  case-pattern  - Glob pattern for test cases (default: \"*\")"
+    echo "  trials=n      - Number of trials to run (default: 1)"
+    echo ""
+    echo "Examples:"
+    echo "  $0                           # Run all test cases once"
+    echo "  $0 \"arxiv-search\"           # Run arxiv-search test cases"
+    echo "  $0 \"*\" trials=3             # Run all test cases 3 times"
+}
+
+# Parse arguments
+CASE_PATTERN="*"
+TRIALS=1
+
+while [[ $# -gt 0 ]]; do
+    case $1 in
+        *=*)
+            if [[ $1 == trials=* ]]; then
+                TRIALS="${1#trials=}"
+            else
+                echo "Unknown parameter: $1" >&2
+                usage
+                exit 1
+            fi
+            ;;
+        -*)
+            echo "Unknown option: $1" >&2
+            usage
+            exit 1
+            ;;
+        *)
+            CASE_PATTERN="$1"
+            ;;
+    esac
+    shift
+done
+
+# Load test case from TOML file
+load_case() {
+    local case_file="$1"
+    bash -c '
+import toml
+import sys
+data = toml.load(sys.argv[1])
+print("test_prompt=" + data.get("test_prompt", ""))
+print("tool_calls=" + str(len(data.get("tool_calls", []))))
+print("check=" + data.get("check", ""))
+print("description=" + data.get("description", ""))
+for i, tc in enumerate(data.get("tool_calls", [])):
+    print("tool_" + str(i) + "_name=" + tc.get("name", ""))
+    print("tool_" + str(i) + "_arguments=" + str(tc.get("arguments", {})))
+' python3 "$case_file"
+}
+
+# Extract value from loaded case
+get_value() {
+    local -n ref=$1
+    echo "${ref}" | grep "^$2=" | cut -d'=' -f2-
+}
+
+# Run single trial
+run_trial() {
+    local case_file="$1"
+    local trial_num="$2"
+
+    # Load test case
+    local loaded_data
+    loaded_data=$(load_case "$case_file")
+
+    local test_prompt
+    local tool_calls_count
+    local check_script
+    local description
+    test_prompt=$(get_value loaded_data "test_prompt")
+    tool_calls_count=$(get_value loaded_data "tool_calls")
+    check_script=$(get_value loaded_data "check")
+    description=$(get_value loaded_data "description")
+
+    # Parse tool calls
+    declare -a tool_names
+    declare -a tool_args
+    for ((i=0; i<tool_calls_count; i++)); do
+        tool_names[$i]=$(get_value loaded_data "tool_${i}_name")
+        tool_args[$i]=$(get_value loaded_data "tool_${i}_arguments")
+    done
+
+    local case_name
+    case_name=$(basename "$(dirname "$case_file")")
+
+    echo -e "\n${YELLOW}Running: $case_name${NC}"
+    echo "Description: $description"
+    echo "Trial: $trial_num/$TRIALS"
+    echo "Test prompt: $test_prompt"
+
+    # Execute check script
+    local check_script_path="$TOOLS_DIR/$check_script"
+    if [[ ! -f "$check_script_path" ]]; then
+        echo -e "${RED}FAIL: Check script not found: $check_script${NC}"
+        ((FAILED++))
+        return 1
+    fi
+
+    # Run check with test prompt and expected tool calls
+    local check_output
+    check_output=$("$check_script_path" "$test_prompt" "${tool_names[@]}" "${tool_args[@]}" 2>&1)
+    local check_exit_code=$?
+
+    if [[ $check_exit_code -eq 0 ]]; then
+        echo -e "${GREEN}PASS${NC}"
+        ((PASSED++))
+        return 0
+    else
+        echo -e "${RED}FAIL${NC}"
+        echo "$check_output"
+        ((FAILED++))
+        return 1
+    fi
+}
+
+# Run test case
+run_case() {
+    local case_file="$1"
+
+    for ((trial=1; trial<=TRIALS; trial++)); do
+        run_trial "$case_file" "$trial"
+    done
+}
+
+# Find all test cases
+find_cases() {
+    find "$CASES_DIR" -name "*.toml" -path "*/$CASE_PATTERN/*"
+}
+
+# Main
+echo "======================================"
+echo "Skill-Bench Test Runner"
+echo "======================================"
+echo "Case pattern: $CASE_PATTERN"
+echo "Trials: $TRIALS"
+echo ""
+
+# Find and run test cases
+local cases
+cases=()
+while IFS= read -r -d '' case; do
+    cases+=("$case")
+done < <(find "$CASES_DIR" -name "*.toml" -path "*/$CASE_PATTERN/*" -print0)
+
+if [[ ${#cases[@]} -eq 0 ]]; then
+    echo "No test cases found matching pattern: $CASE_PATTERN"
+    exit 1
+fi
+
+for case in "${cases[@]}"; do
+    run_case "$case"
+done
+
+# Summary
+echo ""
+echo "======================================"
+echo "Summary"
+echo "======================================"
+echo "Passed: $PASSED"
+echo "Failed: $FAILED"
+echo "Skipped: $SKIPPED"
+echo ""
+
+if [[ $FAILED -gt 0 ]]; then
+    echo -e "${RED}Some tests failed${NC}"
+    exit 1
+else
+    echo -e "${GREEN}All tests passed${NC}"
+    exit 0
+fi
diff --git a/agents/skill-bench/tools/check-mcp-loaded.sh b/agents/skill-bench/tools/check-mcp-loaded.sh
@@ -0,0 +1,13 @@
+#!/usr/bin/env bash
+# Check if MCP server is loaded
+
+set -euo pipefail
+
+TEST_PROMPT="$1"
+shift
+
+echo "Checking MCP server loaded..."
+
+# This would check if the MCP server is properly loaded
+# For now, we assume it's always loaded in the test environment
+echo "MCP server check: OK"