diff --git a/.githooks/pre-push b/.githooks/pre-push new file mode 100755 index 00000000..34393dae --- /dev/null +++ b/.githooks/pre-push @@ -0,0 +1,30 @@ +#!/usr/bin/bash +set -euo pipefail +# Git pre-push hook — runs skill eval for changed skills (non-blocking) +# +# Install shared hooks from the repo root: make setup-githooks +# Enable eval on push: export RUN_EVAL=1 + +if [[ "${RUN_EVAL:-0}" == "1" ]]; then + REPO_ROOT="$(git rev-parse --show-toplevel)" + EVAL_SHA="$(git rev-parse HEAD)" + + # Extract owner/repo from origin remote (SSH or HTTPS) + EVAL_REPO="$(git remote get-url origin \ + | sed -E 's#^(https?://github\.com/|git@github\.com:)##' \ + | sed 's/\.git$//')" + + EVAL_LOG="/tmp/skill-eval-${EVAL_SHA:0:8}.log" + + echo "Skill eval will run in the background (log: ${EVAL_LOG})" + + ( + nohup env \ + EVAL_SHA="${EVAL_SHA}" \ + EVAL_REPO="${EVAL_REPO}" \ + "${REPO_ROOT}/scripts/eval-skill.sh" \ + > "${EVAL_LOG}" 2>&1 + ) & + disown +fi +exit 0 diff --git a/.gitignore b/.gitignore index 1524e53b..309454d6 100644 --- a/.gitignore +++ b/.gitignore @@ -1,4 +1,5 @@ .reports +eval/runs/ .mcp.json !plugins/**/.mcp.json .DS_Store diff --git a/README.md b/README.md index 8c97ddc0..e817fcab 100644 --- a/README.md +++ b/README.md @@ -24,7 +24,10 @@ cd edge-tooling make setup-githooks ``` -`make setup-githooks` installs a pre-commit hook that runs markdownlint on staged `.md` files. +`make setup-githooks` installs shared git hooks: + +- **pre-commit** — runs markdownlint on staged `.md` files +- **pre-push** — optionally evaluates changed skills via the agent-eval-harness (`RUN_EVAL=1 git push`) Prerequisites: Node.js (for markdownlint), Python 3, Bash. diff --git a/scripts/eval-skill.sh b/scripts/eval-skill.sh new file mode 100755 index 00000000..02ecf782 --- /dev/null +++ b/scripts/eval-skill.sh @@ -0,0 +1,171 @@ +#!/usr/bin/bash +set -euo pipefail + +EVAL_MODEL="${EVAL_MODEL:-claude-sonnet-4-6}" +BASE_BRANCH="${BASE_BRANCH:-origin/main}" +EVAL_WORKDIR=$(mktemp -d -t skill-eval-XXXXXX) +EVAL_RUNS_DIR="${AGENT_EVAL_RUNS_DIR:-eval/runs}" + +trap 'rm -rf "${EVAL_WORKDIR}"' EXIT + +# ── Commit status helper ─────────────────────────────────── +# Posts GitHub commit status when EVAL_SHA and EVAL_REPO are +# set (by the pre-push hook). No-op on manual invocation. +post_status() { + local state="$1" + local description="$2" + + [[ -z "${EVAL_SHA:-}" || -z "${EVAL_REPO:-}" ]] && return 0 + command -v gh >/dev/null 2>&1 || return 0 + + gh api "repos/${EVAL_REPO}/statuses/${EVAL_SHA}" \ + -f state="${state}" \ + -f context="skill-eval" \ + -f description="${description}" \ + 2>/dev/null || true +} + +# ── PR comment helper ────────────────────────────────────── +# Posts eval report as a PR comment on the current branch's PR. +post_pr_comment() { + local report_file="$1" + + [[ -z "${EVAL_SHA:-}" || -z "${EVAL_REPO:-}" ]] && return 0 + [[ ! -f "${report_file}" ]] && return 0 + command -v gh >/dev/null 2>&1 || return 0 + + local body + body="## Skill Eval Report + +$(cat "${report_file}")" + + gh pr comment --edit-last --body "${body}" 2>/dev/null \ + || gh pr comment --body "${body}" 2>/dev/null || true +} + +echo "╔══════════════════════════════════════════════╗" +echo "║ Skill Eval — scanning for changed skills ║" +echo "╚══════════════════════════════════════════════╝" +echo "" + +# ── Step 1: find changed SKILL.md files ───────────────────── +# Scope to the current user's commits only — other people's merged +# work on the branch should not trigger eval. +GIT_USER="$(git config user.name)" +mapfile -t CHANGED_SKILLS < <( + git log --author="${GIT_USER}" --format="%H" "${BASE_BRANCH}"..HEAD 2>/dev/null \ + | xargs -rI{} git diff-tree --no-commit-id -r --name-only {} \ + | grep '/skills/.*SKILL\.md$' \ + | sort -u \ + || true) + +if [[ ${#CHANGED_SKILLS[@]} -eq 0 ]]; then + echo "No skill files changed, nothing to evaluate." + exit 0 +fi + +echo "Changed skills:" +printf ' • %s\n' "${CHANGED_SKILLS[@]}" +echo "" + +post_status "pending" "Evaluating ${#CHANGED_SKILLS[@]} skill(s)..." + +EVAL_STATUS_POSTED=0 +trap 'if [[ "${EVAL_STATUS_POSTED}" -eq 0 ]]; then post_status "error" "Eval crashed unexpectedly"; fi; rm -rf "${EVAL_WORKDIR}"' EXIT + +# ── Step 2: for each changed skill, analyze → generate → run ─ +TOTAL=0 +PASSED=0 +FAILED=0 +REPORT_FILE="${EVAL_WORKDIR}/summary.txt" +LAST_ANALYSIS="" + +for skill_file in "${CHANGED_SKILLS[@]}"; do + TOTAL=$((TOTAL + 1)) + + # Extract plugin and skill name from path + # e.g. plugins/two-node/skills/cluster-diagnostic/SKILL.md + plugin=$(echo "${skill_file}" | cut -d'/' -f2) + skill_dir=$(echo "${skill_file}" | rev | cut -d'/' -f2 | rev) + skill_name="${plugin}:${skill_dir}" + + config="${EVAL_WORKDIR}/${plugin}-${skill_dir}.yaml" + cases_dir="${EVAL_WORKDIR}/${plugin}-${skill_dir}/cases" + mkdir -p "${cases_dir}" + + echo "════════════════════════════════════════════" + echo "Evaluating: ${skill_name}" + echo " Skill file: ${skill_file}" + echo " Temp config: ${config}" + echo "════════════════════════════════════════════" + + # Step 2a: Analyze — generate eval config with judges + echo "[1/3] Analyzing skill..." + if ! claude -p "/eval-analyze --skill ${skill_name} --config ${config}" 2>&1; then + echo " ⚠ eval-analyze failed for ${skill_name}, skipping." + FAILED=$((FAILED + 1)) + echo "SKIP ${skill_name} (eval-analyze failed)" >> "${REPORT_FILE}" + continue + fi + + # Step 2b: Generate — create test scenarios + echo "[2/3] Generating test scenarios..." + if ! claude -p "/eval-dataset --config ${config}" 2>&1; then + echo " ⚠ eval-dataset failed for ${skill_name}, skipping." + FAILED=$((FAILED + 1)) + echo "SKIP ${skill_name} (eval-dataset failed)" >> "${REPORT_FILE}" + continue + fi + + # Step 2c: Run — execute eval and score + echo "[3/3] Running evaluation..." + if claude -p "/eval-run --model ${EVAL_MODEL} --config ${config}" 2>&1; then + PASSED=$((PASSED + 1)) + echo "PASS ${skill_name}" >> "${REPORT_FILE}" + else + FAILED=$((FAILED + 1)) + echo "FAIL ${skill_name}" >> "${REPORT_FILE}" + fi + + # Find the most recent analysis.md for this run + latest_run=$(find "${EVAL_RUNS_DIR}" -name "analysis.md" -newer "${config}" 2>/dev/null \ + | sort | tail -1) + if [[ -n "${latest_run}" ]]; then + LAST_ANALYSIS="${latest_run}" + fi + + echo "" +done + +# ── Step 3: print summary report ──────────────────────────── +echo "" +echo "╔══════════════════════════════════════════════╗" +echo "║ Skill Eval Report ║" +echo "╠══════════════════════════════════════════════╣" +printf "║ Skills evaluated: %-24s ║\n" "${TOTAL}" +printf "║ Passed: %-24s ║\n" "${PASSED}" +printf "║ Failed/Skipped: %-24s ║\n" "${FAILED}" +echo "╠══════════════════════════════════════════════╣" + +if [[ -f "${REPORT_FILE}" ]]; then + while IFS= read -r line; do + printf "║ %-43s ║\n" "${line}" + done < "${REPORT_FILE}" +fi +echo "╚══════════════════════════════════════════════╝" + +# ── Step 4: post results to GitHub ────────────────────────── +if [[ "${FAILED}" -gt 0 ]]; then + post_status "failure" "${PASSED}/${TOTAL} skills passed, ${FAILED} failed" +else + post_status "success" "${PASSED}/${TOTAL} skills passed" +fi + +if [[ -n "${LAST_ANALYSIS}" ]]; then + post_pr_comment "${LAST_ANALYSIS}" +fi + +EVAL_STATUS_POSTED=1 + +# Always exit 0 — non-blocking +exit 0