diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 4620cc05..5dfbb6eb 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -265,6 +265,9 @@ jobs: - name: Run Cowork plugin drift tests run: ./tests/test-cowork-drift.sh + - name: Run skill graduation tests + run: ./tests/test-skill-graduations.sh + # Clean up old bot comments on PR push (keeps PRs tidy) # Also runs on workflow_dispatch (no-op) so branch protection doesn't block auto-merge. cleanup-old-comments: diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index e390551c..205c899a 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -56,6 +56,7 @@ Thank you for your interest in improving the SDLC Wizard! ./tests/test-agents-md-interop.sh && \ ./tests/test-self-pr-review-skip.sh && \ ./tests/test-cowork-drift.sh && \ + ./tests/test-skill-graduations.sh && \ ./tests/e2e/run-simulation.sh && \ ./tests/e2e/test-deterministic-checks.sh && \ ./tests/e2e/test-scenario-rotation.sh && \ @@ -205,6 +206,7 @@ python3 -c "import yaml; yaml.safe_load(open('.github/workflows/ci.yml'))" ./tests/test-degradation-detection.sh ./tests/test-local-shepherd.sh ./tests/test-cowork-drift.sh +./tests/test-skill-graduations.sh ./tests/e2e/run-simulation.sh ./tests/e2e/test-deterministic-checks.sh ./tests/e2e/test-scenario-rotation.sh diff --git a/cowork/skills/sdlc/SKILL.md b/cowork/skills/sdlc/SKILL.md index 711b079b..936c5c16 100644 --- a/cowork/skills/sdlc/SKILL.md +++ b/cowork/skills/sdlc/SKILL.md @@ -102,7 +102,11 @@ State your confidence before presenting an approach: | FAILED 2x | Something's wrong | Codex for fresh perspective; if still stuck, STOP | **`/effort max` now** | | CONFUSED | Can't diagnose | Codex; if still confused, STOP and describe | **`/effort max` now** | -**Dynamic effort bumping is NOT optional.** "Consider max effort" is the same as "ignore this." Bump BEFORE the next attempt, not after a third failure. +**Effort bumping is NOT optional.** Bump BEFORE the next attempt, not after a third failure. + +**Confidence ramp:** Opus researches → Fable batch review → 95% list → /goal TDD → Codex check. + +**Advisor:** `advisor()` before plans; if down, spawn Fable subagent. ## Plan Mode @@ -132,7 +136,7 @@ The loop goes back to PLANNING, not TDD RED. Run `/code-review`; issues at confi ## Cross-Model Review (REQUIRED for High-Stakes) -**When to run:** high-stakes changes (auth, payments, data), releases/publishes, complex refactors. **When to skip (log justification):** trivial, hotfixes, risk < review cost. **Prerequisites:** Codex CLI (`npm i -g @openai/codex`) + OpenAI API key. **Reviewer at flagship tier (#233):** even on `opusplan` (Sonnet driver), reviewer runs `gpt-5.5` xhigh — adversarial diversity is the point. +**When to run:** high-stakes changes (auth, payments, data), releases/publishes, complex refactors. **When to skip (log justification):** trivial, hotfixes, risk < review cost. **Prerequisites:** Codex CLI + OpenAI API key. **Reviewer:** `gpt-5.5` xhigh — adversarial diversity. PROTOCOL is universal across domains; only `review_instructions` and `verification_checklist` change. diff --git a/skills/sdlc/SKILL.md b/skills/sdlc/SKILL.md index 711b079b..936c5c16 100644 --- a/skills/sdlc/SKILL.md +++ b/skills/sdlc/SKILL.md @@ -102,7 +102,11 @@ State your confidence before presenting an approach: | FAILED 2x | Something's wrong | Codex for fresh perspective; if still stuck, STOP | **`/effort max` now** | | CONFUSED | Can't diagnose | Codex; if still confused, STOP and describe | **`/effort max` now** | -**Dynamic effort bumping is NOT optional.** "Consider max effort" is the same as "ignore this." Bump BEFORE the next attempt, not after a third failure. +**Effort bumping is NOT optional.** Bump BEFORE the next attempt, not after a third failure. + +**Confidence ramp:** Opus researches → Fable batch review → 95% list → /goal TDD → Codex check. + +**Advisor:** `advisor()` before plans; if down, spawn Fable subagent. ## Plan Mode @@ -132,7 +136,7 @@ The loop goes back to PLANNING, not TDD RED. Run `/code-review`; issues at confi ## Cross-Model Review (REQUIRED for High-Stakes) -**When to run:** high-stakes changes (auth, payments, data), releases/publishes, complex refactors. **When to skip (log justification):** trivial, hotfixes, risk < review cost. **Prerequisites:** Codex CLI (`npm i -g @openai/codex`) + OpenAI API key. **Reviewer at flagship tier (#233):** even on `opusplan` (Sonnet driver), reviewer runs `gpt-5.5` xhigh — adversarial diversity is the point. +**When to run:** high-stakes changes (auth, payments, data), releases/publishes, complex refactors. **When to skip (log justification):** trivial, hotfixes, risk < review cost. **Prerequisites:** Codex CLI + OpenAI API key. **Reviewer:** `gpt-5.5` xhigh — adversarial diversity. PROTOCOL is universal across domains; only `review_instructions` and `verification_checklist` change. diff --git a/tests/test-skill-graduations.sh b/tests/test-skill-graduations.sh new file mode 100755 index 00000000..75c7e884 --- /dev/null +++ b/tests/test-skill-graduations.sh @@ -0,0 +1,70 @@ +#!/bin/bash +set -e + +PASS=0 +FAIL=0 +SCRIPT_DIR="$(cd "$(dirname "$0")" && pwd)" +PROJECT_ROOT="$(cd "$SCRIPT_DIR/.." && pwd)" +SKILL="$PROJECT_ROOT/skills/sdlc/SKILL.md" + +pass() { echo " PASS: $1"; PASS=$((PASS + 1)); } +fail() { echo " FAIL: $1"; FAIL=$((FAIL + 1)); } + +echo "=== Skill Graduation Tests ===" +echo "" + +# --- Confidence Ramp Pattern --- +echo "--- Confidence Ramp Pattern ---" + +# Test 1: SKILL.md mentions the confidence ramp workflow +if grep -qi "confidence ramp" "$SKILL"; then + pass "SKILL.md documents confidence ramp pattern" +else + fail "SKILL.md missing confidence ramp pattern" +fi + +# Test 2: Mentions Fable batch review as part of the ramp +if grep -q "batch.*review\|batch.*consult" "$SKILL"; then + pass "confidence ramp includes batch review step" +else + fail "confidence ramp missing batch review step" +fi + +# Test 3: Confidence ramp line includes /goal and Codex check +if grep -qi "confidence ramp" "$SKILL" | head -1 && grep -i "confidence ramp" "$SKILL" | grep -q "/goal" && grep -i "confidence ramp" "$SKILL" | grep -q "Codex"; then + pass "confidence ramp includes /goal + Codex check" +else + fail "confidence ramp missing /goal or Codex check on the ramp line" +fi + +echo "" +echo "--- Advisor Auto-Fallback ---" + +# Test 4: SKILL.md documents advisor fallback +if grep -qi "advisor.*if down\|advisor.*fallback\|advisor.*unavailable\|fallback.*advisor" "$SKILL"; then + pass "SKILL.md documents advisor fallback" +else + fail "SKILL.md missing advisor fallback" +fi + +# Test 5: Fallback spawns Fable subagent +if grep -q "Fable.*subagent\|subagent.*Fable\|spawn.*Fable\|Fable.*fallback" "$SKILL"; then + pass "advisor fallback uses Fable subagent" +else + fail "advisor fallback missing Fable subagent instruction" +fi + +echo "" +echo "--- Budget Check ---" + +# Test 6: SKILL.md stays under 20K chars +chars=$(wc -c < "$SKILL") +if [ "$chars" -le 20000 ]; then + pass "SKILL.md is under 20K chars ($chars)" +else + fail "SKILL.md exceeds 20K chars ($chars)" +fi + +echo "" +echo "=== Results: $PASS passed, $FAIL failed ===" +[ "$FAIL" -eq 0 ] && exit 0 || exit 1