diff --git a/CHANGELOG.md b/CHANGELOG.md index 4a7ff0d91..bc7755404 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,5 +1,20 @@ # Changelog +## [0.11.16.0] - 2026-03-24 — Ship With Teeth + +`/ship` and `/review` now actually enforce the quality gates they've been talking about. Coverage audit becomes a real gate (not just a diagram), plan completion gets verified against the diff, and verification steps from your plan run automatically. + +### Added + +- **Test coverage gate in /ship.** AI-assessed coverage below 60% is a hard stop. 60-79% gets a prompt. 80%+ passes. Thresholds are configurable per-project via `## Test Coverage` in CLAUDE.md. +- **Coverage warning in /review.** Low coverage is now flagged prominently before you reach the /ship gate, so you can write tests early. +- **Plan completion audit.** /ship reads your plan file, extracts every actionable item, cross-references against the diff, and shows you a DONE/NOT DONE/PARTIAL/CHANGED checklist. Missing items are a shipping blocker (with override). +- **Plan-aware scope drift detection.** /review's scope drift check now reads the plan file too — not just TODOS.md and PR description. +- **Auto-verification via /qa-only.** /ship reads your plan's verification section and runs /qa-only inline to test it — if a dev server is running on localhost. No server, no problem — it skips gracefully. +- **Shared plan file discovery.** Conversation context first, content-based grep fallback second. Used by plan completion, plan review reports, and verification. +- **Ship metrics logging.** Coverage %, plan completion ratio, and verification results are logged to review JSONL for /retro to track trends. +- **Plan completion in /retro.** Weekly retros now show plan completion rates across shipped branches. + ## [0.11.15.0] - 2026-03-24 — E2E Test Coverage for Plan Reviews & Codex ### Added diff --git a/VERSION b/VERSION index 446cced35..e36c939ee 100644 --- a/VERSION +++ b/VERSION @@ -1 +1 @@ -0.11.15.0 +0.11.16.0 diff --git a/retro/SKILL.md b/retro/SKILL.md index 141605541..c8ca3d862 100644 --- a/retro/SKILL.md +++ b/retro/SKILL.md @@ -676,6 +676,28 @@ Narrative covering: - If prior retro exists and has `test_health`: show delta "Test count: {last} → {now} (+{delta})" - If test ratio < 20%: flag as growth area — "100% test coverage is the goal. Tests make vibe coding safe." +### Plan Completion +Check review JSONL logs for plan completion data from /ship runs this period: + +```bash +eval "$(~/.claude/skills/gstack/bin/gstack-slug 2>/dev/null)" +cat ~/.gstack/projects/$SLUG/*-reviews.jsonl 2>/dev/null | grep '"skill":"ship"' | grep '"plan_items_total"' || echo "NO_PLAN_DATA" +``` + +If plan completion data exists within the retro time window: +- Count branches shipped with plans (entries that have `plan_items_total` > 0) +- Compute average completion: sum of `plan_items_done` / sum of `plan_items_total` +- Identify most-skipped item category if data supports it + +Output: +``` +Plan Completion This Period: + {N} branches shipped with plans + Average completion: {X}% ({done}/{total} items) +``` + +If no plan data exists, skip this section silently. + ### Focus & Highlights (from Step 8) - Focus score with interpretation diff --git a/retro/SKILL.md.tmpl b/retro/SKILL.md.tmpl index 57a3759a7..dae967ef4 100644 --- a/retro/SKILL.md.tmpl +++ b/retro/SKILL.md.tmpl @@ -460,6 +460,28 @@ Narrative covering: - If prior retro exists and has `test_health`: show delta "Test count: {last} → {now} (+{delta})" - If test ratio < 20%: flag as growth area — "100% test coverage is the goal. Tests make vibe coding safe." +### Plan Completion +Check review JSONL logs for plan completion data from /ship runs this period: + +```bash +eval "$(~/.claude/skills/gstack/bin/gstack-slug 2>/dev/null)" +cat ~/.gstack/projects/$SLUG/*-reviews.jsonl 2>/dev/null | grep '"skill":"ship"' | grep '"plan_items_total"' || echo "NO_PLAN_DATA" +``` + +If plan completion data exists within the retro time window: +- Count branches shipped with plans (entries that have `plan_items_total` > 0) +- Compute average completion: sum of `plan_items_done` / sum of `plan_items_total` +- Identify most-skipped item category if data supports it + +Output: +``` +Plan Completion This Period: + {N} branches shipped with plans + Average completion: {X}% ({done}/{total} items) +``` + +If no plan data exists, skip this section silently. + ### Focus & Highlights (from Step 8) - Focus score with interpretation diff --git a/review/SKILL.md b/review/SKILL.md index 912e1f3ec..c957b60e5 100644 --- a/review/SKILL.md +++ b/review/SKILL.md @@ -297,7 +297,120 @@ Before reviewing code quality, check: **did they build what was requested — no **If no PR exists:** rely on commit messages and TODOS.md for stated intent — this is the common case since /review runs before /ship creates the PR. 2. Identify the **stated intent** — what was this branch supposed to accomplish? 3. Run `git diff origin/...HEAD --stat` and compare the files changed against the stated intent. -4. Evaluate with skepticism: + +### Plan File Discovery + +1. **Conversation context (primary):** Check if there is an active plan file in this conversation — Claude Code system messages include plan file paths when in plan mode. Look for references like `~/.claude/plans/*.md` in system messages. If found, use it directly — this is the most reliable signal. + +2. **Content-based search (fallback):** If no plan file is referenced in conversation context, search by content: + +```bash +BRANCH=$(git branch --show-current 2>/dev/null | tr '/' '-') +REPO=$(basename "$(git rev-parse --show-toplevel 2>/dev/null)") +# Try branch name match first (most specific) +PLAN=$(ls -t ~/.claude/plans/*.md 2>/dev/null | xargs grep -l "$BRANCH" 2>/dev/null | head -1) +# Fall back to repo name match +[ -z "$PLAN" ] && PLAN=$(ls -t ~/.claude/plans/*.md 2>/dev/null | xargs grep -l "$REPO" 2>/dev/null | head -1) +# Last resort: most recent plan modified in the last 24 hours +[ -z "$PLAN" ] && PLAN=$(find ~/.claude/plans -name '*.md' -mmin -1440 -maxdepth 1 2>/dev/null | xargs ls -t 2>/dev/null | head -1) +[ -n "$PLAN" ] && echo "PLAN_FILE: $PLAN" || echo "NO_PLAN_FILE" +``` + +3. **Validation:** If a plan file was found via content-based search (not conversation context), read the first 20 lines and verify it is relevant to the current branch's work. If it appears to be from a different project or feature, treat as "no plan file found." + +**Error handling:** +- No plan file found → skip with "No plan file detected — skipping." +- Plan file found but unreadable (permissions, encoding) → skip with "Plan file found but unreadable — skipping." + +### Actionable Item Extraction + +Read the plan file. Extract every actionable item — anything that describes work to be done. Look for: + +- **Checkbox items:** `- [ ] ...` or `- [x] ...` +- **Numbered steps** under implementation headings: "1. Create ...", "2. Add ...", "3. Modify ..." +- **Imperative statements:** "Add X to Y", "Create a Z service", "Modify the W controller" +- **File-level specifications:** "New file: path/to/file.ts", "Modify path/to/existing.rb" +- **Test requirements:** "Test that X", "Add test for Y", "Verify Z" +- **Data model changes:** "Add column X to table Y", "Create migration for Z" + +**Ignore:** +- Context/Background sections (`## Context`, `## Background`, `## Problem`) +- Questions and open items (marked with ?, "TBD", "TODO: decide") +- Review report sections (`## GSTACK REVIEW REPORT`) +- Explicitly deferred items ("Future:", "Out of scope:", "NOT in scope:", "P2:", "P3:", "P4:") +- CEO Review Decisions sections (these record choices, not work items) + +**Cap:** Extract at most 50 items. If the plan has more, note: "Showing top 50 of N plan items — full list in plan file." + +**No items found:** If the plan contains no extractable actionable items, skip with: "Plan file contains no actionable items — skipping completion audit." + +For each item, note: +- The item text (verbatim or concise summary) +- Its category: CODE | TEST | MIGRATION | CONFIG | DOCS + +### Cross-Reference Against Diff + +Run `git diff origin/...HEAD` and `git log origin/..HEAD --oneline` to understand what was implemented. + +For each extracted plan item, check the diff and classify: + +- **DONE** — Clear evidence in the diff that this item was implemented. Cite the specific file(s) changed. +- **PARTIAL** — Some work toward this item exists in the diff but it's incomplete (e.g., model created but controller missing, function exists but edge cases not handled). +- **NOT DONE** — No evidence in the diff that this item was addressed. +- **CHANGED** — The item was implemented using a different approach than the plan described, but the same goal is achieved. Note the difference. + +**Be conservative with DONE** — require clear evidence in the diff. A file being touched is not enough; the specific functionality described must be present. +**Be generous with CHANGED** — if the goal is met by different means, that counts as addressed. + +### Output Format + +``` +PLAN COMPLETION AUDIT +═══════════════════════════════ +Plan: {plan file path} + +## Implementation Items + [DONE] Create UserService — src/services/user_service.rb (+142 lines) + [PARTIAL] Add validation — model validates but missing controller checks + [NOT DONE] Add caching layer — no cache-related changes in diff + [CHANGED] "Redis queue" → implemented with Sidekiq instead + +## Test Items + [DONE] Unit tests for UserService — test/services/user_service_test.rb + [NOT DONE] E2E test for signup flow + +## Migration Items + [DONE] Create users table — db/migrate/20240315_create_users.rb + +───────────────────────────────── +COMPLETION: 4/7 DONE, 1 PARTIAL, 1 NOT DONE, 1 CHANGED +───────────────────────────────── +``` + +### Integration with Scope Drift Detection + +The plan completion results augment the existing Scope Drift Detection. If a plan file is found: + +- **NOT DONE items** become additional evidence for **MISSING REQUIREMENTS** in the scope drift report. +- **Items in the diff that don't match any plan item** become evidence for **SCOPE CREEP** detection. + +This is **INFORMATIONAL** — does not block the review (consistent with existing scope drift behavior). + +Update the scope drift output to include plan file context: + +``` +Scope Check: [CLEAN / DRIFT DETECTED / REQUIREMENTS MISSING] +Intent: +Plan: +Delivered: <1-line summary of what the diff actually does> +Plan items: N DONE, M PARTIAL, K NOT DONE +[If NOT DONE: list each missing item] +[If scope creep: list each out-of-scope change not in the plan] +``` + +**No plan file found:** Fall back to existing scope drift behavior (check TODOS.md and PR description only). + +4. Evaluate with skepticism (incorporating plan completion results if available): **SCOPE CREEP detection:** - Files changed that are unrelated to the stated intent @@ -606,6 +719,21 @@ If no test framework detected → include gaps as INFORMATIONAL findings only, n **Diff is test-only changes:** Skip Step 4.75 entirely: "No new application code paths to audit." +### Coverage Warning + +After producing the coverage diagram, check the coverage percentage. Read CLAUDE.md for a `## Test Coverage` section with a `Minimum:` field. If not found, use default: 60%. + +If coverage is below the minimum threshold, output a prominent warning **before** the regular review findings: + +``` +⚠️ COVERAGE WARNING: AI-assessed coverage is {X}%. {N} code paths untested. +Consider writing tests before running /ship. +``` + +This is INFORMATIONAL — does not block /review. But it makes low coverage visible early so the developer can address it before reaching the /ship coverage gate. + +If coverage percentage cannot be determined, skip the warning silently. + This step subsumes the "Test Gaps" category from Pass 2 — do not duplicate findings between the checklist Test Gaps item and this coverage diagram. Include any coverage gaps alongside the findings from Step 4 and Step 4.5. They follow the same Fix-First flow — gaps are INFORMATIONAL findings. --- diff --git a/review/SKILL.md.tmpl b/review/SKILL.md.tmpl index 712b91a90..bb9a3bc73 100644 --- a/review/SKILL.md.tmpl +++ b/review/SKILL.md.tmpl @@ -46,7 +46,10 @@ Before reviewing code quality, check: **did they build what was requested — no **If no PR exists:** rely on commit messages and TODOS.md for stated intent — this is the common case since /review runs before /ship creates the PR. 2. Identify the **stated intent** — what was this branch supposed to accomplish? 3. Run `git diff origin/...HEAD --stat` and compare the files changed against the stated intent. -4. Evaluate with skepticism: + +{{PLAN_COMPLETION_AUDIT_REVIEW}} + +4. Evaluate with skepticism (incorporating plan completion results if available): **SCOPE CREEP detection:** - Files changed that are unrelated to the stated intent diff --git a/scripts/resolvers/index.ts b/scripts/resolvers/index.ts index 95c6ea082..9e9b9596f 100644 --- a/scripts/resolvers/index.ts +++ b/scripts/resolvers/index.ts @@ -11,7 +11,7 @@ import { generateTestFailureTriage } from './preamble'; import { generateCommandReference, generateSnapshotFlags, generateBrowseSetup } from './browse'; import { generateDesignMethodology, generateDesignHardRules, generateDesignOutsideVoices, generateDesignReviewLite, generateDesignSketch } from './design'; import { generateTestBootstrap, generateTestCoverageAuditPlan, generateTestCoverageAuditShip, generateTestCoverageAuditReview } from './testing'; -import { generateReviewDashboard, generatePlanFileReviewReport, generateSpecReviewLoop, generateBenefitsFrom, generateCodexSecondOpinion, generateAdversarialStep, generateCodexPlanReview } from './review'; +import { generateReviewDashboard, generatePlanFileReviewReport, generateSpecReviewLoop, generateBenefitsFrom, generateCodexSecondOpinion, generateAdversarialStep, generateCodexPlanReview, generatePlanCompletionAuditShip, generatePlanCompletionAuditReview, generatePlanVerificationExec } from './review'; import { generateSlugEval, generateSlugSetup, generateBaseBranchDetect, generateDeployBootstrap, generateQAMethodology } from './utility'; export const RESOLVERS: Record string> = { @@ -41,4 +41,7 @@ export const RESOLVERS: Record string> = { ADVERSARIAL_STEP: generateAdversarialStep, DEPLOY_BOOTSTRAP: generateDeployBootstrap, CODEX_PLAN_REVIEW: generateCodexPlanReview, + PLAN_COMPLETION_AUDIT_SHIP: generatePlanCompletionAuditShip, + PLAN_COMPLETION_AUDIT_REVIEW: generatePlanCompletionAuditReview, + PLAN_VERIFICATION_EXEC: generatePlanVerificationExec, }; diff --git a/scripts/resolvers/review.ts b/scripts/resolvers/review.ts index 2f355ef68..1831e098b 100644 --- a/scripts/resolvers/review.ts +++ b/scripts/resolvers/review.ts @@ -592,3 +592,236 @@ SOURCE = "codex" if Codex ran, "claude" if subagent ran. ---`; } + +// ─── Plan File Discovery (shared helper) ────────────────────────────── + +function generatePlanFileDiscovery(): string { + return `### Plan File Discovery + +1. **Conversation context (primary):** Check if there is an active plan file in this conversation — Claude Code system messages include plan file paths when in plan mode. Look for references like \`~/.claude/plans/*.md\` in system messages. If found, use it directly — this is the most reliable signal. + +2. **Content-based search (fallback):** If no plan file is referenced in conversation context, search by content: + +\`\`\`bash +BRANCH=$(git branch --show-current 2>/dev/null | tr '/' '-') +REPO=$(basename "$(git rev-parse --show-toplevel 2>/dev/null)") +# Try branch name match first (most specific) +PLAN=$(ls -t ~/.claude/plans/*.md 2>/dev/null | xargs grep -l "$BRANCH" 2>/dev/null | head -1) +# Fall back to repo name match +[ -z "$PLAN" ] && PLAN=$(ls -t ~/.claude/plans/*.md 2>/dev/null | xargs grep -l "$REPO" 2>/dev/null | head -1) +# Last resort: most recent plan modified in the last 24 hours +[ -z "$PLAN" ] && PLAN=$(find ~/.claude/plans -name '*.md' -mmin -1440 -maxdepth 1 2>/dev/null | xargs ls -t 2>/dev/null | head -1) +[ -n "$PLAN" ] && echo "PLAN_FILE: $PLAN" || echo "NO_PLAN_FILE" +\`\`\` + +3. **Validation:** If a plan file was found via content-based search (not conversation context), read the first 20 lines and verify it is relevant to the current branch's work. If it appears to be from a different project or feature, treat as "no plan file found." + +**Error handling:** +- No plan file found → skip with "No plan file detected — skipping." +- Plan file found but unreadable (permissions, encoding) → skip with "Plan file found but unreadable — skipping."`; +} + +// ─── Plan Completion Audit ──────────────────────────────────────────── + +type PlanCompletionMode = 'ship' | 'review'; + +function generatePlanCompletionAuditInner(mode: PlanCompletionMode): string { + const sections: string[] = []; + + // ── Plan file discovery (shared) ── + sections.push(generatePlanFileDiscovery()); + + // ── Item extraction ── + sections.push(` +### Actionable Item Extraction + +Read the plan file. Extract every actionable item — anything that describes work to be done. Look for: + +- **Checkbox items:** \`- [ ] ...\` or \`- [x] ...\` +- **Numbered steps** under implementation headings: "1. Create ...", "2. Add ...", "3. Modify ..." +- **Imperative statements:** "Add X to Y", "Create a Z service", "Modify the W controller" +- **File-level specifications:** "New file: path/to/file.ts", "Modify path/to/existing.rb" +- **Test requirements:** "Test that X", "Add test for Y", "Verify Z" +- **Data model changes:** "Add column X to table Y", "Create migration for Z" + +**Ignore:** +- Context/Background sections (\`## Context\`, \`## Background\`, \`## Problem\`) +- Questions and open items (marked with ?, "TBD", "TODO: decide") +- Review report sections (\`## GSTACK REVIEW REPORT\`) +- Explicitly deferred items ("Future:", "Out of scope:", "NOT in scope:", "P2:", "P3:", "P4:") +- CEO Review Decisions sections (these record choices, not work items) + +**Cap:** Extract at most 50 items. If the plan has more, note: "Showing top 50 of N plan items — full list in plan file." + +**No items found:** If the plan contains no extractable actionable items, skip with: "Plan file contains no actionable items — skipping completion audit." + +For each item, note: +- The item text (verbatim or concise summary) +- Its category: CODE | TEST | MIGRATION | CONFIG | DOCS`); + + // ── Cross-reference against diff ── + sections.push(` +### Cross-Reference Against Diff + +Run \`git diff origin/...HEAD\` and \`git log origin/..HEAD --oneline\` to understand what was implemented. + +For each extracted plan item, check the diff and classify: + +- **DONE** — Clear evidence in the diff that this item was implemented. Cite the specific file(s) changed. +- **PARTIAL** — Some work toward this item exists in the diff but it's incomplete (e.g., model created but controller missing, function exists but edge cases not handled). +- **NOT DONE** — No evidence in the diff that this item was addressed. +- **CHANGED** — The item was implemented using a different approach than the plan described, but the same goal is achieved. Note the difference. + +**Be conservative with DONE** — require clear evidence in the diff. A file being touched is not enough; the specific functionality described must be present. +**Be generous with CHANGED** — if the goal is met by different means, that counts as addressed.`); + + // ── Output format ── + sections.push(` +### Output Format + +\`\`\` +PLAN COMPLETION AUDIT +═══════════════════════════════ +Plan: {plan file path} + +## Implementation Items + [DONE] Create UserService — src/services/user_service.rb (+142 lines) + [PARTIAL] Add validation — model validates but missing controller checks + [NOT DONE] Add caching layer — no cache-related changes in diff + [CHANGED] "Redis queue" → implemented with Sidekiq instead + +## Test Items + [DONE] Unit tests for UserService — test/services/user_service_test.rb + [NOT DONE] E2E test for signup flow + +## Migration Items + [DONE] Create users table — db/migrate/20240315_create_users.rb + +───────────────────────────────── +COMPLETION: 4/7 DONE, 1 PARTIAL, 1 NOT DONE, 1 CHANGED +───────────────────────────────── +\`\`\``); + + // ── Gate logic (mode-specific) ── + if (mode === 'ship') { + sections.push(` +### Gate Logic + +After producing the completion checklist: + +- **All DONE or CHANGED:** Pass. "Plan completion: PASS — all items addressed." Continue. +- **Only PARTIAL items (no NOT DONE):** Continue with a note in the PR body. Not blocking. +- **Any NOT DONE items:** Use AskUserQuestion: + - Show the completion checklist above + - "{N} items from the plan are NOT DONE. These were part of the original plan but are missing from the implementation." + - RECOMMENDATION: depends on item count and severity. If 1-2 minor items (docs, config), recommend B. If core functionality is missing, recommend A. + - Options: + A) Stop — implement the missing items before shipping + B) Ship anyway — defer these to a follow-up (will create P1 TODOs in Step 5.5) + C) These items were intentionally dropped — remove from scope + - If A: STOP. List the missing items for the user to implement. + - If B: Continue. For each NOT DONE item, create a P1 TODO in Step 5.5 with "Deferred from plan: {plan file path}". + - If C: Continue. Note in PR body: "Plan items intentionally dropped: {list}." + +**No plan file found:** Skip entirely. "No plan file detected — skipping plan completion audit." + +**Include in PR body (Step 8):** Add a \`## Plan Completion\` section with the checklist summary.`); + } else { + // review mode + sections.push(` +### Integration with Scope Drift Detection + +The plan completion results augment the existing Scope Drift Detection. If a plan file is found: + +- **NOT DONE items** become additional evidence for **MISSING REQUIREMENTS** in the scope drift report. +- **Items in the diff that don't match any plan item** become evidence for **SCOPE CREEP** detection. + +This is **INFORMATIONAL** — does not block the review (consistent with existing scope drift behavior). + +Update the scope drift output to include plan file context: + +\`\`\` +Scope Check: [CLEAN / DRIFT DETECTED / REQUIREMENTS MISSING] +Intent: +Plan: +Delivered: <1-line summary of what the diff actually does> +Plan items: N DONE, M PARTIAL, K NOT DONE +[If NOT DONE: list each missing item] +[If scope creep: list each out-of-scope change not in the plan] +\`\`\` + +**No plan file found:** Fall back to existing scope drift behavior (check TODOS.md and PR description only).`); + } + + return sections.join('\n'); +} + +export function generatePlanCompletionAuditShip(_ctx: TemplateContext): string { + return generatePlanCompletionAuditInner('ship'); +} + +export function generatePlanCompletionAuditReview(_ctx: TemplateContext): string { + return generatePlanCompletionAuditInner('review'); +} + +// ─── Plan Verification Execution ────────────────────────────────────── + +export function generatePlanVerificationExec(_ctx: TemplateContext): string { + return `## Step 3.47: Plan Verification + +Automatically verify the plan's testing/verification steps using the \`/qa-only\` skill. + +### 1. Check for verification section + +Using the plan file already discovered in Step 3.45, look for a verification section. Match any of these headings: \`## Verification\`, \`## Test plan\`, \`## Testing\`, \`## How to test\`, \`## Manual testing\`, or any section with verification-flavored items (URLs to visit, things to check visually, interactions to test). + +**If no verification section found:** Skip with "No verification steps found in plan — skipping auto-verification." +**If no plan file was found in Step 3.45:** Skip (already handled). + +### 2. Check for running dev server + +Before invoking browse-based verification, check if a dev server is reachable: + +\`\`\`bash +curl -s -o /dev/null -w '%{http_code}' http://localhost:3000 2>/dev/null || \\ +curl -s -o /dev/null -w '%{http_code}' http://localhost:8080 2>/dev/null || \\ +curl -s -o /dev/null -w '%{http_code}' http://localhost:5173 2>/dev/null || \\ +curl -s -o /dev/null -w '%{http_code}' http://localhost:4000 2>/dev/null || echo "NO_SERVER" +\`\`\` + +**If NO_SERVER:** Skip with "No dev server detected — skipping plan verification. Run /qa separately after deploying." + +### 3. Invoke /qa-only inline + +Read the \`/qa-only\` skill from disk: + +\`\`\`bash +cat \${CLAUDE_SKILL_DIR}/../qa-only/SKILL.md +\`\`\` + +**If unreadable:** Skip with "Could not load /qa-only — skipping plan verification." + +Follow the /qa-only workflow with these modifications: +- **Skip the preamble** (already handled by /ship) +- **Use the plan's verification section as the primary test input** — treat each verification item as a test case +- **Use the detected dev server URL** as the base URL +- **Skip the fix loop** — this is report-only verification during /ship +- **Cap at the verification items from the plan** — do not expand into general site QA + +### 4. Gate logic + +- **All verification items PASS:** Continue silently. "Plan verification: PASS." +- **Any FAIL:** Use AskUserQuestion: + - Show the failures with screenshot evidence + - RECOMMENDATION: Choose A if failures indicate broken functionality. Choose B if cosmetic only. + - Options: + A) Fix the failures before shipping (recommended for functional issues) + B) Ship anyway — known issues (acceptable for cosmetic issues) +- **No verification section / no server / unreadable skill:** Skip (non-blocking). + +### 5. Include in PR body + +Add a \`## Verification Results\` section to the PR body (Step 8): +- If verification ran: summary of results (N PASS, M FAIL, K SKIPPED) +- If skipped: reason for skipping (no plan, no server, no verification section)`; +} diff --git a/scripts/resolvers/testing.ts b/scripts/resolvers/testing.ts index 4ede82708..fde799dc0 100644 --- a/scripts/resolvers/testing.ts +++ b/scripts/resolvers/testing.ts @@ -454,7 +454,40 @@ find . -name '*.test.*' -o -name '*.spec.*' -o -name '*_test.*' -o -name '*_spec \`\`\` For PR body: \`Tests: {before} → {after} (+{delta} new)\` -Coverage line: \`Test Coverage Audit: N new code paths. M covered (X%). K tests generated, J committed.\``); +Coverage line: \`Test Coverage Audit: N new code paths. M covered (X%). K tests generated, J committed.\` + +**7. Coverage gate:** + +Before proceeding, check CLAUDE.md for a \`## Test Coverage\` section with \`Minimum:\` and \`Target:\` fields. If found, use those percentages. Otherwise use defaults: Minimum = 60%, Target = 80%. + +Using the coverage percentage from the diagram in substep 4 (the \`COVERAGE: X/Y (Z%)\` line): + +- **>= target:** Pass. "Coverage gate: PASS ({X}%)." Continue. +- **>= minimum, < target:** Use AskUserQuestion: + - "AI-assessed coverage is {X}%. {N} code paths are untested. Target is {target}%." + - RECOMMENDATION: Choose A because untested code paths are where production bugs hide. + - Options: + A) Generate more tests for remaining gaps (recommended) + B) Ship anyway — I accept the coverage risk + C) These paths don't need tests — mark as intentionally uncovered + - If A: Loop back to substep 5 (generate tests) targeting the remaining gaps. After second pass, if still below target, present AskUserQuestion again with updated numbers. Maximum 2 generation passes total. + - If B: Continue. Include in PR body: "Coverage gate: {X}% — user accepted risk." + - If C: Continue. Include in PR body: "Coverage gate: {X}% — {N} paths intentionally uncovered." + +- **< minimum:** Use AskUserQuestion: + - "AI-assessed coverage is critically low ({X}%). {N} of {M} code paths have no tests. Minimum threshold is {minimum}%." + - RECOMMENDATION: Choose A because less than {minimum}% means more code is untested than tested. + - Options: + A) Generate tests for remaining gaps (recommended) + B) Override — ship with low coverage (I understand the risk) + - If A: Loop back to substep 5. Maximum 2 passes. If still below minimum after 2 passes, present the override choice again. + - If B: Continue. Include in PR body: "Coverage gate: OVERRIDDEN at {X}%." + +**Coverage percentage undetermined:** If the coverage diagram doesn't produce a clear numeric percentage (ambiguous output, parse error), **skip the gate** with: "Coverage gate: could not determine percentage — skipping." Do not default to 0% or block. + +**Test-only diffs:** Skip the gate (same as the existing fast-path). + +**100% coverage:** "Coverage gate: PASS (100%)." Continue.`); // ── Test plan artifact (ship mode) ── sections.push(` @@ -504,7 +537,22 @@ If test framework is detected and gaps were identified: If no test framework detected → include gaps as INFORMATIONAL findings only, no generation. -**Diff is test-only changes:** Skip Step 4.75 entirely: "No new application code paths to audit."`); +**Diff is test-only changes:** Skip Step 4.75 entirely: "No new application code paths to audit." + +### Coverage Warning + +After producing the coverage diagram, check the coverage percentage. Read CLAUDE.md for a \`## Test Coverage\` section with a \`Minimum:\` field. If not found, use default: 60%. + +If coverage is below the minimum threshold, output a prominent warning **before** the regular review findings: + +\`\`\` +⚠️ COVERAGE WARNING: AI-assessed coverage is {X}%. {N} code paths untested. +Consider writing tests before running /ship. +\`\`\` + +This is INFORMATIONAL — does not block /review. But it makes low coverage visible early so the developer can address it before reaching the /ship coverage gate. + +If coverage percentage cannot be determined, skip the warning silently.`); } return sections.join('\n'); diff --git a/ship/SKILL.md b/ship/SKILL.md index 16d0e4b37..07d37eadf 100644 --- a/ship/SKILL.md +++ b/ship/SKILL.md @@ -283,6 +283,9 @@ You are running the `/ship` workflow. This is a **non-interactive, fully automat - Pre-landing review finds ASK items that need user judgment - MINOR or MAJOR version bump needed (ask — see Step 4) - Greptile review comments that need user decision (complex fixes, false positives) +- AI-assessed coverage below minimum threshold (hard gate with user override — see Step 3.4) +- Plan items NOT DONE with no user override (see Step 3.45) +- Plan verification failures (see Step 3.47) - TODOS.md missing and user wants to create one (ask — see Step 5.5) - TODOS.md disorganized and user wants to reorganize (ask — see Step 5.5) @@ -294,7 +297,7 @@ You are running the `/ship` workflow. This is a **non-interactive, fully automat - Multi-file changesets (auto-split into bisectable commits) - TODOS.md completed-item detection (auto-mark) - Auto-fixable review findings (dead code, N+1, stale comments — fixed automatically) -- Test coverage gaps (auto-generate and commit, or flag in PR body) +- Test coverage gaps within target threshold (auto-generate and commit, or flag in PR body) --- @@ -953,6 +956,39 @@ find . -name '*.test.*' -o -name '*.spec.*' -o -name '*_test.*' -o -name '*_spec For PR body: `Tests: {before} → {after} (+{delta} new)` Coverage line: `Test Coverage Audit: N new code paths. M covered (X%). K tests generated, J committed.` +**7. Coverage gate:** + +Before proceeding, check CLAUDE.md for a `## Test Coverage` section with `Minimum:` and `Target:` fields. If found, use those percentages. Otherwise use defaults: Minimum = 60%, Target = 80%. + +Using the coverage percentage from the diagram in substep 4 (the `COVERAGE: X/Y (Z%)` line): + +- **>= target:** Pass. "Coverage gate: PASS ({X}%)." Continue. +- **>= minimum, < target:** Use AskUserQuestion: + - "AI-assessed coverage is {X}%. {N} code paths are untested. Target is {target}%." + - RECOMMENDATION: Choose A because untested code paths are where production bugs hide. + - Options: + A) Generate more tests for remaining gaps (recommended) + B) Ship anyway — I accept the coverage risk + C) These paths don't need tests — mark as intentionally uncovered + - If A: Loop back to substep 5 (generate tests) targeting the remaining gaps. After second pass, if still below target, present AskUserQuestion again with updated numbers. Maximum 2 generation passes total. + - If B: Continue. Include in PR body: "Coverage gate: {X}% — user accepted risk." + - If C: Continue. Include in PR body: "Coverage gate: {X}% — {N} paths intentionally uncovered." + +- **< minimum:** Use AskUserQuestion: + - "AI-assessed coverage is critically low ({X}%). {N} of {M} code paths have no tests. Minimum threshold is {minimum}%." + - RECOMMENDATION: Choose A because less than {minimum}% means more code is untested than tested. + - Options: + A) Generate tests for remaining gaps (recommended) + B) Override — ship with low coverage (I understand the risk) + - If A: Loop back to substep 5. Maximum 2 passes. If still below minimum after 2 passes, present the override choice again. + - If B: Continue. Include in PR body: "Coverage gate: OVERRIDDEN at {X}%." + +**Coverage percentage undetermined:** If the coverage diagram doesn't produce a clear numeric percentage (ambiguous output, parse error), **skip the gate** with: "Coverage gate: could not determine percentage — skipping." Do not default to 0% or block. + +**Test-only diffs:** Skip the gate (same as the existing fast-path). + +**100% coverage:** "Coverage gate: PASS (100%)." Continue. + ### Test Plan Artifact After producing the coverage diagram, write a test plan artifact so `/qa` and `/qa-only` can consume it: @@ -986,6 +1022,181 @@ Repo: {owner/repo} --- +## Step 3.45: Plan Completion Audit + +### Plan File Discovery + +1. **Conversation context (primary):** Check if there is an active plan file in this conversation — Claude Code system messages include plan file paths when in plan mode. Look for references like `~/.claude/plans/*.md` in system messages. If found, use it directly — this is the most reliable signal. + +2. **Content-based search (fallback):** If no plan file is referenced in conversation context, search by content: + +```bash +BRANCH=$(git branch --show-current 2>/dev/null | tr '/' '-') +REPO=$(basename "$(git rev-parse --show-toplevel 2>/dev/null)") +# Try branch name match first (most specific) +PLAN=$(ls -t ~/.claude/plans/*.md 2>/dev/null | xargs grep -l "$BRANCH" 2>/dev/null | head -1) +# Fall back to repo name match +[ -z "$PLAN" ] && PLAN=$(ls -t ~/.claude/plans/*.md 2>/dev/null | xargs grep -l "$REPO" 2>/dev/null | head -1) +# Last resort: most recent plan modified in the last 24 hours +[ -z "$PLAN" ] && PLAN=$(find ~/.claude/plans -name '*.md' -mmin -1440 -maxdepth 1 2>/dev/null | xargs ls -t 2>/dev/null | head -1) +[ -n "$PLAN" ] && echo "PLAN_FILE: $PLAN" || echo "NO_PLAN_FILE" +``` + +3. **Validation:** If a plan file was found via content-based search (not conversation context), read the first 20 lines and verify it is relevant to the current branch's work. If it appears to be from a different project or feature, treat as "no plan file found." + +**Error handling:** +- No plan file found → skip with "No plan file detected — skipping." +- Plan file found but unreadable (permissions, encoding) → skip with "Plan file found but unreadable — skipping." + +### Actionable Item Extraction + +Read the plan file. Extract every actionable item — anything that describes work to be done. Look for: + +- **Checkbox items:** `- [ ] ...` or `- [x] ...` +- **Numbered steps** under implementation headings: "1. Create ...", "2. Add ...", "3. Modify ..." +- **Imperative statements:** "Add X to Y", "Create a Z service", "Modify the W controller" +- **File-level specifications:** "New file: path/to/file.ts", "Modify path/to/existing.rb" +- **Test requirements:** "Test that X", "Add test for Y", "Verify Z" +- **Data model changes:** "Add column X to table Y", "Create migration for Z" + +**Ignore:** +- Context/Background sections (`## Context`, `## Background`, `## Problem`) +- Questions and open items (marked with ?, "TBD", "TODO: decide") +- Review report sections (`## GSTACK REVIEW REPORT`) +- Explicitly deferred items ("Future:", "Out of scope:", "NOT in scope:", "P2:", "P3:", "P4:") +- CEO Review Decisions sections (these record choices, not work items) + +**Cap:** Extract at most 50 items. If the plan has more, note: "Showing top 50 of N plan items — full list in plan file." + +**No items found:** If the plan contains no extractable actionable items, skip with: "Plan file contains no actionable items — skipping completion audit." + +For each item, note: +- The item text (verbatim or concise summary) +- Its category: CODE | TEST | MIGRATION | CONFIG | DOCS + +### Cross-Reference Against Diff + +Run `git diff origin/...HEAD` and `git log origin/..HEAD --oneline` to understand what was implemented. + +For each extracted plan item, check the diff and classify: + +- **DONE** — Clear evidence in the diff that this item was implemented. Cite the specific file(s) changed. +- **PARTIAL** — Some work toward this item exists in the diff but it's incomplete (e.g., model created but controller missing, function exists but edge cases not handled). +- **NOT DONE** — No evidence in the diff that this item was addressed. +- **CHANGED** — The item was implemented using a different approach than the plan described, but the same goal is achieved. Note the difference. + +**Be conservative with DONE** — require clear evidence in the diff. A file being touched is not enough; the specific functionality described must be present. +**Be generous with CHANGED** — if the goal is met by different means, that counts as addressed. + +### Output Format + +``` +PLAN COMPLETION AUDIT +═══════════════════════════════ +Plan: {plan file path} + +## Implementation Items + [DONE] Create UserService — src/services/user_service.rb (+142 lines) + [PARTIAL] Add validation — model validates but missing controller checks + [NOT DONE] Add caching layer — no cache-related changes in diff + [CHANGED] "Redis queue" → implemented with Sidekiq instead + +## Test Items + [DONE] Unit tests for UserService — test/services/user_service_test.rb + [NOT DONE] E2E test for signup flow + +## Migration Items + [DONE] Create users table — db/migrate/20240315_create_users.rb + +───────────────────────────────── +COMPLETION: 4/7 DONE, 1 PARTIAL, 1 NOT DONE, 1 CHANGED +───────────────────────────────── +``` + +### Gate Logic + +After producing the completion checklist: + +- **All DONE or CHANGED:** Pass. "Plan completion: PASS — all items addressed." Continue. +- **Only PARTIAL items (no NOT DONE):** Continue with a note in the PR body. Not blocking. +- **Any NOT DONE items:** Use AskUserQuestion: + - Show the completion checklist above + - "{N} items from the plan are NOT DONE. These were part of the original plan but are missing from the implementation." + - RECOMMENDATION: depends on item count and severity. If 1-2 minor items (docs, config), recommend B. If core functionality is missing, recommend A. + - Options: + A) Stop — implement the missing items before shipping + B) Ship anyway — defer these to a follow-up (will create P1 TODOs in Step 5.5) + C) These items were intentionally dropped — remove from scope + - If A: STOP. List the missing items for the user to implement. + - If B: Continue. For each NOT DONE item, create a P1 TODO in Step 5.5 with "Deferred from plan: {plan file path}". + - If C: Continue. Note in PR body: "Plan items intentionally dropped: {list}." + +**No plan file found:** Skip entirely. "No plan file detected — skipping plan completion audit." + +**Include in PR body (Step 8):** Add a `## Plan Completion` section with the checklist summary. + +--- + +## Step 3.47: Plan Verification + +Automatically verify the plan's testing/verification steps using the `/qa-only` skill. + +### 1. Check for verification section + +Using the plan file already discovered in Step 3.45, look for a verification section. Match any of these headings: `## Verification`, `## Test plan`, `## Testing`, `## How to test`, `## Manual testing`, or any section with verification-flavored items (URLs to visit, things to check visually, interactions to test). + +**If no verification section found:** Skip with "No verification steps found in plan — skipping auto-verification." +**If no plan file was found in Step 3.45:** Skip (already handled). + +### 2. Check for running dev server + +Before invoking browse-based verification, check if a dev server is reachable: + +```bash +curl -s -o /dev/null -w '%{http_code}' http://localhost:3000 2>/dev/null || \ +curl -s -o /dev/null -w '%{http_code}' http://localhost:8080 2>/dev/null || \ +curl -s -o /dev/null -w '%{http_code}' http://localhost:5173 2>/dev/null || \ +curl -s -o /dev/null -w '%{http_code}' http://localhost:4000 2>/dev/null || echo "NO_SERVER" +``` + +**If NO_SERVER:** Skip with "No dev server detected — skipping plan verification. Run /qa separately after deploying." + +### 3. Invoke /qa-only inline + +Read the `/qa-only` skill from disk: + +```bash +cat ${CLAUDE_SKILL_DIR}/../qa-only/SKILL.md +``` + +**If unreadable:** Skip with "Could not load /qa-only — skipping plan verification." + +Follow the /qa-only workflow with these modifications: +- **Skip the preamble** (already handled by /ship) +- **Use the plan's verification section as the primary test input** — treat each verification item as a test case +- **Use the detected dev server URL** as the base URL +- **Skip the fix loop** — this is report-only verification during /ship +- **Cap at the verification items from the plan** — do not expand into general site QA + +### 4. Gate logic + +- **All verification items PASS:** Continue silently. "Plan verification: PASS." +- **Any FAIL:** Use AskUserQuestion: + - Show the failures with screenshot evidence + - RECOMMENDATION: Choose A if failures indicate broken functionality. Choose B if cosmetic only. + - Options: + A) Fix the failures before shipping (recommended for functional issues) + B) Ship anyway — known issues (acceptable for cosmetic issues) +- **No verification section / no server / unreadable skill:** Skip (non-blocking). + +### 5. Include in PR body + +Add a `## Verification Results` section to the PR body (Step 8): +- If verification ran: summary of results (N PASS, M FAIL, K SKIPPED) +- If skipped: reason for skipping (no plan, no server, no verification section) + +--- + ## Step 3.5: Pre-Landing Review Review the diff for structural issues that tests don't catch. @@ -1448,6 +1659,16 @@ gh pr create --base --title ": " --body "$(cat <<'EOF' +## Plan Completion + + + + +## Verification Results + + + + ## TODOS @@ -1488,6 +1709,32 @@ doc updates — the user runs `/ship` and documentation stays current without a --- +## Step 8.75: Persist ship metrics + +Log coverage and plan completion data so `/retro` can track trends: + +```bash +eval "$(~/.claude/skills/gstack/bin/gstack-slug 2>/dev/null)" && mkdir -p ~/.gstack/projects/$SLUG +``` + +Append to `~/.gstack/projects/$SLUG/$BRANCH-reviews.jsonl`: + +```bash +echo '{"skill":"ship","timestamp":"'"$(date -u +%Y-%m-%dT%H:%M:%SZ)"'","coverage_pct":COVERAGE_PCT,"plan_items_total":PLAN_TOTAL,"plan_items_done":PLAN_DONE,"verification_result":"VERIFY_RESULT","version":"VERSION","branch":"BRANCH"}' >> ~/.gstack/projects/$SLUG/$BRANCH-reviews.jsonl +``` + +Substitute from earlier steps: +- **COVERAGE_PCT**: coverage percentage from Step 3.4 diagram (integer, or -1 if undetermined) +- **PLAN_TOTAL**: total plan items extracted in Step 3.45 (0 if no plan file) +- **PLAN_DONE**: count of DONE + CHANGED items from Step 3.45 (0 if no plan file) +- **VERIFY_RESULT**: "pass", "fail", or "skipped" from Step 3.47 +- **VERSION**: from the VERSION file +- **BRANCH**: current branch name + +This step is automatic — never skip it, never ask for confirmation. + +--- + ## Important Rules - **Never skip tests.** If tests fail, stop. diff --git a/ship/SKILL.md.tmpl b/ship/SKILL.md.tmpl index ce859cf37..7f82c64dd 100644 --- a/ship/SKILL.md.tmpl +++ b/ship/SKILL.md.tmpl @@ -32,6 +32,9 @@ You are running the `/ship` workflow. This is a **non-interactive, fully automat - Pre-landing review finds ASK items that need user judgment - MINOR or MAJOR version bump needed (ask — see Step 4) - Greptile review comments that need user decision (complex fixes, false positives) +- AI-assessed coverage below minimum threshold (hard gate with user override — see Step 3.4) +- Plan items NOT DONE with no user override (see Step 3.45) +- Plan verification failures (see Step 3.47) - TODOS.md missing and user wants to create one (ask — see Step 5.5) - TODOS.md disorganized and user wants to reorganize (ask — see Step 5.5) @@ -43,7 +46,7 @@ You are running the `/ship` workflow. This is a **non-interactive, fully automat - Multi-file changesets (auto-split into bisectable commits) - TODOS.md completed-item detection (auto-mark) - Auto-fixable review findings (dead code, N+1, stale comments — fixed automatically) -- Test coverage gaps (auto-generate and commit, or flag in PR body) +- Test coverage gaps within target threshold (auto-generate and commit, or flag in PR body) --- @@ -225,6 +228,16 @@ If multiple suites need to run, run them sequentially (each needs a test lane). --- +## Step 3.45: Plan Completion Audit + +{{PLAN_COMPLETION_AUDIT_SHIP}} + +--- + +{{PLAN_VERIFICATION_EXEC}} + +--- + ## Step 3.5: Pre-Landing Review Review the diff for structural issues that tests don't catch. @@ -500,6 +513,16 @@ gh pr create --base --title ": " --body "$(cat <<'EOF' +## Plan Completion + + + + +## Verification Results + + + + ## TODOS @@ -540,6 +563,32 @@ doc updates — the user runs `/ship` and documentation stays current without a --- +## Step 8.75: Persist ship metrics + +Log coverage and plan completion data so `/retro` can track trends: + +```bash +eval "$(~/.claude/skills/gstack/bin/gstack-slug 2>/dev/null)" && mkdir -p ~/.gstack/projects/$SLUG +``` + +Append to `~/.gstack/projects/$SLUG/$BRANCH-reviews.jsonl`: + +```bash +echo '{"skill":"ship","timestamp":"'"$(date -u +%Y-%m-%dT%H:%M:%SZ)"'","coverage_pct":COVERAGE_PCT,"plan_items_total":PLAN_TOTAL,"plan_items_done":PLAN_DONE,"verification_result":"VERIFY_RESULT","version":"VERSION","branch":"BRANCH"}' >> ~/.gstack/projects/$SLUG/$BRANCH-reviews.jsonl +``` + +Substitute from earlier steps: +- **COVERAGE_PCT**: coverage percentage from Step 3.4 diagram (integer, or -1 if undetermined) +- **PLAN_TOTAL**: total plan items extracted in Step 3.45 (0 if no plan file) +- **PLAN_DONE**: count of DONE + CHANGED items from Step 3.45 (0 if no plan file) +- **VERIFY_RESULT**: "pass", "fail", or "skipped" from Step 3.47 +- **VERSION**: from the VERSION file +- **BRANCH**: current branch name + +This step is automatic — never skip it, never ask for confirmation. + +--- + ## Important Rules - **Never skip tests.** If tests fail, stop. diff --git a/test/gen-skill-docs.test.ts b/test/gen-skill-docs.test.ts index d0da767a8..d8a071a1f 100644 --- a/test/gen-skill-docs.test.ts +++ b/test/gen-skill-docs.test.ts @@ -677,6 +677,168 @@ describe('PLAN_FILE_REVIEW_REPORT resolver', () => { }); }); +// --- {{PLAN_COMPLETION_AUDIT}} resolver tests --- + +describe('PLAN_COMPLETION_AUDIT placeholders', () => { + const shipSkill = fs.readFileSync(path.join(ROOT, 'ship', 'SKILL.md'), 'utf-8'); + const reviewSkill = fs.readFileSync(path.join(ROOT, 'review', 'SKILL.md'), 'utf-8'); + + test('ship SKILL.md contains plan completion audit step', () => { + expect(shipSkill).toContain('Plan Completion Audit'); + expect(shipSkill).toContain('Step 3.45'); + }); + + test('review SKILL.md contains plan completion in scope drift', () => { + expect(reviewSkill).toContain('Plan File Discovery'); + expect(reviewSkill).toContain('Actionable Item Extraction'); + expect(reviewSkill).toContain('Integration with Scope Drift Detection'); + }); + + test('both modes share plan file discovery methodology', () => { + expect(shipSkill).toContain('Plan File Discovery'); + expect(reviewSkill).toContain('Plan File Discovery'); + // Both should have conversation context first + expect(shipSkill).toContain('Conversation context (primary)'); + expect(reviewSkill).toContain('Conversation context (primary)'); + // Both should have grep fallback + expect(shipSkill).toContain('Content-based search (fallback)'); + expect(reviewSkill).toContain('Content-based search (fallback)'); + }); + + test('ship mode has gate logic for NOT DONE items', () => { + expect(shipSkill).toContain('NOT DONE'); + expect(shipSkill).toContain('Stop — implement the missing items'); + expect(shipSkill).toContain('Ship anyway — defer'); + expect(shipSkill).toContain('intentionally dropped'); + }); + + test('review mode is INFORMATIONAL only', () => { + expect(reviewSkill).toContain('INFORMATIONAL'); + expect(reviewSkill).toContain('MISSING REQUIREMENTS'); + expect(reviewSkill).toContain('SCOPE CREEP'); + }); + + test('item extraction has 50-item cap', () => { + expect(shipSkill).toContain('at most 50 items'); + }); + + test('uses file-level traceability (not commit-level)', () => { + expect(shipSkill).toContain('Cite the specific file'); + expect(shipSkill).not.toContain('commit-level traceability'); + }); +}); + +// --- {{PLAN_VERIFICATION_EXEC}} resolver tests --- + +describe('PLAN_VERIFICATION_EXEC placeholder', () => { + const shipSkill = fs.readFileSync(path.join(ROOT, 'ship', 'SKILL.md'), 'utf-8'); + + test('ship SKILL.md contains plan verification step', () => { + expect(shipSkill).toContain('Step 3.47'); + expect(shipSkill).toContain('Plan Verification'); + }); + + test('references /qa-only invocation', () => { + expect(shipSkill).toContain('qa-only/SKILL.md'); + expect(shipSkill).toContain('qa-only'); + }); + + test('contains localhost reachability check', () => { + expect(shipSkill).toContain('localhost:3000'); + expect(shipSkill).toContain('NO_SERVER'); + }); + + test('skips gracefully when no verification section', () => { + expect(shipSkill).toContain('No verification steps found in plan'); + }); + + test('skips gracefully when no dev server', () => { + expect(shipSkill).toContain('No dev server detected'); + }); +}); + +// --- Coverage gate tests --- + +describe('Coverage gate in ship', () => { + const shipSkill = fs.readFileSync(path.join(ROOT, 'ship', 'SKILL.md'), 'utf-8'); + const reviewSkill = fs.readFileSync(path.join(ROOT, 'review', 'SKILL.md'), 'utf-8'); + + test('ship SKILL.md contains coverage gate with thresholds', () => { + expect(shipSkill).toContain('Coverage gate'); + expect(shipSkill).toContain('>= target'); + expect(shipSkill).toContain('< minimum'); + }); + + test('ship SKILL.md supports configurable thresholds via CLAUDE.md', () => { + expect(shipSkill).toContain('## Test Coverage'); + expect(shipSkill).toContain('Minimum:'); + expect(shipSkill).toContain('Target:'); + }); + + test('coverage gate skips on parse failure (not block)', () => { + expect(shipSkill).toContain('could not determine percentage — skipping'); + }); + + test('review SKILL.md contains coverage WARNING', () => { + expect(reviewSkill).toContain('COVERAGE WARNING'); + expect(reviewSkill).toContain('Consider writing tests before running /ship'); + }); + + test('review coverage warning is INFORMATIONAL', () => { + expect(reviewSkill).toContain('INFORMATIONAL'); + }); +}); + +// --- Ship metrics logging --- + +describe('Ship metrics logging', () => { + const shipSkill = fs.readFileSync(path.join(ROOT, 'ship', 'SKILL.md'), 'utf-8'); + + test('ship SKILL.md contains metrics persistence step', () => { + expect(shipSkill).toContain('Step 8.75'); + expect(shipSkill).toContain('coverage_pct'); + expect(shipSkill).toContain('plan_items_total'); + expect(shipSkill).toContain('plan_items_done'); + expect(shipSkill).toContain('verification_result'); + }); +}); + +// --- Plan file discovery shared helper --- + +describe('Plan file discovery shared helper', () => { + // The shared helper should appear in ship (via PLAN_COMPLETION_AUDIT_SHIP) + // and in review (via PLAN_COMPLETION_AUDIT_REVIEW) + const shipSkill = fs.readFileSync(path.join(ROOT, 'ship', 'SKILL.md'), 'utf-8'); + const reviewSkill = fs.readFileSync(path.join(ROOT, 'review', 'SKILL.md'), 'utf-8'); + + test('plan file discovery appears in both ship and review', () => { + expect(shipSkill).toContain('Plan File Discovery'); + expect(reviewSkill).toContain('Plan File Discovery'); + }); + + test('both include conversation context first', () => { + expect(shipSkill).toContain('Conversation context (primary)'); + expect(reviewSkill).toContain('Conversation context (primary)'); + }); + + test('both include content-based fallback', () => { + expect(shipSkill).toContain('Content-based search (fallback)'); + expect(reviewSkill).toContain('Content-based search (fallback)'); + }); +}); + +// --- Retro plan completion --- + +describe('Retro plan completion section', () => { + const retroSkill = fs.readFileSync(path.join(ROOT, 'retro', 'SKILL.md'), 'utf-8'); + + test('retro SKILL.md contains plan completion section', () => { + expect(retroSkill).toContain('### Plan Completion'); + expect(retroSkill).toContain('plan_items_total'); + expect(retroSkill).toContain('Plan Completion This Period'); + }); +}); + // --- Plan status footer in preamble --- describe('Plan status footer in preamble', () => { diff --git a/test/helpers/touchfiles.ts b/test/helpers/touchfiles.ts index 931bcda80..a9b769e7a 100644 --- a/test/helpers/touchfiles.ts +++ b/test/helpers/touchfiles.ts @@ -110,12 +110,17 @@ export const E2E_TOUCHFILES: Record = { 'gemini-review-findings': ['review/**', '.agents/skills/gstack-review/**', 'test/helpers/gemini-session-runner.ts'], - // Coverage audit (shared fixture) + triage + // Coverage audit (shared fixture) + triage + gates 'ship-coverage-audit': ['ship/**', 'test/fixtures/coverage-audit-fixture.ts', 'bin/gstack-repo-mode'], 'review-coverage-audit': ['review/**', 'test/fixtures/coverage-audit-fixture.ts'], 'plan-eng-coverage-audit': ['plan-eng-review/**', 'test/fixtures/coverage-audit-fixture.ts'], 'ship-triage': ['ship/**', 'bin/gstack-repo-mode'], + // Plan completion audit + verification + 'ship-plan-completion': ['ship/**', 'scripts/gen-skill-docs.ts'], + 'ship-plan-verification': ['ship/**', 'qa-only/**', 'scripts/gen-skill-docs.ts'], + 'review-plan-completion': ['review/**', 'scripts/gen-skill-docs.ts'], + // Design 'design-consultation-core': ['design-consultation/**', 'scripts/gen-skill-docs.ts'], 'design-consultation-existing': ['design-consultation/**', 'scripts/gen-skill-docs.ts'],