From 723228fc4e8ed584bb3c7bd5c8706b114844818a Mon Sep 17 00:00:00 2001 From: James Grugett Date: Wed, 8 Apr 2026 18:54:14 -0700 Subject: [PATCH 1/9] Add bunfig: preloads .env vars --- bunfig.toml | 4 ++++ docs/testing.md | 4 +++- 2 files changed, 7 insertions(+), 1 deletion(-) create mode 100644 bunfig.toml diff --git a/bunfig.toml b/bunfig.toml new file mode 100644 index 0000000..1d64e9c --- /dev/null +++ b/bunfig.toml @@ -0,0 +1,4 @@ +preload = ["./src/load-env.ts"] + +[test] +preload = ["./src/load-env.ts"] diff --git a/docs/testing.md b/docs/testing.md index fa730de..08a7744 100644 --- a/docs/testing.md +++ b/docs/testing.md @@ -17,6 +17,8 @@ bun run test:e2e # E2E tests only Fresh workspaces (e.g., carved eval repos) may not have dependencies installed. Always run `bun install` or `bash setup.sh` before expecting `bun run typecheck` or `bun test` to succeed. A task is not complete until both commands pass after dependencies are installed. +Local developer credentials can live in `.env.local`. `bunfig.toml` preloads `src/load-env.ts`, so direct Bun invocations such as `bun test src/__tests__/docs-writer.e2e.test.ts` and `bun run src/run-evalbuff.ts ...` automatically read `.env.local` first, then `.env`, without requiring wrapper scripts. + ## Test File Layout - Unit tests: `src/__tests__/.test.ts` @@ -120,7 +122,7 @@ const SKIP = !process.env.OPENAI_API_KEY || !(process.env.CLAUDE_CODE_KEY || pro it.skipIf(SKIP)('full pipeline', async () => { ... }) ``` -Build a temp git repo with at least 2 distinct feature areas and lightweight repo-local tests. Assert the real artifact structure from `docs/run-artifacts.md`: `plan.json`, `features.json`, `round-0/`, subsequent rounds, `baseline-rejudge-loop-N/`, loop artifacts, `summary.json`, `report.md`, and `git worktree list` cleanup. Any change to artifact persistence contracts must be verified by updating these assertions. +Build a temp git repo with at least 2 distinct feature areas and lightweight repo-local tests. Assert the real artifact structure from `docs/run-artifacts.md`: `plan.json`, `features.json`, `round-0/`, subsequent rounds, `baseline-rejudge-loop-N/`, loop artifacts including `doc-gates-loop-N.json`, `summary.json`, `report.md`, and `git worktree list` cleanup. Any change to artifact persistence contracts must be verified by updating these assertions. ## Preserving Failed E2E Runs From 63926e0ab44fa029460a9c8c6501b4dd93c70abd Mon Sep 17 00:00:00 2001 From: James Grugett Date: Wed, 8 Apr 2026 18:57:21 -0700 Subject: [PATCH 2/9] Gate docs changes per task --- docs/architecture.md | 33 +- docs/cli.md | 8 +- docs/run-artifacts.md | 21 +- src/__tests__/docs-writer.e2e.test.ts | 237 +++++++++ src/__tests__/docs-writer.test.ts | 89 ++++ src/__tests__/eval-runner.test.ts | 98 +++- src/__tests__/load-env.test.ts | 35 ++ src/__tests__/run-evalbuff.e2e.test.ts | 3 + src/__tests__/run-evalbuff.test.ts | 231 +++++++++ src/cli.ts | 10 +- src/docs-writer.ts | 466 ++++++++++++++--- src/eval-runner.ts | 75 ++- src/load-env.ts | 80 +++ src/report.ts | 107 +++- src/run-evalbuff.ts | 664 +++++++++++++++++++++---- src/tui/app.tsx | 2 +- src/tui/data.ts | 2 + src/tui/main.tsx | 4 +- 18 files changed, 1944 insertions(+), 221 deletions(-) create mode 100644 src/__tests__/docs-writer.e2e.test.ts create mode 100644 src/__tests__/docs-writer.test.ts create mode 100644 src/__tests__/load-env.test.ts create mode 100644 src/__tests__/run-evalbuff.test.ts create mode 100644 src/load-env.ts diff --git a/docs/architecture.md b/docs/architecture.md index 79504a7..08c647e 100644 --- a/docs/architecture.md +++ b/docs/architecture.md @@ -2,13 +2,13 @@ ## Pipeline Overview -Evalbuff follows a plan → carve → evaluate → refactor loop: +Evalbuff follows a plan → carve → baseline → gated-improvement loop: 1. **Plan** — `planFeatures()` in `src/carve-features.ts` uses a Codex agent to scan the target repo and identify 15–25 discrete features that can be cleanly removed. 2. **Carve** — `carveFeature()` creates an isolated git worktree, runs a Codex agent to remove the feature, and captures the resulting diff and file operations. -3. **Evaluate** — `runAgentOnCarve()` in `src/eval-runner.ts` clones the repo, applies the carve, copies current docs, runs a coding agent to rebuild the feature, then hands the result to `judgeTaskResult()` in `src/judge.ts`. -4. **Write docs** — `runDocsWriterAgent()` in `src/docs-writer.ts` collects judge suggestions and runs a Claude agent in a temp clone to edit `docs/`, `AGENTS.md`, and `CLAUDE.md`. -5. **Repeat** — Steps 3–4 loop N times. Each loop also re-judges the baseline diffs with current docs to separate judge recalibration from real agent improvement. +3. **Baseline** — `runAgentOnCarve()` in `src/eval-runner.ts` clones the repo, applies the carve, copies current docs, runs a coding agent to rebuild the feature, then hands the result to `judgeTaskResult()` in `src/judge.ts`. +4. **Gate docs changes** — during each improvement loop, every feature is re-run sequentially. The judge and coding agent both suggest independent docs changes. `planDocsChangesForTask()` in `src/docs-writer.ts` reads the docs once, rejects overfit/low-value suggestions, and creates one independent committed docs candidate per surviving suggestion. Evalbuff then materializes each candidate patch onto the current docs state, re-judges the originating task, and optionally re-runs the coding agent before accepting it. +5. **Repeat** — Step 4 loops N times. Each completed loop also re-judges the baseline diffs with the final loop docs to separate judge recalibration from real agent improvement. ## Key Modules @@ -19,7 +19,7 @@ Evalbuff follows a plan → carve → evaluate → refactor loop: | `src/eval-helpers.ts` | Git/docs utilities — carve ops, docs sync, diff capture, ground-truth computation | | `src/carve-features.ts` | Feature identification and extraction via Codex agents in git worktrees | | `src/judge.ts` | Codex-based reviewer that scores agent output with E2E testing | -| `src/docs-writer.ts` | Holistic docs editing agent + judge suggestion collector | +| `src/docs-writer.ts` | Coding-agent suggestion parsing + per-task docs-change planning/materialization | | `src/perfect-feature.ts` | Single-feature iterative optimizer (rebuild → judge → diagnose → update docs) | | `src/report.ts` | Persists round results and generates `summary.json` + `report.md` | | `src/trace-compressor.ts` | Extracts large tool outputs from traces into content-addressed sidecar files | @@ -36,14 +36,13 @@ Target repo ↓ carveFeature() → CarvedFeature[] ↓ [saved as features.json] ↓ - ↓ For each round: - ↓ runAgentOnCarve() → TaskResult (per feature, in parallel) + ↓ Baseline round: + ↓ runAgentOnCarve() → TaskResult (per feature, sequentially) ↓ saveRoundResults() → round-N/ directory ↓ ↓ For each improvement loop: - ↓ collectDocSuggestions() → text - ↓ runDocsWriterAgent() → edits docs in target repo ↓ runEvalRound() → new scores + ↓ gateDocsChangesForTask() → per-feature accepted/rejected doc candidates ↓ runBaselineRejudgeRound() → re-scored baseline ↓ ↓ saveSummary() → summary.json + report.md @@ -67,14 +66,14 @@ Most workflows (eval, docs writer, judging) operate in temporary clones, not the ### Docs Refactor Pattern -`runDocsWriterAgent()` in `src/docs-writer.ts` builds a holistic prompt, not a task-specific checklist. The prompt tells the agent to: -1. Read all current docs (`docs/`, `AGENTS.md`, `CLAUDE.md`). -2. Generalize judge feedback into reusable project patterns — avoid feature-specific examples. -3. Verify every referenced symbol/path with grep before documenting it. -4. Restrict `AGENTS.md` changes to doc-index maintenance or factual corrections. -5. Sync docs back only after a successful run. +`planDocsChangesForTask()` in `src/docs-writer.ts` does one planning pass per feature. The prompt tells the agent to: +1. Read all current docs (`docs/`, `AGENTS.md`, `CLAUDE.md`) once. +2. Reject suggestions that are overfit, already covered, low-priority, or not grounded in the current code. +3. For each surviving suggestion, create one independent docs-only commit on its own branch from the same baseline docs commit. +4. Reset back to the baseline docs commit before preparing the next candidate so branches stay independent. +5. Write a manifest explaining which suggestions were accepted or rejected and why. -When building similar doc-editing agents, follow the same holistic approach: read first, generalize, verify, then write. +When building similar doc-editing agents, favor one read-heavy planning pass that emits independently replayable doc changes instead of rereading the full docs corpus for every candidate. ## Orchestration Patterns @@ -96,7 +95,7 @@ When modifying the orchestration (new `EvalbuffOptions` fields, new phases, new ## Concurrency -Eval rounds use bounded concurrency: `opts.parallelism` workers pull from a shared queue. Each worker runs a full clone → carve → agent → judge cycle independently. +Carving still uses bounded concurrency (`opts.parallelism` workers pull from a shared queue), but eval rounds are intentionally sequential. That ordering matters because accepted docs changes from one feature should affect the very next feature in the same loop. ## Events and TUI diff --git a/docs/cli.md b/docs/cli.md index 37055d5..04967b8 100644 --- a/docs/cli.md +++ b/docs/cli.md @@ -3,18 +3,18 @@ ## Main Pipeline ```bash -bun run src/run-evalbuff.ts \ + bun run src/run-evalbuff.ts \ --repo /path/to/repo \ [--n 20] \ - [--parallelism 10] \ - [--loops 3] \ + [--parallelism 1] \ + [--loops 1] \ [--init-command "npm install"] \ [--coding-model sonnet] \ [--docs-model opus] \ [--cached-features /path/to/features.json] ``` -All flags are parsed explicitly in the `import.meta.main` block. Required flags must be validated with helpful errors. The `--cached-features` flag skips planning/carving and loads pre-carved features directly. +All flags are parsed explicitly in the `import.meta.main` block. Required flags must be validated with helpful errors. The `--cached-features` flag skips planning/carving and loads pre-carved features directly. Improvement loops now run features sequentially and gate docs changes one candidate at a time; `--parallelism` still applies to carving/setup concurrency, not the per-loop feature order. ## Perfect Feature (Single-Feature Optimizer) diff --git a/docs/run-artifacts.md b/docs/run-artifacts.md index 7040563..cfd0c1d 100644 --- a/docs/run-artifacts.md +++ b/docs/run-artifacts.md @@ -16,6 +16,7 @@ $TMPDIR/evalbuff-run-YYYY-MM-DDTHH-MM-SS/ │ ├── trace.txt.sidecars/ # Extracted large payloads + manifest.json │ ├── diff.txt # Agent's unified diff │ ├── judging.json # Full JudgingResult from the judge agent +│ ├── agent-suggestions.json # Coding agent doc/project suggestions │ └── score.txt # Single number (overallScore) │ ├── round-1/ # Loop 1 re-eval (same structure as round-0) @@ -27,9 +28,10 @@ $TMPDIR/evalbuff-run-YYYY-MM-DDTHH-MM-SS/ │ ├── judging.json # Re-judged result (trace/diff not re-persisted) │ └── score.txt │ -├── judge-suggestions-loop-1.txt # Raw judge suggestions fed to docs writer -├── docs-diff-loop-1.txt # Before/after diff of docs for loop 1 -├── docs-state-loop-1.json # Snapshot of all docs after loop 1 +├── judge-suggestions-loop-1.txt # Human-readable summary of accepted/rejected/overfit-skipped doc candidates +├── doc-gates-loop-1.json # Detailed per-candidate gate results for loop 1, including overfit and low-priority rejections +├── docs-diff-loop-1.txt # Before/after diff of docs for loop 1 +├── docs-state-loop-1.json # Snapshot of all docs after loop 1 │ ├── summary.json # EvalSummary — the top-level run summary └── report.md # Human-readable markdown report @@ -43,6 +45,8 @@ $TMPDIR/evalbuff-run-YYYY-MM-DDTHH-MM-SS/ - `rounds[]` — `{ round, avgScore, scores: Record, totalCost }` - `totalCost`, `scoreProgression: number[]` - `baselineRejudgeProgression?: number[]` +- `consideredDocChangesByLoop?: number[]` +- `acceptedDocChangesByLoop?: number[]` **`round-N/summary.json`** (per-round): - `round`, `avgScore`, `totalCost` @@ -56,9 +60,10 @@ $TMPDIR/evalbuff-run-YYYY-MM-DDTHH-MM-SS/ ## Loop Artifact Timing -Loop artifacts (`judge-suggestions-loop-N.txt`, `docs-diff-loop-N.txt`, `docs-state-loop-N.json`) are written at the **log-dir root** during the docs-writer step, **before** the corresponding `round-N/` directory is created by `saveRoundResults()`. This means: +Loop artifacts (`judge-suggestions-loop-N.txt`, `doc-gates-loop-N.json`, `docs-diff-loop-N.txt`, `docs-state-loop-N.json`) are written at the **log-dir root** after the sequential doc-gating pass, **before** the corresponding `round-N/` directory is created by `saveRoundResults()`. This means: -- `judge-suggestions-loop-N.txt` is written only when suggestions exist. +- `judge-suggestions-loop-N.txt` should exist for every completed loop, even if it is empty. +- `doc-gates-loop-N.json` contains every considered docs candidate for the loop, including accepted/rejected status, overfit/low-priority filtering, and rejudge/rerun scores when applicable. - `docs-diff-loop-N.txt` must always exist after the docs-writer step — empty string when nothing changed. - `docs-state-loop-N.json` must always exist — contains the `getDocsSnapshot(repoPath)` result after refactoring. @@ -77,7 +82,7 @@ The TUI data loader in `src/tui/data.ts` exports `loadLogDir(logDir)` returning - `report.md` → `string` (empty string if missing; powers summary/report screens) - `round-N/` directories → per-round feature data (scanned sequentially from 0, stops at first gap) - `baseline-rejudge-loop-N/` directories → re-judged baseline data -- Root-level loop artifacts: `judge-suggestions-loop-N.txt`, `docs-diff-loop-N.txt`, `docs-state-loop-N.json` +- Root-level loop artifacts: `judge-suggestions-loop-N.txt`, `doc-gates-loop-N.json`, `docs-diff-loop-N.txt`, `docs-state-loop-N.json` Per-feature task data comes from child directories (`round-N//score.txt`, `judging.json`, `diff.txt`, `trace.txt`) — not reconstructed from `summary.json`. Missing singular artifacts return `null`, missing collections return `[]`. Loaders must prefer per-feature files over round summaries so partial runs render progressively. @@ -101,7 +106,7 @@ Live/watch UIs must refresh on **any** progressive artifact change, not only whe - `score.txt` or `judging.json` is written inside an existing round - `round-N/summary.json` is written - Top-level `summary.json` is written -- Root-level loop artifacts appear (before their corresponding round directory) +- Root-level loop artifacts appear (before their corresponding round directory), including `doc-gates-loop-N.json` Cumulative metrics like `totalCost` from `round_complete` events are **run totals**, never per-round deltas. The UI must never let displayed cumulative values decrease between rounds. @@ -123,4 +128,4 @@ Both values must appear as visible labeled lines in the report. Per-round detail **Trace compression**: `saveRoundResults()` writes raw `trace.txt` first, then kicks off async `compressAndSave(tracePath, trace)` so `trace.txt.compressed` and `trace.txt.sidecars/manifest.json` appear later without blocking the round. Compression failures emit a warning but must not fail the round. -Always-present sections: Overview table, Score Trajectory, Scores by Round, per-round feature detail (score breakdown, analysis, strengths, weaknesses, E2E tests, docs read, doc suggestions, cost). Optional sections: Baseline Rejudge Trajectory, Baseline Scored by Each Loop's Docs, Judge Suggestions Applied, Docs Changes (per loop), and Final Documentation State. +Always-present sections: Overview table, Score Trajectory, Scores by Round, per-round feature detail (score breakdown, analysis, strengths, weaknesses, E2E tests, docs read, doc suggestions, cost). Optional sections: Baseline Rejudge Trajectory, Baseline Scored by Each Loop's Docs, Doc Gate Summary, Per-Candidate Doc Gates, Docs Changes (per loop), and Final Documentation State. diff --git a/src/__tests__/docs-writer.e2e.test.ts b/src/__tests__/docs-writer.e2e.test.ts new file mode 100644 index 0000000..961270c --- /dev/null +++ b/src/__tests__/docs-writer.e2e.test.ts @@ -0,0 +1,237 @@ +/** + * E2E test for the docs writer. + * + * Creates a small repo with docs and config-driven CLI code, then runs the + * real docs writer agent to plan independent documentation changes from a set + * of suggestions. Verifies that broadly useful guidance is accepted as a + * reusable patch and that an obviously task-specific suggestion is rejected. + * + * Requires CLAUDE_CODE_KEY or ANTHROPIC_API_KEY. + * + * Run: bun test src/__tests__/docs-writer.e2e.test.ts + */ +import { execSync } from 'child_process' +import fs from 'fs' +import os from 'os' +import path from 'path' + +import { describe, expect, it } from 'bun:test' + +import { + cleanupDraftedDocsChange, + cleanupPlannedDocsTaskResult, + materializeDocsChangeFromPatch, + planDocsChangesForTask, +} from '../docs-writer' + +import type { IndependentSuggestion } from '../docs-writer' + +const SKIP = !(process.env.CLAUDE_CODE_KEY || process.env.ANTHROPIC_API_KEY) + +const GIT_ENV = { + ...process.env, + GIT_AUTHOR_NAME: 'test', + GIT_AUTHOR_EMAIL: 'test@test.com', + GIT_COMMITTER_NAME: 'test', + GIT_COMMITTER_EMAIL: 'test@test.com', +} + +const FILES: Record = { + 'package.json': JSON.stringify( + { + name: 'evalbuff-docs-writer-test-project', + version: '1.0.0', + type: 'module', + }, + null, + 2, + ), + 'tsconfig.json': JSON.stringify( + { + compilerOptions: { + target: 'ES2020', + module: 'ESNext', + moduleResolution: 'bundler', + strict: true, + noEmit: true, + }, + include: ['src'], + }, + null, + 2, + ), + 'AGENTS.md': `# Test Repo + +## Docs Index + +- docs/architecture.md +- docs/testing.md +`, + 'docs/architecture.md': `# Architecture + +This repo exposes a small CLI. +Configuration is read from source files under src/. +`, + 'docs/testing.md': `# Testing + +Run \`bun test\` for unit tests. +`, + 'src/config.ts': `export interface AppConfig { + mode: 'dev' | 'test' + cacheDir: string +} + +export function loadConfig( + env: Record = process.env, +): AppConfig { + return { + mode: env.APP_MODE === 'test' ? 'test' : 'dev', + cacheDir: env.APP_CACHE_DIR || '.cache/app', + } +} +`, + 'src/cli.ts': `import { loadConfig } from './config' + +export function runCli( + args: string[], + env: Record = process.env, +): string { + const config = loadConfig(env) + if (args.includes('--print-config')) { + return \`\${config.mode}:\${config.cacheDir}\` + } + return 'ok' +} +`, + 'src/cli.test.ts': `import { runCli } from './cli' + +function assert(cond: boolean, msg: string) { + if (!cond) throw new Error(msg) +} + +assert(runCli(['--print-config'], { + APP_MODE: 'test', + APP_CACHE_DIR: '/tmp/evalbuff-docs-writer-test', +}) === 'test:/tmp/evalbuff-docs-writer-test', 'print-config uses env-driven config') +`, +} + +function createTestRepo(): string { + const dir = fs.mkdtempSync(path.join(os.tmpdir(), 'evalbuff-docs-writer-e2e-')) + execSync('git init', { cwd: dir, stdio: 'ignore', env: GIT_ENV }) + + for (const [relativePath, content] of Object.entries(FILES)) { + const fullPath = path.join(dir, relativePath) + fs.mkdirSync(path.dirname(fullPath), { recursive: true }) + fs.writeFileSync(fullPath, content) + } + + execSync('git add -A && git commit -m "Initial commit"', { + cwd: dir, + stdio: 'ignore', + env: GIT_ENV, + }) + + return dir +} + +describe('docs writer e2e', () => { + it.skipIf(SKIP)( + 'plans reusable docs changes and rejects an overfit suggestion with the real agent', + async () => { + const repoDir = createTestRepo() + let completed = false + let plannedResult: Awaited> | null = null + let materializedDraft: ReturnType | null = null + + try { + console.log(`Docs writer test repo: ${repoDir}`) + + const suggestions: IndependentSuggestion[] = [ + { + source: 'judge', + priority: 85, + text: 'Update docs/testing.md to explain that tests and CLI flows touching src/config.ts should set APP_MODE=test and APP_CACHE_DIR explicitly, because loadConfig() reads both values from environment variables and src/cli.ts uses that config at runtime. This is a reusable setup rule for any config-sensitive task.', + }, + { + source: 'agent', + priority: 80, + text: 'Create a docs file that explains how to rebuild the --print-config branch in src/cli.ts line-by-line, including the exact output formatting and argument order for this one task.', + }, + ] + + plannedResult = await planDocsChangesForTask(repoDir, suggestions, 'sonnet') + expect(plannedResult).toBeDefined() + if (!plannedResult) { + throw new Error('docs writer did not return a plan') + } + + expect(plannedResult.candidates.length).toBeGreaterThanOrEqual(2) + + const accepted = plannedResult.candidates.find((candidate) => + candidate.text.includes('APP_MODE=test and APP_CACHE_DIR explicitly'), + ) + expect(accepted).toBeDefined() + if (!accepted) { + throw new Error('missing accepted candidate') + } + + expect(accepted.accepted).toBe(true) + expect(accepted.overfit).toBe(false) + expect(accepted.branchName).toBeString() + expect(accepted.commitSha).toBeString() + expect(accepted.patchText).toContain('docs/') + expect(accepted.diffText).toContain('APP_MODE') + + materializedDraft = materializeDocsChangeFromPatch(repoDir, accepted.patchText || '') + expect(materializedDraft).toBeDefined() + if (!materializedDraft) { + throw new Error('failed to materialize accepted docs patch') + } + expect(materializedDraft.diffText).toContain('APP_CACHE_DIR') + + const rejected = plannedResult.candidates.find((candidate) => + candidate.text.includes('line-by-line'), + ) + expect(rejected).toBeDefined() + if (!rejected) { + throw new Error('missing rejected candidate') + } + + expect(rejected.accepted).toBe(false) + expect(rejected.overfit || rejected.reason.toLowerCase().includes('overfit')).toBe(true) + expect(rejected.branchName).toBeUndefined() + expect(rejected.commitSha).toBeUndefined() + + const status = execSync('git status --short', { + cwd: repoDir, + encoding: 'utf-8', + }).trim() + expect(status).toBe('') + + const worktrees = execSync('git worktree list', { + cwd: repoDir, + encoding: 'utf-8', + }).trim().split('\n') + expect(worktrees.length).toBe(1) + + completed = true + } finally { + if (completed) { + if (materializedDraft) cleanupDraftedDocsChange(materializedDraft) + if (plannedResult) cleanupPlannedDocsTaskResult(plannedResult) + fs.rmSync(repoDir, { recursive: true, force: true }) + } else { + console.log(`Preserving docs writer test repo for debugging: ${repoDir}`) + if (plannedResult) { + console.log(`Preserving docs writer temp clone: ${plannedResult.tempDir}`) + } + if (materializedDraft) { + console.log(`Preserving materialized docs clone: ${materializedDraft.tempDir}`) + } + } + } + }, + 30 * 60_000, + ) +}) diff --git a/src/__tests__/docs-writer.test.ts b/src/__tests__/docs-writer.test.ts new file mode 100644 index 0000000..fadacc3 --- /dev/null +++ b/src/__tests__/docs-writer.test.ts @@ -0,0 +1,89 @@ +import fs from 'fs' +import os from 'os' +import path from 'path' + +import { afterEach, describe, expect, it } from 'bun:test' + +import { + CODING_AGENT_SUGGESTIONS_FILE, + collectTaskDocSuggestions, + filterDocSuggestionsForPlanning, + readCodingAgentSuggestions, +} from '../docs-writer' + +describe('docs-writer helpers', () => { + const tempPaths: string[] = [] + + afterEach(() => { + for (const tempPath of tempPaths.splice(0)) { + try { + fs.rmSync(tempPath, { recursive: true, force: true }) + } catch { + // ignore cleanup failures + } + } + }) + + it('merges judge and coding-agent doc suggestions by text', () => { + const merged = collectTaskDocSuggestions({ + featureId: 'feature-a', + prompt: 'restore feature a', + score: 5, + diff: '', + trace: '', + judging: { + analysis: '', + strengths: [], + weaknesses: [], + e2eTestsPerformed: [], + completionScore: 5, + codeQualityScore: 5, + e2eScore: 5, + overallScore: 5, + docSuggestions: [ + { text: 'Document the setup script', priority: 60 }, + { text: 'Describe the test harness', priority: 40 }, + ], + projectSuggestions: [], + }, + costEstimate: 0, + docsRead: [], + agentDocSuggestions: [ + { text: 'Document the setup script', priority: 85 }, + ], + agentProjectSuggestions: [], + }) + + expect(merged).toEqual([ + { text: 'Document the setup script', priority: 85, source: 'judge+agent' }, + { text: 'Describe the test harness', priority: 40, source: 'judge' }, + ]) + }) + + it('reads coding-agent suggestions defensively', () => { + const repoDir = fs.mkdtempSync(path.join(os.tmpdir(), 'evalbuff-docs-writer-test-')) + tempPaths.push(repoDir) + + fs.writeFileSync( + path.join(repoDir, CODING_AGENT_SUGGESTIONS_FILE), + JSON.stringify({ + docSuggestions: [{ text: 'Add docs', priority: 70 }], + projectSuggestions: [{ text: 'Add tests', priority: 55 }], + }), + ) + + expect(readCodingAgentSuggestions(repoDir)).toEqual({ + docSuggestions: [{ text: 'Add docs', priority: 70 }], + projectSuggestions: [{ text: 'Add tests', priority: 55 }], + }) + }) + + it('filters low-priority doc suggestions before docs planning', () => { + expect(filterDocSuggestionsForPlanning([ + { text: 'Keep me', priority: 70, source: 'judge' }, + { text: 'Drop me', priority: 25, source: 'agent' }, + ])).toEqual([ + { text: 'Keep me', priority: 70, source: 'judge' }, + ]) + }) +}) diff --git a/src/__tests__/eval-runner.test.ts b/src/__tests__/eval-runner.test.ts index d8a61c5..4f01c64 100644 --- a/src/__tests__/eval-runner.test.ts +++ b/src/__tests__/eval-runner.test.ts @@ -1,10 +1,27 @@ -import { describe, expect, it } from 'bun:test' +import { execSync } from 'child_process' +import fs from 'fs' +import os from 'os' +import path from 'path' + +import { afterEach, describe, expect, it } from 'bun:test' import { runAgentOnCarve } from '../eval-runner' import type { CarvedFeature } from '../carve-features' describe('runAgentOnCarve', () => { + const tempPaths: string[] = [] + + afterEach(() => { + for (const tempPath of tempPaths.splice(0)) { + try { + fs.rmSync(tempPath, { recursive: true, force: true }) + } catch { + // ignore cleanup failures + } + } + }) + it('returns score -1 for infrastructure failures before the agent ever runs', async () => { const feature: CarvedFeature = { id: 'broken-repo', @@ -31,4 +48,83 @@ describe('runAgentOnCarve', () => { expect(result.trace).toContain('Agent error:') expect(result.diff).toBe('') }) + + it('preserves the runner diff when the agent commits changes during the run', async () => { + const repoPath = fs.mkdtempSync(path.join(os.tmpdir(), 'evalbuff-eval-runner-test-')) + tempPaths.push(repoPath) + + fs.mkdirSync(path.join(repoPath, 'docs'), { recursive: true }) + fs.mkdirSync(path.join(repoPath, 'src'), { recursive: true }) + fs.writeFileSync(path.join(repoPath, 'AGENTS.md'), '# Test repo\n') + fs.writeFileSync(path.join(repoPath, 'docs', 'guide.md'), '# Guide\n') + fs.writeFileSync(path.join(repoPath, 'src', 'feature.ts'), 'export const carved = false\n') + + execSync('git init', { cwd: repoPath, stdio: 'ignore' }) + execSync('git config user.name "Evalbuff Tests"', { cwd: repoPath, stdio: 'ignore' }) + execSync('git config user.email "evalbuff@example.com"', { cwd: repoPath, stdio: 'ignore' }) + execSync('git add -A', { cwd: repoPath, stdio: 'ignore' }) + execSync('git commit -m "initial"', { cwd: repoPath, stdio: 'ignore' }) + + let judgedDiff = '' + const expectedDiff = 'diff --git a/src/feature.ts b/src/feature.ts\n+restored\n' + + const feature: CarvedFeature = { + id: 'committed-feature', + prompt: 'Restore the feature', + description: 'A feature used to verify diff preservation', + complexity: 'small', + originalFiles: {}, + operations: [], + diff: expectedDiff, + } + + const result = await runAgentOnCarve( + { + idx: 0, + total: 1, + repoPath, + feature, + model: 'sonnet', + groundTruthDiff: expectedDiff, + docsSourcePath: repoPath, + }, + { + createRunner: (repoDir) => ({ + run: async () => { + fs.writeFileSync(path.join(repoDir, 'src', 'feature.ts'), 'export const carved = true\n') + execSync('git add -A', { cwd: repoDir, stdio: 'ignore' }) + execSync('git commit -m "agent change"', { cwd: repoDir, stdio: 'ignore' }) + + return { + steps: [], + totalCostUsd: 1.25, + diff: expectedDiff, + } + }, + }), + buildCodingAgentPrompt: (prompt) => prompt, + judgeTaskResult: async ({ agentDiff }) => { + judgedDiff = agentDiff + return { + analysis: 'ok', + strengths: [], + weaknesses: [], + e2eTestsPerformed: [], + completionScore: 7, + codeQualityScore: 7, + e2eScore: 7, + overallScore: 7, + } + }, + readCodingAgentSuggestions: () => ({ + docSuggestions: [], + projectSuggestions: [], + }), + }, + ) + + expect(result.diff).toBe(expectedDiff) + expect(judgedDiff).toBe(expectedDiff) + expect(result.score).toBe(7) + }) }) diff --git a/src/__tests__/load-env.test.ts b/src/__tests__/load-env.test.ts new file mode 100644 index 0000000..2e05e80 --- /dev/null +++ b/src/__tests__/load-env.test.ts @@ -0,0 +1,35 @@ +import { describe, expect, it } from 'bun:test' + +import { parseEnvFile } from '../load-env' + +describe('parseEnvFile', () => { + it('parses dotenv-style assignments and ignores comments', () => { + expect(parseEnvFile(` +# comment +OPENAI_API_KEY=abc +export CLAUDE_CODE_KEY="def" +EMPTY= +QUOTED='ghi' +INVALID LINE +`)).toEqual([ + ['OPENAI_API_KEY', 'abc'], + ['CLAUDE_CODE_KEY', 'def'], + ['EMPTY', ''], + ['QUOTED', 'ghi'], + ]) + }) + + it('strips inline comments without breaking quoted hashes', () => { + expect(parseEnvFile(` +OPENAI_API_KEY=sk-live # local key +export CLAUDE_CODE_KEY="anthropic-key" # note +URL_WITH_HASH=https://example.com/#fragment +QUOTED_HASH="#still-a-value" +`)).toEqual([ + ['OPENAI_API_KEY', 'sk-live'], + ['CLAUDE_CODE_KEY', 'anthropic-key'], + ['URL_WITH_HASH', 'https://example.com/#fragment'], + ['QUOTED_HASH', '#still-a-value'], + ]) + }) +}) diff --git a/src/__tests__/run-evalbuff.e2e.test.ts b/src/__tests__/run-evalbuff.e2e.test.ts index 80ea190..71f711b 100644 --- a/src/__tests__/run-evalbuff.e2e.test.ts +++ b/src/__tests__/run-evalbuff.e2e.test.ts @@ -403,6 +403,9 @@ describe('Evalbuff pipeline e2e', () => { const docsStatePath = path.join(logDir, 'docs-state-loop-1.json') expect(fs.existsSync(docsStatePath)).toBe(true) + const docGatesPath = path.join(logDir, 'doc-gates-loop-1.json') + expect(fs.existsSync(docGatesPath)).toBe(true) + // --- Verify overall summary --- const summaryPath = path.join(logDir, 'summary.json') expect(fs.existsSync(summaryPath)).toBe(true) diff --git a/src/__tests__/run-evalbuff.test.ts b/src/__tests__/run-evalbuff.test.ts new file mode 100644 index 0000000..e014dd7 --- /dev/null +++ b/src/__tests__/run-evalbuff.test.ts @@ -0,0 +1,231 @@ +import { afterEach, describe, expect, it } from 'bun:test' + +import { evaluateDocChangeGate, gateDocsChangesForTask, runEvalRound } from '../run-evalbuff' +import { events } from '../tui/events' + +import type { CarvedFeature } from '../carve-features' +import type { TaskResult } from '../eval-runner' + +function createTaskResult(overrides: Partial = {}): TaskResult { + return { + featureId: 'feature-a', + prompt: 'Restore feature a', + score: 6, + diff: 'diff --git a/file.ts b/file.ts\n', + trace: '', + judging: { + analysis: 'ok', + strengths: [], + weaknesses: [], + e2eTestsPerformed: [], + completionScore: 6, + codeQualityScore: 6, + e2eScore: 6, + overallScore: 6, + docSuggestions: [], + projectSuggestions: [], + }, + costEstimate: 0, + docsRead: [], + agentDocSuggestions: [], + agentProjectSuggestions: [], + ...overrides, + } +} + +function createFeature(): CarvedFeature { + return { + id: 'feature-a', + prompt: 'Restore feature a', + description: 'Test feature', + complexity: 'small', + originalFiles: {}, + operations: [], + diff: 'diff --git a/file.ts b/file.ts\n', + } +} + +describe('evaluateDocChangeGate', () => { + afterEach(() => { + events.close() + events.clearBuffer() + }) + + it('fast-accepts when the rejudge score drops by at least twice the normal threshold', () => { + const result = evaluateDocChangeGate({ + baseScore: 6, + rejudgeScore: 5, + }) + + expect(result.accepted).toBe(true) + expect(result.fastAccepted).toBe(true) + expect(result.status).toBe('accepted_fast_rejudge') + expect(result.gateDelta).toBeCloseTo(1, 6) + expect(result.reason).toBe('Accepted without rerun because rejudge dropped by 1.0.') + }) + + it('accepts when rerun minus rejudge clears the threshold', () => { + const result = evaluateDocChangeGate({ + baseScore: 6, + rejudgeScore: 5.8, + rerunScore: 6.4, + }) + + expect(result.accepted).toBe(true) + expect(result.fastAccepted).toBe(false) + expect(result.status).toBe('accepted') + expect(result.gateDelta).toBeCloseTo(0.6, 6) + expect(result.reason).toBe('Accepted because rerun minus rejudge was 0.6.') + }) + + it('rejects when rerun minus rejudge stays below the threshold', () => { + const result = evaluateDocChangeGate({ + baseScore: 6, + rejudgeScore: 5.8, + rerunScore: 6.1, + }) + + expect(result.accepted).toBe(false) + expect(result.fastAccepted).toBe(false) + expect(result.status).toBe('rejected') + expect(result.gateDelta).toBeCloseTo(0.3, 6) + expect(result.reason).toBe('Rejected because rerun minus rejudge was 0.3.') + }) + + it('adds validation rerun cost into round totals', async () => { + const feature = createFeature() + + const round = await runEvalRound( + [feature], + new Map([[feature.id, feature.diff]]), + { + repoPath: '/tmp/repo', + n: 1, + parallelism: 1, + loops: 1, + codingModel: 'sonnet', + docsModel: 'opus', + }, + 1, + undefined, + async () => 3, + { + runAgentOnCarve: async () => createTaskResult({ costEstimate: 2 }), + events, + startSpinner: () => {}, + updateSpinner: () => {}, + stopSpinner: () => {}, + printRoundScores: () => {}, + }, + ) + + expect(round.tasks[0]?.costEstimate).toBe(5) + expect(round.totalCost).toBe(5) + }) + + it('returns docs gate rerun cost and restores the evaluating phase after docs gating', async () => { + const feature = createFeature() + const task = createTaskResult({ + agentDocSuggestions: [{ text: 'Document the rerun gate', priority: 80 }], + }) + + const gated = await gateDocsChangesForTask( + { + feature, + task, + opts: { + repoPath: '/tmp/repo', + n: 1, + parallelism: 1, + loops: 1, + codingModel: 'sonnet', + docsModel: 'opus', + }, + groundTruthDiffs: new Map([[feature.id, feature.diff]]), + loop: 1, + }, + { + collectTaskDocSuggestions: (inputTask) => inputTask.agentDocSuggestions.map((suggestion) => ({ + ...suggestion, + source: 'agent' as const, + })), + filterDocSuggestionsForPlanning: (suggestions) => suggestions, + planDocsChangesForTask: async () => ({ + tempDir: '/tmp/docs-plan', + repoDir: '/tmp/docs-plan/repo', + baseCommit: 'base', + candidates: [ + { + accepted: true, + source: 'agent' as const, + priority: 80, + text: 'Document the rerun gate', + reason: 'Useful guidance', + overfit: false, + patchText: 'patch', + diffText: '--- a/docs/guide.md\n+++ b/docs/guide.md\n', + }, + ], + }), + materializeDocsChangeFromPatch: () => ({ + tempDir: '/tmp/draft-docs', + repoDir: '/tmp/draft-docs', + before: {}, + after: {}, + diffText: '--- a/docs/guide.md\n+++ b/docs/guide.md\n', + }), + cleanupDraftedDocsChange: () => {}, + acceptDraftedDocsChange: () => [], + cleanupPlannedDocsTaskResult: () => {}, + rejudgeTaskWithCurrentDocs: async () => ({ + analysis: 'rejudge', + strengths: [], + weaknesses: [], + e2eTestsPerformed: [], + completionScore: 5.8, + codeQualityScore: 5.8, + e2eScore: 5.8, + overallScore: 5.8, + }), + runAgentOnCarve: async () => createTaskResult({ + score: 6.4, + costEstimate: 3, + judging: { + analysis: 'rerun', + strengths: [], + weaknesses: [], + e2eTestsPerformed: [], + completionScore: 6.4, + codeQualityScore: 6.4, + e2eScore: 6.4, + overallScore: 6.4, + docSuggestions: [], + projectSuggestions: [], + }, + }), + events, + }, + ) + + expect(gated.validationCost).toBe(3) + expect(gated.result.candidates[0]?.rerunScore).toBe(6.4) + + const phaseChanges: Array<{ phase: string; round?: number; loop?: number }> = [] + events.replay(({ event }) => { + if (event.type === 'phase_change') { + phaseChanges.push({ phase: event.phase, round: event.round, loop: event.loop }) + } + }) + + expect(phaseChanges).toContainEqual({ + phase: 'docs_writer', + round: 1, + loop: 1, + }) + expect(phaseChanges).toContainEqual({ + phase: 'evaluating', + round: 1, + loop: 1, + }) + }) +}) diff --git a/src/cli.ts b/src/cli.ts index e916f74..b5269c2 100644 --- a/src/cli.ts +++ b/src/cli.ts @@ -3,7 +3,7 @@ * Evalbuff CLI entry point. * * Usage: - * evalbuff --repo /path/to/repo [--n 20] [--parallelism 10] [--loops 3] + * evalbuff --repo /path/to/repo [--n 20] [--parallelism 1] [--loops 1] * [--init-command "npm install"] [--coding-model sonnet] [--docs-model opus] * [--cached-features /path/to/features.json] */ @@ -27,8 +27,8 @@ if (args.length === 0 || args.includes('--help') || args.includes('-h')) { Options: --repo Path to the target repository (required) --n Number of features to evaluate (default: 20) - --parallelism Max parallel agent runs (default: 10) - --loops Number of optimization loops (default: 3) + --parallelism Max concurrent carve/setup jobs (default: 1) + --loops Number of optimization loops (default: 1) --init-command Command to run before each agent run --coding-model Model for coding agent (default: sonnet) --docs-model Model for docs agent (default: opus) @@ -48,8 +48,8 @@ const hasArg = (name: string): boolean => args.includes(`--${name}`) const repoPath = getArg('repo') const n = parseInt(getArg('n', '20')) -const parallelism = parseInt(getArg('parallelism', '10')) -const loops = parseInt(getArg('loops', '3')) +const parallelism = parseInt(getArg('parallelism', '1')) +const loops = parseInt(getArg('loops', '1')) const initCommand = hasArg('init-command') ? getArg('init-command') : undefined const codingModel = getArg('coding-model', 'sonnet') const docsModel = getArg('docs-model', 'opus') diff --git a/src/docs-writer.ts b/src/docs-writer.ts index e8ae999..536d56a 100644 --- a/src/docs-writer.ts +++ b/src/docs-writer.ts @@ -1,23 +1,182 @@ -import { execSync } from 'child_process' +import { execFileSync, execSync } from 'child_process' import fs from 'fs' import os from 'os' import path from 'path' +import { z } from 'zod/v4' + import { ClaudeRunner } from './runners/claude' -import { syncDocsIntoRepo } from './eval-helpers' +import { computeDocsDiffText, copyDocsIntoRepo, ensureGitIdentity, getDocsSnapshot, syncDocsIntoRepo } from './eval-helpers' +import { SuggestionSchema } from './judge' import type { TaskResult } from './eval-runner' +import type { Suggestion } from './judge' + +export type SuggestionSource = 'judge' | 'agent' | 'judge+agent' + +export interface IndependentSuggestion extends Suggestion { + source: SuggestionSource +} + +export interface CodingAgentSuggestions { + docSuggestions: Suggestion[] + projectSuggestions: Suggestion[] +} + +export interface DraftedDocsChange { + tempDir: string + repoDir: string + before: Record + after: Record + diffText: string +} + +export interface PlannedDocsChange { + text: string + priority: number + source: SuggestionSource + accepted: boolean + reason: string + overfit: boolean + branchName?: string + commitSha?: string + patchText?: string + diffText?: string +} + +export interface PlannedDocsTaskResult { + tempDir: string + repoDir: string + baseCommit: string + candidates: PlannedDocsChange[] +} + +export const CODING_AGENT_SUGGESTIONS_FILE = 'evalbuff-coding-suggestions.json' +const DOCS_WRITER_PLAN_FILE = 'evalbuff-doc-changes-plan.json' +export const DEFAULT_DOC_SUGGESTION_PRIORITY_FLOOR = 40 +const DOCS_WRITER_FAILURE_PREFIX = 'evalbuff-docs-writer-failure-' + +const CodingAgentSuggestionsSchema = z.object({ + docSuggestions: z.array(SuggestionSchema).default([]), + projectSuggestions: z.array(SuggestionSchema).default([]), +}) + +const DocsWriterPlanEntrySchema = z.object({ + text: z.string(), + priority: z.number().min(0).max(100), + source: z.enum(['judge', 'agent', 'judge+agent']), + accepted: z.boolean(), + reason: z.string(), + overfit: z.boolean().default(false), + branchName: z.string().optional(), + commitSha: z.string().optional(), +}) + +const DocsWriterPlanSchema = z.object({ + candidates: z.array(DocsWriterPlanEntrySchema).default([]), +}) + +function mergeSuggestions( + entries: Array<{ source: 'judge' | 'agent'; suggestion: Suggestion }>, +): IndependentSuggestion[] { + const merged = new Map() + + for (const entry of entries) { + const key = entry.suggestion.text.trim().toLowerCase() + const existing = merged.get(key) + if (!existing) { + merged.set(key, { + ...entry.suggestion, + source: entry.source, + }) + continue + } + + const nextSource: SuggestionSource = + existing.source === entry.source ? existing.source : 'judge+agent' + + merged.set(key, { + text: existing.text, + priority: Math.max(existing.priority, entry.suggestion.priority), + source: nextSource, + }) + } + + return [...merged.values()].sort((a, b) => b.priority - a.priority) +} + +export function buildCodingAgentPrompt(taskPrompt: string): string { + return `${taskPrompt} -export function collectDocSuggestions(tasks: TaskResult[]): string { +After you finish the coding task, write JSON to \`${CODING_AGENT_SUGGESTIONS_FILE}\` in the repo root with this exact shape: + +\`\`\`json +{ + "docSuggestions": [ + { "text": "one independent docs change", "priority": 70 } + ], + "projectSuggestions": [ + { "text": "one independent project change", "priority": 55 } + ] +} +\`\`\` + +Rules for the suggestions file: +- Each entry must be an independent suggestion that can be implemented on its own. +- \`docSuggestions\` must focus on general documentation changes that would help future coding agents or reviewers succeed on similar tasks. +- \`projectSuggestions\` must describe project changes (source, tests, infra, cleanup), not docs changes. +- Use priorities from 0-100. +- If you have no suggestions for a category, write an empty array for it. +- Write the file as your last action.` +} + +export function readCodingAgentSuggestions(repoDir: string): CodingAgentSuggestions { + const resultPath = path.join(repoDir, CODING_AGENT_SUGGESTIONS_FILE) + try { + if (!fs.existsSync(resultPath)) { + return { docSuggestions: [], projectSuggestions: [] } + } + const raw = JSON.parse(fs.readFileSync(resultPath, 'utf-8')) + const parsed = CodingAgentSuggestionsSchema.safeParse(raw) + if (!parsed.success) { + return { docSuggestions: [], projectSuggestions: [] } + } + return parsed.data + } catch { + return { docSuggestions: [], projectSuggestions: [] } + } +} + +export function collectTaskDocSuggestions(task: TaskResult): IndependentSuggestion[] { + return mergeSuggestions([ + ...(task.judging.docSuggestions || []).map((suggestion) => ({ + source: 'judge' as const, + suggestion, + })), + ...task.agentDocSuggestions.map((suggestion) => ({ + source: 'agent' as const, + suggestion, + })), + ]) +} + +export function filterDocSuggestionsForPlanning( + suggestions: IndependentSuggestion[], + minPriority: number = DEFAULT_DOC_SUGGESTION_PRIORITY_FLOOR, +): IndependentSuggestion[] { + return suggestions.filter((suggestion) => suggestion.priority >= minPriority) +} + +export function renderDocSuggestions(tasks: TaskResult[]): string { const sections: string[] = [] for (const task of tasks) { - const suggestions = task.judging.docSuggestions + const suggestions = collectTaskDocSuggestions(task) if (!suggestions || suggestions.length === 0) continue sections.push( `### ${task.featureId} (score: ${task.score.toFixed(1)}/10)\n` + - suggestions.map((s) => `- [priority ${s.priority}] ${s.text}`).join('\n'), + suggestions.map((s) => `- [${s.source}] [priority ${s.priority}] ${s.text}`).join('\n'), ) } @@ -28,103 +187,228 @@ export function collectProjectSuggestions(tasks: TaskResult[]): string { const sections: string[] = [] for (const task of tasks) { - const suggestions = task.judging.projectSuggestions + const suggestions = mergeSuggestions([ + ...((task.judging.projectSuggestions || []).map((suggestion) => ({ + source: 'judge' as const, + suggestion, + }))), + ...task.agentProjectSuggestions.map((suggestion) => ({ + source: 'agent' as const, + suggestion, + })), + ]) if (!suggestions || suggestions.length === 0) continue sections.push( `### ${task.featureId} (score: ${task.score.toFixed(1)}/10)\n` + - suggestions.map((s) => `- [priority ${s.priority}] ${s.text}`).join('\n'), + suggestions.map((s) => `- [${s.source}] [priority ${s.priority}] ${s.text}`).join('\n'), ) } return sections.join('\n\n') } -export async function runDocsWriterAgent( +export async function planDocsChangesForTask( repoPath: string, - judgeSuggestions: string, + suggestions: IndependentSuggestion[], model: string, -): Promise { + minPriority: number = DEFAULT_DOC_SUGGESTION_PRIORITY_FLOOR, +): Promise { const tempDir = fs.mkdtempSync(path.join(os.tmpdir(), 'evalbuff-docs-')) const repoDir = path.join(tempDir, 'repo') + let prompt = '' + let runnerResult: Awaited> | null = null + let baseCommit = '' + let lastError: unknown = null - const prompt = `Read ALL existing documentation (docs/, AGENTS.md, CLAUDE.md), consider the judge suggestions below, and make the documentation as useful as possible for coding agents. - -## Goal - -The purpose of these docs is to help a coding agent successfully build NEW features it has never seen before, AND to help reviewers verify that changes actually work. The docs should teach the agent how the project works — its architecture, patterns, conventions, and rules — so it can confidently build anything, not just reconstruct specific existing features. They should also document testing strategies, verification processes, and end-to-end testing approaches that help reviewers evaluate changes beyond just reading the diff. - -## Judge Suggestions - -Multiple judge agents reviewed coding agent attempts and identified documentation gaps. Here are their suggestions, each tagged with a priority score (0-100). Higher priority means more impactful. When the same suggestion appears multiple times across features, that's a signal it deserves higher effective priority. + function preserveFailure(reason: string): void { + try { + const failureDir = fs.mkdtempSync(path.join(os.tmpdir(), DOCS_WRITER_FAILURE_PREFIX)) + fs.writeFileSync(path.join(failureDir, 'reason.txt'), reason) + fs.writeFileSync(path.join(failureDir, 'prompt.txt'), prompt) + if (lastError) { + const errorText = lastError instanceof Error + ? `${lastError.name}: ${lastError.message}\n${lastError.stack || ''}`.trim() + : String(lastError) + fs.writeFileSync(path.join(failureDir, 'error.txt'), errorText + '\n') + } + if (runnerResult) { + fs.writeFileSync( + path.join(failureDir, 'trace.txt'), + runnerResult.steps.map((step) => JSON.stringify(step)).join('\n'), + ) + fs.writeFileSync(path.join(failureDir, 'diff.txt'), runnerResult.diff) + } + if (fs.existsSync(repoDir)) { + fs.renameSync(tempDir, path.join(failureDir, 'workspace')) + } + console.error(`Preserved docs-writer failure bundle at ${failureDir}`) + } catch { + try { + cleanupPlannedDocsTaskResult({ tempDir, repoDir, baseCommit, candidates: [] }) + } catch { + // ignore cleanup failures + } + } + } -**Focus on suggestions with priority 40+. Ignore suggestions with priority below 20 unless they appear multiple times.** Low-priority suggestions are minor nice-to-haves that aren't worth the docs clutter. + try { + execSync(`git clone --no-checkout "${repoPath}" "${repoDir}"`, { stdio: 'ignore' }) + const headSha = execSync('git rev-parse HEAD', { + cwd: repoPath, + encoding: 'utf-8', + }).trim() + execSync(`git checkout ${headSha}`, { cwd: repoDir, stdio: 'ignore' }) + ensureGitIdentity(repoDir) + copyDocsIntoRepo(repoPath, repoDir) + baseCommit = execSync('git rev-parse HEAD', { + cwd: repoDir, + encoding: 'utf-8', + }).trim() -${judgeSuggestions || '(No suggestions were made)'} + prompt = `Read ALL existing documentation (docs/, AGENTS.md, CLAUDE.md) once, then plan and implement a set of independent candidate documentation changes. -## What to do +## Candidate Suggestions -1. **Extract general patterns** — each judge suggestion reflects a specific failure, but your job is to identify the underlying pattern or convention that would prevent a whole class of similar failures. Ask: "What general rule would help an agent get this right for ANY feature?" Some suggestions are about testing/verification strategies for reviewers — treat those as equally important and document them in the appropriate docs (e.g., docs/testing.md or similar). -2. **Do NOT reference specific features** — never mention a specific feature, component, or endpoint by name as an example of what to build. Instead, document the pattern it follows. For example, instead of "the UserProfile component fetches data in useEffect", write "components in this project fetch data using useEffect on mount, following the pattern in src/hooks/". -3. **Document architecture and data flow** — describe how the project is structured, how data flows through it, and where new code should be placed. These are the things an agent building something new needs most. -4. **Edit existing docs** — when a suggestion maps to an existing doc, make fine-grained edits rather than rewriting from scratch. -5. **Create new docs** — when a suggestion identifies a missing pattern or convention, create a concise new doc for it. -6. **Merge overlapping docs** — if multiple suggestions or existing docs cover similar topics, combine them. -7. **Remove redundancy** — consolidate duplicate advice. Dense, actionable information beats verbose explanations. -8. **Fix contradictions** — if docs disagree, pick the correct advice and remove the wrong one. -9. **Prune stale docs** — remove docs that reference files/patterns that no longer exist in the codebase. +${suggestions.length > 0 + ? suggestions.map((suggestion, index) => ( + `${index + 1}. [${suggestion.source}] [priority ${suggestion.priority}] ${suggestion.text}` + )).join('\n') + : '(No suggestions were provided)'} -Rules: -- ONLY modify files in docs/, AGENTS.md, or CLAUDE.md. Do NOT modify source code. -- **Do NOT edit AGENTS.md beyond adding new docs to its index.** The only allowed changes to AGENTS.md are: (a) adding/removing entries in the doc index when you create or delete files under docs/, and (b) correcting existing information that is factually wrong. Do NOT add new paragraphs, prose, sections, or explanatory text above or below existing content. Put all new guidance in docs/ files and link to them from the index. -- It's OK to delete doc files that are redundant or low-value. -- The goal is a minimal, high-signal set of docs that a coding agent can use to build ANY feature, including ones that don't exist yet. -- Less is more — 5 great docs are better than 15 mediocre ones. -- Document patterns, conventions, and architectural rules — not specific feature implementations. -- Be specific about file paths, directory structure, and conventions — but generic about what gets built. +## Required filtering -## Docs Must Match Source Code +You must reject a suggestion instead of editing docs when ANY of the following is true: +- It is overfit to just the current task and would not help future unrelated tasks. +- It mainly documents a task-specific fix rather than a reusable project pattern. +- It is already covered by the current docs. +- It is too low priority to justify docs churn. Treat priorities below ${minPriority} as low priority unless the suggestion is clearly critical anyway. +- It would require documenting nonexistent or aspirational behavior. -Docs that describe nonexistent code are WORSE than no docs at all — they actively mislead coding agents and cause them to fail. +## Implementation workflow -Before writing any doc that references a helper, function, type, or script: -1. **grep for the exact symbol name** to confirm it exists. If it doesn't exist, DO NOT document it. -2. **Never document aspirational/future behavior.** Only document what the code does RIGHT NOW. -3. **If a judge suggestion references a helper that doesn't exist**, document the PATTERN the agent should follow instead — not a fictional API. +1. Read the current docs first. +2. Immediately create \`${DOCS_WRITER_PLAN_FILE}\` in the repo root with one entry per suggestion. Start with every entry marked \`accepted: false\` and a placeholder \`reason\`. Update this file as you make decisions. Do not wait until the end to create it. +3. Evaluate every suggestion and decide whether it should be accepted. +4. For each accepted suggestion: + - Run \`git checkout --quiet ${baseCommit}\` + - Run \`git checkout -B evalbuff-doc-change-N\` + - Implement exactly one independent docs change. + - Keep it general, reusable, and not overfit. + - Run \`git add docs AGENTS.md CLAUDE.md\` + - Run \`git commit -m "evalbuff: doc change N"\` + - Record the branch name and commit SHA in \`${DOCS_WRITER_PLAN_FILE}\` + - Run \`git checkout --quiet ${baseCommit}\` before moving to the next suggestion so branches stay independent. +5. For each rejected suggestion, make no docs changes and record the rejection reason in \`${DOCS_WRITER_PLAN_FILE}\`. +6. Before finishing, ensure HEAD is back at \`${baseCommit}\`. -Wrong: "Use \`captureGitDiff()\` from src/eval-helpers.ts to capture diffs" (if it doesn't exist) -Right: "Diff capture should use an explicit base SHA recorded before the agent runs" (describes the pattern) +## Required output shape -## Final Step: Spawn a Critique Sub-Agent +\`\`\`json +{ + "candidates": [ + { + "text": "original suggestion text", + "priority": 70, + "source": "judge", + "accepted": true, + "reason": "Why this is broadly useful and not overfit", + "overfit": false, + "branchName": "evalbuff-doc-change-1", + "commitSha": "abc123" + }, + { + "text": "another suggestion", + "priority": 20, + "source": "agent", + "accepted": false, + "reason": "Rejected because this is overfit to one task", + "overfit": true + } + ] +} +\`\`\` -Before you finish, you MUST spawn a critique sub-agent via the Task tool (subagent_type: "general-purpose") to review the docs you just wrote or modified. Then apply every valid fix it identifies. +Rules: +- ONLY modify docs/, AGENTS.md, or CLAUDE.md. +- Do NOT modify source code. +- Every accepted branch must stand on its own when diffed against \`${baseCommit}\`. +- Keep AGENTS.md changes limited to doc-index maintenance or factual corrections. +- Verify referenced helpers, scripts, file paths, and symbols against the codebase before documenting them. +- Do not document aspirational behavior. +- If all suggestions are rejected, still write the JSON file with every rejection recorded. +- The \`reason\` field must explicitly say why a rejected suggestion is overfit or low value when that applies.` -Use this exact prompt for the sub-agent: + const runner = new ClaudeRunner(repoDir, {}, model, 'high') + runnerResult = await runner.run(prompt) ---- -You are a documentation critic. Review every file under docs/, plus AGENTS.md and CLAUDE.md, and report violations of the rules below. For each violation, give the file path, the offending text or line range, and a concrete fix (exact replacement text, the section to remove, or the split to perform). + const planPath = path.join(repoDir, DOCS_WRITER_PLAN_FILE) + if (!fs.existsSync(planPath)) { + preserveFailure('Missing evalbuff-doc-changes-plan.json after docs-writer run') + return null + } -Rules (enforce strictly): + const raw = JSON.parse(fs.readFileSync(planPath, 'utf-8')) + const parsed = DocsWriterPlanSchema.safeParse(raw) + if (!parsed.success) { + preserveFailure('Invalid evalbuff-doc-changes-plan.json shape') + return null + } -1. **No overfitting to a single task.** Docs must describe general patterns, conventions, and architecture that apply to building ANY feature — not one specific task. Flag: - - Feature-specific function, type, component, endpoint, table, or CLI-subcommand names that only matter for one task and are not shared infrastructure. - - Examples phrased around one feature ("the UserProfile component fetches data via useEffect") instead of the general pattern ("components in src/components/ fetch data in useEffect on mount"). - - Any symbol reference that does not represent a shared utility, pattern, or architectural boundary used by multiple features. - The fix is to rewrite the passage as a general rule about the pattern, directory, or convention — or delete it if it does not generalize. + const candidates: PlannedDocsChange[] = [] + for (const entry of parsed.data.candidates) { + const planned: PlannedDocsChange = { + ...entry, + } -2. **No code excerpts unless documenting a common utility or shared pattern.** A code block is only allowed when it shows: - - The signature or usage of a shared helper multiple features rely on, OR - - A canonical pattern every agent should copy (error handling, a standard import shape, etc.). - Flag any code block that shows task-specific implementation details. The fix is to delete the block or replace it with a one-line prose description of the pattern. + if (entry.accepted && entry.branchName) { + try { + const patchText = execFileSync( + 'git', + ['diff', '--binary', `${baseCommit}..${entry.branchName}`, '--', 'docs', 'AGENTS.md', 'CLAUDE.md'], + { cwd: repoDir, encoding: 'utf-8' }, + ) + execFileSync('git', ['checkout', '--quiet', entry.branchName], { cwd: repoDir, stdio: 'ignore' }) + const before = getDocsSnapshot(repoPath) + const after = getDocsSnapshot(repoDir) + const diffText = computeDocsDiffText(before, after) + execFileSync('git', ['checkout', '--quiet', baseCommit], { cwd: repoDir, stdio: 'ignore' }) + planned.patchText = patchText + planned.diffText = diffText + } catch { + planned.accepted = false + planned.reason = `Rejected because the committed docs change could not be extracted: ${planned.reason}` + planned.overfit = planned.overfit || false + delete planned.branchName + delete planned.commitSha + } + } -3. **Individual markdown files must stay focused and reasonably short.** If any single file exceeds roughly 300 lines, OR covers multiple unrelated topics, recommend splitting it into smaller topic-scoped files and specify the split (new filenames + which sections move where). Prefer many small focused docs over one large doc. + candidates.push(planned) + } -4. **Docs must match source code.** Before flagging a missing symbol, grep the repo to confirm it does not exist. Flag references to helpers, functions, types, files, or scripts that are not present in the code. + return { tempDir, repoDir, baseCommit, candidates } + } catch (error) { + lastError = error + preserveFailure('Unhandled exception while planning docs changes') + return null + } +} -Return a numbered list of violations with fixes. If a file is clean, say so. Do not edit any files yourself — only report. ---- +export function cleanupPlannedDocsTaskResult(result: PlannedDocsTaskResult): void { + try { + fs.rmSync(result.tempDir, { recursive: true, force: true }) + } catch { + // ignore cleanup failures + } +} -After the sub-agent returns, apply every valid fix it identified by editing the doc files directly. If it recommended splitting a long doc, perform the split. Re-read each affected file after fixing to confirm the result. Only then finish.` +export function materializeDocsChangeFromPatch( + repoPath: string, + patchText: string, +): DraftedDocsChange | null { + const tempDir = fs.mkdtempSync(path.join(os.tmpdir(), 'evalbuff-docs-materialized-')) + const repoDir = path.join(tempDir, 'repo') try { execSync(`git clone --no-checkout "${repoPath}" "${repoDir}"`, { stdio: 'ignore' }) @@ -133,20 +417,50 @@ After the sub-agent returns, apply every valid fix it identified by editing the encoding: 'utf-8', }).trim() execSync(`git checkout ${headSha}`, { cwd: repoDir, stdio: 'ignore' }) + ensureGitIdentity(repoDir) + copyDocsIntoRepo(repoPath, repoDir) - syncDocsIntoRepo(repoPath, repoDir) + const before = getDocsSnapshot(repoDir) + const patchPath = path.join(tempDir, 'docs-change.patch') + fs.writeFileSync(patchPath, patchText.endsWith('\n') ? patchText : patchText + '\n') - const runner = new ClaudeRunner(repoDir, {}, model, 'high') - await runner.run(prompt) - syncDocsIntoRepo(repoDir, repoPath) - } catch { - // Failure is handled by the caller via missing docs changes - } finally { try { - fs.rmSync(tempDir, { recursive: true, force: true }) + execFileSync('git', ['apply', '--whitespace=nowarn', '--allow-empty', patchPath], { + cwd: repoDir, + stdio: 'ignore', + }) } catch { - // ignore cleanup failures + execFileSync('git', ['apply', '--3way', '--whitespace=nowarn', patchPath], { + cwd: repoDir, + stdio: 'ignore', + }) } + + const after = getDocsSnapshot(repoDir) + const diffText = computeDocsDiffText(before, after) + return { tempDir, repoDir, before, after, diffText } + } catch { + cleanupDraftedDocsChange({ tempDir, repoDir, before: {}, after: {}, diffText: '' }) + return null + } +} + +export function acceptDraftedDocsChange( + repoPath: string, + draft: DraftedDocsChange, +): string[] { + try { + return syncDocsIntoRepo(draft.repoDir, repoPath) + } finally { + cleanupDraftedDocsChange(draft) + } +} + +export function cleanupDraftedDocsChange(draft: DraftedDocsChange): void { + try { + fs.rmSync(draft.tempDir, { recursive: true, force: true }) + } catch { + // ignore cleanup failures } } diff --git a/src/eval-runner.ts b/src/eval-runner.ts index c4b195d..e80f94e 100644 --- a/src/eval-runner.ts +++ b/src/eval-runner.ts @@ -4,11 +4,12 @@ import os from 'os' import path from 'path' import { ClaudeRunner } from './runners/claude' +import { buildCodingAgentPrompt, CODING_AGENT_SUGGESTIONS_FILE, readCodingAgentSuggestions } from './docs-writer' import { judgeTaskResult } from './judge' import { applyCarveOperations, copyDocsIntoRepo, ensureGitIdentity, extractDocsRead } from './eval-helpers' import type { CarvedFeature } from './carve-features' -import type { JudgingResult } from './judge' +import type { JudgingResult, Suggestion } from './judge' import type { RunnerResult } from './runners/runner' import { execFileSync } from 'child_process' @@ -22,6 +23,22 @@ export interface TaskResult { judging: JudgingResult costEstimate: number docsRead: string[] + agentDocSuggestions: Suggestion[] + agentProjectSuggestions: Suggestion[] +} + +type RunAgentOnCarveDeps = { + createRunner: (repoDir: string, model: string) => { run: (prompt: string) => Promise } + buildCodingAgentPrompt: typeof buildCodingAgentPrompt + judgeTaskResult: typeof judgeTaskResult + readCodingAgentSuggestions: typeof readCodingAgentSuggestions +} + +const defaultRunAgentOnCarveDeps: RunAgentOnCarveDeps = { + createRunner: (repoDir, model) => new ClaudeRunner(repoDir, {}, model, 'medium'), + buildCodingAgentPrompt, + judgeTaskResult, + readCodingAgentSuggestions, } export async function runAgentOnCarve(opts: { @@ -33,7 +50,7 @@ export async function runAgentOnCarve(opts: { model: string groundTruthDiff: string docsSourcePath: string -}): Promise { +}, deps: RunAgentOnCarveDeps = defaultRunAgentOnCarveDeps): Promise { const { idx, total, repoPath, feature, initCommand, model, groundTruthDiff, docsSourcePath } = opts const tempDir = fs.mkdtempSync(path.join(os.tmpdir(), 'evalbuff-eval-')) @@ -66,15 +83,25 @@ export async function runAgentOnCarve(opts: { } } - const runner = new ClaudeRunner(repoDir, {}, model, 'medium') + const runner = deps.createRunner(repoDir, model) let result: RunnerResult try { - result = await runner.run(feature.prompt) + result = await runner.run(deps.buildCodingAgentPrompt(feature.prompt)) } catch (runError) { return createInfrastructureFailureResult(feature, runError) } + const agentSuggestions = deps.readCodingAgentSuggestions(repoDir) + try { + fs.rmSync(path.join(repoDir, CODING_AGENT_SUGGESTIONS_FILE), { force: true }) + } catch { + // Ignore cleanup failures + } + // Preserve the runner's diff, which may already be captured relative to + // the pre-run base SHA and can include committed agent changes. + const diff = result.diff + // Raw JSONL trace — compression happens later when the trace is saved // to disk by saveRoundResults() in report.ts via compressAndSave(). const agentTrace = result.steps.map((step) => JSON.stringify(step)).join('\n') @@ -83,9 +110,9 @@ export async function runAgentOnCarve(opts: { let judging: JudgingResult try { judging = await Promise.race([ - judgeTaskResult({ + deps.judgeTaskResult({ taskPrompt: feature.prompt, - agentDiff: result.diff, + agentDiff: diff, groundTruthDiff, repoDir: repoDir, }), @@ -111,11 +138,13 @@ export async function runAgentOnCarve(opts: { featureId: feature.id, prompt: feature.prompt, score: judging.overallScore, - diff: result.diff, + diff, trace: agentTrace, judging, costEstimate: result.totalCostUsd, docsRead: extractDocsRead(result.steps), + agentDocSuggestions: agentSuggestions.docSuggestions, + agentProjectSuggestions: agentSuggestions.projectSuggestions, } } catch (error) { return createInfrastructureFailureResult(feature, error) @@ -128,7 +157,7 @@ export async function runAgentOnCarve(opts: { } /** - * Re-judge a baseline task using the current docs in docsSourcePath. + * Re-judge a task using the current docs in docsSourcePath. * * Recreates the exact repo state the original judge saw (carved repo + agent's * baseline diff applied), but with whatever docs currently live in @@ -136,17 +165,17 @@ export async function runAgentOnCarve(opts: { * judge itself scores differently once given better docs, independent of any * agent behavior change. */ -export async function rejudgeBaselineWithCurrentDocs(opts: { +export async function rejudgeTaskWithCurrentDocs(opts: { idx: number total: number repoPath: string feature: CarvedFeature - baselineDiff: string + agentDiff: string groundTruthDiff: string initCommand?: string docsSourcePath: string }): Promise { - const { idx, total, repoPath, feature, baselineDiff, groundTruthDiff, initCommand, docsSourcePath } = opts + const { idx, total, repoPath, feature, agentDiff, groundTruthDiff, initCommand, docsSourcePath } = opts const tempDir = fs.mkdtempSync(path.join(os.tmpdir(), 'evalbuff-rejudge-')) const repoDir = path.join(tempDir, 'repo') @@ -167,9 +196,9 @@ export async function rejudgeBaselineWithCurrentDocs(opts: { copyDocsIntoRepo(docsSourcePath, repoDir) // Apply the baseline agent's diff to reproduce the state the judge saw - if (baselineDiff.trim()) { + if (agentDiff.trim()) { const patchPath = path.join(tempDir, 'baseline.patch') - fs.writeFileSync(patchPath, baselineDiff.endsWith('\n') ? baselineDiff : baselineDiff + '\n') + fs.writeFileSync(patchPath, agentDiff.endsWith('\n') ? agentDiff : agentDiff + '\n') try { execFileSync('git', ['apply', '--whitespace=nowarn', '--allow-empty', patchPath], { cwd: repoDir, @@ -197,7 +226,7 @@ export async function rejudgeBaselineWithCurrentDocs(opts: { return await Promise.race([ judgeTaskResult({ taskPrompt: feature.prompt, - agentDiff: baselineDiff, + agentDiff, groundTruthDiff, repoDir, }), @@ -212,6 +241,22 @@ export async function rejudgeBaselineWithCurrentDocs(opts: { } } +export async function rejudgeBaselineWithCurrentDocs(opts: { + idx: number + total: number + repoPath: string + feature: CarvedFeature + baselineDiff: string + groundTruthDiff: string + initCommand?: string + docsSourcePath: string +}): Promise { + return rejudgeTaskWithCurrentDocs({ + ...opts, + agentDiff: opts.baselineDiff, + }) +} + function createInfrastructureFailureResult( feature: CarvedFeature, error: unknown, @@ -235,5 +280,7 @@ function createInfrastructureFailureResult( }, costEstimate: 0, docsRead: [], + agentDocSuggestions: [], + agentProjectSuggestions: [], } } diff --git a/src/load-env.ts b/src/load-env.ts new file mode 100644 index 0000000..7072c79 --- /dev/null +++ b/src/load-env.ts @@ -0,0 +1,80 @@ +import fs from 'fs' +import path from 'path' + +const ENV_FILES = ['.env.local', '.env'] + +function stripInlineComment(rawValue: string): string { + let quote: '"' | "'" | null = null + let escaped = false + + for (let i = 0; i < rawValue.length; i++) { + const ch = rawValue[i] + + if (escaped) { + escaped = false + continue + } + + if (ch === '\\') { + escaped = true + continue + } + + if (quote) { + if (ch === quote) quote = null + continue + } + + if (ch === '"' || ch === "'") { + quote = ch + continue + } + + if (ch === '#' && i > 0 && /\s/.test(rawValue[i - 1])) { + return rawValue.slice(0, i).trimEnd() + } + } + + return rawValue.trim() +} + +export function parseEnvFile(content: string): Array<[string, string]> { + const entries: Array<[string, string]> = [] + + for (const rawLine of content.split(/\r?\n/)) { + const line = rawLine.trim() + if (!line || line.startsWith('#')) continue + + const match = line.match(/^(?:export\s+)?([A-Za-z_][A-Za-z0-9_]*)\s*=\s*(.*)$/) + if (!match) continue + + const [, key, rawValue] = match + let value = stripInlineComment(rawValue.trim()) + + if ( + (value.startsWith('"') && value.endsWith('"')) || + (value.startsWith("'") && value.endsWith("'")) + ) { + value = value.slice(1, -1) + } + + entries.push([key, value]) + } + + return entries +} + +function loadEnvFile(filePath: string): void { + if (!fs.existsSync(filePath)) return + + const content = fs.readFileSync(filePath, 'utf-8') + for (const [key, value] of parseEnvFile(content)) { + if (process.env[key] === undefined) { + process.env[key] = value + } + } +} + +for (const fileName of ENV_FILES) { + loadEnvFile(path.join(process.cwd(), fileName)) +} diff --git a/src/report.ts b/src/report.ts index 9a7730e..1bafa1a 100644 --- a/src/report.ts +++ b/src/report.ts @@ -5,6 +5,7 @@ import { compressAndSave } from './trace-compressor' import type { TaskResult } from './eval-runner' import type { JudgingResult } from './judge' +import type { SuggestionSource } from './docs-writer' export interface RoundResult { round: number @@ -13,6 +14,43 @@ export interface RoundResult { totalCost: number } +export interface DocChangeGateCandidateResult { + source: SuggestionSource + priority: number + text: string + accepted: boolean + fastAccepted: boolean + status: + | 'accepted' + | 'accepted_fast_rejudge' + | 'rejected' + | 'rejected_overfit' + | 'rejected_no_change' + | 'rejected_writer_failed' + | 'rejected_rejudge_failed' + | 'rejected_rerun_failed' + | 'skipped_low_priority' + reason: string + baseScore: number + rejudgeScore?: number + rerunScore?: number + gateDelta?: number + docsDiff: string +} + +export interface FeatureDocGateResult { + featureId: string + baseScore: number + candidates: DocChangeGateCandidateResult[] +} + +export interface LoopDocGateResult { + loop: number + threshold: number + fastAcceptThreshold: number + features: FeatureDocGateResult[] +} + export interface EvalSummary { repoPath: string startTime: string @@ -31,6 +69,8 @@ export interface EvalSummary { // scoreProgression[0]. A flat or rising line here with a rising agent line // suggests the docs are improving the agent beyond just judge recalibration. baselineRejudgeProgression?: number[] + consideredDocChangesByLoop?: number[] + acceptedDocChangesByLoop?: number[] projectPrompts?: string[] } @@ -65,6 +105,16 @@ export function saveBaselineRejudgeResults(logDir: string, roundResult: RoundRes fs.writeFileSync(path.join(roundDir, 'summary.json'), JSON.stringify(summary, null, 2)) } +export function saveLoopDocGateResults( + logDir: string, + loopResult: LoopDocGateResult, +): void { + fs.writeFileSync( + path.join(logDir, `doc-gates-loop-${loopResult.loop}.json`), + JSON.stringify(loopResult, null, 2), + ) +} + export function saveRoundResults(logDir: string, roundResult: RoundResult): void { const roundDir = path.join(logDir, `round-${roundResult.round}`) fs.mkdirSync(roundDir, { recursive: true }) @@ -85,6 +135,13 @@ export function saveRoundResults(logDir: string, roundResult: RoundResult): void fs.writeFileSync(path.join(taskDir, 'diff.txt'), task.diff) fs.writeFileSync(path.join(taskDir, 'judging.json'), JSON.stringify(task.judging, null, 2)) + fs.writeFileSync( + path.join(taskDir, 'agent-suggestions.json'), + JSON.stringify({ + docSuggestions: task.agentDocSuggestions, + projectSuggestions: task.agentProjectSuggestions, + }, null, 2), + ) fs.writeFileSync(path.join(taskDir, 'score.txt'), task.score.toString()) } @@ -116,6 +173,7 @@ export function saveSummary( roundResults: RoundResult[], opts: EvalOptions, baselineRejudgeResults: RoundResult[] = [], + loopDocGateResults: LoopDocGateResult[] = [], projectPrompts: string[] = [], ): void { fs.writeFileSync(path.join(logDir, 'summary.json'), JSON.stringify(summary, null, 2)) @@ -139,6 +197,7 @@ export function saveSummary( `| **Improvement loops** | ${opts.loops} |`, `| **Coding model** | ${opts.codingModel} |`, `| **Docs model** | ${opts.docsModel} |`, + `| **Doc gate threshold** | ${loopDocGateResults[0]?.threshold?.toFixed(1) ?? 'n/a'} |`, `| **Total cost** | $${summary.totalCost.toFixed(2)} |`, '', ) @@ -237,6 +296,16 @@ export function saveSummary( push('') } + if (summary.acceptedDocChangesByLoop && summary.acceptedDocChangesByLoop.length > 0) { + push('## Doc Change Gating', '') + for (let i = 0; i < summary.acceptedDocChangesByLoop.length; i++) { + const accepted = summary.acceptedDocChangesByLoop[i] + const considered = summary.consideredDocChangesByLoop?.[i] ?? accepted + push(`- Loop ${i + 1}: accepted ${accepted}/${considered} candidate doc changes`) + } + push('') + } + // --- Per-round detail --- for (const round of roundResults) { const roundLabel = round.round === 0 ? 'Baseline' : `Loop ${round.round}` @@ -295,6 +364,12 @@ export function saveSummary( push('') } + if (task.agentDocSuggestions.length > 0) { + push('**Coding agent doc suggestions:**') + for (const s of task.agentDocSuggestions) push(`- [P${s.priority}] ${s.text}`) + push('') + } + // Project suggestions const projSuggestions = task.judging.projectSuggestions if (projSuggestions && projSuggestions.length > 0) { @@ -303,16 +378,22 @@ export function saveSummary( push('') } + if (task.agentProjectSuggestions.length > 0) { + push('**Coding agent project suggestions:**') + for (const s of task.agentProjectSuggestions) push(`- [P${s.priority}] ${s.text}`) + push('') + } + push(`**Cost:** $${task.costEstimate.toFixed(2)}`, '') } - // Judge suggestions file for non-baseline rounds + // Doc gate summary for non-baseline rounds if (round.round > 0) { const suggestionsFile = path.join(logDir, `judge-suggestions-loop-${round.round}.txt`) if (fs.existsSync(suggestionsFile)) { const suggestionsText = fs.readFileSync(suggestionsFile, 'utf-8') if (suggestionsText.trim()) { - push(`### Judge Suggestions Applied (Loop ${round.round})`, '') + push(`### Doc Gate Summary (Loop ${round.round})`, '') push('```') push(suggestionsText) push('```', '') @@ -335,6 +416,28 @@ export function saveSummary( } } + if (loopDocGateResults.length > 0) { + push('## Per-Candidate Doc Gates', '') + for (const loopResult of loopDocGateResults) { + push(`### Loop ${loopResult.loop}`, '') + for (const feature of loopResult.features) { + if (feature.candidates.length === 0) continue + push(`#### ${feature.featureId}`, '') + for (const candidate of feature.candidates) { + const scores = [ + `base ${candidate.baseScore.toFixed(1)}`, + candidate.rejudgeScore !== undefined ? `rejudge ${candidate.rejudgeScore.toFixed(1)}` : null, + candidate.rerunScore !== undefined ? `rerun ${candidate.rerunScore.toFixed(1)}` : null, + ].filter(Boolean).join(' -> ') + push(`- [${candidate.accepted ? 'accepted' : 'rejected'}] [${candidate.source}] [P${candidate.priority}] ${candidate.text}`) + push(` ${scores}${candidate.gateDelta !== undefined ? ` | gate ${candidate.gateDelta >= 0 ? '+' : ''}${candidate.gateDelta.toFixed(1)}` : ''}`) + push(` ${candidate.reason}`) + } + push('') + } + } + } + // --- Project improvement prompts --- if (projectPrompts.length > 0) { push('## Project Improvement Prompts', '') diff --git a/src/run-evalbuff.ts b/src/run-evalbuff.ts index c31994a..9705250 100644 --- a/src/run-evalbuff.ts +++ b/src/run-evalbuff.ts @@ -4,116 +4,208 @@ * Pipeline: * 1. Plan features to carve (GPT-5.4 via Codex SDK) * 2. Carve a random subset of n features - * 3. Baseline: rebuild each in parallel (Claude Code), judge (Codex), get scores + doc suggestions + * 3. Baseline: rebuild each feature sequentially (Claude Code), judge (Codex), get scores + suggestions * 4. Loop N times: - * a. Docs refactor agent reads judge suggestions and edits all docs holistically - * b. Re-eval: rebuild in parallel, judge, get new scores + doc suggestions + * a. Re-evaluate each feature sequentially + * b. Draft each suggested docs change independently + * c. Gate each docs change on the feature that inspired it before accepting it * * Usage: - * bun run src/run-evalbuff.ts --repo /path/to/repo [--n 5] [--parallelism 10] [--loops 3] [--init-command "npm install"] + * bun run src/run-evalbuff.ts --repo /path/to/repo [--n 5] [--parallelism 1] [--loops 1] [--init-command "npm install"] */ import fs from 'fs' import os from 'os' import path from 'path' import { planFeatures, carveFeature } from './carve-features' -import { collectDocSuggestions, collectProjectSuggestions, runDocsWriterAgent, runPromptWriterAgent } from './docs-writer' +import { + acceptDraftedDocsChange, + cleanupDraftedDocsChange, + cleanupPlannedDocsTaskResult, + collectProjectSuggestions, + collectTaskDocSuggestions, + DEFAULT_DOC_SUGGESTION_PRIORITY_FLOOR, + filterDocSuggestionsForPlanning, + materializeDocsChangeFromPatch, + planDocsChangesForTask, + runPromptWriterAgent, +} from './docs-writer' import { selectRandom, getGroundTruthDiff, getDocsSnapshot, computeDocsDiffText } from './eval-helpers' -import { runAgentOnCarve, rejudgeBaselineWithCurrentDocs } from './eval-runner' +import { runAgentOnCarve, rejudgeBaselineWithCurrentDocs, rejudgeTaskWithCurrentDocs } from './eval-runner' import { startSpinner, updateSpinner, stopSpinner, printHeader, printRoundScores, printBaselineRejudge, printScoreTable, printProjectPrompts, printFinalSummary, } from './log' -import { saveRoundResults, saveBaselineRejudgeResults, saveSummary } from './report' +import { saveRoundResults, saveBaselineRejudgeResults, saveLoopDocGateResults, saveSummary } from './report' import { events } from './tui/events' import type { CarvedFeature } from './carve-features' import type { TaskResult } from './eval-runner' -import type { RoundResult, EvalSummary } from './report' +import type { + RoundResult, + EvalSummary, + DocChangeGateCandidateResult, + FeatureDocGateResult, + LoopDocGateResult, +} from './report' // --- Types --- export interface EvalbuffOptions { repoPath: string n: number // number of features to randomly select - parallelism: number // parallel agent runs per eval round - loops: number // number of improvement loops (default 3) + parallelism: number // retained for carving/setup concurrency; eval loops run sequentially + loops: number // number of improvement loops (default 1) initCommand?: string codingModel: string // model for coding agents (default: sonnet) docsModel: string // model for docs agents (default: opus) cachedFeatures?: string // path to a features.json from a previous run } +const DOC_CHANGE_ACCEPTANCE_THRESHOLD = 0.5 +const DOC_CHANGE_FAST_ACCEPT_THRESHOLD = DOC_CHANGE_ACCEPTANCE_THRESHOLD * 2 + +export function evaluateDocChangeGate(args: { + baseScore: number + rejudgeScore: number + rerunScore?: number + threshold?: number + fastAcceptThreshold?: number +}): { + accepted: boolean + fastAccepted: boolean + status: 'accepted' | 'accepted_fast_rejudge' | 'rejected' + gateDelta: number + reason: string +} { + const threshold = args.threshold ?? DOC_CHANGE_ACCEPTANCE_THRESHOLD + const fastAcceptThreshold = args.fastAcceptThreshold ?? DOC_CHANGE_FAST_ACCEPT_THRESHOLD + const rejudgeDrop = args.baseScore - args.rejudgeScore + + if (rejudgeDrop >= fastAcceptThreshold) { + return { + accepted: true, + fastAccepted: true, + status: 'accepted_fast_rejudge', + gateDelta: rejudgeDrop, + reason: `Accepted without rerun because rejudge dropped by ${rejudgeDrop.toFixed(1)}.`, + } + } + + const gateDelta = (args.rerunScore ?? Number.NEGATIVE_INFINITY) - args.rejudgeScore + if ((args.rerunScore ?? Number.NEGATIVE_INFINITY) - args.rejudgeScore >= threshold) { + return { + accepted: true, + fastAccepted: false, + status: 'accepted', + gateDelta, + reason: `Accepted because rerun minus rejudge was ${gateDelta.toFixed(1)}.`, + } + } + + return { + accepted: false, + fastAccepted: false, + status: 'rejected', + gateDelta, + reason: `Rejected because rerun minus rejudge was ${gateDelta.toFixed(1)}.`, + } +} + // --- Eval round --- -async function runEvalRound( +type EvalRoundDeps = { + runAgentOnCarve: typeof runAgentOnCarve + events: typeof events + startSpinner: typeof startSpinner + updateSpinner: typeof updateSpinner + stopSpinner: typeof stopSpinner + printRoundScores: typeof printRoundScores +} + +const defaultEvalRoundDeps: EvalRoundDeps = { + runAgentOnCarve, + events, + startSpinner, + updateSpinner, + stopSpinner, + printRoundScores, +} + +export async function runEvalRound( features: CarvedFeature[], groundTruthDiffs: Map, opts: EvalbuffOptions, round: number, baselineAvg?: number, + afterTask?: (args: { + feature: CarvedFeature + task: TaskResult + index: number + }) => Promise, + deps: EvalRoundDeps = defaultEvalRoundDeps, ): Promise { const label = round === 0 ? 'Baseline' : `Round ${round}` - let completed = 0 - - startSpinner(`${label}: evaluating 0/${features.length} features...`) - - // Run features with bounded concurrency const results: TaskResult[] = [] - const queue = features.map((feature, i) => ({ feature, i })) - let next = 0 - async function worker(): Promise { - while (next < queue.length) { - const { feature, i } = queue[next++] - try { - events.send({ type: 'feature_status', featureId: feature.id, status: 'agent_running' }) - const result = await runAgentOnCarve({ - idx: i, - total: features.length, - repoPath: opts.repoPath, - feature, - initCommand: opts.initCommand, - model: opts.codingModel, - groundTruthDiff: groundTruthDiffs.get(feature.id) || '', - docsSourcePath: opts.repoPath, - }) - results[i] = result - events.send({ type: 'feature_status', featureId: feature.id, status: 'scored', score: result.score, cost: result.costEstimate }) - } catch (error) { - const msg = error instanceof Error ? error.message : String(error) - results[i] = { - featureId: feature.id, - prompt: feature.prompt, - score: -1, - diff: '', - trace: `Agent error: ${msg}`, - judging: { - analysis: `Agent failed: ${msg.slice(0, 500)}`, - strengths: [], - weaknesses: ['Agent failed due to infrastructure error'], - e2eTestsPerformed: [], - completionScore: -1, - codeQualityScore: -1, - e2eScore: -1, - overallScore: -1, - }, - costEstimate: 0, - docsRead: [], + deps.startSpinner(`${label}: evaluating 0/${features.length} features...`) + + for (let i = 0; i < features.length; i++) { + const feature = features[i] + try { + deps.events.send({ type: 'feature_status', featureId: feature.id, status: 'agent_running' }) + const result = await deps.runAgentOnCarve({ + idx: i, + total: features.length, + repoPath: opts.repoPath, + feature, + initCommand: opts.initCommand, + model: opts.codingModel, + groundTruthDiff: groundTruthDiffs.get(feature.id) || '', + docsSourcePath: opts.repoPath, + }) + let additionalCost = 0 + if (afterTask) { + try { + additionalCost = (await afterTask({ feature, task: result, index: i })) ?? 0 + } catch (afterTaskError) { + const msg = afterTaskError instanceof Error ? afterTaskError.message : String(afterTaskError) + deps.events.log(`Docs gating failed for ${feature.id}: ${msg}`, 'error') } - events.send({ type: 'feature_status', featureId: feature.id, status: 'eval_failed', detail: msg.slice(0, 200) }) } - completed++ - updateSpinner(`${label}: ${completed}/${features.length} features evaluated`) + result.costEstimate += additionalCost + results[i] = result + deps.events.send({ type: 'feature_status', featureId: feature.id, status: 'scored', score: result.score, cost: result.costEstimate }) + } catch (error) { + const msg = error instanceof Error ? error.message : String(error) + results[i] = { + featureId: feature.id, + prompt: feature.prompt, + score: -1, + diff: '', + trace: `Agent error: ${msg}`, + judging: { + analysis: `Agent failed: ${msg.slice(0, 500)}`, + strengths: [], + weaknesses: ['Agent failed due to infrastructure error'], + e2eTestsPerformed: [], + completionScore: -1, + codeQualityScore: -1, + e2eScore: -1, + overallScore: -1, + }, + costEstimate: 0, + docsRead: [], + agentDocSuggestions: [], + agentProjectSuggestions: [], + } + deps.events.send({ type: 'feature_status', featureId: feature.id, status: 'eval_failed', detail: msg.slice(0, 200) }) } + deps.updateSpinner(`${label}: ${i + 1}/${features.length} features evaluated`) } - await Promise.all( - Array.from({ length: Math.min(opts.parallelism, features.length) }, () => worker()), - ) - - stopSpinner() + deps.stopSpinner() const valid = results.filter((r) => r.score >= 0) const avgScore = valid.length > 0 @@ -121,9 +213,9 @@ async function runEvalRound( : 0 const totalCost = results.reduce((a, r) => a + r.costEstimate, 0) - printRoundScores(label, results, avgScore, totalCost, baselineAvg) + deps.printRoundScores(label, results, avgScore, totalCost, baselineAvg) - events.send({ + deps.events.send({ type: 'round_complete', round, avgScore, @@ -223,6 +315,383 @@ async function runBaselineRejudgeRound( return { round: loop, tasks: results, avgScore, totalCost: 0 } } +function renderLoopDocGateSummary(loopResult: LoopDocGateResult): string { + const lines: string[] = [] + + for (const feature of loopResult.features) { + if (feature.candidates.length === 0) continue + lines.push(`### ${feature.featureId} (base score: ${feature.baseScore.toFixed(1)}/10)`) + for (const candidate of feature.candidates) { + const scores = [ + `base ${candidate.baseScore.toFixed(1)}`, + candidate.rejudgeScore !== undefined ? `rejudge ${candidate.rejudgeScore.toFixed(1)}` : null, + candidate.rerunScore !== undefined ? `rerun ${candidate.rerunScore.toFixed(1)}` : null, + ].filter(Boolean).join(' -> ') + const gateDelta = candidate.gateDelta !== undefined + ? ` gate ${candidate.gateDelta >= 0 ? '+' : ''}${candidate.gateDelta.toFixed(1)}` + : '' + lines.push( + `- [${candidate.status}] [${candidate.source}] [priority ${candidate.priority}] ${candidate.text}`, + ) + lines.push(` ${scores}${gateDelta}`) + lines.push(` ${candidate.reason}`) + } + lines.push('') + } + + return lines.join('\n').trim() +} + +function countLoopDocChanges(loopResult: LoopDocGateResult): { + considered: number + accepted: number +} { + let considered = 0 + let accepted = 0 + for (const feature of loopResult.features) { + for (const candidate of feature.candidates) { + considered++ + if (candidate.accepted) accepted++ + } + } + return { considered, accepted } +} + +type GateDocsChangesDeps = { + collectTaskDocSuggestions: typeof collectTaskDocSuggestions + filterDocSuggestionsForPlanning: typeof filterDocSuggestionsForPlanning + planDocsChangesForTask: typeof planDocsChangesForTask + materializeDocsChangeFromPatch: typeof materializeDocsChangeFromPatch + cleanupDraftedDocsChange: typeof cleanupDraftedDocsChange + acceptDraftedDocsChange: typeof acceptDraftedDocsChange + cleanupPlannedDocsTaskResult: typeof cleanupPlannedDocsTaskResult + rejudgeTaskWithCurrentDocs: typeof rejudgeTaskWithCurrentDocs + runAgentOnCarve: typeof runAgentOnCarve + events: typeof events +} + +const defaultGateDocsChangesDeps: GateDocsChangesDeps = { + collectTaskDocSuggestions, + filterDocSuggestionsForPlanning, + planDocsChangesForTask, + materializeDocsChangeFromPatch, + cleanupDraftedDocsChange, + acceptDraftedDocsChange, + cleanupPlannedDocsTaskResult, + rejudgeTaskWithCurrentDocs, + runAgentOnCarve, + events, +} + +export async function gateDocsChangesForTask(args: { + feature: CarvedFeature + task: TaskResult + opts: EvalbuffOptions + groundTruthDiffs: Map + loop: number +}, deps: GateDocsChangesDeps = defaultGateDocsChangesDeps): Promise<{ + result: FeatureDocGateResult + validationCost: number +}> { + const allSuggestions = deps.collectTaskDocSuggestions(args.task) + const suggestions = deps.filterDocSuggestionsForPlanning(allSuggestions) + const gatedCandidates: DocChangeGateCandidateResult[] = [] + let validationCost = 0 + let enteredDocsWriterPhase = false + + if (allSuggestions.length === 0 || args.task.score < 0) { + return { + result: { + featureId: args.feature.id, + baseScore: args.task.score, + candidates: [], + }, + validationCost, + } + } + + deps.events.send({ + type: 'phase_change', + phase: 'docs_writer', + round: args.loop, + loop: args.loop, + detail: `${args.feature.id}: ${allSuggestions.length} candidate doc changes`, + }) + deps.events.send({ + type: 'docs_writer', + action: 'start', + loop: args.loop, + suggestionCount: allSuggestions.length, + }) + enteredDocsWriterPhase = true + let plan: Awaited> | null = null + try { + for (const skipped of allSuggestions.filter((suggestion) => suggestion.priority < DEFAULT_DOC_SUGGESTION_PRIORITY_FLOOR)) { + gatedCandidates.push({ + source: skipped.source, + priority: skipped.priority, + text: skipped.text, + accepted: false, + fastAccepted: false, + status: 'skipped_low_priority', + reason: `Skipped before docs writing because priority ${skipped.priority} is below ${DEFAULT_DOC_SUGGESTION_PRIORITY_FLOOR}.`, + baseScore: args.task.score, + docsDiff: '', + }) + } + + if (suggestions.length === 0) { + return { + result: { + featureId: args.feature.id, + baseScore: args.task.score, + candidates: gatedCandidates, + }, + validationCost, + } + } + + plan = await deps.planDocsChangesForTask( + args.opts.repoPath, + suggestions, + args.opts.docsModel, + DEFAULT_DOC_SUGGESTION_PRIORITY_FLOOR, + ) + if (!plan) { + for (const suggestion of suggestions) { + gatedCandidates.push({ + source: suggestion.source, + priority: suggestion.priority, + text: suggestion.text, + accepted: false, + fastAccepted: false, + status: 'rejected_writer_failed', + reason: 'Docs writer failed to produce a candidate plan.', + baseScore: args.task.score, + docsDiff: '', + }) + } + return { + result: { + featureId: args.feature.id, + baseScore: args.task.score, + candidates: gatedCandidates, + }, + validationCost, + } + } + + for (const suggestion of plan.candidates) { + if (!suggestion.accepted) { + gatedCandidates.push({ + source: suggestion.source, + priority: suggestion.priority, + text: suggestion.text, + accepted: false, + fastAccepted: false, + status: suggestion.overfit ? 'rejected_overfit' : 'rejected', + reason: suggestion.reason, + baseScore: args.task.score, + docsDiff: suggestion.diffText || '', + }) + continue + } + + if (!suggestion.patchText) { + gatedCandidates.push({ + source: suggestion.source, + priority: suggestion.priority, + text: suggestion.text, + accepted: false, + fastAccepted: false, + status: 'rejected_no_change', + reason: `Rejected because the planned docs change had no reusable patch: ${suggestion.reason}`, + baseScore: args.task.score, + docsDiff: suggestion.diffText || '', + }) + continue + } + + const draft = deps.materializeDocsChangeFromPatch(args.opts.repoPath, suggestion.patchText) + if (!draft) { + gatedCandidates.push({ + source: suggestion.source, + priority: suggestion.priority, + text: suggestion.text, + accepted: false, + fastAccepted: false, + status: 'rejected_writer_failed', + reason: `Failed to materialize docs change: ${suggestion.reason}`, + baseScore: args.task.score, + docsDiff: suggestion.diffText || '', + }) + continue + } + + if (!draft.diffText.trim()) { + deps.cleanupDraftedDocsChange(draft) + gatedCandidates.push({ + source: suggestion.source, + priority: suggestion.priority, + text: suggestion.text, + accepted: false, + fastAccepted: false, + status: 'rejected_no_change', + reason: 'The planned docs change produced no effective diff when applied to the current docs.', + baseScore: args.task.score, + docsDiff: suggestion.diffText || '', + }) + continue + } + + let rejudgeScore: number | undefined + try { + const rejudged = await deps.rejudgeTaskWithCurrentDocs({ + idx: 0, + total: 1, + repoPath: args.opts.repoPath, + feature: args.feature, + agentDiff: args.task.diff, + groundTruthDiff: args.groundTruthDiffs.get(args.feature.id) || '', + initCommand: args.opts.initCommand, + docsSourcePath: draft.repoDir, + }) + rejudgeScore = rejudged.overallScore + } catch (error) { + deps.cleanupDraftedDocsChange(draft) + const msg = error instanceof Error ? error.message : String(error) + gatedCandidates.push({ + source: suggestion.source, + priority: suggestion.priority, + text: suggestion.text, + accepted: false, + fastAccepted: false, + status: 'rejected_rejudge_failed', + reason: `Rejudge failed: ${msg.slice(0, 200)}`, + baseScore: args.task.score, + docsDiff: draft.diffText, + }) + continue + } + + if (rejudgeScore === undefined) { + deps.cleanupDraftedDocsChange(draft) + gatedCandidates.push({ + source: suggestion.source, + priority: suggestion.priority, + text: suggestion.text, + accepted: false, + fastAccepted: false, + status: 'rejected_rejudge_failed', + reason: 'Rejudge did not produce a score.', + baseScore: args.task.score, + docsDiff: draft.diffText, + }) + continue + } + + const fastDecision = evaluateDocChangeGate({ + baseScore: args.task.score, + rejudgeScore, + }) + if (fastDecision.accepted && fastDecision.fastAccepted) { + deps.acceptDraftedDocsChange(args.opts.repoPath, draft) + gatedCandidates.push({ + source: suggestion.source, + priority: suggestion.priority, + text: suggestion.text, + accepted: fastDecision.accepted, + fastAccepted: fastDecision.fastAccepted, + status: fastDecision.status, + reason: `${suggestion.reason} ${fastDecision.reason}`.trim(), + baseScore: args.task.score, + rejudgeScore, + gateDelta: fastDecision.gateDelta, + docsDiff: draft.diffText, + }) + continue + } + + const rerunTask = await deps.runAgentOnCarve({ + idx: 0, + total: 1, + repoPath: args.opts.repoPath, + feature: args.feature, + initCommand: args.opts.initCommand, + model: args.opts.codingModel, + groundTruthDiff: args.groundTruthDiffs.get(args.feature.id) || '', + docsSourcePath: draft.repoDir, + }) + validationCost += rerunTask.costEstimate + + const decision = evaluateDocChangeGate({ + baseScore: args.task.score, + rejudgeScore, + rerunScore: rerunTask.score, + }) + if (rerunTask.score >= 0 && decision.accepted) { + deps.acceptDraftedDocsChange(args.opts.repoPath, draft) + gatedCandidates.push({ + source: suggestion.source, + priority: suggestion.priority, + text: suggestion.text, + accepted: decision.accepted, + fastAccepted: decision.fastAccepted, + status: decision.status, + reason: `${suggestion.reason} ${decision.reason}`.trim(), + baseScore: args.task.score, + rejudgeScore, + rerunScore: rerunTask.score, + gateDelta: decision.gateDelta, + docsDiff: draft.diffText, + }) + continue + } + + deps.cleanupDraftedDocsChange(draft) + gatedCandidates.push({ + source: suggestion.source, + priority: suggestion.priority, + text: suggestion.text, + accepted: false, + fastAccepted: false, + status: rerunTask.score < 0 ? 'rejected_rerun_failed' : 'rejected', + reason: rerunTask.score < 0 + ? `Rejected because the validation rerun failed. ${suggestion.reason}`.trim() + : `${suggestion.reason} ${decision.reason}`.trim(), + baseScore: args.task.score, + rejudgeScore, + rerunScore: rerunTask.score, + gateDelta: decision.gateDelta, + docsDiff: draft.diffText, + }) + } + + return { + result: { + featureId: args.feature.id, + baseScore: args.task.score, + candidates: gatedCandidates, + }, + validationCost, + } + } finally { + if (plan) { + deps.cleanupPlannedDocsTaskResult(plan) + } + if (enteredDocsWriterPhase) { + deps.events.send({ type: 'docs_writer', action: 'complete', loop: args.loop }) + deps.events.send({ + type: 'phase_change', + phase: 'evaluating', + round: args.loop, + loop: args.loop, + detail: 'Re-eval with updated docs', + }) + } + } +} + // --- Main orchestrator --- export async function runEvalbuff(opts: EvalbuffOptions): Promise { @@ -335,7 +804,7 @@ export async function runEvalbuff(opts: EvalbuffOptions): Promise { let totalCost = baseline.totalCost const roundResults: RoundResult[] = [baseline] const baselineRejudgeResults: RoundResult[] = [] - let previousResults = baseline + const loopDocGateResults: LoopDocGateResult[] = [] const allProjectSuggestionSections: string[] = [] // Collect project suggestions from baseline @@ -346,40 +815,51 @@ export async function runEvalbuff(opts: EvalbuffOptions): Promise { for (let loop = 1; loop <= opts.loops; loop++) { console.log(`\n\x1b[1mLoop ${loop}/${opts.loops}\x1b[0m`) - // Docs writer - const validTasks = previousResults.tasks.filter((t) => t.score >= 0) - const judgeSuggestions = collectDocSuggestions(validTasks) - const suggestionCount = judgeSuggestions.split('\n').filter(l => l.startsWith('-')).length - - events.send({ type: 'phase_change', phase: 'docs_writer', loop }) - events.send({ type: 'docs_writer', action: 'start', loop, suggestionCount }) - const docsSnapshotBefore = getDocsSnapshot(opts.repoPath) - fs.writeFileSync(path.join(logDir, `judge-suggestions-loop-${loop}.txt`), judgeSuggestions) + events.send({ type: 'phase_change', phase: 'evaluating', round: loop, loop, detail: 'Re-eval with updated docs' }) + const featureGateResults: FeatureDocGateResult[] = [] + const results = await runEvalRound( + features, + groundTruthDiffs, + opts, + loop, + baseline.avgScore, + async ({ feature, task }) => { + const gated = await gateDocsChangesForTask({ + feature, + task, + opts, + groundTruthDiffs, + loop, + }) + featureGateResults.push(gated.result) + return gated.validationCost + }, + ) - startSpinner(`Docs writer: processing ${suggestionCount} suggestions...`) - await runDocsWriterAgent(opts.repoPath, judgeSuggestions, opts.docsModel) - events.send({ type: 'docs_writer', action: 'complete', loop }) - stopSpinner(` Docs writer: applied ${suggestionCount} suggestions`) + totalCost += results.totalCost + roundResults.push(results) + + const loopDocGateResult: LoopDocGateResult = { + loop, + threshold: DOC_CHANGE_ACCEPTANCE_THRESHOLD, + fastAcceptThreshold: DOC_CHANGE_FAST_ACCEPT_THRESHOLD, + features: featureGateResults, + } + loopDocGateResults.push(loopDocGateResult) - // Save docs state and diff const docsAfterRefactor = getDocsSnapshot(opts.repoPath) const docsDiffText = computeDocsDiffText(docsSnapshotBefore, docsAfterRefactor) + const loopSummaryText = renderLoopDocGateSummary(loopDocGateResult) + fs.writeFileSync(path.join(logDir, `judge-suggestions-loop-${loop}.txt`), loopSummaryText) fs.writeFileSync(path.join(logDir, `docs-diff-loop-${loop}.txt`), docsDiffText) fs.writeFileSync( path.join(logDir, `docs-state-loop-${loop}.json`), JSON.stringify(docsAfterRefactor, null, 2), ) - - // Re-eval with updated docs - events.send({ type: 'phase_change', phase: 'evaluating', round: loop, loop, detail: 'Re-eval with updated docs' }) - const results = await runEvalRound(features, groundTruthDiffs, opts, loop, baseline.avgScore) + saveLoopDocGateResults(logDir, loopDocGateResult) saveRoundResults(logDir, results) - totalCost += results.totalCost - roundResults.push(results) - previousResults = results - // Re-judge baseline const rejudged = await runBaselineRejudgeRound(baseline, features, groundTruthDiffs, opts, loop) saveBaselineRejudgeResults(logDir, rejudged) @@ -420,10 +900,12 @@ export async function runEvalbuff(opts: EvalbuffOptions): Promise { totalCost, scoreProgression: roundResults.map((r) => r.avgScore), baselineRejudgeProgression: baselineRejudgeResults.map((r) => r.avgScore), + consideredDocChangesByLoop: loopDocGateResults.map((result) => countLoopDocChanges(result).considered), + acceptedDocChangesByLoop: loopDocGateResults.map((result) => countLoopDocChanges(result).accepted), projectPrompts, } - saveSummary(logDir, summary, roundResults, opts, baselineRejudgeResults, projectPrompts) + saveSummary(logDir, summary, roundResults, opts, baselineRejudgeResults, loopDocGateResults, projectPrompts) events.send({ type: 'run_complete', @@ -468,8 +950,8 @@ if (import.meta.main) { const repoPath = getArg('repo') const n = parseInt(getArg('n', '20')) - const parallelism = parseInt(getArg('parallelism', '10')) - const loops = parseInt(getArg('loops', '3')) + const parallelism = parseInt(getArg('parallelism', '1')) + const loops = parseInt(getArg('loops', '1')) const initCommand = hasArg('init-command') ? getArg('init-command') : undefined const codingModel = getArg('coding-model', 'sonnet') const docsModel = getArg('docs-model', 'opus') diff --git a/src/tui/app.tsx b/src/tui/app.tsx index 85b6a06..a63999c 100644 --- a/src/tui/app.tsx +++ b/src/tui/app.tsx @@ -178,7 +178,7 @@ function DashboardView({ state, cursor, onSelect }: { onSelect: (id: string) => void }) { const pLabel = phaseLabel(state.phase, state.round, state.loop) - const pct = Math.round(phaseProgress(state.phase, state.round, state.loops || 3) * 100) + const pct = Math.round(phaseProgress(state.phase, state.round, state.loops || 1) * 100) const repoName = state.repoPath.split('/').pop() || state.repoPath return ( diff --git a/src/tui/data.ts b/src/tui/data.ts index 5267d2e..a3f0d5c 100644 --- a/src/tui/data.ts +++ b/src/tui/data.ts @@ -69,6 +69,7 @@ export interface LoopData { loop: number judgeSuggestions: string docsDiff: string + docGates: unknown | null } export interface RunSummary { @@ -165,6 +166,7 @@ export function loadLogDir(logDir: string): LogDirData { loop: l, judgeSuggestions: readTextSafe(suggestionsPath), docsDiff: readTextSafe(path.join(logDir, `docs-diff-loop-${l}.txt`)), + docGates: readJsonSafe(path.join(logDir, `doc-gates-loop-${l}.json`)), }) } diff --git a/src/tui/main.tsx b/src/tui/main.tsx index 90e0fa9..068008b 100644 --- a/src/tui/main.tsx +++ b/src/tui/main.tsx @@ -369,8 +369,8 @@ async function main() { const repoPath = getArg('repo') const n = parseInt(getArg('n', '5')) - const parallelism = parseInt(getArg('parallelism', '10')) - const loops = parseInt(getArg('loops', '3')) + const parallelism = parseInt(getArg('parallelism', '1')) + const loops = parseInt(getArg('loops', '1')) const initCommand = hasArg('init-command') ? getArg('init-command') : undefined const codingModel = getArg('coding-model', 'sonnet') const docsModel = getArg('docs-model', 'opus') From 99cd6b5b3419950984f2850750e405eb6ab81a5c Mon Sep 17 00:00:00 2001 From: James Grugett Date: Wed, 8 Apr 2026 19:39:55 -0700 Subject: [PATCH 3/9] Remove parallelism & loops params --- docs/architecture.md | 10 +- docs/cli.md | 6 +- docs/run-artifacts.md | 2 +- src/__tests__/run-evalbuff.e2e.test.ts | 4 +- src/__tests__/run-evalbuff.test.ts | 4 - src/cli.ts | 8 +- src/log.ts | 3 +- src/report.ts | 5 +- src/run-evalbuff.ts | 226 +++++++++++-------------- src/test-repo-utils.ts | 2 +- src/tui/app.tsx | 25 ++- src/tui/events.ts | 2 - src/tui/main.tsx | 42 +---- 13 files changed, 135 insertions(+), 204 deletions(-) diff --git a/docs/architecture.md b/docs/architecture.md index 08c647e..e82eb97 100644 --- a/docs/architecture.md +++ b/docs/architecture.md @@ -2,13 +2,13 @@ ## Pipeline Overview -Evalbuff follows a plan → carve → baseline → gated-improvement loop: +Evalbuff follows a plan → carve → baseline → gated improvement round: 1. **Plan** — `planFeatures()` in `src/carve-features.ts` uses a Codex agent to scan the target repo and identify 15–25 discrete features that can be cleanly removed. 2. **Carve** — `carveFeature()` creates an isolated git worktree, runs a Codex agent to remove the feature, and captures the resulting diff and file operations. 3. **Baseline** — `runAgentOnCarve()` in `src/eval-runner.ts` clones the repo, applies the carve, copies current docs, runs a coding agent to rebuild the feature, then hands the result to `judgeTaskResult()` in `src/judge.ts`. -4. **Gate docs changes** — during each improvement loop, every feature is re-run sequentially. The judge and coding agent both suggest independent docs changes. `planDocsChangesForTask()` in `src/docs-writer.ts` reads the docs once, rejects overfit/low-value suggestions, and creates one independent committed docs candidate per surviving suggestion. Evalbuff then materializes each candidate patch onto the current docs state, re-judges the originating task, and optionally re-runs the coding agent before accepting it. -5. **Repeat** — Step 4 loops N times. Each completed loop also re-judges the baseline diffs with the final loop docs to separate judge recalibration from real agent improvement. +4. **Gate docs changes** — during the improvement round, every feature is re-run sequentially. The judge and coding agent both suggest independent docs changes. `planDocsChangesForTask()` in `src/docs-writer.ts` reads the docs once, rejects overfit/low-value suggestions, and creates one independent committed docs candidate per surviving suggestion. Evalbuff then materializes each candidate patch onto the current docs state, re-judges the originating task, and optionally re-runs the coding agent before accepting it. +5. **Baseline rejudge** — after the improvement round, Evalbuff re-judges the baseline diffs with the updated docs to separate judge recalibration from real agent improvement. ## Key Modules @@ -40,7 +40,7 @@ Target repo ↓ runAgentOnCarve() → TaskResult (per feature, sequentially) ↓ saveRoundResults() → round-N/ directory ↓ - ↓ For each improvement loop: + ↓ Improvement round: ↓ runEvalRound() → new scores ↓ gateDocsChangesForTask() → per-feature accepted/rejected doc candidates ↓ runBaselineRejudgeRound() → re-scored baseline @@ -95,7 +95,7 @@ When modifying the orchestration (new `EvalbuffOptions` fields, new phases, new ## Concurrency -Carving still uses bounded concurrency (`opts.parallelism` workers pull from a shared queue), but eval rounds are intentionally sequential. That ordering matters because accepted docs changes from one feature should affect the very next feature in the same loop. +Carving uses a fixed internal worker pool in `src/run-evalbuff.ts` to speed up feature extraction, while baseline evaluation, docs gating, and baseline rejudging stay sequential. The sequential improvement round still matters because accepted docs changes from one feature should affect the very next feature. ## Events and TUI diff --git a/docs/cli.md b/docs/cli.md index 04967b8..033b573 100644 --- a/docs/cli.md +++ b/docs/cli.md @@ -6,15 +6,13 @@ bun run src/run-evalbuff.ts \ --repo /path/to/repo \ [--n 20] \ - [--parallelism 1] \ - [--loops 1] \ [--init-command "npm install"] \ [--coding-model sonnet] \ [--docs-model opus] \ [--cached-features /path/to/features.json] ``` -All flags are parsed explicitly in the `import.meta.main` block. Required flags must be validated with helpful errors. The `--cached-features` flag skips planning/carving and loads pre-carved features directly. Improvement loops now run features sequentially and gate docs changes one candidate at a time; `--parallelism` still applies to carving/setup concurrency, not the per-loop feature order. +All flags are parsed explicitly in the `import.meta.main` block. Required flags must be validated with helpful errors. The `--cached-features` flag skips planning/carving and loads pre-carved features directly. Evalbuff now always runs a single sequential improvement round after baseline, and carve concurrency is an internal fixed constant rather than a public flag. ## Perfect Feature (Single-Feature Optimizer) @@ -65,7 +63,7 @@ For any new CLI command: 2. **Validate required flags** and print helpful error messages for missing ones. Exit early with usage text rather than failing deep in the pipeline. 3. **Add a `scripts` entry** in `package.json`. 4. **Keep the CLI contract consistent** between the file header usage comment, the flag parser, the options type, and the `package.json` script entry. -5. **Log non-default options** in startup output when they affect behavior (e.g., model overrides, parallelism). +5. **Log non-default options** in startup output when they affect behavior (e.g., model overrides). 6. **Thread every flag** through the options type into the runtime path — never parse a flag and ignore it. ### New Command Checklist diff --git a/docs/run-artifacts.md b/docs/run-artifacts.md index cfd0c1d..cf232e4 100644 --- a/docs/run-artifacts.md +++ b/docs/run-artifacts.md @@ -118,7 +118,7 @@ Valid run directories match `evalbuff-run-YYYY-MM-DDTHH-MM-SS`. Scratch director `saveSummary()` in `src/report.ts` writes **both** `summary.json` and `report.md`. Both must be generated together. -The report overview must include: repo path, start time, end time, duration, features carved, improvement loop count, coding model, docs model, and total cost. The "Scores by Round" table must include one column per round plus Average and Cost rows. +The report overview must include: repo path, start time, end time, duration, features carved, improvement round count, coding model, docs model, and total cost. The "Scores by Round" table must include one column per round plus Average and Cost rows. When baseline rejudging is enabled, include both the baseline rejudge trajectory and explicit derived metrics: - **Judge recalibration** = `baselineRejudgeProgression[last] - scoreProgression[0]` — measures how much the judge's scoring changed due to updated docs alone. diff --git a/src/__tests__/run-evalbuff.e2e.test.ts b/src/__tests__/run-evalbuff.e2e.test.ts index 71f711b..2c8116a 100644 --- a/src/__tests__/run-evalbuff.e2e.test.ts +++ b/src/__tests__/run-evalbuff.e2e.test.ts @@ -303,12 +303,10 @@ describe('Evalbuff pipeline e2e', () => { fs.readdirSync(tmpDir).filter((entry) => entry.startsWith('evalbuff-run-')), ) - // Use minimal settings: 2 features, 1 parallel run, 1 improvement loop + // Use minimal settings: 2 features and the default single improvement round await runEvalbuff({ repoPath: repoDir, n: 2, - parallelism: 1, - loops: 1, codingModel: 'sonnet', docsModel: 'sonnet', // use sonnet for speed in tests }) diff --git a/src/__tests__/run-evalbuff.test.ts b/src/__tests__/run-evalbuff.test.ts index e014dd7..8f7f3d7 100644 --- a/src/__tests__/run-evalbuff.test.ts +++ b/src/__tests__/run-evalbuff.test.ts @@ -101,8 +101,6 @@ describe('evaluateDocChangeGate', () => { { repoPath: '/tmp/repo', n: 1, - parallelism: 1, - loops: 1, codingModel: 'sonnet', docsModel: 'opus', }, @@ -136,8 +134,6 @@ describe('evaluateDocChangeGate', () => { opts: { repoPath: '/tmp/repo', n: 1, - parallelism: 1, - loops: 1, codingModel: 'sonnet', docsModel: 'opus', }, diff --git a/src/cli.ts b/src/cli.ts index b5269c2..92d87e7 100644 --- a/src/cli.ts +++ b/src/cli.ts @@ -3,7 +3,7 @@ * Evalbuff CLI entry point. * * Usage: - * evalbuff --repo /path/to/repo [--n 20] [--parallelism 1] [--loops 1] + * evalbuff --repo /path/to/repo [--n 20] * [--init-command "npm install"] [--coding-model sonnet] [--docs-model opus] * [--cached-features /path/to/features.json] */ @@ -27,8 +27,6 @@ if (args.length === 0 || args.includes('--help') || args.includes('-h')) { Options: --repo Path to the target repository (required) --n Number of features to evaluate (default: 20) - --parallelism Max concurrent carve/setup jobs (default: 1) - --loops Number of optimization loops (default: 1) --init-command Command to run before each agent run --coding-model Model for coding agent (default: sonnet) --docs-model Model for docs agent (default: opus) @@ -48,8 +46,6 @@ const hasArg = (name: string): boolean => args.includes(`--${name}`) const repoPath = getArg('repo') const n = parseInt(getArg('n', '20')) -const parallelism = parseInt(getArg('parallelism', '1')) -const loops = parseInt(getArg('loops', '1')) const initCommand = hasArg('init-command') ? getArg('init-command') : undefined const codingModel = getArg('coding-model', 'sonnet') const docsModel = getArg('docs-model', 'opus') @@ -58,8 +54,6 @@ const cachedFeatures = hasArg('cached-features') ? getArg('cached-features') : u runEvalbuff({ repoPath, n, - parallelism, - loops, initCommand, codingModel, docsModel, diff --git a/src/log.ts b/src/log.ts index d77dcb8..600b931 100644 --- a/src/log.ts +++ b/src/log.ts @@ -67,14 +67,13 @@ function scoreDelta(before: number, after: number): string { export function printHeader(opts: { repoPath: string n: number - loops: number codingModel: string docsModel: string logDir: string }): void { console.log(`\n\x1b[1mEvalbuff Run\x1b[0m`) console.log(` Repo: ${opts.repoPath}`) - console.log(` Features: ${opts.n} | Loops: ${opts.loops} | Models: ${opts.codingModel}/${opts.docsModel}`) + console.log(` Features: ${opts.n} | Models: ${opts.codingModel}/${opts.docsModel}`) console.log(` Logs: ${opts.logDir}`) } diff --git a/src/report.ts b/src/report.ts index 1bafa1a..25683a3 100644 --- a/src/report.ts +++ b/src/report.ts @@ -75,7 +75,6 @@ export interface EvalSummary { } export interface EvalOptions { - loops: number codingModel: string docsModel: string } @@ -194,7 +193,7 @@ export function saveSummary( `| **End** | ${summary.endTime} |`, `| **Duration** | ${formatDuration(summary.startTime, summary.endTime)} |`, `| **Features carved** | ${summary.featuresCarved} |`, - `| **Improvement loops** | ${opts.loops} |`, + `| **Improvement rounds** | ${Math.max(roundResults.length - 1, 0)} |`, `| **Coding model** | ${opts.codingModel} |`, `| **Docs model** | ${opts.docsModel} |`, `| **Doc gate threshold** | ${loopDocGateResults[0]?.threshold?.toFixed(1) ?? 'n/a'} |`, @@ -449,7 +448,7 @@ export function saveSummary( } // --- Final docs state --- - const lastLoop = opts.loops + const lastLoop = Math.max(roundResults.length - 1, 0) const finalDocsFile = path.join(logDir, `docs-state-loop-${lastLoop}.json`) if (fs.existsSync(finalDocsFile)) { const finalDocs: Record = JSON.parse(fs.readFileSync(finalDocsFile, 'utf-8')) diff --git a/src/run-evalbuff.ts b/src/run-evalbuff.ts index 9705250..4ef9d75 100644 --- a/src/run-evalbuff.ts +++ b/src/run-evalbuff.ts @@ -5,13 +5,13 @@ * 1. Plan features to carve (GPT-5.4 via Codex SDK) * 2. Carve a random subset of n features * 3. Baseline: rebuild each feature sequentially (Claude Code), judge (Codex), get scores + suggestions - * 4. Loop N times: + * 4. Improvement round: * a. Re-evaluate each feature sequentially * b. Draft each suggested docs change independently * c. Gate each docs change on the feature that inspired it before accepting it * * Usage: - * bun run src/run-evalbuff.ts --repo /path/to/repo [--n 5] [--parallelism 1] [--loops 1] [--init-command "npm install"] + * bun run src/run-evalbuff.ts --repo /path/to/repo [--n 5] [--init-command "npm install"] */ import fs from 'fs' import os from 'os' @@ -55,8 +55,6 @@ import type { export interface EvalbuffOptions { repoPath: string n: number // number of features to randomly select - parallelism: number // retained for carving/setup concurrency; eval loops run sequentially - loops: number // number of improvement loops (default 1) initCommand?: string codingModel: string // model for coding agents (default: sonnet) docsModel: string // model for docs agents (default: opus) @@ -65,6 +63,7 @@ export interface EvalbuffOptions { const DOC_CHANGE_ACCEPTANCE_THRESHOLD = 0.5 const DOC_CHANGE_FAST_ACCEPT_THRESHOLD = DOC_CHANGE_ACCEPTANCE_THRESHOLD * 2 +const CARVE_PARALLELISM = 10 export function evaluateDocChangeGate(args: { baseScore: number @@ -230,8 +229,8 @@ export async function runEvalRound( // // Re-runs the judge on the baseline's stored diffs/traces after docs have been // updated. The agent's work is fixed — only the docs given to the judge change. -// This lets us see whether score changes over loops reflect real agent -// improvement or merely judge recalibration from better docs. +// This lets us see whether the improvement round changed real agent behavior +// or merely judge calibration from better docs. async function runBaselineRejudgeRound( baseline: RoundResult, @@ -242,66 +241,57 @@ async function runBaselineRejudgeRound( ): Promise { let completed = 0 startSpinner(`Baseline rejudge: 0/${baseline.tasks.length} re-scored...`) - const featureById = new Map(features.map(f => [f.id, f])) const results: TaskResult[] = [] - const queue = baseline.tasks.map((baselineTask, i) => ({ baselineTask, i })) - let next = 0 - - async function worker(): Promise { - while (next < queue.length) { - const { baselineTask, i } = queue[next++] - const feature = featureById.get(baselineTask.featureId) - - if (!feature || baselineTask.score < 0) { - results[i] = baselineTask - completed++ - updateSpinner(`Baseline rejudge: ${completed}/${queue.length} re-scored`) - continue - } + for (let i = 0; i < baseline.tasks.length; i++) { + const baselineTask = baseline.tasks[i] + const feature = featureById.get(baselineTask.featureId) - try { - const judging = await rejudgeBaselineWithCurrentDocs({ - idx: i, - total: queue.length, - repoPath: opts.repoPath, - feature, - baselineDiff: baselineTask.diff, - groundTruthDiff: groundTruthDiffs.get(feature.id) || '', - initCommand: opts.initCommand, - docsSourcePath: opts.repoPath, - }) - results[i] = { - ...baselineTask, - score: judging.overallScore, - judging, - costEstimate: 0, - } - } catch (error) { - const msg = error instanceof Error ? error.message : String(error) - results[i] = { - ...baselineTask, - score: -1, - judging: { - analysis: `Rejudge failed: ${msg.slice(0, 500)}`, - strengths: [], - weaknesses: ['Rejudge failed'], - e2eTestsPerformed: [], - completionScore: -1, - codeQualityScore: -1, - e2eScore: -1, - overallScore: -1, - }, - } - } + if (!feature || baselineTask.score < 0) { + results[i] = baselineTask completed++ - updateSpinner(`Baseline rejudge: ${completed}/${queue.length} re-scored`) + updateSpinner(`Baseline rejudge: ${completed}/${baseline.tasks.length} re-scored`) + continue } - } - await Promise.all( - Array.from({ length: Math.min(opts.parallelism, queue.length) }, () => worker()), - ) + try { + const judging = await rejudgeBaselineWithCurrentDocs({ + idx: i, + total: baseline.tasks.length, + repoPath: opts.repoPath, + feature, + baselineDiff: baselineTask.diff, + groundTruthDiff: groundTruthDiffs.get(feature.id) || '', + initCommand: opts.initCommand, + docsSourcePath: opts.repoPath, + }) + results[i] = { + ...baselineTask, + score: judging.overallScore, + judging, + costEstimate: 0, + } + } catch (error) { + const msg = error instanceof Error ? error.message : String(error) + results[i] = { + ...baselineTask, + score: -1, + judging: { + analysis: `Rejudge failed: ${msg.slice(0, 500)}`, + strengths: [], + weaknesses: ['Rejudge failed'], + e2eTestsPerformed: [], + completionScore: -1, + codeQualityScore: -1, + e2eScore: -1, + overallScore: -1, + }, + } + } + + completed++ + updateSpinner(`Baseline rejudge: ${completed}/${baseline.tasks.length} re-scored`) + } stopSpinner() @@ -704,8 +694,6 @@ export async function runEvalbuff(opts: EvalbuffOptions): Promise { type: 'run_start', repoPath: opts.repoPath, n: opts.n, - loops: opts.loops, - parallelism: opts.parallelism, codingModel: opts.codingModel, docsModel: opts.docsModel, logDir, @@ -714,7 +702,6 @@ export async function runEvalbuff(opts: EvalbuffOptions): Promise { printHeader({ repoPath: opts.repoPath, n: opts.n, - loops: opts.loops, codingModel: opts.codingModel, docsModel: opts.docsModel, logDir, @@ -747,10 +734,10 @@ export async function runEvalbuff(opts: EvalbuffOptions): Promise { { const carveQueue = [...selected] let carveNext = 0 - let carveCompleted = 0 const carveResults: (CarvedFeature | null)[] = new Array(carveQueue.length).fill(null) startSpinner(`Carving 0/${carveQueue.length} features...`) + let carveCompleted = 0 async function carveWorker(): Promise { while (carveNext < carveQueue.length) { @@ -773,7 +760,7 @@ export async function runEvalbuff(opts: EvalbuffOptions): Promise { } await Promise.all( - Array.from({ length: Math.min(opts.parallelism, carveQueue.length) }, () => carveWorker()), + Array.from({ length: Math.min(CARVE_PARALLELISM, carveQueue.length) }, () => carveWorker()), ) for (const result of carveResults) { if (result) features.push(result) @@ -811,64 +798,61 @@ export async function runEvalbuff(opts: EvalbuffOptions): Promise { const baselineProjectSuggestions = collectProjectSuggestions(baseline.tasks.filter(t => t.score >= 0)) if (baselineProjectSuggestions) allProjectSuggestionSections.push(`## Baseline Round\n\n${baselineProjectSuggestions}`) - // --- Improvement loops --- - for (let loop = 1; loop <= opts.loops; loop++) { - console.log(`\n\x1b[1mLoop ${loop}/${opts.loops}\x1b[0m`) - - const docsSnapshotBefore = getDocsSnapshot(opts.repoPath) - events.send({ type: 'phase_change', phase: 'evaluating', round: loop, loop, detail: 'Re-eval with updated docs' }) - const featureGateResults: FeatureDocGateResult[] = [] - const results = await runEvalRound( - features, - groundTruthDiffs, - opts, - loop, - baseline.avgScore, - async ({ feature, task }) => { - const gated = await gateDocsChangesForTask({ - feature, - task, - opts, - groundTruthDiffs, - loop, - }) - featureGateResults.push(gated.result) - return gated.validationCost - }, - ) + // --- Improvement round --- + const improvementRound = 1 + console.log(`\n\x1b[1mImprovement Round\x1b[0m`) + + const docsSnapshotBefore = getDocsSnapshot(opts.repoPath) + events.send({ type: 'phase_change', phase: 'evaluating', round: improvementRound, loop: improvementRound, detail: 'Re-eval with updated docs' }) + const featureGateResults: FeatureDocGateResult[] = [] + const results = await runEvalRound( + features, + groundTruthDiffs, + opts, + improvementRound, + baseline.avgScore, + async ({ feature, task }) => { + const gated = await gateDocsChangesForTask({ + feature, + task, + opts, + groundTruthDiffs, + loop: improvementRound, + }) + featureGateResults.push(gated.result) + return gated.validationCost + }, + ) - totalCost += results.totalCost - roundResults.push(results) + totalCost += results.totalCost + roundResults.push(results) - const loopDocGateResult: LoopDocGateResult = { - loop, - threshold: DOC_CHANGE_ACCEPTANCE_THRESHOLD, - fastAcceptThreshold: DOC_CHANGE_FAST_ACCEPT_THRESHOLD, - features: featureGateResults, - } - loopDocGateResults.push(loopDocGateResult) - - const docsAfterRefactor = getDocsSnapshot(opts.repoPath) - const docsDiffText = computeDocsDiffText(docsSnapshotBefore, docsAfterRefactor) - const loopSummaryText = renderLoopDocGateSummary(loopDocGateResult) - fs.writeFileSync(path.join(logDir, `judge-suggestions-loop-${loop}.txt`), loopSummaryText) - fs.writeFileSync(path.join(logDir, `docs-diff-loop-${loop}.txt`), docsDiffText) - fs.writeFileSync( - path.join(logDir, `docs-state-loop-${loop}.json`), - JSON.stringify(docsAfterRefactor, null, 2), - ) - saveLoopDocGateResults(logDir, loopDocGateResult) - saveRoundResults(logDir, results) + const loopDocGateResult: LoopDocGateResult = { + loop: improvementRound, + threshold: DOC_CHANGE_ACCEPTANCE_THRESHOLD, + fastAcceptThreshold: DOC_CHANGE_FAST_ACCEPT_THRESHOLD, + features: featureGateResults, + } + loopDocGateResults.push(loopDocGateResult) + + const docsAfterRefactor = getDocsSnapshot(opts.repoPath) + const docsDiffText = computeDocsDiffText(docsSnapshotBefore, docsAfterRefactor) + const loopSummaryText = renderLoopDocGateSummary(loopDocGateResult) + fs.writeFileSync(path.join(logDir, `judge-suggestions-loop-${improvementRound}.txt`), loopSummaryText) + fs.writeFileSync(path.join(logDir, `docs-diff-loop-${improvementRound}.txt`), docsDiffText) + fs.writeFileSync( + path.join(logDir, `docs-state-loop-${improvementRound}.json`), + JSON.stringify(docsAfterRefactor, null, 2), + ) + saveLoopDocGateResults(logDir, loopDocGateResult) + saveRoundResults(logDir, results) - // Re-judge baseline - const rejudged = await runBaselineRejudgeRound(baseline, features, groundTruthDiffs, opts, loop) - saveBaselineRejudgeResults(logDir, rejudged) - baselineRejudgeResults.push(rejudged) + const rejudged = await runBaselineRejudgeRound(baseline, features, groundTruthDiffs, opts, improvementRound) + saveBaselineRejudgeResults(logDir, rejudged) + baselineRejudgeResults.push(rejudged) - // Collect project suggestions - const loopProjectSuggestions = collectProjectSuggestions(results.tasks.filter(t => t.score >= 0)) - if (loopProjectSuggestions) allProjectSuggestionSections.push(`## Loop ${loop}\n\n${loopProjectSuggestions}`) - } + const loopProjectSuggestions = collectProjectSuggestions(results.tasks.filter(t => t.score >= 0)) + if (loopProjectSuggestions) allProjectSuggestionSections.push(`## Improvement Round\n\n${loopProjectSuggestions}`) // --- Generate project improvement prompts --- let projectPrompts: string[] = [] @@ -950,8 +934,6 @@ if (import.meta.main) { const repoPath = getArg('repo') const n = parseInt(getArg('n', '20')) - const parallelism = parseInt(getArg('parallelism', '1')) - const loops = parseInt(getArg('loops', '1')) const initCommand = hasArg('init-command') ? getArg('init-command') : undefined const codingModel = getArg('coding-model', 'sonnet') const docsModel = getArg('docs-model', 'opus') @@ -960,8 +942,6 @@ if (import.meta.main) { runEvalbuff({ repoPath, n, - parallelism, - loops, initCommand, codingModel, docsModel, diff --git a/src/test-repo-utils.ts b/src/test-repo-utils.ts index 1d02ddc..1773a00 100644 --- a/src/test-repo-utils.ts +++ b/src/test-repo-utils.ts @@ -11,7 +11,7 @@ import { getErrorObject } from './vendor/error' * * When localRepoPath is provided, uses a local clone (near-instant via hardlinks) * instead of a remote clone (5-30s per clone). This is the single biggest - * speedup in evalbuff — with parallelism=5, saves 10-30 remote clones per commit. + * speedup in evalbuff because the pipeline creates many short-lived clones. */ export const withTestRepo = async ( repoConfig: { diff --git a/src/tui/app.tsx b/src/tui/app.tsx index a63999c..72ccd62 100644 --- a/src/tui/app.tsx +++ b/src/tui/app.tsx @@ -44,7 +44,6 @@ interface RunState { elapsed: string done: boolean n: number - loops: number codingModel: string docsModel: string } @@ -76,7 +75,7 @@ function initialState(): RunState { phaseDetail: 'Initializing...', features: new Map(), featureOrder: [], roundScores: new Map(), totalCost: 0, scoreProgression: [], logs: [], startTime: Date.now(), elapsed: '00:00', done: false, - n: 0, loops: 0, codingModel: '', docsModel: '', + n: 0, codingModel: '', docsModel: '', } } @@ -106,20 +105,19 @@ function phaseLabel(phase: Phase, round: number, loop: number): string { case 'planning': return 'Planning Features' case 'carving': return 'Carving Features' case 'evaluating': - return round === 0 ? 'Baseline Eval (Round 0)' : `Re-eval (Loop ${loop}, Round ${round})` - case 'docs_writer': return `Docs Writer (Loop ${loop})` + return round === 0 ? 'Baseline Eval (Round 0)' : `Improvement Eval (Round ${round})` + case 'docs_writer': return `Docs Writer (Round ${loop})` case 'complete': return 'Complete' } } -function phaseProgress(phase: Phase, round: number, loops: number): number { +function phaseProgress(phase: Phase, round: number): number { if (phase === 'planning') return 0.05 if (phase === 'carving') return 0.10 if (phase === 'complete') return 1.0 if (round === 0) return 0.25 - const loopWeight = 0.65 / (loops || 1) - const loopProgress = (round - 1) * loopWeight + (phase === 'docs_writer' ? 0 : loopWeight * 0.5) - return 0.35 + loopProgress + if (phase === 'docs_writer') return 0.55 + return 0.80 } function scoreColor(score: number): string { @@ -178,7 +176,7 @@ function DashboardView({ state, cursor, onSelect }: { onSelect: (id: string) => void }) { const pLabel = phaseLabel(state.phase, state.round, state.loop) - const pct = Math.round(phaseProgress(state.phase, state.round, state.loops || 1) * 100) + const pct = Math.round(phaseProgress(state.phase, state.round) * 100) const repoName = state.repoPath.split('/').pop() || state.repoPath return ( @@ -189,7 +187,7 @@ function DashboardView({ state, cursor, onSelect }: { EVALBUFF {' '}{repoName} - {' '}n={state.n} loops={state.loops} {state.codingModel}/{state.docsModel} + {' '}n={state.n} {state.codingModel}/{state.docsModel} {state.elapsed} @@ -528,7 +526,7 @@ function RoundDetailView({ round, state, logData, cursor, onBack }: { {'< '} Round {round} - {round === 0 ? ' (Baseline)' : ` (Loop ${round})`} + {round === 0 ? ' (Baseline)' : ' (Improvement)'} Avg: @@ -598,7 +596,7 @@ function RoundDetailView({ round, state, logData, cursor, onBack }: { Baseline round — no docs changes applied. This is the first evaluation round. Agents are tested against the current docs. - After this round, judge suggestions will be collected and docs will be updated. + After this round, evalbuff runs one gated improvement round with updated docs. )} @@ -668,7 +666,7 @@ function SummaryView({ state, logData, onBack }: { --- Config --- Repo: {state.repoPath} Features: {state.featureOrder.length} - Loops: {state.loops} + Rounds: {state.scoreProgression.length} Coding: {state.codingModel} Docs: {state.docsModel} Total cost: ${state.totalCost.toFixed(2)} @@ -985,7 +983,6 @@ export function App({ startView, onLoadRun }: { startView?: View['type']; onLoad next.repoPath = event.repoPath next.logDir = event.logDir next.n = event.n - next.loops = event.loops next.codingModel = event.codingModel next.docsModel = event.docsModel next.startTime = Date.now() diff --git a/src/tui/events.ts b/src/tui/events.ts index d0113b4..7f3e050 100644 --- a/src/tui/events.ts +++ b/src/tui/events.ts @@ -29,8 +29,6 @@ export interface RunStartEvent { type: 'run_start' repoPath: string n: number - loops: number - parallelism: number codingModel: string docsModel: string logDir: string diff --git a/src/tui/main.tsx b/src/tui/main.tsx index 068008b..6bdeb6e 100644 --- a/src/tui/main.tsx +++ b/src/tui/main.tsx @@ -71,8 +71,9 @@ function augmentFromFilesystem(logDir: string, seenEventTypes: Set) { type: 'run_start', repoPath: summary?.repoPath || logDir, n: summary?.featuresCarved || 0, - loops: (summary?.rounds?.length ?? 1) - 1, - parallelism: 0, codingModel: '?', docsModel: '?', logDir, + codingModel: '?', + docsModel: '?', + logDir, }) } @@ -227,8 +228,6 @@ async function runDemo() { type: 'run_start', repoPath: '/Users/demo/my-project', n: 20, - loops: 2, - parallelism: 3, codingModel: 'sonnet', docsModel: 'opus', logDir: '/tmp/evalbuff-demo', @@ -292,37 +291,12 @@ async function runDemo() { const avgLoop1 = Object.values(loop1Scores).reduce((a, b) => a + b, 0) / featureIds.length events.send({ type: 'round_complete', round: 1, avgScore: avgLoop1, totalCost: 4.68, scores: loop1Scores }) - // Loop 2 - await sleep(500) - events.send({ type: 'phase_change', phase: 'docs_writer', loop: 2 }) - events.send({ type: 'docs_writer', action: 'start', loop: 2, suggestionCount: 8 }) - await sleep(2000) - events.send({ type: 'docs_writer', action: 'complete', loop: 2 }) - - await sleep(300) - events.send({ type: 'phase_change', phase: 'evaluating', round: 2, loop: 2, detail: 'Re-eval with updated docs' }) - - const loop2Scores: Record = {} - for (const id of featureIds) { - events.send({ type: 'feature_status', featureId: id, status: 'agent_running' }) - await sleep(1000 + Math.random() * 1000) - events.send({ type: 'feature_status', featureId: id, status: 'judging' }) - await sleep(500 + Math.random() * 400) - const improvement = 0.3 + Math.random() * 1.0 - const score = Math.min(10, loop1Scores[id] + improvement) - loop2Scores[id] = Math.round(score * 10) / 10 - events.send({ type: 'feature_status', featureId: id, status: 'scored', score: loop2Scores[id], cost: 0.15 + Math.random() * 0.3 }) - } - - const avgLoop2 = Object.values(loop2Scores).reduce((a, b) => a + b, 0) / featureIds.length - events.send({ type: 'round_complete', round: 2, avgScore: avgLoop2, totalCost: 7.02, scores: loop2Scores }) - await sleep(500) events.send({ type: 'run_complete', - scoreProgression: [avgBaseline, avgLoop1, avgLoop2], - totalCost: 7.02, - duration: '3m 45s', + scoreProgression: [avgBaseline, avgLoop1], + totalCost: 4.68, + duration: '2m 10s', }) } @@ -369,14 +343,12 @@ async function main() { const repoPath = getArg('repo') const n = parseInt(getArg('n', '5')) - const parallelism = parseInt(getArg('parallelism', '1')) - const loops = parseInt(getArg('loops', '1')) const initCommand = hasArg('init-command') ? getArg('init-command') : undefined const codingModel = getArg('coding-model', 'sonnet') const docsModel = getArg('docs-model', 'opus') const cachedFeatures = hasArg('cached-features') ? getArg('cached-features') : undefined - runEvalbuff({ repoPath, n, parallelism, loops, initCommand, codingModel, docsModel, cachedFeatures }).catch(err => { + runEvalbuff({ repoPath, n, initCommand, codingModel, docsModel, cachedFeatures }).catch(err => { events.log(`Run failed: ${err}`, 'error') }) } else { From 355f094cb0ed34ec000c0c496dbb35c2f39a432e Mon Sep 17 00:00:00 2001 From: James Grugett Date: Wed, 8 Apr 2026 20:19:03 -0700 Subject: [PATCH 4/9] Save run artifacts when running gated doc checks --- docs/run-artifacts.md | 18 +++- src/__tests__/report.test.ts | 109 +++++++++++++++++++++++++ src/__tests__/run-evalbuff.e2e.test.ts | 3 + src/report.ts | 77 +++++++++++++++++ src/run-evalbuff.ts | 73 ++++++++++++++--- 5 files changed, 266 insertions(+), 14 deletions(-) create mode 100644 src/__tests__/report.test.ts diff --git a/docs/run-artifacts.md b/docs/run-artifacts.md index cf232e4..500cf65 100644 --- a/docs/run-artifacts.md +++ b/docs/run-artifacts.md @@ -30,6 +30,21 @@ $TMPDIR/evalbuff-run-YYYY-MM-DDTHH-MM-SS/ │ ├── judge-suggestions-loop-1.txt # Human-readable summary of accepted/rejected/overfit-skipped doc candidates ├── doc-gates-loop-1.json # Detailed per-candidate gate results for loop 1, including overfit and low-priority rejections +├── doc-candidates-loop-1/ # Per-candidate validation artifacts for every considered docs change +│ └── / +│ └── candidate-01/ +│ ├── metadata.json # Summary row for this candidate (status, scores, reason, docsDiff) +│ ├── suggestion.txt # Raw suggestion text +│ ├── docs.patch # Proposed docs patch, when available +│ ├── docs-diff.txt # Docs diff that was tested for this candidate +│ ├── rejudge.json # Full rejudge output for the previous trace with updated docs +│ ├── rerun-trace.txt # Validation rerun trace when a rerun happened +│ ├── rerun-trace.txt.compressed +│ ├── rerun-trace.txt.sidecars/ +│ ├── rerun-diff.txt +│ ├── rerun-judging.json +│ ├── rerun-score.txt +│ └── rerun-agent-suggestions.json ├── docs-diff-loop-1.txt # Before/after diff of docs for loop 1 ├── docs-state-loop-1.json # Snapshot of all docs after loop 1 │ @@ -60,10 +75,11 @@ $TMPDIR/evalbuff-run-YYYY-MM-DDTHH-MM-SS/ ## Loop Artifact Timing -Loop artifacts (`judge-suggestions-loop-N.txt`, `doc-gates-loop-N.json`, `docs-diff-loop-N.txt`, `docs-state-loop-N.json`) are written at the **log-dir root** after the sequential doc-gating pass, **before** the corresponding `round-N/` directory is created by `saveRoundResults()`. This means: +Loop artifacts (`judge-suggestions-loop-N.txt`, `doc-gates-loop-N.json`, `doc-candidates-loop-N/`, `docs-diff-loop-N.txt`, `docs-state-loop-N.json`) are written at the **log-dir root** after the sequential doc-gating pass, **before** the corresponding `round-N/` directory is created by `saveRoundResults()`. This means: - `judge-suggestions-loop-N.txt` should exist for every completed loop, even if it is empty. - `doc-gates-loop-N.json` contains every considered docs candidate for the loop, including accepted/rejected status, overfit/low-priority filtering, and rejudge/rerun scores when applicable. +- `doc-candidates-loop-N/` contains one directory per considered candidate with the tested docs diff, the full rejudge output when available, and the full rerun trace/diff/judging bundle when a rerun happened. - `docs-diff-loop-N.txt` must always exist after the docs-writer step — empty string when nothing changed. - `docs-state-loop-N.json` must always exist — contains the `getDocsSnapshot(repoPath)` result after refactoring. diff --git a/src/__tests__/report.test.ts b/src/__tests__/report.test.ts new file mode 100644 index 0000000..cbb59bb --- /dev/null +++ b/src/__tests__/report.test.ts @@ -0,0 +1,109 @@ +import fs from 'fs' +import os from 'os' +import path from 'path' + +import { afterEach, describe, expect, it } from 'bun:test' + +import { saveLoopDocGateArtifacts } from '../report' + +import type { FeatureDocGateArtifacts } from '../report' + +const tempDirs: string[] = [] + +afterEach(() => { + for (const dir of tempDirs.splice(0)) { + fs.rmSync(dir, { recursive: true, force: true }) + } +}) + +describe('saveLoopDocGateArtifacts', () => { + it('persists per-candidate docs diffs, rejudge output, and rerun artifacts', () => { + const logDir = fs.mkdtempSync(path.join(os.tmpdir(), 'evalbuff-report-test-')) + tempDirs.push(logDir) + + const artifacts: FeatureDocGateArtifacts[] = [ + { + featureId: 'feature-a', + candidates: [ + { + summary: { + source: 'judge', + priority: 80, + text: 'Document env setup for config-sensitive tests', + accepted: true, + fastAccepted: false, + status: 'accepted', + reason: 'Reusable and verified', + baseScore: 6, + rejudgeScore: 5.5, + rerunScore: 6.4, + gateDelta: 0.9, + docsDiff: '--- a/docs/testing.md\n+++ b/docs/testing.md\n+Set APP_MODE=test\n', + }, + docsPatchText: 'diff --git a/docs/testing.md b/docs/testing.md\n--- a/docs/testing.md\n+++ b/docs/testing.md\n@@ -1 +1,2 @@\n # Testing\n+Set APP_MODE=test\n', + rejudgeJudging: { + analysis: 'More discerning with updated docs.', + strengths: [], + weaknesses: [], + e2eTestsPerformed: [], + completionScore: 5.5, + codeQualityScore: 5.5, + e2eScore: 5.5, + overallScore: 5.5, + }, + rerunTask: { + featureId: 'feature-a', + prompt: 'Restore feature a', + score: 6.4, + diff: 'diff --git a/src/a.ts b/src/a.ts\n', + trace: '{"type":"text","text":"rerun trace"}\n', + judging: { + analysis: 'Improved rerun.', + strengths: [], + weaknesses: [], + e2eTestsPerformed: [], + completionScore: 6.4, + codeQualityScore: 6.4, + e2eScore: 6.4, + overallScore: 6.4, + docSuggestions: [], + projectSuggestions: [], + }, + costEstimate: 1.25, + docsRead: ['docs/testing.md'], + agentDocSuggestions: [{ text: 'Keep env setup docs', priority: 40 }], + agentProjectSuggestions: [{ text: 'Add config helper', priority: 30 }], + }, + }, + ], + }, + ] + + saveLoopDocGateArtifacts(logDir, 1, artifacts) + + const candidateDir = path.join(logDir, 'doc-candidates-loop-1', 'feature-a', 'candidate-01') + expect(fs.existsSync(candidateDir)).toBe(true) + expect(fs.readFileSync(path.join(candidateDir, 'suggestion.txt'), 'utf-8')).toContain('Document env setup') + expect(fs.readFileSync(path.join(candidateDir, 'docs.patch'), 'utf-8')).toContain('diff --git') + expect(fs.readFileSync(path.join(candidateDir, 'docs-diff.txt'), 'utf-8')).toContain('APP_MODE=test') + + const metadata = JSON.parse(fs.readFileSync(path.join(candidateDir, 'metadata.json'), 'utf-8')) + expect(metadata.rejudgeScore).toBe(5.5) + expect(metadata.rerunScore).toBe(6.4) + + const rejudge = JSON.parse(fs.readFileSync(path.join(candidateDir, 'rejudge.json'), 'utf-8')) + expect(rejudge.overallScore).toBe(5.5) + + expect(fs.readFileSync(path.join(candidateDir, 'rerun-trace.txt'), 'utf-8')).toContain('rerun trace') + expect(fs.readFileSync(path.join(candidateDir, 'rerun-diff.txt'), 'utf-8')).toContain('src/a.ts') + + const rerunJudging = JSON.parse(fs.readFileSync(path.join(candidateDir, 'rerun-judging.json'), 'utf-8')) + expect(rerunJudging.overallScore).toBe(6.4) + + const rerunSuggestions = JSON.parse( + fs.readFileSync(path.join(candidateDir, 'rerun-agent-suggestions.json'), 'utf-8'), + ) + expect(rerunSuggestions.docSuggestions[0]?.text).toBe('Keep env setup docs') + expect(fs.readFileSync(path.join(candidateDir, 'rerun-score.txt'), 'utf-8').trim()).toBe('6.4') + }) +}) diff --git a/src/__tests__/run-evalbuff.e2e.test.ts b/src/__tests__/run-evalbuff.e2e.test.ts index 2c8116a..d17f177 100644 --- a/src/__tests__/run-evalbuff.e2e.test.ts +++ b/src/__tests__/run-evalbuff.e2e.test.ts @@ -404,6 +404,9 @@ describe('Evalbuff pipeline e2e', () => { const docGatesPath = path.join(logDir, 'doc-gates-loop-1.json') expect(fs.existsSync(docGatesPath)).toBe(true) + const docCandidatesDir = path.join(logDir, 'doc-candidates-loop-1') + expect(fs.existsSync(docCandidatesDir)).toBe(true) + // --- Verify overall summary --- const summaryPath = path.join(logDir, 'summary.json') expect(fs.existsSync(summaryPath)).toBe(true) diff --git a/src/report.ts b/src/report.ts index 25683a3..2aec6c6 100644 --- a/src/report.ts +++ b/src/report.ts @@ -44,6 +44,18 @@ export interface FeatureDocGateResult { candidates: DocChangeGateCandidateResult[] } +export interface DocChangeGateCandidateArtifacts { + summary: DocChangeGateCandidateResult + docsPatchText?: string + rejudgeJudging?: JudgingResult + rerunTask?: TaskResult +} + +export interface FeatureDocGateArtifacts { + featureId: string + candidates: DocChangeGateCandidateArtifacts[] +} + export interface LoopDocGateResult { loop: number threshold: number @@ -114,6 +126,71 @@ export function saveLoopDocGateResults( ) } +function sanitizePathSegment(value: string): string { + const normalized = value.trim().replace(/[^a-zA-Z0-9._-]+/g, '-').replace(/^-+|-+$/g, '') + return normalized || 'item' +} + +export function saveLoopDocGateArtifacts( + logDir: string, + loop: number, + features: FeatureDocGateArtifacts[], +): void { + const rootDir = path.join(logDir, `doc-candidates-loop-${loop}`) + fs.mkdirSync(rootDir, { recursive: true }) + + for (const feature of features) { + const featureDir = path.join(rootDir, sanitizePathSegment(feature.featureId)) + fs.mkdirSync(featureDir, { recursive: true }) + + for (let i = 0; i < feature.candidates.length; i++) { + const candidate = feature.candidates[i] + const candidateDir = path.join(featureDir, `candidate-${String(i + 1).padStart(2, '0')}`) + fs.mkdirSync(candidateDir, { recursive: true }) + + fs.writeFileSync(path.join(candidateDir, 'metadata.json'), JSON.stringify(candidate.summary, null, 2)) + fs.writeFileSync(path.join(candidateDir, 'suggestion.txt'), candidate.summary.text + '\n') + + if (candidate.docsPatchText && candidate.docsPatchText.trim()) { + fs.writeFileSync(path.join(candidateDir, 'docs.patch'), candidate.docsPatchText) + } + + if (candidate.summary.docsDiff.trim()) { + fs.writeFileSync(path.join(candidateDir, 'docs-diff.txt'), candidate.summary.docsDiff) + } + + if (candidate.rejudgeJudging) { + fs.writeFileSync( + path.join(candidateDir, 'rejudge.json'), + JSON.stringify(candidate.rejudgeJudging, null, 2), + ) + } + + if (candidate.rerunTask) { + const tracePath = path.join(candidateDir, 'rerun-trace.txt') + fs.writeFileSync(tracePath, candidate.rerunTask.trace) + compressAndSave(tracePath, candidate.rerunTask.trace).catch((err: unknown) => { + console.warn(`[report] Failed to compress rerun trace for ${feature.featureId}: ${err}`) + }) + + fs.writeFileSync(path.join(candidateDir, 'rerun-diff.txt'), candidate.rerunTask.diff) + fs.writeFileSync( + path.join(candidateDir, 'rerun-judging.json'), + JSON.stringify(candidate.rerunTask.judging, null, 2), + ) + fs.writeFileSync(path.join(candidateDir, 'rerun-score.txt'), candidate.rerunTask.score.toString()) + fs.writeFileSync( + path.join(candidateDir, 'rerun-agent-suggestions.json'), + JSON.stringify({ + docSuggestions: candidate.rerunTask.agentDocSuggestions, + projectSuggestions: candidate.rerunTask.agentProjectSuggestions, + }, null, 2), + ) + } + } + } +} + export function saveRoundResults(logDir: string, roundResult: RoundResult): void { const roundDir = path.join(logDir, `round-${roundResult.round}`) fs.mkdirSync(roundDir, { recursive: true }) diff --git a/src/run-evalbuff.ts b/src/run-evalbuff.ts index 4ef9d75..b398d7b 100644 --- a/src/run-evalbuff.ts +++ b/src/run-evalbuff.ts @@ -37,7 +37,7 @@ import { printHeader, printRoundScores, printBaselineRejudge, printScoreTable, printProjectPrompts, printFinalSummary, } from './log' -import { saveRoundResults, saveBaselineRejudgeResults, saveLoopDocGateResults, saveSummary } from './report' +import { saveRoundResults, saveBaselineRejudgeResults, saveLoopDocGateArtifacts, saveLoopDocGateResults, saveSummary } from './report' import { events } from './tui/events' import type { CarvedFeature } from './carve-features' @@ -47,6 +47,7 @@ import type { EvalSummary, DocChangeGateCandidateResult, FeatureDocGateResult, + FeatureDocGateArtifacts, LoopDocGateResult, } from './report' @@ -381,14 +382,30 @@ export async function gateDocsChangesForTask(args: { loop: number }, deps: GateDocsChangesDeps = defaultGateDocsChangesDeps): Promise<{ result: FeatureDocGateResult + artifacts: FeatureDocGateArtifacts validationCost: number }> { const allSuggestions = deps.collectTaskDocSuggestions(args.task) const suggestions = deps.filterDocSuggestionsForPlanning(allSuggestions) const gatedCandidates: DocChangeGateCandidateResult[] = [] + const gateArtifacts: FeatureDocGateArtifacts = { + featureId: args.feature.id, + candidates: [], + } let validationCost = 0 let enteredDocsWriterPhase = false + function recordCandidate( + candidate: DocChangeGateCandidateResult, + artifacts: Omit = {}, + ): void { + gatedCandidates.push(candidate) + gateArtifacts.candidates.push({ + summary: candidate, + ...artifacts, + }) + } + if (allSuggestions.length === 0 || args.task.score < 0) { return { result: { @@ -396,6 +413,7 @@ export async function gateDocsChangesForTask(args: { baseScore: args.task.score, candidates: [], }, + artifacts: gateArtifacts, validationCost, } } @@ -417,7 +435,7 @@ export async function gateDocsChangesForTask(args: { let plan: Awaited> | null = null try { for (const skipped of allSuggestions.filter((suggestion) => suggestion.priority < DEFAULT_DOC_SUGGESTION_PRIORITY_FLOOR)) { - gatedCandidates.push({ + recordCandidate({ source: skipped.source, priority: skipped.priority, text: skipped.text, @@ -437,6 +455,7 @@ export async function gateDocsChangesForTask(args: { baseScore: args.task.score, candidates: gatedCandidates, }, + artifacts: gateArtifacts, validationCost, } } @@ -449,7 +468,7 @@ export async function gateDocsChangesForTask(args: { ) if (!plan) { for (const suggestion of suggestions) { - gatedCandidates.push({ + recordCandidate({ source: suggestion.source, priority: suggestion.priority, text: suggestion.text, @@ -467,13 +486,14 @@ export async function gateDocsChangesForTask(args: { baseScore: args.task.score, candidates: gatedCandidates, }, + artifacts: gateArtifacts, validationCost, } } for (const suggestion of plan.candidates) { if (!suggestion.accepted) { - gatedCandidates.push({ + recordCandidate({ source: suggestion.source, priority: suggestion.priority, text: suggestion.text, @@ -483,12 +503,14 @@ export async function gateDocsChangesForTask(args: { reason: suggestion.reason, baseScore: args.task.score, docsDiff: suggestion.diffText || '', + }, { + docsPatchText: suggestion.patchText, }) continue } if (!suggestion.patchText) { - gatedCandidates.push({ + recordCandidate({ source: suggestion.source, priority: suggestion.priority, text: suggestion.text, @@ -504,7 +526,7 @@ export async function gateDocsChangesForTask(args: { const draft = deps.materializeDocsChangeFromPatch(args.opts.repoPath, suggestion.patchText) if (!draft) { - gatedCandidates.push({ + recordCandidate({ source: suggestion.source, priority: suggestion.priority, text: suggestion.text, @@ -514,13 +536,15 @@ export async function gateDocsChangesForTask(args: { reason: `Failed to materialize docs change: ${suggestion.reason}`, baseScore: args.task.score, docsDiff: suggestion.diffText || '', + }, { + docsPatchText: suggestion.patchText, }) continue } if (!draft.diffText.trim()) { deps.cleanupDraftedDocsChange(draft) - gatedCandidates.push({ + recordCandidate({ source: suggestion.source, priority: suggestion.priority, text: suggestion.text, @@ -529,12 +553,15 @@ export async function gateDocsChangesForTask(args: { status: 'rejected_no_change', reason: 'The planned docs change produced no effective diff when applied to the current docs.', baseScore: args.task.score, - docsDiff: suggestion.diffText || '', + docsDiff: draft.diffText, + }, { + docsPatchText: suggestion.patchText, }) continue } let rejudgeScore: number | undefined + let rejudgeJudging: Awaited> | undefined try { const rejudged = await deps.rejudgeTaskWithCurrentDocs({ idx: 0, @@ -546,11 +573,12 @@ export async function gateDocsChangesForTask(args: { initCommand: args.opts.initCommand, docsSourcePath: draft.repoDir, }) + rejudgeJudging = rejudged rejudgeScore = rejudged.overallScore } catch (error) { deps.cleanupDraftedDocsChange(draft) const msg = error instanceof Error ? error.message : String(error) - gatedCandidates.push({ + recordCandidate({ source: suggestion.source, priority: suggestion.priority, text: suggestion.text, @@ -560,13 +588,15 @@ export async function gateDocsChangesForTask(args: { reason: `Rejudge failed: ${msg.slice(0, 200)}`, baseScore: args.task.score, docsDiff: draft.diffText, + }, { + docsPatchText: suggestion.patchText, }) continue } if (rejudgeScore === undefined) { deps.cleanupDraftedDocsChange(draft) - gatedCandidates.push({ + recordCandidate({ source: suggestion.source, priority: suggestion.priority, text: suggestion.text, @@ -576,6 +606,8 @@ export async function gateDocsChangesForTask(args: { reason: 'Rejudge did not produce a score.', baseScore: args.task.score, docsDiff: draft.diffText, + }, { + docsPatchText: suggestion.patchText, }) continue } @@ -586,7 +618,7 @@ export async function gateDocsChangesForTask(args: { }) if (fastDecision.accepted && fastDecision.fastAccepted) { deps.acceptDraftedDocsChange(args.opts.repoPath, draft) - gatedCandidates.push({ + recordCandidate({ source: suggestion.source, priority: suggestion.priority, text: suggestion.text, @@ -598,6 +630,9 @@ export async function gateDocsChangesForTask(args: { rejudgeScore, gateDelta: fastDecision.gateDelta, docsDiff: draft.diffText, + }, { + docsPatchText: suggestion.patchText, + rejudgeJudging, }) continue } @@ -621,7 +656,7 @@ export async function gateDocsChangesForTask(args: { }) if (rerunTask.score >= 0 && decision.accepted) { deps.acceptDraftedDocsChange(args.opts.repoPath, draft) - gatedCandidates.push({ + recordCandidate({ source: suggestion.source, priority: suggestion.priority, text: suggestion.text, @@ -634,12 +669,16 @@ export async function gateDocsChangesForTask(args: { rerunScore: rerunTask.score, gateDelta: decision.gateDelta, docsDiff: draft.diffText, + }, { + docsPatchText: suggestion.patchText, + rejudgeJudging, + rerunTask, }) continue } deps.cleanupDraftedDocsChange(draft) - gatedCandidates.push({ + recordCandidate({ source: suggestion.source, priority: suggestion.priority, text: suggestion.text, @@ -654,6 +693,10 @@ export async function gateDocsChangesForTask(args: { rerunScore: rerunTask.score, gateDelta: decision.gateDelta, docsDiff: draft.diffText, + }, { + docsPatchText: suggestion.patchText, + rejudgeJudging, + rerunTask, }) } @@ -663,6 +706,7 @@ export async function gateDocsChangesForTask(args: { baseScore: args.task.score, candidates: gatedCandidates, }, + artifacts: gateArtifacts, validationCost, } } finally { @@ -805,6 +849,7 @@ export async function runEvalbuff(opts: EvalbuffOptions): Promise { const docsSnapshotBefore = getDocsSnapshot(opts.repoPath) events.send({ type: 'phase_change', phase: 'evaluating', round: improvementRound, loop: improvementRound, detail: 'Re-eval with updated docs' }) const featureGateResults: FeatureDocGateResult[] = [] + const featureGateArtifacts: FeatureDocGateArtifacts[] = [] const results = await runEvalRound( features, groundTruthDiffs, @@ -820,6 +865,7 @@ export async function runEvalbuff(opts: EvalbuffOptions): Promise { loop: improvementRound, }) featureGateResults.push(gated.result) + featureGateArtifacts.push(gated.artifacts) return gated.validationCost }, ) @@ -845,6 +891,7 @@ export async function runEvalbuff(opts: EvalbuffOptions): Promise { JSON.stringify(docsAfterRefactor, null, 2), ) saveLoopDocGateResults(logDir, loopDocGateResult) + saveLoopDocGateArtifacts(logDir, improvementRound, featureGateArtifacts) saveRoundResults(logDir, results) const rejudged = await runBaselineRejudgeRound(baseline, features, groundTruthDiffs, opts, improvementRound) From 3a1bcaa94ac2e587f9d43ebd74d2f1005a9d2532 Mon Sep 17 00:00:00 2001 From: James Grugett Date: Thu, 9 Apr 2026 11:11:53 -0700 Subject: [PATCH 5/9] Better error handling for codex --- src/carve-features.ts | 59 +++++++++++++++++++++++++++++++++++-------- src/runners/codex.ts | 23 +++++++++++++++-- 2 files changed, 69 insertions(+), 13 deletions(-) diff --git a/src/carve-features.ts b/src/carve-features.ts index 05b5847..03d8cbc 100644 --- a/src/carve-features.ts +++ b/src/carve-features.ts @@ -128,6 +128,7 @@ After your analysis, write a file called \`${RESULT_FILE}\` with this JSON struc You MUST write the result file as your last action.` const result = await thread.run(prompt) + console.log(`planFeatures: Codex finished. Response length: ${result.finalResponse.length}`) // Read the result file const resultPath = path.join(repoPath, RESULT_FILE) @@ -135,9 +136,12 @@ You MUST write the result file as your last action.` // Try to extract from the agent's final response const jsonMatch = result.finalResponse.match(/\{[\s\S]*"candidates"[\s\S]*\}/) if (jsonMatch) { + console.log('planFeatures: extracted plan from final response (no result file written)') return JSON.parse(jsonMatch[0]) as CarvePlan } - throw new Error('Codex agent did not produce a result file') + throw new Error( + `Codex agent did not produce a result file or extractable JSON. Final response: ${result.finalResponse?.slice(0, 500) || '(empty)'}`, + ) } try { @@ -168,10 +172,16 @@ export async function carveFeature( const branchName = `evalbuff-carve-${candidate.id}-${Date.now()}` try { - execSync(`git worktree add -b "${branchName}" "${worktreePath}" HEAD`, { - cwd: repoPath, - stdio: 'ignore', - }) + try { + execSync(`git worktree add -b "${branchName}" "${worktreePath}" HEAD`, { + cwd: repoPath, + stdio: 'pipe', + }) + } catch (error) { + throw new Error( + `Failed to create worktree for ${candidate.id}: ${error instanceof Error ? error.message : error}`, + ) + } // Run the Codex agent in the worktree to carve the feature const codex = new Codex({ @@ -214,6 +224,7 @@ export async function carveFeature( Do NOT create any result files — just make the edits directly.` await thread.run(prompt) + console.log(` [carve:${candidate.id}] Codex finished`) // Capture the diff execSync('git add -A', { cwd: worktreePath, stdio: 'ignore' }) @@ -224,12 +235,19 @@ Do NOT create any result files — just make the edits directly.` }) if (!diff.trim()) { + console.warn( + ` [carve:${candidate.id}] Empty diff — Codex made no changes. Skipping.`, + ) return null } // Build operations from the actual git diff const operations = buildOperationsFromDiff(worktreePath, repoPath, candidate.files) + console.log( + ` [carve:${candidate.id}] Success: ${operations.length} file operations, ${diff.length} bytes diff`, + ) + return { id: candidate.id, prompt: candidate.prompt, @@ -239,8 +257,12 @@ Do NOT create any result files — just make the edits directly.` operations, diff, } - } catch { - return null + } catch (error) { + console.error( + ` [carve:${candidate.id}] Failed:`, + error instanceof Error ? error.message : error, + ) + throw error } finally { // Clean up worktree and branch try { @@ -316,13 +338,28 @@ export async function carveFeatures( // Phase 2: Carve each feature const features: CarvedFeature[] = [] + const failures: { id: string; error: string }[] = [] for (const candidate of selected) { - const carved = await carveFeature(repoPath, candidate) - if (carved) { - features.push(carved) + console.log( + `\nCarving [${features.length + 1}/${selected.length}]: ${candidate.id} (${candidate.complexity})`, + ) + try { + const carved = await carveFeature(repoPath, candidate) + if (carved) { + features.push(carved) + } + } catch (error) { + const message = error instanceof Error ? error.message : String(error) + failures.push({ id: candidate.id, error: message }) + } + } + console.log(`\nCarved ${features.length}/${selected.length} features`) + if (failures.length > 0) { + console.warn(`Failed carves (${failures.length}):`) + for (const f of failures) { + console.warn(` - ${f.id}: ${f.error}`) } } - console.log(`Carved ${features.length}/${selected.length} features`) const result: CarveResult = { repoPath, diff --git a/src/runners/codex.ts b/src/runners/codex.ts index 5e5280a..2e48d36 100644 --- a/src/runners/codex.ts +++ b/src/runners/codex.ts @@ -46,7 +46,18 @@ export class CodexRunner implements Runner { usage = event.usage break case 'turn.failed': + console.error(`[codex-runner] Turn failed:`, event.error.message) + steps.push({ + type: 'text', + text: `[ERROR] Codex turn failed: ${event.error.message}`, + }) + break case 'error': + console.error(`[codex-runner] Stream error:`, event.message) + steps.push({ + type: 'text', + text: `[ERROR] Codex stream error: ${event.message}`, + }) break } } @@ -55,8 +66,11 @@ export class CodexRunner implements Runner { let diff = '' try { diff = captureGitDiff(this.cwd, { baseRef: baseSha }) - } catch { - // Ignore git errors + } catch (error) { + console.error( + `[codex-runner] Failed to capture git diff:`, + error instanceof Error ? error.message : error, + ) } // Estimate cost from token usage (rough GPT-5.1-codex pricing) @@ -132,6 +146,11 @@ function processItem(item: ThreadItem, steps: AgentStep[]): void { // Skip todo lists break case 'error': + console.error(`[codex-runner] Item error:`, item) + steps.push({ + type: 'text', + text: `[ERROR] Codex item error: ${'message' in item ? (item as any).message : JSON.stringify(item)}`, + }) break } } From de2051a31550ead98ea856cbdcef73b1ba9d3a12 Mon Sep 17 00:00:00 2001 From: James Grugett Date: Thu, 9 Apr 2026 11:28:52 -0700 Subject: [PATCH 6/9] Add test repos --- docs/cli.md | 18 + docs/eval-helpers.md | 15 + docs/testing.md | 70 ++- package.json | 3 +- src/__tests__/e2e-repos.test.ts | 169 +++++++ .../run-evalbuff-orchestration.test.ts | 116 +++++ src/debug-codex-plan.ts | 46 ++ src/e2e-repos.ts | 465 ++++++++++++++++++ src/run-evalbuff.ts | 431 ++++++++-------- src/setup-e2e-repos.ts | 128 +++++ test-repos/.gitignore | 3 + test-repos/README.md | 16 + 12 files changed, 1272 insertions(+), 208 deletions(-) create mode 100644 src/__tests__/e2e-repos.test.ts create mode 100644 src/__tests__/run-evalbuff-orchestration.test.ts create mode 100644 src/debug-codex-plan.ts create mode 100644 src/e2e-repos.ts create mode 100644 src/setup-e2e-repos.ts create mode 100644 test-repos/.gitignore create mode 100644 test-repos/README.md diff --git a/docs/cli.md b/docs/cli.md index 033b573..a1e2642 100644 --- a/docs/cli.md +++ b/docs/cli.md @@ -40,6 +40,24 @@ bun run src/trace-compressor.ts --restore # Restore Options: `--output`, `--sidecar-dir`, `--threshold `, `--format auto|jsonl|text`, `--summarize heuristic|claude|none`. Supports stdin/stdout with `-`. +## E2E Benchmark Repo Setup + +```bash +bun run setup:e2e-repos +bun run setup:e2e-repos -- --repo mock-simple +bun run setup:e2e-repos -- --root /tmp/evalbuff-test-repos --force +``` + +Creates deterministic local benchmark repos under `test-repos/` by default: +- `mock-simple` — generated locally for fast/mock E2E coverage +- `codebuff` — pinned checkout of `CodebuffAI/codebuff` +- `manifold` — pinned checkout of `manifoldmarkets/manifold`, plus a local fixture commit that renames `docs/` to `external-docs/` + +Flags: +- `--root ` chooses the target directory +- `--repo ` limits setup to specific repo ids and may be repeated +- `--force` rebuilds fixture directories that already exist + ## TUI Dashboard ```bash diff --git a/docs/eval-helpers.md b/docs/eval-helpers.md index 0410b5f..32fe309 100644 --- a/docs/eval-helpers.md +++ b/docs/eval-helpers.md @@ -42,6 +42,21 @@ Both helpers use `getErrorObject()` from `src/vendor/error.ts` for logging init-command failures. +## Worktree Isolation Pattern (Carve Features) + +`carveFeature()` in `src/carve-features.ts` uses git worktrees instead of full clones to isolate each carve operation. The lifecycle: + +1. **Create**: `git worktree add -b "" "" HEAD` — creates a new worktree checked out at the current HEAD on a temporary branch. The worktree path is constructed inline as `${repoPath}-carve-${candidate.id}` and the branch as `evalbuff-carve-${candidate.id}-${Date.now()}`. +2. **Run**: The Codex agent operates inside the worktree directory, making changes to remove the feature. +3. **Capture**: Diff and file operations are captured from the worktree before cleanup. +4. **Cleanup** (in a `finally` block): + - `git worktree remove --force ""` + - `git branch -D ""` + +**Why worktrees over clones**: Worktrees share the parent repo's object store rather than duplicating it. This avoids network I/O, object copying, and disk duplication — important when carving runs a parallel worker pool (`CARVE_PARALLELISM`) and each worker needs an isolated checkout. The tradeoff is that worktrees are coupled to the parent repo (deleting the parent breaks them), but carve worktrees are ephemeral and cleaned up in the same function call. + +**Cleanup safety**: Both the `worktree remove` and `branch -D` commands run inside a `finally` block so worktrees and branches are cleaned up even when the carve agent fails. Tests should verify no leaked worktrees after carving by asserting `git worktree list` shows exactly one entry (the main working tree). + ## Testing Helpers See `docs/testing.md` section "Helper-Contract Tests" for the required test patterns when modifying or extending these helpers. Key rule: always test against real temp git repos, not mocked filesystem calls. diff --git a/docs/testing.md b/docs/testing.md index 08a7744..3974860 100644 --- a/docs/testing.md +++ b/docs/testing.md @@ -4,21 +4,73 @@ ```bash bun run typecheck # TypeScript strict check -bun run test # Unit tests only (excludes *.e2e.test.ts) +bun run test # Unit tests only (excludes *.e2e.test.ts and test-repos/**) bun run test:all # All tests including E2E bun run test:e2e # E2E tests only ``` -**`bun run test` vs `bun test`**: `bun run test` is the unit-test entrypoint because it applies the repo's `--path-ignore-patterns '**/*.e2e.test.ts'` filter. Bare `bun test` executes all discovered tests and may run live-model E2E cases when provider environment variables are present. Use `bun run typecheck && bun run test` for local verification of ordinary code changes. Reserve `bun test` or `bun run test:all` for environments where network access and provider credentials are intentionally available. +**`bun run test` vs `bun test`**: `bun run test` is the unit-test entrypoint because it applies the repo's `--path-ignore-patterns '**/*.e2e.test.ts'` and `--path-ignore-patterns 'test-repos/**'` filters. Bare `bun test` executes all discovered tests and may run live-model E2E cases when provider environment variables are present. It can also descend into generated benchmark repos under `test-repos/`. Use `bun run typecheck && bun run test` for local verification of ordinary code changes. Reserve `bun test` or `bun run test:all` for environments where network access and provider credentials are intentionally available. **Verification workflow**: For code changes, run in this order: (1) `bun test src/__tests__/.test.ts` for the changed area, (2) `bun run typecheck`, (3) `bun run test` for all unit tests, (4) optionally `bun test` for live-model E2E coverage. Provider/network failures in step 4 should be reported separately from patch regressions, with the exact failing file and provider error message. +### Subprocess Verification for Bun Startup Behavior + +Changes to `bunfig.toml`, `src/load-env.ts`, or any other Bun preload script require a **subprocess-level** verification step in addition to parser unit tests. Parser-only unit tests are insufficient because Bun itself may prepopulate `process.env` before preload scripts execute, and the preload lifecycle (file resolution, load order, double-loading between top-level and `[test]` sections) is only exercised in a real Bun subprocess. + +**Recipe**: Create temporary `.env.local` and `.env` files in the repo root (or a temp cwd that shares the same `bunfig.toml`), then run both `bun run ` and `bun test ` as subprocesses and assert on their stdout/exit codes. The probe scripts should cover at least these cases: + +1. **Precedence**: A key defined in both `.env.local` and `.env` must resolve to the `.env.local` value. +2. **Fallback**: A key present only in `.env` (absent from `.env.local`) must still load. +3. **Export syntax**: `export KEY=value` lines must be accepted (the `export` prefix is optional). +4. **Hash preservation**: `#` inside URLs or quoted strings (e.g., `URL=https://example.com/#frag`, `QUOTED="#val"`) must be preserved as literal characters, while `value # comment` must have the inline comment trimmed. + +This ensures the full Bun → preload → env file chain works end-to-end, not just the parser in isolation. + ## Prerequisites Fresh workspaces (e.g., carved eval repos) may not have dependencies installed. Always run `bun install` or `bash setup.sh` before expecting `bun run typecheck` or `bun test` to succeed. A task is not complete until both commands pass after dependencies are installed. Local developer credentials can live in `.env.local`. `bunfig.toml` preloads `src/load-env.ts`, so direct Bun invocations such as `bun test src/__tests__/docs-writer.e2e.test.ts` and `bun run src/run-evalbuff.ts ...` automatically read `.env.local` first, then `.env`, without requiring wrapper scripts. +### `parseEnvFile` Contract + +`parseEnvFile(content)` in `src/load-env.ts` is a **pure parser** (no I/O, no `process.env` access). It returns `Array<[key, value]>` with these rules: + +| Rule | Detail | +|---|---| +| Key format | Must match `^[A-Za-z_][A-Za-z0-9_]*$`. Lines with invalid keys are silently skipped. | +| `export` prefix | Optional — `export KEY=value` and `KEY=value` are both accepted. | +| Blank lines / comments | Lines that are empty or start with `#` are skipped. | +| Quoting | Surrounding `"..."` or `'...'` are stripped from the value. | +| Inline comments | `#` preceded by whitespace, outside quotes, is treated as a comment start; the value is trimmed before it. | +| Hash in values | `#` inside quoted strings or not preceded by whitespace is preserved literally. | + +**Example input and expected output**: + +``` +# Database config +DB_HOST=localhost +export DB_PORT=5432 +API_URL=https://example.com/api#v2 +QUOTED_HASH="#still-a-value" +SECRET="s3cret" # rotate quarterly +MALFORMED LINE +``` + +Expected parse result: + +``` +[ + ["DB_HOST", "localhost"], + ["DB_PORT", "5432"], + ["API_URL", "https://example.com/api#v2"], + ["QUOTED_HASH", "#still-a-value"], + ["SECRET", "s3cret"] +] +``` + +Note: `MALFORMED LINE` is silently skipped (key contains a space). The inline comment on `SECRET` is stripped. The `#v2` fragment in `API_URL` is preserved because `#` is not preceded by whitespace. + ## Test File Layout - Unit tests: `src/__tests__/.test.ts` @@ -76,6 +128,20 @@ When tests produce diffs, validate both representations: This catches cases where patch text looks valid but serialized file operations do not recreate the same filesystem state. +### All Operation Types + +Carve diffs can include file deletions, modifications, and additions. Git status `'A'` (added files) is mapped to `FileOperation` with `action: 'modify'` and the full file content as `newContent` — there is no separate `'add'` action. Diff validation tests must cover all three git-level operation types: + +- **Delete**: Assert the file is absent after `applyCarveOperations()` and that the diff contains the deletion. +- **Modify**: Assert the file content matches `op.newContent`. +- **Add (as modify)**: Create a test where the carve introduces a new file. Assert that `applyCarveOperations()` creates the file with the correct content (it calls `fs.mkdirSync` with `{ recursive: true }` before writing, so nested new paths are handled). Verify the diff also includes the addition and passes `git apply --check`. + +When testing carve output end-to-end, apply `applyCarveOperations(repoDir, feature.operations)` to a fresh checkout at the same base SHA and compare the resulting filesystem to the actual carved worktree state. This catches drift between the diff text and the serialized operations. + +### No-Op Carve Behavior + +`carveFeature()` returns `null` when the carve produces an empty diff (`!diff.trim()`). Callers skip null results — a no-op carve is not treated as a successful `CarvedFeature`. Tests should assert that `carveFeature()` returns `null` (not an empty-diff `CarvedFeature`) when the agent makes no changes, and that the caller's feature list does not contain entries with empty diffs or empty `originalFiles`. + ## Infrastructure Failure Testing `runAgentOnCarve()` must never throw for infrastructure failures — it returns a `TaskResult` with `score: -1`, empty `diff`, `costEstimate: 0`, `trace` starting with `Agent error:`, and all judging scores set to `-1`. Test by calling with a nonexistent `repoPath` so `git clone` fails before the agent runs. diff --git a/package.json b/package.json index 76c4350..bcc50f8 100644 --- a/package.json +++ b/package.json @@ -18,10 +18,11 @@ "build:smoke": "npm run build && node dist/cli.js --help", "prepublishOnly": "npm run build:smoke", "typecheck": "tsc --noEmit -p .", - "test": "bun test --path-ignore-patterns '**/*.e2e.test.ts'", + "test": "bun test --path-ignore-patterns '**/*.e2e.test.ts' --path-ignore-patterns 'test-repos/**'", "test:all": "bun test", "test:e2e": "bun test src/__tests__/*.e2e.test.ts", "run": "bun run src/run-evalbuff.ts", + "setup:e2e-repos": "bun run src/setup-e2e-repos.ts", "tui": "bun run src/tui/main.tsx", "tui:demo": "bun run src/tui/main.tsx --demo", "tui:watch": "bun run src/tui/main.tsx --log-dir", diff --git a/src/__tests__/e2e-repos.test.ts b/src/__tests__/e2e-repos.test.ts new file mode 100644 index 0000000..fcc52f5 --- /dev/null +++ b/src/__tests__/e2e-repos.test.ts @@ -0,0 +1,169 @@ +import { execFileSync, execSync } from 'child_process' +import fs from 'fs' +import os from 'os' +import path from 'path' +import { fileURLToPath } from 'url' + +import { afterEach, describe, expect, it } from 'bun:test' + +import { + FIXTURE_MANIFEST_FILENAME, + patchCodebuffFixture, + renameDocsDirForManifoldFixture, + setupBenchmarkRepos, +} from '../e2e-repos' + +const tempDirs: string[] = [] +const repoRoot = path.resolve(path.dirname(fileURLToPath(import.meta.url)), '..', '..') + +afterEach(() => { + for (const dir of tempDirs.splice(0)) { + fs.rmSync(dir, { recursive: true, force: true }) + } +}) + +describe('setupBenchmarkRepos', () => { + it('creates only the selected mock fixture and writes a manifest', () => { + const rootDir = fs.mkdtempSync(path.join(os.tmpdir(), 'evalbuff-e2e-repos-')) + tempDirs.push(rootDir) + + const results = setupBenchmarkRepos({ + rootDir, + repoIds: ['mock-simple'], + }) + + expect(results).toHaveLength(1) + expect(results[0]?.id).toBe('mock-simple') + expect(fs.existsSync(path.join(rootDir, 'mock-simple', '.git'))).toBe(true) + expect(fs.existsSync(path.join(rootDir, 'codebuff'))).toBe(false) + expect(fs.existsSync(path.join(rootDir, 'manifold'))).toBe(false) + + const manifest = JSON.parse( + fs.readFileSync(path.join(rootDir, FIXTURE_MANIFEST_FILENAME), 'utf-8'), + ) as { repos: Array<{ id: string; headSha: string }> } + + expect(manifest.repos).toHaveLength(1) + expect(manifest.repos[0]?.id).toBe('mock-simple') + expect(git(path.join(rootDir, 'mock-simple'), 'rev-parse HEAD')).toBe(results[0]?.headSha) + expect(git(path.join(rootDir, 'mock-simple'), 'status --short')).toBe('') + }) + + it('rebuilds a matching fixture when force is enabled', () => { + const rootDir = fs.mkdtempSync(path.join(os.tmpdir(), 'evalbuff-e2e-repos-force-')) + tempDirs.push(rootDir) + + const firstRun = setupBenchmarkRepos({ + rootDir, + repoIds: ['mock-simple'], + }) + const firstHead = firstRun[0]?.headSha + + const secondRun = setupBenchmarkRepos({ + rootDir, + repoIds: ['mock-simple'], + force: true, + }) + + expect(secondRun).toHaveLength(1) + expect(secondRun[0]?.id).toBe('mock-simple') + expect(secondRun[0]?.headSha).toBe(firstHead) + expect(git(path.join(rootDir, 'mock-simple'), 'status --short')).toBe('') + }) +}) + +describe('renameDocsDirForManifoldFixture', () => { + it('moves docs to external-docs and leaves a clean commit', () => { + const repoDir = createRepo({ + 'docs/guide.md': '# Guide\nfixture docs\n', + 'src/index.ts': "export const version = '1.0.0'\n", + }) + + const baseSha = git(repoDir, 'rev-parse HEAD') + + renameDocsDirForManifoldFixture(repoDir) + + expect(fs.existsSync(path.join(repoDir, 'docs'))).toBe(false) + expect(fs.existsSync(path.join(repoDir, 'external-docs', 'guide.md'))).toBe(true) + expect( + fs.readFileSync(path.join(repoDir, 'external-docs', 'guide.md'), 'utf-8'), + ).toContain('fixture docs') + expect(git(repoDir, 'log -1 --pretty=%s')).toBe( + 'evalbuff: move upstream docs to external-docs', + ) + expect(git(repoDir, 'status --short')).toBe('') + expect(git(repoDir, 'rev-parse HEAD')).not.toBe(baseSha) + }) +}) + +describe('patchCodebuffFixture', () => { + it('removes docs and rewrites AGENTS without docs references', () => { + const repoDir = createRepo({ + 'docs/guide.md': '# Guide\nfixture docs\n', + 'AGENTS.md': '# Upstream\nSee docs/guide.md\n', + 'src/index.ts': "export const version = '1.0.0'\n", + }) + + const baseSha = git(repoDir, 'rev-parse HEAD') + + patchCodebuffFixture(repoDir) + + expect(fs.existsSync(path.join(repoDir, 'docs'))).toBe(false) + const agentsMd = fs.readFileSync(path.join(repoDir, 'AGENTS.md'), 'utf-8') + expect(agentsMd).toContain('# Codebuff') + expect(agentsMd.toLowerCase()).not.toContain('docs') + expect(git(repoDir, 'log -1 --pretty=%s')).toBe( + 'evalbuff: remove bundled docs and simplify AGENTS', + ) + expect(git(repoDir, 'status --short')).toBe('') + expect(git(repoDir, 'rev-parse HEAD')).not.toBe(baseSha) + }) +}) + +describe('setup-e2e-repos CLI', () => { + it('honors --root and --repo for a mock-only setup', () => { + const rootDir = fs.mkdtempSync(path.join(os.tmpdir(), 'evalbuff-e2e-cli-')) + tempDirs.push(rootDir) + + execFileSync( + process.execPath, + ['run', 'src/setup-e2e-repos.ts', '--root', rootDir, '--repo', 'mock-simple'], + { + cwd: repoRoot, + encoding: 'utf-8', + stdio: ['ignore', 'pipe', 'pipe'], + env: process.env, + }, + ) + + expect(fs.existsSync(path.join(rootDir, 'mock-simple', '.git'))).toBe(true) + expect(fs.existsSync(path.join(rootDir, 'codebuff'))).toBe(false) + expect(fs.existsSync(path.join(rootDir, FIXTURE_MANIFEST_FILENAME))).toBe(true) + }) +}) + +function createRepo(files: Record): string { + const repoDir = fs.mkdtempSync(path.join(os.tmpdir(), 'evalbuff-repo-fixture-')) + tempDirs.push(repoDir) + + git(repoDir, 'init') + git(repoDir, 'config user.name "Test User"') + git(repoDir, 'config user.email "test@evalbuff.test"') + + for (const [filePath, content] of Object.entries(files)) { + const absolutePath = path.join(repoDir, filePath) + fs.mkdirSync(path.dirname(absolutePath), { recursive: true }) + fs.writeFileSync(absolutePath, content) + } + + git(repoDir, 'add -A') + git(repoDir, 'commit -m "Initial commit"') + return repoDir +} + +function git(repoDir: string, command: string): string { + return execSync(`git ${command}`, { + cwd: repoDir, + encoding: 'utf-8', + stdio: ['ignore', 'pipe', 'ignore'], + }).trim() +} diff --git a/src/__tests__/run-evalbuff-orchestration.test.ts b/src/__tests__/run-evalbuff-orchestration.test.ts new file mode 100644 index 0000000..e6cc175 --- /dev/null +++ b/src/__tests__/run-evalbuff-orchestration.test.ts @@ -0,0 +1,116 @@ +import fs from 'fs' +import os from 'os' +import path from 'path' + +import { afterEach, describe, expect, it, mock } from 'bun:test' + +import { events } from '../tui/events' + +async function waitForFlushedEvents(logDir: string): Promise { + const eventsPath = path.join(logDir, 'events.jsonl') + + for (let attempt = 0; attempt < 50; attempt++) { + if (fs.existsSync(eventsPath)) { + const text = fs.readFileSync(eventsPath, 'utf-8') + if (text.includes('"type":"run_complete"')) { + return text + } + } + await Bun.sleep(20) + } + + throw new Error(`Timed out waiting for flushed events at ${eventsPath}`) +} + +describe('runEvalbuff orchestration', () => { + let repoDir: string | undefined + + afterEach(() => { + events.close() + events.clearBuffer() + mock.restore() + if (repoDir) { + fs.rmSync(repoDir, { recursive: true, force: true }) + repoDir = undefined + } + }) + + it('emits explicit carve failures and terminal events when all selected carves fail', async () => { + repoDir = fs.mkdtempSync(path.join(os.tmpdir(), 'run-evalbuff-orch-')) + + mock.module('../carve-features', () => ({ + planFeatures: async () => ({ + reasoning: 'test', + candidates: [ + { + id: 'throwing-feature', + name: 'Throwing feature', + prompt: 'Rebuild throwing feature', + description: 'Throws during carve', + files: [], + relevantFiles: [], + complexity: 'small' as const, + }, + { + id: 'noop-feature', + name: 'No-op feature', + prompt: 'Rebuild noop feature', + description: 'Produces no carve diff', + files: [], + relevantFiles: [], + complexity: 'small' as const, + }, + ], + }), + carveFeature: async (_repoPath: string, candidate: { id: string }) => { + if (candidate.id === 'throwing-feature') { + throw new Error('worktree add failed') + } + return null + }, + })) + + const { runEvalbuff } = await import('../run-evalbuff') + await runEvalbuff({ + repoPath: repoDir, + n: 2, + codingModel: 'sonnet', + docsModel: 'opus', + }) + + let logDir: string | undefined + events.replay(({ event }) => { + if (event.type === 'run_start') { + logDir = event.logDir + } + }) + + expect(logDir).toBeDefined() + const rawEvents = await waitForFlushedEvents(logDir as string) + const parsedEvents = rawEvents.trim().split('\n').map((line) => JSON.parse(line).event) + + expect(parsedEvents).toContainEqual({ + type: 'feature_status', + featureId: 'throwing-feature', + status: 'carve_failed', + detail: 'worktree add failed', + }) + expect(parsedEvents).toContainEqual({ + type: 'feature_status', + featureId: 'noop-feature', + status: 'carve_failed', + detail: 'Carve produced no changes.', + }) + expect(parsedEvents).toContainEqual({ + type: 'phase_change', + phase: 'complete', + detail: 'Run aborted: no features were successfully carved.', + }) + expect(parsedEvents.at(-1)).toEqual({ + type: 'run_complete', + scoreProgression: [], + totalCost: 0, + duration: expect.any(String), + }) + }) +}) diff --git a/src/debug-codex-plan.ts b/src/debug-codex-plan.ts new file mode 100644 index 0000000..4f72f82 --- /dev/null +++ b/src/debug-codex-plan.ts @@ -0,0 +1,46 @@ +import { planFeatures } from './carve-features' + +function getArg(args: string[], name: string, defaultValue?: string): string { + const idx = args.indexOf(`--${name}`) + if (idx >= 0 && idx + 1 < args.length) return args[idx + 1] + if (defaultValue !== undefined) return defaultValue + throw new Error(`Missing required argument: --${name}`) +} + +async function main(): Promise { + const args = process.argv.slice(2) + const repoPath = getArg(args, 'repo') + const timeoutMs = Number.parseInt(getArg(args, 'timeout-ms', '60000'), 10) + const startedAt = Date.now() + + const timeout = setTimeout(() => { + console.error(`Codex planning timed out after ${timeoutMs}ms for ${repoPath}`) + process.exit(2) + }, timeoutMs) + + try { + const plan = await planFeatures(repoPath) + clearTimeout(timeout) + console.log(JSON.stringify({ + ok: true, + repoPath, + durationMs: Date.now() - startedAt, + candidateCount: plan.candidates.length, + candidateIds: plan.candidates.map((candidate) => candidate.id), + }, null, 2)) + } catch (error) { + clearTimeout(timeout) + const message = error instanceof Error ? error.message : String(error) + console.error(JSON.stringify({ + ok: false, + repoPath, + durationMs: Date.now() - startedAt, + error: message, + }, null, 2)) + process.exit(1) + } +} + +if (import.meta.main) { + main() +} diff --git a/src/e2e-repos.ts b/src/e2e-repos.ts new file mode 100644 index 0000000..1ffa450 --- /dev/null +++ b/src/e2e-repos.ts @@ -0,0 +1,465 @@ +import { execFileSync } from 'child_process' +import fs from 'fs' +import path from 'path' +import { fileURLToPath } from 'url' + +const REPO_ROOT = path.resolve(path.dirname(fileURLToPath(import.meta.url)), '..') + +export const DEFAULT_TEST_REPOS_ROOT = path.join(REPO_ROOT, 'test-repos') +export const FIXTURE_LAYOUT_VERSION = 1 +export const FIXTURE_METADATA_FILENAME = 'evalbuff-fixture.json' +export const FIXTURE_MANIFEST_FILENAME = 'manifest.json' + +const FIXTURE_COMMIT_ENV = { + ...process.env, + GIT_AUTHOR_NAME: 'Evalbuff Fixtures', + GIT_AUTHOR_EMAIL: 'fixtures@evalbuff.local', + GIT_COMMITTER_NAME: 'Evalbuff Fixtures', + GIT_COMMITTER_EMAIL: 'fixtures@evalbuff.local', + GIT_AUTHOR_DATE: '2026-04-08T00:00:00Z', + GIT_COMMITTER_DATE: '2026-04-08T00:00:00Z', +} + +const CODEBUFF_FIXTURE_AGENTS_MD = `# Codebuff + +Codebuff is an advanced coding agent with a composable agent framework. It also includes: +- freebuff, the free coding agent + +## Goal + +Make an efficient learning agent that can do anything. + +## Key Technologies + +- TypeScript monorepo (Bun workspaces) +- Bun runtime + package manager +- Next.js (web app + API routes) +- Multiple LLM providers (Anthropic/OpenAI/Gemini/etc.) + +## Repo Map + +- \`cli/\` — TUI client (OpenTUI + React) and local UX +- \`sdk/\` — JS/TS SDK used by the CLI and external users +- \`web/\` — Next.js app + API routes (the "web API") +- \`packages/agent-runtime/\` — agent runtime + tool handling (server-side) +- \`common/\` — shared types, tools, schemas, utilities +- \`agents/\` — main agents shipped with codebuff +- \`.agents/\` — local agent templates (prompt + programmatic agents) +- \`freebuff/\` — a free coding agent built from configuring the codebuff CLI + +## Conventions + +- Prefer reading the implementation directly before making changes. +- Never force-push \`main\` unless explicitly requested. +- Run interactive git commands in tmux (anything that opens an editor or prompts). +` + +export type BenchmarkRepoId = 'mock-simple' | 'codebuff' | 'manifold' + +export interface FixtureMetadata { + id: BenchmarkRepoId + fixtureVersion: number + description: string + repoPath: string + sourceRepoUrl: string | null + sourceCommitSha: string | null + headSha: string + notes: string[] +} + +export interface SetupBenchmarkReposOptions { + rootDir?: string + repoIds?: BenchmarkRepoId[] + force?: boolean + log?: (message: string) => void +} + +interface BenchmarkRepoDefinition { + id: BenchmarkRepoId + dirName: string + description: string + fixtureVersion: number + sourceRepoUrl: string | null + sourceCommitSha: string | null + setup: (repoDir: string) => FixtureMetadata +} + +const MOCK_SIMPLE_FILES: Record = { + 'package.json': JSON.stringify( + { + name: 'evalbuff-mock-simple', + version: '1.0.0', + private: true, + type: 'module', + scripts: { + test: 'bun test', + typecheck: 'tsc --noEmit', + }, + }, + null, + 2, + ) + '\n', + 'tsconfig.json': JSON.stringify( + { + compilerOptions: { + target: 'ES2020', + module: 'ESNext', + moduleResolution: 'bundler', + strict: true, + noEmit: true, + }, + include: ['src'], + }, + null, + 2, + ) + '\n', + 'README.md': `# Mock Simple Repo + +Small deterministic TypeScript repo for evalbuff E2E coverage. +`, + 'src/index.ts': `import { add, fibonacci } from './math' +import { slugify } from './strings' +import { Stack } from './stack' + +console.log(add(2, 3)) +console.log(fibonacci(8)) +console.log(slugify('Evalbuff Fixtures')) + +const stack = new Stack() +stack.push(1) +stack.push(2) +console.log(stack.peek()) +`, + 'src/math.ts': `export function add(a: number, b: number): number { + return a + b +} + +export function fibonacci(n: number): number { + if (n <= 0) return 0 + if (n === 1) return 1 + + let a = 0 + let b = 1 + for (let i = 2; i <= n; i += 1) { + ;[a, b] = [b, a + b] + } + return b +} +`, + 'src/math.test.ts': `import { expect, test } from 'bun:test' + +import { add, fibonacci } from './math' + +test('math helpers', () => { + expect(add(2, 3)).toBe(5) + expect(fibonacci(8)).toBe(21) +}) +`, + 'src/strings.ts': `export function slugify(value: string): string { + return value + .toLowerCase() + .replace(/[^a-z0-9]+/g, '-') + .replace(/^-|-$/g, '') +} +`, + 'src/strings.test.ts': `import { expect, test } from 'bun:test' + +import { slugify } from './strings' + +test('slugify', () => { + expect(slugify('Evalbuff Fixtures')).toBe('evalbuff-fixtures') +}) +`, + 'src/stack.ts': `export class Stack { + private items: T[] = [] + + push(item: T): void { + this.items.push(item) + } + + peek(): T | undefined { + return this.items[this.items.length - 1] + } +} +`, + 'src/stack.test.ts': `import { expect, test } from 'bun:test' + +import { Stack } from './stack' + +test('stack push and peek', () => { + const stack = new Stack() + stack.push(1) + stack.push(2) + expect(stack.peek()).toBe(2) +}) +`, +} + +const BENCHMARK_REPO_DEFINITIONS: readonly BenchmarkRepoDefinition[] = [ + { + id: 'mock-simple', + dirName: 'mock-simple', + description: 'Generated deterministic TypeScript repo for fast local E2E runs.', + fixtureVersion: FIXTURE_LAYOUT_VERSION, + sourceRepoUrl: null, + sourceCommitSha: null, + setup: (repoDir) => createMockSimpleFixture(repoDir), + }, + { + id: 'codebuff', + dirName: 'codebuff', + description: + 'Pinned checkout of CodebuffAI/codebuff main with fixture cleanup for evalbuff E2E runs.', + fixtureVersion: FIXTURE_LAYOUT_VERSION, + sourceRepoUrl: 'https://github.com/CodebuffAI/codebuff.git', + sourceCommitSha: 'f95f9a58ebcfcfecc8c6ffcfbe6d606ec1278e54', + setup: (repoDir) => + createPinnedRemoteFixture(repoDir, { + id: 'codebuff', + description: + 'Pinned checkout of CodebuffAI/codebuff main with fixture cleanup for evalbuff E2E runs.', + repoUrl: 'https://github.com/CodebuffAI/codebuff.git', + commitSha: 'f95f9a58ebcfcfecc8c6ffcfbe6d606ec1278e54', + postCheckout: patchCodebuffFixture, + }), + }, + { + id: 'manifold', + dirName: 'manifold', + description: + 'Pinned checkout of manifoldmarkets/manifold main with docs/ renamed to external-docs/.', + fixtureVersion: FIXTURE_LAYOUT_VERSION, + sourceRepoUrl: 'https://github.com/manifoldmarkets/manifold.git', + sourceCommitSha: '89c1b733190ff717ff7f7d7fb6206b09c61aebd1', + setup: (repoDir) => + createPinnedRemoteFixture(repoDir, { + id: 'manifold', + description: + 'Pinned checkout of manifoldmarkets/manifold main with docs/ renamed to external-docs/.', + repoUrl: 'https://github.com/manifoldmarkets/manifold.git', + commitSha: '89c1b733190ff717ff7f7d7fb6206b09c61aebd1', + postCheckout: renameDocsDirForManifoldFixture, + }), + }, +] as const + +export const BENCHMARK_REPO_IDS = BENCHMARK_REPO_DEFINITIONS.map((repo) => repo.id) + +export function setupBenchmarkRepos( + options: SetupBenchmarkReposOptions = {}, +): FixtureMetadata[] { + const rootDir = path.resolve(options.rootDir ?? DEFAULT_TEST_REPOS_ROOT) + const repoIds = options.repoIds ?? [...BENCHMARK_REPO_IDS] + const force = options.force ?? false + const log = options.log ?? (() => {}) + + fs.mkdirSync(rootDir, { recursive: true }) + + const selectedDefinitions = repoIds.map((repoId) => { + const definition = BENCHMARK_REPO_DEFINITIONS.find((repo) => repo.id === repoId) + if (!definition) { + throw new Error(`Unknown benchmark repo id: ${repoId}`) + } + return definition + }) + + const results = selectedDefinitions.map((definition) => { + const repoDir = path.join(rootDir, definition.dirName) + const existingMetadata = readFixtureMetadata(repoDir) + + if (fs.existsSync(repoDir)) { + if (!force && existingMetadata && fixtureMatchesDefinition(existingMetadata, definition)) { + const currentHeadSha = gitOutput(repoDir, ['rev-parse', 'HEAD']) + const gitStatus = gitOutput(repoDir, ['status', '--porcelain']) + if (currentHeadSha === existingMetadata.headSha && gitStatus === '') { + log(`Skipping ${definition.id}; fixture already matches ${currentHeadSha}.`) + return existingMetadata + } + } + + if (!force) { + throw new Error( + `Fixture directory already exists and does not match the expected state: ${repoDir}. Re-run with --force to rebuild it.`, + ) + } + + fs.rmSync(repoDir, { recursive: true, force: true }) + } + + log(`Setting up ${definition.id} in ${repoDir}`) + const metadata = definition.setup(repoDir) + writeFixtureMetadata(repoDir, metadata) + return metadata + }) + + const manifestPath = path.join(rootDir, FIXTURE_MANIFEST_FILENAME) + fs.writeFileSync( + manifestPath, + JSON.stringify( + { + layoutVersion: FIXTURE_LAYOUT_VERSION, + generatedAt: new Date().toISOString(), + repos: results, + }, + null, + 2, + ) + '\n', + ) + + return results +} + +export function renameDocsDirForManifoldFixture(repoDir: string): void { + const docsDir = path.join(repoDir, 'docs') + const externalDocsDir = path.join(repoDir, 'external-docs') + + if (!fs.existsSync(docsDir)) { + throw new Error(`Expected docs/ to exist before manifold patch in ${repoDir}`) + } + if (fs.existsSync(externalDocsDir)) { + throw new Error(`Refusing to overwrite existing external-docs/ in ${repoDir}`) + } + + fs.renameSync(docsDir, externalDocsDir) + gitOutput(repoDir, ['add', '-A']) + gitOutput(repoDir, ['commit', '-m', 'evalbuff: move upstream docs to external-docs'], { + env: FIXTURE_COMMIT_ENV, + }) +} + +export function patchCodebuffFixture(repoDir: string): void { + const docsDir = path.join(repoDir, 'docs') + if (fs.existsSync(docsDir)) { + fs.rmSync(docsDir, { recursive: true, force: true }) + } + + fs.writeFileSync(path.join(repoDir, 'AGENTS.md'), CODEBUFF_FIXTURE_AGENTS_MD) + gitOutput(repoDir, ['add', '-A']) + gitOutput( + repoDir, + ['commit', '-m', 'evalbuff: remove bundled docs and simplify AGENTS'], + { + env: FIXTURE_COMMIT_ENV, + }, + ) +} + +function createMockSimpleFixture(repoDir: string): FixtureMetadata { + initializeRepo(repoDir) + + for (const [filePath, content] of Object.entries(MOCK_SIMPLE_FILES)) { + writeFile(repoDir, filePath, content) + } + + gitOutput(repoDir, ['add', '-A']) + gitOutput(repoDir, ['commit', '-m', 'evalbuff fixture: create mock simple repo'], { + env: FIXTURE_COMMIT_ENV, + }) + + return { + id: 'mock-simple', + fixtureVersion: FIXTURE_LAYOUT_VERSION, + description: 'Generated deterministic TypeScript repo for fast local E2E runs.', + repoPath: repoDir, + sourceRepoUrl: null, + sourceCommitSha: null, + headSha: gitOutput(repoDir, ['rev-parse', 'HEAD']), + notes: ['Generated locally by evalbuff.'], + } +} + +function createPinnedRemoteFixture( + repoDir: string, + options: { + id: Exclude + description: string + repoUrl: string + commitSha: string + postCheckout?: (repoDir: string) => void + }, +): FixtureMetadata { + initializeRepo(repoDir) + gitOutput(repoDir, ['remote', 'add', 'origin', options.repoUrl]) + gitOutput(repoDir, ['fetch', '--depth', '1', 'origin', options.commitSha]) + gitOutput(repoDir, ['checkout', '--detach', 'FETCH_HEAD']) + + const notes = [`Pinned to upstream commit ${options.commitSha}.`] + if (options.postCheckout) { + options.postCheckout(repoDir) + } + if (options.id === 'codebuff') { + notes.push('Local fixture commit removes docs/ and rewrites AGENTS.md.') + } + if (options.id === 'manifold') { + notes.push('Local fixture commit renames docs/ to external-docs/.') + } + + return { + id: options.id, + fixtureVersion: FIXTURE_LAYOUT_VERSION, + description: options.description, + repoPath: repoDir, + sourceRepoUrl: options.repoUrl, + sourceCommitSha: options.commitSha, + headSha: gitOutput(repoDir, ['rev-parse', 'HEAD']), + notes, + } +} + +function initializeRepo(repoDir: string): void { + fs.mkdirSync(repoDir, { recursive: true }) + gitOutput(repoDir, ['init', '--initial-branch=main']) + gitOutput(repoDir, ['config', 'user.name', FIXTURE_COMMIT_ENV.GIT_AUTHOR_NAME]) + gitOutput(repoDir, ['config', 'user.email', FIXTURE_COMMIT_ENV.GIT_AUTHOR_EMAIL]) +} + +function fixtureMatchesDefinition( + metadata: FixtureMetadata, + definition: BenchmarkRepoDefinition, +): boolean { + return ( + metadata.id === definition.id && + metadata.fixtureVersion === definition.fixtureVersion && + metadata.sourceRepoUrl === definition.sourceRepoUrl && + metadata.sourceCommitSha === definition.sourceCommitSha + ) +} + +function readFixtureMetadata(repoDir: string): FixtureMetadata | null { + const metadataPath = getFixtureMetadataPath(repoDir) + if (!fs.existsSync(metadataPath)) { + return null + } + return JSON.parse(fs.readFileSync(metadataPath, 'utf-8')) as FixtureMetadata +} + +function writeFixtureMetadata(repoDir: string, metadata: FixtureMetadata): void { + const metadataPath = getFixtureMetadataPath(repoDir) + fs.writeFileSync(metadataPath, JSON.stringify(metadata, null, 2) + '\n') +} + +function getFixtureMetadataPath(repoDir: string): string { + return path.join( + path.dirname(repoDir), + `.${path.basename(repoDir)}.${FIXTURE_METADATA_FILENAME}`, + ) +} + +function writeFile(repoDir: string, filePath: string, content: string): void { + const absolutePath = path.join(repoDir, filePath) + fs.mkdirSync(path.dirname(absolutePath), { recursive: true }) + fs.writeFileSync(absolutePath, content) +} + +function gitOutput( + repoDir: string, + args: string[], + options?: { env?: NodeJS.ProcessEnv }, +): string { + return execFileSync('git', args, { + cwd: repoDir, + encoding: 'utf-8', + env: options?.env ?? process.env, + stdio: ['ignore', 'pipe', 'pipe'], + }).trim() +} diff --git a/src/run-evalbuff.ts b/src/run-evalbuff.ts index b398d7b..483ae43 100644 --- a/src/run-evalbuff.ts +++ b/src/run-evalbuff.ts @@ -66,6 +66,10 @@ const DOC_CHANGE_ACCEPTANCE_THRESHOLD = 0.5 const DOC_CHANGE_FAST_ACCEPT_THRESHOLD = DOC_CHANGE_ACCEPTANCE_THRESHOLD * 2 const CARVE_PARALLELISM = 10 +function getErrorMessage(error: unknown): string { + return error instanceof Error ? error.message : String(error) +} + export function evaluateDocChangeGate(args: { baseScore: number rejudgeScore: number @@ -733,6 +737,10 @@ export async function runEvalbuff(opts: EvalbuffOptions): Promise { const logDir = path.join(os.tmpdir(), `evalbuff-run-${new Date().toISOString().slice(0, 19).replace(/:/g, '-')}`) fs.mkdirSync(logDir, { recursive: true }) + let totalCost = 0 + let scoreProgression: number[] = [] + let completionDetail = 'Run completed.' + events.initLog(logDir) events.send({ type: 'run_start', @@ -743,227 +751,240 @@ export async function runEvalbuff(opts: EvalbuffOptions): Promise { logDir, }) - printHeader({ - repoPath: opts.repoPath, - n: opts.n, - codingModel: opts.codingModel, - docsModel: opts.docsModel, - logDir, - }) - - let features: CarvedFeature[] - - if (opts.cachedFeatures) { - const cached: CarvedFeature[] = JSON.parse(fs.readFileSync(opts.cachedFeatures, 'utf-8')) - features = selectRandom(cached, opts.n) - console.log(`\n Loaded ${features.length} cached features`) + try { + printHeader({ + repoPath: opts.repoPath, + n: opts.n, + codingModel: opts.codingModel, + docsModel: opts.docsModel, + logDir, + }) - events.send({ type: 'feature_planned', totalCandidates: cached.length, selectedIds: features.map(f => f.id) }) - fs.writeFileSync(path.join(logDir, 'features.json'), JSON.stringify(features, null, 2)) - } else { - events.send({ type: 'phase_change', phase: 'planning', detail: 'Analyzing codebase...' }) - startSpinner('Planning features...') - - const plan = await planFeatures(opts.repoPath) - stopSpinner(` Found ${plan.candidates.length} candidates`) - - fs.writeFileSync(path.join(logDir, 'plan.json'), JSON.stringify(plan, null, 2)) - - const selected = selectRandom(plan.candidates, opts.n) - - events.send({ type: 'feature_planned', totalCandidates: plan.candidates.length, selectedIds: selected.map(c => c.id) }) - events.send({ type: 'phase_change', phase: 'carving', detail: `Carving ${selected.length} features...` }) - - features = [] - { - const carveQueue = [...selected] - let carveNext = 0 - const carveResults: (CarvedFeature | null)[] = new Array(carveQueue.length).fill(null) - - startSpinner(`Carving 0/${carveQueue.length} features...`) - let carveCompleted = 0 - - async function carveWorker(): Promise { - while (carveNext < carveQueue.length) { - const idx = carveNext++ - const candidate = carveQueue[idx] - try { - events.send({ type: 'feature_status', featureId: candidate.id, status: 'carving' }) - const carved = await carveFeature(opts.repoPath, candidate) - if (carved) { - carveResults[idx] = carved - events.send({ type: 'feature_status', featureId: candidate.id, status: 'carved', detail: `${carved.operations.length} file operations` }) + let features: CarvedFeature[] + + if (opts.cachedFeatures) { + const cached: CarvedFeature[] = JSON.parse(fs.readFileSync(opts.cachedFeatures, 'utf-8')) + features = selectRandom(cached, opts.n) + console.log(`\n Loaded ${features.length} cached features`) + + events.send({ type: 'feature_planned', totalCandidates: cached.length, selectedIds: features.map(f => f.id) }) + fs.writeFileSync(path.join(logDir, 'features.json'), JSON.stringify(features, null, 2)) + } else { + events.send({ type: 'phase_change', phase: 'planning', detail: 'Analyzing codebase...' }) + startSpinner('Planning features...') + + const plan = await planFeatures(opts.repoPath) + stopSpinner(` Found ${plan.candidates.length} candidates`) + + fs.writeFileSync(path.join(logDir, 'plan.json'), JSON.stringify(plan, null, 2)) + + const selected = selectRandom(plan.candidates, opts.n) + + events.send({ type: 'feature_planned', totalCandidates: plan.candidates.length, selectedIds: selected.map(c => c.id) }) + events.send({ type: 'phase_change', phase: 'carving', detail: `Carving ${selected.length} features...` }) + + features = [] + { + const carveQueue = [...selected] + let carveNext = 0 + const carveResults: (CarvedFeature | null)[] = new Array(carveQueue.length).fill(null) + + startSpinner(`Carving 0/${carveQueue.length} features...`) + let carveCompleted = 0 + + async function carveWorker(): Promise { + while (carveNext < carveQueue.length) { + const idx = carveNext++ + const candidate = carveQueue[idx] + try { + events.send({ type: 'feature_status', featureId: candidate.id, status: 'carving' }) + const carved = await carveFeature(opts.repoPath, candidate) + if (carved) { + carveResults[idx] = carved + events.send({ type: 'feature_status', featureId: candidate.id, status: 'carved', detail: `${carved.operations.length} file operations` }) + } else { + events.send({ type: 'feature_status', featureId: candidate.id, status: 'carve_failed', detail: 'Carve produced no changes.' }) + } + } catch (error) { + const msg = getErrorMessage(error) + events.send({ type: 'feature_status', featureId: candidate.id, status: 'carve_failed', detail: msg.slice(0, 200) }) } - } catch (error) { - const msg = error instanceof Error ? error.message : String(error) - events.send({ type: 'feature_status', featureId: candidate.id, status: 'carve_failed', detail: msg.slice(0, 200) }) + carveCompleted++ + updateSpinner(`Carving ${carveCompleted}/${carveQueue.length} features...`) } - carveCompleted++ - updateSpinner(`Carving ${carveCompleted}/${carveQueue.length} features...`) } - } - await Promise.all( - Array.from({ length: Math.min(CARVE_PARALLELISM, carveQueue.length) }, () => carveWorker()), - ) - for (const result of carveResults) { - if (result) features.push(result) + await Promise.all( + Array.from({ length: Math.min(CARVE_PARALLELISM, carveQueue.length) }, () => carveWorker()), + ) + for (const result of carveResults) { + if (result) features.push(result) + } + stopSpinner(` Carved ${features.length}/${carveQueue.length} features`) } - stopSpinner(` Carved ${features.length}/${carveQueue.length} features`) } - } - - if (features.length === 0) { - console.error('No features were successfully carved. Aborting.') - return - } - - // Pre-compute ground truth diffs - const groundTruthDiffs = new Map() - for (const feature of features) { - groundTruthDiffs.set(feature.id, getGroundTruthDiff(feature)) - } - fs.writeFileSync(path.join(logDir, 'features.json'), JSON.stringify(features, null, 2)) - - // --- Baseline evaluation --- - events.send({ type: 'phase_change', phase: 'evaluating', round: 0, detail: 'Baseline' }) - - const baseline = await runEvalRound(features, groundTruthDiffs, opts, 0) - saveRoundResults(logDir, baseline) - - let totalCost = baseline.totalCost - const roundResults: RoundResult[] = [baseline] - const baselineRejudgeResults: RoundResult[] = [] - const loopDocGateResults: LoopDocGateResult[] = [] - const allProjectSuggestionSections: string[] = [] - - // Collect project suggestions from baseline - const baselineProjectSuggestions = collectProjectSuggestions(baseline.tasks.filter(t => t.score >= 0)) - if (baselineProjectSuggestions) allProjectSuggestionSections.push(`## Baseline Round\n\n${baselineProjectSuggestions}`) - - // --- Improvement round --- - const improvementRound = 1 - console.log(`\n\x1b[1mImprovement Round\x1b[0m`) - - const docsSnapshotBefore = getDocsSnapshot(opts.repoPath) - events.send({ type: 'phase_change', phase: 'evaluating', round: improvementRound, loop: improvementRound, detail: 'Re-eval with updated docs' }) - const featureGateResults: FeatureDocGateResult[] = [] - const featureGateArtifacts: FeatureDocGateArtifacts[] = [] - const results = await runEvalRound( - features, - groundTruthDiffs, - opts, - improvementRound, - baseline.avgScore, - async ({ feature, task }) => { - const gated = await gateDocsChangesForTask({ - feature, - task, - opts, - groundTruthDiffs, - loop: improvementRound, - }) - featureGateResults.push(gated.result) - featureGateArtifacts.push(gated.artifacts) - return gated.validationCost - }, - ) - - totalCost += results.totalCost - roundResults.push(results) - - const loopDocGateResult: LoopDocGateResult = { - loop: improvementRound, - threshold: DOC_CHANGE_ACCEPTANCE_THRESHOLD, - fastAcceptThreshold: DOC_CHANGE_FAST_ACCEPT_THRESHOLD, - features: featureGateResults, - } - loopDocGateResults.push(loopDocGateResult) - - const docsAfterRefactor = getDocsSnapshot(opts.repoPath) - const docsDiffText = computeDocsDiffText(docsSnapshotBefore, docsAfterRefactor) - const loopSummaryText = renderLoopDocGateSummary(loopDocGateResult) - fs.writeFileSync(path.join(logDir, `judge-suggestions-loop-${improvementRound}.txt`), loopSummaryText) - fs.writeFileSync(path.join(logDir, `docs-diff-loop-${improvementRound}.txt`), docsDiffText) - fs.writeFileSync( - path.join(logDir, `docs-state-loop-${improvementRound}.json`), - JSON.stringify(docsAfterRefactor, null, 2), - ) - saveLoopDocGateResults(logDir, loopDocGateResult) - saveLoopDocGateArtifacts(logDir, improvementRound, featureGateArtifacts) - saveRoundResults(logDir, results) - - const rejudged = await runBaselineRejudgeRound(baseline, features, groundTruthDiffs, opts, improvementRound) - saveBaselineRejudgeResults(logDir, rejudged) - baselineRejudgeResults.push(rejudged) - - const loopProjectSuggestions = collectProjectSuggestions(results.tasks.filter(t => t.score >= 0)) - if (loopProjectSuggestions) allProjectSuggestionSections.push(`## Improvement Round\n\n${loopProjectSuggestions}`) - - // --- Generate project improvement prompts --- - let projectPrompts: string[] = [] - const allProjectSuggestionsText = allProjectSuggestionSections.join('\n\n') - if (allProjectSuggestionsText.trim()) { - fs.writeFileSync(path.join(logDir, 'project-suggestions-raw.txt'), allProjectSuggestionsText) - startSpinner('Generating project improvement prompts...') - projectPrompts = await runPromptWriterAgent(opts.repoPath, allProjectSuggestionsText, opts.docsModel) - stopSpinner() - if (projectPrompts.length > 0) { - fs.writeFileSync(path.join(logDir, 'project-prompts.json'), JSON.stringify(projectPrompts, null, 2)) + if (features.length === 0) { + completionDetail = 'Run aborted: no features were successfully carved.' + console.error('No features were successfully carved. Aborting.') + return } - } - // --- Final output --- - const endTime = new Date().toISOString() + // Pre-compute ground truth diffs + const groundTruthDiffs = new Map() + for (const feature of features) { + groundTruthDiffs.set(feature.id, getGroundTruthDiff(feature)) + } - const summary: EvalSummary = { - repoPath: opts.repoPath, - startTime, - endTime, - featuresCarved: features.length, - rounds: roundResults.map((r) => ({ - round: r.round, - avgScore: r.avgScore, - scores: Object.fromEntries(r.tasks.map((t) => [t.featureId, t.score])), - totalCost: r.totalCost, - })), - totalCost, - scoreProgression: roundResults.map((r) => r.avgScore), - baselineRejudgeProgression: baselineRejudgeResults.map((r) => r.avgScore), - consideredDocChangesByLoop: loopDocGateResults.map((result) => countLoopDocChanges(result).considered), - acceptedDocChangesByLoop: loopDocGateResults.map((result) => countLoopDocChanges(result).accepted), - projectPrompts, - } + fs.writeFileSync(path.join(logDir, 'features.json'), JSON.stringify(features, null, 2)) - saveSummary(logDir, summary, roundResults, opts, baselineRejudgeResults, loopDocGateResults, projectPrompts) + // --- Baseline evaluation --- + events.send({ type: 'phase_change', phase: 'evaluating', round: 0, detail: 'Baseline' }) + + const baseline = await runEvalRound(features, groundTruthDiffs, opts, 0) + saveRoundResults(logDir, baseline) + + totalCost = baseline.totalCost + const roundResults: RoundResult[] = [baseline] + scoreProgression = roundResults.map((round) => round.avgScore) + const baselineRejudgeResults: RoundResult[] = [] + const loopDocGateResults: LoopDocGateResult[] = [] + const allProjectSuggestionSections: string[] = [] + + // Collect project suggestions from baseline + const baselineProjectSuggestions = collectProjectSuggestions(baseline.tasks.filter(t => t.score >= 0)) + if (baselineProjectSuggestions) allProjectSuggestionSections.push(`## Baseline Round\n\n${baselineProjectSuggestions}`) + + // --- Improvement round --- + const improvementRound = 1 + console.log(`\n\x1b[1mImprovement Round\x1b[0m`) + + const docsSnapshotBefore = getDocsSnapshot(opts.repoPath) + events.send({ type: 'phase_change', phase: 'evaluating', round: improvementRound, loop: improvementRound, detail: 'Re-eval with updated docs' }) + const featureGateResults: FeatureDocGateResult[] = [] + const featureGateArtifacts: FeatureDocGateArtifacts[] = [] + const results = await runEvalRound( + features, + groundTruthDiffs, + opts, + improvementRound, + baseline.avgScore, + async ({ feature, task }) => { + const gated = await gateDocsChangesForTask({ + feature, + task, + opts, + groundTruthDiffs, + loop: improvementRound, + }) + featureGateResults.push(gated.result) + featureGateArtifacts.push(gated.artifacts) + return gated.validationCost + }, + ) - events.send({ - type: 'run_complete', - scoreProgression: summary.scoreProgression, - totalCost, - duration: `${startTime} → ${endTime}`, - }) - events.close() + totalCost += results.totalCost + roundResults.push(results) + scoreProgression = roundResults.map((round) => round.avgScore) - // Print score table across all rounds - printScoreTable(roundResults, baselineRejudgeResults) + const loopDocGateResult: LoopDocGateResult = { + loop: improvementRound, + threshold: DOC_CHANGE_ACCEPTANCE_THRESHOLD, + fastAcceptThreshold: DOC_CHANGE_FAST_ACCEPT_THRESHOLD, + features: featureGateResults, + } + loopDocGateResults.push(loopDocGateResult) + + const docsAfterRefactor = getDocsSnapshot(opts.repoPath) + const docsDiffText = computeDocsDiffText(docsSnapshotBefore, docsAfterRefactor) + const loopSummaryText = renderLoopDocGateSummary(loopDocGateResult) + fs.writeFileSync(path.join(logDir, `judge-suggestions-loop-${improvementRound}.txt`), loopSummaryText) + fs.writeFileSync(path.join(logDir, `docs-diff-loop-${improvementRound}.txt`), docsDiffText) + fs.writeFileSync( + path.join(logDir, `docs-state-loop-${improvementRound}.json`), + JSON.stringify(docsAfterRefactor, null, 2), + ) + saveLoopDocGateResults(logDir, loopDocGateResult) + saveLoopDocGateArtifacts(logDir, improvementRound, featureGateArtifacts) + saveRoundResults(logDir, results) + + const rejudged = await runBaselineRejudgeRound(baseline, features, groundTruthDiffs, opts, improvementRound) + saveBaselineRejudgeResults(logDir, rejudged) + baselineRejudgeResults.push(rejudged) + + const loopProjectSuggestions = collectProjectSuggestions(results.tasks.filter(t => t.score >= 0)) + if (loopProjectSuggestions) allProjectSuggestionSections.push(`## Improvement Round\n\n${loopProjectSuggestions}`) + + // --- Generate project improvement prompts --- + let projectPrompts: string[] = [] + const allProjectSuggestionsText = allProjectSuggestionSections.join('\n\n') + if (allProjectSuggestionsText.trim()) { + fs.writeFileSync(path.join(logDir, 'project-suggestions-raw.txt'), allProjectSuggestionsText) + startSpinner('Generating project improvement prompts...') + projectPrompts = await runPromptWriterAgent(opts.repoPath, allProjectSuggestionsText, opts.docsModel) + stopSpinner() + if (projectPrompts.length > 0) { + fs.writeFileSync(path.join(logDir, 'project-prompts.json'), JSON.stringify(projectPrompts, null, 2)) + } + } - // Print project improvement prompts - printProjectPrompts(projectPrompts) + // --- Final output --- + const endTime = new Date().toISOString() + + const summary: EvalSummary = { + repoPath: opts.repoPath, + startTime, + endTime, + featuresCarved: features.length, + rounds: roundResults.map((r) => ({ + round: r.round, + avgScore: r.avgScore, + scores: Object.fromEntries(r.tasks.map((t) => [t.featureId, t.score])), + totalCost: r.totalCost, + })), + totalCost, + scoreProgression: roundResults.map((r) => r.avgScore), + baselineRejudgeProgression: baselineRejudgeResults.map((r) => r.avgScore), + consideredDocChangesByLoop: loopDocGateResults.map((result) => countLoopDocChanges(result).considered), + acceptedDocChangesByLoop: loopDocGateResults.map((result) => countLoopDocChanges(result).accepted), + projectPrompts, + } - // Final summary line - printFinalSummary({ - startTime, - endTime, - features: features.length, - totalCost, - scoreProgression: summary.scoreProgression, - baselineRejudgeProgression: summary.baselineRejudgeProgression || [], - promptCount: projectPrompts.length, - logDir, - reportPath: path.join(logDir, 'report.md'), - }) + scoreProgression = summary.scoreProgression + saveSummary(logDir, summary, roundResults, opts, baselineRejudgeResults, loopDocGateResults, projectPrompts) + + // Print score table across all rounds + printScoreTable(roundResults, baselineRejudgeResults) + + // Print project improvement prompts + printProjectPrompts(projectPrompts) + + // Final summary line + printFinalSummary({ + startTime, + endTime, + features: features.length, + totalCost, + scoreProgression: summary.scoreProgression, + baselineRejudgeProgression: summary.baselineRejudgeProgression || [], + promptCount: projectPrompts.length, + logDir, + reportPath: path.join(logDir, 'report.md'), + }) + } catch (error) { + completionDetail = `Run failed: ${getErrorMessage(error).slice(0, 200)}` + throw error + } finally { + const endTime = new Date().toISOString() + events.send({ type: 'phase_change', phase: 'complete', detail: completionDetail }) + events.send({ + type: 'run_complete', + scoreProgression, + totalCost, + duration: `${startTime} → ${endTime}`, + }) + events.close() + } } // --- CLI entry point --- diff --git a/src/setup-e2e-repos.ts b/src/setup-e2e-repos.ts new file mode 100644 index 0000000..2b556a7 --- /dev/null +++ b/src/setup-e2e-repos.ts @@ -0,0 +1,128 @@ +#!/usr/bin/env bun +/** + * Usage: + * bun run src/setup-e2e-repos.ts [--root /path/to/test-repos] + * [--repo mock-simple --repo codebuff --repo manifold] + * [--force] + */ +import path from 'path' + +import { + BENCHMARK_REPO_IDS, + DEFAULT_TEST_REPOS_ROOT, + FIXTURE_MANIFEST_FILENAME, + type BenchmarkRepoId, + setupBenchmarkRepos, +} from './e2e-repos' + +interface SetupE2EReposCliOptions { + rootDir: string + repoIds?: BenchmarkRepoId[] + force: boolean +} + +function parseArgs(argv: string[]): SetupE2EReposCliOptions { + const options: SetupE2EReposCliOptions = { + rootDir: DEFAULT_TEST_REPOS_ROOT, + force: false, + } + + const repoIds: BenchmarkRepoId[] = [] + + for (let index = 0; index < argv.length; index += 1) { + const arg = argv[index] + + if (arg === '--root') { + const value = argv[index + 1] + if (!value) { + throw new Error('Missing value for --root') + } + options.rootDir = path.resolve(value) + index += 1 + continue + } + + if (arg === '--repo') { + const value = argv[index + 1] + if (!value) { + throw new Error('Missing value for --repo') + } + if (!BENCHMARK_REPO_IDS.includes(value as BenchmarkRepoId)) { + throw new Error( + `Unknown repo id "${value}". Expected one of: ${BENCHMARK_REPO_IDS.join(', ')}`, + ) + } + repoIds.push(value as BenchmarkRepoId) + index += 1 + continue + } + + if (arg === '--force') { + options.force = true + continue + } + + throw new Error(`Unknown argument: ${arg}`) + } + + if (repoIds.length > 0) { + options.repoIds = repoIds + } + + return options +} + +function printHelp(): void { + console.log(`Usage: bun run src/setup-e2e-repos.ts [options] + +Options: + --root Directory where benchmark repos will be created + --repo Repo to set up; may be repeated + --force Rebuild existing fixtures in place + -h, --help Show this help message + +Repo ids: + ${BENCHMARK_REPO_IDS.join(', ')}`) +} + +function main(): void { + const args = process.argv.slice(2) + + if (args.includes('--help') || args.includes('-h')) { + printHelp() + return + } + + const options = parseArgs(args) + + if (options.rootDir !== DEFAULT_TEST_REPOS_ROOT) { + console.log(`Using custom root: ${options.rootDir}`) + } + if (options.repoIds) { + console.log(`Selected repos: ${options.repoIds.join(', ')}`) + } + if (options.force) { + console.log('Force rebuild enabled.') + } + + const results = setupBenchmarkRepos({ + ...options, + log: (message) => console.log(message), + }) + + console.log(`Wrote manifest: ${path.join(options.rootDir, FIXTURE_MANIFEST_FILENAME)}`) + for (const result of results) { + console.log(`${result.id}: ${result.repoPath} @ ${result.headSha}`) + } +} + +if (import.meta.main) { + try { + main() + } catch (error) { + console.error( + error instanceof Error ? error.message : `Unexpected error: ${String(error)}`, + ) + process.exit(1) + } +} diff --git a/test-repos/.gitignore b/test-repos/.gitignore new file mode 100644 index 0000000..7c9d611 --- /dev/null +++ b/test-repos/.gitignore @@ -0,0 +1,3 @@ +* +!.gitignore +!README.md diff --git a/test-repos/README.md b/test-repos/README.md new file mode 100644 index 0000000..6dd8519 --- /dev/null +++ b/test-repos/README.md @@ -0,0 +1,16 @@ +# E2E Test Repos + +Generated benchmark repos live here and stay untracked. + +Set them up with: + +```bash +bun run setup:e2e-repos +``` + +Pinned upstream sources: + +- `codebuff`: `CodebuffAI/codebuff` at `f95f9a58ebcfcfecc8c6ffcfbe6d606ec1278e54`, plus a local commit that removes `docs/` and rewrites `AGENTS.md` +- `manifold`: `manifoldmarkets/manifold` at `89c1b733190ff717ff7f7d7fb6206b09c61aebd1`, plus a local commit that renames `docs/` to `external-docs/` + +The generated mock repo is created locally and committed deterministically. From a71c017cfb00995f2b6c16783c9f8dc6fc7f3b4e Mon Sep 17 00:00:00 2001 From: James Grugett Date: Thu, 9 Apr 2026 13:03:14 -0700 Subject: [PATCH 7/9] Cleanup carve features. Use full access sandbox. Propogate and print errors --- src/carve-features.ts | 119 ++++++++++++++++++------------------------ src/judge.ts | 2 +- src/runners/codex.ts | 1 + 3 files changed, 52 insertions(+), 70 deletions(-) diff --git a/src/carve-features.ts b/src/carve-features.ts index 03d8cbc..c995a55 100644 --- a/src/carve-features.ts +++ b/src/carve-features.ts @@ -52,11 +52,36 @@ export interface CarveResult { repoPath: string generationDate: string features: CarvedFeature[] + failures: { id: string; error: string }[] } -// --- Constants --- - -const RESULT_FILE = 'evalbuff-carve-result.json' +// --- Structured output schema for planFeatures --- + +const carvePlanSchema = { + type: 'object', + properties: { + reasoning: { type: 'string', description: 'Analysis of the codebase and approach to selecting features' }, + candidates: { + type: 'array', + items: { + type: 'object', + properties: { + id: { type: 'string', description: 'Short kebab-case identifier' }, + name: { type: 'string', description: 'Human readable name' }, + prompt: { type: 'string', description: 'Natural prompt a developer would use to ask for this feature' }, + description: { type: 'string', description: 'What this feature does and why it exists' }, + files: { type: 'array', items: { type: 'string' }, description: 'Files that ARE the feature (to delete or modify)' }, + relevantFiles: { type: 'array', items: { type: 'string' }, description: 'Other files that import or reference the feature' }, + complexity: { type: 'string', enum: ['small', 'medium', 'large'] }, + }, + required: ['id', 'name', 'prompt', 'description', 'files', 'relevantFiles', 'complexity'], + additionalProperties: false, + }, + }, + }, + required: ['reasoning', 'candidates'], + additionalProperties: false, +} as const // --- Phase 1: Identify features to carve (Codex agent) --- @@ -69,6 +94,7 @@ export async function planFeatures(repoPath: string): Promise { model: 'gpt-5.4', workingDirectory: repoPath, approvalPolicy: 'never', + sandboxMode: 'danger-full-access', webSearchMode: 'live', modelReasoningEffort: 'high', }) @@ -103,53 +129,28 @@ Each feature should: ## Output -After your analysis, write a file called \`${RESULT_FILE}\` with this JSON structure: - -\`\`\`json -{ - "reasoning": "Your analysis of the codebase and approach to selecting features", - "candidates": [ - { - "id": "short-kebab-id", - "name": "Human readable name", - "prompt": "Natural prompt a developer would use to ask for this feature, 1-2 sentences", - "description": "What this feature does and why it exists", - "files": ["path/to/file1.ts", "path/to/file2.tsx"], - "relevantFiles": ["path/to/importer.ts"], - "complexity": "small|medium|large" - } - ] -} -\`\`\` - - **files**: The files that ARE the feature (to be deleted or modified to remove it). Be thorough — missing a file means the carve won't be clean. -- **relevantFiles**: Other files that import or reference the feature. - -You MUST write the result file as your last action.` +- **relevantFiles**: Other files that import or reference the feature.` - const result = await thread.run(prompt) - console.log(`planFeatures: Codex finished. Response length: ${result.finalResponse.length}`) + const result = await thread.run(prompt, { outputSchema: carvePlanSchema }) - // Read the result file - const resultPath = path.join(repoPath, RESULT_FILE) - if (!fs.existsSync(resultPath)) { - // Try to extract from the agent's final response - const jsonMatch = result.finalResponse.match(/\{[\s\S]*"candidates"[\s\S]*\}/) - if (jsonMatch) { - console.log('planFeatures: extracted plan from final response (no result file written)') - return JSON.parse(jsonMatch[0]) as CarvePlan - } + // With structured output, finalResponse is guaranteed valid JSON + let plan: CarvePlan + try { + plan = JSON.parse(result.finalResponse) as CarvePlan + } catch (error) { throw new Error( - `Codex agent did not produce a result file or extractable JSON. Final response: ${result.finalResponse?.slice(0, 500) || '(empty)'}`, + `Failed to parse structured output: ${error instanceof Error ? error.message : error}. Response: ${result.finalResponse?.slice(0, 500) || '(empty)'}`, ) } - try { - const raw = fs.readFileSync(resultPath, 'utf-8') - return JSON.parse(raw) as CarvePlan - } finally { - fs.rmSync(resultPath, { force: true }) + if (!plan.candidates?.length) { + throw new Error( + `Codex returned 0 candidates. Reasoning: ${plan.reasoning?.slice(0, 300) || '(none)'}`, + ) } + + return plan } // --- Phase 2: Carve a feature in an isolated worktree --- @@ -192,6 +193,7 @@ export async function carveFeature( model: 'gpt-5.4', workingDirectory: worktreePath, approvalPolicy: 'never', + sandboxMode: 'danger-full-access', webSearchMode: 'live', modelReasoningEffort: 'high', }) @@ -224,7 +226,6 @@ export async function carveFeature( Do NOT create any result files — just make the edits directly.` await thread.run(prompt) - console.log(` [carve:${candidate.id}] Codex finished`) // Capture the diff execSync('git add -A', { cwd: worktreePath, stdio: 'ignore' }) @@ -235,19 +236,12 @@ Do NOT create any result files — just make the edits directly.` }) if (!diff.trim()) { - console.warn( - ` [carve:${candidate.id}] Empty diff — Codex made no changes. Skipping.`, - ) return null } // Build operations from the actual git diff const operations = buildOperationsFromDiff(worktreePath, repoPath, candidate.files) - console.log( - ` [carve:${candidate.id}] Success: ${operations.length} file operations, ${diff.length} bytes diff`, - ) - return { id: candidate.id, prompt: candidate.prompt, @@ -258,10 +252,6 @@ Do NOT create any result files — just make the edits directly.` diff, } } catch (error) { - console.error( - ` [carve:${candidate.id}] Failed:`, - error instanceof Error ? error.message : error, - ) throw error } finally { // Clean up worktree and branch @@ -323,11 +313,8 @@ export async function carveFeatures( ): Promise { const { count = 10, outputPath } = options - console.log(`Carving features from: ${repoPath} (target: ${count})`) - // Phase 1: Plan const plan = await planFeatures(repoPath) - console.log(`Found ${plan.candidates.length} candidates`) // Select top N candidates (prefer medium complexity) const ranked = [...plan.candidates].sort((a, b) => { @@ -340,31 +327,23 @@ export async function carveFeatures( const features: CarvedFeature[] = [] const failures: { id: string; error: string }[] = [] for (const candidate of selected) { - console.log( - `\nCarving [${features.length + 1}/${selected.length}]: ${candidate.id} (${candidate.complexity})`, - ) try { const carved = await carveFeature(repoPath, candidate) if (carved) { features.push(carved) + } else { + failures.push({ id: candidate.id, error: 'empty diff' }) } } catch (error) { const message = error instanceof Error ? error.message : String(error) failures.push({ id: candidate.id, error: message }) } } - console.log(`\nCarved ${features.length}/${selected.length} features`) - if (failures.length > 0) { - console.warn(`Failed carves (${failures.length}):`) - for (const f of failures) { - console.warn(` - ${f.id}: ${f.error}`) - } - } - const result: CarveResult = { repoPath, generationDate: new Date().toISOString(), features, + failures, } // Save output @@ -372,7 +351,6 @@ export async function carveFeatures( outputPath || path.join(repoPath, `carve-${new Date().toISOString().slice(0, 10)}.json`) fs.writeFileSync(outPath, JSON.stringify(result, null, 2)) - console.log(`\nSaved ${features.length} carved features to: ${outPath}`) return result } @@ -396,6 +374,9 @@ if (import.meta.main) { carveFeatures(repoPath, { count, outputPath }) .then((result) => { console.log(`\nDone! Carved ${result.features.length} features.`) + for (const f of result.failures) { + console.error(` Failed: ${f.id} — ${f.error}`) + } }) .catch((error) => { console.error('Carving failed:', error) diff --git a/src/judge.ts b/src/judge.ts index 153dd78..48cf6c2 100644 --- a/src/judge.ts +++ b/src/judge.ts @@ -239,7 +239,7 @@ async function runCodexReviewer( model: 'gpt-5.4', workingDirectory: cwd, approvalPolicy: 'never', - sandboxMode: 'workspace-write', + sandboxMode: 'danger-full-access', webSearchMode: 'live', modelReasoningEffort: 'high', }) diff --git a/src/runners/codex.ts b/src/runners/codex.ts index 2e48d36..6208dfa 100644 --- a/src/runners/codex.ts +++ b/src/runners/codex.ts @@ -31,6 +31,7 @@ export class CodexRunner implements Runner { model: 'gpt-5.4', workingDirectory: this.cwd, approvalPolicy: 'never', + sandboxMode: 'danger-full-access', webSearchMode: 'live', }) From be764877fb7cafd37faaa0ee72e4de06d279090c Mon Sep 17 00:00:00 2001 From: James Grugett Date: Thu, 9 Apr 2026 14:55:04 -0700 Subject: [PATCH 8/9] Save logs to .evalbuff which can be overriden with --output-dir --- docs/cli.md | 10 ++++++---- docs/run-artifacts.md | 8 +++++--- src/cli.ts | 3 +++ src/perfect-feature.ts | 6 +++++- src/run-evalbuff.ts | 23 +++++++++++++++++++++-- src/tui/app.tsx | 17 +++++++++++++++-- src/tui/main.tsx | 29 +++++++++++++++-------------- 7 files changed, 70 insertions(+), 26 deletions(-) diff --git a/docs/cli.md b/docs/cli.md index a1e2642..ab25d36 100644 --- a/docs/cli.md +++ b/docs/cli.md @@ -9,10 +9,11 @@ [--init-command "npm install"] \ [--coding-model sonnet] \ [--docs-model opus] \ - [--cached-features /path/to/features.json] + [--cached-features /path/to/features.json] \ + [--output-dir /path/to/output] ``` -All flags are parsed explicitly in the `import.meta.main` block. Required flags must be validated with helpful errors. The `--cached-features` flag skips planning/carving and loads pre-carved features directly. Evalbuff now always runs a single sequential improvement round after baseline, and carve concurrency is an internal fixed constant rather than a public flag. +All flags are parsed explicitly in the `import.meta.main` block. Required flags must be validated with helpful errors. The `--cached-features` flag skips planning/carving and loads pre-carved features directly. The `--output-dir` flag overrides the default artifact location (`/.evalbuff`). Evalbuff now always runs a single sequential improvement round after baseline, and carve concurrency is an internal fixed constant rather than a public flag. ## Perfect Feature (Single-Feature Optimizer) @@ -26,7 +27,8 @@ bun run src/perfect-feature.ts \ [--judge-model opus] \ [--analyzer-model opus] \ [--docs-model opus] \ - [--init-command "npm install"] + [--init-command "npm install"] \ + [--output-dir /path/to/output] ``` Iteratively rebuilds a single feature: rebuild → judge → diagnose → update docs → repeat until 10/10 or max rounds. @@ -71,7 +73,7 @@ bun run tui -- --repo /path/to/repo # Start a live run with TUI attac **Navigation**: `Enter` drills into detail screens, `Esc` goes back, `q` quits. Arrow keys and `j`/`k` navigate lists. -**Run discovery**: On macOS, run directories may appear under both `os.tmpdir()` (which resolves through `/private/var/...`) and `/tmp`. Discovery logic must scan both locations to find all runs. +**Run discovery**: The TUI scans `.evalbuff/` in the current working directory (the default output location) as well as legacy temp locations (`os.tmpdir()` and `/tmp` on macOS) to find all runs. ## CLI Conventions diff --git a/docs/run-artifacts.md b/docs/run-artifacts.md index 500cf65..47c3a34 100644 --- a/docs/run-artifacts.md +++ b/docs/run-artifacts.md @@ -1,9 +1,9 @@ # Run Artifacts -Every evalbuff run writes artifacts to a timestamped directory under `os.tmpdir()`: +Every evalbuff run writes artifacts to a timestamped directory under `/.evalbuff/` by default (overridable with `--output-dir`): ``` -$TMPDIR/evalbuff-run-YYYY-MM-DDTHH-MM-SS/ +/.evalbuff/run-YYYY-MM-DDTHH-MM-SS/ ├── plan.json # CarvePlan (only if features were freshly planned) ├── features.json # CarvedFeature[] — the selected features ├── events.jsonl # Timestamped event stream for TUI replay @@ -128,7 +128,9 @@ Cumulative metrics like `totalCost` from `round_complete` events are **run total ## Run Directory Naming -Valid run directories match `evalbuff-run-YYYY-MM-DDTHH-MM-SS`. Scratch directories like `evalbuff-run-review-*` or `evalbuff-run-` are not real runs and must not be treated as such by discovery logic. +Valid run directories match `run-YYYY-MM-DDTHH-MM-SS` (under `.evalbuff/`) or the legacy `evalbuff-run-YYYY-MM-DDTHH-MM-SS` (under temp). Scratch directories like `evalbuff-run-review-*` or `evalbuff-run-` are not real runs and must not be treated as such by discovery logic. + +The `.evalbuff/` directory is automatically added to the repo's `.gitignore` on first run. ## report.md Contract diff --git a/src/cli.ts b/src/cli.ts index 92d87e7..0db6294 100644 --- a/src/cli.ts +++ b/src/cli.ts @@ -31,6 +31,7 @@ Options: --coding-model Model for coding agent (default: sonnet) --docs-model Model for docs agent (default: opus) --cached-features Path to pre-computed features JSON + --output-dir Base directory for run artifacts (default: /.evalbuff) -V, --version Show version number -h, --help Show this help message`) process.exit(0) @@ -50,6 +51,7 @@ const initCommand = hasArg('init-command') ? getArg('init-command') : undefined const codingModel = getArg('coding-model', 'sonnet') const docsModel = getArg('docs-model', 'opus') const cachedFeatures = hasArg('cached-features') ? getArg('cached-features') : undefined +const outputDir = hasArg('output-dir') ? getArg('output-dir') : undefined runEvalbuff({ repoPath, @@ -58,6 +60,7 @@ runEvalbuff({ codingModel, docsModel, cachedFeatures, + outputDir, }).catch((error) => { console.error('Evalbuff run failed:', error) process.exit(1) diff --git a/src/perfect-feature.ts b/src/perfect-feature.ts index 4ce6e9a..be98284 100644 --- a/src/perfect-feature.ts +++ b/src/perfect-feature.ts @@ -58,6 +58,7 @@ interface PerfectFeatureOptions { analyzerModel: string docsModel: string initCommand?: string + outputDir?: string } interface RoundOutcome { @@ -550,7 +551,8 @@ async function runRebuildAndJudge(opts: { async function perfectFeature(opts: PerfectFeatureOptions): Promise { const startTime = new Date().toISOString() - const logDir = path.join(os.tmpdir(), `evalbuff-perfect-${opts.featureId}-${new Date().toISOString().slice(0, 19).replace(/:/g, '-')}`) + const baseDir = opts.outputDir ?? path.join(opts.repoPath, '.evalbuff') + const logDir = path.join(baseDir, `perfect-${opts.featureId}-${new Date().toISOString().slice(0, 19).replace(/:/g, '-')}`) fs.mkdirSync(logDir, { recursive: true }) console.log(`\nPerfect Feature`) @@ -815,6 +817,7 @@ if (import.meta.main) { const analyzerModel = getArg('analyzer-model', 'opus') const docsModel = getArg('docs-model', 'opus') const initCommand = hasArg('init-command') ? getArg('init-command') : undefined + const outputDir = hasArg('output-dir') ? getArg('output-dir') : undefined perfectFeature({ repoPath, @@ -826,6 +829,7 @@ if (import.meta.main) { analyzerModel, docsModel, initCommand, + outputDir, }).catch((error) => { console.error('Perfect feature run failed:', error) process.exit(1) diff --git a/src/run-evalbuff.ts b/src/run-evalbuff.ts index 483ae43..4bb69a6 100644 --- a/src/run-evalbuff.ts +++ b/src/run-evalbuff.ts @@ -14,7 +14,6 @@ * bun run src/run-evalbuff.ts --repo /path/to/repo [--n 5] [--init-command "npm install"] */ import fs from 'fs' -import os from 'os' import path from 'path' import { planFeatures, carveFeature } from './carve-features' @@ -60,12 +59,28 @@ export interface EvalbuffOptions { codingModel: string // model for coding agents (default: sonnet) docsModel: string // model for docs agents (default: opus) cachedFeatures?: string // path to a features.json from a previous run + outputDir?: string // base directory for run artifacts (default: /.evalbuff) } const DOC_CHANGE_ACCEPTANCE_THRESHOLD = 0.5 const DOC_CHANGE_FAST_ACCEPT_THRESHOLD = DOC_CHANGE_ACCEPTANCE_THRESHOLD * 2 const CARVE_PARALLELISM = 10 +/** Ensure an entry exists in the repo's .gitignore. Creates the file if needed. */ +function ensureGitignore(repoPath: string, entry: string): void { + const gitignorePath = path.join(repoPath, '.gitignore') + try { + const content = fs.existsSync(gitignorePath) ? fs.readFileSync(gitignorePath, 'utf-8') : '' + const lines = content.split('\n') + if (!lines.some(line => line.trim() === entry)) { + const suffix = content.length > 0 && !content.endsWith('\n') ? '\n' : '' + fs.appendFileSync(gitignorePath, `${suffix}${entry}\n`) + } + } catch { + // Best-effort — don't fail the run over .gitignore + } +} + function getErrorMessage(error: unknown): string { return error instanceof Error ? error.message : String(error) } @@ -734,8 +749,10 @@ export async function gateDocsChangesForTask(args: { export async function runEvalbuff(opts: EvalbuffOptions): Promise { const startTime = new Date().toISOString() - const logDir = path.join(os.tmpdir(), `evalbuff-run-${new Date().toISOString().slice(0, 19).replace(/:/g, '-')}`) + const baseDir = opts.outputDir ?? path.join(opts.repoPath, '.evalbuff') + const logDir = path.join(baseDir, `run-${new Date().toISOString().slice(0, 19).replace(/:/g, '-')}`) fs.mkdirSync(logDir, { recursive: true }) + ensureGitignore(opts.repoPath, '.evalbuff') let totalCost = 0 let scoreProgression: number[] = [] @@ -1006,6 +1023,7 @@ if (import.meta.main) { const codingModel = getArg('coding-model', 'sonnet') const docsModel = getArg('docs-model', 'opus') const cachedFeatures = hasArg('cached-features') ? getArg('cached-features') : undefined + const outputDir = hasArg('output-dir') ? getArg('output-dir') : undefined runEvalbuff({ repoPath, @@ -1014,6 +1032,7 @@ if (import.meta.main) { codingModel, docsModel, cachedFeatures, + outputDir, }).catch((error) => { console.error('Evalbuff run failed:', error) process.exit(1) diff --git a/src/tui/app.tsx b/src/tui/app.tsx index 72ccd62..cbdf1d3 100644 --- a/src/tui/app.tsx +++ b/src/tui/app.tsx @@ -746,7 +746,7 @@ function SummaryView({ state, logData, onBack }: { // Run Picker View // ============================================================ -/** Scan for evalbuff run directories across temp locations */ +/** Scan for evalbuff run directories across temp and .evalbuff locations */ function scanRunDirs(): RunInfo[] { const os = require('os') const fs = require('fs') @@ -754,7 +754,7 @@ function scanRunDirs(): RunInfo[] { const tmpDir = os.tmpdir() const dirs: string[] = [] - // Scan both os.tmpdir() and /tmp (they can differ on macOS) + // Scan legacy temp locations (os.tmpdir() and /tmp on macOS) for (const base of [tmpDir, '/tmp']) { try { for (const name of fs.readdirSync(base)) { @@ -768,6 +768,19 @@ function scanRunDirs(): RunInfo[] { } catch {} } + // Scan .evalbuff/ in the current working directory + const evalbuffDir = path.join(process.cwd(), '.evalbuff') + try { + for (const name of fs.readdirSync(evalbuffDir)) { + if (name.startsWith('run-') || name.startsWith('perfect-')) { + const full = path.join(evalbuffDir, name) + try { + if (fs.statSync(full).isDirectory()) dirs.push(full) + } catch {} + } + } + } catch {} + // Deduplicate and sort newest first const unique = [...new Set(dirs)].sort().reverse() diff --git a/src/tui/main.tsx b/src/tui/main.tsx index 6bdeb6e..9d6e65c 100644 --- a/src/tui/main.tsx +++ b/src/tui/main.tsx @@ -203,20 +203,20 @@ async function replayLogDir(logDir: string) { // --- Find recent log dirs --- function findRecentLogDirs(): string[] { - const tmpDir = require('os').tmpdir() + const dirs: string[] = [] + + // New default: scan .evalbuff/ in current working directory + const evalbuffDir = path.join(process.cwd(), '.evalbuff') try { - return fs.readdirSync(tmpDir) - .filter(name => name.startsWith('evalbuff-run-')) - .map(name => path.join(tmpDir, name)) - .filter(p => { - try { return fs.statSync(p).isDirectory() } catch { return false } - }) - .sort() - .reverse() - .slice(0, 10) - } catch { - return [] - } + for (const name of fs.readdirSync(evalbuffDir)) { + if (name.startsWith('run-') || name.startsWith('perfect-')) { + const full = path.join(evalbuffDir, name) + try { if (fs.statSync(full).isDirectory()) dirs.push(full) } catch {} + } + } + } catch {} + + return [...new Set(dirs)].sort().reverse().slice(0, 10) } // --- Demo mode --- @@ -347,8 +347,9 @@ async function main() { const codingModel = getArg('coding-model', 'sonnet') const docsModel = getArg('docs-model', 'opus') const cachedFeatures = hasArg('cached-features') ? getArg('cached-features') : undefined + const outputDir = hasArg('output-dir') ? getArg('output-dir') : undefined - runEvalbuff({ repoPath, n, initCommand, codingModel, docsModel, cachedFeatures }).catch(err => { + runEvalbuff({ repoPath, n, initCommand, codingModel, docsModel, cachedFeatures, outputDir }).catch(err => { events.log(`Run failed: ${err}`, 'error') }) } else { From da89de054ba565794874c3423849f0afc4d183b4 Mon Sep 17 00:00:00 2001 From: James Grugett Date: Thu, 9 Apr 2026 18:09:37 -0700 Subject: [PATCH 9/9] fix simlink copy in new repo --- src/eval-helpers.ts | 34 ++++++++++++++++++++++++++++++++++ 1 file changed, 34 insertions(+) diff --git a/src/eval-helpers.ts b/src/eval-helpers.ts index 62eb469..2a48c4d 100644 --- a/src/eval-helpers.ts +++ b/src/eval-helpers.ts @@ -187,6 +187,22 @@ export function syncDocsIntoRepo( const targetDocs = getDocsSnapshot(targetRepoPath) const changed = new Set() + // Detect symlinks among root doc files so we can preserve them. + // When CLAUDE.md is a symlink to AGENTS.md, writing it as a regular file + // breaks the relationship — git patches that modify AGENTS.md won't + // propagate to CLAUDE.md, leaving it stale. + const docSymlinks = new Map() + for (const file of ["AGENTS.md", "CLAUDE.md"]) { + const p = path.join(sourceRepoPath, file) + try { + if (fs.lstatSync(p).isSymbolicLink()) { + docSymlinks.set(file, fs.readlinkSync(p)) + } + } catch { + // file doesn't exist + } + } + for (const filePath of Object.keys(targetDocs)) { if (filePath in sourceDocs) continue fs.rmSync(path.join(targetRepoPath, filePath), { force: true }) @@ -195,6 +211,7 @@ export function syncDocsIntoRepo( } for (const [filePath, content] of Object.entries(sourceDocs)) { + if (docSymlinks.has(filePath)) continue if (targetDocs[filePath] === content) continue const absolutePath = path.join(targetRepoPath, filePath) fs.mkdirSync(path.dirname(absolutePath), { recursive: true }) @@ -202,6 +219,23 @@ export function syncDocsIntoRepo( changed.add(filePath) } + // Recreate symlinks so git patches propagate correctly + for (const [file, linkTarget] of docSymlinks) { + const absolutePath = path.join(targetRepoPath, file) + let alreadyCorrect = false + try { + alreadyCorrect = fs.lstatSync(absolutePath).isSymbolicLink() + && fs.readlinkSync(absolutePath) === linkTarget + } catch { + // doesn't exist yet + } + if (!alreadyCorrect) { + fs.rmSync(absolutePath, { force: true }) + fs.symlinkSync(linkTarget, absolutePath) + changed.add(file) + } + } + return [...changed].sort() }