From aecc736151b7068e0f58f087980f6b3d4070ce62 Mon Sep 17 00:00:00 2001 From: Christopher Tso Date: Sun, 12 Apr 2026 05:55:39 +0000 Subject: [PATCH 1/9] feat(#1048): capture workspace artifacts via snapshot baseline MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit When git is unavailable for a workspace (e.g. read-only path, no git binary), fall back to filesystem snapshots (captureSnapshot / diffFromSnapshots) so file_changes still captures files written by the agent. Additionally, copilot providers (cli, log, sdk) now scan their session-state files/ directory after each run and return those as ProviderResponse.fileChanges. The orchestrator merges provider-reported fileChanges into the result, capturing artifacts written outside workspace_path entirely (the core RED case from #1048). Changes: - file-changes.ts: add WorkspaceSnapshot, captureSnapshot, diffFromSnapshots, generateNewFileDiff, captureSessionArtifacts - types.ts: add fileChanges?: string to ProviderResponse - copilot-log/cli/sdk.ts: scan session-state/files/ and populate providerResponse.fileChanges - orchestrator.ts: three-strategy file_changes capture (git → snapshot fallback → provider-reported merge) - examples/features/workspace-artifact: GREEN example eval with code-grader proving file_changes captures CSV under workspace_path - file-changes.test.ts: 13 new tests for snapshot/diff/artifacts Co-Authored-By: Claude Sonnet 4.6 --- .../workspace-artifact/.agentv/targets.yaml | 11 + .../evals/dataset.eval.yaml | 50 +++++ .../scripts/check-csv-artifact.ts | 72 ++++++ .../workspace-template/.gitkeep | 0 packages/core/src/evaluation/orchestrator.ts | 72 +++++- .../src/evaluation/providers/copilot-cli.ts | 13 ++ .../src/evaluation/providers/copilot-log.ts | 16 ++ .../src/evaluation/providers/copilot-sdk.ts | 15 ++ .../core/src/evaluation/providers/types.ts | 10 + .../src/evaluation/workspace/file-changes.ts | 212 ++++++++++++++++++ .../evaluation/workspace/file-changes.test.ts | 130 +++++++++++ 11 files changed, 592 insertions(+), 9 deletions(-) create mode 100644 examples/features/workspace-artifact/.agentv/targets.yaml create mode 100644 examples/features/workspace-artifact/evals/dataset.eval.yaml create mode 100644 examples/features/workspace-artifact/scripts/check-csv-artifact.ts create mode 100644 examples/features/workspace-artifact/workspace-template/.gitkeep diff --git a/examples/features/workspace-artifact/.agentv/targets.yaml b/examples/features/workspace-artifact/.agentv/targets.yaml new file mode 100644 index 000000000..d30997ce7 --- /dev/null +++ b/examples/features/workspace-artifact/.agentv/targets.yaml @@ -0,0 +1,11 @@ +targets: + # Mock CLI agent that writes a CSV report to outputs/report.csv under workspace_path. + # Simulates what a real agent (e.g. Copilot) would do when asked to generate a report. + - name: mock_csv_agent + provider: cli + command: >- + bash -c ' + mkdir -p outputs && + printf "metric,value,status\ncoverage,87.3,pass\nlatency_p99_ms,142,pass\nerror_rate,0.02,pass\n" > outputs/report.csv && + echo "Generated outputs/report.csv" > {OUTPUT_FILE} + ' diff --git a/examples/features/workspace-artifact/evals/dataset.eval.yaml b/examples/features/workspace-artifact/evals/dataset.eval.yaml new file mode 100644 index 000000000..7d8d9124a --- /dev/null +++ b/examples/features/workspace-artifact/evals/dataset.eval.yaml @@ -0,0 +1,50 @@ +# Workspace artifact example +# +# Demonstrates that file_changes captures files generated by agents under +# workspace_path even when workspace_path is not a pre-existing git repo. +# +# Scenario: +# A mock CLI agent is asked to produce a CSV report. It writes the CSV +# directly into workspace_path (the temp workspace created from the +# template). AgentV takes a baseline snapshot before the agent runs and +# diffs it afterwards, populating file_changes with the new CSV content. +# A code grader then checks the CSV is present via {{file_changes}}. +# +# RED (before fix): Without workspace configured, agents like Copilot that +# save artifacts to their session-state path can't be evaluated because +# file_changes is always empty. +# +# GREEN (after fix): With workspace configured, the snapshot baseline tracks +# any file written under workspace_path — no git required. Provider-reported +# fileChanges additionally surfaces files written outside workspace_path +# (e.g. Copilot session-state) directly from the provider response. + +name: workspace-artifact +description: Verify file_changes captures generated artifacts (CSV) under workspace_path + +workspace: + template: ../workspace-template + +execution: + target: mock_csv_agent + +tests: + - id: csv-report-generated + criteria: >- + The agent must produce a CSV report at outputs/report.csv. + The file_changes diff should show the CSV was created with the correct + header row and at least one data row. + + input: + - role: user + content: + - type: text + value: >- + Generate a CSV report with analysis results and save it to + outputs/report.csv. The CSV must have a header row and at least + one data row. + + assertions: + - name: csv-in-file-changes + type: code-grader + command: ["bun", "run", "../scripts/check-csv-artifact.ts"] diff --git a/examples/features/workspace-artifact/scripts/check-csv-artifact.ts b/examples/features/workspace-artifact/scripts/check-csv-artifact.ts new file mode 100644 index 000000000..018f4b21f --- /dev/null +++ b/examples/features/workspace-artifact/scripts/check-csv-artifact.ts @@ -0,0 +1,72 @@ +#!/usr/bin/env bun +/** + * Code grader: checks that file_changes contains outputs/report.csv + * with a header row and at least one data row. + * + * This grader is intentionally self-contained — no LLM required. + * It proves the workspace-snapshot feature is working by inspecting + * the file_changes diff captured from the temp workspace. + */ +import { readFileSync } from 'node:fs'; + +const input = JSON.parse(readFileSync('/dev/stdin', 'utf-8')) as { + file_changes: string | null; + criteria: string | null; +}; + +const fileChanges: string = input.file_changes ?? ''; + +const assertions: Array<{ text: string; passed: boolean; evidence?: string }> = []; + +// Check 1: file_changes is non-empty +if (!fileChanges || fileChanges.trim().length === 0) { + assertions.push({ + text: 'file_changes is non-empty', + passed: false, + evidence: + 'file_changes is empty — workspace snapshot or git baseline may not be configured', + }); + console.log(JSON.stringify({ score: 0, assertions })); + process.exit(0); +} + +assertions.push({ text: 'file_changes is non-empty', passed: true }); + +// Check 2: diff mentions outputs/report.csv +const hasCsvFile = fileChanges.includes('outputs/report.csv'); +assertions.push({ + text: 'diff contains outputs/report.csv', + passed: hasCsvFile, + evidence: hasCsvFile + ? undefined + : `file_changes did not mention outputs/report.csv. Got:\n${fileChanges.slice(0, 500)}`, +}); + +// Extract CSV lines from the diff (lines starting with '+' that are not '+++') +const csvLines = fileChanges + .split('\n') + .filter((line) => line.startsWith('+') && !line.startsWith('+++')) + .map((line) => line.slice(1)); // strip leading '+' + +// Check 3: has header row (non-empty first content line) +const headerLine = csvLines[0] ?? ''; +const hasHeader = headerLine.includes(','); +assertions.push({ + text: 'CSV has a header row', + passed: hasHeader, + evidence: hasHeader ? undefined : `First CSV line: "${headerLine}"`, +}); + +// Check 4: has at least one data row +const dataRows = csvLines.slice(1).filter((l) => l.trim().length > 0 && l.includes(',')); +const hasDataRow = dataRows.length > 0; +assertions.push({ + text: 'CSV has at least one data row', + passed: hasDataRow, + evidence: hasDataRow ? undefined : 'No data rows found after the header', +}); + +const passed = assertions.filter((a) => a.passed).length; +const score = passed / assertions.length; + +console.log(JSON.stringify({ score, assertions })); diff --git a/examples/features/workspace-artifact/workspace-template/.gitkeep b/examples/features/workspace-artifact/workspace-template/.gitkeep new file mode 100644 index 000000000..e69de29bb diff --git a/packages/core/src/evaluation/orchestrator.ts b/packages/core/src/evaluation/orchestrator.ts index cbc8ef3e9..ca419efb7 100644 --- a/packages/core/src/evaluation/orchestrator.ts +++ b/packages/core/src/evaluation/orchestrator.ts @@ -66,8 +66,11 @@ import type { } from './types.js'; import { captureFileChanges as captureWorkspaceFileChanges, + captureSnapshot, + diffFromSnapshots, initializeBaseline, } from './workspace/file-changes.js'; +import type { WorkspaceSnapshot } from './workspace/file-changes.js'; import { cleanupEvalWorkspaces, cleanupWorkspace, @@ -192,6 +195,8 @@ export interface RunEvalCaseOptions { readonly sharedWorkspacePath?: string; /** Pre-initialized baseline commit for shared workspace */ readonly sharedBaselineCommit?: string; + /** Snapshot baseline for shared workspace (fallback when git is unavailable) */ + readonly sharedBaselineSnapshot?: WorkspaceSnapshot; /** Suite-level .code-workspace file (resolved from workspace.template) */ readonly suiteWorkspaceFile?: string; /** Real-time observability callbacks passed to the provider */ @@ -616,6 +621,7 @@ export async function runEvaluation( const limit = pLimit(workers); let sharedWorkspacePath: string | undefined; let sharedBaselineCommit: string | undefined; + let sharedBaselineSnapshot: WorkspaceSnapshot | undefined; let beforeAllOutput: string | undefined; let poolManager: WorkspacePoolManager | undefined; @@ -625,6 +631,7 @@ export async function runEvaluation( const poolSlots: PoolSlot[] = []; const availablePoolSlots: PoolSlot[] = []; const poolSlotBaselines = new Map(); + const poolSlotSnapshots = new Map(); // Pool capacity: how many slots can exist on disk (independent of worker count). // Workers acquire slots from the pool; the pool itself can be larger than any single run needs. @@ -832,18 +839,23 @@ export async function runEvaluation( } } - // Initialize git baseline for shared workspace + // Initialize baseline for shared workspace (git first, snapshot fallback) if (sharedWorkspacePath) { try { sharedBaselineCommit = await initializeBaseline(sharedWorkspacePath); setupLog(`shared baseline initialized: ${sharedBaselineCommit}`); } catch { - // Non-fatal: file change tracking is best-effort - setupLog('shared baseline initialization skipped (non-fatal)'); + // Git failed — try snapshot fallback + try { + sharedBaselineSnapshot = await captureSnapshot(sharedWorkspacePath); + setupLog('shared baseline snapshot captured (git unavailable)'); + } catch { + setupLog('shared baseline initialization skipped (non-fatal)'); + } } } - // Multi-slot pool: initialize baselines per slot + // Multi-slot pool: initialize baselines per slot (git first, snapshot fallback) if (availablePoolSlots.length > 0) { for (const slot of availablePoolSlots) { try { @@ -851,7 +863,13 @@ export async function runEvaluation( poolSlotBaselines.set(slot.path, baseline); setupLog(`pool slot ${slot.index} baseline initialized: ${baseline}`); } catch { - setupLog(`pool slot ${slot.index} baseline initialization skipped (non-fatal)`); + try { + const snapshot = await captureSnapshot(slot.path); + poolSlotSnapshots.set(slot.path, snapshot); + setupLog(`pool slot ${slot.index} baseline snapshot captured (git unavailable)`); + } catch { + setupLog(`pool slot ${slot.index} baseline initialization skipped (non-fatal)`); + } } } } @@ -965,6 +983,9 @@ export async function runEvaluation( const testBaselineCommit = testPoolSlot ? poolSlotBaselines.get(testPoolSlot.path) : sharedBaselineCommit; + const testBaselineSnapshot = testPoolSlot + ? poolSlotSnapshots.get(testPoolSlot.path) + : sharedBaselineSnapshot; try { const graderProvider = await resolveGraderProvider(target); @@ -988,6 +1009,7 @@ export async function runEvaluation( retainOnFailure: resolvedRetainOnFailure, sharedWorkspacePath: testWorkspacePath, sharedBaselineCommit: testBaselineCommit, + sharedBaselineSnapshot: testBaselineSnapshot, suiteWorkspaceFile, streamCallbacks, typeRegistry, @@ -1395,6 +1417,7 @@ export async function runEvalCase(options: RunEvalCaseOptions): Promise 0) { @@ -1774,6 +1810,24 @@ export async function runEvalCase(options: RunEvalCaseOptions): Promise 0) { + fileChanges = diff; + } + } catch { + // Non-fatal + } + } + + // Strategy 3: provider-reported artifacts (files written outside workspace_path, + // e.g. copilot session-state). Merged on top of any workspace-based diff. + const providerFileChanges = providerResponse?.fileChanges; + if (providerFileChanges) { + fileChanges = fileChanges ? `${fileChanges}\n${providerFileChanges}` : providerFileChanges; } const providerError = extractProviderError(providerResponse); diff --git a/packages/core/src/evaluation/providers/copilot-cli.ts b/packages/core/src/evaluation/providers/copilot-cli.ts index 31684af05..b95a85bf2 100644 --- a/packages/core/src/evaluation/providers/copilot-cli.ts +++ b/packages/core/src/evaluation/providers/copilot-cli.ts @@ -1,5 +1,6 @@ import { randomUUID } from 'node:crypto'; import { mkdir } from 'node:fs/promises'; +import { homedir } from 'node:os'; import path from 'node:path'; import { Readable, Writable } from 'node:stream'; @@ -8,6 +9,7 @@ import { spawn } from 'node:child_process'; import * as acp from '@agentclientprotocol/sdk'; +import { captureSessionArtifacts } from '../workspace/file-changes.js'; import { recordCopilotCliLogEntry } from './copilot-cli-log-tracker.js'; import { CopilotStreamLogger, @@ -298,6 +300,16 @@ export class CopilotCliProvider implements Provider { }); } + // Capture session artifacts from session-state `files/` directory. + // Copilot may write generated files (e.g. CSV reports) there instead of + // the session cwd, so they wouldn't be captured by workspace git diff. + const sessionId = session.sessionId as string | undefined; + const fileChanges = sessionId + ? await captureSessionArtifacts( + path.join(homedir(), '.copilot', 'session-state', sessionId, 'files'), + ).catch(() => undefined) + : undefined; + return { raw: { model: this.config.model, @@ -310,6 +322,7 @@ export class CopilotCliProvider implements Provider { durationMs, startTime, endTime, + ...(fileChanges ? { fileChanges } : {}), }; } finally { await logger?.close(); diff --git a/packages/core/src/evaluation/providers/copilot-log.ts b/packages/core/src/evaluation/providers/copilot-log.ts index cd5471204..02b8ef2d5 100644 --- a/packages/core/src/evaluation/providers/copilot-log.ts +++ b/packages/core/src/evaluation/providers/copilot-log.ts @@ -16,11 +16,20 @@ * The invoke() method ignores request.question since no process is spawned. * It reads the transcript file and returns a ProviderResponse with the * parsed Message[] in the output field. + * + * File-change tracking: + * After reading the transcript, the provider automatically scans the + * session's `files/` subdirectory for artifacts generated during the + * session (e.g. CSV / Markdown reports saved by Copilot). Any files + * found are returned as synthetic unified diffs in `fileChanges` so that + * LLM and code graders can evaluate them via `{{file_changes}}` without + * requiring the agent to echo file contents in its final answer. */ import { readFile } from 'node:fs/promises'; import { homedir } from 'node:os'; import path from 'node:path'; +import { captureSessionArtifacts } from '../workspace/file-changes.js'; import { parseCopilotEvents } from './copilot-log-parser.js'; import { discoverCopilotSessions } from './copilot-session-discovery.js'; import type { CopilotLogResolvedConfig } from './targets.js'; @@ -54,11 +63,18 @@ export class CopilotLogProvider implements Provider { const parsed = parseCopilotEvents(eventsContent); + // Scan session-state `files/` directory for artifacts generated during + // the session (e.g. CSV reports). Return as synthetic diffs so graders + // can evaluate them via {{file_changes}} without special eval wiring. + const filesDir = path.join(sessionDir, 'files'); + const fileChanges = await captureSessionArtifacts(filesDir).catch(() => undefined); + return { output: parsed.messages, tokenUsage: parsed.tokenUsage, durationMs: parsed.durationMs, startTime: parsed.meta.startedAt, + ...(fileChanges ? { fileChanges } : {}), }; } diff --git a/packages/core/src/evaluation/providers/copilot-sdk.ts b/packages/core/src/evaluation/providers/copilot-sdk.ts index b18a10a65..fa6fd638d 100644 --- a/packages/core/src/evaluation/providers/copilot-sdk.ts +++ b/packages/core/src/evaluation/providers/copilot-sdk.ts @@ -1,8 +1,10 @@ import { randomUUID } from 'node:crypto'; import { existsSync } from 'node:fs'; import { mkdir } from 'node:fs/promises'; +import { homedir } from 'node:os'; import path from 'node:path'; +import { captureSessionArtifacts } from '../workspace/file-changes.js'; import { recordCopilotSdkLogEntry } from './copilot-sdk-log-tracker.js'; import { CopilotStreamLogger, @@ -262,6 +264,18 @@ export class CopilotSdkProvider implements Provider { }); } + // Capture session artifacts from session-state `files/` directory. + // Copilot SDK may write generated files (e.g. CSV reports) to the + // session-state directory instead of the workspace cwd. + // biome-ignore lint/suspicious/noExplicitAny: SDK session shape is dynamic + const sessionId = (session as any).id ?? (session as any).sessionId; + const fileChanges = + typeof sessionId === 'string' && sessionId + ? await captureSessionArtifacts( + path.join(homedir(), '.copilot', 'session-state', sessionId, 'files'), + ).catch(() => undefined) + : undefined; + return { raw: { model: this.config.model, @@ -274,6 +288,7 @@ export class CopilotSdkProvider implements Provider { durationMs, startTime, endTime, + ...(fileChanges ? { fileChanges } : {}), }; } finally { unsubscribe(); diff --git a/packages/core/src/evaluation/providers/types.ts b/packages/core/src/evaluation/providers/types.ts index f9ed86758..80222455c 100644 --- a/packages/core/src/evaluation/providers/types.ts +++ b/packages/core/src/evaluation/providers/types.ts @@ -239,6 +239,16 @@ export interface ProviderResponse { readonly startTime?: string; /** ISO 8601 timestamp when execution ended (optional) */ readonly endTime?: string; + /** + * Synthetic unified diff of files generated by the provider outside the + * eval workspace_path (e.g. copilot session-state artifacts in + * `~/.copilot/session-state//files/`). + * + * When set, the orchestrator merges this into `file_changes` so that LLM + * and code graders can inspect agent-generated artifacts even when they are + * written to a path agentv does not track via git or snapshot. + */ + readonly fileChanges?: string; } /** diff --git a/packages/core/src/evaluation/workspace/file-changes.ts b/packages/core/src/evaluation/workspace/file-changes.ts index 12fd8d81a..b88f7f204 100644 --- a/packages/core/src/evaluation/workspace/file-changes.ts +++ b/packages/core/src/evaluation/workspace/file-changes.ts @@ -1,10 +1,53 @@ +/** + * Workspace file-change tracking for AgentV evaluation. + * + * Two strategies are supported — both produce unified-diff output that is + * stored in `file_changes` and surfaced to LLM / code graders: + * + * 1. **Git baseline** (default when `git` is available in workspace_path): + * - `initializeBaseline` runs `git init`, stages all existing files, and + * creates a baseline commit so a clean diff is available after the agent runs. + * - `captureFileChanges` stages everything and emits `git diff `. + * - Supports nested git repos via `--submodule=diff`. + * + * 2. **Snapshot baseline** (fallback when git is unavailable / path is read-only): + * - `captureSnapshot` walks the directory tree and records every text file's + * content as a `Map`. + * - `diffFromSnapshots` compares two snapshots and emits synthetic unified + * diffs for new, modified, and deleted files. + * - Use this when `initializeBaseline` throws (git not installed, permissions, + * read-only session-state directories, etc.). + * + * 3. **Provider-reported artifacts** (for agents that write outside workspace_path): + * - `generateSessionFileDiff` creates a synthetic "new file" diff for a + * single file, given its relative path and content. + * - Copilot providers use this to surface files written into the agent's own + * session-state directory (e.g. `~/.copilot/session-state//files/`). + * + * To extend: + * - Add a new capture strategy here as an exported function. + * - Call it from `orchestrator.ts` alongside the existing git / snapshot logic. + */ + import { exec as execCallback } from 'node:child_process'; +import { readFile, readdir, stat } from 'node:fs/promises'; import { readdirSync, statSync } from 'node:fs'; import path from 'node:path'; import { promisify } from 'node:util'; const execAsync = promisify(execCallback); +/** Maximum file size (bytes) to include in snapshot diffs. Larger files are skipped. */ +const SNAPSHOT_MAX_FILE_BYTES = 512 * 1024; // 512 KB + +/** Directories always excluded from snapshot walks. */ +const SNAPSHOT_EXCLUDE_DIRS = new Set(['.git', 'node_modules', '.agentv', '__pycache__']); + +/** A point-in-time snapshot of a directory: relative path → UTF-8 content. */ +export type WorkspaceSnapshot = Map; + +// ─── Git baseline ──────────────────────────────────────────────────────────── + /** * Build exec options that ensure git commands target the workspace, * not a parent repo. Clears GIT_DIR/GIT_WORK_TREE which may be set @@ -87,3 +130,172 @@ async function stageNestedRepoChanges(workspacePath: string): Promise { await execAsync('git add -A', childOpts); } } + +// ─── Snapshot baseline ─────────────────────────────────────────────────────── + +/** + * Walk `dir` recursively and return a snapshot of every readable text file. + * Binary files and files larger than SNAPSHOT_MAX_FILE_BYTES are omitted. + * Standard noise directories (.git, node_modules, etc.) are skipped. + */ +export async function captureSnapshot(dir: string): Promise { + const snapshot: WorkspaceSnapshot = new Map(); + await walkDir(dir, dir, snapshot); + return snapshot; +} + +async function walkDir( + rootDir: string, + currentDir: string, + snapshot: WorkspaceSnapshot, +): Promise { + let entries: string[]; + try { + entries = await readdir(currentDir); + } catch { + return; + } + + for (const entry of entries) { + if (SNAPSHOT_EXCLUDE_DIRS.has(entry)) continue; + + const fullPath = path.join(currentDir, entry); + let fileStat: Awaited>; + try { + fileStat = await stat(fullPath); + } catch { + continue; + } + + if (fileStat.isDirectory()) { + await walkDir(rootDir, fullPath, snapshot); + } else if (fileStat.isFile()) { + if (fileStat.size > SNAPSHOT_MAX_FILE_BYTES) continue; + let content: string; + try { + content = await readFile(fullPath, 'utf8'); + // Skip if not valid UTF-8 text (binary file heuristic: contains null bytes) + if (content.includes('\0')) continue; + } catch { + continue; + } + const relativePath = path.relative(rootDir, fullPath).replace(/\\/g, '/'); + snapshot.set(relativePath, content); + } + } +} + +/** + * Compare two snapshots and return a synthetic unified-diff string. + * Covers new files, modified files, and deleted files. + * Returns empty string when the snapshots are identical. + */ +export function diffFromSnapshots( + baseline: WorkspaceSnapshot, + current: WorkspaceSnapshot, +): string { + const parts: string[] = []; + + // New and modified files + for (const [relPath, currentContent] of current) { + const baseContent = baseline.get(relPath); + if (baseContent === undefined) { + // New file + parts.push(generateNewFileDiff(relPath, currentContent)); + } else if (baseContent !== currentContent) { + // Modified file + parts.push(generateModifiedFileDiff(relPath, baseContent, currentContent)); + } + } + + // Deleted files + for (const [relPath, baseContent] of baseline) { + if (!current.has(relPath)) { + parts.push(generateDeletedFileDiff(relPath, baseContent)); + } + } + + return parts.join('\n'); +} + +// ─── Synthetic diff helpers ────────────────────────────────────────────────── + +/** + * Generate a synthetic unified diff entry for a newly created file. + * Suitable both for snapshot diffs and provider-reported session artifacts. + */ +export function generateNewFileDiff(relativePath: string, content: string): string { + const lines = content.endsWith('\n') ? content.slice(0, -1).split('\n') : content.split('\n'); + const addedLines = lines.map((l) => `+${l}`).join('\n'); + return [ + `diff --git a/${relativePath} b/${relativePath}`, + 'new file mode 100644', + '--- /dev/null', + `+++ b/${relativePath}`, + `@@ -0,0 +1,${lines.length} @@`, + addedLines, + ].join('\n'); +} + +function generateDeletedFileDiff(relativePath: string, content: string): string { + const lines = content.endsWith('\n') ? content.slice(0, -1).split('\n') : content.split('\n'); + const removedLines = lines.map((l) => `-${l}`).join('\n'); + return [ + `diff --git a/${relativePath} b/${relativePath}`, + 'deleted file mode 100644', + `--- a/${relativePath}`, + '+++ /dev/null', + `@@ -1,${lines.length} +0,0 @@`, + removedLines, + ].join('\n'); +} + +function generateModifiedFileDiff( + relativePath: string, + oldContent: string, + newContent: string, +): string { + // Simple full-file replacement diff (no line-level hunk optimization) + const oldLines = oldContent.endsWith('\n') + ? oldContent.slice(0, -1).split('\n') + : oldContent.split('\n'); + const newLines = newContent.endsWith('\n') + ? newContent.slice(0, -1).split('\n') + : newContent.split('\n'); + const removedLines = oldLines.map((l) => `-${l}`).join('\n'); + const addedLines = newLines.map((l) => `+${l}`).join('\n'); + return [ + `diff --git a/${relativePath} b/${relativePath}`, + `--- a/${relativePath}`, + `+++ b/${relativePath}`, + `@@ -1,${oldLines.length} +1,${newLines.length} @@`, + removedLines, + addedLines, + ].join('\n'); +} + +// ─── Session-state artifact capture ───────────────────────────────────────── + +/** + * Scan a directory (e.g. copilot session-state `files/`) for text files and + * return a synthetic unified diff string showing all of them as new additions. + * + * Returns undefined when the directory does not exist or is empty. + * + * Used by copilot providers to surface files that the agent wrote into its + * own session-state rather than the eval workspace_path. + */ +export async function captureSessionArtifacts( + filesDir: string, + pathPrefix = '', +): Promise { + const snapshot = await captureSnapshot(filesDir).catch(() => undefined); + if (!snapshot || snapshot.size === 0) return undefined; + + const parts: string[] = []; + for (const [relPath, content] of snapshot) { + const displayPath = pathPrefix ? `${pathPrefix}/${relPath}` : relPath; + parts.push(generateNewFileDiff(displayPath, content)); + } + return parts.join('\n'); +} diff --git a/packages/core/test/evaluation/workspace/file-changes.test.ts b/packages/core/test/evaluation/workspace/file-changes.test.ts index 1e7da6155..5b77d016e 100644 --- a/packages/core/test/evaluation/workspace/file-changes.test.ts +++ b/packages/core/test/evaluation/workspace/file-changes.test.ts @@ -7,6 +7,10 @@ import path from 'node:path'; import { captureFileChanges, + captureSessionArtifacts, + captureSnapshot, + diffFromSnapshots, + generateNewFileDiff, initializeBaseline, } from '../../../src/evaluation/workspace/file-changes.js'; @@ -117,3 +121,129 @@ describe('workspace file-changes', () => { expect(diff).toContain('updated library'); }); }); + +describe('captureSnapshot / diffFromSnapshots', () => { + let dir: string; + + beforeEach(async () => { + dir = await mkdtemp(path.join(tmpdir(), 'agentv-snap-test-')); + }); + + afterEach(async () => { + await rm(dir, { recursive: true, force: true }).catch(() => {}); + }); + + it('captures an empty directory', async () => { + const snap = await captureSnapshot(dir); + expect(snap.size).toBe(0); + }); + + it('captures text files recursively', async () => { + await writeFile(path.join(dir, 'file.txt'), 'hello\n', 'utf8'); + await mkdir(path.join(dir, 'sub'), { recursive: true }); + await writeFile(path.join(dir, 'sub', 'nested.txt'), 'nested\n', 'utf8'); + + const snap = await captureSnapshot(dir); + expect(snap.size).toBe(2); + expect(snap.get('file.txt')).toBe('hello\n'); + expect(snap.get('sub/nested.txt')).toBe('nested\n'); + }); + + it('diffFromSnapshots returns empty string when nothing changed', async () => { + await writeFile(path.join(dir, 'file.txt'), 'hello\n', 'utf8'); + const baseline = await captureSnapshot(dir); + const current = await captureSnapshot(dir); + expect(diffFromSnapshots(baseline, current)).toBe(''); + }); + + it('diffFromSnapshots shows new file as addition', async () => { + const baseline = await captureSnapshot(dir); + await writeFile(path.join(dir, 'new.txt'), 'brand new\n', 'utf8'); + const current = await captureSnapshot(dir); + + const diff = diffFromSnapshots(baseline, current); + expect(diff).toContain('new.txt'); + expect(diff).toContain('+brand new'); + expect(diff).toContain('new file mode'); + }); + + it('diffFromSnapshots shows deleted file', async () => { + await writeFile(path.join(dir, 'gone.txt'), 'will be deleted\n', 'utf8'); + const baseline = await captureSnapshot(dir); + await rm(path.join(dir, 'gone.txt')); + const current = await captureSnapshot(dir); + + const diff = diffFromSnapshots(baseline, current); + expect(diff).toContain('gone.txt'); + expect(diff).toContain('deleted file mode'); + expect(diff).toContain('-will be deleted'); + }); + + it('diffFromSnapshots shows modified file', async () => { + await writeFile(path.join(dir, 'mod.txt'), 'original\n', 'utf8'); + const baseline = await captureSnapshot(dir); + await writeFile(path.join(dir, 'mod.txt'), 'changed\n', 'utf8'); + const current = await captureSnapshot(dir); + + const diff = diffFromSnapshots(baseline, current); + expect(diff).toContain('mod.txt'); + expect(diff).toContain('-original'); + expect(diff).toContain('+changed'); + }); +}); + +describe('generateNewFileDiff', () => { + it('generates a valid unified diff for a new file', () => { + const diff = generateNewFileDiff('outputs/report.csv', 'metric,value\ncpu,0.5\n'); + expect(diff).toContain('diff --git a/outputs/report.csv b/outputs/report.csv'); + expect(diff).toContain('new file mode 100644'); + expect(diff).toContain('--- /dev/null'); + expect(diff).toContain('+++ b/outputs/report.csv'); + expect(diff).toContain('+metric,value'); + expect(diff).toContain('+cpu,0.5'); + }); + + it('handles content without trailing newline', () => { + const diff = generateNewFileDiff('out.txt', 'no newline'); + expect(diff).toContain('+no newline'); + expect(diff).toContain('@@ -0,0 +1,1 @@'); + }); +}); + +describe('captureSessionArtifacts', () => { + let artifactsDir: string; + + beforeEach(async () => { + artifactsDir = await mkdtemp(path.join(tmpdir(), 'agentv-artifacts-test-')); + }); + + afterEach(async () => { + await rm(artifactsDir, { recursive: true, force: true }).catch(() => {}); + }); + + it('returns undefined for empty directory', async () => { + const result = await captureSessionArtifacts(artifactsDir); + expect(result).toBeUndefined(); + }); + + it('returns undefined for non-existent directory', async () => { + const result = await captureSessionArtifacts('/non/existent/path'); + expect(result).toBeUndefined(); + }); + + it('returns synthetic diff for files in directory', async () => { + await writeFile(path.join(artifactsDir, 'report.csv'), 'metric,value\ncpu,0.5\n', 'utf8'); + + const result = await captureSessionArtifacts(artifactsDir); + expect(result).toBeDefined(); + expect(result).toContain('report.csv'); + expect(result).toContain('+metric,value'); + }); + + it('applies pathPrefix to file paths', async () => { + await writeFile(path.join(artifactsDir, 'data.csv'), 'a,b\n1,2\n', 'utf8'); + + const result = await captureSessionArtifacts(artifactsDir, 'session-state/files'); + expect(result).toContain('session-state/files/data.csv'); + }); +}); From ce7d8957d760ab2e8e3e2040b648f640785b2ca0 Mon Sep 17 00:00:00 2001 From: Christopher Tso Date: Sun, 12 Apr 2026 05:57:11 +0000 Subject: [PATCH 2/9] fix: apply biome lint/format corrections Co-Authored-By: Claude Sonnet 4.6 --- .../workspace-artifact/scripts/check-csv-artifact.ts | 3 +-- packages/core/src/evaluation/orchestrator.ts | 2 +- packages/core/src/evaluation/workspace/file-changes.ts | 7 ++----- 3 files changed, 4 insertions(+), 8 deletions(-) diff --git a/examples/features/workspace-artifact/scripts/check-csv-artifact.ts b/examples/features/workspace-artifact/scripts/check-csv-artifact.ts index 018f4b21f..43809cdea 100644 --- a/examples/features/workspace-artifact/scripts/check-csv-artifact.ts +++ b/examples/features/workspace-artifact/scripts/check-csv-artifact.ts @@ -23,8 +23,7 @@ if (!fileChanges || fileChanges.trim().length === 0) { assertions.push({ text: 'file_changes is non-empty', passed: false, - evidence: - 'file_changes is empty — workspace snapshot or git baseline may not be configured', + evidence: 'file_changes is empty — workspace snapshot or git baseline may not be configured', }); console.log(JSON.stringify({ score: 0, assertions })); process.exit(0); diff --git a/packages/core/src/evaluation/orchestrator.ts b/packages/core/src/evaluation/orchestrator.ts index ca419efb7..cefdbd9ca 100644 --- a/packages/core/src/evaluation/orchestrator.ts +++ b/packages/core/src/evaluation/orchestrator.ts @@ -65,8 +65,8 @@ import type { WorkspaceScriptConfig, } from './types.js'; import { - captureFileChanges as captureWorkspaceFileChanges, captureSnapshot, + captureFileChanges as captureWorkspaceFileChanges, diffFromSnapshots, initializeBaseline, } from './workspace/file-changes.js'; diff --git a/packages/core/src/evaluation/workspace/file-changes.ts b/packages/core/src/evaluation/workspace/file-changes.ts index b88f7f204..028c16b4a 100644 --- a/packages/core/src/evaluation/workspace/file-changes.ts +++ b/packages/core/src/evaluation/workspace/file-changes.ts @@ -30,8 +30,8 @@ */ import { exec as execCallback } from 'node:child_process'; -import { readFile, readdir, stat } from 'node:fs/promises'; import { readdirSync, statSync } from 'node:fs'; +import { readFile, readdir, stat } from 'node:fs/promises'; import path from 'node:path'; import { promisify } from 'node:util'; @@ -190,10 +190,7 @@ async function walkDir( * Covers new files, modified files, and deleted files. * Returns empty string when the snapshots are identical. */ -export function diffFromSnapshots( - baseline: WorkspaceSnapshot, - current: WorkspaceSnapshot, -): string { +export function diffFromSnapshots(baseline: WorkspaceSnapshot, current: WorkspaceSnapshot): string { const parts: string[] = []; // New and modified files From a73780dd53046870f28c82b91701f5f11f365c64 Mon Sep 17 00:00:00 2001 From: Christopher Tso Date: Sun, 12 Apr 2026 06:50:40 +0000 Subject: [PATCH 3/9] feat: add file-changes-with-repos example and fix docs Example: - examples/features/file-changes-with-repos/ proves that file_changes captures BOTH workspace-root files AND changes inside nested git repos. Uses a before_all hook to init my-lib/ as a nested repo, then verifies that report.txt (workspace root) and my-lib/utils.ts (nested repo) both appear in file_changes. Docs (code-graders.mdx, llm-graders.mdx): - Replace deprecated "workspace_template" field name with workspace - Expand file_changes description to mention nested repos and Copilot session-state artifacts - Add "What file_changes covers" section explaining the 3 strategies - Fix YAML example to use workspace.template (eval YAML form) - Add examples table pointing to the four file-changes examples Co-Authored-By: Claude Sonnet 4.6 --- .../docs/docs/evaluators/code-graders.mdx | 36 +++++++---- .../docs/docs/evaluators/llm-graders.mdx | 4 +- .../.agentv/targets.yaml | 11 ++++ .../evals/dataset.eval.yaml | 59 ++++++++++++++++++ .../scripts/check-file-changes.ts | 60 +++++++++++++++++++ .../workspace-template/README.md | 3 + .../workspace-template/my-lib/utils.ts | 7 +++ 7 files changed, 168 insertions(+), 12 deletions(-) create mode 100644 examples/features/file-changes-with-repos/.agentv/targets.yaml create mode 100644 examples/features/file-changes-with-repos/evals/dataset.eval.yaml create mode 100644 examples/features/file-changes-with-repos/scripts/check-file-changes.ts create mode 100644 examples/features/file-changes-with-repos/workspace-template/README.md create mode 100644 examples/features/file-changes-with-repos/workspace-template/my-lib/utils.ts diff --git a/apps/web/src/content/docs/docs/evaluators/code-graders.mdx b/apps/web/src/content/docs/docs/evaluators/code-graders.mdx index 3befe3581..7f989f8aa 100644 --- a/apps/web/src/content/docs/docs/evaluators/code-graders.mdx +++ b/apps/web/src/content/docs/docs/evaluators/code-graders.mdx @@ -188,8 +188,8 @@ Beyond the basic text fields (`input`, `output`, `expected_output`, `criteria`), | `duration_ms` | `number` | Total execution duration | | `start_time` | `string` | ISO timestamp of first event | | `end_time` | `string` | ISO timestamp of last event | -| `file_changes` | `string \| null` | Unified diff of workspace file changes (when `workspace_template` is configured) | -| `workspace_path` | `string \| null` | Absolute path to the workspace directory (when `workspace_template` is configured) | +| `file_changes` | `string \| null` | Unified diff of workspace file changes (populated when `workspace` is configured; includes files at workspace root, changes inside nested repos, and Copilot session-state artifacts) | +| `workspace_path` | `string \| null` | Absolute path to the temp workspace directory (populated when `workspace` is configured) | ### trace structure @@ -215,13 +215,21 @@ Use `expected_output` for retrieval context in RAG evals (tool calls with output ## Workspace Access -When `workspace_template` is configured on a target, code graders receive the workspace path in two ways: +When `workspace` is configured in the eval YAML (via `workspace.template`, `workspace.path`, or `workspace.repos`), code graders receive the workspace path in two ways: 1. **JSON payload**: `workspace_path` field in the stdin input 2. **Environment variable**: `AGENTV_WORKSPACE_PATH` This enables **functional grading** — running commands like `npm test`, `pytest`, or `cargo test` directly in the agent's workspace. +#### What `file_changes` covers + +`file_changes` is a unified diff built from up to three sources, merged in order: + +1. **Git baseline** (default): `git diff` against a baseline commit taken before the agent ran. Captures edits to tracked files, new files at workspace root, and changes inside any nested git repos that were materialized via `workspace.repos` or set up via a `before_all` hook. +2. **Snapshot fallback**: When git is unavailable (read-only path, no `git` binary), AgentV falls back to a filesystem snapshot taken before the agent ran and diffs file contents directly. +3. **Provider-reported artifacts**: Copilot providers scan their session-state `files/` directory after each run and append those as synthetic diffs. This surfaces files the agent wrote *outside* `workspace_path` entirely (e.g. `~/.copilot/session-state//files/`). + ### Example: Deploy-and-Test Pattern ```typescript @@ -260,14 +268,13 @@ console.log(JSON.stringify({ ``` ```yaml -# targets.yaml -targets: - - name: my_agent - provider: cli - command: "my-agent --task {INPUT_FILE} --output {OUTPUT_FILE}" - workspace_template: ./workspace-template - # dataset.eval.yaml +workspace: + template: ./workspace-template # copied into a temp dir before each run + +execution: + target: my_agent + tests: - id: implement-feature criteria: Agent implements the feature correctly @@ -280,6 +287,15 @@ tests: See `examples/features/functional-grading/` for a complete working example. +#### Examples + +| Example | What it demonstrates | +|---------|----------------------| +| `examples/features/functional-grading/` | `workspace_path` — deploy-and-test with `npm install` + `tsc` + `npm test` | +| `examples/features/file-changes/` | `file_changes` — edits, creates, and deletes captured via git baseline | +| `examples/features/workspace-artifact/` | `file_changes` — new files captured even in a non-git workspace (snapshot fallback) | +| `examples/features/file-changes-with-repos/` | `file_changes` — workspace-root files AND changes inside nested repos both captured | + ## Testing Locally ### With `agentv eval assert` diff --git a/apps/web/src/content/docs/docs/evaluators/llm-graders.mdx b/apps/web/src/content/docs/docs/evaluators/llm-graders.mdx index 51d0db372..12f54ffe3 100644 --- a/apps/web/src/content/docs/docs/evaluators/llm-graders.mdx +++ b/apps/web/src/content/docs/docs/evaluators/llm-graders.mdx @@ -72,7 +72,7 @@ Score the response from 0.0 to 1.0 based on: | `input` | Full resolved input array, JSON-serialized | | `expected_output` | Full resolved expected array, JSON-serialized | | `output` | Full provider output array, JSON-serialized | -| `file_changes` | Unified diff of workspace file changes (when `workspace_template` is configured) | +| `file_changes` | Unified diff of workspace file changes (populated when `workspace` is configured) | ## Per-Grader Target @@ -227,7 +227,7 @@ Derived strings injected into grader prompts: | `input` | Full resolved input array, JSON-serialized | | `expected_output` | Full resolved expected array, JSON-serialized | | `output` | Full provider output array, JSON-serialized | -| `file_changes` | Unified diff of workspace file changes (when `workspace_template` is configured) | +| `file_changes` | Unified diff of workspace file changes (populated when `workspace` is configured) | **Example flow:** diff --git a/examples/features/file-changes-with-repos/.agentv/targets.yaml b/examples/features/file-changes-with-repos/.agentv/targets.yaml new file mode 100644 index 000000000..2077d535a --- /dev/null +++ b/examples/features/file-changes-with-repos/.agentv/targets.yaml @@ -0,0 +1,11 @@ +targets: + # Mock agent that writes to the workspace root AND edits a file inside a nested git repo. + # Simulates an agent that produces an artifact alongside making code changes. + - name: mock_agent + provider: cli + command: >- + bash -c ' + echo "Analysis complete: 3 tests passed, 0 failed." > report.txt && + printf "\nexport function add(a: number, b: number): number {\n return a + b;\n}\n" >> my-lib/utils.ts && + echo "Done" > {OUTPUT_FILE} + ' diff --git a/examples/features/file-changes-with-repos/evals/dataset.eval.yaml b/examples/features/file-changes-with-repos/evals/dataset.eval.yaml new file mode 100644 index 000000000..d89d35062 --- /dev/null +++ b/examples/features/file-changes-with-repos/evals/dataset.eval.yaml @@ -0,0 +1,59 @@ +# File-changes with nested git repos +# +# Proves that file_changes captures BOTH: +# 1. Files created at the workspace root (alongside repos) +# 2. Changes made inside nested git repositories +# +# Setup: +# - workspace.template copies workspace-template/ into the temp workspace +# - before_all hook initialises my-lib/ as a git repo inside the workspace +# - initializeBaseline (runs after before_all) sees my-lib/.git as a gitlink +# +# Agent behaviour: +# - Writes report.txt to workspace root (not inside any repo) +# - Appends a function to my-lib/utils.ts (inside the nested repo) +# +# How file_changes captures both: +# - Workspace-root diff: report.txt shows as a new file in the outer git diff +# - Nested repo diff: my-lib gitlink hash changes; AgentV diffs my-lib/ +# individually and stitches the per-file diffs into file_changes + +name: file-changes-with-repos +description: Verify file_changes captures workspace-root files AND changes inside nested repos + +workspace: + template: ../workspace-template + hooks: + before_all: + command: + - bash + - -c + - >- + cd "{{workspace_path}}/my-lib" && + git -c init.defaultBranch=main init && + git -c user.email=test@agentv.dev -c user.name="AgentV Test" add . && + git -c user.email=test@agentv.dev -c user.name="AgentV Test" commit -m "init" + +execution: + target: mock_agent + +tests: + - id: root-file-and-repo-change + criteria: >- + The agent writes report.txt to the workspace root and appends an add() + function to my-lib/utils.ts. + file_changes must show both: the new workspace-root file and the + modification inside the nested repo. + + input: + - role: user + content: + - type: text + value: >- + Write a one-line summary to report.txt at the workspace root. + Then add an add(a, b) function to my-lib/utils.ts. + + assertions: + - name: check-root-and-repo-changes + type: code-grader + command: ["bun", "run", "../scripts/check-file-changes.ts"] diff --git a/examples/features/file-changes-with-repos/scripts/check-file-changes.ts b/examples/features/file-changes-with-repos/scripts/check-file-changes.ts new file mode 100644 index 000000000..b198b7a35 --- /dev/null +++ b/examples/features/file-changes-with-repos/scripts/check-file-changes.ts @@ -0,0 +1,60 @@ +#!/usr/bin/env bun +/** + * Code grader: verifies file_changes captures BOTH workspace-root files + * and changes inside nested git repos. + * + * Expected diff should include: + * - report.txt (new file at workspace root) + * - my-lib/utils.ts (modification inside the nested repo) + */ +import { readFileSync } from 'node:fs'; + +const input = JSON.parse(readFileSync('/dev/stdin', 'utf-8')) as { + file_changes: string | null; +}; + +const fileChanges = input.file_changes ?? ''; +const assertions: Array<{ text: string; passed: boolean; evidence?: string }> = []; + +if (!fileChanges || fileChanges.trim().length === 0) { + assertions.push({ + text: 'file_changes is non-empty', + passed: false, + evidence: 'file_changes is empty — workspace not configured or file tracking failed', + }); + console.log(JSON.stringify({ score: 0, assertions })); + process.exit(0); +} + +assertions.push({ text: 'file_changes is non-empty', passed: true }); + +// Check 1: workspace-root file appears in diff +const hasRootFile = fileChanges.includes('report.txt'); +assertions.push({ + text: 'diff captures workspace-root file (report.txt)', + passed: hasRootFile, + evidence: hasRootFile + ? undefined + : `file_changes did not mention report.txt.\nDiff:\n${fileChanges.slice(0, 500)}`, +}); + +// Check 2: nested repo change appears in diff +const hasRepoChange = fileChanges.includes('my-lib/utils.ts') || fileChanges.includes('utils.ts'); +assertions.push({ + text: 'diff captures nested-repo change (my-lib/utils.ts)', + passed: hasRepoChange, + evidence: hasRepoChange + ? undefined + : `file_changes did not mention utils.ts.\nDiff:\n${fileChanges.slice(0, 500)}`, +}); + +// Check 3: diff shows the add function was added +const hasAddFn = fileChanges.includes('+export function add'); +assertions.push({ + text: 'diff shows add() function was added', + passed: hasAddFn, + evidence: hasAddFn ? undefined : 'add() function not found in diff', +}); + +const passed = assertions.filter((a) => a.passed).length; +console.log(JSON.stringify({ score: passed / assertions.length, assertions })); diff --git a/examples/features/file-changes-with-repos/workspace-template/README.md b/examples/features/file-changes-with-repos/workspace-template/README.md new file mode 100644 index 000000000..567cfe096 --- /dev/null +++ b/examples/features/file-changes-with-repos/workspace-template/README.md @@ -0,0 +1,3 @@ +# My Project + +A sample project workspace used for AgentV evaluation. diff --git a/examples/features/file-changes-with-repos/workspace-template/my-lib/utils.ts b/examples/features/file-changes-with-repos/workspace-template/my-lib/utils.ts new file mode 100644 index 000000000..d11317afb --- /dev/null +++ b/examples/features/file-changes-with-repos/workspace-template/my-lib/utils.ts @@ -0,0 +1,7 @@ +/** + * Utility functions for the project. + */ + +export function greet(name: string): string { + return `Hello, ${name}!`; +} From 3f4efc1320c6df0d89da9c15b18011fef265a6d0 Mon Sep 17 00:00:00 2001 From: Christopher Tso Date: Sun, 12 Apr 2026 06:55:32 +0000 Subject: [PATCH 4/9] =?UTF-8?q?rename=20dataset.eval.yaml=20=E2=86=92=20ev?= =?UTF-8?q?al.yaml=20in=20new=20examples?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Co-Authored-By: Claude Sonnet 4.6 --- .../evals/{dataset.eval.yaml => eval.yaml} | 0 .../workspace-artifact/evals/{dataset.eval.yaml => eval.yaml} | 0 2 files changed, 0 insertions(+), 0 deletions(-) rename examples/features/file-changes-with-repos/evals/{dataset.eval.yaml => eval.yaml} (100%) rename examples/features/workspace-artifact/evals/{dataset.eval.yaml => eval.yaml} (100%) diff --git a/examples/features/file-changes-with-repos/evals/dataset.eval.yaml b/examples/features/file-changes-with-repos/evals/eval.yaml similarity index 100% rename from examples/features/file-changes-with-repos/evals/dataset.eval.yaml rename to examples/features/file-changes-with-repos/evals/eval.yaml diff --git a/examples/features/workspace-artifact/evals/dataset.eval.yaml b/examples/features/workspace-artifact/evals/eval.yaml similarity index 100% rename from examples/features/workspace-artifact/evals/dataset.eval.yaml rename to examples/features/workspace-artifact/evals/eval.yaml From 8423ea8ad6b3d5feb52dfbb607686007efb69765 Mon Sep 17 00:00:00 2001 From: Christopher Tso Date: Sun, 12 Apr 2026 07:08:23 +0000 Subject: [PATCH 5/9] fix(copilot-sdk): use session.workspacePath for artifact dir MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The SDK exposes session.workspacePath as the authoritative path to the session state directory (contains files/, checkpoints/, plan.md). Using it directly avoids the previous guess of ~/.copilot/session-state//files which relied on session.id or session.sessionId being the right UUID. Also clarify in copilot-cli.ts that session.sessionId (from ACP) is expected to match the session-state dir name — if it doesn't exist the call silently returns undefined. Co-Authored-By: Claude Sonnet 4.6 --- .../src/evaluation/providers/copilot-cli.ts | 3 +++ .../src/evaluation/providers/copilot-sdk.ts | 20 +++++++++---------- 2 files changed, 12 insertions(+), 11 deletions(-) diff --git a/packages/core/src/evaluation/providers/copilot-cli.ts b/packages/core/src/evaluation/providers/copilot-cli.ts index b95a85bf2..3cb4a55ed 100644 --- a/packages/core/src/evaluation/providers/copilot-cli.ts +++ b/packages/core/src/evaluation/providers/copilot-cli.ts @@ -303,6 +303,9 @@ export class CopilotCliProvider implements Provider { // Capture session artifacts from session-state `files/` directory. // Copilot may write generated files (e.g. CSV reports) there instead of // the session cwd, so they wouldn't be captured by workspace git diff. + // ACP session.sessionId is the UUID Copilot assigns at session creation + // and is expected to match the ~/.copilot/session-state// directory + // name. If the directory doesn't exist the call silently returns undefined. const sessionId = session.sessionId as string | undefined; const fileChanges = sessionId ? await captureSessionArtifacts( diff --git a/packages/core/src/evaluation/providers/copilot-sdk.ts b/packages/core/src/evaluation/providers/copilot-sdk.ts index fa6fd638d..aade67937 100644 --- a/packages/core/src/evaluation/providers/copilot-sdk.ts +++ b/packages/core/src/evaluation/providers/copilot-sdk.ts @@ -1,7 +1,6 @@ import { randomUUID } from 'node:crypto'; import { existsSync } from 'node:fs'; import { mkdir } from 'node:fs/promises'; -import { homedir } from 'node:os'; import path from 'node:path'; import { captureSessionArtifacts } from '../workspace/file-changes.js'; @@ -265,16 +264,15 @@ export class CopilotSdkProvider implements Provider { } // Capture session artifacts from session-state `files/` directory. - // Copilot SDK may write generated files (e.g. CSV reports) to the - // session-state directory instead of the workspace cwd. - // biome-ignore lint/suspicious/noExplicitAny: SDK session shape is dynamic - const sessionId = (session as any).id ?? (session as any).sessionId; - const fileChanges = - typeof sessionId === 'string' && sessionId - ? await captureSessionArtifacts( - path.join(homedir(), '.copilot', 'session-state', sessionId, 'files'), - ).catch(() => undefined) - : undefined; + // The SDK's session.workspacePath is the authoritative path to the + // session state directory (contains files/, checkpoints/, plan.md). + // Only populated when infinite sessions are enabled on the server. + const sessionWorkspacePath = session.workspacePath; + const fileChanges = sessionWorkspacePath + ? await captureSessionArtifacts( + path.join(sessionWorkspacePath, 'files'), + ).catch(() => undefined) + : undefined; return { raw: { From abdc00a60839417a51a6535b9b5c4c880f5bbb79 Mon Sep 17 00:00:00 2001 From: Christopher Tso Date: Sun, 12 Apr 2026 07:09:48 +0000 Subject: [PATCH 6/9] fix: biome format --- packages/core/src/evaluation/providers/copilot-sdk.ts | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/packages/core/src/evaluation/providers/copilot-sdk.ts b/packages/core/src/evaluation/providers/copilot-sdk.ts index aade67937..ffe246c1b 100644 --- a/packages/core/src/evaluation/providers/copilot-sdk.ts +++ b/packages/core/src/evaluation/providers/copilot-sdk.ts @@ -269,9 +269,9 @@ export class CopilotSdkProvider implements Provider { // Only populated when infinite sessions are enabled on the server. const sessionWorkspacePath = session.workspacePath; const fileChanges = sessionWorkspacePath - ? await captureSessionArtifacts( - path.join(sessionWorkspacePath, 'files'), - ).catch(() => undefined) + ? await captureSessionArtifacts(path.join(sessionWorkspacePath, 'files')).catch( + () => undefined, + ) : undefined; return { From 3fb4dbf8b96265eff88079b5688e91249326c46a Mon Sep 17 00:00:00 2001 From: Christopher Tso Date: Sun, 12 Apr 2026 07:33:22 +0000 Subject: [PATCH 7/9] refactor: remove snapshot fallback from orchestrator The snapshot fallback (Strategy 2) was dead code: git init never fails in a freshly-created temp workspace, and if it does that's an environment error that should be surfaced, not silently papered over with a worse diff. Remove: - sharedBaselineSnapshot / poolSlotSnapshots / testBaselineSnapshot - captureSnapshot / diffFromSnapshots / WorkspaceSnapshot imports - snapshot catch blocks in shared/pool/per-case baseline init - Strategy 2 branch in file_changes capture captureSnapshot / diffFromSnapshots remain in file-changes.ts (tested, used internally by captureSessionArtifacts). git init failures now log a warning in verbose mode and leave file_changes empty. Co-Authored-By: Claude Sonnet 4.6 --- packages/core/src/evaluation/orchestrator.ts | 80 +++++--------------- 1 file changed, 21 insertions(+), 59 deletions(-) diff --git a/packages/core/src/evaluation/orchestrator.ts b/packages/core/src/evaluation/orchestrator.ts index cefdbd9ca..5f8843665 100644 --- a/packages/core/src/evaluation/orchestrator.ts +++ b/packages/core/src/evaluation/orchestrator.ts @@ -65,12 +65,9 @@ import type { WorkspaceScriptConfig, } from './types.js'; import { - captureSnapshot, captureFileChanges as captureWorkspaceFileChanges, - diffFromSnapshots, initializeBaseline, } from './workspace/file-changes.js'; -import type { WorkspaceSnapshot } from './workspace/file-changes.js'; import { cleanupEvalWorkspaces, cleanupWorkspace, @@ -195,8 +192,6 @@ export interface RunEvalCaseOptions { readonly sharedWorkspacePath?: string; /** Pre-initialized baseline commit for shared workspace */ readonly sharedBaselineCommit?: string; - /** Snapshot baseline for shared workspace (fallback when git is unavailable) */ - readonly sharedBaselineSnapshot?: WorkspaceSnapshot; /** Suite-level .code-workspace file (resolved from workspace.template) */ readonly suiteWorkspaceFile?: string; /** Real-time observability callbacks passed to the provider */ @@ -621,7 +616,6 @@ export async function runEvaluation( const limit = pLimit(workers); let sharedWorkspacePath: string | undefined; let sharedBaselineCommit: string | undefined; - let sharedBaselineSnapshot: WorkspaceSnapshot | undefined; let beforeAllOutput: string | undefined; let poolManager: WorkspacePoolManager | undefined; @@ -631,7 +625,7 @@ export async function runEvaluation( const poolSlots: PoolSlot[] = []; const availablePoolSlots: PoolSlot[] = []; const poolSlotBaselines = new Map(); - const poolSlotSnapshots = new Map(); + // Pool capacity: how many slots can exist on disk (independent of worker count). // Workers acquire slots from the pool; the pool itself can be larger than any single run needs. @@ -839,37 +833,29 @@ export async function runEvaluation( } } - // Initialize baseline for shared workspace (git first, snapshot fallback) + // Initialize git baseline for shared workspace if (sharedWorkspacePath) { try { sharedBaselineCommit = await initializeBaseline(sharedWorkspacePath); setupLog(`shared baseline initialized: ${sharedBaselineCommit}`); - } catch { - // Git failed — try snapshot fallback - try { - sharedBaselineSnapshot = await captureSnapshot(sharedWorkspacePath); - setupLog('shared baseline snapshot captured (git unavailable)'); - } catch { - setupLog('shared baseline initialization skipped (non-fatal)'); - } + } catch (error) { + const message = error instanceof Error ? error.message : String(error); + setupLog(`shared baseline initialization failed (file_changes unavailable): ${message}`); } } - // Multi-slot pool: initialize baselines per slot (git first, snapshot fallback) + // Multi-slot pool: initialize git baselines per slot if (availablePoolSlots.length > 0) { for (const slot of availablePoolSlots) { try { const baseline = await initializeBaseline(slot.path); poolSlotBaselines.set(slot.path, baseline); setupLog(`pool slot ${slot.index} baseline initialized: ${baseline}`); - } catch { - try { - const snapshot = await captureSnapshot(slot.path); - poolSlotSnapshots.set(slot.path, snapshot); - setupLog(`pool slot ${slot.index} baseline snapshot captured (git unavailable)`); - } catch { - setupLog(`pool slot ${slot.index} baseline initialization skipped (non-fatal)`); - } + } catch (error) { + const message = error instanceof Error ? error.message : String(error); + setupLog( + `pool slot ${slot.index} baseline initialization failed (file_changes unavailable): ${message}`, + ); } } } @@ -983,9 +969,6 @@ export async function runEvaluation( const testBaselineCommit = testPoolSlot ? poolSlotBaselines.get(testPoolSlot.path) : sharedBaselineCommit; - const testBaselineSnapshot = testPoolSlot - ? poolSlotSnapshots.get(testPoolSlot.path) - : sharedBaselineSnapshot; try { const graderProvider = await resolveGraderProvider(target); @@ -1009,7 +992,6 @@ export async function runEvaluation( retainOnFailure: resolvedRetainOnFailure, sharedWorkspacePath: testWorkspacePath, sharedBaselineCommit: testBaselineCommit, - sharedBaselineSnapshot: testBaselineSnapshot, suiteWorkspaceFile, streamCallbacks, typeRegistry, @@ -1417,7 +1399,6 @@ export async function runEvalCase(options: RunEvalCaseOptions): Promise 0) { @@ -1810,20 +1783,9 @@ export async function runEvalCase(options: RunEvalCaseOptions): Promise 0) { - fileChanges = diff; - } - } catch { - // Non-fatal - } } - // Strategy 3: provider-reported artifacts (files written outside workspace_path, + // Provider-reported artifacts (files written outside workspace_path, // e.g. copilot session-state). Merged on top of any workspace-based diff. const providerFileChanges = providerResponse?.fileChanges; if (providerFileChanges) { From 76f3b253d65226ed1ae2def620acf3b765facc99 Mon Sep 17 00:00:00 2001 From: Christopher Tso Date: Sun, 12 Apr 2026 07:34:52 +0000 Subject: [PATCH 8/9] fix: remove extra blank line --- packages/core/src/evaluation/orchestrator.ts | 1 - 1 file changed, 1 deletion(-) diff --git a/packages/core/src/evaluation/orchestrator.ts b/packages/core/src/evaluation/orchestrator.ts index 5f8843665..0acf6dc24 100644 --- a/packages/core/src/evaluation/orchestrator.ts +++ b/packages/core/src/evaluation/orchestrator.ts @@ -626,7 +626,6 @@ export async function runEvaluation( const availablePoolSlots: PoolSlot[] = []; const poolSlotBaselines = new Map(); - // Pool capacity: how many slots can exist on disk (independent of worker count). // Workers acquire slots from the pool; the pool itself can be larger than any single run needs. const poolMaxSlots = Math.min(configPoolMaxSlots ?? 10, 50); From c0e2afd0cd313827a12871bc6cfcfe0166f23049 Mon Sep 17 00:00:00 2001 From: Christopher Tso Date: Sun, 12 Apr 2026 07:45:09 +0000 Subject: [PATCH 9/9] docs: remove snapshot fallback from file_changes description Co-Authored-By: Claude Sonnet 4.6 --- .../src/content/docs/docs/evaluators/code-graders.mdx | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/apps/web/src/content/docs/docs/evaluators/code-graders.mdx b/apps/web/src/content/docs/docs/evaluators/code-graders.mdx index 7f989f8aa..1bec26b3b 100644 --- a/apps/web/src/content/docs/docs/evaluators/code-graders.mdx +++ b/apps/web/src/content/docs/docs/evaluators/code-graders.mdx @@ -224,11 +224,10 @@ This enables **functional grading** — running commands like `npm test`, `pytes #### What `file_changes` covers -`file_changes` is a unified diff built from up to three sources, merged in order: +`file_changes` is a unified diff built from two sources, merged in order: -1. **Git baseline** (default): `git diff` against a baseline commit taken before the agent ran. Captures edits to tracked files, new files at workspace root, and changes inside any nested git repos that were materialized via `workspace.repos` or set up via a `before_all` hook. -2. **Snapshot fallback**: When git is unavailable (read-only path, no `git` binary), AgentV falls back to a filesystem snapshot taken before the agent ran and diffs file contents directly. -3. **Provider-reported artifacts**: Copilot providers scan their session-state `files/` directory after each run and append those as synthetic diffs. This surfaces files the agent wrote *outside* `workspace_path` entirely (e.g. `~/.copilot/session-state//files/`). +1. **Git baseline**: `git diff` against a baseline commit taken before the agent ran. Captures edits, new files at workspace root, and changes inside any nested git repos materialized via `workspace.repos` or set up via a `before_all` hook. +2. **Provider-reported artifacts**: Copilot providers scan their session-state `files/` directory after each run and append those as synthetic diffs. This surfaces files the agent wrote *outside* `workspace_path` entirely (e.g. `~/.copilot/session-state//files/`). ### Example: Deploy-and-Test Pattern @@ -293,7 +292,7 @@ See `examples/features/functional-grading/` for a complete working example. |---------|----------------------| | `examples/features/functional-grading/` | `workspace_path` — deploy-and-test with `npm install` + `tsc` + `npm test` | | `examples/features/file-changes/` | `file_changes` — edits, creates, and deletes captured via git baseline | -| `examples/features/workspace-artifact/` | `file_changes` — new files captured even in a non-git workspace (snapshot fallback) | +| `examples/features/workspace-artifact/` | `file_changes` — new file generated by agent (CSV) captured via git baseline | | `examples/features/file-changes-with-repos/` | `file_changes` — workspace-root files AND changes inside nested repos both captured | ## Testing Locally