EntityProcess · christso · Apr 12, 2026 · Apr 12, 2026 · Apr 12, 2026 · Apr 12, 2026
diff --git a/apps/web/src/content/docs/docs/evaluators/code-graders.mdx b/apps/web/src/content/docs/docs/evaluators/code-graders.mdx
@@ -188,8 +188,8 @@ Beyond the basic text fields (`input`, `output`, `expected_output`, `criteria`),
 | `duration_ms` | `number` | Total execution duration |
 | `start_time` | `string` | ISO timestamp of first event |
 | `end_time` | `string` | ISO timestamp of last event |
-| `file_changes` | `string \| null` | Unified diff of workspace file changes (when `workspace_template` is configured) |
-| `workspace_path` | `string \| null` | Absolute path to the workspace directory (when `workspace_template` is configured) |
+| `file_changes` | `string \| null` | Unified diff of workspace file changes (populated when `workspace` is configured; includes files at workspace root, changes inside nested repos, and Copilot session-state artifacts) |
+| `workspace_path` | `string \| null` | Absolute path to the temp workspace directory (populated when `workspace` is configured) |
 
 ### trace structure
 
@@ -215,13 +215,20 @@ Use `expected_output` for retrieval context in RAG evals (tool calls with output
 
 ## Workspace Access
 
-When `workspace_template` is configured on a target, code graders receive the workspace path in two ways:
+When `workspace` is configured in the eval YAML (via `workspace.template`, `workspace.path`, or `workspace.repos`), code graders receive the workspace path in two ways:
 
 1. **JSON payload**: `workspace_path` field in the stdin input
 2. **Environment variable**: `AGENTV_WORKSPACE_PATH`
 
 This enables **functional grading** — running commands like `npm test`, `pytest`, or `cargo test` directly in the agent's workspace.
 
+#### What `file_changes` covers
+
+`file_changes` is a unified diff built from two sources, merged in order:
+
+1. **Git baseline**: `git diff` against a baseline commit taken before the agent ran. Captures edits, new files at workspace root, and changes inside any nested git repos materialized via `workspace.repos` or set up via a `before_all` hook.
+2. **Provider-reported artifacts**: Copilot providers scan their session-state `files/` directory after each run and append those as synthetic diffs. This surfaces files the agent wrote *outside* `workspace_path` entirely (e.g. `~/.copilot/session-state/<uuid>/files/`).
+
 ### Example: Deploy-and-Test Pattern
 
 ```typescript
@@ -260,14 +267,13 @@ console.log(JSON.stringify({
 ```
 
 ```yaml
-# targets.yaml
-targets:
-  - name: my_agent
-    provider: cli
-    command: "my-agent --task {INPUT_FILE} --output {OUTPUT_FILE}"
-    workspace_template: ./workspace-template
-
 # dataset.eval.yaml
+workspace:
+  template: ./workspace-template   # copied into a temp dir before each run
+
+execution:
+  target: my_agent
+
 tests:
   - id: implement-feature
     criteria: Agent implements the feature correctly
@@ -280,6 +286,15 @@ tests:
 
 See `examples/features/functional-grading/` for a complete working example.
 
+#### Examples
+
+| Example | What it demonstrates |
+|---------|----------------------|
+| `examples/features/functional-grading/` | `workspace_path` — deploy-and-test with `npm install` + `tsc` + `npm test` |
+| `examples/features/file-changes/` | `file_changes` — edits, creates, and deletes captured via git baseline |
+| `examples/features/workspace-artifact/` | `file_changes` — new file generated by agent (CSV) captured via git baseline |
+| `examples/features/file-changes-with-repos/` | `file_changes` — workspace-root files AND changes inside nested repos both captured |
+
 ## Testing Locally
 
 ### With `agentv eval assert`

diff --git a/apps/web/src/content/docs/docs/evaluators/llm-graders.mdx b/apps/web/src/content/docs/docs/evaluators/llm-graders.mdx
@@ -72,7 +72,7 @@ Score the response from 0.0 to 1.0 based on:
 | `input` | Full resolved input array, JSON-serialized |
 | `expected_output` | Full resolved expected array, JSON-serialized |
 | `output` | Full provider output array, JSON-serialized |
-| `file_changes` | Unified diff of workspace file changes (when `workspace_template` is configured) |
+| `file_changes` | Unified diff of workspace file changes (populated when `workspace` is configured) |
 
 ## Per-Grader Target
 
@@ -227,7 +227,7 @@ Derived strings injected into grader prompts:
 | `input` | Full resolved input array, JSON-serialized |
 | `expected_output` | Full resolved expected array, JSON-serialized |
 | `output` | Full provider output array, JSON-serialized |
-| `file_changes` | Unified diff of workspace file changes (when `workspace_template` is configured) |
+| `file_changes` | Unified diff of workspace file changes (populated when `workspace` is configured) |
 
 **Example flow:**
 

diff --git a/examples/features/file-changes-with-repos/.agentv/targets.yaml b/examples/features/file-changes-with-repos/.agentv/targets.yaml
@@ -0,0 +1,11 @@
+targets:
+  # Mock agent that writes to the workspace root AND edits a file inside a nested git repo.
+  # Simulates an agent that produces an artifact alongside making code changes.
+  - name: mock_agent
+    provider: cli
+    command: >-
+      bash -c '
+      echo "Analysis complete: 3 tests passed, 0 failed." > report.txt &&
+      printf "\nexport function add(a: number, b: number): number {\n  return a + b;\n}\n" >> my-lib/utils.ts &&
+      echo "Done" > {OUTPUT_FILE}
+      '
diff --git a/examples/features/file-changes-with-repos/evals/eval.yaml b/examples/features/file-changes-with-repos/evals/eval.yaml
@@ -0,0 +1,59 @@
+# File-changes with nested git repos
+#
+# Proves that file_changes captures BOTH:
+#   1. Files created at the workspace root (alongside repos)
+#   2. Changes made inside nested git repositories
+#
+# Setup:
+#   - workspace.template copies workspace-template/ into the temp workspace
+#   - before_all hook initialises my-lib/ as a git repo inside the workspace
+#   - initializeBaseline (runs after before_all) sees my-lib/.git as a gitlink
+#
+# Agent behaviour:
+#   - Writes report.txt to workspace root (not inside any repo)
+#   - Appends a function to my-lib/utils.ts (inside the nested repo)
+#
+# How file_changes captures both:
+#   - Workspace-root diff: report.txt shows as a new file in the outer git diff
+#   - Nested repo diff:    my-lib gitlink hash changes; AgentV diffs my-lib/
+#     individually and stitches the per-file diffs into file_changes
+
+name: file-changes-with-repos
+description: Verify file_changes captures workspace-root files AND changes inside nested repos
+
+workspace:
+  template: ../workspace-template
+  hooks:
+    before_all:
+      command:
+        - bash
+        - -c
+        - >-
+          cd "{{workspace_path}}/my-lib" &&
+          git -c init.defaultBranch=main init &&
+          git -c user.email=test@agentv.dev -c user.name="AgentV Test" add . &&
+          git -c user.email=test@agentv.dev -c user.name="AgentV Test" commit -m "init"
+
+execution:
+  target: mock_agent
+
+tests:
+  - id: root-file-and-repo-change
+    criteria: >-
+      The agent writes report.txt to the workspace root and appends an add()
+      function to my-lib/utils.ts.
+      file_changes must show both: the new workspace-root file and the
+      modification inside the nested repo.
+
+    input:
+      - role: user
+        content:
+          - type: text
+            value: >-
+              Write a one-line summary to report.txt at the workspace root.
+              Then add an add(a, b) function to my-lib/utils.ts.
+
+    assertions:
+      - name: check-root-and-repo-changes
+        type: code-grader
+        command: ["bun", "run", "../scripts/check-file-changes.ts"]
diff --git a/examples/features/file-changes-with-repos/scripts/check-file-changes.ts b/examples/features/file-changes-with-repos/scripts/check-file-changes.ts
@@ -0,0 +1,60 @@
+#!/usr/bin/env bun
+/**
+ * Code grader: verifies file_changes captures BOTH workspace-root files
+ * and changes inside nested git repos.
+ *
+ * Expected diff should include:
+ *   - report.txt        (new file at workspace root)
+ *   - my-lib/utils.ts  (modification inside the nested repo)
+ */
+import { readFileSync } from 'node:fs';
+
+const input = JSON.parse(readFileSync('/dev/stdin', 'utf-8')) as {
+  file_changes: string | null;
+};
+
+const fileChanges = input.file_changes ?? '';
+const assertions: Array<{ text: string; passed: boolean; evidence?: string }> = [];
+
+if (!fileChanges || fileChanges.trim().length === 0) {
+  assertions.push({
+    text: 'file_changes is non-empty',
+    passed: false,
+    evidence: 'file_changes is empty — workspace not configured or file tracking failed',
+  });
+  console.log(JSON.stringify({ score: 0, assertions }));
+  process.exit(0);
+}
+
+assertions.push({ text: 'file_changes is non-empty', passed: true });
+
+// Check 1: workspace-root file appears in diff
+const hasRootFile = fileChanges.includes('report.txt');
+assertions.push({
+  text: 'diff captures workspace-root file (report.txt)',
+  passed: hasRootFile,
+  evidence: hasRootFile
+    ? undefined
+    : `file_changes did not mention report.txt.\nDiff:\n${fileChanges.slice(0, 500)}`,
+});
+
+// Check 2: nested repo change appears in diff
+const hasRepoChange = fileChanges.includes('my-lib/utils.ts') || fileChanges.includes('utils.ts');
+assertions.push({
+  text: 'diff captures nested-repo change (my-lib/utils.ts)',
+  passed: hasRepoChange,
+  evidence: hasRepoChange
+    ? undefined
+    : `file_changes did not mention utils.ts.\nDiff:\n${fileChanges.slice(0, 500)}`,
+});
+
+// Check 3: diff shows the add function was added
+const hasAddFn = fileChanges.includes('+export function add');
+assertions.push({
+  text: 'diff shows add() function was added',
+  passed: hasAddFn,
+  evidence: hasAddFn ? undefined : 'add() function not found in diff',
+});
+
+const passed = assertions.filter((a) => a.passed).length;
+console.log(JSON.stringify({ score: passed / assertions.length, assertions }));
diff --git a/examples/features/file-changes-with-repos/workspace-template/README.md b/examples/features/file-changes-with-repos/workspace-template/README.md
@@ -0,0 +1,3 @@
+# My Project
+
+A sample project workspace used for AgentV evaluation.
diff --git a/examples/features/file-changes-with-repos/workspace-template/my-lib/utils.ts b/examples/features/file-changes-with-repos/workspace-template/my-lib/utils.ts
@@ -0,0 +1,7 @@
+/**
+ * Utility functions for the project.
+ */
+
+export function greet(name: string): string {
+  return `Hello, ${name}!`;
+}
diff --git a/examples/features/workspace-artifact/.agentv/targets.yaml b/examples/features/workspace-artifact/.agentv/targets.yaml
@@ -0,0 +1,11 @@
+targets:
+  # Mock CLI agent that writes a CSV report to outputs/report.csv under workspace_path.
+  # Simulates what a real agent (e.g. Copilot) would do when asked to generate a report.
+  - name: mock_csv_agent
+    provider: cli
+    command: >-
+      bash -c '
+      mkdir -p outputs &&
+      printf "metric,value,status\ncoverage,87.3,pass\nlatency_p99_ms,142,pass\nerror_rate,0.02,pass\n" > outputs/report.csv &&
+      echo "Generated outputs/report.csv" > {OUTPUT_FILE}
+      '
diff --git a/examples/features/workspace-artifact/evals/eval.yaml b/examples/features/workspace-artifact/evals/eval.yaml
@@ -0,0 +1,50 @@
+# Workspace artifact example
+#
+# Demonstrates that file_changes captures files generated by agents under
+# workspace_path even when workspace_path is not a pre-existing git repo.
+#
+# Scenario:
+#   A mock CLI agent is asked to produce a CSV report. It writes the CSV
+#   directly into workspace_path (the temp workspace created from the
+#   template). AgentV takes a baseline snapshot before the agent runs and
+#   diffs it afterwards, populating file_changes with the new CSV content.
+#   A code grader then checks the CSV is present via {{file_changes}}.
+#
+# RED (before fix): Without workspace configured, agents like Copilot that
+#   save artifacts to their session-state path can't be evaluated because
+#   file_changes is always empty.
+#
+# GREEN (after fix): With workspace configured, the snapshot baseline tracks
+#   any file written under workspace_path — no git required. Provider-reported
+#   fileChanges additionally surfaces files written outside workspace_path
+#   (e.g. Copilot session-state) directly from the provider response.
+
+name: workspace-artifact
+description: Verify file_changes captures generated artifacts (CSV) under workspace_path
+
+workspace:
+  template: ../workspace-template
+
+execution:
+  target: mock_csv_agent
+
+tests:
+  - id: csv-report-generated
+    criteria: >-
+      The agent must produce a CSV report at outputs/report.csv.
+      The file_changes diff should show the CSV was created with the correct
+      header row and at least one data row.
+
+    input:
+      - role: user
+        content:
+          - type: text
+            value: >-
+              Generate a CSV report with analysis results and save it to
+              outputs/report.csv. The CSV must have a header row and at least
+              one data row.
+
+    assertions:
+      - name: csv-in-file-changes
+        type: code-grader
+        command: ["bun", "run", "../scripts/check-csv-artifact.ts"]
diff --git a/examples/features/workspace-artifact/scripts/check-csv-artifact.ts b/examples/features/workspace-artifact/scripts/check-csv-artifact.ts
@@ -0,0 +1,71 @@
+#!/usr/bin/env bun
+/**
+ * Code grader: checks that file_changes contains outputs/report.csv
+ * with a header row and at least one data row.
+ *
+ * This grader is intentionally self-contained — no LLM required.
+ * It proves the workspace-snapshot feature is working by inspecting
+ * the file_changes diff captured from the temp workspace.
+ */
+import { readFileSync } from 'node:fs';
+
+const input = JSON.parse(readFileSync('/dev/stdin', 'utf-8')) as {
+  file_changes: string | null;
+  criteria: string | null;
+};
+
+const fileChanges: string = input.file_changes ?? '';
+
+const assertions: Array<{ text: string; passed: boolean; evidence?: string }> = [];
+
+// Check 1: file_changes is non-empty
+if (!fileChanges || fileChanges.trim().length === 0) {
+  assertions.push({
+    text: 'file_changes is non-empty',
+    passed: false,
+    evidence: 'file_changes is empty — workspace snapshot or git baseline may not be configured',
+  });
+  console.log(JSON.stringify({ score: 0, assertions }));
+  process.exit(0);
+}
+
+assertions.push({ text: 'file_changes is non-empty', passed: true });
+
+// Check 2: diff mentions outputs/report.csv
+const hasCsvFile = fileChanges.includes('outputs/report.csv');
+assertions.push({
+  text: 'diff contains outputs/report.csv',
+  passed: hasCsvFile,
+  evidence: hasCsvFile
+    ? undefined
+    : `file_changes did not mention outputs/report.csv. Got:\n${fileChanges.slice(0, 500)}`,
+});
+
+// Extract CSV lines from the diff (lines starting with '+' that are not '+++')
+const csvLines = fileChanges
+  .split('\n')
+  .filter((line) => line.startsWith('+') && !line.startsWith('+++'))
+  .map((line) => line.slice(1)); // strip leading '+'
+
+// Check 3: has header row (non-empty first content line)
+const headerLine = csvLines[0] ?? '';
+const hasHeader = headerLine.includes(',');
+assertions.push({
+  text: 'CSV has a header row',
+  passed: hasHeader,
+  evidence: hasHeader ? undefined : `First CSV line: "${headerLine}"`,
+});
+
+// Check 4: has at least one data row
+const dataRows = csvLines.slice(1).filter((l) => l.trim().length > 0 && l.includes(','));
+const hasDataRow = dataRows.length > 0;
+assertions.push({
+  text: 'CSV has at least one data row',
+  passed: hasDataRow,
+  evidence: hasDataRow ? undefined : 'No data rows found after the header',
+});
+
+const passed = assertions.filter((a) => a.passed).length;
+const score = passed / assertions.length;
+
+console.log(JSON.stringify({ score, assertions }));
diff --git a/examples/features/workspace-artifact/workspace-template/.gitkeep b/examples/features/workspace-artifact/workspace-template/.gitkeep
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1,3 @@
		# My Project

		A sample project workspace used for AgentV evaluation.