From 547e782a648a218dce17fdc8dfc14778501a5a12 Mon Sep 17 00:00:00 2001 From: Christopher Tso Date: Mon, 13 Apr 2026 22:57:47 +0000 Subject: [PATCH 1/2] fix(dry-run): return schema-valid mock responses to prevent grader crashes VSCode provider dry-run mode returned empty `output: []`, causing evaluators to receive an empty candidate string: - `is-json` would fail (empty string is not valid JSON) - `contains`/`equals`/`regex` would fail trivially - `execution-metrics` would report "Token usage data not available" Fix: return `output: [{ role: 'assistant', content: '{}' }]` plus zeroed `tokenUsage: { input: 0, output: 0 }` in both `invoke()` and `invokeBatch()` dry-run paths. The `'{}'` response is valid JSON and a non-empty string, satisfying all built-in graders without crashing. Closes #1088 Co-Authored-By: Claude Sonnet 4.6 --- AGENTS.md | 2 +- .../evaluation/providers/vscode-provider.ts | 6 +- .../providers/vscode-provider-dry-run.test.ts | 89 +++++++++++++++++++ 3 files changed, 94 insertions(+), 3 deletions(-) create mode 100644 packages/core/test/evaluation/providers/vscode-provider-dry-run.test.ts diff --git a/AGENTS.md b/AGENTS.md index 4ca7d7e8b..49b50760f 100644 --- a/AGENTS.md +++ b/AGENTS.md @@ -284,7 +284,7 @@ Unit tests alone are insufficient for evaluator changes. After implementing or m 4. **Update baseline files** if output format changes (e.g., type name renames). Baseline files live alongside eval YAML files as `*.baseline.jsonl` and contain expected `scores[].type` values. There are 30+ baseline files across `examples/`. -5. **Note:** `--dry-run` returns mock responses that don't match evaluator output schemas. Use it only for testing harness flow, not evaluator logic. +5. **Note:** `--dry-run` returns schema-valid mock responses (`{}` as output, zeroed `tokenUsage`). Built-in graders will not crash, but scores are meaningless. Use it for testing harness flow, not evaluator logic. ### Completing Work — E2E Checklist diff --git a/packages/core/src/evaluation/providers/vscode-provider.ts b/packages/core/src/evaluation/providers/vscode-provider.ts index dc41c2a6f..2e97c5654 100644 --- a/packages/core/src/evaluation/providers/vscode-provider.ts +++ b/packages/core/src/evaluation/providers/vscode-provider.ts @@ -75,8 +75,9 @@ export class VSCodeProvider implements Provider { if (this.config.dryRun) { return { - output: [], + output: [{ role: 'assistant' as const, content: '{}' }], durationMs, + tokenUsage: { input: 0, output: 0 }, raw: { session, inputFiles, @@ -146,8 +147,9 @@ export class VSCodeProvider implements Provider { if (this.config.dryRun) { return normalizedRequests.map(({ inputFiles }) => ({ - output: [], + output: [{ role: 'assistant' as const, content: '{}' }], durationMs: perRequestDurationMs, + tokenUsage: { input: 0, output: 0 }, raw: { session, inputFiles, diff --git a/packages/core/test/evaluation/providers/vscode-provider-dry-run.test.ts b/packages/core/test/evaluation/providers/vscode-provider-dry-run.test.ts new file mode 100644 index 000000000..989cc9ea0 --- /dev/null +++ b/packages/core/test/evaluation/providers/vscode-provider-dry-run.test.ts @@ -0,0 +1,89 @@ +import os from 'node:os'; +import path from 'node:path'; +import { rm, mkdir, writeFile } from 'node:fs/promises'; +import { afterAll, beforeAll, describe, expect, it, vi } from 'vitest'; + +// Mock vscode dispatch to skip actual VS Code invocation +vi.mock('../../../src/evaluation/providers/vscode/index.js', () => ({ + dispatchAgentSession: vi.fn().mockResolvedValue({ + exitCode: 0, + subagentName: 'subagent-1', + responseFile: '/fake/response.md', + tempFile: '/fake/response.tmp.md', + }), + dispatchBatchAgent: vi.fn().mockResolvedValue({ + exitCode: 0, + responseFiles: ['/fake/response1.md'], + }), + getSubagentRoot: vi.fn().mockReturnValue('/fake/subagents'), + provisionSubagents: vi.fn().mockResolvedValue({ created: [], skippedExisting: [] }), +})); + +import { VSCodeProvider } from '../../../src/evaluation/providers/vscode-provider.js'; + +let tmpDir: string; +let fakeExecutable: string; + +beforeAll(async () => { + const dir = path.join(os.tmpdir(), `agentv-test-dry-run-${Date.now()}`); + await mkdir(dir, { recursive: true }); + fakeExecutable = path.join(dir, 'code'); + await writeFile(fakeExecutable, '#!/bin/sh\n'); + tmpDir = dir; +}); + +afterAll(async () => { + await rm(tmpDir, { recursive: true, force: true }); +}); + +describe('VSCodeProvider dry-run response shape', () => { + it('returns non-empty output so graders do not crash', async () => { + const provider = new VSCodeProvider( + 'test', + { executable: fakeExecutable, waitForResponse: true, dryRun: true }, + 'vscode', + ); + const response = await provider.invoke({ question: 'ping' }); + + expect(response.output).toHaveLength(1); + expect(response.output![0]!.role).toBe('assistant'); + }); + + it('returns valid JSON content so is-json grader passes', async () => { + const provider = new VSCodeProvider( + 'test', + { executable: fakeExecutable, waitForResponse: true, dryRun: true }, + 'vscode', + ); + const response = await provider.invoke({ question: 'ping' }); + + const content = response.output![0]!.content; + expect(() => JSON.parse(content as string)).not.toThrow(); + }); + + it('returns zeroed tokenUsage so execution-metrics grader does not report missing data', async () => { + const provider = new VSCodeProvider( + 'test', + { executable: fakeExecutable, waitForResponse: true, dryRun: true }, + 'vscode', + ); + const response = await provider.invoke({ question: 'ping' }); + + expect(response.tokenUsage).toEqual({ input: 0, output: 0 }); + }); + + it('batch invoke returns the same schema-valid shape per response', async () => { + const provider = new VSCodeProvider( + 'test', + { executable: fakeExecutable, waitForResponse: true, dryRun: true }, + 'vscode', + ); + const responses = await provider.invokeBatch!([{ question: 'ping' }]); + + expect(responses).toHaveLength(1); + expect(responses[0]!.output).toHaveLength(1); + expect(responses[0]!.output![0]!.role).toBe('assistant'); + expect(() => JSON.parse(responses[0]!.output![0]!.content as string)).not.toThrow(); + expect(responses[0]!.tokenUsage).toEqual({ input: 0, output: 0 }); + }); +}); From cdcee62a00fa25ab529feb5a3b8bbc3fd2d93576 Mon Sep 17 00:00:00 2001 From: Christopher Tso Date: Mon, 13 Apr 2026 22:59:53 +0000 Subject: [PATCH 2/2] test(dry-run): fix noNonNullAssertion lint errors in regression test Co-Authored-By: Claude Sonnet 4.6 --- .../providers/vscode-provider-dry-run.test.ts | 17 +++++++++-------- 1 file changed, 9 insertions(+), 8 deletions(-) diff --git a/packages/core/test/evaluation/providers/vscode-provider-dry-run.test.ts b/packages/core/test/evaluation/providers/vscode-provider-dry-run.test.ts index 989cc9ea0..87e2f2c44 100644 --- a/packages/core/test/evaluation/providers/vscode-provider-dry-run.test.ts +++ b/packages/core/test/evaluation/providers/vscode-provider-dry-run.test.ts @@ -1,6 +1,6 @@ +import { mkdir, rm, writeFile } from 'node:fs/promises'; import os from 'node:os'; import path from 'node:path'; -import { rm, mkdir, writeFile } from 'node:fs/promises'; import { afterAll, beforeAll, describe, expect, it, vi } from 'vitest'; // Mock vscode dispatch to skip actual VS Code invocation @@ -46,7 +46,7 @@ describe('VSCodeProvider dry-run response shape', () => { const response = await provider.invoke({ question: 'ping' }); expect(response.output).toHaveLength(1); - expect(response.output![0]!.role).toBe('assistant'); + expect(response.output?.at(0)?.role).toBe('assistant'); }); it('returns valid JSON content so is-json grader passes', async () => { @@ -57,7 +57,7 @@ describe('VSCodeProvider dry-run response shape', () => { ); const response = await provider.invoke({ question: 'ping' }); - const content = response.output![0]!.content; + const content = response.output?.at(0)?.content; expect(() => JSON.parse(content as string)).not.toThrow(); }); @@ -78,12 +78,13 @@ describe('VSCodeProvider dry-run response shape', () => { { executable: fakeExecutable, waitForResponse: true, dryRun: true }, 'vscode', ); - const responses = await provider.invokeBatch!([{ question: 'ping' }]); + const responses = await provider.invokeBatch?.([{ question: 'ping' }]); expect(responses).toHaveLength(1); - expect(responses[0]!.output).toHaveLength(1); - expect(responses[0]!.output![0]!.role).toBe('assistant'); - expect(() => JSON.parse(responses[0]!.output![0]!.content as string)).not.toThrow(); - expect(responses[0]!.tokenUsage).toEqual({ input: 0, output: 0 }); + const first = responses?.at(0); + expect(first?.output).toHaveLength(1); + expect(first?.output?.at(0)?.role).toBe('assistant'); + expect(() => JSON.parse(first?.output?.at(0)?.content as string)).not.toThrow(); + expect(first?.tokenUsage).toEqual({ input: 0, output: 0 }); }); });