diff --git a/apps/cli/src/commands/validate/validate-files.ts b/apps/cli/src/commands/validate/validate-files.ts index b00c27b47..f9a5021ba 100644 --- a/apps/cli/src/commands/validate/validate-files.ts +++ b/apps/cli/src/commands/validate/validate-files.ts @@ -5,6 +5,7 @@ import { type ValidationResult, type ValidationSummary, detectFileType, + validateCasesFile, validateConfigFile, validateEvalFile, validateFileReferences, @@ -17,12 +18,7 @@ import fg from 'fast-glob'; */ export async function validateFiles(paths: readonly string[]): Promise { const filePaths = await expandPaths(paths); - const results: ValidationResult[] = []; - - for (const filePath of filePaths) { - const result = await validateSingleFile(filePath); - results.push(result); - } + const results = await Promise.all(filePaths.map((filePath) => validateSingleFile(filePath))); const validFiles = results.filter((r) => r.valid).length; const invalidFiles = results.filter((r) => !r.valid).length; @@ -58,10 +54,27 @@ async function validateSingleFile(filePath: string): Promise { }; } } + } else if (fileType === 'cases') { + result = await validateCasesFile(absolutePath); } else if (fileType === 'targets') { result = await validateTargetsFile(absolutePath); - } else { + } else if (fileType === 'config') { result = await validateConfigFile(absolutePath); + } else { + // Unknown file type — skip validation, report as skipped + result = { + valid: true, + filePath: absolutePath, + fileType: 'unknown', + errors: [ + { + severity: 'warning', + filePath: absolutePath, + message: + 'File type not recognized. Eval files must end in .eval.yaml. Skipping validation.', + }, + ], + }; } return result; @@ -130,7 +143,7 @@ async function findYamlFiles(dirPath: string): Promise { } const subFiles = await findYamlFiles(fullPath); results.push(...subFiles); - } else if (entry.isFile() && isYamlFile(entry.name)) { + } else if (entry.isFile() && isEvalYamlFile(entry.name)) { results.push(fullPath); } } @@ -145,3 +158,9 @@ function isYamlFile(filePath: string): boolean { const ext = path.extname(filePath).toLowerCase(); return ext === '.yaml' || ext === '.yml'; } + +/** Returns true only for *.eval.yaml / *.eval.yml files (used for directory scanning). */ +function isEvalYamlFile(filePath: string): boolean { + const lower = path.basename(filePath).toLowerCase(); + return lower.endsWith('.eval.yaml') || lower.endsWith('.eval.yml'); +} diff --git a/examples/features/basic-jsonl/evals/dataset.eval.yaml b/examples/features/basic-jsonl/evals/dataset.eval.yaml index c226536db..508c684c0 100644 --- a/examples/features/basic-jsonl/evals/dataset.eval.yaml +++ b/examples/features/basic-jsonl/evals/dataset.eval.yaml @@ -7,6 +7,4 @@ name: basic-jsonl execution: target: llm -evaluator: llm_grader - tests: ./dataset.jsonl diff --git a/examples/features/prompt-template-sdk/evals/dataset.eval.yaml b/examples/features/prompt-template-sdk/evals/dataset.eval.yaml index 6917de1bd..81e221837 100644 --- a/examples/features/prompt-template-sdk/evals/dataset.eval.yaml +++ b/examples/features/prompt-template-sdk/evals/dataset.eval.yaml @@ -18,8 +18,9 @@ tests: - type: text value: What are the main benefits of TypeScript over JavaScript? - reference_answer: |- - TypeScript provides static type checking, better IDE support, and improved maintainability. + expected_output: + - role: assistant + content: TypeScript provides static type checking, better IDE support, and improved maintainability. assertions: - name: custom-prompt-eval @@ -37,8 +38,9 @@ tests: - type: text value: Explain async/await in JavaScript. - reference_answer: |- - Async/await is syntactic sugar over Promises that makes asynchronous code look synchronous. + expected_output: + - role: assistant + content: Async/await is syntactic sugar over Promises that makes asynchronous code look synchronous. assertions: - name: strict-eval diff --git a/examples/features/tool-trajectory-advanced/evals/trace-file-demo.eval.yaml b/examples/features/tool-trajectory-advanced/evals/trace-file-demo.eval.yaml index 07609020a..a978cee75 100644 --- a/examples/features/tool-trajectory-advanced/evals/trace-file-demo.eval.yaml +++ b/examples/features/tool-trajectory-advanced/evals/trace-file-demo.eval.yaml @@ -23,10 +23,8 @@ tests: # Expected score: 1.0 (webSearch before fetchPage is satisfied) # ============================================================================= - id: in-order-validation - description: |- - Validates that the agent performs web search before fetching page details. - Mode 'in_order' allows other tool calls between expected tools. - + # Validates that the agent performs web search before fetching page details. + # Mode 'in_order' allows other tool calls between expected tools. criteria: |- Agent searches for product information, then fetches detailed specs. @@ -49,10 +47,8 @@ tests: # Expected score: 1.0 (matches full trace exactly) # ============================================================================= - id: exact-sequence-validation - description: |- - Validates the exact sequence of all tool calls in the trace. - Mode 'exact' requires the trace to match precisely. - + # Validates the exact sequence of all tool calls in the trace. + # Mode 'exact' requires the trace to match precisely. criteria: |- Agent follows the exact research workflow: search, fetch, search reviews, summarize. @@ -76,10 +72,8 @@ tests: # Expected score: 1.0 (meets all minimums) # ============================================================================= - id: any-order-with-minimums - description: |- - Validates that the agent performs adequate research by checking minimum - tool call counts. Mode 'any_order' with minimums is flexible on sequence. - + # Validates that the agent performs adequate research by checking minimum + # tool call counts. Mode 'any_order' with minimums is flexible on sequence. criteria: |- Agent performs at least 2 web searches and 1 page fetch for thorough research. @@ -101,10 +95,8 @@ tests: # Expected score: 1.0 (inputs match expected patterns) # ============================================================================= - id: tool-input-validation - description: |- - Validates that tool calls include appropriate input parameters. - Useful for ensuring the agent provides correct context to tools. - + # Validates that tool calls include appropriate input parameters. + # Useful for ensuring the agent provides correct context to tools. criteria: |- Agent searches with relevant product keywords and fetches from authoritative source. @@ -130,10 +122,8 @@ tests: # Expected score: 1.0 (outputs contain expected fields) # ============================================================================= - id: tool-output-validation - description: |- - Validates that tool outputs contain expected data. - Useful for regression testing when tool behavior changes. - + # Validates that tool outputs contain expected data. + # Useful for regression testing when tool behavior changes. criteria: |- Web search returns results with links and snippets; fetch returns product specs. @@ -161,10 +151,8 @@ tests: # This mirrors patterns used in complex agent pipelines # ============================================================================= - id: combined-validation - description: |- - Production-style evaluation combining sequence validation with input/output - checks. Demonstrates a realistic multi-turn agent workflow. - + # Production-style evaluation combining sequence validation with input/output + # checks. Demonstrates a realistic multi-turn agent workflow. criteria: |- Agent performs comprehensive product research: 1. Initial web search for product specs diff --git a/examples/showcase/tool-evaluation-plugins/README.md b/examples/showcase/tool-evaluation-plugins/README.md index fe779397a..cbf91eaca 100644 --- a/examples/showcase/tool-evaluation-plugins/README.md +++ b/examples/showcase/tool-evaluation-plugins/README.md @@ -56,7 +56,7 @@ evaluators: export TOOL_EVAL_PLUGINS_DIR=$(pwd)/examples/showcase/tool-evaluation-plugins # Run the demo -npx agentv eval examples/showcase/tool-evaluation-plugins/tool-eval-demo.yaml +npx agentv eval examples/showcase/tool-evaluation-plugins/tool-eval-demo.eval.yaml ``` ## Input Contract diff --git a/examples/showcase/tool-evaluation-plugins/tool-eval-demo.yaml b/examples/showcase/tool-evaluation-plugins/tool-eval-demo.eval.yaml similarity index 98% rename from examples/showcase/tool-evaluation-plugins/tool-eval-demo.yaml rename to examples/showcase/tool-evaluation-plugins/tool-eval-demo.eval.yaml index 0dab62823..5ba64e4d2 100644 --- a/examples/showcase/tool-evaluation-plugins/tool-eval-demo.yaml +++ b/examples/showcase/tool-evaluation-plugins/tool-eval-demo.eval.yaml @@ -5,7 +5,7 @@ # semantic evaluation capabilities that require domain-specific logic. # # Run: cd examples/showcase/tool-evaluation-plugins -# npx agentv eval tool-eval-demo.yaml --target mock_agent +# npx agentv eval tool-eval-demo.eval.yaml --target mock_agent description: Showcase of tool evaluation plugin patterns diff --git a/packages/core/src/evaluation/validation/cases-validator.ts b/packages/core/src/evaluation/validation/cases-validator.ts new file mode 100644 index 000000000..d88a7a232 --- /dev/null +++ b/packages/core/src/evaluation/validation/cases-validator.ts @@ -0,0 +1,98 @@ +import { readFile } from 'node:fs/promises'; +import path from 'node:path'; +import { parse } from 'yaml'; + +import type { ValidationError, ValidationResult } from './types.js'; + +type JsonValue = string | number | boolean | null | JsonObject | JsonArray; +type JsonObject = { readonly [key: string]: JsonValue }; +type JsonArray = readonly JsonValue[]; + +function isObject(value: unknown): value is JsonObject { + return typeof value === 'object' && value !== null && !Array.isArray(value); +} + +/** + * Validate a cases file — a YAML file whose root is an array of test case objects. + * + * Cases files are referenced from eval files via `tests: path/to/cases.yaml` or + * `file://cases/accuracy.yaml` entries in the tests array. Each item must have + * at least an `id` (non-empty string) and an `input` (string or array). + */ +export async function validateCasesFile(filePath: string): Promise { + const errors: ValidationError[] = []; + const absolutePath = path.resolve(filePath); + + let parsed: unknown; + try { + const content = await readFile(absolutePath, 'utf8'); + parsed = parse(content); + } catch (error) { + errors.push({ + severity: 'error', + filePath: absolutePath, + message: `Failed to parse YAML: ${(error as Error).message}`, + }); + return { valid: false, filePath: absolutePath, fileType: 'cases', errors }; + } + + if (!Array.isArray(parsed)) { + errors.push({ + severity: 'error', + filePath: absolutePath, + message: 'Cases file must contain a YAML array of test case objects', + }); + return { valid: false, filePath: absolutePath, fileType: 'cases', errors }; + } + + for (let i = 0; i < parsed.length; i++) { + const item = parsed[i]; + const location = `[${i}]`; + + if (!isObject(item)) { + errors.push({ + severity: 'error', + filePath: absolutePath, + location, + message: 'Each test case must be an object', + }); + continue; + } + + // Required: id + const id = item.id; + if (typeof id !== 'string' || id.trim().length === 0) { + errors.push({ + severity: 'error', + filePath: absolutePath, + location: `${location}.id`, + message: "Missing or invalid 'id' field (must be a non-empty string)", + }); + } + + // Required: input + const input = item.input; + if (input === undefined) { + errors.push({ + severity: 'error', + filePath: absolutePath, + location: `${location}.input`, + message: "Missing 'input' field (must be a string or array of messages)", + }); + } else if (typeof input !== 'string' && !Array.isArray(input)) { + errors.push({ + severity: 'error', + filePath: absolutePath, + location: `${location}.input`, + message: "Invalid 'input' field (must be a string or array of messages)", + }); + } + } + + return { + valid: errors.filter((e) => e.severity === 'error').length === 0, + filePath: absolutePath, + fileType: 'cases', + errors, + }; +} diff --git a/packages/core/src/evaluation/validation/eval-file.schema.ts b/packages/core/src/evaluation/validation/eval-file.schema.ts index e35f46287..7c6bbc392 100644 --- a/packages/core/src/evaluation/validation/eval-file.schema.ts +++ b/packages/core/src/evaluation/validation/eval-file.schema.ts @@ -382,7 +382,6 @@ const EvalTestSchema = z.object({ metadata: z.record(z.unknown()).optional(), conversation_id: z.string().optional(), suite: z.string().optional(), - note: z.string().optional(), depends_on: z.array(z.string()).optional(), on_dependency_failure: z.enum(['skip', 'fail', 'run']).optional(), mode: z.enum(['conversation']).optional(), diff --git a/packages/core/src/evaluation/validation/eval-validator.ts b/packages/core/src/evaluation/validation/eval-validator.ts index 576dc9f78..f22f82e15 100644 --- a/packages/core/src/evaluation/validation/eval-validator.ts +++ b/packages/core/src/evaluation/validation/eval-validator.ts @@ -1,4 +1,4 @@ -import { readFile } from 'node:fs/promises'; +import { readFile, readdir } from 'node:fs/promises'; import path from 'node:path'; import { parse } from 'yaml'; @@ -36,6 +36,7 @@ const KNOWN_TOP_LEVEL_FIELDS = new Set([ '$schema', 'name', 'description', + 'category', 'version', 'author', 'tags', @@ -44,14 +45,25 @@ const KNOWN_TOP_LEVEL_FIELDS = new Set([ 'input', 'input_files', 'tests', - 'eval_cases', 'target', 'execution', 'assertions', 'evaluators', + 'preprocessors', 'workspace', ]); +/** + * Deprecated top-level fields with migration hints. + * These are still processed by yaml-parser but authors should migrate. + */ +const DEPRECATED_TOP_LEVEL_FIELDS = new Map([ + ['eval_cases', "'eval_cases' is deprecated. Use 'tests' instead."], + ['evalcases', "'evalcases' is deprecated. Use 'tests' instead."], + ['evaluator', "'evaluator' is deprecated. Use 'assertions' instead."], + ['assert', "'assert' is deprecated. Use 'assertions' instead."], +]); + /** Known fields at the test level. */ const KNOWN_TEST_FIELDS = new Set([ 'id', @@ -61,12 +73,12 @@ const KNOWN_TEST_FIELDS = new Set([ 'expected_output', 'assertions', 'evaluators', + 'rubrics', 'execution', 'workspace', 'metadata', 'conversation_id', 'suite', - 'note', 'depends_on', 'on_dependency_failure', 'mode', @@ -76,9 +88,66 @@ const KNOWN_TEST_FIELDS = new Set([ 'window_size', ]); +/** + * Deprecated test-level fields with migration hints. + * These are still processed by yaml-parser but authors should migrate. + */ +const DEPRECATED_TEST_FIELDS = new Map([ + ['evaluator', "'evaluator' is deprecated. Use 'assertions' instead."], + ['assert', "'assert' is deprecated. Use 'assertions' instead."], + ['expected_outcome', "'expected_outcome' is deprecated. Use 'criteria' instead."], +]); + /** Name field pattern: lowercase alphanumeric with hyphens. */ const NAME_PATTERN = /^[a-z0-9-]+$/; +/** Script file extensions recognised as custom assertion plugins. */ +const ASSERTION_SCRIPT_EXTENSIONS = new Set(['.ts', '.js', '.mts', '.mjs', '.cts', '.cjs']); + +/** Cache: directory path → promise of discovered type names. */ +const customAssertionCache = new Map>>(); + +/** + * Walk up the directory tree from `baseDir` collecting type names from + * `.agentv/assertions/` directories — mirrors the runtime discovery in + * `assertion-discovery.ts`. + * + * Results are cached by directory so concurrent validation of many files + * in the same directory only does the filesystem walk once. + */ +function discoverCustomAssertionTypes(baseDir: string): Promise> { + const resolved = path.resolve(baseDir); + const cached = customAssertionCache.get(resolved); + if (cached) return cached; + + const promise = (async () => { + const types = new Set(); + let dir = resolved; + const root = path.parse(dir).root; + + while (dir !== root) { + const assertionsDir = path.join(dir, '.agentv', 'assertions'); + try { + const entries = await readdir(assertionsDir, { withFileTypes: true }); + for (const entry of entries) { + if (!entry.isFile()) continue; + const ext = path.extname(entry.name).toLowerCase(); + if (!ASSERTION_SCRIPT_EXTENSIONS.has(ext)) continue; + types.add(entry.name.slice(0, -ext.length)); + } + } catch { + // Directory doesn't exist — skip + } + dir = path.dirname(dir); + } + + return types; + })(); + + customAssertionCache.set(resolved, promise); + return promise; +} + function isObject(value: unknown): value is JsonObject { return typeof value === 'object' && value !== null && !Array.isArray(value); } @@ -89,6 +158,7 @@ function isObject(value: unknown): value is JsonObject { export async function validateEvalFile(filePath: string): Promise { const errors: ValidationError[] = []; const absolutePath = path.resolve(filePath); + const customAssertionTypes = await discoverCustomAssertionTypes(path.dirname(absolutePath)); let parsed: unknown; try { @@ -125,9 +195,17 @@ export async function validateEvalFile(filePath: string): Promise = new Set(), ): void { if (!Array.isArray(assertField)) { errors.push({ @@ -669,7 +756,7 @@ function validateAssertArray( // Normalize snake_case to kebab-case for backward compatibility const typeValue = rawTypeValue.replace(/_/g, '-'); - if (!isEvaluatorKind(typeValue)) { + if (!isEvaluatorKind(typeValue) && !customAssertionTypes.has(typeValue)) { errors.push({ severity: 'warning', filePath, diff --git a/packages/core/src/evaluation/validation/file-type.ts b/packages/core/src/evaluation/validation/file-type.ts index 46e87bef5..464610c61 100644 --- a/packages/core/src/evaluation/validation/file-type.ts +++ b/packages/core/src/evaluation/validation/file-type.ts @@ -20,6 +20,11 @@ export async function detectFileType(filePath: string): Promise { const content = await readFile(filePath, 'utf8'); const parsed = parse(content) as unknown; + // YAML array root → cases file (array of test case objects) + if (Array.isArray(parsed)) { + return 'cases'; + } + if (typeof parsed !== 'object' || parsed === null) { return inferFileTypeFromPath(filePath); } @@ -65,8 +70,14 @@ function inferFileTypeFromPath(filePath: string): FileType { } } - // Default to eval file - return 'eval'; + // Require .eval.yaml / .eval.yml suffix for eval files + const lower = basename.toLowerCase(); + if (lower.endsWith('.eval.yaml') || lower.endsWith('.eval.yml')) { + return 'eval'; + } + + // Unrecognized — do not assume eval type + return 'unknown'; } /** diff --git a/packages/core/src/evaluation/validation/index.ts b/packages/core/src/evaluation/validation/index.ts index 9347974f3..f5874ee9d 100644 --- a/packages/core/src/evaluation/validation/index.ts +++ b/packages/core/src/evaluation/validation/index.ts @@ -4,6 +4,7 @@ export { detectFileType, isValidSchema, getExpectedSchema } from './file-type.js'; export { validateEvalFile } from './eval-validator.js'; +export { validateCasesFile } from './cases-validator.js'; export { validateTargetsFile } from './targets-validator.js'; export { validateConfigFile } from './config-validator.js'; export { validateFileReferences } from './file-reference-validator.js'; diff --git a/packages/core/src/evaluation/validation/types.ts b/packages/core/src/evaluation/validation/types.ts index b49a0bed8..8aafe5aee 100644 --- a/packages/core/src/evaluation/validation/types.ts +++ b/packages/core/src/evaluation/validation/types.ts @@ -2,7 +2,7 @@ * Validation result types for AgentV file validation. */ -export type FileType = 'eval' | 'targets' | 'config' | 'unknown'; +export type FileType = 'eval' | 'targets' | 'config' | 'cases' | 'unknown'; export type ValidationSeverity = 'error' | 'warning'; diff --git a/packages/core/test/evaluation/validation/eval-validator.test.ts b/packages/core/test/evaluation/validation/eval-validator.test.ts index f6b819479..7992160b8 100644 --- a/packages/core/test/evaluation/validation/eval-validator.test.ts +++ b/packages/core/test/evaluation/validation/eval-validator.test.ts @@ -993,7 +993,6 @@ tests: value: "world" metadata: tag: test - note: A note `, ); @@ -1051,8 +1050,8 @@ tests: }); describe('removed legacy fields', () => { - it('warns on expected_outcome as unknown field', async () => { - const filePath = path.join(tempDir, 'expected-outcome-unknown.yaml'); + it('warns on expected_outcome as deprecated field', async () => { + const filePath = path.join(tempDir, 'expected-outcome-deprecated.yaml'); await writeFile( filePath, `tests: @@ -1068,13 +1067,17 @@ tests: expect(result.valid).toBe(true); const warnings = result.errors.filter((e) => e.severity === 'warning'); - expect(warnings.some((e) => e.message.includes("Unknown field 'expected_outcome'"))).toBe( - true, - ); + expect( + warnings.some( + (e) => + e.message.includes("'expected_outcome' is deprecated") && + e.message.includes("'criteria'"), + ), + ).toBe(true); }); - it('warns on assert as unknown field at test level', async () => { - const filePath = path.join(tempDir, 'assert-unknown.yaml'); + it('warns on assert as deprecated field at test level', async () => { + const filePath = path.join(tempDir, 'assert-deprecated.yaml'); await writeFile( filePath, `tests: @@ -1090,11 +1093,15 @@ tests: expect(result.valid).toBe(true); const warnings = result.errors.filter((e) => e.severity === 'warning'); - expect(warnings.some((e) => e.message.includes("Unknown field 'assert'"))).toBe(true); + expect( + warnings.some( + (e) => e.message.includes("'assert' is deprecated") && e.message.includes("'assertions'"), + ), + ).toBe(true); }); - it('warns on assert as unknown field at top level', async () => { - const filePath = path.join(tempDir, 'assert-top-unknown.yaml'); + it('warns on assert as deprecated field at top level', async () => { + const filePath = path.join(tempDir, 'assert-top-deprecated.yaml'); await writeFile( filePath, `assert: @@ -1109,7 +1116,11 @@ tests: const result = await validateEvalFile(filePath); const warnings = result.errors.filter((e) => e.severity === 'warning'); - expect(warnings.some((e) => e.message.includes("Unknown field 'assert'"))).toBe(true); + expect( + warnings.some( + (e) => e.message.includes("'assert' is deprecated") && e.message.includes("'assertions'"), + ), + ).toBe(true); }); }); }); diff --git a/plugins/agentv-dev/skills/agentv-eval-writer/references/eval-schema.json b/plugins/agentv-dev/skills/agentv-eval-writer/references/eval-schema.json index 80dc2ebd8..885d275b3 100644 --- a/plugins/agentv-dev/skills/agentv-eval-writer/references/eval-schema.json +++ b/plugins/agentv-dev/skills/agentv-eval-writer/references/eval-schema.json @@ -5102,9 +5102,6 @@ "suite": { "type": "string" }, - "note": { - "type": "string" - }, "depends_on": { "type": "array", "items": { @@ -11373,9 +11370,6 @@ "suite": { "type": "string" }, - "note": { - "type": "string" - }, "depends_on": { "type": "array", "items": {