From d77a19967e8bd45cc865ed3db61688e7d31d593d Mon Sep 17 00:00:00 2001 From: Christopher Date: Sat, 4 Apr 2026 07:46:51 +0000 Subject: [PATCH] feat(cli): support repeatable test-id filters --- AGENTS.md | 16 +++++++++---- apps/cli/src/commands/eval/commands/run.ts | 6 ++--- apps/cli/src/commands/eval/run-eval.ts | 17 ++++++++++++-- apps/cli/test/eval.integration.test.ts | 20 ++++++++++++++++ apps/cli/test/fixtures/mock-run-evaluation.ts | 16 +++++++++++-- packages/core/src/evaluation/evaluate.ts | 4 ++-- .../src/evaluation/loaders/jsonl-parser.ts | 12 +++++++--- packages/core/src/evaluation/orchestrator.ts | 23 +++++++++++++++---- packages/core/src/evaluation/yaml-parser.ts | 12 +++++++--- .../evaluation/loaders/jsonl-parser.test.ts | 19 +++++++++++++++ 10 files changed, 121 insertions(+), 24 deletions(-) diff --git a/AGENTS.md b/AGENTS.md index cdf5f284d..dd7529314 100644 --- a/AGENTS.md +++ b/AGENTS.md @@ -334,7 +334,13 @@ When working on a GitHub issue, **ALWAYS** follow this workflow: ``` If the issue has the `in-progress` label, **do not work on it** — pick a different issue. -2. **Create a worktree** with a feature branch: +2. **Update local `main` to the latest `origin/main`** before branching: + ```bash + git checkout main + git pull --ff-only origin main + ``` + +3. **Create a worktree** with a feature branch: ```bash git worktree add agentv.worktrees/ -b /- cd agentv.worktrees/ @@ -343,15 +349,17 @@ When working on a GitHub issue, **ALWAYS** follow this workflow: # Example: git worktree add agentv.worktrees/feat/42-add-new-embedder -b feat/42-add-new-embedder ``` -3. **Implement the changes** and commit following the commit convention + The feature branch must be based on the freshly updated `main`, not a stale local checkout. + +4. **Implement the changes** and commit following the commit convention -4. **Push the branch and create a Pull Request**: +5. **Push the branch and create a Pull Request**: ```bash git push -u origin gh pr create --title "(scope): description" --body "Closes #" ``` -5. **Before merging**, ensure: +6. **Before merging**, ensure: - **E2E verification completed** (see "Completing Work — E2E Checklist") - CI pipeline passes (all checks green) - Code has been reviewed if required diff --git a/apps/cli/src/commands/eval/commands/run.ts b/apps/cli/src/commands/eval/commands/run.ts index 14f33f97e..8e6903c52 100644 --- a/apps/cli/src/commands/eval/commands/run.ts +++ b/apps/cli/src/commands/eval/commands/run.ts @@ -32,10 +32,10 @@ export const evalRunCommand = command({ long: 'targets', description: 'Path to targets.yaml (overrides discovery)', }), - testId: option({ - type: optional(string), + testId: multioption({ + type: array(string), long: 'test-id', - description: 'Filter tests by ID pattern (glob supported, e.g., "summary-*")', + description: 'Filter tests by ID pattern (repeatable, OR logic; glob supported)', }), workers: option({ type: optional(number), diff --git a/apps/cli/src/commands/eval/run-eval.ts b/apps/cli/src/commands/eval/run-eval.ts index 70f8bc26e..1a26fff4b 100644 --- a/apps/cli/src/commands/eval/run-eval.ts +++ b/apps/cli/src/commands/eval/run-eval.ts @@ -60,7 +60,7 @@ interface NormalizedOptions { readonly target?: string; readonly cliTargets: readonly string[]; readonly targetsPath?: string; - readonly filter?: string; + readonly filter?: string | readonly string[]; readonly workers?: number; readonly outPath?: string; readonly outputPaths: readonly string[]; @@ -149,6 +149,18 @@ function normalizeStringArray(value: unknown): readonly string[] { return []; } +function normalizeFilter(value: unknown): string | readonly string[] | undefined { + if (Array.isArray(value)) { + const filters = normalizeStringArray(value); + if (filters.length === 0) { + return undefined; + } + return filters.length === 1 ? filters[0] : filters; + } + + return normalizeString(value); +} + /** * Check whether an eval file's tags satisfy --tag / --exclude-tag filters. * @@ -298,7 +310,7 @@ function normalizeOptions( target: singleTarget, cliTargets, targetsPath: normalizeString(rawOptions.targets), - filter: normalizeString(rawOptions.filter), + filter: normalizeFilter(rawOptions.filter), workers: workers > 0 ? workers : undefined, outPath: cliOut ?? configOut, outputPaths, @@ -718,6 +730,7 @@ async function runSingleEvalFile(params: { } return true; })(), + filter: options.filter, evalCases, verbose: options.verbose, maxConcurrency: resolvedWorkers, diff --git a/apps/cli/test/eval.integration.test.ts b/apps/cli/test/eval.integration.test.ts index 0a9ed4a52..5b3a8b8a3 100644 --- a/apps/cli/test/eval.integration.test.ts +++ b/apps/cli/test/eval.integration.test.ts @@ -247,4 +247,24 @@ describe('agentv eval CLI', () => { await rm(fixture.baseDir, { recursive: true, force: true }); } }); + + it('supports repeatable --test-id flags with OR matching', async () => { + const fixture = await createFixture(); + try { + await runCli(fixture, [ + 'eval', + fixture.testFilePath, + '--test-id', + 'case-alpha', + '--test-id', + 'case-beta', + ]); + + const diagnostics = await readDiagnostics(fixture); + expect(diagnostics.filter).toEqual(['case-alpha', 'case-beta']); + expect(diagnostics.evalCaseIds).toEqual(['case-alpha', 'case-beta']); + } finally { + await rm(fixture.baseDir, { recursive: true, force: true }); + } + }); }); diff --git a/apps/cli/test/fixtures/mock-run-evaluation.ts b/apps/cli/test/fixtures/mock-run-evaluation.ts index e07cb4568..524f4b016 100644 --- a/apps/cli/test/fixtures/mock-run-evaluation.ts +++ b/apps/cli/test/fixtures/mock-run-evaluation.ts @@ -15,7 +15,7 @@ interface RunEvaluationOptionsLike { readonly promptDumpDir?: string; readonly cache?: unknown; readonly useCache?: boolean; - readonly testId?: string; + readonly filter?: string | readonly string[]; readonly evalCases?: ReadonlyArray; readonly verbose?: boolean; readonly onResult?: (result: EvaluationResultLike) => Promise | void; @@ -77,11 +77,23 @@ async function maybeWriteDiagnostics( targetKind: options.target?.kind, agentTimeoutMs: options.agentTimeoutMs ?? null, promptDumpDir: options.promptDumpDir, - testId: options.testId ?? null, + filter: options.filter ?? null, useCache: options.useCache ?? false, envSample: process.env.CLI_ENV_SAMPLE ?? null, envRootOnly: process.env.CLI_ENV_ROOT_ONLY ?? null, envLocalOnly: process.env.CLI_ENV_LOCAL_ONLY ?? null, + evalCaseIds: Array.isArray(options.evalCases) + ? options.evalCases + .map((evalCase) => + evalCase && + typeof evalCase === 'object' && + 'id' in evalCase && + typeof evalCase.id === 'string' + ? evalCase.id + : null, + ) + .filter((id): id is string => id !== null) + : null, resultCount: results.length, } satisfies Record; diff --git a/packages/core/src/evaluation/evaluate.ts b/packages/core/src/evaluation/evaluate.ts index fe4678472..d3bc52eb4 100644 --- a/packages/core/src/evaluation/evaluate.ts +++ b/packages/core/src/evaluation/evaluate.ts @@ -144,8 +144,8 @@ export interface EvalConfig { readonly task?: (input: string) => string | Promise; /** Suite-level assertions applied to all tests */ readonly assert?: readonly AssertEntry[]; - /** Filter tests by ID pattern (glob supported) */ - readonly filter?: string; + /** Filter tests by ID pattern(s) (glob supported). Arrays use OR logic. */ + readonly filter?: string | readonly string[]; /** Maximum concurrent workers (default: 3) */ readonly workers?: number; /** Maximum retries on failure (default: 2) */ diff --git a/packages/core/src/evaluation/loaders/jsonl-parser.ts b/packages/core/src/evaluation/loaders/jsonl-parser.ts index 50ae1ee4b..28b3efa3f 100644 --- a/packages/core/src/evaluation/loaders/jsonl-parser.ts +++ b/packages/core/src/evaluation/loaders/jsonl-parser.ts @@ -23,10 +23,16 @@ const ANSI_RESET = '\u001b[0m'; type LoadOptions = { readonly verbose?: boolean; - /** Filter tests by ID pattern (glob supported, e.g., "summary-*") */ - readonly filter?: string; + /** Filter tests by ID pattern(s) (glob supported, e.g., "summary-*"). Arrays use OR logic. */ + readonly filter?: string | readonly string[]; }; +function matchesFilter(id: string, filter: string | readonly string[]): boolean { + return typeof filter === 'string' + ? micromatch.isMatch(id, filter) + : filter.some((pattern) => micromatch.isMatch(id, pattern)); +} + /** * Sidecar metadata structure for JSONL datasets. */ @@ -178,7 +184,7 @@ export async function loadTestsFromJsonl( const id = asString(evalcase.id); // Skip eval cases that don't match the filter pattern (glob supported) - if (filterPattern && (!id || !micromatch.isMatch(id, filterPattern))) { + if (filterPattern && (!id || !matchesFilter(id, filterPattern))) { continue; } diff --git a/packages/core/src/evaluation/orchestrator.ts b/packages/core/src/evaluation/orchestrator.ts index c04293d8e..f5ad8060f 100644 --- a/packages/core/src/evaluation/orchestrator.ts +++ b/packages/core/src/evaluation/orchestrator.ts @@ -228,8 +228,8 @@ export interface RunEvaluationOptions { readonly cache?: EvaluationCache; readonly useCache?: boolean; readonly now?: () => Date; - /** Filter tests by ID pattern (glob supported, e.g., "summary-*") */ - readonly filter?: string; + /** Filter tests by ID pattern(s) (glob supported, e.g., "summary-*"). Arrays use OR logic. */ + readonly filter?: string | readonly string[]; readonly verbose?: boolean; readonly maxConcurrency?: number; readonly evalCases?: readonly EvalTest[]; @@ -329,7 +329,7 @@ export async function runEvaluation( const filteredEvalCases = filterEvalCases(evalCases, filter); if (filteredEvalCases.length === 0) { if (filter) { - throw new Error(`No tests matched filter '${filter}' in ${evalFilePath}`); + throw new Error(`No tests matched filter '${formatFilter(filter)}' in ${evalFilePath}`); } return []; } @@ -2488,11 +2488,24 @@ async function runEvaluatorList(options: { return { score, scores }; } -function filterEvalCases(evalCases: readonly EvalTest[], filter?: string): readonly EvalTest[] { +function formatFilter(filter: string | readonly string[]): string { + return typeof filter === 'string' ? filter : filter.join(', '); +} + +function matchesFilter(id: string, filter: string | readonly string[]): boolean { + return typeof filter === 'string' + ? micromatch.isMatch(id, filter) + : filter.some((pattern) => micromatch.isMatch(id, pattern)); +} + +function filterEvalCases( + evalCases: readonly EvalTest[], + filter?: string | readonly string[], +): readonly EvalTest[] { if (!filter) { return evalCases; } - return evalCases.filter((evalCase) => micromatch.isMatch(evalCase.id, filter)); + return evalCases.filter((evalCase) => matchesFilter(evalCase.id, filter)); } function buildEvaluatorRegistry( diff --git a/packages/core/src/evaluation/yaml-parser.ts b/packages/core/src/evaluation/yaml-parser.ts index 3fb9dfa8f..0e0cc962f 100644 --- a/packages/core/src/evaluation/yaml-parser.ts +++ b/packages/core/src/evaluation/yaml-parser.ts @@ -75,12 +75,18 @@ const ANSI_RESET = '\u001b[0m'; type LoadOptions = { readonly verbose?: boolean; - /** Filter tests by ID pattern (glob supported, e.g., "summary-*") */ - readonly filter?: string; + /** Filter tests by ID pattern(s) (glob supported, e.g., "summary-*"). Arrays use OR logic. */ + readonly filter?: string | readonly string[]; /** Category derived from the eval file's directory path */ readonly category?: string; }; +function matchesFilter(id: string, filter: string | readonly string[]): boolean { + return typeof filter === 'string' + ? micromatch.isMatch(id, filter) + : filter.some((pattern) => micromatch.isMatch(id, pattern)); +} + type RawTestSuite = JsonObject & { readonly tests?: JsonValue; /** @deprecated Use `tests` instead */ @@ -333,7 +339,7 @@ async function loadTestsFromYaml( const id = asString(evalcase.id); // Skip tests that don't match the filter pattern (glob supported) - if (filterPattern && (!id || !micromatch.isMatch(id, filterPattern))) { + if (filterPattern && (!id || !matchesFilter(id, filterPattern))) { continue; } diff --git a/packages/core/test/evaluation/loaders/jsonl-parser.test.ts b/packages/core/test/evaluation/loaders/jsonl-parser.test.ts index 87f75f832..5a1b2e4b3 100644 --- a/packages/core/test/evaluation/loaders/jsonl-parser.test.ts +++ b/packages/core/test/evaluation/loaders/jsonl-parser.test.ts @@ -255,6 +255,25 @@ describe('loadTestsFromJsonl', () => { expect(cases.map((c) => c.id)).toEqual(['summary-basic', 'summary-advanced']); }); + it('filters by multiple patterns with OR logic', async () => { + const jsonlPath = path.join(tempDir, 'filter-multi.jsonl'); + await writeFile( + jsonlPath, + [ + '{"id": "alpha-case", "criteria": "Goal 1", "input": [{"role": "user", "content": "Query 1"}]}', + '{"id": "beta-case", "criteria": "Goal 2", "input": [{"role": "user", "content": "Query 2"}]}', + '{"id": "gamma-case", "criteria": "Goal 3", "input": [{"role": "user", "content": "Query 3"}]}', + ].join('\n'), + ); + + const cases = await loadTestsFromJsonl(jsonlPath, tempDir, { + filter: ['alpha-*', 'beta-case'], + }); + + expect(cases).toHaveLength(2); + expect(cases.map((c) => c.id)).toEqual(['alpha-case', 'beta-case']); + }); + it('supports conversation_id field', async () => { const jsonlPath = path.join(tempDir, 'with-conv-id.jsonl'); await writeFile(