From d77a19967e8bd45cc865ed3db61688e7d31d593d Mon Sep 17 00:00:00 2001
From: Christopher <christso@gmail.com>
Date: Sat, 4 Apr 2026 07:46:51 +0000
Subject: [PATCH] feat(cli): support repeatable test-id filters

---
 AGENTS.md                                     | 16 +++++++++----
 apps/cli/src/commands/eval/commands/run.ts    |  6 ++---
 apps/cli/src/commands/eval/run-eval.ts        | 17 ++++++++++++--
 apps/cli/test/eval.integration.test.ts        | 20 ++++++++++++++++
 apps/cli/test/fixtures/mock-run-evaluation.ts | 16 +++++++++++--
 packages/core/src/evaluation/evaluate.ts      |  4 ++--
 .../src/evaluation/loaders/jsonl-parser.ts    | 12 +++++++---
 packages/core/src/evaluation/orchestrator.ts  | 23 +++++++++++++++----
 packages/core/src/evaluation/yaml-parser.ts   | 12 +++++++---
 .../evaluation/loaders/jsonl-parser.test.ts   | 19 +++++++++++++++
 10 files changed, 121 insertions(+), 24 deletions(-)
diff --git a/AGENTS.md b/AGENTS.md
index cdf5f284d..dd7529314 100644
--- a/AGENTS.md
+++ b/AGENTS.md
@@ -334,7 +334,13 @@ When working on a GitHub issue, **ALWAYS** follow this workflow:
    ```
    If the issue has the `in-progress` label, **do not work on it** — pick a different issue.
 
-2. **Create a worktree** with a feature branch:
+2. **Update local `main` to the latest `origin/main`** before branching:
+   ```bash
+   git checkout main
+   git pull --ff-only origin main
+   ```
+
+3. **Create a worktree** with a feature branch:
    ```bash
    git worktree add agentv.worktrees/<branch-name> -b <type>/<issue-number>-<short-description>
    cd agentv.worktrees/<branch-name>
@@ -343,15 +349,17 @@ When working on a GitHub issue, **ALWAYS** follow this workflow:
    # Example: git worktree add agentv.worktrees/feat/42-add-new-embedder -b feat/42-add-new-embedder
    ```
 
-3. **Implement the changes** and commit following the commit convention
+   The feature branch must be based on the freshly updated `main`, not a stale local checkout.
+
+4. **Implement the changes** and commit following the commit convention
 
-4. **Push the branch and create a Pull Request**:
+5. **Push the branch and create a Pull Request**:
    ```bash
    git push -u origin <branch-name>
    gh pr create --title "<type>(scope): description" --body "Closes #<issue-number>"
    ```
 
-5. **Before merging**, ensure:
+6. **Before merging**, ensure:
    - **E2E verification completed** (see "Completing Work — E2E Checklist")
    - CI pipeline passes (all checks green)
    - Code has been reviewed if required
diff --git a/apps/cli/src/commands/eval/commands/run.ts b/apps/cli/src/commands/eval/commands/run.ts
index 14f33f97e..8e6903c52 100644
--- a/apps/cli/src/commands/eval/commands/run.ts
+++ b/apps/cli/src/commands/eval/commands/run.ts
@@ -32,10 +32,10 @@ export const evalRunCommand = command({
       long: 'targets',
       description: 'Path to targets.yaml (overrides discovery)',
     }),
-    testId: option({
-      type: optional(string),
+    testId: multioption({
+      type: array(string),
       long: 'test-id',
-      description: 'Filter tests by ID pattern (glob supported, e.g., "summary-*")',
+      description: 'Filter tests by ID pattern (repeatable, OR logic; glob supported)',
     }),
     workers: option({
       type: optional(number),
diff --git a/apps/cli/src/commands/eval/run-eval.ts b/apps/cli/src/commands/eval/run-eval.ts
index 70f8bc26e..1a26fff4b 100644
--- a/apps/cli/src/commands/eval/run-eval.ts
+++ b/apps/cli/src/commands/eval/run-eval.ts
@@ -60,7 +60,7 @@ interface NormalizedOptions {
   readonly target?: string;
   readonly cliTargets: readonly string[];
   readonly targetsPath?: string;
-  readonly filter?: string;
+  readonly filter?: string | readonly string[];
   readonly workers?: number;
   readonly outPath?: string;
   readonly outputPaths: readonly string[];
@@ -149,6 +149,18 @@ function normalizeStringArray(value: unknown): readonly string[] {
   return [];
 }
 
+function normalizeFilter(value: unknown): string | readonly string[] | undefined {
+  if (Array.isArray(value)) {
+    const filters = normalizeStringArray(value);
+    if (filters.length === 0) {
+      return undefined;
+    }
+    return filters.length === 1 ? filters[0] : filters;
+  }
+
+  return normalizeString(value);
+}
+
 /**
  * Check whether an eval file's tags satisfy --tag / --exclude-tag filters.
  *
@@ -298,7 +310,7 @@ function normalizeOptions(
     target: singleTarget,
     cliTargets,
     targetsPath: normalizeString(rawOptions.targets),
-    filter: normalizeString(rawOptions.filter),
+    filter: normalizeFilter(rawOptions.filter),
     workers: workers > 0 ? workers : undefined,
     outPath: cliOut ?? configOut,
     outputPaths,
@@ -718,6 +730,7 @@ async function runSingleEvalFile(params: {
       }
       return true;
     })(),
+    filter: options.filter,
     evalCases,
     verbose: options.verbose,
     maxConcurrency: resolvedWorkers,
diff --git a/apps/cli/test/eval.integration.test.ts b/apps/cli/test/eval.integration.test.ts
index 0a9ed4a52..5b3a8b8a3 100644
--- a/apps/cli/test/eval.integration.test.ts
+++ b/apps/cli/test/eval.integration.test.ts
@@ -247,4 +247,24 @@ describe('agentv eval CLI', () => {
       await rm(fixture.baseDir, { recursive: true, force: true });
     }
   });
+
+  it('supports repeatable --test-id flags with OR matching', async () => {
+    const fixture = await createFixture();
+    try {
+      await runCli(fixture, [
+        'eval',
+        fixture.testFilePath,
+        '--test-id',
+        'case-alpha',
+        '--test-id',
+        'case-beta',
+      ]);
+
+      const diagnostics = await readDiagnostics(fixture);
+      expect(diagnostics.filter).toEqual(['case-alpha', 'case-beta']);
+      expect(diagnostics.evalCaseIds).toEqual(['case-alpha', 'case-beta']);
+    } finally {
+      await rm(fixture.baseDir, { recursive: true, force: true });
+    }
+  });
 });
diff --git a/apps/cli/test/fixtures/mock-run-evaluation.ts b/apps/cli/test/fixtures/mock-run-evaluation.ts
index e07cb4568..524f4b016 100644
--- a/apps/cli/test/fixtures/mock-run-evaluation.ts
+++ b/apps/cli/test/fixtures/mock-run-evaluation.ts
@@ -15,7 +15,7 @@ interface RunEvaluationOptionsLike {
   readonly promptDumpDir?: string;
   readonly cache?: unknown;
   readonly useCache?: boolean;
-  readonly testId?: string;
+  readonly filter?: string | readonly string[];
   readonly evalCases?: ReadonlyArray<unknown>;
   readonly verbose?: boolean;
   readonly onResult?: (result: EvaluationResultLike) => Promise<void> | void;
@@ -77,11 +77,23 @@ async function maybeWriteDiagnostics(
     targetKind: options.target?.kind,
     agentTimeoutMs: options.agentTimeoutMs ?? null,
     promptDumpDir: options.promptDumpDir,
-    testId: options.testId ?? null,
+    filter: options.filter ?? null,
     useCache: options.useCache ?? false,
     envSample: process.env.CLI_ENV_SAMPLE ?? null,
     envRootOnly: process.env.CLI_ENV_ROOT_ONLY ?? null,
     envLocalOnly: process.env.CLI_ENV_LOCAL_ONLY ?? null,
+    evalCaseIds: Array.isArray(options.evalCases)
+      ? options.evalCases
+          .map((evalCase) =>
+            evalCase &&
+            typeof evalCase === 'object' &&
+            'id' in evalCase &&
+            typeof evalCase.id === 'string'
+              ? evalCase.id
+              : null,
+          )
+          .filter((id): id is string => id !== null)
+      : null,
     resultCount: results.length,
   } satisfies Record<string, unknown>;
 
diff --git a/packages/core/src/evaluation/evaluate.ts b/packages/core/src/evaluation/evaluate.ts
index fe4678472..d3bc52eb4 100644
--- a/packages/core/src/evaluation/evaluate.ts
+++ b/packages/core/src/evaluation/evaluate.ts
@@ -144,8 +144,8 @@ export interface EvalConfig {
   readonly task?: (input: string) => string | Promise<string>;
   /** Suite-level assertions applied to all tests */
   readonly assert?: readonly AssertEntry[];
-  /** Filter tests by ID pattern (glob supported) */
-  readonly filter?: string;
+  /** Filter tests by ID pattern(s) (glob supported). Arrays use OR logic. */
+  readonly filter?: string | readonly string[];
   /** Maximum concurrent workers (default: 3) */
   readonly workers?: number;
   /** Maximum retries on failure (default: 2) */
diff --git a/packages/core/src/evaluation/loaders/jsonl-parser.ts b/packages/core/src/evaluation/loaders/jsonl-parser.ts
index 50ae1ee4b..28b3efa3f 100644
--- a/packages/core/src/evaluation/loaders/jsonl-parser.ts
+++ b/packages/core/src/evaluation/loaders/jsonl-parser.ts
@@ -23,10 +23,16 @@ const ANSI_RESET = '\u001b[0m';
 
 type LoadOptions = {
   readonly verbose?: boolean;
-  /** Filter tests by ID pattern (glob supported, e.g., "summary-*") */
-  readonly filter?: string;
+  /** Filter tests by ID pattern(s) (glob supported, e.g., "summary-*"). Arrays use OR logic. */
+  readonly filter?: string | readonly string[];
 };
 
+function matchesFilter(id: string, filter: string | readonly string[]): boolean {
+  return typeof filter === 'string'
+    ? micromatch.isMatch(id, filter)
+    : filter.some((pattern) => micromatch.isMatch(id, pattern));
+}
+
 /**
  * Sidecar metadata structure for JSONL datasets.
  */
@@ -178,7 +184,7 @@ export async function loadTestsFromJsonl(
     const id = asString(evalcase.id);
 
     // Skip eval cases that don't match the filter pattern (glob supported)
-    if (filterPattern && (!id || !micromatch.isMatch(id, filterPattern))) {
+    if (filterPattern && (!id || !matchesFilter(id, filterPattern))) {
       continue;
     }
 
diff --git a/packages/core/src/evaluation/orchestrator.ts b/packages/core/src/evaluation/orchestrator.ts
index c04293d8e..f5ad8060f 100644
--- a/packages/core/src/evaluation/orchestrator.ts
+++ b/packages/core/src/evaluation/orchestrator.ts
@@ -228,8 +228,8 @@ export interface RunEvaluationOptions {
   readonly cache?: EvaluationCache;
   readonly useCache?: boolean;
   readonly now?: () => Date;
-  /** Filter tests by ID pattern (glob supported, e.g., "summary-*") */
-  readonly filter?: string;
+  /** Filter tests by ID pattern(s) (glob supported, e.g., "summary-*"). Arrays use OR logic. */
+  readonly filter?: string | readonly string[];
   readonly verbose?: boolean;
   readonly maxConcurrency?: number;
   readonly evalCases?: readonly EvalTest[];
@@ -329,7 +329,7 @@ export async function runEvaluation(
   const filteredEvalCases = filterEvalCases(evalCases, filter);
   if (filteredEvalCases.length === 0) {
     if (filter) {
-      throw new Error(`No tests matched filter '${filter}' in ${evalFilePath}`);
+      throw new Error(`No tests matched filter '${formatFilter(filter)}' in ${evalFilePath}`);
     }
     return [];
   }
@@ -2488,11 +2488,24 @@ async function runEvaluatorList(options: {
   return { score, scores };
 }
 
-function filterEvalCases(evalCases: readonly EvalTest[], filter?: string): readonly EvalTest[] {
+function formatFilter(filter: string | readonly string[]): string {
+  return typeof filter === 'string' ? filter : filter.join(', ');
+}
+
+function matchesFilter(id: string, filter: string | readonly string[]): boolean {
+  return typeof filter === 'string'
+    ? micromatch.isMatch(id, filter)
+    : filter.some((pattern) => micromatch.isMatch(id, pattern));
+}
+
+function filterEvalCases(
+  evalCases: readonly EvalTest[],
+  filter?: string | readonly string[],
+): readonly EvalTest[] {
   if (!filter) {
     return evalCases;
   }
-  return evalCases.filter((evalCase) => micromatch.isMatch(evalCase.id, filter));
+  return evalCases.filter((evalCase) => matchesFilter(evalCase.id, filter));
 }
 
 function buildEvaluatorRegistry(
diff --git a/packages/core/src/evaluation/yaml-parser.ts b/packages/core/src/evaluation/yaml-parser.ts
index 3fb9dfa8f..0e0cc962f 100644
--- a/packages/core/src/evaluation/yaml-parser.ts
+++ b/packages/core/src/evaluation/yaml-parser.ts
@@ -75,12 +75,18 @@ const ANSI_RESET = '\u001b[0m';
 
 type LoadOptions = {
   readonly verbose?: boolean;
-  /** Filter tests by ID pattern (glob supported, e.g., "summary-*") */
-  readonly filter?: string;
+  /** Filter tests by ID pattern(s) (glob supported, e.g., "summary-*"). Arrays use OR logic. */
+  readonly filter?: string | readonly string[];
   /** Category derived from the eval file's directory path */
   readonly category?: string;
 };
 
+function matchesFilter(id: string, filter: string | readonly string[]): boolean {
+  return typeof filter === 'string'
+    ? micromatch.isMatch(id, filter)
+    : filter.some((pattern) => micromatch.isMatch(id, pattern));
+}
+
 type RawTestSuite = JsonObject & {
   readonly tests?: JsonValue;
   /** @deprecated Use `tests` instead */
@@ -333,7 +339,7 @@ async function loadTestsFromYaml(
     const id = asString(evalcase.id);
 
     // Skip tests that don't match the filter pattern (glob supported)
-    if (filterPattern && (!id || !micromatch.isMatch(id, filterPattern))) {
+    if (filterPattern && (!id || !matchesFilter(id, filterPattern))) {
       continue;
     }
 
diff --git a/packages/core/test/evaluation/loaders/jsonl-parser.test.ts b/packages/core/test/evaluation/loaders/jsonl-parser.test.ts
index 87f75f832..5a1b2e4b3 100644
--- a/packages/core/test/evaluation/loaders/jsonl-parser.test.ts
+++ b/packages/core/test/evaluation/loaders/jsonl-parser.test.ts
@@ -255,6 +255,25 @@ describe('loadTestsFromJsonl', () => {
     expect(cases.map((c) => c.id)).toEqual(['summary-basic', 'summary-advanced']);
   });
 
+  it('filters by multiple patterns with OR logic', async () => {
+    const jsonlPath = path.join(tempDir, 'filter-multi.jsonl');
+    await writeFile(
+      jsonlPath,
+      [
+        '{"id": "alpha-case", "criteria": "Goal 1", "input": [{"role": "user", "content": "Query 1"}]}',
+        '{"id": "beta-case", "criteria": "Goal 2", "input": [{"role": "user", "content": "Query 2"}]}',
+        '{"id": "gamma-case", "criteria": "Goal 3", "input": [{"role": "user", "content": "Query 3"}]}',
+      ].join('\n'),
+    );
+
+    const cases = await loadTestsFromJsonl(jsonlPath, tempDir, {
+      filter: ['alpha-*', 'beta-case'],
+    });
+
+    expect(cases).toHaveLength(2);
+    expect(cases.map((c) => c.id)).toEqual(['alpha-case', 'beta-case']);
+  });
+
   it('supports conversation_id field', async () => {
     const jsonlPath = path.join(tempDir, 'with-conv-id.jsonl');
     await writeFile(