Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
16 changes: 12 additions & 4 deletions AGENTS.md
Original file line number Diff line number Diff line change
Expand Up @@ -334,7 +334,13 @@ When working on a GitHub issue, **ALWAYS** follow this workflow:
```
If the issue has the `in-progress` label, **do not work on it** — pick a different issue.

2. **Create a worktree** with a feature branch:
2. **Update local `main` to the latest `origin/main`** before branching:
```bash
git checkout main
git pull --ff-only origin main
```

3. **Create a worktree** with a feature branch:
```bash
git worktree add agentv.worktrees/<branch-name> -b <type>/<issue-number>-<short-description>
cd agentv.worktrees/<branch-name>
Expand All @@ -343,15 +349,17 @@ When working on a GitHub issue, **ALWAYS** follow this workflow:
# Example: git worktree add agentv.worktrees/feat/42-add-new-embedder -b feat/42-add-new-embedder
```

3. **Implement the changes** and commit following the commit convention
The feature branch must be based on the freshly updated `main`, not a stale local checkout.

4. **Implement the changes** and commit following the commit convention

4. **Push the branch and create a Pull Request**:
5. **Push the branch and create a Pull Request**:
```bash
git push -u origin <branch-name>
gh pr create --title "<type>(scope): description" --body "Closes #<issue-number>"
```

5. **Before merging**, ensure:
6. **Before merging**, ensure:
- **E2E verification completed** (see "Completing Work — E2E Checklist")
- CI pipeline passes (all checks green)
- Code has been reviewed if required
Expand Down
6 changes: 3 additions & 3 deletions apps/cli/src/commands/eval/commands/run.ts
Original file line number Diff line number Diff line change
Expand Up @@ -32,10 +32,10 @@ export const evalRunCommand = command({
long: 'targets',
description: 'Path to targets.yaml (overrides discovery)',
}),
testId: option({
type: optional(string),
testId: multioption({
type: array(string),
long: 'test-id',
description: 'Filter tests by ID pattern (glob supported, e.g., "summary-*")',
description: 'Filter tests by ID pattern (repeatable, OR logic; glob supported)',
}),
workers: option({
type: optional(number),
Expand Down
17 changes: 15 additions & 2 deletions apps/cli/src/commands/eval/run-eval.ts
Original file line number Diff line number Diff line change
Expand Up @@ -60,7 +60,7 @@ interface NormalizedOptions {
readonly target?: string;
readonly cliTargets: readonly string[];
readonly targetsPath?: string;
readonly filter?: string;
readonly filter?: string | readonly string[];
readonly workers?: number;
readonly outPath?: string;
readonly outputPaths: readonly string[];
Expand Down Expand Up @@ -149,6 +149,18 @@ function normalizeStringArray(value: unknown): readonly string[] {
return [];
}

function normalizeFilter(value: unknown): string | readonly string[] | undefined {
if (Array.isArray(value)) {
const filters = normalizeStringArray(value);
if (filters.length === 0) {
return undefined;
}
return filters.length === 1 ? filters[0] : filters;
}

return normalizeString(value);
}

/**
* Check whether an eval file's tags satisfy --tag / --exclude-tag filters.
*
Expand Down Expand Up @@ -298,7 +310,7 @@ function normalizeOptions(
target: singleTarget,
cliTargets,
targetsPath: normalizeString(rawOptions.targets),
filter: normalizeString(rawOptions.filter),
filter: normalizeFilter(rawOptions.filter),
workers: workers > 0 ? workers : undefined,
outPath: cliOut ?? configOut,
outputPaths,
Expand Down Expand Up @@ -718,6 +730,7 @@ async function runSingleEvalFile(params: {
}
return true;
})(),
filter: options.filter,
evalCases,
verbose: options.verbose,
maxConcurrency: resolvedWorkers,
Expand Down
20 changes: 20 additions & 0 deletions apps/cli/test/eval.integration.test.ts
Original file line number Diff line number Diff line change
Expand Up @@ -247,4 +247,24 @@ describe('agentv eval CLI', () => {
await rm(fixture.baseDir, { recursive: true, force: true });
}
});

it('supports repeatable --test-id flags with OR matching', async () => {
const fixture = await createFixture();
try {
await runCli(fixture, [
'eval',
fixture.testFilePath,
'--test-id',
'case-alpha',
'--test-id',
'case-beta',
]);

const diagnostics = await readDiagnostics(fixture);
expect(diagnostics.filter).toEqual(['case-alpha', 'case-beta']);
expect(diagnostics.evalCaseIds).toEqual(['case-alpha', 'case-beta']);
} finally {
await rm(fixture.baseDir, { recursive: true, force: true });
}
});
});
16 changes: 14 additions & 2 deletions apps/cli/test/fixtures/mock-run-evaluation.ts
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@ interface RunEvaluationOptionsLike {
readonly promptDumpDir?: string;
readonly cache?: unknown;
readonly useCache?: boolean;
readonly testId?: string;
readonly filter?: string | readonly string[];
readonly evalCases?: ReadonlyArray<unknown>;
readonly verbose?: boolean;
readonly onResult?: (result: EvaluationResultLike) => Promise<void> | void;
Expand Down Expand Up @@ -77,11 +77,23 @@ async function maybeWriteDiagnostics(
targetKind: options.target?.kind,
agentTimeoutMs: options.agentTimeoutMs ?? null,
promptDumpDir: options.promptDumpDir,
testId: options.testId ?? null,
filter: options.filter ?? null,
useCache: options.useCache ?? false,
envSample: process.env.CLI_ENV_SAMPLE ?? null,
envRootOnly: process.env.CLI_ENV_ROOT_ONLY ?? null,
envLocalOnly: process.env.CLI_ENV_LOCAL_ONLY ?? null,
evalCaseIds: Array.isArray(options.evalCases)
? options.evalCases
.map((evalCase) =>
evalCase &&
typeof evalCase === 'object' &&
'id' in evalCase &&
typeof evalCase.id === 'string'
? evalCase.id
: null,
)
.filter((id): id is string => id !== null)
: null,
resultCount: results.length,
} satisfies Record<string, unknown>;

Expand Down
4 changes: 2 additions & 2 deletions packages/core/src/evaluation/evaluate.ts
Original file line number Diff line number Diff line change
Expand Up @@ -144,8 +144,8 @@ export interface EvalConfig {
readonly task?: (input: string) => string | Promise<string>;
/** Suite-level assertions applied to all tests */
readonly assert?: readonly AssertEntry[];
/** Filter tests by ID pattern (glob supported) */
readonly filter?: string;
/** Filter tests by ID pattern(s) (glob supported). Arrays use OR logic. */
readonly filter?: string | readonly string[];
/** Maximum concurrent workers (default: 3) */
readonly workers?: number;
/** Maximum retries on failure (default: 2) */
Expand Down
12 changes: 9 additions & 3 deletions packages/core/src/evaluation/loaders/jsonl-parser.ts
Original file line number Diff line number Diff line change
Expand Up @@ -23,10 +23,16 @@ const ANSI_RESET = '\u001b[0m';

type LoadOptions = {
readonly verbose?: boolean;
/** Filter tests by ID pattern (glob supported, e.g., "summary-*") */
readonly filter?: string;
/** Filter tests by ID pattern(s) (glob supported, e.g., "summary-*"). Arrays use OR logic. */
readonly filter?: string | readonly string[];
};

function matchesFilter(id: string, filter: string | readonly string[]): boolean {
return typeof filter === 'string'
? micromatch.isMatch(id, filter)
: filter.some((pattern) => micromatch.isMatch(id, pattern));
}

/**
* Sidecar metadata structure for JSONL datasets.
*/
Expand Down Expand Up @@ -178,7 +184,7 @@ export async function loadTestsFromJsonl(
const id = asString(evalcase.id);

// Skip eval cases that don't match the filter pattern (glob supported)
if (filterPattern && (!id || !micromatch.isMatch(id, filterPattern))) {
if (filterPattern && (!id || !matchesFilter(id, filterPattern))) {
continue;
}

Expand Down
23 changes: 18 additions & 5 deletions packages/core/src/evaluation/orchestrator.ts
Original file line number Diff line number Diff line change
Expand Up @@ -228,8 +228,8 @@ export interface RunEvaluationOptions {
readonly cache?: EvaluationCache;
readonly useCache?: boolean;
readonly now?: () => Date;
/** Filter tests by ID pattern (glob supported, e.g., "summary-*") */
readonly filter?: string;
/** Filter tests by ID pattern(s) (glob supported, e.g., "summary-*"). Arrays use OR logic. */
readonly filter?: string | readonly string[];
readonly verbose?: boolean;
readonly maxConcurrency?: number;
readonly evalCases?: readonly EvalTest[];
Expand Down Expand Up @@ -329,7 +329,7 @@ export async function runEvaluation(
const filteredEvalCases = filterEvalCases(evalCases, filter);
if (filteredEvalCases.length === 0) {
if (filter) {
throw new Error(`No tests matched filter '${filter}' in ${evalFilePath}`);
throw new Error(`No tests matched filter '${formatFilter(filter)}' in ${evalFilePath}`);
}
return [];
}
Expand Down Expand Up @@ -2488,11 +2488,24 @@ async function runEvaluatorList(options: {
return { score, scores };
}

function filterEvalCases(evalCases: readonly EvalTest[], filter?: string): readonly EvalTest[] {
function formatFilter(filter: string | readonly string[]): string {
return typeof filter === 'string' ? filter : filter.join(', ');
}

function matchesFilter(id: string, filter: string | readonly string[]): boolean {
return typeof filter === 'string'
? micromatch.isMatch(id, filter)
: filter.some((pattern) => micromatch.isMatch(id, pattern));
}

function filterEvalCases(
evalCases: readonly EvalTest[],
filter?: string | readonly string[],
): readonly EvalTest[] {
if (!filter) {
return evalCases;
}
return evalCases.filter((evalCase) => micromatch.isMatch(evalCase.id, filter));
return evalCases.filter((evalCase) => matchesFilter(evalCase.id, filter));
}

function buildEvaluatorRegistry(
Expand Down
12 changes: 9 additions & 3 deletions packages/core/src/evaluation/yaml-parser.ts
Original file line number Diff line number Diff line change
Expand Up @@ -75,12 +75,18 @@ const ANSI_RESET = '\u001b[0m';

type LoadOptions = {
readonly verbose?: boolean;
/** Filter tests by ID pattern (glob supported, e.g., "summary-*") */
readonly filter?: string;
/** Filter tests by ID pattern(s) (glob supported, e.g., "summary-*"). Arrays use OR logic. */
readonly filter?: string | readonly string[];
/** Category derived from the eval file's directory path */
readonly category?: string;
};

function matchesFilter(id: string, filter: string | readonly string[]): boolean {
return typeof filter === 'string'
? micromatch.isMatch(id, filter)
: filter.some((pattern) => micromatch.isMatch(id, pattern));
}

type RawTestSuite = JsonObject & {
readonly tests?: JsonValue;
/** @deprecated Use `tests` instead */
Expand Down Expand Up @@ -333,7 +339,7 @@ async function loadTestsFromYaml(
const id = asString(evalcase.id);

// Skip tests that don't match the filter pattern (glob supported)
if (filterPattern && (!id || !micromatch.isMatch(id, filterPattern))) {
if (filterPattern && (!id || !matchesFilter(id, filterPattern))) {
continue;
}

Expand Down
19 changes: 19 additions & 0 deletions packages/core/test/evaluation/loaders/jsonl-parser.test.ts
Original file line number Diff line number Diff line change
Expand Up @@ -255,6 +255,25 @@ describe('loadTestsFromJsonl', () => {
expect(cases.map((c) => c.id)).toEqual(['summary-basic', 'summary-advanced']);
});

it('filters by multiple patterns with OR logic', async () => {
const jsonlPath = path.join(tempDir, 'filter-multi.jsonl');
await writeFile(
jsonlPath,
[
'{"id": "alpha-case", "criteria": "Goal 1", "input": [{"role": "user", "content": "Query 1"}]}',
'{"id": "beta-case", "criteria": "Goal 2", "input": [{"role": "user", "content": "Query 2"}]}',
'{"id": "gamma-case", "criteria": "Goal 3", "input": [{"role": "user", "content": "Query 3"}]}',
].join('\n'),
);

const cases = await loadTestsFromJsonl(jsonlPath, tempDir, {
filter: ['alpha-*', 'beta-case'],
});

expect(cases).toHaveLength(2);
expect(cases.map((c) => c.id)).toEqual(['alpha-case', 'beta-case']);
});

it('supports conversation_id field', async () => {
const jsonlPath = path.join(tempDir, 'with-conv-id.jsonl');
await writeFile(
Expand Down
Loading