diff --git a/apps/cli/src/commands/inspect/filter.ts b/apps/cli/src/commands/inspect/filter.ts new file mode 100644 index 000000000..89f81c77d --- /dev/null +++ b/apps/cli/src/commands/inspect/filter.ts @@ -0,0 +1,363 @@ +/** + * `agentv inspect filter` — filter evaluation results by metadata criteria. + * + * Scans JSONL index files in `.agentv/results/runs/` and applies filters + * such as target name, experiment name, score thresholds, execution status, + * and tool usage. Outputs matching test IDs with summary info. + * + * Each filter is optional and combinable (AND logic). Results must match + * all specified filters to be included. + * + * To extend: add new filter predicates in `buildFilterPredicate()`. + */ + +import { existsSync, readFileSync, readdirSync, statSync } from 'node:fs'; +import path from 'node:path'; +import { command, number, oneOf, option, optional, positional, string } from 'cmd-ts'; +import { c, formatScore, padLeft, padRight } from './utils.js'; + +/** A lightweight result record with fields needed for filtering. */ +export interface FilterableRecord { + file: string; + test_id: string; + suite?: string; + target?: string; + experiment?: string; + score: number; + execution_status?: string; + error?: string; + timestamp?: string; + /** Flattened set of tool names found in trace.tool_calls or output messages. */ + tool_names: string[]; +} + +/** + * Recursively collect all index.jsonl files under the runs directory. + */ +function collectIndexFiles(dir: string): string[] { + const files: string[] = []; + try { + const entries = readdirSync(dir, { withFileTypes: true }); + for (const entry of entries) { + const fullPath = path.join(dir, entry.name); + if (entry.isDirectory()) { + files.push(...collectIndexFiles(fullPath)); + } else if (entry.name === 'index.jsonl') { + files.push(fullPath); + } + } + } catch { + // Directory may not exist + } + return files; +} + +/** + * Extract tool names from a result record. + * Looks in trace.tool_calls (Record) and output messages (tool_calls[].tool). + */ +function extractToolNames(record: Record): string[] { + const tools = new Set(); + + // From trace.tool_calls + const trace = record.trace as Record | undefined; + if (trace?.tool_calls && typeof trace.tool_calls === 'object') { + for (const name of Object.keys(trace.tool_calls as Record)) { + tools.add(name); + } + } + + // From output messages (array of messages with tool_calls) + const output = record.output; + if (Array.isArray(output)) { + for (const msg of output) { + if ( + typeof msg === 'object' && + msg !== null && + Array.isArray((msg as Record).tool_calls) + ) { + for (const tc of (msg as Record).tool_calls as Record[]) { + if (typeof tc.tool === 'string') { + tools.add(tc.tool); + } + } + } + } + } + + // From scores[].type or scores[].assertions evidence mentioning tools + // (kept minimal — primary source is trace.tool_calls and output messages) + + return [...tools]; +} + +/** + * Parse a single JSONL index file into filterable records. + */ +export function parseFilterableRecords(filePath: string): FilterableRecord[] { + let content: string; + try { + content = readFileSync(filePath, 'utf8'); + } catch { + return []; + } + + const lines = content.split('\n').filter((line) => line.trim()); + const records: FilterableRecord[] = []; + + for (const line of lines) { + let raw: Record; + try { + raw = JSON.parse(line) as Record; + } catch { + continue; + } + + // Determine experiment from record or from directory path + let experiment = typeof raw.experiment === 'string' ? raw.experiment : undefined; + if (!experiment) { + // Infer from path: .agentv/results/runs///index.jsonl + const parts = filePath.split(path.sep); + const runsIdx = parts.indexOf('runs'); + // If there are 2+ segments between "runs" and the file, the first is the experiment + if (runsIdx !== -1 && parts.length - runsIdx >= 3) { + const candidate = parts[runsIdx + 1]; + // "default" experiment or named experiments; skip if it looks like a timestamp + if (candidate && !/^\d{4}-\d{2}-\d{2}T/.test(candidate)) { + experiment = candidate; + } + } + } + + records.push({ + file: filePath, + test_id: typeof raw.test_id === 'string' ? raw.test_id : 'unknown', + suite: typeof raw.suite === 'string' ? raw.suite : undefined, + target: typeof raw.target === 'string' ? raw.target : undefined, + experiment, + score: typeof raw.score === 'number' ? raw.score : 0, + execution_status: typeof raw.execution_status === 'string' ? raw.execution_status : undefined, + error: typeof raw.error === 'string' ? raw.error : undefined, + timestamp: typeof raw.timestamp === 'string' ? raw.timestamp : undefined, + tool_names: extractToolNames(raw), + }); + } + + return records; +} + +/** Filter predicate that checks all criteria. */ +export type FilterPredicate = (record: FilterableRecord) => boolean; + +export function buildFilterPredicate(opts: { + target?: string; + experiment?: string; + scoreBelow?: number; + scoreAbove?: number; + status?: string; + hasTool?: string; +}): FilterPredicate { + return (record) => { + if (opts.target && record.target !== opts.target) return false; + if (opts.experiment && record.experiment !== opts.experiment) return false; + if (opts.scoreBelow !== undefined && record.score >= opts.scoreBelow) return false; + if (opts.scoreAbove !== undefined && record.score <= opts.scoreAbove) return false; + if (opts.status) { + // Map user-friendly names to execution_status values + const statusMap: Record = { + pass: ['ok'], + fail: ['quality_failure'], + error: ['error', 'timeout', 'provider_error'], + }; + const allowedStatuses = statusMap[opts.status] ?? [opts.status]; + if (record.execution_status && !allowedStatuses.includes(record.execution_status)) + return false; + if (!record.execution_status) { + // Infer from score if execution_status is missing + if (opts.status === 'pass' && record.score < 1) return false; + if (opts.status === 'fail' && record.score >= 1) return false; + if (opts.status === 'error' && !record.error) return false; + } + } + if (opts.hasTool) { + const toolPattern = opts.hasTool.toLowerCase(); + const hasMatch = record.tool_names.some((t) => t.toLowerCase().includes(toolPattern)); + if (!hasMatch) return false; + } + return true; + }; +} + +function discoverFilterSources(searchPath: string | undefined, cwd: string): string[] { + if (searchPath) { + const resolved = path.isAbsolute(searchPath) ? searchPath : path.resolve(cwd, searchPath); + if (!existsSync(resolved)) { + console.error(`${c.red}Error:${c.reset} Path does not exist: ${resolved}`); + process.exit(1); + } + try { + if (statSync(resolved).isDirectory()) { + return collectIndexFiles(resolved); + } + } catch { + // Fall through + } + return [resolved]; + } + + return collectIndexFiles(path.join(cwd, '.agentv', 'results', 'runs')); +} + +function formatFilterTable(records: FilterableRecord[]): string { + const lines: string[] = []; + + if (records.length === 0) { + lines.push(`${c.yellow}No matching results found.${c.reset}`); + return lines.join('\n'); + } + + lines.push(''); + lines.push( + `${c.bold}Filtered Results${c.reset} ${c.dim}(${records.length} match${records.length !== 1 ? 'es' : ''})${c.reset}`, + ); + lines.push(''); + + // Compute column widths + const maxIdLen = Math.min(32, Math.max(7, ...records.map((r) => r.test_id.length))); + const maxTargetLen = Math.min(16, Math.max(6, ...records.map((r) => (r.target ?? '').length))); + const maxExpLen = Math.min(20, Math.max(10, ...records.map((r) => (r.experiment ?? '').length))); + + // Header + const header = ` ${padRight('Test ID', maxIdLen)} ${padRight('Target', maxTargetLen)} ${padRight('Experiment', maxExpLen)} ${padLeft('Score', 6)} Status`; + lines.push(`${c.dim}${header}${c.reset}`); + lines.push( + `${c.dim} ${'─'.repeat(maxIdLen)} ${'─'.repeat(maxTargetLen)} ${'─'.repeat(maxExpLen)} ${'─'.repeat(6)} ${'─'.repeat(16)}${c.reset}`, + ); + + for (const record of records) { + const scoreColor = record.score >= 1 ? c.green : record.score >= 0.5 ? c.yellow : c.red; + const status = + record.execution_status ?? + (record.error ? 'error' : record.score >= 1 ? 'ok' : 'quality_failure'); + const statusColor = status === 'ok' ? c.green : status === 'error' ? c.red : c.yellow; + + const row = ` ${padRight(record.test_id.slice(0, maxIdLen), maxIdLen)} ${padRight((record.target ?? '-').slice(0, maxTargetLen), maxTargetLen)} ${padRight((record.experiment ?? '-').slice(0, maxExpLen), maxExpLen)} ${padLeft(`${scoreColor}${formatScore(record.score)}${c.reset}`, 6)} ${statusColor}${status}${c.reset}`; + lines.push(row); + } + + // Summary + lines.push(''); + const passCount = records.filter((r) => r.score >= 1).length; + const avgScore = + records.length > 0 ? records.reduce((sum, r) => sum + r.score, 0) / records.length : 0; + lines.push( + `${c.dim}${records.length} result${records.length !== 1 ? 's' : ''} | ${passCount} passed | avg score: ${formatScore(avgScore)}${c.reset}`, + ); + lines.push(''); + + return lines.join('\n'); +} + +export const inspectFilterCommand = command({ + name: 'filter', + description: 'Filter evaluation results by target, experiment, score, status, or tool usage', + args: { + path: positional({ + type: optional(string), + displayName: 'path', + description: 'Directory or file to filter (default: .agentv/results/runs/)', + }), + target: option({ + type: optional(string), + long: 'target', + description: 'Filter by target name', + }), + experiment: option({ + type: optional(string), + long: 'experiment', + description: 'Filter by experiment name', + }), + scoreBelow: option({ + type: optional(number), + long: 'score-below', + description: 'Filter to results with score below this value', + }), + scoreAbove: option({ + type: optional(number), + long: 'score-above', + description: 'Filter to results with score above this value', + }), + status: option({ + type: optional(string), + long: 'status', + description: + 'Filter by execution status: pass, fail, error (or raw value like ok, quality_failure)', + }), + hasTool: option({ + type: optional(string), + long: 'has-tool', + description: 'Filter to results that used a specific tool (substring match)', + }), + dir: option({ + type: optional(string), + long: 'dir', + short: 'd', + description: 'Working directory (default: current directory)', + }), + format: option({ + type: optional(oneOf(['table', 'json'])), + long: 'format', + short: 'f', + description: 'Output format: table (default) or json', + }), + }, + handler: async ({ + path: searchPath, + target, + experiment, + scoreBelow, + scoreAbove, + status, + hasTool, + dir, + format, + }) => { + const cwd = dir ?? process.cwd(); + + // Discover sources + const sources = discoverFilterSources(searchPath, cwd); + if (sources.length === 0) { + console.error(`${c.yellow}No result files found.${c.reset}`); + console.error(`${c.dim}Run an evaluation first, or specify a path.${c.reset}`); + process.exit(0); + } + + // Load all records + const allRecords: FilterableRecord[] = []; + for (const source of sources) { + allRecords.push(...parseFilterableRecords(source)); + } + + if (allRecords.length === 0) { + console.error(`${c.yellow}No results found in the specified path.${c.reset}`); + process.exit(0); + } + + // Apply filters + const predicate = buildFilterPredicate({ + target, + experiment, + scoreBelow, + scoreAbove, + status, + hasTool, + }); + const filtered = allRecords.filter(predicate); + + if (format === 'json') { + console.log(JSON.stringify(filtered, null, 2)); + } else { + console.log(formatFilterTable(filtered)); + } + }, +}); diff --git a/apps/cli/src/commands/inspect/index.ts b/apps/cli/src/commands/inspect/index.ts index 94d0a0b0e..a12c04c32 100644 --- a/apps/cli/src/commands/inspect/index.ts +++ b/apps/cli/src/commands/inspect/index.ts @@ -1,7 +1,9 @@ import { subcommands } from 'cmd-ts'; +import { inspectFilterCommand } from './filter.js'; import { traceListCommand } from './list.js'; import { traceScoreCommand } from './score.js'; +import { inspectSearchCommand } from './search.js'; import { traceShowCommand } from './show.js'; import { traceStatsCommand } from './stats.js'; @@ -9,8 +11,10 @@ export const inspectCommand = subcommands({ name: 'inspect', description: 'Inspect and analyze evaluation results', cmds: { + filter: inspectFilterCommand, list: traceListCommand, score: traceScoreCommand, + search: inspectSearchCommand, show: traceShowCommand, stats: traceStatsCommand, }, diff --git a/apps/cli/src/commands/inspect/search.ts b/apps/cli/src/commands/inspect/search.ts new file mode 100644 index 000000000..7c551742c --- /dev/null +++ b/apps/cli/src/commands/inspect/search.ts @@ -0,0 +1,285 @@ +/** + * `agentv inspect search` — regex search across evaluation results and transcripts. + * + * Scans JSONL files in `.agentv/results/runs/` and `.agentv/transcripts/` for + * lines matching a regex pattern. Outputs file path, test_id, and matching + * content with surrounding context. + * + * Supported sources: + * - Run result manifests (index.jsonl) — searches serialized JSON content + * - Transcript JSONL files — searches message content and tool call data + * + * To extend: add new scanners in the `scanSources()` function for additional + * JSONL-based data directories. + */ + +import { existsSync, readFileSync, readdirSync, statSync } from 'node:fs'; +import path from 'node:path'; +import { command, oneOf, option, optional, positional, string } from 'cmd-ts'; +import { toSnakeCaseDeep } from '../../utils/case-conversion.js'; +import { c, padRight } from './utils.js'; + +/** A single search match within a JSONL line. */ +export interface SearchMatch { + /** Absolute path to the source file. */ + file: string; + /** Identifier extracted from the record (test_id, session_id, etc.). */ + id: string; + /** The line number within the file (1-based). */ + lineNumber: number; + /** The matched text snippet with surrounding context. */ + snippet: string; + /** Optional metadata: target, experiment, score. */ + target?: string; + experiment?: string; + score?: number; +} + +/** + * Recursively collect all JSONL files under a directory. + */ +function collectJsonlFiles(dir: string): string[] { + const files: string[] = []; + try { + const entries = readdirSync(dir, { withFileTypes: true }); + for (const entry of entries) { + const fullPath = path.join(dir, entry.name); + if (entry.isDirectory()) { + files.push(...collectJsonlFiles(fullPath)); + } else if (entry.name.endsWith('.jsonl')) { + files.push(fullPath); + } + } + } catch { + // Directory may not exist + } + return files; +} + +/** + * Extract a human-readable snippet around a regex match within a string. + * Returns up to `contextChars` characters on each side of the match. + */ +function extractSnippet( + text: string, + matchIndex: number, + matchLength: number, + contextChars = 60, +): string { + const start = Math.max(0, matchIndex - contextChars); + const end = Math.min(text.length, matchIndex + matchLength + contextChars); + let snippet = text.slice(start, end); + if (start > 0) snippet = `...${snippet}`; + if (end < text.length) snippet = `${snippet}...`; + // Collapse whitespace for readability + return snippet.replace(/\n/g, '\\n').replace(/\r/g, ''); +} + +/** + * Search a single JSONL file for regex matches. + */ +export function searchJsonlFile( + filePath: string, + regex: RegExp, + targetFilter?: string, + experimentFilter?: string, +): SearchMatch[] { + const matches: SearchMatch[] = []; + let content: string; + try { + content = readFileSync(filePath, 'utf8'); + } catch { + return matches; + } + + const lines = content.split('\n').filter((line) => line.trim()); + + for (let i = 0; i < lines.length; i++) { + const line = lines[i]; + let record: Record; + try { + record = JSON.parse(line) as Record; + } catch { + continue; + } + + // Extract metadata for filtering + const target = typeof record.target === 'string' ? record.target : undefined; + const experiment = typeof record.experiment === 'string' ? record.experiment : undefined; + const score = typeof record.score === 'number' ? record.score : undefined; + const testId = + typeof record.test_id === 'string' + ? record.test_id + : typeof record.source === 'object' && record.source !== null + ? ((record.source as Record).session_id as string | undefined) + : undefined; + + // Apply metadata filters before regex search + if (targetFilter && target !== targetFilter) continue; + if (experimentFilter && experiment !== experimentFilter) continue; + + // Search the entire serialized line for the pattern + const match = regex.exec(line); + if (match) { + matches.push({ + file: filePath, + id: testId ?? `line-${i + 1}`, + lineNumber: i + 1, + snippet: extractSnippet(line, match.index, match[0].length), + target, + experiment, + score, + }); + } + } + + return matches; +} + +/** + * Discover all searchable JSONL sources under a base path. + * If the path is a file, search that single file. + * If it's a directory, recursively find all .jsonl files. + * If not specified, scan both .agentv/results/runs/ and .agentv/transcripts/. + */ +function discoverSources(basePath: string | undefined, cwd: string): string[] { + if (basePath) { + const resolved = path.isAbsolute(basePath) ? basePath : path.resolve(cwd, basePath); + if (!existsSync(resolved)) { + console.error(`${c.red}Error:${c.reset} Path does not exist: ${resolved}`); + process.exit(1); + } + try { + if (statSync(resolved).isDirectory()) { + return collectJsonlFiles(resolved); + } + } catch { + // Not a directory — treat as file + } + return [resolved]; + } + + // Default: scan both results and transcripts + const sources: string[] = []; + sources.push(...collectJsonlFiles(path.join(cwd, '.agentv', 'results', 'runs'))); + sources.push(...collectJsonlFiles(path.join(cwd, '.agentv', 'transcripts'))); + return sources; +} + +function formatSearchResults(matches: SearchMatch[], pattern: string): string { + const lines: string[] = []; + + if (matches.length === 0) { + lines.push(`${c.yellow}No matches found for pattern: ${pattern}${c.reset}`); + return lines.join('\n'); + } + + lines.push(''); + lines.push(`${c.bold}Search Results${c.reset} ${c.dim}pattern: /${pattern}/${c.reset}`); + lines.push( + `${c.dim}${matches.length} record${matches.length !== 1 ? 's' : ''} matched${c.reset}`, + ); + lines.push(''); + + // Group by file + const byFile = new Map(); + for (const match of matches) { + const existing = byFile.get(match.file) ?? []; + existing.push(match); + byFile.set(match.file, existing); + } + + for (const [file, fileMatches] of byFile) { + lines.push(`${c.cyan}${file}${c.reset}`); + + for (const match of fileMatches) { + const meta: string[] = []; + if (match.target) meta.push(`target:${match.target}`); + if (match.experiment) meta.push(`exp:${match.experiment}`); + if (match.score !== undefined) meta.push(`score:${match.score}`); + const metaStr = meta.length > 0 ? ` ${c.dim}[${meta.join(', ')}]${c.reset}` : ''; + + lines.push( + ` ${c.bold}${match.id}${c.reset} ${c.dim}(line ${match.lineNumber})${c.reset}${metaStr}`, + ); + lines.push(` ${match.snippet}`); + } + lines.push(''); + } + + return lines.join('\n'); +} + +export const inspectSearchCommand = command({ + name: 'search', + description: 'Search across evaluation results and transcripts for a regex pattern', + args: { + pattern: option({ + type: string, + long: 'pattern', + short: 'p', + description: 'Regex pattern to search for in result/transcript content', + }), + path: positional({ + type: optional(string), + displayName: 'path', + description: + 'Directory or file to search (default: .agentv/results/runs/ and .agentv/transcripts/)', + }), + target: option({ + type: optional(string), + long: 'target', + description: 'Filter results to a specific target name', + }), + experiment: option({ + type: optional(string), + long: 'experiment', + description: 'Filter results to a specific experiment name', + }), + dir: option({ + type: optional(string), + long: 'dir', + short: 'd', + description: 'Working directory (default: current directory)', + }), + format: option({ + type: optional(oneOf(['table', 'json'])), + long: 'format', + short: 'f', + description: 'Output format: table (default) or json', + }), + }, + handler: async ({ pattern, path: searchPath, target, experiment, dir, format }) => { + const cwd = dir ?? process.cwd(); + + // Compile the regex + let regex: RegExp; + try { + regex = new RegExp(pattern, 'i'); + } catch (err) { + console.error(`${c.red}Error:${c.reset} Invalid regex pattern: ${(err as Error).message}`); + process.exit(1); + } + + // Discover files to search + const sources = discoverSources(searchPath, cwd); + if (sources.length === 0) { + console.error(`${c.yellow}No JSONL files found to search.${c.reset}`); + console.error(`${c.dim}Run an evaluation first, or specify a path to search.${c.reset}`); + process.exit(0); + } + + // Search all sources + const allMatches: SearchMatch[] = []; + for (const source of sources) { + const fileMatches = searchJsonlFile(source, regex, target, experiment); + allMatches.push(...fileMatches); + } + + if (format === 'json') { + console.log(JSON.stringify(toSnakeCaseDeep(allMatches), null, 2)); + } else { + console.log(formatSearchResults(allMatches, pattern)); + } + }, +}); diff --git a/apps/cli/test/commands/inspect/filter.test.ts b/apps/cli/test/commands/inspect/filter.test.ts new file mode 100644 index 000000000..7bbb56c03 --- /dev/null +++ b/apps/cli/test/commands/inspect/filter.test.ts @@ -0,0 +1,304 @@ +import { afterEach, beforeEach, describe, expect, it } from 'bun:test'; +import { mkdirSync, mkdtempSync, rmSync, writeFileSync } from 'node:fs'; +import { tmpdir } from 'node:os'; +import path from 'node:path'; + +import { + type FilterPredicate, + type FilterableRecord, + buildFilterPredicate, + parseFilterableRecords, +} from '../../../src/commands/inspect/filter.js'; + +// Minimal index.jsonl records for filter testing +const PASS_RECORD = JSON.stringify({ + test_id: 'test-pass', + target: 'claude', + score: 1, + execution_status: 'ok', + timestamp: '2026-04-01T10:00:00.000Z', + trace: { tool_calls: { read_file: 3, write_file: 1 } }, +}); + +const FAIL_RECORD = JSON.stringify({ + test_id: 'test-fail', + target: 'gpt-4', + score: 0.3, + execution_status: 'quality_failure', + timestamp: '2026-04-01T10:01:00.000Z', + trace: { tool_calls: { read_file: 1 } }, +}); + +const ERROR_RECORD = JSON.stringify({ + test_id: 'test-error', + target: 'claude', + score: 0, + execution_status: 'error', + error: 'Agent timed out', + timestamp: '2026-04-01T10:02:00.000Z', +}); + +const NO_STATUS_PASS = JSON.stringify({ + test_id: 'test-implicit-pass', + target: 'codex', + score: 1, + timestamp: '2026-04-01T10:03:00.000Z', +}); + +const NO_STATUS_FAIL = JSON.stringify({ + test_id: 'test-implicit-fail', + target: 'codex', + score: 0.5, + timestamp: '2026-04-01T10:04:00.000Z', +}); + +const OUTPUT_WITH_TOOLS = JSON.stringify({ + test_id: 'test-tools', + target: 'claude', + score: 0.8, + output: [ + { + role: 'assistant', + tool_calls: [ + { tool: 'execute_command', input: 'ls' }, + { tool: 'read_file', input: 'README.md' }, + ], + }, + ], +}); + +describe('inspect filter', () => { + let tempDir: string; + + beforeEach(() => { + tempDir = mkdtempSync(path.join(tmpdir(), 'agentv-filter-test-')); + }); + + afterEach(() => { + rmSync(tempDir, { recursive: true, force: true }); + }); + + describe('parseFilterableRecords', () => { + it('parses valid index.jsonl into filterable records', () => { + const filePath = path.join(tempDir, 'index.jsonl'); + writeFileSync(filePath, `${PASS_RECORD}\n${FAIL_RECORD}\n`); + + const records = parseFilterableRecords(filePath); + + expect(records).toHaveLength(2); + expect(records[0].test_id).toBe('test-pass'); + expect(records[0].target).toBe('claude'); + expect(records[0].score).toBe(1); + expect(records[0].execution_status).toBe('ok'); + expect(records[0].tool_names).toContain('read_file'); + expect(records[0].tool_names).toContain('write_file'); + + expect(records[1].test_id).toBe('test-fail'); + expect(records[1].target).toBe('gpt-4'); + expect(records[1].score).toBe(0.3); + }); + + it('extracts tool names from output messages', () => { + const filePath = path.join(tempDir, 'index.jsonl'); + writeFileSync(filePath, `${OUTPUT_WITH_TOOLS}\n`); + + const records = parseFilterableRecords(filePath); + + expect(records).toHaveLength(1); + expect(records[0].tool_names).toContain('execute_command'); + expect(records[0].tool_names).toContain('read_file'); + }); + + it('returns empty array for unreadable files', () => { + const records = parseFilterableRecords(path.join(tempDir, 'nonexistent.jsonl')); + + expect(records).toHaveLength(0); + }); + + it('skips invalid JSON lines', () => { + const filePath = path.join(tempDir, 'mixed.jsonl'); + writeFileSync(filePath, `not json\n${PASS_RECORD}\n{broken\n`); + + const records = parseFilterableRecords(filePath); + + expect(records).toHaveLength(1); + expect(records[0].test_id).toBe('test-pass'); + }); + + it('infers experiment name from directory path', () => { + const expDir = path.join( + tempDir, + '.agentv', + 'results', + 'runs', + 'my-experiment', + '2026-04-01T10-00-00-000Z', + ); + mkdirSync(expDir, { recursive: true }); + const filePath = path.join(expDir, 'index.jsonl'); + writeFileSync(filePath, `${PASS_RECORD}\n`); + + const records = parseFilterableRecords(filePath); + + expect(records).toHaveLength(1); + expect(records[0].experiment).toBe('my-experiment'); + }); + + it('defaults test_id to "unknown" when missing', () => { + const record = JSON.stringify({ score: 0.5 }); + const filePath = path.join(tempDir, 'index.jsonl'); + writeFileSync(filePath, `${record}\n`); + + const records = parseFilterableRecords(filePath); + + expect(records).toHaveLength(1); + expect(records[0].test_id).toBe('unknown'); + }); + }); + + describe('buildFilterPredicate', () => { + const makeRecord = (overrides: Partial = {}): FilterableRecord => ({ + file: '/fake/path', + test_id: 'test-1', + score: 0.8, + tool_names: [], + ...overrides, + }); + + it('returns all records when no filters are specified', () => { + const predicate = buildFilterPredicate({}); + expect(predicate(makeRecord())).toBe(true); + expect(predicate(makeRecord({ score: 0 }))).toBe(true); + expect(predicate(makeRecord({ score: 1, target: 'claude' }))).toBe(true); + }); + + it('filters by target', () => { + const predicate = buildFilterPredicate({ target: 'claude' }); + + expect(predicate(makeRecord({ target: 'claude' }))).toBe(true); + expect(predicate(makeRecord({ target: 'gpt-4' }))).toBe(false); + expect(predicate(makeRecord({}))).toBe(false); + }); + + it('filters by experiment', () => { + const predicate = buildFilterPredicate({ experiment: 'baseline' }); + + expect(predicate(makeRecord({ experiment: 'baseline' }))).toBe(true); + expect(predicate(makeRecord({ experiment: 'with-skills' }))).toBe(false); + expect(predicate(makeRecord({}))).toBe(false); + }); + + it('filters by score-below', () => { + const predicate = buildFilterPredicate({ scoreBelow: 0.5 }); + + expect(predicate(makeRecord({ score: 0.3 }))).toBe(true); + expect(predicate(makeRecord({ score: 0.5 }))).toBe(false); + expect(predicate(makeRecord({ score: 1 }))).toBe(false); + }); + + it('filters by score-above', () => { + const predicate = buildFilterPredicate({ scoreAbove: 0.5 }); + + expect(predicate(makeRecord({ score: 0.8 }))).toBe(true); + expect(predicate(makeRecord({ score: 0.5 }))).toBe(false); + expect(predicate(makeRecord({ score: 0 }))).toBe(false); + }); + + it('filters by status=pass using execution_status', () => { + const predicate = buildFilterPredicate({ status: 'pass' }); + + expect(predicate(makeRecord({ execution_status: 'ok' }))).toBe(true); + expect(predicate(makeRecord({ execution_status: 'quality_failure' }))).toBe(false); + expect(predicate(makeRecord({ execution_status: 'error' }))).toBe(false); + }); + + it('filters by status=fail using execution_status', () => { + const predicate = buildFilterPredicate({ status: 'fail' }); + + expect(predicate(makeRecord({ execution_status: 'quality_failure' }))).toBe(true); + expect(predicate(makeRecord({ execution_status: 'ok' }))).toBe(false); + }); + + it('filters by status=error using execution_status', () => { + const predicate = buildFilterPredicate({ status: 'error' }); + + expect(predicate(makeRecord({ execution_status: 'error' }))).toBe(true); + expect(predicate(makeRecord({ execution_status: 'timeout' }))).toBe(true); + expect(predicate(makeRecord({ execution_status: 'ok' }))).toBe(false); + }); + + it('infers status from score when execution_status is missing', () => { + const passPredicate = buildFilterPredicate({ status: 'pass' }); + expect(passPredicate(makeRecord({ score: 1 }))).toBe(true); + expect(passPredicate(makeRecord({ score: 0.5 }))).toBe(false); + + const failPredicate = buildFilterPredicate({ status: 'fail' }); + expect(failPredicate(makeRecord({ score: 0.5 }))).toBe(true); + expect(failPredicate(makeRecord({ score: 1 }))).toBe(false); + + const errorPredicate = buildFilterPredicate({ status: 'error' }); + expect(errorPredicate(makeRecord({ error: 'timeout' }))).toBe(true); + expect(errorPredicate(makeRecord({}))).toBe(false); + }); + + it('filters by has-tool (substring match)', () => { + const predicate = buildFilterPredicate({ hasTool: 'read' }); + + expect(predicate(makeRecord({ tool_names: ['read_file', 'write_file'] }))).toBe(true); + expect(predicate(makeRecord({ tool_names: ['execute_command'] }))).toBe(false); + expect(predicate(makeRecord({ tool_names: [] }))).toBe(false); + }); + + it('combines multiple filters with AND logic', () => { + const predicate = buildFilterPredicate({ + target: 'claude', + scoreAbove: 0.5, + hasTool: 'read', + }); + + // Matches all criteria + expect( + predicate( + makeRecord({ + target: 'claude', + score: 0.9, + tool_names: ['read_file'], + }), + ), + ).toBe(true); + + // Fails target + expect( + predicate( + makeRecord({ + target: 'gpt-4', + score: 0.9, + tool_names: ['read_file'], + }), + ), + ).toBe(false); + + // Fails score + expect( + predicate( + makeRecord({ + target: 'claude', + score: 0.3, + tool_names: ['read_file'], + }), + ), + ).toBe(false); + + // Fails tool + expect( + predicate( + makeRecord({ + target: 'claude', + score: 0.9, + tool_names: ['write_file'], + }), + ), + ).toBe(false); + }); + }); +}); diff --git a/apps/cli/test/commands/inspect/search.test.ts b/apps/cli/test/commands/inspect/search.test.ts new file mode 100644 index 000000000..b9d4e2b77 --- /dev/null +++ b/apps/cli/test/commands/inspect/search.test.ts @@ -0,0 +1,161 @@ +import { afterEach, beforeEach, describe, expect, it } from 'bun:test'; +import { mkdirSync, mkdtempSync, rmSync, writeFileSync } from 'node:fs'; +import { tmpdir } from 'node:os'; +import path from 'node:path'; + +import { type SearchMatch, searchJsonlFile } from '../../../src/commands/inspect/search.js'; + +// Minimal JSONL records for search testing +const RECORD_A = JSON.stringify({ + test_id: 'test-alpha', + target: 'claude', + experiment: 'baseline', + score: 1, + output: 'The quick brown fox jumps over the lazy dog', +}); + +const RECORD_B = JSON.stringify({ + test_id: 'test-beta', + target: 'gpt-4', + experiment: 'baseline', + score: 0.5, + output: 'Hello world from the agent', + error: 'partial failure', +}); + +const RECORD_C = JSON.stringify({ + test_id: 'test-gamma', + target: 'claude', + experiment: 'with-skills', + score: 0, + output: 'Something completely different', +}); + +describe('inspect search', () => { + let tempDir: string; + + beforeEach(() => { + tempDir = mkdtempSync(path.join(tmpdir(), 'agentv-search-test-')); + }); + + afterEach(() => { + rmSync(tempDir, { recursive: true, force: true }); + }); + + describe('searchJsonlFile', () => { + it('finds matches in JSONL content', () => { + const filePath = path.join(tempDir, 'results.jsonl'); + writeFileSync(filePath, `${RECORD_A}\n${RECORD_B}\n${RECORD_C}\n`); + + const matches = searchJsonlFile(filePath, /quick brown fox/i); + + expect(matches).toHaveLength(1); + expect(matches[0].id).toBe('test-alpha'); + expect(matches[0].lineNumber).toBe(1); + expect(matches[0].snippet).toContain('quick brown fox'); + expect(matches[0].target).toBe('claude'); + expect(matches[0].experiment).toBe('baseline'); + expect(matches[0].score).toBe(1); + }); + + it('returns empty array when no matches', () => { + const filePath = path.join(tempDir, 'results.jsonl'); + writeFileSync(filePath, `${RECORD_A}\n${RECORD_B}\n`); + + const matches = searchJsonlFile(filePath, /nonexistent pattern/); + + expect(matches).toHaveLength(0); + }); + + it('matches across multiple lines', () => { + const filePath = path.join(tempDir, 'results.jsonl'); + writeFileSync(filePath, `${RECORD_A}\n${RECORD_B}\n${RECORD_C}\n`); + + // "test-" appears in all records + const matches = searchJsonlFile(filePath, /test-/); + + expect(matches).toHaveLength(3); + expect(matches.map((m) => m.id)).toEqual(['test-alpha', 'test-beta', 'test-gamma']); + }); + + it('applies target filter', () => { + const filePath = path.join(tempDir, 'results.jsonl'); + writeFileSync(filePath, `${RECORD_A}\n${RECORD_B}\n${RECORD_C}\n`); + + // Search for "test-" but only target=claude + const matches = searchJsonlFile(filePath, /test-/, 'claude'); + + expect(matches).toHaveLength(2); + expect(matches.every((m) => m.target === 'claude')).toBe(true); + }); + + it('applies experiment filter', () => { + const filePath = path.join(tempDir, 'results.jsonl'); + writeFileSync(filePath, `${RECORD_A}\n${RECORD_B}\n${RECORD_C}\n`); + + const matches = searchJsonlFile(filePath, /test-/, undefined, 'with-skills'); + + expect(matches).toHaveLength(1); + expect(matches[0].id).toBe('test-gamma'); + expect(matches[0].experiment).toBe('with-skills'); + }); + + it('applies both target and experiment filters', () => { + const filePath = path.join(tempDir, 'results.jsonl'); + writeFileSync(filePath, `${RECORD_A}\n${RECORD_B}\n${RECORD_C}\n`); + + const matches = searchJsonlFile(filePath, /test-/, 'claude', 'with-skills'); + + expect(matches).toHaveLength(1); + expect(matches[0].id).toBe('test-gamma'); + }); + + it('returns empty array for unreadable files', () => { + const matches = searchJsonlFile(path.join(tempDir, 'nonexistent.jsonl'), /pattern/); + + expect(matches).toHaveLength(0); + }); + + it('skips invalid JSON lines', () => { + const filePath = path.join(tempDir, 'mixed.jsonl'); + writeFileSync(filePath, `not json\n${RECORD_A}\n{broken\n`); + + const matches = searchJsonlFile(filePath, /quick brown fox/); + + expect(matches).toHaveLength(1); + expect(matches[0].id).toBe('test-alpha'); + }); + + it('uses line number as fallback id when test_id is missing', () => { + const recordNoId = JSON.stringify({ + score: 0.5, + output: 'something searchable', + }); + const filePath = path.join(tempDir, 'no-id.jsonl'); + writeFileSync(filePath, `${recordNoId}\n`); + + const matches = searchJsonlFile(filePath, /searchable/); + + expect(matches).toHaveLength(1); + expect(matches[0].id).toBe('line-1'); + }); + + it('extracts snippet with context around the match', () => { + const longOutput = `${'A'.repeat(100)}NEEDLE${'B'.repeat(100)}`; + const record = JSON.stringify({ + test_id: 'test-long', + score: 1, + output: longOutput, + }); + const filePath = path.join(tempDir, 'long.jsonl'); + writeFileSync(filePath, `${record}\n`); + + const matches = searchJsonlFile(filePath, /NEEDLE/); + + expect(matches).toHaveLength(1); + expect(matches[0].snippet).toContain('NEEDLE'); + // Snippet should be truncated (not the full line) + expect(matches[0].snippet.length).toBeLessThan(record.length); + }); + }); +});