From fc6b3175e196bde3fd652c51854470a92576e528 Mon Sep 17 00:00:00 2001 From: Christopher Date: Sun, 12 Apr 2026 08:06:33 +0000 Subject: [PATCH 1/2] feat(eval): add dependency-aware eval ordering with DAG wave scheduler (#331) Add `depends_on` and `on_dependency_failure` fields to EvalTest, enabling multi-agent swarm evaluation with dependency ordering between tests. The flat pLimit + Promise.allSettled dispatch is replaced with a DAG-aware wave scheduler that: - Validates the dependency graph (rejects cycles, missing IDs, self-deps) - Computes execution waves via topological sort - Dispatches independent tests in parallel within each wave - Tracks completed results for downstream context injection - Supports three failure policies: skip (default), fail, run - Injects `dependency_results` into evaluator context for dependent tests Tests without `depends_on` behave identically to before (single wave). Co-Authored-By: Claude Opus 4.6 (1M context) --- .../core/src/evaluation/evaluators/types.ts | 3 + packages/core/src/evaluation/orchestrator.ts | 682 ++++++++++++------ packages/core/src/evaluation/types.ts | 23 + .../evaluation/validation/eval-file.schema.ts | 2 + packages/core/src/evaluation/yaml-parser.ts | 16 + .../evaluation/dependency-scheduling.test.ts | 362 ++++++++++ .../references/eval-schema.json | 20 + 7 files changed, 905 insertions(+), 203 deletions(-) create mode 100644 packages/core/test/evaluation/dependency-scheduling.test.ts diff --git a/packages/core/src/evaluation/evaluators/types.ts b/packages/core/src/evaluation/evaluators/types.ts index 6d299d4d4..9fb35ab30 100644 --- a/packages/core/src/evaluation/evaluators/types.ts +++ b/packages/core/src/evaluation/evaluators/types.ts @@ -2,6 +2,7 @@ import type { ResolvedTarget } from '../providers/targets.js'; import type { ChatPrompt, Message, Provider } from '../providers/types.js'; import type { TokenUsage, TraceSummary } from '../trace.js'; import type { + DependencyResult, DockerWorkspaceConfig, EvalTest, EvaluationVerdict, @@ -58,6 +59,8 @@ export interface EvaluationContext { readonly workspacePath?: string; /** Docker workspace config: when present, code-grader commands run inside a container */ readonly dockerConfig?: DockerWorkspaceConfig; + /** Results from dependency tests (only present when the test has depends_on) */ + readonly dependencyResults?: Readonly>; } export interface EvaluationScore { diff --git a/packages/core/src/evaluation/orchestrator.ts b/packages/core/src/evaluation/orchestrator.ts index cbc8ef3e9..0996b4923 100644 --- a/packages/core/src/evaluation/orchestrator.ts +++ b/packages/core/src/evaluation/orchestrator.ts @@ -47,6 +47,7 @@ import { import { aggregateTrials } from './trials.js'; import type { AssertionEntry, + DependencyResult, EvalTest, EvaluationResult, EvaluationVerdict, @@ -157,6 +158,114 @@ function getWorkspaceTemplate(target: ResolvedTarget): string | undefined { return undefined; } +/** + * Validate the dependency DAG for a set of eval tests. + * Rejects circular dependencies and references to missing test IDs. + * Returns silently when the graph is valid. + */ +function validateDependencyGraph(tests: readonly EvalTest[]): void { + const ids = new Set(tests.map((t) => t.id)); + + // Check for missing dependency IDs + for (const test of tests) { + if (!test.depends_on) continue; + for (const dep of test.depends_on) { + if (!ids.has(dep)) { + throw new Error( + `Test '${test.id}' depends on '${dep}', but no test with that ID exists in this suite`, + ); + } + if (dep === test.id) { + throw new Error(`Test '${test.id}' depends on itself`); + } + } + } + + // Detect cycles via DFS + const depMap = new Map(); + for (const test of tests) { + if (test.depends_on && test.depends_on.length > 0) { + depMap.set(test.id, test.depends_on); + } + } + + const visited = new Set(); + const visiting = new Set(); + + function visit(id: string, path: string[]): void { + if (visiting.has(id)) { + const cycle = [...path.slice(path.indexOf(id)), id]; + throw new Error(`Circular dependency detected: ${cycle.join(' → ')}`); + } + if (visited.has(id)) return; + visiting.add(id); + path.push(id); + for (const dep of depMap.get(id) ?? []) { + visit(dep, path); + } + path.pop(); + visiting.delete(id); + visited.add(id); + } + + for (const test of tests) { + visit(test.id, []); + } +} + +/** + * Compute execution waves via topological sort. + * Each wave contains tests whose dependencies have all been satisfied by prior waves. + * Tests without dependencies land in wave 0. + */ +function computeWaves(tests: readonly EvalTest[]): EvalTest[][] { + const hasDeps = tests.some((t) => t.depends_on && t.depends_on.length > 0); + if (!hasDeps) { + // Fast path: no dependencies, single wave with all tests + return [tests.slice()]; + } + + const inDegree = new Map(); + const dependents = new Map(); + const testById = new Map(); + + for (const test of tests) { + testById.set(test.id, test); + inDegree.set(test.id, 0); + } + + for (const test of tests) { + if (!test.depends_on) continue; + inDegree.set(test.id, test.depends_on.length); + for (const dep of test.depends_on) { + const list = dependents.get(dep) ?? []; + list.push(test.id); + dependents.set(dep, list); + } + } + + const waves: EvalTest[][] = []; + let ready = tests.filter((t) => (inDegree.get(t.id) ?? 0) === 0); + + while (ready.length > 0) { + waves.push(ready); + const nextReady: EvalTest[] = []; + for (const test of ready) { + for (const depId of dependents.get(test.id) ?? []) { + const newDeg = (inDegree.get(depId) ?? 1) - 1; + inDegree.set(depId, newDeg); + if (newDeg === 0) { + const depTest = testById.get(depId); + if (depTest) nextReady.push(depTest); + } + } + } + ready = nextReady; + } + + return waves; +} + export interface EvaluationCache { get(key: string): MaybePromise; set(key: string, value: ProviderResponse): MaybePromise; @@ -206,6 +315,8 @@ export interface RunEvalCaseOptions { readonly verbose?: boolean; /** Per-test score threshold for pass/fail (default: 0.8) */ readonly threshold?: number; + /** Results from dependency tests (only present when the test has depends_on) */ + readonly dependencyResults?: Readonly>; } export interface ProgressEvent { @@ -868,236 +979,389 @@ export async function runEvaluation( // fail_on_error tracking (best-effort under concurrency > 1, matching budgetExhausted semantics) let failOnErrorTriggered = false; - // Map test cases to limited promises for parallel execution - const promises = filteredEvalCases.map((evalCase) => - limit(async () => { - // Assign worker ID when test starts executing - const workerId = nextWorkerId++; - workerIdByEvalId.set(evalCase.id, workerId); - - // Check suite-level budget before dispatching - if (totalBudgetUsd !== undefined && budgetExhausted) { - const budgetResult: EvaluationResult = { - timestamp: (now ?? (() => new Date()))().toISOString(), - testId: evalCase.id, - suite: evalCase.suite, - category: evalCase.category, - score: 0, - assertions: [], - output: [], - target: target.name, - error: `Suite budget exceeded ($${cumulativeBudgetCost.toFixed(4)} / $${totalBudgetUsd.toFixed(4)})`, - budgetExceeded: true, - executionStatus: 'execution_error', - failureStage: 'setup', - failureReasonCode: 'budget_exceeded', - executionError: { - message: `Suite budget exceeded ($${cumulativeBudgetCost.toFixed(4)} / $${totalBudgetUsd.toFixed(4)})`, - stage: 'setup', - }, - }; + // --- Validate dependency graph and compute execution waves --- + validateDependencyGraph(filteredEvalCases); + const waves = computeWaves(filteredEvalCases); - if (onProgress) { - await onProgress({ - workerId, - testId: evalCase.id, - status: 'failed', - completedAt: Date.now(), - error: budgetResult.error, - score: budgetResult.score, - executionStatus: budgetResult.executionStatus, - }); - } - if (onResult) { - await onResult(budgetResult); + // Track completed test results for dependency injection + const completedResults = new Map(); + const results: EvaluationResult[] = []; + + // Helper: build a DependencyResult from a completed EvaluationResult + function toDependencyResult(r: EvaluationResult): DependencyResult { + const outputText = extractLastAssistantContent(r.output); + return { + score: r.score, + output: outputText, + workspace_path: r.workspacePath, + details: r.scores + ? (Object.fromEntries( + r.scores.map((s) => [s.name, { score: s.score, verdict: s.verdict }]), + ) as JsonObject) + : undefined, + status: + r.executionStatus === 'ok' + ? 'passed' + : r.executionStatus === 'execution_error' + ? 'error' + : 'failed', + }; + } + + // Helper: check whether all dependencies passed for a given test + function checkDependencies(evalCase: EvalTest): { + ok: boolean; + depResults: Record; + } { + const depResults: Record = {}; + if (!evalCase.depends_on || evalCase.depends_on.length === 0) { + return { ok: true, depResults }; + } + let allPassed = true; + for (const depId of evalCase.depends_on) { + const depResult = completedResults.get(depId); + if (depResult) { + depResults[depId] = toDependencyResult(depResult); + if (depResult.executionStatus !== 'ok') { + allPassed = false; } - return budgetResult; + } else { + // Dependency didn't run (should not happen with valid DAG) + allPassed = false; } + } + return { ok: allPassed, depResults }; + } - // Check fail_on_error before dispatching - if (failOnError === true && failOnErrorTriggered) { - const errorMsg = 'Halted: execution error encountered with fail_on_error enabled'; - const haltResult: EvaluationResult = { - timestamp: (now ?? (() => new Date()))().toISOString(), + // Worker function: dispatches a single eval case with dependency context + async function dispatchTest( + evalCase: EvalTest, + depResults?: Record, + ): Promise { + const workerId = nextWorkerId++; + workerIdByEvalId.set(evalCase.id, workerId); + + // Check suite-level budget before dispatching + if (totalBudgetUsd !== undefined && budgetExhausted) { + const budgetResult: EvaluationResult = { + timestamp: (now ?? (() => new Date()))().toISOString(), + testId: evalCase.id, + suite: evalCase.suite, + category: evalCase.category, + score: 0, + assertions: [], + output: [], + target: target.name, + error: `Suite budget exceeded ($${cumulativeBudgetCost.toFixed(4)} / $${totalBudgetUsd.toFixed(4)})`, + budgetExceeded: true, + executionStatus: 'execution_error', + failureStage: 'setup', + failureReasonCode: 'budget_exceeded', + executionError: { + message: `Suite budget exceeded ($${cumulativeBudgetCost.toFixed(4)} / $${totalBudgetUsd.toFixed(4)})`, + stage: 'setup', + }, + }; + + if (onProgress) { + await onProgress({ + workerId, testId: evalCase.id, - suite: evalCase.suite, - category: evalCase.category, - score: 0, - assertions: [], - output: [], - target: target.name, - error: errorMsg, - executionStatus: 'execution_error', - failureStage: 'setup', - failureReasonCode: 'error_threshold_exceeded', - executionError: { message: errorMsg, stage: 'setup' }, - }; + status: 'failed', + completedAt: Date.now(), + error: budgetResult.error, + score: budgetResult.score, + executionStatus: budgetResult.executionStatus, + }); + } + if (onResult) { + await onResult(budgetResult); + } + return budgetResult; + } + + // Check fail_on_error before dispatching + if (failOnError === true && failOnErrorTriggered) { + const errorMsg = 'Halted: execution error encountered with fail_on_error enabled'; + const haltResult: EvaluationResult = { + timestamp: (now ?? (() => new Date()))().toISOString(), + testId: evalCase.id, + suite: evalCase.suite, + category: evalCase.category, + score: 0, + assertions: [], + output: [], + target: target.name, + error: errorMsg, + executionStatus: 'execution_error', + failureStage: 'setup', + failureReasonCode: 'error_threshold_exceeded', + executionError: { message: errorMsg, stage: 'setup' }, + }; + + if (onProgress) { + await onProgress({ + workerId, + testId: evalCase.id, + status: 'failed', + completedAt: Date.now(), + error: haltResult.error, + score: haltResult.score, + executionStatus: haltResult.executionStatus, + }); + } + if (onResult) { + await onResult(haltResult); + } + return haltResult; + } + + if (onProgress) { + await onProgress({ + workerId, + testId: evalCase.id, + status: 'running', + startedAt: Date.now(), + }); + } - if (onProgress) { - await onProgress({ - workerId, - testId: evalCase.id, - status: 'failed', - completedAt: Date.now(), - error: haltResult.error, - score: haltResult.score, - executionStatus: haltResult.executionStatus, - }); + // Multi-slot pool: each test grabs its own pool slot + const testPoolSlot = availablePoolSlots.length > 0 ? availablePoolSlots.pop() : undefined; + const testWorkspacePath = testPoolSlot?.path ?? sharedWorkspacePath; + const testBaselineCommit = testPoolSlot + ? poolSlotBaselines.get(testPoolSlot.path) + : sharedBaselineCommit; + + try { + const graderProvider = await resolveGraderProvider(target); + const runCaseOptions: RunEvalCaseOptions = { + evalCase: evalCase, + provider: primaryProvider, + target, + evaluators: evaluatorRegistry, + maxRetries, + agentTimeoutMs, + cache, + useCache, + now, + graderProvider, + targetResolver, + availableTargets, + evalRunId, + keepWorkspaces, + cleanupWorkspaces, + retainOnSuccess: resolvedRetainOnSuccess, + retainOnFailure: resolvedRetainOnFailure, + sharedWorkspacePath: testWorkspacePath, + sharedBaselineCommit: testBaselineCommit, + suiteWorkspaceFile, + streamCallbacks, + typeRegistry, + repoManager, + evalDir, + verbose, + threshold: scoreThreshold, + ...(depResults && Object.keys(depResults).length > 0 + ? { dependencyResults: depResults } + : {}), + }; + let result = + trials && trials.count > 1 + ? await runEvalCaseWithTrials(runCaseOptions, trials) + : await runEvalCase(runCaseOptions); + + // Track suite-level budget + if (totalBudgetUsd !== undefined) { + // Sum all trial costs when trials are used, otherwise use trace cost + let caseCost: number | undefined; + if (result.trials && result.trials.length > 0) { + const trialCostSum = result.trials.reduce((sum, t) => sum + (t.costUsd ?? 0), 0); + if (trialCostSum > 0) { + caseCost = trialCostSum; + } + } else { + caseCost = result.costUsd; } - if (onResult) { - await onResult(haltResult); + if (caseCost !== undefined) { + cumulativeBudgetCost += caseCost; + if (cumulativeBudgetCost >= totalBudgetUsd) { + budgetExhausted = true; + } } - return haltResult; + } + + // Track fail_on_error + if (failOnError === true && result.executionStatus === 'execution_error') { + failOnErrorTriggered = true; + } + + // Attach beforeAllOutput to first result only + if (beforeAllOutput && !beforeAllOutputAttached) { + result = { ...result, beforeAllOutput }; + beforeAllOutputAttached = true; } if (onProgress) { await onProgress({ workerId, testId: evalCase.id, - status: 'running', - startedAt: Date.now(), + status: result.error ? 'failed' : 'completed', + startedAt: 0, // Not used for completed status + completedAt: Date.now(), + error: result.error, + score: result.score, + executionStatus: result.executionStatus, }); } - // Multi-slot pool: each test grabs its own pool slot - const testPoolSlot = availablePoolSlots.length > 0 ? availablePoolSlots.pop() : undefined; - const testWorkspacePath = testPoolSlot?.path ?? sharedWorkspacePath; - const testBaselineCommit = testPoolSlot - ? poolSlotBaselines.get(testPoolSlot.path) - : sharedBaselineCommit; + if (onResult) { + await onResult(result); + } + return result; + } catch (error) { + if (onProgress) { + await onProgress({ + workerId, + testId: evalCase.id, + status: 'failed', + completedAt: Date.now(), + error: error instanceof Error ? error.message : String(error), + }); + } + throw error; + } finally { + // Return pool slot for reuse by next test + if (testPoolSlot) { + availablePoolSlots.push(testPoolSlot); + } + } + } - try { - const graderProvider = await resolveGraderProvider(target); - const runCaseOptions: RunEvalCaseOptions = { - evalCase: evalCase, - provider: primaryProvider, - target, - evaluators: evaluatorRegistry, - maxRetries, - agentTimeoutMs, - cache, - useCache, - now, - graderProvider, - targetResolver, - availableTargets, - evalRunId, - keepWorkspaces, - cleanupWorkspaces, - retainOnSuccess: resolvedRetainOnSuccess, - retainOnFailure: resolvedRetainOnFailure, - sharedWorkspacePath: testWorkspacePath, - sharedBaselineCommit: testBaselineCommit, - suiteWorkspaceFile, - streamCallbacks, - typeRegistry, - repoManager, - evalDir, - verbose, - threshold: scoreThreshold, - }; - let result = - trials && trials.count > 1 - ? await runEvalCaseWithTrials(runCaseOptions, trials) - : await runEvalCase(runCaseOptions); - - // Track suite-level budget - if (totalBudgetUsd !== undefined) { - // Sum all trial costs when trials are used, otherwise use trace cost - let caseCost: number | undefined; - if (result.trials && result.trials.length > 0) { - const trialCostSum = result.trials.reduce((sum, t) => sum + (t.costUsd ?? 0), 0); - if (trialCostSum > 0) { - caseCost = trialCostSum; + // --- DAG-aware wave dispatch --- + // Dispatch each wave sequentially; tests within a wave run in parallel via pLimit. + for (const wave of waves) { + const wavePromises = wave.map((evalCase) => + limit(async () => { + // Check dependency status for tests with depends_on + if (evalCase.depends_on && evalCase.depends_on.length > 0) { + const { ok, depResults } = checkDependencies(evalCase); + if (!ok) { + const policy = evalCase.on_dependency_failure ?? 'skip'; + if (policy === 'skip') { + const failedDeps = evalCase.depends_on.filter((d) => { + const r = completedResults.get(d); + return !r || r.executionStatus !== 'ok'; + }); + const skipResult: EvaluationResult = { + timestamp: (now ?? (() => new Date()))().toISOString(), + testId: evalCase.id, + suite: evalCase.suite, + category: evalCase.category, + score: 0, + assertions: [], + output: [], + target: target.name, + error: `Skipped: dependency failed (${failedDeps.join(', ')})`, + executionStatus: 'execution_error', + failureStage: 'setup', + failureReasonCode: 'dependency_failed', + executionError: { + message: `Skipped: dependency failed (${failedDeps.join(', ')})`, + stage: 'setup', + }, + }; + if (onProgress) { + await onProgress({ + workerId: nextWorkerId++, + testId: evalCase.id, + status: 'failed', + completedAt: Date.now(), + error: skipResult.error, + score: 0, + executionStatus: skipResult.executionStatus, + }); + } + if (onResult) { + await onResult(skipResult); + } + return skipResult; } - } else { - caseCost = result.costUsd; - } - if (caseCost !== undefined) { - cumulativeBudgetCost += caseCost; - if (cumulativeBudgetCost >= totalBudgetUsd) { - budgetExhausted = true; + if (policy === 'fail') { + const failedDeps = evalCase.depends_on.filter((d) => { + const r = completedResults.get(d); + return !r || r.executionStatus !== 'ok'; + }); + const failResult: EvaluationResult = { + timestamp: (now ?? (() => new Date()))().toISOString(), + testId: evalCase.id, + suite: evalCase.suite, + category: evalCase.category, + score: 0, + assertions: [], + output: [], + target: target.name, + error: `Failed: dependency failed (${failedDeps.join(', ')})`, + executionStatus: 'execution_error', + failureStage: 'setup', + failureReasonCode: 'dependency_failed', + executionError: { + message: `Failed: dependency failed (${failedDeps.join(', ')})`, + stage: 'setup', + }, + }; + if (onProgress) { + await onProgress({ + workerId: nextWorkerId++, + testId: evalCase.id, + status: 'failed', + completedAt: Date.now(), + error: failResult.error, + score: 0, + executionStatus: failResult.executionStatus, + }); + } + if (onResult) { + await onResult(failResult); + } + return failResult; } + // policy === 'run': fall through to dispatch with dependency results } + return dispatchTest(evalCase, depResults); } + return dispatchTest(evalCase); + }), + ); - // Track fail_on_error - if (failOnError === true && result.executionStatus === 'execution_error') { - failOnErrorTriggered = true; - } - - // Attach beforeAllOutput to first result only - if (beforeAllOutput && !beforeAllOutputAttached) { - result = { ...result, beforeAllOutput }; - beforeAllOutputAttached = true; - } - - if (onProgress) { - await onProgress({ - workerId, - testId: evalCase.id, - status: result.error ? 'failed' : 'completed', - startedAt: 0, // Not used for completed status - completedAt: Date.now(), - error: result.error, - score: result.score, - executionStatus: result.executionStatus, - }); - } + const settled = await Promise.allSettled(wavePromises); + // Collect wave results + for (let i = 0; i < settled.length; i++) { + const outcome = settled[i]; + const evalCase = wave[i]; + if (outcome.status === 'fulfilled') { + completedResults.set(evalCase.id, outcome.value); + results.push(outcome.value); + } else { + const formattingMode = usesFileReferencePrompt(primaryProvider) ? 'agent' : 'lm'; + const promptInputs = await buildPromptInputs(evalCase, formattingMode); + const errorResult = buildErrorResult( + evalCase, + target.name, + (now ?? (() => new Date()))(), + outcome.reason, + promptInputs, + primaryProvider, + 'agent', + 'provider_error', + verbose, + ); + completedResults.set(evalCase.id, errorResult); + results.push(errorResult); if (onResult) { - await onResult(result); - } - return result; - } catch (error) { - if (onProgress) { - await onProgress({ - workerId, - testId: evalCase.id, - status: 'failed', - completedAt: Date.now(), - error: error instanceof Error ? error.message : String(error), - }); - } - throw error; - } finally { - // Return pool slot for reuse by next test - if (testPoolSlot) { - availablePoolSlots.push(testPoolSlot); + await onResult(errorResult); } } - }), - ); - - // Wait for all workers to complete - const settled = await Promise.allSettled(promises); - - // Extract results, handling both fulfilled and rejected promises - const results: EvaluationResult[] = []; - for (let i = 0; i < settled.length; i++) { - const outcome = settled[i]; - if (outcome.status === 'fulfilled') { - results.push(outcome.value); - } else { - // Build error result for rejected promise - const evalCase = filteredEvalCases[i]; - const formattingMode = usesFileReferencePrompt(primaryProvider) ? 'agent' : 'lm'; - const promptInputs = await buildPromptInputs(evalCase, formattingMode); - const errorResult = buildErrorResult( - evalCase, - target.name, - (now ?? (() => new Date()))(), - outcome.reason, - promptInputs, - primaryProvider, - 'agent', - 'provider_error', - verbose, - ); - results.push(errorResult); - if (onResult) { - await onResult(errorResult); - } } } @@ -1401,6 +1665,7 @@ export async function runEvalCase(options: RunEvalCaseOptions): Promise>; }): Promise { const { evalCase, @@ -2133,6 +2400,7 @@ async function evaluateCandidate(options: { workspacePath, dockerConfig, threshold: evalThreshold, + dependencyResults, } = options; const gradeTimestamp = nowFn(); @@ -2161,6 +2429,7 @@ async function evaluateCandidate(options: { workspacePath, dockerConfig, threshold: evalThreshold, + dependencyResults, }); const completedAt = nowFn(); @@ -2247,6 +2516,7 @@ async function runEvaluatorsForCase(options: { readonly workspacePath?: string; readonly dockerConfig?: import('./types.js').DockerWorkspaceConfig; readonly threshold?: number; + readonly dependencyResults?: Readonly>; }): Promise<{ score: EvaluationScore; scores?: EvaluatorResult[] }> { const { evalCase, @@ -2273,6 +2543,7 @@ async function runEvaluatorsForCase(options: { workspacePath, dockerConfig, threshold, + dependencyResults, } = options; if (evalCase.assertions && evalCase.assertions.length > 0) { @@ -2302,6 +2573,7 @@ async function runEvaluatorsForCase(options: { workspacePath, dockerConfig, threshold, + dependencyResults, }); } @@ -2336,6 +2608,7 @@ async function runEvaluatorsForCase(options: { fileChanges, workspacePath, dockerConfig, + dependencyResults, ...(implicitEvaluator ? { evaluator: implicitEvaluator } : {}), }); @@ -2382,6 +2655,7 @@ async function runEvaluatorList(options: { readonly workspacePath?: string; readonly dockerConfig?: import('./types.js').DockerWorkspaceConfig; readonly threshold?: number; + readonly dependencyResults?: Readonly>; }): Promise<{ score: EvaluationScore; scores: EvaluatorResult[] }> { const { evalCase, @@ -2408,6 +2682,7 @@ async function runEvaluatorList(options: { fileChanges, workspacePath, dockerConfig, + dependencyResults, } = options; const scored: Array<{ @@ -2442,6 +2717,7 @@ async function runEvaluatorList(options: { fileChanges, workspacePath, dockerConfig, + dependencyResults, }; // Build the dispatch context for evaluator factories diff --git a/packages/core/src/evaluation/types.ts b/packages/core/src/evaluation/types.ts index 792d3bcd1..3b4adbaec 100644 --- a/packages/core/src/evaluation/types.ts +++ b/packages/core/src/evaluation/types.ts @@ -884,6 +884,29 @@ export interface EvalTest { readonly targets?: readonly string[]; /** Per-test score threshold override (0-1). Resolution: CLI > test > suite > DEFAULT_THRESHOLD. */ readonly threshold?: number; + /** Test IDs this test depends on. Dependent tests wait for all dependencies to complete before running. */ + readonly depends_on?: readonly string[]; + /** What to do when a dependency fails: skip (default), fail, or run anyway. */ + readonly on_dependency_failure?: DependencyFailurePolicy; +} + +/** + * Policy for handling dependency failures. + * - skip: skip the dependent test (default) + * - fail: mark the dependent test as failed without running + * - run: run the dependent test regardless of dependency outcome + */ +export type DependencyFailurePolicy = 'skip' | 'fail' | 'run'; + +/** + * Result summary for a completed dependency, injected into downstream evaluator context. + */ +export interface DependencyResult { + readonly score: number; + readonly output: string; + readonly workspace_path?: string; + readonly details?: JsonObject; + readonly status: 'passed' | 'failed' | 'error'; } /** @deprecated Use `EvalTest` instead */ diff --git a/packages/core/src/evaluation/validation/eval-file.schema.ts b/packages/core/src/evaluation/validation/eval-file.schema.ts index 609f04544..5de36a1a8 100644 --- a/packages/core/src/evaluation/validation/eval-file.schema.ts +++ b/packages/core/src/evaluation/validation/eval-file.schema.ts @@ -373,6 +373,8 @@ const EvalTestSchema = z.object({ conversation_id: z.string().optional(), suite: z.string().optional(), note: z.string().optional(), + depends_on: z.array(z.string()).optional(), + on_dependency_failure: z.enum(['skip', 'fail', 'run']).optional(), }); // --------------------------------------------------------------------------- diff --git a/packages/core/src/evaluation/yaml-parser.ts b/packages/core/src/evaluation/yaml-parser.ts index 9e6f7de1e..1b3ace68c 100644 --- a/packages/core/src/evaluation/yaml-parser.ts +++ b/packages/core/src/evaluation/yaml-parser.ts @@ -501,6 +501,20 @@ async function loadTestsFromYaml( // Extract per-test targets override (matrix evaluation) const caseTargets = extractTargetsFromTestCase(testCaseConfig as JsonObject); + // Extract dependency fields + const dependsOn = Array.isArray(testCaseConfig.depends_on) + ? (testCaseConfig.depends_on as readonly string[]).filter( + (v): v is string => typeof v === 'string', + ) + : undefined; + const onDependencyFailureRaw = asString(testCaseConfig.on_dependency_failure); + const onDependencyFailure = + onDependencyFailureRaw === 'skip' || + onDependencyFailureRaw === 'fail' || + onDependencyFailureRaw === 'run' + ? (onDependencyFailureRaw as import('./types.js').DependencyFailurePolicy) + : undefined; + const testCase: EvalTest = { id, suite: suiteName, @@ -519,6 +533,8 @@ async function loadTestsFromYaml( metadata, targets: caseTargets, ...(caseThreshold !== undefined ? { threshold: caseThreshold } : {}), + ...(dependsOn && dependsOn.length > 0 ? { depends_on: dependsOn } : {}), + ...(onDependencyFailure ? { on_dependency_failure: onDependencyFailure } : {}), }; results.push(testCase); diff --git a/packages/core/test/evaluation/dependency-scheduling.test.ts b/packages/core/test/evaluation/dependency-scheduling.test.ts new file mode 100644 index 000000000..77bcf1c00 --- /dev/null +++ b/packages/core/test/evaluation/dependency-scheduling.test.ts @@ -0,0 +1,362 @@ +import { describe, expect, it } from 'bun:test'; + +import { runEvaluation } from '../../src/evaluation/orchestrator.js'; +import type { ResolvedTarget } from '../../src/evaluation/providers/targets.js'; +import type { + Provider, + ProviderRequest, + ProviderResponse, +} from '../../src/evaluation/providers/types.js'; +import type { DependencyFailurePolicy, EvalTest } from '../../src/evaluation/types.js'; + +/** + * Mock provider returning a fixed response. + */ +class FixedProvider implements Provider { + readonly id: string; + readonly kind = 'mock' as const; + readonly targetName: string; + + constructor( + targetName: string, + private readonly response: ProviderResponse, + ) { + this.id = `mock:${targetName}`; + this.targetName = targetName; + } + + async invoke(_request: ProviderRequest): Promise { + return this.response; + } +} + +const baseTarget: ResolvedTarget = { + kind: 'mock', + name: 'mock', + config: { response: '{}' }, +}; + +const passingEvaluatorRegistry = { + 'llm-grader': { + kind: 'llm-grader', + async evaluate() { + return { + score: 0.9, + verdict: 'pass' as const, + assertions: [{ text: 'ok', passed: true }], + expectedAspectCount: 1, + }; + }, + }, +}; + +const failingEvaluatorRegistry = { + 'llm-grader': { + kind: 'llm-grader', + async evaluate() { + return { + score: 0.2, + verdict: 'fail' as const, + assertions: [{ text: 'nope', passed: false }], + expectedAspectCount: 1, + }; + }, + }, +}; + +function makeTest( + id: string, + opts?: { depends_on?: string[]; on_dependency_failure?: DependencyFailurePolicy }, +): EvalTest { + return { + id, + suite: 'dep-test', + question: `Task ${id}`, + input: [{ role: 'user', content: `Do ${id}` }], + expected_output: [], + file_paths: [], + criteria: `Criteria for ${id}`, + evaluator: 'llm-grader', + ...(opts?.depends_on ? { depends_on: opts.depends_on } : {}), + ...(opts?.on_dependency_failure ? { on_dependency_failure: opts.on_dependency_failure } : {}), + }; +} + +describe('dependency-aware scheduling', () => { + describe('backward compatibility', () => { + it('tests without depends_on run exactly as before', async () => { + const provider = new FixedProvider('mock', { + output: [{ role: 'assistant', content: 'answer' }], + }); + + const results = await runEvaluation({ + testFilePath: 'in-memory.yaml', + repoRoot: '/tmp', + target: baseTarget, + providerFactory: () => provider, + evaluators: passingEvaluatorRegistry, + evalCases: [makeTest('a'), makeTest('b'), makeTest('c')], + }); + + expect(results).toHaveLength(3); + expect(results.every((r) => r.score > 0)).toBe(true); + }); + }); + + describe('DAG validation', () => { + it('rejects circular dependencies', async () => { + const provider = new FixedProvider('mock', { + output: [{ role: 'assistant', content: 'answer' }], + }); + + await expect( + runEvaluation({ + testFilePath: 'in-memory.yaml', + repoRoot: '/tmp', + target: baseTarget, + providerFactory: () => provider, + evaluators: undefined, + evalCases: [makeTest('a', { depends_on: ['b'] }), makeTest('b', { depends_on: ['a'] })], + }), + ).rejects.toThrow(/[Cc]ircular dependency/); + }); + + it('rejects references to missing test IDs', async () => { + const provider = new FixedProvider('mock', { + output: [{ role: 'assistant', content: 'answer' }], + }); + + await expect( + runEvaluation({ + testFilePath: 'in-memory.yaml', + repoRoot: '/tmp', + target: baseTarget, + providerFactory: () => provider, + evaluators: undefined, + evalCases: [makeTest('a', { depends_on: ['nonexistent'] })], + }), + ).rejects.toThrow(/no test with that ID/); + }); + + it('rejects self-dependency', async () => { + const provider = new FixedProvider('mock', { + output: [{ role: 'assistant', content: 'answer' }], + }); + + await expect( + runEvaluation({ + testFilePath: 'in-memory.yaml', + repoRoot: '/tmp', + target: baseTarget, + providerFactory: () => provider, + evaluators: undefined, + evalCases: [makeTest('a', { depends_on: ['a'] })], + }), + ).rejects.toThrow(/depends on itself/); + }); + }); + + describe('wave scheduling', () => { + it('runs independent tests in parallel, dependents after', async () => { + const executionOrder: string[] = []; + + const trackingProvider: Provider = { + id: 'mock:tracking', + kind: 'mock' as const, + targetName: 'tracking', + async invoke(request: ProviderRequest): Promise { + const testId = request.evalCaseId ?? 'unknown'; + executionOrder.push(testId); + // Add small delay to check parallel execution within waves + await new Promise((r) => setTimeout(r, 10)); + return { output: [{ role: 'assistant', content: `Output for ${testId}` }] }; + }, + }; + + const results = await runEvaluation({ + testFilePath: 'in-memory.yaml', + repoRoot: '/tmp', + target: baseTarget, + providerFactory: () => trackingProvider, + evaluators: passingEvaluatorRegistry, + evalCases: [ + makeTest('backend'), + makeTest('frontend'), + makeTest('integration', { depends_on: ['backend', 'frontend'] }), + ], + }); + + expect(results).toHaveLength(3); + // Integration must run after both backend and frontend + const integrationIdx = executionOrder.indexOf('integration'); + const backendIdx = executionOrder.indexOf('backend'); + const frontendIdx = executionOrder.indexOf('frontend'); + expect(integrationIdx).toBeGreaterThan(backendIdx); + expect(integrationIdx).toBeGreaterThan(frontendIdx); + }); + + it('supports multi-level dependency chains', async () => { + const executionOrder: string[] = []; + + const trackingProvider: Provider = { + id: 'mock:tracking', + kind: 'mock' as const, + targetName: 'tracking', + async invoke(request: ProviderRequest): Promise { + const testId = request.evalCaseId ?? 'unknown'; + executionOrder.push(testId); + return { output: [{ role: 'assistant', content: 'ok' }] }; + }, + }; + + const results = await runEvaluation({ + testFilePath: 'in-memory.yaml', + repoRoot: '/tmp', + target: baseTarget, + providerFactory: () => trackingProvider, + evaluators: passingEvaluatorRegistry, + evalCases: [ + makeTest('level-0a'), + makeTest('level-0b'), + makeTest('level-1', { depends_on: ['level-0a'] }), + makeTest('level-2', { depends_on: ['level-1', 'level-0b'] }), + ], + }); + + expect(results).toHaveLength(4); + // Verify ordering: level-2 must be last + const idx2 = executionOrder.indexOf('level-2'); + const idx1 = executionOrder.indexOf('level-1'); + const idx0a = executionOrder.indexOf('level-0a'); + const idx0b = executionOrder.indexOf('level-0b'); + expect(idx1).toBeGreaterThan(idx0a); + expect(idx2).toBeGreaterThan(idx1); + expect(idx2).toBeGreaterThan(idx0b); + }); + }); + + describe('on_dependency_failure policies', () => { + it('skip (default): skips downstream when dependency fails', async () => { + const provider = new FixedProvider('mock', { + output: [{ role: 'assistant', content: 'answer' }], + }); + + const results = await runEvaluation({ + testFilePath: 'in-memory.yaml', + repoRoot: '/tmp', + target: baseTarget, + providerFactory: () => provider, + evaluators: failingEvaluatorRegistry, + evalCases: [makeTest('dep'), makeTest('downstream', { depends_on: ['dep'] })], + }); + + expect(results).toHaveLength(2); + const downstream = results.find((r) => r.testId === 'downstream'); + expect(downstream).toBeDefined(); + expect(downstream?.error).toContain('dependency failed'); + expect(downstream?.error).toContain('dep'); + expect(downstream?.executionStatus).toBe('execution_error'); + }); + + it('fail: marks downstream as failed when dependency fails', async () => { + const provider = new FixedProvider('mock', { + output: [{ role: 'assistant', content: 'answer' }], + }); + + const results = await runEvaluation({ + testFilePath: 'in-memory.yaml', + repoRoot: '/tmp', + target: baseTarget, + providerFactory: () => provider, + evaluators: failingEvaluatorRegistry, + evalCases: [ + makeTest('dep'), + makeTest('downstream', { depends_on: ['dep'], on_dependency_failure: 'fail' }), + ], + }); + + expect(results).toHaveLength(2); + const downstream = results.find((r) => r.testId === 'downstream'); + expect(downstream).toBeDefined(); + expect(downstream?.error).toContain('Failed: dependency failed'); + expect(downstream?.score).toBe(0); + }); + + it('run: executes downstream even when dependency fails', async () => { + const executionOrder: string[] = []; + + const trackingProvider: Provider = { + id: 'mock:tracking', + kind: 'mock' as const, + targetName: 'tracking', + async invoke(request: ProviderRequest): Promise { + const testId = request.evalCaseId ?? 'unknown'; + executionOrder.push(testId); + return { output: [{ role: 'assistant', content: 'ok' }] }; + }, + }; + + const results = await runEvaluation({ + testFilePath: 'in-memory.yaml', + repoRoot: '/tmp', + target: baseTarget, + providerFactory: () => trackingProvider, + evaluators: failingEvaluatorRegistry, + evalCases: [ + makeTest('dep'), + makeTest('downstream', { depends_on: ['dep'], on_dependency_failure: 'run' }), + ], + }); + + expect(results).toHaveLength(2); + // Both tests should have been executed + expect(executionOrder).toContain('dep'); + expect(executionOrder).toContain('downstream'); + }); + }); + + describe('dependency_results in evaluator context', () => { + it('passes dependency results to downstream evaluator', async () => { + let capturedContext: unknown = undefined; + + const contextCapturingRegistry = { + 'llm-grader': { + kind: 'llm-grader', + async evaluate(ctx: unknown) { + capturedContext = ctx; + return { + score: 0.9, + verdict: 'pass' as const, + assertions: [{ text: 'ok', passed: true }], + expectedAspectCount: 1, + }; + }, + }, + }; + + const provider = new FixedProvider('mock', { + output: [{ role: 'assistant', content: 'answer' }], + }); + + await runEvaluation({ + testFilePath: 'in-memory.yaml', + repoRoot: '/tmp', + target: baseTarget, + providerFactory: () => provider, + evaluators: contextCapturingRegistry, + evalCases: [makeTest('dep'), makeTest('downstream', { depends_on: ['dep'] })], + }); + + // The last evaluation context should be for 'downstream' and include dependencyResults + const ctx = capturedContext as { + evalCase: EvalTest; + dependencyResults?: Record; + }; + expect(ctx.evalCase.id).toBe('downstream'); + expect(ctx.dependencyResults).toBeDefined(); + expect(ctx.dependencyResults?.dep).toBeDefined(); + expect((ctx.dependencyResults?.dep as { score: number }).score).toBe(0.9); + expect((ctx.dependencyResults?.dep as { status: string }).status).toBe('passed'); + }); + }); +}); diff --git a/plugins/agentv-dev/skills/agentv-eval-writer/references/eval-schema.json b/plugins/agentv-dev/skills/agentv-eval-writer/references/eval-schema.json index 0c7805ba1..69d694bbe 100644 --- a/plugins/agentv-dev/skills/agentv-eval-writer/references/eval-schema.json +++ b/plugins/agentv-dev/skills/agentv-eval-writer/references/eval-schema.json @@ -5104,6 +5104,16 @@ }, "note": { "type": "string" + }, + "depends_on": { + "type": "array", + "items": { + "type": "string" + } + }, + "on_dependency_failure": { + "type": "string", + "enum": ["skip", "fail", "run"] } }, "required": ["id"], @@ -10124,6 +10134,16 @@ }, "note": { "type": "string" + }, + "depends_on": { + "type": "array", + "items": { + "type": "string" + } + }, + "on_dependency_failure": { + "type": "string", + "enum": ["skip", "fail", "run"] } }, "required": ["id"], From 04686dc369fad96106272f4f6848497fa767090f Mon Sep 17 00:00:00 2001 From: Christopher Date: Sun, 12 Apr 2026 08:23:19 +0000 Subject: [PATCH 2/2] fix(eval): address code review findings for DAG scheduler - Only treat execution_error (not quality_failure) as dependency failure A dependency test scoring 0.2 now correctly allows downstream to run. Only actual crashes/errors trigger skip/fail policies. - Add duplicate test ID validation in validateDependencyGraph - Add defensive assertion in computeWaves for unscheduled tests - Deduplicate skip/fail result construction into single branch - Add tests: transitive cascade (A->B->C), quality_failure distinction Co-Authored-By: Claude Opus 4.6 (1M context) --- packages/core/src/evaluation/orchestrator.ts | 87 +++++------- .../evaluation/dependency-scheduling.test.ts | 129 +++++++++++++++--- 2 files changed, 140 insertions(+), 76 deletions(-) diff --git a/packages/core/src/evaluation/orchestrator.ts b/packages/core/src/evaluation/orchestrator.ts index 0996b4923..5a1120cfb 100644 --- a/packages/core/src/evaluation/orchestrator.ts +++ b/packages/core/src/evaluation/orchestrator.ts @@ -164,7 +164,13 @@ function getWorkspaceTemplate(target: ResolvedTarget): string | undefined { * Returns silently when the graph is valid. */ function validateDependencyGraph(tests: readonly EvalTest[]): void { - const ids = new Set(tests.map((t) => t.id)); + const ids = new Set(); + for (const test of tests) { + if (ids.has(test.id)) { + throw new Error(`Duplicate test ID '${test.id}' — each test must have a unique ID`); + } + ids.add(test.id); + } // Check for missing dependency IDs for (const test of tests) { @@ -263,6 +269,14 @@ function computeWaves(tests: readonly EvalTest[]): EvalTest[][] { ready = nextReady; } + // Defensive: if validation missed a cycle, Kahn's algorithm leaves unscheduled nodes + const totalScheduled = waves.reduce((sum, w) => sum + w.length, 0); + if (totalScheduled !== tests.length) { + throw new Error( + `Internal error: ${tests.length - totalScheduled} tests were not scheduled (possible undetected cycle)`, + ); + } + return waves; } @@ -1022,7 +1036,9 @@ export async function runEvaluation( const depResult = completedResults.get(depId); if (depResult) { depResults[depId] = toDependencyResult(depResult); - if (depResult.executionStatus !== 'ok') { + // Only execution errors count as dependency failures — quality failures + // (low scores) still mean the test ran successfully, just scored poorly. + if (depResult.executionStatus === 'execution_error') { allPassed = false; } } else { @@ -1247,51 +1263,13 @@ export async function runEvaluation( const { ok, depResults } = checkDependencies(evalCase); if (!ok) { const policy = evalCase.on_dependency_failure ?? 'skip'; - if (policy === 'skip') { - const failedDeps = evalCase.depends_on.filter((d) => { - const r = completedResults.get(d); - return !r || r.executionStatus !== 'ok'; - }); - const skipResult: EvaluationResult = { - timestamp: (now ?? (() => new Date()))().toISOString(), - testId: evalCase.id, - suite: evalCase.suite, - category: evalCase.category, - score: 0, - assertions: [], - output: [], - target: target.name, - error: `Skipped: dependency failed (${failedDeps.join(', ')})`, - executionStatus: 'execution_error', - failureStage: 'setup', - failureReasonCode: 'dependency_failed', - executionError: { - message: `Skipped: dependency failed (${failedDeps.join(', ')})`, - stage: 'setup', - }, - }; - if (onProgress) { - await onProgress({ - workerId: nextWorkerId++, - testId: evalCase.id, - status: 'failed', - completedAt: Date.now(), - error: skipResult.error, - score: 0, - executionStatus: skipResult.executionStatus, - }); - } - if (onResult) { - await onResult(skipResult); - } - return skipResult; - } - if (policy === 'fail') { - const failedDeps = evalCase.depends_on.filter((d) => { - const r = completedResults.get(d); - return !r || r.executionStatus !== 'ok'; - }); - const failResult: EvaluationResult = { + if (policy === 'skip' || policy === 'fail') { + const failedDeps = evalCase.depends_on.filter( + (d) => completedResults.get(d)?.executionStatus === 'execution_error', + ); + const prefix = policy === 'skip' ? 'Skipped' : 'Failed'; + const errorMsg = `${prefix}: dependency failed (${failedDeps.join(', ')})`; + const depFailResult: EvaluationResult = { timestamp: (now ?? (() => new Date()))().toISOString(), testId: evalCase.id, suite: evalCase.suite, @@ -1300,14 +1278,11 @@ export async function runEvaluation( assertions: [], output: [], target: target.name, - error: `Failed: dependency failed (${failedDeps.join(', ')})`, + error: errorMsg, executionStatus: 'execution_error', failureStage: 'setup', failureReasonCode: 'dependency_failed', - executionError: { - message: `Failed: dependency failed (${failedDeps.join(', ')})`, - stage: 'setup', - }, + executionError: { message: errorMsg, stage: 'setup' }, }; if (onProgress) { await onProgress({ @@ -1315,15 +1290,15 @@ export async function runEvaluation( testId: evalCase.id, status: 'failed', completedAt: Date.now(), - error: failResult.error, + error: depFailResult.error, score: 0, - executionStatus: failResult.executionStatus, + executionStatus: depFailResult.executionStatus, }); } if (onResult) { - await onResult(failResult); + await onResult(depFailResult); } - return failResult; + return depFailResult; } // policy === 'run': fall through to dispatch with dependency results } diff --git a/packages/core/test/evaluation/dependency-scheduling.test.ts b/packages/core/test/evaluation/dependency-scheduling.test.ts index 77bcf1c00..b08f873e2 100644 --- a/packages/core/test/evaluation/dependency-scheduling.test.ts +++ b/packages/core/test/evaluation/dependency-scheduling.test.ts @@ -236,17 +236,26 @@ describe('dependency-aware scheduling', () => { }); describe('on_dependency_failure policies', () => { - it('skip (default): skips downstream when dependency fails', async () => { - const provider = new FixedProvider('mock', { - output: [{ role: 'assistant', content: 'answer' }], - }); - + // Use a provider that throws for 'dep' to produce an execution_error + const errorOnDepProvider: Provider = { + id: 'mock:error-on-dep', + kind: 'mock' as const, + targetName: 'error-on-dep', + async invoke(request: ProviderRequest): Promise { + if (request.evalCaseId === 'dep') { + throw new Error('Simulated provider crash'); + } + return { output: [{ role: 'assistant', content: 'ok' }] }; + }, + }; + + it('skip (default): skips downstream when dependency has execution error', async () => { const results = await runEvaluation({ testFilePath: 'in-memory.yaml', repoRoot: '/tmp', target: baseTarget, - providerFactory: () => provider, - evaluators: failingEvaluatorRegistry, + providerFactory: () => errorOnDepProvider, + evaluators: passingEvaluatorRegistry, evalCases: [makeTest('dep'), makeTest('downstream', { depends_on: ['dep'] })], }); @@ -258,17 +267,13 @@ describe('dependency-aware scheduling', () => { expect(downstream?.executionStatus).toBe('execution_error'); }); - it('fail: marks downstream as failed when dependency fails', async () => { - const provider = new FixedProvider('mock', { - output: [{ role: 'assistant', content: 'answer' }], - }); - + it('fail: marks downstream as failed when dependency has execution error', async () => { const results = await runEvaluation({ testFilePath: 'in-memory.yaml', repoRoot: '/tmp', target: baseTarget, - providerFactory: () => provider, - evaluators: failingEvaluatorRegistry, + providerFactory: () => errorOnDepProvider, + evaluators: passingEvaluatorRegistry, evalCases: [ makeTest('dep'), makeTest('downstream', { depends_on: ['dep'], on_dependency_failure: 'fail' }), @@ -282,16 +287,94 @@ describe('dependency-aware scheduling', () => { expect(downstream?.score).toBe(0); }); - it('run: executes downstream even when dependency fails', async () => { + it('run: executes downstream even when dependency has execution error', async () => { const executionOrder: string[] = []; + const trackingErrorProvider: Provider = { + id: 'mock:tracking-error', + kind: 'mock' as const, + targetName: 'tracking-error', + async invoke(request: ProviderRequest): Promise { + const testId = request.evalCaseId ?? 'unknown'; + executionOrder.push(testId); + if (testId === 'dep') { + throw new Error('Simulated provider crash'); + } + return { output: [{ role: 'assistant', content: 'ok' }] }; + }, + }; + + const results = await runEvaluation({ + testFilePath: 'in-memory.yaml', + repoRoot: '/tmp', + target: baseTarget, + providerFactory: () => trackingErrorProvider, + evaluators: passingEvaluatorRegistry, + evalCases: [ + makeTest('dep'), + makeTest('downstream', { depends_on: ['dep'], on_dependency_failure: 'run' }), + ], + }); + + expect(results).toHaveLength(2); + // Both tests should have been executed (dep threw but downstream runs anyway) + expect(executionOrder).toContain('dep'); + expect(executionOrder).toContain('downstream'); + }); + }); + + describe('transitive dependency cascade', () => { + it('cascades skip across A -> B -> C when A has execution error', async () => { + // Provider that throws for test 'a' (execution error) + const errorProvider: Provider = { + id: 'mock:error', + kind: 'mock' as const, + targetName: 'error', + async invoke(request: ProviderRequest): Promise { + if (request.evalCaseId === 'a') { + throw new Error('Simulated provider crash'); + } + return { output: [{ role: 'assistant', content: 'ok' }] }; + }, + }; + + const results = await runEvaluation({ + testFilePath: 'in-memory.yaml', + repoRoot: '/tmp', + target: baseTarget, + providerFactory: () => errorProvider, + evaluators: passingEvaluatorRegistry, + evalCases: [ + makeTest('a'), + makeTest('b', { depends_on: ['a'] }), + makeTest('c', { depends_on: ['b'] }), + ], + }); + + expect(results).toHaveLength(3); + const resultA = results.find((r) => r.testId === 'a'); + const resultB = results.find((r) => r.testId === 'b'); + const resultC = results.find((r) => r.testId === 'c'); + // A has execution error (provider threw) + expect(resultA?.executionStatus).toBe('execution_error'); + // B is skipped because A failed + expect(resultB?.error).toContain('dependency failed'); + expect(resultB?.executionStatus).toBe('execution_error'); + // C is skipped because B was skipped (cascade) + expect(resultC?.error).toContain('dependency failed'); + expect(resultC?.executionStatus).toBe('execution_error'); + }); + }); + + describe('quality_failure does NOT trigger dependency failure', () => { + it('runs downstream even when dependency scores below threshold', async () => { + const executionOrder: string[] = []; const trackingProvider: Provider = { id: 'mock:tracking', kind: 'mock' as const, targetName: 'tracking', async invoke(request: ProviderRequest): Promise { - const testId = request.evalCaseId ?? 'unknown'; - executionOrder.push(testId); + executionOrder.push(request.evalCaseId ?? 'unknown'); return { output: [{ role: 'assistant', content: 'ok' }] }; }, }; @@ -301,17 +384,23 @@ describe('dependency-aware scheduling', () => { repoRoot: '/tmp', target: baseTarget, providerFactory: () => trackingProvider, - evaluators: failingEvaluatorRegistry, + evaluators: failingEvaluatorRegistry, // scores 0.2 — quality_failure, not execution_error evalCases: [ makeTest('dep'), - makeTest('downstream', { depends_on: ['dep'], on_dependency_failure: 'run' }), + makeTest('downstream', { depends_on: ['dep'] }), // default: skip ], }); expect(results).toHaveLength(2); - // Both tests should have been executed + // Both tests should execute — quality failure is NOT a dependency failure expect(executionOrder).toContain('dep'); expect(executionOrder).toContain('downstream'); + // dep scored poorly but ran fine + const depResult = results.find((r) => r.testId === 'dep'); + expect(depResult?.executionStatus).toBe('quality_failure'); + // downstream ran (not skipped) + const downstreamResult = results.find((r) => r.testId === 'downstream'); + expect(downstreamResult?.error).toBeUndefined(); }); });