diff --git a/README.md b/README.md index 64f5a3b..c61ff9c 100644 --- a/README.md +++ b/README.md @@ -40,6 +40,7 @@ const config: SkillGymConfig = { reporter: "standard", schedule: "serial", maxParallel: 4, + retryFailed: 1, maxSteps: 4, }, defaults: { @@ -144,6 +145,7 @@ Most important config properties: - `run.reporter`: built-in `standard` reporter or a custom reporter module path - `run.schedule`: execution scheduling mode for case x runner pairs - `run.maxParallel`: maximum concurrent executions for non-serial schedules, defaulting to available CPU parallelism +- `run.retryFailed`: rerun only failed case x runner executions up to this many additional attempts - `run.maxSteps`: best-effort limit on streamed agent steps before skillgym terminates the run - `run.workspace`: default workspace mode for the suite - `defaults.timeoutMs`: default per-case timeout @@ -165,6 +167,8 @@ For concurrent schedules, `run.maxParallel` defaults to `os.availableParallelism Concurrent schedules do not copy or isolate the workspace by themselves. Overlapping runs may still interact through the same filesystem state and live runner output unless you use isolated workspaces. OpenCode, Codex, and Claude Code runtime state are isolated per run under each artifact directory. +`run.retryFailed` is useful when broad benchmark runs include occasional flaky agent failures. SkillGym only retries executions that still count as failed after result classification, keeps each attempt's artifacts, and reports whether a final pass came from a retry. + `run.maxSteps` is enforced on a best-effort basis by monitoring each runner's streamed JSONL output. A step is one observed model round, not one token and not necessarily one tool call, but the exact boundary is still runner-defined, so the same prompt may consume different numbers of steps across agents. When the observed step count exceeds the configured limit, skillgym kills the agent process, fails the run with origin `max-steps`, and preserves raw stdout/stderr artifacts for debugging. No partial normalized report is produced for that failure. ## Workspaces diff --git a/examples/flaky-retry-suite.ts b/examples/flaky-retry-suite.ts new file mode 100644 index 0000000..f53ab28 --- /dev/null +++ b/examples/flaky-retry-suite.ts @@ -0,0 +1,25 @@ +import { access, writeFile } from "node:fs/promises"; +import os from "node:os"; +import path from "node:path"; +import { assert, type TestCase } from "skillgym"; + +const markerPath = path.join(os.tmpdir(), "skillgym-flaky-retry-example-6.marker"); + +const suite: TestCase[] = [ + { + id: "retry-once", + prompt: "Reply exactly: skillgym retry example", + async assert(_report, ctx) { + try { + await access(markerPath); + } catch { + await writeFile(markerPath, "seen", "utf8"); + throw new Error("Intentional first-run failure. Run the same suite again."); + } + + assert.match(ctx.finalOutput(), /skillgym retry example/i); + }, + }, +]; + +export default suite; diff --git a/src/cli.ts b/src/cli.ts index c60efa8..3f56174 100644 --- a/src/cli.ts +++ b/src/cli.ts @@ -22,6 +22,7 @@ async function main(): Promise { const scheduleOption = parsed.options.schedule; const configOption = parsed.options.config; const maxParallelOption = parsed.options["max-parallel"]; + const retryFailedOption = parsed.options["retry-failed"]; const updateSnapshotsOption = parsed.options["update-snapshots"]; const snapshotsOption = parsed.options.snapshots; const tagOption = parsed.options.tag; @@ -35,6 +36,7 @@ async function main(): Promise { reporter: getStringOption(reporterOption), schedule: getStringOption(scheduleOption), maxParallel: getStringOption(maxParallelOption), + retryFailed: getStringOption(retryFailedOption), tags: parseTagOption(tagOption), reporterCwd: process.cwd(), configPath: getStringOption(configOption), diff --git a/src/cli/help.ts b/src/cli/help.ts index 8ab9111..5383076 100644 --- a/src/cli/help.ts +++ b/src/cli/help.ts @@ -17,6 +17,7 @@ ${theme.bold("Run Options:")} --output-dir ${theme.accent("")} Override where run artifacts are written --schedule ${theme.accent("")} Choose ${theme.light("serial")}, ${theme.light("parallel")}, or ${theme.light("isolated-by-runner")} --max-parallel ${theme.accent("")} Cap concurrent executions for non-serial schedules + --retry-failed ${theme.accent("")} Retry only failed case x runner executions up to ${theme.light("n")} extra times --case ${theme.accent("")} Filter the configured suite to one case id --tag ${theme.accent("")} Filter cases by tag; repeat or comma-separate for OR matching --runner ${theme.accent("")} Filter the configured runner set by runner id @@ -32,6 +33,7 @@ ${theme.bold("Examples:")} ${theme.dim("$")} ${theme.light("skillgym run ./examples/basic-suite.ts --reporter standard")} ${theme.dim("$")} ${theme.light("skillgym run ./examples/basic-suite.ts --schedule isolated-by-runner")} ${theme.dim("$")} ${theme.light("skillgym run ./examples/basic-suite.ts --schedule parallel --max-parallel 4")} + ${theme.dim("$")} ${theme.light("skillgym run ./examples/basic-suite.ts --retry-failed 2")} ${theme.dim("$")} ${theme.light("skillgym run ./examples/basic-suite.ts --update-snapshots")} `); } diff --git a/src/cli/run.ts b/src/cli/run.ts index 3b34a22..f088591 100644 --- a/src/cli/run.ts +++ b/src/cli/run.ts @@ -19,6 +19,7 @@ export async function runCommand(options: { outputDir?: string; schedule?: string; maxParallel?: string; + retryFailed?: string; caseId?: string; runner?: string; reporter?: string; @@ -38,6 +39,7 @@ export async function runCommand(options: { outputDir: options.outputDir, schedule: options.schedule, maxParallel: options.maxParallel, + retryFailed: options.retryFailed, tags: options.tags, }, loadedConfig.config, @@ -76,6 +78,7 @@ export async function runCommand(options: { outputDir: runOptions.outputDir, schedule: runOptions.schedule, maxParallel: runOptions.maxParallel, + retryFailed: runOptions.retryFailed, caseId: options.caseId, runner: options.runner, tags: runOptions.tags, diff --git a/src/config.ts b/src/config.ts index e47caa3..5bae9cb 100644 --- a/src/config.ts +++ b/src/config.ts @@ -24,6 +24,7 @@ const RUN_KEYS = [ "workspace", "maxSteps", "maxParallel", + "retryFailed", "tags", ] as const; const DEFAULT_KEYS = ["timeoutMs"] as const; @@ -64,6 +65,7 @@ export interface SkillGymConfig { workspace?: SuiteWorkspaceConfig; maxSteps?: number; maxParallel?: number; + retryFailed?: number; tags?: string[]; }; defaults?: { @@ -121,6 +123,7 @@ export function resolveRunOptions( outputDir?: string; schedule?: string; maxParallel?: string; + retryFailed?: string; tags?: string[]; }, config: SkillGymConfig, @@ -129,12 +132,17 @@ export function resolveRunOptions( outputDir?: string; schedule: ScheduleMode; maxParallel?: number; + retryFailed: number; tags: string[]; } { const maxParallel = cliOptions.maxParallel !== undefined ? parseIntegerString(cliOptions.maxParallel, "CLI option --max-parallel", 1) : config.run?.maxParallel; + const retryFailed = + cliOptions.retryFailed !== undefined + ? parseIntegerString(cliOptions.retryFailed, "CLI option --retry-failed", 0) + : (config.run?.retryFailed ?? 0); return { cwd: @@ -150,6 +158,7 @@ export function resolveRunOptions( ? parseScheduleMode(cliOptions.schedule, "CLI option --schedule") : (config.run?.schedule ?? "serial"), ...(maxParallel === undefined ? {} : { maxParallel }), + retryFailed, tags: cliOptions.tags ?? config.run?.tags ?? [], }; } @@ -256,6 +265,9 @@ function resolveConfigPaths(config: SkillGymConfig, configDir: string): SkillGym ...(config.run.maxParallel === undefined ? {} : { maxParallel: config.run.maxParallel }), + ...(config.run.retryFailed === undefined + ? {} + : { retryFailed: config.run.retryFailed }), ...(config.run.tags === undefined ? {} : { tags: config.run.tags }), }, defaults: @@ -346,6 +358,7 @@ function parseRunConfig(value: unknown, configPath: string): SkillGymConfig["run const maxSteps = parseOptionalInteger(record.maxSteps, `${configPath}.maxSteps`, 1); const maxParallel = parseOptionalInteger(record.maxParallel, `${configPath}.maxParallel`, 1); + const retryFailed = parseOptionalInteger(record.retryFailed, `${configPath}.retryFailed`, 0); return { cwd: parseOptionalNonEmptyString(record.cwd, `${configPath}.cwd`), @@ -355,6 +368,7 @@ function parseRunConfig(value: unknown, configPath: string): SkillGymConfig["run workspace: parseOptionalWorkspaceConfig(record.workspace, `${configPath}.workspace`), ...(maxSteps === undefined ? {} : { maxSteps }), ...(maxParallel === undefined ? {} : { maxParallel }), + ...(retryFailed === undefined ? {} : { retryFailed }), tags: parseOptionalStringArray(record.tags, `${configPath}.tags`), }; } diff --git a/src/domain/result.ts b/src/domain/result.ts index 3d13a53..0699428 100644 --- a/src/domain/result.ts +++ b/src/domain/result.ts @@ -1,7 +1,7 @@ import type { RunnerInfo } from "./runner.js"; import type { SessionReport } from "./session-report.js"; -export interface RunnerResult { +interface BaseRunnerResult { runner: RunnerInfo; passed: boolean; status: RunnerResultStatus; @@ -15,6 +15,15 @@ export interface RunnerResult { failureLogPath?: string; } +export interface RunnerAttemptResult extends BaseRunnerResult { + attempt: number; +} + +export interface RunnerResult extends BaseRunnerResult { + attempt?: number; + attempts?: RunnerAttemptResult[]; +} + export interface FailureClass { id: string; label?: string; diff --git a/src/index.ts b/src/index.ts index 900007e..28af200 100644 --- a/src/index.ts +++ b/src/index.ts @@ -17,6 +17,7 @@ export type { } from "./domain/test-case.js"; export type { FailureClass, + RunnerAttemptResult, RunnerFailureOrigin, RunnerFailureType, RunnerResult, diff --git a/src/reporters/contract.ts b/src/reporters/contract.ts index 8bfec39..35ef646 100644 --- a/src/reporters/contract.ts +++ b/src/reporters/contract.ts @@ -38,6 +38,8 @@ export interface RunnerStartEvent { context: ReporterContext; testCase: TestCase; runner: RunnerInfo; + attempt?: number; + maxAttempts?: number; caseIndex: number; totalCases: number; } @@ -47,6 +49,8 @@ export interface RunnerFinishEvent { testCase: TestCase; runner: RunnerInfo; result: RunnerResult; + attempt?: number; + maxAttempts?: number; caseIndex: number; totalCases: number; } diff --git a/src/reporters/github-actions.ts b/src/reporters/github-actions.ts index 2ada8d4..5ddac96 100644 --- a/src/reporters/github-actions.ts +++ b/src/reporters/github-actions.ts @@ -49,6 +49,11 @@ function formatAnnotationCommand(caseId: string, result: RunnerResult): string { function formatAnnotationMessage(result: RunnerResult): string { const lines = [`failure type: ${result.failureType ?? "unknown"}`]; + const retryCount = countRetries(result); + + if (retryCount > 0) { + lines.push(`retries: ${String(retryCount)}`); + } if (result.failureOrigin !== undefined) { lines.push(`failure origin: ${result.failureOrigin}`); @@ -147,7 +152,8 @@ function formatRunnerAgentLabel(runner: RunnerSummary["runner"]): string { function formatRunnerCaseRow(caseId: string, result: RunnerResult): string { const status = result.passed ? "✅" : "❌"; const usage = result.report.usage; - return `| ${status} \`${caseId}\` | ${formatDuration(result.durationMs)} | ${formatTokens(usage.inputTokens)} | ${formatTokens(usage.outputTokens)} | ${formatTokens(usage.reasoningTokens)} | ${formatTokens(usage.cacheTokens)} | ${formatTokens(usage.totalTokens)} |`; + const retryLabel = formatRetryLabel(result); + return `| ${status} \`${caseId}\`${retryLabel === undefined ? "" : ` ${retryLabel}`} | ${formatDuration(result.durationMs)} | ${formatTokens(usage.inputTokens)} | ${formatTokens(usage.outputTokens)} | ${formatTokens(usage.reasoningTokens)} | ${formatTokens(usage.cacheTokens)} | ${formatTokens(usage.totalTokens)} |`; } function getRunnerCases( @@ -167,6 +173,12 @@ function formatFailureSummaryItem(caseId: string, result: RunnerResult): string `artifacts: \`${result.artifactDir}\``, ]; + const retryCount = countRetries(result); + + if (retryCount > 0) { + segments.splice(2, 0, `retries: ${String(retryCount)}`); + } + if (result.failureClass !== undefined) { segments.splice(2, 0, `class: \`${result.failureClass.id}\``); } @@ -182,6 +194,19 @@ function formatFailureSummaryItem(caseId: string, result: RunnerResult): string return segments.join("; "); } +function formatRetryLabel(result: RunnerResult): string | undefined { + const retryCount = countRetries(result); + if (retryCount === 0) { + return undefined; + } + + return `(${retryCount === 1 ? "1 retry" : `${String(retryCount)} retries`})`; +} + +function countRetries(result: RunnerResult): number { + return Math.max(0, (result.attempts?.length ?? 1) - 1); +} + function listFailures(result: SuiteRunResult): Array<{ caseId: string; result: RunnerResult }> { const failures: Array<{ caseId: string; result: RunnerResult }> = []; diff --git a/src/reporters/json-summary.ts b/src/reporters/json-summary.ts index aa6f86a..c59e7c2 100644 --- a/src/reporters/json-summary.ts +++ b/src/reporters/json-summary.ts @@ -14,6 +14,23 @@ interface SummaryError { interface SummaryRunnerResult { runner: RunnerResult["runner"]; passed: boolean; + status: RunnerResult["status"]; + attempt?: number; + retryCount: number; + durationMs: number; + artifactDir: string; + usage: RunnerResult["report"]["usage"]; + attempts?: SummaryAttemptResult[]; + error?: SummaryError; + failureType?: RunnerResult["failureType"]; + failureOrigin?: RunnerResult["failureOrigin"]; + failureClass?: FailureClass; +} + +interface SummaryAttemptResult { + passed: boolean; + status: RunnerResult["status"]; + attempt: number; durationMs: number; artifactDir: string; usage: RunnerResult["report"]["usage"]; @@ -46,6 +63,48 @@ function summarizeRunnerResult(result: RunnerResult): SummaryRunnerResult { const summary: SummaryRunnerResult = { runner: result.runner, passed: result.passed, + status: result.status, + attempt: result.attempt, + retryCount: countRetries(result), + durationMs: result.durationMs, + artifactDir: result.artifactDir, + usage: result.report.usage, + }; + + if (result.attempts !== undefined) { + summary.attempts = result.attempts.map(summarizeAttemptResult); + } + + if (result.error !== undefined) { + summary.error = { name: result.error.name, message: result.error.message }; + } + + if (result.failureType !== undefined) { + summary.failureType = result.failureType; + } + + if (result.failureOrigin !== undefined) { + summary.failureOrigin = result.failureOrigin; + } + + if (result.failureClass !== undefined) { + summary.failureClass = result.failureClass; + } + + return summary; +} + +function countRetries(result: RunnerResult): number { + return Math.max(0, (result.attempts?.length ?? 1) - 1); +} + +function summarizeAttemptResult( + result: NonNullable[number], +): SummaryAttemptResult { + const summary: SummaryAttemptResult = { + passed: result.passed, + status: result.status, + attempt: result.attempt, durationMs: result.durationMs, artifactDir: result.artifactDir, usage: result.report.usage, diff --git a/src/reporters/standard.ts b/src/reporters/standard.ts index cf2a7a1..7e289b0 100644 --- a/src/reporters/standard.ts +++ b/src/reporters/standard.ts @@ -29,6 +29,7 @@ interface FailureEntry { caseId: string; runner: RunnerInfo; artifactDir: string; + attempts?: RunnerResult["attempts"]; error?: SerializedError; failureType?: RunnerFailureType; failureOrigin?: RunnerFailureOrigin; @@ -61,6 +62,7 @@ interface InteractiveRunEntry { caseId: string; runner: RunnerInfo; status: InteractiveRunStatus; + retryCount: number; } interface InteractiveState { @@ -93,7 +95,6 @@ export function createStandardReporter(options: StandardReporterOptions = {}): B colors.isColorSupported ? `${ACCENT_OPEN}${value}${ACCENT_CLOSE}` : value; const symbols: ReporterSymbols = getSymbols(unicode); const spinner = unicode ? cliSpinners.dots : cliSpinners.line; - const failures: FailureEntry[] = []; let interactiveState: InteractiveState | undefined; return { @@ -148,37 +149,26 @@ export function createStandardReporter(options: StandardReporterOptions = {}): B } const key = createRunKey(event.testCase.id, event.runner.id); - setInteractiveRunStatus(interactiveState, key, "running"); + setInteractiveRunResult(interactiveState, key, { status: "running", retryCount: 0 }); interactiveState.spinnerFrameIndex = 0; renderInteractiveRunList(interactiveState, stdout, colors, symbols, spinner.frames); startSpinner(interactiveState, stdout, colors, symbols, spinner.frames, spinner.interval); }, onRunnerFinish(event) { if (interactive && interactiveState !== undefined) { - setInteractiveRunStatus( + setInteractiveRunResult( interactiveState, createRunKey(event.testCase.id, event.runner.id), - event.result.status, + { + status: event.result.status, + retryCount: countRetries(event.result), + }, ); if (!hasRunningEntries(interactiveState)) { stopSpinner(interactiveState); } renderInteractiveRunList(interactiveState, stdout, colors, symbols, spinner.frames); } - - if (!event.result.passed) { - failures.push({ - caseId: event.testCase.id, - runner: event.result.runner, - artifactDir: event.result.artifactDir, - error: event.result.error, - failureType: event.result.failureType, - failureOrigin: event.result.failureOrigin, - failureClass: event.result.failureClass, - failureLogPath: event.result.failureLogPath, - status: event.result.status, - }); - } }, onCaseFinish(event) { if (interactive) { @@ -188,6 +178,8 @@ export function createStandardReporter(options: StandardReporterOptions = {}): B writeLine(formatCaseRow(event.result, symbols), stdout); }, onSuiteFinish(event) { + const failures = collectFinalFailures(event.result); + if (interactiveState !== undefined) { stopSpinner(interactiveState); renderInteractiveRunList(interactiveState, stdout, colors, symbols, spinner.frames); @@ -200,7 +192,13 @@ export function createStandardReporter(options: StandardReporterOptions = {}): B writeLine(formatRunnerLegend(colors), stdout); for (const caseResult of getRunnerCases(event.result, summary.runner.id)) { writeLine( - formatRunnerCaseRow(caseResult.caseId, caseResult.runnerResult, symbols, accent), + formatRunnerCaseRow( + caseResult.caseId, + caseResult.runnerResult, + symbols, + accent, + colors, + ), stdout, ); } @@ -299,13 +297,16 @@ function formatRunnerCaseRow( result: RunnerResult, symbols: ReturnType, accent: (value: string) => string, + _colors: ReturnType, ): string { const color = result.passed ? pc.green : pc.red; const statusLabel = formatStatusLabel(result.status); + const caseLabel = `${result.passed ? symbols.pass : symbols.fail} ${caseId}`; + return [ - color(padCell(`${result.passed ? symbols.pass : symbols.fail} ${caseId}`, RUNNER_CASE_WIDTH)), + color(padCell(caseLabel, RUNNER_CASE_WIDTH)), padCell(formatDuration(result.durationMs), RUNNER_TIME_WIDTH), - statusLabel === undefined + (statusLabel === undefined ? "" : statusLabel) === "" ? formatTokenSummary(result.report.usage, accent) : `${formatTokenSummary(result.report.usage, accent)} ${pc.dim(statusLabel)}`, ].join(" "); @@ -381,6 +382,10 @@ function formatFailureBlock( lines.push(colors.dim(`Log: ${failure.failureLogPath}`)); } + if (failure.attempts !== undefined && failure.attempts.length > 1) { + lines.push(colors.dim(`Attempts: ${String(failure.attempts.length)}`)); + } + lines.push(colors.dim(`Artifacts: ${failure.artifactDir}`)); return lines.join("\n"); } @@ -588,6 +593,31 @@ function getRunnerCases( }); } +function collectFinalFailures(result: SuiteRunResult): FailureEntry[] { + return result.cases.flatMap((caseResult) => + caseResult.runnerResults.flatMap((runnerResult) => { + if (runnerResult.passed) { + return []; + } + + return [ + { + caseId: caseResult.caseId, + runner: runnerResult.runner, + artifactDir: runnerResult.artifactDir, + attempts: runnerResult.attempts, + error: runnerResult.error, + failureType: runnerResult.failureType, + failureOrigin: runnerResult.failureOrigin, + failureClass: runnerResult.failureClass, + failureLogPath: runnerResult.failureLogPath, + status: runnerResult.status, + }, + ]; + }), + ); +} + function createInteractiveState(event: SuiteStartEvent): InteractiveState { const entries = event.cases.flatMap((testCase) => { return event.runners.map((runner) => ({ @@ -595,6 +625,7 @@ function createInteractiveState(event: SuiteStartEvent): InteractiveState { caseId: testCase.id, runner, status: "queued" as const, + retryCount: 0, })); }); @@ -610,10 +641,10 @@ function createRunKey(caseId: string, runnerId: string): string { return `${caseId}\u0000${runnerId}`; } -function setInteractiveRunStatus( +function setInteractiveRunResult( state: InteractiveState, key: string, - status: InteractiveRunStatus, + result: { status: InteractiveRunStatus; retryCount: number }, ): void { const index = state.entryIndexByKey.get(key); @@ -623,7 +654,8 @@ function setInteractiveRunStatus( state.entries[index] = { ...state.entries[index]!, - status, + status: result.status, + retryCount: result.retryCount, }; } @@ -685,8 +717,10 @@ function formatInteractiveRunRow( ): string { const statusIcon = formatInteractiveStatusIcon(entry, state, colors, symbols, frames); const statusLabel = formatInteractiveStatusLabel(entry.status); + const retryLabel = formatInteractiveRetryLabel(entry, colors); const row = `${statusIcon} ${padCell(entry.caseId, caseWidth)} / ${entry.runner.id}${statusLabel}`; const runnerMeta = ` ${formatRunnerAgentLabel(entry.runner)}`; + const retryMeta = retryLabel === undefined ? "" : ` ${retryLabel}`; switch (entry.status) { case "queued": @@ -695,7 +729,7 @@ function formatInteractiveRunRow( return `${row}${colors.dim(runnerMeta)}`; case "passed": case "expected-failed": - return `${colors.green(row)}${colors.dim(runnerMeta)}`; + return `${colors.green(row)}${colors.dim(runnerMeta)}${retryMeta}`; case "failed": case "unexpected-passed": return `${colors.red(row)}${colors.dim(runnerMeta)}`; @@ -717,6 +751,25 @@ function formatInteractiveStatusLabel(status: InteractiveRunStatus): string { } } +function formatInteractiveRetryLabel( + entry: InteractiveRunEntry, + colors: ReturnType, +): string | undefined { + if (entry.status !== "passed" || entry.retryCount === 0) { + return undefined; + } + + return colors.yellow(formatRetryCountLabel(entry.retryCount)); +} + +function countRetries(result: RunnerResult): number { + return Math.max(0, (result.attempts?.length ?? 1) - 1); +} + +function formatRetryCountLabel(retryCount: number): string { + return `(${retryCount === 1 ? "1 retry" : `${String(retryCount)} retries`})`; +} + function formatInteractiveStatusIcon( entry: InteractiveRunEntry, state: InteractiveState, diff --git a/src/runner/execute-suite.ts b/src/runner/execute-suite.ts index b60a7af..77f57d3 100644 --- a/src/runner/execute-suite.ts +++ b/src/runner/execute-suite.ts @@ -2,7 +2,13 @@ import path from "node:path"; import process from "node:process"; import os from "node:os"; import { getCaseExecutionOptions } from "../config.js"; -import type { CaseResult, RunnerResult, RunnerSummary, SuiteRunResult } from "../domain/result.js"; +import type { + CaseResult, + RunnerAttemptResult, + RunnerResult, + RunnerSummary, + SuiteRunResult, +} from "../domain/result.js"; import type { ResolvedRunner, RunnerConfig, RunnerInfo } from "../domain/runner.js"; import type { ScheduleMode } from "../domain/schedule.js"; import type { SuiteWorkspaceConfig, TestCase } from "../domain/test-case.js"; @@ -49,6 +55,7 @@ export async function executeSuite( outputDir?: string; schedule?: ScheduleMode; maxParallel?: number; + retryFailed?: number; caseId?: string; runner?: string; tags?: string[]; @@ -59,6 +66,7 @@ export async function executeSuite( run?: { workspace?: SuiteWorkspaceConfig; maxSteps?: number; + retryFailed?: number; tags?: string[]; }; runners: Record; @@ -76,6 +84,7 @@ export async function executeSuite( const outputDir = path.resolve(options.outputDir ?? ".skillgym-results", timestampDirName()); const scheduleMode = options.schedule ?? "serial"; const maxParallel = resolveMaxParallel(scheduleMode, options.maxParallel); + const retryFailed = options.retryFailed ?? options.config.run?.retryFailed ?? 0; await ensureDir(outputDir); const selectedRunners = selectRunners(options.config.runners, options.runner); const normalizedCases = normalizeTestCases(testCases); @@ -196,6 +205,7 @@ export async function executeSuite( snapshots: options.snapshots, snapshotStore, maxSteps: options.config.run?.maxSteps, + retryFailed, reporter: options.reporter, rejectedRunners, }); @@ -214,6 +224,7 @@ export async function executeSuite( snapshots: options.snapshots, snapshotStore, maxSteps: options.config.run?.maxSteps, + retryFailed, reporter: options.reporter, rejectedRunners, }); @@ -317,6 +328,7 @@ async function executePlannedExecution( snapshots?: SnapshotRuntimeOptions; snapshotStore?: SnapshotStore; maxSteps?: number; + retryFailed: number; reporter?: BenchmarkReporter; rejectedRunners: Map; }, @@ -338,14 +350,6 @@ async function executePlannedExecution( }); } - await options.reporter?.onRunnerStart?.({ - context: options.context, - testCase: item.testCase, - runner: item.runner.info, - caseIndex: item.caseIndex + 1, - totalCases: options.selectedCases.length, - }); - const artifactDir = path.join( options.outputDir, sanitizePathSegment(item.testCase.id), @@ -353,47 +357,87 @@ async function executePlannedExecution( ); await ensureDir(artifactDir); - const rejectedResult = options.rejectedRunners.get(item.runner.id); - const rawResult = - rejectedResult === undefined - ? await runExecution(item, { - resolvedWorkspace: options.resolvedWorkspace, - executeRunnerFn: options.executeRunnerFn, - outputDir: options.outputDir, - maxSteps: options.maxSteps, - snapshots: options.snapshots, - snapshotStore: options.snapshotStore, - }) - : await createRejectedModelResult(item, artifactDir); - - if (rejectedResult === undefined && (await isModelRejectedResult(rawResult))) { - rawResult.failureType = "runner-crash"; - rawResult.failureOrigin = "model-rejected"; - if (rawResult.error?.name === "AssertionError" || rawResult.error === undefined) { - rawResult.error = { - name: "Error", - message: `Runner rejected configured model "${item.runner.info.agent.model ?? "unknown"}" during initial execution.`, - }; + const maxAttempts = options.retryFailed + 1; + const attempts: RunnerAttemptResult[] = []; + let result: RunnerResult | undefined; + + for (let attempt = 1; attempt <= maxAttempts; attempt += 1) { + await options.reporter?.onRunnerStart?.({ + context: options.context, + testCase: item.testCase, + runner: item.runner.info, + attempt, + maxAttempts, + caseIndex: item.caseIndex + 1, + totalCases: options.selectedCases.length, + }); + + const attemptArtifactDir = resolveAttemptArtifactDir(artifactDir, attempt); + await ensureDir(attemptArtifactDir); + + const rejectedResult = options.rejectedRunners.get(item.runner.id); + const rawResult = + rejectedResult === undefined + ? await runExecution(item, { + artifactDir: attemptArtifactDir, + resolvedWorkspace: options.resolvedWorkspace, + executeRunnerFn: options.executeRunnerFn, + outputDir: options.outputDir, + maxSteps: options.maxSteps, + snapshots: options.snapshots, + snapshotStore: options.snapshotStore, + }) + : await createRejectedModelResult(item, attemptArtifactDir); + + if (rejectedResult === undefined && (await isModelRejectedResult(rawResult))) { + rawResult.failureType = "runner-crash"; + rawResult.failureOrigin = "model-rejected"; + if (rawResult.error?.name === "AssertionError" || rawResult.error === undefined) { + rawResult.error = { + name: "Error", + message: `Runner rejected configured model "${item.runner.info.agent.model ?? "unknown"}" during initial execution.`, + }; + } + rawResult.failureLogPath ??= path.join(attemptArtifactDir, "stderr.log"); + options.rejectedRunners.set(item.runner.id, rawResult); + await writeJson(path.join(attemptArtifactDir, "error.json"), rawResult.error); + await writeJson(path.join(attemptArtifactDir, "report.json"), rawResult.report); + } + + const classifiedAttempt = createAttemptResult( + classifyExpectedFailure(item.testCase, rawResult), + attempt, + ); + attempts.push(classifiedAttempt); + result = { + ...classifiedAttempt, + attempts: [...attempts], + }; + + await options.reporter?.onRunnerFinish?.({ + context: options.context, + testCase: item.testCase, + runner: item.runner.info, + result, + attempt, + maxAttempts, + caseIndex: item.caseIndex + 1, + totalCases: options.selectedCases.length, + }); + + if (!shouldRetry(classifiedAttempt, options.retryFailed, attempt)) { + break; } - rawResult.failureLogPath ??= path.join(artifactDir, "stderr.log"); - options.rejectedRunners.set(item.runner.id, rawResult); - await writeJson(path.join(artifactDir, "error.json"), rawResult.error); - await writeJson(path.join(artifactDir, "report.json"), rawResult.report); } - const result = classifyExpectedFailure(item.testCase, rawResult); + if (result === undefined) { + throw new Error( + `Execution finished without a result for ${item.testCase.id} > ${item.runner.id}`, + ); + } options.caseResults[item.caseIndex]!.runnerResults[item.runnerIndex] = result; - await options.reporter?.onRunnerFinish?.({ - context: options.context, - testCase: item.testCase, - runner: item.runner.info, - result, - caseIndex: item.caseIndex + 1, - totalCases: options.selectedCases.length, - }); - state.completedRuns += 1; if (state.completedRuns === options.selectedRunners.length) { @@ -412,6 +456,7 @@ async function executePlannedExecution( async function runExecution( item: PlannedSuiteExecution, options: { + artifactDir: string; resolvedWorkspace: ReturnType; executeRunnerFn: typeof executeRunner; outputDir: string; @@ -420,18 +465,13 @@ async function runExecution( snapshotStore?: SnapshotStore; }, ): Promise { - const artifactDir = path.join( - options.outputDir, - sanitizePathSegment(item.testCase.id), - item.runner.info.pathKey, - ); const executionStartedMs = Date.now(); let result: RunnerResult; let preparedWorkspace; try { preparedWorkspace = await prepareWorkspace(options.resolvedWorkspace, { - artifactDir, + artifactDir: options.artifactDir, outputDir: options.outputDir, testCase: item.testCase, runner: item.runner.info, @@ -444,7 +484,7 @@ async function runExecution( getAdapter(item.runner.config.agent), { cwd: preparedWorkspace.cwd, - artifactDir, + artifactDir: options.artifactDir, timeoutMs: item.timeoutMs, maxSteps: options.maxSteps, snapshots: @@ -458,19 +498,19 @@ async function runExecution( result = createExecutionFailureResult(error, { testCase: item.testCase, runner: item.runner.info, - artifactDir, + artifactDir: options.artifactDir, durationMs: Date.now() - executionStartedMs, failureOrigin: isWorkspaceFailure ? classifyWorkspaceFailureOrigin(error) : undefined, failureLogPath: isWorkspaceFailure - ? resolveWorkspaceFailureLogPath(artifactDir, error) + ? resolveWorkspaceFailureLogPath(options.artifactDir, error) : undefined, }); - await writeJson(path.join(artifactDir, "error.json"), result.error); - await writeJson(path.join(artifactDir, "report.json"), result.report); + await writeJson(path.join(options.artifactDir, "error.json"), result.error); + await writeJson(path.join(options.artifactDir, "report.json"), result.report); } finally { if (preparedWorkspace !== undefined) { await finalizeWorkspace(preparedWorkspace, { - artifactDir, + artifactDir: options.artifactDir, passed: result!.passed, }); } @@ -479,6 +519,21 @@ async function runExecution( return result; } +function createAttemptResult(result: RunnerResult, attempt: number): RunnerAttemptResult { + return { + ...result, + attempt, + }; +} + +function shouldRetry(result: RunnerAttemptResult, retryFailed: number, attempt: number): boolean { + return !result.passed && attempt <= retryFailed && result.failureOrigin !== "model-rejected"; +} + +function resolveAttemptArtifactDir(artifactDir: string, attempt: number): string { + return attempt === 1 ? artifactDir : path.join(artifactDir, `attempt-${String(attempt)}`); +} + async function createRejectedModelResult( item: PlannedSuiteExecution, artifactDir: string, diff --git a/test/cli.test.ts b/test/cli.test.ts index 60b9a8b..a5291a8 100644 --- a/test/cli.test.ts +++ b/test/cli.test.ts @@ -42,6 +42,7 @@ test("cli help prints full MOTD banner and help sections", async () => { expect(result.stdout).toContain("Run Options:"); expect(result.stdout).toContain("--schedule "); expect(result.stdout).toContain("--max-parallel "); + expect(result.stdout).toContain("--retry-failed "); expect(result.stdout).toContain("Examples:"); }); @@ -339,6 +340,81 @@ test("cli run passes repeated and comma-separated tag filters to execution", asy unmockRunCommandDependencies(); }); +test("cli run passes retryFailed through to execution", async () => { + const tempDir = await mkdtemp(path.join(os.tmpdir(), "skillgym-cli-")); + tempDirs.push(tempDir); + const executeSuite = vi.fn(async () => ({ + suitePath: path.join(tempDir, "suite.ts"), + startedAt: "2026-04-02T12:00:00.000Z", + endedAt: "2026-04-02T12:00:01.000Z", + durationMs: 1_000, + outputDir: path.join(tempDir, ".skillgym-results", "run-1"), + declaredTags: [], + selectedTags: [], + cases: [ + { + caseId: "alpha", + tags: [], + passed: true, + runnerResults: [{ passed: true, status: "passed" }], + }, + ], + runners: [], + })); + + vi.resetModules(); + vi.doMock("../src/config.js", () => ({ + loadConfig: vi.fn(async () => ({ + config: { + runners: { + open: { agent: { type: "opencode", model: "openai/gpt-5" } }, + }, + }, + filePath: path.join(tempDir, "skillgym.config.ts"), + })), + resolveReporterOptions: vi.fn(() => ({ reporter: undefined, cwd: tempDir })), + resolveRunOptions: vi.fn((options) => ({ + cwd: tempDir, + outputDir: path.join(tempDir, ".skillgym-results"), + schedule: "serial", + retryFailed: Number(options.retryFailed ?? 0), + tags: options.tags, + })), + })); + vi.doMock("../src/reporters/index.js", () => ({ + loadReporter: vi.fn(async () => undefined), + })); + vi.doMock("../src/snapshots/store.js", () => ({ + createSnapshotRuntimeOptions: vi.fn(() => undefined), + })); + vi.doMock("../src/runner/load-suite.js", () => ({ + loadSuite: vi.fn(async () => ({ + cases: [{ id: "alpha", prompt: "Say hello", tags: ["smoke"], assert() {} }], + workspace: undefined, + dirPath: tempDir, + })), + })); + vi.doMock("../src/runner/workspace.js", () => ({ + resolveEffectiveWorkspace: vi.fn(() => ({ mode: "shared", cwd: tempDir })), + })); + vi.doMock("../src/runner/execute-suite.js", () => ({ + executeSuite, + })); + + const { runCommand } = await import("../src/cli/run.js"); + + await expect( + runCommand({ suitePath: "./suite.ts", cwd: tempDir, retryFailed: "2" }), + ).resolves.toBeUndefined(); + expect(executeSuite).toHaveBeenCalledWith( + "./suite.ts", + expect.any(Array), + expect.objectContaining({ retryFailed: 2 }), + ); + + unmockRunCommandDependencies(); +}); + async function execCli(args: string[], cwd = repoRoot) { return execFileCapture( process.execPath, diff --git a/test/config.test.ts b/test/config.test.ts index 66f28a2..1bfb663 100644 --- a/test/config.test.ts +++ b/test/config.test.ts @@ -247,6 +247,15 @@ describe("config", () => { expect(parsed.run?.maxParallel).toBe(3); }); + test("parses run retryFailed", () => { + const parsed = parseConfig({ + run: { retryFailed: 2 }, + runners: { open: { agent: { type: "opencode", model: "openai/gpt-5" } } }, + }); + + expect(parsed.run?.retryFailed).toBe(2); + }); + test("accepts cursor-agent runner configs", () => { const parsed = parseConfig({ runners: { @@ -312,6 +321,7 @@ describe("config", () => { outputDir: path.join(tempDir, "config-results"), schedule: "parallel", maxParallel: 2, + retryFailed: 0, tags: [], }); }); @@ -335,6 +345,35 @@ describe("config", () => { ).toThrow("Invalid config at CLI option --max-parallel: expected integer >= 1"); }); + test("run options support retryFailed in config and CLI", () => { + expect( + resolveRunOptions( + {}, + { + run: { retryFailed: 2 }, + runners: { open: { agent: { type: "opencode", model: "openai/gpt-5" } } }, + }, + ), + ).toMatchObject({ retryFailed: 2 }); + + expect( + resolveRunOptions( + { retryFailed: "3" }, + { + run: { retryFailed: 2 }, + runners: { open: { agent: { type: "opencode", model: "openai/gpt-5" } } }, + }, + ), + ).toMatchObject({ retryFailed: 3 }); + + expect(() => + resolveRunOptions( + { retryFailed: "-1" }, + { runners: { open: { agent: { type: "opencode", model: "openai/gpt-5" } } } }, + ), + ).toThrow("Invalid config at CLI option --retry-failed: expected integer >= 0"); + }); + test("run options support config tags and let CLI tags override config", () => { expect( resolveRunOptions( diff --git a/test/reporters/github-actions.test.ts b/test/reporters/github-actions.test.ts index df390ec..c9c75dd 100644 --- a/test/reporters/github-actions.test.ts +++ b/test/reporters/github-actions.test.ts @@ -30,14 +30,19 @@ test("github-actions reporter formats escaped annotations for failed runs", asyn await reporter.onSuiteFinish?.({ context: createContext(), - result: createSuiteResult({ runner, caseId: "case,a", errorMessage: "boom,\n100%" }), + result: createSuiteResult({ + runner, + caseId: "case,a", + errorMessage: "boom,\n100%", + attempts: 2, + }), }); expect(writes.join("")).toContain( "::error title=case%2Ca > code%3Amain,file=/workspace/examples/basic-suite.ts,line=14,col=15::", ); expect(writes.join("")).toContain( - "failure type: assertion%0Afailure origin: assertion%0Aerror: AssertionError: boom,%0A100%25", + "failure type: assertion%0Aretries: 1%0Afailure origin: assertion%0Aerror: AssertionError: boom,%0A100%25", ); expect(writes.join("")).toContain("artifacts: .skillgym-results/run-1/case,a/code-main"); }); @@ -57,7 +62,7 @@ test("github-actions reporter includes file metadata from user stack frames", as await reporter.onSuiteFinish?.({ context: createContext(), - result: createSuiteResult({ runner, caseId: "case-a" }), + result: createSuiteResult({ runner, caseId: "case-a", attempts: 2 }), }); expect(writes.join("")).toContain("file=/workspace/examples/basic-suite.ts,line=14,col=15"); @@ -79,7 +84,7 @@ test("github-actions reporter writes a job summary when GITHUB_STEP_SUMMARY is s await reporter.onSuiteFinish?.({ context: createContext(), - result: createSuiteResult({ runner, caseId: "case-a" }), + result: createSuiteResult({ runner, caseId: "case-a", attempts: 2 }), }); const summary = await readFile(summaryPath, "utf8"); @@ -89,9 +94,9 @@ test("github-actions reporter writes a job summary when GITHUB_STEP_SUMMARY is s expect(summary).toContain("- Runs: 0 passed, 1 failed"); expect(summary).toContain("### Runner: `open-main` (opencode, openai/gpt-5)"); expect(summary).toContain("| Case | Duration | Input | Output | Reasoning | Cache | Billable |"); - expect(summary).toContain("| ❌ `case-a` | 24s | 9,830 | 1,104 | 0 | 0 | 12,000 |"); + expect(summary).toContain("| ❌ `case-a` (1 retry) | 24s | 9,830 | 1,104 | 0 | 0 | 12,000 |"); expect(summary).toContain( - "- `case-a > open-main`; assertion; AssertionError: expected skill to be loaded before command execution; artifacts: `.skillgym-results/run-1/case-a/open-main`; log: `.skillgym-results/run-1/case-a/open-main/stderr.log`", + "- `case-a > open-main`; assertion; AssertionError: expected skill to be loaded before command execution; retries: 1; artifacts: `.skillgym-results/run-1/case-a/open-main`; log: `.skillgym-results/run-1/case-a/open-main/stderr.log`", ); }); @@ -137,11 +142,13 @@ function createSuiteResult(options: { runner: RunnerInfo; caseId: string; errorMessage?: string; + attempts?: number; }): SuiteRunResult { const runnerResult = createFailedRunnerResult( options.runner, options.caseId, options.errorMessage, + options.attempts, ); return { @@ -161,13 +168,64 @@ function createFailedRunnerResult( runner: RunnerInfo, caseId: string, errorMessage = "expected skill to be loaded before command execution", + attempts = 1, ): RunnerResult { + const artifactDir = `.skillgym-results/run-1/${caseId}/${runner.id.replace(/[:]/g, "-")}`; + return { runner, passed: false, status: "failed", + attempt: attempts, durationMs: 24_800, - artifactDir: `.skillgym-results/run-1/${caseId}/${runner.id.replace(/[:]/g, "-")}`, + artifactDir, + attempts: Array.from({ length: attempts }, (_, index) => ({ + runner, + passed: false, + status: "failed", + attempt: index + 1, + durationMs: 24_800, + artifactDir: + index === 0 ? artifactDir : path.join(artifactDir, `attempt-${String(index + 1)}`), + error: { + name: "AssertionError", + message: errorMessage, + stack: [ + `AssertionError: ${errorMessage}`, + " at assert (/workspace/src/assertions/output.ts:88:10)", + " at Object.assert (/workspace/examples/basic-suite.ts:14:15)", + " at executeRunner (/workspace/src/runner/execute-runner.ts:91:7)", + ].join("\n"), + }, + failureType: "assertion", + failureOrigin: "assertion", + failureLogPath: + index === 0 + ? `${artifactDir}/stderr.log` + : `${path.join(artifactDir, `attempt-${String(index + 1)}`)}/stderr.log`, + report: createSessionReport({ + runner, + usage: { + cacheTokens: 0, + totalTokens: 12_000, + inputTokens: 9_830, + outputTokens: 1_104, + reasoningTokens: 0, + inputChars: 10, + outputChars: 5, + reasoningChars: 0, + source: { + input: "provider", + output: "provider", + reasoning: "provider", + }, + }, + files: { + observedReads: ["a"], + observedSkillReads: [], + }, + }), + })), error: { name: "AssertionError", message: errorMessage, @@ -180,7 +238,7 @@ function createFailedRunnerResult( }, failureType: "assertion", failureOrigin: "assertion", - failureLogPath: `.skillgym-results/run-1/${caseId}/${runner.id.replace(/[:]/g, "-")}/stderr.log`, + failureLogPath: `${artifactDir}/stderr.log`, report: createSessionReport({ runner, usage: { diff --git a/test/reporters/json-summary.test.ts b/test/reporters/json-summary.test.ts index f95d15e..65d9556 100644 --- a/test/reporters/json-summary.test.ts +++ b/test/reporters/json-summary.test.ts @@ -33,8 +33,121 @@ test("json-summary reporter omits session internals and prints summary on suite runner, passed: false, status: "failed", + attempt: 2, durationMs: 18_200, artifactDir: ".skillgym-results/run-1/case-a/open-main", + attempts: [ + { + runner, + passed: false, + status: "failed", + attempt: 1, + durationMs: 20_000, + artifactDir: ".skillgym-results/run-1/case-a/open-main", + failureType: "assertion", + failureOrigin: "assertion", + failureClass: { + id: "missing-flag", + label: "Missing required flag", + }, + error: { + name: "AssertionError", + message: "expected skill to be loaded", + }, + report: { + runner, + sessionId: "sess-attempt-1", + prompt: "Do the thing", + usage: { + inputTokens: 900, + outputTokens: 180, + reasoningTokens: 40, + cacheTokens: 350, + totalTokens: 1080, + inputChars: 4000, + outputChars: 800, + reasoningChars: 200, + source: { input: "provider", output: "provider", reasoning: "derived" }, + }, + files: { + observedReads: ["/workspace/src/index.ts"], + observedSkillReads: ["/workspace/.claude/skills/my-skill.md"], + }, + detectedSkills: [], + events: [], + finalOutput: "Done.", + startedAt: "2026-04-02T12:00:00.000Z", + endedAt: "2026-04-02T12:00:20.000Z", + durationMs: 20_000, + rawArtifacts: {}, + }, + }, + { + runner, + passed: false, + status: "failed", + attempt: 2, + durationMs: 18_200, + artifactDir: ".skillgym-results/run-1/case-a/open-main", + failureType: "assertion", + failureOrigin: "assertion", + failureClass: { + id: "missing-flag", + label: "Missing required flag", + }, + error: { + name: "AssertionError", + message: "expected skill to be loaded", + stack: + "AssertionError: expected skill to be loaded\n at /workspace/suite.ts:10:5", + }, + report: { + runner, + sessionId: "sess-abc123", + prompt: "Do the thing", + usage: { + inputTokens: 1000, + outputTokens: 200, + reasoningTokens: 50, + cacheTokens: 400, + totalTokens: 1200, + inputChars: 4000, + outputChars: 800, + reasoningChars: 200, + source: { input: "provider", output: "provider", reasoning: "derived" }, + }, + files: { + observedReads: ["/workspace/src/index.ts"], + observedSkillReads: ["/workspace/.claude/skills/my-skill.md"], + }, + detectedSkills: [ + { skill: "my-skill", confidence: "explicit", evidence: ["loaded skill"] }, + ], + events: [ + { + type: "toolCall", + tool: "Read", + args: { file_path: "/workspace/src/index.ts" }, + at: "2026-04-02T12:00:01.000Z", + }, + { + type: "message", + role: "assistant", + text: "I'll read the file.", + at: "2026-04-02T12:00:02.000Z", + }, + ], + finalOutput: "Done.", + startedAt: "2026-04-02T12:00:00.000Z", + endedAt: "2026-04-02T12:00:18.000Z", + durationMs: 18_200, + rawArtifacts: { + stdoutPath: ".skillgym-results/run-1/case-a/open-main/stdout.log", + sessionPath: ".skillgym-results/run-1/case-a/open-main/session.json", + }, + }, + }, + ], failureType: "assertion", failureOrigin: "assertion", failureClass: { @@ -156,6 +269,9 @@ test("json-summary reporter omits session internals and prints summary on suite const runnerResult = caseResult.runnerResults[0]; expect(runnerResult.runner.id).toBe("open-main"); expect(runnerResult.passed).toBe(false); + expect(runnerResult.status).toBe("failed"); + expect(runnerResult.attempt).toBe(2); + expect(runnerResult.retryCount).toBe(1); expect(runnerResult.durationMs).toBe(18_200); expect(runnerResult.artifactDir).toBe(".skillgym-results/run-1/case-a/open-main"); expect(runnerResult.failureType).toBe("assertion"); @@ -173,6 +289,58 @@ test("json-summary reporter omits session internals and prints summary on suite // usage preserved expect(runnerResult.usage.inputTokens).toBe(1000); expect(runnerResult.usage.totalTokens).toBe(1200); + expect(runnerResult.attempts).toEqual([ + { + passed: false, + status: "failed", + attempt: 1, + durationMs: 20_000, + artifactDir: ".skillgym-results/run-1/case-a/open-main", + usage: { + inputTokens: 900, + outputTokens: 180, + reasoningTokens: 40, + cacheTokens: 350, + totalTokens: 1080, + inputChars: 4000, + outputChars: 800, + reasoningChars: 200, + source: { input: "provider", output: "provider", reasoning: "derived" }, + }, + error: { name: "AssertionError", message: "expected skill to be loaded" }, + failureType: "assertion", + failureOrigin: "assertion", + failureClass: { + id: "missing-flag", + label: "Missing required flag", + }, + }, + { + passed: false, + status: "failed", + attempt: 2, + durationMs: 18_200, + artifactDir: ".skillgym-results/run-1/case-a/open-main", + usage: { + inputTokens: 1000, + outputTokens: 200, + reasoningTokens: 50, + cacheTokens: 400, + totalTokens: 1200, + inputChars: 4000, + outputChars: 800, + reasoningChars: 200, + source: { input: "provider", output: "provider", reasoning: "derived" }, + }, + error: { name: "AssertionError", message: "expected skill to be loaded" }, + failureType: "assertion", + failureOrigin: "assertion", + failureClass: { + id: "missing-flag", + label: "Missing required flag", + }, + }, + ]); // session internals omitted expect(runnerResult.report).toBeUndefined(); diff --git a/test/reporters/standard.test.ts b/test/reporters/standard.test.ts index 013ac3e..e0ece3c 100644 --- a/test/reporters/standard.test.ts +++ b/test/reporters/standard.test.ts @@ -1,3 +1,4 @@ +import path from "node:path"; import { afterEach, expect, test, vi } from "vitest"; import type { CaseResult, @@ -68,6 +69,7 @@ test("standard reporter prints runner-grouped results and failure artifacts", as passed: false, artifactDir: ".skillgym-results/run-1/case-a/code-main", totalTokens: 12_000, + attempts: 2, }), ], }), @@ -168,6 +170,7 @@ test("standard reporter prints runner-grouped results and failure artifacts", as expect(output).toContain("✗ case-a > code-main (codex, gpt-5.4)"); expect(output).toContain("AssertionError: expected skill to be loaded before command execution"); expect(output).toContain("at /workspace/examples/basic-suite.ts:14:15"); + expect(output).toContain("Attempts: 2"); expect(output).not.toContain("skillgym could not complete the run"); expect(output).not.toContain("Run did not complete because the runner crashed"); expect(output).toContain("Artifacts: .skillgym-results/run-1/case-a/code-main"); @@ -420,6 +423,152 @@ test("standard reporter labels expected failures and unexpected passes", async ( expect(output).not.toContain("known-gap > open-main"); }); +test("standard reporter shows recovered retries inline without failure blocks", async () => { + const writes: string[] = []; + const reporter = createStandardReporter({ + stdout: { + isTTY: true, + columns: 120, + write(chunk: string) { + writes.push(chunk); + return true; + }, + }, + isInteractive: false, + isUnicode: true, + }); + const runner = createRunnerInfo("cursor-main", { type: "cursor-agent", model: "auto" }); + const context = { + isInteractive: false, + cwd: "/workspace", + workspaceMode: "shared" as const, + suitePath: "examples/flaky-retry-suite.ts", + outputDir: ".skillgym-results/run-1", + selectedCaseCount: 1, + selectedRunnerCount: 1, + selectedExecutionCount: 1, + scheduleMode: "serial" as const, + maxParallel: 1, + declaredTags: [], + }; + const suiteResult: SuiteRunResult = { + suitePath: context.suitePath, + startedAt: "2026-04-02T12:00:00.000Z", + endedAt: "2026-04-02T12:00:12.000Z", + durationMs: 12_000, + outputDir: context.outputDir, + declaredTags: [], + selectedTags: [], + cases: [ + createCaseResult({ + caseId: "retry-once", + runnerResults: [ + createRunnerResult({ + runner, + passed: true, + artifactDir: ".skillgym-results/run-1/retry-once/cursor-main/attempt-2", + totalTokens: 12_000, + attempts: 2, + }), + ], + }), + ], + runners: [ + createRunnerSummary({ + runner, + passedCases: 1, + totalCases: 1, + averageDurationMs: 24_800, + averageTotalTokens: 12_000, + }), + ], + }; + + await reporter.onSuiteStart?.({ + context, + cases: [], + runners: [runner], + startedAt: suiteResult.startedAt, + }); + await reporter.onRunnerFinish?.({ + context, + testCase: { id: "retry-once", prompt: "", assert() {} }, + runner, + result: suiteResult.cases[0]!.runnerResults[0]!, + caseIndex: 1, + totalCases: 1, + }); + await reporter.onCaseFinish?.({ + context, + testCase: { id: "retry-once", prompt: "", assert() {} }, + result: suiteResult.cases[0]!, + caseIndex: 1, + totalCases: 1, + }); + await reporter.onSuiteFinish?.({ context, result: suiteResult }); + + const output = writes.join(""); + expect(output).toContain("retry-once"); + expect(output).not.toContain("Failure Classes"); + expect(output).not.toContain("Failures"); + expect(output).not.toContain("Artifacts:"); +}); + +test("standard reporter interactive mode shows retry warning on recovered run", async () => { + const writes: string[] = []; + const reporter = createStandardReporter({ + stdout: { + isTTY: true, + columns: 120, + write(chunk: string) { + writes.push(chunk); + return true; + }, + }, + isInteractive: true, + isUnicode: true, + }); + + const context = { + isInteractive: true, + cwd: "/workspace", + workspaceMode: "shared" as const, + suitePath: "examples/flaky-retry-suite.ts", + outputDir: ".skillgym-results/run-1", + selectedCaseCount: 1, + selectedRunnerCount: 1, + selectedExecutionCount: 1, + scheduleMode: "serial" as const, + maxParallel: 1, + declaredTags: [], + }; + const runner = createRunnerInfo("cursor-main", { type: "cursor-agent", model: "auto" }); + + await reporter.onSuiteStart?.({ + context, + cases: [{ id: "retry-once", prompt: "", assert() {} }], + runners: [runner], + startedAt: "2026-04-02T12:00:00.000Z", + }); + + await reporter.onRunnerFinish?.({ + context, + testCase: { id: "retry-once", prompt: "", assert() {} }, + runner, + result: createRunnerResult({ + runner, + passed: true, + artifactDir: "x", + totalTokens: 10_000, + attempts: 2, + }), + caseIndex: 1, + totalCases: 1, + }); + + expect(writes.join("")).toContain("(1 retry)"); +}); + test("standard reporter prints warning line for overlapping shared-workspace schedules", async () => { const parallelWrites: string[] = []; const serialWrites: string[] = []; @@ -1033,14 +1182,71 @@ function createRunnerResult(options: { status?: RunnerResult["status"]; artifactDir: string; totalTokens: number; + attempts?: number; failureClass?: RunnerResult["failureClass"]; }): RunnerResult { + const attempts = options.attempts ?? 1; return { runner: options.runner, passed: options.passed, status: options.status ?? (options.passed ? "passed" : "failed"), + attempt: attempts, durationMs: 24_800, artifactDir: options.artifactDir, + attempts: Array.from({ length: attempts }, (_, index) => ({ + runner: options.runner, + passed: options.passed, + status: options.status ?? (options.passed ? "passed" : "failed"), + attempt: index + 1, + durationMs: 24_800, + artifactDir: + index === 0 + ? options.artifactDir + : path.join(options.artifactDir, `attempt-${String(index + 1)}`), + error: + options.passed || options.status === "unexpected-passed" + ? undefined + : { + name: "AssertionError", + message: "expected skill to be loaded before command execution", + stack: [ + "AssertionError: expected skill to be loaded before command execution", + " at assert (/workspace/src/assertions/output.ts:88:10)", + " at Object.assert (/workspace/examples/basic-suite.ts:14:15)", + " at executeRunner (/workspace/src/runner/execute-runner.ts:91:7)", + ].join("\n"), + }, + failureType: + options.passed || options.status === "unexpected-passed" ? undefined : "assertion", + failureOrigin: + options.passed || options.status === "unexpected-passed" ? undefined : "assertion", + failureClass: + options.passed || options.status === "unexpected-passed" + ? undefined + : (options.failureClass ?? { id: "assertion", label: "Assertion failure" }), + report: createSessionReport({ + runner: options.runner, + usage: { + cacheTokens: 7_233, + totalTokens: options.totalTokens, + inputTokens: 9_830, + outputTokens: 1_104, + reasoningTokens: 0, + inputChars: 10, + outputChars: 5, + reasoningChars: 0, + source: { + input: "provider", + output: "provider", + reasoning: "provider", + }, + }, + files: { + observedReads: ["a", "b", "c"], + observedSkillReads: [], + }, + }), + })), error: options.passed || options.status === "unexpected-passed" ? undefined diff --git a/test/runner/execute-suite.reporter.test.ts b/test/runner/execute-suite.reporter.test.ts index fc40e9f..6fe843e 100644 --- a/test/runner/execute-suite.reporter.test.ts +++ b/test/runner/execute-suite.reporter.test.ts @@ -331,6 +331,102 @@ test("executeSuite with parallel schedule respects maxParallel", async () => { expect(maxActive).toBe(2); }); +test("executeSuite retries only failed executions and preserves attempt artifacts", async () => { + const outputDir = await createTempDir(); + const attemptsByRun = new Map(); + const runnerPathKey = createRunnerInfo("open", { + type: "opencode", + model: "openai/gpt-5", + }).pathKey; + + const result = await executeSuite("./suite.ts", [{ id: "flaky", prompt: "a", assert() {} }], { + cwd: outputDir, + outputDir, + retryFailed: 2, + isInteractive: false, + config: { + runners: { + open: { agent: { type: "opencode", model: "openai/gpt-5" } }, + }, + }, + executeRunnerFn: async (testCase, runner, _adapter, options) => { + const key = `${testCase.id}:${runner.id}`; + const attempt = (attemptsByRun.get(key) ?? 0) + 1; + attemptsByRun.set(key, attempt); + + return createRunnerResult({ + caseId: testCase.id, + runner, + passed: attempt >= 2, + durationMs: attempt * 10, + artifactDir: options.artifactDir, + totalTokens: 100 * attempt, + outputTokens: 20, + observedReads: 1, + }); + }, + }); + + const runnerResult = result.cases[0]!.runnerResults[0]!; + expect(attemptsByRun.get("flaky:open")).toBe(2); + expect(runnerResult).toMatchObject({ + passed: true, + attempt: 2, + artifactDir: path.join(result.outputDir, "flaky", runnerPathKey, "attempt-2"), + }); + expect(runnerResult.attempts).toHaveLength(2); + expect(runnerResult.attempts?.map((attempt) => attempt.artifactDir)).toEqual([ + path.join(result.outputDir, "flaky", runnerPathKey), + path.join(result.outputDir, "flaky", runnerPathKey, "attempt-2"), + ]); + + const saved = JSON.parse( + await readFile(path.join(result.outputDir, "results.json"), "utf8"), + ) as SuiteRunResult; + expect(saved.cases[0]?.runnerResults[0]?.attempts).toHaveLength(2); +}); + +test("executeSuite does not retry expected failures", async () => { + const outputDir = await createTempDir(); + let attempts = 0; + + const result = await executeSuite( + "./suite.ts", + [{ id: "known-gap", prompt: "a", expectedFail: true, assert() {} }], + { + cwd: outputDir, + outputDir, + retryFailed: 2, + isInteractive: false, + config: { + runners: { + open: { agent: { type: "opencode", model: "openai/gpt-5" } }, + }, + }, + executeRunnerFn: async (testCase, runner, _adapter, options) => { + attempts += 1; + return createRunnerResult({ + caseId: testCase.id, + runner, + passed: false, + durationMs: 10, + artifactDir: options.artifactDir, + totalTokens: 100, + outputTokens: 20, + observedReads: 1, + }); + }, + }, + ); + + expect(attempts).toBe(1); + expect(result.cases[0]?.runnerResults[0]).toMatchObject({ + passed: true, + status: "expected-failed", + attempt: 1, + }); +}); + test("executeSuite with isolated-by-runner runs serially within a runner and concurrently across runners", async () => { const outputDir = await createTempDir(); const started: string[] = [];