From 075ac672babb4bc80afff2515029a03354fe2f67 Mon Sep 17 00:00:00 2001 From: Szymon Chmal Date: Tue, 5 May 2026 08:56:19 +0200 Subject: [PATCH 1/2] Add retry support for failed executions --- README.md | 4 + src/cli.ts | 2 + src/cli/help.ts | 2 + src/cli/run.ts | 2 + src/config.ts | 14 ++ src/domain/result.ts | 11 +- src/index.ts | 1 + src/reporters/contract.ts | 4 + src/reporters/github-actions.ts | 21 ++- src/reporters/json-summary.ts | 53 +++++++ src/reporters/standard.ts | 20 ++- src/runner/execute-suite.ts | 167 ++++++++++++++------- test/cli.test.ts | 1 + test/config.test.ts | 39 +++++ test/reporters/github-actions.test.ts | 76 +++++++++- test/reporters/json-summary.test.ts | 167 +++++++++++++++++++++ test/reporters/standard.test.ts | 61 ++++++++ test/runner/execute-suite.reporter.test.ts | 96 ++++++++++++ 18 files changed, 674 insertions(+), 67 deletions(-) diff --git a/README.md b/README.md index 64f5a3b..c61ff9c 100644 --- a/README.md +++ b/README.md @@ -40,6 +40,7 @@ const config: SkillGymConfig = { reporter: "standard", schedule: "serial", maxParallel: 4, + retryFailed: 1, maxSteps: 4, }, defaults: { @@ -144,6 +145,7 @@ Most important config properties: - `run.reporter`: built-in `standard` reporter or a custom reporter module path - `run.schedule`: execution scheduling mode for case x runner pairs - `run.maxParallel`: maximum concurrent executions for non-serial schedules, defaulting to available CPU parallelism +- `run.retryFailed`: rerun only failed case x runner executions up to this many additional attempts - `run.maxSteps`: best-effort limit on streamed agent steps before skillgym terminates the run - `run.workspace`: default workspace mode for the suite - `defaults.timeoutMs`: default per-case timeout @@ -165,6 +167,8 @@ For concurrent schedules, `run.maxParallel` defaults to `os.availableParallelism Concurrent schedules do not copy or isolate the workspace by themselves. Overlapping runs may still interact through the same filesystem state and live runner output unless you use isolated workspaces. OpenCode, Codex, and Claude Code runtime state are isolated per run under each artifact directory. +`run.retryFailed` is useful when broad benchmark runs include occasional flaky agent failures. SkillGym only retries executions that still count as failed after result classification, keeps each attempt's artifacts, and reports whether a final pass came from a retry. + `run.maxSteps` is enforced on a best-effort basis by monitoring each runner's streamed JSONL output. A step is one observed model round, not one token and not necessarily one tool call, but the exact boundary is still runner-defined, so the same prompt may consume different numbers of steps across agents. When the observed step count exceeds the configured limit, skillgym kills the agent process, fails the run with origin `max-steps`, and preserves raw stdout/stderr artifacts for debugging. No partial normalized report is produced for that failure. ## Workspaces diff --git a/src/cli.ts b/src/cli.ts index c60efa8..3f56174 100644 --- a/src/cli.ts +++ b/src/cli.ts @@ -22,6 +22,7 @@ async function main(): Promise { const scheduleOption = parsed.options.schedule; const configOption = parsed.options.config; const maxParallelOption = parsed.options["max-parallel"]; + const retryFailedOption = parsed.options["retry-failed"]; const updateSnapshotsOption = parsed.options["update-snapshots"]; const snapshotsOption = parsed.options.snapshots; const tagOption = parsed.options.tag; @@ -35,6 +36,7 @@ async function main(): Promise { reporter: getStringOption(reporterOption), schedule: getStringOption(scheduleOption), maxParallel: getStringOption(maxParallelOption), + retryFailed: getStringOption(retryFailedOption), tags: parseTagOption(tagOption), reporterCwd: process.cwd(), configPath: getStringOption(configOption), diff --git a/src/cli/help.ts b/src/cli/help.ts index 8ab9111..5383076 100644 --- a/src/cli/help.ts +++ b/src/cli/help.ts @@ -17,6 +17,7 @@ ${theme.bold("Run Options:")} --output-dir ${theme.accent("")} Override where run artifacts are written --schedule ${theme.accent("")} Choose ${theme.light("serial")}, ${theme.light("parallel")}, or ${theme.light("isolated-by-runner")} --max-parallel ${theme.accent("")} Cap concurrent executions for non-serial schedules + --retry-failed ${theme.accent("")} Retry only failed case x runner executions up to ${theme.light("n")} extra times --case ${theme.accent("")} Filter the configured suite to one case id --tag ${theme.accent("")} Filter cases by tag; repeat or comma-separate for OR matching --runner ${theme.accent("")} Filter the configured runner set by runner id @@ -32,6 +33,7 @@ ${theme.bold("Examples:")} ${theme.dim("$")} ${theme.light("skillgym run ./examples/basic-suite.ts --reporter standard")} ${theme.dim("$")} ${theme.light("skillgym run ./examples/basic-suite.ts --schedule isolated-by-runner")} ${theme.dim("$")} ${theme.light("skillgym run ./examples/basic-suite.ts --schedule parallel --max-parallel 4")} + ${theme.dim("$")} ${theme.light("skillgym run ./examples/basic-suite.ts --retry-failed 2")} ${theme.dim("$")} ${theme.light("skillgym run ./examples/basic-suite.ts --update-snapshots")} `); } diff --git a/src/cli/run.ts b/src/cli/run.ts index 3b34a22..18006e2 100644 --- a/src/cli/run.ts +++ b/src/cli/run.ts @@ -19,6 +19,7 @@ export async function runCommand(options: { outputDir?: string; schedule?: string; maxParallel?: string; + retryFailed?: string; caseId?: string; runner?: string; reporter?: string; @@ -76,6 +77,7 @@ export async function runCommand(options: { outputDir: runOptions.outputDir, schedule: runOptions.schedule, maxParallel: runOptions.maxParallel, + retryFailed: runOptions.retryFailed, caseId: options.caseId, runner: options.runner, tags: runOptions.tags, diff --git a/src/config.ts b/src/config.ts index e47caa3..5bae9cb 100644 --- a/src/config.ts +++ b/src/config.ts @@ -24,6 +24,7 @@ const RUN_KEYS = [ "workspace", "maxSteps", "maxParallel", + "retryFailed", "tags", ] as const; const DEFAULT_KEYS = ["timeoutMs"] as const; @@ -64,6 +65,7 @@ export interface SkillGymConfig { workspace?: SuiteWorkspaceConfig; maxSteps?: number; maxParallel?: number; + retryFailed?: number; tags?: string[]; }; defaults?: { @@ -121,6 +123,7 @@ export function resolveRunOptions( outputDir?: string; schedule?: string; maxParallel?: string; + retryFailed?: string; tags?: string[]; }, config: SkillGymConfig, @@ -129,12 +132,17 @@ export function resolveRunOptions( outputDir?: string; schedule: ScheduleMode; maxParallel?: number; + retryFailed: number; tags: string[]; } { const maxParallel = cliOptions.maxParallel !== undefined ? parseIntegerString(cliOptions.maxParallel, "CLI option --max-parallel", 1) : config.run?.maxParallel; + const retryFailed = + cliOptions.retryFailed !== undefined + ? parseIntegerString(cliOptions.retryFailed, "CLI option --retry-failed", 0) + : (config.run?.retryFailed ?? 0); return { cwd: @@ -150,6 +158,7 @@ export function resolveRunOptions( ? parseScheduleMode(cliOptions.schedule, "CLI option --schedule") : (config.run?.schedule ?? "serial"), ...(maxParallel === undefined ? {} : { maxParallel }), + retryFailed, tags: cliOptions.tags ?? config.run?.tags ?? [], }; } @@ -256,6 +265,9 @@ function resolveConfigPaths(config: SkillGymConfig, configDir: string): SkillGym ...(config.run.maxParallel === undefined ? {} : { maxParallel: config.run.maxParallel }), + ...(config.run.retryFailed === undefined + ? {} + : { retryFailed: config.run.retryFailed }), ...(config.run.tags === undefined ? {} : { tags: config.run.tags }), }, defaults: @@ -346,6 +358,7 @@ function parseRunConfig(value: unknown, configPath: string): SkillGymConfig["run const maxSteps = parseOptionalInteger(record.maxSteps, `${configPath}.maxSteps`, 1); const maxParallel = parseOptionalInteger(record.maxParallel, `${configPath}.maxParallel`, 1); + const retryFailed = parseOptionalInteger(record.retryFailed, `${configPath}.retryFailed`, 0); return { cwd: parseOptionalNonEmptyString(record.cwd, `${configPath}.cwd`), @@ -355,6 +368,7 @@ function parseRunConfig(value: unknown, configPath: string): SkillGymConfig["run workspace: parseOptionalWorkspaceConfig(record.workspace, `${configPath}.workspace`), ...(maxSteps === undefined ? {} : { maxSteps }), ...(maxParallel === undefined ? {} : { maxParallel }), + ...(retryFailed === undefined ? {} : { retryFailed }), tags: parseOptionalStringArray(record.tags, `${configPath}.tags`), }; } diff --git a/src/domain/result.ts b/src/domain/result.ts index 3d13a53..0699428 100644 --- a/src/domain/result.ts +++ b/src/domain/result.ts @@ -1,7 +1,7 @@ import type { RunnerInfo } from "./runner.js"; import type { SessionReport } from "./session-report.js"; -export interface RunnerResult { +interface BaseRunnerResult { runner: RunnerInfo; passed: boolean; status: RunnerResultStatus; @@ -15,6 +15,15 @@ export interface RunnerResult { failureLogPath?: string; } +export interface RunnerAttemptResult extends BaseRunnerResult { + attempt: number; +} + +export interface RunnerResult extends BaseRunnerResult { + attempt?: number; + attempts?: RunnerAttemptResult[]; +} + export interface FailureClass { id: string; label?: string; diff --git a/src/index.ts b/src/index.ts index 900007e..28af200 100644 --- a/src/index.ts +++ b/src/index.ts @@ -17,6 +17,7 @@ export type { } from "./domain/test-case.js"; export type { FailureClass, + RunnerAttemptResult, RunnerFailureOrigin, RunnerFailureType, RunnerResult, diff --git a/src/reporters/contract.ts b/src/reporters/contract.ts index 8bfec39..35ef646 100644 --- a/src/reporters/contract.ts +++ b/src/reporters/contract.ts @@ -38,6 +38,8 @@ export interface RunnerStartEvent { context: ReporterContext; testCase: TestCase; runner: RunnerInfo; + attempt?: number; + maxAttempts?: number; caseIndex: number; totalCases: number; } @@ -47,6 +49,8 @@ export interface RunnerFinishEvent { testCase: TestCase; runner: RunnerInfo; result: RunnerResult; + attempt?: number; + maxAttempts?: number; caseIndex: number; totalCases: number; } diff --git a/src/reporters/github-actions.ts b/src/reporters/github-actions.ts index 2ada8d4..e5eca4e 100644 --- a/src/reporters/github-actions.ts +++ b/src/reporters/github-actions.ts @@ -50,6 +50,10 @@ function formatAnnotationCommand(caseId: string, result: RunnerResult): string { function formatAnnotationMessage(result: RunnerResult): string { const lines = [`failure type: ${result.failureType ?? "unknown"}`]; + if (result.attempts !== undefined && result.attempts.length > 1) { + lines.push(`attempts: ${String(result.attempts.length)}`); + } + if (result.failureOrigin !== undefined) { lines.push(`failure origin: ${result.failureOrigin}`); } @@ -147,7 +151,8 @@ function formatRunnerAgentLabel(runner: RunnerSummary["runner"]): string { function formatRunnerCaseRow(caseId: string, result: RunnerResult): string { const status = result.passed ? "✅" : "❌"; const usage = result.report.usage; - return `| ${status} \`${caseId}\` | ${formatDuration(result.durationMs)} | ${formatTokens(usage.inputTokens)} | ${formatTokens(usage.outputTokens)} | ${formatTokens(usage.reasoningTokens)} | ${formatTokens(usage.cacheTokens)} | ${formatTokens(usage.totalTokens)} |`; + const retryLabel = formatRetryLabel(result); + return `| ${status} \`${caseId}\`${retryLabel === undefined ? "" : ` ${retryLabel}`} | ${formatDuration(result.durationMs)} | ${formatTokens(usage.inputTokens)} | ${formatTokens(usage.outputTokens)} | ${formatTokens(usage.reasoningTokens)} | ${formatTokens(usage.cacheTokens)} | ${formatTokens(usage.totalTokens)} |`; } function getRunnerCases( @@ -167,6 +172,10 @@ function formatFailureSummaryItem(caseId: string, result: RunnerResult): string `artifacts: \`${result.artifactDir}\``, ]; + if (result.attempts !== undefined && result.attempts.length > 1) { + segments.splice(2, 0, `attempts: ${String(result.attempts.length)}`); + } + if (result.failureClass !== undefined) { segments.splice(2, 0, `class: \`${result.failureClass.id}\``); } @@ -182,6 +191,16 @@ function formatFailureSummaryItem(caseId: string, result: RunnerResult): string return segments.join("; "); } +function formatRetryLabel(result: RunnerResult): string | undefined { + if (result.attempts === undefined || result.attempts.length <= 1) { + return undefined; + } + + return result.passed + ? `(passed on retry ${String(result.attempt ?? result.attempts.length)}/${String(result.attempts.length)})` + : `(failed after ${String(result.attempts.length)} attempts)`; +} + function listFailures(result: SuiteRunResult): Array<{ caseId: string; result: RunnerResult }> { const failures: Array<{ caseId: string; result: RunnerResult }> = []; diff --git a/src/reporters/json-summary.ts b/src/reporters/json-summary.ts index aa6f86a..42cd3aa 100644 --- a/src/reporters/json-summary.ts +++ b/src/reporters/json-summary.ts @@ -14,6 +14,22 @@ interface SummaryError { interface SummaryRunnerResult { runner: RunnerResult["runner"]; passed: boolean; + status: RunnerResult["status"]; + attempt?: number; + durationMs: number; + artifactDir: string; + usage: RunnerResult["report"]["usage"]; + attempts?: SummaryAttemptResult[]; + error?: SummaryError; + failureType?: RunnerResult["failureType"]; + failureOrigin?: RunnerResult["failureOrigin"]; + failureClass?: FailureClass; +} + +interface SummaryAttemptResult { + passed: boolean; + status: RunnerResult["status"]; + attempt: number; durationMs: number; artifactDir: string; usage: RunnerResult["report"]["usage"]; @@ -46,6 +62,43 @@ function summarizeRunnerResult(result: RunnerResult): SummaryRunnerResult { const summary: SummaryRunnerResult = { runner: result.runner, passed: result.passed, + status: result.status, + attempt: result.attempt, + durationMs: result.durationMs, + artifactDir: result.artifactDir, + usage: result.report.usage, + }; + + if (result.attempts !== undefined) { + summary.attempts = result.attempts.map(summarizeAttemptResult); + } + + if (result.error !== undefined) { + summary.error = { name: result.error.name, message: result.error.message }; + } + + if (result.failureType !== undefined) { + summary.failureType = result.failureType; + } + + if (result.failureOrigin !== undefined) { + summary.failureOrigin = result.failureOrigin; + } + + if (result.failureClass !== undefined) { + summary.failureClass = result.failureClass; + } + + return summary; +} + +function summarizeAttemptResult( + result: NonNullable[number], +): SummaryAttemptResult { + const summary: SummaryAttemptResult = { + passed: result.passed, + status: result.status, + attempt: result.attempt, durationMs: result.durationMs, artifactDir: result.artifactDir, usage: result.report.usage, diff --git a/src/reporters/standard.ts b/src/reporters/standard.ts index cf2a7a1..287331d 100644 --- a/src/reporters/standard.ts +++ b/src/reporters/standard.ts @@ -29,6 +29,7 @@ interface FailureEntry { caseId: string; runner: RunnerInfo; artifactDir: string; + attempts?: RunnerResult["attempts"]; error?: SerializedError; failureType?: RunnerFailureType; failureOrigin?: RunnerFailureOrigin; @@ -171,6 +172,7 @@ export function createStandardReporter(options: StandardReporterOptions = {}): B caseId: event.testCase.id, runner: event.result.runner, artifactDir: event.result.artifactDir, + attempts: event.result.attempts, error: event.result.error, failureType: event.result.failureType, failureOrigin: event.result.failureOrigin, @@ -301,7 +303,9 @@ function formatRunnerCaseRow( accent: (value: string) => string, ): string { const color = result.passed ? pc.green : pc.red; - const statusLabel = formatStatusLabel(result.status); + const statusLabel = [formatStatusLabel(result.status), formatRetryLabel(result)] + .filter((label): label is string => label !== undefined) + .join(", "); return [ color(padCell(`${result.passed ? symbols.pass : symbols.fail} ${caseId}`, RUNNER_CASE_WIDTH)), padCell(formatDuration(result.durationMs), RUNNER_TIME_WIDTH), @@ -381,6 +385,10 @@ function formatFailureBlock( lines.push(colors.dim(`Log: ${failure.failureLogPath}`)); } + if (failure.attempts !== undefined && failure.attempts.length > 1) { + lines.push(colors.dim(`Attempts: ${String(failure.attempts.length)}`)); + } + lines.push(colors.dim(`Artifacts: ${failure.artifactDir}`)); return lines.join("\n"); } @@ -431,6 +439,16 @@ function formatFailureClassLabel(failureClass: FailureClass): string { return `${failureClass.label} [${failureClass.id}]`; } +function formatRetryLabel(result: RunnerResult): string | undefined { + if (result.attempts === undefined || result.attempts.length <= 1) { + return undefined; + } + + return result.passed + ? `passed on retry ${String(result.attempt ?? result.attempts.length)}/${String(result.attempts.length)}` + : `failed after ${String(result.attempts.length)} attempts`; +} + function formatErrorLocation(error: SerializedError): string | undefined { const location = extractUserStackFrame(error); return location === undefined ? undefined : formatStackFrameLocation(location); diff --git a/src/runner/execute-suite.ts b/src/runner/execute-suite.ts index b60a7af..77f57d3 100644 --- a/src/runner/execute-suite.ts +++ b/src/runner/execute-suite.ts @@ -2,7 +2,13 @@ import path from "node:path"; import process from "node:process"; import os from "node:os"; import { getCaseExecutionOptions } from "../config.js"; -import type { CaseResult, RunnerResult, RunnerSummary, SuiteRunResult } from "../domain/result.js"; +import type { + CaseResult, + RunnerAttemptResult, + RunnerResult, + RunnerSummary, + SuiteRunResult, +} from "../domain/result.js"; import type { ResolvedRunner, RunnerConfig, RunnerInfo } from "../domain/runner.js"; import type { ScheduleMode } from "../domain/schedule.js"; import type { SuiteWorkspaceConfig, TestCase } from "../domain/test-case.js"; @@ -49,6 +55,7 @@ export async function executeSuite( outputDir?: string; schedule?: ScheduleMode; maxParallel?: number; + retryFailed?: number; caseId?: string; runner?: string; tags?: string[]; @@ -59,6 +66,7 @@ export async function executeSuite( run?: { workspace?: SuiteWorkspaceConfig; maxSteps?: number; + retryFailed?: number; tags?: string[]; }; runners: Record; @@ -76,6 +84,7 @@ export async function executeSuite( const outputDir = path.resolve(options.outputDir ?? ".skillgym-results", timestampDirName()); const scheduleMode = options.schedule ?? "serial"; const maxParallel = resolveMaxParallel(scheduleMode, options.maxParallel); + const retryFailed = options.retryFailed ?? options.config.run?.retryFailed ?? 0; await ensureDir(outputDir); const selectedRunners = selectRunners(options.config.runners, options.runner); const normalizedCases = normalizeTestCases(testCases); @@ -196,6 +205,7 @@ export async function executeSuite( snapshots: options.snapshots, snapshotStore, maxSteps: options.config.run?.maxSteps, + retryFailed, reporter: options.reporter, rejectedRunners, }); @@ -214,6 +224,7 @@ export async function executeSuite( snapshots: options.snapshots, snapshotStore, maxSteps: options.config.run?.maxSteps, + retryFailed, reporter: options.reporter, rejectedRunners, }); @@ -317,6 +328,7 @@ async function executePlannedExecution( snapshots?: SnapshotRuntimeOptions; snapshotStore?: SnapshotStore; maxSteps?: number; + retryFailed: number; reporter?: BenchmarkReporter; rejectedRunners: Map; }, @@ -338,14 +350,6 @@ async function executePlannedExecution( }); } - await options.reporter?.onRunnerStart?.({ - context: options.context, - testCase: item.testCase, - runner: item.runner.info, - caseIndex: item.caseIndex + 1, - totalCases: options.selectedCases.length, - }); - const artifactDir = path.join( options.outputDir, sanitizePathSegment(item.testCase.id), @@ -353,47 +357,87 @@ async function executePlannedExecution( ); await ensureDir(artifactDir); - const rejectedResult = options.rejectedRunners.get(item.runner.id); - const rawResult = - rejectedResult === undefined - ? await runExecution(item, { - resolvedWorkspace: options.resolvedWorkspace, - executeRunnerFn: options.executeRunnerFn, - outputDir: options.outputDir, - maxSteps: options.maxSteps, - snapshots: options.snapshots, - snapshotStore: options.snapshotStore, - }) - : await createRejectedModelResult(item, artifactDir); - - if (rejectedResult === undefined && (await isModelRejectedResult(rawResult))) { - rawResult.failureType = "runner-crash"; - rawResult.failureOrigin = "model-rejected"; - if (rawResult.error?.name === "AssertionError" || rawResult.error === undefined) { - rawResult.error = { - name: "Error", - message: `Runner rejected configured model "${item.runner.info.agent.model ?? "unknown"}" during initial execution.`, - }; + const maxAttempts = options.retryFailed + 1; + const attempts: RunnerAttemptResult[] = []; + let result: RunnerResult | undefined; + + for (let attempt = 1; attempt <= maxAttempts; attempt += 1) { + await options.reporter?.onRunnerStart?.({ + context: options.context, + testCase: item.testCase, + runner: item.runner.info, + attempt, + maxAttempts, + caseIndex: item.caseIndex + 1, + totalCases: options.selectedCases.length, + }); + + const attemptArtifactDir = resolveAttemptArtifactDir(artifactDir, attempt); + await ensureDir(attemptArtifactDir); + + const rejectedResult = options.rejectedRunners.get(item.runner.id); + const rawResult = + rejectedResult === undefined + ? await runExecution(item, { + artifactDir: attemptArtifactDir, + resolvedWorkspace: options.resolvedWorkspace, + executeRunnerFn: options.executeRunnerFn, + outputDir: options.outputDir, + maxSteps: options.maxSteps, + snapshots: options.snapshots, + snapshotStore: options.snapshotStore, + }) + : await createRejectedModelResult(item, attemptArtifactDir); + + if (rejectedResult === undefined && (await isModelRejectedResult(rawResult))) { + rawResult.failureType = "runner-crash"; + rawResult.failureOrigin = "model-rejected"; + if (rawResult.error?.name === "AssertionError" || rawResult.error === undefined) { + rawResult.error = { + name: "Error", + message: `Runner rejected configured model "${item.runner.info.agent.model ?? "unknown"}" during initial execution.`, + }; + } + rawResult.failureLogPath ??= path.join(attemptArtifactDir, "stderr.log"); + options.rejectedRunners.set(item.runner.id, rawResult); + await writeJson(path.join(attemptArtifactDir, "error.json"), rawResult.error); + await writeJson(path.join(attemptArtifactDir, "report.json"), rawResult.report); + } + + const classifiedAttempt = createAttemptResult( + classifyExpectedFailure(item.testCase, rawResult), + attempt, + ); + attempts.push(classifiedAttempt); + result = { + ...classifiedAttempt, + attempts: [...attempts], + }; + + await options.reporter?.onRunnerFinish?.({ + context: options.context, + testCase: item.testCase, + runner: item.runner.info, + result, + attempt, + maxAttempts, + caseIndex: item.caseIndex + 1, + totalCases: options.selectedCases.length, + }); + + if (!shouldRetry(classifiedAttempt, options.retryFailed, attempt)) { + break; } - rawResult.failureLogPath ??= path.join(artifactDir, "stderr.log"); - options.rejectedRunners.set(item.runner.id, rawResult); - await writeJson(path.join(artifactDir, "error.json"), rawResult.error); - await writeJson(path.join(artifactDir, "report.json"), rawResult.report); } - const result = classifyExpectedFailure(item.testCase, rawResult); + if (result === undefined) { + throw new Error( + `Execution finished without a result for ${item.testCase.id} > ${item.runner.id}`, + ); + } options.caseResults[item.caseIndex]!.runnerResults[item.runnerIndex] = result; - await options.reporter?.onRunnerFinish?.({ - context: options.context, - testCase: item.testCase, - runner: item.runner.info, - result, - caseIndex: item.caseIndex + 1, - totalCases: options.selectedCases.length, - }); - state.completedRuns += 1; if (state.completedRuns === options.selectedRunners.length) { @@ -412,6 +456,7 @@ async function executePlannedExecution( async function runExecution( item: PlannedSuiteExecution, options: { + artifactDir: string; resolvedWorkspace: ReturnType; executeRunnerFn: typeof executeRunner; outputDir: string; @@ -420,18 +465,13 @@ async function runExecution( snapshotStore?: SnapshotStore; }, ): Promise { - const artifactDir = path.join( - options.outputDir, - sanitizePathSegment(item.testCase.id), - item.runner.info.pathKey, - ); const executionStartedMs = Date.now(); let result: RunnerResult; let preparedWorkspace; try { preparedWorkspace = await prepareWorkspace(options.resolvedWorkspace, { - artifactDir, + artifactDir: options.artifactDir, outputDir: options.outputDir, testCase: item.testCase, runner: item.runner.info, @@ -444,7 +484,7 @@ async function runExecution( getAdapter(item.runner.config.agent), { cwd: preparedWorkspace.cwd, - artifactDir, + artifactDir: options.artifactDir, timeoutMs: item.timeoutMs, maxSteps: options.maxSteps, snapshots: @@ -458,19 +498,19 @@ async function runExecution( result = createExecutionFailureResult(error, { testCase: item.testCase, runner: item.runner.info, - artifactDir, + artifactDir: options.artifactDir, durationMs: Date.now() - executionStartedMs, failureOrigin: isWorkspaceFailure ? classifyWorkspaceFailureOrigin(error) : undefined, failureLogPath: isWorkspaceFailure - ? resolveWorkspaceFailureLogPath(artifactDir, error) + ? resolveWorkspaceFailureLogPath(options.artifactDir, error) : undefined, }); - await writeJson(path.join(artifactDir, "error.json"), result.error); - await writeJson(path.join(artifactDir, "report.json"), result.report); + await writeJson(path.join(options.artifactDir, "error.json"), result.error); + await writeJson(path.join(options.artifactDir, "report.json"), result.report); } finally { if (preparedWorkspace !== undefined) { await finalizeWorkspace(preparedWorkspace, { - artifactDir, + artifactDir: options.artifactDir, passed: result!.passed, }); } @@ -479,6 +519,21 @@ async function runExecution( return result; } +function createAttemptResult(result: RunnerResult, attempt: number): RunnerAttemptResult { + return { + ...result, + attempt, + }; +} + +function shouldRetry(result: RunnerAttemptResult, retryFailed: number, attempt: number): boolean { + return !result.passed && attempt <= retryFailed && result.failureOrigin !== "model-rejected"; +} + +function resolveAttemptArtifactDir(artifactDir: string, attempt: number): string { + return attempt === 1 ? artifactDir : path.join(artifactDir, `attempt-${String(attempt)}`); +} + async function createRejectedModelResult( item: PlannedSuiteExecution, artifactDir: string, diff --git a/test/cli.test.ts b/test/cli.test.ts index 60b9a8b..71a5ab2 100644 --- a/test/cli.test.ts +++ b/test/cli.test.ts @@ -42,6 +42,7 @@ test("cli help prints full MOTD banner and help sections", async () => { expect(result.stdout).toContain("Run Options:"); expect(result.stdout).toContain("--schedule "); expect(result.stdout).toContain("--max-parallel "); + expect(result.stdout).toContain("--retry-failed "); expect(result.stdout).toContain("Examples:"); }); diff --git a/test/config.test.ts b/test/config.test.ts index 66f28a2..1bfb663 100644 --- a/test/config.test.ts +++ b/test/config.test.ts @@ -247,6 +247,15 @@ describe("config", () => { expect(parsed.run?.maxParallel).toBe(3); }); + test("parses run retryFailed", () => { + const parsed = parseConfig({ + run: { retryFailed: 2 }, + runners: { open: { agent: { type: "opencode", model: "openai/gpt-5" } } }, + }); + + expect(parsed.run?.retryFailed).toBe(2); + }); + test("accepts cursor-agent runner configs", () => { const parsed = parseConfig({ runners: { @@ -312,6 +321,7 @@ describe("config", () => { outputDir: path.join(tempDir, "config-results"), schedule: "parallel", maxParallel: 2, + retryFailed: 0, tags: [], }); }); @@ -335,6 +345,35 @@ describe("config", () => { ).toThrow("Invalid config at CLI option --max-parallel: expected integer >= 1"); }); + test("run options support retryFailed in config and CLI", () => { + expect( + resolveRunOptions( + {}, + { + run: { retryFailed: 2 }, + runners: { open: { agent: { type: "opencode", model: "openai/gpt-5" } } }, + }, + ), + ).toMatchObject({ retryFailed: 2 }); + + expect( + resolveRunOptions( + { retryFailed: "3" }, + { + run: { retryFailed: 2 }, + runners: { open: { agent: { type: "opencode", model: "openai/gpt-5" } } }, + }, + ), + ).toMatchObject({ retryFailed: 3 }); + + expect(() => + resolveRunOptions( + { retryFailed: "-1" }, + { runners: { open: { agent: { type: "opencode", model: "openai/gpt-5" } } } }, + ), + ).toThrow("Invalid config at CLI option --retry-failed: expected integer >= 0"); + }); + test("run options support config tags and let CLI tags override config", () => { expect( resolveRunOptions( diff --git a/test/reporters/github-actions.test.ts b/test/reporters/github-actions.test.ts index df390ec..3c00e94 100644 --- a/test/reporters/github-actions.test.ts +++ b/test/reporters/github-actions.test.ts @@ -30,14 +30,19 @@ test("github-actions reporter formats escaped annotations for failed runs", asyn await reporter.onSuiteFinish?.({ context: createContext(), - result: createSuiteResult({ runner, caseId: "case,a", errorMessage: "boom,\n100%" }), + result: createSuiteResult({ + runner, + caseId: "case,a", + errorMessage: "boom,\n100%", + attempts: 2, + }), }); expect(writes.join("")).toContain( "::error title=case%2Ca > code%3Amain,file=/workspace/examples/basic-suite.ts,line=14,col=15::", ); expect(writes.join("")).toContain( - "failure type: assertion%0Afailure origin: assertion%0Aerror: AssertionError: boom,%0A100%25", + "failure type: assertion%0Aattempts: 2%0Afailure origin: assertion%0Aerror: AssertionError: boom,%0A100%25", ); expect(writes.join("")).toContain("artifacts: .skillgym-results/run-1/case,a/code-main"); }); @@ -57,7 +62,7 @@ test("github-actions reporter includes file metadata from user stack frames", as await reporter.onSuiteFinish?.({ context: createContext(), - result: createSuiteResult({ runner, caseId: "case-a" }), + result: createSuiteResult({ runner, caseId: "case-a", attempts: 2 }), }); expect(writes.join("")).toContain("file=/workspace/examples/basic-suite.ts,line=14,col=15"); @@ -79,7 +84,7 @@ test("github-actions reporter writes a job summary when GITHUB_STEP_SUMMARY is s await reporter.onSuiteFinish?.({ context: createContext(), - result: createSuiteResult({ runner, caseId: "case-a" }), + result: createSuiteResult({ runner, caseId: "case-a", attempts: 2 }), }); const summary = await readFile(summaryPath, "utf8"); @@ -89,9 +94,11 @@ test("github-actions reporter writes a job summary when GITHUB_STEP_SUMMARY is s expect(summary).toContain("- Runs: 0 passed, 1 failed"); expect(summary).toContain("### Runner: `open-main` (opencode, openai/gpt-5)"); expect(summary).toContain("| Case | Duration | Input | Output | Reasoning | Cache | Billable |"); - expect(summary).toContain("| ❌ `case-a` | 24s | 9,830 | 1,104 | 0 | 0 | 12,000 |"); expect(summary).toContain( - "- `case-a > open-main`; assertion; AssertionError: expected skill to be loaded before command execution; artifacts: `.skillgym-results/run-1/case-a/open-main`; log: `.skillgym-results/run-1/case-a/open-main/stderr.log`", + "| ❌ `case-a` (failed after 2 attempts) | 24s | 9,830 | 1,104 | 0 | 0 | 12,000 |", + ); + expect(summary).toContain( + "- `case-a > open-main`; assertion; AssertionError: expected skill to be loaded before command execution; attempts: 2; artifacts: `.skillgym-results/run-1/case-a/open-main`; log: `.skillgym-results/run-1/case-a/open-main/stderr.log`", ); }); @@ -137,11 +144,13 @@ function createSuiteResult(options: { runner: RunnerInfo; caseId: string; errorMessage?: string; + attempts?: number; }): SuiteRunResult { const runnerResult = createFailedRunnerResult( options.runner, options.caseId, options.errorMessage, + options.attempts, ); return { @@ -161,13 +170,64 @@ function createFailedRunnerResult( runner: RunnerInfo, caseId: string, errorMessage = "expected skill to be loaded before command execution", + attempts = 1, ): RunnerResult { + const artifactDir = `.skillgym-results/run-1/${caseId}/${runner.id.replace(/[:]/g, "-")}`; + return { runner, passed: false, status: "failed", + attempt: attempts, durationMs: 24_800, - artifactDir: `.skillgym-results/run-1/${caseId}/${runner.id.replace(/[:]/g, "-")}`, + artifactDir, + attempts: Array.from({ length: attempts }, (_, index) => ({ + runner, + passed: false, + status: "failed", + attempt: index + 1, + durationMs: 24_800, + artifactDir: + index === 0 ? artifactDir : path.join(artifactDir, `attempt-${String(index + 1)}`), + error: { + name: "AssertionError", + message: errorMessage, + stack: [ + `AssertionError: ${errorMessage}`, + " at assert (/workspace/src/assertions/output.ts:88:10)", + " at Object.assert (/workspace/examples/basic-suite.ts:14:15)", + " at executeRunner (/workspace/src/runner/execute-runner.ts:91:7)", + ].join("\n"), + }, + failureType: "assertion", + failureOrigin: "assertion", + failureLogPath: + index === 0 + ? `${artifactDir}/stderr.log` + : `${path.join(artifactDir, `attempt-${String(index + 1)}`)}/stderr.log`, + report: createSessionReport({ + runner, + usage: { + cacheTokens: 0, + totalTokens: 12_000, + inputTokens: 9_830, + outputTokens: 1_104, + reasoningTokens: 0, + inputChars: 10, + outputChars: 5, + reasoningChars: 0, + source: { + input: "provider", + output: "provider", + reasoning: "provider", + }, + }, + files: { + observedReads: ["a"], + observedSkillReads: [], + }, + }), + })), error: { name: "AssertionError", message: errorMessage, @@ -180,7 +240,7 @@ function createFailedRunnerResult( }, failureType: "assertion", failureOrigin: "assertion", - failureLogPath: `.skillgym-results/run-1/${caseId}/${runner.id.replace(/[:]/g, "-")}/stderr.log`, + failureLogPath: `${artifactDir}/stderr.log`, report: createSessionReport({ runner, usage: { diff --git a/test/reporters/json-summary.test.ts b/test/reporters/json-summary.test.ts index f95d15e..6ec7452 100644 --- a/test/reporters/json-summary.test.ts +++ b/test/reporters/json-summary.test.ts @@ -33,8 +33,121 @@ test("json-summary reporter omits session internals and prints summary on suite runner, passed: false, status: "failed", + attempt: 2, durationMs: 18_200, artifactDir: ".skillgym-results/run-1/case-a/open-main", + attempts: [ + { + runner, + passed: false, + status: "failed", + attempt: 1, + durationMs: 20_000, + artifactDir: ".skillgym-results/run-1/case-a/open-main", + failureType: "assertion", + failureOrigin: "assertion", + failureClass: { + id: "missing-flag", + label: "Missing required flag", + }, + error: { + name: "AssertionError", + message: "expected skill to be loaded", + }, + report: { + runner, + sessionId: "sess-attempt-1", + prompt: "Do the thing", + usage: { + inputTokens: 900, + outputTokens: 180, + reasoningTokens: 40, + cacheTokens: 350, + totalTokens: 1080, + inputChars: 4000, + outputChars: 800, + reasoningChars: 200, + source: { input: "provider", output: "provider", reasoning: "derived" }, + }, + files: { + observedReads: ["/workspace/src/index.ts"], + observedSkillReads: ["/workspace/.claude/skills/my-skill.md"], + }, + detectedSkills: [], + events: [], + finalOutput: "Done.", + startedAt: "2026-04-02T12:00:00.000Z", + endedAt: "2026-04-02T12:00:20.000Z", + durationMs: 20_000, + rawArtifacts: {}, + }, + }, + { + runner, + passed: false, + status: "failed", + attempt: 2, + durationMs: 18_200, + artifactDir: ".skillgym-results/run-1/case-a/open-main", + failureType: "assertion", + failureOrigin: "assertion", + failureClass: { + id: "missing-flag", + label: "Missing required flag", + }, + error: { + name: "AssertionError", + message: "expected skill to be loaded", + stack: + "AssertionError: expected skill to be loaded\n at /workspace/suite.ts:10:5", + }, + report: { + runner, + sessionId: "sess-abc123", + prompt: "Do the thing", + usage: { + inputTokens: 1000, + outputTokens: 200, + reasoningTokens: 50, + cacheTokens: 400, + totalTokens: 1200, + inputChars: 4000, + outputChars: 800, + reasoningChars: 200, + source: { input: "provider", output: "provider", reasoning: "derived" }, + }, + files: { + observedReads: ["/workspace/src/index.ts"], + observedSkillReads: ["/workspace/.claude/skills/my-skill.md"], + }, + detectedSkills: [ + { skill: "my-skill", confidence: "explicit", evidence: ["loaded skill"] }, + ], + events: [ + { + type: "toolCall", + tool: "Read", + args: { file_path: "/workspace/src/index.ts" }, + at: "2026-04-02T12:00:01.000Z", + }, + { + type: "message", + role: "assistant", + text: "I'll read the file.", + at: "2026-04-02T12:00:02.000Z", + }, + ], + finalOutput: "Done.", + startedAt: "2026-04-02T12:00:00.000Z", + endedAt: "2026-04-02T12:00:18.000Z", + durationMs: 18_200, + rawArtifacts: { + stdoutPath: ".skillgym-results/run-1/case-a/open-main/stdout.log", + sessionPath: ".skillgym-results/run-1/case-a/open-main/session.json", + }, + }, + }, + ], failureType: "assertion", failureOrigin: "assertion", failureClass: { @@ -156,6 +269,8 @@ test("json-summary reporter omits session internals and prints summary on suite const runnerResult = caseResult.runnerResults[0]; expect(runnerResult.runner.id).toBe("open-main"); expect(runnerResult.passed).toBe(false); + expect(runnerResult.status).toBe("failed"); + expect(runnerResult.attempt).toBe(2); expect(runnerResult.durationMs).toBe(18_200); expect(runnerResult.artifactDir).toBe(".skillgym-results/run-1/case-a/open-main"); expect(runnerResult.failureType).toBe("assertion"); @@ -173,6 +288,58 @@ test("json-summary reporter omits session internals and prints summary on suite // usage preserved expect(runnerResult.usage.inputTokens).toBe(1000); expect(runnerResult.usage.totalTokens).toBe(1200); + expect(runnerResult.attempts).toEqual([ + { + passed: false, + status: "failed", + attempt: 1, + durationMs: 20_000, + artifactDir: ".skillgym-results/run-1/case-a/open-main", + usage: { + inputTokens: 900, + outputTokens: 180, + reasoningTokens: 40, + cacheTokens: 350, + totalTokens: 1080, + inputChars: 4000, + outputChars: 800, + reasoningChars: 200, + source: { input: "provider", output: "provider", reasoning: "derived" }, + }, + error: { name: "AssertionError", message: "expected skill to be loaded" }, + failureType: "assertion", + failureOrigin: "assertion", + failureClass: { + id: "missing-flag", + label: "Missing required flag", + }, + }, + { + passed: false, + status: "failed", + attempt: 2, + durationMs: 18_200, + artifactDir: ".skillgym-results/run-1/case-a/open-main", + usage: { + inputTokens: 1000, + outputTokens: 200, + reasoningTokens: 50, + cacheTokens: 400, + totalTokens: 1200, + inputChars: 4000, + outputChars: 800, + reasoningChars: 200, + source: { input: "provider", output: "provider", reasoning: "derived" }, + }, + error: { name: "AssertionError", message: "expected skill to be loaded" }, + failureType: "assertion", + failureOrigin: "assertion", + failureClass: { + id: "missing-flag", + label: "Missing required flag", + }, + }, + ]); // session internals omitted expect(runnerResult.report).toBeUndefined(); diff --git a/test/reporters/standard.test.ts b/test/reporters/standard.test.ts index 013ac3e..3026421 100644 --- a/test/reporters/standard.test.ts +++ b/test/reporters/standard.test.ts @@ -1,3 +1,4 @@ +import path from "node:path"; import { afterEach, expect, test, vi } from "vitest"; import type { CaseResult, @@ -68,6 +69,7 @@ test("standard reporter prints runner-grouped results and failure artifacts", as passed: false, artifactDir: ".skillgym-results/run-1/case-a/code-main", totalTokens: 12_000, + attempts: 2, }), ], }), @@ -153,6 +155,7 @@ test("standard reporter prints runner-grouped results and failure artifacts", as ); expect(output).toContain("✓ case-a"); expect(output).toContain("✗ case-a"); + expect(output).toContain("failed after 2 attempts"); expect(output).toContain("Cases 1 failed | 1 passed (2)"); expect(output).toContain("Runs 1 failed | 3 passed (4)"); expect(output).toContain("Statuses 0 expected failures | 0 unexpected passes"); @@ -168,6 +171,7 @@ test("standard reporter prints runner-grouped results and failure artifacts", as expect(output).toContain("✗ case-a > code-main (codex, gpt-5.4)"); expect(output).toContain("AssertionError: expected skill to be loaded before command execution"); expect(output).toContain("at /workspace/examples/basic-suite.ts:14:15"); + expect(output).toContain("Attempts: 2"); expect(output).not.toContain("skillgym could not complete the run"); expect(output).not.toContain("Run did not complete because the runner crashed"); expect(output).toContain("Artifacts: .skillgym-results/run-1/case-a/code-main"); @@ -1033,14 +1037,71 @@ function createRunnerResult(options: { status?: RunnerResult["status"]; artifactDir: string; totalTokens: number; + attempts?: number; failureClass?: RunnerResult["failureClass"]; }): RunnerResult { + const attempts = options.attempts ?? 1; return { runner: options.runner, passed: options.passed, status: options.status ?? (options.passed ? "passed" : "failed"), + attempt: attempts, durationMs: 24_800, artifactDir: options.artifactDir, + attempts: Array.from({ length: attempts }, (_, index) => ({ + runner: options.runner, + passed: options.passed, + status: options.status ?? (options.passed ? "passed" : "failed"), + attempt: index + 1, + durationMs: 24_800, + artifactDir: + index === 0 + ? options.artifactDir + : path.join(options.artifactDir, `attempt-${String(index + 1)}`), + error: + options.passed || options.status === "unexpected-passed" + ? undefined + : { + name: "AssertionError", + message: "expected skill to be loaded before command execution", + stack: [ + "AssertionError: expected skill to be loaded before command execution", + " at assert (/workspace/src/assertions/output.ts:88:10)", + " at Object.assert (/workspace/examples/basic-suite.ts:14:15)", + " at executeRunner (/workspace/src/runner/execute-runner.ts:91:7)", + ].join("\n"), + }, + failureType: + options.passed || options.status === "unexpected-passed" ? undefined : "assertion", + failureOrigin: + options.passed || options.status === "unexpected-passed" ? undefined : "assertion", + failureClass: + options.passed || options.status === "unexpected-passed" + ? undefined + : (options.failureClass ?? { id: "assertion", label: "Assertion failure" }), + report: createSessionReport({ + runner: options.runner, + usage: { + cacheTokens: 7_233, + totalTokens: options.totalTokens, + inputTokens: 9_830, + outputTokens: 1_104, + reasoningTokens: 0, + inputChars: 10, + outputChars: 5, + reasoningChars: 0, + source: { + input: "provider", + output: "provider", + reasoning: "provider", + }, + }, + files: { + observedReads: ["a", "b", "c"], + observedSkillReads: [], + }, + }), + })), error: options.passed || options.status === "unexpected-passed" ? undefined diff --git a/test/runner/execute-suite.reporter.test.ts b/test/runner/execute-suite.reporter.test.ts index fc40e9f..6fe843e 100644 --- a/test/runner/execute-suite.reporter.test.ts +++ b/test/runner/execute-suite.reporter.test.ts @@ -331,6 +331,102 @@ test("executeSuite with parallel schedule respects maxParallel", async () => { expect(maxActive).toBe(2); }); +test("executeSuite retries only failed executions and preserves attempt artifacts", async () => { + const outputDir = await createTempDir(); + const attemptsByRun = new Map(); + const runnerPathKey = createRunnerInfo("open", { + type: "opencode", + model: "openai/gpt-5", + }).pathKey; + + const result = await executeSuite("./suite.ts", [{ id: "flaky", prompt: "a", assert() {} }], { + cwd: outputDir, + outputDir, + retryFailed: 2, + isInteractive: false, + config: { + runners: { + open: { agent: { type: "opencode", model: "openai/gpt-5" } }, + }, + }, + executeRunnerFn: async (testCase, runner, _adapter, options) => { + const key = `${testCase.id}:${runner.id}`; + const attempt = (attemptsByRun.get(key) ?? 0) + 1; + attemptsByRun.set(key, attempt); + + return createRunnerResult({ + caseId: testCase.id, + runner, + passed: attempt >= 2, + durationMs: attempt * 10, + artifactDir: options.artifactDir, + totalTokens: 100 * attempt, + outputTokens: 20, + observedReads: 1, + }); + }, + }); + + const runnerResult = result.cases[0]!.runnerResults[0]!; + expect(attemptsByRun.get("flaky:open")).toBe(2); + expect(runnerResult).toMatchObject({ + passed: true, + attempt: 2, + artifactDir: path.join(result.outputDir, "flaky", runnerPathKey, "attempt-2"), + }); + expect(runnerResult.attempts).toHaveLength(2); + expect(runnerResult.attempts?.map((attempt) => attempt.artifactDir)).toEqual([ + path.join(result.outputDir, "flaky", runnerPathKey), + path.join(result.outputDir, "flaky", runnerPathKey, "attempt-2"), + ]); + + const saved = JSON.parse( + await readFile(path.join(result.outputDir, "results.json"), "utf8"), + ) as SuiteRunResult; + expect(saved.cases[0]?.runnerResults[0]?.attempts).toHaveLength(2); +}); + +test("executeSuite does not retry expected failures", async () => { + const outputDir = await createTempDir(); + let attempts = 0; + + const result = await executeSuite( + "./suite.ts", + [{ id: "known-gap", prompt: "a", expectedFail: true, assert() {} }], + { + cwd: outputDir, + outputDir, + retryFailed: 2, + isInteractive: false, + config: { + runners: { + open: { agent: { type: "opencode", model: "openai/gpt-5" } }, + }, + }, + executeRunnerFn: async (testCase, runner, _adapter, options) => { + attempts += 1; + return createRunnerResult({ + caseId: testCase.id, + runner, + passed: false, + durationMs: 10, + artifactDir: options.artifactDir, + totalTokens: 100, + outputTokens: 20, + observedReads: 1, + }); + }, + }, + ); + + expect(attempts).toBe(1); + expect(result.cases[0]?.runnerResults[0]).toMatchObject({ + passed: true, + status: "expected-failed", + attempt: 1, + }); +}); + test("executeSuite with isolated-by-runner runs serially within a runner and concurrently across runners", async () => { const outputDir = await createTempDir(); const started: string[] = []; From 1d44e13d0dd81314dbbc49d24965640991ce853c Mon Sep 17 00:00:00 2001 From: Szymon Chmal Date: Tue, 5 May 2026 09:46:17 +0200 Subject: [PATCH 2/2] Refine retry reporting output --- examples/flaky-retry-suite.ts | 25 +++++ src/cli/run.ts | 1 + src/reporters/github-actions.ts | 22 ++-- src/reporters/json-summary.ts | 6 ++ src/reporters/standard.ts | 113 +++++++++++++------- test/cli.test.ts | 75 +++++++++++++ test/reporters/github-actions.test.ts | 8 +- test/reporters/json-summary.test.ts | 1 + test/reporters/standard.test.ts | 147 +++++++++++++++++++++++++- 9 files changed, 345 insertions(+), 53 deletions(-) create mode 100644 examples/flaky-retry-suite.ts diff --git a/examples/flaky-retry-suite.ts b/examples/flaky-retry-suite.ts new file mode 100644 index 0000000..f53ab28 --- /dev/null +++ b/examples/flaky-retry-suite.ts @@ -0,0 +1,25 @@ +import { access, writeFile } from "node:fs/promises"; +import os from "node:os"; +import path from "node:path"; +import { assert, type TestCase } from "skillgym"; + +const markerPath = path.join(os.tmpdir(), "skillgym-flaky-retry-example-6.marker"); + +const suite: TestCase[] = [ + { + id: "retry-once", + prompt: "Reply exactly: skillgym retry example", + async assert(_report, ctx) { + try { + await access(markerPath); + } catch { + await writeFile(markerPath, "seen", "utf8"); + throw new Error("Intentional first-run failure. Run the same suite again."); + } + + assert.match(ctx.finalOutput(), /skillgym retry example/i); + }, + }, +]; + +export default suite; diff --git a/src/cli/run.ts b/src/cli/run.ts index 18006e2..f088591 100644 --- a/src/cli/run.ts +++ b/src/cli/run.ts @@ -39,6 +39,7 @@ export async function runCommand(options: { outputDir: options.outputDir, schedule: options.schedule, maxParallel: options.maxParallel, + retryFailed: options.retryFailed, tags: options.tags, }, loadedConfig.config, diff --git a/src/reporters/github-actions.ts b/src/reporters/github-actions.ts index e5eca4e..5ddac96 100644 --- a/src/reporters/github-actions.ts +++ b/src/reporters/github-actions.ts @@ -49,9 +49,10 @@ function formatAnnotationCommand(caseId: string, result: RunnerResult): string { function formatAnnotationMessage(result: RunnerResult): string { const lines = [`failure type: ${result.failureType ?? "unknown"}`]; + const retryCount = countRetries(result); - if (result.attempts !== undefined && result.attempts.length > 1) { - lines.push(`attempts: ${String(result.attempts.length)}`); + if (retryCount > 0) { + lines.push(`retries: ${String(retryCount)}`); } if (result.failureOrigin !== undefined) { @@ -172,8 +173,10 @@ function formatFailureSummaryItem(caseId: string, result: RunnerResult): string `artifacts: \`${result.artifactDir}\``, ]; - if (result.attempts !== undefined && result.attempts.length > 1) { - segments.splice(2, 0, `attempts: ${String(result.attempts.length)}`); + const retryCount = countRetries(result); + + if (retryCount > 0) { + segments.splice(2, 0, `retries: ${String(retryCount)}`); } if (result.failureClass !== undefined) { @@ -192,13 +195,16 @@ function formatFailureSummaryItem(caseId: string, result: RunnerResult): string } function formatRetryLabel(result: RunnerResult): string | undefined { - if (result.attempts === undefined || result.attempts.length <= 1) { + const retryCount = countRetries(result); + if (retryCount === 0) { return undefined; } - return result.passed - ? `(passed on retry ${String(result.attempt ?? result.attempts.length)}/${String(result.attempts.length)})` - : `(failed after ${String(result.attempts.length)} attempts)`; + return `(${retryCount === 1 ? "1 retry" : `${String(retryCount)} retries`})`; +} + +function countRetries(result: RunnerResult): number { + return Math.max(0, (result.attempts?.length ?? 1) - 1); } function listFailures(result: SuiteRunResult): Array<{ caseId: string; result: RunnerResult }> { diff --git a/src/reporters/json-summary.ts b/src/reporters/json-summary.ts index 42cd3aa..c59e7c2 100644 --- a/src/reporters/json-summary.ts +++ b/src/reporters/json-summary.ts @@ -16,6 +16,7 @@ interface SummaryRunnerResult { passed: boolean; status: RunnerResult["status"]; attempt?: number; + retryCount: number; durationMs: number; artifactDir: string; usage: RunnerResult["report"]["usage"]; @@ -64,6 +65,7 @@ function summarizeRunnerResult(result: RunnerResult): SummaryRunnerResult { passed: result.passed, status: result.status, attempt: result.attempt, + retryCount: countRetries(result), durationMs: result.durationMs, artifactDir: result.artifactDir, usage: result.report.usage, @@ -92,6 +94,10 @@ function summarizeRunnerResult(result: RunnerResult): SummaryRunnerResult { return summary; } +function countRetries(result: RunnerResult): number { + return Math.max(0, (result.attempts?.length ?? 1) - 1); +} + function summarizeAttemptResult( result: NonNullable[number], ): SummaryAttemptResult { diff --git a/src/reporters/standard.ts b/src/reporters/standard.ts index 287331d..7e289b0 100644 --- a/src/reporters/standard.ts +++ b/src/reporters/standard.ts @@ -62,6 +62,7 @@ interface InteractiveRunEntry { caseId: string; runner: RunnerInfo; status: InteractiveRunStatus; + retryCount: number; } interface InteractiveState { @@ -94,7 +95,6 @@ export function createStandardReporter(options: StandardReporterOptions = {}): B colors.isColorSupported ? `${ACCENT_OPEN}${value}${ACCENT_CLOSE}` : value; const symbols: ReporterSymbols = getSymbols(unicode); const spinner = unicode ? cliSpinners.dots : cliSpinners.line; - const failures: FailureEntry[] = []; let interactiveState: InteractiveState | undefined; return { @@ -149,38 +149,26 @@ export function createStandardReporter(options: StandardReporterOptions = {}): B } const key = createRunKey(event.testCase.id, event.runner.id); - setInteractiveRunStatus(interactiveState, key, "running"); + setInteractiveRunResult(interactiveState, key, { status: "running", retryCount: 0 }); interactiveState.spinnerFrameIndex = 0; renderInteractiveRunList(interactiveState, stdout, colors, symbols, spinner.frames); startSpinner(interactiveState, stdout, colors, symbols, spinner.frames, spinner.interval); }, onRunnerFinish(event) { if (interactive && interactiveState !== undefined) { - setInteractiveRunStatus( + setInteractiveRunResult( interactiveState, createRunKey(event.testCase.id, event.runner.id), - event.result.status, + { + status: event.result.status, + retryCount: countRetries(event.result), + }, ); if (!hasRunningEntries(interactiveState)) { stopSpinner(interactiveState); } renderInteractiveRunList(interactiveState, stdout, colors, symbols, spinner.frames); } - - if (!event.result.passed) { - failures.push({ - caseId: event.testCase.id, - runner: event.result.runner, - artifactDir: event.result.artifactDir, - attempts: event.result.attempts, - error: event.result.error, - failureType: event.result.failureType, - failureOrigin: event.result.failureOrigin, - failureClass: event.result.failureClass, - failureLogPath: event.result.failureLogPath, - status: event.result.status, - }); - } }, onCaseFinish(event) { if (interactive) { @@ -190,6 +178,8 @@ export function createStandardReporter(options: StandardReporterOptions = {}): B writeLine(formatCaseRow(event.result, symbols), stdout); }, onSuiteFinish(event) { + const failures = collectFinalFailures(event.result); + if (interactiveState !== undefined) { stopSpinner(interactiveState); renderInteractiveRunList(interactiveState, stdout, colors, symbols, spinner.frames); @@ -202,7 +192,13 @@ export function createStandardReporter(options: StandardReporterOptions = {}): B writeLine(formatRunnerLegend(colors), stdout); for (const caseResult of getRunnerCases(event.result, summary.runner.id)) { writeLine( - formatRunnerCaseRow(caseResult.caseId, caseResult.runnerResult, symbols, accent), + formatRunnerCaseRow( + caseResult.caseId, + caseResult.runnerResult, + symbols, + accent, + colors, + ), stdout, ); } @@ -301,15 +297,16 @@ function formatRunnerCaseRow( result: RunnerResult, symbols: ReturnType, accent: (value: string) => string, + _colors: ReturnType, ): string { const color = result.passed ? pc.green : pc.red; - const statusLabel = [formatStatusLabel(result.status), formatRetryLabel(result)] - .filter((label): label is string => label !== undefined) - .join(", "); + const statusLabel = formatStatusLabel(result.status); + const caseLabel = `${result.passed ? symbols.pass : symbols.fail} ${caseId}`; + return [ - color(padCell(`${result.passed ? symbols.pass : symbols.fail} ${caseId}`, RUNNER_CASE_WIDTH)), + color(padCell(caseLabel, RUNNER_CASE_WIDTH)), padCell(formatDuration(result.durationMs), RUNNER_TIME_WIDTH), - statusLabel === undefined + (statusLabel === undefined ? "" : statusLabel) === "" ? formatTokenSummary(result.report.usage, accent) : `${formatTokenSummary(result.report.usage, accent)} ${pc.dim(statusLabel)}`, ].join(" "); @@ -439,16 +436,6 @@ function formatFailureClassLabel(failureClass: FailureClass): string { return `${failureClass.label} [${failureClass.id}]`; } -function formatRetryLabel(result: RunnerResult): string | undefined { - if (result.attempts === undefined || result.attempts.length <= 1) { - return undefined; - } - - return result.passed - ? `passed on retry ${String(result.attempt ?? result.attempts.length)}/${String(result.attempts.length)}` - : `failed after ${String(result.attempts.length)} attempts`; -} - function formatErrorLocation(error: SerializedError): string | undefined { const location = extractUserStackFrame(error); return location === undefined ? undefined : formatStackFrameLocation(location); @@ -606,6 +593,31 @@ function getRunnerCases( }); } +function collectFinalFailures(result: SuiteRunResult): FailureEntry[] { + return result.cases.flatMap((caseResult) => + caseResult.runnerResults.flatMap((runnerResult) => { + if (runnerResult.passed) { + return []; + } + + return [ + { + caseId: caseResult.caseId, + runner: runnerResult.runner, + artifactDir: runnerResult.artifactDir, + attempts: runnerResult.attempts, + error: runnerResult.error, + failureType: runnerResult.failureType, + failureOrigin: runnerResult.failureOrigin, + failureClass: runnerResult.failureClass, + failureLogPath: runnerResult.failureLogPath, + status: runnerResult.status, + }, + ]; + }), + ); +} + function createInteractiveState(event: SuiteStartEvent): InteractiveState { const entries = event.cases.flatMap((testCase) => { return event.runners.map((runner) => ({ @@ -613,6 +625,7 @@ function createInteractiveState(event: SuiteStartEvent): InteractiveState { caseId: testCase.id, runner, status: "queued" as const, + retryCount: 0, })); }); @@ -628,10 +641,10 @@ function createRunKey(caseId: string, runnerId: string): string { return `${caseId}\u0000${runnerId}`; } -function setInteractiveRunStatus( +function setInteractiveRunResult( state: InteractiveState, key: string, - status: InteractiveRunStatus, + result: { status: InteractiveRunStatus; retryCount: number }, ): void { const index = state.entryIndexByKey.get(key); @@ -641,7 +654,8 @@ function setInteractiveRunStatus( state.entries[index] = { ...state.entries[index]!, - status, + status: result.status, + retryCount: result.retryCount, }; } @@ -703,8 +717,10 @@ function formatInteractiveRunRow( ): string { const statusIcon = formatInteractiveStatusIcon(entry, state, colors, symbols, frames); const statusLabel = formatInteractiveStatusLabel(entry.status); + const retryLabel = formatInteractiveRetryLabel(entry, colors); const row = `${statusIcon} ${padCell(entry.caseId, caseWidth)} / ${entry.runner.id}${statusLabel}`; const runnerMeta = ` ${formatRunnerAgentLabel(entry.runner)}`; + const retryMeta = retryLabel === undefined ? "" : ` ${retryLabel}`; switch (entry.status) { case "queued": @@ -713,7 +729,7 @@ function formatInteractiveRunRow( return `${row}${colors.dim(runnerMeta)}`; case "passed": case "expected-failed": - return `${colors.green(row)}${colors.dim(runnerMeta)}`; + return `${colors.green(row)}${colors.dim(runnerMeta)}${retryMeta}`; case "failed": case "unexpected-passed": return `${colors.red(row)}${colors.dim(runnerMeta)}`; @@ -735,6 +751,25 @@ function formatInteractiveStatusLabel(status: InteractiveRunStatus): string { } } +function formatInteractiveRetryLabel( + entry: InteractiveRunEntry, + colors: ReturnType, +): string | undefined { + if (entry.status !== "passed" || entry.retryCount === 0) { + return undefined; + } + + return colors.yellow(formatRetryCountLabel(entry.retryCount)); +} + +function countRetries(result: RunnerResult): number { + return Math.max(0, (result.attempts?.length ?? 1) - 1); +} + +function formatRetryCountLabel(retryCount: number): string { + return `(${retryCount === 1 ? "1 retry" : `${String(retryCount)} retries`})`; +} + function formatInteractiveStatusIcon( entry: InteractiveRunEntry, state: InteractiveState, diff --git a/test/cli.test.ts b/test/cli.test.ts index 71a5ab2..a5291a8 100644 --- a/test/cli.test.ts +++ b/test/cli.test.ts @@ -340,6 +340,81 @@ test("cli run passes repeated and comma-separated tag filters to execution", asy unmockRunCommandDependencies(); }); +test("cli run passes retryFailed through to execution", async () => { + const tempDir = await mkdtemp(path.join(os.tmpdir(), "skillgym-cli-")); + tempDirs.push(tempDir); + const executeSuite = vi.fn(async () => ({ + suitePath: path.join(tempDir, "suite.ts"), + startedAt: "2026-04-02T12:00:00.000Z", + endedAt: "2026-04-02T12:00:01.000Z", + durationMs: 1_000, + outputDir: path.join(tempDir, ".skillgym-results", "run-1"), + declaredTags: [], + selectedTags: [], + cases: [ + { + caseId: "alpha", + tags: [], + passed: true, + runnerResults: [{ passed: true, status: "passed" }], + }, + ], + runners: [], + })); + + vi.resetModules(); + vi.doMock("../src/config.js", () => ({ + loadConfig: vi.fn(async () => ({ + config: { + runners: { + open: { agent: { type: "opencode", model: "openai/gpt-5" } }, + }, + }, + filePath: path.join(tempDir, "skillgym.config.ts"), + })), + resolveReporterOptions: vi.fn(() => ({ reporter: undefined, cwd: tempDir })), + resolveRunOptions: vi.fn((options) => ({ + cwd: tempDir, + outputDir: path.join(tempDir, ".skillgym-results"), + schedule: "serial", + retryFailed: Number(options.retryFailed ?? 0), + tags: options.tags, + })), + })); + vi.doMock("../src/reporters/index.js", () => ({ + loadReporter: vi.fn(async () => undefined), + })); + vi.doMock("../src/snapshots/store.js", () => ({ + createSnapshotRuntimeOptions: vi.fn(() => undefined), + })); + vi.doMock("../src/runner/load-suite.js", () => ({ + loadSuite: vi.fn(async () => ({ + cases: [{ id: "alpha", prompt: "Say hello", tags: ["smoke"], assert() {} }], + workspace: undefined, + dirPath: tempDir, + })), + })); + vi.doMock("../src/runner/workspace.js", () => ({ + resolveEffectiveWorkspace: vi.fn(() => ({ mode: "shared", cwd: tempDir })), + })); + vi.doMock("../src/runner/execute-suite.js", () => ({ + executeSuite, + })); + + const { runCommand } = await import("../src/cli/run.js"); + + await expect( + runCommand({ suitePath: "./suite.ts", cwd: tempDir, retryFailed: "2" }), + ).resolves.toBeUndefined(); + expect(executeSuite).toHaveBeenCalledWith( + "./suite.ts", + expect.any(Array), + expect.objectContaining({ retryFailed: 2 }), + ); + + unmockRunCommandDependencies(); +}); + async function execCli(args: string[], cwd = repoRoot) { return execFileCapture( process.execPath, diff --git a/test/reporters/github-actions.test.ts b/test/reporters/github-actions.test.ts index 3c00e94..c9c75dd 100644 --- a/test/reporters/github-actions.test.ts +++ b/test/reporters/github-actions.test.ts @@ -42,7 +42,7 @@ test("github-actions reporter formats escaped annotations for failed runs", asyn "::error title=case%2Ca > code%3Amain,file=/workspace/examples/basic-suite.ts,line=14,col=15::", ); expect(writes.join("")).toContain( - "failure type: assertion%0Aattempts: 2%0Afailure origin: assertion%0Aerror: AssertionError: boom,%0A100%25", + "failure type: assertion%0Aretries: 1%0Afailure origin: assertion%0Aerror: AssertionError: boom,%0A100%25", ); expect(writes.join("")).toContain("artifacts: .skillgym-results/run-1/case,a/code-main"); }); @@ -94,11 +94,9 @@ test("github-actions reporter writes a job summary when GITHUB_STEP_SUMMARY is s expect(summary).toContain("- Runs: 0 passed, 1 failed"); expect(summary).toContain("### Runner: `open-main` (opencode, openai/gpt-5)"); expect(summary).toContain("| Case | Duration | Input | Output | Reasoning | Cache | Billable |"); + expect(summary).toContain("| ❌ `case-a` (1 retry) | 24s | 9,830 | 1,104 | 0 | 0 | 12,000 |"); expect(summary).toContain( - "| ❌ `case-a` (failed after 2 attempts) | 24s | 9,830 | 1,104 | 0 | 0 | 12,000 |", - ); - expect(summary).toContain( - "- `case-a > open-main`; assertion; AssertionError: expected skill to be loaded before command execution; attempts: 2; artifacts: `.skillgym-results/run-1/case-a/open-main`; log: `.skillgym-results/run-1/case-a/open-main/stderr.log`", + "- `case-a > open-main`; assertion; AssertionError: expected skill to be loaded before command execution; retries: 1; artifacts: `.skillgym-results/run-1/case-a/open-main`; log: `.skillgym-results/run-1/case-a/open-main/stderr.log`", ); }); diff --git a/test/reporters/json-summary.test.ts b/test/reporters/json-summary.test.ts index 6ec7452..65d9556 100644 --- a/test/reporters/json-summary.test.ts +++ b/test/reporters/json-summary.test.ts @@ -271,6 +271,7 @@ test("json-summary reporter omits session internals and prints summary on suite expect(runnerResult.passed).toBe(false); expect(runnerResult.status).toBe("failed"); expect(runnerResult.attempt).toBe(2); + expect(runnerResult.retryCount).toBe(1); expect(runnerResult.durationMs).toBe(18_200); expect(runnerResult.artifactDir).toBe(".skillgym-results/run-1/case-a/open-main"); expect(runnerResult.failureType).toBe("assertion"); diff --git a/test/reporters/standard.test.ts b/test/reporters/standard.test.ts index 3026421..e0ece3c 100644 --- a/test/reporters/standard.test.ts +++ b/test/reporters/standard.test.ts @@ -155,7 +155,6 @@ test("standard reporter prints runner-grouped results and failure artifacts", as ); expect(output).toContain("✓ case-a"); expect(output).toContain("✗ case-a"); - expect(output).toContain("failed after 2 attempts"); expect(output).toContain("Cases 1 failed | 1 passed (2)"); expect(output).toContain("Runs 1 failed | 3 passed (4)"); expect(output).toContain("Statuses 0 expected failures | 0 unexpected passes"); @@ -424,6 +423,152 @@ test("standard reporter labels expected failures and unexpected passes", async ( expect(output).not.toContain("known-gap > open-main"); }); +test("standard reporter shows recovered retries inline without failure blocks", async () => { + const writes: string[] = []; + const reporter = createStandardReporter({ + stdout: { + isTTY: true, + columns: 120, + write(chunk: string) { + writes.push(chunk); + return true; + }, + }, + isInteractive: false, + isUnicode: true, + }); + const runner = createRunnerInfo("cursor-main", { type: "cursor-agent", model: "auto" }); + const context = { + isInteractive: false, + cwd: "/workspace", + workspaceMode: "shared" as const, + suitePath: "examples/flaky-retry-suite.ts", + outputDir: ".skillgym-results/run-1", + selectedCaseCount: 1, + selectedRunnerCount: 1, + selectedExecutionCount: 1, + scheduleMode: "serial" as const, + maxParallel: 1, + declaredTags: [], + }; + const suiteResult: SuiteRunResult = { + suitePath: context.suitePath, + startedAt: "2026-04-02T12:00:00.000Z", + endedAt: "2026-04-02T12:00:12.000Z", + durationMs: 12_000, + outputDir: context.outputDir, + declaredTags: [], + selectedTags: [], + cases: [ + createCaseResult({ + caseId: "retry-once", + runnerResults: [ + createRunnerResult({ + runner, + passed: true, + artifactDir: ".skillgym-results/run-1/retry-once/cursor-main/attempt-2", + totalTokens: 12_000, + attempts: 2, + }), + ], + }), + ], + runners: [ + createRunnerSummary({ + runner, + passedCases: 1, + totalCases: 1, + averageDurationMs: 24_800, + averageTotalTokens: 12_000, + }), + ], + }; + + await reporter.onSuiteStart?.({ + context, + cases: [], + runners: [runner], + startedAt: suiteResult.startedAt, + }); + await reporter.onRunnerFinish?.({ + context, + testCase: { id: "retry-once", prompt: "", assert() {} }, + runner, + result: suiteResult.cases[0]!.runnerResults[0]!, + caseIndex: 1, + totalCases: 1, + }); + await reporter.onCaseFinish?.({ + context, + testCase: { id: "retry-once", prompt: "", assert() {} }, + result: suiteResult.cases[0]!, + caseIndex: 1, + totalCases: 1, + }); + await reporter.onSuiteFinish?.({ context, result: suiteResult }); + + const output = writes.join(""); + expect(output).toContain("retry-once"); + expect(output).not.toContain("Failure Classes"); + expect(output).not.toContain("Failures"); + expect(output).not.toContain("Artifacts:"); +}); + +test("standard reporter interactive mode shows retry warning on recovered run", async () => { + const writes: string[] = []; + const reporter = createStandardReporter({ + stdout: { + isTTY: true, + columns: 120, + write(chunk: string) { + writes.push(chunk); + return true; + }, + }, + isInteractive: true, + isUnicode: true, + }); + + const context = { + isInteractive: true, + cwd: "/workspace", + workspaceMode: "shared" as const, + suitePath: "examples/flaky-retry-suite.ts", + outputDir: ".skillgym-results/run-1", + selectedCaseCount: 1, + selectedRunnerCount: 1, + selectedExecutionCount: 1, + scheduleMode: "serial" as const, + maxParallel: 1, + declaredTags: [], + }; + const runner = createRunnerInfo("cursor-main", { type: "cursor-agent", model: "auto" }); + + await reporter.onSuiteStart?.({ + context, + cases: [{ id: "retry-once", prompt: "", assert() {} }], + runners: [runner], + startedAt: "2026-04-02T12:00:00.000Z", + }); + + await reporter.onRunnerFinish?.({ + context, + testCase: { id: "retry-once", prompt: "", assert() {} }, + runner, + result: createRunnerResult({ + runner, + passed: true, + artifactDir: "x", + totalTokens: 10_000, + attempts: 2, + }), + caseIndex: 1, + totalCases: 1, + }); + + expect(writes.join("")).toContain("(1 retry)"); +}); + test("standard reporter prints warning line for overlapping shared-workspace schedules", async () => { const parallelWrites: string[] = []; const serialWrites: string[] = [];