callstackincubator · V3RON · May 6, 2026 · May 5, 2026 · May 5, 2026
diff --git a/README.md b/README.md
@@ -40,6 +40,7 @@ const config: SkillGymConfig = {
     reporter: "standard",
     schedule: "serial",
     maxParallel: 4,
+    retryFailed: 1,
     maxSteps: 4,
   },
   defaults: {
@@ -144,6 +145,7 @@ Most important config properties:
 - `run.reporter`: built-in `standard` reporter or a custom reporter module path
 - `run.schedule`: execution scheduling mode for case x runner pairs
 - `run.maxParallel`: maximum concurrent executions for non-serial schedules, defaulting to available CPU parallelism
+- `run.retryFailed`: rerun only failed case x runner executions up to this many additional attempts
 - `run.maxSteps`: best-effort limit on streamed agent steps before skillgym terminates the run
 - `run.workspace`: default workspace mode for the suite
 - `defaults.timeoutMs`: default per-case timeout
@@ -165,6 +167,8 @@ For concurrent schedules, `run.maxParallel` defaults to `os.availableParallelism
 
 Concurrent schedules do not copy or isolate the workspace by themselves. Overlapping runs may still interact through the same filesystem state and live runner output unless you use isolated workspaces. OpenCode, Codex, and Claude Code runtime state are isolated per run under each artifact directory.
 
+`run.retryFailed` is useful when broad benchmark runs include occasional flaky agent failures. SkillGym only retries executions that still count as failed after result classification, keeps each attempt's artifacts, and reports whether a final pass came from a retry.
+
 `run.maxSteps` is enforced on a best-effort basis by monitoring each runner's streamed JSONL output. A step is one observed model round, not one token and not necessarily one tool call, but the exact boundary is still runner-defined, so the same prompt may consume different numbers of steps across agents. When the observed step count exceeds the configured limit, skillgym kills the agent process, fails the run with origin `max-steps`, and preserves raw stdout/stderr artifacts for debugging. No partial normalized report is produced for that failure.
 
 ## Workspaces

diff --git a/examples/flaky-retry-suite.ts b/examples/flaky-retry-suite.ts
@@ -0,0 +1,25 @@
+import { access, writeFile } from "node:fs/promises";
+import os from "node:os";
+import path from "node:path";
+import { assert, type TestCase } from "skillgym";
+
+const markerPath = path.join(os.tmpdir(), "skillgym-flaky-retry-example-6.marker");
+
+const suite: TestCase[] = [
+  {
+    id: "retry-once",
+    prompt: "Reply exactly: skillgym retry example",
+    async assert(_report, ctx) {
+      try {
+        await access(markerPath);
+      } catch {
+        await writeFile(markerPath, "seen", "utf8");
+        throw new Error("Intentional first-run failure. Run the same suite again.");
+      }
+
+      assert.match(ctx.finalOutput(), /skillgym retry example/i);
+    },
+  },
+];
+
+export default suite;
diff --git a/src/cli.ts b/src/cli.ts
@@ -22,6 +22,7 @@ async function main(): Promise<void> {
       const scheduleOption = parsed.options.schedule;
       const configOption = parsed.options.config;
       const maxParallelOption = parsed.options["max-parallel"];
+      const retryFailedOption = parsed.options["retry-failed"];
       const updateSnapshotsOption = parsed.options["update-snapshots"];
       const snapshotsOption = parsed.options.snapshots;
       const tagOption = parsed.options.tag;
@@ -35,6 +36,7 @@ async function main(): Promise<void> {
         reporter: getStringOption(reporterOption),
         schedule: getStringOption(scheduleOption),
         maxParallel: getStringOption(maxParallelOption),
+        retryFailed: getStringOption(retryFailedOption),
         tags: parseTagOption(tagOption),
         reporterCwd: process.cwd(),
         configPath: getStringOption(configOption),

diff --git a/src/cli/help.ts b/src/cli/help.ts
@@ -17,6 +17,7 @@ ${theme.bold("Run Options:")}
   --output-dir ${theme.accent("<path>")}    Override where run artifacts are written
   --schedule ${theme.accent("<mode>")}      Choose ${theme.light("serial")}, ${theme.light("parallel")}, or ${theme.light("isolated-by-runner")}
   --max-parallel ${theme.accent("<n>")}     Cap concurrent executions for non-serial schedules
+  --retry-failed ${theme.accent("<n>")}     Retry only failed case x runner executions up to ${theme.light("n")} extra times
   --case ${theme.accent("<id>")}            Filter the configured suite to one case id
   --tag ${theme.accent("<tag>")}            Filter cases by tag; repeat or comma-separate for OR matching
   --runner ${theme.accent("<runner-id>")}   Filter the configured runner set by runner id
@@ -32,6 +33,7 @@ ${theme.bold("Examples:")}
   ${theme.dim("$")} ${theme.light("skillgym run ./examples/basic-suite.ts --reporter standard")}
   ${theme.dim("$")} ${theme.light("skillgym run ./examples/basic-suite.ts --schedule isolated-by-runner")}
   ${theme.dim("$")} ${theme.light("skillgym run ./examples/basic-suite.ts --schedule parallel --max-parallel 4")}
+  ${theme.dim("$")} ${theme.light("skillgym run ./examples/basic-suite.ts --retry-failed 2")}
   ${theme.dim("$")} ${theme.light("skillgym run ./examples/basic-suite.ts --update-snapshots")}
 `);
 }
diff --git a/src/cli/run.ts b/src/cli/run.ts
@@ -19,6 +19,7 @@ export async function runCommand(options: {
   outputDir?: string;
   schedule?: string;
   maxParallel?: string;
+  retryFailed?: string;
   caseId?: string;
   runner?: string;
   reporter?: string;
@@ -38,6 +39,7 @@ export async function runCommand(options: {
       outputDir: options.outputDir,
       schedule: options.schedule,
       maxParallel: options.maxParallel,
+      retryFailed: options.retryFailed,
       tags: options.tags,
     },
     loadedConfig.config,
@@ -76,6 +78,7 @@ export async function runCommand(options: {
     outputDir: runOptions.outputDir,
     schedule: runOptions.schedule,
     maxParallel: runOptions.maxParallel,
+    retryFailed: runOptions.retryFailed,
     caseId: options.caseId,
     runner: options.runner,
     tags: runOptions.tags,

diff --git a/src/config.ts b/src/config.ts
@@ -24,6 +24,7 @@ const RUN_KEYS = [
   "workspace",
   "maxSteps",
   "maxParallel",
+  "retryFailed",
   "tags",
 ] as const;
 const DEFAULT_KEYS = ["timeoutMs"] as const;
@@ -64,6 +65,7 @@ export interface SkillGymConfig {
     workspace?: SuiteWorkspaceConfig;
     maxSteps?: number;
     maxParallel?: number;
+    retryFailed?: number;
     tags?: string[];
   };
   defaults?: {
@@ -121,6 +123,7 @@ export function resolveRunOptions(
     outputDir?: string;
     schedule?: string;
     maxParallel?: string;
+    retryFailed?: string;
     tags?: string[];
   },
   config: SkillGymConfig,
@@ -129,12 +132,17 @@ export function resolveRunOptions(
   outputDir?: string;
   schedule: ScheduleMode;
   maxParallel?: number;
+  retryFailed: number;
   tags: string[];
 } {
   const maxParallel =
     cliOptions.maxParallel !== undefined
       ? parseIntegerString(cliOptions.maxParallel, "CLI option --max-parallel", 1)
       : config.run?.maxParallel;
+  const retryFailed =
+    cliOptions.retryFailed !== undefined
+      ? parseIntegerString(cliOptions.retryFailed, "CLI option --retry-failed", 0)
+      : (config.run?.retryFailed ?? 0);
 
   return {
     cwd:
@@ -150,6 +158,7 @@ export function resolveRunOptions(
         ? parseScheduleMode(cliOptions.schedule, "CLI option --schedule")
         : (config.run?.schedule ?? "serial"),
     ...(maxParallel === undefined ? {} : { maxParallel }),
+    retryFailed,
     tags: cliOptions.tags ?? config.run?.tags ?? [],
   };
 }
@@ -256,6 +265,9 @@ function resolveConfigPaths(config: SkillGymConfig, configDir: string): SkillGym
             ...(config.run.maxParallel === undefined
               ? {}
               : { maxParallel: config.run.maxParallel }),
+            ...(config.run.retryFailed === undefined
+              ? {}
+              : { retryFailed: config.run.retryFailed }),
             ...(config.run.tags === undefined ? {} : { tags: config.run.tags }),
           },
     defaults:
@@ -346,6 +358,7 @@ function parseRunConfig(value: unknown, configPath: string): SkillGymConfig["run
 
   const maxSteps = parseOptionalInteger(record.maxSteps, `${configPath}.maxSteps`, 1);
   const maxParallel = parseOptionalInteger(record.maxParallel, `${configPath}.maxParallel`, 1);
+  const retryFailed = parseOptionalInteger(record.retryFailed, `${configPath}.retryFailed`, 0);
 
   return {
     cwd: parseOptionalNonEmptyString(record.cwd, `${configPath}.cwd`),
@@ -355,6 +368,7 @@ function parseRunConfig(value: unknown, configPath: string): SkillGymConfig["run
     workspace: parseOptionalWorkspaceConfig(record.workspace, `${configPath}.workspace`),
     ...(maxSteps === undefined ? {} : { maxSteps }),
     ...(maxParallel === undefined ? {} : { maxParallel }),
+    ...(retryFailed === undefined ? {} : { retryFailed }),
     tags: parseOptionalStringArray(record.tags, `${configPath}.tags`),
   };
 }

diff --git a/src/domain/result.ts b/src/domain/result.ts
@@ -1,7 +1,7 @@
 import type { RunnerInfo } from "./runner.js";
 import type { SessionReport } from "./session-report.js";
 
-export interface RunnerResult {
+interface BaseRunnerResult {
   runner: RunnerInfo;
   passed: boolean;
   status: RunnerResultStatus;
@@ -15,6 +15,15 @@ export interface RunnerResult {
   failureLogPath?: string;
 }
 
+export interface RunnerAttemptResult extends BaseRunnerResult {
+  attempt: number;
+}
+
+export interface RunnerResult extends BaseRunnerResult {
+  attempt?: number;
+  attempts?: RunnerAttemptResult[];
+}
+
 export interface FailureClass {
   id: string;
   label?: string;

diff --git a/src/index.ts b/src/index.ts
@@ -17,6 +17,7 @@ export type {
 } from "./domain/test-case.js";
 export type {
   FailureClass,
+  RunnerAttemptResult,
   RunnerFailureOrigin,
   RunnerFailureType,
   RunnerResult,

diff --git a/src/reporters/contract.ts b/src/reporters/contract.ts
@@ -38,6 +38,8 @@ export interface RunnerStartEvent {
   context: ReporterContext;
   testCase: TestCase;
   runner: RunnerInfo;
+  attempt?: number;
+  maxAttempts?: number;
   caseIndex: number;
   totalCases: number;
 }
@@ -47,6 +49,8 @@ export interface RunnerFinishEvent {
   testCase: TestCase;
   runner: RunnerInfo;
   result: RunnerResult;
+  attempt?: number;
+  maxAttempts?: number;
   caseIndex: number;
   totalCases: number;
 }

diff --git a/src/reporters/github-actions.ts b/src/reporters/github-actions.ts
@@ -49,6 +49,11 @@ function formatAnnotationCommand(caseId: string, result: RunnerResult): string {
 
 function formatAnnotationMessage(result: RunnerResult): string {
   const lines = [`failure type: ${result.failureType ?? "unknown"}`];
+  const retryCount = countRetries(result);
+
+  if (retryCount > 0) {
+    lines.push(`retries: ${String(retryCount)}`);
+  }
 
   if (result.failureOrigin !== undefined) {
     lines.push(`failure origin: ${result.failureOrigin}`);
@@ -147,7 +152,8 @@ function formatRunnerAgentLabel(runner: RunnerSummary["runner"]): string {
 function formatRunnerCaseRow(caseId: string, result: RunnerResult): string {
   const status = result.passed ? "✅" : "❌";
   const usage = result.report.usage;
-  return `| ${status} \`${caseId}\` | ${formatDuration(result.durationMs)} | ${formatTokens(usage.inputTokens)} | ${formatTokens(usage.outputTokens)} | ${formatTokens(usage.reasoningTokens)} | ${formatTokens(usage.cacheTokens)} | ${formatTokens(usage.totalTokens)} |`;
+  const retryLabel = formatRetryLabel(result);
+  return `| ${status} \`${caseId}\`${retryLabel === undefined ? "" : ` ${retryLabel}`} | ${formatDuration(result.durationMs)} | ${formatTokens(usage.inputTokens)} | ${formatTokens(usage.outputTokens)} | ${formatTokens(usage.reasoningTokens)} | ${formatTokens(usage.cacheTokens)} | ${formatTokens(usage.totalTokens)} |`;
 }
 
 function getRunnerCases(
@@ -167,6 +173,12 @@ function formatFailureSummaryItem(caseId: string, result: RunnerResult): string
     `artifacts: \`${result.artifactDir}\``,
   ];
 
+  const retryCount = countRetries(result);
+
+  if (retryCount > 0) {
+    segments.splice(2, 0, `retries: ${String(retryCount)}`);
+  }
+
   if (result.failureClass !== undefined) {
     segments.splice(2, 0, `class: \`${result.failureClass.id}\``);
   }
@@ -182,6 +194,19 @@ function formatFailureSummaryItem(caseId: string, result: RunnerResult): string
   return segments.join("; ");
 }
 
+function formatRetryLabel(result: RunnerResult): string | undefined {
+  const retryCount = countRetries(result);
+  if (retryCount === 0) {
+    return undefined;
+  }
+
+  return `(${retryCount === 1 ? "1 retry" : `${String(retryCount)} retries`})`;
+}
+
+function countRetries(result: RunnerResult): number {
+  return Math.max(0, (result.attempts?.length ?? 1) - 1);
+}
+
 function listFailures(result: SuiteRunResult): Array<{ caseId: string; result: RunnerResult }> {
   const failures: Array<{ caseId: string; result: RunnerResult }> = [];
 

diff --git a/src/reporters/json-summary.ts b/src/reporters/json-summary.ts
@@ -14,6 +14,23 @@ interface SummaryError {
 interface SummaryRunnerResult {
   runner: RunnerResult["runner"];
   passed: boolean;
+  status: RunnerResult["status"];
+  attempt?: number;
+  retryCount: number;
+  durationMs: number;
+  artifactDir: string;
+  usage: RunnerResult["report"]["usage"];
+  attempts?: SummaryAttemptResult[];
+  error?: SummaryError;
+  failureType?: RunnerResult["failureType"];
+  failureOrigin?: RunnerResult["failureOrigin"];
+  failureClass?: FailureClass;
+}
+
+interface SummaryAttemptResult {
+  passed: boolean;
+  status: RunnerResult["status"];
+  attempt: number;
   durationMs: number;
   artifactDir: string;
   usage: RunnerResult["report"]["usage"];
@@ -46,6 +63,48 @@ function summarizeRunnerResult(result: RunnerResult): SummaryRunnerResult {
   const summary: SummaryRunnerResult = {
     runner: result.runner,
     passed: result.passed,
+    status: result.status,
+    attempt: result.attempt,
+    retryCount: countRetries(result),
+    durationMs: result.durationMs,
+    artifactDir: result.artifactDir,
+    usage: result.report.usage,
+  };
+
+  if (result.attempts !== undefined) {
+    summary.attempts = result.attempts.map(summarizeAttemptResult);
+  }
+
+  if (result.error !== undefined) {
+    summary.error = { name: result.error.name, message: result.error.message };
+  }
+
+  if (result.failureType !== undefined) {
+    summary.failureType = result.failureType;
+  }
+
+  if (result.failureOrigin !== undefined) {
+    summary.failureOrigin = result.failureOrigin;
+  }
+
+  if (result.failureClass !== undefined) {
+    summary.failureClass = result.failureClass;
+  }
+
+  return summary;
+}
+
+function countRetries(result: RunnerResult): number {
+  return Math.max(0, (result.attempts?.length ?? 1) - 1);
+}
+
+function summarizeAttemptResult(
+  result: NonNullable<RunnerResult["attempts"]>[number],
+): SummaryAttemptResult {
+  const summary: SummaryAttemptResult = {
+    passed: result.passed,
+    status: result.status,
+    attempt: result.attempt,
     durationMs: result.durationMs,
     artifactDir: result.artifactDir,
     usage: result.report.usage,