Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 4 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -40,6 +40,7 @@ const config: SkillGymConfig = {
reporter: "standard",
schedule: "serial",
maxParallel: 4,
retryFailed: 1,
maxSteps: 4,
},
defaults: {
Expand Down Expand Up @@ -144,6 +145,7 @@ Most important config properties:
- `run.reporter`: built-in `standard` reporter or a custom reporter module path
- `run.schedule`: execution scheduling mode for case x runner pairs
- `run.maxParallel`: maximum concurrent executions for non-serial schedules, defaulting to available CPU parallelism
- `run.retryFailed`: rerun only failed case x runner executions up to this many additional attempts
- `run.maxSteps`: best-effort limit on streamed agent steps before skillgym terminates the run
- `run.workspace`: default workspace mode for the suite
- `defaults.timeoutMs`: default per-case timeout
Expand All @@ -165,6 +167,8 @@ For concurrent schedules, `run.maxParallel` defaults to `os.availableParallelism

Concurrent schedules do not copy or isolate the workspace by themselves. Overlapping runs may still interact through the same filesystem state and live runner output unless you use isolated workspaces. OpenCode, Codex, and Claude Code runtime state are isolated per run under each artifact directory.

`run.retryFailed` is useful when broad benchmark runs include occasional flaky agent failures. SkillGym only retries executions that still count as failed after result classification, keeps each attempt's artifacts, and reports whether a final pass came from a retry.

`run.maxSteps` is enforced on a best-effort basis by monitoring each runner's streamed JSONL output. A step is one observed model round, not one token and not necessarily one tool call, but the exact boundary is still runner-defined, so the same prompt may consume different numbers of steps across agents. When the observed step count exceeds the configured limit, skillgym kills the agent process, fails the run with origin `max-steps`, and preserves raw stdout/stderr artifacts for debugging. No partial normalized report is produced for that failure.

## Workspaces
Expand Down
25 changes: 25 additions & 0 deletions examples/flaky-retry-suite.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,25 @@
import { access, writeFile } from "node:fs/promises";
import os from "node:os";
import path from "node:path";
import { assert, type TestCase } from "skillgym";

const markerPath = path.join(os.tmpdir(), "skillgym-flaky-retry-example-6.marker");

const suite: TestCase[] = [
{
id: "retry-once",
prompt: "Reply exactly: skillgym retry example",
async assert(_report, ctx) {
try {
await access(markerPath);
} catch {
await writeFile(markerPath, "seen", "utf8");
throw new Error("Intentional first-run failure. Run the same suite again.");
}

assert.match(ctx.finalOutput(), /skillgym retry example/i);
},
},
];

export default suite;
2 changes: 2 additions & 0 deletions src/cli.ts
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@ async function main(): Promise<void> {
const scheduleOption = parsed.options.schedule;
const configOption = parsed.options.config;
const maxParallelOption = parsed.options["max-parallel"];
const retryFailedOption = parsed.options["retry-failed"];
const updateSnapshotsOption = parsed.options["update-snapshots"];
const snapshotsOption = parsed.options.snapshots;
const tagOption = parsed.options.tag;
Expand All @@ -35,6 +36,7 @@ async function main(): Promise<void> {
reporter: getStringOption(reporterOption),
schedule: getStringOption(scheduleOption),
maxParallel: getStringOption(maxParallelOption),
retryFailed: getStringOption(retryFailedOption),
tags: parseTagOption(tagOption),
reporterCwd: process.cwd(),
configPath: getStringOption(configOption),
Expand Down
2 changes: 2 additions & 0 deletions src/cli/help.ts
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@ ${theme.bold("Run Options:")}
--output-dir ${theme.accent("<path>")} Override where run artifacts are written
--schedule ${theme.accent("<mode>")} Choose ${theme.light("serial")}, ${theme.light("parallel")}, or ${theme.light("isolated-by-runner")}
--max-parallel ${theme.accent("<n>")} Cap concurrent executions for non-serial schedules
--retry-failed ${theme.accent("<n>")} Retry only failed case x runner executions up to ${theme.light("n")} extra times
--case ${theme.accent("<id>")} Filter the configured suite to one case id
--tag ${theme.accent("<tag>")} Filter cases by tag; repeat or comma-separate for OR matching
--runner ${theme.accent("<runner-id>")} Filter the configured runner set by runner id
Expand All @@ -32,6 +33,7 @@ ${theme.bold("Examples:")}
${theme.dim("$")} ${theme.light("skillgym run ./examples/basic-suite.ts --reporter standard")}
${theme.dim("$")} ${theme.light("skillgym run ./examples/basic-suite.ts --schedule isolated-by-runner")}
${theme.dim("$")} ${theme.light("skillgym run ./examples/basic-suite.ts --schedule parallel --max-parallel 4")}
${theme.dim("$")} ${theme.light("skillgym run ./examples/basic-suite.ts --retry-failed 2")}
${theme.dim("$")} ${theme.light("skillgym run ./examples/basic-suite.ts --update-snapshots")}
`);
}
3 changes: 3 additions & 0 deletions src/cli/run.ts
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@ export async function runCommand(options: {
outputDir?: string;
schedule?: string;
maxParallel?: string;
retryFailed?: string;
caseId?: string;
runner?: string;
reporter?: string;
Expand All @@ -38,6 +39,7 @@ export async function runCommand(options: {
outputDir: options.outputDir,
schedule: options.schedule,
maxParallel: options.maxParallel,
retryFailed: options.retryFailed,
tags: options.tags,
},
loadedConfig.config,
Expand Down Expand Up @@ -76,6 +78,7 @@ export async function runCommand(options: {
outputDir: runOptions.outputDir,
schedule: runOptions.schedule,
maxParallel: runOptions.maxParallel,
retryFailed: runOptions.retryFailed,
caseId: options.caseId,
runner: options.runner,
tags: runOptions.tags,
Expand Down
14 changes: 14 additions & 0 deletions src/config.ts
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@ const RUN_KEYS = [
"workspace",
"maxSteps",
"maxParallel",
"retryFailed",
"tags",
] as const;
const DEFAULT_KEYS = ["timeoutMs"] as const;
Expand Down Expand Up @@ -64,6 +65,7 @@ export interface SkillGymConfig {
workspace?: SuiteWorkspaceConfig;
maxSteps?: number;
maxParallel?: number;
retryFailed?: number;
tags?: string[];
};
defaults?: {
Expand Down Expand Up @@ -121,6 +123,7 @@ export function resolveRunOptions(
outputDir?: string;
schedule?: string;
maxParallel?: string;
retryFailed?: string;
tags?: string[];
},
config: SkillGymConfig,
Expand All @@ -129,12 +132,17 @@ export function resolveRunOptions(
outputDir?: string;
schedule: ScheduleMode;
maxParallel?: number;
retryFailed: number;
tags: string[];
} {
const maxParallel =
cliOptions.maxParallel !== undefined
? parseIntegerString(cliOptions.maxParallel, "CLI option --max-parallel", 1)
: config.run?.maxParallel;
const retryFailed =
cliOptions.retryFailed !== undefined
? parseIntegerString(cliOptions.retryFailed, "CLI option --retry-failed", 0)
: (config.run?.retryFailed ?? 0);

return {
cwd:
Expand All @@ -150,6 +158,7 @@ export function resolveRunOptions(
? parseScheduleMode(cliOptions.schedule, "CLI option --schedule")
: (config.run?.schedule ?? "serial"),
...(maxParallel === undefined ? {} : { maxParallel }),
retryFailed,
tags: cliOptions.tags ?? config.run?.tags ?? [],
};
}
Expand Down Expand Up @@ -256,6 +265,9 @@ function resolveConfigPaths(config: SkillGymConfig, configDir: string): SkillGym
...(config.run.maxParallel === undefined
? {}
: { maxParallel: config.run.maxParallel }),
...(config.run.retryFailed === undefined
? {}
: { retryFailed: config.run.retryFailed }),
...(config.run.tags === undefined ? {} : { tags: config.run.tags }),
},
defaults:
Expand Down Expand Up @@ -346,6 +358,7 @@ function parseRunConfig(value: unknown, configPath: string): SkillGymConfig["run

const maxSteps = parseOptionalInteger(record.maxSteps, `${configPath}.maxSteps`, 1);
const maxParallel = parseOptionalInteger(record.maxParallel, `${configPath}.maxParallel`, 1);
const retryFailed = parseOptionalInteger(record.retryFailed, `${configPath}.retryFailed`, 0);

return {
cwd: parseOptionalNonEmptyString(record.cwd, `${configPath}.cwd`),
Expand All @@ -355,6 +368,7 @@ function parseRunConfig(value: unknown, configPath: string): SkillGymConfig["run
workspace: parseOptionalWorkspaceConfig(record.workspace, `${configPath}.workspace`),
...(maxSteps === undefined ? {} : { maxSteps }),
...(maxParallel === undefined ? {} : { maxParallel }),
...(retryFailed === undefined ? {} : { retryFailed }),
tags: parseOptionalStringArray(record.tags, `${configPath}.tags`),
};
}
Expand Down
11 changes: 10 additions & 1 deletion src/domain/result.ts
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
import type { RunnerInfo } from "./runner.js";
import type { SessionReport } from "./session-report.js";

export interface RunnerResult {
interface BaseRunnerResult {
runner: RunnerInfo;
passed: boolean;
status: RunnerResultStatus;
Expand All @@ -15,6 +15,15 @@ export interface RunnerResult {
failureLogPath?: string;
}

export interface RunnerAttemptResult extends BaseRunnerResult {
attempt: number;
}

export interface RunnerResult extends BaseRunnerResult {
attempt?: number;
attempts?: RunnerAttemptResult[];
}

export interface FailureClass {
id: string;
label?: string;
Expand Down
1 change: 1 addition & 0 deletions src/index.ts
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@ export type {
} from "./domain/test-case.js";
export type {
FailureClass,
RunnerAttemptResult,
RunnerFailureOrigin,
RunnerFailureType,
RunnerResult,
Expand Down
4 changes: 4 additions & 0 deletions src/reporters/contract.ts
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,8 @@ export interface RunnerStartEvent {
context: ReporterContext;
testCase: TestCase;
runner: RunnerInfo;
attempt?: number;
maxAttempts?: number;
caseIndex: number;
totalCases: number;
}
Expand All @@ -47,6 +49,8 @@ export interface RunnerFinishEvent {
testCase: TestCase;
runner: RunnerInfo;
result: RunnerResult;
attempt?: number;
maxAttempts?: number;
caseIndex: number;
totalCases: number;
}
Expand Down
27 changes: 26 additions & 1 deletion src/reporters/github-actions.ts
Original file line number Diff line number Diff line change
Expand Up @@ -49,6 +49,11 @@ function formatAnnotationCommand(caseId: string, result: RunnerResult): string {

function formatAnnotationMessage(result: RunnerResult): string {
const lines = [`failure type: ${result.failureType ?? "unknown"}`];
const retryCount = countRetries(result);

if (retryCount > 0) {
lines.push(`retries: ${String(retryCount)}`);
}

if (result.failureOrigin !== undefined) {
lines.push(`failure origin: ${result.failureOrigin}`);
Expand Down Expand Up @@ -147,7 +152,8 @@ function formatRunnerAgentLabel(runner: RunnerSummary["runner"]): string {
function formatRunnerCaseRow(caseId: string, result: RunnerResult): string {
const status = result.passed ? "✅" : "❌";
const usage = result.report.usage;
return `| ${status} \`${caseId}\` | ${formatDuration(result.durationMs)} | ${formatTokens(usage.inputTokens)} | ${formatTokens(usage.outputTokens)} | ${formatTokens(usage.reasoningTokens)} | ${formatTokens(usage.cacheTokens)} | ${formatTokens(usage.totalTokens)} |`;
const retryLabel = formatRetryLabel(result);
return `| ${status} \`${caseId}\`${retryLabel === undefined ? "" : ` ${retryLabel}`} | ${formatDuration(result.durationMs)} | ${formatTokens(usage.inputTokens)} | ${formatTokens(usage.outputTokens)} | ${formatTokens(usage.reasoningTokens)} | ${formatTokens(usage.cacheTokens)} | ${formatTokens(usage.totalTokens)} |`;
}

function getRunnerCases(
Expand All @@ -167,6 +173,12 @@ function formatFailureSummaryItem(caseId: string, result: RunnerResult): string
`artifacts: \`${result.artifactDir}\``,
];

const retryCount = countRetries(result);

if (retryCount > 0) {
segments.splice(2, 0, `retries: ${String(retryCount)}`);
}

if (result.failureClass !== undefined) {
segments.splice(2, 0, `class: \`${result.failureClass.id}\``);
}
Expand All @@ -182,6 +194,19 @@ function formatFailureSummaryItem(caseId: string, result: RunnerResult): string
return segments.join("; ");
}

function formatRetryLabel(result: RunnerResult): string | undefined {
const retryCount = countRetries(result);
if (retryCount === 0) {
return undefined;
}

return `(${retryCount === 1 ? "1 retry" : `${String(retryCount)} retries`})`;
}

function countRetries(result: RunnerResult): number {
return Math.max(0, (result.attempts?.length ?? 1) - 1);
}

function listFailures(result: SuiteRunResult): Array<{ caseId: string; result: RunnerResult }> {
const failures: Array<{ caseId: string; result: RunnerResult }> = [];

Expand Down
59 changes: 59 additions & 0 deletions src/reporters/json-summary.ts
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,23 @@ interface SummaryError {
interface SummaryRunnerResult {
runner: RunnerResult["runner"];
passed: boolean;
status: RunnerResult["status"];
attempt?: number;
retryCount: number;
durationMs: number;
artifactDir: string;
usage: RunnerResult["report"]["usage"];
attempts?: SummaryAttemptResult[];
error?: SummaryError;
failureType?: RunnerResult["failureType"];
failureOrigin?: RunnerResult["failureOrigin"];
failureClass?: FailureClass;
}

interface SummaryAttemptResult {
passed: boolean;
status: RunnerResult["status"];
attempt: number;
durationMs: number;
artifactDir: string;
usage: RunnerResult["report"]["usage"];
Expand Down Expand Up @@ -46,6 +63,48 @@ function summarizeRunnerResult(result: RunnerResult): SummaryRunnerResult {
const summary: SummaryRunnerResult = {
runner: result.runner,
passed: result.passed,
status: result.status,
attempt: result.attempt,
retryCount: countRetries(result),
durationMs: result.durationMs,
artifactDir: result.artifactDir,
usage: result.report.usage,
};

if (result.attempts !== undefined) {
summary.attempts = result.attempts.map(summarizeAttemptResult);
}

if (result.error !== undefined) {
summary.error = { name: result.error.name, message: result.error.message };
}

if (result.failureType !== undefined) {
summary.failureType = result.failureType;
}

if (result.failureOrigin !== undefined) {
summary.failureOrigin = result.failureOrigin;
}

if (result.failureClass !== undefined) {
summary.failureClass = result.failureClass;
}

return summary;
}

function countRetries(result: RunnerResult): number {
return Math.max(0, (result.attempts?.length ?? 1) - 1);
}

function summarizeAttemptResult(
result: NonNullable<RunnerResult["attempts"]>[number],
): SummaryAttemptResult {
const summary: SummaryAttemptResult = {
passed: result.passed,
status: result.status,
attempt: result.attempt,
durationMs: result.durationMs,
artifactDir: result.artifactDir,
usage: result.report.usage,
Expand Down
Loading
Loading