From a3a5bd21bd159ff9c7b4521cc162774e4e963fc3 Mon Sep 17 00:00:00 2001 From: Szymon Chmal Date: Mon, 4 May 2026 11:15:20 +0200 Subject: [PATCH 1/2] add structured failure classification --- src/assertions/assert.ts | 9 ++ src/assertions/index.ts | 1 + src/assertions/types.ts | 6 ++ src/domain/result.ts | 6 ++ src/domain/test-case.ts | 2 + src/failure-classification.ts | 78 ++++++++++++++++ src/index.ts | 1 + src/reporters/github-actions.ts | 8 ++ src/reporters/json-summary.ts | 7 +- src/reporters/standard.ts | 63 +++++++++++++ src/runner/execute-runner.ts | 3 + src/runner/execute-suite.ts | 40 ++++++-- src/runner/workspace.ts | 7 ++ test/reporters/json-summary.test.ts | 8 ++ test/reporters/standard.test.ts | 104 +++++++++++++++++++++ test/runner/execute-runner.test.ts | 35 +++++++ test/runner/execute-suite.reporter.test.ts | 39 ++++++++ 17 files changed, 409 insertions(+), 8 deletions(-) create mode 100644 src/failure-classification.ts diff --git a/src/assertions/assert.ts b/src/assertions/assert.ts index adaf77f..c4b2bec 100644 --- a/src/assertions/assert.ts +++ b/src/assertions/assert.ts @@ -1,4 +1,5 @@ import nodeAssert from "node:assert/strict"; +import { attachFailureClass, type FailureClassInput } from "../failure-classification.js"; import { commandAssertions } from "./commands.js"; import { fileReadAssertions } from "./file-reads.js"; import { outputAssertions } from "./output.js"; @@ -7,6 +8,14 @@ import { toolCallAssertions } from "./tool-calls.js"; import type { SkillGymAssert } from "./types.js"; export const assert: SkillGymAssert = Object.assign(nodeAssert, { + classify(failureClass: FailureClassInput, callback: () => T): T { + try { + return callback(); + } catch (error) { + attachFailureClass(error, failureClass); + throw error; + } + }, skills: skillAssertions, commands: commandAssertions, fileReads: fileReadAssertions, diff --git a/src/assertions/index.ts b/src/assertions/index.ts index 58b462f..4df11d3 100644 --- a/src/assertions/index.ts +++ b/src/assertions/index.ts @@ -1,6 +1,7 @@ export { assert } from "./assert.js"; export { CommandMatcherBuilder, commandMatcher } from "./command-matcher.js"; export type { + AssertionClassifier, AssertionOptions, CommandMatcher, CommandMatcherBuilderLike, diff --git a/src/assertions/types.ts b/src/assertions/types.ts index 28eadf4..ec3cf73 100644 --- a/src/assertions/types.ts +++ b/src/assertions/types.ts @@ -1,4 +1,5 @@ import type nodeAssert from "node:assert/strict"; +import type { FailureClass } from "../domain/result.js"; import type { SessionReport, ToolCallEvent } from "../domain/session-report.js"; export type Matcher = string | RegExp; @@ -154,10 +155,15 @@ export interface OutputAssertions { notEmpty(report: SessionReport, options?: AssertionOptions): void; } +export interface AssertionClassifier { + (failureClass: string | FailureClass, callback: () => T): T; +} + export type SkillGymAssert = typeof nodeAssert & { skills: SkillAssertions; commands: CommandAssertions; fileReads: FileReadAssertions; toolCalls: ToolCallAssertions; output: OutputAssertions; + classify: AssertionClassifier; }; diff --git a/src/domain/result.ts b/src/domain/result.ts index c90a8a4..3d13a53 100644 --- a/src/domain/result.ts +++ b/src/domain/result.ts @@ -11,9 +11,15 @@ export interface RunnerResult { error?: SerializedError; failureType?: RunnerFailureType; failureOrigin?: RunnerFailureOrigin; + failureClass?: FailureClass; failureLogPath?: string; } +export interface FailureClass { + id: string; + label?: string; +} + export type RunnerResultStatus = "passed" | "failed" | "expected-failed" | "unexpected-passed"; export interface CaseResult { diff --git a/src/domain/test-case.ts b/src/domain/test-case.ts index 96b03bf..4816f4b 100644 --- a/src/domain/test-case.ts +++ b/src/domain/test-case.ts @@ -1,3 +1,4 @@ +import type { FailureClass, RunnerResult } from "./result.js"; import type { SessionEvent, SessionReport, SkillDetection } from "./session-report.js"; export interface WorkspaceBootstrapConfig { @@ -27,6 +28,7 @@ export interface TestCase { tags?: string[]; timeoutMs?: number; expectedFail?: boolean; + classifyFailure?(result: RunnerResult): FailureClass | string | undefined; assert(report: SessionReport, ctx: AssertionContext): void | Promise; } diff --git a/src/failure-classification.ts b/src/failure-classification.ts new file mode 100644 index 0000000..861f17f --- /dev/null +++ b/src/failure-classification.ts @@ -0,0 +1,78 @@ +import type { FailureClass, RunnerFailureOrigin, RunnerFailureType } from "./domain/result.js"; + +const FAILURE_CLASS_SYMBOL = Symbol.for("skillgym.failureClass"); + +export type FailureClassInput = string | FailureClass; + +type ErrorWithFailureClass = Error & { + [FAILURE_CLASS_SYMBOL]?: FailureClass; +}; + +export function normalizeFailureClass(input: FailureClassInput): FailureClass { + if (typeof input === "string") { + return { id: input }; + } + + return input.label === undefined ? { id: input.id } : { id: input.id, label: input.label }; +} + +export function attachFailureClass(error: unknown, input: FailureClassInput): void { + if (!(error instanceof Error)) { + return; + } + + (error as ErrorWithFailureClass)[FAILURE_CLASS_SYMBOL] = normalizeFailureClass(input); +} + +export function getAttachedFailureClass(error: unknown): FailureClass | undefined { + if (!(error instanceof Error)) { + return undefined; + } + + return (error as ErrorWithFailureClass)[FAILURE_CLASS_SYMBOL]; +} + +export function resolveFailureClass(options: { + failureClass?: FailureClassInput; + failureType?: RunnerFailureType; + failureOrigin?: RunnerFailureOrigin; +}): FailureClass | undefined { + if (options.failureClass !== undefined) { + return normalizeFailureClass(options.failureClass); + } + + if (options.failureType === "assertion") { + return { id: "assertion", label: "Assertion failure" }; + } + + if (options.failureType === "timeout") { + return { id: "timeout", label: "Timeout" }; + } + + switch (options.failureOrigin) { + case "assert-hook": + return { id: "assert-hook", label: "Assert hook crash" }; + case "max-steps": + return { id: "max-steps", label: "Max steps exceeded" }; + case "model-rejected": + return { id: "model-rejected", label: "Rejected model" }; + case "workspace-bootstrap": + return { id: "workspace-bootstrap", label: "Workspace bootstrap" }; + case "workspace-setup": + return { id: "workspace-setup", label: "Workspace setup" }; + case "collection": + return { id: "collection", label: "Artifact collection" }; + case "normalization": + return { id: "normalization", label: "Report normalization" }; + case "snapshot": + return { id: "snapshot", label: "Snapshot verification" }; + case "runner": + return { id: "runner-crash", label: "Runner crash" }; + case "assertion": + return { id: "assertion", label: "Assertion failure" }; + case undefined: + return options.failureType === "runner-crash" + ? { id: "runner-crash", label: "Runner crash" } + : undefined; + } +} diff --git a/src/index.ts b/src/index.ts index 8a95ce4..5d0108f 100644 --- a/src/index.ts +++ b/src/index.ts @@ -16,6 +16,7 @@ export type { WorkspaceBootstrapConfig, } from "./domain/test-case.js"; export type { + FailureClass, RunnerFailureOrigin, RunnerFailureType, RunnerResult, diff --git a/src/reporters/github-actions.ts b/src/reporters/github-actions.ts index 2348ef7..2ada8d4 100644 --- a/src/reporters/github-actions.ts +++ b/src/reporters/github-actions.ts @@ -54,6 +54,10 @@ function formatAnnotationMessage(result: RunnerResult): string { lines.push(`failure origin: ${result.failureOrigin}`); } + if (result.failureClass !== undefined) { + lines.push(`failure class: ${result.failureClass.id}`); + } + if (result.error !== undefined) { lines.push(`error: ${result.error.name}: ${result.error.message}`); } @@ -163,6 +167,10 @@ function formatFailureSummaryItem(caseId: string, result: RunnerResult): string `artifacts: \`${result.artifactDir}\``, ]; + if (result.failureClass !== undefined) { + segments.splice(2, 0, `class: \`${result.failureClass.id}\``); + } + if (result.failureLogPath !== undefined) { segments.push(`log: \`${result.failureLogPath}\``); } diff --git a/src/reporters/json-summary.ts b/src/reporters/json-summary.ts index 40d8906..aa6f86a 100644 --- a/src/reporters/json-summary.ts +++ b/src/reporters/json-summary.ts @@ -1,5 +1,5 @@ import process from "node:process"; -import type { CaseResult, RunnerResult, SuiteRunResult } from "../domain/result.js"; +import type { CaseResult, FailureClass, RunnerResult, SuiteRunResult } from "../domain/result.js"; import type { BenchmarkReporter } from "./contract.js"; interface JsonSummaryReporterOptions { @@ -20,6 +20,7 @@ interface SummaryRunnerResult { error?: SummaryError; failureType?: RunnerResult["failureType"]; failureOrigin?: RunnerResult["failureOrigin"]; + failureClass?: FailureClass; } interface SummaryCaseResult { @@ -62,6 +63,10 @@ function summarizeRunnerResult(result: RunnerResult): SummaryRunnerResult { summary.failureOrigin = result.failureOrigin; } + if (result.failureClass !== undefined) { + summary.failureClass = result.failureClass; + } + return summary; } diff --git a/src/reporters/standard.ts b/src/reporters/standard.ts index e578682..cf2a7a1 100644 --- a/src/reporters/standard.ts +++ b/src/reporters/standard.ts @@ -5,6 +5,7 @@ import { printBanner } from "../cli/branding.js"; import pc from "picocolors"; import type { CaseResult, + FailureClass, RunnerFailureOrigin, RunnerFailureType, RunnerResult, @@ -31,10 +32,16 @@ interface FailureEntry { error?: SerializedError; failureType?: RunnerFailureType; failureOrigin?: RunnerFailureOrigin; + failureClass?: FailureClass; failureLogPath?: string; status: RunnerResult["status"]; } +interface FailureGroup { + failureClass: FailureClass; + failures: FailureEntry[]; +} + interface StandardReporterOptions { stdout?: Pick; isInteractive?: boolean; @@ -167,6 +174,7 @@ export function createStandardReporter(options: StandardReporterOptions = {}): B error: event.result.error, failureType: event.result.failureType, failureOrigin: event.result.failureOrigin, + failureClass: event.result.failureClass, failureLogPath: event.result.failureLogPath, status: event.result.status, }); @@ -200,6 +208,15 @@ export function createStandardReporter(options: StandardReporterOptions = {}): B } if (failures.length > 0) { + writeLine("", stdout); + writeLine(colors.bold("Failure Classes"), stdout); + writeLine("", stdout); + + for (const group of groupFailures(failures)) { + writeLine(formatFailureGroup(group, colors), stdout); + writeLine("", stdout); + } + writeLine("", stdout); writeLine(colors.bold("Failures"), stdout); writeLine("", stdout); @@ -368,6 +385,52 @@ function formatFailureBlock( return lines.join("\n"); } +function formatFailureGroup( + group: FailureGroup, + colors: ReturnType, +): string { + const lines = [ + `${colors.bold(formatFailureClassLabel(group.failureClass))}${colors.dim(` (${group.failures.length})`)}`, + ]; + + for (const failure of group.failures) { + lines.push(colors.dim(`- ${failure.caseId} > ${failure.runner.id}: ${failure.artifactDir}`)); + } + + return lines.join("\n"); +} + +function groupFailures(failures: FailureEntry[]): FailureGroup[] { + const groups = new Map(); + + for (const failure of failures) { + const failureClass = + failure.failureClass ?? + (failure.status === "unexpected-passed" + ? { id: "unexpected-passed", label: "Unexpected pass" } + : { id: "unknown", label: "Unclassified" }); + const key = failureClass.id; + const existing = groups.get(key); + + if (existing === undefined) { + groups.set(key, { failureClass, failures: [failure] }); + continue; + } + + existing.failures.push(failure); + } + + return Array.from(groups.values()); +} + +function formatFailureClassLabel(failureClass: FailureClass): string { + if (failureClass.label === undefined || failureClass.label === failureClass.id) { + return failureClass.id; + } + + return `${failureClass.label} [${failureClass.id}]`; +} + function formatErrorLocation(error: SerializedError): string | undefined { const location = extractUserStackFrame(error); return location === undefined ? undefined : formatStackFrameLocation(location); diff --git a/src/runner/execute-runner.ts b/src/runner/execute-runner.ts index 04523cc..4c84df2 100644 --- a/src/runner/execute-runner.ts +++ b/src/runner/execute-runner.ts @@ -6,6 +6,7 @@ import type { RunnerInfo } from "../domain/runner.js"; import type { SessionReport } from "../domain/session-report.js"; import type { TestCase } from "../domain/test-case.js"; import { createAssertionContext } from "../assertions/context.js"; +import { getAttachedFailureClass } from "../failure-classification.js"; import type { SnapshotRuntimeOptions, SnapshotStore } from "../snapshots/store.js"; import { ensureDir, writeJson } from "../utils/fs.js"; import { isCommandTimeoutError, isMaxStepsExceededError } from "../utils/process.js"; @@ -106,6 +107,7 @@ export async function executeRunner( durationMs: Date.now() - startedMs, failureType: isAssertionFailure ? "assertion" : "runner-crash", failureOrigin: isAssertionFailure ? "assertion" : "assert-hook", + failureClass: getAttachedFailureClass(error), report, }); } @@ -172,6 +174,7 @@ async function writeAndReturnFailure( durationMs: number; failureType?: RunnerResult["failureType"]; failureOrigin?: RunnerFailureOrigin; + failureClass?: RunnerResult["failureClass"]; failureLogPath?: string; report?: SessionReport; }, diff --git a/src/runner/execute-suite.ts b/src/runner/execute-suite.ts index bcc56e9..b60a7af 100644 --- a/src/runner/execute-suite.ts +++ b/src/runner/execute-suite.ts @@ -7,6 +7,7 @@ import type { ResolvedRunner, RunnerConfig, RunnerInfo } from "../domain/runner. import type { ScheduleMode } from "../domain/schedule.js"; import type { SuiteWorkspaceConfig, TestCase } from "../domain/test-case.js"; import { getAdapter } from "../adapters/index.js"; +import { normalizeFailureClass } from "../failure-classification.js"; import type { BenchmarkReporter, ReporterContext } from "../reporters/contract.js"; import { SnapshotStore, type SnapshotRuntimeOptions } from "../snapshots/store.js"; import { ensureDir, writeJson } from "../utils/fs.js"; @@ -248,35 +249,60 @@ export async function executeSuite( } export function classifyExpectedFailure(testCase: TestCase, result: RunnerResult): RunnerResult { + const classifiedResult = applyTestCaseFailureClass(testCase, result); + if (testCase.expectedFail !== true) { return { - ...result, - status: result.passed ? "passed" : "failed", + ...classifiedResult, + status: classifiedResult.passed ? "passed" : "failed", }; } - if (result.passed) { + if (classifiedResult.passed) { return { - ...result, + ...classifiedResult, passed: false, status: "unexpected-passed", + failureClass: classifiedResult.failureClass ?? { + id: "unexpected-passed", + label: "Unexpected pass", + }, }; } - if (result.failureType === "assertion" && result.failureOrigin === "assertion") { + if ( + classifiedResult.failureType === "assertion" && + classifiedResult.failureOrigin === "assertion" + ) { return { - ...result, + ...classifiedResult, passed: true, status: "expected-failed", }; } return { - ...result, + ...classifiedResult, status: "failed", }; } +function applyTestCaseFailureClass(testCase: TestCase, result: RunnerResult): RunnerResult { + if (result.passed) { + return result; + } + + const failureClass = testCase.classifyFailure?.(result); + if (failureClass === undefined) { + return result; + } + + return { + ...result, + failureClass: normalizeFailureClass(failureClass), + }; +} + async function executePlannedExecution( item: PlannedSuiteExecution, options: { diff --git a/src/runner/workspace.ts b/src/runner/workspace.ts index fe89f66..83b6c78 100644 --- a/src/runner/workspace.ts +++ b/src/runner/workspace.ts @@ -8,6 +8,7 @@ import type { TestCase, WorkspaceBootstrapConfig, } from "../domain/test-case.js"; +import { resolveFailureClass, type FailureClassInput } from "../failure-classification.js"; import { serializeError } from "../utils/error.js"; import { copyDir, @@ -235,6 +236,7 @@ export function createExecutionFailureResult( durationMs: number; failureType?: RunnerFailureType; failureOrigin?: RunnerFailureOrigin; + failureClass?: FailureClassInput; failureLogPath?: string; report?: SessionReport; }, @@ -278,6 +280,11 @@ export function createExecutionFailureResult( error: serializedError, failureType: options.failureType ?? "runner-crash", failureOrigin: options.failureOrigin, + failureClass: resolveFailureClass({ + failureClass: options.failureClass, + failureType: options.failureType ?? "runner-crash", + failureOrigin: options.failureOrigin, + }), failureLogPath: options.failureLogPath, }; } diff --git a/test/reporters/json-summary.test.ts b/test/reporters/json-summary.test.ts index a76b9fd..f95d15e 100644 --- a/test/reporters/json-summary.test.ts +++ b/test/reporters/json-summary.test.ts @@ -37,6 +37,10 @@ test("json-summary reporter omits session internals and prints summary on suite artifactDir: ".skillgym-results/run-1/case-a/open-main", failureType: "assertion", failureOrigin: "assertion", + failureClass: { + id: "missing-flag", + label: "Missing required flag", + }, failureLogPath: ".skillgym-results/run-1/case-a/open-main/stderr.log", error: { name: "AssertionError", @@ -156,6 +160,10 @@ test("json-summary reporter omits session internals and prints summary on suite expect(runnerResult.artifactDir).toBe(".skillgym-results/run-1/case-a/open-main"); expect(runnerResult.failureType).toBe("assertion"); expect(runnerResult.failureOrigin).toBe("assertion"); + expect(runnerResult.failureClass).toEqual({ + id: "missing-flag", + label: "Missing required flag", + }); // error: name and message preserved, stack omitted expect(runnerResult.error.name).toBe("AssertionError"); diff --git a/test/reporters/standard.test.ts b/test/reporters/standard.test.ts index 2083147..013ac3e 100644 --- a/test/reporters/standard.test.ts +++ b/test/reporters/standard.test.ts @@ -162,6 +162,9 @@ test("standard reporter prints runner-grouped results and failure artifacts", as expect(output).toContain("Tokens 9,830 / 1,104 / 0 / 7,233 / 15,201"); expect(output).toContain("Output .skillgym-results/run-1"); expect(output).toContain("Failures"); + expect(output).toContain("Failure Classes"); + expect(output).toContain("Assertion failure [assertion] (1)"); + expect(output).toContain("- case-a > code-main: .skillgym-results/run-1/case-a/code-main"); expect(output).toContain("✗ case-a > code-main (codex, gpt-5.4)"); expect(output).toContain("AssertionError: expected skill to be loaded before command execution"); expect(output).toContain("at /workspace/examples/basic-suite.ts:14:15"); @@ -919,6 +922,102 @@ test("standard reporter formats model-rejected failures as runner crashes with s ); }); +test("standard reporter groups failures by custom failure class", async () => { + const writes: string[] = []; + const reporter = createStandardReporter({ + stdout: { + isTTY: false, + columns: 120, + write(chunk: string) { + writes.push(chunk); + return true; + }, + }, + isInteractive: false, + isUnicode: true, + }); + + const runner = createRunnerInfo("open-main", { type: "opencode", model: "openai/gpt-5" }); + const context = { + isInteractive: false, + cwd: "/workspace", + workspaceMode: "shared" as const, + suitePath: "examples/basic-suite.ts", + outputDir: ".skillgym-results/run-1", + selectedCaseCount: 2, + selectedRunnerCount: 1, + selectedExecutionCount: 2, + scheduleMode: "serial" as const, + maxParallel: 1, + declaredTags: [], + }; + const suiteResult: SuiteRunResult = { + suitePath: context.suitePath, + startedAt: "2026-04-02T12:00:00.000Z", + endedAt: "2026-04-02T12:01:42.000Z", + durationMs: 102_000, + outputDir: context.outputDir, + declaredTags: [], + selectedTags: [], + cases: [ + createCaseResult({ + caseId: "case-a", + runnerResults: [ + { + ...createRunnerResult({ + runner, + passed: false, + artifactDir: ".skillgym-results/run-1/case-a/open-main", + totalTokens: 12_000, + }), + failureClass: { id: "wrong-cli-alias", label: "Wrong CLI alias" }, + }, + ], + }), + createCaseResult({ + caseId: "case-b", + runnerResults: [ + { + ...createRunnerResult({ + runner, + passed: false, + artifactDir: ".skillgym-results/run-1/case-b/open-main", + totalTokens: 12_000, + }), + failureClass: { id: "wrong-cli-alias", label: "Wrong CLI alias" }, + }, + ], + }), + ], + runners: [ + createRunnerSummary({ + runner, + passedCases: 0, + totalCases: 2, + averageDurationMs: 24_800, + averageTotalTokens: 12_000, + }), + ], + }; + + for (const caseResult of suiteResult.cases) { + await reporter.onRunnerFinish?.({ + context, + testCase: { id: caseResult.caseId, prompt: "", assert() {} }, + runner, + result: caseResult.runnerResults[0]!, + caseIndex: 1, + totalCases: 2, + }); + } + await reporter.onSuiteFinish?.({ context, result: suiteResult }); + + const output = writes.join(""); + expect(output).toContain("Wrong CLI alias [wrong-cli-alias] (2)"); + expect(output).toContain("- case-a > open-main: .skillgym-results/run-1/case-a/open-main"); + expect(output).toContain("- case-b > open-main: .skillgym-results/run-1/case-b/open-main"); +}); + function createCaseResult(options: { caseId: string; runnerResults: RunnerResult[] }): CaseResult { return { caseId: options.caseId, @@ -934,6 +1033,7 @@ function createRunnerResult(options: { status?: RunnerResult["status"]; artifactDir: string; totalTokens: number; + failureClass?: RunnerResult["failureClass"]; }): RunnerResult { return { runner: options.runner, @@ -957,6 +1057,10 @@ function createRunnerResult(options: { failureType: options.passed || options.status === "unexpected-passed" ? undefined : "assertion", failureOrigin: options.passed || options.status === "unexpected-passed" ? undefined : "assertion", + failureClass: + options.passed || options.status === "unexpected-passed" + ? undefined + : (options.failureClass ?? { id: "assertion", label: "Assertion failure" }), report: createSessionReport({ runner: options.runner, usage: { diff --git a/test/runner/execute-runner.test.ts b/test/runner/execute-runner.test.ts index b0aa6a7..92c1a2a 100644 --- a/test/runner/execute-runner.test.ts +++ b/test/runner/execute-runner.test.ts @@ -3,6 +3,7 @@ import os from "node:os"; import path from "node:path"; import { AssertionError } from "node:assert"; import { afterEach, expect, test } from "vitest"; +import { assert } from "../../src/assertions/assert.js"; import type { RawRunArtifacts, RunHandle, @@ -149,10 +150,44 @@ test("executeRunner marks AssertionError failures separately from runner crashes expect(result.passed).toBe(false); expect(result.failureType).toBe("assertion"); expect(result.failureOrigin).toBe("assertion"); + expect(result.failureClass).toEqual({ id: "assertion", label: "Assertion failure" }); expect(result.error?.message).toBe("expected a skill read"); expect(result.report.usage.totalTokens).toBe(120); }); +test("executeRunner preserves assertion failure classes attached with assert.classify", async () => { + const outputDir = await createTempDir(); + const runner = createRunnerInfo("open-main", { type: "opencode", model: "openai/gpt-5" }); + const adapter = createSuccessfulAdapter(runner, { totalTokens: 120 }); + + const result = await executeRunner( + { + id: "alpha", + prompt: "prompt", + assert(report) { + assert.classify({ id: "missing-flag", label: "Missing required flag" }, () => { + assert.output.includes(report, /--json/, { + message: "expected the agent to pass --json", + }); + }); + }, + }, + runner, + adapter, + { + cwd: outputDir, + artifactDir: path.join(outputDir, "alpha", runner.pathKey), + timeoutMs: 5_000, + }, + ); + + expect(result.passed).toBe(false); + expect(result.failureType).toBe("assertion"); + expect(result.failureOrigin).toBe("assertion"); + expect(result.failureClass).toEqual({ id: "missing-flag", label: "Missing required flag" }); + expect(result.error?.message).toContain("expected the agent to pass --json"); +}); + test("executeRunner treats non-AssertionError exceptions from assert as run failures", async () => { const outputDir = await createTempDir(); const runner = createRunnerInfo("open-main", { type: "opencode", model: "openai/gpt-5" }); diff --git a/test/runner/execute-suite.reporter.test.ts b/test/runner/execute-suite.reporter.test.ts index 137e1c6..fc40e9f 100644 --- a/test/runner/execute-suite.reporter.test.ts +++ b/test/runner/execute-suite.reporter.test.ts @@ -626,6 +626,41 @@ test("classifyExpectedFailure maps raw outcomes to expectation-aware statuses", ).toMatchObject({ passed: false, status: "failed" }); }); +test("classifyExpectedFailure applies custom failure classification hooks", async () => { + const outputDir = await createTempDir(); + const runner = createRunnerInfo("open", { type: "opencode", model: "openai/gpt-5" }); + const testCase: TestCase = { + id: "expected", + prompt: "a", + classifyFailure(result) { + return result.error?.message.includes("alias") + ? { id: "wrong-cli-alias", label: "Wrong CLI alias" } + : undefined; + }, + assert() {}, + }; + + const result = classifyExpectedFailure(testCase, { + ...createRunnerResult({ + caseId: "expected", + runner, + passed: false, + durationMs: 10, + artifactDir: path.join(outputDir, "expected", runner.pathKey), + totalTokens: 100, + outputTokens: 20, + observedReads: 1, + }), + error: { + name: "AssertionError", + message: "wrong cli alias used", + }, + }); + + expect(result.failureClass).toEqual({ id: "wrong-cli-alias", label: "Wrong CLI alias" }); + expect(result.status).toBe("failed"); +}); + test("executeSuite aggregates runner summaries from case-centric results", async () => { const outputDir = await createTempDir(); let suiteResult: SuiteRunResult | undefined; @@ -1025,6 +1060,7 @@ function createRunnerResult(options: { observedReads: number; failureType?: RunnerResult["failureType"]; failureOrigin?: RunnerResult["failureOrigin"]; + failureClass?: RunnerResult["failureClass"]; }): RunnerResult { const inputTokens = options.totalTokens === undefined @@ -1045,6 +1081,9 @@ function createRunnerResult(options: { }, failureType: options.passed ? undefined : (options.failureType ?? "assertion"), failureOrigin: options.passed ? undefined : (options.failureOrigin ?? "assertion"), + failureClass: options.passed + ? undefined + : (options.failureClass ?? { id: "assertion", label: "Assertion failure" }), report: createSessionReport({ runner: options.runner, usage: { From 40bce45576a3f8df0cae1f739dcb53856e4d0bdf Mon Sep 17 00:00:00 2001 From: Szymon Chmal Date: Mon, 4 May 2026 11:43:10 +0200 Subject: [PATCH 2/2] document failure classification usage --- README.md | 6 +++ docs/assertions.md | 22 +++++++++++ docs/reporters.md | 5 ++- docs/test-cases.md | 45 ++++++++++++++++++++++ examples/failure-classification-suite.ts | 48 ++++++++++++++++++++++++ 5 files changed, 125 insertions(+), 1 deletion(-) create mode 100644 examples/failure-classification-suite.ts diff --git a/README.md b/README.md index 80830d1..d86e518 100644 --- a/README.md +++ b/README.md @@ -254,6 +254,12 @@ The [workspace isolation suite](examples/workspace-isolation-suite.ts) demonstra npx skillgym run ./examples/workspace-isolation-suite.ts ``` +The [failure classification suite](examples/failure-classification-suite.ts) demonstrates `assert.classify(...)` and `classifyFailure(...)` so reporters can group related failures under one class: + +```bash +npx skillgym run ./examples/failure-classification-suite.ts +``` + ## Docs The documentation site is at [incubator.callstack.com/skillgym](https://incubator.callstack.com/skillgym/). Repository docs: diff --git a/docs/assertions.md b/docs/assertions.md index a92658d..f311284 100644 --- a/docs/assertions.md +++ b/docs/assertions.md @@ -4,6 +4,7 @@ - Node's `node:assert/strict` API - grouped helpers for normalized session reports +- `assert.classify(...)` for attaching structured failure classes to assertion failures ```ts import { assert } from "skillgym"; @@ -11,8 +12,29 @@ import { assert } from "skillgym"; assert.ok(true); assert.equal(1, 1); assert.match("skillgym ready", /ready/); +assert.classify("missing-flag", () => { + assert.match("--json", /--yaml/); +}); ``` +## Failure classification + +Use `assert.classify(...)` when you want an assertion failure to carry a stable structured class that reporters can group across runs. + +```ts +assert.classify({ id: "wrong-cli-alias", label: "Wrong CLI alias" }, () => { + assert.doesNotMatch(ctx.finalOutput(), /\bcursr\b/i, "wrong Cursor CLI alias in final output"); +}); +``` + +Rules: + +- `id` is the stable machine-readable key used for grouping +- `label` is optional and gives reporters a human-friendly display name +- passing a string such as `assert.classify("wrong-cli-alias", ...)` sets only the `id` +- if the callback does not throw, no failure class is recorded +- if the callback throws, the thrown error keeps the attached failure class through the runner and reporter pipeline + ## Report helper groups - `assert.skills.*` diff --git a/docs/reporters.md b/docs/reporters.md index fe41470..ccbeef1 100644 --- a/docs/reporters.md +++ b/docs/reporters.md @@ -107,6 +107,7 @@ The built-in `standard` reporter is optimized for polished CLI output. - Expected failures count as passed suite health, are labeled `expected failure`, and are excluded from the failure details section. - Unexpected passes count as failures and are labeled `unexpected pass` because the benchmark expectation may be stale. - Summary output includes expected-failure and unexpected-pass counts in addition to pass/fail totals. +- Failure summaries are grouped by structured `failureClass` when available, with grouped counts and per-run artifact paths. - Full stack traces are not shown by default. Reporter-visible token metrics on `RunnerSummary` include: @@ -133,6 +134,7 @@ The built-in `json-summary` reporter writes a trimmed JSON summary to stdout — - It ignores live progress hooks. - The output includes per-case and per-runner results with token usage, pass/fail status, artifact paths, and error details, but omits the full session events and raw artifacts. +- Per-runner results include `failureClass` when present so downstream tooling can keep grouped-failure semantics. - It is useful for post-run analysis steps or feeding results to an LLM. ## GitHub Actions reporter @@ -140,9 +142,10 @@ The built-in `json-summary` reporter writes a trimmed JSON summary to stdout — The built-in `github-actions` reporter is designed for GitHub CI. - Failed runs emit `::error` workflow command annotations, including file and line when a stack frame is available. +- GitHub annotations and job summaries include `failureClass.id` when present. - When `GITHUB_STEP_SUMMARY` is set, the reporter appends a Markdown job summary containing: - Suite metadata (path, case/run counts, duration, output dir) - A per-runner section with a table of all cases (pass/fail status, duration, and token columns: input, output, reasoning, cache, billable) - - A failures section listing up to 10 failures with error name/message, artifact dir, and log path + - A failures section listing up to 10 failures with error name/message, failure class, artifact dir, and log path - When `GITHUB_STEP_SUMMARY` is missing, summary writing is skipped. - PR comments stay out of scope for the reporter itself; add those in a separate CI step if needed. diff --git a/docs/test-cases.md b/docs/test-cases.md index e5863c5..430aba8 100644 --- a/docs/test-cases.md +++ b/docs/test-cases.md @@ -52,6 +52,7 @@ export interface TestCase { tags?: string[]; timeoutMs?: number; expectedFail?: boolean; + classifyFailure?(result: RunnerResult): FailureClass | string | undefined; assert(report: SessionReport, ctx: AssertionContext): void | Promise; } ``` @@ -63,6 +64,7 @@ Field meanings: - `tags`: optional labels for selecting cases with `--tag`; multiple selected tags use OR matching - `timeoutMs`: optional per-case timeout override - `expectedFail`: mark assertion failures as expected benchmark signal, not suite-health failures +- `classifyFailure(result)`: optional post-processing hook for assigning or overriding structured failure classes - `assert(report, ctx)`: pass or fail logic for that execution `TestCase` does not include runner selection. Each case runs against the selected configured runners. @@ -124,6 +126,49 @@ const suite: TestCase[] = [ See `assertions.md` for the full assertion reference. +## Failure classification hooks + +Use failure classification when you want to group multiple failing runs under one shared cause, such as a pseudo command, wrong CLI alias, missing required flag, or wrong command family. + +There are two integration points: + +- `assert.classify(...)` attaches a failure class directly where an assertion is made +- `classifyFailure(result)` lets the test case assign or override the final class after the run result is available + +Example: + +```ts +import { assert, type TestCase } from "skillgym"; + +const suite: TestCase[] = [ + { + id: "cursor-alias-check", + prompt: 'Say you would run: cursr agent "open README.md".', + classifyFailure(result) { + return result.error?.message.includes("wrong Cursor CLI alias") + ? { id: "wrong-cli-alias", label: "Wrong CLI alias" } + : undefined; + }, + assert(_report, ctx) { + assert.classify({ id: "wrong-cli-alias", label: "Wrong CLI alias" }, () => { + assert.doesNotMatch( + ctx.finalOutput(), + /\bcursr\s+agent\b/i, + "wrong Cursor CLI alias in final output", + ); + }); + }, + }, +]; +``` + +Notes: + +- `assert.classify(...)` is the smallest way to tag a single assertion failure +- `classifyFailure(result)` is useful when several different assertion messages should collapse into one shared class +- if both are used, `classifyFailure(result)` runs later and can override the attached class +- built-in infrastructure failures still receive default classes such as `Assertion failure`, `Timeout`, `Runner crash`, or `Max steps exceeded` + ## Expected failures Use `expectedFail: true` for benchmark cases that intentionally capture a known model or agent gap. Expected failures only apply to assertion failures. Runner crashes, timeouts, workspace setup failures, collection failures, normalization failures, snapshot failures, and `run.maxSteps` failures still fail suite health because they indicate infrastructure or benchmark integrity problems. diff --git a/examples/failure-classification-suite.ts b/examples/failure-classification-suite.ts new file mode 100644 index 0000000..fca3792 --- /dev/null +++ b/examples/failure-classification-suite.ts @@ -0,0 +1,48 @@ +import { assert, type TestCase } from "skillgym"; + +const suite: TestCase[] = [ + { + id: "wrong-cli-alias-echo", + prompt: [ + "Reply with exactly: I will use cursr agent for this task.", + "Do not mention the correct CLI alias.", + ].join(" "), + classifyFailure(result) { + return result.error?.message.includes("wrong Cursor CLI alias") + ? { id: "wrong-cli-alias", label: "Wrong CLI alias" } + : undefined; + }, + async assert(_report, ctx) { + assert.classify({ id: "wrong-cli-alias", label: "Wrong CLI alias" }, () => { + assert.doesNotMatch( + ctx.finalOutput(), + /\bcursr\b/i, + "wrong Cursor CLI alias in final output", + ); + }); + }, + }, + { + id: "wrong-cli-alias-command", + prompt: [ + 'Say you would run this command: cursr agent "open README.md".', + "Do not correct the alias.", + ].join(" "), + classifyFailure(result) { + return result.error?.message.includes("wrong Cursor CLI alias") + ? { id: "wrong-cli-alias", label: "Wrong CLI alias" } + : undefined; + }, + async assert(_report, ctx) { + assert.classify({ id: "wrong-cli-alias", label: "Wrong CLI alias" }, () => { + assert.doesNotMatch( + ctx.finalOutput(), + /\bcursr\s+agent\b/i, + "wrong Cursor CLI alias in final output", + ); + }); + }, + }, +]; + +export default suite;