callstackincubator · V3RON · May 4, 2026 · May 4, 2026 · May 4, 2026 · May 4, 2026
diff --git a/README.md b/README.md
@@ -261,6 +261,12 @@ The [workspace isolation suite](examples/workspace-isolation-suite.ts) demonstra
 npx skillgym run ./examples/workspace-isolation-suite.ts
 ```
 
+The [failure classification suite](examples/failure-classification-suite.ts) demonstrates `assert.classify(...)` and `classifyFailure(...)` so reporters can group related failures under one class:
+
+```bash
+npx skillgym run ./examples/failure-classification-suite.ts
+```
+
 ## Docs
 
 The documentation site is at [incubator.callstack.com/skillgym](https://incubator.callstack.com/skillgym/). Repository docs:

diff --git a/docs/assertions.md b/docs/assertions.md
@@ -5,6 +5,7 @@
 - Node's `node:assert/strict` API
 - grouped helpers for normalized session reports
 - `assert.soft.*` for Jest/Vitest-style sync soft assertions
+- `assert.classify(...)` for attaching structured failure classes to assertion failures
 
 ```ts
 import { assert } from "skillgym";
@@ -13,8 +14,29 @@ assert.ok(true);
 assert.equal(1, 1);
 assert.match("skillgym ready", /ready/);
 assert.soft.match("skillgym ready", /ready/);
+assert.classify("missing-flag", () => {
+  assert.match("--json", /--yaml/);
+});
 ```
 
+## Failure classification
+
+Use `assert.classify(...)` when you want an assertion failure to carry a stable structured class that reporters can group across runs.
+
+```ts
+assert.classify({ id: "wrong-cli-alias", label: "Wrong CLI alias" }, () => {
+  assert.doesNotMatch(ctx.finalOutput(), /\bcursr\b/i, "wrong Cursor CLI alias in final output");
+});
+```
+
+Rules:
+
+- `id` is the stable machine-readable key used for grouping
+- `label` is optional and gives reporters a human-friendly display name
+- passing a string such as `assert.classify("wrong-cli-alias", ...)` sets only the `id`
+- if the callback does not throw, no failure class is recorded
+- if the callback throws, the thrown error keeps the attached failure class through the runner and reporter pipeline
+
 ## Report helper groups
 
 - `assert.soft.*`

diff --git a/docs/reporters.md b/docs/reporters.md
@@ -107,6 +107,7 @@ The built-in `standard` reporter is optimized for polished CLI output.
 - Expected failures count as passed suite health, are labeled `expected failure`, and are excluded from the failure details section.
 - Unexpected passes count as failures and are labeled `unexpected pass` because the benchmark expectation may be stale.
 - Summary output includes expected-failure and unexpected-pass counts in addition to pass/fail totals.
+- Failure summaries are grouped by structured `failureClass` when available, with grouped counts and per-run artifact paths.
 - Full stack traces are not shown by default.
 
 Reporter-visible token metrics on `RunnerSummary` include:
@@ -133,16 +134,18 @@ The built-in `json-summary` reporter writes a trimmed JSON summary to stdout —
 
 - It ignores live progress hooks.
 - The output includes per-case and per-runner results with token usage, pass/fail status, artifact paths, and error details, but omits the full session events and raw artifacts.
+- Per-runner results include `failureClass` when present so downstream tooling can keep grouped-failure semantics.
 - It is useful for post-run analysis steps or feeding results to an LLM.
 
 ## GitHub Actions reporter
 
 The built-in `github-actions` reporter is designed for GitHub CI.
 
 - Failed runs emit `::error` workflow command annotations, including file and line when a stack frame is available.
+- GitHub annotations and job summaries include `failureClass.id` when present.
 - When `GITHUB_STEP_SUMMARY` is set, the reporter appends a Markdown job summary containing:
   - Suite metadata (path, case/run counts, duration, output dir)
   - A per-runner section with a table of all cases (pass/fail status, duration, and token columns: input, output, reasoning, cache, billable)
-  - A failures section listing up to 10 failures with error name/message, artifact dir, and log path
+  - A failures section listing up to 10 failures with error name/message, failure class, artifact dir, and log path
 - When `GITHUB_STEP_SUMMARY` is missing, summary writing is skipped.
 - PR comments stay out of scope for the reporter itself; add those in a separate CI step if needed.
diff --git a/docs/test-cases.md b/docs/test-cases.md
@@ -52,6 +52,7 @@ export interface TestCase {
   tags?: string[];
   timeoutMs?: number;
   expectedFail?: boolean;
+  classifyFailure?(result: RunnerResult): FailureClass | string | undefined;
   assert(report: SessionReport, ctx: AssertionContext): void | Promise<void>;
 }
 ```
@@ -63,6 +64,7 @@ Field meanings:
 - `tags`: optional labels for selecting cases with `--tag`; multiple selected tags use OR matching
 - `timeoutMs`: optional per-case timeout override
 - `expectedFail`: mark assertion failures as expected benchmark signal, not suite-health failures
+- `classifyFailure(result)`: optional post-processing hook for assigning or overriding structured failure classes
 - `assert(report, ctx)`: pass or fail logic for that execution
 
 `TestCase` does not include runner selection. Each case runs against the selected configured runners.
@@ -124,6 +126,49 @@ const suite: TestCase[] = [
 
 See `assertions.md` for the full assertion reference.
 
+## Failure classification hooks
+
+Use failure classification when you want to group multiple failing runs under one shared cause, such as a pseudo command, wrong CLI alias, missing required flag, or wrong command family.
+
+There are two integration points:
+
+- `assert.classify(...)` attaches a failure class directly where an assertion is made
+- `classifyFailure(result)` lets the test case assign or override the final class after the run result is available
+
+Example:
+
+```ts
+import { assert, type TestCase } from "skillgym";
+
+const suite: TestCase[] = [
+  {
+    id: "cursor-alias-check",
+    prompt: 'Say you would run: cursr agent "open README.md".',
+    classifyFailure(result) {
+      return result.error?.message.includes("wrong Cursor CLI alias")
+        ? { id: "wrong-cli-alias", label: "Wrong CLI alias" }
+        : undefined;
+    },
+    assert(_report, ctx) {
+      assert.classify({ id: "wrong-cli-alias", label: "Wrong CLI alias" }, () => {
+        assert.doesNotMatch(
+          ctx.finalOutput(),
+          /\bcursr\s+agent\b/i,
+          "wrong Cursor CLI alias in final output",
+        );
+      });
+    },
+  },
+];
+```
+
+Notes:
+
+- `assert.classify(...)` is the smallest way to tag a single assertion failure
+- `classifyFailure(result)` is useful when several different assertion messages should collapse into one shared class
+- if both are used, `classifyFailure(result)` runs later and can override the attached class
+- built-in infrastructure failures still receive default classes such as `Assertion failure`, `Timeout`, `Runner crash`, or `Max steps exceeded`
+
 ## Expected failures
 
 Use `expectedFail: true` for benchmark cases that intentionally capture a known model or agent gap. Expected failures only apply to assertion failures. Runner crashes, timeouts, workspace setup failures, collection failures, normalization failures, snapshot failures, and `run.maxSteps` failures still fail suite health because they indicate infrastructure or benchmark integrity problems.

diff --git a/examples/failure-classification-suite.ts b/examples/failure-classification-suite.ts
@@ -0,0 +1,48 @@
+import { assert, type TestCase } from "skillgym";
+
+const suite: TestCase[] = [
+  {
+    id: "wrong-cli-alias-echo",
+    prompt: [
+      "Reply with exactly: I will use cursr agent for this task.",
+      "Do not mention the correct CLI alias.",
+    ].join(" "),
+    classifyFailure(result) {
+      return result.error?.message.includes("wrong Cursor CLI alias")
+        ? { id: "wrong-cli-alias", label: "Wrong CLI alias" }
+        : undefined;
+    },
+    async assert(_report, ctx) {
+      assert.classify({ id: "wrong-cli-alias", label: "Wrong CLI alias" }, () => {
+        assert.doesNotMatch(
+          ctx.finalOutput(),
+          /\bcursr\b/i,
+          "wrong Cursor CLI alias in final output",
+        );
+      });
+    },
+  },
+  {
+    id: "wrong-cli-alias-command",
+    prompt: [
+      'Say you would run this command: cursr agent "open README.md".',
+      "Do not correct the alias.",
+    ].join(" "),
+    classifyFailure(result) {
+      return result.error?.message.includes("wrong Cursor CLI alias")
+        ? { id: "wrong-cli-alias", label: "Wrong CLI alias" }
+        : undefined;
+    },
+    async assert(_report, ctx) {
+      assert.classify({ id: "wrong-cli-alias", label: "Wrong CLI alias" }, () => {
+        assert.doesNotMatch(
+          ctx.finalOutput(),
+          /\bcursr\s+agent\b/i,
+          "wrong Cursor CLI alias in final output",
+        );
+      });
+    },
+  },
+];
+
+export default suite;
diff --git a/src/assertions/assert.ts b/src/assertions/assert.ts
@@ -1,4 +1,5 @@
 import nodeAssert from "node:assert/strict";
+import { attachFailureClass, type FailureClassInput } from "../failure-classification.js";
 import { commandAssertions } from "./commands.js";
 import { fileReadAssertions } from "./file-reads.js";
 import { outputAssertions } from "./output.js";
@@ -8,6 +9,14 @@ import { toolCallAssertions } from "./tool-calls.js";
 import type { SkillGymAssert } from "./types.js";
 
 export const assert: SkillGymAssert = Object.assign(nodeAssert, {
+  classify<T>(failureClass: FailureClassInput, callback: () => T): T {
+    try {
+      return callback();
+    } catch (error) {
+      attachFailureClass(error, failureClass);
+      throw error;
+    }
+  },
   soft: softAssert,
   skills: skillAssertions,
   commands: commandAssertions,

diff --git a/src/assertions/index.ts b/src/assertions/index.ts
@@ -2,6 +2,7 @@ export { assert } from "./assert.js";
 export { runWithSoftAssertions } from "./soft.js";
 export { CommandMatcherBuilder, commandMatcher } from "./command-matcher.js";
 export type {
+  AssertionClassifier,
   AssertionOptions,
   CommandMatcher,
   CommandMatcherBuilderLike,

diff --git a/src/assertions/types.ts b/src/assertions/types.ts
@@ -1,4 +1,5 @@
 import type nodeAssert from "node:assert/strict";
+import type { FailureClass } from "../domain/result.js";
 import type { SessionReport, ToolCallEvent } from "../domain/session-report.js";
 
 export type Matcher = string | RegExp;
@@ -154,6 +155,10 @@ export interface OutputAssertions {
   notEmpty(report: SessionReport, options?: AssertionOptions): void;
 }
 
+export interface AssertionClassifier {
+  <T>(failureClass: string | FailureClass, callback: () => T): T;
+}
+
 export type SkillGymSoftAssert = typeof nodeAssert & {
   skills: SkillAssertions;
   commands: CommandAssertions;
@@ -169,4 +174,5 @@ export type SkillGymAssert = typeof nodeAssert & {
   fileReads: FileReadAssertions;
   toolCalls: ToolCallAssertions;
   output: OutputAssertions;
+  classify: AssertionClassifier;
 };
diff --git a/src/domain/result.ts b/src/domain/result.ts
@@ -11,9 +11,15 @@ export interface RunnerResult {
   error?: SerializedError;
   failureType?: RunnerFailureType;
   failureOrigin?: RunnerFailureOrigin;
+  failureClass?: FailureClass;
   failureLogPath?: string;
 }
 
+export interface FailureClass {
+  id: string;
+  label?: string;
+}
+
 export type RunnerResultStatus = "passed" | "failed" | "expected-failed" | "unexpected-passed";
 
 export interface CaseResult {

diff --git a/src/domain/test-case.ts b/src/domain/test-case.ts
@@ -1,3 +1,4 @@
+import type { FailureClass, RunnerResult } from "./result.js";
 import type { SessionEvent, SessionReport, SkillDetection } from "./session-report.js";
 
 export interface WorkspaceBootstrapConfig {
@@ -27,6 +28,7 @@ export interface TestCase {
   tags?: string[];
   timeoutMs?: number;
   expectedFail?: boolean;
+  classifyFailure?(result: RunnerResult): FailureClass | string | undefined;
   assert(report: SessionReport, ctx: AssertionContext): void | Promise<void>;
 }
 

diff --git a/src/failure-classification.ts b/src/failure-classification.ts
@@ -0,0 +1,78 @@
+import type { FailureClass, RunnerFailureOrigin, RunnerFailureType } from "./domain/result.js";
+
+const FAILURE_CLASS_SYMBOL = Symbol.for("skillgym.failureClass");
+
+export type FailureClassInput = string | FailureClass;
+
+type ErrorWithFailureClass = Error & {
+  [FAILURE_CLASS_SYMBOL]?: FailureClass;
+};
+
+export function normalizeFailureClass(input: FailureClassInput): FailureClass {
+  if (typeof input === "string") {
+    return { id: input };
+  }
+
+  return input.label === undefined ? { id: input.id } : { id: input.id, label: input.label };
+}
+
+export function attachFailureClass(error: unknown, input: FailureClassInput): void {
+  if (!(error instanceof Error)) {
+    return;
+  }
+
+  (error as ErrorWithFailureClass)[FAILURE_CLASS_SYMBOL] = normalizeFailureClass(input);
+}
+
+export function getAttachedFailureClass(error: unknown): FailureClass | undefined {
+  if (!(error instanceof Error)) {
+    return undefined;
+  }
+
+  return (error as ErrorWithFailureClass)[FAILURE_CLASS_SYMBOL];
+}
+
+export function resolveFailureClass(options: {
+  failureClass?: FailureClassInput;
+  failureType?: RunnerFailureType;
+  failureOrigin?: RunnerFailureOrigin;
+}): FailureClass | undefined {
+  if (options.failureClass !== undefined) {
+    return normalizeFailureClass(options.failureClass);
+  }
+
+  if (options.failureType === "assertion") {
+    return { id: "assertion", label: "Assertion failure" };
+  }
+
+  if (options.failureType === "timeout") {
+    return { id: "timeout", label: "Timeout" };
+  }
+
+  switch (options.failureOrigin) {
+    case "assert-hook":
+      return { id: "assert-hook", label: "Assert hook crash" };
+    case "max-steps":
+      return { id: "max-steps", label: "Max steps exceeded" };
+    case "model-rejected":
+      return { id: "model-rejected", label: "Rejected model" };
+    case "workspace-bootstrap":
+      return { id: "workspace-bootstrap", label: "Workspace bootstrap" };
+    case "workspace-setup":
+      return { id: "workspace-setup", label: "Workspace setup" };
+    case "collection":
+      return { id: "collection", label: "Artifact collection" };
+    case "normalization":
+      return { id: "normalization", label: "Report normalization" };
+    case "snapshot":
+      return { id: "snapshot", label: "Snapshot verification" };
+    case "runner":
+      return { id: "runner-crash", label: "Runner crash" };
+    case "assertion":
+      return { id: "assertion", label: "Assertion failure" };
+    case undefined:
+      return options.failureType === "runner-crash"
+        ? { id: "runner-crash", label: "Runner crash" }
+        : undefined;
+  }
+}
diff --git a/src/index.ts b/src/index.ts
@@ -16,6 +16,7 @@ export type {
   WorkspaceBootstrapConfig,
 } from "./domain/test-case.js";
 export type {
+  FailureClass,
   RunnerFailureOrigin,
   RunnerFailureType,
   RunnerResult,

diff --git a/src/reporters/github-actions.ts b/src/reporters/github-actions.ts
@@ -54,6 +54,10 @@ function formatAnnotationMessage(result: RunnerResult): string {
     lines.push(`failure origin: ${result.failureOrigin}`);
   }
 
+  if (result.failureClass !== undefined) {
+    lines.push(`failure class: ${result.failureClass.id}`);
+  }
+
   if (result.error !== undefined) {
     lines.push(`error: ${result.error.name}: ${result.error.message}`);
   }
@@ -163,6 +167,10 @@ function formatFailureSummaryItem(caseId: string, result: RunnerResult): string
     `artifacts: \`${result.artifactDir}\``,
   ];
 
+  if (result.failureClass !== undefined) {
+    segments.splice(2, 0, `class: \`${result.failureClass.id}\``);
+  }
+
   if (result.failureLogPath !== undefined) {
     segments.push(`log: \`${result.failureLogPath}\``);
   }