From 8d31ce917b9600402395de014dcdad9cdb116859 Mon Sep 17 00:00:00 2001
From: Szymon Chmal <szymon@chmal.it>
Date: Tue, 5 May 2026 18:44:31 +0200
Subject: [PATCH] Add bundled CLI skills

---
 README.md            |   7 +++
 package.json         |   3 +-
 skills/assertions.md |  97 ++++++++++++++++++++++++++++++++++++++++
 skills/core.md       | 103 +++++++++++++++++++++++++++++++++++++++++++
 skills/reporters.md  |  64 +++++++++++++++++++++++++++
 skills/snapshots.md  |  52 ++++++++++++++++++++++
 skills/test-cases.md |  84 +++++++++++++++++++++++++++++++++++
 skills/workspaces.md |  73 ++++++++++++++++++++++++++++++
 src/cli.ts           |  21 +++++++++
 src/cli/branding.ts  |   4 ++
 src/cli/help.ts      |   9 +++-
 src/cli/skills.ts    |  42 ++++++++++++++++++
 test/cli.test.ts     |  35 +++++++++++++++
 13 files changed, 591 insertions(+), 3 deletions(-)
 create mode 100644 skills/assertions.md
 create mode 100644 skills/core.md
 create mode 100644 skills/reporters.md
 create mode 100644 skills/snapshots.md
 create mode 100644 skills/test-cases.md
 create mode 100644 skills/workspaces.md
 create mode 100644 src/cli/skills.ts

diff --git a/README.md b/README.md
index 64f5a3b..043bce0 100644
--- a/README.md
+++ b/README.md
@@ -104,6 +104,13 @@ View CLI help:
 npx skillgym help
 ```
 
+List bundled agent-facing skills and read the main one:
+
+```bash
+npx skillgym skills list
+npx skillgym skills get core
+```
+
 By default, `skillgym` uses the built-in `standard` reporter.
 
 TypeScript config, suite, and reporter modules work out of the box on Node `>=22.18.0` using Node's built-in TypeScript stripping.
diff --git a/package.json b/package.json
index b2088cf..7143e9a 100644
--- a/package.json
+++ b/package.json
@@ -26,7 +26,8 @@
   },
   "files": [
     "dist",
-    "bin.js"
+    "bin.js",
+    "skills"
   ],
   "type": "module",
   "main": "./dist/index.js",
diff --git a/skills/assertions.md b/skills/assertions.md
new file mode 100644
index 0000000..eb9899a
--- /dev/null
+++ b/skills/assertions.md
@@ -0,0 +1,97 @@
+---
+name: assertions
+description: Assertion authoring for Skillgym benchmark suites. Covers hard and soft assertions, grouped helpers, skill detection checks, command matching, and failure classification.
+---
+
+# skillgym assertions
+
+Use this skill when you are writing or debugging `assert(report, ctx)` logic.
+
+## What Skillgym gives you
+
+`skillgym` exports a root `assert` object that combines Node strict assertions with benchmark-focused helpers.
+
+Available groups:
+
+- `assert.soft.*`
+- `assert.skills.*`
+- `assert.commands.*`
+- `assert.fileReads.*`
+- `assert.toolCalls.*`
+- `assert.output.*`
+
+## Typical patterns
+
+```ts
+import { assert, commandMatcher } from "skillgym";
+
+assert.skills.has(report, "find-skills");
+assert.commands.includes(report, commandMatcher("pnpm").arg("test"));
+assert.fileReads.includes(report, /README\.md$/);
+assert.toolCalls.includes(report, { tool: /skill/i });
+assert.match(ctx.finalOutput(), /expo/i);
+```
+
+## Hard vs soft assertions
+
+- Use hard assertions when one failure should stop the case immediately.
+- Use `assert.soft.*` when you want one run to report multiple mismatches together.
+
+```ts
+assert.soft.skills.has(report, "find-skills");
+assert.soft.commands.includes(report, "npx skills find");
+assert.soft.output.notEmpty(report);
+```
+
+## Skill assertions
+
+Use `assert.skills.*` against `report.detectedSkills`.
+
+Most useful methods:
+
+- `has`
+- `notHas`
+- `includes`
+- `count`
+- `exactlyOne`
+- `only`
+
+Confidence can be filtered with `minConfidence`:
+
+```ts
+assert.skills.has(report, "find-skills", { minConfidence: "strong" });
+```
+
+## Command assertions
+
+Use raw strings or `RegExp` for quick checks. Use `commandMatcher(...)` for stable matching on executable, args, and options.
+
+```ts
+assert.commands.includes(report, "pnpm test");
+assert.commands.before(report, /skills find/, /pnpm install/);
+assert.commands.includes(report, commandMatcher("pnpm").arg("test").option("--filter", "unit"));
+```
+
+## Output assertions
+
+Use output assertions when the final agent response matters directly. Use them together with normalized report assertions, not instead of them.
+
+## Failure classification
+
+Use `assert.classify(...)` when multiple failures should collapse into one stable machine-readable cause.
+
+```ts
+assert.classify({ id: "wrong-cli-alias", label: "Wrong CLI alias" }, () => {
+  assert.doesNotMatch(ctx.finalOutput(), /\bcursr\b/i);
+});
+```
+
+This improves grouped reporting across runs.
+
+## Good assertion style
+
+- assert the smallest behavior that proves the benchmark intent
+- prefer normalized report fields over fragile prose matching
+- use command and tool-call assertions for workflow checks
+- use output assertions for user-visible wording checks
+- classify repeated failure modes with stable ids
diff --git a/skills/core.md b/skills/core.md
new file mode 100644
index 0000000..ce63092
--- /dev/null
+++ b/skills/core.md
@@ -0,0 +1,103 @@
+---
+name: core
+description: Core Skillgym workflow for agents. Read this first before writing or debugging suites. Covers how to structure a suite, run it, interpret results, and when to read deeper feature-specific skills.
+---
+
+# skillgym core
+
+Skillgym benchmarks coding-agent behavior by running real agent sessions and asserting on the normalized execution report.
+
+Read this skill first. It gives you the default workflow, the minimum suite shape, and the map to deeper feature skills.
+
+## Core loop
+
+```bash
+skillgym skills get core
+skillgym run <suite.ts>
+skillgym run <suite.ts> --case <id>
+skillgym run <suite.ts> --runner <runner-id>
+```
+
+Typical agent loop:
+
+1. Read the target suite or create one.
+2. Write or refine prompts and assertions.
+3. Run one suite, case, or runner slice.
+4. Inspect failures from the output directory and normalized report.
+5. Tighten assertions or workspace setup until the benchmark captures the intended behavior.
+
+## Minimum suite shape
+
+```ts
+import { assert, type TestCase } from "skillgym";
+
+const suite: TestCase[] = [
+  {
+    id: "uses-correct-skill",
+    prompt: "Find the right skill and explain how to install it.",
+    assert(report) {
+      assert.skills.has(report, "find-skills");
+      assert.match(report.finalOutput, /install/i);
+    },
+  },
+];
+
+export default suite;
+```
+
+Use stable `id` values. Keep prompts exact. Put benchmark intent in assertions, not in prose comments.
+
+## Primary commands
+
+```bash
+skillgym help
+skillgym skills list
+skillgym skills get <name>
+
+skillgym run <suite.ts>
+skillgym run <suite.ts> --case <id>
+skillgym run <suite.ts> --tag <tag>
+skillgym run <suite.ts> --runner <runner-id>
+skillgym run <suite.ts> --reporter json-summary
+skillgym run <suite.ts> --schedule parallel --max-parallel 4
+skillgym run <suite.ts> --update-snapshots
+```
+
+## Mental model
+
+- A configured runner is one agent target.
+- Each selected case runs once per selected runner.
+- Assertions evaluate the normalized session report after the run finishes.
+- Output artifacts are written under the configured `outputDir`.
+- Expected assertion failures can be benchmark-successful; infrastructure failures are still real failures.
+
+## When to read deeper skills
+
+Read the focused skills only when the task needs them:
+
+- `skillgym skills get test-cases`
+  Use when creating or reshaping suite files, tags, expected failures, or per-case timeouts.
+- `skillgym skills get assertions`
+  Use when writing pass/fail logic against skills, commands, tool calls, output, or failure classes.
+- `skillgym skills get workspaces`
+  Use when the agent needs isolated filesystem state, template repos, or bootstrap commands.
+- `skillgym skills get snapshots`
+  Use when benchmarking token regressions or updating snapshot baselines.
+- `skillgym skills get reporters`
+  Use when choosing built-in reporters or wiring a custom reporter.
+
+## Suggested authoring order
+
+1. Start with one small case and one runner.
+2. Make the assertion explicit and narrow.
+3. Add tags or expected-failure behavior only after the baseline case works.
+4. Add workspace isolation when shared state can affect the benchmark.
+5. Add snapshots when behavior is stable enough to guard token regressions.
+
+## Common mistakes
+
+- asserting on vague output instead of checking the normalized report
+- trying to select runners inside `TestCase` instead of config plus CLI filters
+- using shared workspaces for stateful tasks that need isolation
+- treating snapshot mismatches like functional failures instead of cost regressions
+- writing one huge suite before proving one small representative case
diff --git a/skills/reporters.md b/skills/reporters.md
new file mode 100644
index 0000000..3b37430
--- /dev/null
+++ b/skills/reporters.md
@@ -0,0 +1,64 @@
+---
+name: reporters
+description: Skillgym reporter selection and customization. Covers built-in reporters, CLI/config selection, lifecycle hooks, and when to use machine-readable output.
+---
+
+# skillgym reporters
+
+Use this skill when choosing how benchmark results should be rendered or consumed.
+
+## Built-in reporters
+
+- `standard`
+- `json`
+- `json-summary`
+- `github-actions`
+
+## Main commands
+
+```bash
+skillgym run <suite.ts> --reporter standard
+skillgym run <suite.ts> --reporter json
+skillgym run <suite.ts> --reporter json-summary
+skillgym run <suite.ts> --reporter github-actions
+skillgym run <suite.ts> --reporter ./path/to/custom-reporter.ts
+```
+
+## Selection rules
+
+- omitting `--reporter` uses the built-in `standard` reporter
+- CLI `--reporter` overrides config `run.reporter`
+- relative custom reporter paths resolve from `process.cwd()` on CLI input
+
+## When to use each built-in reporter
+
+- `standard`: default interactive CLI output for humans
+- `json`: full aggregated result on stdout for machine consumers
+- `json-summary`: trimmed result for post-processing or LLM consumption
+- `github-actions`: CI annotations and job summary output
+
+## Custom reporter shape
+
+```ts
+import type { BenchmarkReporter } from "skillgym";
+
+const reporter: BenchmarkReporter = {
+  onSuiteFinish(event) {
+    console.log(event.result.outputDir);
+  },
+};
+
+export default reporter;
+```
+
+## Reporter lifecycle hooks
+
+- `onSuiteStart`
+- `onCaseStart`
+- `onRunnerStart`
+- `onRunnerFinish`
+- `onCaseFinish`
+- `onSuiteFinish`
+- `onError`
+
+Use `json-summary` when another agent or tool needs a smaller result than the full session report.
diff --git a/skills/snapshots.md b/skills/snapshots.md
new file mode 100644
index 0000000..b325514
--- /dev/null
+++ b/skills/snapshots.md
@@ -0,0 +1,52 @@
+---
+name: snapshots
+description: Token regression snapshots in Skillgym. Covers baseline creation, tolerance checks, update flows, metrics, and when snapshots should be used.
+---
+
+# skillgym snapshots
+
+Use this skill when the benchmark should guard token usage regressions.
+
+## Purpose
+
+Snapshots compare the current run against a stored baseline for each `caseId + runner.id` pair.
+
+This is useful for catching regressions caused by:
+
+- skill changes that make the agent do more work
+- tool changes that return too much data
+- model behavior changes that increase token usage
+
+## Important behavior
+
+- default metric is `totalTokens`
+- missing snapshot entries are created automatically
+- the run fails when usage exceeds the configured tolerance
+- snapshots are cost guards, not functional assertions
+
+## Main commands
+
+```bash
+skillgym run <suite.ts> --update-snapshots
+skillgym run <suite.ts> --snapshots ./skillgym.snapshots.json
+```
+
+## Supported metrics
+
+- `totalTokens`
+- `inputTokens`
+- `outputTokens`
+- `reasoningTokens`
+- `cacheTokens`
+
+## When to add snapshots
+
+- after the benchmark behavior is already stable
+- when you want to catch prompt or tooling cost regressions
+- when the selected runner reports the needed token metric reliably
+
+## When not to rely on snapshots alone
+
+- when the case still lacks functional assertions
+- when the run is flaky for reasons unrelated to token usage
+- when you are still exploring prompt shape and workflow
diff --git a/skills/test-cases.md b/skills/test-cases.md
new file mode 100644
index 0000000..e4e3dd6
--- /dev/null
+++ b/skills/test-cases.md
@@ -0,0 +1,84 @@
+---
+name: test-cases
+description: Defining Skillgym suites and test cases. Covers suite exports, test case fields, tags, expected failures, timeouts, and assertion context usage.
+---
+
+# skillgym test-cases
+
+Use this skill when creating or restructuring suite files.
+
+## Suite exports
+
+A suite module must export a default suite value.
+
+Supported shapes:
+
+- array of `TestCase`
+- object map of named `TestCase`
+
+```ts
+import { assert, type TestCase } from "skillgym";
+
+const suite: TestCase[] = [
+  {
+    id: "always-passes",
+    prompt: "Say only: skillgym ready",
+    assert(report, ctx) {
+      assert.match(ctx.finalOutput(), /skillgym ready/);
+    },
+  },
+];
+
+export default suite;
+```
+
+## Important fields
+
+- `id`: stable identifier used in results and artifact paths
+- `prompt`: exact prompt sent to the runner
+- `tags`: optional labels for `--tag`
+- `timeoutMs`: per-case timeout override
+- `expectedFail`: mark assertion failures as expected benchmark signal
+- `classifyFailure(result)`: assign or override a structured failure class
+- `assert(report, ctx)`: pass/fail logic
+
+## Tags
+
+Tags let you run slices of a suite without changing file structure.
+
+```bash
+skillgym run ./suite.ts --tag smoke
+skillgym run ./suite.ts --tag smoke,auth
+skillgym run ./suite.ts --tag smoke --tag auth
+```
+
+Tag matching is OR-based.
+
+## Expected failures
+
+Use `expectedFail: true` for known model or agent gaps that you want to track without failing suite health.
+
+- assertion failure becomes `expected-failed`
+- assertion success becomes `unexpected-passed`
+- infrastructure failures still fail the suite
+
+## Assertion context
+
+`ctx` is a convenience wrapper over the normalized report.
+
+Useful helpers:
+
+- `ctx.getCommands()`
+- `ctx.getToolCalls(tool?)`
+- `ctx.getFileReads()`
+- `ctx.detectedSkills()`
+- `ctx.finalOutput()`
+
+## Recommended case shape
+
+- one benchmark intent per case
+- one stable prompt per case
+- assertions that prove the intent directly
+- tags only when they improve filtering
+
+Keep cases small and composable. If a case is checking multiple unrelated things, split it.
diff --git a/skills/workspaces.md b/skills/workspaces.md
new file mode 100644
index 0000000..09ffa02
--- /dev/null
+++ b/skills/workspaces.md
@@ -0,0 +1,73 @@
+---
+name: workspaces
+description: Skillgym workspace setup for benchmark runs. Covers shared vs isolated mode, template directories, bootstrap commands, cleanup rules, and path resolution.
+---
+
+# skillgym workspaces
+
+Use this skill when benchmark runs need specific filesystem state.
+
+## Workspace modes
+
+- `shared`: run directly in a real working directory
+- `isolated`: create a fresh workspace per case x runner execution
+
+Use isolated workspaces when runs should not mutate the original checkout or when each execution needs a prepared template.
+
+## Shared mode
+
+```ts
+export const workspace = {
+  mode: "shared",
+  cwd: "./fixtures/repo-a",
+};
+```
+
+Behavior:
+
+- runs execute directly in that directory
+- `cwd` is optional
+- if omitted, Skillgym falls back to config `run.cwd`, then `process.cwd()`
+
+## Isolated mode
+
+```ts
+export const workspace = {
+  mode: "isolated",
+  templateDir: "./fixtures/repo-template",
+  bootstrap: {
+    command: "sh",
+    args: ["./scripts/bootstrap-workspace.sh"],
+  },
+};
+```
+
+Behavior:
+
+- each case x runner execution gets its own workspace
+- `templateDir` copies a starter project into that workspace
+- `bootstrap` runs before the agent starts
+- successful isolated runs are deleted
+- failed isolated runs are preserved under `outputDir/workspaces`
+
+## When to use isolated mode
+
+- the agent edits files
+- a case depends on seed files or a fixture repo
+- concurrent runs could interfere with each other
+- you need reproducible filesystem setup per execution
+
+## Runtime environment for bootstrap
+
+- `SKILLGYM_WORKSPACE`
+- `SKILLGYM_CASE_ID`
+- `SKILLGYM_RUNNER_ID`
+- `SKILLGYM_OUTPUT_DIR`
+- `SKILLGYM_ARTIFACT_DIR`
+
+## Good workspace practice
+
+- default to `shared` only for read-only or intentionally real-project checks
+- switch to `isolated` when filesystem mutations matter
+- keep bootstrap short and deterministic
+- preserve templates close to the suite when they are suite-specific
diff --git a/src/cli.ts b/src/cli.ts
index c60efa8..487b239 100644
--- a/src/cli.ts
+++ b/src/cli.ts
@@ -2,6 +2,7 @@ import { printHelp } from "./cli/help.js";
 import { printBanner } from "./cli/branding.js";
 import { formatCliError } from "./cli/error.js";
 import { RunFailuresError, runCommand } from "./cli/run.js";
+import { listBundledSkills, readBundledSkill } from "./cli/skills.js";
 import { parseArgs } from "./utils/cli.js";
 
 async function main(): Promise<void> {
@@ -47,6 +48,26 @@ async function main(): Promise<void> {
       printBanner({ kind: "full" });
       printHelp();
       return;
+    case "skills": {
+      const subcommand = parsed.positionals[0] ?? "list";
+
+      if (subcommand === "list") {
+        console.log(listBundledSkills().join("\n"));
+        return;
+      }
+
+      if (subcommand === "get") {
+        const skillName = parsed.positionals[1];
+        if (skillName === undefined) {
+          throw new Error("Missing skill name. Usage: skillgym skills get <name>");
+        }
+
+        console.log(readBundledSkill(skillName));
+        return;
+      }
+
+      throw new Error(`Unknown skills subcommand: ${subcommand}`);
+    }
     default:
       printBanner({ kind: "full" });
       printHelp();
diff --git a/src/cli/branding.ts b/src/cli/branding.ts
index 1b986da..47684ee 100644
--- a/src/cli/branding.ts
+++ b/src/cli/branding.ts
@@ -62,6 +62,10 @@ export function printBanner(options: { kind: "compact" | "full"; stdout?: Writer
       `  ${theme.dim("$")} ${theme.light("skillgym help")}                   ${theme.dim("Show CLI help")}`,
       stdout,
     );
+    writeLine(
+      `  ${theme.dim("$")} ${theme.light("skillgym skills get")} ${theme.accent("core")}       ${theme.dim("Read the main bundled skill")}`,
+      stdout,
+    );
     writeLine("", stdout);
   }
 }
diff --git a/src/cli/help.ts b/src/cli/help.ts
index 8ab9111..e31a6e8 100644
--- a/src/cli/help.ts
+++ b/src/cli/help.ts
@@ -7,9 +7,13 @@ export function printHelp(): void {
   console.log(`
 ${theme.bold("Usage:")} skillgym <command> [options]
 
+If you are an LLM agent, run ${theme.light("skillgym skills get core")} before authoring or debugging a benchmark suite.
+
 ${theme.bold("Commands:")}
-  run ${theme.accent("<suite.ts>")}    Execute a benchmark suite against the configured runners
-  help              Show this help message
+  run ${theme.accent("<suite.ts>")}     Execute a benchmark suite against the configured runners
+  skills list        List bundled skill files
+  skills get ${theme.accent("<name>")}  Print one bundled skill file
+  help               Show this help message
 
 ${theme.bold("Run Options:")}
   --config ${theme.accent("<path>")}        Load an explicit skillgym config file
@@ -25,6 +29,7 @@ ${theme.bold("Run Options:")}
   --update-snapshots       Refresh snapshot baselines for the executed runs
 
 ${theme.bold("Examples:")}
+  ${theme.dim("$")} ${theme.light("skillgym skills get core")}
   ${theme.dim("$")} ${theme.light("skillgym run ./examples/basic-suite.ts")}
   ${theme.dim("$")} ${theme.light("skillgym run ./examples/basic-suite.ts --runner open-main")}
   ${theme.dim("$")} ${theme.light("skillgym run ./examples/basic-suite.ts --case always-passes")}
diff --git a/src/cli/skills.ts b/src/cli/skills.ts
new file mode 100644
index 0000000..b668653
--- /dev/null
+++ b/src/cli/skills.ts
@@ -0,0 +1,42 @@
+import { existsSync, readdirSync, readFileSync } from "node:fs";
+import path from "node:path";
+import { fileURLToPath } from "node:url";
+
+const SKILL_FILE_EXTENSION = ".md";
+
+export function listBundledSkills(): string[] {
+  return readdirSync(resolveSkillsDir())
+    .filter((entry) => entry.endsWith(SKILL_FILE_EXTENSION))
+    .map((entry) => entry.slice(0, -SKILL_FILE_EXTENSION.length))
+    .sort();
+}
+
+export function readBundledSkill(name: string): string {
+  if (!/^[a-z0-9-]+$/i.test(name)) {
+    throw new Error(`Invalid skill name: ${name}`);
+  }
+
+  const filePath = path.join(resolveSkillsDir(), `${name}${SKILL_FILE_EXTENSION}`);
+
+  if (!existsSync(filePath)) {
+    throw new Error(`Unknown bundled skill: ${name}`);
+  }
+
+  return readFileSync(filePath, "utf8");
+}
+
+function resolveSkillsDir(): string {
+  const moduleDir = path.dirname(fileURLToPath(import.meta.url));
+  const candidates = [
+    path.resolve(moduleDir, "..", "..", "skills"),
+    path.resolve(moduleDir, "..", "skills"),
+  ];
+
+  for (const candidate of candidates) {
+    if (existsSync(path.join(candidate, `core${SKILL_FILE_EXTENSION}`))) {
+      return candidate;
+    }
+  }
+
+  throw new Error("Could not locate bundled skills directory.");
+}
diff --git a/test/cli.test.ts b/test/cli.test.ts
index 60b9a8b..9d7a217 100644
--- a/test/cli.test.ts
+++ b/test/cli.test.ts
@@ -38,13 +38,48 @@ test("cli help prints full MOTD banner and help sections", async () => {
   expect(result.stdout).toContain("Prove your agent skills work before you ship them.");
   expect(result.stdout).not.toContain("by Callstack");
   expect(result.stdout).toContain("Usage:");
+  expect(result.stdout).toContain("skillgym skills get core");
   expect(result.stdout).toContain("Commands:");
+  expect(result.stdout).toContain("skills list");
+  expect(result.stdout).toContain("skills get <name>");
   expect(result.stdout).toContain("Run Options:");
   expect(result.stdout).toContain("--schedule <mode>");
   expect(result.stdout).toContain("--max-parallel <n>");
   expect(result.stdout).toContain("Examples:");
 });
 
+test("cli skills list prints bundled skill names", async () => {
+  const result = await execCli(["skills", "list"]);
+
+  expect(result.exitCode).toBe(0);
+  expect(result.stderr).toBe("");
+  expect(result.stdout).toContain("assertions");
+  expect(result.stdout).toContain("core");
+  expect(result.stdout).toContain("reporters");
+  expect(result.stdout).toContain("snapshots");
+  expect(result.stdout).toContain("test-cases");
+  expect(result.stdout).toContain("workspaces");
+});
+
+test("cli skills get core prints the bundled core skill", async () => {
+  const result = await execCli(["skills", "get", "core"]);
+
+  expect(result.exitCode).toBe(0);
+  expect(result.stderr).toBe("");
+  expect(result.stdout).toContain("# skillgym core");
+  expect(result.stdout).toContain("skillgym skills get test-cases");
+  expect(result.stdout).toContain("skillgym run <suite.ts>");
+});
+
+test("cli skills get reports missing skill name without printing MOTD banner", async () => {
+  const result = await execCli(["skills", "get"]);
+
+  expect(result.exitCode).toBe(1);
+  expect(result.stdout).toBe("");
+  expect(result.stderr).toContain("Missing skill name. Usage: skillgym skills get <name>");
+  expect(result.stderr).not.toContain("Prove your agent skills work before you ship them.");
+});
+
 test("cli run reports missing suite path without printing MOTD banner", async () => {
   const result = await execCli(["run"]);