From 8d31ce917b9600402395de014dcdad9cdb116859 Mon Sep 17 00:00:00 2001 From: Szymon Chmal Date: Tue, 5 May 2026 18:44:31 +0200 Subject: [PATCH] Add bundled CLI skills --- README.md | 7 +++ package.json | 3 +- skills/assertions.md | 97 ++++++++++++++++++++++++++++++++++++++++ skills/core.md | 103 +++++++++++++++++++++++++++++++++++++++++++ skills/reporters.md | 64 +++++++++++++++++++++++++++ skills/snapshots.md | 52 ++++++++++++++++++++++ skills/test-cases.md | 84 +++++++++++++++++++++++++++++++++++ skills/workspaces.md | 73 ++++++++++++++++++++++++++++++ src/cli.ts | 21 +++++++++ src/cli/branding.ts | 4 ++ src/cli/help.ts | 9 +++- src/cli/skills.ts | 42 ++++++++++++++++++ test/cli.test.ts | 35 +++++++++++++++ 13 files changed, 591 insertions(+), 3 deletions(-) create mode 100644 skills/assertions.md create mode 100644 skills/core.md create mode 100644 skills/reporters.md create mode 100644 skills/snapshots.md create mode 100644 skills/test-cases.md create mode 100644 skills/workspaces.md create mode 100644 src/cli/skills.ts diff --git a/README.md b/README.md index 64f5a3b..043bce0 100644 --- a/README.md +++ b/README.md @@ -104,6 +104,13 @@ View CLI help: npx skillgym help ``` +List bundled agent-facing skills and read the main one: + +```bash +npx skillgym skills list +npx skillgym skills get core +``` + By default, `skillgym` uses the built-in `standard` reporter. TypeScript config, suite, and reporter modules work out of the box on Node `>=22.18.0` using Node's built-in TypeScript stripping. diff --git a/package.json b/package.json index b2088cf..7143e9a 100644 --- a/package.json +++ b/package.json @@ -26,7 +26,8 @@ }, "files": [ "dist", - "bin.js" + "bin.js", + "skills" ], "type": "module", "main": "./dist/index.js", diff --git a/skills/assertions.md b/skills/assertions.md new file mode 100644 index 0000000..eb9899a --- /dev/null +++ b/skills/assertions.md @@ -0,0 +1,97 @@ +--- +name: assertions +description: Assertion authoring for Skillgym benchmark suites. Covers hard and soft assertions, grouped helpers, skill detection checks, command matching, and failure classification. +--- + +# skillgym assertions + +Use this skill when you are writing or debugging `assert(report, ctx)` logic. + +## What Skillgym gives you + +`skillgym` exports a root `assert` object that combines Node strict assertions with benchmark-focused helpers. + +Available groups: + +- `assert.soft.*` +- `assert.skills.*` +- `assert.commands.*` +- `assert.fileReads.*` +- `assert.toolCalls.*` +- `assert.output.*` + +## Typical patterns + +```ts +import { assert, commandMatcher } from "skillgym"; + +assert.skills.has(report, "find-skills"); +assert.commands.includes(report, commandMatcher("pnpm").arg("test")); +assert.fileReads.includes(report, /README\.md$/); +assert.toolCalls.includes(report, { tool: /skill/i }); +assert.match(ctx.finalOutput(), /expo/i); +``` + +## Hard vs soft assertions + +- Use hard assertions when one failure should stop the case immediately. +- Use `assert.soft.*` when you want one run to report multiple mismatches together. + +```ts +assert.soft.skills.has(report, "find-skills"); +assert.soft.commands.includes(report, "npx skills find"); +assert.soft.output.notEmpty(report); +``` + +## Skill assertions + +Use `assert.skills.*` against `report.detectedSkills`. + +Most useful methods: + +- `has` +- `notHas` +- `includes` +- `count` +- `exactlyOne` +- `only` + +Confidence can be filtered with `minConfidence`: + +```ts +assert.skills.has(report, "find-skills", { minConfidence: "strong" }); +``` + +## Command assertions + +Use raw strings or `RegExp` for quick checks. Use `commandMatcher(...)` for stable matching on executable, args, and options. + +```ts +assert.commands.includes(report, "pnpm test"); +assert.commands.before(report, /skills find/, /pnpm install/); +assert.commands.includes(report, commandMatcher("pnpm").arg("test").option("--filter", "unit")); +``` + +## Output assertions + +Use output assertions when the final agent response matters directly. Use them together with normalized report assertions, not instead of them. + +## Failure classification + +Use `assert.classify(...)` when multiple failures should collapse into one stable machine-readable cause. + +```ts +assert.classify({ id: "wrong-cli-alias", label: "Wrong CLI alias" }, () => { + assert.doesNotMatch(ctx.finalOutput(), /\bcursr\b/i); +}); +``` + +This improves grouped reporting across runs. + +## Good assertion style + +- assert the smallest behavior that proves the benchmark intent +- prefer normalized report fields over fragile prose matching +- use command and tool-call assertions for workflow checks +- use output assertions for user-visible wording checks +- classify repeated failure modes with stable ids diff --git a/skills/core.md b/skills/core.md new file mode 100644 index 0000000..ce63092 --- /dev/null +++ b/skills/core.md @@ -0,0 +1,103 @@ +--- +name: core +description: Core Skillgym workflow for agents. Read this first before writing or debugging suites. Covers how to structure a suite, run it, interpret results, and when to read deeper feature-specific skills. +--- + +# skillgym core + +Skillgym benchmarks coding-agent behavior by running real agent sessions and asserting on the normalized execution report. + +Read this skill first. It gives you the default workflow, the minimum suite shape, and the map to deeper feature skills. + +## Core loop + +```bash +skillgym skills get core +skillgym run +skillgym run --case +skillgym run --runner +``` + +Typical agent loop: + +1. Read the target suite or create one. +2. Write or refine prompts and assertions. +3. Run one suite, case, or runner slice. +4. Inspect failures from the output directory and normalized report. +5. Tighten assertions or workspace setup until the benchmark captures the intended behavior. + +## Minimum suite shape + +```ts +import { assert, type TestCase } from "skillgym"; + +const suite: TestCase[] = [ + { + id: "uses-correct-skill", + prompt: "Find the right skill and explain how to install it.", + assert(report) { + assert.skills.has(report, "find-skills"); + assert.match(report.finalOutput, /install/i); + }, + }, +]; + +export default suite; +``` + +Use stable `id` values. Keep prompts exact. Put benchmark intent in assertions, not in prose comments. + +## Primary commands + +```bash +skillgym help +skillgym skills list +skillgym skills get + +skillgym run +skillgym run --case +skillgym run --tag +skillgym run --runner +skillgym run --reporter json-summary +skillgym run --schedule parallel --max-parallel 4 +skillgym run --update-snapshots +``` + +## Mental model + +- A configured runner is one agent target. +- Each selected case runs once per selected runner. +- Assertions evaluate the normalized session report after the run finishes. +- Output artifacts are written under the configured `outputDir`. +- Expected assertion failures can be benchmark-successful; infrastructure failures are still real failures. + +## When to read deeper skills + +Read the focused skills only when the task needs them: + +- `skillgym skills get test-cases` + Use when creating or reshaping suite files, tags, expected failures, or per-case timeouts. +- `skillgym skills get assertions` + Use when writing pass/fail logic against skills, commands, tool calls, output, or failure classes. +- `skillgym skills get workspaces` + Use when the agent needs isolated filesystem state, template repos, or bootstrap commands. +- `skillgym skills get snapshots` + Use when benchmarking token regressions or updating snapshot baselines. +- `skillgym skills get reporters` + Use when choosing built-in reporters or wiring a custom reporter. + +## Suggested authoring order + +1. Start with one small case and one runner. +2. Make the assertion explicit and narrow. +3. Add tags or expected-failure behavior only after the baseline case works. +4. Add workspace isolation when shared state can affect the benchmark. +5. Add snapshots when behavior is stable enough to guard token regressions. + +## Common mistakes + +- asserting on vague output instead of checking the normalized report +- trying to select runners inside `TestCase` instead of config plus CLI filters +- using shared workspaces for stateful tasks that need isolation +- treating snapshot mismatches like functional failures instead of cost regressions +- writing one huge suite before proving one small representative case diff --git a/skills/reporters.md b/skills/reporters.md new file mode 100644 index 0000000..3b37430 --- /dev/null +++ b/skills/reporters.md @@ -0,0 +1,64 @@ +--- +name: reporters +description: Skillgym reporter selection and customization. Covers built-in reporters, CLI/config selection, lifecycle hooks, and when to use machine-readable output. +--- + +# skillgym reporters + +Use this skill when choosing how benchmark results should be rendered or consumed. + +## Built-in reporters + +- `standard` +- `json` +- `json-summary` +- `github-actions` + +## Main commands + +```bash +skillgym run --reporter standard +skillgym run --reporter json +skillgym run --reporter json-summary +skillgym run --reporter github-actions +skillgym run --reporter ./path/to/custom-reporter.ts +``` + +## Selection rules + +- omitting `--reporter` uses the built-in `standard` reporter +- CLI `--reporter` overrides config `run.reporter` +- relative custom reporter paths resolve from `process.cwd()` on CLI input + +## When to use each built-in reporter + +- `standard`: default interactive CLI output for humans +- `json`: full aggregated result on stdout for machine consumers +- `json-summary`: trimmed result for post-processing or LLM consumption +- `github-actions`: CI annotations and job summary output + +## Custom reporter shape + +```ts +import type { BenchmarkReporter } from "skillgym"; + +const reporter: BenchmarkReporter = { + onSuiteFinish(event) { + console.log(event.result.outputDir); + }, +}; + +export default reporter; +``` + +## Reporter lifecycle hooks + +- `onSuiteStart` +- `onCaseStart` +- `onRunnerStart` +- `onRunnerFinish` +- `onCaseFinish` +- `onSuiteFinish` +- `onError` + +Use `json-summary` when another agent or tool needs a smaller result than the full session report. diff --git a/skills/snapshots.md b/skills/snapshots.md new file mode 100644 index 0000000..b325514 --- /dev/null +++ b/skills/snapshots.md @@ -0,0 +1,52 @@ +--- +name: snapshots +description: Token regression snapshots in Skillgym. Covers baseline creation, tolerance checks, update flows, metrics, and when snapshots should be used. +--- + +# skillgym snapshots + +Use this skill when the benchmark should guard token usage regressions. + +## Purpose + +Snapshots compare the current run against a stored baseline for each `caseId + runner.id` pair. + +This is useful for catching regressions caused by: + +- skill changes that make the agent do more work +- tool changes that return too much data +- model behavior changes that increase token usage + +## Important behavior + +- default metric is `totalTokens` +- missing snapshot entries are created automatically +- the run fails when usage exceeds the configured tolerance +- snapshots are cost guards, not functional assertions + +## Main commands + +```bash +skillgym run --update-snapshots +skillgym run --snapshots ./skillgym.snapshots.json +``` + +## Supported metrics + +- `totalTokens` +- `inputTokens` +- `outputTokens` +- `reasoningTokens` +- `cacheTokens` + +## When to add snapshots + +- after the benchmark behavior is already stable +- when you want to catch prompt or tooling cost regressions +- when the selected runner reports the needed token metric reliably + +## When not to rely on snapshots alone + +- when the case still lacks functional assertions +- when the run is flaky for reasons unrelated to token usage +- when you are still exploring prompt shape and workflow diff --git a/skills/test-cases.md b/skills/test-cases.md new file mode 100644 index 0000000..e4e3dd6 --- /dev/null +++ b/skills/test-cases.md @@ -0,0 +1,84 @@ +--- +name: test-cases +description: Defining Skillgym suites and test cases. Covers suite exports, test case fields, tags, expected failures, timeouts, and assertion context usage. +--- + +# skillgym test-cases + +Use this skill when creating or restructuring suite files. + +## Suite exports + +A suite module must export a default suite value. + +Supported shapes: + +- array of `TestCase` +- object map of named `TestCase` + +```ts +import { assert, type TestCase } from "skillgym"; + +const suite: TestCase[] = [ + { + id: "always-passes", + prompt: "Say only: skillgym ready", + assert(report, ctx) { + assert.match(ctx.finalOutput(), /skillgym ready/); + }, + }, +]; + +export default suite; +``` + +## Important fields + +- `id`: stable identifier used in results and artifact paths +- `prompt`: exact prompt sent to the runner +- `tags`: optional labels for `--tag` +- `timeoutMs`: per-case timeout override +- `expectedFail`: mark assertion failures as expected benchmark signal +- `classifyFailure(result)`: assign or override a structured failure class +- `assert(report, ctx)`: pass/fail logic + +## Tags + +Tags let you run slices of a suite without changing file structure. + +```bash +skillgym run ./suite.ts --tag smoke +skillgym run ./suite.ts --tag smoke,auth +skillgym run ./suite.ts --tag smoke --tag auth +``` + +Tag matching is OR-based. + +## Expected failures + +Use `expectedFail: true` for known model or agent gaps that you want to track without failing suite health. + +- assertion failure becomes `expected-failed` +- assertion success becomes `unexpected-passed` +- infrastructure failures still fail the suite + +## Assertion context + +`ctx` is a convenience wrapper over the normalized report. + +Useful helpers: + +- `ctx.getCommands()` +- `ctx.getToolCalls(tool?)` +- `ctx.getFileReads()` +- `ctx.detectedSkills()` +- `ctx.finalOutput()` + +## Recommended case shape + +- one benchmark intent per case +- one stable prompt per case +- assertions that prove the intent directly +- tags only when they improve filtering + +Keep cases small and composable. If a case is checking multiple unrelated things, split it. diff --git a/skills/workspaces.md b/skills/workspaces.md new file mode 100644 index 0000000..09ffa02 --- /dev/null +++ b/skills/workspaces.md @@ -0,0 +1,73 @@ +--- +name: workspaces +description: Skillgym workspace setup for benchmark runs. Covers shared vs isolated mode, template directories, bootstrap commands, cleanup rules, and path resolution. +--- + +# skillgym workspaces + +Use this skill when benchmark runs need specific filesystem state. + +## Workspace modes + +- `shared`: run directly in a real working directory +- `isolated`: create a fresh workspace per case x runner execution + +Use isolated workspaces when runs should not mutate the original checkout or when each execution needs a prepared template. + +## Shared mode + +```ts +export const workspace = { + mode: "shared", + cwd: "./fixtures/repo-a", +}; +``` + +Behavior: + +- runs execute directly in that directory +- `cwd` is optional +- if omitted, Skillgym falls back to config `run.cwd`, then `process.cwd()` + +## Isolated mode + +```ts +export const workspace = { + mode: "isolated", + templateDir: "./fixtures/repo-template", + bootstrap: { + command: "sh", + args: ["./scripts/bootstrap-workspace.sh"], + }, +}; +``` + +Behavior: + +- each case x runner execution gets its own workspace +- `templateDir` copies a starter project into that workspace +- `bootstrap` runs before the agent starts +- successful isolated runs are deleted +- failed isolated runs are preserved under `outputDir/workspaces` + +## When to use isolated mode + +- the agent edits files +- a case depends on seed files or a fixture repo +- concurrent runs could interfere with each other +- you need reproducible filesystem setup per execution + +## Runtime environment for bootstrap + +- `SKILLGYM_WORKSPACE` +- `SKILLGYM_CASE_ID` +- `SKILLGYM_RUNNER_ID` +- `SKILLGYM_OUTPUT_DIR` +- `SKILLGYM_ARTIFACT_DIR` + +## Good workspace practice + +- default to `shared` only for read-only or intentionally real-project checks +- switch to `isolated` when filesystem mutations matter +- keep bootstrap short and deterministic +- preserve templates close to the suite when they are suite-specific diff --git a/src/cli.ts b/src/cli.ts index c60efa8..487b239 100644 --- a/src/cli.ts +++ b/src/cli.ts @@ -2,6 +2,7 @@ import { printHelp } from "./cli/help.js"; import { printBanner } from "./cli/branding.js"; import { formatCliError } from "./cli/error.js"; import { RunFailuresError, runCommand } from "./cli/run.js"; +import { listBundledSkills, readBundledSkill } from "./cli/skills.js"; import { parseArgs } from "./utils/cli.js"; async function main(): Promise { @@ -47,6 +48,26 @@ async function main(): Promise { printBanner({ kind: "full" }); printHelp(); return; + case "skills": { + const subcommand = parsed.positionals[0] ?? "list"; + + if (subcommand === "list") { + console.log(listBundledSkills().join("\n")); + return; + } + + if (subcommand === "get") { + const skillName = parsed.positionals[1]; + if (skillName === undefined) { + throw new Error("Missing skill name. Usage: skillgym skills get "); + } + + console.log(readBundledSkill(skillName)); + return; + } + + throw new Error(`Unknown skills subcommand: ${subcommand}`); + } default: printBanner({ kind: "full" }); printHelp(); diff --git a/src/cli/branding.ts b/src/cli/branding.ts index 1b986da..47684ee 100644 --- a/src/cli/branding.ts +++ b/src/cli/branding.ts @@ -62,6 +62,10 @@ export function printBanner(options: { kind: "compact" | "full"; stdout?: Writer ` ${theme.dim("$")} ${theme.light("skillgym help")} ${theme.dim("Show CLI help")}`, stdout, ); + writeLine( + ` ${theme.dim("$")} ${theme.light("skillgym skills get")} ${theme.accent("core")} ${theme.dim("Read the main bundled skill")}`, + stdout, + ); writeLine("", stdout); } } diff --git a/src/cli/help.ts b/src/cli/help.ts index 8ab9111..e31a6e8 100644 --- a/src/cli/help.ts +++ b/src/cli/help.ts @@ -7,9 +7,13 @@ export function printHelp(): void { console.log(` ${theme.bold("Usage:")} skillgym [options] +If you are an LLM agent, run ${theme.light("skillgym skills get core")} before authoring or debugging a benchmark suite. + ${theme.bold("Commands:")} - run ${theme.accent("")} Execute a benchmark suite against the configured runners - help Show this help message + run ${theme.accent("")} Execute a benchmark suite against the configured runners + skills list List bundled skill files + skills get ${theme.accent("")} Print one bundled skill file + help Show this help message ${theme.bold("Run Options:")} --config ${theme.accent("")} Load an explicit skillgym config file @@ -25,6 +29,7 @@ ${theme.bold("Run Options:")} --update-snapshots Refresh snapshot baselines for the executed runs ${theme.bold("Examples:")} + ${theme.dim("$")} ${theme.light("skillgym skills get core")} ${theme.dim("$")} ${theme.light("skillgym run ./examples/basic-suite.ts")} ${theme.dim("$")} ${theme.light("skillgym run ./examples/basic-suite.ts --runner open-main")} ${theme.dim("$")} ${theme.light("skillgym run ./examples/basic-suite.ts --case always-passes")} diff --git a/src/cli/skills.ts b/src/cli/skills.ts new file mode 100644 index 0000000..b668653 --- /dev/null +++ b/src/cli/skills.ts @@ -0,0 +1,42 @@ +import { existsSync, readdirSync, readFileSync } from "node:fs"; +import path from "node:path"; +import { fileURLToPath } from "node:url"; + +const SKILL_FILE_EXTENSION = ".md"; + +export function listBundledSkills(): string[] { + return readdirSync(resolveSkillsDir()) + .filter((entry) => entry.endsWith(SKILL_FILE_EXTENSION)) + .map((entry) => entry.slice(0, -SKILL_FILE_EXTENSION.length)) + .sort(); +} + +export function readBundledSkill(name: string): string { + if (!/^[a-z0-9-]+$/i.test(name)) { + throw new Error(`Invalid skill name: ${name}`); + } + + const filePath = path.join(resolveSkillsDir(), `${name}${SKILL_FILE_EXTENSION}`); + + if (!existsSync(filePath)) { + throw new Error(`Unknown bundled skill: ${name}`); + } + + return readFileSync(filePath, "utf8"); +} + +function resolveSkillsDir(): string { + const moduleDir = path.dirname(fileURLToPath(import.meta.url)); + const candidates = [ + path.resolve(moduleDir, "..", "..", "skills"), + path.resolve(moduleDir, "..", "skills"), + ]; + + for (const candidate of candidates) { + if (existsSync(path.join(candidate, `core${SKILL_FILE_EXTENSION}`))) { + return candidate; + } + } + + throw new Error("Could not locate bundled skills directory."); +} diff --git a/test/cli.test.ts b/test/cli.test.ts index 60b9a8b..9d7a217 100644 --- a/test/cli.test.ts +++ b/test/cli.test.ts @@ -38,13 +38,48 @@ test("cli help prints full MOTD banner and help sections", async () => { expect(result.stdout).toContain("Prove your agent skills work before you ship them."); expect(result.stdout).not.toContain("by Callstack"); expect(result.stdout).toContain("Usage:"); + expect(result.stdout).toContain("skillgym skills get core"); expect(result.stdout).toContain("Commands:"); + expect(result.stdout).toContain("skills list"); + expect(result.stdout).toContain("skills get "); expect(result.stdout).toContain("Run Options:"); expect(result.stdout).toContain("--schedule "); expect(result.stdout).toContain("--max-parallel "); expect(result.stdout).toContain("Examples:"); }); +test("cli skills list prints bundled skill names", async () => { + const result = await execCli(["skills", "list"]); + + expect(result.exitCode).toBe(0); + expect(result.stderr).toBe(""); + expect(result.stdout).toContain("assertions"); + expect(result.stdout).toContain("core"); + expect(result.stdout).toContain("reporters"); + expect(result.stdout).toContain("snapshots"); + expect(result.stdout).toContain("test-cases"); + expect(result.stdout).toContain("workspaces"); +}); + +test("cli skills get core prints the bundled core skill", async () => { + const result = await execCli(["skills", "get", "core"]); + + expect(result.exitCode).toBe(0); + expect(result.stderr).toBe(""); + expect(result.stdout).toContain("# skillgym core"); + expect(result.stdout).toContain("skillgym skills get test-cases"); + expect(result.stdout).toContain("skillgym run "); +}); + +test("cli skills get reports missing skill name without printing MOTD banner", async () => { + const result = await execCli(["skills", "get"]); + + expect(result.exitCode).toBe(1); + expect(result.stdout).toBe(""); + expect(result.stderr).toContain("Missing skill name. Usage: skillgym skills get "); + expect(result.stderr).not.toContain("Prove your agent skills work before you ship them."); +}); + test("cli run reports missing suite path without printing MOTD banner", async () => { const result = await execCli(["run"]);