diff --git a/extensions/models/ci_git.ts b/extensions/models/ci_git.ts new file mode 100644 index 00000000..fd6e2f7e --- /dev/null +++ b/extensions/models/ci_git.ts @@ -0,0 +1,328 @@ +import { z } from "npm:zod@4"; + +const GlobalArgsSchema = z.object({ + url: z.string().describe("Git repository URL"), + workDir: z.string().optional().describe( + "Base directory for clones (defaults to a temporary directory)", + ), +}); + +const RepositorySchema = z.object({ + path: z.string(), + sha: z.string(), + branch: z.string(), + remote: z.string(), + ref: z.string(), +}); + +const DiffFileSchema = z.object({ + path: z.string(), + status: z.string(), + additions: z.number(), + deletions: z.number(), +}); + +const DiffSchema = z.object({ + base: z.string(), + head: z.string(), + files: z.array(DiffFileSchema), + totalAdditions: z.number(), + totalDeletions: z.number(), + filesChanged: z.number(), +}); + +async function runGit( + args: string[], + opts?: { cwd?: string }, +): Promise { + const cmd = new Deno.Command("git", { + args, + cwd: opts?.cwd, + stdout: "piped", + stderr: "piped", + }); + const output = await cmd.output(); + if (output.code !== 0) { + const stderr = new TextDecoder().decode(output.stderr); + throw new Error(`git ${args[0]} failed: ${stderr}`); + } + return new TextDecoder().decode(output.stdout).trim(); +} + +export const model = { + type: "@swamp/ci/git", + version: "2026.04.10.1", + globalArguments: GlobalArgsSchema, + resources: { + "repository": { + description: "Cloned or checked-out repository state", + schema: RepositorySchema, + lifetime: "infinite", + garbageCollection: 5, + }, + "diff": { + description: "Diff between two refs", + schema: DiffSchema, + lifetime: "infinite", + garbageCollection: 5, + }, + }, + methods: { + clone: { + description: "Clone a git repository (idempotent — skips if directory already exists)", + arguments: z.object({ + ref: z.string().optional().describe("Branch, tag, or commit to checkout after clone"), + depth: z.number().optional().describe("Shallow clone depth"), + }), + execute: async ( + args: { ref?: string; depth?: number }, + context: { + globalArgs: { url: string; workDir?: string }; + logger: { + info: (msg: string, data?: Record) => void; + }; + writeResource: ( + specName: string, + name: string, + data: Record, + ) => Promise; + }, + ) => { + const { url, workDir } = context.globalArgs; + const repoName = url.split("/").pop()?.replace(/\.git$/, "") ?? "repo"; + const baseDir = workDir ?? await Deno.makeTempDir({ prefix: "swamp-ci-" }); + const repoPath = `${baseDir}/${repoName}`; + + context.logger.info(`Cloning ${url} to ${repoPath}`); + + // Skip if already cloned + try { + const stat = await Deno.stat(repoPath); + if (stat.isDirectory) { + context.logger.info( + `Repository already exists at ${repoPath}, fetching latest`, + ); + await runGit(["fetch", "--all"], { cwd: repoPath }); + if (args.ref) { + await runGit(["checkout", args.ref], { cwd: repoPath }); + } + } + } catch { + // Directory doesn't exist, proceed with clone + const cloneArgs = ["clone"]; + if (args.depth) { + cloneArgs.push("--depth", String(args.depth)); + } + if (args.ref) { + cloneArgs.push("--branch", args.ref); + } + cloneArgs.push(url, repoPath); + await runGit(cloneArgs); + } + + // If ref is a specific commit (not a branch/tag), checkout after clone + if (args.ref && args.ref.match(/^[0-9a-f]{7,40}$/)) { + await runGit(["checkout", args.ref], { cwd: repoPath }); + } + + const sha = await runGit(["rev-parse", "HEAD"], { cwd: repoPath }); + const branch = await runGit( + ["rev-parse", "--abbrev-ref", "HEAD"], + { cwd: repoPath }, + ).catch(() => "HEAD"); + const remote = await runGit( + ["remote", "get-url", "origin"], + { cwd: repoPath }, + ).catch(() => url); + + context.logger.info(`Cloned ${url} at ${sha}`); + + const handle = await context.writeResource("repository", "repository", { + path: repoPath, + sha, + branch, + remote, + ref: args.ref ?? branch, + }); + return { dataHandles: [handle] }; + }, + }, + + checkout: { + description: "Checkout a specific ref in an existing repository", + arguments: z.object({ + path: z.string().describe("Path to the git repository"), + ref: z.string().describe("Branch, tag, or commit SHA to checkout"), + }), + execute: async ( + args: { path: string; ref: string }, + context: { + globalArgs: { url: string }; + logger: { + info: (msg: string, data?: Record) => void; + }; + writeResource: ( + specName: string, + name: string, + data: Record, + ) => Promise; + }, + ) => { + context.logger.info(`Checking out ${args.ref} in ${args.path}`); + + await runGit(["checkout", args.ref], { cwd: args.path }); + + const sha = await runGit(["rev-parse", "HEAD"], { cwd: args.path }); + const branch = await runGit( + ["rev-parse", "--abbrev-ref", "HEAD"], + { cwd: args.path }, + ).catch(() => "HEAD"); + const remote = await runGit( + ["remote", "get-url", "origin"], + { cwd: args.path }, + ).catch(() => context.globalArgs.url); + + context.logger.info(`Checked out ${args.ref} at ${sha}`); + + const handle = await context.writeResource("repository", "repository", { + path: args.path, + sha, + branch, + remote, + ref: args.ref, + }); + return { dataHandles: [handle] }; + }, + }, + + fetch: { + description: "Fetch latest refs from a remote", + arguments: z.object({ + path: z.string().describe("Path to the git repository"), + remote: z.string().default("origin").describe("Remote name"), + ref: z.string().optional().describe("Specific ref to fetch"), + }), + execute: async ( + args: { path: string; remote: string; ref?: string }, + context: { + globalArgs: { url: string }; + logger: { + info: (msg: string, data?: Record) => void; + }; + writeResource: ( + specName: string, + name: string, + data: Record, + ) => Promise; + }, + ) => { + const fetchArgs = ["fetch", args.remote]; + if (args.ref) { + fetchArgs.push(args.ref); + } + + context.logger.info(`Fetching from ${args.remote}`); + + await runGit(fetchArgs, { cwd: args.path }); + + const sha = await runGit(["rev-parse", "HEAD"], { cwd: args.path }); + const branch = await runGit( + ["rev-parse", "--abbrev-ref", "HEAD"], + { cwd: args.path }, + ).catch(() => "HEAD"); + const remote = await runGit( + ["remote", "get-url", args.remote], + { cwd: args.path }, + ).catch(() => context.globalArgs.url); + + const handle = await context.writeResource("repository", "repository", { + path: args.path, + sha, + branch, + remote, + ref: branch, + }); + return { dataHandles: [handle] }; + }, + }, + + diff: { + description: "Show diff stats between two refs", + arguments: z.object({ + path: z.string().describe("Path to the git repository"), + base: z.string().describe("Base ref (branch, tag, or SHA)"), + head: z.string().default("HEAD").describe("Head ref to compare against"), + }), + execute: async ( + args: { path: string; base: string; head: string }, + context: { + logger: { + info: (msg: string, data?: Record) => void; + }; + writeResource: ( + specName: string, + name: string, + data: Record, + ) => Promise; + }, + ) => { + context.logger.info(`Computing diff ${args.base}..${args.head}`); + + const numstat = await runGit( + ["diff", "--numstat", `${args.base}...${args.head}`], + { cwd: args.path }, + ); + + const files = numstat + .split("\n") + .filter((line) => line.trim()) + .map((line) => { + const [additions, deletions, path] = line.split("\t"); + return { + path: path ?? "", + status: "modified", + additions: additions === "-" ? 0 : parseInt(additions, 10), + deletions: deletions === "-" ? 0 : parseInt(deletions, 10), + }; + }); + + const totalAdditions = files.reduce((sum, f) => sum + f.additions, 0); + const totalDeletions = files.reduce((sum, f) => sum + f.deletions, 0); + + context.logger.info( + `Diff: ${files.length} files, +${totalAdditions} -${totalDeletions}`, + ); + + const handle = await context.writeResource("diff", "diff", { + base: args.base, + head: args.head, + files, + totalAdditions, + totalDeletions, + filesChanged: files.length, + }); + return { dataHandles: [handle] }; + }, + }, + + clean: { + description: "Remove a cloned repository directory", + arguments: z.object({ + path: z.string().describe("Path to the repository to remove"), + }), + execute: async ( + args: { path: string }, + context: { + logger: { + info: (msg: string, data?: Record) => void; + }; + }, + ) => { + context.logger.info(`Removing ${args.path}`); + await Deno.remove(args.path, { recursive: true }); + context.logger.info(`Cleaned up ${args.path}`); + return { dataHandles: [] }; + }, + }, + }, +}; diff --git a/extensions/models/ci_promptfoo_eval.ts b/extensions/models/ci_promptfoo_eval.ts new file mode 100644 index 00000000..1545e610 --- /dev/null +++ b/extensions/models/ci_promptfoo_eval.ts @@ -0,0 +1,432 @@ +import { z } from "npm:zod@4"; + +const TOKEN_PRICING: Record = { + "sonnet": { prompt: 3.0, completion: 15.0 }, + "opus": { prompt: 15.0, completion: 75.0 }, + "gpt-5.4": { prompt: 2.0, completion: 8.0 }, + "gemini-2.5-pro": { prompt: 1.25, completion: 10.0 }, +}; + +const API_KEY_ENV: Record = { + "sonnet": "ANTHROPIC_API_KEY", + "opus": "ANTHROPIC_API_KEY", + "gpt-5.4": "OPENAI_API_KEY", + "gemini-2.5-pro": "GOOGLE_API_KEY", +}; + +const FailureSchema = z.object({ + description: z.string(), + output: z.string(), +}); + +const ResultSchema = z.object({ + model: z.string(), + total: z.number(), + passed: z.number(), + failed: z.number(), + errors: z.number(), + passRate: z.number(), + tokens: z.object({ + total: z.number(), + prompt: z.number(), + completion: z.number(), + }), + cost: z.number(), + durationMs: z.number(), + failures: z.array(FailureSchema), +}); + +interface EvalStats { + successes: number; + failures: number; + errors: number; + tokenUsage: { + total: number; + prompt: number; + completion: number; + cached: number; + }; + durationMs: number; +} + +interface EvalResult { + success: boolean; + testCase?: { + description?: string; + vars?: Record; + }; + response?: { + output?: unknown; + }; +} + +interface PromptfooOutput { + results: { + stats: EvalStats; + results: EvalResult[]; + }; +} + +function extractToolCallName(output: unknown): string | undefined { + if (!output) return undefined; + if (Array.isArray(output)) { + for (const item of output) { + if (item?.function?.name) return item.function.name; + if (item?.functionCall?.name) return item.functionCall.name; + if (item?.name) return item.name; + } + } + if (typeof output === "object" && output !== null) { + const obj = output as Record; + if (obj.function && typeof obj.function === "object") { + return (obj.function as Record).name as string; + } + if (obj.functionCall && typeof obj.functionCall === "object") { + return (obj.functionCall as Record).name as string; + } + } + return undefined; +} + +export const model = { + type: "@swamp/ci/promptfoo-eval", + version: "2026.04.10.1", + globalArguments: z.object({}), + reports: ["@swamp/ci/eval-result"], + resources: { + "result": { + description: "Structured eval results for a single model", + schema: ResultSchema, + lifetime: "infinite", + garbageCollection: 10, + }, + }, + files: { + "raw-results": { + description: "Full promptfoo results.json output", + contentType: "application/json", + lifetime: "infinite", + garbageCollection: 5, + }, + }, + methods: { + setupNpm: { + description: + "Install promptfoo npm dependencies once in the shared workDir. Run this before parallel eval steps to avoid npm install races.", + arguments: z.object({ + workDir: z.string().describe("Path to the swamp repository checkout"), + }), + execute: async ( + args: { workDir: string }, + context: { + logger: { info: (msg: string) => void }; + writeResource: ( + specName: string, + name: string, + data: Record, + ) => Promise; + }, + ) => { + const configDir = `${args.workDir}/evals/promptfoo`; + context.logger.info(`Installing promptfoo dependencies in ${configDir}`); + + const installCmd = new Deno.Command("npm", { + args: ["install", "--package-lock=false"], + cwd: configDir, + stdout: "piped", + stderr: "piped", + }); + const result = await installCmd.output(); + + if (result.code !== 0) { + const stderr = new TextDecoder().decode(result.stderr); + throw new Error(`npm install failed: ${stderr}`); + } + + context.logger.info(`Promptfoo dependencies installed`); + const handle = await context.writeResource( + "result", + "npm-install-marker", + { + model: "setup", + total: 0, + passed: 0, + failed: 0, + errors: 0, + passRate: 0, + tokens: { total: 0, prompt: 0, completion: 0 }, + cost: 0, + durationMs: 0, + failures: [], + skipped: true, + }, + ); + return { dataHandles: [handle] }; + }, + }, + run: { + description: + "Run promptfoo skill trigger evals for a specific model and capture structured results", + arguments: z.object({ + workDir: z.string().describe("Path to the swamp repository checkout"), + model: z.string().describe( + "Model alias to evaluate (sonnet, opus, gpt-5.4, gemini-2.5-pro)", + ), + concurrency: z.number().default(20).describe( + "Number of concurrent eval calls", + ), + selectedModel: z.string().default("all").describe( + "Filter: only run if model matches this value, or 'all' to run every model", + ), + }), + execute: async ( + args: { + workDir: string; + model: string; + concurrency: number; + selectedModel: string; + }, + context: { + logger: { + info: (msg: string, data?: Record) => void; + }; + writeResource: ( + specName: string, + name: string, + data: Record, + ) => Promise; + createFileWriter: ( + specName: string, + name: string, + ) => { writeText: (content: string) => Promise }; + }, + ) => { + const { workDir, model: modelAlias, concurrency, selectedModel } = args; + const promptfooDir = `${workDir}/evals/promptfoo`; + + // Skip if this model wasn't selected + if (selectedModel !== "all" && selectedModel !== modelAlias) { + context.logger.info( + `Skipping ${modelAlias} — not selected (selected: ${selectedModel})`, + ); + const handle = await context.writeResource( + "result", + `result-${modelAlias}`, + { + model: modelAlias, + total: 0, + passed: 0, + failed: 0, + errors: 0, + passRate: 0, + tokens: { total: 0, prompt: 0, completion: 0 }, + cost: 0, + durationMs: 0, + failures: [], + skipped: true, + }, + ); + return { dataHandles: [handle] }; + } + + // Create a per-model isolated work directory. This avoids collisions + // when multiple models run in parallel (they'd otherwise clobber each + // other's promptfooconfig.yaml and results.json). + const tempDir = await Deno.makeTempDir({ + prefix: `swamp-eval-${modelAlias}-`, + }); + const configPath = `${tempDir}/promptfooconfig.yaml`; + const resultsPath = `${tempDir}/results.json`; + + context.logger.info( + `Running promptfoo eval for ${modelAlias} (concurrency=${concurrency}, tempDir=${tempDir})`, + ); + + const startTime = Date.now(); + + try { + // Step 1: Generate promptfoo config for this model. The generator + // prints to stdout; we capture and write to our per-model path. + context.logger.info(`Generating config for ${modelAlias}`); + const genCmd = new Deno.Command("deno", { + args: [ + "run", + "--config", + `${workDir}/deno.json`, + "--allow-read", + `${promptfooDir}/generate_config.ts`, + "--model", + modelAlias, + ], + cwd: workDir, + stdout: "piped", + stderr: "piped", + }); + const genOutput = await genCmd.output(); + if (genOutput.code !== 0) { + throw new Error( + `Config generation failed: ${ + new TextDecoder().decode(genOutput.stderr) + }`, + ); + } + await Deno.writeFile(configPath, genOutput.stdout); + + // Step 2: Run promptfoo eval with per-model config and output. + // cwd is the shared promptfooDir so node_modules is found (npm + // install runs once in the setup-npm step). + context.logger.info(`Running promptfoo eval for ${modelAlias}`); + const evalCmd = new Deno.Command("npx", { + args: [ + "promptfoo", + "eval", + "-c", + configPath, + "-j", + String(concurrency), + "--no-cache", + "-o", + resultsPath, + ], + cwd: promptfooDir, + stdout: "piped", + stderr: "piped", + }); + const evalOutput = await evalCmd.output(); + const durationMs = Date.now() - startTime; + + // promptfoo exits 100 when assertions fail (expected), other + // non-zero is a hard failure (e.g., missing API key). + const stdout = new TextDecoder().decode(evalOutput.stdout); + const stderr = new TextDecoder().decode(evalOutput.stderr); + + let rawJson: string; + try { + rawJson = await Deno.readTextFile(resultsPath); + } catch { + // No results — check if this was a graceful skip (missing key). + // promptfoo itself doesn't skip; that check is in the outer + // eval-skill-triggers wrapper. Since we bypass that wrapper, + // missing keys show up as non-zero exit without results. + const isSkip = evalOutput.code !== 0 && + (stderr.includes("API key") || stderr.includes("api key")); + if (isSkip) { + context.logger.info( + `${modelAlias} skipped — API key not configured`, + ); + const handle = await context.writeResource( + "result", + `result-${modelAlias}`, + { + model: modelAlias, + total: 0, + passed: 0, + failed: 0, + errors: 0, + passRate: 0, + tokens: { total: 0, prompt: 0, completion: 0 }, + cost: 0, + durationMs, + failures: [], + skipped: true, + }, + ); + return { dataHandles: [handle] }; + } + throw new Error( + `Eval failed — no results.json produced.\nExit code: ${evalOutput.code}\nStdout: ${stdout}\nStderr: ${stderr}`, + ); + } + + return await processResults( + rawJson, + modelAlias, + durationMs, + context, + ); + } finally { + // Always clean up the per-model temp dir + await Deno.remove(tempDir, { recursive: true }).catch(() => {}); + } + }, + }, + }, +}; + +// Helper: parse results.json and write the structured resource. +async function processResults( + rawJson: string, + modelAlias: string, + durationMs: number, + context: { + logger: { info: (msg: string) => void }; + writeResource: ( + specName: string, + name: string, + data: Record, + ) => Promise; + createFileWriter: ( + specName: string, + name: string, + ) => { writeText: (content: string) => Promise }; + }, +): Promise<{ dataHandles: unknown[] }> { + const data: PromptfooOutput = JSON.parse(rawJson); + const { stats, results } = data.results; + + const total = stats.successes + stats.failures; + const passRate = total > 0 ? stats.successes / total : 0; + + const pricing = TOKEN_PRICING[modelAlias]; + const cost = pricing + ? (stats.tokenUsage.prompt / 1_000_000) * pricing.prompt + + (stats.tokenUsage.completion / 1_000_000) * pricing.completion + : 0; + + const failures = results + .filter((r) => !r.success) + .map((r) => { + const desc = r.testCase?.description ?? + r.testCase?.vars?.query ?? "unknown"; + const calledTool = extractToolCallName(r.response?.output); + const outputStr = typeof r.response?.output === "string" + ? r.response.output.slice(0, 80) + : calledTool + ? `routed to ${calledTool}` + : "text response (no tool call)"; + return { description: desc, output: outputStr }; + }); + + context.logger.info( + `Eval complete for ${modelAlias}: ${stats.successes}/${total} passed (${(passRate * 100).toFixed(1)}%)`, + ); + + const resultHandle = await context.writeResource( + "result", + `result-${modelAlias}`, + { + model: modelAlias, + total, + passed: stats.successes, + failed: stats.failures, + errors: stats.errors ?? 0, + passRate, + tokens: { + total: stats.tokenUsage.total, + prompt: stats.tokenUsage.prompt, + completion: stats.tokenUsage.completion, + }, + cost, + durationMs, + failures, + }, + ); + + const fileWriter = context.createFileWriter( + "raw-results", + `raw-results-${modelAlias}`, + ); + await fileWriter.writeText(rawJson); + + return { dataHandles: [resultHandle] }; +} diff --git a/extensions/reports/ci_eval_analysis.ts b/extensions/reports/ci_eval_analysis.ts new file mode 100644 index 00000000..12e8e097 --- /dev/null +++ b/extensions/reports/ci_eval_analysis.ts @@ -0,0 +1,325 @@ +/** + * Cross-model eval analysis report. + * + * Workflow-scope report that reads structured eval results from all + * @swamp/ci/promptfoo-eval steps, computes cross-model failure analysis, + * and produces a markdown summary matching the GitHub Actions format. + */ + +const PASS_THRESHOLD = 0.9; + +interface EvalFailure { + description: string; + output: string; +} + +interface EvalResult { + model: string; + total: number; + passed: number; + failed: number; + errors: number; + passRate: number; + tokens: { total: number; prompt: number; completion: number }; + cost: number; + durationMs: number; + failures: EvalFailure[]; + skipped?: boolean; +} + +interface StepExecution { + jobName: string; + stepName: string; + modelName: string; + modelType: string; + methodName: string; + status: "succeeded" | "failed" | "skipped"; + dataHandles: Array<{ + name: string; + specName: string; + kind: "resource" | "file"; + dataId: string; + version: number; + size: number; + tags: Record; + }>; + methodArgs: Record; + modelId: string; + globalArgs: Record; +} + +interface ModelTypeLike { + normalized: string; + raw: string; + toDirectoryPath: () => string; + toString: () => string; + equals: (other: ModelTypeLike) => boolean; +} + +interface WorkflowReportContext { + scope: "workflow"; + workflowId: string; + workflowRunId: string; + workflowName: string; + workflowStatus: "succeeded" | "failed"; + stepExecutions: StepExecution[]; + repoDir: string; + logger: { + info: (msg: string, data?: Record) => void; + warn: (msg: string, data?: Record) => void; + }; + dataRepository: { + getContent: ( + modelType: ModelTypeLike, + modelId: string, + name: string, + version?: number, + ) => Promise; + }; +} + +// Duck-typed ModelType — matches swamp's ModelType interface without +// requiring an internal import (bundled reports can't import from src/). +function makeModelType(rawType: string): ModelTypeLike { + const normalized = rawType + .trim() + .toLowerCase() + .replace(/::/g, "/") + .replace(/\s+/g, "/") + .replace(/\/+/g, "/") + .replace(/^\/|\/$/g, ""); + return { + raw: rawType, + normalized, + toDirectoryPath: () => normalized, + toString: () => rawType, + equals: (other: ModelTypeLike) => other.normalized === normalized, + }; +} + +type ReportContext = WorkflowReportContext | { scope: string }; + +function isWorkflowContext(ctx: ReportContext): ctx is WorkflowReportContext { + return ctx.scope === "workflow"; +} + +function formatDuration(ms: number): string { + const seconds = Math.floor(ms / 1000); + const minutes = Math.floor(seconds / 60); + const remainingSeconds = seconds % 60; + return minutes > 0 ? `${minutes}m ${remainingSeconds}s` : `${seconds}s`; +} + +function findCrossModelFailures( + results: EvalResult[], +): Map { + const failureMap = new Map(); + for (const result of results) { + for (const failure of result.failures) { + const desc = failure.description.trim(); + if (!failureMap.has(desc)) { + failureMap.set(desc, []); + } + failureMap.get(desc)!.push(result.model); + } + } + return failureMap; +} + +export const report = { + name: "@swamp/ci/eval-analysis", + description: + "Cross-model skill trigger eval analysis — reads all promptfoo-eval results and produces a comparison report", + scope: "workflow" as const, + labels: ["ci", "eval"], + + execute: async ( + context: ReportContext, + ): Promise<{ markdown: string; json: Record }> => { + if (!isWorkflowContext(context)) { + return { + markdown: "⚠️ This report only runs at workflow scope.", + json: { error: "wrong scope" }, + }; + } + + // Find all eval-runner steps and extract their result data handles + const evalSteps = context.stepExecutions.filter( + (step) => + step.modelType === "@swamp/ci/promptfoo-eval" && + step.methodName === "run", + ); + + if (evalSteps.length === 0) { + return { + markdown: "No eval results found — no @swamp/ci/promptfoo-eval steps executed.", + json: { error: "no eval steps found" }, + }; + } + + // Read result data from each eval step + const results: EvalResult[] = []; + + for (const step of evalSteps) { + const resultHandle = step.dataHandles.find( + (h) => h.specName === "result" && h.kind === "resource", + ); + if (!resultHandle) { + context.logger.warn( + "Eval step {step} has no result data handle, skipping", + { step: step.stepName }, + ); + continue; + } + + const content = await context.dataRepository.getContent( + makeModelType(step.modelType), + step.modelId, + resultHandle.name, + resultHandle.version, + ); + if (!content) { + context.logger.warn( + "Could not read result data for step {step}", + { step: step.stepName }, + ); + continue; + } + + const parsed = JSON.parse(new TextDecoder().decode(content)); + results.push(parsed as EvalResult); + } + + // Separate skipped from actually-run results — skipped models weren't + // selected for this run and shouldn't affect the verdict. + const skippedResults = results.filter((r) => r.skipped === true); + const ranResults = results.filter((r) => r.skipped !== true); + + if (ranResults.length === 0) { + return { + markdown: `No eval results to analyze — ${skippedResults.length} models were skipped.`, + json: { error: "no results loaded", skipped: skippedResults.length }, + }; + } + + // Sort by pass rate descending + ranResults.sort((a, b) => b.passRate - a.passRate); + + // Compute cross-model failures (only across models that actually ran) + const crossModelFailures = findCrossModelFailures(ranResults); + const multiModelFailures = [...crossModelFailures.entries()] + .filter(([_, models]) => models.length > 1) + .sort((a, b) => b[1].length - a[1].length); + const singleModelFailures = [...crossModelFailures.entries()] + .filter(([_, models]) => models.length === 1); + + // Compute verdict — only models that actually ran count toward pass/fail + const allPassed = ranResults.every((r) => r.passRate >= PASS_THRESHOLD); + const failingModels = ranResults + .filter((r) => r.passRate < PASS_THRESHOLD) + .map((r) => r.model); + + // This report only renders cross-model comparison — it needs 2+ models + // to be meaningful. When a single model runs, @swamp/ci/eval-result + // (method-scope) handles individual results instead. + if (ranResults.length < 2) { + return { + markdown: + `_Cross-model analysis skipped — only ${ranResults.length} model(s) evaluated. See individual @swamp/ci/eval-result reports for per-model details._\n`, + json: { + skipped: true, + reason: "cross-model analysis requires 2+ models", + modelsRun: ranResults.map((r) => r.model), + modelsSkipped: skippedResults.map((r) => r.model), + }, + }; + } + + // Build markdown report + let md = "## Cross-Model Skill Trigger Eval Analysis\n\n"; + + // Results table + md += "### Results\n\n"; + md += + "| Model | Pass Rate | Passed | Failed | Tokens | Cost | Duration | Status |\n"; + md += + "|-------|-----------|--------|--------|--------|------|----------|--------|\n"; + for (const r of ranResults) { + const status = r.passRate >= PASS_THRESHOLD ? "✅ Pass" : "❌ Fail"; + md += `| ${r.model} | ${(r.passRate * 100).toFixed(1)}% | ${r.passed} | ${r.failed} | ${r.tokens.total.toLocaleString()} | $${r.cost.toFixed(2)} | ${formatDuration(r.durationMs)} | ${status} |\n`; + } + + if (skippedResults.length > 0) { + md += `\n_${skippedResults.length} model(s) were not selected for this run: ${skippedResults.map((r) => r.model).join(", ")}_\n`; + } + + // Cross-model failures + if (multiModelFailures.length > 0) { + md += "\n### Cross-Model Failures\n\n"; + md += + "These tests fail on multiple models, suggesting skill description issues:\n\n"; + md += "| Test | Models Failing | Count |\n"; + md += "|------|---------------|-------|\n"; + for (const [desc, models] of multiModelFailures) { + const escapedDesc = desc.replace(/\|/g, "\\|"); + md += + `| ${escapedDesc} | ${models.join(", ")} | ${models.length}/${ranResults.length} |\n`; + } + } + + // Model-specific failures + if (singleModelFailures.length > 0) { + md += "\n### Model-Specific Failures\n\n"; + md += + "These tests fail on only one model, suggesting model-specific quirks:\n\n"; + md += "| Test | Model |\n"; + md += "|------|-------|\n"; + for (const [desc, models] of singleModelFailures) { + const escapedDesc = desc.replace(/\|/g, "\\|"); + md += `| ${escapedDesc} | ${models[0]} |\n`; + } + } + + // Verdict + md += "\n### Verdict\n\n"; + if (allPassed) { + md += `✅ **All evaluated models pass** the 90% threshold (${ranResults.length}/${ranResults.length}).\n`; + } else { + md += + "❌ **Action required** — the following models are below the 90% threshold:\n\n"; + for (const model of failingModels) { + const r = ranResults.find((r) => r.model === model)!; + md += `- **${model}**: ${(r.passRate * 100).toFixed(1)}%\n`; + } + } + + // Build JSON report + const json = { + verdict: allPassed ? "pass" : "fail", + threshold: PASS_THRESHOLD, + failingModels, + skippedModels: skippedResults.map((r) => r.model), + models: ranResults.map((r) => ({ + model: r.model, + passRate: r.passRate, + passed: r.passed, + failed: r.failed, + tokens: r.tokens.total, + cost: r.cost, + durationMs: r.durationMs, + status: r.passRate >= PASS_THRESHOLD ? "pass" : "fail", + })), + crossModelFailures: multiModelFailures.map(([test, models]) => ({ + test, + models, + count: models.length, + })), + modelSpecificFailures: singleModelFailures.map(([test, models]) => ({ + test, + model: models[0], + })), + }; + + return { markdown: md, json }; + }, +}; diff --git a/extensions/reports/ci_eval_result.ts b/extensions/reports/ci_eval_result.ts new file mode 100644 index 00000000..6772a583 --- /dev/null +++ b/extensions/reports/ci_eval_result.ts @@ -0,0 +1,185 @@ +/** + * Per-model eval result report. + * + * Method-scope report that runs after each @swamp/ci/promptfoo-eval step + * and renders that specific model's results in GHA per-job summary format. + */ + +interface EvalFailure { + description: string; + output: string; +} + +interface EvalResult { + model: string; + total: number; + passed: number; + failed: number; + errors: number; + passRate: number; + tokens: { total: number; prompt: number; completion: number }; + cost: number; + durationMs: number; + failures: EvalFailure[]; + skipped?: boolean; +} + +interface DataHandle { + name: string; + specName: string; + kind: "resource" | "file"; + dataId: string; + version: number; +} + +interface ModelTypeLike { + normalized: string; + raw: string; + toDirectoryPath: () => string; + toString: () => string; + equals: (other: ModelTypeLike) => boolean; +} + +interface MethodReportContext { + scope: "method"; + modelType: ModelTypeLike; + modelId: string; + definition: { + id: string; + name: string; + version: number; + tags: Record; + }; + methodName: string; + executionStatus: "succeeded" | "failed"; + dataHandles: DataHandle[]; + dataRepository: { + getContent: ( + modelType: ModelTypeLike, + modelId: string, + name: string, + version?: number, + ) => Promise; + }; + logger: { + info: (msg: string) => void; + warn: (msg: string) => void; + }; +} + +type ReportContext = MethodReportContext | { scope: string }; + +function isMethodContext(ctx: ReportContext): ctx is MethodReportContext { + return ctx.scope === "method"; +} + +function formatDuration(ms: number): string { + const seconds = Math.floor(ms / 1000); + const minutes = Math.floor(seconds / 60); + const remainingSeconds = seconds % 60; + return minutes > 0 ? `${minutes}m ${remainingSeconds}s` : `${seconds}s`; +} + +export const report = { + name: "@swamp/ci/eval-result", + description: + "Per-model skill trigger eval results — runs after each @swamp/ci/promptfoo-eval step", + scope: "method" as const, + labels: ["ci", "eval"], + + execute: async ( + context: ReportContext, + ): Promise<{ markdown: string; json: Record }> => { + if (!isMethodContext(context)) { + return { + markdown: "⚠️ This report only runs at method scope.", + json: { error: "wrong scope" }, + }; + } + + // Only applies to @swamp/ci/promptfoo-eval model runs + if (context.modelType.normalized !== "@swamp/ci/promptfoo-eval") { + return { + markdown: "", + json: { skipped: true, reason: "wrong model type" }, + }; + } + + // Find the result resource handle + const resultHandle = context.dataHandles.find( + (h) => h.specName === "result" && h.kind === "resource", + ); + + if (!resultHandle) { + return { + markdown: "_No eval result produced._", + json: { error: "no result handle" }, + }; + } + + // Load the structured result data + const content = await context.dataRepository.getContent( + context.modelType, + context.modelId, + resultHandle.name, + resultHandle.version, + ); + + if (!content) { + return { + markdown: "_Could not load eval result data._", + json: { error: "no content" }, + }; + } + + const r: EvalResult = JSON.parse(new TextDecoder().decode(content)); + + // Skipped models get a minimal summary + if (r.skipped) { + return { + markdown: `## Skill Trigger Eval Results (${r.model})\n\n_Skipped — not selected for this run._\n`, + json: { model: r.model, skipped: true }, + }; + } + + // Build the full per-model summary (GHA per-job style) + let md = `## Skill Trigger Eval Results (${r.model})\n\n`; + md += "| Metric | Value |\n|---|---|\n"; + md += `| Model | ${r.model} |\n`; + md += `| Total tests | ${r.total} |\n`; + md += `| Passed | ${r.passed} |\n`; + md += `| Failed | ${r.failed} |\n`; + md += `| Pass rate | ${(r.passRate * 100).toFixed(1)}% |\n`; + md += `| Estimated cost | $${r.cost.toFixed(2)} |\n`; + md += `| Tokens | ${r.tokens.total.toLocaleString()} |\n`; + md += `| Duration | ${formatDuration(r.durationMs)} |\n`; + + if (r.failures.length > 0) { + md += "\n### Failed Tests\n\n"; + md += "| Test | Output |\n|---|---|\n"; + for (const f of r.failures) { + const escapedDesc = f.description.replace(/\|/g, "\\|"); + const escapedOutput = f.output + .replace(/\|/g, "\\|") + .replace(/\n/g, " "); + md += `| ${escapedDesc} | ${escapedOutput} |\n`; + } + } + + return { + markdown: md, + json: { + model: r.model, + total: r.total, + passed: r.passed, + failed: r.failed, + passRate: r.passRate, + cost: r.cost, + tokens: r.tokens.total, + durationMs: r.durationMs, + status: r.passRate >= 0.9 ? "pass" : "fail", + failures: r.failures, + }, + }; + }, +};