diff --git a/extensions/models/ci_git.ts b/extensions/models/ci_git.ts
new file mode 100644
index 00000000..fd6e2f7e
--- /dev/null
+++ b/extensions/models/ci_git.ts
@@ -0,0 +1,328 @@
+import { z } from "npm:zod@4";
+
+const GlobalArgsSchema = z.object({
+  url: z.string().describe("Git repository URL"),
+  workDir: z.string().optional().describe(
+    "Base directory for clones (defaults to a temporary directory)",
+  ),
+});
+
+const RepositorySchema = z.object({
+  path: z.string(),
+  sha: z.string(),
+  branch: z.string(),
+  remote: z.string(),
+  ref: z.string(),
+});
+
+const DiffFileSchema = z.object({
+  path: z.string(),
+  status: z.string(),
+  additions: z.number(),
+  deletions: z.number(),
+});
+
+const DiffSchema = z.object({
+  base: z.string(),
+  head: z.string(),
+  files: z.array(DiffFileSchema),
+  totalAdditions: z.number(),
+  totalDeletions: z.number(),
+  filesChanged: z.number(),
+});
+
+async function runGit(
+  args: string[],
+  opts?: { cwd?: string },
+): Promise<string> {
+  const cmd = new Deno.Command("git", {
+    args,
+    cwd: opts?.cwd,
+    stdout: "piped",
+    stderr: "piped",
+  });
+  const output = await cmd.output();
+  if (output.code !== 0) {
+    const stderr = new TextDecoder().decode(output.stderr);
+    throw new Error(`git ${args[0]} failed: ${stderr}`);
+  }
+  return new TextDecoder().decode(output.stdout).trim();
+}
+
+export const model = {
+  type: "@swamp/ci/git",
+  version: "2026.04.10.1",
+  globalArguments: GlobalArgsSchema,
+  resources: {
+    "repository": {
+      description: "Cloned or checked-out repository state",
+      schema: RepositorySchema,
+      lifetime: "infinite",
+      garbageCollection: 5,
+    },
+    "diff": {
+      description: "Diff between two refs",
+      schema: DiffSchema,
+      lifetime: "infinite",
+      garbageCollection: 5,
+    },
+  },
+  methods: {
+    clone: {
+      description: "Clone a git repository (idempotent — skips if directory already exists)",
+      arguments: z.object({
+        ref: z.string().optional().describe("Branch, tag, or commit to checkout after clone"),
+        depth: z.number().optional().describe("Shallow clone depth"),
+      }),
+      execute: async (
+        args: { ref?: string; depth?: number },
+        context: {
+          globalArgs: { url: string; workDir?: string };
+          logger: {
+            info: (msg: string, data?: Record<string, unknown>) => void;
+          };
+          writeResource: (
+            specName: string,
+            name: string,
+            data: Record<string, unknown>,
+          ) => Promise<unknown>;
+        },
+      ) => {
+        const { url, workDir } = context.globalArgs;
+        const repoName = url.split("/").pop()?.replace(/\.git$/, "") ?? "repo";
+        const baseDir = workDir ?? await Deno.makeTempDir({ prefix: "swamp-ci-" });
+        const repoPath = `${baseDir}/${repoName}`;
+
+        context.logger.info(`Cloning ${url} to ${repoPath}`);
+
+        // Skip if already cloned
+        try {
+          const stat = await Deno.stat(repoPath);
+          if (stat.isDirectory) {
+            context.logger.info(
+              `Repository already exists at ${repoPath}, fetching latest`,
+            );
+            await runGit(["fetch", "--all"], { cwd: repoPath });
+            if (args.ref) {
+              await runGit(["checkout", args.ref], { cwd: repoPath });
+            }
+          }
+        } catch {
+          // Directory doesn't exist, proceed with clone
+          const cloneArgs = ["clone"];
+          if (args.depth) {
+            cloneArgs.push("--depth", String(args.depth));
+          }
+          if (args.ref) {
+            cloneArgs.push("--branch", args.ref);
+          }
+          cloneArgs.push(url, repoPath);
+          await runGit(cloneArgs);
+        }
+
+        // If ref is a specific commit (not a branch/tag), checkout after clone
+        if (args.ref && args.ref.match(/^[0-9a-f]{7,40}$/)) {
+          await runGit(["checkout", args.ref], { cwd: repoPath });
+        }
+
+        const sha = await runGit(["rev-parse", "HEAD"], { cwd: repoPath });
+        const branch = await runGit(
+          ["rev-parse", "--abbrev-ref", "HEAD"],
+          { cwd: repoPath },
+        ).catch(() => "HEAD");
+        const remote = await runGit(
+          ["remote", "get-url", "origin"],
+          { cwd: repoPath },
+        ).catch(() => url);
+
+        context.logger.info(`Cloned ${url} at ${sha}`);
+
+        const handle = await context.writeResource("repository", "repository", {
+          path: repoPath,
+          sha,
+          branch,
+          remote,
+          ref: args.ref ?? branch,
+        });
+        return { dataHandles: [handle] };
+      },
+    },
+
+    checkout: {
+      description: "Checkout a specific ref in an existing repository",
+      arguments: z.object({
+        path: z.string().describe("Path to the git repository"),
+        ref: z.string().describe("Branch, tag, or commit SHA to checkout"),
+      }),
+      execute: async (
+        args: { path: string; ref: string },
+        context: {
+          globalArgs: { url: string };
+          logger: {
+            info: (msg: string, data?: Record<string, unknown>) => void;
+          };
+          writeResource: (
+            specName: string,
+            name: string,
+            data: Record<string, unknown>,
+          ) => Promise<unknown>;
+        },
+      ) => {
+        context.logger.info(`Checking out ${args.ref} in ${args.path}`);
+
+        await runGit(["checkout", args.ref], { cwd: args.path });
+
+        const sha = await runGit(["rev-parse", "HEAD"], { cwd: args.path });
+        const branch = await runGit(
+          ["rev-parse", "--abbrev-ref", "HEAD"],
+          { cwd: args.path },
+        ).catch(() => "HEAD");
+        const remote = await runGit(
+          ["remote", "get-url", "origin"],
+          { cwd: args.path },
+        ).catch(() => context.globalArgs.url);
+
+        context.logger.info(`Checked out ${args.ref} at ${sha}`);
+
+        const handle = await context.writeResource("repository", "repository", {
+          path: args.path,
+          sha,
+          branch,
+          remote,
+          ref: args.ref,
+        });
+        return { dataHandles: [handle] };
+      },
+    },
+
+    fetch: {
+      description: "Fetch latest refs from a remote",
+      arguments: z.object({
+        path: z.string().describe("Path to the git repository"),
+        remote: z.string().default("origin").describe("Remote name"),
+        ref: z.string().optional().describe("Specific ref to fetch"),
+      }),
+      execute: async (
+        args: { path: string; remote: string; ref?: string },
+        context: {
+          globalArgs: { url: string };
+          logger: {
+            info: (msg: string, data?: Record<string, unknown>) => void;
+          };
+          writeResource: (
+            specName: string,
+            name: string,
+            data: Record<string, unknown>,
+          ) => Promise<unknown>;
+        },
+      ) => {
+        const fetchArgs = ["fetch", args.remote];
+        if (args.ref) {
+          fetchArgs.push(args.ref);
+        }
+
+        context.logger.info(`Fetching from ${args.remote}`);
+
+        await runGit(fetchArgs, { cwd: args.path });
+
+        const sha = await runGit(["rev-parse", "HEAD"], { cwd: args.path });
+        const branch = await runGit(
+          ["rev-parse", "--abbrev-ref", "HEAD"],
+          { cwd: args.path },
+        ).catch(() => "HEAD");
+        const remote = await runGit(
+          ["remote", "get-url", args.remote],
+          { cwd: args.path },
+        ).catch(() => context.globalArgs.url);
+
+        const handle = await context.writeResource("repository", "repository", {
+          path: args.path,
+          sha,
+          branch,
+          remote,
+          ref: branch,
+        });
+        return { dataHandles: [handle] };
+      },
+    },
+
+    diff: {
+      description: "Show diff stats between two refs",
+      arguments: z.object({
+        path: z.string().describe("Path to the git repository"),
+        base: z.string().describe("Base ref (branch, tag, or SHA)"),
+        head: z.string().default("HEAD").describe("Head ref to compare against"),
+      }),
+      execute: async (
+        args: { path: string; base: string; head: string },
+        context: {
+          logger: {
+            info: (msg: string, data?: Record<string, unknown>) => void;
+          };
+          writeResource: (
+            specName: string,
+            name: string,
+            data: Record<string, unknown>,
+          ) => Promise<unknown>;
+        },
+      ) => {
+        context.logger.info(`Computing diff ${args.base}..${args.head}`);
+
+        const numstat = await runGit(
+          ["diff", "--numstat", `${args.base}...${args.head}`],
+          { cwd: args.path },
+        );
+
+        const files = numstat
+          .split("\n")
+          .filter((line) => line.trim())
+          .map((line) => {
+            const [additions, deletions, path] = line.split("\t");
+            return {
+              path: path ?? "",
+              status: "modified",
+              additions: additions === "-" ? 0 : parseInt(additions, 10),
+              deletions: deletions === "-" ? 0 : parseInt(deletions, 10),
+            };
+          });
+
+        const totalAdditions = files.reduce((sum, f) => sum + f.additions, 0);
+        const totalDeletions = files.reduce((sum, f) => sum + f.deletions, 0);
+
+        context.logger.info(
+          `Diff: ${files.length} files, +${totalAdditions} -${totalDeletions}`,
+        );
+
+        const handle = await context.writeResource("diff", "diff", {
+          base: args.base,
+          head: args.head,
+          files,
+          totalAdditions,
+          totalDeletions,
+          filesChanged: files.length,
+        });
+        return { dataHandles: [handle] };
+      },
+    },
+
+    clean: {
+      description: "Remove a cloned repository directory",
+      arguments: z.object({
+        path: z.string().describe("Path to the repository to remove"),
+      }),
+      execute: async (
+        args: { path: string },
+        context: {
+          logger: {
+            info: (msg: string, data?: Record<string, unknown>) => void;
+          };
+        },
+      ) => {
+        context.logger.info(`Removing ${args.path}`);
+        await Deno.remove(args.path, { recursive: true });
+        context.logger.info(`Cleaned up ${args.path}`);
+        return { dataHandles: [] };
+      },
+    },
+  },
+};
diff --git a/extensions/models/ci_promptfoo_eval.ts b/extensions/models/ci_promptfoo_eval.ts
new file mode 100644
index 00000000..1545e610
--- /dev/null
+++ b/extensions/models/ci_promptfoo_eval.ts
@@ -0,0 +1,432 @@
+import { z } from "npm:zod@4";
+
+const TOKEN_PRICING: Record<string, { prompt: number; completion: number }> = {
+  "sonnet": { prompt: 3.0, completion: 15.0 },
+  "opus": { prompt: 15.0, completion: 75.0 },
+  "gpt-5.4": { prompt: 2.0, completion: 8.0 },
+  "gemini-2.5-pro": { prompt: 1.25, completion: 10.0 },
+};
+
+const API_KEY_ENV: Record<string, string> = {
+  "sonnet": "ANTHROPIC_API_KEY",
+  "opus": "ANTHROPIC_API_KEY",
+  "gpt-5.4": "OPENAI_API_KEY",
+  "gemini-2.5-pro": "GOOGLE_API_KEY",
+};
+
+const FailureSchema = z.object({
+  description: z.string(),
+  output: z.string(),
+});
+
+const ResultSchema = z.object({
+  model: z.string(),
+  total: z.number(),
+  passed: z.number(),
+  failed: z.number(),
+  errors: z.number(),
+  passRate: z.number(),
+  tokens: z.object({
+    total: z.number(),
+    prompt: z.number(),
+    completion: z.number(),
+  }),
+  cost: z.number(),
+  durationMs: z.number(),
+  failures: z.array(FailureSchema),
+});
+
+interface EvalStats {
+  successes: number;
+  failures: number;
+  errors: number;
+  tokenUsage: {
+    total: number;
+    prompt: number;
+    completion: number;
+    cached: number;
+  };
+  durationMs: number;
+}
+
+interface EvalResult {
+  success: boolean;
+  testCase?: {
+    description?: string;
+    vars?: Record<string, string>;
+  };
+  response?: {
+    output?: unknown;
+  };
+}
+
+interface PromptfooOutput {
+  results: {
+    stats: EvalStats;
+    results: EvalResult[];
+  };
+}
+
+function extractToolCallName(output: unknown): string | undefined {
+  if (!output) return undefined;
+  if (Array.isArray(output)) {
+    for (const item of output) {
+      if (item?.function?.name) return item.function.name;
+      if (item?.functionCall?.name) return item.functionCall.name;
+      if (item?.name) return item.name;
+    }
+  }
+  if (typeof output === "object" && output !== null) {
+    const obj = output as Record<string, unknown>;
+    if (obj.function && typeof obj.function === "object") {
+      return (obj.function as Record<string, unknown>).name as string;
+    }
+    if (obj.functionCall && typeof obj.functionCall === "object") {
+      return (obj.functionCall as Record<string, unknown>).name as string;
+    }
+  }
+  return undefined;
+}
+
+export const model = {
+  type: "@swamp/ci/promptfoo-eval",
+  version: "2026.04.10.1",
+  globalArguments: z.object({}),
+  reports: ["@swamp/ci/eval-result"],
+  resources: {
+    "result": {
+      description: "Structured eval results for a single model",
+      schema: ResultSchema,
+      lifetime: "infinite",
+      garbageCollection: 10,
+    },
+  },
+  files: {
+    "raw-results": {
+      description: "Full promptfoo results.json output",
+      contentType: "application/json",
+      lifetime: "infinite",
+      garbageCollection: 5,
+    },
+  },
+  methods: {
+    setupNpm: {
+      description:
+        "Install promptfoo npm dependencies once in the shared workDir. Run this before parallel eval steps to avoid npm install races.",
+      arguments: z.object({
+        workDir: z.string().describe("Path to the swamp repository checkout"),
+      }),
+      execute: async (
+        args: { workDir: string },
+        context: {
+          logger: { info: (msg: string) => void };
+          writeResource: (
+            specName: string,
+            name: string,
+            data: Record<string, unknown>,
+          ) => Promise<unknown>;
+        },
+      ) => {
+        const configDir = `${args.workDir}/evals/promptfoo`;
+        context.logger.info(`Installing promptfoo dependencies in ${configDir}`);
+
+        const installCmd = new Deno.Command("npm", {
+          args: ["install", "--package-lock=false"],
+          cwd: configDir,
+          stdout: "piped",
+          stderr: "piped",
+        });
+        const result = await installCmd.output();
+
+        if (result.code !== 0) {
+          const stderr = new TextDecoder().decode(result.stderr);
+          throw new Error(`npm install failed: ${stderr}`);
+        }
+
+        context.logger.info(`Promptfoo dependencies installed`);
+        const handle = await context.writeResource(
+          "result",
+          "npm-install-marker",
+          {
+            model: "setup",
+            total: 0,
+            passed: 0,
+            failed: 0,
+            errors: 0,
+            passRate: 0,
+            tokens: { total: 0, prompt: 0, completion: 0 },
+            cost: 0,
+            durationMs: 0,
+            failures: [],
+            skipped: true,
+          },
+        );
+        return { dataHandles: [handle] };
+      },
+    },
+    run: {
+      description:
+        "Run promptfoo skill trigger evals for a specific model and capture structured results",
+      arguments: z.object({
+        workDir: z.string().describe("Path to the swamp repository checkout"),
+        model: z.string().describe(
+          "Model alias to evaluate (sonnet, opus, gpt-5.4, gemini-2.5-pro)",
+        ),
+        concurrency: z.number().default(20).describe(
+          "Number of concurrent eval calls",
+        ),
+        selectedModel: z.string().default("all").describe(
+          "Filter: only run if model matches this value, or 'all' to run every model",
+        ),
+      }),
+      execute: async (
+        args: {
+          workDir: string;
+          model: string;
+          concurrency: number;
+          selectedModel: string;
+        },
+        context: {
+          logger: {
+            info: (msg: string, data?: Record<string, unknown>) => void;
+          };
+          writeResource: (
+            specName: string,
+            name: string,
+            data: Record<string, unknown>,
+          ) => Promise<unknown>;
+          createFileWriter: (
+            specName: string,
+            name: string,
+          ) => { writeText: (content: string) => Promise<void> };
+        },
+      ) => {
+        const { workDir, model: modelAlias, concurrency, selectedModel } = args;
+        const promptfooDir = `${workDir}/evals/promptfoo`;
+
+        // Skip if this model wasn't selected
+        if (selectedModel !== "all" && selectedModel !== modelAlias) {
+          context.logger.info(
+            `Skipping ${modelAlias} — not selected (selected: ${selectedModel})`,
+          );
+          const handle = await context.writeResource(
+            "result",
+            `result-${modelAlias}`,
+            {
+              model: modelAlias,
+              total: 0,
+              passed: 0,
+              failed: 0,
+              errors: 0,
+              passRate: 0,
+              tokens: { total: 0, prompt: 0, completion: 0 },
+              cost: 0,
+              durationMs: 0,
+              failures: [],
+              skipped: true,
+            },
+          );
+          return { dataHandles: [handle] };
+        }
+
+        // Create a per-model isolated work directory. This avoids collisions
+        // when multiple models run in parallel (they'd otherwise clobber each
+        // other's promptfooconfig.yaml and results.json).
+        const tempDir = await Deno.makeTempDir({
+          prefix: `swamp-eval-${modelAlias}-`,
+        });
+        const configPath = `${tempDir}/promptfooconfig.yaml`;
+        const resultsPath = `${tempDir}/results.json`;
+
+        context.logger.info(
+          `Running promptfoo eval for ${modelAlias} (concurrency=${concurrency}, tempDir=${tempDir})`,
+        );
+
+        const startTime = Date.now();
+
+        try {
+          // Step 1: Generate promptfoo config for this model. The generator
+          // prints to stdout; we capture and write to our per-model path.
+          context.logger.info(`Generating config for ${modelAlias}`);
+          const genCmd = new Deno.Command("deno", {
+            args: [
+              "run",
+              "--config",
+              `${workDir}/deno.json`,
+              "--allow-read",
+              `${promptfooDir}/generate_config.ts`,
+              "--model",
+              modelAlias,
+            ],
+            cwd: workDir,
+            stdout: "piped",
+            stderr: "piped",
+          });
+          const genOutput = await genCmd.output();
+          if (genOutput.code !== 0) {
+            throw new Error(
+              `Config generation failed: ${
+                new TextDecoder().decode(genOutput.stderr)
+              }`,
+            );
+          }
+          await Deno.writeFile(configPath, genOutput.stdout);
+
+          // Step 2: Run promptfoo eval with per-model config and output.
+          // cwd is the shared promptfooDir so node_modules is found (npm
+          // install runs once in the setup-npm step).
+          context.logger.info(`Running promptfoo eval for ${modelAlias}`);
+          const evalCmd = new Deno.Command("npx", {
+            args: [
+              "promptfoo",
+              "eval",
+              "-c",
+              configPath,
+              "-j",
+              String(concurrency),
+              "--no-cache",
+              "-o",
+              resultsPath,
+            ],
+            cwd: promptfooDir,
+            stdout: "piped",
+            stderr: "piped",
+          });
+          const evalOutput = await evalCmd.output();
+          const durationMs = Date.now() - startTime;
+
+          // promptfoo exits 100 when assertions fail (expected), other
+          // non-zero is a hard failure (e.g., missing API key).
+          const stdout = new TextDecoder().decode(evalOutput.stdout);
+          const stderr = new TextDecoder().decode(evalOutput.stderr);
+
+          let rawJson: string;
+          try {
+            rawJson = await Deno.readTextFile(resultsPath);
+          } catch {
+            // No results — check if this was a graceful skip (missing key).
+            // promptfoo itself doesn't skip; that check is in the outer
+            // eval-skill-triggers wrapper. Since we bypass that wrapper,
+            // missing keys show up as non-zero exit without results.
+            const isSkip = evalOutput.code !== 0 &&
+              (stderr.includes("API key") || stderr.includes("api key"));
+            if (isSkip) {
+              context.logger.info(
+                `${modelAlias} skipped — API key not configured`,
+              );
+              const handle = await context.writeResource(
+                "result",
+                `result-${modelAlias}`,
+                {
+                  model: modelAlias,
+                  total: 0,
+                  passed: 0,
+                  failed: 0,
+                  errors: 0,
+                  passRate: 0,
+                  tokens: { total: 0, prompt: 0, completion: 0 },
+                  cost: 0,
+                  durationMs,
+                  failures: [],
+                  skipped: true,
+                },
+              );
+              return { dataHandles: [handle] };
+            }
+            throw new Error(
+              `Eval failed — no results.json produced.\nExit code: ${evalOutput.code}\nStdout: ${stdout}\nStderr: ${stderr}`,
+            );
+          }
+
+          return await processResults(
+            rawJson,
+            modelAlias,
+            durationMs,
+            context,
+          );
+        } finally {
+          // Always clean up the per-model temp dir
+          await Deno.remove(tempDir, { recursive: true }).catch(() => {});
+        }
+      },
+    },
+  },
+};
+
+// Helper: parse results.json and write the structured resource.
+async function processResults(
+  rawJson: string,
+  modelAlias: string,
+  durationMs: number,
+  context: {
+    logger: { info: (msg: string) => void };
+    writeResource: (
+      specName: string,
+      name: string,
+      data: Record<string, unknown>,
+    ) => Promise<unknown>;
+    createFileWriter: (
+      specName: string,
+      name: string,
+    ) => { writeText: (content: string) => Promise<void> };
+  },
+): Promise<{ dataHandles: unknown[] }> {
+  const data: PromptfooOutput = JSON.parse(rawJson);
+  const { stats, results } = data.results;
+
+  const total = stats.successes + stats.failures;
+  const passRate = total > 0 ? stats.successes / total : 0;
+
+  const pricing = TOKEN_PRICING[modelAlias];
+  const cost = pricing
+    ? (stats.tokenUsage.prompt / 1_000_000) * pricing.prompt +
+      (stats.tokenUsage.completion / 1_000_000) * pricing.completion
+    : 0;
+
+  const failures = results
+    .filter((r) => !r.success)
+    .map((r) => {
+      const desc = r.testCase?.description ??
+        r.testCase?.vars?.query ?? "unknown";
+      const calledTool = extractToolCallName(r.response?.output);
+      const outputStr = typeof r.response?.output === "string"
+        ? r.response.output.slice(0, 80)
+        : calledTool
+        ? `routed to ${calledTool}`
+        : "text response (no tool call)";
+      return { description: desc, output: outputStr };
+    });
+
+  context.logger.info(
+    `Eval complete for ${modelAlias}: ${stats.successes}/${total} passed (${(passRate * 100).toFixed(1)}%)`,
+  );
+
+  const resultHandle = await context.writeResource(
+    "result",
+    `result-${modelAlias}`,
+    {
+      model: modelAlias,
+      total,
+      passed: stats.successes,
+      failed: stats.failures,
+      errors: stats.errors ?? 0,
+      passRate,
+      tokens: {
+        total: stats.tokenUsage.total,
+        prompt: stats.tokenUsage.prompt,
+        completion: stats.tokenUsage.completion,
+      },
+      cost,
+      durationMs,
+      failures,
+    },
+  );
+
+  const fileWriter = context.createFileWriter(
+    "raw-results",
+    `raw-results-${modelAlias}`,
+  );
+  await fileWriter.writeText(rawJson);
+
+  return { dataHandles: [resultHandle] };
+}
diff --git a/extensions/reports/ci_eval_analysis.ts b/extensions/reports/ci_eval_analysis.ts
new file mode 100644
index 00000000..12e8e097
--- /dev/null
+++ b/extensions/reports/ci_eval_analysis.ts
@@ -0,0 +1,325 @@
+/**
+ * Cross-model eval analysis report.
+ *
+ * Workflow-scope report that reads structured eval results from all
+ * @swamp/ci/promptfoo-eval steps, computes cross-model failure analysis,
+ * and produces a markdown summary matching the GitHub Actions format.
+ */
+
+const PASS_THRESHOLD = 0.9;
+
+interface EvalFailure {
+  description: string;
+  output: string;
+}
+
+interface EvalResult {
+  model: string;
+  total: number;
+  passed: number;
+  failed: number;
+  errors: number;
+  passRate: number;
+  tokens: { total: number; prompt: number; completion: number };
+  cost: number;
+  durationMs: number;
+  failures: EvalFailure[];
+  skipped?: boolean;
+}
+
+interface StepExecution {
+  jobName: string;
+  stepName: string;
+  modelName: string;
+  modelType: string;
+  methodName: string;
+  status: "succeeded" | "failed" | "skipped";
+  dataHandles: Array<{
+    name: string;
+    specName: string;
+    kind: "resource" | "file";
+    dataId: string;
+    version: number;
+    size: number;
+    tags: Record<string, string>;
+  }>;
+  methodArgs: Record<string, unknown>;
+  modelId: string;
+  globalArgs: Record<string, unknown>;
+}
+
+interface ModelTypeLike {
+  normalized: string;
+  raw: string;
+  toDirectoryPath: () => string;
+  toString: () => string;
+  equals: (other: ModelTypeLike) => boolean;
+}
+
+interface WorkflowReportContext {
+  scope: "workflow";
+  workflowId: string;
+  workflowRunId: string;
+  workflowName: string;
+  workflowStatus: "succeeded" | "failed";
+  stepExecutions: StepExecution[];
+  repoDir: string;
+  logger: {
+    info: (msg: string, data?: Record<string, unknown>) => void;
+    warn: (msg: string, data?: Record<string, unknown>) => void;
+  };
+  dataRepository: {
+    getContent: (
+      modelType: ModelTypeLike,
+      modelId: string,
+      name: string,
+      version?: number,
+    ) => Promise<Uint8Array | null>;
+  };
+}
+
+// Duck-typed ModelType — matches swamp's ModelType interface without
+// requiring an internal import (bundled reports can't import from src/).
+function makeModelType(rawType: string): ModelTypeLike {
+  const normalized = rawType
+    .trim()
+    .toLowerCase()
+    .replace(/::/g, "/")
+    .replace(/\s+/g, "/")
+    .replace(/\/+/g, "/")
+    .replace(/^\/|\/$/g, "");
+  return {
+    raw: rawType,
+    normalized,
+    toDirectoryPath: () => normalized,
+    toString: () => rawType,
+    equals: (other: ModelTypeLike) => other.normalized === normalized,
+  };
+}
+
+type ReportContext = WorkflowReportContext | { scope: string };
+
+function isWorkflowContext(ctx: ReportContext): ctx is WorkflowReportContext {
+  return ctx.scope === "workflow";
+}
+
+function formatDuration(ms: number): string {
+  const seconds = Math.floor(ms / 1000);
+  const minutes = Math.floor(seconds / 60);
+  const remainingSeconds = seconds % 60;
+  return minutes > 0 ? `${minutes}m ${remainingSeconds}s` : `${seconds}s`;
+}
+
+function findCrossModelFailures(
+  results: EvalResult[],
+): Map<string, string[]> {
+  const failureMap = new Map<string, string[]>();
+  for (const result of results) {
+    for (const failure of result.failures) {
+      const desc = failure.description.trim();
+      if (!failureMap.has(desc)) {
+        failureMap.set(desc, []);
+      }
+      failureMap.get(desc)!.push(result.model);
+    }
+  }
+  return failureMap;
+}
+
+export const report = {
+  name: "@swamp/ci/eval-analysis",
+  description:
+    "Cross-model skill trigger eval analysis — reads all promptfoo-eval results and produces a comparison report",
+  scope: "workflow" as const,
+  labels: ["ci", "eval"],
+
+  execute: async (
+    context: ReportContext,
+  ): Promise<{ markdown: string; json: Record<string, unknown> }> => {
+    if (!isWorkflowContext(context)) {
+      return {
+        markdown: "⚠️ This report only runs at workflow scope.",
+        json: { error: "wrong scope" },
+      };
+    }
+
+    // Find all eval-runner steps and extract their result data handles
+    const evalSteps = context.stepExecutions.filter(
+      (step) =>
+        step.modelType === "@swamp/ci/promptfoo-eval" &&
+        step.methodName === "run",
+    );
+
+    if (evalSteps.length === 0) {
+      return {
+        markdown: "No eval results found — no @swamp/ci/promptfoo-eval steps executed.",
+        json: { error: "no eval steps found" },
+      };
+    }
+
+    // Read result data from each eval step
+    const results: EvalResult[] = [];
+
+    for (const step of evalSteps) {
+      const resultHandle = step.dataHandles.find(
+        (h) => h.specName === "result" && h.kind === "resource",
+      );
+      if (!resultHandle) {
+        context.logger.warn(
+          "Eval step {step} has no result data handle, skipping",
+          { step: step.stepName },
+        );
+        continue;
+      }
+
+      const content = await context.dataRepository.getContent(
+        makeModelType(step.modelType),
+        step.modelId,
+        resultHandle.name,
+        resultHandle.version,
+      );
+      if (!content) {
+        context.logger.warn(
+          "Could not read result data for step {step}",
+          { step: step.stepName },
+        );
+        continue;
+      }
+
+      const parsed = JSON.parse(new TextDecoder().decode(content));
+      results.push(parsed as EvalResult);
+    }
+
+    // Separate skipped from actually-run results — skipped models weren't
+    // selected for this run and shouldn't affect the verdict.
+    const skippedResults = results.filter((r) => r.skipped === true);
+    const ranResults = results.filter((r) => r.skipped !== true);
+
+    if (ranResults.length === 0) {
+      return {
+        markdown: `No eval results to analyze — ${skippedResults.length} models were skipped.`,
+        json: { error: "no results loaded", skipped: skippedResults.length },
+      };
+    }
+
+    // Sort by pass rate descending
+    ranResults.sort((a, b) => b.passRate - a.passRate);
+
+    // Compute cross-model failures (only across models that actually ran)
+    const crossModelFailures = findCrossModelFailures(ranResults);
+    const multiModelFailures = [...crossModelFailures.entries()]
+      .filter(([_, models]) => models.length > 1)
+      .sort((a, b) => b[1].length - a[1].length);
+    const singleModelFailures = [...crossModelFailures.entries()]
+      .filter(([_, models]) => models.length === 1);
+
+    // Compute verdict — only models that actually ran count toward pass/fail
+    const allPassed = ranResults.every((r) => r.passRate >= PASS_THRESHOLD);
+    const failingModels = ranResults
+      .filter((r) => r.passRate < PASS_THRESHOLD)
+      .map((r) => r.model);
+
+    // This report only renders cross-model comparison — it needs 2+ models
+    // to be meaningful. When a single model runs, @swamp/ci/eval-result
+    // (method-scope) handles individual results instead.
+    if (ranResults.length < 2) {
+      return {
+        markdown:
+          `_Cross-model analysis skipped — only ${ranResults.length} model(s) evaluated. See individual @swamp/ci/eval-result reports for per-model details._\n`,
+        json: {
+          skipped: true,
+          reason: "cross-model analysis requires 2+ models",
+          modelsRun: ranResults.map((r) => r.model),
+          modelsSkipped: skippedResults.map((r) => r.model),
+        },
+      };
+    }
+
+    // Build markdown report
+    let md = "## Cross-Model Skill Trigger Eval Analysis\n\n";
+
+    // Results table
+    md += "### Results\n\n";
+    md +=
+      "| Model | Pass Rate | Passed | Failed | Tokens | Cost | Duration | Status |\n";
+    md +=
+      "|-------|-----------|--------|--------|--------|------|----------|--------|\n";
+    for (const r of ranResults) {
+      const status = r.passRate >= PASS_THRESHOLD ? "✅ Pass" : "❌ Fail";
+      md += `| ${r.model} | ${(r.passRate * 100).toFixed(1)}% | ${r.passed} | ${r.failed} | ${r.tokens.total.toLocaleString()} | $${r.cost.toFixed(2)} | ${formatDuration(r.durationMs)} | ${status} |\n`;
+    }
+
+    if (skippedResults.length > 0) {
+      md += `\n_${skippedResults.length} model(s) were not selected for this run: ${skippedResults.map((r) => r.model).join(", ")}_\n`;
+    }
+
+    // Cross-model failures
+    if (multiModelFailures.length > 0) {
+      md += "\n### Cross-Model Failures\n\n";
+      md +=
+        "These tests fail on multiple models, suggesting skill description issues:\n\n";
+      md += "| Test | Models Failing | Count |\n";
+      md += "|------|---------------|-------|\n";
+      for (const [desc, models] of multiModelFailures) {
+        const escapedDesc = desc.replace(/\|/g, "\\|");
+        md +=
+          `| ${escapedDesc} | ${models.join(", ")} | ${models.length}/${ranResults.length} |\n`;
+      }
+    }
+
+    // Model-specific failures
+    if (singleModelFailures.length > 0) {
+      md += "\n### Model-Specific Failures\n\n";
+      md +=
+        "These tests fail on only one model, suggesting model-specific quirks:\n\n";
+      md += "| Test | Model |\n";
+      md += "|------|-------|\n";
+      for (const [desc, models] of singleModelFailures) {
+        const escapedDesc = desc.replace(/\|/g, "\\|");
+        md += `| ${escapedDesc} | ${models[0]} |\n`;
+      }
+    }
+
+    // Verdict
+    md += "\n### Verdict\n\n";
+    if (allPassed) {
+      md += `✅ **All evaluated models pass** the 90% threshold (${ranResults.length}/${ranResults.length}).\n`;
+    } else {
+      md +=
+        "❌ **Action required** — the following models are below the 90% threshold:\n\n";
+      for (const model of failingModels) {
+        const r = ranResults.find((r) => r.model === model)!;
+        md += `- **${model}**: ${(r.passRate * 100).toFixed(1)}%\n`;
+      }
+    }
+
+    // Build JSON report
+    const json = {
+      verdict: allPassed ? "pass" : "fail",
+      threshold: PASS_THRESHOLD,
+      failingModels,
+      skippedModels: skippedResults.map((r) => r.model),
+      models: ranResults.map((r) => ({
+        model: r.model,
+        passRate: r.passRate,
+        passed: r.passed,
+        failed: r.failed,
+        tokens: r.tokens.total,
+        cost: r.cost,
+        durationMs: r.durationMs,
+        status: r.passRate >= PASS_THRESHOLD ? "pass" : "fail",
+      })),
+      crossModelFailures: multiModelFailures.map(([test, models]) => ({
+        test,
+        models,
+        count: models.length,
+      })),
+      modelSpecificFailures: singleModelFailures.map(([test, models]) => ({
+        test,
+        model: models[0],
+      })),
+    };
+
+    return { markdown: md, json };
+  },
+};
diff --git a/extensions/reports/ci_eval_result.ts b/extensions/reports/ci_eval_result.ts
new file mode 100644
index 00000000..6772a583
--- /dev/null
+++ b/extensions/reports/ci_eval_result.ts
@@ -0,0 +1,185 @@
+/**
+ * Per-model eval result report.
+ *
+ * Method-scope report that runs after each @swamp/ci/promptfoo-eval step
+ * and renders that specific model's results in GHA per-job summary format.
+ */
+
+interface EvalFailure {
+  description: string;
+  output: string;
+}
+
+interface EvalResult {
+  model: string;
+  total: number;
+  passed: number;
+  failed: number;
+  errors: number;
+  passRate: number;
+  tokens: { total: number; prompt: number; completion: number };
+  cost: number;
+  durationMs: number;
+  failures: EvalFailure[];
+  skipped?: boolean;
+}
+
+interface DataHandle {
+  name: string;
+  specName: string;
+  kind: "resource" | "file";
+  dataId: string;
+  version: number;
+}
+
+interface ModelTypeLike {
+  normalized: string;
+  raw: string;
+  toDirectoryPath: () => string;
+  toString: () => string;
+  equals: (other: ModelTypeLike) => boolean;
+}
+
+interface MethodReportContext {
+  scope: "method";
+  modelType: ModelTypeLike;
+  modelId: string;
+  definition: {
+    id: string;
+    name: string;
+    version: number;
+    tags: Record<string, string>;
+  };
+  methodName: string;
+  executionStatus: "succeeded" | "failed";
+  dataHandles: DataHandle[];
+  dataRepository: {
+    getContent: (
+      modelType: ModelTypeLike,
+      modelId: string,
+      name: string,
+      version?: number,
+    ) => Promise<Uint8Array | null>;
+  };
+  logger: {
+    info: (msg: string) => void;
+    warn: (msg: string) => void;
+  };
+}
+
+type ReportContext = MethodReportContext | { scope: string };
+
+function isMethodContext(ctx: ReportContext): ctx is MethodReportContext {
+  return ctx.scope === "method";
+}
+
+function formatDuration(ms: number): string {
+  const seconds = Math.floor(ms / 1000);
+  const minutes = Math.floor(seconds / 60);
+  const remainingSeconds = seconds % 60;
+  return minutes > 0 ? `${minutes}m ${remainingSeconds}s` : `${seconds}s`;
+}
+
+export const report = {
+  name: "@swamp/ci/eval-result",
+  description:
+    "Per-model skill trigger eval results — runs after each @swamp/ci/promptfoo-eval step",
+  scope: "method" as const,
+  labels: ["ci", "eval"],
+
+  execute: async (
+    context: ReportContext,
+  ): Promise<{ markdown: string; json: Record<string, unknown> }> => {
+    if (!isMethodContext(context)) {
+      return {
+        markdown: "⚠️ This report only runs at method scope.",
+        json: { error: "wrong scope" },
+      };
+    }
+
+    // Only applies to @swamp/ci/promptfoo-eval model runs
+    if (context.modelType.normalized !== "@swamp/ci/promptfoo-eval") {
+      return {
+        markdown: "",
+        json: { skipped: true, reason: "wrong model type" },
+      };
+    }
+
+    // Find the result resource handle
+    const resultHandle = context.dataHandles.find(
+      (h) => h.specName === "result" && h.kind === "resource",
+    );
+
+    if (!resultHandle) {
+      return {
+        markdown: "_No eval result produced._",
+        json: { error: "no result handle" },
+      };
+    }
+
+    // Load the structured result data
+    const content = await context.dataRepository.getContent(
+      context.modelType,
+      context.modelId,
+      resultHandle.name,
+      resultHandle.version,
+    );
+
+    if (!content) {
+      return {
+        markdown: "_Could not load eval result data._",
+        json: { error: "no content" },
+      };
+    }
+
+    const r: EvalResult = JSON.parse(new TextDecoder().decode(content));
+
+    // Skipped models get a minimal summary
+    if (r.skipped) {
+      return {
+        markdown: `## Skill Trigger Eval Results (${r.model})\n\n_Skipped — not selected for this run._\n`,
+        json: { model: r.model, skipped: true },
+      };
+    }
+
+    // Build the full per-model summary (GHA per-job style)
+    let md = `## Skill Trigger Eval Results (${r.model})\n\n`;
+    md += "| Metric | Value |\n|---|---|\n";
+    md += `| Model | ${r.model} |\n`;
+    md += `| Total tests | ${r.total} |\n`;
+    md += `| Passed | ${r.passed} |\n`;
+    md += `| Failed | ${r.failed} |\n`;
+    md += `| Pass rate | ${(r.passRate * 100).toFixed(1)}% |\n`;
+    md += `| Estimated cost | $${r.cost.toFixed(2)} |\n`;
+    md += `| Tokens | ${r.tokens.total.toLocaleString()} |\n`;
+    md += `| Duration | ${formatDuration(r.durationMs)} |\n`;
+
+    if (r.failures.length > 0) {
+      md += "\n### Failed Tests\n\n";
+      md += "| Test | Output |\n|---|---|\n";
+      for (const f of r.failures) {
+        const escapedDesc = f.description.replace(/\|/g, "\\|");
+        const escapedOutput = f.output
+          .replace(/\|/g, "\\|")
+          .replace(/\n/g, " ");
+        md += `| ${escapedDesc} | ${escapedOutput} |\n`;
+      }
+    }
+
+    return {
+      markdown: md,
+      json: {
+        model: r.model,
+        total: r.total,
+        passed: r.passed,
+        failed: r.failed,
+        passRate: r.passRate,
+        cost: r.cost,
+        tokens: r.tokens.total,
+        durationMs: r.durationMs,
+        status: r.passRate >= 0.9 ? "pass" : "fail",
+        failures: r.failures,
+      },
+    };
+  },
+};