diff --git a/.github/workflows/multi-model-eval.yml b/.github/workflows/multi-model-eval.yml
index a528fe5d..28c09b84 100644
--- a/.github/workflows/multi-model-eval.yml
+++ b/.github/workflows/multi-model-eval.yml
@@ -63,3 +63,38 @@ jobs:
OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }}
GOOGLE_API_KEY: ${{ secrets.GOOGLE_API_KEY }}
run: deno run eval-skill-triggers --model ${{ matrix.model }} --concurrency ${{ matrix.concurrency }}
+
+ - name: Upload eval results
+ if: steps.check.outputs.run == 'true' && always()
+ uses: actions/upload-artifact@v4
+ with:
+ name: eval-results-${{ matrix.model }}
+ path: evals/promptfoo/results.json
+ retention-days: 30
+
+ analysis:
+ name: Cross-Model Analysis
+ runs-on: ubuntu-latest
+ needs: eval
+ if: always()
+ permissions:
+ contents: read
+ steps:
+ - name: Checkout code
+ uses: actions/checkout@v6
+
+ - name: Setup Deno
+ uses: denoland/setup-deno@v2
+ with:
+ deno-version: v2.x
+
+ - name: Download all eval results
+ uses: actions/download-artifact@v4
+ with:
+ pattern: eval-results-*
+ path: eval-artifacts
+
+ - name: Run cross-model analysis
+ run: deno run --allow-read --allow-env=GITHUB_STEP_SUMMARY --allow-write scripts/analyze_eval_results.ts eval-artifacts
+ env:
+ GITHUB_STEP_SUMMARY: ${{ github.step_summary }}
diff --git a/scripts/analyze_eval_results.ts b/scripts/analyze_eval_results.ts
new file mode 100644
index 00000000..808fe560
--- /dev/null
+++ b/scripts/analyze_eval_results.ts
@@ -0,0 +1,301 @@
+// Swamp, an Automation Framework
+// Copyright (C) 2026 System Initiative, Inc.
+//
+// This file is part of Swamp.
+//
+// Swamp is free software: you can redistribute it and/or modify
+// it under the terms of the GNU Affero General Public License version 3
+// as published by the Free Software Foundation, with the Swamp
+// Extension and Definition Exception (found in the "COPYING-EXCEPTION"
+// file).
+//
+// Swamp is distributed in the hope that it will be useful,
+// but WITHOUT ANY WARRANTY; without even the implied warranty of
+// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+// GNU Affero General Public License for more details.
+//
+// You should have received a copy of the GNU Affero General Public License
+// along with Swamp. If not, see .
+
+/**
+ * Analyzes eval results from all models and produces a cross-model comparison.
+ * Reads results.json files uploaded as artifacts by each eval job.
+ *
+ * Usage: deno run --allow-read --allow-env=GITHUB_STEP_SUMMARY --allow-write scripts/analyze_eval_results.ts
+ */
+
+import { join } from "@std/path";
+
+interface EvalResult {
+ success: boolean;
+ testCase?: {
+ description?: string;
+ vars?: Record;
+ };
+ response?: {
+ output?: unknown;
+ };
+}
+
+interface EvalStats {
+ successes: number;
+ failures: number;
+ errors: number;
+ tokenUsage: {
+ total: number;
+ prompt: number;
+ completion: number;
+ };
+ durationMs: number;
+}
+
+interface PromptfooOutput {
+ results: {
+ stats: EvalStats;
+ results: EvalResult[];
+ };
+}
+
+interface ModelSummary {
+ model: string;
+ total: number;
+ passed: number;
+ failed: number;
+ errors: number;
+ passRate: number;
+ tokens: number;
+ durationMs: number;
+ failures: string[];
+}
+
+function extractToolCallName(output: unknown): string | undefined {
+ if (!output) return undefined;
+
+ if (Array.isArray(output)) {
+ for (const item of output) {
+ // OpenAI format
+ if (item?.function?.name) return item.function.name;
+ // Google format
+ if (item?.functionCall?.name) return item.functionCall.name;
+ // Generic format
+ if (item?.name) return item.name;
+ }
+ }
+
+ if (typeof output === "object" && output !== null) {
+ const obj = output as Record;
+ if (obj.function && typeof obj.function === "object") {
+ return (obj.function as Record).name as string;
+ }
+ if (obj.functionCall && typeof obj.functionCall === "object") {
+ return (obj.functionCall as Record).name as string;
+ }
+ }
+
+ return undefined;
+}
+
+async function loadResults(artifactsDir: string): Promise {
+ const summaries: ModelSummary[] = [];
+
+ for await (const entry of Deno.readDir(artifactsDir)) {
+ if (!entry.isDirectory || !entry.name.startsWith("eval-results-")) {
+ continue;
+ }
+
+ const model = entry.name.replace("eval-results-", "");
+ const resultsPath = join(artifactsDir, entry.name, "results.json");
+
+ let data: PromptfooOutput;
+ try {
+ data = JSON.parse(await Deno.readTextFile(resultsPath));
+ } catch {
+ console.warn(`Skipping ${model}: no results.json found`);
+ continue;
+ }
+
+ const { stats, results } = data.results;
+ const total = stats.successes + stats.failures;
+ const failures = results
+ .filter((r) => !r.success)
+ .map((r) => {
+ const desc = r.testCase?.description ?? "unknown";
+ const calledTool = extractToolCallName(r.response?.output);
+ const outputStr = typeof r.response?.output === "string"
+ ? r.response.output.slice(0, 80)
+ : calledTool
+ ? `routed to ${calledTool}`
+ : "text response (no tool call)";
+ return `${desc} → ${outputStr}`;
+ });
+
+ summaries.push({
+ model,
+ total,
+ passed: stats.successes,
+ failed: stats.failures,
+ errors: stats.errors,
+ passRate: total > 0 ? stats.successes / total : 0,
+ tokens: stats.tokenUsage.total,
+ durationMs: stats.durationMs,
+ failures,
+ });
+ }
+
+ // Sort by pass rate descending
+ summaries.sort((a, b) => b.passRate - a.passRate);
+ return summaries;
+}
+
+function findCrossModelFailures(
+ summaries: ModelSummary[],
+): Map {
+ // Map test description -> list of models that failed it
+ const failureMap = new Map();
+
+ for (const summary of summaries) {
+ for (const failure of summary.failures) {
+ // Extract just the test description (before →)
+ const desc = failure.split(" → ")[0].trim();
+ if (!failureMap.has(desc)) {
+ failureMap.set(desc, []);
+ }
+ failureMap.get(desc)!.push(summary.model);
+ }
+ }
+
+ return failureMap;
+}
+
+function formatDuration(ms: number): string {
+ const seconds = Math.floor(ms / 1000);
+ const minutes = Math.floor(seconds / 60);
+ const remainingSeconds = seconds % 60;
+ return minutes > 0 ? `${minutes}m ${remainingSeconds}s` : `${seconds}s`;
+}
+
+async function main(): Promise {
+ const artifactsDir = Deno.args[0];
+ if (!artifactsDir) {
+ console.error("Usage: analyze_eval_results.ts ");
+ Deno.exit(1);
+ }
+
+ const summaries = await loadResults(artifactsDir);
+
+ if (summaries.length === 0) {
+ console.log("No eval results found.");
+ return;
+ }
+
+ // Console output
+ console.log("\n=== Cross-Model Skill Trigger Eval Analysis ===\n");
+
+ console.log("Model Results:");
+ for (const s of summaries) {
+ const status = s.passRate >= 0.9 ? "PASS" : "FAIL";
+ console.log(
+ ` ${status} ${s.model}: ${s.passed}/${s.total} (${(s.passRate * 100).toFixed(1)}%) — ${s.tokens.toLocaleString()} tokens, ${formatDuration(s.durationMs)}`,
+ );
+ }
+
+ // Cross-model failures
+ const crossModelFailures = findCrossModelFailures(summaries);
+ const multiModelFailures = [...crossModelFailures.entries()]
+ .filter(([_, models]) => models.length > 1)
+ .sort((a, b) => b[1].length - a[1].length);
+
+ if (multiModelFailures.length > 0) {
+ console.log("\nCross-Model Failures (same test fails on multiple models):");
+ for (const [desc, models] of multiModelFailures) {
+ console.log(` [${models.length}/${summaries.length}] ${desc}`);
+ console.log(` Models: ${models.join(", ")}`);
+ }
+ }
+
+ // Per-model unique failures
+ const singleModelFailures = [...crossModelFailures.entries()]
+ .filter(([_, models]) => models.length === 1);
+
+ if (singleModelFailures.length > 0) {
+ console.log("\nModel-Specific Failures (only one model fails):");
+ for (const [desc, models] of singleModelFailures) {
+ console.log(` [${models[0]}] ${desc}`);
+ }
+ }
+
+ // Detailed failure list per model
+ for (const s of summaries) {
+ if (s.failures.length > 0) {
+ console.log(`\n${s.model} failures (${s.failures.length}):`);
+ for (const f of s.failures) {
+ console.log(` ${f}`);
+ }
+ }
+ }
+
+ // Overall verdict
+ const allPassed = summaries.every((s) => s.passRate >= 0.9);
+ console.log(
+ `\nVerdict: ${allPassed ? "ALL MODELS PASS" : "ACTION REQUIRED — some models below 90% threshold"}`,
+ );
+
+ // GitHub Actions summary
+ const summaryFile = Deno.env.get("GITHUB_STEP_SUMMARY");
+ if (summaryFile) {
+ let md = "## Cross-Model Skill Trigger Eval Analysis\n\n";
+
+ // Summary table
+ md += "### Results\n\n";
+ md += "| Model | Pass Rate | Passed | Failed | Tokens | Duration | Status |\n";
+ md += "|-------|-----------|--------|--------|--------|----------|--------|\n";
+ for (const s of summaries) {
+ const status = s.passRate >= 0.9 ? "✅ Pass" : "❌ Fail";
+ md +=
+ `| ${s.model} | ${(s.passRate * 100).toFixed(1)}% | ${s.passed} | ${s.failed} | ${s.tokens.toLocaleString()} | ${formatDuration(s.durationMs)} | ${status} |\n`;
+ }
+
+ // Cross-model failures
+ if (multiModelFailures.length > 0) {
+ md += "\n### Cross-Model Failures\n\n";
+ md +=
+ "These tests fail on multiple models, suggesting skill description issues:\n\n";
+ md += "| Test | Models Failing | Count |\n";
+ md += "|------|---------------|-------|\n";
+ for (const [desc, models] of multiModelFailures) {
+ const escapedDesc = desc.replace(/\|/g, "\\|");
+ md +=
+ `| ${escapedDesc} | ${models.join(", ")} | ${models.length}/${summaries.length} |\n`;
+ }
+ }
+
+ // Model-specific failures
+ if (singleModelFailures.length > 0) {
+ md += "\n### Model-Specific Failures\n\n";
+ md +=
+ "These tests fail on only one model, suggesting model-specific quirks:\n\n";
+ md += "| Test | Model |\n";
+ md += "|------|-------|\n";
+ for (const [desc, models] of singleModelFailures) {
+ const escapedDesc = desc.replace(/\|/g, "\\|");
+ md += `| ${escapedDesc} | ${models[0]} |\n`;
+ }
+ }
+
+ // Verdict
+ md += "\n### Verdict\n\n";
+ if (allPassed) {
+ md += "✅ **All models pass** the 90% threshold.\n";
+ } else {
+ const failing = summaries.filter((s) => s.passRate < 0.9);
+ md += "❌ **Action required** — the following models are below the 90% threshold:\n\n";
+ for (const s of failing) {
+ md += `- **${s.model}**: ${(s.passRate * 100).toFixed(1)}%\n`;
+ }
+ }
+
+ await Deno.writeTextFile(summaryFile, md, { append: true });
+ }
+}
+
+await main();