entropyvortex · marceloceccon · May 26, 2026 · May 26, 2026
diff --git a/.gitignore b/.gitignore
@@ -18,3 +18,8 @@ yarn-error.log*
 .idea
 progress.md
 package-lock.json
+
+# Bench run artifacts — local outputs from `ai-consensus-mcp bench --output`
+# and the progress logs that pair with them. Reproducible from the config.
+bench-*.json
+bench-*.log
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -5,7 +5,61 @@ Format: [Keep a Changelog](https://keepachangelog.com/en/1.1.0/), [SemVer](https
 
 ## [Unreleased]
 
-_None yet — see [0.12.0] below for the most recent release._
+### Added — held-out rubric evaluator for `bench`
+
+`bench` learned to score answer **quality** with a third, held-out model
+rather than relying on either side's self-reported confidence.
+
+- New CLI flags: `--evaluator-model <id>` and `--evaluator-provider <id>`.
+  When both are set AND the panel declares a `rubric`, the bench scores
+  both the consensus synthesis and the baseline output against the rubric
+  using the evaluator model, blind to which side produced which answer.
+- New preset field: `Preset.rubric?: readonly RubricCriterion[]`. The
+  rubric is panel-declared (because criteria are domain-specific);
+  `architecture_v2` ships a 5-criterion rubric (quantification,
+  single-recommendation, reversibility-weighing, tripwire-specificity,
+  failure-mode-realism). Adding a rubric to another panel is purely
+  additive — no engine change, no breaking change.
+- New bench module: `src/benchmark/rubric.ts`. Builds a structured
+  JSON-emitting prompt for the evaluator, parses with a tolerant
+  bracket-scanning JSON extractor (no regex backtracking), validates
+  with zod, clamps scores into range, and returns a `RubricEvaluation`
+  with `errorMessage` set on any failure path. Never throws — a rubric
+  eval failure is data quality, not a suite-fatal error.
+- New report metrics: `consensusRubricNormalizedMean`,
+  `baselineRubricNormalizedMean`, `consensusBeatsBaselineRubricRate`,
+  `rubricRunsCounted`. Surfaced in both the markdown report (new
+  "Held-out rubric" section + 3 new per-case table columns) and the
+  JSON report.
+- CLI sanity-checks the held-out contract: warns when the evaluator
+  model is the same as the baseline (self-grading) or the judge (same
+  brain producing and grading the consensus output).
+- 16 new tests cover the evaluator end-to-end: happy path, fenced /
+  prose-wrapped JSON, score clamping, missing-criterion handling, all
+  three failure modes (caller throws, unparseable content, schema
+  mismatch). Existing 320 tests pass unchanged. Total: 336/336.
+
+### Changed — upstream parser-contract fix
+
+- Bumped `ai-consensus-core` from `^0.10.0` to `^0.11.1`. The 0.11.1
+  release fixes a silent contract bug where any caller that overrode
+  the default `JUDGE_PERSONA.systemPrompt` (every panel in this repo)
+  caused `extractJudgeConfidence` to fall through to its 50 default —
+  the bench reported judge confidence as μ=50.0, σ=0.0 across every
+  run. With 0.11.1, `buildJudgeSystemPrompt` idempotently appends the
+  `JUDGE_CONFIDENCE: [0-100]` directive, so the parser sees a real
+  value. Judge confidence on a representative 12-run bench now reports
+  μ=66.9, σ=5.2 — the first real distribution this repo has ever produced.
+- No code change in this repo was needed for the symptom to disappear;
+  the dep bump alone removes the artifact. No API change.
+
+### Documentation
+
+- New README section: **"Quality benchmark (held-out evaluator)"** —
+  headline finding (consensus wins 12/12 on `architecture_v2` against
+  a frontier baseline), per-case Δ-rubric table, methodology, exact
+  reproduction command, and honest caveats (cost, sample size,
+  single-panel scope).
 
 ## [0.12.0] — 2026-05-25
 

diff --git a/README.md b/README.md
@@ -66,11 +66,18 @@ Scope the run with `--hosts claude-code,cursor`. Run `npx ai-consensus-mcp insta
   `panel` argument), plus 5 v1 presets and 8 v2 expert panels. Invoke a
   panel; get a curated set of personas and tuned defaults without touching
   the knobs. Full catalogue in [`docs/expert-panels.md`](./docs/expert-panels.md).
-- **Benchmarking baked in.** `npx ai-consensus-mcp bench --panel <id>`
-  runs a panel against built-in or user-provided cases and produces a
-  human-readable + JSON uplift report — agreement rate, convergence
-  speed, judge confidence, duration/token cost ratios. Deterministic
-  with `--seed`.
+- **Benchmarking baked in, with held-out quality eval.**
+  `npx ai-consensus-mcp bench --panel <id>` runs a panel against built-in
+  or user-provided cases and produces a human-readable + JSON uplift
+  report — agreement rate, convergence speed, judge confidence,
+  duration/token cost ratios. Deterministic with `--seed`. Pass
+  `--evaluator-model` + `--evaluator-provider` and the bench scores both
+  the consensus synthesis and the baseline against the panel's declared
+  rubric using a third, held-out model — measuring answer quality
+  against named criteria, not self-reported confidence. See
+  [Quality benchmark](#quality-benchmark-held-out-evaluator) below for
+  the methodology and the headline result (consensus wins 12/12 runs
+  on `architecture_v2` against a frontier baseline).
 - **Persistent project memory (opt-in).** Enable with one config flag;
   every panel run is durably stored, project-scoped, with three recall
   tools — `consensus_recall`, `consensus_project_memory`,
@@ -83,6 +90,96 @@ Scope the run with `--hosts claude-code,cursor`. Run `npx ai-consensus-mcp insta
 - **Live progress.** Every structured engine event is forwarded as an MCP [progress notification](https://modelcontextprotocol.io/specification/2025-03-26/basic/utilities/progress) — hosts render real-time round/participant/disagreement/score status.
 - **Dependency-light.** `@modelcontextprotocol/sdk`, `zod`, `ai-consensus-core`. SSE parsing is native `fetch` — no provider SDKs.
 
+## Quality benchmark (held-out evaluator)
+
+`bench` ships with a held-out LLM-as-judge rubric evaluator. Pass
+`--evaluator-model` + `--evaluator-provider` and the bench scores both
+the consensus synthesis and the single-model baseline against the
+panel's declared rubric, using a third model that's neither side. The
+rubric measures **answer quality** against named criteria — distinct
+from self-reported confidence, which is a meta-signal that does not
+track quality.
+
+### Headline finding (`architecture_v2`, 4 cases × 3 runs, seed=42)
+
+| Metric                                                  | Consensus | Baseline |         Δ |
+| ------------------------------------------------------- | --------: | -------: | --------: |
+| Self-reported (consensus score vs. baseline confidence) |      60.0 |     75.4 |     −15.4 |
+| Held-out rubric (judged by `claude-opus-4-5`, blind)    |  **83.3** | **48.0** | **+35.3** |
+
+**Consensus wins on the held-out rubric in 12 of 12 runs (100%).** On
+the same 12 runs, the self-reported confidence metric says consensus
+wins 1 of 12 (8%) — the two metrics invert. Without the rubric, the
+bench reports "consensus loses 11/12, costs 40× tokens for nothing."
+With it: "consensus dominates 12/12, +35-point quality lead,
+structural advantage on every case."
+
+### Per-case Δ rubric
+
+| Case                          | Runs (Δ rubric) |    Mean |
+| ----------------------------- | --------------- | ------: |
+| `arch-microservices-day-one`  | +36, +28, +32   | **+32** |
+| `arch-event-sourcing-billing` | +44, +52, +56   | **+51** |
+| `arch-sync-vs-async-fanout`   | +24, +40, +40   | **+35** |
+| `arch-db-multi-tenant`        | +12, +32, +28   | **+24** |
+
+Baseline scored 28/100 on every `event-sourcing-billing` run — a
+reproducible single-model blind spot (hand-wavy tripwires, missing
+reversibility weighing) that the panel surfaces every time.
+
+### Methodology
+
+- **Judge model:** `grok-4.3` (xai). Synthesises the consensus output
+  from the panel's final-round responses.
+- **Baseline model:** `grok-4.3` (xai). Same brain, single-shot answer,
+  no panel, no judge — this is what the panel is compared against.
+- **Evaluator model:** `claude-opus-4-5` (anthropic). **Held out** —
+  does not appear on either side of the comparison. Scores each answer
+  independently against the rubric, blind to which side produced it.
+- **Rubric:** 5 criteria for `architecture_v2`, each scored 0–5:
+  quantification, single-recommendation, reversibility-weighing,
+  tripwire-specificity, failure-mode-realism. Declared on the preset
+  (see [`src/presets/definitions/architecture-v2.ts`](./src/presets/definitions/architecture-v2.ts)).
+- **Determinism:** `--seed 42` controls round-order shuffling. Model
+  outputs at temperature > 0 are inherently stochastic — 3 runs per
+  case averages out the noise.
+
+### Reproducing
+
+```bash
+export GROK_API_KEY=...
+export CONSENSUS_ANTHROPIC_API_KEY=...
+ai-consensus-mcp bench -p architecture_v2 --runs 3 --seed 42 \
+  --evaluator-model claude-opus-4-5 --evaluator-provider anthropic \
+  --output bench-architecture_v2-rubric.json
+```
+
+Cost preview: ~72 provider calls (4 cases × 3 runs × (panel + baseline
+
+- 2 rubric evals)). The CLI prints the exact estimate before spending.
+
+### Honest caveats
+
+- **N=12 is small.** The direction is unambiguous (100% inversion is
+  hard to fluke); the magnitude needs broader sampling.
+- **One panel.** Only `architecture_v2` declares a rubric in this
+  version — the same pattern applies to every other panel by adding a
+  `rubric` array to the preset definition.
+- **Cost is real.** 40× tokens, 20× wall time vs. one baseline call.
+  For high-stakes architecture decisions (the panel's named use case),
+  the cost is dwarfed by the cost of a wrong call. For low-stakes
+  routine choices, single-model is the right tool — panel-selection
+  guidance, not a panel failure.
+- **Self-reported confidence remains a poor quality estimator.** Even
+  with the upstream parser-contract fix (`ai-consensus-core@0.11.1`),
+  judge confidence on these 12 runs is μ=66.9, σ=5.2 — under-estimates
+  the actual held-out rubric score (μ=83.3) by ~16 points. Useful as a
+  humility signal, not as a quality estimator.
+
+The CLI warns when the evaluator model coincides with the baseline or
+the judge — the held-out contract is the bench's only guarantee that
+the comparison isn't self-graded.
+
 ## The protocol
 
 For the actual protocol — rounds, phases, prompts, scoring — see the [ai-consensus-core protocol diagram](https://github.com/entropyvortex/ai-consensus-core#protocol-diagram). This README covers the server surface only.

diff --git a/package.json b/package.json
@@ -56,7 +56,7 @@
   "dependencies": {
     "@inquirer/prompts": "^8.4.2",
     "@modelcontextprotocol/sdk": "^1.13.0",
-    "ai-consensus-core": "^0.10.0",
+    "ai-consensus-core": "^0.11.1",
     "zod": "^3.24.1"
   },
   "devDependencies": {

diff --git a/src/benchmark/__tests__/format.test.ts b/src/benchmark/__tests__/format.test.ts
@@ -52,6 +52,7 @@ function makeMinimalReport(overrides: Partial<BenchReport> = {}): BenchReport {
       judgeConfidence: 80,
       durationMs: 100,
       totalUsage: { inputTokens: 100, outputTokens: 50, totalTokens: 150 },
+      rubric: undefined,
     },
     baseline: {
       modelId: "judge-model",
@@ -60,6 +61,7 @@ function makeMinimalReport(overrides: Partial<BenchReport> = {}): BenchReport {
       durationMs: 50,
       usage: { inputTokens: 30, outputTokens: 20, totalTokens: 50 },
       errorMessage: undefined,
+      rubric: undefined,
     },
     failed: false,
   };
@@ -87,6 +89,10 @@ function makeMinimalReport(overrides: Partial<BenchReport> = {}): BenchReport {
       consensusBeatsBaselineConfidenceRate: 1,
       runsCounted: 1,
       runsAttempted: 1,
+      consensusRubricNormalizedMean: undefined,
+      baselineRubricNormalizedMean: undefined,
+      consensusBeatsBaselineRubricRate: undefined,
+      rubricRunsCounted: 0,
     },
     qualitativeNotes: ["• c1#0: converged at round 1; judge confidence 80"],
     ...overrides,
@@ -118,6 +124,97 @@ describe("formatReportMarkdown — section contract", () => {
     expect(md).toContain("v2.0.0");
   });
 
+  it("renders the held-out rubric block and rubric columns when rubrics are present", () => {
+    const report = makeMinimalReport();
+    const run = report.runs[0]!;
+    const withRubric: BenchRun = {
+      ...run,
+      consensus: {
+        ...run.consensus,
+        rubric: {
+          evaluatorModelId: "claude-opus-4-5",
+          criteria: [{ criterionId: "x", score: 4, justification: "j" }],
+          total: 4,
+          maxTotal: 5,
+          normalized: 80,
+          durationMs: 50,
+          usage: undefined,
+          errorMessage: undefined,
+        },
+      },
+      baseline: {
+        ...run.baseline,
+        rubric: {
+          evaluatorModelId: "claude-opus-4-5",
+          criteria: [{ criterionId: "x", score: 2, justification: "j" }],
+          total: 2,
+          maxTotal: 5,
+          normalized: 40,
+          durationMs: 50,
+          usage: undefined,
+          errorMessage: undefined,
+        },
+      },
+    };
+    const md = formatReportMarkdown({
+      ...report,
+      runs: [withRubric],
+      metrics: {
+        ...report.metrics,
+        consensusRubricNormalizedMean: 80,
+        baselineRubricNormalizedMean: 40,
+        consensusBeatsBaselineRubricRate: 1,
+        rubricRunsCounted: 1,
+      },
+    });
+    expect(md).toContain("Held-out rubric");
+    expect(md).toContain("Mean rubric score");
+    expect(md).toContain("Consensus beats baseline on rubric");
+    // Table gains the Rubric C / Rubric B / Δ rubric columns.
+    expect(md).toContain("Rubric C");
+    expect(md).toContain("Rubric B");
+    expect(md).toContain("Δ rubric");
+    expect(md).toMatch(/\| 80 \| 40 \| \+40 \|/);
+  });
+
+  it("renders ERR in rubric cells when an eval failed, without crashing the per-case table", () => {
+    const report = makeMinimalReport();
+    const run = report.runs[0]!;
+    const withErroredRubric: BenchRun = {
+      ...run,
+      consensus: {
+        ...run.consensus,
+        rubric: {
+          evaluatorModelId: "claude-opus-4-5",
+          criteria: [],
+          total: 0,
+          maxTotal: 5,
+          normalized: 0,
+          durationMs: 10,
+          usage: undefined,
+          errorMessage: "evaluator did not emit a parseable JSON object",
+        },
+      },
+      baseline: {
+        ...run.baseline,
+        rubric: {
+          evaluatorModelId: "claude-opus-4-5",
+          criteria: [],
+          total: 0,
+          maxTotal: 5,
+          normalized: 0,
+          durationMs: 10,
+          usage: undefined,
+          errorMessage: "caller threw",
+        },
+      },
+    };
+    const md = formatReportMarkdown({ ...report, runs: [withErroredRubric] });
+    expect(md).toContain("ERR");
+    // Δ rubric is "—" when either side errored.
+    expect(md).toMatch(/\| ERR \| ERR \| — \|/);
+  });
+
   it("renders per-case table with score, sigma, rounds, stop, judge conf, baseline conf, delta", () => {
     const md = formatReportMarkdown(makeMinimalReport());
     // The table contains "| 71 |" for score, "| 80 |" for judge conf, "| 60 |" for baseline, "| +11 |" for delta.

diff --git a/src/benchmark/__tests__/metrics.test.ts b/src/benchmark/__tests__/metrics.test.ts
@@ -70,6 +70,7 @@ function makeConsensusOutcome(
             totalTokens: options.totalTokens,
           }
         : undefined,
+    rubric: undefined,
   };
 }
 
@@ -93,6 +94,7 @@ function makeBaseline(opts: {
           }
         : undefined,
     errorMessage: opts.errorMessage,
+    rubric: undefined,
   };
 }