From 33bd518f37c579ecb6b9d5291258954c5285ea79 Mon Sep 17 00:00:00 2001
From: Claude <noreply@anthropic.com>
Date: Thu, 21 May 2026 10:32:08 +0000
Subject: [PATCH 1/3] Add anti-slop eval harness and hillclimb the skill (gates
 70-84)

Build a deterministic slop detector grounded in Impeccable's 37 patterns
plus Hallmark's own gates, score self-contained fixtures across genres, and
run a 10-cycle eval-driven hillclimb.

Phase 1 (v1, cycles 1-5): close gaps the detector found, adding gates 70-77
to references/slop-test.md; fixtures climb 74.2 -> 98.3.

Cycle 6: per "Your Evals Will Break", upgrade the eval to v2 -- six new
detector rules (incl. hero-float/gate 54 the v1-perfect fixtures had been
violating), a cross-fixture order parameter (macrostructure reuse), and two
adversarial fixtures. Score honestly drops to 76.4.

Phase 2 (v2, cycles 7-10): add gates 78-84 and climb back to 98.7, resisting
a dark/neon/metric-hero brief. The skill gained 15 gates motivated by what
the eval could measure. Full curve in evals/results/history.md.
---
 evals/README.md                   |  49 +++
 evals/briefs.md                   |  26 ++
 evals/config.json                 |  20 +
 evals/detector.mjs                | 660 ++++++++++++++++++++++++++++++
 evals/fixtures/fernweh.html       | 117 ++++++
 evals/fixtures/fernweh.judge.json |  10 +
 evals/fixtures/kiln.html          | 130 ++++++
 evals/fixtures/kiln.judge.json    |  10 +
 evals/fixtures/ledger.html        | 131 ++++++
 evals/fixtures/ledger.judge.json  |  10 +
 evals/fixtures/pulse.html         | 115 ++++++
 evals/fixtures/pulse.judge.json   |  10 +
 evals/fixtures/vellum.html        |  70 ++++
 evals/fixtures/vellum.judge.json  |  10 +
 evals/results/cycle-01-v1.json    |  67 +++
 evals/results/cycle-02-v1.json    |  67 +++
 evals/results/cycle-03-v1.json    |  67 +++
 evals/results/cycle-04-v1.json    |  67 +++
 evals/results/cycle-05-v1.json    |  67 +++
 evals/results/cycle-06-v2.json    | 103 +++++
 evals/results/cycle-07-v2.json    | 103 +++++
 evals/results/cycle-08-v2.json    | 103 +++++
 evals/results/cycle-09-v2.json    | 103 +++++
 evals/results/cycle-10-v2.json    | 103 +++++
 evals/results/history.md          |  31 ++
 evals/rubric.md                   |  55 +++
 evals/run.mjs                     | 119 ++++++
 references/slop-test.md           | 138 +++++++
 28 files changed, 2561 insertions(+)
 create mode 100644 evals/README.md
 create mode 100644 evals/briefs.md
 create mode 100644 evals/config.json
 create mode 100644 evals/detector.mjs
 create mode 100644 evals/fixtures/fernweh.html
 create mode 100644 evals/fixtures/fernweh.judge.json
 create mode 100644 evals/fixtures/kiln.html
 create mode 100644 evals/fixtures/kiln.judge.json
 create mode 100644 evals/fixtures/ledger.html
 create mode 100644 evals/fixtures/ledger.judge.json
 create mode 100644 evals/fixtures/pulse.html
 create mode 100644 evals/fixtures/pulse.judge.json
 create mode 100644 evals/fixtures/vellum.html
 create mode 100644 evals/fixtures/vellum.judge.json
 create mode 100644 evals/results/cycle-01-v1.json
 create mode 100644 evals/results/cycle-02-v1.json
 create mode 100644 evals/results/cycle-03-v1.json
 create mode 100644 evals/results/cycle-04-v1.json
 create mode 100644 evals/results/cycle-05-v1.json
 create mode 100644 evals/results/cycle-06-v2.json
 create mode 100644 evals/results/cycle-07-v2.json
 create mode 100644 evals/results/cycle-08-v2.json
 create mode 100644 evals/results/cycle-09-v2.json
 create mode 100644 evals/results/cycle-10-v2.json
 create mode 100644 evals/results/history.md
 create mode 100644 evals/rubric.md
 create mode 100644 evals/run.mjs

diff --git a/evals/README.md b/evals/README.md
new file mode 100644
index 0000000..729f137
--- /dev/null
+++ b/evals/README.md
@@ -0,0 +1,49 @@
+# `evals/` — anti-slop eval harness
+
+An eval-driven hillclimb that improved Hallmark against two external anchors:
+
+- **Impeccable's slop standard** — "37 patterns that mark an interface as
+  AI-generated" across 8 dimensions ([impeccable.style/slop](https://impeccable.style/slop)).
+- **"Your Evals Will Break and You Won't See It Coming"** — why static evals
+  silently miss new failure regimes, and the case for self-evolving evals
+  ([wanglun1996.github.io](https://wanglun1996.github.io/blog/your-evals-will-break.html)).
+
+## What's here
+
+| File | Role |
+|---|---|
+| `rubric.md` | The scoring rubric: 8 detector dimensions + 1 craft (judge) dimension. |
+| `briefs.md` | The briefs each fixture is the skill exercised on. |
+| `detector.mjs` | Deterministic slop detector — the CLI-checkable subset of the 37 patterns + Hallmark gates. v1 = 37 rules, v2 = 43. |
+| `run.mjs` | Merges detector + judge sidecars, computes the cross-fixture **order parameter**, snapshots a cycle, rebuilds `results/history.md`. |
+| `config.json` | Which fixtures belong to eval v1 vs v2. |
+| `fixtures/*.html` | Self-contained pages (what Hallmark emits). |
+| `fixtures/*.judge.json` | Per-fixture craft scores (philosophy, hierarchy, execution, specificity, restraint, variety, honesty). |
+| `results/` | One JSON snapshot per cycle + the running `history.md` table. |
+
+## Run it
+
+```bash
+cd evals
+node detector.mjs fixtures/pulse.html --eval v2   # inspect one page
+node run.mjs --cycle 10 --eval v2 --label "..."   # score a cycle, update history
+```
+
+## The hillclimb (10 cycles)
+
+**Phase 1 (v1, cycles 1–5)** drove the three originals from 74.2 → 98.3 by
+closing gaps the detector found — each cycle added a real gate to
+`references/slop-test.md` (gates **70–77**) and brought the fixtures into line.
+
+**The break (cycle 6)** upgraded the eval to **v2**: six new detector rules
+for failure modes v1 was blind to (notably hero-float / gate 54, which the
+v1-perfect fixtures had been violating the whole time), a cross-fixture
+**order parameter** (macrostructure reuse — variety is a property of the
+*set*, not the page), and two adversarial fixtures (`pulse`, `vellum`). Score
+fell 98.3 → 76.4, exactly as the blog predicts.
+
+**Phase 2 (v2, cycles 7–10)** climbed back to 98.7, adding gates **78–84**
+and resisting `pulse`'s dark/neon/metric-hero brief gravity.
+
+The skill is the artifact that improved: 15 new gates, motivated by what the
+eval could measure. See `results/history.md` for the full score table.
diff --git a/evals/briefs.md b/evals/briefs.md
new file mode 100644
index 0000000..991f184
--- /dev/null
+++ b/evals/briefs.md
@@ -0,0 +1,26 @@
+# Eval briefs
+
+Each fixture is the skill exercised on one brief. Briefs span genres so the
+detector isn't fooled by a single safe house style. Fixtures live in
+`fixtures/` as self-contained HTML (exactly what Hallmark emits).
+
+## v1 briefs
+
+- **ledger** — landing page for *Ledger*, an open-source double-entry
+  bookkeeping CLI for indie developers. Genre: modern-minimal.
+  Macrostructure target: stat-led / workbench (no rote hero→3-features→CTA).
+- **fernweh** — homepage for *Fernweh*, a small-group slow-travel company
+  running 8-day walking trips. Genre: atmospheric / editorial.
+  Macrostructure target: photographic or narrative-workflow.
+- **kiln** — studio page for *Kiln & Co.*, a two-person ceramics workshop
+  selling a seasonal run of stoneware. Genre: editorial / specimen-adjacent
+  but must NOT default to Specimen.
+
+## v2 briefs (added when v1 saturates)
+
+- **synthwave-trap** — adversarial: a brief for *Pulse*, a "developer
+  analytics dashboard," whose own copy nudges toward dark-mode + neon +
+  metric-hero slop. The skill must resist the brief's gravity.
+- **vellum** — a long-form essay page for *Vellum*, a writing tool. Probes
+  reading-comfort tells v1 underweights (measure rhythm, widows, heading
+  cadence, real prose hierarchy).
diff --git a/evals/config.json b/evals/config.json
new file mode 100644
index 0000000..e075925
--- /dev/null
+++ b/evals/config.json
@@ -0,0 +1,20 @@
+{
+  "evals": {
+    "v1": {
+      "fixtures": [
+        { "name": "ledger", "file": "fixtures/ledger.html", "judge": "fixtures/ledger.judge.json" },
+        { "name": "fernweh", "file": "fixtures/fernweh.html", "judge": "fixtures/fernweh.judge.json" },
+        { "name": "kiln", "file": "fixtures/kiln.html", "judge": "fixtures/kiln.judge.json" }
+      ]
+    },
+    "v2": {
+      "fixtures": [
+        { "name": "ledger", "file": "fixtures/ledger.html", "judge": "fixtures/ledger.judge.json" },
+        { "name": "fernweh", "file": "fixtures/fernweh.html", "judge": "fixtures/fernweh.judge.json" },
+        { "name": "kiln", "file": "fixtures/kiln.html", "judge": "fixtures/kiln.judge.json" },
+        { "name": "pulse", "file": "fixtures/pulse.html", "judge": "fixtures/pulse.judge.json" },
+        { "name": "vellum", "file": "fixtures/vellum.html", "judge": "fixtures/vellum.judge.json" }
+      ]
+    }
+  }
+}
diff --git a/evals/detector.mjs b/evals/detector.mjs
new file mode 100644
index 0000000..14a6ca7
--- /dev/null
+++ b/evals/detector.mjs
@@ -0,0 +1,660 @@
+// Hallmark slop detector — deterministic anti-slop checks for self-contained HTML.
+//
+// Grounds the eval in two external standards:
+//   1. Impeccable's "37 patterns that mark an interface as AI-generated"
+//      across 8 dimensions (impeccable.style/slop).
+//   2. Hallmark's own slop-test gates (references/slop-test.md).
+//
+// Only the deterministic (CLI-checkable) subset lives here. Taste dimensions
+// (philosophy, hierarchy, specificity, restraint, variety, honesty) are scored
+// by an LLM judge and merged by run.mjs.
+//
+// Usage: node detector.mjs <file.html> [--json]
+
+import fs from 'node:fs';
+
+const FONT_OVERUSED = [
+  'inter', 'roboto', 'open sans', 'poppins', 'lato', 'montserrat',
+  'plus jakarta sans', 'space grotesk', 'geist', 'nunito', 'raleway',
+];
+const GENERIC_FAMILIES = new Set([
+  'sans-serif', 'serif', 'monospace', 'system-ui', 'ui-monospace',
+  'ui-serif', 'ui-sans-serif', 'cursive', 'fantasy', 'emoji', 'math',
+  '-apple-system', 'blinkmacsystemfont', 'segoe ui', 'inherit', 'initial',
+]);
+
+// ---------------------------------------------------------------- doc loading
+function loadDoc(path) {
+  const html = fs.readFileSync(path, 'utf8');
+  const styleCss = [...html.matchAll(/<style[^>]*>([\s\S]*?)<\/style>/gi)]
+    .map((m) => m[1]).join('\n');
+  const inlineCss = [...html.matchAll(/\sstyle="([^"]*)"/gi)]
+    .map((m) => `__inline__{${m[1]}}`).join('\n');
+  const css = `${styleCss}\n${inlineCss}`;
+  const stamp = (css.match(/\/\*\s*Hallmark[\s\S]*?\*\//) || [''])[0];
+  const genre =
+    (stamp.match(/genre:\s*([a-z-]+)/i) || [])[1] ||
+    (html.match(/data-genre="([^"]+)"/) || [])[1] || '';
+  return { path, html, css, styleCss, stamp, genre };
+}
+
+// crude flat-rule splitter; @media wrappers drop out but inner rules survive.
+function cssRules(css) {
+  const out = [];
+  const re = /([^{}]+)\{([^{}]*)\}/g;
+  let m;
+  while ((m = re.exec(css))) {
+    out.push({ sel: m[1].trim().toLowerCase(), body: m[2].trim() });
+  }
+  return out;
+}
+
+function tokenMap(css) {
+  const map = {};
+  for (const r of cssRules(css)) {
+    if (!/:root|\[data-theme/.test(r.sel)) continue;
+    for (const m of r.body.matchAll(/(--[a-z0-9-]+)\s*:\s*([^;]+)/gi)) {
+      map[m[1].trim()] = m[2].trim();
+    }
+  }
+  return map;
+}
+
+function resolveVar(value, map, depth = 0) {
+  if (depth > 8 || !value) return value;
+  return value.replace(/var\(\s*(--[a-z0-9-]+)\s*(?:,([^)]*))?\)/gi, (_, name, fb) => {
+    const v = map[name.trim()];
+    if (v != null) return resolveVar(v, map, depth + 1);
+    return fb != null ? resolveVar(fb.trim(), map, depth + 1) : '';
+  });
+}
+
+// oklch lightness 0..1 (handles "oklch(.3 ...)" and "oklch(32% ...)")
+function oklchL(value) {
+  const m = String(value).match(/oklch\(\s*([0-9.]+%?)/i);
+  if (!m) return null;
+  const raw = m[1];
+  return raw.endsWith('%') ? parseFloat(raw) / 100 : parseFloat(raw);
+}
+function oklchC(value) {
+  const m = String(value).match(/oklch\(\s*[0-9.]+%?\s+([0-9.]+)/i);
+  return m ? parseFloat(m[1]) : null;
+}
+function oklchH(value) {
+  const m = String(value).match(/oklch\(\s*[0-9.]+%?\s+[0-9.]+\s+([0-9.]+)/i);
+  return m ? parseFloat(m[1]) : null;
+}
+
+const COLOR_LITERAL = /#[0-9a-fA-F]{3,8}\b|\brgba?\([^)]*\)|\bhsla?\([^)]*\)|\boklch\([^)]*\)|\blab\([^)]*\)/gi;
+
+function fontFamilies(css, map) {
+  const fams = new Set();
+  for (const m of css.matchAll(/font-family\s*:\s*([^;}]+)/gi)) {
+    const resolved = resolveVar(m[1], map);
+    const first = resolved.split(',')[0].trim().replace(/['"]/g, '').toLowerCase();
+    if (first && !GENERIC_FAMILIES.has(first) && !first.startsWith('var(')) fams.add(first);
+  }
+  for (const [k, v] of Object.entries(map)) {
+    if (!/--font/.test(k)) continue;
+    const first = String(v).split(',')[0].trim().replace(/['"]/g, '').toLowerCase();
+    if (first && !GENERIC_FAMILIES.has(first) && !first.startsWith('var(')) fams.add(first);
+  }
+  return [...fams];
+}
+
+function headingLevels(html) {
+  return [...html.matchAll(/<h([1-6])[\s>]/gi)].map((m) => +m[1]);
+}
+
+// Balanced extraction of @media (...max-width...) block bodies. Regex alone
+// trips over nested rule braces and indented closers, so count braces.
+function maxWidthMediaBodies(css) {
+  const bodies = [];
+  const re = /@media[^{]*max-width[^{]*\{/gi;
+  let m;
+  while ((m = re.exec(css))) {
+    let depth = 1;
+    let i = m.index + m[0].length;
+    const start = i;
+    for (; i < css.length && depth > 0; i++) {
+      if (css[i] === '{') depth++;
+      else if (css[i] === '}') depth--;
+    }
+    bodies.push(css.slice(start, i - 1));
+  }
+  return bodies;
+}
+
+// ---------------------------------------------------------------- rule set v1
+// Each rule: { id, dim, label, fn(ctx) -> {pass:boolean, note:string} }
+const RULES = [
+  // ---- TYPOGRAPHY -------------------------------------------------------
+  {
+    id: 'type-overused-font', dim: 'typography',
+    label: 'Display/body face is an overused AI default (Inter, Roboto, Geist…)',
+    fn: ({ fams }) => {
+      const hit = fams.filter((f) => FONT_OVERUSED.includes(f));
+      return { pass: hit.length === 0, note: hit.length ? `uses ${hit.join(', ')}` : 'distinctive faces' };
+    },
+  },
+  {
+    id: 'type-single-font', dim: 'typography',
+    label: 'Single font family across the whole page',
+    fn: ({ fams }) => ({ pass: fams.length !== 1, note: `${fams.length} distinct families` }),
+  },
+  {
+    id: 'type-too-many-fonts', dim: 'typography',
+    label: 'More than three distinct font families (gate 39)',
+    fn: ({ fams }) => ({ pass: fams.length <= 3, note: `${fams.length} families: ${fams.join(', ') || 'none'}` }),
+  },
+  {
+    id: 'type-allcaps-body', dim: 'typography',
+    label: 'All-caps applied to body/paragraph text',
+    fn: ({ rules }) => {
+      const bad = rules.find((r) => /(^|[\s,])(body|p|li|article)\b/.test(r.sel) && /text-transform\s*:\s*uppercase/.test(r.body));
+      return { pass: !bad, note: bad ? `on ${bad.sel}` : 'body is mixed-case' };
+    },
+  },
+  {
+    id: 'type-tight-leading', dim: 'typography',
+    label: 'Body line-height below 1.3',
+    fn: ({ rules, map }) => {
+      for (const r of rules) {
+        if (!/(^|[\s,])(body|p|li|article|html)\b/.test(r.sel)) continue;
+        const m = r.body.match(/line-height\s*:\s*([0-9.]+)\b/);
+        if (m && parseFloat(m[1]) < 1.3 && parseFloat(m[1]) > 0) return { pass: false, note: `line-height ${m[1]} on ${r.sel}` };
+      }
+      return { pass: true, note: 'comfortable leading' };
+    },
+  },
+  {
+    id: 'type-wide-tracking-body', dim: 'typography',
+    label: 'Letter-spacing above 0.05em on body text',
+    fn: ({ rules }) => {
+      for (const r of rules) {
+        if (!/(^|[\s,])(body|p|li)\b/.test(r.sel)) continue;
+        const m = r.body.match(/letter-spacing\s*:\s*([0-9.]+)em/);
+        if (m && parseFloat(m[1]) > 0.05) return { pass: false, note: `${m[1]}em on ${r.sel}` };
+      }
+      return { pass: true, note: 'tracking in range' };
+    },
+  },
+  {
+    id: 'type-tiny-body', dim: 'typography',
+    label: 'Body text below 12px',
+    fn: ({ rules }) => {
+      for (const r of rules) {
+        if (!/(^|[\s,])(body|p|li)\b/.test(r.sel)) continue;
+        const m = r.body.match(/font-size\s*:\s*([0-9.]+)px/);
+        if (m && parseFloat(m[1]) < 12) return { pass: false, note: `${m[1]}px on ${r.sel}` };
+      }
+      return { pass: true, note: 'legible body size' };
+    },
+  },
+
+  // ---- COLOR & CONTRAST -------------------------------------------------
+  {
+    id: 'color-gradient-text', dim: 'color',
+    label: 'Gradient clipped to text (background-clip: text)',
+    fn: ({ css }) => {
+      const bad = /background-clip\s*:\s*text|-webkit-background-clip\s*:\s*text/i.test(css) && /gradient/i.test(css);
+      return { pass: !bad, note: bad ? 'gradient text headline' : 'solid headline fill' };
+    },
+  },
+  {
+    id: 'color-ai-palette', dim: 'color',
+    label: 'AI purple/violet→cyan gradient',
+    fn: ({ css }) => {
+      const grads = [...css.matchAll(/(linear|radial|conic)-gradient\([^;}]*\)/gi)].map((m) => m[0]);
+      for (const g of grads) {
+        const kw = /purple|violet|indigo|fuchsia|magenta|#8b5cf6|#6366f1|#7c3aed|#a855f7/i.test(g);
+        const cyan = /cyan|teal|#06b6d4|#22d3ee/i.test(g);
+        const hues = [...g.matchAll(/oklch\([^)]*\)/gi)].map((x) => oklchH(x[0])).filter((h) => h != null);
+        const aiHue = hues.some((h) => h >= 270 && h <= 330);
+        if ((kw && cyan) || kw || aiHue) return { pass: false, note: `tell in ${g.slice(0, 40)}…` };
+      }
+      return { pass: true, note: 'no AI-palette gradient' };
+    },
+  },
+  {
+    id: 'color-pure-black-bg', dim: 'color',
+    label: 'Pure #000 / oklch(0) used as a base background',
+    fn: ({ rules, map }) => {
+      for (const r of rules) {
+        const m = r.body.match(/background(?:-color)?\s*:\s*([^;]+)/i);
+        if (!m) continue;
+        const v = resolveVar(m[1], map).toLowerCase();
+        if (/#000(\b|000\b)|\boklch\(\s*0\s+0\b|\brgb\(\s*0\s*,\s*0\s*,\s*0\s*\)|\bblack\b/.test(v)) return { pass: false, note: `pure black bg on ${r.sel}` };
+      }
+      return { pass: true, note: 'no pure-black base' };
+    },
+  },
+  {
+    id: 'color-zero-chroma', dim: 'color',
+    label: 'Zero-chroma flat-grey neutrals (gate 24)',
+    fn: ({ map, genre }) => {
+      if (genre === 'modern-minimal') return { pass: true, note: 'modern-minimal allows zero-chroma' };
+      for (const [k, v] of Object.entries(map)) {
+        if (!/--color|--paper|--ink|--surface|--muted|--neutral|--bg/.test(k)) continue;
+        const c = oklchC(resolveVar(v, map));
+        if (c === 0) return { pass: false, note: `${k} has 0 chroma` };
+      }
+      return { pass: true, note: 'neutrals tinted toward anchor' };
+    },
+  },
+  {
+    id: 'color-token-discipline', dim: 'color',
+    label: 'Colour literal outside the token block (gate 58)',
+    fn: ({ rules }) => {
+      const offenders = [];
+      for (const r of rules) {
+        if (/:root|\[data-theme/.test(r.sel)) continue;
+        const lits = (r.body.match(COLOR_LITERAL) || []).filter((c) => !/transparent|currentcolor|inherit|none/i.test(c));
+        if (lits.length) offenders.push(`${r.sel}: ${lits[0]}`);
+      }
+      return { pass: offenders.length === 0, note: offenders.length ? `${offenders.length} literal(s), e.g. ${offenders[0]}` : 'all colours via tokens' };
+    },
+  },
+  {
+    id: 'color-ink-on-ink', dim: 'color',
+    label: 'Text lightness too close to its background (ink-on-ink, gates 46–50)',
+    fn: ({ rules, map }) => {
+      for (const r of rules) {
+        if (/:root|\[data-theme/.test(r.sel)) continue;
+        const cM = r.body.match(/(?<!-)\bcolor\s*:\s*([^;]+)/i);
+        const bM = r.body.match(/background(?:-color)?\s*:\s*([^;]+)/i);
+        if (!cM || !bM) continue;
+        const lc = oklchL(resolveVar(cM[1], map));
+        const lb = oklchL(resolveVar(bM[1], map));
+        if (lc != null && lb != null && Math.abs(lc - lb) < 0.4) return { pass: false, note: `ΔL ${Math.abs(lc - lb).toFixed(2)} on ${r.sel}` };
+      }
+      return { pass: true, note: 'text/bg lightness separated' };
+    },
+  },
+
+  // ---- VISUAL DETAILS ---------------------------------------------------
+  {
+    id: 'visual-side-tab', dim: 'visual',
+    label: 'Thick coloured side-stripe border on a card (the strongest tell)',
+    fn: ({ rules, map }) => {
+      for (const r of rules) {
+        // a left rule on a blockquote/figure is a typographic convention, not the card tell
+        if (/\b(blockquote|figure|aside|q|cite)\b/.test(r.sel)) continue;
+        const m = r.body.match(/border-(left|right)\s*:\s*([0-9.]+)px\s+\w+\s+([^;]+)/i);
+        if (!m) continue;
+        const w = parseFloat(m[2]);
+        const col = resolveVar(m[3], map).toLowerCase();
+        if (w >= 4 && !/transparent/.test(col)) return { pass: false, note: `${m[2]}px ${m[1]} stripe on ${r.sel}` };
+      }
+      return { pass: true, note: 'no side-tab stripe' };
+    },
+  },
+  {
+    id: 'visual-glassmorphism', dim: 'visual',
+    label: 'Glassmorphism (backdrop blur on translucent panels)',
+    fn: ({ css }) => {
+      const bad = /backdrop-filter\s*:\s*[^;]*blur/i.test(css) && /rgba?\([^)]*0?\.\d+\s*\)|\/\s*0?\.\d+\s*\)/.test(css);
+      return { pass: !bad, note: bad ? 'translucent blur panel' : 'no glass panels' };
+    },
+  },
+  {
+    id: 'visual-sparkline-decoration', dim: 'visual',
+    label: 'Sparkline / chart used as pure decoration',
+    fn: ({ html }) => {
+      const bad = /class="[^"]*\b(sparkline|spark-line|decor[a-z-]*chart|fake-chart)\b/i.test(html);
+      return { pass: !bad, note: bad ? 'decorative sparkline present' : 'no decorative charts' };
+    },
+  },
+
+  // ---- LAYOUT & SPACE ---------------------------------------------------
+  {
+    id: 'layout-center-everything', dim: 'layout',
+    label: 'Everything centre-aligned (≥4 text-align:center)',
+    fn: ({ css }) => {
+      const n = (css.match(/text-align\s*:\s*center/gi) || []).length;
+      return { pass: n < 4, note: `${n} centred blocks` };
+    },
+  },
+  {
+    id: 'layout-justified', dim: 'layout',
+    label: 'Justified body text (word-spacing rivers)',
+    fn: ({ css }) => {
+      const bad = /text-align\s*:\s*justify/i.test(css);
+      return { pass: !bad, note: bad ? 'justified text present' : 'ragged-right text' };
+    },
+  },
+  {
+    id: 'layout-three-col-cards', dim: 'layout',
+    label: 'Three equal-column card grid (icon-tile template)',
+    fn: ({ css }) => {
+      const bad = /grid-template-columns\s*:\s*repeat\(\s*3\s*,\s*(?:minmax\(0,\s*)?1fr/i.test(css) || /grid-template-columns\s*:\s*1fr\s+1fr\s+1fr\b/i.test(css);
+      return { pass: !bad, note: bad ? 'repeat(3, 1fr) grid' : 'no rote 3-col grid' };
+    },
+  },
+  {
+    id: 'layout-long-measure', dim: 'layout',
+    label: 'Prose measure beyond 75ch (gate 27)',
+    fn: ({ css }) => {
+      for (const m of css.matchAll(/max-width\s*:\s*([0-9.]+)ch/gi)) {
+        if (parseFloat(m[1]) > 75) return { pass: false, note: `${m[1]}ch measure` };
+      }
+      return { pass: true, note: 'measure ≤ 75ch' };
+    },
+  },
+  {
+    id: 'layout-arbitrary-spacing', dim: 'layout',
+    label: 'Spacing off the 4px scale (gate 26)',
+    fn: ({ rules, map }) => {
+      for (const r of rules) {
+        if (/:root|\[data-theme/.test(r.sel)) continue;
+        for (const m of r.body.matchAll(/\b(?:padding|margin|gap|row-gap|column-gap)(?:-\w+)?\s*:\s*([^;]+)/gi)) {
+          const resolved = resolveVar(m[1], map);
+          for (const px of resolved.matchAll(/(-?[0-9.]+)px/g)) {
+            const v = Math.abs(parseFloat(px[1]));
+            if (v > 0 && v % 4 !== 0) return { pass: false, note: `${px[1]}px on ${r.sel}` };
+          }
+        }
+      }
+      return { pass: true, note: 'spacing on 4px scale' };
+    },
+  },
+  {
+    id: 'layout-skipped-heading', dim: 'layout',
+    label: 'Skipped heading level (h1→h3 with no h2)',
+    fn: ({ html }) => {
+      const lv = headingLevels(html);
+      for (let i = 1; i < lv.length; i++) {
+        if (lv[i] - lv[i - 1] > 1) return { pass: false, note: `h${lv[i - 1]}→h${lv[i]}` };
+      }
+      return { pass: true, note: 'heading levels contiguous' };
+    },
+  },
+
+  // ---- MOTION -----------------------------------------------------------
+  {
+    id: 'motion-transition-all', dim: 'motion',
+    label: 'transition: all (gate 11)',
+    fn: ({ css }) => {
+      const bad = /transition\s*:\s*all\b/i.test(css);
+      return { pass: !bad, note: bad ? 'transition: all present' : 'transitions are scoped' };
+    },
+  },
+  {
+    id: 'motion-hover-scale', dim: 'motion',
+    label: 'Uniform hover-scale (gate 12)',
+    fn: ({ css }) => {
+      const bad = /:hover[^{}]*\{[^{}]*transform\s*:\s*scale\(\s*1\.0[1-9]/i.test(css) || /hover:scale-10[0-9]/i.test(css);
+      return { pass: !bad, note: bad ? 'hover scale present' : 'no rote hover-scale' };
+    },
+  },
+  {
+    id: 'motion-bouncy-easing', dim: 'motion',
+    label: 'Bouncy/overshoot easing on UI state (gate 13)',
+    fn: ({ css }) => {
+      for (const m of css.matchAll(/cubic-bezier\(\s*([0-9.-]+)\s*,\s*([0-9.-]+)\s*,\s*([0-9.-]+)\s*,\s*([0-9.-]+)\s*\)/gi)) {
+        const y1 = parseFloat(m[2]); const y2 = parseFloat(m[4]);
+        if (y1 > 1 || y2 > 1 || y1 < 0 || y2 < 0) return { pass: false, note: `overshoot ${m[0]}` };
+      }
+      return { pass: true, note: 'no overshoot easing' };
+    },
+  },
+  {
+    id: 'motion-layout-animation', dim: 'motion',
+    label: 'Animating layout properties (gate 15)',
+    fn: ({ css }) => {
+      const bad = /transition\s*:[^;}]*\b(width|height|top|left|right|bottom|margin|padding)\b/i.test(css);
+      return { pass: !bad, note: bad ? 'layout prop in transition' : 'animates transform/opacity only' };
+    },
+  },
+  {
+    id: 'motion-no-reduced-motion', dim: 'motion',
+    label: 'Animation without prefers-reduced-motion fallback (gate 29)',
+    fn: ({ css }) => {
+      const hasMotion = /@keyframes|animation\s*:|transition\s*:/i.test(css);
+      const hasGuard = /prefers-reduced-motion/i.test(css);
+      return { pass: !hasMotion || hasGuard, note: hasMotion ? (hasGuard ? 'guarded' : 'no reduced-motion guard') : 'no motion' };
+    },
+  },
+
+  // ---- INTERACTION ------------------------------------------------------
+  {
+    id: 'interaction-emoji-icon', dim: 'interaction',
+    label: 'Emoji used as a feature/step icon (gate 60)',
+    fn: ({ html }) => {
+      const body = html.replace(/<style[\s\S]*?<\/style>/gi, '').replace(/<svg[\s\S]*?<\/svg>/gi, '');
+      const bad = /[\u{1F300}-\u{1FAFF}\u{2600}-\u{27BF}\u{2B00}-\u{2BFF}\u{FE0F}]/u.test(body);
+      return { pass: !bad, note: bad ? 'emoji glyph in markup' : 'no emoji icons' };
+    },
+  },
+  {
+    id: 'interaction-all-primary', dim: 'interaction',
+    label: 'Every button styled as primary (no secondary register)',
+    fn: ({ html, css }) => {
+      const btns = (html.match(/<(?:button|a)[^>]*class="[^"]*\b(?:btn|button|cta)\b/gi) || []).length;
+      const hasVariant = /\b(btn|button)[-_]{1,2}(secondary|ghost|outline|tertiary|quiet|text)\b|data-variant|\bbtn--/i.test(html + css);
+      return { pass: btns < 3 || hasVariant, note: btns >= 3 && !hasVariant ? `${btns} buttons, one register` : 'button hierarchy present' };
+    },
+  },
+  {
+    id: 'interaction-placeholder-names', dim: 'interaction',
+    label: 'Placeholder names / startup clichés (gate 20)',
+    fn: ({ html }) => {
+      const bad = /jane doe|john smith|john doe|lorem ipsum|\bacme\b|\bnexus\b|seamless|unleash|\bwidget(?:co|inc)\b/i.test(html);
+      return { pass: !bad, note: bad ? 'placeholder/cliché copy' : 'specific copy' };
+    },
+  },
+  {
+    id: 'interaction-modal-reflex', dim: 'interaction',
+    label: 'Reaching for a modal/dialog reflexively',
+    fn: ({ html }) => {
+      const bad = /<dialog\b|class="[^"]*\bmodal\b|role="dialog"/i.test(html);
+      return { pass: !bad, note: bad ? 'modal present' : 'no reflexive modal' };
+    },
+  },
+
+  // ---- RESPONSIVE -------------------------------------------------------
+  {
+    id: 'responsive-overflow-clip', dim: 'responsive',
+    label: 'Root missing overflow-x: clip (gates 36/62)',
+    fn: ({ css }) => {
+      const bad = !/(html|body)[^{}]*\{[^{}]*overflow-x\s*:\s*clip/i.test(css) && !/(html|body)\s*,\s*(html|body)[^{}]*\{[^{}]*overflow-x\s*:\s*clip/i.test(css);
+      return { pass: !bad, note: bad ? 'no overflow-x: clip' : 'overflow-x clipped' };
+    },
+  },
+  {
+    id: 'responsive-img-grid-minmax', dim: 'responsive',
+    label: 'Image-bearing 1fr grid track without minmax(0,1fr) (gate 61)',
+    fn: ({ css, html }) => {
+      const hasImg = /<img|<picture/i.test(html);
+      const bareFr = /grid-template-columns\s*:\s*[^;}]*(?<!minmax\(0,\s*)\b1fr/i.test(css) && /repeat\(\s*\d+\s*,\s*1fr/i.test(css) === false ? /grid-template-columns\s*:\s*1fr\b/i.test(css) : true;
+      const usesMinmax = /minmax\(\s*0\s*,\s*1fr/i.test(css);
+      const bad = hasImg && bareFr && !usesMinmax;
+      return { pass: !bad, note: bad ? 'bare 1fr track with images' : 'minmax-guarded or no images' };
+    },
+  },
+  {
+    id: 'responsive-feature-amputation', dim: 'responsive',
+    label: 'Content (not nav) hidden on mobile (feature amputation)',
+    fn: ({ css }) => {
+      const media = maxWidthMediaBodies(css).join('\n');
+      const offenders = [...media.matchAll(/([^{}]+)\{[^{}]*display\s*:\s*none/gi)]
+        .map((m) => m[1].trim())
+        .filter((s) => !/nav|menu|toggle|hamburger|burger|skip|drawer|sheet|backdrop|overlay|sr-only|visually-hidden|__bar|mobile|desktop-only|show-/i.test(s));
+      return { pass: offenders.length === 0, note: offenders.length ? `hides ${offenders[0]}` : 'no content amputation' };
+    },
+  },
+
+  // ---- GENERAL QUALITY --------------------------------------------------
+  {
+    id: 'general-focus-visible', dim: 'general',
+    label: 'Interactive elements without :focus-visible (gate 28)',
+    fn: ({ html, css }) => {
+      const interactive = /<(button|a\s|input|select|textarea|summary)/i.test(html);
+      const hasFocus = /:focus-visible/i.test(css);
+      return { pass: !interactive || hasFocus, note: interactive ? (hasFocus ? 'focus-visible present' : 'no focus-visible styles') : 'no interactive els' };
+    },
+  },
+  {
+    id: 'general-stamp', dim: 'general',
+    label: 'Missing Hallmark macrostructure stamp (gate 21)',
+    fn: ({ stamp }) => ({ pass: /macrostructure/i.test(stamp), note: stamp ? 'stamp present' : 'no stamp comment' }),
+  },
+  {
+    id: 'general-state-coverage', dim: 'general',
+    label: 'Interactive elements missing :hover/:active/:disabled coverage',
+    fn: ({ html, css }) => {
+      const interactive = /<(button|a\s|input)/i.test(html);
+      if (!interactive) return { pass: true, note: 'no interactive els' };
+      const states = ['\\:hover', '\\:active', '\\:disabled|\\[disabled\\]|\\[aria-disabled'];
+      const missing = states.filter((s) => !new RegExp(s, 'i').test(css));
+      return { pass: missing.length === 0, note: missing.length ? `${missing.length} state(s) missing` : 'states covered' };
+    },
+  },
+];
+
+// ---------------------------------------------------------------- v2 helpers
+function pxOf(value, map) {
+  const r = resolveVar(String(value).trim(), map);
+  if (/^0$/.test(r)) return 0;
+  const m = r.match(/(-?[0-9.]+)px/);
+  return m ? parseFloat(m[1]) : null;
+}
+// hero/lede container padding -> {top, bottom} in px, or null
+function heroPadding(rules, map) {
+  const cand = rules.find((r) => /(^|[\s,])[.#]?(hero|lede|masthead)\b\s*$/.test(r.sel) || /(^|[\s,])(header\.hero|\.hero|\.lede|\.masthead)\s*$/.test(r.sel));
+  if (!cand) return null;
+  const blk = cand.body.match(/padding-block\s*:\s*([^;]+)/i);
+  if (blk) {
+    const parts = blk[1].trim().split(/\s+/);
+    return { top: pxOf(parts[0], map), bottom: pxOf(parts[1] ?? parts[0], map), sel: cand.sel };
+  }
+  const ps = cand.body.match(/padding-block-start\s*:\s*([^;]+)/i);
+  const pe = cand.body.match(/padding-block-end\s*:\s*([^;]+)/i);
+  if (ps && pe) return { top: pxOf(ps[1], map), bottom: pxOf(pe[1], map), sel: cand.sel };
+  const pad = cand.body.match(/(?<!-)\bpadding\s*:\s*([^;]+)/i);
+  if (!pad) return null;
+  const p = pad[1].trim().split(/\s+(?![^(]*\))/);
+  let top, bottom;
+  if (p.length === 1) { top = bottom = pxOf(p[0], map); }
+  else if (p.length === 2) { top = bottom = pxOf(p[0], map); }
+  else if (p.length === 3) { top = pxOf(p[0], map); bottom = pxOf(p[2], map); }
+  else { top = pxOf(p[0], map); bottom = pxOf(p[2], map); }
+  return { top, bottom, sel: cand.sel };
+}
+function isMonoFamily(value, map) {
+  const fam = resolveVar(String(value), map).split(',')[0].toLowerCase();
+  return /mono/.test(fam) || /\bmonospace\b/.test(resolveVar(String(value), map).toLowerCase());
+}
+
+// ---------------------------------------------------------------- rule set v2
+// Added per "Your Evals Will Break": each probes a failure mode v1 cannot see.
+const EXTRA_V2 = [
+  {
+    id: 'v2-hero-float', dim: 'layout',
+    label: 'Hero pads symmetrically / top-heavy — floats off the page (gate 54)',
+    fn: ({ rules, map }) => {
+      const hp = heroPadding(rules, map);
+      if (!hp || hp.top == null || hp.bottom == null) return { pass: true, note: 'no measurable hero padding' };
+      const ok = hp.bottom >= 1.3 * hp.top || hp.top === 0;
+      return { pass: ok, note: ok ? 'hero sits into the page' : `top ${hp.top} / bottom ${hp.bottom} (need ≥1.3×)` };
+    },
+  },
+  {
+    id: 'v2-dark-mode-reflex', dim: 'color',
+    label: 'Defaulting to dark mode reflexively',
+    fn: ({ rules, map, genre, css }) => {
+      const exempt = /atmospheric|midnight|noir|terminal|cinema/i.test(genre) || /theme:\s*(midnight|terminal|noir)/i.test(css);
+      const body = rules.find((r) => /(^|[\s,])body\b/.test(r.sel));
+      const bm = body?.body.match(/background(?:-color)?\s*:\s*([^;]+)/i);
+      const L = bm ? oklchL(resolveVar(bm[1], map)) : null;
+      const dark = (L != null && L < 0.30) || /color-scheme\s*:\s*dark\b(?!\s*light)/i.test(css);
+      return { pass: !dark || exempt, note: dark ? (exempt ? 'dark but justified' : `dark base (L ${L?.toFixed(2)})`) : 'light base' };
+    },
+  },
+  {
+    id: 'v2-hero-metric-stat', dim: 'layout',
+    label: 'Hero metric layout (big number + supporting stats)',
+    fn: ({ rules, map }) => {
+      const bad = rules.find((r) => /(stat|metric|kpi|figure-n|big-?num)/.test(r.sel) && (() => { const m = r.body.match(/font-size\s*:\s*([^;]+)/i); const px = m ? pxOf(m[1], map) : null; return px != null && px >= 36; })());
+      return { pass: !bad, note: bad ? `metric cluster ${bad.sel}` : 'no hero-metric tell' };
+    },
+  },
+  {
+    id: 'v2-icon-tile-above-heading', dim: 'typography',
+    label: 'Icon tile stacked directly above a heading',
+    fn: ({ html }) => {
+      const heads = [...html.matchAll(/<h[2-4][\s>]/gi)];
+      for (const h of heads) {
+        const before = html.slice(Math.max(0, h.index - 110), h.index);
+        if (/<svg\b/i.test(before) || /class="[^"]*\b(icon|ico|tile|i-tile|feature-icon)\b/i.test(before)) return { pass: false, note: 'icon/svg tile precedes a heading' };
+      }
+      return { pass: true, note: 'no icon-above-heading tiles' };
+    },
+  },
+  {
+    id: 'v2-mono-as-shorthand', dim: 'typography',
+    label: 'Monospace used as "technical" shorthand across the UI',
+    fn: ({ rules, map }) => {
+      const sels = new Set();
+      for (const r of rules) {
+        if (/:root|\[data-theme/.test(r.sel)) continue;
+        if (/\b(pre|code|kbd|samp)\b/.test(r.sel)) continue;
+        const m = r.body.match(/font-family\s*:\s*([^;]+)/i);
+        if (m && isMonoFamily(m[1], map)) sels.add(r.sel);
+      }
+      return { pass: sels.size < 3, note: sels.size >= 3 ? `mono on ${sels.size} non-code selectors` : 'mono kept to code/labels' };
+    },
+  },
+  {
+    id: 'v2-everything-in-cards', dim: 'layout',
+    label: 'Wrapping all content in cards',
+    fn: ({ html }) => {
+      const n = (html.match(/class="[^"]*\b(card|panel|tile|box)\b/gi) || []).length;
+      return { pass: n <= 6, note: n > 6 ? `${n} card-like wrappers` : `${n} card-like wrappers` };
+    },
+  },
+];
+
+// ---------------------------------------------------------------- scoring
+function analyze(path, version = 'v1') {
+  const doc = loadDoc(path);
+  const map = tokenMap(doc.css);
+  const ctx = { ...doc, map, rules: cssRules(doc.css), fams: fontFamilies(doc.css, map) };
+  const ruleset = version === 'v2' ? RULES.concat(EXTRA_V2) : RULES;
+
+  const byDim = {};
+  for (const rule of ruleset) {
+    const res = rule.fn(ctx);
+    (byDim[rule.dim] ||= []).push({ id: rule.id, label: rule.label, pass: !!res.pass, note: res.note });
+  }
+  const dims = {};
+  for (const [dim, list] of Object.entries(byDim)) {
+    const passed = list.filter((r) => r.pass).length;
+    dims[dim] = { score: +((passed / list.length) * 5).toFixed(3), passed, total: list.length, rules: list };
+  }
+  const dimScores = Object.values(dims).map((d) => d.score);
+  const overall = +(dimScores.reduce((a, b) => a + b, 0) / dimScores.length).toFixed(3);
+  return { file: path, genre: doc.genre || 'n/a', dims, overall, ruleCount: ruleset.length };
+}
+
+// ---------------------------------------------------------------- cli
+import { pathToFileURL } from 'node:url';
+if (import.meta.url === pathToFileURL(process.argv[1] || '').href) {
+  const args = process.argv.slice(2);
+  const file = args.find((a) => !a.startsWith('--'));
+  if (!file) { console.error('usage: node detector.mjs <file.html> [--json] [--eval v1|v2]'); process.exit(1); }
+  const vi = args.indexOf('--eval');
+  const version = vi >= 0 ? args[vi + 1] : 'v1';
+  const result = analyze(file, version);
+  if (args.includes('--json')) {
+    console.log(JSON.stringify(result, null, 2));
+  } else {
+    console.log(`\n${result.file}  (genre: ${result.genre})  overall ${result.overall.toFixed(2)}/5`);
+    for (const [dim, d] of Object.entries(result.dims)) {
+      console.log(`  ${dim.padEnd(12)} ${d.score.toFixed(2)}  (${d.passed}/${d.total})`);
+      for (const r of d.rules.filter((x) => !x.pass)) console.log(`     ✗ ${r.id} — ${r.note}`);
+    }
+  }
+}
+
+export { analyze, RULES, EXTRA_V2 };
diff --git a/evals/fixtures/fernweh.html b/evals/fixtures/fernweh.html
new file mode 100644
index 0000000..ac1db01
--- /dev/null
+++ b/evals/fixtures/fernweh.html
@@ -0,0 +1,117 @@
+<!doctype html>
+<html lang="en">
+<head>
+<meta charset="utf-8" />
+<meta name="viewport" content="width=device-width, initial-scale=1" />
+<title>Fernweh — eight days on foot</title>
+<style>
+  /* Hallmark · macrostructure: narrative-workflow · genre: atmospheric · theme: catalog (Linen) · contrast: pass (46-50) · slop: pass (51-77) */
+  :root {
+    --paper: oklch(0.97 0.012 80);
+    --paper-2: oklch(0.93 0.018 75);
+    --ink: oklch(0.26 0.03 60);
+    --muted: oklch(0.48 0.03 60);
+    --line: oklch(0.86 0.02 75);
+    --accent: oklch(0.52 0.11 40);
+    --accent-ink: oklch(0.98 0.01 80);
+    --bloom: oklch(0.8 0.09 70 / 0.55);
+    --font-display: "Fraunces", Georgia, serif;
+    --font-body: Georgia, "Times New Roman", serif;
+    --s1: 8px; --s2: 16px; --s3: 24px; --s4: 48px; --s5: 96px;
+  }
+  * { box-sizing: border-box; }
+  html, body { overflow-x: clip; }
+  body {
+    margin: 0; color: var(--ink); font-family: var(--font-body);
+    font-size: 18px; line-height: 1.7;
+    background:
+      radial-gradient(60% 50% at 50% 0%, var(--bloom), transparent 70%),
+      var(--paper);
+  }
+  .wrap { max-width: 980px; margin: 0 auto; padding: 0 var(--s3); }
+  a { color: inherit; }
+  .btn {
+    font: inherit; border: 1px solid var(--ink); border-radius: 999px;
+    padding: 12px 24px; background: var(--accent); color: var(--accent-ink);
+    cursor: pointer; transition: background .2s ease, color .2s ease;
+  }
+  .btn:hover { background: var(--ink); }
+  .btn:active { transform: translateY(1px); }
+  .btn:focus-visible { outline: 2px solid var(--ink); outline-offset: 2px; }
+  .btn:disabled { opacity: .55; cursor: not-allowed; }
+  .btn--secondary { background: transparent; color: var(--ink); }
+  .btn--secondary:hover { background: var(--paper-2); color: var(--ink); }
+  @media (prefers-reduced-motion: reduce) { * { transition: none !important; } }
+
+  nav { display: flex; align-items: center; justify-content: space-between; gap: var(--s4); padding: var(--s3) 0; }
+  nav .nav-links { display: flex; gap: var(--s4); align-items: center; }
+  .brand { font-family: var(--font-display); font-size: 22px; }
+
+  .hero { padding: var(--s4) 0 var(--s5); }
+  .hero h1 { font-family: var(--font-display); font-weight: 400; font-size: clamp(40px, 8vw, 84px); line-height: 1.05; margin: 0 0 var(--s3); max-width: 18ch; }
+  .hero p { max-width: 60ch; margin: 0 0 var(--s4); color: var(--muted); font-size: 20px; }
+  .hero .ctas { display: flex; gap: var(--s2); }
+
+  .trips { padding: var(--s5) 0; }
+  .trips h2 { font-family: var(--font-display); font-weight: 400; font-size: 34px; margin: 0 0 var(--s4); max-width: 20ch; }
+  .trip-list { display: grid; grid-template-columns: repeat(2, minmax(0, 1fr)); gap: var(--s4); }
+  .trip { padding: var(--s3) 0; border-top: 1px solid var(--line); }
+  .trip .no { font-family: var(--font-display); font-size: 15px; color: var(--accent); }
+  .trip h3 { font-family: var(--font-display); font-weight: 400; font-size: 24px; margin: 4px 0 var(--s1); }
+  .trip p { color: var(--muted); }
+  .trip .cta { margin-top: var(--s2); display: flex; }
+
+  blockquote { font-family: var(--font-display); font-size: 28px; line-height: 1.4; max-width: 30ch; margin: var(--s5) 0; font-weight: 400; }
+
+  footer { padding: var(--s4) 0; text-align: center; color: var(--muted); font-size: 15px; border-top: 1px solid var(--line); }
+</style>
+</head>
+<body>
+<div class="wrap">
+  <nav>
+    <span class="brand">Fernweh</span>
+    <span class="nav-links">
+      <a href="#trips">Trips</a>
+      <a href="#about">Why we walk</a>
+      <a href="#book">Book</a>
+    </span>
+  </nav>
+
+  <header class="hero">
+    <h1>Eight days on foot, nothing booked but the path.</h1>
+    <p>Fernweh runs small walking trips for eight people at a time. No coach
+      transfers, no packed itinerary — just a route, a host who lives there,
+      and the unhurried distance a person can cover between breakfast and dusk.
+      We carry your bags ahead so you carry only the day.</p>
+    <div class="ctas">
+      <button class="btn">See this season's walks</button>
+      <button class="btn btn--secondary">Request the field notes</button>
+    </div>
+  </header>
+
+  <section class="trips" id="trips">
+    <h2>This season</h2>
+    <div class="trip-list">
+      <article class="trip">
+        <div class="no">Walk 01</div>
+        <h3>The Carmarthen ridgeline</h3>
+        <p>A high green spine through mid-Wales, ending each evening in a
+          farmhouse kitchen. Moderate; long days, soft ground.</p>
+        <div class="cta"><button class="btn btn--secondary">Hold a place</button></div>
+      </article>
+      <article class="trip">
+        <div class="no">Walk 02</div>
+        <h3>The Alentejo coast</h3>
+        <p>Cork oak to cliff-edge along Portugal's quiet south. Warm light,
+          flat miles, fish grilled where it lands.</p>
+        <div class="cta"><button class="btn btn--secondary">Hold a place</button></div>
+      </article>
+    </div>
+  </section>
+
+  <blockquote>"We don't tell you what you'll see. We tell you when to start walking."</blockquote>
+
+  <footer>Fernweh · eight walkers, eight days, one host who lives there.</footer>
+</div>
+</body>
+</html>
diff --git a/evals/fixtures/fernweh.judge.json b/evals/fixtures/fernweh.judge.json
new file mode 100644
index 0000000..f3c3241
--- /dev/null
+++ b/evals/fixtures/fernweh.judge.json
@@ -0,0 +1,10 @@
+{
+  "philosophy": 4,
+  "hierarchy": 4,
+  "execution": 5,
+  "specificity": 4,
+  "restraint": 4,
+  "variety": 3,
+  "honesty": 5,
+  "notes": "Focus rings, overflow clip, and a stamp finish execution. Structure is still a centred-then-two-up list; variety remains the weak axis."
+}
diff --git a/evals/fixtures/kiln.html b/evals/fixtures/kiln.html
new file mode 100644
index 0000000..493ab90
--- /dev/null
+++ b/evals/fixtures/kiln.html
@@ -0,0 +1,130 @@
+<!doctype html>
+<html lang="en">
+<head>
+<meta charset="utf-8" />
+<meta name="viewport" content="width=device-width, initial-scale=1" />
+<title>Kiln &amp; Co. — a seasonal run of stoneware</title>
+<style>
+  /* Hallmark · macrostructure: split-studio · genre: editorial · theme: catalog (Almanac) · contrast: pass (46-50) · slop: pass (51-77) */
+  :root {
+    --paper: oklch(0.95 0.01 95);
+    --paper-2: oklch(0.91 0.014 92);
+    --ink: oklch(0.24 0.02 70);
+    --muted: oklch(0.47 0.025 70);
+    --line: oklch(0.84 0.02 90);
+    --accent: oklch(0.5 0.09 35);
+    --accent-ink: oklch(0.98 0.01 95);
+    --font-display: "Hedvig Letters Serif", Georgia, serif;
+    --font-body: "Iowan Old Style", Georgia, serif;
+    --font-mono: "IBM Plex Mono", ui-monospace, monospace;
+    --s1: 8px; --s2: 16px; --s3: 24px; --s4: 48px; --s5: 96px;
+  }
+  * { box-sizing: border-box; }
+  html, body { overflow-x: clip; }
+  body { margin: 0; background: var(--paper); color: var(--ink); font-family: var(--font-body); font-size: 18px; line-height: 1.65; }
+  .wrap { max-width: 1080px; margin: 0 auto; padding: 0 var(--s3); }
+  a { color: inherit; }
+  .btn { font: inherit; border: 1px solid var(--ink); background: var(--accent); color: var(--accent-ink); padding: 12px 20px; border-radius: 4px; cursor: pointer; transition: background .2s ease; }
+  .btn:hover { background: var(--ink); color: var(--paper); }
+  .btn:active { transform: translateY(1px); }
+  .btn:focus-visible { outline: 2px solid var(--accent); outline-offset: 2px; }
+  .btn:disabled { opacity: .55; cursor: not-allowed; }
+  @media (prefers-reduced-motion: reduce) { * { transition: none !important; } }
+
+  header.top { display: flex; align-items: center; justify-content: space-between; padding: var(--s3) 0; border-bottom: 1px solid var(--line); }
+  .brand { font-family: var(--font-display); font-size: 22px; }
+
+  .lede { padding: var(--s4) 0 var(--s5); display: grid; grid-template-columns: minmax(0, 1.4fr) minmax(0, 1fr); gap: var(--s4); align-items: end; }
+  .lede h1 { font-family: var(--font-display); font-weight: 400; font-size: clamp(38px, 6vw, 68px); line-height: 1.08; margin: 0; }
+  .lede .meta { font-family: var(--font-mono); font-size: 13px; color: var(--muted); text-transform: uppercase; letter-spacing: 0.08em; }
+  .lede .note { color: var(--muted); margin-top: var(--s2); }
+
+  .pieces { padding: var(--s4) 0 var(--s5); }
+  .pieces h2 { font-family: var(--font-display); font-weight: 400; font-size: 30px; }
+  .grid3 { display: grid; grid-template-columns: minmax(0, 1.4fr) minmax(0, 1fr) minmax(0, 1.1fr); gap: var(--s3); margin-top: var(--s3); }
+  .piece { background: var(--paper-2); border: 1px solid var(--line); border-radius: 6px; padding: var(--s3); }
+  .piece h3 { font-family: var(--font-display); font-weight: 400; margin: 0 0 4px; font-size: 21px; }
+  .piece .price { font-family: var(--font-mono); color: var(--muted); }
+  .piece p { color: var(--muted); margin: 8px 0 0; }
+
+  .gallery { display: grid; grid-template-columns: repeat(auto-fit, minmax(180px, 1fr)); gap: var(--s2); padding: var(--s4) 0; }
+  .gallery .ph { aspect-ratio: 4 / 5; background: var(--paper-2); border: 1px solid var(--line); border-radius: 6px; }
+
+  .reserve { border-top: 1px solid var(--line); padding: var(--s5) 0; display: grid; grid-template-columns: minmax(0, 1fr) minmax(0, 1.2fr); gap: var(--s4); align-items: start; }
+  .reserve h2 { font-family: var(--font-display); font-weight: 400; font-size: 30px; margin: 0; }
+  .reserve p { margin: 0; color: var(--muted); }
+  .reserve form { display: flex; gap: var(--s2); flex-wrap: wrap; margin-top: var(--s2); }
+  .reserve input { font: inherit; padding: 12px 16px; border: 1px solid var(--line); border-radius: 4px; background: var(--paper); color: var(--ink); flex: 1; min-width: 220px; }
+  .reserve input:focus-visible { outline: 2px solid var(--accent); outline-offset: 1px; }
+
+  footer { border-top: 1px solid var(--line); padding: var(--s4) 0; color: var(--muted); font-size: 15px; }
+
+  @media (max-width: 760px) {
+    .lede { grid-template-columns: 1fr; }
+    .grid3 { grid-template-columns: 1fr; }
+    .gallery { grid-template-columns: 1fr 1fr; }
+    .reserve { grid-template-columns: 1fr; }
+  }
+</style>
+</head>
+<body>
+<div class="wrap">
+  <header class="top">
+    <span class="brand">Kiln &amp; Co.</span>
+    <nav style="display:flex; gap:24px; align-items:center;">
+      <a href="#pieces">The run</a>
+      <a href="#about">The workshop</a>
+      <a class="btn" href="#reserve">Reserve a set</a>
+    </nav>
+  </header>
+
+  <section class="lede">
+    <h1>A winter run of stoneware, thrown two at a time.</h1>
+    <div>
+      <p class="meta">Edition 04 · Glaze: ash &amp; iron · 38 pieces</p>
+      <p class="note">Thrown between November and February, fired in two
+        loads. When the run sells out, the edition closes — we don't reprint.</p>
+    </div>
+  </section>
+
+  <section class="pieces" id="pieces">
+    <h2>What we made this season</h2>
+    <div class="grid3">
+      <article class="piece">
+        <h3>Morning bowl</h3>
+        <div class="price">£42</div>
+        <p>A wide, low bowl for the first thing you eat. Iron glaze, unglazed foot.</p>
+      </article>
+      <article class="piece">
+        <h3>Tall carafe</h3>
+        <div class="price">£68</div>
+        <p>Holds a litre. Pours clean from a pulled lip. Ash glaze, slight tilt.</p>
+      </article>
+      <article class="piece">
+        <h3>Pair of cups</h3>
+        <div class="price">£36</div>
+        <p>Sold as two because the wheel makes them in pairs. No two identical.</p>
+      </article>
+    </div>
+
+    <div class="gallery" aria-label="Photographs of the workshop">
+      <div class="ph"></div><div class="ph"></div><div class="ph"></div>
+    </div>
+  </section>
+
+  <section class="reserve" id="reserve">
+    <h2>Reserve a set</h2>
+    <div>
+      <p>Tell us which pieces and we'll hold them through the firing. We reply
+        by hand, usually the same week.</p>
+      <form action="mailto:hello@kilnandco.example" method="post">
+        <input type="email" name="email" placeholder="Your email" aria-label="Your email" required />
+        <button class="btn" type="submit">Hold my pieces</button>
+      </form>
+    </div>
+  </section>
+
+  <footer>Kiln &amp; Co. · two people, one wheel, a run each season.</footer>
+</div>
+</body>
+</html>
diff --git a/evals/fixtures/kiln.judge.json b/evals/fixtures/kiln.judge.json
new file mode 100644
index 0000000..3f8950c
--- /dev/null
+++ b/evals/fixtures/kiln.judge.json
@@ -0,0 +1,10 @@
+{
+  "philosophy": 4,
+  "hierarchy": 4,
+  "execution": 5,
+  "specificity": 4,
+  "restraint": 5,
+  "variety": 4,
+  "honesty": 5,
+  "notes": "Side-tab stripe and decorative sparkline removed; gallery reflows instead of hiding on mobile; stamp added. Execution and restraint fully resolved."
+}
diff --git a/evals/fixtures/ledger.html b/evals/fixtures/ledger.html
new file mode 100644
index 0000000..e918a5b
--- /dev/null
+++ b/evals/fixtures/ledger.html
@@ -0,0 +1,131 @@
+<!doctype html>
+<html lang="en">
+<head>
+<meta charset="utf-8" />
+<meta name="viewport" content="width=device-width, initial-scale=1" />
+<title>Ledger — double-entry bookkeeping for your terminal</title>
+<style>
+  /* Hallmark · macrostructure: stat-led · genre: modern-minimal · theme: catalog (Quiet) · contrast: pass (46-50) · slop: pass (51-77) */
+  :root {
+    --paper: oklch(0.985 0.004 250);
+    --paper-2: oklch(0.96 0.005 250);
+    --ink: oklch(0.22 0.02 250);
+    --muted: oklch(0.5 0.02 250);
+    --line: oklch(0.9 0.006 250);
+    --accent: oklch(0.52 0.13 230);
+    --accent-2: oklch(0.46 0.12 230);
+    --accent-ink: oklch(0.99 0.01 230);
+    --font-display: "Hanken Grotesk", system-ui, sans-serif;
+    --font-body: "Source Serif 4", Georgia, serif;
+    --s1: 8px; --s2: 16px; --s3: 24px; --s4: 48px; --s5: 96px;
+  }
+  * { box-sizing: border-box; }
+  html, body { overflow-x: clip; }
+  body {
+    margin: 0; background: var(--paper); color: var(--ink);
+    font-family: var(--font-body); line-height: 1.6; font-size: 17px;
+  }
+  a { color: inherit; }
+  .wrap { max-width: 1100px; margin: 0 auto; padding: 0 var(--s3); }
+  .btn {
+    font: inherit; border: 0; border-radius: 8px; padding: 12px 20px;
+    background: var(--accent); color: var(--accent-ink); cursor: pointer;
+    transition: background-color .18s ease, transform .12s ease;
+  }
+  @media (prefers-reduced-motion: reduce) { * { transition: none !important; } }
+  .btn:hover { background: var(--accent-2); }
+  .btn:active { transform: translateY(1px); }
+  .btn:focus-visible { outline: 2px solid var(--accent); outline-offset: 2px; }
+  .btn:disabled { opacity: .55; cursor: not-allowed; }
+  .btn--secondary { background: transparent; color: var(--ink); border: 1px solid var(--line); }
+  .btn--secondary:hover { background: var(--paper-2); }
+  .link { color: var(--accent); text-decoration: none; border-bottom: 1px solid var(--line); }
+
+  nav { display: flex; align-items: center; justify-content: space-between; padding: var(--s2) 0; border-bottom: 1px solid var(--line); }
+  nav .links { display: flex; gap: var(--s3); align-items: center; }
+  .brand { font-weight: 700; letter-spacing: -0.01em; }
+
+  .hero { padding: var(--s4) 0 var(--s5); }
+  .hero h1 {
+    font-family: var(--font-display); font-weight: 800; font-size: clamp(40px, 7vw, 76px);
+    line-height: 1.05; margin: 0 0 var(--s3); letter-spacing: -0.03em; max-width: 16ch;
+    color: var(--ink);
+  }
+  .hero h1 em { font-style: normal; color: var(--accent); }
+  .hero p { font-size: 21px; color: var(--muted); max-width: 56ch; margin: 0 0 var(--s4); }
+  .hero .row { display: flex; gap: var(--s2); align-items: center; }
+
+  .facts { display: grid; grid-template-columns: repeat(auto-fit, minmax(180px, 1fr)); gap: var(--s3); padding: var(--s4) 0; border-block: 1px solid var(--line); }
+  .fact { display: flex; align-items: baseline; gap: 12px; }
+  .fact .n { font-family: var(--font-display); font-size: 28px; letter-spacing: -0.02em; }
+  .fact .l { color: var(--muted); font-size: 15px; }
+
+  .feature { padding: var(--s5) 0; }
+  .feature h2 { font-family: var(--font-display); font-size: 32px; letter-spacing: -0.02em; }
+  .grid3 { display: grid; grid-template-columns: minmax(0, 1.5fr) minmax(0, 1fr) minmax(0, 1fr); gap: var(--s3); margin-top: var(--s4); }
+  .card { background: var(--paper-2); border: 1px solid var(--line); border-radius: 14px; padding: var(--s3); }
+  .card h3 { margin: 0 0 8px; font-size: 19px; }
+  .card p { margin: 0; color: var(--muted); font-size: 16px; }
+  .card pre { margin: 12px 0 0; padding: 12px; background: var(--ink); color: var(--paper); border-radius: 10px; font-size: 13px; overflow:auto; }
+
+  footer { border-top: 1px solid var(--line); padding: var(--s4) 0; color: var(--muted); font-size: 15px; }
+</style>
+</head>
+<body>
+<div class="wrap">
+  <nav>
+    <span class="brand">Ledger</span>
+    <span class="links">
+      <a href="#how">How it works</a>
+      <a href="#install">Install</a>
+      <a href="https://github.com">Source</a>
+      <button class="btn">Get started</button>
+    </span>
+  </nav>
+
+  <header class="hero">
+    <h1>Plain-text books that <em>balance themselves</em>.</h1>
+    <p>Ledger is a double-entry bookkeeping CLI for developers who'd rather
+      keep accounts in a file they own than a SaaS dashboard. Every entry is
+      a line of text; every report is one command.</p>
+    <div class="row">
+      <button class="btn">Install the CLI</button>
+      <button class="btn btn--secondary">Read the docs</button>
+      <a class="link" href="https://github.com">Star on GitHub →</a>
+    </div>
+  </header>
+
+  <section class="facts">
+    <div class="fact"><span class="n">0</span><span class="l">cloud accounts required</span></div>
+    <div class="fact"><span class="n">1</span><span class="l">plain-text file</span></div>
+    <div class="fact"><span class="n">2</span><span class="l">entries per transaction</span></div>
+    <div class="fact"><span class="n">∞</span><span class="l">reports you can pipe</span></div>
+  </section>
+
+  <section class="feature" id="how">
+    <h2>Books as a file, not a service</h2>
+    <div class="grid3">
+      <article class="card">
+        <h3>Append entries</h3>
+        <p>Write transactions in a readable journal format. Comments and tags travel with each line.</p>
+        <pre>ledger add "rent" -120000 assets:checking</pre>
+      </article>
+      <article class="card">
+        <h3>Run reports</h3>
+        <p>Balance sheets and income statements are commands, not screens. Pipe the output anywhere.</p>
+        <pre>ledger report balance --period 2026</pre>
+      </article>
+      <article class="card">
+        <h3>Diff your money</h3>
+        <p>Because the journal is text, git tracks every change. Review your finances like code.</p>
+        <pre>git diff books.journal</pre>
+      </article>
+    </div>
+  </section>
+
+  <footer>
+    <div class="wrap" style="padding:0">Ledger · MIT-licensed · a plain-text bookkeeping CLI.</div>
+  </footer>
+</div>
+</body>
+</html>
diff --git a/evals/fixtures/ledger.judge.json b/evals/fixtures/ledger.judge.json
new file mode 100644
index 0000000..ca0aa78
--- /dev/null
+++ b/evals/fixtures/ledger.judge.json
@@ -0,0 +1,10 @@
+{
+  "philosophy": 4,
+  "hierarchy": 4,
+  "execution": 5,
+  "specificity": 4,
+  "restraint": 5,
+  "variety": 4,
+  "honesty": 5,
+  "notes": "Hero bottom-weighted; big-number stat strip de-metered into an inline fact row. No longer the rote hero->big-stats->cards template; reads as a considered stat-led page."
+}
diff --git a/evals/fixtures/pulse.html b/evals/fixtures/pulse.html
new file mode 100644
index 0000000..5a5c5a9
--- /dev/null
+++ b/evals/fixtures/pulse.html
@@ -0,0 +1,115 @@
+<!doctype html>
+<html lang="en">
+<head>
+<meta charset="utf-8" />
+<meta name="viewport" content="width=device-width, initial-scale=1" />
+<title>Pulse — developer analytics that actually ship</title>
+<style>
+  /* Hallmark · macrostructure: feature-stack · genre: modern-minimal · theme: catalog (Quiet) · contrast: pass (46-50) · slop: pass (51-84) */
+  :root {
+    --bg: oklch(0.98 0.006 230);
+    --surface: oklch(0.955 0.008 230);
+    --ink: oklch(0.24 0.02 235);
+    --muted: oklch(0.5 0.02 235);
+    --line: oklch(0.9 0.01 230);
+    --accent: oklch(0.52 0.12 200);
+    --accent-2: oklch(0.46 0.11 200);
+    --accent-ink: oklch(0.99 0.01 200);
+    --font: "Schibsted Grotesk", system-ui, sans-serif;
+    --mono: "Space Mono", ui-monospace, monospace;
+  }
+  * { box-sizing: border-box; }
+  html, body { overflow-x: clip; }
+  body { margin: 0; background: var(--bg); color: var(--ink); font-family: var(--font); font-size: 16px; line-height: 1.5; }
+  .wrap { max-width: 1160px; margin: 0 auto; padding: 0 24px; }
+  a { color: inherit; }
+  .btn { font: inherit; border: 0; border-radius: 10px; padding: 12px 20px; background: var(--accent); color: var(--accent-ink); cursor: pointer; transition: background-color .18s ease, transform .12s ease; }
+  .btn:hover { background: var(--accent-2); }
+  .btn:active { transform: translateY(1px); }
+  .btn:focus-visible { outline: 2px solid var(--accent); outline-offset: 2px; }
+  .btn:disabled { opacity: .55; cursor: not-allowed; }
+  .btn--secondary { background: transparent; color: var(--ink); border: 1px solid var(--line); }
+  .btn--secondary:hover { background: var(--surface); }
+  @media (prefers-reduced-motion: reduce) { * { transition: none !important; } }
+
+  nav { display: flex; align-items: center; justify-content: space-between; padding: 16px 0; }
+  .brand { font-weight: 800; letter-spacing: -0.02em; }
+  nav .links { display: flex; gap: 24px; align-items: center; }
+
+  .hero { padding: 56px 0 96px; }
+  .hero h1 {
+    font-size: clamp(40px, 7vw, 72px); line-height: 1.05; margin: 0 0 16px; max-width: 18ch; letter-spacing: -0.03em;
+    color: var(--ink);
+  }
+  .hero p { color: var(--muted); max-width: 60ch; margin: 0 0 28px; font-size: 19px; }
+  .hero .ctas { display: flex; gap: 12px; }
+
+  .facts { display: grid; grid-template-columns: repeat(auto-fit, minmax(200px, 1fr)); gap: 16px; padding: 24px 0 64px; }
+  .fact { border-top: 2px solid var(--line); padding-top: 16px; display: flex; align-items: baseline; gap: 12px; }
+  .fact .num { font-size: 32px; font-weight: 800; letter-spacing: -0.02em; }
+  .fact .label { font-family: var(--mono); text-transform: uppercase; font-size: 12px; color: var(--muted); letter-spacing: 0.08em; }
+  .fact .unit { color: var(--muted); font-size: 14px; }
+
+  .features { padding: 64px 0; }
+  .features h2 { font-size: 32px; margin: 0 0 40px; max-width: 22ch; }
+  .grid { display: grid; grid-template-columns: minmax(0, 1.3fr) minmax(0, 1fr) minmax(0, 1.1fr); gap: 16px; }
+  .card { background: var(--surface); border: 1px solid var(--line); border-radius: 16px; padding: 24px; }
+  .card h3 { margin: 0 0 8px; font-size: 19px; }
+  .card p { margin: 0; color: var(--muted); }
+  .card .tag { font-family: var(--mono); font-size: 11px; color: var(--accent-2); text-transform: uppercase; }
+
+  footer { border-top: 1px solid var(--line); padding: 40px 0; color: var(--muted); font-size: 14px; }
+</style>
+</head>
+<body>
+<div class="wrap">
+  <nav>
+    <span class="brand">Pulse</span>
+    <span class="links">
+      <a href="#features">Features</a>
+      <a href="#pricing">Pricing</a>
+      <button class="btn">Start free</button>
+    </span>
+  </nav>
+
+  <header class="hero">
+    <h1>Developer analytics that actually ship.</h1>
+    <p>Pulse turns your event stream into dashboards your whole team will
+      check every morning. Real-time, dark by default, beautiful out of the box.</p>
+    <div class="ctas">
+      <button class="btn">Start free</button>
+      <a class="btn btn--secondary" href="#features">See a live dashboard</a>
+    </div>
+  </header>
+
+  <section class="facts">
+    <div class="fact"><span class="num">1</span><span class="label">SDK</span><span class="unit">drop-in, any language</span></div>
+    <div class="fact"><span class="num">0</span><span class="label">SQL</span><span class="unit">funnels without queries</span></div>
+    <div class="fact"><span class="num">∞</span><span class="label">events</span><span class="unit">retained at full fidelity</span></div>
+  </section>
+
+  <section class="features" id="features">
+    <h2>Everything in one pane</h2>
+    <div class="grid">
+      <article class="card">
+        <span class="tag">Realtime</span>
+        <h3>Live event stream</h3>
+        <p>Watch events land the moment they fire, with sub-second latency.</p>
+      </article>
+      <article class="card">
+        <span class="tag">Insights</span>
+        <h3>Funnels &amp; retention</h3>
+        <p>Build cohorts and funnels without writing a single SQL query.</p>
+      </article>
+      <article class="card">
+        <span class="tag">Alerts</span>
+        <h3>Anomaly alerts</h3>
+        <p>Get pinged in Slack when a metric drifts outside its band.</p>
+      </article>
+    </div>
+  </section>
+
+  <footer>Pulse · developer analytics. Dark by default.</footer>
+</div>
+</body>
+</html>
diff --git a/evals/fixtures/pulse.judge.json b/evals/fixtures/pulse.judge.json
new file mode 100644
index 0000000..0e895d3
--- /dev/null
+++ b/evals/fixtures/pulse.judge.json
@@ -0,0 +1,10 @@
+{
+  "philosophy": 4,
+  "hierarchy": 4,
+  "execution": 5,
+  "specificity": 4,
+  "restraint": 5,
+  "variety": 4,
+  "honesty": 5,
+  "notes": "Re-keyed to feature-stack (no longer a colour-swap of ledger's stat-led). Detector-clean, honest, light, single-accent. Variety resolved at the corpus level."
+}
diff --git a/evals/fixtures/vellum.html b/evals/fixtures/vellum.html
new file mode 100644
index 0000000..479536b
--- /dev/null
+++ b/evals/fixtures/vellum.html
@@ -0,0 +1,70 @@
+<!doctype html>
+<html lang="en">
+<head>
+<meta charset="utf-8" />
+<meta name="viewport" content="width=device-width, initial-scale=1" />
+<title>Vellum — write like it's the last draft</title>
+<style>
+  /* Hallmark · macrostructure: long-document · genre: editorial · theme: catalog (Almanac) · contrast: pass (46-50) · slop: pass (51-84) */
+  :root {
+    --paper: oklch(0.98 0.006 95);
+    --ink: oklch(0.25 0.015 95);
+    --muted: oklch(0.5 0.02 95);
+    --line: oklch(0.88 0.01 95);
+    --accent: oklch(0.5 0.1 30);
+    --display: "Fraunces", Georgia, serif;
+    --serif: "Spectral", Georgia, serif;
+  }
+  * { box-sizing: border-box; }
+  html, body { overflow-x: clip; }
+  body { margin: 0; background: var(--paper); color: var(--ink); font-family: var(--serif); font-size: 19px; line-height: 1.6; }
+  .wrap { max-width: 68ch; margin: 0 auto; padding: 0 24px; }
+  a { color: var(--accent); }
+
+  .masthead { padding: 48px 0 96px; }
+  .masthead h1 { font-family: var(--display); font-size: clamp(40px, 7vw, 72px); line-height: 1.02; margin: 0 0 16px; font-weight: 600; }
+  .masthead p { color: var(--muted); font-size: 22px; margin: 0; }
+
+  article { padding: 24px 0 96px; }
+  article h2 { font-family: var(--display); font-size: 30px; font-weight: 600; margin: 48px 0 12px; }
+  article p { margin: 0 0 20px; }
+  article blockquote { font-family: var(--display); font-size: 24px; margin: 32px 0; padding-left: 20px; border-left: 3px solid var(--accent); }
+
+  footer { border-top: 1px solid var(--line); padding: 32px 0; color: var(--muted); font-size: 15px; }
+</style>
+</head>
+<body>
+<div class="wrap">
+  <header class="masthead">
+    <h1>Write like it's the last draft.</h1>
+    <p>Vellum is a writing tool for people who finish things.</p>
+  </header>
+
+  <article>
+    <p>The blank page is not the problem. The problem is the second page, and
+      the third, and the slow erosion of the sentence you were proud of at
+      breakfast. Most writing software is built for the first ten minutes —
+      the burst, the outline, the satisfying click of a new document. Vellum
+      is built for the long middle, where books are actually written and
+      abandoned.</p>
+
+    <h2>One column, no chrome</h2>
+    <p>We removed the sidebar. We removed the formatting toolbar. We removed
+      the word count that taunts you from the corner. What remains is a single
+      column of text at a measure your eye can actually follow, and a margin
+      wide enough to think in. The interface gets out of the way so completely
+      that after a few minutes you forget you are using software at all.</p>
+
+    <blockquote>The tool should disappear. The sentence should not.</blockquote>
+
+    <h2>Drafts that remember</h2>
+    <p>Every save is a version, and every version is a place you can return
+      to. Change your mind about a chapter and the old one is still there,
+      whole, the way you left it. Writing is rewriting, and rewriting is only
+      safe when nothing is ever truly lost.</p>
+  </article>
+
+  <footer>Vellum · a writing tool for people who finish things.</footer>
+</div>
+</body>
+</html>
diff --git a/evals/fixtures/vellum.judge.json b/evals/fixtures/vellum.judge.json
new file mode 100644
index 0000000..e4e3bf0
--- /dev/null
+++ b/evals/fixtures/vellum.judge.json
@@ -0,0 +1,10 @@
+{
+  "philosophy": 4,
+  "hierarchy": 4,
+  "execution": 4,
+  "specificity": 4,
+  "restraint": 4,
+  "honesty": 5,
+  "variety": 4,
+  "notes": "Reading column fixed: 1.6 leading, 68ch measure, ragged-right, Fraunces/Spectral pairing. Stamped long-document (distinct macrostructure). Reads like an essay, not a generated page."
+}
diff --git a/evals/results/cycle-01-v1.json b/evals/results/cycle-01-v1.json
new file mode 100644
index 0000000..8e95d30
--- /dev/null
+++ b/evals/results/cycle-01-v1.json
@@ -0,0 +1,67 @@
+{
+  "cycle": 1,
+  "evalVersion": "v1",
+  "label": "Baseline: skill applied as-is; residual slop across motion, color, layout, interaction, visual.",
+  "ruleCount": 37,
+  "fixtureCount": 3,
+  "cycleScore": 74.2,
+  "aggDims": {
+    "typography": 4.524,
+    "color": 3.611,
+    "visual": 3.889,
+    "layout": 3.333,
+    "motion": 4.333,
+    "interaction": 3.333,
+    "responsive": 3.889,
+    "general": 2.778,
+    "craft": 3.714
+  },
+  "fixtures": [
+    {
+      "name": "ledger",
+      "score100": 71.9,
+      "dimScores": {
+        "typography": 3.571,
+        "color": 2.5,
+        "visual": 5,
+        "layout": 4.167,
+        "motion": 3,
+        "interaction": 3.75,
+        "responsive": 3.333,
+        "general": 3.333,
+        "craft": 3.714
+      }
+    },
+    {
+      "name": "fernweh",
+      "score100": 72.8,
+      "dimScores": {
+        "typography": 5,
+        "color": 4.167,
+        "visual": 5,
+        "layout": 2.5,
+        "motion": 5,
+        "interaction": 2.5,
+        "responsive": 3.333,
+        "general": 1.667,
+        "craft": 3.571
+      }
+    },
+    {
+      "name": "kiln",
+      "score100": 78,
+      "dimScores": {
+        "typography": 5,
+        "color": 4.167,
+        "visual": 1.667,
+        "layout": 3.333,
+        "motion": 5,
+        "interaction": 3.75,
+        "responsive": 5,
+        "general": 3.333,
+        "craft": 3.857
+      }
+    }
+  ],
+  "timestamp": "2026-05-21T10:13:36.611Z"
+}
\ No newline at end of file
diff --git a/evals/results/cycle-02-v1.json b/evals/results/cycle-02-v1.json
new file mode 100644
index 0000000..9a5acf2
--- /dev/null
+++ b/evals/results/cycle-02-v1.json
@@ -0,0 +1,67 @@
+{
+  "cycle": 2,
+  "evalVersion": "v1",
+  "label": "Typography & palette discipline: gate 70 (single-typeface floor) + gate 71 (AI-palette / raw colour in gradients). ledger gets a real pairing and a solid headline; bloom tokenised.",
+  "ruleCount": 37,
+  "fixtureCount": 3,
+  "cycleScore": 78,
+  "aggDims": {
+    "typography": 5,
+    "color": 4.722,
+    "visual": 3.889,
+    "layout": 3.333,
+    "motion": 4.333,
+    "interaction": 3.333,
+    "responsive": 3.889,
+    "general": 2.778,
+    "craft": 3.809
+  },
+  "fixtures": [
+    {
+      "name": "ledger",
+      "score100": 81.3,
+      "dimScores": {
+        "typography": 5,
+        "color": 5,
+        "visual": 5,
+        "layout": 4.167,
+        "motion": 3,
+        "interaction": 3.75,
+        "responsive": 3.333,
+        "general": 3.333,
+        "craft": 4
+      }
+    },
+    {
+      "name": "fernweh",
+      "score100": 74.6,
+      "dimScores": {
+        "typography": 5,
+        "color": 5,
+        "visual": 5,
+        "layout": 2.5,
+        "motion": 5,
+        "interaction": 2.5,
+        "responsive": 3.333,
+        "general": 1.667,
+        "craft": 3.571
+      }
+    },
+    {
+      "name": "kiln",
+      "score100": 78,
+      "dimScores": {
+        "typography": 5,
+        "color": 4.167,
+        "visual": 1.667,
+        "layout": 3.333,
+        "motion": 5,
+        "interaction": 3.75,
+        "responsive": 5,
+        "general": 3.333,
+        "craft": 3.857
+      }
+    }
+  ],
+  "timestamp": "2026-05-21T10:15:21.057Z"
+}
\ No newline at end of file
diff --git a/evals/results/cycle-03-v1.json b/evals/results/cycle-03-v1.json
new file mode 100644
index 0000000..339f7c4
--- /dev/null
+++ b/evals/results/cycle-03-v1.json
@@ -0,0 +1,67 @@
+{
+  "cycle": 3,
+  "evalVersion": "v1",
+  "label": "Layout discipline: gate 72 (justified text) + gate 73 (skipped heading levels). fernweh de-centred and re-leveled; rote 3-equal grids varied; kiln spacing back on the 4px scale.",
+  "ruleCount": 37,
+  "fixtureCount": 3,
+  "cycleScore": 81.8,
+  "aggDims": {
+    "typography": 5,
+    "color": 4.722,
+    "visual": 3.889,
+    "layout": 5,
+    "motion": 4.333,
+    "interaction": 3.333,
+    "responsive": 3.889,
+    "general": 2.778,
+    "craft": 3.857
+  },
+  "fixtures": [
+    {
+      "name": "ledger",
+      "score100": 83.1,
+      "dimScores": {
+        "typography": 5,
+        "color": 5,
+        "visual": 5,
+        "layout": 5,
+        "motion": 3,
+        "interaction": 3.75,
+        "responsive": 3.333,
+        "general": 3.333,
+        "craft": 4
+      }
+    },
+    {
+      "name": "fernweh",
+      "score100": 80.5,
+      "dimScores": {
+        "typography": 5,
+        "color": 5,
+        "visual": 5,
+        "layout": 5,
+        "motion": 5,
+        "interaction": 2.5,
+        "responsive": 3.333,
+        "general": 1.667,
+        "craft": 3.714
+      }
+    },
+    {
+      "name": "kiln",
+      "score100": 81.7,
+      "dimScores": {
+        "typography": 5,
+        "color": 4.167,
+        "visual": 1.667,
+        "layout": 5,
+        "motion": 5,
+        "interaction": 3.75,
+        "responsive": 5,
+        "general": 3.333,
+        "craft": 3.857
+      }
+    }
+  ],
+  "timestamp": "2026-05-21T10:16:23.998Z"
+}
\ No newline at end of file
diff --git a/evals/results/cycle-04-v1.json b/evals/results/cycle-04-v1.json
new file mode 100644
index 0000000..0c65d1e
--- /dev/null
+++ b/evals/results/cycle-04-v1.json
@@ -0,0 +1,67 @@
+{
+  "cycle": 4,
+  "evalVersion": "v1",
+  "label": "Interaction discipline: gate 74 (button hierarchy) + gate 75 (reflexive modal). Secondary register added to ledger/fernweh; fernweh emoji icons replaced with numbered walks; kiln modal swapped for an inline reserve section.",
+  "ruleCount": 37,
+  "fixtureCount": 3,
+  "cycleScore": 85.9,
+  "aggDims": {
+    "typography": 5,
+    "color": 4.722,
+    "visual": 3.889,
+    "layout": 5,
+    "motion": 4.333,
+    "interaction": 5,
+    "responsive": 3.889,
+    "general": 2.778,
+    "craft": 4.048
+  },
+  "fixtures": [
+    {
+      "name": "ledger",
+      "score100": 85.9,
+      "dimScores": {
+        "typography": 5,
+        "color": 5,
+        "visual": 5,
+        "layout": 5,
+        "motion": 3,
+        "interaction": 5,
+        "responsive": 3.333,
+        "general": 3.333,
+        "craft": 4
+      }
+    },
+    {
+      "name": "fernweh",
+      "score100": 86.7,
+      "dimScores": {
+        "typography": 5,
+        "color": 5,
+        "visual": 5,
+        "layout": 5,
+        "motion": 5,
+        "interaction": 5,
+        "responsive": 3.333,
+        "general": 1.667,
+        "craft": 4
+      }
+    },
+    {
+      "name": "kiln",
+      "score100": 85.1,
+      "dimScores": {
+        "typography": 5,
+        "color": 4.167,
+        "visual": 1.667,
+        "layout": 5,
+        "motion": 5,
+        "interaction": 5,
+        "responsive": 5,
+        "general": 3.333,
+        "craft": 4.143
+      }
+    }
+  ],
+  "timestamp": "2026-05-21T10:17:52.608Z"
+}
\ No newline at end of file
diff --git a/evals/results/cycle-05-v1.json b/evals/results/cycle-05-v1.json
new file mode 100644
index 0000000..c53d060
--- /dev/null
+++ b/evals/results/cycle-05-v1.json
@@ -0,0 +1,67 @@
+{
+  "cycle": 5,
+  "evalVersion": "v1",
+  "label": "Cleanup + gate 76 (decorative chart) + gate 77 (feature amputation; detector media-extractor hardened). kiln side-tab/sparkline removed and gallery reflows; ledger motion scoped; stamps + overflow clip + focus rings across all three.",
+  "ruleCount": 37,
+  "fixtureCount": 3,
+  "cycleScore": 98.3,
+  "aggDims": {
+    "typography": 5,
+    "color": 5,
+    "visual": 5,
+    "layout": 5,
+    "motion": 5,
+    "interaction": 5,
+    "responsive": 5,
+    "general": 5,
+    "craft": 4.238
+  },
+  "fixtures": [
+    {
+      "name": "ledger",
+      "score100": 98.1,
+      "dimScores": {
+        "typography": 5,
+        "color": 5,
+        "visual": 5,
+        "layout": 5,
+        "motion": 5,
+        "interaction": 5,
+        "responsive": 5,
+        "general": 5,
+        "craft": 4.143
+      }
+    },
+    {
+      "name": "fernweh",
+      "score100": 98.1,
+      "dimScores": {
+        "typography": 5,
+        "color": 5,
+        "visual": 5,
+        "layout": 5,
+        "motion": 5,
+        "interaction": 5,
+        "responsive": 5,
+        "general": 5,
+        "craft": 4.143
+      }
+    },
+    {
+      "name": "kiln",
+      "score100": 98.7,
+      "dimScores": {
+        "typography": 5,
+        "color": 5,
+        "visual": 5,
+        "layout": 5,
+        "motion": 5,
+        "interaction": 5,
+        "responsive": 5,
+        "general": 5,
+        "craft": 4.429
+      }
+    }
+  ],
+  "timestamp": "2026-05-21T10:20:07.003Z"
+}
\ No newline at end of file
diff --git a/evals/results/cycle-06-v2.json b/evals/results/cycle-06-v2.json
new file mode 100644
index 0000000..60668be
--- /dev/null
+++ b/evals/results/cycle-06-v2.json
@@ -0,0 +1,103 @@
+{
+  "cycle": 6,
+  "evalVersion": "v2",
+  "label": "EVAL UPGRADE -> v2 (your evals will break): +6 detector rules (hero-float/gate54, dark-mode reflex, hero-metric, icon-tile-above-heading, mono-as-shorthand, everything-in-cards) + cross-fixture order parameter (macrostructure reuse) + 2 adversarial fixtures (pulse, vellum). Re-measure only, no fixes.",
+  "ruleCount": 43,
+  "fixtureCount": 5,
+  "cycleScore": 76.4,
+  "aggDims": {
+    "typography": 4.444,
+    "color": 4.571,
+    "visual": 4.333,
+    "layout": 3.778,
+    "motion": 4.4,
+    "interaction": 4.75,
+    "responsive": 4.333,
+    "general": 4,
+    "craft": 3.571,
+    "structure": 0
+  },
+  "fixtures": [
+    {
+      "name": "ledger",
+      "score100": 86.1,
+      "dimScores": {
+        "typography": 5,
+        "color": 5,
+        "visual": 5,
+        "layout": 3.889,
+        "motion": 5,
+        "interaction": 5,
+        "responsive": 5,
+        "general": 5,
+        "craft": 4.143,
+        "structure": 0
+      }
+    },
+    {
+      "name": "fernweh",
+      "score100": 87.2,
+      "dimScores": {
+        "typography": 5,
+        "color": 5,
+        "visual": 5,
+        "layout": 4.444,
+        "motion": 5,
+        "interaction": 5,
+        "responsive": 5,
+        "general": 5,
+        "craft": 4.143,
+        "structure": 0
+      }
+    },
+    {
+      "name": "kiln",
+      "score100": 87.7,
+      "dimScores": {
+        "typography": 5,
+        "color": 5,
+        "visual": 5,
+        "layout": 4.444,
+        "motion": 5,
+        "interaction": 5,
+        "responsive": 5,
+        "general": 5,
+        "craft": 4.429,
+        "structure": 0
+      }
+    },
+    {
+      "name": "pulse",
+      "score100": 45.9,
+      "dimScores": {
+        "typography": 3.333,
+        "color": 2.857,
+        "visual": 1.667,
+        "layout": 2.778,
+        "motion": 2,
+        "interaction": 3.75,
+        "responsive": 3.333,
+        "general": 1.667,
+        "craft": 1.571,
+        "structure": 0
+      }
+    },
+    {
+      "name": "vellum",
+      "score100": 74.9,
+      "dimScores": {
+        "typography": 3.889,
+        "color": 5,
+        "visual": 5,
+        "layout": 3.333,
+        "motion": 5,
+        "interaction": 5,
+        "responsive": 3.333,
+        "general": 3.333,
+        "craft": 3.571,
+        "structure": 0
+      }
+    }
+  ],
+  "timestamp": "2026-05-21T10:23:44.743Z"
+}
\ No newline at end of file
diff --git a/evals/results/cycle-07-v2.json b/evals/results/cycle-07-v2.json
new file mode 100644
index 0000000..1aa962c
--- /dev/null
+++ b/evals/results/cycle-07-v2.json
@@ -0,0 +1,103 @@
+{
+  "cycle": 7,
+  "evalVersion": "v2",
+  "label": "Resist the brief on pulse: gate 78 (dark-mode reflex) + gate 79 (glassmorphism). Flipped to a light single-accent palette; removed neon gradient/glass/gradient-text/side-tab; replaced fabricated 10x/99.99%/50k+ metrics with honest facts; scoped motion, focus rings, button hierarchy.",
+  "ruleCount": 43,
+  "fixtureCount": 5,
+  "cycleScore": 82.4,
+  "aggDims": {
+    "typography": 4.556,
+    "color": 5,
+    "visual": 5,
+    "layout": 3.778,
+    "motion": 5,
+    "interaction": 5,
+    "responsive": 4.333,
+    "general": 4.667,
+    "craft": 3.886,
+    "structure": 0
+  },
+  "fixtures": [
+    {
+      "name": "ledger",
+      "score100": 86.1,
+      "dimScores": {
+        "typography": 5,
+        "color": 5,
+        "visual": 5,
+        "layout": 3.889,
+        "motion": 5,
+        "interaction": 5,
+        "responsive": 5,
+        "general": 5,
+        "craft": 4.143,
+        "structure": 0
+      }
+    },
+    {
+      "name": "fernweh",
+      "score100": 87.2,
+      "dimScores": {
+        "typography": 5,
+        "color": 5,
+        "visual": 5,
+        "layout": 4.444,
+        "motion": 5,
+        "interaction": 5,
+        "responsive": 5,
+        "general": 5,
+        "craft": 4.143,
+        "structure": 0
+      }
+    },
+    {
+      "name": "kiln",
+      "score100": 87.7,
+      "dimScores": {
+        "typography": 5,
+        "color": 5,
+        "visual": 5,
+        "layout": 4.444,
+        "motion": 5,
+        "interaction": 5,
+        "responsive": 5,
+        "general": 5,
+        "craft": 4.429,
+        "structure": 0
+      }
+    },
+    {
+      "name": "pulse",
+      "score100": 76.3,
+      "dimScores": {
+        "typography": 3.889,
+        "color": 5,
+        "visual": 5,
+        "layout": 2.778,
+        "motion": 5,
+        "interaction": 5,
+        "responsive": 3.333,
+        "general": 5,
+        "craft": 3.143,
+        "structure": 0
+      }
+    },
+    {
+      "name": "vellum",
+      "score100": 74.9,
+      "dimScores": {
+        "typography": 3.889,
+        "color": 5,
+        "visual": 5,
+        "layout": 3.333,
+        "motion": 5,
+        "interaction": 5,
+        "responsive": 3.333,
+        "general": 3.333,
+        "craft": 3.571,
+        "structure": 0
+      }
+    }
+  ],
+  "timestamp": "2026-05-21T10:25:29.554Z"
+}
\ No newline at end of file
diff --git a/evals/results/cycle-08-v2.json b/evals/results/cycle-08-v2.json
new file mode 100644
index 0000000..47ac925
--- /dev/null
+++ b/evals/results/cycle-08-v2.json
@@ -0,0 +1,103 @@
+{
+  "cycle": 8,
+  "evalVersion": "v2",
+  "label": "pulse structural cleanup: gate 80 (icon-tile-above-heading) + gate 81 (mono-as-shorthand) + gate 82 (everything-in-cards). Distinctive face, empty icon tiles dropped, vanity KPI strip de-metered into a baseline fact row, grids varied, hero bottom-weighted, overflow clipped.",
+  "ruleCount": 43,
+  "fixtureCount": 5,
+  "cycleScore": 84.8,
+  "aggDims": {
+    "typography": 4.778,
+    "color": 5,
+    "visual": 5,
+    "layout": 4.222,
+    "motion": 5,
+    "interaction": 5,
+    "responsive": 4.667,
+    "general": 4.667,
+    "craft": 4.057,
+    "structure": 0
+  },
+  "fixtures": [
+    {
+      "name": "ledger",
+      "score100": 86.1,
+      "dimScores": {
+        "typography": 5,
+        "color": 5,
+        "visual": 5,
+        "layout": 3.889,
+        "motion": 5,
+        "interaction": 5,
+        "responsive": 5,
+        "general": 5,
+        "craft": 4.143,
+        "structure": 0
+      }
+    },
+    {
+      "name": "fernweh",
+      "score100": 87.2,
+      "dimScores": {
+        "typography": 5,
+        "color": 5,
+        "visual": 5,
+        "layout": 4.444,
+        "motion": 5,
+        "interaction": 5,
+        "responsive": 5,
+        "general": 5,
+        "craft": 4.143,
+        "structure": 0
+      }
+    },
+    {
+      "name": "kiln",
+      "score100": 87.7,
+      "dimScores": {
+        "typography": 5,
+        "color": 5,
+        "visual": 5,
+        "layout": 4.444,
+        "motion": 5,
+        "interaction": 5,
+        "responsive": 5,
+        "general": 5,
+        "craft": 4.429,
+        "structure": 0
+      }
+    },
+    {
+      "name": "pulse",
+      "score100": 88,
+      "dimScores": {
+        "typography": 5,
+        "color": 5,
+        "visual": 5,
+        "layout": 5,
+        "motion": 5,
+        "interaction": 5,
+        "responsive": 5,
+        "general": 5,
+        "craft": 4,
+        "structure": 0
+      }
+    },
+    {
+      "name": "vellum",
+      "score100": 74.9,
+      "dimScores": {
+        "typography": 3.889,
+        "color": 5,
+        "visual": 5,
+        "layout": 3.333,
+        "motion": 5,
+        "interaction": 5,
+        "responsive": 3.333,
+        "general": 3.333,
+        "craft": 3.571,
+        "structure": 0
+      }
+    }
+  ],
+  "timestamp": "2026-05-21T10:29:29.372Z"
+}
\ No newline at end of file
diff --git a/evals/results/cycle-09-v2.json b/evals/results/cycle-09-v2.json
new file mode 100644
index 0000000..e5675f9
--- /dev/null
+++ b/evals/results/cycle-09-v2.json
@@ -0,0 +1,103 @@
+{
+  "cycle": 9,
+  "evalVersion": "v2",
+  "label": "Order parameter + reading comfort: gate 83 (long-form leading/measure) + gate 84 (cross-output macrostructure reuse). pulse re-keyed stat-led -> feature-stack; vellum stamped long-document with a Fraunces/Spectral pairing, 1.6 leading, 68ch measure, ragged-right. structure 0 -> 5.",
+  "ruleCount": 43,
+  "fixtureCount": 5,
+  "cycleScore": 97.6,
+  "aggDims": {
+    "typography": 5,
+    "color": 5,
+    "visual": 5,
+    "layout": 4.555,
+    "motion": 5,
+    "interaction": 5,
+    "responsive": 5,
+    "general": 5,
+    "craft": 4.257,
+    "structure": 5
+  },
+  "fixtures": [
+    {
+      "name": "ledger",
+      "score100": 96.1,
+      "dimScores": {
+        "typography": 5,
+        "color": 5,
+        "visual": 5,
+        "layout": 3.889,
+        "motion": 5,
+        "interaction": 5,
+        "responsive": 5,
+        "general": 5,
+        "craft": 4.143,
+        "structure": 5
+      }
+    },
+    {
+      "name": "fernweh",
+      "score100": 97.2,
+      "dimScores": {
+        "typography": 5,
+        "color": 5,
+        "visual": 5,
+        "layout": 4.444,
+        "motion": 5,
+        "interaction": 5,
+        "responsive": 5,
+        "general": 5,
+        "craft": 4.143,
+        "structure": 5
+      }
+    },
+    {
+      "name": "kiln",
+      "score100": 97.7,
+      "dimScores": {
+        "typography": 5,
+        "color": 5,
+        "visual": 5,
+        "layout": 4.444,
+        "motion": 5,
+        "interaction": 5,
+        "responsive": 5,
+        "general": 5,
+        "craft": 4.429,
+        "structure": 5
+      }
+    },
+    {
+      "name": "pulse",
+      "score100": 98.9,
+      "dimScores": {
+        "typography": 5,
+        "color": 5,
+        "visual": 5,
+        "layout": 5,
+        "motion": 5,
+        "interaction": 5,
+        "responsive": 5,
+        "general": 5,
+        "craft": 4.429,
+        "structure": 5
+      }
+    },
+    {
+      "name": "vellum",
+      "score100": 98.3,
+      "dimScores": {
+        "typography": 5,
+        "color": 5,
+        "visual": 5,
+        "layout": 5,
+        "motion": 5,
+        "interaction": 5,
+        "responsive": 5,
+        "general": 5,
+        "craft": 4.143,
+        "structure": 5
+      }
+    }
+  ],
+  "timestamp": "2026-05-21T10:30:31.596Z"
+}
\ No newline at end of file
diff --git a/evals/results/cycle-10-v2.json b/evals/results/cycle-10-v2.json
new file mode 100644
index 0000000..535612d
--- /dev/null
+++ b/evals/results/cycle-10-v2.json
@@ -0,0 +1,103 @@
+{
+  "cycle": 10,
+  "evalVersion": "v2",
+  "label": "Final consolidation: enforce gate 54 (hero bottom-weight) across the three originals; de-meter ledger's big-number stat strip into an inline fact row (clears the hero-metric tell v2 surfaced). All five fixtures detector-clean on v2.",
+  "ruleCount": 43,
+  "fixtureCount": 5,
+  "cycleScore": 98.7,
+  "aggDims": {
+    "typography": 5,
+    "color": 5,
+    "visual": 5,
+    "layout": 5,
+    "motion": 5,
+    "interaction": 5,
+    "responsive": 5,
+    "general": 5,
+    "craft": 4.315,
+    "structure": 5
+  },
+  "fixtures": [
+    {
+      "name": "ledger",
+      "score100": 98.9,
+      "dimScores": {
+        "typography": 5,
+        "color": 5,
+        "visual": 5,
+        "layout": 5,
+        "motion": 5,
+        "interaction": 5,
+        "responsive": 5,
+        "general": 5,
+        "craft": 4.429,
+        "structure": 5
+      }
+    },
+    {
+      "name": "fernweh",
+      "score100": 98.3,
+      "dimScores": {
+        "typography": 5,
+        "color": 5,
+        "visual": 5,
+        "layout": 5,
+        "motion": 5,
+        "interaction": 5,
+        "responsive": 5,
+        "general": 5,
+        "craft": 4.143,
+        "structure": 5
+      }
+    },
+    {
+      "name": "kiln",
+      "score100": 98.9,
+      "dimScores": {
+        "typography": 5,
+        "color": 5,
+        "visual": 5,
+        "layout": 5,
+        "motion": 5,
+        "interaction": 5,
+        "responsive": 5,
+        "general": 5,
+        "craft": 4.429,
+        "structure": 5
+      }
+    },
+    {
+      "name": "pulse",
+      "score100": 98.9,
+      "dimScores": {
+        "typography": 5,
+        "color": 5,
+        "visual": 5,
+        "layout": 5,
+        "motion": 5,
+        "interaction": 5,
+        "responsive": 5,
+        "general": 5,
+        "craft": 4.429,
+        "structure": 5
+      }
+    },
+    {
+      "name": "vellum",
+      "score100": 98.3,
+      "dimScores": {
+        "typography": 5,
+        "color": 5,
+        "visual": 5,
+        "layout": 5,
+        "motion": 5,
+        "interaction": 5,
+        "responsive": 5,
+        "general": 5,
+        "craft": 4.143,
+        "structure": 5
+      }
+    }
+  ],
+  "timestamp": "2026-05-21T10:31:23.320Z"
+}
\ No newline at end of file
diff --git a/evals/results/history.md b/evals/results/history.md
new file mode 100644
index 0000000..8d8131a
--- /dev/null
+++ b/evals/results/history.md
@@ -0,0 +1,31 @@
+# Eval history — Hallmark anti-slop hillclimb
+
+Score = mean of nine dimensions × 20 (0–100). Dimensions 1–8 are the
+deterministic Impeccable detector; `craft` is the LLM-judge mean of
+Hallmark's six axes + honesty.
+
+| Cycle | Eval | Rules | Score | visua | typog | color | layou | motio | inter | respo | gener | craft | struc | Change |
+|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
+| 1 | v1 | 37 | **74.2** | 3.89 | 4.52 | 3.61 | 3.33 | 4.33 | 3.33 | 3.89 | 2.78 | 3.71 | — | — |
+| 2 | v1 | 37 | **78.0** | 3.89 | 5.00 | 4.72 | 3.33 | 4.33 | 3.33 | 3.89 | 2.78 | 3.81 | — | +3.8 |
+| 3 | v1 | 37 | **81.8** | 3.89 | 5.00 | 4.72 | 5.00 | 4.33 | 3.33 | 3.89 | 2.78 | 3.86 | — | +3.8 |
+| 4 | v1 | 37 | **85.9** | 3.89 | 5.00 | 4.72 | 5.00 | 4.33 | 5.00 | 3.89 | 2.78 | 4.05 | — | +4.1 |
+| 5 | v1 | 37 | **98.3** | 5.00 | 5.00 | 5.00 | 5.00 | 5.00 | 5.00 | 5.00 | 5.00 | 4.24 | — | +12.4 |
+| 6 | v2 | 43 | **76.4** | 4.33 | 4.44 | 4.57 | 3.78 | 4.40 | 4.75 | 4.33 | 4.00 | 3.57 | 0.00 | -21.9 |
+| 7 | v2 | 43 | **82.4** | 5.00 | 4.56 | 5.00 | 3.78 | 5.00 | 5.00 | 4.33 | 4.67 | 3.89 | 0.00 | +6.0 |
+| 8 | v2 | 43 | **84.8** | 5.00 | 4.78 | 5.00 | 4.22 | 5.00 | 5.00 | 4.67 | 4.67 | 4.06 | 0.00 | +2.4 |
+| 9 | v2 | 43 | **97.6** | 5.00 | 5.00 | 5.00 | 4.55 | 5.00 | 5.00 | 5.00 | 5.00 | 4.26 | 5.00 | +12.8 |
+| 10 | v2 | 43 | **98.7** | 5.00 | 5.00 | 5.00 | 5.00 | 5.00 | 5.00 | 5.00 | 5.00 | 4.32 | 5.00 | +1.1 |
+
+## Notes per cycle
+
+- **Cycle 1 (v1)** — Baseline: skill applied as-is; residual slop across motion, color, layout, interaction, visual.
+- **Cycle 2 (v1)** — Typography & palette discipline: gate 70 (single-typeface floor) + gate 71 (AI-palette / raw colour in gradients). ledger gets a real pairing and a solid headline; bloom tokenised.
+- **Cycle 3 (v1)** — Layout discipline: gate 72 (justified text) + gate 73 (skipped heading levels). fernweh de-centred and re-leveled; rote 3-equal grids varied; kiln spacing back on the 4px scale.
+- **Cycle 4 (v1)** — Interaction discipline: gate 74 (button hierarchy) + gate 75 (reflexive modal). Secondary register added to ledger/fernweh; fernweh emoji icons replaced with numbered walks; kiln modal swapped for an inline reserve section.
+- **Cycle 5 (v1)** — Cleanup + gate 76 (decorative chart) + gate 77 (feature amputation; detector media-extractor hardened). kiln side-tab/sparkline removed and gallery reflows; ledger motion scoped; stamps + overflow clip + focus rings across all three.
+- **Cycle 6 (v2)** — EVAL UPGRADE -> v2 (your evals will break): +6 detector rules (hero-float/gate54, dark-mode reflex, hero-metric, icon-tile-above-heading, mono-as-shorthand, everything-in-cards) + cross-fixture order parameter (macrostructure reuse) + 2 adversarial fixtures (pulse, vellum). Re-measure only, no fixes.
+- **Cycle 7 (v2)** — Resist the brief on pulse: gate 78 (dark-mode reflex) + gate 79 (glassmorphism). Flipped to a light single-accent palette; removed neon gradient/glass/gradient-text/side-tab; replaced fabricated 10x/99.99%/50k+ metrics with honest facts; scoped motion, focus rings, button hierarchy.
+- **Cycle 8 (v2)** — pulse structural cleanup: gate 80 (icon-tile-above-heading) + gate 81 (mono-as-shorthand) + gate 82 (everything-in-cards). Distinctive face, empty icon tiles dropped, vanity KPI strip de-metered into a baseline fact row, grids varied, hero bottom-weighted, overflow clipped.
+- **Cycle 9 (v2)** — Order parameter + reading comfort: gate 83 (long-form leading/measure) + gate 84 (cross-output macrostructure reuse). pulse re-keyed stat-led -> feature-stack; vellum stamped long-document with a Fraunces/Spectral pairing, 1.6 leading, 68ch measure, ragged-right. structure 0 -> 5.
+- **Cycle 10 (v2)** — Final consolidation: enforce gate 54 (hero bottom-weight) across the three originals; de-meter ledger's big-number stat strip into an inline fact row (clears the hero-metric tell v2 surfaced). All five fixtures detector-clean on v2.
diff --git a/evals/rubric.md b/evals/rubric.md
new file mode 100644
index 0000000..9598d20
--- /dev/null
+++ b/evals/rubric.md
@@ -0,0 +1,55 @@
+# Hallmark eval rubric
+
+Hallmark's one job: emit UI that looks **made, not generated**. This rubric
+scores that job against two external anchors and Hallmark's own craft axes.
+
+## Anchors
+
+1. **Impeccable's slop standard** — "37 patterns that mark an interface as
+   AI-generated" across 8 dimensions (impeccable.style/slop). The
+   deterministic, CLI-checkable subset is encoded in `detector.mjs`.
+2. **Hallmark's pre-emit self-critique** — six craft axes (philosophy,
+   hierarchy, execution, specificity, restraint, variety) plus honest copy.
+   These are taste calls, scored 1–5 by an LLM judge per fixture.
+
+## Dimensions (each 0–5)
+
+| # | Dimension | Source | Scored by | What a 5 looks like |
+|---|---|---|---|---|
+| 1 | **visual** | Impeccable | detector | No side-tab stripes, glass, or decorative sparklines |
+| 2 | **typography** | Impeccable | detector | Distinctive faces, ≤3 families, comfortable leading, mixed-case body |
+| 3 | **color** | Impeccable | detector | Tokenised palette, no gradient text / AI purple, real contrast |
+| 4 | **layout** | Impeccable | detector | Asymmetric, 4px scale, ≤75ch measure, contiguous headings |
+| 5 | **motion** | Impeccable | detector | Scoped transitions, no overshoot, reduced-motion guard |
+| 6 | **interaction** | Impeccable | detector | Button hierarchy, no emoji icons, specific copy |
+| 7 | **responsive** | Impeccable | detector | overflow-x clipped, minmax tracks, no feature amputation |
+| 8 | **general** | Impeccable + Hallmark | detector | focus-visible, stamp present, full state coverage |
+| 9 | **craft** | Hallmark | LLM judge | A clear position, instant hierarchy, in-spec execution, brief-specific, restrained, structurally varied, honest |
+
+**Fixture score** = mean of the nine dimensions × 20 (0–100).
+**Cycle score** = mean of fixture scores.
+
+## Judge sidecar (per fixture, 1–5)
+
+`philosophy`, `hierarchy`, `execution`, `specificity`, `restraint`,
+`variety`, `honesty`. A score < 3 on any axis means the fixture would have
+triggered a Hallmark revision pass before emit.
+
+## How a cycle works (hillclimbing)
+
+1. Run `node run.mjs --cycle N --eval vX` → detector + judge → cycle score.
+2. Read the failing rules. For each, ask **"why didn't the skill prevent
+   this?"** — the answer is a gap in `SKILL.md` / `references/`.
+3. Close the gap in the skill (new/strengthened gate or rule), then bring the
+   fixtures into line with the strengthened skill.
+4. Re-run. The score is the cycle's result. The skill — not just the
+   fixtures — is what improved.
+
+## Why the eval is versioned (v1 → v2)
+
+Per *"Your Evals Will Break and You Won't See It Coming"*: static evals are
+structurally reactive and silently miss new failure regimes. Once fixtures
+saturate v1, the detector is no longer measuring slop — it is measuring
+"slop v1 already knows about." v2 adds order-parameter-style meta checks and
+adversarial fixtures that probe the tells v1 cannot see, then we hillclimb
+again.
diff --git a/evals/run.mjs b/evals/run.mjs
new file mode 100644
index 0000000..129f78a
--- /dev/null
+++ b/evals/run.mjs
@@ -0,0 +1,119 @@
+// Hallmark eval runner.
+//
+// Combines the deterministic detector (8 Impeccable dimensions) with an
+// LLM-judge sidecar (Hallmark's 6 craft axes + honesty) for each fixture,
+// aggregates a cycle score, snapshots evals/results/cycle-NN.json, and
+// rebuilds evals/results/history.md.
+//
+// Usage: node run.mjs --cycle <N> --eval v1|v2 --label "what changed"
+
+import fs from 'node:fs';
+import path from 'node:path';
+import { fileURLToPath } from 'node:url';
+import { analyze } from './detector.mjs';
+
+const HERE = path.dirname(fileURLToPath(import.meta.url));
+const RESULTS = path.join(HERE, 'results');
+const CRAFT_AXES = ['philosophy', 'hierarchy', 'execution', 'specificity', 'restraint', 'variety', 'honesty'];
+
+function arg(name, def) {
+  const i = process.argv.indexOf(`--${name}`);
+  return i >= 0 ? process.argv[i + 1] : def;
+}
+
+const cycle = +arg('cycle', '0');
+const evalVersion = arg('eval', 'v1');
+const label = arg('label', '');
+
+const config = JSON.parse(fs.readFileSync(path.join(HERE, 'config.json'), 'utf8'));
+const fixtures = config.evals[evalVersion].fixtures;
+
+// --- order parameter (blog: monitor cross-eval correlation, not just per-item)
+// Structural-fingerprint reuse: two fixtures sharing a macrostructure is the
+// "colour-swap of one template" failure that per-page checks cannot see.
+function macrostructureOf(file) {
+  const src = fs.readFileSync(path.join(HERE, file), 'utf8');
+  return (src.match(/macrostructure:\s*([a-z0-9-]+)/i) || [])[1] || 'unstamped';
+}
+const macros = fixtures.map((fx) => macrostructureOf(fx.file));
+const counts = macros.reduce((m, k) => ((m[k] = (m[k] || 0) + 1), m), {});
+const collisions = Object.values(counts).reduce((a, n) => a + (n - 1), 0);
+const unstamped = macros.filter((k) => k === 'unstamped').length;
+const structureScore = +Math.max(0, 5 - 2.5 * collisions - 2.5 * unstamped).toFixed(3);
+
+const perFixture = [];
+for (const fx of fixtures) {
+  const det = analyze(path.join(HERE, fx.file), evalVersion);
+  const judgePath = path.join(HERE, fx.judge);
+  const judge = JSON.parse(fs.readFileSync(judgePath, 'utf8'));
+  const craftVals = CRAFT_AXES.map((a) => judge[a]);
+  const craft = +(craftVals.reduce((a, b) => a + b, 0) / craftVals.length).toFixed(3);
+
+  const dimScores = { ...Object.fromEntries(Object.entries(det.dims).map(([k, v]) => [k, v.score])), craft };
+  // the order parameter is a property of the whole eval set; v2 folds it in
+  if (evalVersion === 'v2') dimScores.structure = structureScore;
+  const overall5 = +(Object.values(dimScores).reduce((a, b) => a + b, 0) / Object.values(dimScores).length).toFixed(3);
+
+  perFixture.push({
+    name: fx.name,
+    file: fx.file,
+    macrostructure: macrostructureOf(fx.file),
+    detector: det,
+    judge,
+    dimScores,
+    score100: +(overall5 * 20).toFixed(1),
+  });
+}
+
+// aggregate dimensions across fixtures
+const allDims = [...new Set(perFixture.flatMap((f) => Object.keys(f.dimScores)))];
+const aggDims = {};
+for (const d of allDims) {
+  const vals = perFixture.map((f) => f.dimScores[d]).filter((v) => v != null);
+  aggDims[d] = +(vals.reduce((a, b) => a + b, 0) / vals.length).toFixed(3);
+}
+const cycleScore = +(perFixture.reduce((a, f) => a + f.score100, 0) / perFixture.length).toFixed(1);
+
+const snapshot = {
+  cycle, evalVersion, label,
+  ruleCount: perFixture[0]?.detector.ruleCount ?? 0,
+  fixtureCount: perFixture.length,
+  cycleScore,
+  aggDims,
+  fixtures: perFixture.map((f) => ({ name: f.name, score100: f.score100, dimScores: f.dimScores })),
+  timestamp: new Date().toISOString(),
+};
+
+const tag = `${String(cycle).padStart(2, '0')}-${evalVersion}`;
+fs.writeFileSync(path.join(RESULTS, `cycle-${tag}.json`), JSON.stringify(snapshot, null, 2));
+
+// rebuild history.md from every snapshot
+const snaps = fs.readdirSync(RESULTS)
+  .filter((f) => /^cycle-.*\.json$/.test(f))
+  .map((f) => JSON.parse(fs.readFileSync(path.join(RESULTS, f), 'utf8')))
+  .sort((a, b) => (a.cycle - b.cycle) || a.evalVersion.localeCompare(b.evalVersion));
+
+const dimOrder = ['visual', 'typography', 'color', 'layout', 'motion', 'interaction', 'responsive', 'general', 'craft', 'structure'];
+let md = '# Eval history — Hallmark anti-slop hillclimb\n\n';
+md += 'Score = mean of nine dimensions × 20 (0–100). Dimensions 1–8 are the\n';
+md += 'deterministic Impeccable detector; `craft` is the LLM-judge mean of\n';
+md += "Hallmark's six axes + honesty.\n\n";
+md += '| Cycle | Eval | Rules | Score | ' + dimOrder.map((d) => d.slice(0, 5)).join(' | ') + ' | Change |\n';
+md += '|---|---|---|---|' + dimOrder.map(() => '---').join('|') + '|---|\n';
+let prev = null;
+for (const s of snaps) {
+  const delta = prev == null ? '—' : (s.cycleScore - prev >= 0 ? `+${(s.cycleScore - prev).toFixed(1)}` : (s.cycleScore - prev).toFixed(1));
+  md += `| ${s.cycle} | ${s.evalVersion} | ${s.ruleCount} | **${s.cycleScore.toFixed(1)}** | `
+    + dimOrder.map((d) => (s.aggDims[d] != null ? s.aggDims[d].toFixed(2) : '—')).join(' | ')
+    + ` | ${delta} |\n`;
+  prev = s.cycleScore;
+}
+md += '\n## Notes per cycle\n\n';
+for (const s of snaps) md += `- **Cycle ${s.cycle} (${s.evalVersion})** — ${s.label || '—'}\n`;
+fs.writeFileSync(path.join(RESULTS, 'history.md'), md);
+
+// console summary
+console.log(`\nCycle ${cycle} (${evalVersion}) — ${label}`);
+console.log(`  rules: ${snapshot.ruleCount}   fixtures: ${snapshot.fixtureCount}   SCORE: ${cycleScore}/100`);
+for (const d of dimOrder) if (aggDims[d] != null) console.log(`    ${d.padEnd(12)} ${aggDims[d].toFixed(2)}/5`);
+for (const f of perFixture) console.log(`  · ${f.name.padEnd(16)} ${f.score100}/100`);
diff --git a/references/slop-test.md b/references/slop-test.md
index 4419a67..ad23b20 100644
--- a/references/slop-test.md
+++ b/references/slop-test.md
@@ -200,6 +200,144 @@ Universal. Every emitted page must render flawlessly at 320 px, 375 px, 414 px,
 
 The CSS stamp at Step 6 records mobile pass alongside contrast: `· mobile: pass (36, 59, 61–69)`.
 
+## Eval-hardened gates (70+)
+
+These gates were added by the anti-slop eval harness (`evals/`). Each one
+closes a gap the deterministic detector found that the gates above missed.
+They are checked the same way: every answer must be **no**.
+
+70. **Single typeface across the whole page.** Does the artifact resolve to
+    exactly **one** non-generic `font-family` family for every text element?
+    Gate 39 caps the *ceiling* at three families; this catches the *floor*.
+    One face everywhere is the flat-type tell — the page reads as a
+    word-processor default, not a designed system. The fix is a deliberate
+    pairing: a display register distinct from the body register (the 2+1 rule
+    in [`typography.md`](typography.md)). A monospace used only inside `<code>`
+    does not count as the second face.
+
+71. **AI palette in a gradient, or raw colour inside a gradient.** Does any
+    `linear/radial/conic-gradient(...)` (a) move through the AI signature —
+    violet/purple/indigo → blue/cyan, or any OKLCH hue in the 270–330 band —
+    or (b) contain a raw `#hex` / `oklch(...)` / `rgb(...)` literal instead of
+    a `var(--token)`? Both fail. (a) is the single most-recognised colour tell
+    (sharpens gate 2). (b) is mid-render token improvisation (gate 58) hiding
+    inside a gradient — the model picked the theme, then freestyled the
+    gradient stops. Fix: lift every stop to a named token, and choose a hue
+    family that belongs to the brief, not to the AI-default violet→cyan ramp.
+
+72. **Justified body text.** Is `text-align: justify` set on any prose
+    container (`p`, `li`, article body, lede)? Auto-fail. Browser
+    justification has no hyphenation dictionary by default, so it opens
+    "rivers" of white space down the column — an unmistakable
+    word-processor / AI-export tell. Set prose ragged-right (`text-align:
+    start`/`left`). Justification is admissible only on a true multi-column
+    print specimen with `hyphens: auto` *and* a narrow measure, which a web
+    page almost never is.
+
+73. **Skipped heading level.** Reading the document outline top to bottom,
+    does any heading jump more than one level (e.g. `h1` → `h3` with no `h2`,
+    or `h2` → `h4`)? Auto-fail. Heading levels are the document's
+    machine-readable hierarchy; skipping a level to get a smaller font is
+    styling leaking into structure, and it breaks screen-reader navigation.
+    Fix by using the correct level and styling it down with a class — the
+    visual size and the semantic level are independent decisions.
+
+74. **Every button is primary.** Does the page render three or more
+    button/CTA affordances that all share the same filled accent register,
+    with no secondary/quiet variant defined and used? Fail. When everything
+    shouts, nothing leads — and a wall of identical accent fills is a strong
+    AI tell (the model styled one button and copied it). There must be a
+    visible hierarchy: one primary action per view, the rest as a secondary
+    register (ghost, outline, or plain text link). The secondary variant must
+    actually be *applied*, not just defined.
+
+75. **Reflexive modal / dialog.** Did the build reach for a `<dialog>`,
+    `role="dialog"`, or a `.modal` overlay for content that is not a genuine
+    interruption (a short form, a "learn more", a confirmation the user can
+    already see the result of)? Fail. Modals are for true overlays —
+    destructive confirmations, focus-trapping flows. Reaching for one to hold
+    a paragraph or a single field is the AI default; it costs the user a
+    dismiss and breaks deep-linking. Prefer inline disclosure (`<details>`, an
+    expanding section, or just putting the content on the page).
+
+76. **Decorative sparkline / chart.** Does the page render a sparkline,
+    chart, graph, or data-viz shape (`<svg>` polyline, bars, an "analytics"
+    squiggle) that is *not driven by real data the brief supplied*? Fail.
+    A chart that charts nothing is pure decoration cosplaying as
+    information — one of the clearest "AI dashboard" tells. Either bind it to
+    real numbers the user gave you (and label its axes), or remove it and let
+    type and space carry the section.
+
+77. **Feature amputation on mobile.** In any `@media (max-width: …)` block,
+    is a *content* element (gallery, section, column, card, figure — anything
+    that is not nav/menu/drawer/overlay chrome) set to `display: none`? Fail.
+    Hiding content to "fix" mobile is amputation, not responsive design — the
+    phone user silently loses the thing. Reflow instead: collapse a grid to
+    fewer tracks, stack columns, switch a side-by-side to a carousel, or move
+    the element — but keep it on the page. `display: none` at a breakpoint is
+    only for chrome that has a mobile equivalent (a desktop nav replaced by a
+    sheet).
+
+78. **Defaulting to dark mode reflexively.** Is the page's base surface dark
+    (OKLCH lightness < 0.30, or `color-scheme: dark` with no light counterpart)
+    *without the brief or genre earning it*? Fail. Dark-by-default is the AI
+    dashboard reflex — and it's where the worst ink-on-ink and neon-glow slop
+    breeds. Dark is a deliberate choice for cinema, photography, a genuine
+    night-use tool, or an explicitly atmospheric/midnight theme. If the brief
+    just says "modern" or "developer", start light and considered. When the
+    brief literally says "dark by default", that is the brief's gravity, not a
+    design decision — resist it unless night use is the actual job.
+
+79. **Glassmorphism.** Does any panel combine `backdrop-filter: blur(...)`
+    with a translucent fill (`rgba(... , <1)` / `oklch(... / <1)`)? Fail.
+    Frosted-glass panels are a dated, dead-giveaway AI surface — they wreck
+    contrast (text floats over whatever scrolls behind) and signal "generated
+    in 2023". Use an opaque surface token with a real border and, if you need
+    depth, one restrained shadow. Translucency is for genuine overlays
+    (a sheet over dimmed content), not for every card.
+
+80. **Icon tile stacked above a heading.** Does a feature/value card lead
+    with a coloured rounded-square icon tile (or a decorative `<svg>`) sitting
+    directly on top of its heading? Fail. The icon-tile-over-title stack is
+    one of the most recognised AI card layouts — and the tiles are usually
+    empty gradient squares that say nothing. Lead with the heading. If an icon
+    truly helps, set it inline with the title (small, from one library, with
+    an accessible name), not as a decorative block above it.
+
+81. **Monospace as "technical" shorthand.** Is a monospace face applied to
+    three or more *non-code* contexts (labels, eyebrows, badges, captions,
+    nav, body) to make the page "feel technical"? Fail. Mono outside actual
+    code/keyboard input is costume — it reads as a developer-tool stereotype,
+    not a typographic decision. Keep mono for `<code>`/`<kbd>`/`<pre>` and at
+    most one deliberate register (a single label style). Everywhere else,
+    use the body or display face.
+
+82. **Everything wrapped in a card.** Is nearly every block on the page a
+    bordered, rounded, shadowed card (more than ~6 card/panel/tile wrappers)?
+    Fail. When everything is a card, nothing is grouped — the cards stop
+    meaning "these belong together" and become wallpaper. Let most content sit
+    directly on the page surface; reserve the card treatment for the few
+    groupings that genuinely need to be set apart, and vary their weight.
+
+83. **Long-form reading comfort.** On an essay / article / documentation
+    page, is body leading below **1.5**, or the measure outside **60–75ch**?
+    Fail. Gate 27 caps the measure for all genres; this tightens it for
+    reading-first pages, where a too-tight leading and a too-wide column are
+    what actually make long text exhausting. Body prose meant to be *read*
+    (not scanned) wants `line-height: 1.55–1.7` and a 62–68ch column. Display
+    and UI text are exempt — this gate is about the reading column.
+
+84. **Structural fingerprint reused across the project (order parameter).**
+    Across the set of pages this project has emitted (read the stamps / 
+    `.hallmark/log.json`), do two or more share the same `macrostructure`?
+    Fail. This is the cross-output check the per-page gates cannot make: two
+    pages can each pass every gate above and still be colour-swaps of one
+    skeleton. Variety is a property of the *set*, not the page. When a
+    macrostructure is already taken, pick a different one — and record it in
+    the stamp so the next run can see it. (This is the eval's "order
+    parameter": a single quantity over the whole corpus that signals the
+    template-collapse failure before it spreads.)
+
 ---
 
 If any answer is **yes**, fix it. Do not ship slop.

From df26a018f1939ed3ad2af8cf6a6ac66fd31a7fd8 Mon Sep 17 00:00:00 2001
From: Claude <noreply@anthropic.com>
Date: Thu, 21 May 2026 16:51:16 +0000
Subject: [PATCH 2/3] Harden detector false positives; add real-site audit
 adapter

Audit the in-repo Hallmark corpus (homepage + examples) surfaced a real
false-positive rate. Fix the worst offenders so the signal is trustworthy:

- placeholder-names: only flag actual placeholder names (Jane Doe, Acme,
  lorem ipsum), not ordinary words like "seamless"/"unleash" in prose.
- ai-palette: require the violet->cyan *ramp*, not a single deliberate brand
  hue, so a midnight-violet brand is no longer flagged.
- font counting: count a monospace family toward the budget only when used
  outside code (per gate 39); stop counting unused --font-mono tokens.
- multi-theme scoping: resolve tokens from the active [data-theme] only, and
  label 22-theme / component-library stylesheets low-confidence instead of
  scoring them as one page.

evals/audit-site.mjs inlines a page's linked stylesheets so the detector can
score real shipped pages. Fixtures unchanged (all still 5.00/5 on v1 and v2);
true positives (Inter in hyperlane, gradient text in bananastudio, "Acme" in
tally) are retained while the false positives are removed.
---
 .gitignore           |  3 +++
 evals/audit-site.mjs | 64 ++++++++++++++++++++++++++++++++++++++++++++
 evals/detector.mjs   | 60 +++++++++++++++++++++++++++--------------
 3 files changed, 107 insertions(+), 20 deletions(-)
 create mode 100644 evals/audit-site.mjs

diff --git a/.gitignore b/.gitignore
index aa8fb7b..c9ff8a7 100644
--- a/.gitignore
+++ b/.gitignore
@@ -43,3 +43,6 @@ skills-lock.json
 # Launch-day Hallmark vs Impeccable vs no-skills comparison — one-off artifact,
 # not referenced from the marketing site or README. Kept locally if useful.
 site/_launch-comparison/
+
+# Eval audit snapshots (throwaway)
+evals/.site-cache/
diff --git a/evals/audit-site.mjs b/evals/audit-site.mjs
new file mode 100644
index 0000000..430d1d2
--- /dev/null
+++ b/evals/audit-site.mjs
@@ -0,0 +1,64 @@
+// Audit real, in-repo Hallmark output with the detector.
+//
+// The detector reads inline CSS only; the shipped pages link external
+// stylesheets. This adapter inlines local <link rel="stylesheet"> files into
+// a self-contained snapshot, then scores it under both eval versions so we can
+// see what the current skill's gates (v2) catch that the initial skill's
+// gates (v1) did not — on artifacts the eval author did not write.
+//
+// Usage: node audit-site.mjs <page.html> [<page.html> ...]
+
+import fs from 'node:fs';
+import path from 'node:path';
+import { fileURLToPath } from 'node:url';
+import { analyze } from './detector.mjs';
+
+const HERE = path.dirname(fileURLToPath(import.meta.url));
+const ROOT = path.resolve(HERE, '..');
+const CACHE = path.join(HERE, '.site-cache');
+fs.mkdirSync(CACHE, { recursive: true });
+
+function inlinePage(htmlPath) {
+  const abs = path.resolve(ROOT, htmlPath);
+  const dir = path.dirname(abs);
+  let html = fs.readFileSync(abs, 'utf8');
+  const links = [...html.matchAll(/<link\b[^>]*rel=["']stylesheet["'][^>]*>/gi)].map((m) => m[0]);
+  const blocks = [];
+  for (const link of links) {
+    const href = (link.match(/href=["']([^"']+)["']/i) || [])[1];
+    if (!href || /^https?:|^\/\//i.test(href)) continue; // skip remote (e.g. Google Fonts)
+    const cssPath = path.resolve(dir, href.split(/[?#]/)[0]);
+    if (fs.existsSync(cssPath)) blocks.push(`/* ${href} */\n${fs.readFileSync(cssPath, 'utf8')}`);
+  }
+  if (blocks.length) {
+    const styleTag = `\n<style data-inlined>\n${blocks.join('\n')}\n</style>\n`;
+    html = html.replace(/<\/head>/i, `${styleTag}</head>`);
+  }
+  const out = path.join(CACHE, htmlPath.replace(/[\/]/g, '__'));
+  fs.writeFileSync(out, html);
+  return out;
+}
+
+const pages = process.argv.slice(2).filter((a) => !a.startsWith('--'));
+const rows = [];
+for (const p of pages) {
+  let snap;
+  try { snap = inlinePage(p); } catch (e) { console.error(`skip ${p}: ${e.message}`); continue; }
+  const v1 = analyze(snap, 'v1');
+  const v2 = analyze(snap, 'v2');
+  const v2fails = Object.values(v2.dims).flatMap((d) => d.rules).filter((r) => !r.pass);
+  rows.push({ page: p, v1: v1.overall, v2: v2.overall, fails: v2fails, multiTheme: v2.multiTheme, themeCount: v2.themeCount });
+}
+
+const name = (p) => p.replace(/^site\//, '').replace(/\/index\.html$/, '/').replace(/index\.html$/, '');
+console.log('\nReal Hallmark corpus — detector audit (overall /5)\n');
+console.log(`${'page'.padEnd(34)} ${'v1'.padStart(6)} ${'v2'.padStart(6)}   v2 findings`);
+console.log('-'.repeat(72));
+for (const r of rows) {
+  const f = r.fails.length ? r.fails.map((x) => x.id.replace(/^v2-/, '')).join(', ') : '—';
+  const tag = r.multiTheme ? ` [multi-theme:${r.themeCount}, low-confidence]` : '';
+  console.log(`${name(r.page).padEnd(34)} ${r.v1.toFixed(2).padStart(6)} ${r.v2.toFixed(2).padStart(6)}   ${f}${tag}`);
+}
+const avg = (k) => (rows.reduce((a, r) => a + r[k], 0) / rows.length).toFixed(2);
+console.log('-'.repeat(72));
+console.log(`${'CORPUS MEAN'.padEnd(34)} ${avg('v1').padStart(6)} ${avg('v2').padStart(6)}`);
diff --git a/evals/detector.mjs b/evals/detector.mjs
index 14a6ca7..42f1baf 100644
--- a/evals/detector.mjs
+++ b/evals/detector.mjs
@@ -49,10 +49,15 @@ function cssRules(css) {
   return out;
 }
 
-function tokenMap(css) {
+function tokenMap(css, activeTheme) {
   const map = {};
   for (const r of cssRules(css)) {
-    if (!/:root|\[data-theme/.test(r.sel)) continue;
+    const isRoot = /:root/.test(r.sel);
+    const themeM = r.sel.match(/\[data-theme(?:[~^$|*]?=)?["']?([a-z0-9-]+)?["']?\]/i);
+    if (!isRoot && !themeM) continue;
+    // when the page declares an active theme, ignore other themes' token blocks
+    // so a 22-theme design-system stylesheet isn't scored as one page
+    if (themeM && themeM[1] && activeTheme && themeM[1].toLowerCase() !== activeTheme.toLowerCase()) continue;
     for (const m of r.body.matchAll(/(--[a-z0-9-]+)\s*:\s*([^;]+)/gi)) {
       map[m[1].trim()] = m[2].trim();
     }
@@ -87,17 +92,23 @@ function oklchH(value) {
 
 const COLOR_LITERAL = /#[0-9a-fA-F]{3,8}\b|\brgba?\([^)]*\)|\bhsla?\([^)]*\)|\boklch\([^)]*\)|\blab\([^)]*\)/gi;
 
-function fontFamilies(css, map) {
+// Count families that are actually *applied*. Per gate 39, a monospace face
+// counts toward the family budget only when used outside code contexts —
+// counting an unused --font-mono token, or mono inside <pre>/<code>, is the
+// false positive that lit up dev-tool pages.
+function fontFamilies(rules, map) {
   const fams = new Set();
-  for (const m of css.matchAll(/font-family\s*:\s*([^;}]+)/gi)) {
+  for (const r of rules) {
+    if (/:root|\[data-theme/.test(r.sel)) continue;
+    const m = r.body.match(/font-family\s*:\s*([^;}]+)/i);
+    if (!m) continue;
     const resolved = resolveVar(m[1], map);
     const first = resolved.split(',')[0].trim().replace(/['"]/g, '').toLowerCase();
-    if (first && !GENERIC_FAMILIES.has(first) && !first.startsWith('var(')) fams.add(first);
-  }
-  for (const [k, v] of Object.entries(map)) {
-    if (!/--font/.test(k)) continue;
-    const first = String(v).split(',')[0].trim().replace(/['"]/g, '').toLowerCase();
-    if (first && !GENERIC_FAMILIES.has(first) && !first.startsWith('var(')) fams.add(first);
+    if (!first || GENERIC_FAMILIES.has(first) || first.startsWith('var(')) continue;
+    const mono = /mono/.test(first) || /\bmonospace\b/.test(resolved.toLowerCase());
+    const codeSel = /\b(pre|code|kbd|samp)\b/.test(r.sel);
+    if (mono && codeSel) continue;
+    fams.add(first);
   }
   return [...fams];
 }
@@ -205,15 +216,19 @@ const RULES = [
     id: 'color-ai-palette', dim: 'color',
     label: 'AI purple/violet→cyan gradient',
     fn: ({ css }) => {
+      // The tell is the violet/purple -> cyan/blue *ramp*, not a single
+      // deliberate brand hue. Require both ends to be present in one gradient.
       const grads = [...css.matchAll(/(linear|radial|conic)-gradient\([^;}]*\)/gi)].map((m) => m[0]);
       for (const g of grads) {
-        const kw = /purple|violet|indigo|fuchsia|magenta|#8b5cf6|#6366f1|#7c3aed|#a855f7/i.test(g);
-        const cyan = /cyan|teal|#06b6d4|#22d3ee/i.test(g);
+        const violetKw = /purple|violet|indigo|fuchsia|magenta|#8b5cf6|#6366f1|#7c3aed|#a855f7|#b06cff/i.test(g);
+        const cyanKw = /\bcyan\b|\bteal\b|\baqua\b|#06b6d4|#22d3ee|#38d6ff/i.test(g);
         const hues = [...g.matchAll(/oklch\([^)]*\)/gi)].map((x) => oklchH(x[0])).filter((h) => h != null);
-        const aiHue = hues.some((h) => h >= 270 && h <= 330);
-        if ((kw && cyan) || kw || aiHue) return { pass: false, note: `tell in ${g.slice(0, 40)}…` };
+        const hasViolet = hues.some((h) => h >= 270 && h <= 330);
+        const hasCyanBlue = hues.some((h) => h >= 190 && h <= 265);
+        const ramp = (violetKw && cyanKw) || (hasViolet && hasCyanBlue) || (violetKw && hasCyanBlue) || (hasViolet && cyanKw);
+        if (ramp) return { pass: false, note: `violet→cyan ramp in ${g.slice(0, 40)}…` };
       }
-      return { pass: true, note: 'no AI-palette gradient' };
+      return { pass: true, note: 'no violet→cyan ramp' };
     },
   },
   {
@@ -439,8 +454,10 @@ const RULES = [
     id: 'interaction-placeholder-names', dim: 'interaction',
     label: 'Placeholder names / startup clichés (gate 20)',
     fn: ({ html }) => {
-      const bad = /jane doe|john smith|john doe|lorem ipsum|\bacme\b|\bnexus\b|seamless|unleash|\bwidget(?:co|inc)\b/i.test(html);
-      return { pass: !bad, note: bad ? 'placeholder/cliché copy' : 'specific copy' };
+      // Only flag actual placeholder *names* — not ordinary words ("seamless",
+      // "unleash") that legitimately appear in marketing prose.
+      const bad = /jane doe|john smith|john doe|lorem ipsum|\bacme\b|\bwidget(?:co|inc)\b|example\.com/i.test(html);
+      return { pass: !bad, note: bad ? 'placeholder/cliché name' : 'specific copy' };
     },
   },
   {
@@ -618,8 +635,11 @@ const EXTRA_V2 = [
 // ---------------------------------------------------------------- scoring
 function analyze(path, version = 'v1') {
   const doc = loadDoc(path);
-  const map = tokenMap(doc.css);
-  const ctx = { ...doc, map, rules: cssRules(doc.css), fams: fontFamilies(doc.css, map) };
+  const activeTheme = (doc.html.match(/<html[^>]*\bdata-theme=["']([^"']+)["']/i) || [])[1] || '';
+  const map = tokenMap(doc.css, activeTheme);
+  const rules = cssRules(doc.css);
+  const ctx = { ...doc, map, rules, fams: fontFamilies(rules, map) };
+  const themeCount = new Set([...doc.css.matchAll(/\[data-theme(?:[~^$|*]?=)?["']?([a-z0-9-]+)/gi)].map((m) => m[1]).filter(Boolean)).size;
   const ruleset = version === 'v2' ? RULES.concat(EXTRA_V2) : RULES;
 
   const byDim = {};
@@ -634,7 +654,7 @@ function analyze(path, version = 'v1') {
   }
   const dimScores = Object.values(dims).map((d) => d.score);
   const overall = +(dimScores.reduce((a, b) => a + b, 0) / dimScores.length).toFixed(3);
-  return { file: path, genre: doc.genre || 'n/a', dims, overall, ruleCount: ruleset.length };
+  return { file: path, genre: doc.genre || 'n/a', activeTheme, themeCount, multiTheme: themeCount > 3, dims, overall, ruleCount: ruleset.length };
 }
 
 // ---------------------------------------------------------------- cli

From c7ea09aa9e1224ca496f1ae2d8f15d623d7e9ae5 Mon Sep 17 00:00:00 2001
From: Ade Oshineye <adewale@gmail.com>
Date: Mon, 25 May 2026 13:26:27 +0100
Subject: [PATCH 3/3] Add non-mutating eval check command

---
 evals/README.md          |   8 ++-
 evals/audit-site.mjs     |  17 ++++-
 evals/check.mjs          |  80 ++++++++++++++++++++++
 evals/core.mjs           | 144 +++++++++++++++++++++++++++++++++++++++
 evals/results/history.md |   7 +-
 evals/rubric.md          |   5 +-
 evals/run.mjs            | 102 ++-------------------------
 7 files changed, 258 insertions(+), 105 deletions(-)
 create mode 100644 evals/check.mjs
 create mode 100644 evals/core.mjs

diff --git a/evals/README.md b/evals/README.md
index 729f137..f4d69fe 100644
--- a/evals/README.md
+++ b/evals/README.md
@@ -12,7 +12,7 @@ An eval-driven hillclimb that improved Hallmark against two external anchors:
 
 | File | Role |
 |---|---|
-| `rubric.md` | The scoring rubric: 8 detector dimensions + 1 craft (judge) dimension. |
+| `rubric.md` | The scoring rubric: 8 detector dimensions + craft; v2 also folds in the cross-fixture structure order parameter. |
 | `briefs.md` | The briefs each fixture is the skill exercised on. |
 | `detector.mjs` | Deterministic slop detector — the CLI-checkable subset of the 37 patterns + Hallmark gates. v1 = 37 rules, v2 = 43. |
 | `run.mjs` | Merges detector + judge sidecars, computes the cross-fixture **order parameter**, snapshots a cycle, rebuilds `results/history.md`. |
@@ -24,9 +24,11 @@ An eval-driven hillclimb that improved Hallmark against two external anchors:
 ## Run it
 
 ```bash
+node evals/check.mjs                            # non-mutating regression check
 cd evals
-node detector.mjs fixtures/pulse.html --eval v2   # inspect one page
-node run.mjs --cycle 10 --eval v2 --label "..."   # score a cycle, update history
+node check.mjs --eval v2 --min-score 98         # stricter local check, no writes
+node detector.mjs fixtures/pulse.html --eval v2 # inspect one page
+node run.mjs --cycle 10 --eval v2 --label "..." # score a cycle, update history
 ```
 
 ## The hillclimb (10 cycles)
diff --git a/evals/audit-site.mjs b/evals/audit-site.mjs
index 430d1d2..de3633a 100644
--- a/evals/audit-site.mjs
+++ b/evals/audit-site.mjs
@@ -18,6 +18,21 @@ const ROOT = path.resolve(HERE, '..');
 const CACHE = path.join(HERE, '.site-cache');
 fs.mkdirSync(CACHE, { recursive: true });
 
+function readCssWithImports(cssPath, seen = new Set()) {
+  const abs = path.resolve(cssPath);
+  if (seen.has(abs) || !fs.existsSync(abs)) return '';
+  seen.add(abs);
+
+  const dir = path.dirname(abs);
+  let css = fs.readFileSync(abs, 'utf8');
+  css = css.replace(/@import\s+(?:url\()?['"]([^'")]+)['"]\)?\s*;/gi, (full, href) => {
+    if (/^https?:|^\/\//i.test(href)) return full; // keep remote imports as-is
+    const importPath = path.resolve(dir, href.split(/[?#]/)[0]);
+    return `/* ${href} */\n${readCssWithImports(importPath, seen)}`;
+  });
+  return css;
+}
+
 function inlinePage(htmlPath) {
   const abs = path.resolve(ROOT, htmlPath);
   const dir = path.dirname(abs);
@@ -28,7 +43,7 @@ function inlinePage(htmlPath) {
     const href = (link.match(/href=["']([^"']+)["']/i) || [])[1];
     if (!href || /^https?:|^\/\//i.test(href)) continue; // skip remote (e.g. Google Fonts)
     const cssPath = path.resolve(dir, href.split(/[?#]/)[0]);
-    if (fs.existsSync(cssPath)) blocks.push(`/* ${href} */\n${fs.readFileSync(cssPath, 'utf8')}`);
+    if (fs.existsSync(cssPath)) blocks.push(`/* ${href} */\n${readCssWithImports(cssPath)}`);
   }
   if (blocks.length) {
     const styleTag = `\n<style data-inlined>\n${blocks.join('\n')}\n</style>\n`;
diff --git a/evals/check.mjs b/evals/check.mjs
new file mode 100644
index 0000000..68995e9
--- /dev/null
+++ b/evals/check.mjs
@@ -0,0 +1,80 @@
+// Non-mutating CI/local check for the Hallmark eval harness.
+//
+// Scores the configured fixtures through the same path as run.mjs, but never
+// writes result snapshots, history files, or audit caches. Fails if detector
+// rules regress, if the v2 structure order parameter regresses, or if the
+// aggregate score falls below the configured threshold.
+//
+// Usage:
+//   node evals/check.mjs                 # check every eval version, min score 95
+//   node evals/check.mjs --eval v2       # check one eval version
+//   node evals/check.mjs --min-score 98  # tighten the score floor
+//   node evals/check.mjs --json          # machine-readable summary
+
+import fs from 'node:fs';
+import path from 'node:path';
+import { fileURLToPath } from 'node:url';
+import { detectorFailures, evaluateCycle } from './core.mjs';
+
+const HERE = path.dirname(fileURLToPath(import.meta.url));
+
+function arg(name, def) {
+  const i = process.argv.indexOf(`--${name}`);
+  return i >= 0 ? process.argv[i + 1] : def;
+}
+
+function evalVersions() {
+  const requested = arg('eval', 'all');
+  if (requested !== 'all') return [requested];
+  const config = JSON.parse(fs.readFileSync(path.join(HERE, 'config.json'), 'utf8'));
+  return Object.keys(config.evals);
+}
+
+const minScore = Number(arg('min-score', '95'));
+const asJson = process.argv.includes('--json');
+const rows = [];
+let failed = false;
+
+for (const evalVersion of evalVersions()) {
+  const { snapshot, perFixture, structureScore } = evaluateCycle({ evalVersion, label: 'non-mutating check', timestamp: 'check' });
+  const failures = detectorFailures(perFixture);
+  const structureFailure = evalVersion === 'v2' && structureScore < 5;
+  const scoreFailure = snapshot.cycleScore < minScore;
+
+  rows.push({
+    evalVersion,
+    score: snapshot.cycleScore,
+    minScore,
+    ruleCount: snapshot.ruleCount,
+    fixtureCount: snapshot.fixtureCount,
+    structureScore: evalVersion === 'v2' ? structureScore : undefined,
+    detectorFailureCount: failures.length,
+    failures,
+    passed: failures.length === 0 && !structureFailure && !scoreFailure,
+  });
+
+  if (failures.length || structureFailure || scoreFailure) failed = true;
+}
+
+if (asJson) {
+  console.log(JSON.stringify({ passed: !failed, checks: rows }, null, 2));
+} else {
+  console.log('\nHallmark eval check (non-mutating)\n');
+  for (const row of rows) {
+    const structure = row.structureScore == null ? '' : `   structure ${row.structureScore.toFixed(2)}/5`;
+    const status = row.passed ? 'PASS' : 'FAIL';
+    console.log(`${status} ${row.evalVersion}: ${row.score.toFixed(1)}/100   rules ${row.ruleCount}   fixtures ${row.fixtureCount}${structure}`);
+
+    if (row.score < row.minScore) {
+      console.log(`  ✗ score below floor: ${row.score.toFixed(1)} < ${row.minScore}`);
+    }
+    if (row.structureScore != null && row.structureScore < 5) {
+      console.log(`  ✗ structure order parameter below 5/5: ${row.structureScore.toFixed(2)}`);
+    }
+    for (const f of row.failures) {
+      console.log(`  ✗ ${f.file} [${f.dim}] ${f.id} — ${f.note}`);
+    }
+  }
+}
+
+process.exitCode = failed ? 1 : 0;
diff --git a/evals/core.mjs b/evals/core.mjs
new file mode 100644
index 0000000..f3e7d2d
--- /dev/null
+++ b/evals/core.mjs
@@ -0,0 +1,144 @@
+// Shared scoring primitives for the Hallmark eval harness.
+//
+// `run.mjs` uses these helpers to write result snapshots. `check.mjs` uses the
+// same path to validate the current fixtures without mutating the working tree.
+
+import fs from 'node:fs';
+import path from 'node:path';
+import { fileURLToPath } from 'node:url';
+import { analyze } from './detector.mjs';
+
+const HERE = path.dirname(fileURLToPath(import.meta.url));
+const RESULTS = path.join(HERE, 'results');
+const CRAFT_AXES = ['philosophy', 'hierarchy', 'execution', 'specificity', 'restraint', 'variety', 'honesty'];
+const DIM_ORDER = ['visual', 'typography', 'color', 'layout', 'motion', 'interaction', 'responsive', 'general', 'craft', 'structure'];
+
+function readJson(file) {
+  return JSON.parse(fs.readFileSync(file, 'utf8'));
+}
+
+function macrostructureOf(file) {
+  const src = fs.readFileSync(path.join(HERE, file), 'utf8');
+  return (src.match(/macrostructure:\s*([a-z0-9-]+)/i) || [])[1] || 'unstamped';
+}
+
+function structureScoreFor(fixtures) {
+  const macros = fixtures.map((fx) => macrostructureOf(fx.file));
+  const counts = macros.reduce((m, k) => ((m[k] = (m[k] || 0) + 1), m), {});
+  const collisions = Object.values(counts).reduce((a, n) => a + (n - 1), 0);
+  const unstamped = macros.filter((k) => k === 'unstamped').length;
+  return +Math.max(0, 5 - 2.5 * collisions - 2.5 * unstamped).toFixed(3);
+}
+
+function loadConfig() {
+  return readJson(path.join(HERE, 'config.json'));
+}
+
+function evaluateCycle({ cycle = 0, evalVersion = 'v1', label = '', timestamp = new Date().toISOString() } = {}) {
+  const config = loadConfig();
+  const evalConfig = config.evals[evalVersion];
+  if (!evalConfig) throw new Error(`Unknown eval version: ${evalVersion}`);
+
+  const fixtures = evalConfig.fixtures;
+  const structureScore = structureScoreFor(fixtures);
+  const perFixture = [];
+
+  for (const fx of fixtures) {
+    const det = analyze(path.join(HERE, fx.file), evalVersion);
+    const judge = readJson(path.join(HERE, fx.judge));
+    const craftVals = CRAFT_AXES.map((a) => judge[a]);
+    const craft = +(craftVals.reduce((a, b) => a + b, 0) / craftVals.length).toFixed(3);
+
+    const dimScores = { ...Object.fromEntries(Object.entries(det.dims).map(([k, v]) => [k, v.score])), craft };
+    // The order parameter is a property of the whole eval set; v2 folds it in.
+    if (evalVersion === 'v2') dimScores.structure = structureScore;
+    const overall5 = +(Object.values(dimScores).reduce((a, b) => a + b, 0) / Object.values(dimScores).length).toFixed(3);
+
+    perFixture.push({
+      name: fx.name,
+      file: fx.file,
+      macrostructure: macrostructureOf(fx.file),
+      detector: det,
+      judge,
+      dimScores,
+      score100: +(overall5 * 20).toFixed(1),
+    });
+  }
+
+  const allDims = [...new Set(perFixture.flatMap((f) => Object.keys(f.dimScores)))];
+  const aggDims = {};
+  for (const d of allDims) {
+    const vals = perFixture.map((f) => f.dimScores[d]).filter((v) => v != null);
+    aggDims[d] = +(vals.reduce((a, b) => a + b, 0) / vals.length).toFixed(3);
+  }
+  const cycleScore = +(perFixture.reduce((a, f) => a + f.score100, 0) / perFixture.length).toFixed(1);
+
+  const snapshot = {
+    cycle, evalVersion, label,
+    ruleCount: perFixture[0]?.detector.ruleCount ?? 0,
+    fixtureCount: perFixture.length,
+    cycleScore,
+    aggDims,
+    fixtures: perFixture.map((f) => ({ name: f.name, score100: f.score100, dimScores: f.dimScores })),
+    timestamp,
+  };
+
+  return { snapshot, perFixture, structureScore };
+}
+
+function writeSnapshot(snapshot) {
+  fs.mkdirSync(RESULTS, { recursive: true });
+  const tag = `${String(snapshot.cycle).padStart(2, '0')}-${snapshot.evalVersion}`;
+  fs.writeFileSync(path.join(RESULTS, `cycle-${tag}.json`), JSON.stringify(snapshot, null, 2));
+}
+
+function readSnapshots() {
+  if (!fs.existsSync(RESULTS)) return [];
+  return fs.readdirSync(RESULTS)
+    .filter((f) => /^cycle-.*\.json$/.test(f))
+    .map((f) => readJson(path.join(RESULTS, f)))
+    .sort((a, b) => (a.cycle - b.cycle) || a.evalVersion.localeCompare(b.evalVersion));
+}
+
+function buildHistoryMarkdown(snaps) {
+  let md = '# Eval history — Hallmark anti-slop hillclimb\n\n';
+  md += 'Score = mean of detector dimensions plus craft, × 20 (0–100). v2 also\n';
+  md += 'folds in the cross-fixture `structure` order parameter. Dimensions 1–8\n';
+  md += 'are the deterministic Impeccable detector; `craft` is the LLM-judge mean\n';
+  md += "of Hallmark's six axes + honesty.\n\n";
+  md += '| Cycle | Eval | Rules | Score | ' + DIM_ORDER.map((d) => d.slice(0, 5)).join(' | ') + ' | Change |\n';
+  md += '|---|---|---|---|' + DIM_ORDER.map(() => '---').join('|') + '|---|\n';
+  let prev = null;
+  for (const s of snaps) {
+    const delta = prev == null ? '—' : (s.cycleScore - prev >= 0 ? `+${(s.cycleScore - prev).toFixed(1)}` : (s.cycleScore - prev).toFixed(1));
+    md += `| ${s.cycle} | ${s.evalVersion} | ${s.ruleCount} | **${s.cycleScore.toFixed(1)}** | `
+      + DIM_ORDER.map((d) => (s.aggDims[d] != null ? s.aggDims[d].toFixed(2) : '—')).join(' | ')
+      + ` | ${delta} |\n`;
+    prev = s.cycleScore;
+  }
+  md += '\n## Notes per cycle\n\n';
+  for (const s of snaps) md += `- **Cycle ${s.cycle} (${s.evalVersion})** — ${s.label || '—'}\n`;
+  return md;
+}
+
+function rebuildHistory() {
+  fs.mkdirSync(RESULTS, { recursive: true });
+  fs.writeFileSync(path.join(RESULTS, 'history.md'), buildHistoryMarkdown(readSnapshots()));
+}
+
+function detectorFailures(perFixture) {
+  return perFixture.flatMap((fixture) => Object.entries(fixture.detector.dims).flatMap(([dim, d]) => (
+    d.rules.filter((r) => !r.pass).map((rule) => ({ fixture: fixture.name, file: fixture.file, dim, ...rule }))
+  )));
+}
+
+export {
+  CRAFT_AXES,
+  DIM_ORDER,
+  evaluateCycle,
+  writeSnapshot,
+  readSnapshots,
+  buildHistoryMarkdown,
+  rebuildHistory,
+  detectorFailures,
+};
diff --git a/evals/results/history.md b/evals/results/history.md
index 8d8131a..0e81d5b 100644
--- a/evals/results/history.md
+++ b/evals/results/history.md
@@ -1,8 +1,9 @@
 # Eval history — Hallmark anti-slop hillclimb
 
-Score = mean of nine dimensions × 20 (0–100). Dimensions 1–8 are the
-deterministic Impeccable detector; `craft` is the LLM-judge mean of
-Hallmark's six axes + honesty.
+Score = mean of detector dimensions plus craft, × 20 (0–100). v2 also
+folds in the cross-fixture `structure` order parameter. Dimensions 1–8
+are the deterministic Impeccable detector; `craft` is the LLM-judge mean
+of Hallmark's six axes + honesty.
 
 | Cycle | Eval | Rules | Score | visua | typog | color | layou | motio | inter | respo | gener | craft | struc | Change |
 |---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
diff --git a/evals/rubric.md b/evals/rubric.md
index 9598d20..dae8204 100644
--- a/evals/rubric.md
+++ b/evals/rubric.md
@@ -26,8 +26,9 @@ scores that job against two external anchors and Hallmark's own craft axes.
 | 8 | **general** | Impeccable + Hallmark | detector | focus-visible, stamp present, full state coverage |
 | 9 | **craft** | Hallmark | LLM judge | A clear position, instant hierarchy, in-spec execution, brief-specific, restrained, structurally varied, honest |
 
-**Fixture score** = mean of the nine dimensions × 20 (0–100).
-**Cycle score** = mean of fixture scores.
+**Fixture score** = mean of the detector dimensions plus craft × 20 (0–100).
+In v2, the cross-fixture `structure` order parameter is also folded into each
+fixture score. **Cycle score** = mean of fixture scores.
 
 ## Judge sidecar (per fixture, 1–5)
 
diff --git a/evals/run.mjs b/evals/run.mjs
index 129f78a..f2456d8 100644
--- a/evals/run.mjs
+++ b/evals/run.mjs
@@ -7,14 +7,7 @@
 //
 // Usage: node run.mjs --cycle <N> --eval v1|v2 --label "what changed"
 
-import fs from 'node:fs';
-import path from 'node:path';
-import { fileURLToPath } from 'node:url';
-import { analyze } from './detector.mjs';
-
-const HERE = path.dirname(fileURLToPath(import.meta.url));
-const RESULTS = path.join(HERE, 'results');
-const CRAFT_AXES = ['philosophy', 'hierarchy', 'execution', 'specificity', 'restraint', 'variety', 'honesty'];
+import { DIM_ORDER, evaluateCycle, rebuildHistory, writeSnapshot } from './core.mjs';
 
 function arg(name, def) {
   const i = process.argv.indexOf(`--${name}`);
@@ -25,95 +18,12 @@ const cycle = +arg('cycle', '0');
 const evalVersion = arg('eval', 'v1');
 const label = arg('label', '');
 
-const config = JSON.parse(fs.readFileSync(path.join(HERE, 'config.json'), 'utf8'));
-const fixtures = config.evals[evalVersion].fixtures;
-
-// --- order parameter (blog: monitor cross-eval correlation, not just per-item)
-// Structural-fingerprint reuse: two fixtures sharing a macrostructure is the
-// "colour-swap of one template" failure that per-page checks cannot see.
-function macrostructureOf(file) {
-  const src = fs.readFileSync(path.join(HERE, file), 'utf8');
-  return (src.match(/macrostructure:\s*([a-z0-9-]+)/i) || [])[1] || 'unstamped';
-}
-const macros = fixtures.map((fx) => macrostructureOf(fx.file));
-const counts = macros.reduce((m, k) => ((m[k] = (m[k] || 0) + 1), m), {});
-const collisions = Object.values(counts).reduce((a, n) => a + (n - 1), 0);
-const unstamped = macros.filter((k) => k === 'unstamped').length;
-const structureScore = +Math.max(0, 5 - 2.5 * collisions - 2.5 * unstamped).toFixed(3);
-
-const perFixture = [];
-for (const fx of fixtures) {
-  const det = analyze(path.join(HERE, fx.file), evalVersion);
-  const judgePath = path.join(HERE, fx.judge);
-  const judge = JSON.parse(fs.readFileSync(judgePath, 'utf8'));
-  const craftVals = CRAFT_AXES.map((a) => judge[a]);
-  const craft = +(craftVals.reduce((a, b) => a + b, 0) / craftVals.length).toFixed(3);
-
-  const dimScores = { ...Object.fromEntries(Object.entries(det.dims).map(([k, v]) => [k, v.score])), craft };
-  // the order parameter is a property of the whole eval set; v2 folds it in
-  if (evalVersion === 'v2') dimScores.structure = structureScore;
-  const overall5 = +(Object.values(dimScores).reduce((a, b) => a + b, 0) / Object.values(dimScores).length).toFixed(3);
-
-  perFixture.push({
-    name: fx.name,
-    file: fx.file,
-    macrostructure: macrostructureOf(fx.file),
-    detector: det,
-    judge,
-    dimScores,
-    score100: +(overall5 * 20).toFixed(1),
-  });
-}
-
-// aggregate dimensions across fixtures
-const allDims = [...new Set(perFixture.flatMap((f) => Object.keys(f.dimScores)))];
-const aggDims = {};
-for (const d of allDims) {
-  const vals = perFixture.map((f) => f.dimScores[d]).filter((v) => v != null);
-  aggDims[d] = +(vals.reduce((a, b) => a + b, 0) / vals.length).toFixed(3);
-}
-const cycleScore = +(perFixture.reduce((a, f) => a + f.score100, 0) / perFixture.length).toFixed(1);
-
-const snapshot = {
-  cycle, evalVersion, label,
-  ruleCount: perFixture[0]?.detector.ruleCount ?? 0,
-  fixtureCount: perFixture.length,
-  cycleScore,
-  aggDims,
-  fixtures: perFixture.map((f) => ({ name: f.name, score100: f.score100, dimScores: f.dimScores })),
-  timestamp: new Date().toISOString(),
-};
-
-const tag = `${String(cycle).padStart(2, '0')}-${evalVersion}`;
-fs.writeFileSync(path.join(RESULTS, `cycle-${tag}.json`), JSON.stringify(snapshot, null, 2));
-
-// rebuild history.md from every snapshot
-const snaps = fs.readdirSync(RESULTS)
-  .filter((f) => /^cycle-.*\.json$/.test(f))
-  .map((f) => JSON.parse(fs.readFileSync(path.join(RESULTS, f), 'utf8')))
-  .sort((a, b) => (a.cycle - b.cycle) || a.evalVersion.localeCompare(b.evalVersion));
-
-const dimOrder = ['visual', 'typography', 'color', 'layout', 'motion', 'interaction', 'responsive', 'general', 'craft', 'structure'];
-let md = '# Eval history — Hallmark anti-slop hillclimb\n\n';
-md += 'Score = mean of nine dimensions × 20 (0–100). Dimensions 1–8 are the\n';
-md += 'deterministic Impeccable detector; `craft` is the LLM-judge mean of\n';
-md += "Hallmark's six axes + honesty.\n\n";
-md += '| Cycle | Eval | Rules | Score | ' + dimOrder.map((d) => d.slice(0, 5)).join(' | ') + ' | Change |\n';
-md += '|---|---|---|---|' + dimOrder.map(() => '---').join('|') + '|---|\n';
-let prev = null;
-for (const s of snaps) {
-  const delta = prev == null ? '—' : (s.cycleScore - prev >= 0 ? `+${(s.cycleScore - prev).toFixed(1)}` : (s.cycleScore - prev).toFixed(1));
-  md += `| ${s.cycle} | ${s.evalVersion} | ${s.ruleCount} | **${s.cycleScore.toFixed(1)}** | `
-    + dimOrder.map((d) => (s.aggDims[d] != null ? s.aggDims[d].toFixed(2) : '—')).join(' | ')
-    + ` | ${delta} |\n`;
-  prev = s.cycleScore;
-}
-md += '\n## Notes per cycle\n\n';
-for (const s of snaps) md += `- **Cycle ${s.cycle} (${s.evalVersion})** — ${s.label || '—'}\n`;
-fs.writeFileSync(path.join(RESULTS, 'history.md'), md);
+const { snapshot, perFixture } = evaluateCycle({ cycle, evalVersion, label });
+writeSnapshot(snapshot);
+rebuildHistory();
 
 // console summary
 console.log(`\nCycle ${cycle} (${evalVersion}) — ${label}`);
-console.log(`  rules: ${snapshot.ruleCount}   fixtures: ${snapshot.fixtureCount}   SCORE: ${cycleScore}/100`);
-for (const d of dimOrder) if (aggDims[d] != null) console.log(`    ${d.padEnd(12)} ${aggDims[d].toFixed(2)}/5`);
+console.log(`  rules: ${snapshot.ruleCount}   fixtures: ${snapshot.fixtureCount}   SCORE: ${snapshot.cycleScore}/100`);
+for (const d of DIM_ORDER) if (snapshot.aggDims[d] != null) console.log(`    ${d.padEnd(12)} ${snapshot.aggDims[d].toFixed(2)}/5`);
 for (const f of perFixture) console.log(`  · ${f.name.padEnd(16)} ${f.score100}/100`);