Nutlope · adewale · May 21, 2026 · May 21, 2026 · May 25, 2026 · May 25, 2026
diff --git a/.gitignore b/.gitignore
@@ -43,3 +43,6 @@ skills-lock.json
 # Launch-day Hallmark vs Impeccable vs no-skills comparison — one-off artifact,
 # not referenced from the marketing site or README. Kept locally if useful.
 site/_launch-comparison/
+
+# Eval audit snapshots (throwaway)
+evals/.site-cache/
diff --git a/evals/README.md b/evals/README.md
@@ -0,0 +1,51 @@
+# `evals/` — anti-slop eval harness
+
+An eval-driven hillclimb that improved Hallmark against two external anchors:
+
+- **Impeccable's slop standard** — "37 patterns that mark an interface as
+  AI-generated" across 8 dimensions ([impeccable.style/slop](https://impeccable.style/slop)).
+- **"Your Evals Will Break and You Won't See It Coming"** — why static evals
+  silently miss new failure regimes, and the case for self-evolving evals
+  ([wanglun1996.github.io](https://wanglun1996.github.io/blog/your-evals-will-break.html)).
+
+## What's here
+
+| File | Role |
+|---|---|
+| `rubric.md` | The scoring rubric: 8 detector dimensions + craft; v2 also folds in the cross-fixture structure order parameter. |
+| `briefs.md` | The briefs each fixture is the skill exercised on. |
+| `detector.mjs` | Deterministic slop detector — the CLI-checkable subset of the 37 patterns + Hallmark gates. v1 = 37 rules, v2 = 43. |
+| `run.mjs` | Merges detector + judge sidecars, computes the cross-fixture **order parameter**, snapshots a cycle, rebuilds `results/history.md`. |
+| `config.json` | Which fixtures belong to eval v1 vs v2. |
+| `fixtures/*.html` | Self-contained pages (what Hallmark emits). |
+| `fixtures/*.judge.json` | Per-fixture craft scores (philosophy, hierarchy, execution, specificity, restraint, variety, honesty). |
+| `results/` | One JSON snapshot per cycle + the running `history.md` table. |
+
+## Run it
+
+```bash
+node evals/check.mjs                            # non-mutating regression check
+cd evals
+node check.mjs --eval v2 --min-score 98         # stricter local check, no writes
+node detector.mjs fixtures/pulse.html --eval v2 # inspect one page
+node run.mjs --cycle 10 --eval v2 --label "..." # score a cycle, update history
+```
+
+## The hillclimb (10 cycles)
+
+**Phase 1 (v1, cycles 1–5)** drove the three originals from 74.2 → 98.3 by
+closing gaps the detector found — each cycle added a real gate to
+`references/slop-test.md` (gates **70–77**) and brought the fixtures into line.
+
+**The break (cycle 6)** upgraded the eval to **v2**: six new detector rules
+for failure modes v1 was blind to (notably hero-float / gate 54, which the
+v1-perfect fixtures had been violating the whole time), a cross-fixture
+**order parameter** (macrostructure reuse — variety is a property of the
+*set*, not the page), and two adversarial fixtures (`pulse`, `vellum`). Score
+fell 98.3 → 76.4, exactly as the blog predicts.
+
+**Phase 2 (v2, cycles 7–10)** climbed back to 98.7, adding gates **78–84**
+and resisting `pulse`'s dark/neon/metric-hero brief gravity.
+
+The skill is the artifact that improved: 15 new gates, motivated by what the
+eval could measure. See `results/history.md` for the full score table.
diff --git a/evals/audit-site.mjs b/evals/audit-site.mjs
@@ -0,0 +1,79 @@
+// Audit real, in-repo Hallmark output with the detector.
+//
+// The detector reads inline CSS only; the shipped pages link external
+// stylesheets. This adapter inlines local <link rel="stylesheet"> files into
+// a self-contained snapshot, then scores it under both eval versions so we can
+// see what the current skill's gates (v2) catch that the initial skill's
+// gates (v1) did not — on artifacts the eval author did not write.
+//
+// Usage: node audit-site.mjs <page.html> [<page.html> ...]
+
+import fs from 'node:fs';
+import path from 'node:path';
+import { fileURLToPath } from 'node:url';
+import { analyze } from './detector.mjs';
+
+const HERE = path.dirname(fileURLToPath(import.meta.url));
+const ROOT = path.resolve(HERE, '..');
+const CACHE = path.join(HERE, '.site-cache');
+fs.mkdirSync(CACHE, { recursive: true });
+
+function readCssWithImports(cssPath, seen = new Set()) {
+  const abs = path.resolve(cssPath);
+  if (seen.has(abs) || !fs.existsSync(abs)) return '';
+  seen.add(abs);
+
+  const dir = path.dirname(abs);
+  let css = fs.readFileSync(abs, 'utf8');
+  css = css.replace(/@import\s+(?:url\()?['"]([^'")]+)['"]\)?\s*;/gi, (full, href) => {
+    if (/^https?:|^\/\//i.test(href)) return full; // keep remote imports as-is
+    const importPath = path.resolve(dir, href.split(/[?#]/)[0]);
+    return `/* ${href} */\n${readCssWithImports(importPath, seen)}`;
+  });
+  return css;
+}
+
+function inlinePage(htmlPath) {
+  const abs = path.resolve(ROOT, htmlPath);
+  const dir = path.dirname(abs);
+  let html = fs.readFileSync(abs, 'utf8');
+  const links = [...html.matchAll(/<link\b[^>]*rel=["']stylesheet["'][^>]*>/gi)].map((m) => m[0]);
+  const blocks = [];
+  for (const link of links) {
+    const href = (link.match(/href=["']([^"']+)["']/i) || [])[1];
+    if (!href || /^https?:|^\/\//i.test(href)) continue; // skip remote (e.g. Google Fonts)
+    const cssPath = path.resolve(dir, href.split(/[?#]/)[0]);
+    if (fs.existsSync(cssPath)) blocks.push(`/* ${href} */\n${readCssWithImports(cssPath)}`);
+  }
+  if (blocks.length) {
+    const styleTag = `\n<style data-inlined>\n${blocks.join('\n')}\n</style>\n`;
+    html = html.replace(/<\/head>/i, `${styleTag}</head>`);
+  }
+  const out = path.join(CACHE, htmlPath.replace(/[\/]/g, '__'));
+  fs.writeFileSync(out, html);
+  return out;
+}
+
+const pages = process.argv.slice(2).filter((a) => !a.startsWith('--'));
+const rows = [];
+for (const p of pages) {
+  let snap;
+  try { snap = inlinePage(p); } catch (e) { console.error(`skip ${p}: ${e.message}`); continue; }
+  const v1 = analyze(snap, 'v1');
+  const v2 = analyze(snap, 'v2');
+  const v2fails = Object.values(v2.dims).flatMap((d) => d.rules).filter((r) => !r.pass);
+  rows.push({ page: p, v1: v1.overall, v2: v2.overall, fails: v2fails, multiTheme: v2.multiTheme, themeCount: v2.themeCount });
+}
+
+const name = (p) => p.replace(/^site\//, '').replace(/\/index\.html$/, '/').replace(/index\.html$/, '');
+console.log('\nReal Hallmark corpus — detector audit (overall /5)\n');
+console.log(`${'page'.padEnd(34)} ${'v1'.padStart(6)} ${'v2'.padStart(6)}   v2 findings`);
+console.log('-'.repeat(72));
+for (const r of rows) {
+  const f = r.fails.length ? r.fails.map((x) => x.id.replace(/^v2-/, '')).join(', ') : '—';
+  const tag = r.multiTheme ? ` [multi-theme:${r.themeCount}, low-confidence]` : '';
+  console.log(`${name(r.page).padEnd(34)} ${r.v1.toFixed(2).padStart(6)} ${r.v2.toFixed(2).padStart(6)}   ${f}${tag}`);
+}
+const avg = (k) => (rows.reduce((a, r) => a + r[k], 0) / rows.length).toFixed(2);
+console.log('-'.repeat(72));
+console.log(`${'CORPUS MEAN'.padEnd(34)} ${avg('v1').padStart(6)} ${avg('v2').padStart(6)}`);
diff --git a/evals/briefs.md b/evals/briefs.md
@@ -0,0 +1,26 @@
+# Eval briefs
+
+Each fixture is the skill exercised on one brief. Briefs span genres so the
+detector isn't fooled by a single safe house style. Fixtures live in
+`fixtures/` as self-contained HTML (exactly what Hallmark emits).
+
+## v1 briefs
+
+- **ledger** — landing page for *Ledger*, an open-source double-entry
+  bookkeeping CLI for indie developers. Genre: modern-minimal.
+  Macrostructure target: stat-led / workbench (no rote hero→3-features→CTA).
+- **fernweh** — homepage for *Fernweh*, a small-group slow-travel company
+  running 8-day walking trips. Genre: atmospheric / editorial.
+  Macrostructure target: photographic or narrative-workflow.
+- **kiln** — studio page for *Kiln & Co.*, a two-person ceramics workshop
+  selling a seasonal run of stoneware. Genre: editorial / specimen-adjacent
+  but must NOT default to Specimen.
+
+## v2 briefs (added when v1 saturates)
+
+- **synthwave-trap** — adversarial: a brief for *Pulse*, a "developer
+  analytics dashboard," whose own copy nudges toward dark-mode + neon +
+  metric-hero slop. The skill must resist the brief's gravity.
+- **vellum** — a long-form essay page for *Vellum*, a writing tool. Probes
+  reading-comfort tells v1 underweights (measure rhythm, widows, heading
+  cadence, real prose hierarchy).
diff --git a/evals/check.mjs b/evals/check.mjs
@@ -0,0 +1,80 @@
+// Non-mutating CI/local check for the Hallmark eval harness.
+//
+// Scores the configured fixtures through the same path as run.mjs, but never
+// writes result snapshots, history files, or audit caches. Fails if detector
+// rules regress, if the v2 structure order parameter regresses, or if the
+// aggregate score falls below the configured threshold.
+//
+// Usage:
+//   node evals/check.mjs                 # check every eval version, min score 95
+//   node evals/check.mjs --eval v2       # check one eval version
+//   node evals/check.mjs --min-score 98  # tighten the score floor
+//   node evals/check.mjs --json          # machine-readable summary
+
+import fs from 'node:fs';
+import path from 'node:path';
+import { fileURLToPath } from 'node:url';
+import { detectorFailures, evaluateCycle } from './core.mjs';
+
+const HERE = path.dirname(fileURLToPath(import.meta.url));
+
+function arg(name, def) {
+  const i = process.argv.indexOf(`--${name}`);
+  return i >= 0 ? process.argv[i + 1] : def;
+}
+
+function evalVersions() {
+  const requested = arg('eval', 'all');
+  if (requested !== 'all') return [requested];
+  const config = JSON.parse(fs.readFileSync(path.join(HERE, 'config.json'), 'utf8'));
+  return Object.keys(config.evals);
+}
+
+const minScore = Number(arg('min-score', '95'));
+const asJson = process.argv.includes('--json');
+const rows = [];
+let failed = false;
+
+for (const evalVersion of evalVersions()) {
+  const { snapshot, perFixture, structureScore } = evaluateCycle({ evalVersion, label: 'non-mutating check', timestamp: 'check' });
+  const failures = detectorFailures(perFixture);
+  const structureFailure = evalVersion === 'v2' && structureScore < 5;
+  const scoreFailure = snapshot.cycleScore < minScore;
+
+  rows.push({
+    evalVersion,
+    score: snapshot.cycleScore,
+    minScore,
+    ruleCount: snapshot.ruleCount,
+    fixtureCount: snapshot.fixtureCount,
+    structureScore: evalVersion === 'v2' ? structureScore : undefined,
+    detectorFailureCount: failures.length,
+    failures,
+    passed: failures.length === 0 && !structureFailure && !scoreFailure,
+  });
+
+  if (failures.length || structureFailure || scoreFailure) failed = true;
+}
+
+if (asJson) {
+  console.log(JSON.stringify({ passed: !failed, checks: rows }, null, 2));
+} else {
+  console.log('\nHallmark eval check (non-mutating)\n');
+  for (const row of rows) {
+    const structure = row.structureScore == null ? '' : `   structure ${row.structureScore.toFixed(2)}/5`;
+    const status = row.passed ? 'PASS' : 'FAIL';
+    console.log(`${status} ${row.evalVersion}: ${row.score.toFixed(1)}/100   rules ${row.ruleCount}   fixtures ${row.fixtureCount}${structure}`);
+
+    if (row.score < row.minScore) {
+      console.log(`  ✗ score below floor: ${row.score.toFixed(1)} < ${row.minScore}`);
+    }
+    if (row.structureScore != null && row.structureScore < 5) {
+      console.log(`  ✗ structure order parameter below 5/5: ${row.structureScore.toFixed(2)}`);
+    }
+    for (const f of row.failures) {
+      console.log(`  ✗ ${f.file} [${f.dim}] ${f.id} — ${f.note}`);
+    }
+  }
+}
+
+process.exitCode = failed ? 1 : 0;
diff --git a/evals/config.json b/evals/config.json
@@ -0,0 +1,20 @@
+{
+  "evals": {
+    "v1": {
+      "fixtures": [
+        { "name": "ledger", "file": "fixtures/ledger.html", "judge": "fixtures/ledger.judge.json" },
+        { "name": "fernweh", "file": "fixtures/fernweh.html", "judge": "fixtures/fernweh.judge.json" },
+        { "name": "kiln", "file": "fixtures/kiln.html", "judge": "fixtures/kiln.judge.json" }
+      ]
+    },
+    "v2": {
+      "fixtures": [
+        { "name": "ledger", "file": "fixtures/ledger.html", "judge": "fixtures/ledger.judge.json" },
+        { "name": "fernweh", "file": "fixtures/fernweh.html", "judge": "fixtures/fernweh.judge.json" },
+        { "name": "kiln", "file": "fixtures/kiln.html", "judge": "fixtures/kiln.judge.json" },
+        { "name": "pulse", "file": "fixtures/pulse.html", "judge": "fixtures/pulse.judge.json" },
+        { "name": "vellum", "file": "fixtures/vellum.html", "judge": "fixtures/vellum.judge.json" }
+      ]
+    }
+  }
+}