Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -43,3 +43,6 @@ skills-lock.json
# Launch-day Hallmark vs Impeccable vs no-skills comparison — one-off artifact,
# not referenced from the marketing site or README. Kept locally if useful.
site/_launch-comparison/

# Eval audit snapshots (throwaway)
evals/.site-cache/
51 changes: 51 additions & 0 deletions evals/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,51 @@
# `evals/` — anti-slop eval harness

An eval-driven hillclimb that improved Hallmark against two external anchors:

- **Impeccable's slop standard** — "37 patterns that mark an interface as
AI-generated" across 8 dimensions ([impeccable.style/slop](https://impeccable.style/slop)).
- **"Your Evals Will Break and You Won't See It Coming"** — why static evals
silently miss new failure regimes, and the case for self-evolving evals
([wanglun1996.github.io](https://wanglun1996.github.io/blog/your-evals-will-break.html)).

## What's here

| File | Role |
|---|---|
| `rubric.md` | The scoring rubric: 8 detector dimensions + craft; v2 also folds in the cross-fixture structure order parameter. |
| `briefs.md` | The briefs each fixture is the skill exercised on. |
| `detector.mjs` | Deterministic slop detector — the CLI-checkable subset of the 37 patterns + Hallmark gates. v1 = 37 rules, v2 = 43. |
| `run.mjs` | Merges detector + judge sidecars, computes the cross-fixture **order parameter**, snapshots a cycle, rebuilds `results/history.md`. |
| `config.json` | Which fixtures belong to eval v1 vs v2. |
| `fixtures/*.html` | Self-contained pages (what Hallmark emits). |
| `fixtures/*.judge.json` | Per-fixture craft scores (philosophy, hierarchy, execution, specificity, restraint, variety, honesty). |
| `results/` | One JSON snapshot per cycle + the running `history.md` table. |

## Run it

```bash
node evals/check.mjs # non-mutating regression check
cd evals
node check.mjs --eval v2 --min-score 98 # stricter local check, no writes
node detector.mjs fixtures/pulse.html --eval v2 # inspect one page
node run.mjs --cycle 10 --eval v2 --label "..." # score a cycle, update history
```

## The hillclimb (10 cycles)

**Phase 1 (v1, cycles 1–5)** drove the three originals from 74.2 → 98.3 by
closing gaps the detector found — each cycle added a real gate to
`references/slop-test.md` (gates **70–77**) and brought the fixtures into line.

**The break (cycle 6)** upgraded the eval to **v2**: six new detector rules
for failure modes v1 was blind to (notably hero-float / gate 54, which the
v1-perfect fixtures had been violating the whole time), a cross-fixture
**order parameter** (macrostructure reuse — variety is a property of the
*set*, not the page), and two adversarial fixtures (`pulse`, `vellum`). Score
fell 98.3 → 76.4, exactly as the blog predicts.

**Phase 2 (v2, cycles 7–10)** climbed back to 98.7, adding gates **78–84**
and resisting `pulse`'s dark/neon/metric-hero brief gravity.

The skill is the artifact that improved: 15 new gates, motivated by what the
eval could measure. See `results/history.md` for the full score table.
79 changes: 79 additions & 0 deletions evals/audit-site.mjs
Original file line number Diff line number Diff line change
@@ -0,0 +1,79 @@
// Audit real, in-repo Hallmark output with the detector.
//
// The detector reads inline CSS only; the shipped pages link external
// stylesheets. This adapter inlines local <link rel="stylesheet"> files into
// a self-contained snapshot, then scores it under both eval versions so we can
// see what the current skill's gates (v2) catch that the initial skill's
// gates (v1) did not — on artifacts the eval author did not write.
//
// Usage: node audit-site.mjs <page.html> [<page.html> ...]

import fs from 'node:fs';
import path from 'node:path';
import { fileURLToPath } from 'node:url';
import { analyze } from './detector.mjs';

const HERE = path.dirname(fileURLToPath(import.meta.url));
const ROOT = path.resolve(HERE, '..');
const CACHE = path.join(HERE, '.site-cache');
fs.mkdirSync(CACHE, { recursive: true });

function readCssWithImports(cssPath, seen = new Set()) {
const abs = path.resolve(cssPath);
if (seen.has(abs) || !fs.existsSync(abs)) return '';
seen.add(abs);

const dir = path.dirname(abs);
let css = fs.readFileSync(abs, 'utf8');
css = css.replace(/@import\s+(?:url\()?['"]([^'")]+)['"]\)?\s*;/gi, (full, href) => {
if (/^https?:|^\/\//i.test(href)) return full; // keep remote imports as-is
const importPath = path.resolve(dir, href.split(/[?#]/)[0]);
return `/* ${href} */\n${readCssWithImports(importPath, seen)}`;
});
return css;
}

function inlinePage(htmlPath) {
const abs = path.resolve(ROOT, htmlPath);
const dir = path.dirname(abs);
let html = fs.readFileSync(abs, 'utf8');
const links = [...html.matchAll(/<link\b[^>]*rel=["']stylesheet["'][^>]*>/gi)].map((m) => m[0]);
const blocks = [];
for (const link of links) {
const href = (link.match(/href=["']([^"']+)["']/i) || [])[1];
if (!href || /^https?:|^\/\//i.test(href)) continue; // skip remote (e.g. Google Fonts)
const cssPath = path.resolve(dir, href.split(/[?#]/)[0]);
if (fs.existsSync(cssPath)) blocks.push(`/* ${href} */\n${readCssWithImports(cssPath)}`);
}
if (blocks.length) {
const styleTag = `\n<style data-inlined>\n${blocks.join('\n')}\n</style>\n`;
html = html.replace(/<\/head>/i, `${styleTag}</head>`);
}
const out = path.join(CACHE, htmlPath.replace(/[\/]/g, '__'));
fs.writeFileSync(out, html);
return out;
}

const pages = process.argv.slice(2).filter((a) => !a.startsWith('--'));
const rows = [];
for (const p of pages) {
let snap;
try { snap = inlinePage(p); } catch (e) { console.error(`skip ${p}: ${e.message}`); continue; }
const v1 = analyze(snap, 'v1');
const v2 = analyze(snap, 'v2');
const v2fails = Object.values(v2.dims).flatMap((d) => d.rules).filter((r) => !r.pass);
rows.push({ page: p, v1: v1.overall, v2: v2.overall, fails: v2fails, multiTheme: v2.multiTheme, themeCount: v2.themeCount });
}

const name = (p) => p.replace(/^site\//, '').replace(/\/index\.html$/, '/').replace(/index\.html$/, '');
console.log('\nReal Hallmark corpus — detector audit (overall /5)\n');
console.log(`${'page'.padEnd(34)} ${'v1'.padStart(6)} ${'v2'.padStart(6)} v2 findings`);
console.log('-'.repeat(72));
for (const r of rows) {
const f = r.fails.length ? r.fails.map((x) => x.id.replace(/^v2-/, '')).join(', ') : '—';
const tag = r.multiTheme ? ` [multi-theme:${r.themeCount}, low-confidence]` : '';
console.log(`${name(r.page).padEnd(34)} ${r.v1.toFixed(2).padStart(6)} ${r.v2.toFixed(2).padStart(6)} ${f}${tag}`);
}
const avg = (k) => (rows.reduce((a, r) => a + r[k], 0) / rows.length).toFixed(2);
console.log('-'.repeat(72));
console.log(`${'CORPUS MEAN'.padEnd(34)} ${avg('v1').padStart(6)} ${avg('v2').padStart(6)}`);
26 changes: 26 additions & 0 deletions evals/briefs.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,26 @@
# Eval briefs

Each fixture is the skill exercised on one brief. Briefs span genres so the
detector isn't fooled by a single safe house style. Fixtures live in
`fixtures/` as self-contained HTML (exactly what Hallmark emits).

## v1 briefs

- **ledger** — landing page for *Ledger*, an open-source double-entry
bookkeeping CLI for indie developers. Genre: modern-minimal.
Macrostructure target: stat-led / workbench (no rote hero→3-features→CTA).
- **fernweh** — homepage for *Fernweh*, a small-group slow-travel company
running 8-day walking trips. Genre: atmospheric / editorial.
Macrostructure target: photographic or narrative-workflow.
- **kiln** — studio page for *Kiln & Co.*, a two-person ceramics workshop
selling a seasonal run of stoneware. Genre: editorial / specimen-adjacent
but must NOT default to Specimen.

## v2 briefs (added when v1 saturates)

- **synthwave-trap** — adversarial: a brief for *Pulse*, a "developer
analytics dashboard," whose own copy nudges toward dark-mode + neon +
metric-hero slop. The skill must resist the brief's gravity.
- **vellum** — a long-form essay page for *Vellum*, a writing tool. Probes
reading-comfort tells v1 underweights (measure rhythm, widows, heading
cadence, real prose hierarchy).
80 changes: 80 additions & 0 deletions evals/check.mjs
Original file line number Diff line number Diff line change
@@ -0,0 +1,80 @@
// Non-mutating CI/local check for the Hallmark eval harness.
//
// Scores the configured fixtures through the same path as run.mjs, but never
// writes result snapshots, history files, or audit caches. Fails if detector
// rules regress, if the v2 structure order parameter regresses, or if the
// aggregate score falls below the configured threshold.
//
// Usage:
// node evals/check.mjs # check every eval version, min score 95
// node evals/check.mjs --eval v2 # check one eval version
// node evals/check.mjs --min-score 98 # tighten the score floor
// node evals/check.mjs --json # machine-readable summary

import fs from 'node:fs';
import path from 'node:path';
import { fileURLToPath } from 'node:url';
import { detectorFailures, evaluateCycle } from './core.mjs';

const HERE = path.dirname(fileURLToPath(import.meta.url));

function arg(name, def) {
const i = process.argv.indexOf(`--${name}`);
return i >= 0 ? process.argv[i + 1] : def;
}

function evalVersions() {
const requested = arg('eval', 'all');
if (requested !== 'all') return [requested];
const config = JSON.parse(fs.readFileSync(path.join(HERE, 'config.json'), 'utf8'));
return Object.keys(config.evals);
}

const minScore = Number(arg('min-score', '95'));
const asJson = process.argv.includes('--json');
const rows = [];
let failed = false;

for (const evalVersion of evalVersions()) {
const { snapshot, perFixture, structureScore } = evaluateCycle({ evalVersion, label: 'non-mutating check', timestamp: 'check' });
const failures = detectorFailures(perFixture);
const structureFailure = evalVersion === 'v2' && structureScore < 5;
const scoreFailure = snapshot.cycleScore < minScore;

rows.push({
evalVersion,
score: snapshot.cycleScore,
minScore,
ruleCount: snapshot.ruleCount,
fixtureCount: snapshot.fixtureCount,
structureScore: evalVersion === 'v2' ? structureScore : undefined,
detectorFailureCount: failures.length,
failures,
passed: failures.length === 0 && !structureFailure && !scoreFailure,
});

if (failures.length || structureFailure || scoreFailure) failed = true;
}

if (asJson) {
console.log(JSON.stringify({ passed: !failed, checks: rows }, null, 2));
} else {
console.log('\nHallmark eval check (non-mutating)\n');
for (const row of rows) {
const structure = row.structureScore == null ? '' : ` structure ${row.structureScore.toFixed(2)}/5`;
const status = row.passed ? 'PASS' : 'FAIL';
console.log(`${status} ${row.evalVersion}: ${row.score.toFixed(1)}/100 rules ${row.ruleCount} fixtures ${row.fixtureCount}${structure}`);

if (row.score < row.minScore) {
console.log(` ✗ score below floor: ${row.score.toFixed(1)} < ${row.minScore}`);
}
if (row.structureScore != null && row.structureScore < 5) {
console.log(` ✗ structure order parameter below 5/5: ${row.structureScore.toFixed(2)}`);
}
for (const f of row.failures) {
console.log(` ✗ ${f.file} [${f.dim}] ${f.id} — ${f.note}`);
}
}
}

process.exitCode = failed ? 1 : 0;
20 changes: 20 additions & 0 deletions evals/config.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,20 @@
{
"evals": {
"v1": {
"fixtures": [
{ "name": "ledger", "file": "fixtures/ledger.html", "judge": "fixtures/ledger.judge.json" },
{ "name": "fernweh", "file": "fixtures/fernweh.html", "judge": "fixtures/fernweh.judge.json" },
{ "name": "kiln", "file": "fixtures/kiln.html", "judge": "fixtures/kiln.judge.json" }
]
},
"v2": {
"fixtures": [
{ "name": "ledger", "file": "fixtures/ledger.html", "judge": "fixtures/ledger.judge.json" },
{ "name": "fernweh", "file": "fixtures/fernweh.html", "judge": "fixtures/fernweh.judge.json" },
{ "name": "kiln", "file": "fixtures/kiln.html", "judge": "fixtures/kiln.judge.json" },
{ "name": "pulse", "file": "fixtures/pulse.html", "judge": "fixtures/pulse.judge.json" },
{ "name": "vellum", "file": "fixtures/vellum.html", "judge": "fixtures/vellum.judge.json" }
]
}
}
}
Loading