From 5a7b7ae8581ef7be0351df5fc600976b5e714816 Mon Sep 17 00:00:00 2001 From: Keith So <68618199+kitfunso@users.noreply.github.com> Date: Thu, 11 Jun 2026 20:04:34 +0100 Subject: [PATCH 1/5] feat(eval): E1 longitudinal lifecycle harness (generator + driver + invariants) The pre-registered E1 protocol (hippo-paper PREREGISTERED-DESIGN.md v0.2+A1): - scripts/e1-lifecycle/generate.mjs: deterministic protocol generator (mulberry32; same seed = byte-identical). 300 version-chained facts, exact 40% updates / 10% contradictions / 15% traps via shuffled-slice draws, 5 hard-negative families (incl. the pilot-1 both-token rescale), scheduled retrievals (hot = top quartile), outcome schedule on EXPLICIT memory refs. - scripts/e1-lifecycle/run.mjs: hermetic driver. Fresh temp store per (arm x seed); simulated weekly clock via HIPPO_FAKE_NOW + cache reset; CLI-parity mutation block (hybridSearch -> markRetrieved -> persistence gated by isRecallBoostAblated, identical to cmdRecall); outcomes applied to explicit protocol-mapped ids; READ-ONLY probes per epoch (explicit now, no write-back) computing current-version R@5, MRR, stale-intrusion, trap-persistence, contradiction-intrusion, hot-fact R@5. 7 arms wired. No sleep (E3's dimension), no embeddings (orthogonal to the mechanisms). - tests/e1-harness.test.ts: 7 invariant gates - generator determinism + exact fractions, simulated-time stamping (no real-clock leakage), strengthen-off isolation WITH outcome attribution intact, probe read-only-ness, driver env hygiene, baseline arms. - benchmarks/e1-lifecycle/PILOT-NOTES.md: pilot 1 (saturation guard fired on all-off, >0.90 on three primaries) + the recorded rescale. Co-Authored-By: Claude Opus 4.8 (1M context) --- benchmarks/e1-lifecycle/PILOT-NOTES.md | 37 +++ scripts/e1-lifecycle/generate.mjs | 323 +++++++++++++++++++++++++ scripts/e1-lifecycle/run.mjs | 288 ++++++++++++++++++++++ tests/e1-harness.test.ts | 128 ++++++++++ 4 files changed, 776 insertions(+) create mode 100644 benchmarks/e1-lifecycle/PILOT-NOTES.md create mode 100644 scripts/e1-lifecycle/generate.mjs create mode 100644 scripts/e1-lifecycle/run.mjs create mode 100644 tests/e1-harness.test.ts diff --git a/benchmarks/e1-lifecycle/PILOT-NOTES.md b/benchmarks/e1-lifecycle/PILOT-NOTES.md new file mode 100644 index 0000000..aeccd2a --- /dev/null +++ b/benchmarks/e1-lifecycle/PILOT-NOTES.md @@ -0,0 +1,37 @@ +# E1 pilot notes (pre-freeze; design v0.2 saturation-guard clause) + +Per PREREGISTERED-DESIGN.md section 3 E1: pilot-stage rescales are allowed BEFORE the +`e1-generator-freeze` tag, must be recorded here with their trigger, and stop being legal +the moment the tag exists. + +## Pilot 1 — 2026-06-11, generator 1.0.0 (4 negative families), seed 1, 300 facts / 20 sessions / 10x distractors + +| final epoch | full | all-off | guard [0.10, 0.90] | +|---|---|---|---| +| current R@5 | 0.333 | 0.943 | all-off VIOLATES (>0.90) | +| stale-intrusion | 0.600 | 0.975 | all-off VIOLATES | +| trap-persistence | 0.089 | 0.933 | all-off VIOLATES | +| hot R@5 | 0.435 | 0.930 | all-off VIOLATES | + +Wall-clock: full 206.8s, all-off 104.8s per (arm x seed). Full registered matrix +(7 arms x 20 seeds) projects to ~6-7h serial, local, $0. + +**Diagnosis:** the probe query ("entity attribute") collides on BOTH tokens with only +~4-6 documents per fact (versions + contradiction + trap + paraphrase-negative). With +top-5 slots, a static BM25 ranking can hold essentially all of them - the all-off +baseline saturates (everyone gets an A; the exam differentiates nothing). + +**Directional preview (not registered results):** mechanisms differentiate strongly and +in OPPOSITE directions - outcome feedback buries traps (0.089 vs 0.933); decay appears +to sacrifice old-but-current facts to fresher distractors (full current R@5 0.333 vs +0.943; 7-day default half-life across a 20-week horizon floors every old memory's +strength multiplier at 0.5x while fresh noise keeps ~1.0x). If this holds under the +rescaled protocol and the registered arms, the decomposition (which mechanism helps, +which hurts, at what horizon) is the paper's core result. H2's pivot criteria adjudicate. + +**Rescale applied (generator 1.0.0 -> same version pre-freeze, family count 4 -> 5):** +added a BOTH-token hard-negative family (same entity AND attribute in a meta/process +sentence carrying its own NEG token), so every fact now has more both-token colliders +than top-5 slots and the static baseline must actually rank. No other change. + +## Pilot 2 — pending (rescaled generator) diff --git a/scripts/e1-lifecycle/generate.mjs b/scripts/e1-lifecycle/generate.mjs new file mode 100644 index 0000000..f951c86 --- /dev/null +++ b/scripts/e1-lifecycle/generate.mjs @@ -0,0 +1,323 @@ +#!/usr/bin/env node +/** + * E1 longitudinal lifecycle protocol — deterministic GENERATOR. + * + * Emits the full pre-registered protocol for one seed (PREREGISTERED-DESIGN.md + * v0.2 + A1, hippo-paper repo): K simulated sessions over calendar time, N + * version-chained facts (40% receive v2/v3 updates, 10% receive + * contradictions), HARD negatives only (same-entity/different-attribute, + * paraphrases of superseded versions, temporal near-misses, + * contradiction-lookalikes), a scheduled-retrieval plan (drives + * strengthening; hot facts = top quartile by scheduled recalls), and an + * outcome schedule with EXPLICIT memory references (never last_retrieval_ids + * - the strengthen-off arm must keep outcome attribution working) including + * plausible-but-wrong TRAP memories that receive --bad. + * + * Determinism contract (inherited from scripts/lifecycle-stress/inject.mjs): + * a seeded mulberry32 PRNG drives every choice; NO Math.random, NO Date.now + * in the content path. Same seed => byte-identical protocol JSON. This file + * is tagged `e1-generator-freeze` BEFORE any ablation arm is run (anti-bias + * commitment, design rev #4); changes after the tag = amendment + full re-run. + * + * Value tokens are opaque (VAL) so scoring is by + * token containment, never by memory id, and a probe can detect WHICH + * version surfaced (current vs superseded vs contradiction vs trap). + * + * Run standalone: node scripts/e1-lifecycle/generate.mjs --seed 1 [--facts 300] [--sessions 20] + */ + +import * as fs from 'node:fs'; +import * as path from 'node:path'; +import { fileURLToPath } from 'node:url'; +import { mulberry32 } from '../lifecycle-stress/inject.mjs'; + +export const GENERATOR_VERSION = '1.0.0'; + +// -------------------------------------------------------------------------- +// Vocabulary pools. Compound entities = HEAD x UNIT product (>= 288 distinct), +// each fact draws a UNIQUE (entity, attribute) pair; cross-fact token overlap +// stays low by construction (distinct entity tokens dominate each content). +// -------------------------------------------------------------------------- + +const ENTITY_HEADS = [ + 'Falcon', 'Atlas', 'Nova', 'Orion', 'Pegasus', 'Vega', 'Lyra', 'Draco', + 'Cygnus', 'Hydra', 'Phoenix', 'Cobra', 'Tucana', 'Mensa', 'Carina', 'Lynx', + 'Aquila', 'Corvus', 'Dorado', 'Fornax', 'Gemini', 'Indus', 'Pavo', 'Volans', +]; +const ENTITY_UNITS = [ + 'pipeline', 'gateway', 'cluster', 'registry', 'scheduler', 'archive', + 'console', 'exporter', 'replicator', 'balancer', 'notifier', 'indexer', +]; +const ATTRIBUTES = [ + 'deadline', 'budget cap', 'owner', 'vendor contract', 'rollout window', + 'staffing plan', 'compliance review date', 'migration target', + 'pricing tier', 'incident contact', 'launch gate', 'security sign-off', + 'retention period', 'latency budget', 'backup cadence', 'capacity ceiling', +]; +// Connectives vary surface form between versions/paraphrases (low-signal words). +const CONNECTIVES = ['is now', 'has been set to', 'was confirmed as', 'stands at', 'moved to']; +const PARAPHRASE_LEADS = [ + 'For the record,', 'As noted earlier,', 'Per the old thread,', 'Reminder:', +]; +const NEARMISS_QUALIFIERS = [ + 'tentatively pencilled near', 'rumoured to be around', 'once floated as', + 'informally guessed at', +]; + +/** Deterministic opaque value token: VAL + factIdx + version + 4 seeded digits. */ +function valueToken(rand, factIdx, version) { + const digits = String(Math.floor(rand() * 10000)).padStart(4, '0'); + return `VAL${factIdx}X${version}D${digits}`; +} + +function pick(rand, arr) { + return arr[Math.floor(rand() * arr.length)]; +} + +/** Fisher-Yates with the seeded PRNG. */ +function shuffle(rand, arr) { + const a = arr.slice(); + for (let i = a.length - 1; i > 0; i--) { + const j = Math.floor(rand() * (i + 1)); + [a[i], a[j]] = [a[j], a[i]]; + } + return a; +} + +// -------------------------------------------------------------------------- +// Generator +// -------------------------------------------------------------------------- + +/** + * @param {object} opts + * @param {number} opts.seed + * @param {number} [opts.numFacts=300] + * @param {number} [opts.numSessions=20] + * @param {number} [opts.distractorMultiple=10] hard negatives >= multiple * facts + * @param {string} [opts.baseDate='2025-01-06T09:00:00.000Z'] + * @param {number} [opts.sessionIntervalDays=7] + */ +export function generateProtocol(opts) { + const seed = opts.seed >>> 0; + const numFacts = opts.numFacts ?? 300; + const numSessions = opts.numSessions ?? 20; + const distractorMultiple = opts.distractorMultiple ?? 10; + const baseDate = opts.baseDate ?? '2025-01-06T09:00:00.000Z'; + const sessionIntervalDays = opts.sessionIntervalDays ?? 7; + const rand = mulberry32(seed); + + // Sessions with simulated dates (weekly cadence by default). + const baseMs = Date.parse(baseDate); + const sessions = Array.from({ length: numSessions }, (_, i) => ({ + index: i, + date: new Date(baseMs + i * sessionIntervalDays * 24 * 60 * 60 * 1000).toISOString(), + })); + + // Unique (entity, attribute) per fact. + const pairs = []; + for (const h of ENTITY_HEADS) for (const u of ENTITY_UNITS) for (const a of ATTRIBUTES) { + pairs.push({ entity: `${h} ${u}`, attribute: a }); + } + if (pairs.length < numFacts) throw new Error(`vocab too small: ${pairs.length} < ${numFacts}`); + const chosen = shuffle(rand, pairs).slice(0, numFacts); + + const memories = []; // { id, session, kind, factId, version, content, token } + const probes = []; // { factId, query, versionTimeline, tokens, trapTokens, contraTokens, hot } + const retrievalSchedule = []; // { session, query, factId } + const outcomeSchedule = []; // { session, memoryRef, good } + let memSeq = 0; + const mid = () => `pm${memSeq++}`; + + // Which facts get updates (40%) and contradictions (10%) - disjoint draws + // from a shuffled index list so fractions are exact, not stochastic. + const order = shuffle(rand, Array.from({ length: numFacts }, (_, i) => i)); + const updateSet = new Set(order.slice(0, Math.round(numFacts * 0.4))); + const contraSet = new Set(shuffle(rand, order).slice(0, Math.round(numFacts * 0.1))); + // Traps: 15% of facts get one plausible-but-wrong memory that receives --bad. + const trapSet = new Set(shuffle(rand, order).slice(0, Math.round(numFacts * 0.15))); + + // Recall frequency per fact (0..6, seeded). Hot = top quartile. + const recallCounts = chosen.map(() => Math.floor(rand() * 7)); + const sortedCounts = recallCounts.slice().sort((a, b) => b - a); + const hotThreshold = sortedCounts[Math.max(0, Math.floor(numFacts / 4) - 1)]; + + for (let f = 0; f < numFacts; f++) { + const { entity, attribute } = chosen[f]; + const factId = `F${f}`; + // v1 lands in the first 60% of sessions so updates have room afterwards. + const s1 = Math.floor(rand() * Math.max(1, Math.floor(numSessions * 0.6))); + const tok1 = valueToken(rand, f, 1); + const tokens = { 1: tok1 }; + const versionTimeline = [{ session: s1, version: 1 }]; + memories.push({ + id: mid(), session: s1, kind: 'fact', factId, version: 1, + content: `${entity} ${attribute} ${pick(rand, CONNECTIVES)} ${tok1}.`, + token: tok1, + }); + + // Version chain: v2 (and possibly v3) at strictly later sessions. + if (updateSet.has(f)) { + const s2 = s1 + 1 + Math.floor(rand() * Math.max(1, numSessions - s1 - 2)); + const tok2 = valueToken(rand, f, 2); + tokens[2] = tok2; + versionTimeline.push({ session: s2, version: 2 }); + memories.push({ + id: mid(), session: s2, kind: 'update', factId, version: 2, + content: `${entity} ${attribute} ${pick(rand, CONNECTIVES)} ${tok2}.`, + token: tok2, + }); + if (rand() < 0.3 && s2 + 1 < numSessions) { + const s3 = s2 + 1 + Math.floor(rand() * Math.max(1, numSessions - s2 - 2)); + const tok3 = valueToken(rand, f, 3); + tokens[3] = tok3; + versionTimeline.push({ session: Math.min(s3, numSessions - 1), version: 3 }); + memories.push({ + id: mid(), session: Math.min(s3, numSessions - 1), kind: 'update', factId, version: 3, + content: `${entity} ${attribute} ${pick(rand, CONNECTIVES)} ${tok3}.`, + token: tok3, + }); + } + } + + // Contradiction: asserts a WRONG value at a later session; never becomes + // current truth (adversarial noise; secondary intrusion metric). + const contraTokens = []; + if (contraSet.has(f)) { + const sc = Math.min(numSessions - 1, s1 + 1 + Math.floor(rand() * Math.max(1, numSessions - s1 - 1))); + const ctok = `CONTRA${f}D${String(Math.floor(rand() * 10000)).padStart(4, '0')}`; + contraTokens.push(ctok); + memories.push({ + id: mid(), session: sc, kind: 'contradiction', factId, version: null, + content: `Heard in passing that ${entity} ${attribute} ${pick(rand, CONNECTIVES)} ${ctok}.`, + token: ctok, + }); + } + + // Trap: plausible-but-wrong memory that the outcome schedule marks --bad + // (twice). Trap-persistence = its token still in top-5 afterwards. + const trapTokens = []; + if (trapSet.has(f)) { + const st = Math.min(numSessions - 1, s1 + Math.floor(rand() * Math.max(1, numSessions - s1))); + const ttok = `TRAP${f}D${String(Math.floor(rand() * 10000)).padStart(4, '0')}`; + trapTokens.push(ttok); + const trapId = mid(); + memories.push({ + id: trapId, session: st, kind: 'trap', factId, version: null, + content: `${entity} ${attribute} ${pick(rand, CONNECTIVES)} ${ttok}.`, + token: ttok, + }); + for (let b = 0; b < 2; b++) { + const so = Math.min(numSessions - 1, st + 1 + b); + outcomeSchedule.push({ session: so, memoryRef: trapId, good: false }); + } + } + + // Scheduled mutating recalls (strengthening drivers), spread over sessions + // AFTER the fact first exists. + const query = `${entity} ${attribute}`; + const hot = recallCounts[f] >= hotThreshold && recallCounts[f] > 0; + for (let r = 0; r < recallCounts[f]; r++) { + const sr = Math.min(numSessions - 1, s1 + 1 + Math.floor(rand() * Math.max(1, numSessions - s1 - 1))); + retrievalSchedule.push({ session: sr, query, factId }); + } + + // Positive outcomes: 30% of facts' v1 memory gets one --good. + if (rand() < 0.3) { + const v1Mem = memories.find((m) => m.factId === factId && m.version === 1); + const so = Math.min(numSessions - 1, s1 + 1); + outcomeSchedule.push({ session: so, memoryRef: v1Mem.id, good: true }); + } + + probes.push({ factId, query, versionTimeline, tokens, contraTokens, trapTokens, hot }); + } + + // ------------------------------------------------------------------------ + // Hard negatives (>= distractorMultiple x facts), four template families. + // Each carries its own opaque NEG token so it can never satisfy a probe. + // ------------------------------------------------------------------------ + const negPerFact = distractorMultiple; + for (let f = 0; f < numFacts; f++) { + const { entity, attribute } = chosen[f]; + const probe = probes[f]; + for (let n = 0; n < negPerFact; n++) { + const family = n % 5; + const sn = Math.floor(rand() * numSessions); + const ntok = `NEG${f}N${n}D${String(Math.floor(rand() * 10000)).padStart(4, '0')}`; + let content; + if (family === 0) { + // same-entity / DIFFERENT-attribute + const otherAttr = pick(rand, ATTRIBUTES.filter((a) => a !== attribute)); + content = `${entity} ${otherAttr} ${pick(rand, CONNECTIVES)} ${ntok}.`; + } else if (family === 1 && probe.tokens[2]) { + // paraphrase of a SUPERSEDED version (carries the OLD token: surfacing + // it after the update counts as stale-intrusion - intentional) + content = `${pick(rand, PARAPHRASE_LEADS)} ${entity} ${attribute} ${pick(rand, CONNECTIVES)} ${probe.tokens[1]}.`; + } else if (family === 2) { + // temporal near-miss + content = `${entity} ${attribute} ${pick(rand, NEARMISS_QUALIFIERS)} ${ntok}.`; + } else if (family === 3) { + // contradiction-lookalike (hedged phrasing, own token) + content = `Unconfirmed: ${entity} ${attribute} might be ${ntok}, pending review.`; + } else { + // BOTH-token mention (pilot rescale, 2026-06-11): same entity AND + // attribute in a process/meta sentence with its own token. These + // collide with the probe query on both tokens, so a static top-5 + // cannot hold every same-fact document - de-saturates the all-off + // baseline (pilot showed currentR5 0.943 > 0.90 guard). + content = `Review of ${entity} ${attribute} noted in minutes ${ntok}; decision log pending.`; + } + memories.push({ + id: mid(), session: sn, kind: 'distractor', factId: null, version: null, + content, token: content.includes(probe.tokens[1]) ? probe.tokens[1] : ntok, + }); + } + } + + // Stable ordering within each session: by id sequence (already insertion- + // ordered); the driver ingests session-by-session. + retrievalSchedule.sort((a, b) => a.session - b.session || a.factId.localeCompare(b.factId)); + outcomeSchedule.sort((a, b) => a.session - b.session || a.memoryRef.localeCompare(b.memoryRef)); + + return { + meta: { + generatorVersion: GENERATOR_VERSION, seed, numFacts, numSessions, + distractorMultiple, baseDate, sessionIntervalDays, + counts: { + memories: memories.length, + updates: memories.filter((m) => m.kind === 'update').length, + contradictions: memories.filter((m) => m.kind === 'contradiction').length, + traps: memories.filter((m) => m.kind === 'trap').length, + distractors: memories.filter((m) => m.kind === 'distractor').length, + scheduledRecalls: retrievalSchedule.length, + outcomes: outcomeSchedule.length, + hotFacts: probes.filter((p) => p.hot).length, + }, + }, + sessions, memories, retrievalSchedule, outcomeSchedule, probes, + }; +} + +// -------------------------------------------------------------------------- +// CLI +// -------------------------------------------------------------------------- +const isMain = process.argv[1] && path.resolve(process.argv[1]) === fileURLToPath(import.meta.url); +if (isMain) { + const args = process.argv.slice(2); + const getArg = (name, dflt) => { + const i = args.indexOf(`--${name}`); + return i >= 0 ? args[i + 1] : dflt; + }; + const seed = Number(getArg('seed', '1')); + const protocol = generateProtocol({ + seed, + numFacts: Number(getArg('facts', '300')), + numSessions: Number(getArg('sessions', '20')), + distractorMultiple: Number(getArg('distractors', '10')), + }); + const outDir = getArg('out', path.join(path.dirname(fileURLToPath(import.meta.url)), 'protocols')); + fs.mkdirSync(outDir, { recursive: true }); + const outFile = path.join(outDir, `protocol-seed${seed}.json`); + fs.writeFileSync(outFile, JSON.stringify(protocol, null, 1), 'utf8'); + console.log(`wrote ${outFile}`); + console.log(JSON.stringify(protocol.meta, null, 2)); +} diff --git a/scripts/e1-lifecycle/run.mjs b/scripts/e1-lifecycle/run.mjs new file mode 100644 index 0000000..4dca5ae --- /dev/null +++ b/scripts/e1-lifecycle/run.mjs @@ -0,0 +1,288 @@ +#!/usr/bin/env node +/** + * E1 longitudinal lifecycle protocol — the DRIVER. + * + * Executes the pre-registered protocol (PREREGISTERED-DESIGN.md v0.2 + A1) + * for each requested (arm x seed): builds a HERMETIC fresh store (temp + * HIPPO_HOME, cleaned up after), walks the K simulated sessions in order, + * and probes READ-ONLY after every session. + * + * Mutation/measurement split (design rev #3): + * - MUTATORS (the only state writers): session ingestion + * (createMemory+writeEntry under HIPPO_FAKE_NOW), scheduled retrievals + * (hybridSearch -> markRetrieved -> persistence gated EXACTLY like the + * CLI: skipped when isRecallBoostAblated()), and outcome applications + * (applyOutcome+writeEntry on EXPLICIT protocol-mapped ids - never + * last_retrieval_ids). + * - PROBES are in-process hybridSearch calls with an explicit `now`; no + * markRetrieved, no writes. (retrieve_inprocess.mjs pattern.) + * + * Simulated time: HIPPO_FAKE_NOW env + _resetAblationCacheForTests() per + * session (in-process equivalent of one process per session). createMemory / + * markRetrieved / scoring all honor it via evalNow(). + * + * Arms (env per design section 3; baselines rank differently in the prober): + * full no flags + * decay-off HIPPO_ABLATE_DECAY=1 (A1: co-ablates outcome-slow + read-side boost) + * strengthen-off HIPPO_ABLATE_RECALL_BOOST=1 + * outcome-off HIPPO_ABLATE_OUTCOME=1 + * all-off all three + * bm25-static all three + probe ranks by raw BM25 component only + * recency-window all three + probe returns the 5 newest entries + * + * NO sleep/consolidation in E1 (that is E3's dimension); no embeddings (the + * lexical+lifecycle composite exercises every mechanism under test; the + * embedding blend is orthogonal to the ablations). + * + * Run (pilot): node scripts/e1-lifecycle/run.mjs --arms full,all-off --seeds 1 --facts 300 + */ + +import * as fs from 'node:fs'; +import * as os from 'node:os'; +import * as path from 'node:path'; +import { createHash } from 'node:crypto'; +import { fileURLToPath } from 'node:url'; + +import { createMemory, applyOutcome } from '../../dist/memory.js'; +import { writeEntry, loadAllEntries, initStore } from '../../dist/store.js'; +import { hybridSearch, markRetrieved } from '../../dist/search.js'; +import { isRecallBoostAblated, _resetAblationCacheForTests } from '../../dist/ablation.js'; +import { generateProtocol, GENERATOR_VERSION } from './generate.mjs'; + +const HERE = path.dirname(fileURLToPath(import.meta.url)); +const REPO = path.resolve(HERE, '..', '..'); +const OUT_DIR = path.join(REPO, 'benchmarks', 'e1-lifecycle', 'raw'); + +const ARM_ENV = { + 'full': {}, + 'decay-off': { HIPPO_ABLATE_DECAY: '1' }, + 'strengthen-off': { HIPPO_ABLATE_RECALL_BOOST: '1' }, + 'outcome-off': { HIPPO_ABLATE_OUTCOME: '1' }, + 'all-off': { HIPPO_ABLATE_DECAY: '1', HIPPO_ABLATE_RECALL_BOOST: '1', HIPPO_ABLATE_OUTCOME: '1' }, + 'bm25-static': { HIPPO_ABLATE_DECAY: '1', HIPPO_ABLATE_RECALL_BOOST: '1', HIPPO_ABLATE_OUTCOME: '1' }, + 'recency-window': { HIPPO_ABLATE_DECAY: '1', HIPPO_ABLATE_RECALL_BOOST: '1', HIPPO_ABLATE_OUTCOME: '1' }, +}; +const ABLATION_VARS = ['HIPPO_ABLATE_DECAY', 'HIPPO_ABLATE_RECALL_BOOST', 'HIPPO_ABLATE_OUTCOME', 'HIPPO_ABLATE_OUTCOME_SLOW', 'HIPPO_ABLATE_OUTCOME_FAST', 'HIPPO_FAKE_NOW']; + +function setArmEnv(arm) { + for (const v of ABLATION_VARS) delete process.env[v]; + for (const [k, val] of Object.entries(ARM_ENV[arm])) process.env[k] = val; + _resetAblationCacheForTests(); +} + +function setSimulatedNow(iso) { + process.env.HIPPO_FAKE_NOW = iso; + _resetAblationCacheForTests(); +} + +/** Current version token for a probe at epoch e, or null if fact not yet live. */ +function currentAt(probe, epoch) { + let cur = null; + for (const step of probe.versionTimeline) { + if (step.session <= epoch) cur = step; + } + return cur; // { session, version } | null +} + +const PROBE_TOP_K = 5; +const PROBE_BUDGET = 100000; // token budget never binds at top-5 granularity + +async function probeEpoch(protocol, entries, epoch, epochDate, arm) { + const probeNow = new Date(Date.parse(epochDate) + 60 * 60 * 1000); // +1h after session + let active = 0, current5 = 0, staleEligible = 0, staleHit = 0; + let trapEligible = 0, trapHit = 0, contraEligible = 0, contraHit = 0; + let hotActive = 0, hotCurrent5 = 0, mrrSum = 0; + + for (const probe of protocol.probes) { + const cur = currentAt(probe, epoch); + if (!cur) continue; + active++; + if (probe.hot) hotActive++; + + let top; + if (arm === 'recency-window') { + top = entries.slice().sort((a, b) => Date.parse(b.created) - Date.parse(a.created)).slice(0, PROBE_TOP_K); + } else { + const results = await hybridSearch(probe.query, entries, { budget: PROBE_BUDGET, now: probeNow, minResults: PROBE_TOP_K }); + const ranked = arm === 'bm25-static' + ? results.slice().sort((a, b) => b.bm25 - a.bm25) + : results; + top = ranked.slice(0, PROBE_TOP_K).map((r) => r.entry); + } + const texts = top.map((e) => e.content); + const curTok = probe.tokens[cur.version]; + const rank = texts.findIndex((t) => t.includes(curTok)); + if (rank >= 0) { + current5++; + if (probe.hot) hotCurrent5++; + mrrSum += 1 / (rank + 1); + } + // Stale intrusion: only meaningful once an update has superseded v1. + if (cur.version >= 2) { + staleEligible++; + const staleToks = Object.entries(probe.tokens) + .filter(([v]) => Number(v) < cur.version).map(([, t]) => t); + if (texts.some((t) => staleToks.some((s) => t.includes(s)))) staleHit++; + } + if (probe.trapTokens.length > 0) { + // Eligible once the trap memory exists in the store. + const trapLive = entries.some((e) => probe.trapTokens.some((t) => e.content.includes(t))); + if (trapLive) { + trapEligible++; + if (texts.some((t) => probe.trapTokens.some((tt) => t.includes(tt)))) trapHit++; + } + } + if (probe.contraTokens.length > 0) { + const contraLive = entries.some((e) => probe.contraTokens.some((t) => e.content.includes(t))); + if (contraLive) { + contraEligible++; + if (texts.some((t) => probe.contraTokens.some((ct) => t.includes(ct)))) contraHit++; + } + } + } + + return { + epoch, activeProbes: active, + currentR5: active > 0 ? current5 / active : null, + mrr: active > 0 ? mrrSum / active : null, + staleEligible, staleIntrusionRate: staleEligible > 0 ? staleHit / staleEligible : null, + trapEligible, trapPersistenceRate: trapEligible > 0 ? trapHit / trapEligible : null, + contraEligible, contraIntrusionRate: contraEligible > 0 ? contraHit / contraEligible : null, + hotActive, hotR5: hotActive > 0 ? hotCurrent5 / hotActive : null, + }; +} + +/** + * @param {string} arm + * @param {number} seed + * @param {object} [genOpts] forwarded to generateProtocol + * @param {(hippoRoot: string, idMap: Map) => Promise|void} [inspect] + * test hook: called with the live store path AFTER the last epoch, BEFORE + * cleanup (invariant tests assert on real DB state). + */ +export async function runArmSeed(arm, seed, genOpts = {}, inspect = undefined) { + const protocol = generateProtocol({ seed, ...genOpts }); + const protocolHash = createHash('sha256').update(JSON.stringify(protocol)).digest('hex').slice(0, 16); + + const hippoRoot = fs.mkdtempSync(path.join(os.tmpdir(), `hippo-e1-${arm}-s${seed}-`)); + const epochs = []; + try { + setArmEnv(arm); + setSimulatedNow(protocol.sessions[0].date); + initStore(hippoRoot); + + const idMap = new Map(); // protocol memory id -> hippo entry (latest object) + const bySession = new Map(); + for (const m of protocol.memories) { + if (!bySession.has(m.session)) bySession.set(m.session, []); + bySession.get(m.session).push(m); + } + const retrievalsBySession = new Map(); + for (const r of protocol.retrievalSchedule) { + if (!retrievalsBySession.has(r.session)) retrievalsBySession.set(r.session, []); + retrievalsBySession.get(r.session).push(r); + } + const outcomesBySession = new Map(); + for (const o of protocol.outcomeSchedule) { + if (!outcomesBySession.has(o.session)) outcomesBySession.set(o.session, []); + outcomesBySession.get(o.session).push(o); + } + + for (const session of protocol.sessions) { + setSimulatedNow(session.date); // mutators stamp simulated time + + // 1. Ingest this session's memories (created/last_retrieved = fake now). + for (const m of bySession.get(session.index) ?? []) { + const entry = createMemory(m.content); + writeEntry(hippoRoot, entry); + idMap.set(m.id, entry.id); + } + + // 2. Scheduled mutating retrievals - CLI-parity block: hybridSearch -> + // markRetrieved -> persistence gated exactly like cli.ts cmdRecall. + const entriesNow = () => loadAllEntries(hippoRoot); + for (const r of retrievalsBySession.get(session.index) ?? []) { + const entries = entriesNow(); + const results = await hybridSearch(r.query, entries, { budget: PROBE_BUDGET, minResults: PROBE_TOP_K }); + const topEntries = results.slice(0, PROBE_TOP_K).map((x) => x.entry); + const updated = markRetrieved(topEntries); // default now = evalNow() (fake) + if (!isRecallBoostAblated()) { + for (const u of updated) writeEntry(hippoRoot, u); + } + } + + // 3. Scheduled outcomes on EXPLICIT ids (never last_retrieval_ids). + for (const o of outcomesBySession.get(session.index) ?? []) { + const hippoId = idMap.get(o.memoryRef); + if (!hippoId) throw new Error(`outcome before ingestion: ${o.memoryRef} at session ${session.index}`); + const entry = loadAllEntries(hippoRoot).find((e) => e.id === hippoId); + if (!entry) throw new Error(`outcome target missing from store: ${hippoId}`); + const updated = applyOutcome(entry, o.good); + writeEntry(hippoRoot, updated); + } + + // 4. READ-ONLY probes (explicit now; no markRetrieved; no writes). + const entries = loadAllEntries(hippoRoot); + epochs.push(await probeEpoch(protocol, entries, session.index, session.date, arm)); + } + + if (inspect) await inspect(hippoRoot, idMap); + } finally { + for (const v of ABLATION_VARS) delete process.env[v]; + _resetAblationCacheForTests(); + fs.rmSync(hippoRoot, { recursive: true, force: true }); + } + + return { + meta: { + arm, seed, generatorVersion: GENERATOR_VERSION, protocolHash, + protocolCounts: protocol.meta.counts, ranAt: new Date().toISOString(), + }, + epochs, + }; +} + +// -------------------------------------------------------------------------- +// CLI +// -------------------------------------------------------------------------- +const isMain = process.argv[1] && path.resolve(process.argv[1]) === fileURLToPath(import.meta.url); +if (isMain) { + const args = process.argv.slice(2); + const getArg = (name, dflt) => { + const i = args.indexOf(`--${name}`); + return i >= 0 ? args[i + 1] : dflt; + }; + const arms = getArg('arms', 'full,all-off').split(',').map((s) => s.trim()).filter(Boolean); + const seeds = getArg('seeds', '1').split(',').flatMap((s) => { + const m = s.match(/^(\d+)-(\d+)$/); + return m ? Array.from({ length: Number(m[2]) - Number(m[1]) + 1 }, (_, i) => Number(m[1]) + i) : [Number(s)]; + }); + const genOpts = { + numFacts: Number(getArg('facts', '300')), + numSessions: Number(getArg('sessions', '20')), + distractorMultiple: Number(getArg('distractors', '10')), + }; + for (const arm of arms) { + if (!ARM_ENV[arm]) { + console.error(`unknown arm: ${arm}`); + process.exit(1); + } + } + fs.mkdirSync(OUT_DIR, { recursive: true }); + (async () => { + for (const arm of arms) { + for (const seed of seeds) { + const t0 = Date.now(); + const result = await runArmSeed(arm, seed, genOpts); + const outFile = path.join(OUT_DIR, `${arm}-seed${seed}.json`); + fs.writeFileSync(outFile, JSON.stringify(result, null, 1), 'utf8'); + const last = result.epochs[result.epochs.length - 1]; + console.log( + `${arm} seed=${seed} done in ${((Date.now() - t0) / 1000).toFixed(1)}s | final epoch: ` + + `R@5=${last.currentR5?.toFixed(3)} stale=${last.staleIntrusionRate?.toFixed(3)} ` + + `trap=${last.trapPersistenceRate?.toFixed(3)} hot=${last.hotR5?.toFixed(3)}` + ); + } + } + })().catch((e) => { console.error(e); process.exit(1); }); +} diff --git a/tests/e1-harness.test.ts b/tests/e1-harness.test.ts new file mode 100644 index 0000000..48cb626 --- /dev/null +++ b/tests/e1-harness.test.ts @@ -0,0 +1,128 @@ +/** + * E1 longitudinal harness — invariant tests (pre-run gates from the frozen + * design: rev #3 mutation/measurement split, rev #8 timestamp invariants, + * rev #10 ablation wiring; outcome targeting via explicit ids). + * + * Real stores (house rule), tiny protocol sizes so the whole file stays fast. + * Env isolation: the driver owns ablation env vars during runArmSeed and + * clears them in its finally; tests also clear in beforeEach/afterEach. + */ + +import { describe, it, expect, beforeEach, afterEach } from 'vitest'; +// @ts-expect-error - .mjs harness modules have no type declarations +import { generateProtocol } from '../scripts/e1-lifecycle/generate.mjs'; +// @ts-expect-error - .mjs harness modules have no type declarations +import { runArmSeed } from '../scripts/e1-lifecycle/run.mjs'; +import { loadAllEntries } from '../src/store.js'; +import { _resetAblationCacheForTests } from '../src/ablation.js'; + +const ABLATION_ENV_VARS = [ + 'HIPPO_ABLATE_DECAY', + 'HIPPO_ABLATE_RECALL_BOOST', + 'HIPPO_ABLATE_OUTCOME', + 'HIPPO_ABLATE_OUTCOME_SLOW', + 'HIPPO_ABLATE_OUTCOME_FAST', + 'HIPPO_FAKE_NOW', +] as const; + +function clearAblationEnv(): void { + for (const v of ABLATION_ENV_VARS) delete process.env[v]; + _resetAblationCacheForTests(); +} +beforeEach(clearAblationEnv); +afterEach(clearAblationEnv); + +const TINY = { numFacts: 12, numSessions: 6, distractorMultiple: 4 }; + +describe('E1 generator', () => { + it('is deterministic: same seed => byte-identical protocol', () => { + const a = JSON.stringify(generateProtocol({ seed: 7, ...TINY })); + const b = JSON.stringify(generateProtocol({ seed: 7, ...TINY })); + expect(a).toBe(b); + const c = JSON.stringify(generateProtocol({ seed: 8, ...TINY })); + expect(a).not.toBe(c); + }); + + it('honors the registered fractions and structural invariants', () => { + const p = generateProtocol({ seed: 1, numFacts: 50, numSessions: 10, distractorMultiple: 10 }); + // 40% updates, 10% contradictions (exact via shuffled-slice draws). + const updatedFacts = new Set(p.memories.filter((m: any) => m.kind === 'update').map((m: any) => m.factId)); + expect(updatedFacts.size).toBe(20); + expect(p.memories.filter((m: any) => m.kind === 'contradiction').length).toBe(5); + // Hard-negative pool >= 10x facts. + expect(p.meta.counts.distractors).toBeGreaterThanOrEqual(500); + // Version timelines strictly ordered; updates strictly after v1. + for (const probe of p.probes) { + const sessions = probe.versionTimeline.map((s: any) => s.session); + for (let i = 1; i < sessions.length; i++) expect(sessions[i]).toBeGreaterThan(sessions[i - 1]); + } + // Every outcome target exists and is scheduled at/after its ingestion session. + const memById = new Map(p.memories.map((m: any) => [m.id, m])); + for (const o of p.outcomeSchedule) { + const m = memById.get(o.memoryRef) as any; + expect(m).toBeDefined(); + expect(o.session).toBeGreaterThanOrEqual(m.session); + } + // Traps are bad-marked; some positive outcomes exist. + const trapIds = new Set(p.memories.filter((m: any) => m.kind === 'trap').map((m: any) => m.id)); + expect(p.outcomeSchedule.filter((o: any) => !o.good).every((o: any) => trapIds.has(o.memoryRef))).toBe(true); + expect(p.outcomeSchedule.some((o: any) => o.good)).toBe(true); + }); +}); + +describe('E1 driver', () => { + it('full arm: simulated time stamps the store; metrics emitted per epoch', async () => { + let storeSeen = false; + const result = await runArmSeed('full', 3, TINY, (hippoRoot: string) => { + storeSeen = true; + const entries = loadAllEntries(hippoRoot); + expect(entries.length).toBeGreaterThan(0); + // Every created timestamp lies within the protocol's simulated range - + // nothing stamped with the real 2026 clock (rev #8 invariant). + for (const e of entries) { + expect(Date.parse(e.created)).toBeGreaterThanOrEqual(Date.parse('2025-01-06T00:00:00.000Z')); + expect(Date.parse(e.created)).toBeLessThan(Date.parse('2025-06-01T00:00:00.000Z')); + } + // Scheduled retrievals strengthened SOMETHING (full arm: writes live). + expect(entries.some((e) => e.retrieval_count > 0)).toBe(true); + }); + expect(storeSeen).toBe(true); + expect(result.epochs.length).toBe(TINY.numSessions); + const last = result.epochs[TINY.numSessions - 1]; + expect(last.activeProbes).toBe(TINY.numFacts); + expect(last.currentR5).not.toBeNull(); + expect(result.meta.protocolHash).toMatch(/^[0-9a-f]{16}$/); + }); + + it('strengthen-off arm: zero retrieval writes, outcome attribution still lands', async () => { + await runArmSeed('strengthen-off', 3, TINY, (hippoRoot: string) => { + const entries = loadAllEntries(hippoRoot); + // No strengthening writes anywhere (rev #10: the arm isolates one mechanism). + expect(entries.every((e) => e.retrieval_count === 0)).toBe(true); + // Outcomes still applied via explicit ids (codex round-7 coupling fix): + // traps received --bad despite strengthening being off. + expect(entries.some((e) => (e.outcome_negative ?? 0) > 0)).toBe(true); + }); + }); + + it('probes are read-only: an all-off arm leaves zero retrieval state', async () => { + await runArmSeed('all-off', 5, TINY, (hippoRoot: string) => { + const entries = loadAllEntries(hippoRoot); + // Probes ran every epoch over these entries; none of them wrote back. + expect(entries.every((e) => e.retrieval_count === 0)).toBe(true); + expect(entries.every((e) => e.half_life_days <= 90)).toBe(true); // no +2 accumulation beyond derive caps + }); + }); + + it('driver cleans its env: no ablation vars leak after a run', async () => { + await runArmSeed('decay-off', 2, TINY); + for (const v of ABLATION_ENV_VARS) expect(process.env[v]).toBeUndefined(); + }); + + it('baseline arms produce rankings (bm25-static + recency-window)', async () => { + const bm25 = await runArmSeed('bm25-static', 4, TINY); + const rec = await runArmSeed('recency-window', 4, TINY); + expect(bm25.epochs[TINY.numSessions - 1].currentR5).not.toBeNull(); + expect(rec.epochs[TINY.numSessions - 1].currentR5).not.toBeNull(); + }); +}); From 625bc67ef5df17e7902455ff2f0e54b18a7401b4 Mon Sep 17 00:00:00 2001 From: Keith So <68618199+kitfunso@users.noreply.github.com> Date: Thu, 11 Jun 2026 20:12:44 +0100 Subject: [PATCH 2/5] feat(eval): pilot-3 rescale - value-claim lookalike negatives pass saturation guard Pilot 2 (both-token meta-sentences) failed to de-saturate the all-off baseline (0.967: BM25 length normalization ranks long sentences below tight fact docs). Pilot 3 replaces family 4 with VALUE-CLAIM lookalikes - the exact fact template carrying their own token - so every fact has more identical-form competitors than top-5 slots and a static ranker must genuinely tie-break. Realistic adversary: long-lived stores accumulate multiple tight value claims per key. Pilot 3 final-epoch all-off: currentR5 0.727, stale 0.875, trap 0.733, hot 0.757 - saturation guard PASSES on every primary metric (prereg amendment A2: final-epoch endpoints; early epochs ceiling by construction in an accumulating store). Full pilot trail in PILOT-NOTES.md. Co-Authored-By: Claude Opus 4.8 (1M context) --- benchmarks/e1-lifecycle/PILOT-NOTES.md | 27 +++++++++++++++++++++++++- scripts/e1-lifecycle/generate.mjs | 16 +++++++++------ 2 files changed, 36 insertions(+), 7 deletions(-) diff --git a/benchmarks/e1-lifecycle/PILOT-NOTES.md b/benchmarks/e1-lifecycle/PILOT-NOTES.md index aeccd2a..c1e918f 100644 --- a/benchmarks/e1-lifecycle/PILOT-NOTES.md +++ b/benchmarks/e1-lifecycle/PILOT-NOTES.md @@ -34,4 +34,29 @@ added a BOTH-token hard-negative family (same entity AND attribute in a meta/pro sentence carrying its own NEG token), so every fact now has more both-token colliders than top-5 slots and the static baseline must actually rank. No other change. -## Pilot 2 — pending (rescaled generator) +## Pilot 2 — 2026-06-11, both-token meta-sentence family, seed 1 + +FAILED to de-saturate: all-off currentR5 0.967 (worse than pilot 1). Root cause: the new +negatives were LONGER sentences; BM25 length normalization ranks them below the tight fact +docs, so they never displace anything from top-5. + +## Pilot 3 — 2026-06-11, value-claim lookalike family (exact fact template, own token), seed 1 + +| final epoch | full | all-off | guard [0.10, 0.90] | +|---|---|---|---| +| current R@5 | 0.257 | 0.727 | PASS | +| stale-intrusion | 0.475 | 0.875 | PASS | +| trap-persistence | 0.111 | 0.733 | PASS | +| hot R@5 | 0.365 | 0.757 | PASS | + +Wall-clock: all-off 102.0s, full 167.5s. Registered matrix (7 arms x 20 seeds) projects ~6h. + +**Trajectory note -> amendment A2:** epochs 0-9 are ceiling-saturated by construction (an +accumulating store starts with 1-3 docs per fact; a top-5 metric cannot miss). Primary +endpoints clarified to FINAL-EPOCH values (prereg amendment A2, recorded pre-freeze); +trajectories reported descriptively. Final-epoch all-off passes the guard on all metrics. + +Directional preview persists (not registered results): outcome feedback buries traps +(0.111 vs 0.733); full-lifecycle current R@5 BELOW all-off (0.257 vs 0.727 - decay +sacrifices old-but-current facts to fresh noise at this horizon); stale-suppression helps +(0.475 vs 0.875). The per-mechanism arms will attribute these causally. diff --git a/scripts/e1-lifecycle/generate.mjs b/scripts/e1-lifecycle/generate.mjs index f951c86..ada1f0c 100644 --- a/scripts/e1-lifecycle/generate.mjs +++ b/scripts/e1-lifecycle/generate.mjs @@ -259,12 +259,16 @@ export function generateProtocol(opts) { // contradiction-lookalike (hedged phrasing, own token) content = `Unconfirmed: ${entity} ${attribute} might be ${ntok}, pending review.`; } else { - // BOTH-token mention (pilot rescale, 2026-06-11): same entity AND - // attribute in a process/meta sentence with its own token. These - // collide with the probe query on both tokens, so a static top-5 - // cannot hold every same-fact document - de-saturates the all-off - // baseline (pilot showed currentR5 0.943 > 0.90 guard). - content = `Review of ${entity} ${attribute} noted in minutes ${ntok}; decision log pending.`; + // VALUE-CLAIM lookalike (pilot-2 rescale, 2026-06-11): the EXACT fact + // template with its own token. Pilot 2 showed longer meta-sentences + // never displace tight fact docs (BM25 length normalization), so the + // all-off ceiling held at 0.967. Identical-form claims are also the + // realistic adversary: a long-lived store accumulates multiple tight + // value claims per key (old notes, speculation, misheard values). + // With > top-k identical-form docs per fact, a static ranker must + // tie-break arbitrarily; lifecycle signals (update recency, + // strengthening, outcomes) are precisely the tiebreakers under test. + content = `${entity} ${attribute} ${pick(rand, CONNECTIVES)} ${ntok}.`; } memories.push({ id: mid(), session: sn, kind: 'distractor', factId: null, version: null, From 2149cb18ef99daf559db61154159f16e824d98e8 Mon Sep 17 00:00:00 2001 From: Keith So <68618199+kitfunso@users.noreply.github.com> Date: Thu, 11 Jun 2026 20:25:34 +0100 Subject: [PATCH 3/5] fix(eval): deterministic entry ids + disjoint contradiction sampling (codex round 1) P1: the protocol intentionally creates score ties (identical-form value-claim negatives), and same-timestamp rows order by id, so random createMemory UUIDs made identical (arm, seed) runs produce different top-5 metrics under the same protocol hash. Entry ids are now sha256-derived from (seed, protocol memory id), keeping the mem_<12hex> format. New invariant test: two complete runs of the same (arm, seed) agree to the byte. P2: contraSet re-sampled the full fact list, so contradiction facts could overlap updated facts despite the disjointness comment. Contradictions now sample from the NON-updated remainder; disjointness asserted in tests. Pilot 4 reconfirms the saturation guard on every primary metric (all-off final epoch: 0.727 / 0.825 / 0.778 / 0.693). 8 invariant tests green. Co-Authored-By: Claude Opus 4.8 (1M context) --- benchmarks/e1-lifecycle/PILOT-NOTES.md | 15 +++++++++++++++ scripts/e1-lifecycle/generate.mjs | 10 +++++++--- scripts/e1-lifecycle/run.mjs | 6 ++++++ tests/e1-harness.test.ts | 15 +++++++++++++++ 4 files changed, 43 insertions(+), 3 deletions(-) diff --git a/benchmarks/e1-lifecycle/PILOT-NOTES.md b/benchmarks/e1-lifecycle/PILOT-NOTES.md index c1e918f..bce00fa 100644 --- a/benchmarks/e1-lifecycle/PILOT-NOTES.md +++ b/benchmarks/e1-lifecycle/PILOT-NOTES.md @@ -60,3 +60,18 @@ Directional preview persists (not registered results): outcome feedback buries t (0.111 vs 0.733); full-lifecycle current R@5 BELOW all-off (0.257 vs 0.727 - decay sacrifices old-but-current facts to fresh noise at this horizon); stale-suppression helps (0.475 vs 0.875). The per-mechanism arms will attribute these causally. + +## Pilot 4 - 2026-06-11, post codex-round-1 fixes (deterministic ids, disjoint contradictions) + +| final epoch | full | all-off | guard [0.10, 0.90] | +|---|---|---|---| +| current R@5 | 0.320 | 0.727 | PASS | +| stale-intrusion | 0.475 | 0.825 | PASS | +| trap-persistence | 0.067 | 0.778 | PASS | +| hot R@5 | 0.480 | 0.693 | PASS | + +Wall-clock: all-off 87.2s, full 161.1s. Reproducibility now proven by invariant test +(two identical (arm, seed) runs produce byte-identical metrics; entry ids derived from +(seed, protocol id) instead of random UUIDs). Contradiction facts disjoint from updated +facts (attribution unconfounded). Guard passes; protocol ready to freeze pending codex +convergence. diff --git a/scripts/e1-lifecycle/generate.mjs b/scripts/e1-lifecycle/generate.mjs index ada1f0c..bd3f5bb 100644 --- a/scripts/e1-lifecycle/generate.mjs +++ b/scripts/e1-lifecycle/generate.mjs @@ -128,11 +128,15 @@ export function generateProtocol(opts) { let memSeq = 0; const mid = () => `pm${memSeq++}`; - // Which facts get updates (40%) and contradictions (10%) - disjoint draws - // from a shuffled index list so fractions are exact, not stochastic. + // Which facts get updates (40%) and contradictions (10%). Fractions are + // exact (shuffled-slice draws, not stochastic) and the two sets are + // DISJOINT: contradictions sample from the NON-updated remainder, so + // contradiction-intrusion is never confounded with version chains + // (codex P2: the earlier draw re-sampled the full list and could overlap). const order = shuffle(rand, Array.from({ length: numFacts }, (_, i) => i)); const updateSet = new Set(order.slice(0, Math.round(numFacts * 0.4))); - const contraSet = new Set(shuffle(rand, order).slice(0, Math.round(numFacts * 0.1))); + const nonUpdated = order.filter((i) => !updateSet.has(i)); + const contraSet = new Set(shuffle(rand, nonUpdated).slice(0, Math.round(numFacts * 0.1))); // Traps: 15% of facts get one plausible-but-wrong memory that receives --bad. const trapSet = new Set(shuffle(rand, order).slice(0, Math.round(numFacts * 0.15))); diff --git a/scripts/e1-lifecycle/run.mjs b/scripts/e1-lifecycle/run.mjs index 4dca5ae..cda1d76 100644 --- a/scripts/e1-lifecycle/run.mjs +++ b/scripts/e1-lifecycle/run.mjs @@ -192,8 +192,14 @@ export async function runArmSeed(arm, seed, genOpts = {}, inspect = undefined) { setSimulatedNow(session.date); // mutators stamp simulated time // 1. Ingest this session's memories (created/last_retrieved = fake now). + // Entry ids are DERIVED from (seed, protocol id), not random UUIDs: + // the protocol intentionally creates score TIES (identical-form + // negatives), and same-timestamp rows order by id - random ids would + // make identical (arm, seed) runs produce different top-5 metrics + // (codex P1). sha256 prefix keeps the mem_<12 hex> format. for (const m of bySession.get(session.index) ?? []) { const entry = createMemory(m.content); + entry.id = `mem_${createHash('sha256').update(`e1:${seed}:${m.id}`).digest('hex').slice(0, 12)}`; writeEntry(hippoRoot, entry); idMap.set(m.id, entry.id); } diff --git a/tests/e1-harness.test.ts b/tests/e1-harness.test.ts index 48cb626..018b53c 100644 --- a/tests/e1-harness.test.ts +++ b/tests/e1-harness.test.ts @@ -49,6 +49,10 @@ describe('E1 generator', () => { const updatedFacts = new Set(p.memories.filter((m: any) => m.kind === 'update').map((m: any) => m.factId)); expect(updatedFacts.size).toBe(20); expect(p.memories.filter((m: any) => m.kind === 'contradiction').length).toBe(5); + // Updates and contradictions are DISJOINT (codex P2): a contradicted fact + // never has a version chain, so contradiction-intrusion is unconfounded. + const contraFacts = new Set(p.memories.filter((m: any) => m.kind === 'contradiction').map((m: any) => m.factId)); + for (const cf of contraFacts) expect(updatedFacts.has(cf)).toBe(false); // Hard-negative pool >= 10x facts. expect(p.meta.counts.distractors).toBeGreaterThanOrEqual(500); // Version timelines strictly ordered; updates strictly after v1. @@ -119,6 +123,17 @@ describe('E1 driver', () => { for (const v of ABLATION_ENV_VARS) expect(process.env[v]).toBeUndefined(); }); + it('is REPRODUCIBLE: identical (arm, seed) runs produce identical metrics (codex P1)', async () => { + // The protocol intentionally creates score ties (identical-form + // negatives); with random entry UUIDs, tie order differed across runs of + // the same seed. Entry ids are now derived from (seed, protocol id) - + // two full runs must agree to the byte. + const a = await runArmSeed('full', 11, TINY); + const b = await runArmSeed('full', 11, TINY); + expect(JSON.stringify(a.epochs)).toBe(JSON.stringify(b.epochs)); + expect(a.meta.protocolHash).toBe(b.meta.protocolHash); + }); + it('baseline arms produce rankings (bm25-static + recency-window)', async () => { const bm25 = await runArmSeed('bm25-static', 4, TINY); const rec = await runArmSeed('recency-window', 4, TINY); From de285f3e5fba26f7e05ae15ab3e5920a3ec220de Mon Sep 17 00:00:00 2001 From: Keith So <68618199+kitfunso@users.noreply.github.com> Date: Thu, 11 Jun 2026 20:32:36 +0100 Subject: [PATCH 4/5] fix(eval): content-blind bm25-static tie-break + pretest build prerequisite codex round 2 on the E1 harness: 1. The bm25-static baseline re-sorted hybridSearch results by raw BM25 with a bare stable sort, so identical-BM25 candidates (which this protocol deliberately mass-produces) kept the composite lifecycle/recency-tinged ordering - the baseline was not actually BM25-only at ties. Tie-break now by entry id (seed-derived, deterministic, content-blind). 2. tests/e1-harness.test.ts imports the harness, which imports ../../dist/ - a clean-checkout `npm test` (vitest without build) failed before tests ran. Added "pretest": "npm run build" so npm test always tests built code; CI already builds first (ci.yml). 8 invariant tests green. Co-Authored-By: Claude Opus 4.8 (1M context) --- package.json | 1 + scripts/e1-lifecycle/run.mjs | 6 +++++- 2 files changed, 6 insertions(+), 1 deletion(-) diff --git a/package.json b/package.json index fc1a2a2..d9c088c 100644 --- a/package.json +++ b/package.json @@ -27,6 +27,7 @@ "baseline": "npm run build && npm test", "build": "tsc && tsc -p tsconfig.benchmarks.json", "dev": "tsc --watch", + "pretest": "npm run build", "test": "vitest run", "test:watch": "vitest", "postinstall": "node scripts/postinstall.cjs", diff --git a/scripts/e1-lifecycle/run.mjs b/scripts/e1-lifecycle/run.mjs index cda1d76..ec4e871 100644 --- a/scripts/e1-lifecycle/run.mjs +++ b/scripts/e1-lifecycle/run.mjs @@ -104,8 +104,12 @@ async function probeEpoch(protocol, entries, epoch, epochDate, arm) { top = entries.slice().sort((a, b) => Date.parse(b.created) - Date.parse(a.created)).slice(0, PROBE_TOP_K); } else { const results = await hybridSearch(probe.query, entries, { budget: PROBE_BUDGET, now: probeNow, minResults: PROBE_TOP_K }); + // bm25-static tie-break MUST be independent of the composite order: a + // bare stable sort would let identical-BM25 candidates (this protocol + // creates many) keep hybridSearch's lifecycle/recency-tinged ordering + // (codex P2). Entry id is deterministic (seed-derived) and content-blind. const ranked = arm === 'bm25-static' - ? results.slice().sort((a, b) => b.bm25 - a.bm25) + ? results.slice().sort((a, b) => (b.bm25 - a.bm25) || a.entry.id.localeCompare(b.entry.id)) : results; top = ranked.slice(0, PROBE_TOP_K).map((r) => r.entry); } From a6eafcbe50199eaee729f74066fc0449fa03e34c Mon Sep 17 00:00:00 2001 From: Keith So <68618199+kitfunso@users.noreply.github.com> Date: Thu, 11 Jun 2026 20:42:02 +0100 Subject: [PATCH 5/5] fix(eval): reproducibility-test timeout + gitignore E1 raw outputs (codex round 3) - 60s explicit timeout on the two-full-run reproducibility test (two real SQLite store runs sit at ~4.8s, brushing vitest's 5s default - a cold-CI flake waiting to happen). - benchmarks/e1-lifecycle/raw/ gitignored: harness outputs are generated; the REGISTERED results are committed to the hippo-paper artifact repo per the prereg, not to this code repo. Co-Authored-By: Claude Opus 4.8 (1M context) --- .gitignore | 1 + tests/e1-harness.test.ts | 2 +- 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/.gitignore b/.gitignore index c34da01..41a7900 100644 --- a/.gitignore +++ b/.gitignore @@ -65,3 +65,4 @@ tmp-card4-20seed/ trajectories/ benchmarks/lifecycle-stress/results-2*.json benchmarks/lifecycle-stress/labels/ +benchmarks/e1-lifecycle/raw/ diff --git a/tests/e1-harness.test.ts b/tests/e1-harness.test.ts index 018b53c..f92cf35 100644 --- a/tests/e1-harness.test.ts +++ b/tests/e1-harness.test.ts @@ -132,7 +132,7 @@ describe('E1 driver', () => { const b = await runArmSeed('full', 11, TINY); expect(JSON.stringify(a.epochs)).toBe(JSON.stringify(b.epochs)); expect(a.meta.protocolHash).toBe(b.meta.protocolHash); - }); + }, 60_000); // two full real-store runs; default 5s timeout is a CI flake (codex P2) it('baseline arms produce rankings (bm25-static + recency-window)', async () => { const bm25 = await runArmSeed('bm25-static', 4, TINY);