From 5a7b7ae8581ef7be0351df5fc600976b5e714816 Mon Sep 17 00:00:00 2001
From: Keith So <68618199+kitfunso@users.noreply.github.com>
Date: Thu, 11 Jun 2026 20:04:34 +0100
Subject: [PATCH 1/5] feat(eval): E1 longitudinal lifecycle harness (generator
 + driver + invariants)

The pre-registered E1 protocol (hippo-paper PREREGISTERED-DESIGN.md v0.2+A1):

- scripts/e1-lifecycle/generate.mjs: deterministic protocol generator
  (mulberry32; same seed = byte-identical). 300 version-chained facts, exact
  40% updates / 10% contradictions / 15% traps via shuffled-slice draws,
  5 hard-negative families (incl. the pilot-1 both-token rescale), scheduled
  retrievals (hot = top quartile), outcome schedule on EXPLICIT memory refs.
- scripts/e1-lifecycle/run.mjs: hermetic driver. Fresh temp store per
  (arm x seed); simulated weekly clock via HIPPO_FAKE_NOW + cache reset;
  CLI-parity mutation block (hybridSearch -> markRetrieved -> persistence
  gated by isRecallBoostAblated, identical to cmdRecall); outcomes applied to
  explicit protocol-mapped ids; READ-ONLY probes per epoch (explicit now, no
  write-back) computing current-version R@5, MRR, stale-intrusion,
  trap-persistence, contradiction-intrusion, hot-fact R@5. 7 arms wired.
  No sleep (E3's dimension), no embeddings (orthogonal to the mechanisms).
- tests/e1-harness.test.ts: 7 invariant gates - generator determinism +
  exact fractions, simulated-time stamping (no real-clock leakage),
  strengthen-off isolation WITH outcome attribution intact, probe
  read-only-ness, driver env hygiene, baseline arms.
- benchmarks/e1-lifecycle/PILOT-NOTES.md: pilot 1 (saturation guard fired on
  all-off, >0.90 on three primaries) + the recorded rescale.

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
---
 benchmarks/e1-lifecycle/PILOT-NOTES.md |  37 +++
 scripts/e1-lifecycle/generate.mjs      | 323 +++++++++++++++++++++++++
 scripts/e1-lifecycle/run.mjs           | 288 ++++++++++++++++++++++
 tests/e1-harness.test.ts               | 128 ++++++++++
 4 files changed, 776 insertions(+)
 create mode 100644 benchmarks/e1-lifecycle/PILOT-NOTES.md
 create mode 100644 scripts/e1-lifecycle/generate.mjs
 create mode 100644 scripts/e1-lifecycle/run.mjs
 create mode 100644 tests/e1-harness.test.ts

diff --git a/benchmarks/e1-lifecycle/PILOT-NOTES.md b/benchmarks/e1-lifecycle/PILOT-NOTES.md
new file mode 100644
index 0000000..aeccd2a
--- /dev/null
+++ b/benchmarks/e1-lifecycle/PILOT-NOTES.md
@@ -0,0 +1,37 @@
+# E1 pilot notes (pre-freeze; design v0.2 saturation-guard clause)
+
+Per PREREGISTERED-DESIGN.md section 3 E1: pilot-stage rescales are allowed BEFORE the
+`e1-generator-freeze` tag, must be recorded here with their trigger, and stop being legal
+the moment the tag exists.
+
+## Pilot 1 — 2026-06-11, generator 1.0.0 (4 negative families), seed 1, 300 facts / 20 sessions / 10x distractors
+
+| final epoch | full | all-off | guard [0.10, 0.90] |
+|---|---|---|---|
+| current R@5 | 0.333 | 0.943 | all-off VIOLATES (>0.90) |
+| stale-intrusion | 0.600 | 0.975 | all-off VIOLATES |
+| trap-persistence | 0.089 | 0.933 | all-off VIOLATES |
+| hot R@5 | 0.435 | 0.930 | all-off VIOLATES |
+
+Wall-clock: full 206.8s, all-off 104.8s per (arm x seed). Full registered matrix
+(7 arms x 20 seeds) projects to ~6-7h serial, local, $0.
+
+**Diagnosis:** the probe query ("entity attribute") collides on BOTH tokens with only
+~4-6 documents per fact (versions + contradiction + trap + paraphrase-negative). With
+top-5 slots, a static BM25 ranking can hold essentially all of them - the all-off
+baseline saturates (everyone gets an A; the exam differentiates nothing).
+
+**Directional preview (not registered results):** mechanisms differentiate strongly and
+in OPPOSITE directions - outcome feedback buries traps (0.089 vs 0.933); decay appears
+to sacrifice old-but-current facts to fresher distractors (full current R@5 0.333 vs
+0.943; 7-day default half-life across a 20-week horizon floors every old memory's
+strength multiplier at 0.5x while fresh noise keeps ~1.0x). If this holds under the
+rescaled protocol and the registered arms, the decomposition (which mechanism helps,
+which hurts, at what horizon) is the paper's core result. H2's pivot criteria adjudicate.
+
+**Rescale applied (generator 1.0.0 -> same version pre-freeze, family count 4 -> 5):**
+added a BOTH-token hard-negative family (same entity AND attribute in a meta/process
+sentence carrying its own NEG token), so every fact now has more both-token colliders
+than top-5 slots and the static baseline must actually rank. No other change.
+
+## Pilot 2 — pending (rescaled generator)
diff --git a/scripts/e1-lifecycle/generate.mjs b/scripts/e1-lifecycle/generate.mjs
new file mode 100644
index 0000000..f951c86
--- /dev/null
+++ b/scripts/e1-lifecycle/generate.mjs
@@ -0,0 +1,323 @@
+#!/usr/bin/env node
+/**
+ * E1 longitudinal lifecycle protocol — deterministic GENERATOR.
+ *
+ * Emits the full pre-registered protocol for one seed (PREREGISTERED-DESIGN.md
+ * v0.2 + A1, hippo-paper repo): K simulated sessions over calendar time, N
+ * version-chained facts (40% receive v2/v3 updates, 10% receive
+ * contradictions), HARD negatives only (same-entity/different-attribute,
+ * paraphrases of superseded versions, temporal near-misses,
+ * contradiction-lookalikes), a scheduled-retrieval plan (drives
+ * strengthening; hot facts = top quartile by scheduled recalls), and an
+ * outcome schedule with EXPLICIT memory references (never last_retrieval_ids
+ * - the strengthen-off arm must keep outcome attribution working) including
+ * plausible-but-wrong TRAP memories that receive --bad.
+ *
+ * Determinism contract (inherited from scripts/lifecycle-stress/inject.mjs):
+ * a seeded mulberry32 PRNG drives every choice; NO Math.random, NO Date.now
+ * in the content path. Same seed => byte-identical protocol JSON. This file
+ * is tagged `e1-generator-freeze` BEFORE any ablation arm is run (anti-bias
+ * commitment, design rev #4); changes after the tag = amendment + full re-run.
+ *
+ * Value tokens are opaque (VAL<fact><version><digits>) so scoring is by
+ * token containment, never by memory id, and a probe can detect WHICH
+ * version surfaced (current vs superseded vs contradiction vs trap).
+ *
+ * Run standalone:  node scripts/e1-lifecycle/generate.mjs --seed 1 [--facts 300] [--sessions 20]
+ */
+
+import * as fs from 'node:fs';
+import * as path from 'node:path';
+import { fileURLToPath } from 'node:url';
+import { mulberry32 } from '../lifecycle-stress/inject.mjs';
+
+export const GENERATOR_VERSION = '1.0.0';
+
+// --------------------------------------------------------------------------
+// Vocabulary pools. Compound entities = HEAD x UNIT product (>= 288 distinct),
+// each fact draws a UNIQUE (entity, attribute) pair; cross-fact token overlap
+// stays low by construction (distinct entity tokens dominate each content).
+// --------------------------------------------------------------------------
+
+const ENTITY_HEADS = [
+  'Falcon', 'Atlas', 'Nova', 'Orion', 'Pegasus', 'Vega', 'Lyra', 'Draco',
+  'Cygnus', 'Hydra', 'Phoenix', 'Cobra', 'Tucana', 'Mensa', 'Carina', 'Lynx',
+  'Aquila', 'Corvus', 'Dorado', 'Fornax', 'Gemini', 'Indus', 'Pavo', 'Volans',
+];
+const ENTITY_UNITS = [
+  'pipeline', 'gateway', 'cluster', 'registry', 'scheduler', 'archive',
+  'console', 'exporter', 'replicator', 'balancer', 'notifier', 'indexer',
+];
+const ATTRIBUTES = [
+  'deadline', 'budget cap', 'owner', 'vendor contract', 'rollout window',
+  'staffing plan', 'compliance review date', 'migration target',
+  'pricing tier', 'incident contact', 'launch gate', 'security sign-off',
+  'retention period', 'latency budget', 'backup cadence', 'capacity ceiling',
+];
+// Connectives vary surface form between versions/paraphrases (low-signal words).
+const CONNECTIVES = ['is now', 'has been set to', 'was confirmed as', 'stands at', 'moved to'];
+const PARAPHRASE_LEADS = [
+  'For the record,', 'As noted earlier,', 'Per the old thread,', 'Reminder:',
+];
+const NEARMISS_QUALIFIERS = [
+  'tentatively pencilled near', 'rumoured to be around', 'once floated as',
+  'informally guessed at',
+];
+
+/** Deterministic opaque value token: VAL + factIdx + version + 4 seeded digits. */
+function valueToken(rand, factIdx, version) {
+  const digits = String(Math.floor(rand() * 10000)).padStart(4, '0');
+  return `VAL${factIdx}X${version}D${digits}`;
+}
+
+function pick(rand, arr) {
+  return arr[Math.floor(rand() * arr.length)];
+}
+
+/** Fisher-Yates with the seeded PRNG. */
+function shuffle(rand, arr) {
+  const a = arr.slice();
+  for (let i = a.length - 1; i > 0; i--) {
+    const j = Math.floor(rand() * (i + 1));
+    [a[i], a[j]] = [a[j], a[i]];
+  }
+  return a;
+}
+
+// --------------------------------------------------------------------------
+// Generator
+// --------------------------------------------------------------------------
+
+/**
+ * @param {object} opts
+ * @param {number} opts.seed
+ * @param {number} [opts.numFacts=300]
+ * @param {number} [opts.numSessions=20]
+ * @param {number} [opts.distractorMultiple=10]  hard negatives >= multiple * facts
+ * @param {string} [opts.baseDate='2025-01-06T09:00:00.000Z']
+ * @param {number} [opts.sessionIntervalDays=7]
+ */
+export function generateProtocol(opts) {
+  const seed = opts.seed >>> 0;
+  const numFacts = opts.numFacts ?? 300;
+  const numSessions = opts.numSessions ?? 20;
+  const distractorMultiple = opts.distractorMultiple ?? 10;
+  const baseDate = opts.baseDate ?? '2025-01-06T09:00:00.000Z';
+  const sessionIntervalDays = opts.sessionIntervalDays ?? 7;
+  const rand = mulberry32(seed);
+
+  // Sessions with simulated dates (weekly cadence by default).
+  const baseMs = Date.parse(baseDate);
+  const sessions = Array.from({ length: numSessions }, (_, i) => ({
+    index: i,
+    date: new Date(baseMs + i * sessionIntervalDays * 24 * 60 * 60 * 1000).toISOString(),
+  }));
+
+  // Unique (entity, attribute) per fact.
+  const pairs = [];
+  for (const h of ENTITY_HEADS) for (const u of ENTITY_UNITS) for (const a of ATTRIBUTES) {
+    pairs.push({ entity: `${h} ${u}`, attribute: a });
+  }
+  if (pairs.length < numFacts) throw new Error(`vocab too small: ${pairs.length} < ${numFacts}`);
+  const chosen = shuffle(rand, pairs).slice(0, numFacts);
+
+  const memories = []; // { id, session, kind, factId, version, content, token }
+  const probes = [];   // { factId, query, versionTimeline, tokens, trapTokens, contraTokens, hot }
+  const retrievalSchedule = []; // { session, query, factId }
+  const outcomeSchedule = [];   // { session, memoryRef, good }
+  let memSeq = 0;
+  const mid = () => `pm${memSeq++}`;
+
+  // Which facts get updates (40%) and contradictions (10%) - disjoint draws
+  // from a shuffled index list so fractions are exact, not stochastic.
+  const order = shuffle(rand, Array.from({ length: numFacts }, (_, i) => i));
+  const updateSet = new Set(order.slice(0, Math.round(numFacts * 0.4)));
+  const contraSet = new Set(shuffle(rand, order).slice(0, Math.round(numFacts * 0.1)));
+  // Traps: 15% of facts get one plausible-but-wrong memory that receives --bad.
+  const trapSet = new Set(shuffle(rand, order).slice(0, Math.round(numFacts * 0.15)));
+
+  // Recall frequency per fact (0..6, seeded). Hot = top quartile.
+  const recallCounts = chosen.map(() => Math.floor(rand() * 7));
+  const sortedCounts = recallCounts.slice().sort((a, b) => b - a);
+  const hotThreshold = sortedCounts[Math.max(0, Math.floor(numFacts / 4) - 1)];
+
+  for (let f = 0; f < numFacts; f++) {
+    const { entity, attribute } = chosen[f];
+    const factId = `F${f}`;
+    // v1 lands in the first 60% of sessions so updates have room afterwards.
+    const s1 = Math.floor(rand() * Math.max(1, Math.floor(numSessions * 0.6)));
+    const tok1 = valueToken(rand, f, 1);
+    const tokens = { 1: tok1 };
+    const versionTimeline = [{ session: s1, version: 1 }];
+    memories.push({
+      id: mid(), session: s1, kind: 'fact', factId, version: 1,
+      content: `${entity} ${attribute} ${pick(rand, CONNECTIVES)} ${tok1}.`,
+      token: tok1,
+    });
+
+    // Version chain: v2 (and possibly v3) at strictly later sessions.
+    if (updateSet.has(f)) {
+      const s2 = s1 + 1 + Math.floor(rand() * Math.max(1, numSessions - s1 - 2));
+      const tok2 = valueToken(rand, f, 2);
+      tokens[2] = tok2;
+      versionTimeline.push({ session: s2, version: 2 });
+      memories.push({
+        id: mid(), session: s2, kind: 'update', factId, version: 2,
+        content: `${entity} ${attribute} ${pick(rand, CONNECTIVES)} ${tok2}.`,
+        token: tok2,
+      });
+      if (rand() < 0.3 && s2 + 1 < numSessions) {
+        const s3 = s2 + 1 + Math.floor(rand() * Math.max(1, numSessions - s2 - 2));
+        const tok3 = valueToken(rand, f, 3);
+        tokens[3] = tok3;
+        versionTimeline.push({ session: Math.min(s3, numSessions - 1), version: 3 });
+        memories.push({
+          id: mid(), session: Math.min(s3, numSessions - 1), kind: 'update', factId, version: 3,
+          content: `${entity} ${attribute} ${pick(rand, CONNECTIVES)} ${tok3}.`,
+          token: tok3,
+        });
+      }
+    }
+
+    // Contradiction: asserts a WRONG value at a later session; never becomes
+    // current truth (adversarial noise; secondary intrusion metric).
+    const contraTokens = [];
+    if (contraSet.has(f)) {
+      const sc = Math.min(numSessions - 1, s1 + 1 + Math.floor(rand() * Math.max(1, numSessions - s1 - 1)));
+      const ctok = `CONTRA${f}D${String(Math.floor(rand() * 10000)).padStart(4, '0')}`;
+      contraTokens.push(ctok);
+      memories.push({
+        id: mid(), session: sc, kind: 'contradiction', factId, version: null,
+        content: `Heard in passing that ${entity} ${attribute} ${pick(rand, CONNECTIVES)} ${ctok}.`,
+        token: ctok,
+      });
+    }
+
+    // Trap: plausible-but-wrong memory that the outcome schedule marks --bad
+    // (twice). Trap-persistence = its token still in top-5 afterwards.
+    const trapTokens = [];
+    if (trapSet.has(f)) {
+      const st = Math.min(numSessions - 1, s1 + Math.floor(rand() * Math.max(1, numSessions - s1)));
+      const ttok = `TRAP${f}D${String(Math.floor(rand() * 10000)).padStart(4, '0')}`;
+      trapTokens.push(ttok);
+      const trapId = mid();
+      memories.push({
+        id: trapId, session: st, kind: 'trap', factId, version: null,
+        content: `${entity} ${attribute} ${pick(rand, CONNECTIVES)} ${ttok}.`,
+        token: ttok,
+      });
+      for (let b = 0; b < 2; b++) {
+        const so = Math.min(numSessions - 1, st + 1 + b);
+        outcomeSchedule.push({ session: so, memoryRef: trapId, good: false });
+      }
+    }
+
+    // Scheduled mutating recalls (strengthening drivers), spread over sessions
+    // AFTER the fact first exists.
+    const query = `${entity} ${attribute}`;
+    const hot = recallCounts[f] >= hotThreshold && recallCounts[f] > 0;
+    for (let r = 0; r < recallCounts[f]; r++) {
+      const sr = Math.min(numSessions - 1, s1 + 1 + Math.floor(rand() * Math.max(1, numSessions - s1 - 1)));
+      retrievalSchedule.push({ session: sr, query, factId });
+    }
+
+    // Positive outcomes: 30% of facts' v1 memory gets one --good.
+    if (rand() < 0.3) {
+      const v1Mem = memories.find((m) => m.factId === factId && m.version === 1);
+      const so = Math.min(numSessions - 1, s1 + 1);
+      outcomeSchedule.push({ session: so, memoryRef: v1Mem.id, good: true });
+    }
+
+    probes.push({ factId, query, versionTimeline, tokens, contraTokens, trapTokens, hot });
+  }
+
+  // ------------------------------------------------------------------------
+  // Hard negatives (>= distractorMultiple x facts), four template families.
+  // Each carries its own opaque NEG token so it can never satisfy a probe.
+  // ------------------------------------------------------------------------
+  const negPerFact = distractorMultiple;
+  for (let f = 0; f < numFacts; f++) {
+    const { entity, attribute } = chosen[f];
+    const probe = probes[f];
+    for (let n = 0; n < negPerFact; n++) {
+      const family = n % 5;
+      const sn = Math.floor(rand() * numSessions);
+      const ntok = `NEG${f}N${n}D${String(Math.floor(rand() * 10000)).padStart(4, '0')}`;
+      let content;
+      if (family === 0) {
+        // same-entity / DIFFERENT-attribute
+        const otherAttr = pick(rand, ATTRIBUTES.filter((a) => a !== attribute));
+        content = `${entity} ${otherAttr} ${pick(rand, CONNECTIVES)} ${ntok}.`;
+      } else if (family === 1 && probe.tokens[2]) {
+        // paraphrase of a SUPERSEDED version (carries the OLD token: surfacing
+        // it after the update counts as stale-intrusion - intentional)
+        content = `${pick(rand, PARAPHRASE_LEADS)} ${entity} ${attribute} ${pick(rand, CONNECTIVES)} ${probe.tokens[1]}.`;
+      } else if (family === 2) {
+        // temporal near-miss
+        content = `${entity} ${attribute} ${pick(rand, NEARMISS_QUALIFIERS)} ${ntok}.`;
+      } else if (family === 3) {
+        // contradiction-lookalike (hedged phrasing, own token)
+        content = `Unconfirmed: ${entity} ${attribute} might be ${ntok}, pending review.`;
+      } else {
+        // BOTH-token mention (pilot rescale, 2026-06-11): same entity AND
+        // attribute in a process/meta sentence with its own token. These
+        // collide with the probe query on both tokens, so a static top-5
+        // cannot hold every same-fact document - de-saturates the all-off
+        // baseline (pilot showed currentR5 0.943 > 0.90 guard).
+        content = `Review of ${entity} ${attribute} noted in minutes ${ntok}; decision log pending.`;
+      }
+      memories.push({
+        id: mid(), session: sn, kind: 'distractor', factId: null, version: null,
+        content, token: content.includes(probe.tokens[1]) ? probe.tokens[1] : ntok,
+      });
+    }
+  }
+
+  // Stable ordering within each session: by id sequence (already insertion-
+  // ordered); the driver ingests session-by-session.
+  retrievalSchedule.sort((a, b) => a.session - b.session || a.factId.localeCompare(b.factId));
+  outcomeSchedule.sort((a, b) => a.session - b.session || a.memoryRef.localeCompare(b.memoryRef));
+
+  return {
+    meta: {
+      generatorVersion: GENERATOR_VERSION, seed, numFacts, numSessions,
+      distractorMultiple, baseDate, sessionIntervalDays,
+      counts: {
+        memories: memories.length,
+        updates: memories.filter((m) => m.kind === 'update').length,
+        contradictions: memories.filter((m) => m.kind === 'contradiction').length,
+        traps: memories.filter((m) => m.kind === 'trap').length,
+        distractors: memories.filter((m) => m.kind === 'distractor').length,
+        scheduledRecalls: retrievalSchedule.length,
+        outcomes: outcomeSchedule.length,
+        hotFacts: probes.filter((p) => p.hot).length,
+      },
+    },
+    sessions, memories, retrievalSchedule, outcomeSchedule, probes,
+  };
+}
+
+// --------------------------------------------------------------------------
+// CLI
+// --------------------------------------------------------------------------
+const isMain = process.argv[1] && path.resolve(process.argv[1]) === fileURLToPath(import.meta.url);
+if (isMain) {
+  const args = process.argv.slice(2);
+  const getArg = (name, dflt) => {
+    const i = args.indexOf(`--${name}`);
+    return i >= 0 ? args[i + 1] : dflt;
+  };
+  const seed = Number(getArg('seed', '1'));
+  const protocol = generateProtocol({
+    seed,
+    numFacts: Number(getArg('facts', '300')),
+    numSessions: Number(getArg('sessions', '20')),
+    distractorMultiple: Number(getArg('distractors', '10')),
+  });
+  const outDir = getArg('out', path.join(path.dirname(fileURLToPath(import.meta.url)), 'protocols'));
+  fs.mkdirSync(outDir, { recursive: true });
+  const outFile = path.join(outDir, `protocol-seed${seed}.json`);
+  fs.writeFileSync(outFile, JSON.stringify(protocol, null, 1), 'utf8');
+  console.log(`wrote ${outFile}`);
+  console.log(JSON.stringify(protocol.meta, null, 2));
+}
diff --git a/scripts/e1-lifecycle/run.mjs b/scripts/e1-lifecycle/run.mjs
new file mode 100644
index 0000000..4dca5ae
--- /dev/null
+++ b/scripts/e1-lifecycle/run.mjs
@@ -0,0 +1,288 @@
+#!/usr/bin/env node
+/**
+ * E1 longitudinal lifecycle protocol — the DRIVER.
+ *
+ * Executes the pre-registered protocol (PREREGISTERED-DESIGN.md v0.2 + A1)
+ * for each requested (arm x seed): builds a HERMETIC fresh store (temp
+ * HIPPO_HOME, cleaned up after), walks the K simulated sessions in order,
+ * and probes READ-ONLY after every session.
+ *
+ * Mutation/measurement split (design rev #3):
+ *   - MUTATORS (the only state writers): session ingestion
+ *     (createMemory+writeEntry under HIPPO_FAKE_NOW), scheduled retrievals
+ *     (hybridSearch -> markRetrieved -> persistence gated EXACTLY like the
+ *     CLI: skipped when isRecallBoostAblated()), and outcome applications
+ *     (applyOutcome+writeEntry on EXPLICIT protocol-mapped ids - never
+ *     last_retrieval_ids).
+ *   - PROBES are in-process hybridSearch calls with an explicit `now`; no
+ *     markRetrieved, no writes. (retrieve_inprocess.mjs pattern.)
+ *
+ * Simulated time: HIPPO_FAKE_NOW env + _resetAblationCacheForTests() per
+ * session (in-process equivalent of one process per session). createMemory /
+ * markRetrieved / scoring all honor it via evalNow().
+ *
+ * Arms (env per design section 3; baselines rank differently in the prober):
+ *   full            no flags
+ *   decay-off       HIPPO_ABLATE_DECAY=1            (A1: co-ablates outcome-slow + read-side boost)
+ *   strengthen-off  HIPPO_ABLATE_RECALL_BOOST=1
+ *   outcome-off     HIPPO_ABLATE_OUTCOME=1
+ *   all-off         all three
+ *   bm25-static     all three + probe ranks by raw BM25 component only
+ *   recency-window  all three + probe returns the 5 newest entries
+ *
+ * NO sleep/consolidation in E1 (that is E3's dimension); no embeddings (the
+ * lexical+lifecycle composite exercises every mechanism under test; the
+ * embedding blend is orthogonal to the ablations).
+ *
+ * Run (pilot):  node scripts/e1-lifecycle/run.mjs --arms full,all-off --seeds 1 --facts 300
+ */
+
+import * as fs from 'node:fs';
+import * as os from 'node:os';
+import * as path from 'node:path';
+import { createHash } from 'node:crypto';
+import { fileURLToPath } from 'node:url';
+
+import { createMemory, applyOutcome } from '../../dist/memory.js';
+import { writeEntry, loadAllEntries, initStore } from '../../dist/store.js';
+import { hybridSearch, markRetrieved } from '../../dist/search.js';
+import { isRecallBoostAblated, _resetAblationCacheForTests } from '../../dist/ablation.js';
+import { generateProtocol, GENERATOR_VERSION } from './generate.mjs';
+
+const HERE = path.dirname(fileURLToPath(import.meta.url));
+const REPO = path.resolve(HERE, '..', '..');
+const OUT_DIR = path.join(REPO, 'benchmarks', 'e1-lifecycle', 'raw');
+
+const ARM_ENV = {
+  'full': {},
+  'decay-off': { HIPPO_ABLATE_DECAY: '1' },
+  'strengthen-off': { HIPPO_ABLATE_RECALL_BOOST: '1' },
+  'outcome-off': { HIPPO_ABLATE_OUTCOME: '1' },
+  'all-off': { HIPPO_ABLATE_DECAY: '1', HIPPO_ABLATE_RECALL_BOOST: '1', HIPPO_ABLATE_OUTCOME: '1' },
+  'bm25-static': { HIPPO_ABLATE_DECAY: '1', HIPPO_ABLATE_RECALL_BOOST: '1', HIPPO_ABLATE_OUTCOME: '1' },
+  'recency-window': { HIPPO_ABLATE_DECAY: '1', HIPPO_ABLATE_RECALL_BOOST: '1', HIPPO_ABLATE_OUTCOME: '1' },
+};
+const ABLATION_VARS = ['HIPPO_ABLATE_DECAY', 'HIPPO_ABLATE_RECALL_BOOST', 'HIPPO_ABLATE_OUTCOME', 'HIPPO_ABLATE_OUTCOME_SLOW', 'HIPPO_ABLATE_OUTCOME_FAST', 'HIPPO_FAKE_NOW'];
+
+function setArmEnv(arm) {
+  for (const v of ABLATION_VARS) delete process.env[v];
+  for (const [k, val] of Object.entries(ARM_ENV[arm])) process.env[k] = val;
+  _resetAblationCacheForTests();
+}
+
+function setSimulatedNow(iso) {
+  process.env.HIPPO_FAKE_NOW = iso;
+  _resetAblationCacheForTests();
+}
+
+/** Current version token for a probe at epoch e, or null if fact not yet live. */
+function currentAt(probe, epoch) {
+  let cur = null;
+  for (const step of probe.versionTimeline) {
+    if (step.session <= epoch) cur = step;
+  }
+  return cur; // { session, version } | null
+}
+
+const PROBE_TOP_K = 5;
+const PROBE_BUDGET = 100000; // token budget never binds at top-5 granularity
+
+async function probeEpoch(protocol, entries, epoch, epochDate, arm) {
+  const probeNow = new Date(Date.parse(epochDate) + 60 * 60 * 1000); // +1h after session
+  let active = 0, current5 = 0, staleEligible = 0, staleHit = 0;
+  let trapEligible = 0, trapHit = 0, contraEligible = 0, contraHit = 0;
+  let hotActive = 0, hotCurrent5 = 0, mrrSum = 0;
+
+  for (const probe of protocol.probes) {
+    const cur = currentAt(probe, epoch);
+    if (!cur) continue;
+    active++;
+    if (probe.hot) hotActive++;
+
+    let top;
+    if (arm === 'recency-window') {
+      top = entries.slice().sort((a, b) => Date.parse(b.created) - Date.parse(a.created)).slice(0, PROBE_TOP_K);
+    } else {
+      const results = await hybridSearch(probe.query, entries, { budget: PROBE_BUDGET, now: probeNow, minResults: PROBE_TOP_K });
+      const ranked = arm === 'bm25-static'
+        ? results.slice().sort((a, b) => b.bm25 - a.bm25)
+        : results;
+      top = ranked.slice(0, PROBE_TOP_K).map((r) => r.entry);
+    }
+    const texts = top.map((e) => e.content);
+    const curTok = probe.tokens[cur.version];
+    const rank = texts.findIndex((t) => t.includes(curTok));
+    if (rank >= 0) {
+      current5++;
+      if (probe.hot) hotCurrent5++;
+      mrrSum += 1 / (rank + 1);
+    }
+    // Stale intrusion: only meaningful once an update has superseded v1.
+    if (cur.version >= 2) {
+      staleEligible++;
+      const staleToks = Object.entries(probe.tokens)
+        .filter(([v]) => Number(v) < cur.version).map(([, t]) => t);
+      if (texts.some((t) => staleToks.some((s) => t.includes(s)))) staleHit++;
+    }
+    if (probe.trapTokens.length > 0) {
+      // Eligible once the trap memory exists in the store.
+      const trapLive = entries.some((e) => probe.trapTokens.some((t) => e.content.includes(t)));
+      if (trapLive) {
+        trapEligible++;
+        if (texts.some((t) => probe.trapTokens.some((tt) => t.includes(tt)))) trapHit++;
+      }
+    }
+    if (probe.contraTokens.length > 0) {
+      const contraLive = entries.some((e) => probe.contraTokens.some((t) => e.content.includes(t)));
+      if (contraLive) {
+        contraEligible++;
+        if (texts.some((t) => probe.contraTokens.some((ct) => t.includes(ct)))) contraHit++;
+      }
+    }
+  }
+
+  return {
+    epoch, activeProbes: active,
+    currentR5: active > 0 ? current5 / active : null,
+    mrr: active > 0 ? mrrSum / active : null,
+    staleEligible, staleIntrusionRate: staleEligible > 0 ? staleHit / staleEligible : null,
+    trapEligible, trapPersistenceRate: trapEligible > 0 ? trapHit / trapEligible : null,
+    contraEligible, contraIntrusionRate: contraEligible > 0 ? contraHit / contraEligible : null,
+    hotActive, hotR5: hotActive > 0 ? hotCurrent5 / hotActive : null,
+  };
+}
+
+/**
+ * @param {string} arm
+ * @param {number} seed
+ * @param {object} [genOpts]  forwarded to generateProtocol
+ * @param {(hippoRoot: string, idMap: Map<string,string>) => Promise<void>|void} [inspect]
+ *   test hook: called with the live store path AFTER the last epoch, BEFORE
+ *   cleanup (invariant tests assert on real DB state).
+ */
+export async function runArmSeed(arm, seed, genOpts = {}, inspect = undefined) {
+  const protocol = generateProtocol({ seed, ...genOpts });
+  const protocolHash = createHash('sha256').update(JSON.stringify(protocol)).digest('hex').slice(0, 16);
+
+  const hippoRoot = fs.mkdtempSync(path.join(os.tmpdir(), `hippo-e1-${arm}-s${seed}-`));
+  const epochs = [];
+  try {
+    setArmEnv(arm);
+    setSimulatedNow(protocol.sessions[0].date);
+    initStore(hippoRoot);
+
+    const idMap = new Map(); // protocol memory id -> hippo entry (latest object)
+    const bySession = new Map();
+    for (const m of protocol.memories) {
+      if (!bySession.has(m.session)) bySession.set(m.session, []);
+      bySession.get(m.session).push(m);
+    }
+    const retrievalsBySession = new Map();
+    for (const r of protocol.retrievalSchedule) {
+      if (!retrievalsBySession.has(r.session)) retrievalsBySession.set(r.session, []);
+      retrievalsBySession.get(r.session).push(r);
+    }
+    const outcomesBySession = new Map();
+    for (const o of protocol.outcomeSchedule) {
+      if (!outcomesBySession.has(o.session)) outcomesBySession.set(o.session, []);
+      outcomesBySession.get(o.session).push(o);
+    }
+
+    for (const session of protocol.sessions) {
+      setSimulatedNow(session.date); // mutators stamp simulated time
+
+      // 1. Ingest this session's memories (created/last_retrieved = fake now).
+      for (const m of bySession.get(session.index) ?? []) {
+        const entry = createMemory(m.content);
+        writeEntry(hippoRoot, entry);
+        idMap.set(m.id, entry.id);
+      }
+
+      // 2. Scheduled mutating retrievals - CLI-parity block: hybridSearch ->
+      //    markRetrieved -> persistence gated exactly like cli.ts cmdRecall.
+      const entriesNow = () => loadAllEntries(hippoRoot);
+      for (const r of retrievalsBySession.get(session.index) ?? []) {
+        const entries = entriesNow();
+        const results = await hybridSearch(r.query, entries, { budget: PROBE_BUDGET, minResults: PROBE_TOP_K });
+        const topEntries = results.slice(0, PROBE_TOP_K).map((x) => x.entry);
+        const updated = markRetrieved(topEntries); // default now = evalNow() (fake)
+        if (!isRecallBoostAblated()) {
+          for (const u of updated) writeEntry(hippoRoot, u);
+        }
+      }
+
+      // 3. Scheduled outcomes on EXPLICIT ids (never last_retrieval_ids).
+      for (const o of outcomesBySession.get(session.index) ?? []) {
+        const hippoId = idMap.get(o.memoryRef);
+        if (!hippoId) throw new Error(`outcome before ingestion: ${o.memoryRef} at session ${session.index}`);
+        const entry = loadAllEntries(hippoRoot).find((e) => e.id === hippoId);
+        if (!entry) throw new Error(`outcome target missing from store: ${hippoId}`);
+        const updated = applyOutcome(entry, o.good);
+        writeEntry(hippoRoot, updated);
+      }
+
+      // 4. READ-ONLY probes (explicit now; no markRetrieved; no writes).
+      const entries = loadAllEntries(hippoRoot);
+      epochs.push(await probeEpoch(protocol, entries, session.index, session.date, arm));
+    }
+
+    if (inspect) await inspect(hippoRoot, idMap);
+  } finally {
+    for (const v of ABLATION_VARS) delete process.env[v];
+    _resetAblationCacheForTests();
+    fs.rmSync(hippoRoot, { recursive: true, force: true });
+  }
+
+  return {
+    meta: {
+      arm, seed, generatorVersion: GENERATOR_VERSION, protocolHash,
+      protocolCounts: protocol.meta.counts, ranAt: new Date().toISOString(),
+    },
+    epochs,
+  };
+}
+
+// --------------------------------------------------------------------------
+// CLI
+// --------------------------------------------------------------------------
+const isMain = process.argv[1] && path.resolve(process.argv[1]) === fileURLToPath(import.meta.url);
+if (isMain) {
+  const args = process.argv.slice(2);
+  const getArg = (name, dflt) => {
+    const i = args.indexOf(`--${name}`);
+    return i >= 0 ? args[i + 1] : dflt;
+  };
+  const arms = getArg('arms', 'full,all-off').split(',').map((s) => s.trim()).filter(Boolean);
+  const seeds = getArg('seeds', '1').split(',').flatMap((s) => {
+    const m = s.match(/^(\d+)-(\d+)$/);
+    return m ? Array.from({ length: Number(m[2]) - Number(m[1]) + 1 }, (_, i) => Number(m[1]) + i) : [Number(s)];
+  });
+  const genOpts = {
+    numFacts: Number(getArg('facts', '300')),
+    numSessions: Number(getArg('sessions', '20')),
+    distractorMultiple: Number(getArg('distractors', '10')),
+  };
+  for (const arm of arms) {
+    if (!ARM_ENV[arm]) {
+      console.error(`unknown arm: ${arm}`);
+      process.exit(1);
+    }
+  }
+  fs.mkdirSync(OUT_DIR, { recursive: true });
+  (async () => {
+    for (const arm of arms) {
+      for (const seed of seeds) {
+        const t0 = Date.now();
+        const result = await runArmSeed(arm, seed, genOpts);
+        const outFile = path.join(OUT_DIR, `${arm}-seed${seed}.json`);
+        fs.writeFileSync(outFile, JSON.stringify(result, null, 1), 'utf8');
+        const last = result.epochs[result.epochs.length - 1];
+        console.log(
+          `${arm} seed=${seed} done in ${((Date.now() - t0) / 1000).toFixed(1)}s | final epoch: ` +
+          `R@5=${last.currentR5?.toFixed(3)} stale=${last.staleIntrusionRate?.toFixed(3)} ` +
+          `trap=${last.trapPersistenceRate?.toFixed(3)} hot=${last.hotR5?.toFixed(3)}`
+        );
+      }
+    }
+  })().catch((e) => { console.error(e); process.exit(1); });
+}
diff --git a/tests/e1-harness.test.ts b/tests/e1-harness.test.ts
new file mode 100644
index 0000000..48cb626
--- /dev/null
+++ b/tests/e1-harness.test.ts
@@ -0,0 +1,128 @@
+/**
+ * E1 longitudinal harness — invariant tests (pre-run gates from the frozen
+ * design: rev #3 mutation/measurement split, rev #8 timestamp invariants,
+ * rev #10 ablation wiring; outcome targeting via explicit ids).
+ *
+ * Real stores (house rule), tiny protocol sizes so the whole file stays fast.
+ * Env isolation: the driver owns ablation env vars during runArmSeed and
+ * clears them in its finally; tests also clear in beforeEach/afterEach.
+ */
+
+import { describe, it, expect, beforeEach, afterEach } from 'vitest';
+// @ts-expect-error - .mjs harness modules have no type declarations
+import { generateProtocol } from '../scripts/e1-lifecycle/generate.mjs';
+// @ts-expect-error - .mjs harness modules have no type declarations
+import { runArmSeed } from '../scripts/e1-lifecycle/run.mjs';
+import { loadAllEntries } from '../src/store.js';
+import { _resetAblationCacheForTests } from '../src/ablation.js';
+
+const ABLATION_ENV_VARS = [
+  'HIPPO_ABLATE_DECAY',
+  'HIPPO_ABLATE_RECALL_BOOST',
+  'HIPPO_ABLATE_OUTCOME',
+  'HIPPO_ABLATE_OUTCOME_SLOW',
+  'HIPPO_ABLATE_OUTCOME_FAST',
+  'HIPPO_FAKE_NOW',
+] as const;
+
+function clearAblationEnv(): void {
+  for (const v of ABLATION_ENV_VARS) delete process.env[v];
+  _resetAblationCacheForTests();
+}
+beforeEach(clearAblationEnv);
+afterEach(clearAblationEnv);
+
+const TINY = { numFacts: 12, numSessions: 6, distractorMultiple: 4 };
+
+describe('E1 generator', () => {
+  it('is deterministic: same seed => byte-identical protocol', () => {
+    const a = JSON.stringify(generateProtocol({ seed: 7, ...TINY }));
+    const b = JSON.stringify(generateProtocol({ seed: 7, ...TINY }));
+    expect(a).toBe(b);
+    const c = JSON.stringify(generateProtocol({ seed: 8, ...TINY }));
+    expect(a).not.toBe(c);
+  });
+
+  it('honors the registered fractions and structural invariants', () => {
+    const p = generateProtocol({ seed: 1, numFacts: 50, numSessions: 10, distractorMultiple: 10 });
+    // 40% updates, 10% contradictions (exact via shuffled-slice draws).
+    const updatedFacts = new Set(p.memories.filter((m: any) => m.kind === 'update').map((m: any) => m.factId));
+    expect(updatedFacts.size).toBe(20);
+    expect(p.memories.filter((m: any) => m.kind === 'contradiction').length).toBe(5);
+    // Hard-negative pool >= 10x facts.
+    expect(p.meta.counts.distractors).toBeGreaterThanOrEqual(500);
+    // Version timelines strictly ordered; updates strictly after v1.
+    for (const probe of p.probes) {
+      const sessions = probe.versionTimeline.map((s: any) => s.session);
+      for (let i = 1; i < sessions.length; i++) expect(sessions[i]).toBeGreaterThan(sessions[i - 1]);
+    }
+    // Every outcome target exists and is scheduled at/after its ingestion session.
+    const memById = new Map(p.memories.map((m: any) => [m.id, m]));
+    for (const o of p.outcomeSchedule) {
+      const m = memById.get(o.memoryRef) as any;
+      expect(m).toBeDefined();
+      expect(o.session).toBeGreaterThanOrEqual(m.session);
+    }
+    // Traps are bad-marked; some positive outcomes exist.
+    const trapIds = new Set(p.memories.filter((m: any) => m.kind === 'trap').map((m: any) => m.id));
+    expect(p.outcomeSchedule.filter((o: any) => !o.good).every((o: any) => trapIds.has(o.memoryRef))).toBe(true);
+    expect(p.outcomeSchedule.some((o: any) => o.good)).toBe(true);
+  });
+});
+
+describe('E1 driver', () => {
+  it('full arm: simulated time stamps the store; metrics emitted per epoch', async () => {
+    let storeSeen = false;
+    const result = await runArmSeed('full', 3, TINY, (hippoRoot: string) => {
+      storeSeen = true;
+      const entries = loadAllEntries(hippoRoot);
+      expect(entries.length).toBeGreaterThan(0);
+      // Every created timestamp lies within the protocol's simulated range -
+      // nothing stamped with the real 2026 clock (rev #8 invariant).
+      for (const e of entries) {
+        expect(Date.parse(e.created)).toBeGreaterThanOrEqual(Date.parse('2025-01-06T00:00:00.000Z'));
+        expect(Date.parse(e.created)).toBeLessThan(Date.parse('2025-06-01T00:00:00.000Z'));
+      }
+      // Scheduled retrievals strengthened SOMETHING (full arm: writes live).
+      expect(entries.some((e) => e.retrieval_count > 0)).toBe(true);
+    });
+    expect(storeSeen).toBe(true);
+    expect(result.epochs.length).toBe(TINY.numSessions);
+    const last = result.epochs[TINY.numSessions - 1];
+    expect(last.activeProbes).toBe(TINY.numFacts);
+    expect(last.currentR5).not.toBeNull();
+    expect(result.meta.protocolHash).toMatch(/^[0-9a-f]{16}$/);
+  });
+
+  it('strengthen-off arm: zero retrieval writes, outcome attribution still lands', async () => {
+    await runArmSeed('strengthen-off', 3, TINY, (hippoRoot: string) => {
+      const entries = loadAllEntries(hippoRoot);
+      // No strengthening writes anywhere (rev #10: the arm isolates one mechanism).
+      expect(entries.every((e) => e.retrieval_count === 0)).toBe(true);
+      // Outcomes still applied via explicit ids (codex round-7 coupling fix):
+      // traps received --bad despite strengthening being off.
+      expect(entries.some((e) => (e.outcome_negative ?? 0) > 0)).toBe(true);
+    });
+  });
+
+  it('probes are read-only: an all-off arm leaves zero retrieval state', async () => {
+    await runArmSeed('all-off', 5, TINY, (hippoRoot: string) => {
+      const entries = loadAllEntries(hippoRoot);
+      // Probes ran every epoch over these entries; none of them wrote back.
+      expect(entries.every((e) => e.retrieval_count === 0)).toBe(true);
+      expect(entries.every((e) => e.half_life_days <= 90)).toBe(true); // no +2 accumulation beyond derive caps
+    });
+  });
+
+  it('driver cleans its env: no ablation vars leak after a run', async () => {
+    await runArmSeed('decay-off', 2, TINY);
+    for (const v of ABLATION_ENV_VARS) expect(process.env[v]).toBeUndefined();
+  });
+
+  it('baseline arms produce rankings (bm25-static + recency-window)', async () => {
+    const bm25 = await runArmSeed('bm25-static', 4, TINY);
+    const rec = await runArmSeed('recency-window', 4, TINY);
+    expect(bm25.epochs[TINY.numSessions - 1].currentR5).not.toBeNull();
+    expect(rec.epochs[TINY.numSessions - 1].currentR5).not.toBeNull();
+  });
+});

From 625bc67ef5df17e7902455ff2f0e54b18a7401b4 Mon Sep 17 00:00:00 2001
From: Keith So <68618199+kitfunso@users.noreply.github.com>
Date: Thu, 11 Jun 2026 20:12:44 +0100
Subject: [PATCH 2/5] feat(eval): pilot-3 rescale - value-claim lookalike
 negatives pass saturation guard

Pilot 2 (both-token meta-sentences) failed to de-saturate the all-off baseline
(0.967: BM25 length normalization ranks long sentences below tight fact docs).
Pilot 3 replaces family 4 with VALUE-CLAIM lookalikes - the exact fact template
carrying their own token - so every fact has more identical-form competitors
than top-5 slots and a static ranker must genuinely tie-break. Realistic
adversary: long-lived stores accumulate multiple tight value claims per key.

Pilot 3 final-epoch all-off: currentR5 0.727, stale 0.875, trap 0.733,
hot 0.757 - saturation guard PASSES on every primary metric (prereg amendment
A2: final-epoch endpoints; early epochs ceiling by construction in an
accumulating store). Full pilot trail in PILOT-NOTES.md.

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
---
 benchmarks/e1-lifecycle/PILOT-NOTES.md | 27 +++++++++++++++++++++++++-
 scripts/e1-lifecycle/generate.mjs      | 16 +++++++++------
 2 files changed, 36 insertions(+), 7 deletions(-)

diff --git a/benchmarks/e1-lifecycle/PILOT-NOTES.md b/benchmarks/e1-lifecycle/PILOT-NOTES.md
index aeccd2a..c1e918f 100644
--- a/benchmarks/e1-lifecycle/PILOT-NOTES.md
+++ b/benchmarks/e1-lifecycle/PILOT-NOTES.md
@@ -34,4 +34,29 @@ added a BOTH-token hard-negative family (same entity AND attribute in a meta/pro
 sentence carrying its own NEG token), so every fact now has more both-token colliders
 than top-5 slots and the static baseline must actually rank. No other change.
 
-## Pilot 2 — pending (rescaled generator)
+## Pilot 2 — 2026-06-11, both-token meta-sentence family, seed 1
+
+FAILED to de-saturate: all-off currentR5 0.967 (worse than pilot 1). Root cause: the new
+negatives were LONGER sentences; BM25 length normalization ranks them below the tight fact
+docs, so they never displace anything from top-5.
+
+## Pilot 3 — 2026-06-11, value-claim lookalike family (exact fact template, own token), seed 1
+
+| final epoch | full | all-off | guard [0.10, 0.90] |
+|---|---|---|---|
+| current R@5 | 0.257 | 0.727 | PASS |
+| stale-intrusion | 0.475 | 0.875 | PASS |
+| trap-persistence | 0.111 | 0.733 | PASS |
+| hot R@5 | 0.365 | 0.757 | PASS |
+
+Wall-clock: all-off 102.0s, full 167.5s. Registered matrix (7 arms x 20 seeds) projects ~6h.
+
+**Trajectory note -> amendment A2:** epochs 0-9 are ceiling-saturated by construction (an
+accumulating store starts with 1-3 docs per fact; a top-5 metric cannot miss). Primary
+endpoints clarified to FINAL-EPOCH values (prereg amendment A2, recorded pre-freeze);
+trajectories reported descriptively. Final-epoch all-off passes the guard on all metrics.
+
+Directional preview persists (not registered results): outcome feedback buries traps
+(0.111 vs 0.733); full-lifecycle current R@5 BELOW all-off (0.257 vs 0.727 - decay
+sacrifices old-but-current facts to fresh noise at this horizon); stale-suppression helps
+(0.475 vs 0.875). The per-mechanism arms will attribute these causally.
diff --git a/scripts/e1-lifecycle/generate.mjs b/scripts/e1-lifecycle/generate.mjs
index f951c86..ada1f0c 100644
--- a/scripts/e1-lifecycle/generate.mjs
+++ b/scripts/e1-lifecycle/generate.mjs
@@ -259,12 +259,16 @@ export function generateProtocol(opts) {
         // contradiction-lookalike (hedged phrasing, own token)
         content = `Unconfirmed: ${entity} ${attribute} might be ${ntok}, pending review.`;
       } else {
-        // BOTH-token mention (pilot rescale, 2026-06-11): same entity AND
-        // attribute in a process/meta sentence with its own token. These
-        // collide with the probe query on both tokens, so a static top-5
-        // cannot hold every same-fact document - de-saturates the all-off
-        // baseline (pilot showed currentR5 0.943 > 0.90 guard).
-        content = `Review of ${entity} ${attribute} noted in minutes ${ntok}; decision log pending.`;
+        // VALUE-CLAIM lookalike (pilot-2 rescale, 2026-06-11): the EXACT fact
+        // template with its own token. Pilot 2 showed longer meta-sentences
+        // never displace tight fact docs (BM25 length normalization), so the
+        // all-off ceiling held at 0.967. Identical-form claims are also the
+        // realistic adversary: a long-lived store accumulates multiple tight
+        // value claims per key (old notes, speculation, misheard values).
+        // With > top-k identical-form docs per fact, a static ranker must
+        // tie-break arbitrarily; lifecycle signals (update recency,
+        // strengthening, outcomes) are precisely the tiebreakers under test.
+        content = `${entity} ${attribute} ${pick(rand, CONNECTIVES)} ${ntok}.`;
       }
       memories.push({
         id: mid(), session: sn, kind: 'distractor', factId: null, version: null,

From 2149cb18ef99daf559db61154159f16e824d98e8 Mon Sep 17 00:00:00 2001
From: Keith So <68618199+kitfunso@users.noreply.github.com>
Date: Thu, 11 Jun 2026 20:25:34 +0100
Subject: [PATCH 3/5] fix(eval): deterministic entry ids + disjoint
 contradiction sampling (codex round 1)

P1: the protocol intentionally creates score ties (identical-form value-claim
negatives), and same-timestamp rows order by id, so random createMemory UUIDs
made identical (arm, seed) runs produce different top-5 metrics under the same
protocol hash. Entry ids are now sha256-derived from (seed, protocol memory
id), keeping the mem_<12hex> format. New invariant test: two complete runs of
the same (arm, seed) agree to the byte.

P2: contraSet re-sampled the full fact list, so contradiction facts could
overlap updated facts despite the disjointness comment. Contradictions now
sample from the NON-updated remainder; disjointness asserted in tests.

Pilot 4 reconfirms the saturation guard on every primary metric (all-off
final epoch: 0.727 / 0.825 / 0.778 / 0.693). 8 invariant tests green.

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
---
 benchmarks/e1-lifecycle/PILOT-NOTES.md | 15 +++++++++++++++
 scripts/e1-lifecycle/generate.mjs      | 10 +++++++---
 scripts/e1-lifecycle/run.mjs           |  6 ++++++
 tests/e1-harness.test.ts               | 15 +++++++++++++++
 4 files changed, 43 insertions(+), 3 deletions(-)

diff --git a/benchmarks/e1-lifecycle/PILOT-NOTES.md b/benchmarks/e1-lifecycle/PILOT-NOTES.md
index c1e918f..bce00fa 100644
--- a/benchmarks/e1-lifecycle/PILOT-NOTES.md
+++ b/benchmarks/e1-lifecycle/PILOT-NOTES.md
@@ -60,3 +60,18 @@ Directional preview persists (not registered results): outcome feedback buries t
 (0.111 vs 0.733); full-lifecycle current R@5 BELOW all-off (0.257 vs 0.727 - decay
 sacrifices old-but-current facts to fresh noise at this horizon); stale-suppression helps
 (0.475 vs 0.875). The per-mechanism arms will attribute these causally.
+
+## Pilot 4 - 2026-06-11, post codex-round-1 fixes (deterministic ids, disjoint contradictions)
+
+| final epoch | full | all-off | guard [0.10, 0.90] |
+|---|---|---|---|
+| current R@5 | 0.320 | 0.727 | PASS |
+| stale-intrusion | 0.475 | 0.825 | PASS |
+| trap-persistence | 0.067 | 0.778 | PASS |
+| hot R@5 | 0.480 | 0.693 | PASS |
+
+Wall-clock: all-off 87.2s, full 161.1s. Reproducibility now proven by invariant test
+(two identical (arm, seed) runs produce byte-identical metrics; entry ids derived from
+(seed, protocol id) instead of random UUIDs). Contradiction facts disjoint from updated
+facts (attribution unconfounded). Guard passes; protocol ready to freeze pending codex
+convergence.
diff --git a/scripts/e1-lifecycle/generate.mjs b/scripts/e1-lifecycle/generate.mjs
index ada1f0c..bd3f5bb 100644
--- a/scripts/e1-lifecycle/generate.mjs
+++ b/scripts/e1-lifecycle/generate.mjs
@@ -128,11 +128,15 @@ export function generateProtocol(opts) {
   let memSeq = 0;
   const mid = () => `pm${memSeq++}`;
 
-  // Which facts get updates (40%) and contradictions (10%) - disjoint draws
-  // from a shuffled index list so fractions are exact, not stochastic.
+  // Which facts get updates (40%) and contradictions (10%). Fractions are
+  // exact (shuffled-slice draws, not stochastic) and the two sets are
+  // DISJOINT: contradictions sample from the NON-updated remainder, so
+  // contradiction-intrusion is never confounded with version chains
+  // (codex P2: the earlier draw re-sampled the full list and could overlap).
   const order = shuffle(rand, Array.from({ length: numFacts }, (_, i) => i));
   const updateSet = new Set(order.slice(0, Math.round(numFacts * 0.4)));
-  const contraSet = new Set(shuffle(rand, order).slice(0, Math.round(numFacts * 0.1)));
+  const nonUpdated = order.filter((i) => !updateSet.has(i));
+  const contraSet = new Set(shuffle(rand, nonUpdated).slice(0, Math.round(numFacts * 0.1)));
   // Traps: 15% of facts get one plausible-but-wrong memory that receives --bad.
   const trapSet = new Set(shuffle(rand, order).slice(0, Math.round(numFacts * 0.15)));
 
diff --git a/scripts/e1-lifecycle/run.mjs b/scripts/e1-lifecycle/run.mjs
index 4dca5ae..cda1d76 100644
--- a/scripts/e1-lifecycle/run.mjs
+++ b/scripts/e1-lifecycle/run.mjs
@@ -192,8 +192,14 @@ export async function runArmSeed(arm, seed, genOpts = {}, inspect = undefined) {
       setSimulatedNow(session.date); // mutators stamp simulated time
 
       // 1. Ingest this session's memories (created/last_retrieved = fake now).
+      //    Entry ids are DERIVED from (seed, protocol id), not random UUIDs:
+      //    the protocol intentionally creates score TIES (identical-form
+      //    negatives), and same-timestamp rows order by id - random ids would
+      //    make identical (arm, seed) runs produce different top-5 metrics
+      //    (codex P1). sha256 prefix keeps the mem_<12 hex> format.
       for (const m of bySession.get(session.index) ?? []) {
         const entry = createMemory(m.content);
+        entry.id = `mem_${createHash('sha256').update(`e1:${seed}:${m.id}`).digest('hex').slice(0, 12)}`;
         writeEntry(hippoRoot, entry);
         idMap.set(m.id, entry.id);
       }
diff --git a/tests/e1-harness.test.ts b/tests/e1-harness.test.ts
index 48cb626..018b53c 100644
--- a/tests/e1-harness.test.ts
+++ b/tests/e1-harness.test.ts
@@ -49,6 +49,10 @@ describe('E1 generator', () => {
     const updatedFacts = new Set(p.memories.filter((m: any) => m.kind === 'update').map((m: any) => m.factId));
     expect(updatedFacts.size).toBe(20);
     expect(p.memories.filter((m: any) => m.kind === 'contradiction').length).toBe(5);
+    // Updates and contradictions are DISJOINT (codex P2): a contradicted fact
+    // never has a version chain, so contradiction-intrusion is unconfounded.
+    const contraFacts = new Set(p.memories.filter((m: any) => m.kind === 'contradiction').map((m: any) => m.factId));
+    for (const cf of contraFacts) expect(updatedFacts.has(cf)).toBe(false);
     // Hard-negative pool >= 10x facts.
     expect(p.meta.counts.distractors).toBeGreaterThanOrEqual(500);
     // Version timelines strictly ordered; updates strictly after v1.
@@ -119,6 +123,17 @@ describe('E1 driver', () => {
     for (const v of ABLATION_ENV_VARS) expect(process.env[v]).toBeUndefined();
   });
 
+  it('is REPRODUCIBLE: identical (arm, seed) runs produce identical metrics (codex P1)', async () => {
+    // The protocol intentionally creates score ties (identical-form
+    // negatives); with random entry UUIDs, tie order differed across runs of
+    // the same seed. Entry ids are now derived from (seed, protocol id) -
+    // two full runs must agree to the byte.
+    const a = await runArmSeed('full', 11, TINY);
+    const b = await runArmSeed('full', 11, TINY);
+    expect(JSON.stringify(a.epochs)).toBe(JSON.stringify(b.epochs));
+    expect(a.meta.protocolHash).toBe(b.meta.protocolHash);
+  });
+
   it('baseline arms produce rankings (bm25-static + recency-window)', async () => {
     const bm25 = await runArmSeed('bm25-static', 4, TINY);
     const rec = await runArmSeed('recency-window', 4, TINY);

From de285f3e5fba26f7e05ae15ab3e5920a3ec220de Mon Sep 17 00:00:00 2001
From: Keith So <68618199+kitfunso@users.noreply.github.com>
Date: Thu, 11 Jun 2026 20:32:36 +0100
Subject: [PATCH 4/5] fix(eval): content-blind bm25-static tie-break + pretest
 build prerequisite

codex round 2 on the E1 harness:

1. The bm25-static baseline re-sorted hybridSearch results by raw BM25 with a
   bare stable sort, so identical-BM25 candidates (which this protocol
   deliberately mass-produces) kept the composite lifecycle/recency-tinged
   ordering - the baseline was not actually BM25-only at ties. Tie-break now
   by entry id (seed-derived, deterministic, content-blind).

2. tests/e1-harness.test.ts imports the harness, which imports ../../dist/ -
   a clean-checkout `npm test` (vitest without build) failed before tests ran.
   Added "pretest": "npm run build" so npm test always tests built code; CI
   already builds first (ci.yml).

8 invariant tests green.

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
---
 package.json                 | 1 +
 scripts/e1-lifecycle/run.mjs | 6 +++++-
 2 files changed, 6 insertions(+), 1 deletion(-)

diff --git a/package.json b/package.json
index fc1a2a2..d9c088c 100644
--- a/package.json
+++ b/package.json
@@ -27,6 +27,7 @@
     "baseline": "npm run build && npm test",
     "build": "tsc && tsc -p tsconfig.benchmarks.json",
     "dev": "tsc --watch",
+    "pretest": "npm run build",
     "test": "vitest run",
     "test:watch": "vitest",
     "postinstall": "node scripts/postinstall.cjs",
diff --git a/scripts/e1-lifecycle/run.mjs b/scripts/e1-lifecycle/run.mjs
index cda1d76..ec4e871 100644
--- a/scripts/e1-lifecycle/run.mjs
+++ b/scripts/e1-lifecycle/run.mjs
@@ -104,8 +104,12 @@ async function probeEpoch(protocol, entries, epoch, epochDate, arm) {
       top = entries.slice().sort((a, b) => Date.parse(b.created) - Date.parse(a.created)).slice(0, PROBE_TOP_K);
     } else {
       const results = await hybridSearch(probe.query, entries, { budget: PROBE_BUDGET, now: probeNow, minResults: PROBE_TOP_K });
+      // bm25-static tie-break MUST be independent of the composite order: a
+      // bare stable sort would let identical-BM25 candidates (this protocol
+      // creates many) keep hybridSearch's lifecycle/recency-tinged ordering
+      // (codex P2). Entry id is deterministic (seed-derived) and content-blind.
       const ranked = arm === 'bm25-static'
-        ? results.slice().sort((a, b) => b.bm25 - a.bm25)
+        ? results.slice().sort((a, b) => (b.bm25 - a.bm25) || a.entry.id.localeCompare(b.entry.id))
         : results;
       top = ranked.slice(0, PROBE_TOP_K).map((r) => r.entry);
     }

From a6eafcbe50199eaee729f74066fc0449fa03e34c Mon Sep 17 00:00:00 2001
From: Keith So <68618199+kitfunso@users.noreply.github.com>
Date: Thu, 11 Jun 2026 20:42:02 +0100
Subject: [PATCH 5/5] fix(eval): reproducibility-test timeout + gitignore E1
 raw outputs (codex round 3)

- 60s explicit timeout on the two-full-run reproducibility test (two real
  SQLite store runs sit at ~4.8s, brushing vitest's 5s default - a cold-CI
  flake waiting to happen).
- benchmarks/e1-lifecycle/raw/ gitignored: harness outputs are generated;
  the REGISTERED results are committed to the hippo-paper artifact repo per
  the prereg, not to this code repo.

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
---
 .gitignore               | 1 +
 tests/e1-harness.test.ts | 2 +-
 2 files changed, 2 insertions(+), 1 deletion(-)

diff --git a/.gitignore b/.gitignore
index c34da01..41a7900 100644
--- a/.gitignore
+++ b/.gitignore
@@ -65,3 +65,4 @@ tmp-card4-20seed/
 trajectories/
 benchmarks/lifecycle-stress/results-2*.json
 benchmarks/lifecycle-stress/labels/
+benchmarks/e1-lifecycle/raw/
diff --git a/tests/e1-harness.test.ts b/tests/e1-harness.test.ts
index 018b53c..f92cf35 100644
--- a/tests/e1-harness.test.ts
+++ b/tests/e1-harness.test.ts
@@ -132,7 +132,7 @@ describe('E1 driver', () => {
     const b = await runArmSeed('full', 11, TINY);
     expect(JSON.stringify(a.epochs)).toBe(JSON.stringify(b.epochs));
     expect(a.meta.protocolHash).toBe(b.meta.protocolHash);
-  });
+  }, 60_000); // two full real-store runs; default 5s timeout is a CI flake (codex P2)
 
   it('baseline arms produce rankings (bm25-static + recency-window)', async () => {
     const bm25 = await runArmSeed('bm25-static', 4, TINY);