From f28ed16b854659509795925d7020b6cf433df5c6 Mon Sep 17 00:00:00 2001
From: Willi Budzinski <w.budzinski@telekom.de>
Date: Fri, 19 Jun 2026 19:04:44 +0200
Subject: [PATCH 1/4] feat: add hermetic longmemeval harness foundation

---
 .gitignore                                    |   2 +
 README.md                                     |   2 +
 benchmark/README.md                           |   5 +
 benchmark/longmemeval/Makefile                |  21 +
 benchmark/longmemeval/README.md               |  41 ++
 benchmark/longmemeval/data/smoke-ids.txt      |  50 +++
 benchmark/longmemeval/prompts/judge.md        |   1 +
 benchmark/longmemeval/prompts/reader.md       |   1 +
 benchmark/longmemeval/src/check.ts            | 116 +++++
 benchmark/longmemeval/src/data.ts             | 128 ++++++
 benchmark/longmemeval/src/manifest.ts         | 103 +++++
 benchmark/longmemeval/src/render-tables.ts    |  39 ++
 benchmark/longmemeval/src/stats.ts            | 130 ++++++
 benchmark/longmemeval/src/systems.ts          | 101 +++++
 benchmark/longmemeval/src/types.ts            | 155 +++++++
 .../plan.md                                   | 311 ++++++++++++++
 .../todo.md                                   | 141 ++++++
 package.json                                  |   1 +
 test/fixtures/longmemeval/invalid-turn.json   |  14 +
 test/fixtures/longmemeval/mini.json           |  48 +++
 .../longmemeval/mismatched-sessions.json      |  14 +
 .../longmemeval/missing-question-id.json      |  13 +
 test/fixtures/longmemeval/non-array.json      |   3 +
 test/longmemeval-harness.test.ts              | 404 ++++++++++++++++++
 test/quality-gates.test.ts                    |   3 +
 25 files changed, 1847 insertions(+)
 create mode 100644 benchmark/longmemeval/Makefile
 create mode 100644 benchmark/longmemeval/README.md
 create mode 100644 benchmark/longmemeval/data/smoke-ids.txt
 create mode 100644 benchmark/longmemeval/prompts/judge.md
 create mode 100644 benchmark/longmemeval/prompts/reader.md
 create mode 100644 benchmark/longmemeval/src/check.ts
 create mode 100644 benchmark/longmemeval/src/data.ts
 create mode 100644 benchmark/longmemeval/src/manifest.ts
 create mode 100644 benchmark/longmemeval/src/render-tables.ts
 create mode 100644 benchmark/longmemeval/src/stats.ts
 create mode 100644 benchmark/longmemeval/src/systems.ts
 create mode 100644 benchmark/longmemeval/src/types.ts
 create mode 100644 docs/todos/2026-06-19-issue-313-longmemeval-harness/plan.md
 create mode 100644 docs/todos/2026-06-19-issue-313-longmemeval-harness/todo.md
 create mode 100644 test/fixtures/longmemeval/invalid-turn.json
 create mode 100644 test/fixtures/longmemeval/mini.json
 create mode 100644 test/fixtures/longmemeval/mismatched-sessions.json
 create mode 100644 test/fixtures/longmemeval/missing-question-id.json
 create mode 100644 test/fixtures/longmemeval/non-array.json
 create mode 100644 test/longmemeval-harness.test.ts

diff --git a/.gitignore b/.gitignore
index bb91374cd..90b4d68bf 100644
--- a/.gitignore
+++ b/.gitignore
@@ -13,6 +13,8 @@ dist/
 plugin/scripts/*.map
 plugin/scripts/*.d.mts
 data/
+!benchmark/longmemeval/data/
+!benchmark/longmemeval/data/smoke-ids.txt
 !eval/data/
 !eval/data/**
 data-*/
diff --git a/README.md b/README.md
index 5a0cf2032..058a83f64 100644
--- a/README.md
+++ b/README.md
@@ -286,6 +286,8 @@ Latest release notes: [CHANGELOG.md](CHANGELOG.md).
 
 **Reproduce locally:** [`eval/README.md`](eval/README.md) — adapter-pluggable harness for LongMemEval `_s` (public 500-Q) + `coding-agent-life-v1` (in-house 15-session corpus). Grep / vector / agentmemory adapters score side-by-side, NDJSON output, published scorecards land in [`docs/benchmarks/`](docs/benchmarks/).
 
+**QA harness foundation:** [`benchmark/longmemeval/`](benchmark/longmemeval/) contains the hermetic issue #313 harness scaffold for future end-to-end LongMemEval-S reader/judge runs, statistical tests, manifests, and reproducibility checks. The published numbers above remain retrieval-only until an approved provider-backed run publishes QA results.
+
 **Pairs with [codegraph](https://github.com/colbymchenry/codegraph), [Understand Anything](https://github.com/Lum1104/Understand-Anything), and [Graphify](https://github.com/safishamsi/graphify).** Code-graph indexing, multi-agent build pipelines, and broader knowledge graphs across docs / PDFs / images / videos. agentmemory remembers the work; those three projects light up the rest of the context layer. Recipes + question-routing table: [`docs/recipes/pairings.md`](docs/recipes/pairings.md).
 
 ---
diff --git a/benchmark/README.md b/benchmark/README.md
index 23d228522..fb2fd9ab9 100644
--- a/benchmark/README.md
+++ b/benchmark/README.md
@@ -11,6 +11,11 @@ Two kinds of numbers live in this directory:
    throughput against a running daemon. This is the file you want when
    somebody asks "what's p99 at 100k memories under concurrency 100?".
 
+`longmemeval/` is the hermetic foundation for the issue #313
+LongMemEval-S QA/statistics harness. It does not run provider-backed
+reader or judge models yet; use `corepack pnpm run bench:longmemeval:check`
+to validate the local harness structure.
+
 ## load-100k.ts
 
 Hand-rolled, dependency-free load harness. Issues real HTTP against a
diff --git a/benchmark/longmemeval/Makefile b/benchmark/longmemeval/Makefile
new file mode 100644
index 000000000..ac9cecdd6
--- /dev/null
+++ b/benchmark/longmemeval/Makefile
@@ -0,0 +1,21 @@
+ROOT := ../..
+
+.PHONY: check smoke-ids data reproduce reproduce-full
+
+check:
+	cd $(ROOT) && corepack pnpm run bench:longmemeval:check
+
+smoke-ids:
+	cd $(ROOT) && corepack pnpm run bench:longmemeval:check
+
+data:
+	@echo "LongMemEval data download is approval-required and intentionally not part of the hermetic check."
+	@exit 2
+
+reproduce:
+	@echo "Provider-backed LongMemEval smoke reproduction requires explicit approval, dataset availability, and model credentials."
+	@exit 2
+
+reproduce-full:
+	@echo "Full LongMemEval-S reproduction requires explicit approval, dataset availability, model credentials, and runtime budget."
+	@exit 2
diff --git a/benchmark/longmemeval/README.md b/benchmark/longmemeval/README.md
new file mode 100644
index 000000000..0dfb5b2a8
--- /dev/null
+++ b/benchmark/longmemeval/README.md
@@ -0,0 +1,41 @@
+# LongMemEval-S Harness
+
+This directory is the foundation for the issue #313 LongMemEval-S QA harness.
+It is intentionally hermetic in this first slice: no dataset download, no
+submodule, no provider calls, no new dependencies, and no historical baseline
+claim.
+
+The existing LongMemEval numbers in `benchmark/LONGMEMEVAL.md` are
+retrieval-only diagnostics. This harness adds the reproducibility and
+statistics structure needed for a later approved end-to-end run with reader and
+judge models.
+
+## Local Check
+
+```sh
+corepack pnpm run bench:longmemeval:check
+make -C benchmark/longmemeval check
+```
+
+The check validates:
+
+- the tiny checked-in fixture shape,
+- the 50 smoke IDs against committed retrieval result provenance,
+- the six issue #313 system definitions,
+- prompt hashing and manifest redaction,
+- markdown table rendering.
+
+## Deferred Full-Scope Work
+
+These targets are present to document the intended workflow, but they fail
+closed until maintainers approve the needed boundaries:
+
+```sh
+make -C benchmark/longmemeval data
+make -C benchmark/longmemeval reproduce
+make -C benchmark/longmemeval reproduce-full
+```
+
+Approval is required before adding a LongMemEval submodule, downloading the
+real dataset in automation, calling reader or judge models, wiring a real PR CI
+benchmark gate, or publishing a `v0.9.24` QA baseline.
diff --git a/benchmark/longmemeval/data/smoke-ids.txt b/benchmark/longmemeval/data/smoke-ids.txt
new file mode 100644
index 000000000..5287368f4
--- /dev/null
+++ b/benchmark/longmemeval/data/smoke-ids.txt
@@ -0,0 +1,50 @@
+e47becba
+118b2229
+51a45a95
+58bf7951
+1e043500
+c5e8278d
+6ade9755
+6f9b354f
+58ef2f1c
+f8c5f88b
+5d3d2817
+7527f7e2
+c960da58
+3b6f954b
+726462e0
+94f70d80
+66f24dbb
+ad7109d1
+af8d2e46
+dccbc061
+c8c3f81d
+8ebdbe50
+6b168ec8
+75499fd8
+21436231
+95bcc1c8
+0862e8bf
+853b0a1d
+a06e4cfe
+37d43f65
+b86304ba
+d52b4f67
+25e5aa4f
+caf9ead2
+8550ddae
+60d45044
+3f1e9474
+86b68151
+577d4d32
+ec81a493
+15745da0
+e01b8e2f
+bc8a6e93
+ccb36322
+001be529
+b320f3f8
+19b5f2b3
+4fd1909e
+545bd2b5
+8a137a7f
diff --git a/benchmark/longmemeval/prompts/judge.md b/benchmark/longmemeval/prompts/judge.md
new file mode 100644
index 000000000..edc2a15f7
--- /dev/null
+++ b/benchmark/longmemeval/prompts/judge.md
@@ -0,0 +1 @@
+You are the LongMemEval-S judge. Score whether the answer matches the reference answer.
diff --git a/benchmark/longmemeval/prompts/reader.md b/benchmark/longmemeval/prompts/reader.md
new file mode 100644
index 000000000..79677366d
--- /dev/null
+++ b/benchmark/longmemeval/prompts/reader.md
@@ -0,0 +1 @@
+You are the LongMemEval-S reader. Answer the question using only the retrieved context.
diff --git a/benchmark/longmemeval/src/check.ts b/benchmark/longmemeval/src/check.ts
new file mode 100644
index 000000000..b6c72d600
--- /dev/null
+++ b/benchmark/longmemeval/src/check.ts
@@ -0,0 +1,116 @@
+import { readFileSync } from "node:fs";
+import { fileURLToPath } from "node:url";
+import { loadLongMemEvalRows, selectSmokeRows } from "./data.js";
+import { buildManifest } from "./manifest.js";
+import { renderResultsTables } from "./render-tables.js";
+import { getLongMemEvalSystems } from "./systems.js";
+
+export interface LongMemEvalCheckResult {
+  ok: true;
+  fixtureRows: number;
+  systems: number;
+  smokeIds: number;
+}
+
+const fixturePath = fileURLToPath(
+  new URL("../../../test/fixtures/longmemeval/mini.json", import.meta.url),
+);
+const smokeIdsPath = fileURLToPath(new URL("../data/smoke-ids.txt", import.meta.url));
+const hybridResultsPath = fileURLToPath(
+  new URL("../../data/longmemeval_results_hybrid.json", import.meta.url),
+);
+
+function readSmokeIds(): string[] {
+  return readFileSync(smokeIdsPath, "utf8")
+    .split(/\r?\n/)
+    .map((line) => line.trim())
+    .filter(Boolean);
+}
+
+function validateSmokeIds(smokeIds: string[]): void {
+  if (smokeIds.length !== 50) {
+    throw new Error(`expected 50 smoke ids, got ${smokeIds.length}`);
+  }
+  if (new Set(smokeIds).size !== smokeIds.length) {
+    throw new Error("smoke ids must be unique");
+  }
+  const hybridResults = JSON.parse(readFileSync(hybridResultsPath, "utf8")) as {
+    per_question?: Array<{ question_id?: string }>;
+  };
+  const knownIds = new Set(
+    (hybridResults.per_question ?? []).map((row) => row.question_id).filter(Boolean),
+  );
+  const missing = smokeIds.filter((id) => !knownIds.has(id));
+  if (missing.length > 0) {
+    throw new Error(`smoke ids missing from checked-in hybrid results: ${missing.join(", ")}`);
+  }
+}
+
+export async function runLongMemEvalCheck(): Promise<LongMemEvalCheckResult> {
+  const fixtureRows = loadLongMemEvalRows(fixturePath);
+  selectSmokeRows(fixtureRows, ["q1", "q2", "q3"]);
+
+  const smokeIds = readSmokeIds();
+  validateSmokeIds(smokeIds);
+
+  const systems = getLongMemEvalSystems();
+  const manifest = buildManifest({
+    runId: "check",
+    createdAt: "2026-06-19T00:00:00.000Z",
+    commitSha: "local-check",
+    packageVersion: "0.0.0-check",
+    dataset: { name: "fixture", sha256: "fixture" },
+    prompts: { reader: "reader", judge: "judge" },
+    models: {
+      reader: { provider: "mock", model: "mock-reader", temperature: 0 },
+      judge: { provider: "mock", model: "mock-judge", temperature: 0 },
+    },
+    systems,
+  });
+  if (manifest.systems.length !== 6) {
+    throw new Error(`expected 6 systems in manifest, got ${manifest.systems.length}`);
+  }
+
+  const markdown = renderResultsTables({
+    categories: {
+      fixture: {
+        "agentmemory-baseline": {
+          n: 1,
+          correct: 1,
+          accuracy: 1,
+          ci: { low: 1, high: 1 },
+        },
+      },
+    },
+    hypotheses: [
+      {
+        id: "check",
+        comparison: "fixture",
+        rawPValue: 1,
+        adjustedPValue: 1,
+        claimed: false,
+      },
+    ],
+  });
+  if (!markdown.includes("directional")) {
+    throw new Error("rendered table should include directional claim label");
+  }
+
+  return {
+    ok: true,
+    fixtureRows: fixtureRows.length,
+    systems: systems.length,
+    smokeIds: smokeIds.length,
+  };
+}
+
+if (process.argv[1] && fileURLToPath(import.meta.url) === process.argv[1]) {
+  runLongMemEvalCheck()
+    .then((result) => {
+      console.log(JSON.stringify(result, null, 2));
+    })
+    .catch((err: unknown) => {
+      console.error(err instanceof Error ? err.message : String(err));
+      process.exit(1);
+    });
+}
diff --git a/benchmark/longmemeval/src/data.ts b/benchmark/longmemeval/src/data.ts
new file mode 100644
index 000000000..d3013c503
--- /dev/null
+++ b/benchmark/longmemeval/src/data.ts
@@ -0,0 +1,128 @@
+import { createHash } from "node:crypto";
+import { readFileSync } from "node:fs";
+import type {
+  ChecksumResult,
+  LongMemEvalRawRow,
+  LongMemEvalRow,
+  LongMemEvalTurn,
+} from "./types.js";
+
+function requireString(value: unknown, field: string): string {
+  if (typeof value !== "string") {
+    throw new Error(`${field} must be a string`);
+  }
+  return value;
+}
+
+function requireStringArray(value: unknown, field: string): string[] {
+  if (!Array.isArray(value) || value.some((item) => typeof item !== "string")) {
+    throw new Error(`${field} must be an array of strings`);
+  }
+  return value;
+}
+
+function validateTurn(value: unknown, index: number): LongMemEvalTurn {
+  if (typeof value !== "object" || value === null) {
+    throw new Error(`turn ${index} must be an object`);
+  }
+  const record = value as Record<string, unknown>;
+  return {
+    role: requireString(record.role, `turn ${index} role`),
+    content: requireString(record.content, `turn ${index} content`),
+  };
+}
+
+function validateRawRow(value: unknown): LongMemEvalRawRow {
+  if (typeof value !== "object" || value === null) {
+    throw new Error("LongMemEval row must be an object");
+  }
+  const record = value as Record<string, unknown>;
+  const questionId = requireString(record.question_id, "question_id");
+  const haystackSessionIds = requireStringArray(
+    record.haystack_session_ids,
+    "haystack_session_ids",
+  );
+  if (!Array.isArray(record.haystack_sessions)) {
+    throw new Error("haystack_sessions must be an array");
+  }
+  if (haystackSessionIds.length !== record.haystack_sessions.length) {
+    throw new Error(
+      `LongMemEval row ${questionId}: haystack_session_ids (${haystackSessionIds.length}) and haystack_sessions (${record.haystack_sessions.length}) length mismatch`,
+    );
+  }
+  const haystackSessions = record.haystack_sessions.map((session, sessionIndex) => {
+    if (!Array.isArray(session)) {
+      throw new Error(`haystack_sessions ${sessionIndex} must be an array`);
+    }
+    return session.map((turn, turnIndex) => validateTurn(turn, turnIndex));
+  });
+
+  const raw: LongMemEvalRawRow = {
+    question_id: questionId,
+    question_type: requireString(record.question_type, "question_type"),
+    question: requireString(record.question, "question"),
+    answer_session_ids: requireStringArray(record.answer_session_ids, "answer_session_ids"),
+    haystack_session_ids: haystackSessionIds,
+    haystack_sessions: haystackSessions,
+  };
+  if (record.answer !== undefined) raw.answer = requireString(record.answer, "answer");
+  return raw;
+}
+
+function flattenSession(turns: LongMemEvalTurn[]): string {
+  return turns.map((turn) => `[${turn.role}] ${turn.content}`).join("\n\n");
+}
+
+export function loadLongMemEvalRows(path: string): LongMemEvalRow[] {
+  const raw = JSON.parse(readFileSync(path, "utf8")) as unknown;
+  if (!Array.isArray(raw)) {
+    throw new Error("expected LongMemEval JSON array");
+  }
+  return raw.map((item) => {
+    const row = validateRawRow(item);
+    return {
+      id: row.question_id,
+      type: row.question_type,
+      question: row.question,
+      answer: row.answer,
+      answerSessionIds: row.answer_session_ids,
+      haystack: row.haystack_session_ids.map((id, index) => ({
+        id,
+        turns: row.haystack_sessions[index],
+        content: flattenSession(row.haystack_sessions[index]),
+      })),
+    };
+  });
+}
+
+export function selectSmokeRows(
+  rows: LongMemEvalRow[],
+  questionIds: string[],
+): LongMemEvalRow[] {
+  const byId = new Map(rows.map((row) => [row.id, row]));
+  const missing = questionIds.filter((id) => !byId.has(id));
+  if (missing.length > 0) {
+    throw new Error(`missing smoke question ids: ${missing.join(", ")}`);
+  }
+  return questionIds.map((id) => byId.get(id)!);
+}
+
+export function sha256File(path: string): string {
+  return createHash("sha256").update(readFileSync(path)).digest("hex");
+}
+
+export function verifyChecksumLine(line: string): ChecksumResult {
+  const match = line.match(/^([a-fA-F0-9]+)\s+(.+)$/);
+  if (!match) {
+    throw new Error("checksum line must be '<sha256>  <path>'");
+  }
+  const expected = match[1].toLowerCase();
+  const path = match[2];
+  const actual = sha256File(path);
+  return {
+    ok: actual === expected,
+    expected,
+    actual,
+    path,
+  };
+}
diff --git a/benchmark/longmemeval/src/manifest.ts b/benchmark/longmemeval/src/manifest.ts
new file mode 100644
index 000000000..e222cec78
--- /dev/null
+++ b/benchmark/longmemeval/src/manifest.ts
@@ -0,0 +1,103 @@
+import { createHash } from "node:crypto";
+import type {
+  LongMemEvalManifest,
+  LongMemEvalManifestInput,
+  LongMemEvalModelConfig,
+  LongMemEvalSystemDefinition,
+} from "./types.js";
+
+const SECRET_KEY_PATTERN = /(?:api[-_]?key|authorization|auth|secret|token|password|credential)/i;
+const SECRET_VALUE_PATTERNS = [
+  /(?:api[_-]?key|secret|token|password|credential|auth)[\s]*[=:]\s*["']?[A-Za-z0-9_\-/.+]{20,}["']?/i,
+  /Bearer\s+[A-Za-z0-9._\-+/=]{20,}/i,
+  /sk-proj-[A-Za-z0-9\-_]{20,}/,
+  /(?:sk|pk|rk|ak)-[A-Za-z0-9][A-Za-z0-9\-_]{19,}/,
+  /sk-ant-[A-Za-z0-9\-_]{20,}/,
+  /gh[pus]_[A-Za-z0-9]{36,}/,
+  /github_pat_[A-Za-z0-9_]{22,}/,
+  /xoxb-[A-Za-z0-9\-]+/,
+  /AKIA[0-9A-Z]{16}/,
+  /AIza[A-Za-z0-9\-_]{35}/,
+  /eyJ[A-Za-z0-9_-]{10,}\.[A-Za-z0-9_-]{10,}\.[A-Za-z0-9_-]{10,}/,
+  /npm_[A-Za-z0-9]{36}/,
+  /glpat-[A-Za-z0-9\-_]{20,}/,
+  /dop_v1_[A-Za-z0-9]{64}/,
+];
+
+function sha256Text(text: string): string {
+  return createHash("sha256").update(text).digest("hex");
+}
+
+function looksSecret(value: string): boolean {
+  return SECRET_VALUE_PATTERNS.some((pattern) => pattern.test(value));
+}
+
+function sanitizeMetadata(metadata?: Record<string, unknown>): Record<string, unknown> | undefined {
+  if (!metadata) return undefined;
+  const sanitized: Record<string, unknown> = {};
+  for (const [key, value] of Object.entries(metadata)) {
+    if (SECRET_KEY_PATTERN.test(key)) continue;
+    if (typeof value === "string" && looksSecret(value)) continue;
+    if (value === null || ["string", "number", "boolean"].includes(typeof value)) {
+      sanitized[key] = value;
+    }
+  }
+  return Object.keys(sanitized).length > 0 ? sanitized : undefined;
+}
+
+function sanitizeEnv(env: Record<string, string>): Record<string, string> {
+  const sanitized: Record<string, string> = {};
+  for (const [key, value] of Object.entries(env)) {
+    if (SECRET_KEY_PATTERN.test(key)) continue;
+    if (looksSecret(value)) continue;
+    sanitized[key] = value;
+  }
+  return sanitized;
+}
+
+function modelConfig(model: LongMemEvalModelConfig): LongMemEvalModelConfig {
+  const safe: LongMemEvalModelConfig = {
+    provider: model.provider,
+    model: model.model,
+    temperature: model.temperature,
+  };
+  if (model.maxTokens !== undefined) safe.maxTokens = model.maxTokens;
+  return safe;
+}
+
+function systemDefinition(system: LongMemEvalSystemDefinition): LongMemEvalSystemDefinition {
+  return {
+    id: system.id,
+    label: system.label,
+    description: system.description,
+    endpoint: system.endpoint,
+    retrievalBudgetTokens: system.retrievalBudgetTokens,
+    env: sanitizeEnv(system.env),
+  };
+}
+
+export function buildManifest(input: LongMemEvalManifestInput): LongMemEvalManifest {
+  const manifest: LongMemEvalManifest = {
+    schemaVersion: 1,
+    runId: input.runId,
+    createdAt: input.createdAt ?? new Date().toISOString(),
+    commitSha: input.commitSha,
+    packageVersion: input.packageVersion,
+    dataset: {
+      name: input.dataset.name,
+      sha256: input.dataset.sha256,
+    },
+    promptHashes: {
+      reader: sha256Text(input.prompts.reader),
+      judge: sha256Text(input.prompts.judge),
+    },
+    models: {
+      reader: modelConfig(input.models.reader),
+      judge: modelConfig(input.models.judge),
+    },
+    systems: input.systems.map((system) => systemDefinition(system)),
+  };
+  const metadata = sanitizeMetadata(input.metadata);
+  if (metadata) manifest.metadata = metadata;
+  return manifest;
+}
diff --git a/benchmark/longmemeval/src/render-tables.ts b/benchmark/longmemeval/src/render-tables.ts
new file mode 100644
index 000000000..f860a0d4b
--- /dev/null
+++ b/benchmark/longmemeval/src/render-tables.ts
@@ -0,0 +1,39 @@
+import type { ResultsTableSummary } from "./types.js";
+
+function pct(value: number): string {
+  return `${(value * 100).toFixed(1)}%`;
+}
+
+function pValue(value: number): string {
+  return value.toPrecision(3);
+}
+
+export function renderResultsTables(summary: ResultsTableSummary): string {
+  const lines: string[] = [
+    "# LongMemEval-S Results",
+    "",
+    "## Accuracy By Category",
+    "",
+    "| Category | System | n | Correct | Accuracy | 95% CI |",
+    "|---|---|---:|---:|---:|---|",
+  ];
+
+  for (const [category, systems] of Object.entries(summary.categories)) {
+    for (const [system, row] of Object.entries(systems)) {
+      lines.push(
+        `| ${category} | ${system} | ${row.n} | ${row.correct} | ${pct(row.accuracy)} | ${pct(row.ci.low)} - ${pct(row.ci.high)} |`,
+      );
+    }
+  }
+
+  lines.push("", "## Statistical Claims", "");
+  lines.push("| Comparison | raw p | adjusted p | Result |");
+  lines.push("|---|---:|---:|---|");
+  for (const hypothesis of summary.hypotheses) {
+    lines.push(
+      `| ${hypothesis.comparison} | ${pValue(hypothesis.rawPValue)} | ${pValue(hypothesis.adjustedPValue)} | ${hypothesis.claimed ? "claimed" : "directional"} |`,
+    );
+  }
+
+  return `${lines.join("\n")}\n`;
+}
diff --git a/benchmark/longmemeval/src/stats.ts b/benchmark/longmemeval/src/stats.ts
new file mode 100644
index 000000000..0ff936b4e
--- /dev/null
+++ b/benchmark/longmemeval/src/stats.ts
@@ -0,0 +1,130 @@
+import type {
+  AccuracyCi,
+  CategorySystemSummary,
+  CorrectnessRow,
+  HypothesisPValue,
+  McNemarResult,
+  PairedOutcome,
+} from "./types.js";
+
+function binomialCoefficient(n: number, k: number): number {
+  let result = 1;
+  const limit = Math.min(k, n - k);
+  for (let i = 1; i <= limit; i++) {
+    result = (result * (n - limit + i)) / i;
+  }
+  return result;
+}
+
+function binomialProbability(n: number, k: number): number {
+  return binomialCoefficient(n, k) * 0.5 ** n;
+}
+
+export function mcnemarExact(pairs: PairedOutcome[]): McNemarResult {
+  let b = 0;
+  let c = 0;
+  for (const pair of pairs) {
+    if (pair.a && !pair.b) b++;
+    if (!pair.a && pair.b) c++;
+  }
+  const nDiscordant = b + c;
+  if (nDiscordant === 0) {
+    return { b, c, nDiscordant, pValue: 1 };
+  }
+  const tailLimit = Math.min(b, c);
+  let oneTail = 0;
+  for (let i = 0; i <= tailLimit; i++) {
+    oneTail += binomialProbability(nDiscordant, i);
+  }
+  return {
+    b,
+    c,
+    nDiscordant,
+    pValue: Math.min(1, oneTail * 2),
+  };
+}
+
+export function applyBonferroni<T extends HypothesisPValue>(
+  hypotheses: T[],
+  familySize: number,
+  alpha = 0.05,
+): Array<T & { adjustedPValue: number; claimed: boolean }> {
+  return hypotheses.map((hypothesis) => {
+    const adjustedPValue = Math.min(1, hypothesis.pValue * familySize);
+    return {
+      ...hypothesis,
+      adjustedPValue,
+      claimed: adjustedPValue < alpha,
+    };
+  });
+}
+
+function mulberry32(seed: number): () => number {
+  let state = seed >>> 0;
+  return () => {
+    state += 0x6d2b79f5;
+    let t = state;
+    t = Math.imul(t ^ (t >>> 15), t | 1);
+    t ^= t + Math.imul(t ^ (t >>> 7), t | 61);
+    return ((t ^ (t >>> 14)) >>> 0) / 4294967296;
+  };
+}
+
+function percentile(sorted: number[], p: number): number {
+  if (sorted.length === 0) return 0;
+  const index = Math.min(
+    sorted.length - 1,
+    Math.max(0, Math.floor((p / 100) * sorted.length)),
+  );
+  return sorted[index];
+}
+
+export function bootstrapAccuracyCi(
+  rows: CorrectnessRow[],
+  options: { iterations: number; seed: number },
+): AccuracyCi {
+  if (rows.length === 0) {
+    return {
+      mean: 0,
+      low: 0,
+      high: 0,
+      iterations: options.iterations,
+      seed: options.seed,
+    };
+  }
+  const mean = rows.filter((row) => row.correct).length / rows.length;
+  const random = mulberry32(options.seed);
+  const samples: number[] = [];
+  for (let i = 0; i < options.iterations; i++) {
+    let correct = 0;
+    for (let j = 0; j < rows.length; j++) {
+      const sampled = rows[Math.floor(random() * rows.length)];
+      if (sampled.correct) correct++;
+    }
+    samples.push(correct / rows.length);
+  }
+  samples.sort((a, b) => a - b);
+  return {
+    mean,
+    low: percentile(samples, 2.5),
+    high: percentile(samples, 97.5),
+    iterations: options.iterations,
+    seed: options.seed,
+  };
+}
+
+export function categoryBreakdown(
+  rows: CorrectnessRow[],
+): Record<string, Record<string, CategorySystemSummary>> {
+  const summaries: Record<string, Record<string, CategorySystemSummary>> = {};
+  for (const row of rows) {
+    const category = row.category ?? "uncategorized";
+    const systemId = row.systemId ?? "default";
+    const categorySummary = (summaries[category] ??= {});
+    const summary = (categorySummary[systemId] ??= { n: 0, correct: 0, accuracy: 0 });
+    summary.n++;
+    if (row.correct) summary.correct++;
+    summary.accuracy = summary.correct / summary.n;
+  }
+  return summaries;
+}
diff --git a/benchmark/longmemeval/src/systems.ts b/benchmark/longmemeval/src/systems.ts
new file mode 100644
index 000000000..cddfd635f
--- /dev/null
+++ b/benchmark/longmemeval/src/systems.ts
@@ -0,0 +1,101 @@
+import type { LongMemEvalSystemDefinition } from "./types.js";
+
+const SYSTEMS: readonly LongMemEvalSystemDefinition[] = [
+  {
+    id: "agentmemory-baseline",
+    label: "agentmemory baseline",
+    description: "Default agentmemory installation with consolidation enabled and default retrieval weights.",
+    endpoint: "smart-search",
+    retrievalBudgetTokens: 2500,
+    env: {
+      CONSOLIDATION_ENABLED: "true",
+      BM25_WEIGHT: "0.4",
+      VECTOR_WEIGHT: "0.6",
+      GRAPH_EXTRACTION_ENABLED: "true",
+      RERANK_ENABLED: "false",
+      AGENTMEMORY_HIGH_ORDER_CONTEXT: "false",
+    },
+  },
+  {
+    id: "agentmemory-consolidation-off",
+    label: "agentmemory consolidation off",
+    description: "Default retrieval with consolidation disabled for regression comparison.",
+    endpoint: "smart-search",
+    retrievalBudgetTokens: 2500,
+    env: {
+      CONSOLIDATION_ENABLED: "false",
+      BM25_WEIGHT: "0.4",
+      VECTOR_WEIGHT: "0.6",
+      GRAPH_EXTRACTION_ENABLED: "true",
+      RERANK_ENABLED: "false",
+      AGENTMEMORY_HIGH_ORDER_CONTEXT: "false",
+    },
+  },
+  {
+    id: "agentmemory-bm25-only",
+    label: "agentmemory BM25 only",
+    description: "Keyword-only retrieval channel for isolating lexical recall.",
+    endpoint: "smart-search",
+    retrievalBudgetTokens: 2500,
+    env: {
+      CONSOLIDATION_ENABLED: "true",
+      BM25_WEIGHT: "1",
+      VECTOR_WEIGHT: "0",
+      GRAPH_EXTRACTION_ENABLED: "false",
+      RERANK_ENABLED: "false",
+      AGENTMEMORY_HIGH_ORDER_CONTEXT: "false",
+    },
+  },
+  {
+    id: "agentmemory-vector-only",
+    label: "agentmemory vector only",
+    description: "Vector-only retrieval channel for isolating embedding recall.",
+    endpoint: "smart-search",
+    retrievalBudgetTokens: 2500,
+    env: {
+      CONSOLIDATION_ENABLED: "true",
+      BM25_WEIGHT: "0",
+      VECTOR_WEIGHT: "1",
+      GRAPH_EXTRACTION_ENABLED: "false",
+      RERANK_ENABLED: "false",
+      AGENTMEMORY_HIGH_ORDER_CONTEXT: "false",
+    },
+  },
+  {
+    id: "agentmemory-hybrid-no-rerank",
+    label: "agentmemory hybrid no rerank",
+    description: "BM25 plus vector retrieval without graph or reranker contribution.",
+    endpoint: "smart-search",
+    retrievalBudgetTokens: 2500,
+    env: {
+      CONSOLIDATION_ENABLED: "true",
+      BM25_WEIGHT: "0.4",
+      VECTOR_WEIGHT: "0.6",
+      GRAPH_EXTRACTION_ENABLED: "false",
+      RERANK_ENABLED: "false",
+      AGENTMEMORY_HIGH_ORDER_CONTEXT: "false",
+    },
+  },
+  {
+    id: "agentmemory-full",
+    label: "agentmemory full",
+    description: "Full retrieval and context regime for approved provider-backed benchmark runs.",
+    endpoint: "context",
+    retrievalBudgetTokens: 2500,
+    env: {
+      CONSOLIDATION_ENABLED: "true",
+      BM25_WEIGHT: "0.4",
+      VECTOR_WEIGHT: "0.6",
+      GRAPH_EXTRACTION_ENABLED: "true",
+      RERANK_ENABLED: "true",
+      AGENTMEMORY_HIGH_ORDER_CONTEXT: "true",
+    },
+  },
+];
+
+export function getLongMemEvalSystems(): LongMemEvalSystemDefinition[] {
+  return SYSTEMS.map((system) => ({
+    ...system,
+    env: { ...system.env },
+  }));
+}
diff --git a/benchmark/longmemeval/src/types.ts b/benchmark/longmemeval/src/types.ts
new file mode 100644
index 000000000..981b9bf2a
--- /dev/null
+++ b/benchmark/longmemeval/src/types.ts
@@ -0,0 +1,155 @@
+export interface LongMemEvalTurn {
+  role: string;
+  content: string;
+}
+
+export interface LongMemEvalRawRow {
+  question_id: string;
+  question_type: string;
+  question: string;
+  answer?: string;
+  answer_session_ids: string[];
+  haystack_session_ids: string[];
+  haystack_sessions: LongMemEvalTurn[][];
+}
+
+export interface LongMemEvalSession {
+  id: string;
+  content: string;
+  turns: LongMemEvalTurn[];
+}
+
+export interface LongMemEvalRow {
+  id: string;
+  type: string;
+  question: string;
+  answer?: string;
+  answerSessionIds: string[];
+  haystack: LongMemEvalSession[];
+}
+
+export interface ChecksumResult {
+  ok: boolean;
+  expected: string;
+  actual: string;
+  path: string;
+}
+
+export type LongMemEvalEndpoint = "smart-search" | "context";
+
+export interface LongMemEvalSystemDefinition {
+  id: string;
+  label: string;
+  description: string;
+  endpoint: LongMemEvalEndpoint;
+  retrievalBudgetTokens: number;
+  env: Record<string, string>;
+}
+
+export interface PairedOutcome {
+  a: boolean;
+  b: boolean;
+}
+
+export interface McNemarResult {
+  b: number;
+  c: number;
+  nDiscordant: number;
+  pValue: number;
+}
+
+export interface HypothesisPValue {
+  id: string;
+  pValue: number;
+}
+
+export interface CorrectnessRow {
+  questionId: string;
+  category?: string;
+  systemId?: string;
+  correct: boolean;
+}
+
+export interface AccuracyCi {
+  mean: number;
+  low: number;
+  high: number;
+  iterations: number;
+  seed: number;
+}
+
+export interface CategorySystemSummary {
+  n: number;
+  correct: number;
+  accuracy: number;
+}
+
+export interface LongMemEvalModelConfig {
+  provider: string;
+  model: string;
+  temperature: number;
+  maxTokens?: number;
+}
+
+export interface LongMemEvalManifestInput {
+  runId: string;
+  commitSha: string;
+  packageVersion: string;
+  createdAt?: string;
+  dataset: {
+    name: string;
+    sha256: string;
+  };
+  prompts: {
+    reader: string;
+    judge: string;
+  };
+  models: {
+    reader: LongMemEvalModelConfig;
+    judge: LongMemEvalModelConfig;
+  };
+  systems: LongMemEvalSystemDefinition[];
+  metadata?: Record<string, unknown>;
+}
+
+export interface LongMemEvalManifest {
+  schemaVersion: 1;
+  runId: string;
+  createdAt: string;
+  commitSha: string;
+  packageVersion: string;
+  dataset: {
+    name: string;
+    sha256: string;
+  };
+  promptHashes: {
+    reader: string;
+    judge: string;
+  };
+  models: {
+    reader: LongMemEvalModelConfig;
+    judge: LongMemEvalModelConfig;
+  };
+  systems: LongMemEvalSystemDefinition[];
+  metadata?: Record<string, unknown>;
+}
+
+export interface RenderCategorySummary extends CategorySystemSummary {
+  ci: {
+    low: number;
+    high: number;
+  };
+}
+
+export interface RenderHypothesisSummary {
+  id: string;
+  comparison: string;
+  rawPValue: number;
+  adjustedPValue: number;
+  claimed: boolean;
+}
+
+export interface ResultsTableSummary {
+  categories: Record<string, Record<string, RenderCategorySummary>>;
+  hypotheses: RenderHypothesisSummary[];
+}
diff --git a/docs/todos/2026-06-19-issue-313-longmemeval-harness/plan.md b/docs/todos/2026-06-19-issue-313-longmemeval-harness/plan.md
new file mode 100644
index 000000000..00be0570e
--- /dev/null
+++ b/docs/todos/2026-06-19-issue-313-longmemeval-harness/plan.md
@@ -0,0 +1,311 @@
+# Issue 313 LongMemEval Harness Implementation Plan
+
+> **For agentic workers:** REQUIRED SUB-SKILL: Use superpowers:subagent-driven-development (recommended) or superpowers:executing-plans to implement this plan task-by-task. Steps use checkbox (`- [ ]`) syntax for tracking.
+
+**Goal:** Build the first hermetic LongMemEval-S harness foundation for issue #313 without crossing provider, dataset, dependency, submodule, product API, or CI policy boundaries.
+
+**Architecture:** Add a standalone `benchmark/longmemeval/` harness that owns data validation, system-mode definitions, statistics, manifests, table rendering, docs, and a local check command. Existing retrieval-only evals remain unchanged and can be reused later only through explicit, narrow imports. Real dataset/model execution is documented as a later approved phase.
+
+**Tech Stack:** TypeScript ESM, Node built-ins (`crypto`, `fs`, `path`, `child_process`), Vitest, existing pnpm scripts, no new dependencies.
+
+---
+
+## File Structure
+
+- Create `benchmark/longmemeval/README.md`: user-facing harness contract, bounded first-slice limits, commands, output layout, deferred approvals.
+- Create `benchmark/longmemeval/Makefile`: local `check`, `smoke-ids`, `reproduce`, and `reproduce-full` targets where provider/full-data targets fail with clear approval-required messages.
+- Create `benchmark/longmemeval/data/smoke-ids.txt`: deterministic 50-id smoke list for future real dataset selection, sourced from checked-in `benchmark/data/longmemeval_results_hybrid.json` `per_question` IDs so no dataset download is needed.
+- Create `benchmark/longmemeval/prompts/reader.md`: prompt text used for prompt hashing.
+- Create `benchmark/longmemeval/prompts/judge.md`: prompt text used for prompt hashing.
+- Create `benchmark/longmemeval/src/types.ts`: shared harness types.
+- Create `benchmark/longmemeval/src/data.ts`: LongMemEval row validation, checksum helpers, smoke-id selection.
+- Create `benchmark/longmemeval/src/systems.ts`: six requested system definitions as data, with no runtime side effects.
+- Create `benchmark/longmemeval/src/stats.ts`: McNemar exact, Bonferroni, seeded bootstrap, category summaries.
+- Create `benchmark/longmemeval/src/manifest.ts`: prompt hashing and manifest builder.
+- Create `benchmark/longmemeval/src/render-tables.ts`: `results_tables.md` renderer.
+- Create `benchmark/longmemeval/src/check.ts`: hermetic check entrypoint using fixture data and no provider calls.
+- Create `test/fixtures/longmemeval/mini.json`: tiny LongMemEval-shaped fixture.
+- Create `test/longmemeval-harness.test.ts`: deterministic unit tests for all new harness utilities.
+- Modify `package.json`: add `bench:longmemeval:check`.
+- Modify `README.md`: link the new harness and clarify retrieval-only versus QA/statistical harness.
+- Modify `benchmark/README.md`: mention the new harness as a reproducibility/statistics surface.
+- Modify `docs/todos/2026-06-19-issue-313-longmemeval-harness/todo.md`: record progress and verification evidence.
+
+## Task 1: Data Validation And Fixture
+
+**Files:**
+- Create: `benchmark/longmemeval/src/types.ts`
+- Create: `benchmark/longmemeval/src/data.ts`
+- Create: `test/fixtures/longmemeval/mini.json`
+- Create: `test/longmemeval-harness.test.ts`
+
+- [x] **Step 1: Write failing data tests**
+
+Add tests that import `loadLongMemEvalRows`, `selectSmokeRows`, `sha256File`, and `verifyChecksumLine`. The tests should assert:
+
+```ts
+const rows = loadLongMemEvalRows(fixturePath);
+expect(rows).toHaveLength(3);
+expect(rows[0].haystack).toHaveLength(2);
+expect(rows[0].answerSessionIds).toEqual(["s1"]);
+expect(selectSmokeRows(rows, ["q3", "q1"]).map((row) => row.id)).toEqual(["q3", "q1"]);
+expect(() => selectSmokeRows(rows, ["missing"])).toThrow(/missing smoke question ids: missing/);
+expect(verifyChecksumLine(`${actualHash}  ${fixturePath}`)).toEqual({ ok: true, expected: actualHash, actual: actualHash, path: fixturePath });
+expect(verifyChecksumLine(`0000  ${fixturePath}`).ok).toBe(false);
+```
+
+Also add validation tests that assert:
+
+```ts
+expect(() => loadLongMemEvalRows(nonArrayJsonPath)).toThrow(/expected LongMemEval JSON array/);
+expect(() => loadLongMemEvalRows(missingQuestionIdPath)).toThrow(/question_id must be a string/);
+expect(() => loadLongMemEvalRows(invalidTurnPath)).toThrow(/turn 0 role must be a string/);
+expect(() => loadLongMemEvalRows(mismatchedSessionsPath)).toThrow(/haystack_session_ids .* haystack_sessions .* length mismatch/);
+```
+
+- [x] **Step 2: Verify the tests fail**
+
+Run:
+
+```sh
+corepack pnpm exec vitest run test/longmemeval-harness.test.ts
+```
+
+Expected: fail because `benchmark/longmemeval/src/data.ts` does not exist.
+
+- [x] **Step 3: Implement data utilities**
+
+Create `types.ts` with `LongMemEvalRow`, `LongMemEvalTurn`, and `LongMemEvalRawRow`. Create `data.ts` that:
+
+- Parses JSON arrays.
+- Validates `question_id`, `question_type`, `question`, `answer_session_ids`, `haystack_session_ids`, and `haystack_sessions`.
+- Throws `LongMemEval row <id>: haystack_session_ids (...) and haystack_sessions (...) length mismatch` for mismatched session arrays.
+- Flattens turns into `[role] content` strings.
+- Computes SHA-256 with `createHash("sha256")`.
+- Verifies checksum lines in the form `<sha256><spaces><path>`.
+- Selects smoke rows in the caller-provided order and throws once with all missing ids.
+
+- [x] **Step 4: Verify the tests pass**
+
+Run the same Vitest command and expect all tests in `test/longmemeval-harness.test.ts` to pass for the data section.
+
+## Task 2: System Definitions
+
+**Files:**
+- Create: `benchmark/longmemeval/src/systems.ts`
+- Modify: `test/longmemeval-harness.test.ts`
+
+- [x] **Step 1: Write failing system tests**
+
+Assert that `getLongMemEvalSystems()` returns exactly these ids in order:
+
+```ts
+[
+  "agentmemory-baseline",
+  "agentmemory-consolidation-off",
+  "agentmemory-bm25-only",
+  "agentmemory-vector-only",
+  "agentmemory-hybrid-no-rerank",
+  "agentmemory-full",
+]
+```
+
+Also assert names are unique, `agentmemory-bm25-only.env.BM25_WEIGHT === "1"`, `agentmemory-vector-only.env.VECTOR_WEIGHT === "1"`, `agentmemory-consolidation-off.env.CONSOLIDATION_ENABLED === "false"`, and no env key or value contains `KEY`, `TOKEN`, or `SECRET`.
+
+- [x] **Step 2: Verify the tests fail**
+
+Run:
+
+```sh
+corepack pnpm exec vitest run test/longmemeval-harness.test.ts
+```
+
+Expected: fail because `systems.ts` does not exist.
+
+- [x] **Step 3: Implement system definitions**
+
+Create immutable system definitions with fields `id`, `label`, `description`, `endpoint`, `retrievalBudgetTokens`, and `env`. Use explicit strings only; do not read current process env.
+
+- [x] **Step 4: Verify the tests pass**
+
+Run the same Vitest command. Expected: system tests pass with no credential-like keys or values.
+
+## Task 3: Statistical Rigor Utilities
+
+**Files:**
+- Create: `benchmark/longmemeval/src/stats.ts`
+- Modify: `test/longmemeval-harness.test.ts`
+
+- [x] **Step 1: Write failing stats tests**
+
+Add tests for:
+
+```ts
+expect(mcnemarExact([{ a: true, b: false }, { a: false, b: true }])).toMatchObject({ b: 1, c: 1, nDiscordant: 2, pValue: 1 });
+expect(mcnemarExact([{ a: true, b: true }]).pValue).toBe(1);
+expect(mcnemarExact([{ a: true, b: false }, { a: true, b: false }, { a: true, b: false }, { a: true, b: false }])).toMatchObject({ b: 4, c: 0, nDiscordant: 4, pValue: 0.125 });
+expect(applyBonferroni([{ id: "x", pValue: 0.02 }], 6)[0]).toMatchObject({ adjustedPValue: 0.12, claimed: false });
+const ci = bootstrapAccuracyCi([{ questionId: "q1", correct: true }, { questionId: "q2", correct: false }], { iterations: 200, seed: 42 });
+expect(ci.mean).toBe(0.5);
+expect(ci.low).toBeGreaterThanOrEqual(0);
+expect(ci.high).toBeLessThanOrEqual(1);
+expect(bootstrapAccuracyCi(rows, { iterations: 200, seed: 42 })).toEqual(bootstrapAccuracyCi(rows, { iterations: 200, seed: 42 }));
+expect(bootstrapAccuracyCi(rows, { iterations: 200, seed: 7 })).not.toEqual(bootstrapAccuracyCi(rows, { iterations: 200, seed: 42 }));
+expect(categoryBreakdown(rows)).toMatchObject({ "multi-session": { n: 2, correct: 1, accuracy: 0.5 } });
+```
+
+- [x] **Step 2: Verify the tests fail**
+
+Run targeted Vitest. Expected: fail because `stats.ts` does not exist.
+
+- [x] **Step 3: Implement stats utilities**
+
+Implement:
+
+- Two-sided exact McNemar as a binomial tail under `p=0.5`, returning `pValue: 1` when there are zero discordant pairs.
+- Bonferroni correction with `Math.min(1, pValue * familySize)` and `claimed = adjustedPValue < alpha`.
+- Seeded `mulberry32` bootstrap over question rows.
+- Category breakdown with `n`, `correct`, and `accuracy`.
+
+- [x] **Step 4: Verify the tests pass**
+
+Run targeted Vitest. Expected: stats tests pass deterministically.
+
+## Task 4: Manifest And Table Rendering
+
+**Files:**
+- Create: `benchmark/longmemeval/src/manifest.ts`
+- Create: `benchmark/longmemeval/src/render-tables.ts`
+- Create: `benchmark/longmemeval/prompts/reader.md`
+- Create: `benchmark/longmemeval/prompts/judge.md`
+- Modify: `test/longmemeval-harness.test.ts`
+
+- [x] **Step 1: Write failing manifest/render tests**
+
+Assert:
+
+```ts
+const manifest = buildManifest({ runId: "run_test", commitSha: "abc123", packageVersion: "0.9.28", dataset: { name: "fixture", sha256: "hash" }, prompts: { reader: "reader text", judge: "judge text" }, models: { reader: { provider: "mock", model: "mock-reader", temperature: 0 }, judge: { provider: "mock", model: "mock-judge", temperature: 0 } }, systems: getLongMemEvalSystems().slice(0, 1) });
+expect(manifest.promptHashes.reader).toMatch(/^[a-f0-9]{64}$/);
+expect(manifest.models.reader.model).toBe("mock-reader");
+expect(JSON.stringify(manifest)).not.toContain("reader text");
+expect(JSON.stringify(manifest)).not.toContain("judge text");
+expect(JSON.stringify(buildManifest({ ...safeInput, metadata: { apiKey: "sk-test", token: "secret-token", authorization: "Bearer x", harmless: "kept" } }))).not.toMatch(/sk-test|secret-token|Bearer x|apiKey|authorization/i);
+expect(renderResultsTables(summary)).toContain("| Category | System | n | Correct | Accuracy | 95% CI |");
+expect(renderResultsTables(summary)).toContain("claimed");
+expect(renderResultsTables(summary)).toContain("directional");
+```
+
+- [x] **Step 2: Verify the tests fail**
+
+Run targeted Vitest. Expected: fail because manifest/render modules do not exist.
+
+- [x] **Step 3: Implement manifest and renderer**
+
+Implement prompt hashing with SHA-256, manifest schema version `1`, ISO timestamp injection from `new Date().toISOString()` unless provided, and markdown table rendering with sample sizes, confidence intervals, and claim labels. `buildManifest` must whitelist fields, must not read `process.env`, must not serialize raw prompt text, and must redact or drop secret-like metadata keys and values.
+
+- [x] **Step 4: Verify the tests pass**
+
+Run targeted Vitest. Expected: manifest/render tests pass.
+
+## Task 5: Harness Docs And Check Command
+
+**Files:**
+- Create: `benchmark/longmemeval/README.md`
+- Create: `benchmark/longmemeval/Makefile`
+- Create: `benchmark/longmemeval/data/smoke-ids.txt`
+- Create: `benchmark/longmemeval/src/check.ts`
+- Modify: `package.json`
+- Modify: `benchmark/README.md`
+- Modify: `README.md`
+- Modify: `test/quality-gates.test.ts`
+
+- [x] **Step 1: Write failing check/docs tests**
+
+Extend `test/longmemeval-harness.test.ts` to read `smoke-ids.txt` and assert it has 50 unique ids, all ids are present in checked-in `benchmark/data/longmemeval_results_hybrid.json` `per_question`, and no network/provider environment variables are needed. Add a static import guard that reads `benchmark/longmemeval/src/check.ts` and rejects imports from `eval/runner/adapters`, `src/providers`, `src/index`, and `benchmark/longmemeval-bench.ts`. Stub `globalThis.fetch` to throw, set provider/dataset env vars to sentinel values, invoke the check entrypoint, and assert it still succeeds with fixtures only.
+
+Extend `test/quality-gates.test.ts` to assert `pkg.scripts?.["bench:longmemeval:check"] === "tsx benchmark/longmemeval/src/check.ts"` and root lint still covers `benchmark/**/*.ts`.
+
+- [x] **Step 2: Verify the tests fail**
+
+Run:
+
+```sh
+corepack pnpm exec vitest run test/longmemeval-harness.test.ts test/quality-gates.test.ts
+```
+
+Expected: fail because the script and smoke ids do not exist.
+
+- [x] **Step 3: Add docs and command**
+
+Add:
+
+- `bench:longmemeval:check` script.
+- `Makefile` target `check` calling `cd $(ROOT) && corepack pnpm run bench:longmemeval:check`, with `ROOT := ../..` from `benchmark/longmemeval`.
+- `Makefile` target `smoke-ids` that verifies `data/smoke-ids.txt` against the checked-in retrieval result ids without downloading the dataset.
+- `Makefile` targets `data`, `reproduce`, and `reproduce-full` that print approval-required messages and exit nonzero until full scope is approved.
+- `README.md` and `benchmark/README.md` links that say the new harness foundation is for QA/statistical reproducibility and that current published LongMemEval numbers remain retrieval-only.
+- `check.ts` that validates smoke ids, fixture loading, system definitions, manifest hashing, and table rendering with no provider calls and no imports from provider/runtime surfaces.
+
+- [x] **Step 4: Verify the tests and check command pass**
+
+Run:
+
+```sh
+corepack pnpm exec vitest run test/longmemeval-harness.test.ts test/quality-gates.test.ts
+corepack pnpm run bench:longmemeval:check
+```
+
+Expected: both pass.
+
+## Task 6: Cleanup, Final Verification, And GitHub Prep Inputs
+
+**Files:**
+- Modify: `docs/todos/2026-06-19-issue-313-longmemeval-harness/todo.md`
+
+- [x] **Step 1: Focused simplification pass**
+
+Inspect the new `benchmark/longmemeval/src/*.ts` files for duplicated local helpers, unclear names, unnecessary comments, and avoidable mutable state. Preserve public types and test contracts.
+
+- [x] **Step 2: Run targeted verification**
+
+Run:
+
+```sh
+corepack pnpm exec vitest run test/longmemeval-harness.test.ts test/eval-adapters.test.ts test/quality-gates.test.ts
+corepack pnpm run bench:longmemeval:check
+corepack pnpm run lint
+```
+
+Expected: all pass.
+
+- [x] **Step 3: Run broader tests when dependency state allows**
+
+Run:
+
+```sh
+corepack pnpm test
+```
+
+Expected: pass. If blocked by pnpm ignored-build hardening, follow repo instruction with `corepack pnpm install --frozen-lockfile --ignore-scripts`, then rerun.
+
+- [x] **Step 4: Run required security gates before commit if staging**
+
+Because this touches benchmark tooling, docs, and package scripts, run Semgrep and staged Gitleaks before committing. OSV is required only if dependency, lockfile, submodule, container, vendored, or third-party package surfaces changed.
+
+```sh
+semgrep scan --config p/default --error --metrics=off .
+gitleaks protect --staged --redact
+```
+
+- [x] **Step 5: Update task state and prepare GitHub handoff**
+
+Record verification evidence, residual risks, and Feature / Verification Matrix results in `todo.md`. Pass task state, plan, changed files, verification, and boundary notes into GitHub push preparation.
+
+## Self-Review
+
+Spec coverage: This plan covers the bounded Arena recommendation: separate harness foundation, no provider/data/submodule/dependency/product API/true CI boundary changes, deterministic stats and manifest tests, docs, and a local check command. It intentionally does not complete the full #313 acceptance items that require Human Checkpoints.
+
+Placeholder scan: No unresolved placeholder tokens are used as implementation instructions. Deferred full-scope items are explicitly non-goals or approval-required Make targets.
+
+Type consistency: The plan uses stable names across tasks: `LongMemEvalRow`, `getLongMemEvalSystems`, `mcnemarExact`, `applyBonferroni`, `bootstrapAccuracyCi`, `categoryBreakdown`, `buildManifest`, and `renderResultsTables`.
diff --git a/docs/todos/2026-06-19-issue-313-longmemeval-harness/todo.md b/docs/todos/2026-06-19-issue-313-longmemeval-harness/todo.md
new file mode 100644
index 000000000..7c0cd24d4
--- /dev/null
+++ b/docs/todos/2026-06-19-issue-313-longmemeval-harness/todo.md
@@ -0,0 +1,141 @@
+# Issue 313 LongMemEval Harness Task State
+
+Task id: `2026-06-19-issue-313-longmemeval-harness`
+Scope: `agentmemory` repository, branch `issue/313-longmemeval-harness`
+GitHub issue: `https://github.com/wbugitlab1/agentmemory/issues/313`
+Plan: `docs/todos/2026-06-19-issue-313-longmemeval-harness/plan.md`
+Spec: none; source of truth is issue #313 plus the Arena synthesis for the bounded first slice.
+
+## Sprint Contract
+
+Goal: add the first, hermetic LongMemEval-S harness foundation under `benchmark/longmemeval/` so future approved work can run real LongMemEval-S QA/judge benchmarks with statistical rigor.
+
+Scope:
+
+- Add a separate `benchmark/longmemeval/` harness surface.
+- Keep existing retrieval-only `eval/` and `benchmark/longmemeval-bench.ts` behavior intact.
+- Add dependency-free TypeScript utilities for dataset validation, six system definitions, manifests, McNemar, Bonferroni, bootstrap confidence intervals, and markdown table rendering.
+- Add deterministic fixture tests and a local check target.
+- Add docs that distinguish this bounded foundation from real provider-backed/full-dataset benchmark runs.
+
+Non-goals:
+
+- No `.gitmodules` or submodule pin.
+- No new npm dependencies or lockfile changes.
+- No real LongMemEval dataset download.
+- No calls to Anthropic, OpenAI, Hugging Face, or any other provider.
+- No generated `v0.9.24` QA baseline.
+- No product REST, auth, schema, persistence, MCP, or iii-engine surface changes.
+- No real model-backed CI benchmark gate.
+
+Acceptance criteria:
+
+- Harness modules are covered by local deterministic tests.
+- Six named system definitions exist and are serializable without credentials.
+- Statistical utilities produce known outputs for small fixtures.
+- Manifest utilities record commit/package/dataset/prompt/model inputs without leaking secret values.
+- `benchmark/longmemeval/README.md` and `README.md` describe the new harness and clearly label deferred provider/data/baseline work.
+- A local check command exists for maintainers to wire into CI later after policy approval.
+
+Intended verification:
+
+- Red/green targeted Vitest for new harness tests.
+- `corepack pnpm exec vitest run test/longmemeval-harness.test.ts test/eval-adapters.test.ts`
+- `corepack pnpm run lint`
+- `corepack pnpm test` if dependency state allows.
+- Required security gates for tooling/docs/benchmark changes before commit if staging occurs: Semgrep and staged Gitleaks; OSV only if dependency, lockfile, submodule, container, vendored, or third-party package surfaces change.
+
+Known boundaries:
+
+- Full issue acceptance crosses Human Checkpoints for CI policy, dataset/submodule policy, provider credentials/cost/data handling, and historical baseline generation.
+- This task implements the Arena-recommended bounded first slice only.
+- The initial delegation for this issue described routine branch push, PR creation, clean PR merge to `origin/main`, and post-success thread archival as in-scope after green verification. This task record is not itself authorization for later remote effects; before any remote write, re-check the current active user/developer/repo instructions, target only `origin`, and stop for Human Checkpoints, non-routine remote/project/account state changes, force-push, destructive actions, or failed/skipped/flaky/incomplete verification.
+
+Stop conditions:
+
+- Implementation requires a new dependency, submodule, provider call, dataset download, lockfile change, product API change, auth/security behavior change, persistence/schema change, or true CI gate.
+- Required verification is blocked and no targeted substitute covers the changed surface.
+- A reviewer finds an unresolved High/Medium issue that implies changed scope or boundary behavior.
+
+## Arena Synthesis
+
+Base: Candidate 3. It gives the safest implementation shape: separate `benchmark/longmemeval/` harness, existing retrieval evals left intact, hermetic CI-ready checks by default, and explicit Human Checkpoints for real dataset downloads, submodules, provider-backed model calls, and CI benchmark gates.
+
+Grafts:
+
+- Candidate 1: issue #313 is open; #65 is closed and retrieval-only; do not synthesize `v0.9.24` QA baseline from current code; apply security gates if dependencies, submodules, CI, or package-manager metadata change.
+- Candidate 2: McNemar paired counts `b`/`c`, `b+c === 0` returns `p=1`, explicit Bonferroni family size, seeded bootstrap resampling by question ID, and separate retrieval diagnostics from judged QA correctness.
+
+Rejected:
+
+- Closing #313 as duplicate of #65.
+- Rewriting existing retrieval-only evals into the new QA/statistics harness.
+- Adding REST/API controls, dependencies, submodules, real provider calls, or model-backed CI without a Human Checkpoint.
+
+Arena reports:
+
+- `/tmp/arena-longmemeval-313/candidate-1/report.md`
+- `/tmp/arena-longmemeval-313/candidate-2/report.md`
+- `/tmp/arena-longmemeval-313/candidate-3/report.md`
+- `/tmp/arena-longmemeval-313/judge/report.md`
+- `/tmp/arena-longmemeval-313/synthesis.md`
+
+## Feature / Verification Matrix
+
+| Change | Verification method | Status | Evidence |
+|---|---|---:|---|
+| Task state and plan | File inspection | Done | `todo.md` and `plan.md` created and updated after review |
+| Harness data validation | New Vitest tests | Done | RED missing module, GREEN in `test/longmemeval-harness.test.ts` |
+| Six system definitions | New Vitest tests | Done | RED missing module, GREEN in `test/longmemeval-harness.test.ts` |
+| Statistical utilities | New Vitest tests with known small cases | Done | McNemar `b=4,c=0 => p=0.125`, Bonferroni, exact bootstrap output, category tests |
+| Manifest and prompt hashing | New Vitest tests | Done | Prompt hashes, no raw prompts, metadata/model/system secret-leak tests |
+| Markdown results renderer | New Vitest tests | Done | Exact category and hypothesis row assertions |
+| Harness docs and README link | Diff review and lint where applicable | Done | `README.md`, `benchmark/README.md`, `benchmark/longmemeval/README.md`; lint passed |
+| Final verification | Targeted tests, lint, full tests when feasible | Done | After manifest redaction hardening: targeted Vitest passed 3 files / 43 tests; harness check passed; lint passed; full `corepack pnpm test` passed 208 files / 2867 tests |
+| Security gates | Semgrep and staged Gitleaks | Done | Semgrep passed with 0 findings on 944 tracked files; staged Gitleaks initially caught two synthetic test tokens, test literals were split into runtime fakes, then staged Gitleaks passed with no leaks |
+
+## Subagent Ledger
+
+| Workstream | Scope | Edits allowed | Expected output | Result | Residual risk |
+|---|---|---:|---|---|---|
+| Arena candidates | Read-only validity and feasibility reports | No | Candidate reports | Completed | Reports are advisory; main agent owns synthesis |
+| Arena judge | Read-only scoring of candidate reports | No | Cross-judge report | Completed | Main agent owns final decision |
+| Plan review | Plan/spec-risk review before implementation | No | High/Medium findings or ACCEPT | Completed | Findings triaged and plan updated before code edits |
+| Implementation | Task-owned harness files and tests | Yes, after plan review | Bounded harness implementation | Completed | Main agent integrated changes and ran verification |
+| Final security review | Current diff | No | ACCEPT or High/Medium findings | Completed | No High/Medium security findings; smoke-id ignored file fixed |
+| Final test coverage review | Current diff and tests | No | ACCEPT or High/Medium findings | Completed | High manifest leak and Medium test gaps fixed |
+| Final maintainability review | Current diff and docs | No | ACCEPT or High/Medium findings | Completed | Smoke-id tracking, manifest whitelist, and task-state staleness fixed |
+
+## Progress Notes
+
+- 2026-06-19: Created branch `issue/313-longmemeval-harness` from detached HEAD `8bf46f74b5e114938c45b4cf3a7bfcedd790a176`.
+- 2026-06-19: Arena completed. Consensus: issue #313 is valid and not duplicate/stale, but full acceptance crosses Human Checkpoints.
+- 2026-06-19: User requested `$github-feature-loop`; proceeding with the bounded Arena recommendation.
+- 2026-06-19: Pre-implementation review found weak data/stat/manifest/check tests and Makefile target mismatch. Plan updated before code edits.
+- 2026-06-19: Implemented bounded harness foundation with TDD red/green cycles for data validation, systems, stats, manifests, renderer, check command, smoke IDs, docs, and package script.
+- 2026-06-19: Verification before final review: targeted `corepack pnpm exec vitest run test/longmemeval-harness.test.ts test/eval-adapters.test.ts test/quality-gates.test.ts` passed 3 files / 42 tests; `corepack pnpm run bench:longmemeval:check` passed; `corepack pnpm run lint` passed; full `corepack pnpm test` passed 208 files / 2866 tests.
+- 2026-06-19: Final reviewers found ignored smoke IDs, manifest spread leak risk, and weak bootstrap/render assertions. Added `.gitignore` exception, field-by-field manifest serialization, and stronger tests. Harness test passed 18 tests after fixes.
+- 2026-06-19: Post-fix verification: `corepack pnpm exec vitest run test/longmemeval-harness.test.ts test/eval-adapters.test.ts test/quality-gates.test.ts` passed 3 files / 43 tests; `corepack pnpm run bench:longmemeval:check` returned `{ ok: true, fixtureRows: 3, systems: 6, smokeIds: 50 }`; `corepack pnpm run lint` passed; full `corepack pnpm test` passed 208 files / 2867 tests.
+- 2026-06-19: Test coverage reviewer found manifest redaction did not cover `password`/`credential` keys or common standalone token formats. Broadened harness manifest redaction and tests using repo privacy patterns; targeted `corepack pnpm exec vitest run test/longmemeval-harness.test.ts` passed 1 file / 18 tests.
+- 2026-06-19: Final post-redaction verification: `corepack pnpm exec vitest run test/longmemeval-harness.test.ts test/eval-adapters.test.ts test/quality-gates.test.ts` passed 3 files / 43 tests; `corepack pnpm run bench:longmemeval:check` returned `{ ok: true, fixtureRows: 3, systems: 6, smokeIds: 50 }`; `corepack pnpm run lint` passed; full `corepack pnpm test` passed 208 files / 2867 tests.
+- 2026-06-19: Staged security gates: Semgrep passed with 0 findings on 944 tracked files. `gitleaks protect --staged --redact` initially flagged two synthetic redaction-test literals in `test/longmemeval-harness.test.ts`; split those fakes into runtime-composed values, reran targeted harness test successfully, then staged Gitleaks passed with no leaks.
+- 2026-06-19: Final post-Gitleaks-fix verification: Semgrep passed with 0 findings; `corepack pnpm exec vitest run test/longmemeval-harness.test.ts test/eval-adapters.test.ts test/quality-gates.test.ts` passed 3 files / 43 tests; `corepack pnpm run bench:longmemeval:check` returned `{ ok: true, fixtureRows: 3, systems: 6, smokeIds: 50 }`; `corepack pnpm run lint` passed; full `corepack pnpm test` passed 208 files / 2867 tests.
+
+## Review Triage
+
+| Finding | Classification | Action |
+|---|---|---|
+| Remote push/PR/merge/archive authorization conflicts with generic approval gate | Clarified | The original issue delegation is historical context, not a standing authorization token in task state. Any remote write or archive action must be checked against the current active user/developer/repo instructions and must stop for Human Checkpoints, non-routine remote effects, force-push, destructive actions, or failed/skipped/flaky/incomplete verification. |
+| Data validation tests too happy-path heavy | Fixed in plan | Added malformed JSON, non-array JSON, missing field, invalid turn, and session-length mismatch tests. |
+| Stats assertions do not catch always-`1` McNemar bug | Fixed in plan | Added asymmetric `b=4,c=0 => p=0.125` and repeatable seeded bootstrap assertions. |
+| Manifest tests do not prove no prompt/secret leakage | Fixed in plan | Added whitelist-only manifest behavior, secret-like field/value redaction assertions, no raw prompt serialization, and no `process.env` reads. |
+| Makefile target mismatch and root command resolution | Fixed in plan | Added explicit `smoke-ids` target and `ROOT := ../..` root-script invocation. |
+| Smoke IDs lack hermetic provenance | Fixed in plan | Use checked-in `benchmark/data/longmemeval_results_hybrid.json` `per_question` IDs, with fallback to clearly labeled fixture-only placeholders if that file is unavailable during implementation. |
+| Hermetic check command lacks proof | Fixed in plan | Added tests stubbing `globalThis.fetch`, setting provider/dataset env vars, and static import guards against provider/runtime modules. |
+| `smoke-ids.txt` ignored by root `data/` ignore | Fixed | Added narrow `.gitignore` exception and confirmed `git status` shows the file as untracked/trackable. |
+| Manifest model/system spread can serialize extra secret fields | Fixed | Added failing test, changed manifest building to whitelist model/system fields and sanitize secret-like env entries. |
+| Bootstrap and render-table tests too weak | Fixed | Added exact bootstrap output and exact markdown row assertions. |
+| Task-state remote authorization wording too broad | Fixed | Reworded the delegation note so task state records context but is not a standing authorization token for later remote effects. |
+| Task-state final verification stale after fixes | Fixed | Updated matrix and progress notes with the fresh post-fix targeted, check, lint, and full-test evidence. |
+| Manifest redaction too narrow for stated no-secret contract | Fixed | Mirrored repo privacy patterns for sensitive keys and common standalone secret values, and added tests for `password`, `credential`, GitHub PAT, AWS key, JWT, npm token, GitLab token, and long token assignment forms. |
+| Synthetic redaction-test tokens tripped staged Gitleaks | Fixed | Runtime-compose fake token values in tests so redaction contracts still execute without committing scanner-matching literals. |
diff --git a/package.json b/package.json
index 7639aaf19..aedbf909a 100644
--- a/package.json
+++ b/package.json
@@ -35,6 +35,7 @@
     "skills:gen": "tsx scripts/skills/generate.ts",
     "skills:check": "tsx scripts/skills/generate.ts --check && tsx scripts/skills/check.ts",
     "bench:load": "node --import tsx benchmark/load-100k.ts",
+    "bench:longmemeval:check": "tsx benchmark/longmemeval/src/check.ts",
     "eval:longmemeval": "tsx eval/runner/longmemeval.ts",
     "eval:coding-life": "tsx eval/runner/coding-life.ts"
   },
diff --git a/test/fixtures/longmemeval/invalid-turn.json b/test/fixtures/longmemeval/invalid-turn.json
new file mode 100644
index 000000000..49a8b5152
--- /dev/null
+++ b/test/fixtures/longmemeval/invalid-turn.json
@@ -0,0 +1,14 @@
+[
+  {
+    "question_id": "q_bad_turn",
+    "question_type": "single-session-user",
+    "question": "Bad turn?",
+    "answer_session_ids": ["s1"],
+    "haystack_session_ids": ["s1"],
+    "haystack_sessions": [
+      [
+        { "role": 12, "content": "content" }
+      ]
+    ]
+  }
+]
diff --git a/test/fixtures/longmemeval/mini.json b/test/fixtures/longmemeval/mini.json
new file mode 100644
index 000000000..7dff4a1a1
--- /dev/null
+++ b/test/fixtures/longmemeval/mini.json
@@ -0,0 +1,48 @@
+[
+  {
+    "question_id": "q1",
+    "question_type": "single-session-user",
+    "question": "Which project did Mara choose for the launch demo?",
+    "answer": "Mara chose Atlas.",
+    "answer_session_ids": ["s1"],
+    "haystack_session_ids": ["s1", "s2"],
+    "haystack_sessions": [
+      [
+        { "role": "user", "content": "For the launch demo, I want to use Atlas." },
+        { "role": "assistant", "content": "Noted: Atlas is the launch demo project." }
+      ],
+      [
+        { "role": "user", "content": "The billing refactor can wait." }
+      ]
+    ]
+  },
+  {
+    "question_id": "q2",
+    "question_type": "multi-session",
+    "question": "What snack should be ordered for the workshop?",
+    "answer": "Order pistachios.",
+    "answer_session_ids": ["s3"],
+    "haystack_session_ids": ["s3"],
+    "haystack_sessions": [
+      [
+        { "role": "user", "content": "For Friday's workshop, order pistachios." }
+      ]
+    ]
+  },
+  {
+    "question_id": "q3",
+    "question_type": "temporal-reasoning",
+    "question": "Which color was chosen after the redesign?",
+    "answer": "Green.",
+    "answer_session_ids": ["s5"],
+    "haystack_session_ids": ["s4", "s5"],
+    "haystack_sessions": [
+      [
+        { "role": "user", "content": "The old dashboard color was blue." }
+      ],
+      [
+        { "role": "user", "content": "After the redesign, switch the dashboard to green." }
+      ]
+    ]
+  }
+]
diff --git a/test/fixtures/longmemeval/mismatched-sessions.json b/test/fixtures/longmemeval/mismatched-sessions.json
new file mode 100644
index 000000000..ba370ce46
--- /dev/null
+++ b/test/fixtures/longmemeval/mismatched-sessions.json
@@ -0,0 +1,14 @@
+[
+  {
+    "question_id": "q_mismatch",
+    "question_type": "single-session-user",
+    "question": "Mismatch?",
+    "answer_session_ids": ["s1"],
+    "haystack_session_ids": ["s1", "s2"],
+    "haystack_sessions": [
+      [
+        { "role": "user", "content": "content" }
+      ]
+    ]
+  }
+]
diff --git a/test/fixtures/longmemeval/missing-question-id.json b/test/fixtures/longmemeval/missing-question-id.json
new file mode 100644
index 000000000..01eb0e923
--- /dev/null
+++ b/test/fixtures/longmemeval/missing-question-id.json
@@ -0,0 +1,13 @@
+[
+  {
+    "question_type": "single-session-user",
+    "question": "Missing id?",
+    "answer_session_ids": ["s1"],
+    "haystack_session_ids": ["s1"],
+    "haystack_sessions": [
+      [
+        { "role": "user", "content": "content" }
+      ]
+    ]
+  }
+]
diff --git a/test/fixtures/longmemeval/non-array.json b/test/fixtures/longmemeval/non-array.json
new file mode 100644
index 000000000..360a58755
--- /dev/null
+++ b/test/fixtures/longmemeval/non-array.json
@@ -0,0 +1,3 @@
+{
+  "question_id": "q_bad"
+}
diff --git a/test/longmemeval-harness.test.ts b/test/longmemeval-harness.test.ts
new file mode 100644
index 000000000..7bc9deeb8
--- /dev/null
+++ b/test/longmemeval-harness.test.ts
@@ -0,0 +1,404 @@
+import { readFileSync } from "node:fs";
+import { resolve } from "node:path";
+import { describe, expect, it } from "vitest";
+import {
+  loadLongMemEvalRows,
+  selectSmokeRows,
+  sha256File,
+  verifyChecksumLine,
+} from "../benchmark/longmemeval/src/data.js";
+import {
+  applyBonferroni,
+  bootstrapAccuracyCi,
+  categoryBreakdown,
+  mcnemarExact,
+} from "../benchmark/longmemeval/src/stats.js";
+import { buildManifest } from "../benchmark/longmemeval/src/manifest.js";
+import { renderResultsTables } from "../benchmark/longmemeval/src/render-tables.js";
+import { runLongMemEvalCheck } from "../benchmark/longmemeval/src/check.js";
+import { getLongMemEvalSystems } from "../benchmark/longmemeval/src/systems.js";
+
+const FIXTURE_DIR = resolve(__dirname, "fixtures", "longmemeval");
+const fixturePath = resolve(FIXTURE_DIR, "mini.json");
+const smokeIdsPath = resolve(__dirname, "..", "benchmark", "longmemeval", "data", "smoke-ids.txt");
+const hybridResultsPath = resolve(
+  __dirname,
+  "..",
+  "benchmark",
+  "data",
+  "longmemeval_results_hybrid.json",
+);
+
+describe("LongMemEval harness data utilities", () => {
+  it("loads LongMemEval rows and flattens sessions", () => {
+    const rows = loadLongMemEvalRows(fixturePath);
+
+    expect(rows).toHaveLength(3);
+    expect(rows[0].haystack).toHaveLength(2);
+    expect(rows[0].answerSessionIds).toEqual(["s1"]);
+    expect(rows[0].haystack[0].content).toContain("[user] For the launch demo");
+  });
+
+  it("selects smoke rows in requested order and reports missing ids together", () => {
+    const rows = loadLongMemEvalRows(fixturePath);
+
+    expect(selectSmokeRows(rows, ["q3", "q1"]).map((row) => row.id)).toEqual(["q3", "q1"]);
+    expect(() => selectSmokeRows(rows, ["missing", "also-missing"])).toThrow(
+      /missing smoke question ids: missing, also-missing/,
+    );
+  });
+
+  it("computes and verifies checksum lines", () => {
+    const actualHash = sha256File(fixturePath);
+
+    expect(verifyChecksumLine(`${actualHash}  ${fixturePath}`)).toEqual({
+      ok: true,
+      expected: actualHash,
+      actual: actualHash,
+      path: fixturePath,
+    });
+    expect(verifyChecksumLine(`0000  ${fixturePath}`).ok).toBe(false);
+  });
+
+  it("rejects malformed LongMemEval inputs", () => {
+    expect(() => loadLongMemEvalRows(resolve(FIXTURE_DIR, "non-array.json"))).toThrow(
+      /expected LongMemEval JSON array/,
+    );
+    expect(() => loadLongMemEvalRows(resolve(FIXTURE_DIR, "missing-question-id.json"))).toThrow(
+      /question_id must be a string/,
+    );
+    expect(() => loadLongMemEvalRows(resolve(FIXTURE_DIR, "invalid-turn.json"))).toThrow(
+      /turn 0 role must be a string/,
+    );
+    expect(() => loadLongMemEvalRows(resolve(FIXTURE_DIR, "mismatched-sessions.json"))).toThrow(
+      /haystack_session_ids .* haystack_sessions .* length mismatch/,
+    );
+  });
+
+  it("keeps fixture JSON readable for static test diagnostics", () => {
+    expect(readFileSync(fixturePath, "utf8")).toContain("\"question_id\": \"q1\"");
+  });
+});
+
+describe("LongMemEval harness system definitions", () => {
+  it("defines the six issue 313 systems in stable order", () => {
+    expect(getLongMemEvalSystems().map((system) => system.id)).toEqual([
+      "agentmemory-baseline",
+      "agentmemory-consolidation-off",
+      "agentmemory-bm25-only",
+      "agentmemory-vector-only",
+      "agentmemory-hybrid-no-rerank",
+      "agentmemory-full",
+    ]);
+  });
+
+  it("uses unique labels and side-effect-free environment overrides", () => {
+    const systems = getLongMemEvalSystems();
+    const labels = new Set(systems.map((system) => system.label));
+    const bm25Only = systems.find((system) => system.id === "agentmemory-bm25-only");
+    const vectorOnly = systems.find((system) => system.id === "agentmemory-vector-only");
+    const consolidationOff = systems.find(
+      (system) => system.id === "agentmemory-consolidation-off",
+    );
+
+    expect(labels.size).toBe(systems.length);
+    expect(bm25Only?.env.BM25_WEIGHT).toBe("1");
+    expect(bm25Only?.env.VECTOR_WEIGHT).toBe("0");
+    expect(vectorOnly?.env.BM25_WEIGHT).toBe("0");
+    expect(vectorOnly?.env.VECTOR_WEIGHT).toBe("1");
+    expect(consolidationOff?.env.CONSOLIDATION_ENABLED).toBe("false");
+
+    for (const system of systems) {
+      expect(system.retrievalBudgetTokens).toBeGreaterThan(0);
+      for (const [key, value] of Object.entries(system.env)) {
+        expect(`${key}=${value}`).not.toMatch(/KEY|TOKEN|SECRET/i);
+      }
+    }
+  });
+});
+
+describe("LongMemEval harness statistics", () => {
+  const judgedRows = [
+    { questionId: "q1", category: "multi-session", systemId: "a", correct: true },
+    { questionId: "q2", category: "multi-session", systemId: "a", correct: false },
+    { questionId: "q3", category: "single-session-user", systemId: "a", correct: true },
+    { questionId: "q4", category: "single-session-user", systemId: "a", correct: true },
+  ];
+
+  it("computes exact McNemar p-values from paired discordant outcomes", () => {
+    expect(mcnemarExact([{ a: true, b: false }, { a: false, b: true }])).toMatchObject({
+      b: 1,
+      c: 1,
+      nDiscordant: 2,
+      pValue: 1,
+    });
+    expect(mcnemarExact([{ a: true, b: true }]).pValue).toBe(1);
+    expect(
+      mcnemarExact([
+        { a: true, b: false },
+        { a: true, b: false },
+        { a: true, b: false },
+        { a: true, b: false },
+      ]),
+    ).toMatchObject({ b: 4, c: 0, nDiscordant: 4, pValue: 0.125 });
+  });
+
+  it("applies Bonferroni correction and claim labels", () => {
+    expect(applyBonferroni([{ id: "x", pValue: 0.02 }], 6)[0]).toMatchObject({
+      adjustedPValue: 0.12,
+      claimed: false,
+    });
+    expect(applyBonferroni([{ id: "x", pValue: 0.001 }], 6)[0]).toMatchObject({
+      adjustedPValue: 0.006,
+      claimed: true,
+    });
+  });
+
+  it("bootstraps accuracy deterministically with a seed", () => {
+    const ci = bootstrapAccuracyCi(
+      [
+        { questionId: "q1", correct: true },
+        { questionId: "q2", correct: false },
+      ],
+      { iterations: 200, seed: 42 },
+    );
+
+    expect(ci.mean).toBe(0.5);
+    expect(ci.low).toBeGreaterThanOrEqual(0);
+    expect(ci.high).toBeLessThanOrEqual(1);
+    expect(bootstrapAccuracyCi(judgedRows, { iterations: 200, seed: 42 })).toEqual({
+      mean: 0.75,
+      low: 0.25,
+      high: 1,
+      iterations: 200,
+      seed: 42,
+    });
+  });
+
+  it("summarizes judged rows by category and system", () => {
+    expect(categoryBreakdown(judgedRows)).toMatchObject({
+      "multi-session": {
+        a: { n: 2, correct: 1, accuracy: 0.5 },
+      },
+      "single-session-user": {
+        a: { n: 2, correct: 2, accuracy: 1 },
+      },
+    });
+  });
+});
+
+describe("LongMemEval harness manifest and table rendering", () => {
+  const fakeGithubPat = ["ghp", "abcdefghijklmnopqrstuvwxyzABCDEFGHIJ"].join("_");
+  const fakeAwsKey = ["AKIA", "1234567890ABCDEF"].join("");
+  const fakeJwt = ["eyJaaaaaaaaaa", "bbbbbbbbbb", "cccccccccc"].join(".");
+  const fakeNpmToken = ["npm", "abcdefghijklmnopqrstuvwxyzABCDEFGHIJ"].join("_");
+  const fakeGitLabToken = ["glpat", "abcdefghijklmnopqrst"].join("-");
+  const fakeGithubFineGrainedPat = ["github", "pat", "abcdefghijklmnopqrstuvwxyz"].join("_");
+  const fakeLongAssignment = ["token", "abcdefghijklmnopqrstuvwxyz1234567890"].join("=");
+
+  const safeManifestInput = {
+    runId: "run_test",
+    commitSha: "abc123",
+    packageVersion: "0.9.28",
+    createdAt: "2026-06-19T00:00:00.000Z",
+    dataset: { name: "fixture", sha256: "hash" },
+    prompts: { reader: "reader text", judge: "judge text" },
+    models: {
+      reader: { provider: "mock", model: "mock-reader", temperature: 0 },
+      judge: { provider: "mock", model: "mock-judge", temperature: 0 },
+    },
+    systems: getLongMemEvalSystems().slice(0, 1),
+  };
+
+  it("builds a manifest from whitelisted fields and prompt hashes", () => {
+    const manifest = buildManifest(safeManifestInput);
+
+    expect(manifest.schemaVersion).toBe(1);
+    expect(manifest.runId).toBe("run_test");
+    expect(manifest.commitSha).toBe("abc123");
+    expect(manifest.promptHashes.reader).toMatch(/^[a-f0-9]{64}$/);
+    expect(manifest.models.reader.model).toBe("mock-reader");
+    expect(JSON.stringify(manifest)).not.toContain("reader text");
+    expect(JSON.stringify(manifest)).not.toContain("judge text");
+  });
+
+  it("drops secret-like metadata keys and values from manifests", () => {
+    const serialized = JSON.stringify(
+      buildManifest({
+        ...safeManifestInput,
+        metadata: {
+          apiKey: "sk-test",
+          token: "secret-token",
+          authorization: "Bearer x",
+          password: "plain-password",
+          credential: "plain-credential",
+          harmlessGithubPat: fakeGithubPat,
+          harmlessAwsKey: fakeAwsKey,
+          harmlessJwt: fakeJwt,
+          harmlessNpmToken: fakeNpmToken,
+          harmlessGitLabToken: fakeGitLabToken,
+          harmlessAssignment: fakeLongAssignment,
+          harmless: "kept",
+        },
+      }),
+    );
+
+    expect(serialized).toContain("kept");
+    for (const forbidden of [
+      "sk-test",
+      "secret-token",
+      "Bearer x",
+      "apiKey",
+      "authorization",
+      "password",
+      "credential",
+      fakeGithubPat,
+      fakeAwsKey,
+      fakeJwt,
+      fakeNpmToken,
+      fakeGitLabToken,
+      fakeLongAssignment,
+    ]) {
+      expect(serialized).not.toContain(forbidden);
+    }
+  });
+
+  it("whitelists model and system fields before serializing manifests", () => {
+    const leakyInput = {
+      ...safeManifestInput,
+      models: {
+        reader: { ...safeManifestInput.models.reader, apiKey: "sk-model" },
+        judge: { ...safeManifestInput.models.judge, authorization: "Bearer model" },
+      },
+      systems: [
+        {
+          ...getLongMemEvalSystems()[0],
+          apiKey: "sk-system",
+          env: {
+            ...getLongMemEvalSystems()[0].env,
+            OPENAI_API_KEY: "sk-env",
+            RANDOM_VALUE: fakeGithubFineGrainedPat,
+            PASSWORD: "plain-password",
+            HARMLESS_FLAG: "kept",
+          },
+        },
+      ],
+    };
+
+    const serialized = JSON.stringify(buildManifest(leakyInput));
+
+    expect(serialized).toContain("HARMLESS_FLAG");
+    expect(serialized).toContain("kept");
+    for (const forbidden of [
+      "sk-model",
+      "Bearer model",
+      "sk-system",
+      "sk-env",
+      "apiKey",
+      "authorization",
+      "OPENAI_API_KEY",
+      fakeGithubFineGrainedPat,
+      "PASSWORD",
+      "plain-password",
+    ]) {
+      expect(serialized).not.toContain(forbidden);
+    }
+  });
+
+  it("renders accuracy and hypothesis tables with claim labels", () => {
+    const markdown = renderResultsTables({
+      categories: {
+        "multi-session": {
+          "agentmemory-baseline": {
+            n: 2,
+            correct: 1,
+            accuracy: 0.5,
+            ci: { low: 0.25, high: 0.75 },
+          },
+        },
+      },
+      hypotheses: [
+        {
+          id: "baseline-vs-full",
+          comparison: "baseline vs full",
+          rawPValue: 0.001,
+          adjustedPValue: 0.006,
+          claimed: true,
+        },
+        {
+          id: "baseline-vs-hybrid",
+          comparison: "baseline vs hybrid",
+          rawPValue: 0.04,
+          adjustedPValue: 0.24,
+          claimed: false,
+        },
+      ],
+    });
+
+    expect(markdown).toContain("| Category | System | n | Correct | Accuracy | 95% CI |");
+    expect(markdown).toContain(
+      "| multi-session | agentmemory-baseline | 2 | 1 | 50.0% | 25.0% - 75.0% |",
+    );
+    expect(markdown).toContain("| baseline vs full | 0.00100 | 0.00600 | claimed |");
+    expect(markdown).toContain("| baseline vs hybrid | 0.0400 | 0.240 | directional |");
+    expect(markdown).toContain("claimed");
+    expect(markdown).toContain("directional");
+  });
+});
+
+describe("LongMemEval harness local check", () => {
+  it("keeps smoke ids unique and sourced from checked-in retrieval results", () => {
+    const smokeIds = readFileSync(smokeIdsPath, "utf8")
+      .split(/\r?\n/)
+      .map((line) => line.trim())
+      .filter(Boolean);
+    const hybridResults = JSON.parse(readFileSync(hybridResultsPath, "utf8")) as {
+      per_question: Array<{ question_id: string }>;
+    };
+    const knownIds = new Set(hybridResults.per_question.map((row) => row.question_id));
+
+    expect(smokeIds).toHaveLength(50);
+    expect(new Set(smokeIds).size).toBe(50);
+    for (const id of smokeIds) {
+      expect(knownIds.has(id), `${id} should come from checked-in hybrid results`).toBe(true);
+    }
+  });
+
+  it("does not import provider, runtime, or legacy benchmark entrypoints", () => {
+    const checkSource = readFileSync(
+      resolve(__dirname, "..", "benchmark", "longmemeval", "src", "check.ts"),
+      "utf8",
+    );
+
+    expect(checkSource).not.toMatch(/eval\/runner\/adapters|src\/providers|src\/index|longmemeval-bench/);
+  });
+
+  it("runs the local check without network or provider environment", async () => {
+    const originalFetch = globalThis.fetch;
+    const originalOpenAiKey = process.env.OPENAI_API_KEY;
+    const originalAnthropicKey = process.env.ANTHROPIC_API_KEY;
+    const originalLongMemEvalPath = process.env.LONGMEMEVAL_PATH;
+    globalThis.fetch = async () => {
+      throw new Error("network must not be used");
+    };
+    process.env.OPENAI_API_KEY = "sk-test";
+    process.env.ANTHROPIC_API_KEY = "secret-token";
+    process.env.LONGMEMEVAL_PATH = "/should/not/be/read";
+    try {
+      await expect(runLongMemEvalCheck()).resolves.toMatchObject({
+        ok: true,
+        fixtureRows: 3,
+        systems: 6,
+        smokeIds: 50,
+      });
+    } finally {
+      globalThis.fetch = originalFetch;
+      if (originalOpenAiKey === undefined) delete process.env.OPENAI_API_KEY;
+      else process.env.OPENAI_API_KEY = originalOpenAiKey;
+      if (originalAnthropicKey === undefined) delete process.env.ANTHROPIC_API_KEY;
+      else process.env.ANTHROPIC_API_KEY = originalAnthropicKey;
+      if (originalLongMemEvalPath === undefined) delete process.env.LONGMEMEVAL_PATH;
+      else process.env.LONGMEMEVAL_PATH = originalLongMemEvalPath;
+    }
+  });
+});
diff --git a/test/quality-gates.test.ts b/test/quality-gates.test.ts
index 32fdbd11b..1de6e8cc4 100644
--- a/test/quality-gates.test.ts
+++ b/test/quality-gates.test.ts
@@ -108,6 +108,9 @@ describe("root quality gates", () => {
     expect(pkg.scripts?.["coverage:cli-hooks"]).toBe(
       "vitest run --coverage --config vitest.cli-hooks.config.ts",
     );
+    expect(pkg.scripts?.["bench:longmemeval:check"]).toBe(
+      "tsx benchmark/longmemeval/src/check.ts",
+    );
   });
 
   it("pins the root lint and coverage dev tools", () => {

From e2d04de666522245b54838cb48992ab31375736b Mon Sep 17 00:00:00 2001
From: Willi Budzinski <w.budzinski@telekom.de>
Date: Fri, 19 Jun 2026 19:08:16 +0200
Subject: [PATCH 2/4] docs: record longmemeval verification evidence

---
 docs/todos/2026-06-19-issue-313-longmemeval-harness/todo.md | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/docs/todos/2026-06-19-issue-313-longmemeval-harness/todo.md b/docs/todos/2026-06-19-issue-313-longmemeval-harness/todo.md
index 7c0cd24d4..aa11d7a95 100644
--- a/docs/todos/2026-06-19-issue-313-longmemeval-harness/todo.md
+++ b/docs/todos/2026-06-19-issue-313-longmemeval-harness/todo.md
@@ -93,6 +93,7 @@ Arena reports:
 | Harness docs and README link | Diff review and lint where applicable | Done | `README.md`, `benchmark/README.md`, `benchmark/longmemeval/README.md`; lint passed |
 | Final verification | Targeted tests, lint, full tests when feasible | Done | After manifest redaction hardening: targeted Vitest passed 3 files / 43 tests; harness check passed; lint passed; full `corepack pnpm test` passed 208 files / 2867 tests |
 | Security gates | Semgrep and staged Gitleaks | Done | Semgrep passed with 0 findings on 944 tracked files; staged Gitleaks initially caught two synthetic test tokens, test literals were split into runtime fakes, then staged Gitleaks passed with no leaks |
+| Post-base-merge verification | Targeted tests, lint, full tests | Done | After merging captured `origin/main` base `67bb438b4158d74771ed285e06c9ac078985d603`, targeted Vitest passed 3 files / 43 tests; harness check passed; lint passed; isolated `test/codex-sdk-provider.test.ts` passed after one full-suite timeout; full rerun passed 212 files / 2920 tests |
 
 ## Subagent Ledger
 
@@ -120,6 +121,8 @@ Arena reports:
 - 2026-06-19: Final post-redaction verification: `corepack pnpm exec vitest run test/longmemeval-harness.test.ts test/eval-adapters.test.ts test/quality-gates.test.ts` passed 3 files / 43 tests; `corepack pnpm run bench:longmemeval:check` returned `{ ok: true, fixtureRows: 3, systems: 6, smokeIds: 50 }`; `corepack pnpm run lint` passed; full `corepack pnpm test` passed 208 files / 2867 tests.
 - 2026-06-19: Staged security gates: Semgrep passed with 0 findings on 944 tracked files. `gitleaks protect --staged --redact` initially flagged two synthetic redaction-test literals in `test/longmemeval-harness.test.ts`; split those fakes into runtime-composed values, reran targeted harness test successfully, then staged Gitleaks passed with no leaks.
 - 2026-06-19: Final post-Gitleaks-fix verification: Semgrep passed with 0 findings; `corepack pnpm exec vitest run test/longmemeval-harness.test.ts test/eval-adapters.test.ts test/quality-gates.test.ts` passed 3 files / 43 tests; `corepack pnpm run bench:longmemeval:check` returned `{ ok: true, fixtureRows: 3, systems: 6, smokeIds: 50 }`; `corepack pnpm run lint` passed; full `corepack pnpm test` passed 208 files / 2867 tests.
+- 2026-06-19: Merged captured PR base `67bb438b4158d74771ed285e06c9ac078985d603` into the issue branch. Merge was conflict-free and the PR diff against that base still contains only the 25 issue #313 files.
+- 2026-06-19: Post-base-merge verification: targeted `corepack pnpm exec vitest run test/longmemeval-harness.test.ts test/eval-adapters.test.ts test/quality-gates.test.ts` passed 3 files / 43 tests; `corepack pnpm run bench:longmemeval:check` returned `{ ok: true, fixtureRows: 3, systems: 6, smokeIds: 50 }`; `corepack pnpm run lint` passed. First `corepack pnpm test` run after base merge failed once in `test/codex-sdk-provider.test.ts` on a 2000ms Codex CLI timeout; isolated `corepack pnpm exec vitest run test/codex-sdk-provider.test.ts` then passed 1 file / 3 tests, and a full `corepack pnpm test` rerun passed 212 files / 2920 tests.
 
 ## Review Triage
 

From c46c96730c9943894d858f9d217003c09fc99ef6 Mon Sep 17 00:00:00 2001
From: Willi Budzinski <w.budzinski@telekom.de>
Date: Sat, 20 Jun 2026 04:01:48 +0200
Subject: [PATCH 3/4] test: relax codex sdk fake cli timeout

---
 .../todo.md                                       |  5 +++++
 test/codex-sdk-provider.test.ts                   | 15 +++++++++++++--
 2 files changed, 18 insertions(+), 2 deletions(-)

diff --git a/docs/todos/2026-06-19-issue-313-longmemeval-harness/todo.md b/docs/todos/2026-06-19-issue-313-longmemeval-harness/todo.md
index aa11d7a95..93f3878c1 100644
--- a/docs/todos/2026-06-19-issue-313-longmemeval-harness/todo.md
+++ b/docs/todos/2026-06-19-issue-313-longmemeval-harness/todo.md
@@ -94,6 +94,7 @@ Arena reports:
 | Final verification | Targeted tests, lint, full tests when feasible | Done | After manifest redaction hardening: targeted Vitest passed 3 files / 43 tests; harness check passed; lint passed; full `corepack pnpm test` passed 208 files / 2867 tests |
 | Security gates | Semgrep and staged Gitleaks | Done | Semgrep passed with 0 findings on 944 tracked files; staged Gitleaks initially caught two synthetic test tokens, test literals were split into runtime fakes, then staged Gitleaks passed with no leaks |
 | Post-base-merge verification | Targeted tests, lint, full tests | Done | After merging captured `origin/main` base `67bb438b4158d74771ed285e06c9ac078985d603`, targeted Vitest passed 3 files / 43 tests; harness check passed; lint passed; isolated `test/codex-sdk-provider.test.ts` passed after one full-suite timeout; full rerun passed 212 files / 2920 tests |
+| Latest-base suite robustness | Full-suite reproduction and approved narrow test fix | Done | After merging latest `origin/main` base `24ff6779f5618d0f07039161be4f750252747f31`, full suite reproduced a 2000ms fake Codex CLI timeout in `test/codex-sdk-provider.test.ts`; user approved a narrow test-only robustness fix; isolated Codex SDK test, issue-targeted Vitest, harness check, lint, and full `corepack pnpm test` passed |
 
 ## Subagent Ledger
 
@@ -123,6 +124,9 @@ Arena reports:
 - 2026-06-19: Final post-Gitleaks-fix verification: Semgrep passed with 0 findings; `corepack pnpm exec vitest run test/longmemeval-harness.test.ts test/eval-adapters.test.ts test/quality-gates.test.ts` passed 3 files / 43 tests; `corepack pnpm run bench:longmemeval:check` returned `{ ok: true, fixtureRows: 3, systems: 6, smokeIds: 50 }`; `corepack pnpm run lint` passed; full `corepack pnpm test` passed 208 files / 2867 tests.
 - 2026-06-19: Merged captured PR base `67bb438b4158d74771ed285e06c9ac078985d603` into the issue branch. Merge was conflict-free and the PR diff against that base still contains only the 25 issue #313 files.
 - 2026-06-19: Post-base-merge verification: targeted `corepack pnpm exec vitest run test/longmemeval-harness.test.ts test/eval-adapters.test.ts test/quality-gates.test.ts` passed 3 files / 43 tests; `corepack pnpm run bench:longmemeval:check` returned `{ ok: true, fixtureRows: 3, systems: 6, smokeIds: 50 }`; `corepack pnpm run lint` passed. First `corepack pnpm test` run after base merge failed once in `test/codex-sdk-provider.test.ts` on a 2000ms Codex CLI timeout; isolated `corepack pnpm exec vitest run test/codex-sdk-provider.test.ts` then passed 1 file / 3 tests, and a full `corepack pnpm test` rerun passed 212 files / 2920 tests.
+- 2026-06-20: PR #1024 became behind `origin/main`; merged latest base `24ff6779f5618d0f07039161be4f750252747f31` into the issue branch. The PR diff against that base still contains only issue #313 files before the approved test robustness fix.
+- 2026-06-20: Full `corepack pnpm test` after latest base merge reproduced the Codex SDK provider fake-CLI timeout under full-suite load, while isolated `corepack pnpm exec vitest run test/codex-sdk-provider.test.ts` passed. Human Checkpoint: user approved option 1, a narrow out-of-scope test robustness fix.
+- 2026-06-20: Raised the fake Codex CLI test timeout from 2000ms to a named 10000ms constant without changing provider code. Verification passed: isolated Codex SDK provider test 1 file / 3 tests; issue-targeted Vitest 3 files / 43 tests; `corepack pnpm run bench:longmemeval:check`; `corepack pnpm run lint`; full `corepack pnpm test` 212 files / 2922 tests.
 
 ## Review Triage
 
@@ -142,3 +146,4 @@ Arena reports:
 | Task-state final verification stale after fixes | Fixed | Updated matrix and progress notes with the fresh post-fix targeted, check, lint, and full-test evidence. |
 | Manifest redaction too narrow for stated no-secret contract | Fixed | Mirrored repo privacy patterns for sensitive keys and common standalone secret values, and added tests for `password`, `credential`, GitHub PAT, AWS key, JWT, npm token, GitLab token, and long token assignment forms. |
 | Synthetic redaction-test tokens tripped staged Gitleaks | Fixed | Runtime-compose fake token values in tests so redaction contracts still execute without committing scanner-matching literals. |
+| Latest-base full-suite timeout in Codex SDK provider tests | Fixed after Human Checkpoint | User approved a narrow test-only robustness fix; kept product behavior unchanged and raised only the fake CLI test timeout to avoid load-sensitive full-suite failures. |
diff --git a/test/codex-sdk-provider.test.ts b/test/codex-sdk-provider.test.ts
index c8102c47f..38c013302 100644
--- a/test/codex-sdk-provider.test.ts
+++ b/test/codex-sdk-provider.test.ts
@@ -7,6 +7,7 @@ import { afterEach, describe, expect, it } from "vitest";
 import { CodexSDKProvider } from "../src/providers/codex-sdk.js";
 
 let tempDirs: string[] = [];
+const FAKE_CODEX_TIMEOUT_MS = 10_000;
 
 function makeFakeCodex(body: string): string {
   const dir = mkdtempSync(join(tmpdir(), "agentmemory-codex-"));
@@ -55,7 +56,12 @@ process.stdin.on("end", () => {
 `);
     process.env.OPENAI_API_KEY = "sk-test-secret";
 
-    const provider = new CodexSDKProvider("codex-default", 128, command, 2_000);
+    const provider = new CodexSDKProvider(
+      "codex-default",
+      128,
+      command,
+      FAKE_CODEX_TIMEOUT_MS,
+    );
 
     await expect(provider.compress("system text", "user text")).resolves.toBe("ok");
   });
@@ -80,7 +86,12 @@ process.stdin.on("end", () => {
   process.exit(4);
 });
 `);
-    const provider = new CodexSDKProvider("codex-default", 128, command, 2_000);
+    const provider = new CodexSDKProvider(
+      "codex-default",
+      128,
+      command,
+      FAKE_CODEX_TIMEOUT_MS,
+    );
 
     await expect(provider.compress("system", "user")).rejects.toThrow(
       /Codex CLI exited with code 4: boom/,

From 8d083e4b9dcc841dc16472eb6dfa1a9021f1b3dc Mon Sep 17 00:00:00 2001
From: Willi Budzinski <w.budzinski@telekom.de>
Date: Sat, 20 Jun 2026 04:06:08 +0200
Subject: [PATCH 4/4] docs: record latest longmemeval verification

---
 .../todos/2026-06-19-issue-313-longmemeval-harness/todo.md | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/docs/todos/2026-06-19-issue-313-longmemeval-harness/todo.md b/docs/todos/2026-06-19-issue-313-longmemeval-harness/todo.md
index 93f3878c1..861ceba72 100644
--- a/docs/todos/2026-06-19-issue-313-longmemeval-harness/todo.md
+++ b/docs/todos/2026-06-19-issue-313-longmemeval-harness/todo.md
@@ -92,9 +92,9 @@ Arena reports:
 | Markdown results renderer | New Vitest tests | Done | Exact category and hypothesis row assertions |
 | Harness docs and README link | Diff review and lint where applicable | Done | `README.md`, `benchmark/README.md`, `benchmark/longmemeval/README.md`; lint passed |
 | Final verification | Targeted tests, lint, full tests when feasible | Done | After manifest redaction hardening: targeted Vitest passed 3 files / 43 tests; harness check passed; lint passed; full `corepack pnpm test` passed 208 files / 2867 tests |
-| Security gates | Semgrep and staged Gitleaks | Done | Semgrep passed with 0 findings on 944 tracked files; staged Gitleaks initially caught two synthetic test tokens, test literals were split into runtime fakes, then staged Gitleaks passed with no leaks |
+| Security gates | Semgrep and staged Gitleaks | Done | Semgrep passed with 0 findings on 971 tracked files after the current-base merge; staged Gitleaks initially caught two synthetic test tokens, test literals were split into runtime fakes, then staged Gitleaks passed with no leaks before task-owned commits |
 | Post-base-merge verification | Targeted tests, lint, full tests | Done | After merging captured `origin/main` base `67bb438b4158d74771ed285e06c9ac078985d603`, targeted Vitest passed 3 files / 43 tests; harness check passed; lint passed; isolated `test/codex-sdk-provider.test.ts` passed after one full-suite timeout; full rerun passed 212 files / 2920 tests |
-| Latest-base suite robustness | Full-suite reproduction and approved narrow test fix | Done | After merging latest `origin/main` base `24ff6779f5618d0f07039161be4f750252747f31`, full suite reproduced a 2000ms fake Codex CLI timeout in `test/codex-sdk-provider.test.ts`; user approved a narrow test-only robustness fix; isolated Codex SDK test, issue-targeted Vitest, harness check, lint, and full `corepack pnpm test` passed |
+| Latest-base suite robustness | Full-suite reproduction and approved narrow test fix | Done | Full suite reproduced a 2000ms fake Codex CLI timeout in `test/codex-sdk-provider.test.ts`; user approved a narrow test-only robustness fix; after merging current `origin/main` base `682a133e66fa1650d62ae460d089b8ec19aaa92b`, targeted Vitest passed 4 files / 46 tests; harness check passed; lint passed; full `corepack pnpm test` passed 212 files / 2944 tests |
 
 ## Subagent Ledger
 
@@ -127,6 +127,9 @@ Arena reports:
 - 2026-06-20: PR #1024 became behind `origin/main`; merged latest base `24ff6779f5618d0f07039161be4f750252747f31` into the issue branch. The PR diff against that base still contains only issue #313 files before the approved test robustness fix.
 - 2026-06-20: Full `corepack pnpm test` after latest base merge reproduced the Codex SDK provider fake-CLI timeout under full-suite load, while isolated `corepack pnpm exec vitest run test/codex-sdk-provider.test.ts` passed. Human Checkpoint: user approved option 1, a narrow out-of-scope test robustness fix.
 - 2026-06-20: Raised the fake Codex CLI test timeout from 2000ms to a named 10000ms constant without changing provider code. Verification passed: isolated Codex SDK provider test 1 file / 3 tests; issue-targeted Vitest 3 files / 43 tests; `corepack pnpm run bench:longmemeval:check`; `corepack pnpm run lint`; full `corepack pnpm test` 212 files / 2922 tests.
+- 2026-06-20: Fetched again before push prep; `origin/main` had advanced to `682a133e66fa1650d62ae460d089b8ec19aaa92b`. Merged it into the issue branch conflict-free.
+- 2026-06-20: Post-current-base verification passed: `corepack pnpm exec vitest run test/codex-sdk-provider.test.ts test/longmemeval-harness.test.ts test/eval-adapters.test.ts test/quality-gates.test.ts` passed 4 files / 46 tests; `corepack pnpm run bench:longmemeval:check` returned `{ ok: true, fixtureRows: 3, systems: 6, smokeIds: 50 }`; `corepack pnpm run lint` passed; full `corepack pnpm test` passed 212 files / 2944 tests.
+- 2026-06-20: Final post-current-base Semgrep rerun passed with 0 findings on 971 tracked files.
 
 ## Review Triage