From 061fd0d3f0523f91de204b503e43d3ba3b8a0005 Mon Sep 17 00:00:00 2001 From: Ed Heltzel <402910+edheltzel@users.noreply.github.com> Date: Wed, 10 Jun 2026 20:34:06 -0400 Subject: [PATCH 1/3] feat(provenance): add Record Provenance as automatic write-path metadata MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Migration 8->9 adds a nullable provenance column (CHECK-constrained to verbatim/user_authored/extracted/derived) to messages, decisions, learnings, breadcrumbs, and loa_entries. Legacy rows stay NULL (unknown) per ADR-0001 — never guessed, never laundered. Write paths stamp provenance automatically: - CLI add + MCP memory_add -> user_authored (no public override) - raw conversation capture (import, dump, PreCompact flush) -> verbatim - extraction writers (hooks, structured extraction, LoA, import-legacy) -> extracted - derived reserved for future internal paths recall provenance backfill classifies legacy rows on deterministic evidence only, dry-run by default with --execute. CLI search flags unknown provenance by default; --show-provenance shows all values. MCP search/hybrid/recall payloads carry provenance for every record type. Refs #42, ADR-0001 --- hooks/RecallPreCompact.ts | 9 +- hooks/lib/sqlite-writers.ts | 38 +++++--- src/commands/add.ts | 10 ++- src/commands/dump.ts | 8 +- src/commands/import-legacy.ts | 5 +- src/commands/loa.ts | 6 +- src/commands/provenance.ts | 148 +++++++++++++++++++++++++++++++ src/commands/search.ts | 12 ++- src/db/migrations.ts | 19 ++++ src/db/schema.ts | 5 ++ src/index.ts | 28 +++++- src/lib/conversation-import.ts | 4 +- src/lib/import.ts | 4 +- src/lib/memory.ts | 56 +++++++----- src/lib/structured-extraction.ts | 4 + src/mcp-server.ts | 38 ++++++-- src/types/index.ts | 15 ++++ tests/db/migrations.test.ts | 77 +++++++++++++++- 18 files changed, 424 insertions(+), 62 deletions(-) create mode 100644 src/commands/provenance.ts diff --git a/hooks/RecallPreCompact.ts b/hooks/RecallPreCompact.ts index 4460306..a1ae8de 100644 --- a/hooks/RecallPreCompact.ts +++ b/hooks/RecallPreCompact.ts @@ -358,10 +358,13 @@ export function flushConversation(convPath: string, cwd: string): FlushResult { // Insert messages. importance defaults to 5 — these are mid-session // captures, not curated, and the Stop hook may later promote a subset - // to LoA at importance 8. + // to LoA at importance 8. Raw transcript capture is verbatim + // (ADR-0001); the column guard keeps pre-provenance DBs working. + const hasProvenance = (db.prepare('PRAGMA table_info(messages)').all() as Array<{ name: string }>) + .some((c) => c.name === 'provenance'); const insertMessage = db.prepare(` - INSERT INTO messages (session_id, timestamp, role, content, project, importance) - VALUES (?, ?, ?, ?, ?, 5) + INSERT INTO messages (session_id, timestamp, role, content, project, importance${hasProvenance ? ', provenance' : ''}) + VALUES (?, ?, ?, ?, ?, 5${hasProvenance ? ", 'verbatim'" : ''}) `); const tx = db.transaction((rows: ParsedMessage[]) => { diff --git a/hooks/lib/sqlite-writers.ts b/hooks/lib/sqlite-writers.ts index afc5880..3802911 100644 --- a/hooks/lib/sqlite-writers.ts +++ b/hooks/lib/sqlite-writers.ts @@ -42,6 +42,16 @@ function columnExists(db: Database, table: string, column: string): boolean { } } +// ADR-0001: every writer in this file is an extraction path, so records are +// stamped provenance = 'extracted'. The value is a SQL literal (not a bind +// param) so the legacy-DB column guard stays a simple string switch — older +// databases without the provenance column keep working unchanged. +function provenanceFragment(db: Database, table: string): { col: string; val: string } { + return columnExists(db, table, 'provenance') + ? { col: ', provenance', val: ", 'extracted'" } + : { col: '', val: '' }; +} + // --------------------------------------------------------------------------- // extraction_sessions // --------------------------------------------------------------------------- @@ -105,11 +115,12 @@ export function writeDecisionsBatch(dbPath: string, items: DecisionInput[]): num try { if (!tableExists(db, 'decisions')) return 0; const hasConfidence = columnExists(db, 'decisions', 'confidence'); + const provenance = provenanceFragment(db, 'decisions'); const sql = hasConfidence - ? `INSERT INTO decisions (session_id, category, project, decision, status, confidence, importance) - VALUES (?, ?, ?, ?, 'active', ?, ?)` - : `INSERT INTO decisions (session_id, category, project, decision, status, importance) - VALUES (?, ?, ?, ?, 'active', ?)`; + ? `INSERT INTO decisions (session_id, category, project, decision, status, confidence, importance${provenance.col}) + VALUES (?, ?, ?, ?, 'active', ?, ?${provenance.val})` + : `INSERT INTO decisions (session_id, category, project, decision, status, importance${provenance.col}) + VALUES (?, ?, ?, ?, 'active', ?${provenance.val})`; const stmt = db.prepare(sql); const insertMany = db.transaction((batch: DecisionInput[]) => { let n = 0; @@ -165,11 +176,12 @@ export function writeLearningsBatch(dbPath: string, items: LearningInput[]): num try { if (!tableExists(db, 'learnings')) return 0; const hasConfidence = columnExists(db, 'learnings', 'confidence'); + const provenance = provenanceFragment(db, 'learnings'); const sql = hasConfidence - ? `INSERT INTO learnings (session_id, category, project, problem, solution, prevention, tags, confidence, importance) - VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?)` - : `INSERT INTO learnings (session_id, category, project, problem, solution, prevention, tags, importance) - VALUES (?, ?, ?, ?, ?, ?, ?, ?)`; + ? `INSERT INTO learnings (session_id, category, project, problem, solution, prevention, tags, confidence, importance${provenance.col}) + VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?${provenance.val})` + : `INSERT INTO learnings (session_id, category, project, problem, solution, prevention, tags, importance${provenance.col}) + VALUES (?, ?, ?, ?, ?, ?, ?, ?${provenance.val})`; const stmt = db.prepare(sql); const insertMany = db.transaction((batch: LearningInput[]) => { let n = 0; @@ -227,9 +239,10 @@ export function writeBreadcrumbsBatch(dbPath: string, items: BreadcrumbInput[]): const db = openDb(dbPath); try { if (!tableExists(db, 'breadcrumbs')) return 0; + const provenance = provenanceFragment(db, 'breadcrumbs'); const stmt = db.prepare( - `INSERT INTO breadcrumbs (session_id, content, category, project, importance, expires_at) - VALUES (?, ?, ?, ?, ?, ?)` + `INSERT INTO breadcrumbs (session_id, content, category, project, importance, expires_at${provenance.col}) + VALUES (?, ?, ?, ?, ?, ?${provenance.val})` ); const insertMany = db.transaction((batch: BreadcrumbInput[]) => { let n = 0; @@ -273,11 +286,12 @@ export function writeLoaEntryFromExtraction(dbPath: string, entry: LoaInput): nu if (!tableExists(db, 'loa_entries')) return 0; // LoA importance is floored at 5 (curated tier guardrail). const importance = Math.max(5, clampImportance(entry.importance, 8)); + const provenance = provenanceFragment(db, 'loa_entries'); const result = db .prepare( `INSERT INTO loa_entries - (title, description, fabric_extract, session_id, project, tags, message_count, importance) - VALUES (?, ?, ?, ?, ?, ?, ?, ?)` + (title, description, fabric_extract, session_id, project, tags, message_count, importance${provenance.col}) + VALUES (?, ?, ?, ?, ?, ?, ?, ?${provenance.val})` ) .run( entry.title, diff --git a/src/commands/add.ts b/src/commands/add.ts index c4befb9..62e65c3 100644 --- a/src/commands/add.ts +++ b/src/commands/add.ts @@ -21,7 +21,9 @@ export function runAddBreadcrumb(content: string, options: AddBreadcrumbOptions) content, project, category: options.category, - importance: options.importance ?? 5 + importance: options.importance ?? 5, + // ADR-0001: provenance is stamped from the write path, never a CLI flag. + provenance: 'user_authored' }); console.log(`✓ Added breadcrumb #${id}${project ? ` [${project}]` : ''}`); @@ -51,7 +53,8 @@ export function runAddDecision(decision: string, options: AddDecisionOptions): v reasoning: options.why, alternatives: options.alternatives, status: 'active', - confidence + confidence, + provenance: 'user_authored' }); console.log(`✓ Added decision #${id}${project ? ` [${project}]` : ''} (${confidence})`); @@ -78,7 +81,8 @@ export function runAddLearning(problem: string, solution: string, options: AddLe project, category: options.category, prevention: options.prevention, - tags: options.tags + tags: options.tags, + provenance: 'user_authored' }); console.log(`✓ Added learning #${id}${project ? ` [${project}]` : ''}`); diff --git a/src/commands/dump.ts b/src/commands/dump.ts index d87caec..dae54bb 100644 --- a/src/commands/dump.ts +++ b/src/commands/dump.ts @@ -387,7 +387,8 @@ export async function coreDump(title: string, options: DumpOptions & { session?: summary: `Dumped: ${title}` }); - const importedCount = addMessagesBatch(session.messages); + // Raw conversation capture is verbatim (ADR-0001). + const importedCount = addMessagesBatch(session.messages.map(m => ({ ...m, provenance: 'verbatim' as const }))); // Get imported message IDs for LoA const db = getDb(); @@ -429,7 +430,10 @@ export async function coreDump(title: string, options: DumpOptions & { session?: parent_loa_id: options.continues, project: options.project || session.project, tags: options.tags, - message_count: importedMessages.length + message_count: importedMessages.length, + // Fabric output and the basic-summary fallback are both generated from + // the session messages — extracted either way (ADR-0001). + provenance: 'extracted' }); await autoEmbedLoaEntry(loaId, title, fabricExtract); diff --git a/src/commands/import-legacy.ts b/src/commands/import-legacy.ts index 6d8b5ac..fc8cf98 100644 --- a/src/commands/import-legacy.ts +++ b/src/commands/import-legacy.ts @@ -164,7 +164,10 @@ export function runImportLegacy(options: ImportLegacyOptions): void { message_range_start: undefined, message_range_end: undefined, message_count: undefined, - tags: 'legacy,imported' + tags: 'legacy,imported', + // DISTILLED.md / HOT_RECALL.md content is prior extraction output — + // the record stays honest as extracted (ADR-0001). + provenance: 'extracted' }); // Update the created_at to match the original date diff --git a/src/commands/loa.ts b/src/commands/loa.ts index 94d0f36..932920a 100644 --- a/src/commands/loa.ts +++ b/src/commands/loa.ts @@ -115,7 +115,8 @@ export async function runLoa(title: string, options: LoaOptions): Promise process.exit(1); } - // Create LoA entry + // Create LoA entry — Fabric extract_wisdom output is generated from the + // session messages, so the record is extracted (ADR-0001). const id = createLoaEntry({ title, description: `Captured ${messages.length} messages`, @@ -125,7 +126,8 @@ export async function runLoa(title: string, options: LoaOptions): Promise parent_loa_id: options.continues, project, tags: options.tags, - message_count: messages.length + message_count: messages.length, + provenance: 'extracted' }); console.log(`\n✓ LoA #${id} captured: "${title}"`); diff --git a/src/commands/provenance.ts b/src/commands/provenance.ts new file mode 100644 index 0000000..db0cde1 --- /dev/null +++ b/src/commands/provenance.ts @@ -0,0 +1,148 @@ +// recall provenance — conservative backfill for the Record Provenance column. +// +// Background (ADR-0001, CONTEXT.md, issue #42): +// Migration 8→9 added a nullable `provenance` column to messages/decisions/ +// learnings/breadcrumbs/loa_entries. Write paths stamp provenance going +// forward; legacy rows are NULL ("unknown"). This command classifies legacy +// rows — and ONLY where the source table or a write-path marker gives +// deterministic evidence. +// +// Binding rules: +// - NEVER guess. A row with no deterministic evidence stays NULL and is +// reported as unknown. +// - NEVER overwrite. Only rows with provenance IS NULL are touched. +// - `user_authored` is never assigned by backfill: nothing in the data +// distinguishes a CLI/MCP-authored row from an extraction row that was +// given a custom category. +// +// Evidence table: +// - messages → 'verbatim' — every message writer that has ever existed +// (JSONL import, conversation import, dump, PreCompact flush) +// captures raw transcript text without semantic rewriting. +// - loa_entries → 'extracted' — every LoA writer stores machine-generated +// content (Fabric/Haiku extracts, basic-summary fallback, or +// prior DISTILLED.md extraction output via import-legacy). +// - decisions → 'extracted' iff category = 'auto-extracted' (the marker the +// extraction writers stamp). Other rows: unknown. +// - learnings → 'extracted' iff category = 'auto-extracted'. Else unknown. +// - breadcrumbs → 'extracted' iff category = 'extracted-idea'. Else unknown. +// +// Bind-count note (see src/lib/chunk.ts): every statement here binds zero +// variables — bulk UPDATEs with literal predicates — so no chunking applies. + +import { getDb } from '../db/connection.js'; + +const BACKFILL_TABLES = ['messages', 'decisions', 'learnings', 'breadcrumbs', 'loa_entries'] as const; +type BackfillTable = typeof BACKFILL_TABLES[number]; + +export interface ProvenanceBackfillOptions { + dryRun?: boolean; + table?: BackfillTable | 'all'; +} + +interface TableRule { + table: BackfillTable; + value: 'verbatim' | 'extracted'; + // SQL predicate (beyond provenance IS NULL) that constitutes the + // deterministic evidence; undefined = the whole table qualifies. + evidenceWhere?: string; + evidence: string; +} + +const RULES: TableRule[] = [ + { + table: 'messages', + value: 'verbatim', + evidence: 'raw conversation capture is the only historical write path', + }, + { + table: 'loa_entries', + value: 'extracted', + evidence: 'all historical LoA writers store machine-generated extracts', + }, + { + table: 'decisions', + value: 'extracted', + evidenceWhere: "category = 'auto-extracted'", + evidence: "category = 'auto-extracted' (extraction-writer marker)", + }, + { + table: 'learnings', + value: 'extracted', + evidenceWhere: "category = 'auto-extracted'", + evidence: "category = 'auto-extracted' (extraction-writer marker)", + }, + { + table: 'breadcrumbs', + value: 'extracted', + evidenceWhere: "category = 'extracted-idea'", + evidence: "category = 'extracted-idea' (extraction-writer marker)", + }, +]; + +export interface ProvenanceBackfillResult { + table: string; + value: string; + unknownBefore: number; + classified: number; + remainingUnknown: number; + evidence: string; +} + +export function runProvenanceBackfill(options: ProvenanceBackfillOptions = {}): ProvenanceBackfillResult[] { + const dryRun = options.dryRun ?? true; + const target = options.table ?? 'all'; + + if (target !== 'all' && !(BACKFILL_TABLES as readonly string[]).includes(target)) { + console.error(`Unknown table: ${target}. Use one of: ${BACKFILL_TABLES.join(', ')}, all`); + process.exitCode = 1; + return []; + } + + const db = getDb(); + const results: ProvenanceBackfillResult[] = []; + + for (const rule of RULES) { + if (target !== 'all' && target !== rule.table) continue; + + const count = (where: string) => + (db.prepare(`SELECT COUNT(*) as count FROM ${rule.table} WHERE ${where}`).get() as { count: number }).count; + + const unknownBefore = count('provenance IS NULL'); + const evidenceClause = rule.evidenceWhere + ? `provenance IS NULL AND ${rule.evidenceWhere}` + : 'provenance IS NULL'; + const classified = count(evidenceClause); + + if (!dryRun && classified > 0) { + db.prepare(`UPDATE ${rule.table} SET provenance = '${rule.value}' WHERE ${evidenceClause}`).run(); + } + + results.push({ + table: rule.table, + value: rule.value, + unknownBefore, + classified, + remainingUnknown: unknownBefore - classified, + evidence: rule.evidence, + }); + } + + // Report + console.log(dryRun ? '[DRY RUN — no changes written]\n' : '[LIVE — changes written]\n'); + for (const r of results) { + const verb = dryRun ? 'would set' : 'set'; + console.log(`${r.table}: ${r.unknownBefore} unknown — ${verb} ${r.classified} to ${r.value}`); + console.log(` evidence: ${r.evidence}`); + if (r.remainingUnknown > 0) { + console.log(` ${r.remainingUnknown} left unknown (no deterministic evidence — staying NULL)`); + } + console.log(''); + } + + if (dryRun) { + console.log('Re-run with --execute to apply changes.'); + } + + return results; +} diff --git a/src/commands/search.ts b/src/commands/search.ts index 6dc5092..2c13579 100644 --- a/src/commands/search.ts +++ b/src/commands/search.ts @@ -7,6 +7,7 @@ interface SearchOptions { table?: string; biasType?: string; limit?: number; + showProvenance?: boolean; } export function runSearch(query: string, options: SearchOptions): void { @@ -40,7 +41,16 @@ export function runSearch(query: string, options: SearchOptions): void { const projectTag = result.project ? ` [${result.project}]` : ''; const date = result.created_at.split('T')[0]; - console.log(`[${result.table}#${result.id}]${projectTag} ${date}`); + // Display contract (issue #42): known provenance stays quiet by default; + // unknown (NULL) is always flagged. --show-provenance shows every value. + let provenanceTag = ''; + if (options.showProvenance) { + provenanceTag = ` [provenance: ${result.provenance ?? 'unknown'}]`; + } else if (!result.provenance) { + provenanceTag = ' ⚠ [provenance: unknown]'; + } + + console.log(`[${result.table}#${result.id}]${projectTag} ${date}${provenanceTag}`); console.log(` ${preview.replace(/\n/g, ' ')}`); console.log(''); } diff --git a/src/db/migrations.ts b/src/db/migrations.ts index f567c03..415e1a4 100644 --- a/src/db/migrations.ts +++ b/src/db/migrations.ts @@ -179,6 +179,25 @@ export const MIGRATIONS: Migration[] = [ db.prepare('CREATE INDEX IF NOT EXISTS idx_learnings_importance ON learnings(importance)').run(); db.prepare('CREATE INDEX IF NOT EXISTS idx_loa_importance ON loa_entries(importance)').run(); }, + + // Migration 8 → 9: Record Provenance (ADR-0001, issue #42). + // Additive nullable column on all memory tables. Provenance is automatic + // write-path metadata; legacy rows stay NULL ("unknown") until explicitly + // backfilled via `recall provenance backfill` — never guessed, no default. + // The CHECK constraint passes for NULL (IN() evaluates to NULL → allowed), + // so unknown remains representable. + (db) => { + const tables = ['messages', 'decisions', 'learnings', 'breadcrumbs', 'loa_entries']; + for (const table of tables) { + try { + db.prepare( + `ALTER TABLE ${table} ADD COLUMN provenance TEXT CHECK (provenance IN ('verbatim', 'user_authored', 'extracted', 'derived'))` + ).run(); + } catch { + // Column already exists — safe to ignore (fresh install case) + } + } + }, ]; // --------------------------------------------------------------------------- diff --git a/src/db/schema.ts b/src/db/schema.ts index 3da02e8..7f60912 100644 --- a/src/db/schema.ts +++ b/src/db/schema.ts @@ -25,6 +25,7 @@ CREATE TABLE IF NOT EXISTS messages ( content TEXT NOT NULL, project TEXT, importance INTEGER DEFAULT 5 CHECK (importance BETWEEN 1 AND 10), + provenance TEXT CHECK (provenance IN ('verbatim', 'user_authored', 'extracted', 'derived')), FOREIGN KEY (session_id) REFERENCES sessions(session_id) ); @@ -40,6 +41,7 @@ CREATE TABLE IF NOT EXISTS decisions ( alternatives TEXT, status TEXT DEFAULT 'active' CHECK (status IN ('active', 'superseded', 'reverted')), importance INTEGER DEFAULT 5 CHECK (importance BETWEEN 1 AND 10), + provenance TEXT CHECK (provenance IN ('verbatim', 'user_authored', 'extracted', 'derived')), FOREIGN KEY (session_id) REFERENCES sessions(session_id) ); @@ -55,6 +57,7 @@ CREATE TABLE IF NOT EXISTS learnings ( prevention TEXT, tags TEXT, importance INTEGER DEFAULT 5 CHECK (importance BETWEEN 1 AND 10), + provenance TEXT CHECK (provenance IN ('verbatim', 'user_authored', 'extracted', 'derived')), FOREIGN KEY (session_id) REFERENCES sessions(session_id) ); @@ -67,6 +70,7 @@ CREATE TABLE IF NOT EXISTS breadcrumbs ( category TEXT, project TEXT, importance INTEGER DEFAULT 5 CHECK (importance BETWEEN 1 AND 10), + provenance TEXT CHECK (provenance IN ('verbatim', 'user_authored', 'extracted', 'derived')), expires_at DATETIME, FOREIGN KEY (session_id) REFERENCES sessions(session_id) ); @@ -92,6 +96,7 @@ CREATE TABLE IF NOT EXISTS loa_entries ( tags TEXT, message_count INTEGER, importance INTEGER DEFAULT 8 CHECK (importance BETWEEN 1 AND 10), + provenance TEXT CHECK (provenance IN ('verbatim', 'user_authored', 'extracted', 'derived')), FOREIGN KEY (parent_loa_id) REFERENCES loa_entries(id), FOREIGN KEY (message_range_start) REFERENCES messages(id), FOREIGN KEY (message_range_end) REFERENCES messages(id) diff --git a/src/index.ts b/src/index.ts index 2a0ab42..e36e94b 100644 --- a/src/index.ts +++ b/src/index.ts @@ -24,6 +24,7 @@ import { runCluster } from './commands/cluster.js'; import { runEmbedBackfill, runSemanticSearch, runEmbedStats, runHybridSearch } from './commands/embed.js'; import { runDoctor } from './commands/doctor.js'; import { runImportanceBackfill, runPin, runUnpin } from './commands/importance.js'; +import { runProvenanceBackfill } from './commands/provenance.js'; import { runBenchmark, listBenchmarks, reportLatestBenchmark } from './commands/benchmark.js'; import { runOnboard } from './commands/onboard.js'; import { runMigrate } from './commands/migrate.js'; @@ -177,12 +178,14 @@ program .option('-t, --table ', 'Hard-filter to one table (messages, loa, decisions, learnings, breadcrumbs)') .option('--bias-type
', 'Softly boost one table without filtering others (messages, loa, decisions, learnings, breadcrumbs)') .option('-l, --limit ', 'Max results', '20') + .option('--show-provenance', 'Show provenance for every result (default: only unknown provenance is flagged)') .action((query, options) => { runSearch(query, { project: options.project, table: options.table, biasType: options.biasType, - limit: parseInt(options.limit, 10) + limit: parseInt(options.limit, 10), + showProvenance: options.showProvenance }); closeDb(); }); @@ -535,6 +538,27 @@ importanceCmd closeDb(); }); +// recall provenance — conservative backfill for Record Provenance (ADR-0001). +// Provenance is automatic write-path metadata: there is intentionally no +// flag to set it on add commands; this maintenance path only classifies +// legacy NULL rows where deterministic evidence exists. +const provenanceCmd = program + .command('provenance') + .description('Manage Record Provenance metadata on memory records'); + +provenanceCmd + .command('backfill') + .description('Classify legacy rows with unknown provenance using deterministic write-path evidence (dry-run by default; never guesses)') + .option('--execute', 'Apply changes (default is dry-run)') + .option('-t, --table
', 'Target table: messages, decisions, learnings, breadcrumbs, loa_entries, all', 'all') + .action((options) => { + runProvenanceBackfill({ + dryRun: !options.execute, + table: options.table + }); + closeDb(); + }); + // recall pin
[importance] — force a record to a high importance (default 10) program .command('pin
[importance]') @@ -620,7 +644,7 @@ program .option('-k, --keyword', 'Use keyword search only (FTS5)') .option('-v, --vector', 'Use vector search only (semantic)') .action(async (query, options) => { - if (query && !['init', 'add', 'search', 'recent', 'show', 'stats', 'import', 'import-conversations', 'loa', 'telos', 'docs', 'dump', 'embed', 'semantic', 'hybrid', 'doctor', 'importance', 'pin', 'unpin', 'decision', 'prune', 'cluster', 'import-legacy', 'benchmark', 'onboard', 'migrate', 'path'].includes(query)) { + if (query && !['init', 'add', 'search', 'recent', 'show', 'stats', 'import', 'import-conversations', 'loa', 'telos', 'docs', 'dump', 'embed', 'semantic', 'hybrid', 'doctor', 'importance', 'provenance', 'pin', 'unpin', 'decision', 'prune', 'cluster', 'import-legacy', 'benchmark', 'onboard', 'migrate', 'path'].includes(query)) { if (options.keyword) { // FTS5 only runSearch(query, { diff --git a/src/lib/conversation-import.ts b/src/lib/conversation-import.ts index 382b790..8e1c70c 100644 --- a/src/lib/conversation-import.ts +++ b/src/lib/conversation-import.ts @@ -567,7 +567,9 @@ export async function importConversations( source: session.source, }); - const count = addMessagesBatch(session.messages); + // Raw conversation capture is verbatim (ADR-0001); the structured + // extraction below stamps its own records as extracted. + const count = addMessagesBatch(session.messages.map(m => ({ ...m, provenance: 'verbatim' as const }))); result.sessionsImported++; result.messagesImported += count; diff --git a/src/lib/import.ts b/src/lib/import.ts index 66d9785..dd7c930 100644 --- a/src/lib/import.ts +++ b/src/lib/import.ts @@ -184,8 +184,8 @@ export function importAllSessions(options?: { dryRun?: boolean; verbose?: boolea summary: `Imported from ${basename(file)}` }); - // Insert messages in batch - const count = addMessagesBatch(parsed.messages); + // Insert messages in batch — raw conversation capture is verbatim (ADR-0001) + const count = addMessagesBatch(parsed.messages.map(m => ({ ...m, provenance: 'verbatim' as const }))); result.sessionsImported++; result.messagesImported += count; diff --git a/src/lib/memory.ts b/src/lib/memory.ts index f3695a5..eb89ce3 100644 --- a/src/lib/memory.ts +++ b/src/lib/memory.ts @@ -2,7 +2,7 @@ import { getDb, getDbPath } from '../db/connection.js'; import { existsSync, statSync } from 'fs'; -import type { Session, Message, Decision, Learning, Breadcrumb, LoaEntry, Stats, SearchResult } from '../types/index.js'; +import type { Session, Message, Decision, Learning, Breadcrumb, LoaEntry, Stats, SearchResult, Provenance } from '../types/index.js'; // ============ Sessions ============ @@ -48,8 +48,8 @@ export function endSession(sessionId: string, summary?: string): void { export function addMessage(message: Omit): number { const db = getDb(); const stmt = db.prepare(` - INSERT INTO messages (session_id, timestamp, role, content, project, importance) - VALUES ($session_id, $timestamp, $role, $content, $project, $importance) + INSERT INTO messages (session_id, timestamp, role, content, project, importance, provenance) + VALUES ($session_id, $timestamp, $role, $content, $project, $importance, $provenance) `); const result = stmt.run({ $session_id: message.session_id, @@ -57,7 +57,8 @@ export function addMessage(message: Omit): number { $role: message.role, $content: message.content, $project: message.project || null, - $importance: clampImportance(message.importance, 5) + $importance: clampImportance(message.importance, 5), + $provenance: message.provenance ?? null }); return result.lastInsertRowid as number; } @@ -65,8 +66,8 @@ export function addMessage(message: Omit): number { export function addMessagesBatch(messages: Omit[]): number { const db = getDb(); const stmt = db.prepare(` - INSERT INTO messages (session_id, timestamp, role, content, project, importance) - VALUES ($session_id, $timestamp, $role, $content, $project, $importance) + INSERT INTO messages (session_id, timestamp, role, content, project, importance, provenance) + VALUES ($session_id, $timestamp, $role, $content, $project, $importance, $provenance) `); const insertMany = db.transaction((msgs: Omit[]) => { @@ -78,7 +79,8 @@ export function addMessagesBatch(messages: Omit[]): number { $role: msg.role, $content: msg.content, $project: msg.project || null, - $importance: clampImportance(msg.importance, 5) + $importance: clampImportance(msg.importance, 5), + $provenance: msg.provenance ?? null }); count++; } @@ -113,8 +115,8 @@ export function pinRecord(table: 'decisions' | 'learnings' | 'breadcrumbs' | 'lo export function addDecision(decision: Omit): number { const db = getDb(); const stmt = db.prepare(` - INSERT INTO decisions (session_id, category, project, decision, reasoning, alternatives, status, confidence, importance) - VALUES ($session_id, $category, $project, $decision, $reasoning, $alternatives, $status, $confidence, $importance) + INSERT INTO decisions (session_id, category, project, decision, reasoning, alternatives, status, confidence, importance, provenance) + VALUES ($session_id, $category, $project, $decision, $reasoning, $alternatives, $status, $confidence, $importance, $provenance) `); const result = stmt.run({ $session_id: decision.session_id || null, @@ -125,7 +127,8 @@ export function addDecision(decision: Omit): numb $alternatives: decision.alternatives || null, $status: decision.status || 'active', $confidence: decision.confidence || 'medium', - $importance: clampImportance(decision.importance, 5) + $importance: clampImportance(decision.importance, 5), + $provenance: decision.provenance ?? null }); return result.lastInsertRowid as number; } @@ -207,8 +210,8 @@ export function findSimilarDecisions(text: string, limit = 3): Decision[] { export function addLearning(learning: Omit): number { const db = getDb(); const stmt = db.prepare(` - INSERT INTO learnings (session_id, category, project, problem, solution, prevention, tags, confidence, importance) - VALUES ($session_id, $category, $project, $problem, $solution, $prevention, $tags, $confidence, $importance) + INSERT INTO learnings (session_id, category, project, problem, solution, prevention, tags, confidence, importance, provenance) + VALUES ($session_id, $category, $project, $problem, $solution, $prevention, $tags, $confidence, $importance, $provenance) `); const result = stmt.run({ $session_id: learning.session_id || null, @@ -219,7 +222,8 @@ export function addLearning(learning: Omit): numb $prevention: learning.prevention || null, $tags: learning.tags || null, $confidence: learning.confidence || 'medium', - $importance: clampImportance(learning.importance, 5) + $importance: clampImportance(learning.importance, 5), + $provenance: learning.provenance ?? null }); return result.lastInsertRowid as number; } @@ -234,8 +238,8 @@ export function getLearning(id: number): Learning | undefined { export function addBreadcrumb(breadcrumb: Omit): number { const db = getDb(); const stmt = db.prepare(` - INSERT INTO breadcrumbs (session_id, content, category, project, importance, expires_at) - VALUES ($session_id, $content, $category, $project, $importance, $expires_at) + INSERT INTO breadcrumbs (session_id, content, category, project, importance, expires_at, provenance) + VALUES ($session_id, $content, $category, $project, $importance, $expires_at, $provenance) `); const result = stmt.run({ $session_id: breadcrumb.session_id || null, @@ -243,7 +247,8 @@ export function addBreadcrumb(breadcrumb: Omit) $category: breadcrumb.category || null, $project: breadcrumb.project || null, $importance: breadcrumb.importance ?? 5, - $expires_at: breadcrumb.expires_at || null + $expires_at: breadcrumb.expires_at || null, + $provenance: breadcrumb.provenance ?? null }); return result.lastInsertRowid as number; } @@ -304,7 +309,7 @@ export function search(query: string, options?: MemorySearchOptions): SearchResu switch (table) { case 'messages': sql = ` - SELECT m.id, m.content, m.project, m.timestamp as created_at, f.rank + SELECT m.id, m.content, m.project, m.timestamp as created_at, m.provenance, f.rank FROM messages_fts f JOIN messages m ON m.id = f.rowid WHERE messages_fts MATCH ? @@ -315,7 +320,7 @@ export function search(query: string, options?: MemorySearchOptions): SearchResu break; case 'decisions': sql = ` - SELECT d.id, d.decision as content, d.project, d.created_at, f.rank + SELECT d.id, d.decision as content, d.project, d.created_at, d.provenance, f.rank FROM decisions_fts f JOIN decisions d ON d.id = f.rowid WHERE decisions_fts MATCH ? @@ -327,7 +332,7 @@ export function search(query: string, options?: MemorySearchOptions): SearchResu break; case 'learnings': sql = ` - SELECT l.id, l.problem as content, l.project, l.created_at, f.rank + SELECT l.id, l.problem as content, l.project, l.created_at, l.provenance, f.rank FROM learnings_fts f JOIN learnings l ON l.id = f.rowid WHERE learnings_fts MATCH ? @@ -338,7 +343,7 @@ export function search(query: string, options?: MemorySearchOptions): SearchResu break; case 'breadcrumbs': sql = ` - SELECT b.id, b.content, b.project, b.created_at, f.rank + SELECT b.id, b.content, b.project, b.created_at, b.provenance, f.rank FROM breadcrumbs_fts f JOIN breadcrumbs b ON b.id = f.rowid WHERE breadcrumbs_fts MATCH ? @@ -349,7 +354,7 @@ export function search(query: string, options?: MemorySearchOptions): SearchResu break; case 'loa': sql = ` - SELECT l.id, l.title || ': ' || SUBSTR(l.fabric_extract, 1, 200) as content, l.project, l.created_at, f.rank + SELECT l.id, l.title || ': ' || SUBSTR(l.fabric_extract, 1, 200) as content, l.project, l.created_at, l.provenance, f.rank FROM loa_fts f JOIN loa_entries l ON l.id = f.rowid WHERE loa_fts MATCH ? @@ -373,6 +378,7 @@ export function search(query: string, options?: MemorySearchOptions): SearchResu content: string; project: string | null; created_at: string; + provenance: Provenance | null; rank: number; }>; @@ -383,6 +389,7 @@ export function search(query: string, options?: MemorySearchOptions): SearchResu content: row.content, project: row.project || undefined, created_at: row.created_at, + provenance: row.provenance ?? null, rank: row.rank }); } @@ -453,8 +460,8 @@ export function createLoaEntry(entry: Omit): numb // so a careless caller cannot demote curated knowledge below neutral. const importance = Math.max(5, clampImportance(entry.importance, 8)); const stmt = db.prepare(` - INSERT INTO loa_entries (title, description, fabric_extract, message_range_start, message_range_end, parent_loa_id, session_id, project, tags, message_count, importance) - VALUES ($title, $description, $fabric_extract, $message_range_start, $message_range_end, $parent_loa_id, $session_id, $project, $tags, $message_count, $importance) + INSERT INTO loa_entries (title, description, fabric_extract, message_range_start, message_range_end, parent_loa_id, session_id, project, tags, message_count, importance, provenance) + VALUES ($title, $description, $fabric_extract, $message_range_start, $message_range_end, $parent_loa_id, $session_id, $project, $tags, $message_count, $importance, $provenance) `); const result = stmt.run({ $title: entry.title, @@ -467,7 +474,8 @@ export function createLoaEntry(entry: Omit): numb $project: entry.project || null, $tags: entry.tags || null, $message_count: entry.message_count || null, - $importance: importance + $importance: importance, + $provenance: entry.provenance ?? null }); return result.lastInsertRowid as number; } diff --git a/src/lib/structured-extraction.ts b/src/lib/structured-extraction.ts index f262870..c314dfe 100644 --- a/src/lib/structured-extraction.ts +++ b/src/lib/structured-extraction.ts @@ -164,6 +164,7 @@ function writeLoa(ctx: StructuredExtractionContext): number { project: ctx.project, tags: ctx.topics.join(','), message_count: ctx.messageCount ?? range.count, + provenance: 'extracted', }); } @@ -194,6 +195,7 @@ export function writeStructuredExtraction(ctx: StructuredExtractionContext): Str decision: item.decision, status: 'active', confidence: item.confidence, + provenance: 'extracted', }); result.decisions++; } @@ -211,6 +213,7 @@ export function writeStructuredExtraction(ctx: StructuredExtractionContext): Str solution: item.solution, tags: ctx.sessionLabel, confidence: 'medium', + provenance: 'extracted', }); result.learnings++; } @@ -226,6 +229,7 @@ export function writeStructuredExtraction(ctx: StructuredExtractionContext): Str project: ctx.project, content, importance: 5, + provenance: 'extracted', }); result.breadcrumbs++; } diff --git a/src/mcp-server.ts b/src/mcp-server.ts index 60d53a2..958d001 100644 --- a/src/mcp-server.ts +++ b/src/mcp-server.ts @@ -66,8 +66,15 @@ import { reciprocalRankFusion, checkEmbeddingService, } from "./lib/embeddings.js"; +import type { Provenance } from "./types/index.js"; import { existsSync } from "fs"; +// Record Provenance display (ADR-0001): structured results always carry +// provenance; legacy NULL is reported as "unknown", never guessed. +function provenanceLabel(provenance: Provenance | null | undefined): string { + return `provenance: ${provenance ?? "unknown"}`; +} + /** * Hybrid search combining FTS5 + vector embeddings with RRF fusion * Used by context_for_agent and memory_hybrid_search @@ -82,6 +89,7 @@ async function hybridSearch( content: string; score: number; source: "fts" | "vec" | "both"; + provenance: Provenance | null; }>; embeddingsAvailable: boolean; }> { @@ -159,6 +167,7 @@ async function hybridSearch( content: string; score: number; source: "fts" | "vec" | "both"; + provenance: Provenance | null; } >(); @@ -171,6 +180,7 @@ async function hybridSearch( content: r.content, score, source: "fts", + provenance: r.provenance ?? null, }); } @@ -182,25 +192,29 @@ async function hybridSearch( } else { // Need to fetch content let content = ""; + let provenance: Provenance | null = null; if (r.source_table === "loa_entries") { const loa = db .prepare( - "SELECT title, fabric_extract FROM loa_entries WHERE id = ?", + "SELECT title, fabric_extract, provenance FROM loa_entries WHERE id = ?", ) .get(r.source_id) as any; content = loa ? `${loa.title}: ${loa.fabric_extract?.slice(0, 200)}` : ""; + provenance = loa?.provenance ?? null; } else if (r.source_table === "decisions") { const dec = db - .prepare("SELECT decision FROM decisions WHERE id = ?") + .prepare("SELECT decision, provenance FROM decisions WHERE id = ?") .get(r.source_id) as any; content = dec?.decision || ""; + provenance = dec?.provenance ?? null; } else if (r.source_table === "messages") { const msg = db - .prepare("SELECT content FROM messages WHERE id = ?") + .prepare("SELECT content, provenance FROM messages WHERE id = ?") .get(r.source_id) as any; content = msg?.content?.slice(0, 200) || ""; + provenance = msg?.provenance ?? null; } resultMap.set(key, { @@ -209,6 +223,7 @@ async function hybridSearch( content, score: fusedScores.get(key) || 0, source: "vec", + provenance, }); } } @@ -229,6 +244,7 @@ async function hybridSearch( content: r.content, score: r.rank || 0, source: "fts" as const, + provenance: r.provenance ?? null, })) .slice(0, limit), embeddingsAvailable: false, @@ -288,7 +304,7 @@ server.tool( r.content.length > 200 ? r.content.slice(0, 200) + "..." : r.content; - return `[${r.table}#${r.id}] ${r.project || "no-project"} | ${r.created_at}\n${preview}`; + return `[${r.table}#${r.id}] ${r.project || "no-project"} | ${r.created_at} | ${provenanceLabel(r.provenance)}\n${preview}`; }) .join("\n\n---\n\n"); @@ -356,7 +372,7 @@ server.tool( ? r.content.slice(0, 200) + "..." : r.content; const score = (r.score * 100).toFixed(1); - return `${score}% ${sourceTag} [${r.table}#${r.id}]\n${preview}`; + return `${score}% ${sourceTag} [${r.table}#${r.id}] | ${provenanceLabel(r.provenance)}\n${preview}`; }) .join("\n\n---\n\n"); @@ -405,7 +421,7 @@ server.tool( output += "### Library of Alexandria (Curated Knowledge)\n"; for (const e of loa) { const preview = e.fabric_extract.slice(0, 300).replace(/\n/g, " "); - output += `- **LoA #${e.id}** [${e.project || "no-project"}] ${e.created_at?.split("T")[0]}: ${e.title}\n ${preview}...\n`; + output += `- **LoA #${e.id}** [${e.project || "no-project"}] ${e.created_at?.split("T")[0]} (${provenanceLabel(e.provenance)}): ${e.title}\n ${preview}...\n`; } output += "\n"; } @@ -413,7 +429,7 @@ server.tool( if (decisions.length > 0) { output += "### Recent Decisions\n"; for (const d of decisions) { - output += `- **#${d.id}** [${d.project || "no-project"}]: ${d.decision}${d.reasoning ? ` (${d.reasoning})` : ""}\n`; + output += `- **#${d.id}** [${d.project || "no-project"}] (${provenanceLabel(d.provenance)}): ${d.decision}${d.reasoning ? ` (${d.reasoning})` : ""}\n`; } output += "\n"; } @@ -421,7 +437,7 @@ server.tool( if (breadcrumbs.length > 0) { output += "### Breadcrumbs\n"; for (const b of breadcrumbs) { - output += `- **#${b.id}** [${b.project || "no-project"}]: ${b.content}\n`; + output += `- **#${b.id}** [${b.project || "no-project"}] (${provenanceLabel(b.provenance)}): ${b.content}\n`; } output += "\n"; } @@ -555,6 +571,9 @@ server.tool( } } + // ADR-0001: provenance is stamped from the write path. memory_add + // deliberately exposes no provenance parameter — agents must not + // be able to launder extracted content as something else. id = addDecision({ decision: content, reasoning: detail, @@ -562,6 +581,7 @@ server.tool( status: "active", confidence: confidence || "medium", importance, + provenance: "user_authored", }); let resultText = `Added decision #${id}: ${content}`; @@ -583,6 +603,7 @@ server.tool( tags, confidence: confidence || "medium", importance, + provenance: "user_authored", }); return { content: [ @@ -595,6 +616,7 @@ server.tool( content, project, importance: importance ?? 5, + provenance: "user_authored", }); return { content: [ diff --git a/src/types/index.ts b/src/types/index.ts index 3601818..e7635b3 100644 --- a/src/types/index.ts +++ b/src/types/index.ts @@ -1,5 +1,14 @@ // Core types for RECALL +// Record Provenance (ADR-0001, CONTEXT.md): the declared origin and +// transformation level of a memory record. Automatic write-path metadata — +// never a public MCP parameter or CLI classification input. Survivor-order +// vocabulary: user_authored > verbatim > extracted > derived. Legacy unknown +// is NULL/absent, never guessed. `derived` is reserved for future paths that +// mechanically produce records from existing memory records. +export const PROVENANCE_VALUES = ['user_authored', 'verbatim', 'extracted', 'derived'] as const; +export type Provenance = typeof PROVENANCE_VALUES[number]; + export interface Session { id?: number; session_id: string; @@ -21,6 +30,7 @@ export interface Message { content: string; project?: string; importance?: number; + provenance?: Provenance | null; } export interface Decision { @@ -35,6 +45,7 @@ export interface Decision { status: 'active' | 'superseded' | 'reverted'; confidence?: 'high' | 'medium' | 'low'; importance?: number; + provenance?: Provenance | null; } export interface Learning { @@ -49,6 +60,7 @@ export interface Learning { tags?: string; confidence?: 'high' | 'medium' | 'low'; importance?: number; + provenance?: Provenance | null; } export interface Breadcrumb { @@ -60,6 +72,7 @@ export interface Breadcrumb { project?: string; importance: number; expires_at?: string; + provenance?: Provenance | null; } export interface LoaEntry { @@ -76,6 +89,7 @@ export interface LoaEntry { tags?: string; message_count?: number; importance?: number; + provenance?: Provenance | null; } export interface SearchResult { @@ -85,6 +99,7 @@ export interface SearchResult { project?: string; created_at: string; rank?: number; + provenance?: Provenance | null; } export interface Stats { diff --git a/tests/db/migrations.test.ts b/tests/db/migrations.test.ts index 43020e0..8a52d55 100644 --- a/tests/db/migrations.test.ts +++ b/tests/db/migrations.test.ts @@ -110,10 +110,85 @@ describe('migration failure handling', () => { }); }); +describe('provenance migration (8 to 9)', () => { + const PROVENANCE_TABLES = ['messages', 'decisions', 'learnings', 'breadcrumbs', 'loa_entries']; + + test('all memory tables have provenance column after migrations', () => { + applyMigrations(db); + for (const table of PROVENANCE_TABLES) { + const cols = db.prepare(`PRAGMA table_info(${table})`).all() as any[]; + expect(cols.map((c: any) => c.name)).toContain('provenance'); + } + }); + + test('upgrade path: ALTER adds provenance to a legacy table without it', () => { + // Simulate a pre-provenance install: legacy table shape, version 8. + const legacyDir = mkdtempSync(join(tmpdir(), 'recall-legacy-test-')); + const legacyDb = new Database(join(legacyDir, 'legacy.db')); + try { + legacyDb.exec(` + CREATE TABLE messages ( + id INTEGER PRIMARY KEY AUTOINCREMENT, + session_id TEXT NOT NULL, + timestamp DATETIME NOT NULL, + role TEXT NOT NULL, + content TEXT NOT NULL, + project TEXT, + importance INTEGER DEFAULT 5 + ); + CREATE TABLE decisions (id INTEGER PRIMARY KEY AUTOINCREMENT, decision TEXT NOT NULL); + CREATE TABLE learnings (id INTEGER PRIMARY KEY AUTOINCREMENT, problem TEXT NOT NULL); + CREATE TABLE breadcrumbs (id INTEGER PRIMARY KEY AUTOINCREMENT, content TEXT NOT NULL); + CREATE TABLE loa_entries (id INTEGER PRIMARY KEY AUTOINCREMENT, title TEXT NOT NULL, fabric_extract TEXT NOT NULL); + `); + legacyDb.prepare('INSERT INTO messages (session_id, timestamp, role, content) VALUES (?, ?, ?, ?)') + .run('s1', '2026-01-01T00:00:00Z', 'user', 'legacy row'); + legacyDb.prepare('PRAGMA user_version = 8').run(); + + const result = applyMigrations(legacyDb); + expect(result.from).toBe(8); + expect(getMigrationVersion(legacyDb)).toBe(MIGRATIONS.length); + + for (const table of PROVENANCE_TABLES) { + const cols = legacyDb.prepare(`PRAGMA table_info(${table})`).all() as any[]; + expect(cols.map((c: any) => c.name)).toContain('provenance'); + } + + // Legacy rows stay NULL — unknown is never laundered into a value. + const row = legacyDb.prepare('SELECT provenance FROM messages WHERE session_id = ?').get('s1') as any; + expect(row.provenance).toBeNull(); + + // CHECK on the ALTERed column enforces the vocabulary but allows NULL. + expect(() => { + legacyDb.prepare('INSERT INTO messages (session_id, timestamp, role, content, provenance) VALUES (?, ?, ?, ?, ?)') + .run('s1', '2026-01-01T00:00:01Z', 'user', 'bad', 'guessed'); + }).toThrow(); + legacyDb.prepare('INSERT INTO messages (session_id, timestamp, role, content, provenance) VALUES (?, ?, ?, ?, ?)') + .run('s1', '2026-01-01T00:00:02Z', 'user', 'ok', 'verbatim'); + } finally { + legacyDb.close(); + rmSync(legacyDir, { recursive: true, force: true }); + } + }); + + test('CHECK constraint enforces vocabulary on fresh-install DDL', () => { + applyMigrations(db); + const insert = (provenance: string | null) => + db.prepare('INSERT INTO breadcrumbs (content, provenance) VALUES (?, ?)').run('x', provenance); + + for (const valid of ['verbatim', 'user_authored', 'extracted', 'derived', null]) { + expect(() => insert(valid)).not.toThrow(); + } + expect(() => insert('unknown')).toThrow(); + expect(() => insert('VERBATIM')).toThrow(); + }); +}); + describe('MIGRATIONS array', () => { test('has expected number of migrations', () => { // 7 → 8: importance column on messages/decisions/learnings/loa_entries (Sprint #4) - expect(MIGRATIONS.length).toBe(8); + // 8 → 9: provenance column on all five memory tables (issue #42) + expect(MIGRATIONS.length).toBe(9); }); test('all entries are functions', () => { From 7a938426355e069f2f42e9ad11179a9197e99270 Mon Sep 17 00:00:00 2001 From: Ed Heltzel <402910+edheltzel@users.noreply.github.com> Date: Wed, 10 Jun 2026 20:41:39 -0400 Subject: [PATCH 2/3] test(provenance): cover write paths, backfill, search display, and result payloads - backfill: dry-run default writes nothing, --execute classifies only evidence-backed rows, never overwrites, idempotent, table filter, unknown-table rejection - write paths: CLI add stamps user_authored, structured extraction stamps extracted, batch message capture persists verbatim, unstamped writes stay NULL - hooks: sqlite-writers stamp extracted + legacy-DB column guard, PreCompact flush stamps verbatim + pre-provenance DB guard - conversation import: raw messages stamped verbatim - search(): provenance present for all five record types, NULL as null - CLI display contract: quiet for known, flags unknown, --show-provenance - ADR-0001 contract pins: MCP memory_add schema and CLI expose no provenance override Refs #42 --- tests/commands/provenance.test.ts | 169 +++++++++++++++++++++++ tests/commands/search.test.ts | 53 ++++++- tests/hooks/recall-precompact.test.ts | 32 +++++ tests/hooks/sqlite-writers.test.ts | 45 ++++++ tests/lib/conversation-import.test.ts | 18 +++ tests/lib/provenance-write-paths.test.ts | 168 ++++++++++++++++++++++ 6 files changed, 484 insertions(+), 1 deletion(-) create mode 100644 tests/commands/provenance.test.ts create mode 100644 tests/lib/provenance-write-paths.test.ts diff --git a/tests/commands/provenance.test.ts b/tests/commands/provenance.test.ts new file mode 100644 index 0000000..d98bbc5 --- /dev/null +++ b/tests/commands/provenance.test.ts @@ -0,0 +1,169 @@ +// recall provenance backfill — conservative legacy classification (issue #42, ADR-0001). +// +// Binding rules under test: +// - dry-run is the default and writes nothing +// - --execute only sets provenance where deterministic evidence exists +// - rows without evidence stay NULL (unknown), never guessed +// - rows that already have provenance are never overwritten +// - user_authored is never assigned by backfill + +import { describe, test, expect, beforeEach, afterEach } from 'bun:test'; +import { Database } from 'bun:sqlite'; +import { setupTestDb, teardownTestDb } from '../helpers/setup'; +import { runProvenanceBackfill } from '../../src/commands/provenance'; +import { + createSession, + addMessage, + addDecision, + addLearning, + addBreadcrumb, + createLoaEntry, +} from '../../src/lib/memory'; + +let dbPath: string; +const originalLog = console.log; + +beforeEach(() => { + dbPath = setupTestDb(); + console.log = () => {}; // backfill prints a report; keep test output clean +}); + +afterEach(() => { + console.log = originalLog; + teardownTestDb(); +}); + +function readDb(): Database { + return new Database(dbPath, { readonly: true }); +} + +/** Seeds one legacy (NULL-provenance) landscape across all five tables. */ +function seedLegacyRows(): void { + createSession({ session_id: 's1', started_at: '2026-01-01T00:00:00Z', project: 'demo' }); + + // messages: all legacy rows are deterministic 'verbatim' + addMessage({ session_id: 's1', timestamp: '2026-01-01T00:00:01Z', role: 'user', content: 'legacy message' }); + + // decisions: evidence marker is category = 'auto-extracted' + addDecision({ session_id: 's1', decision: 'extracted decision', category: 'auto-extracted', status: 'active' }); + addDecision({ session_id: 's1', decision: 'unmarked decision', category: 'manual', status: 'active' }); + addDecision({ session_id: 's1', decision: 'already stamped', status: 'active', provenance: 'user_authored' }); + + // learnings: evidence marker is category = 'auto-extracted' + addLearning({ session_id: 's1', problem: 'extracted problem', solution: 'fix', category: 'auto-extracted' }); + addLearning({ session_id: 's1', problem: 'unmarked problem', solution: 'fix', category: 'other' }); + + // breadcrumbs: evidence marker is category = 'extracted-idea' + addBreadcrumb({ session_id: 's1', content: 'extracted idea', category: 'extracted-idea', importance: 5 }); + addBreadcrumb({ session_id: 's1', content: 'unmarked note', category: 'note', importance: 5 }); + + // loa_entries: all legacy rows are deterministic 'extracted' + createLoaEntry({ title: 'legacy loa', fabric_extract: 'extract body', session_id: 's1' }); +} + +describe('runProvenanceBackfill — dry run (default)', () => { + test('reports classifications without writing anything', () => { + seedLegacyRows(); + + const results = runProvenanceBackfill({}); + + expect(results.length).toBe(5); + const byTable = Object.fromEntries(results.map(r => [r.table, r])); + expect(byTable.messages.classified).toBe(1); + expect(byTable.messages.value).toBe('verbatim'); + expect(byTable.loa_entries.classified).toBe(1); + expect(byTable.loa_entries.value).toBe('extracted'); + // only the evidence-marked rows qualify; pre-stamped row is not "unknown" + expect(byTable.decisions.unknownBefore).toBe(2); + expect(byTable.decisions.classified).toBe(1); + expect(byTable.decisions.remainingUnknown).toBe(1); + expect(byTable.learnings.classified).toBe(1); + expect(byTable.breadcrumbs.classified).toBe(1); + + // Nothing was written + const db = readDb(); + const nullCount = (table: string) => + (db.prepare(`SELECT COUNT(*) AS c FROM ${table} WHERE provenance IS NULL`).get() as { c: number }).c; + expect(nullCount('messages')).toBe(1); + expect(nullCount('decisions')).toBe(2); + expect(nullCount('learnings')).toBe(2); + expect(nullCount('breadcrumbs')).toBe(2); + expect(nullCount('loa_entries')).toBe(1); + db.close(); + }); +}); + +describe('runProvenanceBackfill — execute', () => { + test('classifies only evidence-backed rows; the rest stay NULL', () => { + seedLegacyRows(); + + runProvenanceBackfill({ dryRun: false }); + + const db = readDb(); + const provenanceOf = (table: string, where: string) => + (db.prepare(`SELECT provenance FROM ${table} WHERE ${where}`).get() as any)?.provenance; + + expect(provenanceOf('messages', "content = 'legacy message'")).toBe('verbatim'); + expect(provenanceOf('loa_entries', "title = 'legacy loa'")).toBe('extracted'); + + expect(provenanceOf('decisions', "decision = 'extracted decision'")).toBe('extracted'); + expect(provenanceOf('decisions', "decision = 'unmarked decision'")).toBeNull(); + // never overwritten, and user_authored is never assigned by backfill + expect(provenanceOf('decisions', "decision = 'already stamped'")).toBe('user_authored'); + + expect(provenanceOf('learnings', "problem = 'extracted problem'")).toBe('extracted'); + expect(provenanceOf('learnings', "problem = 'unmarked problem'")).toBeNull(); + + expect(provenanceOf('breadcrumbs', "content = 'extracted idea'")).toBe('extracted'); + expect(provenanceOf('breadcrumbs', "content = 'unmarked note'")).toBeNull(); + db.close(); + }); + + test('is idempotent: a second execute classifies nothing new', () => { + seedLegacyRows(); + runProvenanceBackfill({ dryRun: false }); + + const second = runProvenanceBackfill({ dryRun: false }); + for (const r of second) { + expect(r.classified).toBe(0); + } + }); + + test('table filter limits the run to one table', () => { + seedLegacyRows(); + + const results = runProvenanceBackfill({ dryRun: false, table: 'decisions' }); + + expect(results.length).toBe(1); + expect(results[0].table).toBe('decisions'); + + const db = readDb(); + // messages untouched by a decisions-only run + const msg = db.prepare("SELECT provenance FROM messages WHERE content = 'legacy message'").get() as any; + expect(msg.provenance).toBeNull(); + db.close(); + }); +}); + +describe('runProvenanceBackfill — input validation', () => { + const originalExitCode = process.exitCode; + const originalError = console.error; + + afterEach(() => { + process.exitCode = originalExitCode ?? 0; + console.error = originalError; + }); + + test('rejects an unknown table', () => { + let errorOutput = ''; + console.error = (msg?: unknown) => { + errorOutput += String(msg); + }; + + const results = runProvenanceBackfill({ table: 'sessions' as any }); + + expect(results).toEqual([]); + expect(errorOutput).toContain('Unknown table: sessions'); + expect(process.exitCode).toBe(1); + }); +}); diff --git a/tests/commands/search.test.ts b/tests/commands/search.test.ts index 9a02b0e..226fe8c 100644 --- a/tests/commands/search.test.ts +++ b/tests/commands/search.test.ts @@ -1,4 +1,4 @@ -import { describe, test, expect, afterEach } from 'bun:test'; +import { describe, test, expect, beforeEach, afterEach } from 'bun:test'; import { runSearch } from '../../src/commands/search.js'; describe('runSearch --bias-type guard', () => { @@ -23,3 +23,54 @@ describe('runSearch --bias-type guard', () => { expect(process.exitCode).toBe(1); }); }); + +import { setupTestDb, teardownTestDb } from '../helpers/setup'; +import { createSession, addDecision, addBreadcrumb } from '../../src/lib/memory'; + +describe('runSearch provenance display contract (issue #42)', () => { + const originalLog = console.log; + let output: string; + + beforeEach(() => { + setupTestDb(); + output = ''; + console.log = (msg?: unknown) => { + output += `${String(msg ?? '')}\n`; + }; + + createSession({ session_id: 'disp-1', started_at: '2026-01-01T00:00:00Z', project: 'demo' }); + addDecision({ session_id: 'disp-1', decision: 'quizzacious known decision', status: 'active', provenance: 'user_authored' }); + addBreadcrumb({ session_id: 'disp-1', content: 'quizzacious legacy crumb', importance: 5 }); // provenance NULL + }); + + afterEach(() => { + console.log = originalLog; + teardownTestDb(); + }); + + test('default display stays quiet for known provenance and flags unknown', () => { + runSearch('quizzacious', {}); + + const lines = output.split('\n'); + const knownLine = lines.find(l => l.includes('decisions#')); + const unknownLine = lines.find(l => l.includes('breadcrumbs#')); + + expect(knownLine).toBeDefined(); + expect(knownLine).not.toContain('provenance'); + + expect(unknownLine).toBeDefined(); + expect(unknownLine).toContain('⚠'); + expect(unknownLine).toContain('provenance: unknown'); + }); + + test('--show-provenance shows every provenance value', () => { + runSearch('quizzacious', { showProvenance: true }); + + const lines = output.split('\n'); + const knownLine = lines.find(l => l.includes('decisions#')); + const unknownLine = lines.find(l => l.includes('breadcrumbs#')); + + expect(knownLine).toContain('provenance: user_authored'); + expect(unknownLine).toContain('provenance: unknown'); + }); +}); diff --git a/tests/hooks/recall-precompact.test.ts b/tests/hooks/recall-precompact.test.ts index 13052de..4921d58 100644 --- a/tests/hooks/recall-precompact.test.ts +++ b/tests/hooks/recall-precompact.test.ts @@ -274,3 +274,35 @@ describe('RecallPreCompact — flushConversation', () => { expect(hookSource).not.toMatch(/extractWithClaude|extractWithOllama|fetch\(|http\.request/); }); }); + +describe('RecallPreCompact — Record Provenance (ADR-0001, issue #42)', () => { + test('stamps flushed messages verbatim when the DB has the provenance column', async () => { + // Migrated DB shape: messages carries the provenance column. + const db = new Database(dbPath); + db.exec(`ALTER TABLE messages ADD COLUMN provenance TEXT CHECK (provenance IN ('verbatim', 'user_authored', 'extracted', 'derived'))`); + db.close(); + + writeJsonlMessages([ + { role: 'user', text: 'a message captured mid-session' }, + { role: 'assistant', text: 'a reply captured mid-session' }, + ]); + + const { flushConversation } = await import('../../hooks/RecallPreCompact'); + const result = flushConversation(convPath, '/tmp/proj'); + expect(result.imported).toBe(2); + + const readDb = new Database(dbPath, { readonly: true }); + const rows = readDb.prepare('SELECT provenance FROM messages ORDER BY id').all() as Array<{ provenance: string }>; + readDb.close(); + expect(rows.map(r => r.provenance)).toEqual(['verbatim', 'verbatim']); + }); + + test('keeps working against a pre-provenance DB (column guard)', async () => { + // CORE_SCHEMA above has no provenance column — the flush must not fail. + writeJsonlMessages([{ role: 'user', text: 'legacy database flush message' }]); + + const { flushConversation } = await import('../../hooks/RecallPreCompact'); + const result = flushConversation(convPath, '/tmp/proj'); + expect(result.imported).toBe(1); + }); +}); diff --git a/tests/hooks/sqlite-writers.test.ts b/tests/hooks/sqlite-writers.test.ts index 866f7eb..4c744f4 100644 --- a/tests/hooks/sqlite-writers.test.ts +++ b/tests/hooks/sqlite-writers.test.ts @@ -177,3 +177,48 @@ describe('writeExtractionErrors', () => { expect(rows[0].fix).toBe('chmod +x'); }); }); + +describe('Record Provenance stamping (ADR-0001, issue #42)', () => { + test('every extraction writer stamps provenance = extracted', () => { + writeDecisionsBatch(dbPath, [{ decision: 'stamped decision' }]); + writeLearningsBatch(dbPath, [{ problem: 'stamped problem', solution: 'fix' }]); + writeBreadcrumbsBatch(dbPath, [{ content: 'stamped crumb' }]); + writeLoaEntryFromExtraction(dbPath, { + title: 'stamped loa', + fabricExtract: '## ONE SENTENCE SUMMARY\ntext', + sessionId: 's1', + }); + + const db = openRead(); + for (const table of ['decisions', 'learnings', 'breadcrumbs', 'loa_entries']) { + const row = db.prepare(`SELECT provenance FROM ${table} LIMIT 1`).get() as any; + expect(row.provenance).toBe('extracted'); + } + db.close(); + }); + + test('still writes into a legacy DB whose tables have no provenance column', () => { + const legacyPath = dbPath.replace('test.db', 'legacy-writers.db'); + const legacy = new Database(legacyPath); + legacy.exec(` + CREATE TABLE decisions ( + id INTEGER PRIMARY KEY AUTOINCREMENT, + session_id TEXT, + category TEXT, + project TEXT, + decision TEXT NOT NULL, + status TEXT DEFAULT 'active', + importance INTEGER DEFAULT 5 + ); + `); + legacy.close(); + + const n = writeDecisionsBatch(legacyPath, [{ decision: 'legacy write' }]); + expect(n).toBe(1); + + const db = new Database(legacyPath, { readonly: true }); + const row = db.prepare('SELECT decision FROM decisions').get() as any; + db.close(); + expect(row.decision).toBe('legacy write'); + }); +}); diff --git a/tests/lib/conversation-import.test.ts b/tests/lib/conversation-import.test.ts index b51c7c4..82f427d 100644 --- a/tests/lib/conversation-import.test.ts +++ b/tests/lib/conversation-import.test.ts @@ -289,3 +289,21 @@ describe('conversationSourceAdapters', () => { } }); }); + +describe('Record Provenance (ADR-0001, issue #42)', () => { + test('raw imported messages are stamped verbatim', async () => { + const file = join(tempDir, 'slack-export.json'); + writeFileSync(file, JSON.stringify([ + { ts: '1710000000.000100', user: 'U1', text: 'hello from slack history' }, + { ts: '1710000001.000200', user: 'U2', text: 'a reply worth remembering' }, + ])); + + const result = await importConversations(file, { format: 'slack', noExtract: true }); + expect(result.messagesImported).toBe(2); + + const db = readDb(); + const rows = db.prepare('SELECT provenance FROM messages ORDER BY timestamp').all() as any[]; + db.close(); + expect(rows.map(r => r.provenance)).toEqual(['verbatim', 'verbatim']); + }); +}); diff --git a/tests/lib/provenance-write-paths.test.ts b/tests/lib/provenance-write-paths.test.ts new file mode 100644 index 0000000..f7251ae --- /dev/null +++ b/tests/lib/provenance-write-paths.test.ts @@ -0,0 +1,168 @@ +// Record Provenance write-path stamping (issue #42, ADR-0001). +// +// Provenance is automatic write-path metadata. Each capture surface stamps +// the value its write-path semantics dictate; no public surface accepts a +// provenance override. These tests pin the stamp per path: +// - CLI `recall add` → user_authored +// - structured extraction (Haiku/Fabric output) → extracted +// - raw message capture (import/dump batch writer) → verbatim +// - search() structured results carry provenance for every record type + +import { describe, test, expect, beforeEach, afterEach } from 'bun:test'; +import { readFileSync } from 'fs'; +import { join } from 'path'; +import { Database } from 'bun:sqlite'; +import { setupTestDb, teardownTestDb } from '../helpers/setup'; +import { runAddBreadcrumb, runAddDecision, runAddLearning } from '../../src/commands/add'; +import { writeStructuredExtraction } from '../../src/lib/structured-extraction'; +import { + createSession, + addMessage, + addMessagesBatch, + addDecision, + addLearning, + addBreadcrumb, + createLoaEntry, + search, +} from '../../src/lib/memory'; + +let dbPath: string; +const originalLog = console.log; + +beforeEach(() => { + dbPath = setupTestDb(); + console.log = () => {}; // add commands print confirmations; keep output clean +}); + +afterEach(() => { + console.log = originalLog; + teardownTestDb(); +}); + +function readDb(): Database { + return new Database(dbPath, { readonly: true }); +} + +describe('CLI add commands stamp user_authored', () => { + test('breadcrumb, decision, and learning all land as user_authored', () => { + runAddBreadcrumb('a crumb worth keeping', { project: 'demo' }); + runAddDecision('we choose sqlite', { project: 'demo' }); + runAddLearning('it was broken', 'we fixed it', { project: 'demo' }); + + const db = readDb(); + for (const table of ['breadcrumbs', 'decisions', 'learnings']) { + const row = db.prepare(`SELECT provenance FROM ${table} LIMIT 1`).get() as any; + expect(row.provenance).toBe('user_authored'); + } + db.close(); + }); +}); + +describe('structured extraction stamps extracted', () => { + test('decisions and LoA entry from an extract are marked extracted', () => { + createSession({ session_id: 'ext-1', started_at: '2026-01-01T00:00:00Z', project: 'demo' }); + + const result = writeStructuredExtraction({ + sessionId: 'ext-1', + sessionLabel: 'extraction test', + project: 'demo', + timestamp: '2026-01-01', + conversationPath: '/tmp/conv.jsonl', + topics: ['testing'], + summary: 'a one sentence summary', + extracted: [ + '## ONE SENTENCE SUMMARY', + 'a one sentence summary', + '', + '## DECISIONS MADE', + '- Adopt write-path provenance stamping (confidence: HIGH)', + ].join('\n'), + }); + + expect(result.decisions).toBe(1); + expect(result.loa).toBe(1); + + const db = readDb(); + const decision = db.prepare('SELECT provenance FROM decisions LIMIT 1').get() as any; + const loa = db.prepare('SELECT provenance FROM loa_entries LIMIT 1').get() as any; + db.close(); + expect(decision.provenance).toBe('extracted'); + expect(loa.provenance).toBe('extracted'); + }); +}); + +describe('raw message capture', () => { + test('batch writer persists verbatim when the import path stamps it', () => { + createSession({ session_id: 'imp-1', started_at: '2026-01-01T00:00:00Z', project: 'demo' }); + + // import.ts / conversation-import.ts / dump.ts all map messages through + // addMessagesBatch with provenance: 'verbatim' + addMessagesBatch([ + { session_id: 'imp-1', timestamp: '2026-01-01T00:00:01Z', role: 'user', content: 'raw text', provenance: 'verbatim' }, + { session_id: 'imp-1', timestamp: '2026-01-01T00:00:02Z', role: 'assistant', content: 'raw reply', provenance: 'verbatim' }, + ]); + + const db = readDb(); + const rows = db.prepare('SELECT provenance FROM messages ORDER BY id').all() as any[]; + db.close(); + expect(rows.map(r => r.provenance)).toEqual(['verbatim', 'verbatim']); + }); + + test('a write without provenance stays NULL — unknown is representable, never defaulted', () => { + createSession({ session_id: 'imp-2', started_at: '2026-01-01T00:00:00Z', project: 'demo' }); + addMessage({ session_id: 'imp-2', timestamp: '2026-01-01T00:00:01Z', role: 'user', content: 'unstamped' }); + + const db = readDb(); + const row = db.prepare('SELECT provenance FROM messages LIMIT 1').get() as any; + db.close(); + expect(row.provenance).toBeNull(); + }); +}); + +describe('search() structured results carry provenance', () => { + test('every record type returns its provenance; NULL surfaces as null', () => { + createSession({ session_id: 'srch-1', started_at: '2026-01-01T00:00:00Z', project: 'demo' }); + + addMessage({ session_id: 'srch-1', timestamp: '2026-01-01T00:00:01Z', role: 'user', content: 'xylocarp message', provenance: 'verbatim' }); + addDecision({ session_id: 'srch-1', decision: 'xylocarp decision', status: 'active', provenance: 'user_authored' }); + addLearning({ session_id: 'srch-1', problem: 'xylocarp problem', solution: 'fix', provenance: 'extracted' }); + addBreadcrumb({ session_id: 'srch-1', content: 'xylocarp crumb', importance: 5, provenance: 'user_authored' }); + createLoaEntry({ title: 'xylocarp loa', fabric_extract: 'xylocarp extract body', session_id: 'srch-1', provenance: 'extracted' }); + // legacy row with unknown provenance + addBreadcrumb({ session_id: 'srch-1', content: 'xylocarp legacy crumb', importance: 5 }); + + const results = search('xylocarp', { limit: 20 }); + const byKey = new Map(results.map(r => [`${r.table}:${r.content}`, r])); + + expect(byKey.get('messages:xylocarp message')?.provenance).toBe('verbatim'); + expect(byKey.get('decisions:xylocarp decision')?.provenance).toBe('user_authored'); + expect(byKey.get('learnings:xylocarp problem')?.provenance).toBe('extracted'); + expect(byKey.get('breadcrumbs:xylocarp crumb')?.provenance).toBe('user_authored'); + expect(byKey.get('breadcrumbs:xylocarp legacy crumb')?.provenance).toBeNull(); + + const loaResult = results.find(r => r.table === 'loa'); + expect(loaResult?.provenance).toBe('extracted'); + }); +}); + +describe('no public provenance override (ADR-0001 contract)', () => { + const repoRoot = join(import.meta.dir, '..', '..'); + + test('MCP memory_add input schema exposes no provenance parameter', () => { + const source = readFileSync(join(repoRoot, 'src', 'mcp-server.ts'), 'utf-8'); + const toolStart = source.indexOf('"memory_add"'); + expect(toolStart).toBeGreaterThan(-1); + // The zod input schema sits between the tool name and the handler callback. + const handlerStart = source.indexOf('async (', toolStart); + const schemaBlock = source.slice(toolStart, handlerStart); + expect(schemaBlock).not.toContain('provenance'); + // The handler stamps it instead. + const handlerBlock = source.slice(handlerStart, source.indexOf('server.tool', handlerStart)); + expect(handlerBlock).toContain('provenance: "user_authored"'); + }); + + test('CLI exposes no --provenance flag anywhere', () => { + const source = readFileSync(join(repoRoot, 'src', 'index.ts'), 'utf-8'); + expect(source).not.toContain('--provenance'); + }); +}); From 49c18f438fbb296310dcc73aa15b8be5679498d7 Mon Sep 17 00:00:00 2001 From: Ed Heltzel <402910+edheltzel@users.noreply.github.com> Date: Wed, 10 Jun 2026 20:44:16 -0400 Subject: [PATCH 3/3] docs(provenance): document --show-provenance and recall provenance backfill - cli-reference: search flag, display contract, Record Provenance section - mcp-tools: provenance in search/hybrid/recall payloads; memory_add stamps user_authored with no provenance parameter (ADR-0001) - architecture: provenance column + migration 8->9 note - slash-commands + /Recall:search: --show-provenance flag - FOR_CLAUDE/FOR_PI/FOR_OPENCODE: CLI examples kept in sync Refs #42 --- FOR_CLAUDE.md | 2 ++ FOR_OPENCODE.md | 2 ++ FOR_PI.md | 2 ++ commands/Recall/search.md | 1 + docs/architecture.md | 10 ++++++++++ docs/cli-reference.md | 27 +++++++++++++++++++++++++++ docs/mcp-tools.md | 8 +++++--- docs/slash-commands.md | 1 + 8 files changed, 50 insertions(+), 3 deletions(-) diff --git a/FOR_CLAUDE.md b/FOR_CLAUDE.md index 891534f..da25e44 100644 --- a/FOR_CLAUDE.md +++ b/FOR_CLAUDE.md @@ -141,6 +141,8 @@ You can also use the `recall` CLI directly via Bash: ```bash recall search "deployment pipeline" # Search memory recall search "database choice" --bias-type decisions # Prefer decisions, keep other matches +recall search "auth" --show-provenance # Show Record Provenance for every result +recall provenance backfill # Classify legacy unknown-provenance rows (dry-run; --execute to apply) recall stats # Database statistics recall loa list # Browse curated knowledge recall dump "Session title" # Capture current session diff --git a/FOR_OPENCODE.md b/FOR_OPENCODE.md index af48a50..02e6164 100644 --- a/FOR_OPENCODE.md +++ b/FOR_OPENCODE.md @@ -86,6 +86,8 @@ You can also use the `recall` CLI directly via Bash tool: ```bash recall search "deployment pipeline" # Search memory recall search "database choice" --bias-type decisions # Prefer decisions, keep other matches +recall search "auth" --show-provenance # Show Record Provenance for every result +recall provenance backfill # Classify legacy unknown-provenance rows (dry-run; --execute to apply) recall stats # Database statistics recall loa list # Browse curated knowledge recall onboard # Interactive L0 identity setup (run once per user) diff --git a/FOR_PI.md b/FOR_PI.md index 58bddaa..2bf23ab 100644 --- a/FOR_PI.md +++ b/FOR_PI.md @@ -86,6 +86,8 @@ You can also use the `recall` CLI directly via shell commands: ```bash recall search "deployment pipeline" # Search memory recall search "database choice" --bias-type decisions # Prefer decisions, keep other matches +recall search "auth" --show-provenance # Show Record Provenance for every result +recall provenance backfill # Classify legacy unknown-provenance rows (dry-run; --execute to apply) recall stats # Database statistics recall loa list # Browse curated knowledge recall onboard # Interactive L0 identity setup (run once per user) diff --git a/commands/Recall/search.md b/commands/Recall/search.md index 8c9c774..5a1882b 100644 --- a/commands/Recall/search.md +++ b/commands/Recall/search.md @@ -18,6 +18,7 @@ recall search "$1" - `-t
` — Hard-filter to one table: messages, loa, decisions, learnings, breadcrumbs - `--bias-type
` — Softly boost one table without filtering other matches. Same values as `-t`. - `-l ` — Max results (default: 20) +- `--show-provenance` — Show Record Provenance for every result (by default only unknown provenance is flagged) ## Examples diff --git a/docs/architecture.md b/docs/architecture.md index 93d241b..a7d859e 100644 --- a/docs/architecture.md +++ b/docs/architecture.md @@ -78,6 +78,16 @@ tables (`messages`, `decisions`, `learnings`, `loa_entries`). It controls L1 tier ranking at session start. Manage manually with `recall pin` / `recall unpin` or backfill from confidence signals with `recall importance backfill`. +The `provenance` column was added in schema migration 8→9 on all five memory +tables (`messages`, `decisions`, `learnings`, `breadcrumbs`, `loa_entries`). +It declares how each record was created — `verbatim`, `user_authored`, +`extracted`, or `derived` — and is stamped automatically by every write path, +never accepted from callers (see +`docs/adr/0001-record-provenance-automatic-write-path-metadata.md`). Legacy +rows stay `NULL` (unknown) until classified with +`recall provenance backfill`, which only acts on deterministic write-path +evidence and never guesses. + ## Tiered RecallStart (v0.7.0+) The `RecallStart` hook injects two tiers at the top of every session: diff --git a/docs/cli-reference.md b/docs/cli-reference.md index 2742894..43b47da 100644 --- a/docs/cli-reference.md +++ b/docs/cli-reference.md @@ -16,6 +16,7 @@ recall search "query" # FTS5 search with options recall search "query" -t decisions # Hard-filter to decisions only recall search "query" --bias-type decisions # Prefer decisions, still show other matching tables recall search "query" -p myproject # Filter by project +recall search "query" --show-provenance # Show provenance for every result recall semantic "query" # Semantic search (explicit) recall hybrid "query" # Hybrid search (explicit) ``` @@ -43,6 +44,8 @@ FTS5 supports boolean operators and prefix matching: - `auth*` — prefix match (authz, authentication, etc.) - `"vpn config"` — exact phrase +By default, search output stays quiet about [Record Provenance](#record-provenance) when a record carries a known value, and visibly flags records whose provenance is unknown (legacy rows that predate the provenance column). Pass `--show-provenance` to display the provenance of every result. + --- ## Capture @@ -209,6 +212,30 @@ recall unpin decisions 42 # Reset to table default (5, or 8 for Lo LoA entries have a write-time floor of 5; `recall pin` will not drop them below that. +## Record Provenance + +The `provenance` column on `messages`, `decisions`, `learnings`, `breadcrumbs`, +and `loa_entries` declares how each record was created: `verbatim` (exact source +text), `user_authored` (directly authored via a user or agent command), +`extracted` (generated from source material, possibly lossy), or `derived` +(mechanically produced from existing memory records). Provenance is **automatic +write-path metadata** — every write path stamps it; there is no flag or MCP +parameter to set it (see `docs/adr/0001-record-provenance-automatic-write-path-metadata.md`). + +Legacy rows that predate the column have no declared provenance (`NULL`, +reported as `unknown`). The backfill classifies them conservatively — only +where the source table or a write-path marker gives deterministic evidence; +everything else stays unknown rather than being guessed: + +```bash +recall provenance backfill # Dry-run report (default) +recall provenance backfill --execute # Apply the classification +recall provenance backfill --execute -t loa_entries # Limit to one table +``` + +Allowed `-t/--table` values: `messages`, `decisions`, `learnings`, +`breadcrumbs`, `loa_entries`, `all` (default). + ## Benchmarks Phase 2 benchmark harness for measuring context efficiency. diff --git a/docs/mcp-tools.md b/docs/mcp-tools.md index f36e1c3..31eab0d 100644 --- a/docs/mcp-tools.md +++ b/docs/mcp-tools.md @@ -22,7 +22,7 @@ Use `table` when you need a **hard filter** to one record type. Use `bias_type` | bias_type | string | no | — | Softly boost one table type in ranking without filtering other matches. Same allowed values as `table`; prefer `table` when you need only one type. | | limit | number | no | 10 | Maximum number of results to return | -**Returns:** Array of matching records with table name, id, content, project, and snippet highlighting. +**Returns:** Array of matching records with table name, id, content, project, snippet highlighting, and Record Provenance (`verbatim`, `user_authored`, `extracted`, `derived`, or `unknown` for legacy rows that predate provenance). ```js // Only decisions @@ -48,7 +48,7 @@ Combined keyword + semantic search using Reciprocal Rank Fusion. Best for natura | project | string | no | — | Filter results to a specific project name | | limit | number | no | 10 | Maximum number of results to return | -**Returns:** Array of matching records ranked by fused keyword and semantic relevance scores. +**Returns:** Array of matching records ranked by fused keyword and semantic relevance scores, each with its Record Provenance. ```js memory_hybrid_search({ query: "how did we handle rate limiting", project: "my-app" }) @@ -67,7 +67,7 @@ Get recent context — LoA entries, decisions, and breadcrumbs. Good for orienti | limit | number | no | 5 | Number of recent entries to return per category | | project | string | no | — | Filter results to a specific project name | -**Returns:** Recent records grouped by category: Library of Alexandria entries, decisions, and breadcrumbs. +**Returns:** Recent records grouped by category: Library of Alexandria entries, decisions, and breadcrumbs — each annotated with its Record Provenance. ```js memory_recall({ limit: 5, project: "my-app" }) @@ -112,6 +112,8 @@ Add structured records during a session. Use this to capture decisions, learning **Returns:** Confirmation with the new record's id and table. +Records created through `memory_add` are automatically stamped with Record Provenance `user_authored`. There is intentionally no provenance parameter — provenance is write-path metadata, not a caller claim (see `docs/adr/0001-record-provenance-automatic-write-path-metadata.md`). + ```js memory_add({ type: "decision", content: "Use PostgreSQL over MySQL", detail: "Better JSON support and JSONB indexing" }) memory_add({ type: "learning", content: "bun:sqlite uses $param syntax", detail: "Not :param like better-sqlite3", tags: "bun,sqlite" }) diff --git a/docs/slash-commands.md b/docs/slash-commands.md index 9213e55..50343ab 100644 --- a/docs/slash-commands.md +++ b/docs/slash-commands.md @@ -27,6 +27,7 @@ Searches messages, LoA entries, decisions, learnings, and breadcrumbs. The slash - `/Recall:search database choice -t decisions` — hard-filter to decisions only - `/Recall:search database choice --bias-type decisions` — prefer decisions first, while still returning matching learnings/messages/LoA/breadcrumbs +- `/Recall:search database choice --show-provenance` — show Record Provenance for every result (by default only unknown provenance is flagged) Rule of thumb: use `-t` when you want only one table; use `--bias-type` when you want one table first without hiding other context.