From 779be252a6c9512bb9bfd9304fb530aba479726b Mon Sep 17 00:00:00 2001 From: Ed Heltzel <402910+edheltzel@users.noreply.github.com> Date: Thu, 11 Jun 2026 04:35:44 -0400 Subject: [PATCH 1/2] =?UTF-8?q?feat:=20add=20recall=20repair=20=E2=80=94?= =?UTF-8?q?=20FTS5=20index=20rebuild,=20re-embedding,=20and=20report-only?= =?UTF-8?q?=20integrity=20checks?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Explicit data/index maintenance separate from doctor --fix (issue #46). Dry-run by default; --execute applies. Rebuilds drifted FTS5 indexes from source tables, recreates missing indexes/sync triggers from canonical per-table schema DDL (the un-migrated-DB silently-empty-search gap), re-embeds rows missing embeddings when Ollama is available, and reports orphan/invariant problems without mutating them. Never hard-deletes rows, never touches Record Provenance. Doctor gains a read-only FTS health check that recommends repair; --fix remains symlinks-only. Closes #46 --- docs/cli-reference.md | 49 ++- docs/troubleshooting.md | 20 ++ src/commands/doctor.ts | 34 ++ src/commands/embed.ts | 18 +- src/commands/repair.ts | 183 +++++++++++ src/db/schema.ts | 193 +++++++----- src/index.ts | 21 +- src/lib/repair.ts | 565 ++++++++++++++++++++++++++++++++++ tests/commands/repair.test.ts | 337 ++++++++++++++++++++ tests/lib/repair.test.ts | 372 ++++++++++++++++++++++ 10 files changed, 1705 insertions(+), 87 deletions(-) create mode 100644 src/commands/repair.ts create mode 100644 src/lib/repair.ts create mode 100644 tests/commands/repair.test.ts create mode 100644 tests/lib/repair.test.ts diff --git a/docs/cli-reference.md b/docs/cli-reference.md index 671de42..f0a265f 100644 --- a/docs/cli-reference.md +++ b/docs/cli-reference.md @@ -343,6 +343,53 @@ Marked duplicates are hidden from all search paths by default; see `recall search --include-duplicates`. Lineage is included in `recall export`, so the audit trail is portable. +## Repair + +Explicit data/index maintenance — deliberately separate from +`recall doctor --fix`, which only repairs install-layout symlinks and never +touches data. + +```bash +recall repair # Dry-run report (default — writes nothing) +recall repair --execute # Apply the planned repairs +recall repair -t messages # Scope to one table +recall repair --no-embed # Skip the embedding pass +``` + +What repair covers: + +- **FTS5 index rebuild.** Each search index (`messages`, `decisions`, + `learnings`, `breadcrumbs`, `loa_entries`, `telos`, `documents`) is checked + against its source table — indexed row count (via the docsize shadow + table), sync-trigger presence, and the FTS5 `integrity-check` command. A + drifted index is rebuilt from its source table; a missing index or missing + sync triggers (the classic symptom: search silently returns nothing on an + un-migrated database) are recreated from the canonical schema DDL and then + rebuilt. +- **Re-embedding.** Rows expected to carry embeddings (`loa_entries`, + `decisions`, `learnings`, assistant `messages`) that have none are + re-embedded when the Ollama embedding service is available and the row has + enough source text. If the service is unavailable, repair reports the + missing embeddings and still exits successfully — unless another requested + repair failed. Partial results are never hidden: embedded, skipped + (too short), and failed counts are all reported. +- **Orphan/invariant reporting.** Named, unambiguous integrity checks — + orphaned embeddings, dedup lineage pointing at missing rows, messages + without a session, broken LoA message ranges and parent links, pending + schema migrations — are **report-only**. Repair never attempts heuristic + data mutation; pending migrations are fixed by `recall init`. + +Safety model: + +- **Dry-run by default.** Mutations require `--execute`. Run + `recall export --backup` before applying repairs. +- **Repair never hard-deletes rows.** +- **Repair never changes [Record Provenance](#record-provenance).** FTS + rebuild regenerates index shadow tables; re-embedding only inserts into + the `embeddings` table. No source-table column is written. +- `recall doctor` may recommend `recall repair`, but `doctor --fix` never + runs data repair implicitly. + ## Admin ```bash @@ -359,7 +406,7 @@ recall onboard # Interactive L0 identity interview (see `recall init` creates the database schema if it does not exist, and applies any pending migrations. It is safe to run on an existing database. -`recall doctor` checks the database connection, schema integrity, FTS5 index health, MCP server registration, Ollama availability, and the per-platform symlinks under `~/.agents/Recall/`. Run this first when troubleshooting. Pass `--fix` to repair drift: missing symlinks are re-created; user-modified files at symlink targets are backed up under `~/.agents/Recall/backups//doctor-fix/` before being replaced. +`recall doctor` checks the database connection, schema integrity, FTS5 index health, MCP server registration, Ollama availability, and the per-platform symlinks under `~/.agents/Recall/`. Run this first when troubleshooting. Pass `--fix` to repair drift: missing symlinks are re-created; user-modified files at symlink targets are backed up under `~/.agents/Recall/backups//doctor-fix/` before being replaced. `--fix` only ever repairs symlinks — data and index maintenance is the explicit job of [`recall repair`](#repair), which doctor recommends when an FTS index is out of sync. `recall stats` reports row counts per table and total database size. diff --git a/docs/troubleshooting.md b/docs/troubleshooting.md index 3a26838..ed7999f 100644 --- a/docs/troubleshooting.md +++ b/docs/troubleshooting.md @@ -119,6 +119,26 @@ symlinks resolve to readable files after linking — so a silent Extraction hooks fire on the `Stop` event. If the hook isn't registered in `settings.json`, re-run `./install.sh`. +### "Search returns nothing, but the data is there" + +`recall show` / `recall recent` find records that `recall search` never +returns. The usual cause is an FTS5 index out of sync with its source table — +classically a database created by an older version where the index or its +sync triggers were never created, so every write since has been silently +unindexed. + +```bash +recall doctor # the FTS index check reports which indexes drifted +recall repair # dry-run: shows the planned rebuilds, writes nothing +recall export --backup # recommended before any repair +recall repair --execute # rebuild the indexes from the source tables +``` + +See [Repair in the CLI reference](cli-reference.md#repair) for the full +safety model. `recall doctor --fix` does **not** fix this — doctor's `--fix` +only repairs symlinks; data and index maintenance always goes through +`recall repair`. + ### "Embedding service unavailable" Embeddings are optional. Hybrid search falls back to FTS5-only automatically. diff --git a/src/commands/doctor.ts b/src/commands/doctor.ts index 9e00fb2..f445d75 100644 --- a/src/commands/doctor.ts +++ b/src/commands/doctor.ts @@ -6,6 +6,7 @@ import { execSync, spawnSync } from 'child_process'; import { createHash } from 'crypto'; import { homedir } from 'os'; import { getDb, getDbPath } from '../db/connection.js'; +import { checkAllFts } from '../lib/repair.js'; import { VERSION } from '../version.js'; export interface DoctorOptions { @@ -138,6 +139,38 @@ function checkStructuredData(): CheckResult { } } +// ───────────────────────────────────────── +// Check: FTS5 index health +// ───────────────────────────────────────── +// +// Read-only by design (issue #46): doctor only RECOMMENDS `recall repair`. +// This check is exported as a plain CheckResult with no repair fn, so the +// --fix loop (symlinks only) can never run data repair implicitly. +export function checkFtsIndexes(): CheckResult { + const label = 'FTS5 search indexes in sync'; + + try { + const reports = checkAllFts(getDb()); + const problems = reports.filter(r => r.status !== 'ok'); + + if (problems.length === 0) { + return { label, status: 'PASS', message: `All ${reports.length} FTS indexes in sync with source tables` }; + } + + const summary = problems + .map(p => `${p.ftsTable}: ${p.status} (${p.detail})`) + .join('; '); + return { + label, + status: 'WARN', + message: `${summary} — run 'recall repair' to inspect, 'recall repair --execute' to fix`, + }; + } catch (err: unknown) { + const msg = err instanceof Error ? err.message : String(err); + return { label, status: 'FAIL', message: `Check failed: ${msg}` }; + } +} + // ───────────────────────────────────────── // Check 4: Extraction output files // ───────────────────────────────────────── @@ -601,6 +634,7 @@ export async function runDoctor(opts: DoctorOptions = {}): Promise { results.push(checkDatabase()); results.push(checkMessages()); results.push(checkStructuredData()); + results.push(checkFtsIndexes()); results.push(checkExtractionFiles()); results.push(checkExtractLog()); results.push(checkTrackerLockouts()); diff --git a/src/commands/embed.ts b/src/commands/embed.ts index 66bd623..6273d93 100644 --- a/src/commands/embed.ts +++ b/src/commands/embed.ts @@ -3,6 +3,7 @@ import { getDb } from '../db/connection.js'; import { embed, embeddingToBlob, blobToEmbedding, cosineSimilarity, checkEmbeddingService, reciprocalRankFusion, EMBEDDING_MODEL } from '../lib/embeddings.js'; import { notMarkedDuplicateSql } from '../lib/dedup.js'; +import { embeddingTextFor } from '../lib/repair.js'; import { search as ftsSearch } from '../lib/memory.js'; // Marked duplicates (recall dedup, issue #45) keep their embeddings but are @@ -20,21 +21,12 @@ interface EmbedOptions { } /** - * Get content to embed for a given table + * Get content to embed for a given table. The per-table composition rules + * live in src/lib/repair.ts (shared with recall repair); this only maps the + * CLI's short 'loa' alias to the real table name. */ function getContentForTable(table: string, row: any): string { - switch (table) { - case 'loa': - return `${row.title}\n\n${row.fabric_extract}`; - case 'decisions': - return `${row.decision}\n\nReasoning: ${row.reasoning || 'N/A'}`; - case 'learnings': - return `${row.problem}\n\nSolution: ${row.solution || 'N/A'}`; - case 'messages': - return row.content; - default: - return row.content || row.text || ''; - } + return embeddingTextFor(table === 'loa' ? 'loa_entries' : table, row); } /** diff --git a/src/commands/repair.ts b/src/commands/repair.ts new file mode 100644 index 0000000..57cfdbb --- /dev/null +++ b/src/commands/repair.ts @@ -0,0 +1,183 @@ +// recall repair command (issue #46). +// +// Explicit data/index maintenance: rebuild FTS5 indexes from their source +// tables (recreating missing indexes from the canonical schema DDL) and +// re-embed rows missing embeddings when the Ollama service is available. +// Orphan/invariant problems that cannot be repaired safely are report-only. +// +// Dry-run by default; --execute applies. Deliberately separate from +// `recall doctor --fix`, which only repairs install-layout symlinks and +// never runs data repair. Core logic lives in src/lib/repair.ts. + +import { getDb } from '../db/connection.js'; +import { checkEmbeddingService, embed } from '../lib/embeddings.js'; +import { + applyEmbedRepair, + applyFtsRepair, + FTS_SOURCES, + planRepair, + type EmbedFn, + type EmbedRepairResult, + type FtsRepairResult, + type RepairPlan, +} from '../lib/repair.js'; + +export interface RepairOptions { + execute?: boolean; + table?: string; + /** Commander maps --no-embed to embed: false. */ + embed?: boolean; +} + +/** + * Injectable service/embedding clients so tests run deterministically + * offline. The CLI always uses the real Ollama-backed defaults. + */ +export interface RepairDeps { + checkService: () => Promise<{ available: boolean; model: string; url: string }>; + embedFn: EmbedFn; +} + +export interface RepairRunResult { + plan: RepairPlan; + fts: FtsRepairResult | null; + embeddings: EmbedRepairResult | null; + /** Why the embedding pass did not run, or null if it ran. */ + embedSkipped: string | null; +} + +const DEFAULT_DEPS: RepairDeps = { checkService: checkEmbeddingService, embedFn: embed }; + +export async function runRepair( + options: RepairOptions = {}, + deps: RepairDeps = DEFAULT_DEPS +): Promise { + const execute = options.execute ?? false; + const embedPass = options.embed ?? true; + + const target = options.table ?? 'all'; + if (target !== 'all' && !FTS_SOURCES.includes(target)) { + console.error(`Invalid --table "${target}". Valid tables: ${FTS_SOURCES.join(', ')}, all.`); + process.exitCode = 1; + return undefined; + } + + const db = getDb(); + const plan = planRepair(db, { + table: target === 'all' ? undefined : target, + embed: embedPass, + }); + + console.log(execute ? '[EXECUTE — applying repairs]\n' : '[DRY RUN — no changes written]\n'); + if (execute) { + console.log("Recommended: run 'recall export --backup' before applying repairs.\n"); + } + + // ── FTS indexes ────────────────────────────────────────────── + console.log('FTS indexes:'); + for (const report of plan.fts) { + const rows = report.sourceRows !== null ? `${report.sourceRows} source row(s)` : 'source missing'; + const planned = + report.action === 'rebuild' ? (execute ? 'rebuilding' : 'would rebuild') + : report.action === 'create-and-rebuild' ? (execute ? 'creating + rebuilding' : 'would create + rebuild') + : report.action === 'report-only' ? 'unrepairable here' + : 'no action'; + console.log(` ${report.ftsTable}: ${report.status} (${rows}) — ${report.detail} [${planned}]`); + } + + let ftsResult: FtsRepairResult | null = null; + if (execute) { + ftsResult = applyFtsRepair(db, plan); + if (ftsResult.created.length > 0) { + console.log(` Created from canonical schema: ${ftsResult.created.join(', ')}`); + } + if (ftsResult.rebuilt.length > 0) { + console.log(` Rebuilt from source tables: ${ftsResult.rebuilt.join(', ')}`); + } + for (const failure of ftsResult.failed) { + console.error(` FAILED ${failure.ftsTable}: ${failure.error}`); + process.exitCode = 1; + } + } + console.log(''); + + // ── Embeddings ─────────────────────────────────────────────── + console.log('Embeddings:'); + let embedResult: EmbedRepairResult | null = null; + let embedSkipped: string | null = null; + + if (!embedPass) { + embedSkipped = 'disabled (--no-embed)'; + console.log(` Skipped — ${embedSkipped}`); + } else if (plan.embedGaps.length === 0) { + console.log(' No embeddable tables in scope.'); + } else { + for (const gap of plan.embedGaps) { + const shortNote = gap.tooShort > 0 ? ` (${gap.tooShort} too short to embed)` : ''; + console.log(` ${gap.table}: ${gap.missing} missing${shortNote}`); + } + const embeddable = plan.embedGaps.reduce((sum, g) => sum + (g.missing - g.tooShort), 0); + + if (execute && embeddable > 0) { + const service = await deps.checkService(); + if (!service.available) { + // Diagnostic path: report and stay successful — an unreachable + // embedding service is an environment state, not a repair failure. + embedSkipped = `embedding service unavailable at ${service.url} (model ${service.model})`; + console.log(` Skipped re-embedding — ${embedSkipped}.`); + console.log(` ${embeddable} row(s) still missing embeddings; re-run when Ollama is up.`); + } else { + embedResult = await applyEmbedRepair(db, plan, deps.embedFn); + console.log(` Embedded ${embedResult.embedded} row(s), skipped ${embedResult.skippedTooShort} too-short row(s), ${embedResult.failed.length} failure(s).`); + for (const failure of embedResult.failed.slice(0, 5)) { + console.error(` FAILED ${failure.table}#${failure.id}: ${failure.error}`); + } + if (embedResult.failed.length > 5) { + console.error(` ...and ${embedResult.failed.length - 5} more failures`); + } + } + } else if (execute && embeddable === 0) { + console.log(' Nothing to embed.'); + } + } + console.log(''); + + // ── Orphans / invariants (report-only) ─────────────────────── + console.log('Orphans / invariants (report-only, never repaired automatically):'); + if (plan.orphans.length === 0) { + console.log(' None found.'); + } else { + for (const orphan of plan.orphans) { + if (orphan.error) { + console.log(` ${orphan.check}: check failed — ${orphan.error}`); + } else { + const sample = orphan.sample.length > 0 ? ` — ${orphan.sample.join(', ')}` : ''; + console.log(` ${orphan.check}: ${orphan.count} (${orphan.description})${sample}`); + } + } + } + console.log(''); + + // ── Schema state ───────────────────────────────────────────── + if (plan.migrations.pending > 0) { + console.log( + `Schema: ${plan.migrations.pending} migration(s) pending ` + + `(version ${plan.migrations.current}, target ${plan.migrations.target}) — run 'recall init' to apply.` + ); + } else { + console.log(`Schema: migrations up to date (version ${plan.migrations.current}).`); + } + console.log(''); + + if (!execute) { + const ftsWork = plan.fts.filter(f => f.action === 'rebuild' || f.action === 'create-and-rebuild').length; + const embedWork = plan.embedGaps.reduce((sum, g) => sum + (g.missing - g.tooShort), 0); + if (ftsWork > 0 || embedWork > 0) { + console.log("Re-run with --execute to apply repairs. Recommended: 'recall export --backup' first."); + } else { + console.log('Nothing to repair.'); + } + } + + return { plan, fts: ftsResult, embeddings: embedResult, embedSkipped }; +} diff --git a/src/db/schema.ts b/src/db/schema.ts index 269e0b9..567aeb8 100644 --- a/src/db/schema.ts +++ b/src/db/schema.ts @@ -259,7 +259,21 @@ CREATE INDEX IF NOT EXISTS idx_dedup_lineage_survivor ON dedup_lineage(survivor_table, survivor_id); `; -export const CREATE_FTS = ` +// Per-source-table FTS5 DDL. Single source of truth: the CREATE_FTS / +// CREATE_FTS_TRIGGERS strings consumed by initDb are derived from this map, +// and `recall repair` uses individual entries to recreate one missing index +// without touching the others. All statements are idempotent (IF NOT EXISTS). +export interface FtsTableSchema { + /** FTS5 virtual table name (e.g. messages_fts). */ + ftsTable: string; + createTable: string; + createTriggers: string; +} + +export const FTS_SCHEMA: Record = { + messages: { + ftsTable: 'messages_fts', + createTable: ` -- FTS5 virtual table for messages (conversation search) CREATE VIRTUAL TABLE IF NOT EXISTS messages_fts USING fts5( content, @@ -267,70 +281,8 @@ CREATE VIRTUAL TABLE IF NOT EXISTS messages_fts USING fts5( content='messages', content_rowid='id' ); - --- FTS5 virtual table for decisions -CREATE VIRTUAL TABLE IF NOT EXISTS decisions_fts USING fts5( - decision, - reasoning, - project, - content='decisions', - content_rowid='id' -); - --- FTS5 virtual table for learnings -CREATE VIRTUAL TABLE IF NOT EXISTS learnings_fts USING fts5( - problem, - solution, - tags, - project, - content='learnings', - content_rowid='id' -); - --- FTS5 virtual table for breadcrumbs -CREATE VIRTUAL TABLE IF NOT EXISTS breadcrumbs_fts USING fts5( - content, - category, - project, - content='breadcrumbs', - content_rowid='id' -); - --- FTS5 virtual table for LoA entries -CREATE VIRTUAL TABLE IF NOT EXISTS loa_fts USING fts5( - title, - description, - fabric_extract, - tags, - project, - content='loa_entries', - content_rowid='id' -); - --- FTS5 virtual table for TELOS -CREATE VIRTUAL TABLE IF NOT EXISTS telos_fts USING fts5( - code, - type, - title, - content, - category, - content='telos', - content_rowid='id' -); - --- FTS5 virtual table for documents -CREATE VIRTUAL TABLE IF NOT EXISTS documents_fts USING fts5( - title, - type, - content, - summary, - path, - content='documents', - content_rowid='id' -); -`; - -export const CREATE_FTS_TRIGGERS = ` +`, + createTriggers: ` -- Messages FTS triggers CREATE TRIGGER IF NOT EXISTS messages_ai AFTER INSERT ON messages BEGIN INSERT INTO messages_fts(rowid, content, project) VALUES (new.id, new.content, new.project); @@ -342,7 +294,21 @@ CREATE TRIGGER IF NOT EXISTS messages_au AFTER UPDATE ON messages BEGIN INSERT INTO messages_fts(messages_fts, rowid, content, project) VALUES('delete', old.id, old.content, old.project); INSERT INTO messages_fts(rowid, content, project) VALUES (new.id, new.content, new.project); END; - +`, + }, + decisions: { + ftsTable: 'decisions_fts', + createTable: ` +-- FTS5 virtual table for decisions +CREATE VIRTUAL TABLE IF NOT EXISTS decisions_fts USING fts5( + decision, + reasoning, + project, + content='decisions', + content_rowid='id' +); +`, + createTriggers: ` -- Decisions FTS triggers CREATE TRIGGER IF NOT EXISTS decisions_ai AFTER INSERT ON decisions BEGIN INSERT INTO decisions_fts(rowid, decision, reasoning, project) VALUES (new.id, new.decision, new.reasoning, new.project); @@ -354,7 +320,22 @@ CREATE TRIGGER IF NOT EXISTS decisions_au AFTER UPDATE ON decisions BEGIN INSERT INTO decisions_fts(decisions_fts, rowid, decision, reasoning, project) VALUES('delete', old.id, old.decision, old.reasoning, old.project); INSERT INTO decisions_fts(rowid, decision, reasoning, project) VALUES (new.id, new.decision, new.reasoning, new.project); END; - +`, + }, + learnings: { + ftsTable: 'learnings_fts', + createTable: ` +-- FTS5 virtual table for learnings +CREATE VIRTUAL TABLE IF NOT EXISTS learnings_fts USING fts5( + problem, + solution, + tags, + project, + content='learnings', + content_rowid='id' +); +`, + createTriggers: ` -- Learnings FTS triggers CREATE TRIGGER IF NOT EXISTS learnings_ai AFTER INSERT ON learnings BEGIN INSERT INTO learnings_fts(rowid, problem, solution, tags, project) VALUES (new.id, new.problem, new.solution, new.tags, new.project); @@ -366,7 +347,21 @@ CREATE TRIGGER IF NOT EXISTS learnings_au AFTER UPDATE ON learnings BEGIN INSERT INTO learnings_fts(learnings_fts, rowid, problem, solution, tags, project) VALUES('delete', old.id, old.problem, old.solution, old.tags, old.project); INSERT INTO learnings_fts(rowid, problem, solution, tags, project) VALUES (new.id, new.problem, new.solution, new.tags, new.project); END; - +`, + }, + breadcrumbs: { + ftsTable: 'breadcrumbs_fts', + createTable: ` +-- FTS5 virtual table for breadcrumbs +CREATE VIRTUAL TABLE IF NOT EXISTS breadcrumbs_fts USING fts5( + content, + category, + project, + content='breadcrumbs', + content_rowid='id' +); +`, + createTriggers: ` -- Breadcrumbs FTS triggers CREATE TRIGGER IF NOT EXISTS breadcrumbs_ai AFTER INSERT ON breadcrumbs BEGIN INSERT INTO breadcrumbs_fts(rowid, content, category, project) VALUES (new.id, new.content, new.category, new.project); @@ -378,7 +373,23 @@ CREATE TRIGGER IF NOT EXISTS breadcrumbs_au AFTER UPDATE ON breadcrumbs BEGIN INSERT INTO breadcrumbs_fts(breadcrumbs_fts, rowid, content, category, project) VALUES('delete', old.id, old.content, old.category, old.project); INSERT INTO breadcrumbs_fts(rowid, content, category, project) VALUES (new.id, new.content, new.category, new.project); END; - +`, + }, + loa_entries: { + ftsTable: 'loa_fts', + createTable: ` +-- FTS5 virtual table for LoA entries +CREATE VIRTUAL TABLE IF NOT EXISTS loa_fts USING fts5( + title, + description, + fabric_extract, + tags, + project, + content='loa_entries', + content_rowid='id' +); +`, + createTriggers: ` -- LoA FTS triggers CREATE TRIGGER IF NOT EXISTS loa_ai AFTER INSERT ON loa_entries BEGIN INSERT INTO loa_fts(rowid, title, description, fabric_extract, tags, project) VALUES (new.id, new.title, new.description, new.fabric_extract, new.tags, new.project); @@ -390,7 +401,23 @@ CREATE TRIGGER IF NOT EXISTS loa_au AFTER UPDATE ON loa_entries BEGIN INSERT INTO loa_fts(loa_fts, rowid, title, description, fabric_extract, tags, project) VALUES('delete', old.id, old.title, old.description, old.fabric_extract, old.tags, old.project); INSERT INTO loa_fts(rowid, title, description, fabric_extract, tags, project) VALUES (new.id, new.title, new.description, new.fabric_extract, new.tags, new.project); END; - +`, + }, + telos: { + ftsTable: 'telos_fts', + createTable: ` +-- FTS5 virtual table for TELOS +CREATE VIRTUAL TABLE IF NOT EXISTS telos_fts USING fts5( + code, + type, + title, + content, + category, + content='telos', + content_rowid='id' +); +`, + createTriggers: ` -- TELOS FTS triggers CREATE TRIGGER IF NOT EXISTS telos_ai AFTER INSERT ON telos BEGIN INSERT INTO telos_fts(rowid, code, type, title, content, category) VALUES (new.id, new.code, new.type, new.title, new.content, new.category); @@ -402,7 +429,23 @@ CREATE TRIGGER IF NOT EXISTS telos_au AFTER UPDATE ON telos BEGIN INSERT INTO telos_fts(telos_fts, rowid, code, type, title, content, category) VALUES('delete', old.id, old.code, old.type, old.title, old.content, old.category); INSERT INTO telos_fts(rowid, code, type, title, content, category) VALUES (new.id, new.code, new.type, new.title, new.content, new.category); END; - +`, + }, + documents: { + ftsTable: 'documents_fts', + createTable: ` +-- FTS5 virtual table for documents +CREATE VIRTUAL TABLE IF NOT EXISTS documents_fts USING fts5( + title, + type, + content, + summary, + path, + content='documents', + content_rowid='id' +); +`, + createTriggers: ` -- Documents FTS triggers CREATE TRIGGER IF NOT EXISTS documents_ai AFTER INSERT ON documents BEGIN INSERT INTO documents_fts(rowid, title, type, content, summary, path) VALUES (new.id, new.title, new.type, new.content, new.summary, new.path); @@ -414,7 +457,13 @@ CREATE TRIGGER IF NOT EXISTS documents_au AFTER UPDATE ON documents BEGIN INSERT INTO documents_fts(documents_fts, rowid, title, type, content, summary, path) VALUES('delete', old.id, old.title, old.type, old.content, old.summary, old.path); INSERT INTO documents_fts(rowid, title, type, content, summary, path) VALUES (new.id, new.title, new.type, new.content, new.summary, new.path); END; -`; +`, + }, +}; + +export const CREATE_FTS = Object.values(FTS_SCHEMA).map(s => s.createTable).join(''); + +export const CREATE_FTS_TRIGGERS = Object.values(FTS_SCHEMA).map(s => s.createTriggers).join(''); // Vector embeddings tables (requires sqlite-vec extension) export const CREATE_VECTOR_TABLES = ` diff --git a/src/index.ts b/src/index.ts index a2e2aaf..821adc1 100644 --- a/src/index.ts +++ b/src/index.ts @@ -31,6 +31,7 @@ import { runMigrate } from './commands/migrate.js'; import { runPath } from './commands/path.js'; import { runExport } from './commands/export.js'; import { runDedup } from './commands/dedup.js'; +import { runRepair } from './commands/repair.js'; import { DEFAULT_SEMANTIC_THRESHOLD } from './lib/dedup.js'; import { closeDb } from './db/connection.js'; @@ -686,6 +687,24 @@ program closeDb(); }); +// recall repair — data/index maintenance (issue #46) +// Dry-run by default; --execute rebuilds FTS5 indexes and re-embeds rows +// missing embeddings. Separate from `recall doctor --fix` (symlinks only). +program + .command('repair') + .description('Rebuild FTS5 indexes and re-embed missing embeddings (dry-run by default)') + .option('--execute', 'Apply the planned repairs (default is dry-run)') + .option('-t, --table ', 'Target table: messages, decisions, learnings, breadcrumbs, loa_entries, telos, documents, all', 'all') + .option('--no-embed', 'Skip the embedding pass') + .action(async (options) => { + await runRepair({ + execute: options.execute, + table: options.table, + embed: options.embed + }); + closeDb(); + }); + // Default command: recall → hybrid search (Phase 3: best of both worlds) program .arguments('[query]') @@ -696,7 +715,7 @@ program .option('-k, --keyword', 'Use keyword search only (FTS5)') .option('-v, --vector', 'Use vector search only (semantic)') .action(async (query, options) => { - if (query && !['init', 'add', 'search', 'recent', 'show', 'stats', 'import', 'import-conversations', 'loa', 'telos', 'docs', 'dump', 'embed', 'semantic', 'hybrid', 'doctor', 'importance', 'provenance', 'pin', 'unpin', 'decision', 'prune', 'cluster', 'import-legacy', 'benchmark', 'onboard', 'migrate', 'path', 'export', 'dedup'].includes(query)) { + if (query && !['init', 'add', 'search', 'recent', 'show', 'stats', 'import', 'import-conversations', 'loa', 'telos', 'docs', 'dump', 'embed', 'semantic', 'hybrid', 'doctor', 'importance', 'provenance', 'pin', 'unpin', 'decision', 'prune', 'cluster', 'import-legacy', 'benchmark', 'onboard', 'migrate', 'path', 'export', 'dedup', 'repair'].includes(query)) { if (options.keyword) { // FTS5 only runSearch(query, { diff --git a/src/lib/repair.ts b/src/lib/repair.ts new file mode 100644 index 0000000..f8109cf --- /dev/null +++ b/src/lib/repair.ts @@ -0,0 +1,565 @@ +// recall repair — core logic (issue #46). +// +// Explicit data/index maintenance, separate from `recall doctor --fix` +// (doctor --fix only repairs install-layout symlinks and never runs data +// repair; doctor may *recommend* repair). +// +// Safety model: +// - Dry-run by default; mutations require an explicit execute step. +// - Repair never changes Record Provenance — no repair statement writes a +// source-table column. FTS rebuild regenerates index shadow tables from +// the source rows; re-embed only inserts into the embeddings table. +// - Repair never hard-deletes rows. Orphan/invariant findings are +// report-only — every check is named in code and covered by tests. +// - Initial repair scope is limited to unambiguous maintenance: FTS5 +// rebuild (including recreating a missing index from the canonical +// schema DDL) and re-embedding rows missing embeddings. No heuristic +// data mutation. +// +// Bind-count note (see src/lib/chunk.ts): all SQL here binds a fixed number +// of parameters — aggregate counts, EXISTS anti-joins with constant table +// names, and keyset pagination (`WHERE id > ? LIMIT ?`) for the embed +// candidate walk. Nothing scales with input size. + +import { Database } from 'bun:sqlite'; +import { SQLITE_SAFE_CHUNK_SIZE } from './chunk.js'; +import { FTS_SCHEMA } from '../db/schema.js'; +import { MIGRATIONS, getMigrationVersion } from '../db/migrations.js'; +import { notMarkedDuplicateSql } from './dedup.js'; +import { embeddingToBlob, type EmbeddingResult } from './embeddings.js'; +import { PROVENANCE_TABLES } from '../types/index.js'; + +/** Source tables carrying an FTS5 index — derived from the schema map. */ +export const FTS_SOURCES = Object.keys(FTS_SCHEMA); + +/** + * Minimum trimmed text length a row needs to be embedded. Matches the + * eligibility rule `recall embed backfill` has always applied. + */ +export const MIN_EMBED_TEXT_LENGTH = 10; + +// --------------------------------------------------------------------------- +// Embedding sources — which rows are expected to carry embeddings, and what +// text gets embedded. Single source of truth shared with `recall embed`. +// --------------------------------------------------------------------------- + +export interface EmbedSourceConfig { + table: 'messages' | 'decisions' | 'learnings' | 'loa_entries'; + /** Columns selected to build the embedding text. */ + columns: string[]; + /** Restricts which rows are expected to be embedded (constant SQL). */ + extraWhere?: string; + text: (row: Record) => string; +} + +export const EMBED_SOURCES: EmbedSourceConfig[] = [ + { + table: 'loa_entries', + columns: ['title', 'fabric_extract'], + text: r => `${r.title}\n\n${r.fabric_extract}`, + }, + { + table: 'decisions', + columns: ['decision', 'reasoning'], + text: r => `${r.decision}\n\nReasoning: ${r.reasoning || 'N/A'}`, + }, + { + table: 'learnings', + columns: ['problem', 'solution'], + text: r => `${r.problem}\n\nSolution: ${r.solution || 'N/A'}`, + }, + { + table: 'messages', + columns: ['content'], + extraWhere: "t.role = 'assistant'", + text: r => String(r.content ?? ''), + }, +]; + +/** + * Canonical embedding text for a source table row. Unknown tables fall back + * to a generic content/text lookup (legacy `recall embed` behavior). + */ +export function embeddingTextFor(table: string, row: Record): string { + const source = EMBED_SOURCES.find(s => s.table === table); + if (source) return source.text(row); + return String((row.content as string) || (row.text as string) || ''); +} + +// --------------------------------------------------------------------------- +// FTS5 index health +// --------------------------------------------------------------------------- + +export type FtsStatus = 'ok' | 'drift' | 'missing-fts' | 'missing-source'; +export type FtsAction = 'none' | 'rebuild' | 'create-and-rebuild' | 'report-only'; + +export interface FtsReport { + source: string; + ftsTable: string; + sourceRows: number | null; + indexedRows: number | null; + status: FtsStatus; + action: FtsAction; + detail: string; +} + +function tableExists(db: Database, name: string): boolean { + // FTS5 virtual tables are recorded in sqlite_master with type 'table'. + return !!db.prepare( + "SELECT 1 FROM sqlite_master WHERE type = 'table' AND name = ?" + ).get(name); +} + +/** + * FTS sync trigger names derive from the FTS table name: messages_fts → + * messages_ai / messages_ad / messages_au (matches the schema DDL). + */ +function missingFtsTriggers(db: Database, ftsTable: string): string[] { + const prefix = ftsTable.replace(/_fts$/, ''); + const stmt = db.prepare("SELECT 1 FROM sqlite_master WHERE type = 'trigger' AND name = ?"); + return ['ai', 'ad', 'au'] + .map(suffix => `${prefix}_${suffix}`) + .filter(name => !stmt.get(name)); +} + +function countRows(db: Database, table: string): number { + return (db.prepare(`SELECT COUNT(*) AS c FROM ${table}`).get() as { c: number }).c; +} + +/** + * Rows actually present in an FTS5 index. `SELECT COUNT(*)` on an + * external-content FTS table reads through to the content table, so the + * docsize shadow table (one row per indexed document) is the truthful + * count. Returns null when it cannot be read. + */ +function countIndexedRows(db: Database, ftsTable: string): number | null { + try { + return (db.prepare(`SELECT COUNT(*) AS c FROM ${ftsTable}_docsize`).get() as { c: number }).c; + } catch { + return null; + } +} + +/** + * FTS5 external-content integrity check. The `rank` form (SQLite >= 3.42) + * validates the index against its content table and raises SQLITE_CORRUPT + * on mismatch. Any non-corruption error (e.g. the command form is + * unsupported on an older SQLite) is treated as "no evidence of drift" so + * the row-count comparison stands alone. + */ +function ftsIndexConsistent(db: Database, ftsTable: string): boolean { + try { + db.prepare(`INSERT INTO ${ftsTable}(${ftsTable}, rank) VALUES('integrity-check', 1)`).run(); + return true; + } catch (err) { + const msg = err instanceof Error ? err.message : String(err); + return !/malformed|corrupt/i.test(msg); + } +} + +export function checkFts(db: Database, source: string): FtsReport { + const schema = FTS_SCHEMA[source]; + if (!schema) { + throw new Error(`Unknown FTS source table: ${source}`); + } + const ftsTable = schema.ftsTable; + + if (!tableExists(db, source)) { + return { + source, ftsTable, sourceRows: null, indexedRows: null, + status: 'missing-source', action: 'report-only', + detail: `source table ${source} is missing — run 'recall init' to recreate the schema`, + }; + } + const sourceRows = countRows(db, source); + + if (!tableExists(db, ftsTable)) { + return { + source, ftsTable, sourceRows, indexedRows: null, + status: 'missing-fts', action: 'create-and-rebuild', + detail: `FTS index ${ftsTable} is missing — searches over ${source} fail or return nothing`, + }; + } + + const indexedRows = countIndexedRows(db, ftsTable); + + const lostTriggers = missingFtsTriggers(db, ftsTable); + if (lostTriggers.length > 0) { + return { + source, ftsTable, sourceRows, indexedRows, + status: 'drift', action: 'rebuild', + detail: `sync trigger(s) missing (${lostTriggers.join(', ')}) — writes to ${source} are not being indexed`, + }; + } + if (indexedRows !== null && indexedRows !== sourceRows) { + return { + source, ftsTable, sourceRows, indexedRows, + status: 'drift', action: 'rebuild', + detail: `index has ${indexedRows} row(s), source has ${sourceRows}`, + }; + } + if (!ftsIndexConsistent(db, ftsTable)) { + return { + source, ftsTable, sourceRows, indexedRows, + status: 'drift', action: 'rebuild', + detail: 'integrity-check failed (index content out of sync with source)', + }; + } + return { source, ftsTable, sourceRows, indexedRows, status: 'ok', action: 'none', detail: 'in sync' }; +} + +/** FTS health for every indexed table — shared by repair and doctor. */ +export function checkAllFts(db: Database): FtsReport[] { + return FTS_SOURCES.map(s => checkFts(db, s)); +} + +// --------------------------------------------------------------------------- +// Embedding gaps +// --------------------------------------------------------------------------- + +export interface EmbedGapReport { + table: string; + /** Rows expected to carry an embedding that have none. */ + missing: number; + /** Subset of `missing` whose source text is too short to embed. */ + tooShort: number; +} + +function embedGapQuery(config: EmbedSourceConfig): string { + const where = [ + 't.id > ?', + ...(config.extraWhere ? [config.extraWhere] : []), + 'e.id IS NULL', + // Marked duplicates are hidden from every search path; embedding them + // would index records search can never return. + notMarkedDuplicateSql(`'${config.table}'`, 't.id'), + ].join(' AND '); + return ` + SELECT t.id, ${config.columns.map(c => `t.${c}`).join(', ')} + FROM ${config.table} t + LEFT JOIN embeddings e ON e.source_table = '${config.table}' AND e.source_id = t.id + WHERE ${where} + ORDER BY t.id + LIMIT ? + `; +} + +/** Keyset-paginated walk over rows missing embeddings (fixed binds). */ +function* iterateEmbedGapRows( + db: Database, + config: EmbedSourceConfig, + batchSize: number = SQLITE_SAFE_CHUNK_SIZE +): Generator> { + const stmt = db.prepare(embedGapQuery(config)); + let lastId = 0; + for (;;) { + const batch = stmt.all(lastId, batchSize) as Array>; + if (batch.length === 0) return; + for (const row of batch) yield row; + lastId = batch[batch.length - 1].id as number; + } +} + +/** + * "Enough source text to embed" is measured on the raw column values, not + * the composed embedding text — boilerplate like "Reasoning: N/A" must not + * make an empty record look embeddable. + */ +function sourceTextLength(config: EmbedSourceConfig, row: Record): number { + return config.columns + .map(c => row[c]) + .filter(v => v !== null && v !== undefined && String(v).length > 0) + .join('\n') + .trim() + .length; +} + +export function countEmbedGaps(db: Database, config: EmbedSourceConfig): EmbedGapReport { + let missing = 0; + let tooShort = 0; + for (const row of iterateEmbedGapRows(db, config)) { + missing++; + if (sourceTextLength(config, row) < MIN_EMBED_TEXT_LENGTH) tooShort++; + } + return { table: config.table, missing, tooShort }; +} + +// --------------------------------------------------------------------------- +// Orphan / invariant checks — report-only, never repaired automatically +// --------------------------------------------------------------------------- + +export interface OrphanReport { + /** Stable check id — named in code and tests. */ + check: string; + description: string; + count: number; + /** Up to 5 sample references for triage. */ + sample: string[]; + /** Set when the check itself could not run (e.g. table missing). */ + error?: string; +} + +const SAMPLE_LIMIT = 5; + +interface OrphanCheckDef { + check: string; + description: string; + countSql: string; + sampleSql: string; + sampleLabel: (row: Record) => string; +} + +function orphanCheckDefs(): OrphanCheckDef[] { + const defs: OrphanCheckDef[] = []; + + // Embeddings whose source row no longer exists. + for (const table of FTS_SOURCES) { + const where = `e.source_table = '${table}' + AND NOT EXISTS (SELECT 1 FROM ${table} t WHERE t.id = e.source_id)`; + defs.push({ + check: `orphaned-embeddings:${table}`, + description: `embeddings rows pointing at deleted ${table} rows`, + countSql: `SELECT COUNT(*) AS c FROM embeddings e WHERE ${where}`, + sampleSql: `SELECT e.source_id AS id FROM embeddings e WHERE ${where} ORDER BY e.source_id LIMIT ${SAMPLE_LIMIT}`, + sampleLabel: r => `${table}#${r.id}`, + }); + } + + // Embeddings with a source_table value no known table resolves. + const knownList = FTS_SOURCES.map(t => `'${t}'`).join(', '); + defs.push({ + check: 'unknown-embedding-source', + description: 'embeddings rows with an unrecognized source_table', + countSql: `SELECT COUNT(*) AS c FROM embeddings WHERE source_table NOT IN (${knownList})`, + sampleSql: `SELECT source_table, source_id FROM embeddings WHERE source_table NOT IN (${knownList}) ORDER BY id LIMIT ${SAMPLE_LIMIT}`, + sampleLabel: r => `${r.source_table}#${r.source_id}`, + }); + + // Actively marked dedup lineage whose duplicate or survivor row is gone. + // 'marked' means hidden-but-intact, so both sides must still exist. + for (const side of ['duplicate', 'survivor'] as const) { + for (const table of PROVENANCE_TABLES) { + const where = `dl.status = 'marked' AND dl.${side}_table = '${table}' + AND NOT EXISTS (SELECT 1 FROM ${table} t WHERE t.id = dl.${side}_id)`; + defs.push({ + check: `lineage-missing-${side}:${table}`, + description: `dedup_lineage 'marked' rows whose ${side} ${table} row no longer exists`, + countSql: `SELECT COUNT(*) AS c FROM dedup_lineage dl WHERE ${where}`, + sampleSql: `SELECT dl.id FROM dedup_lineage dl WHERE ${where} ORDER BY dl.id LIMIT ${SAMPLE_LIMIT}`, + sampleLabel: r => `dedup_lineage#${r.id}`, + }); + } + } + + // Messages referencing a session that does not exist. + defs.push({ + check: 'messages-without-session', + description: 'messages whose session_id has no sessions row', + countSql: `SELECT COUNT(*) AS c FROM messages m + WHERE NOT EXISTS (SELECT 1 FROM sessions s WHERE s.session_id = m.session_id)`, + sampleSql: `SELECT m.id FROM messages m + WHERE NOT EXISTS (SELECT 1 FROM sessions s WHERE s.session_id = m.session_id) + ORDER BY m.id LIMIT ${SAMPLE_LIMIT}`, + sampleLabel: r => `messages#${r.id}`, + }); + + // LoA entries whose message range points at deleted messages. + const loaRangeWhere = `(l.message_range_start IS NOT NULL + AND NOT EXISTS (SELECT 1 FROM messages m WHERE m.id = l.message_range_start)) + OR (l.message_range_end IS NOT NULL + AND NOT EXISTS (SELECT 1 FROM messages m WHERE m.id = l.message_range_end))`; + defs.push({ + check: 'loa-broken-message-range', + description: 'loa_entries whose message_range_start/end points at a missing message', + countSql: `SELECT COUNT(*) AS c FROM loa_entries l WHERE ${loaRangeWhere}`, + sampleSql: `SELECT l.id FROM loa_entries l WHERE ${loaRangeWhere} ORDER BY l.id LIMIT ${SAMPLE_LIMIT}`, + sampleLabel: r => `loa_entries#${r.id}`, + }); + + // LoA entries whose parent link is broken. + const loaParentWhere = `l.parent_loa_id IS NOT NULL + AND NOT EXISTS (SELECT 1 FROM loa_entries p WHERE p.id = l.parent_loa_id)`; + defs.push({ + check: 'loa-broken-parent', + description: 'loa_entries whose parent_loa_id points at a missing entry', + countSql: `SELECT COUNT(*) AS c FROM loa_entries l WHERE ${loaParentWhere}`, + sampleSql: `SELECT l.id FROM loa_entries l WHERE ${loaParentWhere} ORDER BY l.id LIMIT ${SAMPLE_LIMIT}`, + sampleLabel: r => `loa_entries#${r.id}`, + }); + + return defs; +} + +export function checkOrphans(db: Database): OrphanReport[] { + const reports: OrphanReport[] = []; + for (const def of orphanCheckDefs()) { + try { + const count = (db.prepare(def.countSql).get() as { c: number }).c; + if (count === 0) continue; + const rows = db.prepare(def.sampleSql).all() as Array>; + reports.push({ + check: def.check, + description: def.description, + count, + sample: rows.map(def.sampleLabel), + }); + } catch (err) { + reports.push({ + check: def.check, + description: def.description, + count: 0, + sample: [], + error: err instanceof Error ? err.message : String(err), + }); + } + } + return reports; +} + +// --------------------------------------------------------------------------- +// Schema/migration state — report-only +// --------------------------------------------------------------------------- + +export interface MigrationReport { + current: number; + target: number; + pending: number; +} + +export function checkMigrations(db: Database): MigrationReport { + const current = getMigrationVersion(db); + const target = MIGRATIONS.length; + return { current, target, pending: Math.max(0, target - current) }; +} + +// --------------------------------------------------------------------------- +// Plan and apply +// --------------------------------------------------------------------------- + +export interface RepairPlan { + fts: FtsReport[]; + embedGaps: EmbedGapReport[]; + orphans: OrphanReport[]; + migrations: MigrationReport; +} + +export interface PlanRepairOptions { + /** Restrict the FTS and embedding passes to one source table. */ + table?: string; + /** Set false to skip the embedding pass entirely (--no-embed). */ + embed?: boolean; +} + +export function planRepair(db: Database, options: PlanRepairOptions = {}): RepairPlan { + const ftsSources = options.table ? [options.table] : FTS_SOURCES; + const fts = ftsSources.map(s => checkFts(db, s)); + + const missingSources = new Set(fts.filter(f => f.status === 'missing-source').map(f => f.source)); + const embedConfigs = (options.embed ?? true) + ? EMBED_SOURCES.filter(c => + (!options.table || c.table === options.table) && + !missingSources.has(c.table) && + tableExists(db, c.table) && + tableExists(db, 'embeddings')) + : []; + const embedGaps = embedConfigs.map(c => countEmbedGaps(db, c)); + + return { + fts, + embedGaps, + orphans: checkOrphans(db), + migrations: checkMigrations(db), + }; +} + +export interface FtsRepairResult { + /** FTS tables recreated from the canonical schema DDL before rebuilding. */ + created: string[]; + rebuilt: string[]; + failed: Array<{ ftsTable: string; error: string }>; +} + +/** + * Apply the planned FTS repairs: recreate missing indexes from the canonical + * per-table DDL (idempotent IF NOT EXISTS), then rebuild each planned index + * from its source table via the FTS5 'rebuild' command. + */ +export function applyFtsRepair(db: Database, plan: RepairPlan): FtsRepairResult { + const result: FtsRepairResult = { created: [], rebuilt: [], failed: [] }; + for (const report of plan.fts) { + if (report.action !== 'rebuild' && report.action !== 'create-and-rebuild') continue; + const schema = FTS_SCHEMA[report.source]; + try { + // The canonical DDL is idempotent (IF NOT EXISTS): recreating covers a + // missing virtual table and any missing sync triggers in one pass. + db.exec(schema.createTable); + db.exec(schema.createTriggers); + if (report.action === 'create-and-rebuild') { + result.created.push(report.ftsTable); + } + db.prepare(`INSERT INTO ${report.ftsTable}(${report.ftsTable}) VALUES('rebuild')`).run(); + result.rebuilt.push(report.ftsTable); + } catch (err) { + result.failed.push({ + ftsTable: report.ftsTable, + error: err instanceof Error ? err.message : String(err), + }); + } + } + return result; +} + +export interface EmbedRepairResult { + embedded: number; + /** Rows skipped because their source text is too short to embed. */ + skippedTooShort: number; + /** Per-row embedding failures — reported, never hidden. */ + failed: Array<{ table: string; id: number; error: string }>; +} + +export type EmbedFn = (text: string) => Promise; + +/** + * Re-embed rows missing embeddings. `embedFn` is injected so the command can + * wire the real Ollama client while tests run deterministically offline. + * Failed rows stay missing and are retried on the next run. + */ +export async function applyEmbedRepair( + db: Database, + plan: RepairPlan, + embedFn: EmbedFn, + onProgress?: (table: string, done: number, total: number) => void +): Promise { + const insert = db.prepare(` + INSERT OR REPLACE INTO embeddings (source_table, source_id, model, dimensions, embedding) + VALUES (?, ?, ?, ?, ?) + `); + const result: EmbedRepairResult = { embedded: 0, skippedTooShort: 0, failed: [] }; + + for (const gap of plan.embedGaps) { + if (gap.missing === 0) continue; + const config = EMBED_SOURCES.find(c => c.table === gap.table); + if (!config) continue; + let done = 0; + for (const row of iterateEmbedGapRows(db, config)) { + done++; + if (sourceTextLength(config, row) < MIN_EMBED_TEXT_LENGTH) { + result.skippedTooShort++; + continue; + } + try { + const res = await embedFn(config.text(row).trim()); + insert.run(config.table, row.id as number, res.model, res.dimensions, embeddingToBlob(res.embedding)); + result.embedded++; + } catch (err) { + result.failed.push({ + table: config.table, + id: row.id as number, + error: err instanceof Error ? err.message : String(err), + }); + } + onProgress?.(gap.table, done, gap.missing); + } + } + return result; +} diff --git a/tests/commands/repair.test.ts b/tests/commands/repair.test.ts new file mode 100644 index 0000000..dd77342 --- /dev/null +++ b/tests/commands/repair.test.ts @@ -0,0 +1,337 @@ +// recall repair — issue #46 acceptance criteria. +// +// Behavior under test: +// - dry-run by default: reports the plan, writes nothing +// - --execute rebuilds FTS5 indexes from source tables (drift + missing +// index recreated from canonical DDL) and re-embeds missing embeddings +// - planned/applied actions reported with counts by table and repair type +// - embedding service unavailable: missing embeddings reported, command +// still exits successfully (diagnostic path) +// - partial embedding repair reports skipped rows and failures +// - orphan/invariant problems are report-only — never auto-repaired +// - repair never hard-deletes rows, never changes Record Provenance +// - --table scopes the run; invalid --table is rejected +// - doctor recommends repair but doctor --fix never runs data repair + +import { describe, test, expect, beforeEach, afterEach } from 'bun:test'; +import { setupTestDb, teardownTestDb } from '../helpers/setup'; +import { getDb } from '../../src/db/connection'; +import { runRepair, type RepairDeps } from '../../src/commands/repair'; +import { checkFtsIndexes } from '../../src/commands/doctor'; +import { checkFts } from '../../src/lib/repair'; +import { embeddingToBlob, type EmbeddingResult } from '../../src/lib/embeddings'; +import { + addBreadcrumb, + addDecision, + addLearning, + addMessage, + createLoaEntry, + createSession, + search, +} from '../../src/lib/memory'; + +const originalLog = console.log; +const originalError = console.error; +const originalExitCode = process.exitCode; + +beforeEach(() => { + setupTestDb(); + console.log = () => {}; + console.error = () => {}; +}); + +afterEach(() => { + console.log = originalLog; + console.error = originalError; + process.exitCode = originalExitCode; + teardownTestDb(); +}); + +const CRUMB = 'Always use bun for every script in this repository, never npm.'; + +const okEmbed = async (): Promise => + ({ embedding: [1, 0, 0], model: 'test', dimensions: 3 }); + +const upDeps: RepairDeps = { + checkService: async () => ({ available: true, model: 'test', url: 'http://test' }), + embedFn: okEmbed, +}; + +const downDeps: RepairDeps = { + checkService: async () => ({ available: false, model: 'test', url: 'http://test' }), + embedFn: async () => { throw new Error('service is down — embedFn must never be called'); }, +}; + +function wipeIndex(ftsTable: string): void { + getDb().prepare(`INSERT INTO ${ftsTable}(${ftsTable}) VALUES('delete-all')`).run(); +} + +function dropTriggers(prefix: string): void { + const db = getDb(); + for (const suffix of ['ai', 'ad', 'au']) { + db.exec(`DROP TRIGGER IF EXISTS ${prefix}_${suffix}`); + } +} + +function embeddingsCount(): number { + return (getDb().prepare('SELECT COUNT(*) AS c FROM embeddings').get() as { c: number }).c; +} + +describe('dry-run vs execute', () => { + test('dry-run reports the plan and writes nothing', async () => { + addBreadcrumb({ content: CRUMB, importance: 5 }); + addDecision({ decision: 'Adopt SQLite WAL mode for every database connection.', status: 'active' }); + wipeIndex('breadcrumbs_fts'); + + const result = (await runRepair({}, upDeps))!; + + // The plan reports the drift and the embedding gap... + const crumbs = result.plan.fts.find(f => f.source === 'breadcrumbs')!; + expect(crumbs.status).toBe('drift'); + expect(crumbs.action).toBe('rebuild'); + const decisionGap = result.plan.embedGaps.find(g => g.table === 'decisions')!; + expect(decisionGap.missing).toBe(1); + + // ...but nothing was repaired. + expect(result.fts).toBeNull(); + expect(result.embeddings).toBeNull(); + expect(checkFts(getDb(), 'breadcrumbs').status).toBe('drift'); + expect(embeddingsCount()).toBe(0); + expect(search('bun').length).toBe(0); // index still broken + }); + + test('--execute rebuilds a drifted index and search works again', async () => { + addBreadcrumb({ content: CRUMB, importance: 5 }); + wipeIndex('breadcrumbs_fts'); + expect(search('bun').length).toBe(0); + + const result = (await runRepair({ execute: true, embed: false }))!; + expect(result.fts?.rebuilt).toContain('breadcrumbs_fts'); + expect(result.fts?.failed.length).toBe(0); + expect(search('bun').length).toBe(1); + expect(process.exitCode).not.toBe(1); + }); + + test('--execute recreates a missing index (un-migrated DB gap) and restores sync triggers', async () => { + addBreadcrumb({ content: CRUMB, importance: 5 }); + dropTriggers('breadcrumbs'); + getDb().exec('DROP TABLE breadcrumbs_fts'); + + const result = (await runRepair({ execute: true, embed: false }))!; + expect(result.fts?.created).toContain('breadcrumbs_fts'); + expect(result.fts?.rebuilt).toContain('breadcrumbs_fts'); + + // Old rows searchable again, and new writes are indexed via the + // recreated triggers. + expect(search('bun').length).toBe(1); + addBreadcrumb({ content: 'Another breadcrumb about bun written after repair.', importance: 5 }); + expect(search('bun').length).toBe(2); + }); + + test('reports planned and applied actions with counts by table and repair type', async () => { + addBreadcrumb({ content: CRUMB, importance: 5 }); + wipeIndex('breadcrumbs_fts'); + addDecision({ decision: 'Adopt SQLite WAL mode for every database connection.', status: 'active' }); + + const result = (await runRepair({ execute: true }, upDeps))!; + + // Per-table FTS reports carry status, action, and row counts. + for (const report of result.plan.fts) { + expect(typeof report.status).toBe('string'); + expect(typeof report.action).toBe('string'); + expect(report.sourceRows).not.toBeNull(); + } + // Per-table embedding gaps + applied counts by repair type. + expect(result.plan.embedGaps.find(g => g.table === 'decisions')?.missing).toBe(1); + expect(result.fts?.rebuilt).toEqual(['breadcrumbs_fts']); + expect(result.embeddings?.embedded).toBe(1); + }); +}); + +describe('embedding repair', () => { + test('embedding service unavailable: reports missing embeddings, exits successfully', async () => { + addDecision({ decision: 'Adopt SQLite WAL mode for every database connection.', status: 'active' }); + + const result = (await runRepair({ execute: true }, downDeps))!; + expect(result.embedSkipped).toContain('unavailable'); + expect(result.embeddings).toBeNull(); + expect(embeddingsCount()).toBe(0); + // Diagnostic path: not a failure. + expect(process.exitCode).not.toBe(1); + }); + + test('embedding service unavailable does not mask an FTS repair that succeeded', async () => { + addBreadcrumb({ content: CRUMB, importance: 5 }); + addDecision({ decision: 'Adopt SQLite WAL mode for every database connection.', status: 'active' }); + wipeIndex('breadcrumbs_fts'); + + const result = (await runRepair({ execute: true }, downDeps))!; + expect(result.fts?.rebuilt).toContain('breadcrumbs_fts'); + expect(search('bun').length).toBe(1); + expect(process.exitCode).not.toBe(1); + }); + + test('partial success: failures and skipped rows are reported, successes persist', async () => { + addDecision({ decision: 'Adopt SQLite WAL mode for every database connection.', status: 'active' }); + addDecision({ decision: 'POISON row that the embedding service rejects every time.', status: 'active' }); + addDecision({ decision: 'no', status: 'active' }); // too short to embed + + const flakyDeps: RepairDeps = { + ...upDeps, + embedFn: async (text) => { + if (text.includes('POISON')) throw new Error('boom'); + return okEmbed(); + }, + }; + + const result = (await runRepair({ execute: true }, flakyDeps))!; + expect(result.embeddings?.embedded).toBe(1); + expect(result.embeddings?.skippedTooShort).toBe(1); + expect(result.embeddings?.failed.length).toBe(1); + expect(result.embeddings?.failed[0].error).toContain('boom'); + expect(embeddingsCount()).toBe(1); + }); + + test('--no-embed skips the embedding pass entirely', async () => { + addDecision({ decision: 'Adopt SQLite WAL mode for every database connection.', status: 'active' }); + + const result = (await runRepair({ execute: true, embed: false }))!; + expect(result.embedSkipped).toContain('--no-embed'); + expect(result.plan.embedGaps.length).toBe(0); + expect(embeddingsCount()).toBe(0); + }); + + test('covers loa_entries, learnings, and assistant messages', async () => { + createSession({ session_id: 's1', started_at: '2026-01-01T00:00:00Z' }); + addMessage({ session_id: 's1', timestamp: '2026-01-01T00:00:01Z', role: 'assistant', content: CRUMB }); + addMessage({ session_id: 's1', timestamp: '2026-01-01T00:00:02Z', role: 'user', content: CRUMB }); + addLearning({ problem: 'FTS index drifted after manual surgery on the database.', solution: 'Run recall repair.' }); + createLoaEntry({ title: 'Entry', fabric_extract: 'A fabric extract long enough to embed.' }); + + const result = (await runRepair({ execute: true }, upDeps))!; + expect(result.embeddings?.embedded).toBe(3); // assistant message + learning + loa + const tables = getDb().prepare('SELECT DISTINCT source_table FROM embeddings ORDER BY source_table').all() as Array<{ source_table: string }>; + expect(tables.map(t => t.source_table)).toEqual(['learnings', 'loa_entries', 'messages']); + }); +}); + +describe('safety invariants', () => { + test('repair never hard-deletes rows', async () => { + createSession({ session_id: 's1', started_at: '2026-01-01T00:00:00Z' }); + addMessage({ session_id: 's1', timestamp: '2026-01-01T00:00:01Z', role: 'assistant', content: CRUMB }); + addDecision({ decision: 'Adopt SQLite WAL mode for every database connection.', status: 'active' }); + addLearning({ problem: 'A problem statement long enough to be embedded.', solution: 'A fix.' }); + addBreadcrumb({ content: CRUMB, importance: 5 }); + createLoaEntry({ title: 'Entry', fabric_extract: 'A fabric extract long enough to embed.' }); + const db = getDb(); + db.prepare(`INSERT INTO telos (code, type, title, content) VALUES ('G0', 'goal', 'Goal', 'Telos content')`).run(); + db.prepare(`INSERT INTO documents (path, title, type, content) VALUES ('/tmp/d.md', 'Doc', 'reference', 'Document body')`).run(); + // Orphaned embedding — reported, must never be deleted. + db.prepare( + `INSERT INTO embeddings (source_table, source_id, model, dimensions, embedding) + VALUES ('decisions', 9999, 'test', 3, ?)` + ).run(embeddingToBlob([1, 0, 0])); + wipeIndex('messages_fts'); + + const tables = ['sessions', 'messages', 'decisions', 'learnings', 'breadcrumbs', 'loa_entries', 'telos', 'documents', 'dedup_lineage']; + const before = tables.map(t => (db.prepare(`SELECT COUNT(*) AS c FROM ${t}`).get() as { c: number }).c); + const embeddingsBefore = embeddingsCount(); + + await runRepair({ execute: true }, upDeps); + + const after = tables.map(t => (db.prepare(`SELECT COUNT(*) AS c FROM ${t}`).get() as { c: number }).c); + expect(after).toEqual(before); + // Embeddings may only grow (re-embed), never shrink. + expect(embeddingsCount()).toBeGreaterThanOrEqual(embeddingsBefore); + }); + + test('repair never changes Record Provenance', async () => { + addBreadcrumb({ content: CRUMB, importance: 5, provenance: 'user_authored' }); + addBreadcrumb({ content: `${CRUMB} (second)`, importance: 5, provenance: 'extracted' }); + addDecision({ decision: 'Adopt SQLite WAL mode for every database connection.', status: 'active', provenance: 'verbatim' }); + addLearning({ problem: 'A problem statement long enough to be embedded.', solution: 'A fix.' }); // NULL provenance + wipeIndex('breadcrumbs_fts'); + wipeIndex('decisions_fts'); + + const db = getDb(); + const snapshot = () => ({ + breadcrumbs: db.prepare('SELECT id, provenance FROM breadcrumbs ORDER BY id').all(), + decisions: db.prepare('SELECT id, provenance FROM decisions ORDER BY id').all(), + learnings: db.prepare('SELECT id, provenance FROM learnings ORDER BY id').all(), + }); + const before = snapshot(); + + await runRepair({ execute: true }, upDeps); + + expect(snapshot()).toEqual(before); + }); + + test('orphan problems are reported but never auto-repaired', async () => { + getDb().prepare( + `INSERT INTO embeddings (source_table, source_id, model, dimensions, embedding) + VALUES ('decisions', 9999, 'test', 3, ?)` + ).run(embeddingToBlob([1, 0, 0])); + + const result = (await runRepair({ execute: true }, upDeps))!; + const orphan = result.plan.orphans.find(r => r.check === 'orphaned-embeddings:decisions'); + expect(orphan?.count).toBe(1); + + // Still there after execute — repair only reports it. + const remaining = getDb().prepare( + `SELECT COUNT(*) AS c FROM embeddings WHERE source_table = 'decisions' AND source_id = 9999` + ).get() as { c: number }; + expect(remaining.c).toBe(1); + }); + + test('an un-migrated database is reported with a recall init recommendation', async () => { + getDb().exec('PRAGMA user_version = 5'); + const result = (await runRepair({}, upDeps))!; + expect(result.plan.migrations.pending).toBeGreaterThan(0); + }); +}); + +describe('scoping and validation', () => { + test('--table scopes the FTS and embedding passes', async () => { + addBreadcrumb({ content: CRUMB, importance: 5 }); + createSession({ session_id: 's1', started_at: '2026-01-01T00:00:00Z' }); + addMessage({ session_id: 's1', timestamp: '2026-01-01T00:00:01Z', role: 'assistant', content: CRUMB }); + wipeIndex('breadcrumbs_fts'); + wipeIndex('messages_fts'); + + const result = (await runRepair({ execute: true, table: 'breadcrumbs' }, upDeps))!; + expect(result.plan.fts.length).toBe(1); + expect(result.plan.fts[0].source).toBe('breadcrumbs'); + expect(result.plan.embedGaps.length).toBe(0); // breadcrumbs carry no embeddings + + // The scoped table is repaired; the out-of-scope one is untouched. + expect(checkFts(getDb(), 'breadcrumbs').status).toBe('ok'); + expect(checkFts(getDb(), 'messages').status).toBe('drift'); + }); + + test('invalid --table is rejected', async () => { + expect(await runRepair({ table: 'sessions' })).toBeUndefined(); + expect(process.exitCode).toBe(1); + }); +}); + +describe('doctor separation (issue #46)', () => { + test('doctor detects drift and recommends recall repair without touching data', () => { + addBreadcrumb({ content: CRUMB, importance: 5 }); + wipeIndex('breadcrumbs_fts'); + + const check = checkFtsIndexes(); + expect(check.status).toBe('WARN'); + expect(check.message).toContain('recall repair'); + + // The check is read-only: the drift is still there afterwards. Doctor's + // --fix loop only iterates symlink probes, so data repair can never run + // implicitly — repairing requires an explicit `recall repair --execute`. + expect(checkFts(getDb(), 'breadcrumbs').status).toBe('drift'); + }); + + test('doctor reports PASS when indexes are in sync', () => { + addBreadcrumb({ content: CRUMB, importance: 5 }); + const check = checkFtsIndexes(); + expect(check.status).toBe('PASS'); + }); +}); diff --git a/tests/lib/repair.test.ts b/tests/lib/repair.test.ts new file mode 100644 index 0000000..37ce3ee --- /dev/null +++ b/tests/lib/repair.test.ts @@ -0,0 +1,372 @@ +// src/lib/repair.ts — unit coverage (issue #46). +// +// Behavior under test: +// - FTS health detection: ok / drift (count mismatch, missing triggers, +// stale content) / missing-fts / missing-source +// - applyFtsRepair recreates missing indexes + triggers from canonical DDL +// and rebuilds from source tables +// - embedding gap detection: missing rows, too-short eligibility, assistant +// role filter, marked-duplicate exclusion +// - applyEmbedRepair: injectable embed fn, partial failure reporting +// - orphan/invariant checks are named and report-only +// - migration state report + +import { describe, test, expect, beforeEach, afterEach } from 'bun:test'; +import { setupTestDb, teardownTestDb } from '../helpers/setup'; +import { getDb } from '../../src/db/connection'; +import { MIGRATIONS } from '../../src/db/migrations'; +import { FTS_SCHEMA } from '../../src/db/schema'; +import { embeddingToBlob, type EmbeddingResult } from '../../src/lib/embeddings'; +import { + addBreadcrumb, + addDecision, + addMessage, + createLoaEntry, + createSession, +} from '../../src/lib/memory'; +import { + applyEmbedRepair, + applyFtsRepair, + checkAllFts, + checkFts, + checkMigrations, + checkOrphans, + countEmbedGaps, + embeddingTextFor, + EMBED_SOURCES, + FTS_SOURCES, + planRepair, +} from '../../src/lib/repair'; + +beforeEach(() => { + setupTestDb(); +}); + +afterEach(() => { + teardownTestDb(); +}); + +const CRUMB = 'Always use bun for every script in this repository, never npm.'; + +const okEmbed = async (): Promise => + ({ embedding: [1, 0, 0], model: 'test', dimensions: 3 }); + +function wipeIndex(ftsTable: string): void { + getDb().prepare(`INSERT INTO ${ftsTable}(${ftsTable}) VALUES('delete-all')`).run(); +} + +function dropTriggers(prefix: string): void { + const db = getDb(); + for (const suffix of ['ai', 'ad', 'au']) { + db.exec(`DROP TRIGGER IF EXISTS ${prefix}_${suffix}`); + } +} + +function decisionConfig() { + return EMBED_SOURCES.find(c => c.table === 'decisions')!; +} + +describe('FTS health detection', () => { + test('a fresh database reports every index ok', () => { + const reports = checkAllFts(getDb()); + expect(reports.length).toBe(FTS_SOURCES.length); + for (const report of reports) { + expect(report.status).toBe('ok'); + expect(report.action).toBe('none'); + } + }); + + test('row-count mismatch is drift → rebuild', () => { + addBreadcrumb({ content: CRUMB, importance: 5 }); + wipeIndex('breadcrumbs_fts'); + + const report = checkFts(getDb(), 'breadcrumbs'); + expect(report.status).toBe('drift'); + expect(report.action).toBe('rebuild'); + expect(report.sourceRows).toBe(1); + expect(report.indexedRows).toBe(0); + }); + + test('missing sync triggers are drift even when counts match', () => { + addBreadcrumb({ content: CRUMB, importance: 5 }); + dropTriggers('breadcrumbs'); + + const report = checkFts(getDb(), 'breadcrumbs'); + expect(report.status).toBe('drift'); + expect(report.detail).toContain('trigger'); + }); + + test('missing FTS table → create-and-rebuild', () => { + dropTriggers('breadcrumbs'); + getDb().exec('DROP TABLE breadcrumbs_fts'); + + const report = checkFts(getDb(), 'breadcrumbs'); + expect(report.status).toBe('missing-fts'); + expect(report.action).toBe('create-and-rebuild'); + }); + + test('missing source table is report-only', () => { + const db = getDb(); + dropTriggers('telos'); + db.exec('DROP TABLE telos_fts'); + db.exec('DROP TABLE telos'); + + const report = checkFts(db, 'telos'); + expect(report.status).toBe('missing-source'); + expect(report.action).toBe('report-only'); + expect(report.detail).toContain('recall init'); + }); + + test('stale index content with matching counts is detected via integrity-check', () => { + const version = (getDb().prepare('SELECT sqlite_version() AS v').get() as { v: string }).v; + const [major, minor] = version.split('.').map(Number); + if (major < 3 || (major === 3 && minor < 42)) { + // External-content integrity-check needs SQLite >= 3.42; on older + // builds detection degrades to count comparison by design. + return; + } + + const id = addBreadcrumb({ content: CRUMB, importance: 5 }); + dropTriggers('breadcrumbs'); + // Trigger-bypassing update: counts still match, index content is stale. + getDb().prepare('UPDATE breadcrumbs SET content = ? WHERE id = ?') + .run('Completely different content after the triggers were lost.', id); + // Restore triggers so only the stale content (not trigger absence) drives + // the verdict. + getDb().exec(FTS_SCHEMA.breadcrumbs.createTriggers); + + const report = checkFts(getDb(), 'breadcrumbs'); + expect(report.status).toBe('drift'); + }); +}); + +describe('applyFtsRepair', () => { + test('rebuilds a wiped index from the source table', () => { + addBreadcrumb({ content: CRUMB, importance: 5 }); + wipeIndex('breadcrumbs_fts'); + + const plan = planRepair(getDb(), { embed: false }); + const result = applyFtsRepair(getDb(), plan); + expect(result.rebuilt).toContain('breadcrumbs_fts'); + expect(result.failed.length).toBe(0); + + const indexed = getDb().prepare('SELECT COUNT(*) AS c FROM breadcrumbs_fts').get() as { c: number }; + expect(indexed.c).toBe(1); + expect(checkFts(getDb(), 'breadcrumbs').status).toBe('ok'); + }); + + test('recreates a missing index and its triggers, then rebuilds', () => { + addBreadcrumb({ content: CRUMB, importance: 5 }); + dropTriggers('breadcrumbs'); + getDb().exec('DROP TABLE breadcrumbs_fts'); + + const plan = planRepair(getDb(), { embed: false }); + const result = applyFtsRepair(getDb(), plan); + expect(result.created).toContain('breadcrumbs_fts'); + expect(result.rebuilt).toContain('breadcrumbs_fts'); + + // Pre-existing rows are searchable again and new writes are indexed + // (triggers restored). + expect(checkFts(getDb(), 'breadcrumbs').status).toBe('ok'); + addBreadcrumb({ content: 'A brand new breadcrumb written after the repair ran.', importance: 5 }); + expect(checkFts(getDb(), 'breadcrumbs').status).toBe('ok'); + const indexed = getDb().prepare('SELECT COUNT(*) AS c FROM breadcrumbs_fts').get() as { c: number }; + expect(indexed.c).toBe(2); + }); + + test('never touches source tables', () => { + addBreadcrumb({ content: CRUMB, importance: 5, provenance: 'user_authored' }); + wipeIndex('breadcrumbs_fts'); + const before = getDb().prepare('SELECT * FROM breadcrumbs ORDER BY id').all(); + + const plan = planRepair(getDb(), { embed: false }); + applyFtsRepair(getDb(), plan); + + const after = getDb().prepare('SELECT * FROM breadcrumbs ORDER BY id').all(); + expect(after).toEqual(before); + }); +}); + +describe('embedding gap detection', () => { + test('counts rows missing embeddings and flags too-short text', () => { + addDecision({ decision: 'Adopt SQLite WAL mode for every database connection.', status: 'active' }); + addDecision({ decision: 'no', status: 'active' }); // too short to embed + + const report = countEmbedGaps(getDb(), decisionConfig()); + expect(report.missing).toBe(2); + expect(report.tooShort).toBe(1); + }); + + test('already-embedded rows are not gaps', () => { + const id = addDecision({ decision: 'Adopt SQLite WAL mode for every database connection.', status: 'active' }); + getDb().prepare( + `INSERT INTO embeddings (source_table, source_id, model, dimensions, embedding) + VALUES ('decisions', ?, 'test', 3, ?)` + ).run(id, embeddingToBlob([1, 0, 0])); + + const report = countEmbedGaps(getDb(), decisionConfig()); + expect(report.missing).toBe(0); + }); + + test('only assistant messages are embedding candidates', () => { + createSession({ session_id: 's1', started_at: '2026-01-01T00:00:00Z' }); + addMessage({ session_id: 's1', timestamp: '2026-01-01T00:00:01Z', role: 'user', content: CRUMB }); + addMessage({ session_id: 's1', timestamp: '2026-01-01T00:00:02Z', role: 'assistant', content: CRUMB }); + + const config = EMBED_SOURCES.find(c => c.table === 'messages')!; + const report = countEmbedGaps(getDb(), config); + expect(report.missing).toBe(1); + }); + + test('marked duplicates are excluded from gaps', () => { + const survivor = addDecision({ decision: 'Adopt SQLite WAL mode for every database connection.', status: 'active' }); + const duplicate = addDecision({ decision: 'Adopt SQLite WAL mode for every DB connection always.', status: 'active' }); + getDb().prepare(` + INSERT INTO dedup_lineage + (survivor_table, survivor_id, duplicate_table, duplicate_id, reason, similarity, status, detail) + VALUES ('decisions', ?, 'decisions', ?, 'semantic', 0.99, 'marked', '{}') + `).run(survivor, duplicate); + + const report = countEmbedGaps(getDb(), decisionConfig()); + expect(report.missing).toBe(1); + }); +}); + +describe('applyEmbedRepair', () => { + test('embeds missing rows via the injected embed fn', async () => { + const id = addDecision({ decision: 'Adopt SQLite WAL mode for every database connection.', status: 'active' }); + + const plan = planRepair(getDb()); + const result = await applyEmbedRepair(getDb(), plan, okEmbed); + expect(result.embedded).toBe(1); + expect(result.failed.length).toBe(0); + + const row = getDb().prepare( + `SELECT model, dimensions FROM embeddings WHERE source_table = 'decisions' AND source_id = ?` + ).get(id) as { model: string; dimensions: number }; + expect(row.model).toBe('test'); + expect(row.dimensions).toBe(3); + }); + + test('partial failure: failures are reported, successes persist, nothing is hidden', async () => { + addDecision({ decision: 'Adopt SQLite WAL mode for every database connection.', status: 'active' }); + addDecision({ decision: 'POISON row that the embedding service rejects every time.', status: 'active' }); + addDecision({ decision: 'no', status: 'active' }); // too short — skipped + + const flaky = async (text: string): Promise => { + if (text.includes('POISON')) throw new Error('boom'); + return okEmbed(); + }; + + const plan = planRepair(getDb()); + const result = await applyEmbedRepair(getDb(), plan, flaky); + expect(result.embedded).toBe(1); + expect(result.skippedTooShort).toBe(1); + expect(result.failed.length).toBe(1); + expect(result.failed[0].table).toBe('decisions'); + expect(result.failed[0].error).toContain('boom'); + + const count = getDb().prepare('SELECT COUNT(*) AS c FROM embeddings').get() as { c: number }; + expect(count.c).toBe(1); + }); +}); + +describe('embeddingTextFor', () => { + test('matches the per-table composition rules', () => { + expect(embeddingTextFor('loa_entries', { title: 'T', fabric_extract: 'F' })).toBe('T\n\nF'); + expect(embeddingTextFor('decisions', { decision: 'D', reasoning: null })).toBe('D\n\nReasoning: N/A'); + expect(embeddingTextFor('learnings', { problem: 'P', solution: 'S' })).toBe('P\n\nSolution: S'); + expect(embeddingTextFor('messages', { content: 'C' })).toBe('C'); + expect(embeddingTextFor('unknown_table', { content: 'fallback' })).toBe('fallback'); + }); +}); + +describe('orphan/invariant checks (report-only)', () => { + test('clean database reports nothing', () => { + expect(checkOrphans(getDb())).toEqual([]); + }); + + test('orphaned embeddings are reported with samples', () => { + getDb().prepare( + `INSERT INTO embeddings (source_table, source_id, model, dimensions, embedding) + VALUES ('decisions', 9999, 'test', 3, ?)` + ).run(embeddingToBlob([1, 0, 0])); + + const reports = checkOrphans(getDb()); + const orphan = reports.find(r => r.check === 'orphaned-embeddings:decisions'); + expect(orphan).toBeDefined(); + expect(orphan!.count).toBe(1); + expect(orphan!.sample).toEqual(['decisions#9999']); + }); + + test('unknown embedding source tables are reported', () => { + getDb().prepare( + `INSERT INTO embeddings (source_table, source_id, model, dimensions, embedding) + VALUES ('bogus', 1, 'test', 3, ?)` + ).run(embeddingToBlob([1, 0, 0])); + + const reports = checkOrphans(getDb()); + const orphan = reports.find(r => r.check === 'unknown-embedding-source'); + expect(orphan?.count).toBe(1); + expect(orphan?.sample).toEqual(['bogus#1']); + }); + + test('marked lineage with a missing duplicate row is reported', () => { + const survivor = addBreadcrumb({ content: CRUMB, importance: 5 }); + getDb().prepare(` + INSERT INTO dedup_lineage + (survivor_table, survivor_id, duplicate_table, duplicate_id, reason, similarity, status, detail) + VALUES ('breadcrumbs', ?, 'breadcrumbs', 4242, 'exact', 1.0, 'marked', '{}') + `).run(survivor); + + const reports = checkOrphans(getDb()); + const orphan = reports.find(r => r.check === 'lineage-missing-duplicate:breadcrumbs'); + expect(orphan?.count).toBe(1); + }); + + test('messages without a session and broken LoA links are reported', () => { + const db = getDb(); + db.exec('PRAGMA foreign_keys = OFF'); + db.prepare(` + INSERT INTO messages (session_id, timestamp, role, content) + VALUES ('ghost-session', '2026-01-01T00:00:00Z', 'user', 'orphaned message') + `).run(); + db.prepare(` + INSERT INTO loa_entries (title, fabric_extract, message_range_start, message_range_end, parent_loa_id) + VALUES ('broken', 'body', 9991, 9992, 7777) + `).run(); + db.exec('PRAGMA foreign_keys = ON'); + + const checks = new Map(checkOrphans(db).map(r => [r.check, r])); + expect(checks.get('messages-without-session')?.count).toBe(1); + expect(checks.get('loa-broken-message-range')?.count).toBe(1); + expect(checks.get('loa-broken-parent')?.count).toBe(1); + }); + + test('a healthy LoA lineage is not reported', () => { + createSession({ session_id: 's1', started_at: '2026-01-01T00:00:00Z' }); + const m1 = addMessage({ session_id: 's1', timestamp: '2026-01-01T00:00:01Z', role: 'user', content: CRUMB }); + const parent = createLoaEntry({ title: 'parent', fabric_extract: 'body' }); + createLoaEntry({ + title: 'child', fabric_extract: 'body', + message_range_start: m1, message_range_end: m1, parent_loa_id: parent, + }); + + expect(checkOrphans(getDb())).toEqual([]); + }); +}); + +describe('migration state', () => { + test('up-to-date database reports zero pending', () => { + const report = checkMigrations(getDb()); + expect(report.current).toBe(MIGRATIONS.length); + expect(report.pending).toBe(0); + }); + + test('an un-migrated database reports pending migrations', () => { + getDb().exec('PRAGMA user_version = 5'); + const report = checkMigrations(getDb()); + expect(report.current).toBe(5); + expect(report.pending).toBe(MIGRATIONS.length - 5); + }); +}); From 4e65c6eb46331c499f36877a9300fa920a1444ec Mon Sep 17 00:00:00 2001 From: Ed Heltzel <402910+edheltzel@users.noreply.github.com> Date: Thu, 11 Jun 2026 05:12:41 -0400 Subject: [PATCH 2/2] =?UTF-8?q?fix:=20address=20PR=20#62=20review=20blocke?= =?UTF-8?q?rs=20=E2=80=94=20exit-code=20restore=20and=20pre-dedup=20legacy?= =?UTF-8?q?=20DB=20crash?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Blocker 1: restore process.exitCode with the repo's ?? 0 convention in repair.test.ts — Bun silently ignores assigning undefined, so the invalid --table test's exit code poisoned the suite (CI-red with 0 failures). Blocker 2: drop the marked-duplicates exclusion clause from the embed gap query when dedup_lineage does not exist. A pre-dedup legacy DB (the command's headline use case) crashed with a raw SQLiteError inside planRepair before any output; it now completes the plan, still reports embed gaps, and prints the 'recall init' recommendation. The legacy-DB test now builds a genuine pre-dedup schema (full DDL, dedup_lineage dropped, user_version 9) instead of faking it with PRAGMA on a current schema. --- src/lib/repair.ts | 11 ++++++--- tests/commands/repair.test.ts | 46 ++++++++++++++++++++++++++++++++--- 2 files changed, 49 insertions(+), 8 deletions(-) diff --git a/src/lib/repair.ts b/src/lib/repair.ts index f8109cf..cc89186 100644 --- a/src/lib/repair.ts +++ b/src/lib/repair.ts @@ -225,14 +225,17 @@ export interface EmbedGapReport { tooShort: number; } -function embedGapQuery(config: EmbedSourceConfig): string { +function embedGapQuery(config: EmbedSourceConfig, excludeMarkedDuplicates: boolean): string { const where = [ 't.id > ?', ...(config.extraWhere ? [config.extraWhere] : []), 'e.id IS NULL', // Marked duplicates are hidden from every search path; embedding them - // would index records search can never return. - notMarkedDuplicateSql(`'${config.table}'`, 't.id'), + // would index records search can never return. On a pre-dedup legacy DB + // (no dedup_lineage table yet — its DDL ships via recall init) nothing + // can be marked, so the clause is dropped instead of referencing a + // missing table and crashing the plan. + ...(excludeMarkedDuplicates ? [notMarkedDuplicateSql(`'${config.table}'`, 't.id')] : []), ].join(' AND '); return ` SELECT t.id, ${config.columns.map(c => `t.${c}`).join(', ')} @@ -250,7 +253,7 @@ function* iterateEmbedGapRows( config: EmbedSourceConfig, batchSize: number = SQLITE_SAFE_CHUNK_SIZE ): Generator> { - const stmt = db.prepare(embedGapQuery(config)); + const stmt = db.prepare(embedGapQuery(config, tableExists(db, 'dedup_lineage'))); let lastId = 0; for (;;) { const batch = stmt.all(lastId, batchSize) as Array>; diff --git a/tests/commands/repair.test.ts b/tests/commands/repair.test.ts index dd77342..3e9e72a 100644 --- a/tests/commands/repair.test.ts +++ b/tests/commands/repair.test.ts @@ -14,8 +14,17 @@ // - doctor recommends repair but doctor --fix never runs data repair import { describe, test, expect, beforeEach, afterEach } from 'bun:test'; +import { Database } from 'bun:sqlite'; +import { dirname, join } from 'path'; import { setupTestDb, teardownTestDb } from '../helpers/setup'; -import { getDb } from '../../src/db/connection'; +import { closeDb, getDb } from '../../src/db/connection'; +import { + CREATE_TABLES, + CREATE_INDEXES, + CREATE_FTS, + CREATE_FTS_TRIGGERS, + CREATE_VECTOR_TABLES, +} from '../../src/db/schema'; import { runRepair, type RepairDeps } from '../../src/commands/repair'; import { checkFtsIndexes } from '../../src/commands/doctor'; import { checkFts } from '../../src/lib/repair'; @@ -43,7 +52,9 @@ beforeEach(() => { afterEach(() => { console.log = originalLog; console.error = originalError; - process.exitCode = originalExitCode; + // Bun silently ignores assigning undefined to process.exitCode, so a bare + // restore would leave a poisoned exit code behind (CI-red with 0 failures). + process.exitCode = originalExitCode ?? 0; teardownTestDb(); }); @@ -283,10 +294,37 @@ describe('safety invariants', () => { expect(remaining.c).toBe(1); }); - test('an un-migrated database is reported with a recall init recommendation', async () => { - getDb().exec('PRAGMA user_version = 5'); + test('a genuine pre-dedup legacy database does not crash and reports pending migrations', async () => { + // Build the schema an install older than the dedup migration actually + // has: full DDL of its era but no dedup_lineage table (that DDL ships + // via `recall init` on current versions; getDb() runs no DDL at all). + // PRAGMA user_version on a current schema would not reproduce this. + const legacyPath = join(dirname(process.env.RECALL_DB_PATH!), 'legacy.db'); + closeDb(); + const legacy = new Database(legacyPath); + legacy.exec(CREATE_TABLES); + legacy.exec(CREATE_INDEXES); + legacy.exec(CREATE_FTS); + legacy.exec(CREATE_FTS_TRIGGERS); + legacy.exec(CREATE_VECTOR_TABLES); + legacy.exec('DROP TABLE dedup_lineage'); + legacy.exec('PRAGMA user_version = 9'); + legacy.prepare( + `INSERT INTO decisions (decision, status) VALUES ('Adopt SQLite WAL mode for every database connection.', 'active')` + ).run(); + legacy.close(); + process.env.RECALL_DB_PATH = legacyPath; + + // Plain dry-run — the path that crashed with a raw SQLiteError when the + // embed gap query referenced the missing dedup_lineage table. const result = (await runRepair({}, upDeps))!; + + // The plan completes: the recall init recommendation can actually print. expect(result.plan.migrations.pending).toBeGreaterThan(0); + // The embed pass still reports gaps — without the exclusion clause, + // since nothing can be marked as a duplicate on this schema. + expect(result.plan.embedGaps.find(g => g.table === 'decisions')?.missing).toBe(1); + expect(process.exitCode).not.toBe(1); }); });