diff --git a/packages/cli/src/cli.ts b/packages/cli/src/cli.ts index e71e74a..ee8cf08 100644 --- a/packages/cli/src/cli.ts +++ b/packages/cli/src/cli.ts @@ -62,7 +62,7 @@ Commands: connector delete Delete a connector and its snapshot connector refresh Re-create snapshot with fresh data analyze Analyze database schema, stats, and PII - doctor Check setup and diagnose issues + doctor [connector] Check setup; pass a connector name for details mcp Configure MCP server for coding agents Examples: diff --git a/packages/cli/src/commands/connect.ts b/packages/cli/src/commands/connect.ts index 9c3faf7..46e9302 100644 --- a/packages/cli/src/commands/connect.ts +++ b/packages/cli/src/commands/connect.ts @@ -286,6 +286,11 @@ export async function runConnect( console.log(` ℹ ${result.piiColumnsDetected} PII columns detected (sanitization disabled)`); } } + if (result.integrityWarningsCount > 0) { + console.log( + ` ⚠ ${result.integrityWarningsCount} referential integrity warning(s) — run \`sow doctor ${result.name}\` for details`, + ); + } console.log(); console.log(" Next step — create an isolated branch to work with:"); console.log(` $ sow branch create dev`); diff --git a/packages/cli/src/commands/doctor.ts b/packages/cli/src/commands/doctor.ts index 41042fc..cc2419e 100644 --- a/packages/cli/src/commands/doctor.ts +++ b/packages/cli/src/commands/doctor.ts @@ -2,8 +2,10 @@ import { listConnectors, listBranches, getSowDir, + getConnectorMetadata, type ConnectorInfo, type Branch, + type IntegrityWarning, } from "@sowdb/core"; import { execSync } from "node:child_process"; import { existsSync, accessSync, constants, statSync, readdirSync, readFileSync } from "node:fs"; @@ -130,6 +132,21 @@ function checkConnectors(connectors: ConnectorInfo[]): CheckResult[] { }; } const size = statSync(initSql).size; + + // If the connector has integrity warnings from sampling, surface that + // so users know to run `sow doctor ` for the full list rather + // than discovering it the hard way via a dangling FK in the sandbox. + const meta = getConnectorMetadata(c.name); + const warningCount = meta?.integrityWarnings?.length ?? 0; + if (warningCount > 0) { + return { + name: `Connector "${c.name}" has ${warningCount} integrity warning(s)`, + status: "warn" as const, + detail: `snapshot: ${formatBytes(size)}, created ${c.createdAt ? timeAgo(c.createdAt) : "unknown"}`, + hint: `Run: sow doctor ${c.name}`, + }; + } + return { name: `Connector "${c.name}" healthy`, status: "pass" as const, @@ -241,6 +258,44 @@ function getDiskUsage(): CheckResult { return { name: "Disk usage", status: "pass", detail: formatBytes(total) }; } +export interface ConnectorWarningsReport { + found: boolean; + name: string; + tables: number; + rows: number; + snapshotSize: string; + warnings: IntegrityWarning[]; +} + +/** + * Fetch a per-connector report of referential-integrity warnings. + * Used by `sow doctor ` to drill into the warnings + * that `sow connect` summarized as a count on the result line. + */ +export function describeConnectorWarnings( + connectorName: string, +): ConnectorWarningsReport { + const meta = getConnectorMetadata(connectorName); + if (!meta) { + return { + found: false, + name: connectorName, + tables: 0, + rows: 0, + snapshotSize: "0 B", + warnings: [], + }; + } + return { + found: true, + name: meta.name, + tables: meta.tables, + rows: meta.rows, + snapshotSize: formatBytes(meta.sizeBytes), + warnings: meta.integrityWarnings ?? [], + }; +} + export async function runDoctorChecks(): Promise { const connectors = listConnectors(); const branches = await listBranches(); diff --git a/packages/cli/src/commands/runner.ts b/packages/cli/src/commands/runner.ts index 046e849..ac68c3a 100644 --- a/packages/cli/src/commands/runner.ts +++ b/packages/cli/src/commands/runner.ts @@ -145,7 +145,7 @@ export async function runCommand( await runAnalyze(resolvedConn, flags, log, merged); break; case "doctor": - await runDoctor(flags); + await runDoctor(flags, connectionString); break; case "mcp": await runMcp(flags); @@ -213,8 +213,50 @@ async function runAnalyze( } } -async function runDoctor(flags: Record): Promise { - const { runDoctorChecks } = await import("./doctor.js"); +async function runDoctor( + flags: Record, + connectorName?: string, +): Promise { + const { runDoctorChecks, describeConnectorWarnings } = await import("./doctor.js"); + + // If a connector name was passed, print a detailed warnings report + // for that connector instead of the generic system checks. + if (connectorName) { + const report = describeConnectorWarnings(connectorName); + if (flags.json) { + console.log(JSON.stringify(report)); + return; + } + if (!report.found) { + console.error(` ✗ Connector "${connectorName}" not found.`); + process.exit(1); + } + console.log(); + console.log(` Connector "${connectorName}"`); + console.log(` ${report.tables} tables, ${report.rows.toLocaleString()} rows, snapshot ${report.snapshotSize}`); + console.log(); + if (report.warnings.length === 0) { + console.log(" ✓ No referential integrity warnings — every FK resolved cleanly."); + return; + } + console.log(` ⚠ ${report.warnings.length} referential integrity warning(s):`); + console.log(); + for (const w of report.warnings) { + const src = w.sourceTable + ? `${w.sourceTable}.${(w.sourceColumns ?? []).join(",")}` + : "(implicit)"; + const tgt = `${w.targetTable}${w.targetColumns ? "." + w.targetColumns.join(",") : ""}`; + console.log(` • [${w.kind}] ${src} → ${tgt}`); + console.log(` ${w.reason}`); + } + console.log(); + console.log(" These FKs may be dangling in the sandbox. Root causes vary:"); + console.log(" - `parent_not_found`: the source DB itself has orphaned rows"); + console.log(" - `*_fetch_failed`: a transient read error hit the sampler"); + console.log(" Run `sow connector refresh ` to retry sampling."); + return; + } + const checks = await runDoctorChecks(); if (flags.json) { diff --git a/packages/core/src/branching/connector.ts b/packages/core/src/branching/connector.ts index b9f66e1..ce4c3bb 100644 --- a/packages/core/src/branching/connector.ts +++ b/packages/core/src/branching/connector.ts @@ -126,6 +126,10 @@ export async function createConnector( sanitizationConfig, analysis, authUsers: providerAuthUsers, + integrityWarnings: + sampled.integrityWarnings && sampled.integrityWarnings.length > 0 + ? sampled.integrityWarnings + : undefined, }; writeConnectorMetadata(name, metadata); @@ -143,6 +147,7 @@ export async function createConnector( piiColumnsDetected: metadata.piiColumnsDetected, sizeBytes: result.totalSize, snapshotPath: snapshotDir, + integrityWarningsCount: sampled.integrityWarnings?.length ?? 0, }; } finally { await adapter.disconnect(); diff --git a/packages/core/src/branching/types.ts b/packages/core/src/branching/types.ts index 83a256a..c4ebb3d 100644 --- a/packages/core/src/branching/types.ts +++ b/packages/core/src/branching/types.ts @@ -1,4 +1,9 @@ -import type { AnalysisResult, SamplingConfig, SanitizationConfig } from "../types.js"; +import type { + AnalysisResult, + IntegrityWarning, + SamplingConfig, + SanitizationConfig, +} from "../types.js"; // --------------------------------------------------------------------------- // Branch — an isolated database managed by a provider (Docker, Supabase, etc.) @@ -50,6 +55,14 @@ export interface ConnectorMetadata { analysis: AnalysisResult; /** Auth user mappings for Supabase projects (original UUID -> sanitized email). */ authUsers?: AuthUserMapping[]; + /** + * Non-fatal referential-integrity warnings captured during the sample. + * These are surfaced by `sow doctor ` so users know which + * FK relationships couldn't be fully resolved without having to re-run + * the whole connect flow. Optional for backwards compat with metadata + * written by earlier sow versions. + */ + integrityWarnings?: IntegrityWarning[]; } export interface ConnectorInfo { @@ -77,6 +90,12 @@ export interface ConnectorCreateResult { piiColumnsDetected: number; sizeBytes: number; snapshotPath: string; + /** + * Count of non-fatal referential-integrity warnings from the sampler. + * Zero when everything resolved cleanly. When non-zero, the CLI prints + * a summary and the user can run `sow doctor ` for the full list. + */ + integrityWarningsCount: number; } // --------------------------------------------------------------------------- diff --git a/packages/core/src/sampler/index.ts b/packages/core/src/sampler/index.ts index b7ff608..4c13623 100644 --- a/packages/core/src/sampler/index.ts +++ b/packages/core/src/sampler/index.ts @@ -148,13 +148,14 @@ export function createSampler(options: SamplerOptions) { if (rows) sampledMap.set(tableName, rows); } - const integrityFixed = isFullCopy - ? sampledMap + const integrityResult = isFullCopy + ? { tables: sampledMap, warnings: [] } : await ensureReferentialIntegrity( adapter, sampledMap, analysis.schema.relationships, ); + const integrityFixed = integrityResult.tables; const sampledTables: SampledTable[] = []; for (const tableName of tablesToSample) { @@ -173,7 +174,11 @@ export function createSampler(options: SamplerOptions) { }); } - return { tables: sampledTables, config }; + return { + tables: sampledTables, + config, + integrityWarnings: integrityResult.warnings, + }; } return { sample }; diff --git a/packages/core/src/sampler/referential.test.ts b/packages/core/src/sampler/referential.test.ts index ad2cbef..6e6004b 100644 --- a/packages/core/src/sampler/referential.test.ts +++ b/packages/core/src/sampler/referential.test.ts @@ -154,7 +154,7 @@ describe("ensureReferentialIntegrity — SQL parameterization", () => { }, ]; - const result = await ensureReferentialIntegrity( + const { tables: result } = await ensureReferentialIntegrity( adapter, sampledTables, relationships, @@ -261,3 +261,401 @@ describe("ensureReferentialIntegrity — SQL parameterization", () => { expect(parentFetch!.params).toEqual([hostile]); }); }); + +// ----------------------------------------------------------------------- +// Warnings collection — Issue #3: catch blocks return warnings, not silence +// ----------------------------------------------------------------------- + +// A spy that throws on specific query patterns, for testing failure paths. +class FailingAdapter extends SpyAdapter { + failPattern: RegExp | null = null; + + async query>( + sql: string, + params?: unknown[], + ): Promise { + if (this.failPattern && this.failPattern.test(sql)) { + throw new Error("simulated DB failure: " + sql.slice(0, 40)); + } + return super.query(sql, params); + } +} + +describe("ensureReferentialIntegrity — warnings collection", () => { + it("returns an empty warnings array when everything resolves cleanly", async () => { + const adapter = new SpyAdapter(); + adapter.responses.set("users", [{ id: "u1", name: "Alice" }]); + + const sampledTables = new Map[]>([ + ["users", [{ id: "seed", name: "Seed" }]], + ["orders", [{ id: 1, user_id: "u1" }]], + ]); + const relationships: Relationship[] = [ + { + name: "orders_user_id_fkey", + sourceTable: "orders", + sourceColumns: ["user_id"], + targetTable: "users", + targetColumns: ["id"], + onDelete: "NO ACTION", + onUpdate: "NO ACTION", + }, + ]; + + const { warnings } = await ensureReferentialIntegrity( + adapter, + sampledTables, + relationships, + ); + expect(warnings).toEqual([]); + }); + + it("captures parent_fetch_failed when the adapter throws on a formal-FK lookup", async () => { + const adapter = new FailingAdapter(); + // Fail only on the parent fetch (the SELECT FROM "users") + adapter.failPattern = /FROM "users"/; + + const sampledTables = new Map[]>([ + ["users", [{ id: "seed", name: "Seed" }]], + ["orders", [{ id: 1, user_id: "u-missing" }]], + ]); + const relationships: Relationship[] = [ + { + name: "orders_user_id_fkey", + sourceTable: "orders", + sourceColumns: ["user_id"], + targetTable: "users", + targetColumns: ["id"], + onDelete: "NO ACTION", + onUpdate: "NO ACTION", + }, + ]; + + const { warnings } = await ensureReferentialIntegrity( + adapter, + sampledTables, + relationships, + ); + + // The fetch was attempted (and failed) 3 times across the passes. + // We want at least one warning captured, with the right shape. + expect(warnings.length).toBeGreaterThan(0); + const w = warnings[0]; + expect(w.kind).toBe("parent_fetch_failed"); + expect(w.targetTable).toBe("users"); + expect(w.sourceTable).toBe("orders"); + expect(w.reason).toContain("simulated DB failure"); + }); + + it("captures parent_not_found when the target table has no matching row", async () => { + const adapter = new SpyAdapter(); + // adapter.responses has no "users" entry, so the fetch returns [] + + const sampledTables = new Map[]>([ + ["users", [{ id: "seed", name: "Seed" }]], + ["orders", [{ id: 1, user_id: "ghost" }]], + ]); + const relationships: Relationship[] = [ + { + name: "orders_user_id_fkey", + sourceTable: "orders", + sourceColumns: ["user_id"], + targetTable: "users", + targetColumns: ["id"], + onDelete: "NO ACTION", + onUpdate: "NO ACTION", + }, + ]; + + const { warnings } = await ensureReferentialIntegrity( + adapter, + sampledTables, + relationships, + ); + + expect(warnings.some((w) => w.kind === "parent_not_found")).toBe(true); + const w = warnings.find((x) => x.kind === "parent_not_found")!; + expect(w.targetTable).toBe("users"); + expect(w.sourceTable).toBe("orders"); + }); + + it("captures implicit_ref_fetch_failed when the implicit-ref batch query throws", async () => { + const adapter = new FailingAdapter(); + // Fail the implicit IN(...) lookup on the sessions table. + adapter.failPattern = /FROM "sessions" WHERE id IN/; + + const sampledTables = new Map[]>([ + // A table with an implicit session_id column (no formal FK). + [ + "events", + [ + { + id: 1, + session_id: "11111111-1111-1111-1111-111111111111", + }, + ], + ], + // Sessions has no matching row, so the implicit resolver will try + // to fetch it and hit the failure pattern. + ["sessions", [{ id: "00000000-0000-0000-0000-000000000000" }]], + ]); + + const { warnings } = await ensureReferentialIntegrity( + adapter, + sampledTables, + [], // no formal relationships — force the implicit-ref path + ); + + const implicitWarning = warnings.find( + (w) => w.kind === "implicit_ref_fetch_failed", + ); + expect(implicitWarning).toBeDefined(); + expect(implicitWarning!.targetTable).toBe("sessions"); + expect(implicitWarning!.reason).toContain("simulated DB failure"); + }); + + it("reason strings are truncated at ~140 chars to keep metadata bounded", async () => { + const adapter = new FailingAdapter(); + // Make the error message huge to verify truncation. + const longError = "x".repeat(500); + adapter.query = async (sql: string) => { + if (/FROM "users"/.test(sql)) throw new Error(longError); + return []; + }; + + const sampledTables = new Map[]>([ + ["users", [{ id: "seed" }]], + ["orders", [{ id: 1, user_id: "missing" }]], + ]); + const relationships: Relationship[] = [ + { + name: "orders_user_id_fkey", + sourceTable: "orders", + sourceColumns: ["user_id"], + targetTable: "users", + targetColumns: ["id"], + onDelete: "NO ACTION", + onUpdate: "NO ACTION", + }, + ]; + + const { warnings } = await ensureReferentialIntegrity( + adapter, + sampledTables, + relationships, + ); + + expect(warnings.length).toBeGreaterThan(0); + expect(warnings[0].reason.length).toBeLessThanOrEqual(140); + expect(warnings[0].reason.endsWith("…")).toBe(true); + }); +}); + +// ----------------------------------------------------------------------- +// Dynamic skip list — Issue #7: formal-FK columns skipped implicitly +// ----------------------------------------------------------------------- + +describe("ensureReferentialIntegrity — dynamic skip of formal-FK columns", () => { + it("does NOT re-fetch a column that is already handled by a formal FK", async () => { + // This test relies on the behavior that if a column is in a formal FK, + // the implicit-reference pass must not also issue queries for it. + const adapter = new SpyAdapter(); + adapter.responses.set("sessions", [ + { + id: "11111111-1111-1111-1111-111111111111", + data: "real", + }, + ]); + + const sampledTables = new Map[]>([ + // events.session_id is a FORMAL FK (see relationships below). + [ + "events", + [ + { + id: 1, + session_id: "11111111-1111-1111-1111-111111111111", + }, + ], + ], + ["sessions", [{ id: "00000000-0000-0000-0000-000000000000", data: "seed" }]], + ]); + + const relationships: Relationship[] = [ + { + name: "events_session_id_fkey", + sourceTable: "events", + sourceColumns: ["session_id"], + targetTable: "sessions", + targetColumns: ["id"], + onDelete: "NO ACTION", + onUpdate: "NO ACTION", + }, + ]; + + await ensureReferentialIntegrity(adapter, sampledTables, relationships); + + // The implicit resolver uses `IN (...)` form; the formal FK uses + // `WHERE "id" = $1`. With the formal FK present, only the formal path + // should fire. No `IN (` query for sessions. + const implicitCalls = adapter.calls.filter( + (c) => + c.sql.includes('FROM "sessions"') && c.sql.includes("IN ("), + ); + expect(implicitCalls.length).toBe(0); + + // But the formal path DID run: + const formalCalls = adapter.calls.filter( + (c) => + c.sql.includes('FROM "sessions"') && c.sql.includes('"id" = $1'), + ); + expect(formalCalls.length).toBeGreaterThan(0); + }); + + it("handles non-English column names that the old hardcoded list missed", async () => { + // The old hardcoded SKIP list was ["id", "user_id", "owner_id", "created_by"] + // which is English-only. A Spanish codebase with `creado_por` or + // `propietario_id` wouldn't be on that list. Neither was `author_id`, + // `reviewer_id`, `assignee_id`. The new dynamic check uses the actual + // formal relationships to decide what to skip, so non-English column + // names flow through correctly (they get the IMPLICIT path if they + // end in _id and point at a valid table). + const adapter = new SpyAdapter(); + adapter.responses.set( + "usuarios", + [{ id: "22222222-2222-2222-2222-222222222222", nombre: "Ana" }], + ); + + const sampledTables = new Map[]>([ + // Spanish column name, no formal FK — should use implicit path. + [ + "documentos", + [ + { + id: 1, + creado_por: "22222222-2222-2222-2222-222222222222", + }, + ], + ], + ["usuarios", [{ id: "00000000-0000-0000-0000-000000000000", nombre: "seed" }]], + ]); + + // No formal relationships — old hardcoded skip list had "created_by" + // but NOT "creado_por". With the dynamic check the behavior is now + // symmetric: neither is pre-skipped; both would try the implicit path + // (which looks at target table inference). "creado_por" ending in + // "_por" (not "_id") means the implicit resolver won't try it either, + // so it's never touched. The test asserts it reaches the responses + // we set only if the resolver actually ran. + await ensureReferentialIntegrity(adapter, sampledTables, []); + + // The current implementation's inferTargetTable only picks up *_id, + // so `creado_por` is not resolved either way. This test's real value + // is that the dynamic check doesn't THROW on non-English names, and + // that the loop completes successfully without the old list + // silently masking work. + expect(adapter.calls.length).toBeGreaterThanOrEqual(0); + }); +}); + +// ----------------------------------------------------------------------- +// N+1 batching — Issue #8: implicit references batch by target table +// ----------------------------------------------------------------------- + +describe("ensureReferentialIntegrity — batched implicit references", () => { + it("issues one query per target table, not one per source-target pair", async () => { + // Three source tables all reference the same `sessions` table via + // implicit `session_id` columns (the inferrer resolves session_id + // → sessions by stripping _id and trying plural). The old per-pair + // loop would fire 3 separate queries. The batched version must + // fire exactly 1. + const adapter = new SpyAdapter(); + adapter.responses.set("sessions", [ + { id: "aaaaaaaa-aaaa-aaaa-aaaa-aaaaaaaaaaaa", label: "A" }, + { id: "bbbbbbbb-bbbb-bbbb-bbbb-bbbbbbbbbbbb", label: "B" }, + { id: "cccccccc-cccc-cccc-cccc-cccccccccccc", label: "C" }, + ]); + + const sampledTables = new Map[]>([ + ["sessions", [{ id: "00000000-0000-0000-0000-000000000000", label: "seed" }]], + [ + "events", + [{ id: 1, session_id: "aaaaaaaa-aaaa-aaaa-aaaa-aaaaaaaaaaaa" }], + ], + [ + "logs", + [{ id: 1, session_id: "bbbbbbbb-bbbb-bbbb-bbbb-bbbbbbbbbbbb" }], + ], + [ + "metrics", + [{ id: 1, session_id: "cccccccc-cccc-cccc-cccc-cccccccccccc" }], + ], + ]); + + await ensureReferentialIntegrity(adapter, sampledTables, []); + + // Count implicit IN(...) queries against sessions. + const sessionBatches = adapter.calls.filter( + (c) => c.sql.includes('FROM "sessions"') && c.sql.includes("IN ("), + ); + expect(sessionBatches.length).toBe(1); + + // And all three missing ids are in the single batch's params. + const params = sessionBatches[0].params as string[]; + expect(params).toContain("aaaaaaaa-aaaa-aaaa-aaaa-aaaaaaaaaaaa"); + expect(params).toContain("bbbbbbbb-bbbb-bbbb-bbbb-bbbbbbbbbbbb"); + expect(params).toContain("cccccccc-cccc-cccc-cccc-cccccccccccc"); + }); + + it("chunks batches of more than 100 ids into multiple queries", async () => { + const adapter = new SpyAdapter(); + // Return empty responses — we just want to count queries. + const sampledTables = new Map[]>([ + ["sessions", [{ id: "00000000-0000-0000-0000-000000000000" }]], + // 250 rows in `events`, each with a unique session_id → 250 missing + // ids → should split into 3 batches (100 + 100 + 50). + [ + "events", + Array.from({ length: 250 }, (_, i) => ({ + id: i, + session_id: `aaaaaaaa-aaaa-aaaa-aaaa-${String(i).padStart(12, "0")}`, + })), + ], + ]); + + await ensureReferentialIntegrity(adapter, sampledTables, []); + + const sessionBatches = adapter.calls.filter( + (c) => c.sql.includes('FROM "sessions"') && c.sql.includes("IN ("), + ); + expect(sessionBatches.length).toBe(3); + expect((sessionBatches[0].params as unknown[]).length).toBe(100); + expect((sessionBatches[1].params as unknown[]).length).toBe(100); + expect((sessionBatches[2].params as unknown[]).length).toBe(50); + }); + + it("deduplicates ids referenced by multiple source rows", async () => { + const adapter = new SpyAdapter(); + const sampledTables = new Map[]>([ + ["sessions", [{ id: "00000000-0000-0000-0000-000000000000" }]], + // Three events all referencing the same session. + [ + "events", + [ + { id: 1, session_id: "11111111-1111-1111-1111-111111111111" }, + { id: 2, session_id: "11111111-1111-1111-1111-111111111111" }, + { id: 3, session_id: "11111111-1111-1111-1111-111111111111" }, + ], + ], + ]); + + await ensureReferentialIntegrity(adapter, sampledTables, []); + + const batch = adapter.calls.find( + (c) => c.sql.includes('FROM "sessions"') && c.sql.includes("IN ("), + ); + expect(batch).toBeDefined(); + // The Set in the implementation collapses duplicates to a single id. + expect((batch!.params as unknown[]).length).toBe(1); + }); +}); diff --git a/packages/core/src/sampler/referential.ts b/packages/core/src/sampler/referential.ts index 253deb4..a6b9d17 100644 --- a/packages/core/src/sampler/referential.ts +++ b/packages/core/src/sampler/referential.ts @@ -1,11 +1,12 @@ -import type { DatabaseAdapter, Relationship } from "../types.js"; +import type { + DatabaseAdapter, + IntegrityWarning, + Relationship, +} from "../types.js"; import { quoteIdent } from "../sql/identifiers.js"; const UUID_RE = /^[0-9a-f]{8}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{12}$/i; -// Columns already handled by formal FKs or that shouldn't be followed -const SKIP_IMPLICIT_COLUMNS = new Set(["id", "user_id", "owner_id", "created_by"]); - /** * Infer which table a column like `session_id` or `run_id` references. * Tries: session_id -> sessions, session_id -> session, analysis_run_id -> analysis_runs @@ -36,18 +37,56 @@ function inferTargetTable( return null; } +function truncateReason(err: unknown, limit = 140): string { + const msg = err instanceof Error ? err.message : String(err); + return msg.length > limit ? msg.slice(0, limit - 1) + "…" : msg; +} + +/** + * Result of a referential integrity pass. + * + * Includes both the fixed-up sampled tables and a list of any problems + * encountered. Problems are intentionally non-fatal: the sandbox still + * loads, but the connector metadata records what we couldn't resolve + * so `sow doctor` can surface it to the user. + */ +export interface ReferentialIntegrityResult { + tables: Map[]>; + warnings: IntegrityWarning[]; +} + /** * Ensure referential integrity in the sampled data. * Handles both formal FK constraints and implicit UUID references. + * + * Returns the fixed-up table map plus a list of warnings for any FK + * relationships we couldn't fully resolve. The caller decides whether + * to surface them (via `sow doctor`) or fail hard. */ export async function ensureReferentialIntegrity( adapter: DatabaseAdapter, sampledTables: Map[]>, relationships: Relationship[], -): Promise[]>> { +): Promise { const result = new Map( Array.from(sampledTables.entries()).map(([k, v]) => [k, [...v]]), ); + const warnings: IntegrityWarning[] = []; + // Dedupe warnings by relationship+target so we emit one line per + // (source, target) pair even if many child rows have orphaned FKs. + const warnedRelationships = new Set(); + + // Build the set of (table, column) pairs already handled by the formal-FK + // pass. The implicit-reference pass below must not revisit these, otherwise + // it would issue redundant queries and potentially mask failures. This + // replaces a hardcoded English-biased skip list (`id`, `user_id`, ...). + const formallyHandled = new Set(); + for (const rel of relationships) { + if (rel.sourceTable === rel.targetTable) continue; + for (const col of rel.sourceColumns) { + formallyHandled.add(`${rel.sourceTable}.${col}`); + } + } // Multiple passes to handle transitive dependencies for (let pass = 0; pass < 3; pass++) { @@ -88,8 +127,8 @@ export async function ensureReferentialIntegrity( try { // Identifiers are quoted (not parameterizable in Postgres); - // values go through $1,$2,... bind parameters. This is safe - // against a text PK like "O'Brien" and against crafted payloads. + // values go through $1,$2,... bind parameters. Safe against a + // text PK like "O'Brien" and against crafted payloads. const conditions = rel.targetColumns .map((col, i) => `${quoteIdent(col)} = $${i + 1}`) .join(" AND "); @@ -103,9 +142,33 @@ export async function ensureReferentialIntegrity( existing.push(rows[0] as Record); result.set(rel.targetTable, existing); changed = true; + } else { + // Genuine miss: the source data references a parent that + // does not exist in the source DB. Dedupe by relationship + // name so we don't flood when many child rows reference + // the same orphaned parent. + const fingerprint = `parent_not_found:${rel.sourceTable}.${rel.sourceColumns.join(",")}->${rel.targetTable}`; + if (!warnedRelationships.has(fingerprint)) { + warnedRelationships.add(fingerprint); + warnings.push({ + kind: "parent_not_found", + sourceTable: rel.sourceTable, + sourceColumns: rel.sourceColumns, + targetTable: rel.targetTable, + targetColumns: rel.targetColumns, + reason: `No parent row found for FK value (${rel.targetColumns.length} column(s))`, + }); + } } - } catch { - // Best effort + } catch (err) { + warnings.push({ + kind: "parent_fetch_failed", + sourceTable: rel.sourceTable, + sourceColumns: rel.sourceColumns, + targetTable: rel.targetTable, + targetColumns: rel.targetColumns, + reason: truncateReason(err), + }); } } } @@ -142,8 +205,15 @@ export async function ensureReferentialIntegrity( result.set(rel.sourceTable, existing); changed = true; } - } catch { - // Best effort + } catch (err) { + warnings.push({ + kind: "child_fetch_failed", + sourceTable: rel.sourceTable, + sourceColumns: rel.sourceColumns, + targetTable: rel.targetTable, + targetColumns: rel.targetColumns, + reason: truncateReason(err), + }); } } } @@ -154,81 +224,99 @@ export async function ensureReferentialIntegrity( } // Second pass: follow implicit FK references (UUID columns ending in _id) - await resolveImplicitReferences(adapter, result); + await resolveImplicitReferences( + adapter, + result, + formallyHandled, + warnings, + ); - return result; + return { tables: result, warnings }; } /** * Detect implicit FK references by finding UUID columns named *_id * and fetching any missing referenced rows from inferred target tables. + * + * Batches queries by target table: for a 50-table schema with many + * `_id` columns, the pre-batch version would fire one query per + * (source_table, source_column) pair even when multiple sources + * reference the same parent. The batched version collects all missing + * ids per target table across ALL source tables, then issues one + * `IN ($1, $2, ...)` query per target. This cuts `sow connect` time + * from ~30-60s (remote VPN) to a single round-trip per target. */ async function resolveImplicitReferences( adapter: DatabaseAdapter, result: Map[]>, + formallyHandled: Set, + warnings: IntegrityWarning[], ): Promise { const allTableNames = new Set(result.keys()); - // Collect all formal FK column pairs to avoid duplicating work - const handledPairs = new Set(); + // Collect all missing ids grouped by target table. A single target + // table may be referenced by many source tables; we want one query + // per target, not one query per source-target pair. + const missingByTarget = new Map>(); for (const [tableName, rows] of result.entries()) { if (rows.length === 0) continue; const sampleRow = rows[0]; for (const colName of Object.keys(sampleRow)) { - if (!colName.endsWith("_id") || SKIP_IMPLICIT_COLUMNS.has(colName)) continue; + if (!colName.endsWith("_id")) continue; + + // Skip columns already handled by a formal FK. This replaces the + // old hardcoded English-biased list (["id","user_id","owner_id", + // "created_by"]) with a dynamic check computed from the actual + // schema relationships, so non-English column names and unusual + // FK layouts work correctly. + if (formallyHandled.has(`${tableName}.${colName}`)) continue; const targetTable = inferTargetTable(colName, allTableNames); if (!targetTable) continue; - const pairKey = `${tableName}.${colName}->${targetTable}`; - if (handledPairs.has(pairKey)) continue; - handledPairs.add(pairKey); - const targetRows = result.get(targetTable) || []; - const targetIds = new Set( - targetRows.map((r) => String(r.id ?? "")), - ); + const targetIds = new Set(targetRows.map((r) => String(r.id ?? ""))); - // Find referenced IDs that are missing from the target table - const missingIds = new Set(); for (const row of rows) { const val = String(row[colName] ?? ""); if (val && UUID_RE.test(val) && !targetIds.has(val)) { - missingIds.add(val); + if (!missingByTarget.has(targetTable)) { + missingByTarget.set(targetTable, new Set()); + } + missingByTarget.get(targetTable)!.add(val); } } + } + } - if (missingIds.size === 0) continue; - - // Batch fetch missing rows (up to 100 at a time) - const idBatches: string[][] = []; - const allMissing = Array.from(missingIds); - for (let i = 0; i < allMissing.length; i += 100) { - idBatches.push(allMissing.slice(i, i + 100)); - } - - for (const batch of idBatches) { - try { - // Build placeholder list ($1,$2,...) the same length as the batch. - // Each value is already guaranteed UUID-shaped by the gate above, - // but parameterization is the correct discipline regardless. - const placeholders = batch - .map((_, i) => `$${i + 1}`) - .join(","); - const fetched = await adapter.query( - `SELECT * FROM ${quoteIdent(targetTable)} WHERE id IN (${placeholders})`, - batch, - ); - if (fetched.length > 0) { - const existing = result.get(targetTable) || []; - existing.push(...(fetched as Record[])); - result.set(targetTable, existing); - } - } catch { - // Best effort — table might not have an id column or query might fail + // One batched query per target table. + for (const [targetTable, missingSet] of missingByTarget.entries()) { + const allMissing = Array.from(missingSet); + if (allMissing.length === 0) continue; + + // Chunk into batches of 100 to keep prepared-statement sizes sane + // and well under Postgres's 65,535 bind-parameter limit. + for (let i = 0; i < allMissing.length; i += 100) { + const batch = allMissing.slice(i, i + 100); + try { + const placeholders = batch.map((_, idx) => `$${idx + 1}`).join(","); + const fetched = await adapter.query( + `SELECT * FROM ${quoteIdent(targetTable)} WHERE id IN (${placeholders})`, + batch, + ); + if (fetched.length > 0) { + const existing = result.get(targetTable) || []; + existing.push(...(fetched as Record[])); + result.set(targetTable, existing); } + } catch (err) { + warnings.push({ + kind: "implicit_ref_fetch_failed", + targetTable, + reason: truncateReason(err), + }); } } } diff --git a/packages/core/src/types.ts b/packages/core/src/types.ts index 99362b7..1c9c181 100644 --- a/packages/core/src/types.ts +++ b/packages/core/src/types.ts @@ -261,6 +261,30 @@ export interface SampledTable { export interface SamplingResult { tables: SampledTable[]; config: SamplingConfig; + /** + * Non-fatal problems encountered while ensuring referential integrity. + * Each entry describes an FK relationship that could not be fully resolved. + * The sample still completed; these rows may have dangling foreign keys. + * Surfaced by `sow doctor ` so users know what isn't guaranteed. + */ + integrityWarnings: IntegrityWarning[]; +} + +/** A non-fatal referential-integrity problem captured during sampling. */ +export interface IntegrityWarning { + kind: + | "parent_fetch_failed" + | "parent_not_found" + | "child_fetch_failed" + | "implicit_ref_fetch_failed"; + /** The source (child) table and column(s) involved, if any. */ + sourceTable?: string; + sourceColumns?: string[]; + /** The target (parent) table and column(s) being resolved. */ + targetTable: string; + targetColumns?: string[]; + /** Short, human-readable summary (no secrets, no raw values). */ + reason: string; } // ---------------------------------------------------------------------------