diff --git a/README.md b/README.md index e917cf5d9..ec04f505e 100644 --- a/README.md +++ b/README.md @@ -116,6 +116,8 @@ msgvault can search your archive semantically using vector embeddings in additio A separate MCP tool, `find_similar_messages`, returns nearest neighbors for a seed message. See the [Vector Search guide](https://msgvault.io/usage/vector-search/) for setup, backfill, and troubleshooting. +> **Run only one embedding process at a time.** Don't run `msgvault embeddings build`/`resume` or `repair-encoding` concurrently with a `msgvault serve` daemon — they write the same embedding state, and concurrent writers are not coordinated across processes. + ## Importing from MBOX or Apple Mail Import email from providers that offer MBOX exports or from a local Apple Mail data directory: diff --git a/cmd/msgvault/cmd/embed.go b/cmd/msgvault/cmd/embed.go index feaaf0125..a921b116a 100644 --- a/cmd/msgvault/cmd/embed.go +++ b/cmd/msgvault/cmd/embed.go @@ -9,6 +9,7 @@ import ( var ( embedFullRebuild bool embedYes bool + embedBackstop bool embeddingsRetireYes bool embeddingsRetireForceActive bool embeddingsActivateForce bool @@ -25,9 +26,11 @@ var embeddingsResumeCmd = &cobra.Command{ Use: "resume", Short: "Resume or top up the current vector embedding generation", Long: `Resume or top up the current vector embedding generation. -If a matching generation is building, this drains its pending queue and -activates it when complete. Otherwise it embeds pending rows for the -active generation.`, +If a matching generation is building, this embeds any messages still +needing embedding for it and activates it when complete. Otherwise it +embeds any messages still needing embedding for the active generation. +Pass --backstop for a full-scan pass that ignores the per-generation +watermark, catching any straggler messages the incremental scan skipped.`, RunE: runEmbeddingsResume, } var embeddingsListCmd = &cobra.Command{ @@ -55,9 +58,10 @@ func newEmbeddingsBuildCmd(use string) *cobra.Command { Short: "Build or update the vector embedding index (incremental by default; --full-rebuild for a new generation)", Long: `Build or update the vector embedding index for hybrid search. Writes vectors to the co-located vectors.db. In the default incremental -mode, the command drains any pending rows in the active generation. With ---full-rebuild, it creates a new building generation, embeds the entire -corpus, and (on a clean completion) atomically activates it. +mode, the command embeds any messages still needing embedding for the +active generation. With --full-rebuild, it creates a new building +generation, embeds the entire corpus, and (on a clean completion) +atomically activates it. Requires [vector] to be enabled in config.toml and [vector.embeddings] to point at a running OpenAI-compatible endpoint.`, @@ -65,6 +69,8 @@ to point at a running OpenAI-compatible endpoint.`, } cmd.Flags().BoolVar(&embedFullRebuild, "full-rebuild", false, "Create a new generation and rebuild from scratch") cmd.Flags().BoolVar(&embedYes, "yes", false, "Skip confirmation prompts") + cmd.Flags().BoolVar(&embedBackstop, "backstop", false, + "Full-scan pass that ignores the per-generation watermark, catching any straggler messages the incremental scan skipped (idempotent)") return cmd } @@ -92,10 +98,12 @@ func runEmbeddingsResume(cmd *cobra.Command, args []string) error { func init() { embedCmd.Deprecated = "use 'msgvault embeddings build' instead" + embeddingsResumeCmd.Flags().BoolVar(&embedBackstop, "backstop", false, + "Full-scan pass that ignores the per-generation watermark, catching any straggler messages the incremental scan skipped (idempotent)") embeddingsRetireCmd.Flags().BoolVar(&embeddingsRetireYes, "yes", false, "Skip confirmation prompt") embeddingsRetireCmd.Flags().BoolVar(&embeddingsRetireForceActive, "force-active", false, "Allow retiring the active generation") embeddingsActivateCmd.Flags().BoolVar(&embeddingsActivateYes, "yes", false, "Skip confirmation prompt") - embeddingsActivateCmd.Flags().BoolVar(&embeddingsActivateForce, "force", false, "Allow activation with pending rows or a fingerprint mismatch") + embeddingsActivateCmd.Flags().BoolVar(&embeddingsActivateForce, "force", false, "Allow activation while messages still need embedding, or with a fingerprint mismatch") embeddingsCmd.AddCommand(embeddingsBuildCmd) embeddingsCmd.AddCommand(embeddingsResumeCmd) embeddingsCmd.AddCommand(embeddingsListCmd) diff --git a/cmd/msgvault/cmd/embed_pg_test.go b/cmd/msgvault/cmd/embed_pg_test.go index b765dd0df..eb35787e7 100644 --- a/cmd/msgvault/cmd/embed_pg_test.go +++ b/cmd/msgvault/cmd/embed_pg_test.go @@ -34,17 +34,19 @@ func countEmbeddingRowsPG(t *testing.T, db *sql.DB, gen vector.GenerationID) int return n } -// seedGenWithEmbeddingsPG creates a building generation, upserts one chunk per -// supplied message id (dim 4), and clears its pending queue so the management -// commands treat it as a finished generation. Returns the generation id. +// seedGenWithEmbeddingsPG creates a building generation and upserts one chunk +// per supplied message id (dim 4). The consuming tests force-activate/retire +// (force=true), bypassing the coverage gate, so no embed_gen stamping is +// needed to make the management commands treat it as finished. Returns the +// generation id. func seedGenWithEmbeddingsPG(t *testing.T, pgb *pgvector.Backend, ids ...int64) vector.GenerationID { t.Helper() ctx := context.Background() - for _, id := range ids { - _, err := pgb.DB().ExecContext(ctx, - `INSERT INTO messages (id) VALUES ($1) ON CONFLICT DO NOTHING`, id) - require.NoErrorf(t, err, "seed message %d", id) - } + // No messages rows are inserted: embeddings.message_id has no FK to + // messages, and the consuming tests force-activate/retire (force=true), + // bypassing the coverage gate, so no live messages are needed. (The full + // schema's messages.id is GENERATED ALWAYS AS IDENTITY, so an explicit-id + // insert would be rejected anyway.) gen, err := pgb.CreateGeneration(ctx, "test-model", 4, "test-model:4") require.NoError(t, err, "CreateGeneration") chunks := make([]vector.Chunk, 0, len(ids)) @@ -54,9 +56,8 @@ func seedGenWithEmbeddingsPG(t *testing.T, pgb *pgvector.Backend, ids ...int64) chunks = append(chunks, vector.Chunk{MessageID: id, ChunkIndex: 0, Vector: v}) } require.NoError(t, pgb.Upsert(ctx, gen, chunks), "Upsert") - _, err = pgb.DB().ExecContext(ctx, - `DELETE FROM pending_embeddings WHERE generation_id = $1`, int64(gen)) - require.NoError(t, err, "clear pending") + // The consuming tests force-activate/retire (force=true), bypassing the + // coverage gate, so no embed_gen stamping is needed here. return gen } @@ -157,6 +158,16 @@ func openEmbedManagePGDB(t *testing.T) (*pgvector.Backend, func(string) string, require.NoError(t, err, "store.Open") t.Cleanup(func() { _ = st.Close() }) + // Mirror production: the CLI embeddings commands (runEmbed, + // runEmbeddingsRetire/Activate/List) all run store.InitSchema BEFORE + // opening the pgvector backend. InitSchema creates the full messages table + // (with embed_gen) and applied_migrations, so the backend's open-time reset/ + // backfill have the columns they touch. A minimal hand-rolled messages + // table omitting embed_gen diverges from production and breaks both the + // open-time reset and the activate/retire coverage gates (which reference + // embed_gen even under --force). + require.NoError(t, st.InitSchema(), "InitSchema") + pgb, err := pgvector.Open(ctx, pgvector.Options{ DB: st.DB(), Dimension: 4, @@ -164,15 +175,6 @@ func openEmbedManagePGDB(t *testing.T) (*pgvector.Backend, func(string) string, require.NoError(t, err, "pgvector.Open") t.Cleanup(func() { _ = pgb.Close() }) - // Create a minimal messages table so CreateGeneration's seed query works. - _, err = st.DB().ExecContext(ctx, ` - CREATE TABLE IF NOT EXISTS messages ( - id BIGINT PRIMARY KEY, - deleted_at TIMESTAMPTZ, - deleted_from_source_at TIMESTAMPTZ - )`) - require.NoError(t, err, "create messages scaffold") - return pgb, (&store.PostgreSQLDialect{}).Rebind, dsn } diff --git a/cmd/msgvault/cmd/embed_test.go b/cmd/msgvault/cmd/embed_test.go index 597b52a09..e9af2674c 100644 --- a/cmd/msgvault/cmd/embed_test.go +++ b/cmd/msgvault/cmd/embed_test.go @@ -22,11 +22,15 @@ func TestEmbeddingsCommandRegistration(t *testing.T) { require.Equal("build", buildCmd.Name()) require.NotNil(buildCmd.Flags().Lookup("full-rebuild")) require.NotNil(buildCmd.Flags().Lookup("yes")) + require.NotNil(buildCmd.Flags().Lookup("backstop")) resumeCmd, _, err := rootCmd.Find([]string{"embeddings", "resume"}) require.NoError(err) require.Equal("resume", resumeCmd.Name()) require.Nil(resumeCmd.Flags().Lookup("full-rebuild")) + // --backstop is also available on resume, so operators + // can do a watermark-ignoring straggler sweep without --full-rebuild. + require.NotNil(resumeCmd.Flags().Lookup("backstop")) listCmd, _, err := rootCmd.Find([]string{"embeddings", "list"}) require.NoError(err) @@ -52,11 +56,49 @@ func TestEmbeddingsCommandRegistration(t *testing.T) { require.NotNil(legacyCmd.Flags().Lookup("yes")) } +// TestRunEmbeddingsResume_PreservesBackstopFlag pins the resume behavior: +// resume forces incremental mode (saves/restores embedFullRebuild + embedYes) but +// must leave embedBackstop exactly as the operator set it, so +// `embeddings resume --backstop` actually runs a backstop pass. +func TestRunEmbeddingsResume_PreservesBackstopFlag(t *testing.T) { + assert := assertpkg.New(t) + + // Save and restore all three globals so the test is hermetic. + oldFull, oldYes, oldBackstop := embedFullRebuild, embedYes, embedBackstop + t.Cleanup(func() { embedFullRebuild, embedYes, embedBackstop = oldFull, oldYes, oldBackstop }) + + // Operator state: full-rebuild on (resume must clear it), backstop on + // (resume must NOT touch it). Point at an empty config so the run errors + // out early (vector disabled) without needing a real backend. + embedFullRebuild = true + embedYes = false + embedBackstop = true + oldCfg := cfg + cfg = &config.Config{} + t.Cleanup(func() { cfg = oldCfg }) + + cmd := embeddingsResumeCmd + oldCtx := cmd.Context() + cmd.SetContext(context.Background()) + t.Cleanup(func() { cmd.SetContext(oldCtx) }) + + // Errors because vector is not enabled — that's fine; we only assert the + // flag-preservation contract of runEmbeddingsResume. + _ = runEmbeddingsResume(cmd, nil) + + assert.True(embedBackstop, "resume must NOT clobber embedBackstop") + assert.True(embedFullRebuild, "resume must restore embedFullRebuild to its prior value") + assert.False(embedYes, "resume must restore embedYes to its prior value") +} + func TestListEmbeddingGenerationsIncludesActiveAndBuilding(t *testing.T) { require := requirepkg.New(t) assert := assertpkg.New(t) db := newEmbeddingMetadataTestDB(t) + // listEmbeddingGenerations reads only the generation metadata now; + // coverage (missing count) is filled separately from the main DB via + // fillCoverage, so it is not asserted here. rows, err := listEmbeddingGenerations(t.Context(), db, sqliteRebind) require.NoError(err) require.Len(rows, 2) @@ -64,18 +106,24 @@ func TestListEmbeddingGenerationsIncludesActiveAndBuilding(t *testing.T) { assert.Equal(vector.GenerationID(1), rows[0].ID) assert.Equal(vector.GenerationActive, rows[0].State) assert.Equal(int64(2), rows[0].MessageCount) - assert.Equal(int64(0), rows[0].PendingCount) assert.Equal(vector.GenerationID(2), rows[1].ID) assert.Equal(vector.GenerationBuilding, rows[1].State) - assert.Equal(int64(1), rows[1].PendingCount) } -func TestRunEmbeddingsActivateRefusesPendingWithoutForce(t *testing.T) { +// TestRunEmbeddingsActivateRefusesMissingWithoutForce verifies the CLI +// pre-flight coverage gate: activating a building generation that still +// has live messages needing embedding (embed_gen <> gen in the main DB) +// must fail without --force. +func TestRunEmbeddingsActivateRefusesMissingWithoutForce(t *testing.T) { require := requirepkg.New(t) assert := assertpkg.New(t) - dbPath := newEmbeddingMetadataTestDBFile(t) - withEmbeddingCommandConfig(t, dbPath) + dataDir := t.TempDir() + dbPath := newEmbeddingMetadataTestDBFileAt(t, filepath.Join(dataDir, "vectors.db")) + // Main DB with one live, unembedded message -> coverage reports + // missing=1 for generation 2. + seedMainDBWithLiveMessage(t, dataDir) + withEmbeddingCommandConfigDataDir(t, dbPath, dataDir) oldYes := embeddingsActivateYes embeddingsActivateYes = true @@ -87,7 +135,8 @@ func TestRunEmbeddingsActivateRefusesPendingWithoutForce(t *testing.T) { err := runEmbeddingsActivate(cmd, []string{"2"}) require.Error(err) - assert.Contains(err.Error(), "pending embedding rows") + assert.Contains(err.Error(), "needing embedding") + assert.Contains(err.Error(), "msgvault embeddings resume --backstop") } // TestRetireEmbeddingGenerationRefusesActiveWithoutForce_PreCheck pins the @@ -134,7 +183,14 @@ func newEmbeddingMetadataTestDB(t *testing.T) *sql.DB { func newEmbeddingMetadataTestDBFile(t *testing.T) string { t.Helper() - path := filepath.Join(t.TempDir(), "vectors.db") + return newEmbeddingMetadataTestDBFileAt(t, filepath.Join(t.TempDir(), "vectors.db")) +} + +// newEmbeddingMetadataTestDBFileAt creates a vectors.db with just the +// index_generations metadata (no pending_embeddings — coverage now lives +// in the main DB) at the given path. +func newEmbeddingMetadataTestDBFileAt(t *testing.T, path string) string { + t.Helper() db, err := sql.Open("sqlite3", path) requirepkg.NoError(t, err) defer func() { requirepkg.NoError(t, db.Close()) }() @@ -152,14 +208,6 @@ CREATE TABLE index_generations ( state TEXT NOT NULL, message_count INTEGER NOT NULL DEFAULT 0 ); -CREATE TABLE pending_embeddings ( - generation_id INTEGER NOT NULL, - message_id INTEGER NOT NULL, - enqueued_at INTEGER NOT NULL, - claimed_at INTEGER, - claim_token TEXT, - PRIMARY KEY (generation_id, message_id) -); `) requirepkg.NoError(t, err) @@ -170,12 +218,28 @@ INSERT INTO index_generations VALUES (1, 'model', 4, ?, 100, 101, 110, 111, 'active', 2), (2, 'model', 4, ?, 120, 121, NULL, NULL, 'building', 1); -INSERT INTO pending_embeddings (generation_id, message_id, enqueued_at) VALUES (2, 42, 120); `, fp, fp) requirepkg.NoError(t, err) return path } +// seedMainDBWithLiveMessage creates a main msgvault.db in dataDir with one +// live message whose embed_gen is NULL — i.e. it reads as "missing" for +// every generation, so the coverage gate refuses activation. +func seedMainDBWithLiveMessage(t *testing.T, dataDir string) { + t.Helper() + s, err := store.Open(filepath.Join(dataDir, "msgvault.db")) + requirepkg.NoError(t, err) + defer func() { requirepkg.NoError(t, s.Close()) }() + requirepkg.NoError(t, s.InitSchema()) + _, err = s.DB().Exec(` +INSERT INTO sources (id, source_type, identifier) VALUES (1, 'gmail', 'me@example.com'); +INSERT INTO conversations (id, source_id, conversation_type) VALUES (1, 1, 'email_thread'); +INSERT INTO messages (id, conversation_id, source_id, source_message_id, message_type, embed_gen) VALUES (1, 1, 1, 'm1', 'email', NULL); +`) + requirepkg.NoError(t, err) +} + func withEmbeddingCommandConfig(t *testing.T, vecPath string) { t.Helper() oldCfg := cfg @@ -183,6 +247,18 @@ func withEmbeddingCommandConfig(t *testing.T, vecPath string) { t.Cleanup(func() { cfg = oldCfg }) } +// withEmbeddingCommandConfigDataDir is like withEmbeddingCommandConfig but +// also sets Data.DataDir so DatabaseDSN() resolves to a real main DB (used +// by the coverage gate). +func withEmbeddingCommandConfigDataDir(t *testing.T, vecPath, dataDir string) { + t.Helper() + oldCfg := cfg + c := newTestConfigForFingerprint(vecPath) + c.Data.DataDir = dataDir + cfg = c + t.Cleanup(func() { cfg = oldCfg }) +} + func newTestConfigForFingerprint(vecPath string) *config.Config { return &config.Config{ Vector: vector.Config{ diff --git a/cmd/msgvault/cmd/embed_vector.go b/cmd/msgvault/cmd/embed_vector.go index 7bbd195ab..8df24cd03 100644 --- a/cmd/msgvault/cmd/embed_vector.go +++ b/cmd/msgvault/cmd/embed_vector.go @@ -29,11 +29,25 @@ func runEmbed(cmd *cobra.Command) error { } defer func() { _ = s.Close() }() + // Auto-migrate the main schema before any embed_gen access. On an + // upgraded SQLite DB whose messages table predates the embed_gen + // column, InitSchema's LegacyColumnMigrations adds it; without this + // the backfill UPDATE and CoverageCounts below fail with "no such + // column: embed_gen". serve.go does the same before setupVectorFeatures. + if err := s.InitSchema(); err != nil { + return fmt.Errorf("init schema: %w", err) + } + var ( backend vector.Backend vectorsDB *sql.DB closeFn func() error rebind func(string) string + // lastModifiedExpr is the dialect-correct SELECT expression for the + // embed worker's last_modified CAS token. SQLite needs CAST(... AS + // TEXT) to defeat go-sqlite3's DATETIME→time.Time coercion (which + // would break round-trip equality); PG uses the bare column. + lastModifiedExpr = "CAST(m.last_modified AS TEXT)" ) if s.IsPostgreSQL() { // pgvector embeddings live in the same Postgres database as @@ -52,6 +66,7 @@ func runEmbed(cmd *cobra.Command) error { vectorsDB = pgb.DB() closeFn = pgb.Close rebind = (&store.PostgreSQLDialect{}).Rebind + lastModifiedExpr = "m.last_modified" } else { if err := sqlitevec.RegisterExtension(); err != nil { return fmt.Errorf("register sqlite-vec: %w", err) @@ -98,15 +113,20 @@ func runEmbed(cmd *cobra.Command) error { Timeout: cfg.Vector.Embeddings.Timeout, MaxRetries: cfg.Vector.Embeddings.MaxRetries, }) - totalPending, err := pendingCount(ctx, vectorsDB, rebind, gen) + // "Pending" is now the count of live messages still needing work for + // this generation (embed_gen <> gen), read from the main DB coverage + // rather than a queue table. + missing, err := s.MissingCount(ctx, int64(gen)) if err != nil { - return fmt.Errorf("count pending: %w", err) + return fmt.Errorf("coverage counts: %w", err) } + totalPending := int(missing) worker := embed.NewWorker(embed.WorkerDeps{ Backend: backend, VectorsDB: vectorsDB, MainDB: s.DB(), + Store: s, Client: client, Preprocess: embed.PreprocessConfig{ StripQuotes: cfg.Vector.Preprocess.StripQuotesEnabled(), @@ -116,48 +136,44 @@ func runEmbed(cmd *cobra.Command) error { StripURLTracking: cfg.Vector.Preprocess.StripURLTrackingEnabled(), CollapseWhitespace: cfg.Vector.Preprocess.CollapseWhitespaceEnabled(), }, - MaxInputChars: cfg.Vector.Embeddings.MaxInputChars, - BatchSize: cfg.Vector.Embeddings.BatchSize, - EmbedTimeout: cfg.Vector.Embeddings.Timeout, - EmbedMaxRetries: cfg.Vector.Embeddings.MaxRetries, - Rebind: rebind, - TotalPending: totalPending, - Progress: newProgressPrinter(errOut, totalPending, cfg.Vector.Embeddings.ETAWindow), + MaxInputChars: cfg.Vector.Embeddings.MaxInputChars, + BatchSize: cfg.Vector.Embeddings.BatchSize, + Rebind: rebind, + LastModifiedExpr: lastModifiedExpr, + TotalPending: totalPending, + Progress: newProgressPrinter(errOut, totalPending, cfg.Vector.Embeddings.ETAWindow), }) - if n, err := worker.ReclaimStale(ctx); err != nil { - return fmt.Errorf("reclaim stale: %w", err) - } else if n > 0 { - _, _ = fmt.Fprintf(errOut, "Reclaimed %d stale claims.\n", n) + var res embed.RunResult + if embedBackstop { + res, err = worker.RunBackstop(ctx, gen) + } else { + res, err = worker.RunOnce(ctx, gen) } - - res, err := worker.RunOnce(ctx, gen) if err != nil { return fmt.Errorf("embed run: %w", err) } - _, _ = fmt.Fprintf(out, "Claimed: %d, succeeded: %d, failed: %d, truncated: %d\n", + _, _ = fmt.Fprintf(out, "Scanned: %d, succeeded: %d, failed: %d, truncated: %d\n", res.Claimed, res.Succeeded, res.Failed, res.Truncated) - // Activation is a function of the generation's final state, not + // Activation is a function of the generation's final coverage, not // of the cumulative retry counter — transient failures that the // worker later recovers from must not block activation, and an // active generation must not be re-activated. if rebuildInProgress { - remaining, err := pendingCount(ctx, vectorsDB, rebind, gen) + _, _, _, remaining, err := s.CoverageCounts(ctx, int64(gen)) if err != nil { - return fmt.Errorf("count pending: %w", err) + return fmt.Errorf("coverage counts: %w", err) } if remaining == 0 { // force=false: we already gated on remaining==0 above, and the - // backend re-asserts the seeded/no-pending gate atomically. + // backend re-asserts the no-missing gate atomically. if err := backend.ActivateGeneration(ctx, gen, false); err != nil { return fmt.Errorf("activate generation: %w", err) } _, _ = fmt.Fprintf(out, "Generation %d activated.\n", gen) } else { - _, _ = fmt.Fprintf(errOut, - "Generation %d still has %d pending rows; run `msgvault embeddings resume` again to finish, then it will activate automatically.\n", - gen, remaining) + _, _ = fmt.Fprint(errOut, remainingCoverageHint(gen, remaining)) } } return nil @@ -234,34 +250,19 @@ func pickEmbedGeneration(ctx context.Context, backend vector.Backend, opts embed } if building != nil { if building.Fingerprint == opts.Fingerprint { - // Re-run the initial seed if the prior CreateGeneration - // crashed between inserting the building row and committing - // the seed. Without this, a resume could "drain" zero - // pending rows and activate an unseeded generation. - err := backend.EnsureSeeded(ctx, building.ID) - switch { - case err == nil: - _, _ = fmt.Fprintf(opts.Stderr, "Resuming building generation %d (%s).\n", - building.ID, building.Fingerprint) - return building.ID, true, nil - case errors.Is(err, vector.ErrGenerationNotBuilding): - // Race: another actor (daemon, concurrent CLI, retire - // call) moved the generation out of 'building' between - // BuildingGeneration and EnsureSeeded. Fall through to - // the active-generation lookup rather than aborting - // with a fatal error — if the flip was an activation - // matching our fingerprint, that's exactly the - // generation we want to top up. - _, _ = fmt.Fprintf(opts.Stderr, - "Building generation %d changed state while resuming; re-resolving.\n", - building.ID) - default: - return 0, false, fmt.Errorf("ensure seeded: %w", err) - } - } else { - return 0, false, fmt.Errorf("in-progress rebuild has fingerprint=%q, config has %q — activate or retire it before running with a different model", - building.Fingerprint, opts.Fingerprint) + // Resume the matching build. Under scan-and-fill there is no + // seed pass to re-run on resume — the worker discovers work by + // scanning messages.embed_gen, so a crash before any embedding + // simply leaves the whole corpus needing work. If the build was + // already activated by a concurrent actor, the worker's scan is + // still harmless (covered rows are skipped) and the subsequent + // activation gate will report it is not building. + _, _ = fmt.Fprintf(opts.Stderr, "Resuming building generation %d (%s).\n", + building.ID, building.Fingerprint) + return building.ID, true, nil } + return 0, false, fmt.Errorf("in-progress rebuild has fingerprint=%q, config has %q — activate or retire it before running with a different model", + building.Fingerprint, opts.Fingerprint) } active, err := vector.ResolveActiveForFingerprint(ctx, backend, opts.Fingerprint) @@ -280,21 +281,6 @@ func pickEmbedGeneration(ctx context.Context, backend vector.Backend, opts embed } } -// pendingCount counts queue rows for gen. rebind translates the -// ?-placeholder to the driver's native form; nil is treated as the -// identity so the SQLite path is unchanged. -func pendingCount(ctx context.Context, db *sql.DB, rebind func(string) string, gen vector.GenerationID) (int, error) { - if rebind == nil { - rebind = func(q string) string { return q } - } - var n int - if err := db.QueryRowContext(ctx, - rebind(`SELECT COUNT(*) FROM pending_embeddings WHERE generation_id = ?`), int64(gen)).Scan(&n); err != nil { - return 0, fmt.Errorf("query pending: %w", err) - } - return n, nil -} - // newProgressPrinter returns an embed.Worker Progress callback that // emits a rate-limited one-line summary to w. Rate limit is ~2s to // keep stderr quiet on fast backends (ANE sustains ~500 msg/s at diff --git a/cmd/msgvault/cmd/embed_vector_initschema_test.go b/cmd/msgvault/cmd/embed_vector_initschema_test.go new file mode 100644 index 000000000..324a1ff61 --- /dev/null +++ b/cmd/msgvault/cmd/embed_vector_initschema_test.go @@ -0,0 +1,159 @@ +//go:build sqlite_vec + +package cmd + +import ( + "context" + "database/sql" + "path/filepath" + "testing" + + _ "github.com/mattn/go-sqlite3" + "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/require" + "go.kenn.io/msgvault/internal/store" + "go.kenn.io/msgvault/internal/vector" + "go.kenn.io/msgvault/internal/vector/sqlitevec" +) + +// makeUpgradedMainDB writes a real msgvault.db and then DROPs the embed_gen +// column from messages, reproducing the exact shape a v0.14/v0.15 archive +// has after upgrading to a binary that introduced embed_gen but before any +// InitSchema has run against it: a full messages table that is missing only +// embed_gen. Any read/write of embed_gen fails with "no such column". +// +// It seeds one live message so coverage/backfill have something to act on. +func makeUpgradedMainDB(t *testing.T, mainPath string) { + t.Helper() + s, err := store.Open(mainPath) + require.NoError(t, err, "store.Open for fixture") + require.NoError(t, s.InitSchema(), "InitSchema for fixture") + _, err = s.DB().Exec(` +INSERT INTO sources (id, source_type, identifier) VALUES (1, 'gmail', 'me@example.com'); +INSERT INTO conversations (id, source_id, conversation_type) VALUES (1, 1, 'email_thread'); +INSERT INTO messages (id, conversation_id, source_id, source_message_id, message_type) +VALUES (1, 1, 1, 'm1', 'email'); +`) + require.NoError(t, err, "seed message") + // Drop the backfill ledger row (none expected yet) and the embed_gen + // column so the DB looks like a pre-embed_gen upgrade. + _, err = s.DB().Exec(`ALTER TABLE messages DROP COLUMN embed_gen`) + require.NoError(t, err, "drop embed_gen to simulate pre-upgrade schema") + require.NoError(t, s.Close(), "close fixture store") +} + +// dropEmbedGenColumn removes the embed_gen column from an existing main DB, +// simulating the post-seed pre-upgrade shape used by the failure-mode arm of +// the test. Kept separate from makeUpgradedMainDB so the seed (which needs a +// real active generation) can run while embed_gen still exists, then drop it +// just before the embed_gen-touching reopen. +func dropEmbedGenColumn(t *testing.T, db *sql.DB) { + t.Helper() + _, err := db.Exec(`ALTER TABLE messages DROP COLUMN embed_gen`) + require.NoError(t, err, "drop embed_gen to simulate pre-upgrade schema") +} + +// TestEmbed_UpgradedDBMissingEmbedGen_NeedsInitSchema is the regression +// guard for Codex #2: the embeddings build/resume path (runEmbed) opened +// the store but never called InitSchema, so on an upgraded DB whose +// messages table lacked embed_gen the upgrade backfill and CoverageCounts +// failed with "no such column: embed_gen" before InitSchema would have +// added it. +// +// The test first asserts the failure mode is real: against an upgraded DB +// with an active generation but the column-less messages table, opening the +// backend (which runs BackfillEmbedGenForUpgrade) errors on the missing +// column. Then it asserts the fix: running the runEmbed ordering +// (store.Open -> s.InitSchema() -> sqlitevec.Open -> s.CoverageCounts) +// succeeds because InitSchema adds embed_gen before any embed_gen access. +func TestEmbed_UpgradedDBMissingEmbedGen_NeedsInitSchema(t *testing.T) { + require.NoError(t, sqlitevec.RegisterExtension(), "RegisterExtension") + ctx := context.Background() + + // --- 1. Failure mode without InitSchema ------------------------------- + dir := t.TempDir() + mainPath := filepath.Join(dir, "msgvault.db") + vecPath := filepath.Join(dir, "vectors.db") + // Seed with embed_gen still PRESENT: store.Open + InitSchema leave the + // column in place. The seed backend Open below (which runs the orphaned- + // stamp reset + the upgrade backfill, both of which touch embed_gen) must + // succeed here — we drop the column only afterwards to reproduce the + // upgraded-but-not-reinitialized shape for the failing reopen. + sSeed, err := store.Open(mainPath) + require.NoError(t, err, "store.Open for fixture (column present)") + require.NoError(t, sSeed.InitSchema(), "InitSchema for fixture") + _, err = sSeed.DB().Exec(` +INSERT INTO sources (id, source_type, identifier) VALUES (1, 'gmail', 'me@example.com'); +INSERT INTO conversations (id, source_id, conversation_type) VALUES (1, 1, 'email_thread'); +INSERT INTO messages (id, conversation_id, source_id, source_message_id, message_type) +VALUES (1, 1, 1, 'm1', 'email'); +`) + require.NoError(t, err, "seed message") + + mainRaw := sSeed.DB() + + // Seed an active generation that already embedded msg 1, then clear the + // backfill ledger so the next Open runs the real embed_gen-stamping + // backfill. embed_gen is present here, so the reset + backfill succeed. + seed, err := sqlitevec.Open(ctx, sqlitevec.Options{ + Path: vecPath, MainPath: mainPath, Dimension: 4, MainDB: mainRaw, + }) + require.NoError(t, err, "seed backend Open") + gen, err := seed.CreateGeneration(ctx, "model", 4, "model:4") + require.NoError(t, err, "seed CreateGeneration") + require.NoError(t, seed.Upsert(ctx, gen, []vector.Chunk{{ + MessageID: 1, ChunkIndex: 0, Vector: []float32{0, 0, 0, 1}, + }}), "seed Upsert") + require.NoError(t, seed.ActivateGeneration(ctx, gen, true), "seed Activate") + require.NoError(t, seed.Close(), "close seed backend") + _, err = mainRaw.Exec(`DELETE FROM applied_migrations WHERE name = 'embed_gen_backfill_active_v1'`) + require.NoError(t, err, "clear backfill ledger") + + // Now reproduce the pre-upgrade shape: drop embed_gen. Msg 1 is stamped + // for the active gen, but the column no longer exists. + dropEmbedGenColumn(t, mainRaw) + + // Reopen: the orphaned-stamp reset (which runs first) tries to read/clear + // messages.embed_gen, which does not exist on this upgraded schema → + // "no such column: embed_gen". (Were the reset to somehow pass, the + // backfill's embed_gen stamp would fail identically.) Either way the Open + // fails until runEmbed's InitSchema adds the column back. + reopen, err := sqlitevec.Open(ctx, sqlitevec.Options{ + Path: vecPath, MainPath: mainPath, Dimension: 4, MainDB: mainRaw, + }) + if err == nil { + _ = reopen.Close() + } + require.Error(t, err, "open must fail on a messages table lacking embed_gen") + assert.Contains(t, err.Error(), "embed_gen", "failure should be the missing embed_gen column") + require.NoError(t, sSeed.Close(), "close seed store") + + // --- 2. The fix: InitSchema first, then the embed path succeeds -------- + dir3 := t.TempDir() + mainPath3 := filepath.Join(dir3, "msgvault.db") + makeUpgradedMainDB(t, mainPath3) + + // This is what runEmbed now does: Open + InitSchema BEFORE touching the + // vector backend / embed_gen. + s, err := store.Open(mainPath3) + require.NoError(t, err, "store.Open") + defer func() { _ = s.Close() }() + require.NoError(t, s.InitSchema(), "InitSchema must add the embed_gen column") + + // Backend Open now runs BackfillEmbedGenForUpgrade against a schema that + // HAS embed_gen — no error. + b3, err := sqlitevec.Open(ctx, sqlitevec.Options{ + Path: filepath.Join(dir3, "vectors.db"), + MainPath: mainPath3, + Dimension: 4, + MainDB: s.DB(), + }) + require.NoError(t, err, "backend Open after InitSchema must succeed") + defer func() { _ = b3.Close() }() + + // CoverageCounts (the other embed_gen reader in runEmbed) also succeeds. + live, _, _, missing, err := s.CoverageCounts(ctx, 1) + require.NoError(t, err, "CoverageCounts after InitSchema must succeed") + assert.Equal(t, int64(1), live, "one live message") + assert.Equal(t, int64(1), missing, "unstamped message reads as missing for gen 1") +} diff --git a/cmd/msgvault/cmd/embed_vector_pg_test.go b/cmd/msgvault/cmd/embed_vector_pg_test.go index 06d5cf0db..d7a37016b 100644 --- a/cmd/msgvault/cmd/embed_vector_pg_test.go +++ b/cmd/msgvault/cmd/embed_vector_pg_test.go @@ -15,9 +15,10 @@ import ( ) // TestRunEmbed_PG_OpenAndZeroPending exercises the command-level PG wiring -// in the runEmbed path: opening the pgvector backend, seeding a generation, -// and confirming that pendingCount returns 0 on an empty messages table (clean -// exit path). Skips when MSGVAULT_TEST_DB is unset or not a postgres DSN. +// in the runEmbed path: opening the pgvector backend, creating a +// generation, and confirming that coverage reports 0 missing on an empty +// messages table (clean exit path). Skips when MSGVAULT_TEST_DB is unset +// or not a postgres DSN. func TestRunEmbed_PG_OpenAndZeroPending(t *testing.T) { _, dsn := openServePGSchema(t) ctx := context.Background() @@ -29,7 +30,7 @@ func TestRunEmbed_PG_OpenAndZeroPending(t *testing.T) { require.True(t, st.IsPostgreSQL(), "expected PG-backed store") // Open the pgvector backend — this runs the schema migration so that - // index_generations and pending_embeddings exist. + // index_generations and the embedding tables exist. pgb, err := pgvector.Open(ctx, pgvector.Options{ DB: st.DB(), Dimension: 4, @@ -37,14 +38,15 @@ func TestRunEmbed_PG_OpenAndZeroPending(t *testing.T) { require.NoError(t, err, "pgvector.Open must succeed and migrate the schema") t.Cleanup(func() { _ = pgb.Close() }) - // A fresh database has no messages table yet in this isolated schema. - // Create a minimal messages table so CreateGeneration's seed query - // succeeds (it counts embeddable messages). + // A fresh isolated schema has no messages table yet. Create a minimal + // one (with embed_gen) so the coverage gate query succeeds. Empty, so + // coverage reports 0 missing. _, err = st.DB().ExecContext(ctx, ` CREATE TABLE IF NOT EXISTS messages ( id BIGINT PRIMARY KEY, deleted_at TIMESTAMPTZ, - deleted_from_source_at TIMESTAMPTZ + deleted_from_source_at TIMESTAMPTZ, + embed_gen BIGINT )`) require.NoError(t, err, "create messages scaffold") @@ -70,12 +72,11 @@ func TestRunEmbed_PG_OpenAndZeroPending(t *testing.T) { assert.NotZero(t, gen, "generation ID must be non-zero") assert.True(t, rebuildInProgress, "full-rebuild path must report rebuildInProgress=true") - // pendingCount is the same helper runEmbed calls to decide whether to - // activate the generation. With an empty messages table it must return 0. - rebind := (&store.PostgreSQLDialect{}).Rebind - n, err := pendingCount(ctx, pgb.DB(), rebind, gen) - require.NoError(t, err, "pendingCount on PG must succeed") - assert.Equal(t, 0, n, "empty messages table → 0 pending embeddings") + // MissingCount is what runEmbed uses to decide whether to activate + // the generation. With an empty messages table missing must be 0. + missing, err := st.MissingCount(ctx, int64(gen)) + require.NoError(t, err, "MissingCount on PG must succeed") + assert.Equal(t, int64(0), missing, "empty messages table → 0 missing") // Confirm the generation state: still building (no activation yet). building, err := pgb.BuildingGeneration(ctx) diff --git a/cmd/msgvault/cmd/embed_vector_test.go b/cmd/msgvault/cmd/embed_vector_test.go index 7079a5fec..c368606f0 100644 --- a/cmd/msgvault/cmd/embed_vector_test.go +++ b/cmd/msgvault/cmd/embed_vector_test.go @@ -20,7 +20,8 @@ import ( ) // openTestBackend opens a fresh in-memory-ish sqlitevec backend with a -// single pre-seeded message so CreateGeneration has something to enqueue. +// single pre-seeded message so the scan-and-fill worker has a message to +// discover and embed. func openTestBackend(t *testing.T) *sqlitevec.Backend { t.Helper() ctx := context.Background() @@ -272,130 +273,12 @@ func TestPickEmbedGeneration_FullRebuildAbortsWhenDeclined(t *testing.T) { requirepkg.Error(t, err, "expected abort error") } -// TestPickEmbedGeneration_ResumeReseedsUnseededBuilding regression- -// guards the crash-window bug where a process that died between -// inserting the building row and committing the initial seed would -// leave the queue empty; a later `msgvault embeddings build` would then "drain" -// zero rows and silently activate an unseeded generation. The resume -// path must call EnsureSeeded on the matched build before returning, -// reseeding pending_embeddings so the activation gate sees real work -// (or the absence of any) instead of a vacuous empty queue. -func TestPickEmbedGeneration_ResumeReseedsUnseededBuilding(t *testing.T) { - require := requirepkg.New(t) - assert := assertpkg.New(t) - ctx := context.Background() - b := openTestBackend(t) - - // Step 1: create a building gen the normal way (which seeds + marks - // seeded_at). - gen, err := b.CreateGeneration(ctx, "fake", 4, "") - require.NoError(err, "CreateGeneration") - - // Step 2: simulate the crash window — clear pending_embeddings and - // blank seeded_at so the next resume must reseed. This mirrors the - // state after a process dies between the building-row insert and - // the seedPending commit. - _, err = b.DB().ExecContext(ctx, - `DELETE FROM pending_embeddings WHERE generation_id = ?`, int64(gen)) - require.NoError(err, "clear pending") - _, err = b.DB().ExecContext(ctx, - `UPDATE index_generations SET seeded_at = NULL WHERE id = ?`, int64(gen)) - require.NoError(err, "clear seeded_at") - - // Sanity: pending really is empty before the resume. - var pendingBefore int - require.NoError(b.DB().QueryRowContext(ctx, - `SELECT COUNT(*) FROM pending_embeddings WHERE generation_id = ?`, int64(gen)).Scan(&pendingBefore), - "count pending before") - require.Equal(0, pendingBefore, "pending count before resume = %d, want 0 (test setup wrong)", pendingBefore) - - // Step 3: run pickEmbedGeneration on the resume path. - gotGen, rebuildInProgress, err := pickEmbedGeneration(ctx, b, embedGenerationOpts{ - FullRebuild: false, - Model: "fake", - Dimension: 4, - Fingerprint: "fake:4", - Stderr: openStderrSink(t), - }) - require.NoError(err, "pickEmbedGeneration") - assert.Equal(gen, gotGen, "gotGen mismatch") - assert.True(rebuildInProgress, "rebuildInProgress=false, want true") - - // Step 4: pending_embeddings should now contain the message we - // seeded in openTestBackend (id=1). Without EnsureSeeded on the - // resume path, this would still be 0. - var pendingAfter int - require.NoError(b.DB().QueryRowContext(ctx, - `SELECT COUNT(*) FROM pending_embeddings WHERE generation_id = ?`, int64(gen)).Scan(&pendingAfter), - "count pending after") - assert.Equal(1, pendingAfter, "pending count after resume = %d, want 1 (EnsureSeeded should have reseeded)", pendingAfter) - // And seeded_at should be set so a subsequent resume skips the work. - var seededAt sql.NullInt64 - require.NoError(b.DB().QueryRowContext(ctx, - `SELECT seeded_at FROM index_generations WHERE id = ?`, int64(gen)).Scan(&seededAt), - "read seeded_at") - assert.True(seededAt.Valid, "seeded_at still NULL after resume, want set") -} - -// TestPickEmbedGeneration_ResumeRacesActivation regresses the case -// where the `building` row flips to `active` between the -// BuildingGeneration read and EnsureSeeded. Before the fix this -// surfaced a fatal `ensure seeded: ... state="active"` error even -// though a legitimate active generation (matching the configured -// fingerprint) now existed. After the fix we fall through to the -// active-generation lookup and top it up as a normal incremental -// pass. -func TestPickEmbedGeneration_ResumeRacesActivation(t *testing.T) { - require := requirepkg.New(t) - assert := assertpkg.New(t) - ctx := context.Background() - b := openTestBackend(t) - - // Create the building generation as if the operator had just run - // `msgvault embeddings build --full-rebuild`. CreateGeneration seeds pending - // rows for id=1 via openTestBackend's seed message. - gen, err := b.CreateGeneration(ctx, "fake", 4, "") - require.NoError(err, "CreateGeneration") - // Simulate the race: another actor (the daemon, or a concurrent - // `msgvault embeddings build` run that finished first) activated the - // generation. From this actor's perspective BuildingGeneration - // returned non-nil a moment ago, but the state has since flipped. - require.NoError(b.ActivateGeneration(ctx, gen, true), "ActivateGeneration") - - // Intercepting the race is hard to do in a single-threaded test, - // but we can drive the same code path by calling - // pickEmbedGeneration with a backend that reports the now-active - // generation when BuildingGeneration is queried. We use a - // shim that wraps the real backend and overrides only - // BuildingGeneration. - shim := &buildingShim{Backend: b, forceBuilding: &vector.Generation{ - ID: gen, Fingerprint: "fake:4", State: vector.GenerationBuilding, - }} - - gotGen, rebuildInProgress, err := pickEmbedGeneration(ctx, shim, embedGenerationOpts{ - FullRebuild: false, - Model: "fake", - Dimension: 4, - Fingerprint: "fake:4", - Stderr: openStderrSink(t), - }) - require.NoError(err, "pickEmbedGeneration (race must be retryable, not fatal)") - assert.Equal(gen, gotGen, "same generation, but now active") - assert.False(rebuildInProgress, "rebuildInProgress=true, want false (now on the active path)") -} - -// buildingShim wraps a real backend, overriding only BuildingGeneration -// to return a forced value. Used by TestPickEmbedGeneration_ResumeRacesActivation -// to simulate a stale read where the generation flipped to active -// underneath us after BuildingGeneration returned. -type buildingShim struct { - vector.Backend - - forceBuilding *vector.Generation -} +func TestRemainingCoverageHintMentionsBackstop(t *testing.T) { + got := remainingCoverageHint(7, 3) -func (s *buildingShim) BuildingGeneration(ctx context.Context) (*vector.Generation, error) { - return s.forceBuilding, nil + assertpkg.Contains(t, got, "Generation 7 still has 3 message(s) needing embedding") + assertpkg.Contains(t, got, "msgvault embeddings resume --backstop") + assertpkg.NotContains(t, got, "resume` again") } func TestNewProgressPrinter_UsesWindowedRate(t *testing.T) { diff --git a/cmd/msgvault/cmd/embeddings_manage.go b/cmd/msgvault/cmd/embeddings_manage.go index 96ac83a5f..b3e0f04fa 100644 --- a/cmd/msgvault/cmd/embeddings_manage.go +++ b/cmd/msgvault/cmd/embeddings_manage.go @@ -32,10 +32,98 @@ type embeddingGenerationRow struct { CompletedAt *time.Time ActivatedAt *time.Time MessageCount int64 - PendingCount int64 + // Coverage counts for this generation over the live-message universe, + // computed from the main DB (live/stamped/missing) plus the vector + // backend (embedded). Filled by fillCoverage / fillFullCoverage. The + // invariant LiveCount == EmbeddedCount + BlankCount + MissingCount holds. + // + // - LiveCount: total live messages (the embedding universe). + // - EmbeddedCount: live messages that actually have >=1 vector for + // this generation (COUNT(DISTINCT message_id) in the embeddings + // table). Only filled by fillFullCoverage (needs the backend). + // - BlankCount: stamped-but-empty messages (stamped embed_gen=id + // but no vector) — the body-extraction-regression detector. + // Only filled by fillFullCoverage. + // - MissingCount: live messages not yet stamped for this generation. + LiveCount int64 + EmbeddedCount int64 + BlankCount int64 + MissingCount int64 +} + +// fillCoverage populates row.LiveCount and row.MissingCount from the main +// DB so the management commands can gate on how many live messages still +// need embedding for the generation. This is the cheap, backend-free path +// used by the activation gate (which only needs MissingCount). It leaves +// EmbeddedCount/BlankCount at zero — use fillFullCoverage for the display +// table where the embedded/blank split is wanted. A failure is surfaced to +// the caller. +func fillCoverage(ctx context.Context, row *embeddingGenerationRow) error { + s, err := store.Open(cfg.DatabaseDSN()) + if err != nil { + return fmt.Errorf("open main db for coverage: %w", err) + } + defer func() { _ = s.Close() }() + live, _, _, missing, err := s.CoverageCounts(ctx, int64(row.ID)) + if err != nil { + return err + } + row.LiveCount = live + row.MissingCount = missing + return nil +} + +// fillFullCoverage populates the complete live/embedded/blank/missing split +// for the generation. The main DB supplies live, stamped (embed_gen=id), +// and missing; the vector backend supplies embedded (COUNT(DISTINCT +// message_id) in the embeddings table for this generation). blank is the +// remainder, stamped - embedded, clamped >= 0 — messages stamped terminal +// DONE but with no vector (the empty/unembeddable case). The invariant +// live == embedded + blank + missing holds. The backend handle is passed +// in by the caller (which already opened it for the generation listing). +func fillFullCoverage(ctx context.Context, backend vector.Backend, row *embeddingGenerationRow) error { + s, err := store.Open(cfg.DatabaseDSN()) + if err != nil { + return fmt.Errorf("open main db for coverage: %w", err) + } + defer func() { _ = s.Close() }() + live, stamped, _, missing, err := s.CoverageCounts(ctx, int64(row.ID)) + if err != nil { + return err + } + embedded, err := backend.EmbeddedMessageCount(ctx, row.ID) + if err != nil { + return fmt.Errorf("count embedded messages for generation %d: %w", row.ID, err) + } + blank := max(stamped-embedded, 0) + row.LiveCount = live + row.EmbeddedCount = embedded + row.BlankCount = blank + row.MissingCount = missing + return nil +} + +// ensureMainSchema opens the main DB and runs InitSchema so that an +// upgraded SQLite archive (whose messages table predates the embed_gen +// column) gets the column added before any management command reads +// embed_gen via CoverageCounts. Mirrors the serve.go / runEmbed pattern. +// Cheap and idempotent on an already-current schema; harmless on PG. +func ensureMainSchema() error { + s, err := store.Open(cfg.DatabaseDSN()) + if err != nil { + return fmt.Errorf("open main db: %w", err) + } + defer func() { _ = s.Close() }() + if err := s.InitSchema(); err != nil { + return fmt.Errorf("init schema: %w", err) + } + return nil } func runEmbeddingsList(cmd *cobra.Command, _ []string) error { + if err := ensureMainSchema(); err != nil { + return err + } db, rebind, closeDB, err := openEmbeddingsMetadataDB(cmd.Context()) if err != nil { return err @@ -51,16 +139,46 @@ func runEmbeddingsList(cmd *cobra.Command, _ []string) error { return nil } + // Fill per-generation coverage (live/embedded/blank/missing) for + // non-retired generations — the interesting numbers. The embedded leg + // comes from the vector backend (the embeddings table), so open it once + // and thread it down. Retired generations are immutable; leave their + // coverage at zero and skip the backend scan. + needCoverage := false + for i := range rows { + if rows[i].State != vector.GenerationRetired { + needCoverage = true + break + } + } + if needCoverage { + backend, closeBackend, err := openEmbeddingsBackend(cmd.Context()) + if err != nil { + return err + } + defer closeBackend() + for i := range rows { + if rows[i].State == vector.GenerationRetired { + continue + } + if err := fillFullCoverage(cmd.Context(), backend, &rows[i]); err != nil { + return err + } + } + } + w := tabwriter.NewWriter(cmd.OutOrStdout(), 0, 0, 2, ' ', 0) - _, _ = fmt.Fprintln(w, "ID\tSTATE\tMODEL\tDIM\tMESSAGES\tPENDING\tFINGERPRINT\tSTARTED\tCOMPLETED\tACTIVATED") + _, _ = fmt.Fprintln(w, "ID\tSTATE\tMODEL\tDIM\tLIVE\tEMBEDDED\tBLANK\tMISSING\tFINGERPRINT\tSTARTED\tCOMPLETED\tACTIVATED") for _, row := range rows { - _, _ = fmt.Fprintf(w, "%d\t%s\t%s\t%d\t%d\t%d\t%s\t%s\t%s\t%s\n", + _, _ = fmt.Fprintf(w, "%d\t%s\t%s\t%d\t%d\t%d\t%d\t%d\t%s\t%s\t%s\t%s\n", row.ID, row.State, row.Model, row.Dimension, - row.MessageCount, - row.PendingCount, + row.LiveCount, + row.EmbeddedCount, + row.BlankCount, + row.MissingCount, row.Fingerprint, formatGenerationTime(row.StartedAt), formatGenerationTimePtr(row.CompletedAt), @@ -78,6 +196,9 @@ func runEmbeddingsRetire(cmd *cobra.Command, args []string) error { if err != nil { return err } + if err := ensureMainSchema(); err != nil { + return err + } db, rebind, closeDB, err := openEmbeddingsMetadataDB(cmd.Context()) if err != nil { @@ -134,6 +255,9 @@ func runEmbeddingsActivate(cmd *cobra.Command, args []string) error { if err != nil { return err } + if err := ensureMainSchema(); err != nil { + return err + } db, rebind, closeDB, err := openEmbeddingsMetadataDB(cmd.Context()) if err != nil { @@ -153,20 +277,19 @@ func runEmbeddingsActivate(cmd *cobra.Command, args []string) error { return fmt.Errorf("generation %d fingerprint=%q does not match config=%q; pass --force to activate anyway", gen, row.Fingerprint, expected) } - // The pending/seeded gate is enforced atomically inside - // backend.ActivateGeneration (see below) so a concurrent sync that - // dual-enqueues a pending row cannot slip it in between this read and - // the state flip. We still surface a friendly pre-flight error here - // (against the committed metadata read) so the common case fails fast - // before opening a backend connection and before prompting — but the - // backend's transactional gate is the authoritative guarantee. - if row.PendingCount > 0 && !embeddingsActivateForce { - return fmt.Errorf("generation %d still has %d pending embedding rows; run `msgvault embeddings resume` or pass --force", - gen, row.PendingCount) - } - if row.SeededAt == nil && !embeddingsActivateForce { - return fmt.Errorf("generation %d has not finished seeding; run `msgvault embeddings resume` or pass --force", - gen) + // The coverage gate is enforced inside backend.ActivateGeneration + // (atomically on PG; via a Go pre-check on SQLite). We still surface a + // friendly pre-flight error here (against the main-DB coverage) so the + // common case fails fast before opening a backend connection and before + // prompting — but the backend's gate is the authoritative guarantee. + if !embeddingsActivateForce { + if err := fillCoverage(cmd.Context(), &row); err != nil { + return err + } + if row.MissingCount > 0 { + return fmt.Errorf("generation %d still has %d message(s) needing embedding; run `msgvault embeddings resume --backstop` to recover any below-watermark stragglers, or pass --force", + gen, row.MissingCount) + } } active, hasActive, err := activeEmbeddingGeneration(cmd.Context(), db, rebind) @@ -187,11 +310,11 @@ func runEmbeddingsActivate(cmd *cobra.Command, args []string) error { // Route through the vector backend so the auto-retire of the previously // active generation deletes its embeddings on PG (the same delete-on-retire // invariant as the retire path). The backend's ActivateGeneration requires - // the target to be in 'building' state, enforces the seeded/no-pending gate - // ATOMICALLY with the state flip (unless force), and auto-retires the prior - // active generation in one transaction. The fingerprint check above is the - // only gate the backend cannot make (it does not know the config - // fingerprint); the pending/seeded gate is owned by the backend. + // the target to be in 'building' state, enforces the coverage (no-missing) + // gate ATOMICALLY with the state flip (unless force), and auto-retires the + // prior active generation in one transaction. The fingerprint check above + // is the only gate the backend cannot make (it does not know the config + // fingerprint); the coverage gate is owned by the backend. backend, closeBackend, err := openEmbeddingsBackend(cmd.Context()) if err != nil { return err @@ -204,6 +327,12 @@ func runEmbeddingsActivate(cmd *cobra.Command, args []string) error { return nil } +func remainingCoverageHint(gen vector.GenerationID, remaining int64) string { + return fmt.Sprintf( + "Generation %d still has %d message(s) needing embedding; run `msgvault embeddings resume --backstop` to recover any below-watermark stragglers, then it will activate automatically.\n", + gen, remaining) +} + // openEmbeddingsMetadataDB opens the database that holds embedding generation // metadata and returns a handle, a rebind function for SQL placeholders, a // close callback, and any error. @@ -282,9 +411,17 @@ func openEmbeddingsBackend(ctx context.Context) (vector.Backend, func(), error) if err != nil { return nil, nil, fmt.Errorf("open postgres for embeddings backend: %w", err) } - // SkipMigrate: the metadata tables already exist (the caller's - // openEmbeddingsMetadataDB pre-checks index_generations), and a - // management command must not run migrations as a side effect. + // SkipMigrate skips only the privileged CREATE EXTENSION + full + // migrate: the extension + metadata tables already exist (the caller's + // openEmbeddingsMetadataDB pre-checks index_generations), so a + // management command must not attempt the privileged extension step. + // This open is WRITABLE management, NOT read-only — ReadOnly stays + // false so Open still applies the extension-less schema (bringing up + // embed_watermark etc. if missing) and runs the one-time embed_gen + // upgrade backfill, matching the SQLite management path (which always + // migrates vectors.db + backfills). Without this, a post-upgrade PG + // archive would report its whole corpus as missing on the first + // writable management command. b, err := pgvector.Open(ctx, pgvector.Options{ DB: db, Dimension: cfg.Vector.Embeddings.Dimension, @@ -311,15 +448,28 @@ func openEmbeddingsBackend(ctx context.Context) (vector.Backend, func(), error) } return nil, nil, fmt.Errorf("stat vectors.db: %w", err) } + // On SQLite the messages table (and embed_gen) lives in the main DB, + // in a SEPARATE file from vectors.db. Backend methods that gate on + // live-message coverage — ActivateGeneration's hasMissingForGen, and + // the live-intersected EmbeddedMessageCount — dereference b.mainDB, so + // the management path must open and pass a main-DB handle just like + // embed_vector.go does. Omitting it leaves b.mainDB nil and panics on + // `msgvault embeddings activate`. Close it in the returned cleanup. + mainStore, err := store.Open(dsn) + if err != nil { + return nil, nil, fmt.Errorf("open main db for embeddings backend: %w", err) + } b, err := sqlitevec.Open(ctx, sqlitevec.Options{ Path: vecPath, MainPath: dsn, Dimension: cfg.Vector.Embeddings.Dimension, + MainDB: mainStore.DB(), }) if err != nil { + _ = mainStore.Close() return nil, nil, fmt.Errorf("open vectors.db backend: %w", err) } - return b, func() { _ = b.Close() }, nil + return b, func() { _ = b.Close(); _ = mainStore.Close() }, nil } func sqliteDSNWithBusyTimeout(path string) string { @@ -345,10 +495,8 @@ func listEmbeddingGenerations(ctx context.Context, db *sql.DB, rebind func(strin rows, err := db.QueryContext(ctx, ` SELECT g.id, g.model, g.dimension, g.fingerprint, g.state, g.started_at, g.completed_at, g.activated_at, g.message_count, - g.seeded_at, COUNT(p.message_id) AS pending_count + g.seeded_at FROM index_generations g - LEFT JOIN pending_embeddings p ON p.generation_id = g.id - GROUP BY g.id ORDER BY g.id`) if err != nil { return nil, fmt.Errorf("list embedding generations: %w", err) @@ -373,11 +521,9 @@ func getEmbeddingGeneration(ctx context.Context, db *sql.DB, rebind func(string) row := db.QueryRowContext(ctx, rebind(` SELECT g.id, g.model, g.dimension, g.fingerprint, g.state, g.started_at, g.completed_at, g.activated_at, g.message_count, - g.seeded_at, COUNT(p.message_id) AS pending_count + g.seeded_at FROM index_generations g - LEFT JOIN pending_embeddings p ON p.generation_id = g.id - WHERE g.id = ? - GROUP BY g.id`), int64(gen)) + WHERE g.id = ?`), int64(gen)) g, err := scanEmbeddingGeneration(row) if errors.Is(err, sql.ErrNoRows) { return embeddingGenerationRow{}, fmt.Errorf("%w: %d", vector.ErrUnknownGeneration, gen) @@ -392,11 +538,9 @@ func activeEmbeddingGeneration(ctx context.Context, db *sql.DB, rebind func(stri row := db.QueryRowContext(ctx, rebind(` SELECT g.id, g.model, g.dimension, g.fingerprint, g.state, g.started_at, g.completed_at, g.activated_at, g.message_count, - g.seeded_at, COUNT(p.message_id) AS pending_count + g.seeded_at FROM index_generations g - LEFT JOIN pending_embeddings p ON p.generation_id = g.id - WHERE g.state = ? - GROUP BY g.id`), string(vector.GenerationActive)) + WHERE g.state = ?`), string(vector.GenerationActive)) g, err := scanEmbeddingGeneration(row) if errors.Is(err, sql.ErrNoRows) { return embeddingGenerationRow{}, false, nil @@ -426,7 +570,6 @@ func scanEmbeddingGeneration(s generationScanner) (embeddingGenerationRow, error &activatedAt, &row.MessageCount, &seededAt, - &row.PendingCount, ); err != nil { return embeddingGenerationRow{}, err } diff --git a/cmd/msgvault/cmd/mcp.go b/cmd/msgvault/cmd/mcp.go index b762e540d..161295a57 100644 --- a/cmd/msgvault/cmd/mcp.go +++ b/cmd/msgvault/cmd/mcp.go @@ -104,10 +104,9 @@ Add to Claude Desktop config: defer cancel() // Build optional vector-search components. MCP runs as a - // query-only server, so the worker and enqueuer fields go - // unused — only Backend, HybridEngine, and VectorCfg reach - // the MCP layer. - vf, err := setupVectorFeatures(ctx, s.DB(), dbPath, true) + // query-only server, so the Worker field goes unused — only + // Backend, HybridEngine, and Cfg reach the MCP layer. + vf, err := setupVectorFeatures(ctx, s, dbPath, true) if err != nil { return fmt.Errorf("vector features: %w", err) } diff --git a/cmd/msgvault/cmd/repair_encoding.go b/cmd/msgvault/cmd/repair_encoding.go index c207b76ea..153618095 100644 --- a/cmd/msgvault/cmd/repair_encoding.go +++ b/cmd/msgvault/cmd/repair_encoding.go @@ -2,6 +2,7 @@ package cmd import ( "compress/zlib" + "context" "database/sql" "fmt" "io" @@ -57,29 +58,15 @@ charset detection issues in the MIME parser.`, return err } - // Re-enqueue repaired messages for re-embedding so semantic - // results reflect the corrected text. setupVectorFeatures - // returns (nil, nil) when vector search is disabled or the - // binary was built without sqlite_vec; in those cases the - // pending_embeddings table is irrelevant and there's nothing - // to do. + // Reset embed_gen = NULL on repaired messages so the scan-and-fill + // embed worker re-embeds them with the corrected text on its next + // run (msgvault embeddings build / the serve daemon). This is the + // scan-and-fill replacement for the old re-enqueue step: clearing + // embed_gen makes the message read as "needs embedding" again. + // No-op when vector search is disabled — the column is harmless. if len(reembedNeededIDs) > 0 { - vf, err := setupVectorFeatures(ctx, s.DB(), dbPath, false) - if err != nil { - fmt.Fprintf(os.Stderr, - "Warning: failed to open vectors.db for re-enqueue: %v\n", err) - } else if vf != nil { - if err := vf.Enqueuer.EnqueueMessages(ctx, reembedNeededIDs); err != nil { - fmt.Fprintf(os.Stderr, - "Warning: failed to re-enqueue %d messages for re-embedding: %v\n", - len(reembedNeededIDs), err) - } else { - fmt.Printf("Re-enqueued %d message(s) for re-embedding.\n", - len(reembedNeededIDs)) - } - if closeErr := vf.Close(); closeErr != nil { - logger.Warn("closing vectors.db failed", "error", closeErr) - } + if err := repairResetEmbeddings(ctx, s, reembedNeededIDs); err != nil { + fmt.Fprintf(os.Stderr, "Warning: %v\n", err) } } @@ -96,6 +83,60 @@ charset detection issues in the MIME parser.`, }, } +// repairResetEmbeddings marks the repaired messages for re-embedding and lowers +// the scan-and-fill watermark so the next incremental embed run re-finds them. +// +// Ordering is load-bearing. The vector backend is opened FIRST, before +// s.ResetEmbedGen clears embed_gen. Opening a writable backend runs the +// one-time upgrade backfill as a side effect when its ledger is still unmarked; +// that backfill stamps embed_gen=active on every already-embedded message. If +// the reset ran first, a first-run backfill would re-stamp the just-NULLed +// (previously-embedded) repaired messages back to active, silently undoing the +// re-embed request. Opening first lets the backfill land and mark its ledger so +// the subsequent reset sticks. +// +// When vector search is disabled, openVectorBackendForRepair returns a nil +// backend and this still resets embed_gen (a harmless no-op on the main DB +// column) while the watermark step short-circuits. +func repairResetEmbeddings(ctx context.Context, s *store.Store, reembedNeededIDs []int64) error { + // 1. Open the vector backend up front. This triggers (and marks the + // ledger for) the one-time upgrade backfill BEFORE we clear embed_gen. + // nil backend + nil closeFn when vector search is disabled. + backend, closeFn, err := openVectorBackendForRepair(ctx, s) + if err != nil { + return fmt.Errorf("failed to open vector backend for re-embedding: %w", err) + } + if closeFn != nil { + defer func() { _ = closeFn() }() + } + + // 2. Clear embed_gen on the repaired messages — now post-backfill, so the + // (already-marked) ledger cannot re-run and re-cover them. + if err := s.ResetEmbedGen(ctx, reembedNeededIDs); err != nil { + return fmt.Errorf("failed to mark %d repaired message(s) for re-embedding: %w", + len(reembedNeededIDs), err) + } + fmt.Printf("Marked %d message(s) for re-embedding.\n", len(reembedNeededIDs)) + + // 3. Lower the watermark below the smallest repaired id. Clearing embed_gen + // is not enough on its own: an incremental embed run resumes from the + // per-generation watermark and only scans ids ABOVE it, so a repaired + // message whose id sits below the current watermark would never be + // re-found (it would wait for a full-scan backstop, which the CLI + // defaults off and serve can have disabled). No-op (nil backend) when + // vector search is not configured. + minID := reembedNeededIDs[0] + for _, id := range reembedNeededIDs[1:] { + if id < minID { + minID = id + } + } + if err := lowerEmbedWatermarkForRepair(ctx, backend, minID); err != nil { + return fmt.Errorf("failed to lower embed watermark for repaired messages: %w", err) + } + return nil +} + // repairStats tracks repair statistics. type repairStats struct { subjects int @@ -114,10 +155,10 @@ type repairStats struct { // repairEncoding runs all repair passes over s and returns the IDs of // messages whose embedding inputs (subject, body_text, or body_html) -// were modified. Callers use that list to re-enqueue affected messages -// for re-embedding so semantic search results don't stay stale against -// the repaired text. Snippet-only repairs are NOT included because the -// embedder doesn't read snippet. +// were modified. Callers reset embed_gen to NULL (via s.ResetEmbedGen) on +// those ids so the scan-and-fill worker re-embeds them and semantic search +// results don't stay stale against the repaired text. Snippet-only repairs +// are NOT included because the embedder doesn't read snippet. func repairEncoding(s *store.Store) (reembedNeededIDs []int64, err error) { stats := &repairStats{} @@ -273,9 +314,10 @@ func repairMessageFields(s *store.Store, stats *repairStats) (reembedNeededIDs [ } // Any change to fields that feed the embedder (subject, - // body_text, body_html) invalidates prior embeddings and - // must trigger a re-enqueue. Snippet is not embedded, so - // snippet-only repairs are excluded. + // body_text, body_html) invalidates prior embeddings, so we + // reset embed_gen to NULL (via ResetEmbedGen) on these ids so + // the scan-and-fill worker re-embeds them. Snippet is not + // embedded, so snippet-only repairs are excluded. if r.newSubject.Valid || r.newBody.Valid || r.newHTML.Valid { reembedNeededIDs = append(reembedNeededIDs, r.id) } diff --git a/cmd/msgvault/cmd/repair_encoding_test.go b/cmd/msgvault/cmd/repair_encoding_test.go index 9b7287756..13e7eced8 100644 --- a/cmd/msgvault/cmd/repair_encoding_test.go +++ b/cmd/msgvault/cmd/repair_encoding_test.go @@ -100,9 +100,10 @@ func TestRepairEncoding_NoScanErrors(t *testing.T) { // TestRepairMessageFields_ReturnsReembedNeededIDs guards the re-embedding // hook: when any field that feeds the embedder (subject, body_text, // body_html) is repaired, the affected message id must appear in the -// returned slice so the caller can re-enqueue it against -// pending_embeddings. Snippet-only repairs must NOT appear because the -// embedder doesn't read snippet. +// returned slice so the caller can mark it for re-embedding via +// ResetEmbedGen (embed_gen -> NULL), which the scan-and-fill worker then +// re-picks up. Snippet-only repairs must NOT appear because the embedder +// doesn't read snippet. func TestRepairMessageFields_ReturnsReembedNeededIDs(t *testing.T) { require := requirepkg.New(t) assert := assertpkg.New(t) diff --git a/cmd/msgvault/cmd/repair_encoding_vector.go b/cmd/msgvault/cmd/repair_encoding_vector.go new file mode 100644 index 000000000..3ba42c563 --- /dev/null +++ b/cmd/msgvault/cmd/repair_encoding_vector.go @@ -0,0 +1,97 @@ +//go:build sqlite_vec || pgvector + +package cmd + +import ( + "context" + "fmt" + "path/filepath" + + "go.kenn.io/msgvault/internal/store" + "go.kenn.io/msgvault/internal/vector" + "go.kenn.io/msgvault/internal/vector/pgvector" + "go.kenn.io/msgvault/internal/vector/sqlitevec" +) + +// openVectorBackendForRepair opens the dialect-selected vector backend the same +// way the embed/serve commands do and returns it together with a close func. +// +// Opening a writable backend runs the one-time upgrade backfill +// (BackfillEmbedGenForUpgrade) as a side effect when the upgrade ledger is +// still unmarked. repair-encoding MUST open the backend BEFORE it clears +// embed_gen (s.ResetEmbedGen): if the reset ran first, a first-run backfill +// would re-stamp the just-NULLed (previously-embedded) messages back to +// embed_gen=active, silently undoing the re-embed request. Opening first lets +// the backfill land and mark its ledger, so the subsequent reset sticks. +// +// Returns (nil, nil, nil) and is a silent no-op when vector search is not +// configured (cfg.Vector.Enabled == false): a user without embeddings has no +// watermark to fix and no backfill to run. +// +// This file is compiled only with a vector backend build tag; the no-tag build +// uses the stub in repair_encoding_vector_stub.go. +func openVectorBackendForRepair(ctx context.Context, s *store.Store) (vector.Backend, func() error, error) { + if !cfg.Vector.Enabled { + // Vector search disabled: nothing to open. No-op. + return nil, nil, nil + } + + if s.IsPostgreSQL() { + pgb, err := pgvector.Open(ctx, pgvector.Options{ + DB: s.DB(), + Dimension: cfg.Vector.Embeddings.Dimension, + SkipExtension: cfg.Vector.SkipExtensionCreate, + }) + if err != nil { + return nil, nil, fmt.Errorf("open pgvector backend: %w", err) + } + return pgb, pgb.Close, nil + } + + if err := sqlitevec.RegisterExtension(); err != nil { + return nil, nil, fmt.Errorf("register sqlite-vec: %w", err) + } + vecPath := cfg.Vector.DBPath + if vecPath == "" { + vecPath = filepath.Join(cfg.Data.DataDir, "vectors.db") + } + sb, err := sqlitevec.Open(ctx, sqlitevec.Options{ + Path: vecPath, + MainPath: cfg.DatabaseDSN(), + Dimension: cfg.Vector.Embeddings.Dimension, + MainDB: s.DB(), + }) + if err != nil { + return nil, nil, fmt.Errorf("open vectors.db: %w", err) + } + return sb, sb.Close, nil +} + +// lowerEmbedWatermarkForRepair lowers the scan-and-fill embed watermark below +// the minimum repaired message id so the next incremental embed run re-finds +// the repaired messages, even when their ids sit BELOW the current watermark. +// +// repair-encoding already reset embed_gen=NULL on these ids (s.ResetEmbedGen), +// but ScanForEmbedding only returns rows with `id > watermark`, so a repaired +// row below the watermark would otherwise wait for a full-scan backstop (which +// the CLI defaults off and serve can have disabled). Lowering the watermark +// closes that gap. +// +// It operates on an already-open backend handle (see +// openVectorBackendForRepair) so the one-time upgrade backfill has already run +// and marked its ledger; the watermark reset itself is idempotent and never +// raises a generation's cursor. backend may be nil when vector search is +// disabled, in which case this is a no-op. +// +// This file is compiled only with a vector backend build tag; the no-tag build +// uses the stub in repair_encoding_vector_stub.go. +func lowerEmbedWatermarkForRepair(ctx context.Context, backend vector.Backend, minRepairedID int64) error { + if backend == nil { + // Vector search disabled: no watermark exists to lower. No-op. + return nil + } + if err := backend.ResetWatermarkBelow(ctx, minRepairedID); err != nil { + return fmt.Errorf("lower embed watermark: %w", err) + } + return nil +} diff --git a/cmd/msgvault/cmd/repair_encoding_vector_stub.go b/cmd/msgvault/cmd/repair_encoding_vector_stub.go new file mode 100644 index 000000000..aa1fefbd8 --- /dev/null +++ b/cmd/msgvault/cmd/repair_encoding_vector_stub.go @@ -0,0 +1,27 @@ +//go:build !sqlite_vec && !pgvector + +package cmd + +import ( + "context" + + "go.kenn.io/msgvault/internal/store" + "go.kenn.io/msgvault/internal/vector" +) + +// openVectorBackendForRepair is a no-op for builds without a vector backend +// build tag: there is no embeddings store to open and no upgrade backfill to +// run. The real implementation lives in repair_encoding_vector.go (built with +// sqlite_vec or pgvector). repair-encoding still resets embed_gen on the main +// DB column, which is harmless when vector search is unavailable. +func openVectorBackendForRepair(_ context.Context, _ *store.Store) (vector.Backend, func() error, error) { + return nil, nil, nil +} + +// lowerEmbedWatermarkForRepair is a no-op for builds without a vector backend +// build tag: there is no embeddings store and no watermark to lower. The real +// implementation lives in repair_encoding_vector.go (built with sqlite_vec or +// pgvector). +func lowerEmbedWatermarkForRepair(_ context.Context, _ vector.Backend, _ int64) error { + return nil +} diff --git a/cmd/msgvault/cmd/repair_encoding_vector_test.go b/cmd/msgvault/cmd/repair_encoding_vector_test.go new file mode 100644 index 000000000..fb69f152e --- /dev/null +++ b/cmd/msgvault/cmd/repair_encoding_vector_test.go @@ -0,0 +1,161 @@ +//go:build sqlite_vec + +package cmd + +import ( + "context" + "database/sql" + "path/filepath" + "testing" + + "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/require" + + "go.kenn.io/msgvault/internal/config" + "go.kenn.io/msgvault/internal/store" + "go.kenn.io/msgvault/internal/vector" + "go.kenn.io/msgvault/internal/vector/sqlitevec" +) + +// embedGenBackfillMigrationKey duplicates the (unexported) sqlitevec ledger key +// that guards the one-time upgrade backfill. Kept in sync deliberately: the +// test reconstructs the fresh-upgrade precondition by clearing this exact row. +const embedGenBackfillMigrationKey = "embed_gen_backfill_active_v1" + +// readEmbedGen reports embed_gen for a message and whether it is NULL. +func readEmbedGen(t *testing.T, db *sql.DB, id int64) (val int64, isNull bool) { + t.Helper() + var v sql.NullInt64 + require.NoError(t, db.QueryRow(`SELECT embed_gen FROM messages WHERE id = ?`, id).Scan(&v)) + return v.Int64, !v.Valid +} + +// TestRepairResetEmbeddings_OpensBackendBeforeResettingEmbedGen is the FIX A +// regression guard: repair-encoding must OPEN the vector backend +// (which runs the one-time upgrade backfill as a side effect) BEFORE it clears +// embed_gen on the repaired messages. +// +// Precondition reproduced: a freshly-upgraded archive from a pre-embed_gen +// build — an ACTIVE generation with existing active-gen embeddings, embed_gen +// NULL everywhere, and the backfill ledger UNMARKED. The first writable open of +// the vector backend runs BackfillEmbedGenForUpgrade, which stamps +// embed_gen=active on every already-embedded message. +// +// The bug: with the OLD ordering (ResetEmbedGen first, then open the backend to +// lower the watermark), the very first backend open during repair runs the +// backfill AFTER the reset, re-stamping the just-NULLed (previously-embedded) +// repaired message back to embed_gen=active — silently undoing the re-embed +// request. The fix opens the backend first so the backfill lands and marks its +// ledger BEFORE the reset, so the NULL sticks. +// +// Assert: the repaired message ends embed_gen IS NULL (it will be re-embedded). +// This FAILS with the old reset-before-open ordering (the message ends stamped +// =active) and PASSES with the open-before-reset fix. +func TestRepairResetEmbeddings_OpensBackendBeforeResettingEmbedGen(t *testing.T) { + require.NoError(t, sqlitevec.RegisterExtension(), "RegisterExtension") + ctx := context.Background() + + dir := t.TempDir() + mainPath := filepath.Join(dir, "msgvault.db") + vecPath := filepath.Join(dir, "vectors.db") + + // Real main DB with two live messages, both previously embedded. + s, err := store.Open(mainPath) + require.NoError(t, err, "store.Open") + // Close the store LAST (registered first → LIFO) so the backend that + // borrows s.DB() closes before it, and the open msgvault.db handle does not + // block t.TempDir() cleanup on Windows. + t.Cleanup(func() { _ = s.Close() }) + require.NoError(t, s.InitSchema(), "InitSchema") + _, err = s.DB().Exec(` +INSERT INTO sources (id, source_type, identifier) VALUES (1, 'gmail', 'me@example.com'); +INSERT INTO conversations (id, source_id, conversation_type) VALUES (1, 1, 'email_thread'); +INSERT INTO messages (id, conversation_id, source_id, source_message_id, message_type) +VALUES (1, 1, 1, 'm1', 'email'), (2, 1, 1, 'm2', 'email'); +`) + require.NoError(t, err, "seed messages") + + // Create + activate a generation with an embedding for BOTH messages, so + // they are genuinely "previously embedded" (the upgrade-backfill target). + rw, err := sqlitevec.Open(ctx, sqlitevec.Options{ + Path: vecPath, MainPath: mainPath, Dimension: 4, MainDB: s.DB(), + }) + require.NoError(t, err, "rw backend Open") + gen, err := rw.CreateGeneration(ctx, "model", 4, "model:4") + require.NoError(t, err, "CreateGeneration") + require.NoError(t, rw.Upsert(ctx, gen, []vector.Chunk{ + {MessageID: 1, ChunkIndex: 0, Vector: []float32{0, 0, 0, 1}}, + {MessageID: 2, ChunkIndex: 0, Vector: []float32{0, 0, 1, 0}}, + }), "Upsert") + require.NoError(t, rw.ActivateGeneration(ctx, gen, true), "Activate") + require.NoError(t, rw.Close(), "close rw backend") + + // Reconstruct the fresh-upgrade precondition: embed_gen NULL everywhere and + // the backfill ledger UNMARKED, so the NEXT writable open runs the backfill. + _, err = s.DB().Exec(`UPDATE messages SET embed_gen = NULL`) + require.NoError(t, err, "reset embed_gen") + _, err = s.DB().Exec(`DELETE FROM applied_migrations WHERE name = ?`, embedGenBackfillMigrationKey) + require.NoError(t, err, "clear backfill ledger") + + // Wire cfg so repairResetEmbeddings opens the SAME vector backend the real + // repair command would (this open triggers the one-time backfill). + savedCfg := cfg + t.Cleanup(func() { cfg = savedCfg }) + cfg = &config.Config{} + cfg.Data.DataDir = dir + cfg.Vector.Enabled = true + cfg.Vector.DBPath = vecPath + cfg.Vector.Embeddings.Dimension = 4 + + // Drive the real repair embed-fixup flow on message 1: open backend (runs + + // marks the backfill) → ResetEmbedGen([1]) → lower watermark → close. + require.NoError(t, repairResetEmbeddings(ctx, s, []int64{1}), + "repairResetEmbeddings") + + // FIX A assertion: the repaired message must end embed_gen IS NULL — the + // re-embed request survives. With the OLD reset-before-open ordering, the + // backfill (running on the open that lowers the watermark) would re-stamp + // it back to =active here. + _, isNull1 := readEmbedGen(t, s.DB(), 1) + assert.True(t, isNull1, + "repaired message 1 must end embed_gen=NULL so scan-and-fill re-embeds it") + + // Sanity: message 2 was NOT repaired, so the backfill legitimately stamps it + // back to the active generation (it stays "covered"). + v2, isNull2 := readEmbedGen(t, s.DB(), 2) + assert.False(t, isNull2, "unrepaired message 2 must be re-stamped by the backfill") + assert.Equal(t, int64(gen), v2, "message 2 embed_gen = active generation") +} + +// TestRepairResetEmbeddings_VectorDisabledStillResets pins the !Enabled branch: +// with vector search disabled, repairResetEmbeddings opens no backend (no +// backfill) but still clears embed_gen so the column is consistent. No error. +func TestRepairResetEmbeddings_VectorDisabledStillResets(t *testing.T) { + ctx := context.Background() + dir := t.TempDir() + mainPath := filepath.Join(dir, "msgvault.db") + + s, err := store.Open(mainPath) + require.NoError(t, err, "store.Open") + t.Cleanup(func() { _ = s.Close() }) + require.NoError(t, s.InitSchema(), "InitSchema") + _, err = s.DB().Exec(` +INSERT INTO sources (id, source_type, identifier) VALUES (1, 'gmail', 'me@example.com'); +INSERT INTO conversations (id, source_id, conversation_type) VALUES (1, 1, 'email_thread'); +INSERT INTO messages (id, conversation_id, source_id, source_message_id, message_type, embed_gen) +VALUES (1, 1, 1, 'm1', 'email', 7); +`) + require.NoError(t, err, "seed message with embed_gen set") + + savedCfg := cfg + t.Cleanup(func() { cfg = savedCfg }) + cfg = &config.Config{} + cfg.Data.DataDir = dir + cfg.Vector.Enabled = false + + require.NoError(t, repairResetEmbeddings(ctx, s, []int64{1}), + "repairResetEmbeddings (vector disabled)") + + _, isNull := readEmbedGen(t, s.DB(), 1) + assert.True(t, isNull, "embed_gen must be cleared even when vector search is disabled") +} diff --git a/cmd/msgvault/cmd/search_vector.go b/cmd/msgvault/cmd/search_vector.go index 98ab51af1..96cf957bb 100644 --- a/cmd/msgvault/cmd/search_vector.go +++ b/cmd/msgvault/cmd/search_vector.go @@ -71,6 +71,11 @@ func runHybridSearch(cmd *cobra.Command, queryStr, mode string, explain bool, sc return fmt.Errorf("open main db: %w", err) } mainDB = st.DB() + // store.Open returns a WRITABLE connection (not OpenReadOnly), so + // ReadOnly stays false: Open runs the full migrate + one-time upgrade + // backfill, exactly as before. This CLI search path is user-facing + // read-only in intent but the connection can write, so it correctly + // behaves like serve's writable open. pgb, err := pgvector.Open(ctx, pgvector.Options{ DB: mainDB, Dimension: cfg.Vector.Embeddings.Dimension, diff --git a/cmd/msgvault/cmd/serve.go b/cmd/msgvault/cmd/serve.go index 33b8ccafb..fa1a41d8d 100644 --- a/cmd/msgvault/cmd/serve.go +++ b/cmd/msgvault/cmd/serve.go @@ -104,7 +104,7 @@ func runServe(cmd *cobra.Command, args []string) error { // Build optional vector-search components. Returns (nil, nil) when // cfg.Vector.Enabled is false, or an error when enabled but the // binary was built without -tags sqlite_vec. - vf, err := setupVectorFeatures(ctx, s.DB(), dbPath, false) + vf, err := setupVectorFeatures(ctx, s, dbPath, false) if err != nil { return fmt.Errorf("vector features: %w", err) } @@ -158,11 +158,13 @@ func runServe(cmd *cobra.Command, args []string) error { getOAuthMgr := oauthManagerCache() - // Create sync function for the scheduler. vf is captured and used - // inside runScheduledSync to wire the embed enqueuer into each - // per-run Syncer; it is nil when vector search is disabled. + // Create sync function for the scheduler. Under scan-and-fill the + // Syncer no longer needs an enqueuer — newly-ingested messages get + // embed_gen = NULL by column default and the embed worker (wired + // separately below from vf) discovers them on its next run, so the + // sync path no longer threads vf. syncFunc := func(ctx context.Context, email string) error { - return runScheduledSync(ctx, email, s, getOAuthMgr, vf) + return runScheduledSync(ctx, email, s, getOAuthMgr) } // Create and configure scheduler @@ -199,12 +201,12 @@ func runServe(cmd *cobra.Command, args []string) error { // Only when vector search is enabled and wired. if vf != nil { embedJob := &scheduler.EmbedJob{ - Worker: vf.Worker, - Backend: vf.Backend, - VectorsDB: vf.VectorsDB, - Rebind: vf.Rebind, - Fingerprint: vf.Cfg.GenerationFingerprint(), - Log: logger, + Worker: vf.Worker, + Backend: vf.Backend, + Store: s, + Fingerprint: vf.Cfg.GenerationFingerprint(), + BackstopInterval: vf.Cfg.Embed.BackstopInterval, + Log: logger, } schedule := cfg.Vector.Embed.Schedule.Cron if err := sched.SetEmbedJob( @@ -392,14 +394,15 @@ func (a *schedulerAdapter) Status() []api.AccountStatus { // dispatch is by source_type: Gmail accounts run an incremental sync // using the Gmail History API; IMAP accounts run a full sync (already // deduplicated by message-id at the store layer, since IMAP has no -// equivalent history API). When vf is non-nil (vector search enabled), -// the Syncer is configured to enqueue newly-ingested message IDs into -// the embedding pipeline so subsequent embed runs pick them up. +// equivalent history API). Under scan-and-fill there is no enqueue step +// — newly-ingested messages get embed_gen = NULL by column default, so +// subsequent embed runs discover and pick them up by scanning; the sync +// path therefore needs no vector-feature wiring. // // The identifier passed in is whatever the scheduler holds — for // Gmail this is the email address, for IMAP it's the full // `imaps://user@host:port` URL recorded by `add-imap`. -func runScheduledSync(ctx context.Context, identifier string, s *store.Store, getOAuthMgr func(string) (*oauth.Manager, error), vf *vectorFeatures) error { +func runScheduledSync(ctx context.Context, identifier string, s *store.Store, getOAuthMgr func(string) (*oauth.Manager, error)) error { logger.Info("starting scheduled sync", "identifier", identifier) startTime := time.Now() @@ -425,9 +428,9 @@ func runScheduledSync(ctx context.Context, identifier string, s *store.Store, ge ) switch sourceType { case sourceTypeGmail: - summary, err = runScheduledGmailSync(ctx, identifier, src, s, getOAuthMgr, vf) + summary, err = runScheduledGmailSync(ctx, identifier, src, s, getOAuthMgr) case sourceTypeIMAP: - summary, err = runScheduledIMAPSync(ctx, src, s, vf) + summary, err = runScheduledIMAPSync(ctx, src, s) default: return fmt.Errorf("source %q has type %q which is not supported by the daemon scheduler (only gmail and imap)", identifier, sourceType) } @@ -484,7 +487,7 @@ func findScheduledSyncSource(s *store.Store, identifier string) (*store.Source, // getTokenSourceWithReauth) because serve runs as a daemon and cannot // open a browser for OAuth — the error path tells the user how to // re-authorize from a terminal. -func runScheduledGmailSync(ctx context.Context, email string, src *store.Source, s *store.Store, getOAuthMgr func(string) (*oauth.Manager, error), vf *vectorFeatures) (*gmail.SyncSummary, error) { +func runScheduledGmailSync(ctx context.Context, email string, src *store.Source, s *store.Store, getOAuthMgr func(string) (*oauth.Manager, error)) (*gmail.SyncSummary, error) { appName := "" if src != nil { appName = sourceOAuthApp(src) @@ -534,9 +537,6 @@ func runScheduledGmailSync(ctx context.Context, email string, src *store.Source, opts.AttachmentsDir = cfg.AttachmentsDir() syncer := sync.New(client, s, opts).WithLogger(logger) - if vf != nil { - syncer.SetEmbedEnqueuer(vf.Enqueuer) - } source, err := s.GetOrCreateSource(sourceTypeGmail, email) if err != nil { @@ -563,7 +563,7 @@ func runScheduledGmailSync(ctx context.Context, email string, src *store.Source, // the store to dedupe by message-id. NoResume is forced on because // IMAP page tokens are numeric offsets that don't survive across // processes (see syncfull.go). -func runScheduledIMAPSync(ctx context.Context, src *store.Source, s *store.Store, vf *vectorFeatures) (*gmail.SyncSummary, error) { +func runScheduledIMAPSync(ctx context.Context, src *store.Source, s *store.Store) (*gmail.SyncSummary, error) { apiClient, err := buildAPIClient(ctx, src, nil, nil) if err != nil { return nil, fmt.Errorf("build IMAP client: %w", err) @@ -576,9 +576,6 @@ func runScheduledIMAPSync(ctx context.Context, src *store.Source, s *store.Store opts.NoResume = true syncer := sync.New(apiClient, s, opts).WithLogger(logger) - if vf != nil { - syncer.SetEmbedEnqueuer(vf.Enqueuer) - } // runPostSourceCreateMigrations is keyed off Gmail-only legacy // state, so it's a no-op for fresh IMAP installs; we still call it diff --git a/cmd/msgvault/cmd/serve_test.go b/cmd/msgvault/cmd/serve_test.go index b48e7d797..98f94ab9f 100644 --- a/cmd/msgvault/cmd/serve_test.go +++ b/cmd/msgvault/cmd/serve_test.go @@ -284,7 +284,7 @@ func TestRunScheduledIMAPSync_NoCredentials(t *testing.T) { return nil, nil //nolint:nilnil // unreachable guard, see comment above } - err = runScheduledSync(context.Background(), imapID, s, getOAuthMgr, nil) + err = runScheduledSync(context.Background(), imapID, s, getOAuthMgr) require.Error(err, "runScheduledSync(imap, no creds) want credentials error") msg := err.Error() assert.False(strings.Contains(msg, "refresh token") || strings.Contains(msg, "token may be expired"), @@ -330,7 +330,7 @@ func TestRunScheduledIMAPSync_DispatchByDisplayName(t *testing.T) { // Pass the email (as config.toml `email = "..."` would supply it), // not the imaps:// identifier. Dispatch must still land on the // IMAP path; absence of credentials produces an IMAP-shaped error. - err = runScheduledSync(context.Background(), imapEmail, s, getOAuthMgr, nil) + err = runScheduledSync(context.Background(), imapEmail, s, getOAuthMgr) require.Error(err, "runScheduledSync(email, no creds) want IMAP credentials error") msg := err.Error() assert.False(strings.Contains(msg, "refresh token") || strings.Contains(msg, "token may be expired"), @@ -381,7 +381,7 @@ func TestRunScheduledIMAPSync_DefaultIdentityIsDisplayName(t *testing.T) { // Expected to fail at the IMAP connection; what matters is that // confirmDefaultIdentity ran first with the display_name. - _ = runScheduledSync(context.Background(), imapID, s, getOAuthMgr, nil) + _ = runScheduledSync(context.Background(), imapID, s, getOAuthMgr) identities, err := s.ListAccountIdentities(src.ID) require.NoError(err, "ListAccountIdentities") diff --git a/cmd/msgvault/cmd/serve_vector.go b/cmd/msgvault/cmd/serve_vector.go index a6a217085..6be2e5e8c 100644 --- a/cmd/msgvault/cmd/serve_vector.go +++ b/cmd/msgvault/cmd/serve_vector.go @@ -16,40 +16,53 @@ import ( "go.kenn.io/msgvault/internal/vector/sqlitevec" ) -// setupVectorFeatures builds the vector backend, hybrid engine, embed -// worker, and enqueuer used by the serve daemon and the MCP command. The -// backend is dialect-selected from mainPath: a postgres:// DSN uses the -// pgvector backend sharing mainDB (no separate vectors.db, no ATTACH); +// setupVectorFeatures builds the vector backend, hybrid engine, and embed +// worker used by the serve daemon and the MCP command. The backend is +// dialect-selected from mainPath: a postgres:// DSN uses the pgvector +// backend sharing mainStore's DB (no separate vectors.db, no ATTACH); // otherwise the sqlitevec backend opens/attaches vectors.db. Returns // (nil, nil) when cfg.Vector.Enabled is false. The returned Close function // must be called on shutdown. // -// mainDB is the already-opened handle to the main database. On SQLite, -// mainPath is the msgvault.db filesystem path FusedSearch uses to ATTACH +// mainStore is the already-opened main-database store. On SQLite, mainPath +// is the msgvault.db filesystem path FusedSearch uses to ATTACH // vectors.db; on PostgreSQL it is the DSN, used only for dialect detection // (store.IsPostgresURL). // -// readOnly skips schema migration on the PostgreSQL backend -// (pgvector.Options.SkipMigrate); set it true when mainDB is a read-only -// connection — e.g. the MCP server — so CREATE EXTENSION / DDL are not -// attempted (PostgreSQL rejects them with SQLSTATE 25006). Ignored on -// SQLite. -func setupVectorFeatures(ctx context.Context, mainDB *sql.DB, mainPath string, readOnly bool) (*vectorFeatures, error) { +// readOnly marks mainDB as a read-only connection — e.g. the MCP server's +// store.OpenReadOnly. On PostgreSQL it sets BOTH pgvector.Options.SkipMigrate +// and pgvector.Options.ReadOnly: SkipMigrate suppresses the privileged +// CREATE EXTENSION + full migrate, and ReadOnly suppresses ALL remaining +// writes — the extension-less schema apply, the orphan reset, and the +// embed_gen backfill — because PG vector tables share the (read-only) main +// connection and any DDL/UPDATE would be rejected with SQLSTATE 25006. On +// SQLite it sets sqlitevec.Options.ReadOnly so only the one-time embed_gen +// upgrade backfill — which WRITES messages.embed_gen + applied_migrations +// through the main handle — is skipped (the query-only handle would reject +// those writes); Migrate still runs there because it only touches the +// separate vectors.db, which is read-write regardless. +func setupVectorFeatures(ctx context.Context, mainStore *store.Store, mainPath string, readOnly bool) (*vectorFeatures, error) { if !cfg.Vector.Enabled { return nil, nil //nolint:nilnil // vector disabled: callers nil-check vf; (nil, nil) means "no features, no error" } if err := cfg.Vector.Validate(); err != nil { return nil, fmt.Errorf("vector config: %w", err) } + mainDB := mainStore.DB() - // Resolve the dialect once from the main DSN. The queue, worker, and - // enqueuer are dialect-portable via Rebind / InsertOrIgnore, so the - // serve daemon and MCP run vector features on PostgreSQL the same way - // `msgvault embed` does. SQLite's Rebind / InsertOrIgnore are identity - // so the SQLite path is unchanged. + // Resolve the dialect once from the main DSN. The worker is + // dialect-portable via Rebind, so the serve daemon and MCP run vector + // features on PostgreSQL the same way `msgvault embed` does. SQLite's + // Rebind is identity so the SQLite path is unchanged. var dialect store.Dialect = &store.SQLiteDialect{} + // lastModifiedExpr is the dialect-correct SELECT expression for the embed + // worker's last_modified CAS token. SQLite needs CAST(... AS TEXT) to + // defeat go-sqlite3's DATETIME→time.Time coercion (which would break + // round-trip equality); PG uses the bare column. + lastModifiedExpr := "CAST(m.last_modified AS TEXT)" if store.IsPostgresURL(mainPath) { dialect = &store.PostgreSQLDialect{} + lastModifiedExpr = "m.last_modified" } var ( @@ -64,6 +77,11 @@ func setupVectorFeatures(ctx context.Context, mainDB *sql.DB, mainPath string, r DB: mainDB, Dimension: cfg.Vector.Embeddings.Dimension, SkipMigrate: readOnly, + // ReadOnly MUST track readOnly here: this is the MCP read-only + // path (store.OpenReadOnly). When set, Open performs no writes — + // no schema apply, no orphan reset, no upgrade backfill — so the + // query-only connection never attempts DDL/UPDATE (SQLSTATE 25006). + ReadOnly: readOnly, // On a managed/locked-down PG the `vector` extension is // pre-installed by an admin and CREATE EXTENSION would fail // for the msgvault role; SkipExtensionCreate lets schema + @@ -89,6 +107,11 @@ func setupVectorFeatures(ctx context.Context, mainDB *sql.DB, mainPath string, r MainPath: mainPath, Dimension: cfg.Vector.Embeddings.Dimension, MainDB: mainDB, + // Honor the read-only signal on SQLite too: when mainDB is a + // query-only handle (MCP), skip the embed_gen upgrade backfill, + // which would write through it. Migrate still runs (vectors.db + // is read-write). + ReadOnly: readOnly, }) if err != nil { return nil, fmt.Errorf("open vectors.db: %w", err) @@ -111,6 +134,7 @@ func setupVectorFeatures(ctx context.Context, mainDB *sql.DB, mainPath string, r Backend: backend, VectorsDB: vectorsDB, MainDB: mainDB, + Store: mainStore, Client: client, Preprocess: embed.PreprocessConfig{ StripQuotes: cfg.Vector.Preprocess.StripQuotesEnabled(), @@ -120,14 +144,13 @@ func setupVectorFeatures(ctx context.Context, mainDB *sql.DB, mainPath string, r StripURLTracking: cfg.Vector.Preprocess.StripURLTrackingEnabled(), CollapseWhitespace: cfg.Vector.Preprocess.CollapseWhitespaceEnabled(), }, - MaxInputChars: cfg.Vector.Embeddings.MaxInputChars, - BatchSize: cfg.Vector.Embeddings.BatchSize, - EmbedTimeout: cfg.Vector.Embeddings.Timeout, - EmbedMaxRetries: cfg.Vector.Embeddings.MaxRetries, - // Rebind makes the worker's queue + body-fetch SQL run on pgx. + MaxInputChars: cfg.Vector.Embeddings.MaxInputChars, + BatchSize: cfg.Vector.Embeddings.BatchSize, + // Rebind makes the worker's body-fetch + watermark SQL run on pgx. // SQLiteDialect.Rebind is identity, so the SQLite path is unchanged. - Rebind: dialect.Rebind, - Log: logger, + Rebind: dialect.Rebind, + LastModifiedExpr: lastModifiedExpr, + Log: logger, }) engine := hybrid.NewEngine(backend, mainDB, client, hybrid.Config{ @@ -142,19 +165,14 @@ func setupVectorFeatures(ctx context.Context, mainDB *sql.DB, mainPath string, r Rebind: dialect.Rebind, }) - // The enqueuer drives sync-time enqueueing into pending_embeddings. - // On PG it must run on pgx (rebind ? → $N) and use ON CONFLICT DO - // NOTHING (insertOrIgnore) instead of SQLite's INSERT OR IGNORE. - enqueuer := embed.NewEnqueuer(vectorsDB, dialect.Rebind, dialect.InsertOrIgnore) + // No sync-time enqueue: newly-persisted messages get embed_gen = NULL + // by column default and the scan-and-fill worker picks them up. return &vectorFeatures{ Backend: backend, HybridEngine: engine, - Enqueuer: enqueuer, Worker: worker, Cfg: cfg.Vector, - VectorsDB: vectorsDB, - Rebind: dialect.Rebind, Close: closeFn, }, nil } diff --git a/cmd/msgvault/cmd/serve_vector_nopg_test.go b/cmd/msgvault/cmd/serve_vector_nopg_test.go index 109f15f58..e74ca7c77 100644 --- a/cmd/msgvault/cmd/serve_vector_nopg_test.go +++ b/cmd/msgvault/cmd/serve_vector_nopg_test.go @@ -9,6 +9,7 @@ import ( "github.com/stretchr/testify/assert" "github.com/stretchr/testify/require" "go.kenn.io/msgvault/internal/config" + "go.kenn.io/msgvault/internal/store" ) // TestSetupVectorFeatures_PostgresWithoutPgvectorTag verifies that when @@ -30,7 +31,14 @@ func TestSetupVectorFeatures_PostgresWithoutPgvectorTag(t *testing.T) { cfg.Vector.Embeddings.Dimension = 768 cfg.Vector.Embeddings.BatchSize = 32 - _, err := setupVectorFeatures(context.Background(), nil, "postgres://user@host/db", false) + // setupVectorFeatures needs a non-nil *store.Store (it reads + // store.DB()); an in-memory SQLite store suffices — the PG branch is + // selected from the DSN and fails at the pgvector stub. + st, err := store.Open(":memory:") + require.NoError(t, err, "store.Open") + t.Cleanup(func() { _ = st.Close() }) + + _, err = setupVectorFeatures(context.Background(), st, "postgres://user@host/db", false) require.Error(t, err, "setupVectorFeatures with postgres DSN and no pgvector tag") // Must come from the stub, not the removed up-front refusal. assert.Contains(t, err.Error(), "pgvector support not compiled in", diff --git a/cmd/msgvault/cmd/serve_vector_pg_test.go b/cmd/msgvault/cmd/serve_vector_pg_test.go index 151d10316..e08bc2cee 100644 --- a/cmd/msgvault/cmd/serve_vector_pg_test.go +++ b/cmd/msgvault/cmd/serve_vector_pg_test.go @@ -15,6 +15,7 @@ import ( "github.com/stretchr/testify/assert" "github.com/stretchr/testify/require" "go.kenn.io/msgvault/internal/config" + "go.kenn.io/msgvault/internal/store" ) // openServePGSchema creates an isolated per-test schema on @@ -64,12 +65,20 @@ func openServePGSchema(t *testing.T) (*sql.DB, string) { // TestSetupVectorFeatures_SucceedsOnPostgres is the inverse of the old // refusal test: with the pgvector backend compiled in, setupVectorFeatures // must succeed against a postgres:// DSN and wire up the backend, hybrid -// engine, and enqueuer. Runs only with a live PG (MSGVAULT_TEST_DB). +// engine, and worker. Runs only with a live PG (MSGVAULT_TEST_DB). func TestSetupVectorFeatures_SucceedsOnPostgres(t *testing.T) { savedCfg := cfg defer func() { cfg = savedCfg }() - db, dsn := openServePGSchema(t) + _, dsn := openServePGSchema(t) + + // setupVectorFeatures now takes a *store.Store; open one over the + // schema-scoped DSN (which also runs the main-schema init). + st, err := store.Open(dsn) + require.NoError(t, err, "store.Open") + t.Cleanup(func() { _ = st.Close() }) + require.NoError(t, st.InitSchema(), "InitSchema") + db := st.DB() cfg = &config.Config{} cfg.Vector.Enabled = true @@ -79,7 +88,7 @@ func TestSetupVectorFeatures_SucceedsOnPostgres(t *testing.T) { cfg.Vector.Embeddings.Dimension = 768 cfg.Vector.Embeddings.BatchSize = 32 - vf, err := setupVectorFeatures(context.Background(), db, dsn, false) + vf, err := setupVectorFeatures(context.Background(), st, dsn, false) require.NoError(t, err, "setupVectorFeatures on postgres DSN must succeed with pgvector built in") require.NotNil(t, vf, "vectorFeatures") t.Cleanup(func() { @@ -90,15 +99,13 @@ func TestSetupVectorFeatures_SucceedsOnPostgres(t *testing.T) { assert.NotNil(t, vf.Backend, "Backend wired") assert.NotNil(t, vf.HybridEngine, "HybridEngine wired") - assert.NotNil(t, vf.Enqueuer, "Enqueuer wired") assert.NotNil(t, vf.Worker, "Worker wired") - assert.Same(t, db, vf.VectorsDB, "PG shares the main DB handle as the vectors DB") - // The pgvector schema was migrated into the isolated schema, so the - // enqueuer/worker can run against it. Smoke-test that the tables exist. + // The pgvector schema was migrated into the isolated schema. Smoke-test + // that the embedding tables exist. var n int require.NoError(t, db.QueryRow(`SELECT COUNT(*) FROM index_generations`).Scan(&n), "index_generations must exist after setupVectorFeatures migrated the pgvector schema") - require.NoError(t, db.QueryRow(`SELECT COUNT(*) FROM pending_embeddings`).Scan(&n), - "pending_embeddings must exist after setupVectorFeatures migrated the pgvector schema") + require.NoError(t, db.QueryRow(`SELECT COUNT(*) FROM embed_watermark`).Scan(&n), + "embed_watermark must exist after setupVectorFeatures migrated the pgvector schema") } diff --git a/cmd/msgvault/cmd/serve_vector_stub.go b/cmd/msgvault/cmd/serve_vector_stub.go index 5f881df4f..90dc4cd02 100644 --- a/cmd/msgvault/cmd/serve_vector_stub.go +++ b/cmd/msgvault/cmd/serve_vector_stub.go @@ -4,7 +4,6 @@ package cmd import ( "context" - "database/sql" "errors" "go.kenn.io/msgvault/internal/store" @@ -14,7 +13,7 @@ import ( // (nil, nil) when vector search is disabled, and a descriptive error // when the user enabled vector search in config but built the binary // without -tags sqlite_vec. -func setupVectorFeatures(_ context.Context, _ *sql.DB, mainPath string, _ bool) (*vectorFeatures, error) { +func setupVectorFeatures(_ context.Context, _ *store.Store, mainPath string, _ bool) (*vectorFeatures, error) { if !cfg.Vector.Enabled { return nil, nil //nolint:nilnil // vector disabled: callers nil-check vf; (nil, nil) means "no features, no error" } diff --git a/cmd/msgvault/cmd/sync.go b/cmd/msgvault/cmd/sync.go index 460b8ff03..eb028dc83 100644 --- a/cmd/msgvault/cmd/sync.go +++ b/cmd/msgvault/cmd/sync.go @@ -69,19 +69,10 @@ Examples: cancel() }() - // Open vector backend (optional) so newly-ingested messages - // are enqueued for embedding. - vf, err := setupVectorFeatures(ctx, s.DB(), dbPath, false) - if err != nil { - return fmt.Errorf("vector features: %w", err) - } - defer func() { - if vf != nil && vf.Close != nil { - if closeErr := vf.Close(); closeErr != nil { - logger.Warn("closing vectors.db failed", "error", closeErr) - } - } - }() + // Embedding is no longer driven by sync: newly-ingested messages + // get embed_gen = NULL by column default and the scan-and-fill + // embed worker (msgvault embeddings build / the serve daemon) + // picks them up. getOAuthMgr := oauthManagerCache() @@ -180,7 +171,7 @@ Examples: break } fmt.Printf("Note: IMAP account %s does not support incremental sync. Running full sync.\n\n", src.Identifier) - if err := runFullSync(ctx, s, getOAuthMgr, src, vf); err != nil { + if err := runFullSync(ctx, s, getOAuthMgr, src); err != nil { syncErrors = append(syncErrors, fmt.Sprintf("%s: %v", src.Identifier, err)) } } @@ -194,7 +185,7 @@ Examples: syncErrors = append(syncErrors, target.email+": no source found - run 'sync-full' first") continue } - if err := runIncrementalSync(ctx, s, getOAuthMgr, target.source, vf); err != nil { + if err := runIncrementalSync(ctx, s, getOAuthMgr, target.source); err != nil { syncErrors = append(syncErrors, fmt.Sprintf("%s: %v", target.email, err)) continue } @@ -217,7 +208,7 @@ Examples: }, } -func runIncrementalSync(ctx context.Context, s *store.Store, getOAuthMgr func(string) (*oauth.Manager, error), source *store.Source, vf *vectorFeatures) error { +func runIncrementalSync(ctx context.Context, s *store.Store, getOAuthMgr func(string) (*oauth.Manager, error), source *store.Source) error { if !source.SyncCursor.Valid || source.SyncCursor.String == "" { return errors.New("no history ID - run 'sync-full' first") } @@ -265,9 +256,6 @@ func runIncrementalSync(ctx context.Context, s *store.Store, getOAuthMgr func(st syncer := sync.New(client, s, opts). WithLogger(logger). WithProgress(&CLIProgress{}) - if vf != nil { - syncer.SetEmbedEnqueuer(vf.Enqueuer) - } // Run incremental sync startTime := time.Now() diff --git a/cmd/msgvault/cmd/syncfull.go b/cmd/msgvault/cmd/syncfull.go index 5ab78fd75..e5909299d 100644 --- a/cmd/msgvault/cmd/syncfull.go +++ b/cmd/msgvault/cmd/syncfull.go @@ -173,19 +173,10 @@ Examples: cancel() }() - // Open vector backend (optional) so newly-ingested messages - // are enqueued for embedding. - vf, err := setupVectorFeatures(ctx, s.DB(), dbPath, false) - if err != nil { - return fmt.Errorf("vector features: %w", err) - } - defer func() { - if vf != nil && vf.Close != nil { - if closeErr := vf.Close(); closeErr != nil { - logger.Warn("closing vectors.db failed", "error", closeErr) - } - } - }() + // Embedding is no longer driven by sync: newly-ingested messages + // get embed_gen = NULL by column default and the scan-and-fill + // embed worker (msgvault embeddings build / the serve daemon) + // picks them up. for _, src := range sources { if ctx.Err() != nil { @@ -203,7 +194,7 @@ Examples: } } - if err := runFullSync(ctx, s, getOAuthMgr, src, vf); err != nil { + if err := runFullSync(ctx, s, getOAuthMgr, src); err != nil { syncErrors = append(syncErrors, fmt.Sprintf("%s: %v", src.Identifier, err)) continue } @@ -331,7 +322,7 @@ func buildAPIClient(ctx context.Context, src *store.Source, getOAuthMgr func(str } } -func runFullSync(ctx context.Context, s *store.Store, getOAuthMgr func(string) (*oauth.Manager, error), src *store.Source, vf *vectorFeatures) error { +func runFullSync(ctx context.Context, s *store.Store, getOAuthMgr func(string) (*oauth.Manager, error), src *store.Source) error { apiClient, err := buildAPIClient(ctx, src, getOAuthMgr, nil) if err != nil { return err @@ -371,9 +362,6 @@ func runFullSync(ctx context.Context, s *store.Store, getOAuthMgr func(string) ( syncer := sync.New(apiClient, s, opts). WithLogger(logger). WithProgress(&CLIProgress{}) - if vf != nil { - syncer.SetEmbedEnqueuer(vf.Enqueuer) - } // Run sync startTime := time.Now() diff --git a/cmd/msgvault/cmd/vector_features.go b/cmd/msgvault/cmd/vector_features.go index 561a2f200..67ec6525b 100644 --- a/cmd/msgvault/cmd/vector_features.go +++ b/cmd/msgvault/cmd/vector_features.go @@ -1,9 +1,6 @@ package cmd import ( - "database/sql" - - "go.kenn.io/msgvault/internal/sync" "go.kenn.io/msgvault/internal/vector" "go.kenn.io/msgvault/internal/vector/embed" "go.kenn.io/msgvault/internal/vector/hybrid" @@ -12,29 +9,21 @@ import ( // vectorFeatures carries the optional vector-search components that the // serve, mcp, sync, and sync-full commands wire into their servers and // sync pipelines. It is populated only when cfg.Vector.Enabled is true -// AND the binary is built with -tags sqlite_vec; otherwise -// setupVectorFeatures returns (nil, nil) or a clear error. +// AND the binary is built with a vector backend tag (sqlite_vec or +// pgvector); otherwise setupVectorFeatures returns (nil, nil) or a clear +// error. // // When non-nil, all fields are populated (invariant enforced by // setupVectorFeatures). Callers only need to nil-check vf itself. type vectorFeatures struct { Backend vector.Backend HybridEngine *hybrid.Engine - Enqueuer sync.EmbedEnqueuer Worker *embed.Worker Cfg vector.Config - // VectorsDB is the underlying vectors.db handle. The daemon's - // EmbedJob uses it to count pending_embeddings for the - // activation gate; other consumers should prefer the higher- - // level Backend abstraction. - VectorsDB *sql.DB - // Rebind translates ?-placeholders to the driver's native form for - // raw queries run directly against VectorsDB (the daemon's EmbedJob - // activation-gate count). Identity on SQLite, PostgreSQLDialect.Rebind - // on PG. Callers that issue raw SQL against VectorsDB must apply it. - Rebind func(string) string - // Close releases the underlying vectors.db handle. Every caller - // that receives a non-nil vectorFeatures must invoke Close during - // shutdown so WAL checkpoints complete. + // Close releases the backend's resources: on SQLite it closes the + // vectors.db handle (so WAL checkpoints complete); on PostgreSQL it is + // a no-op because the pgvector backend shares the main store's handle, + // which is owned and closed elsewhere. Every caller that receives a + // non-nil vectorFeatures must invoke Close during shutdown. Close func() error } diff --git a/docs/api-server.md b/docs/api-server.md index 16d020ebe..5c0b3ba2b 100644 --- a/docs/api-server.md +++ b/docs/api-server.md @@ -109,15 +109,19 @@ the state of the index. "started_at": "2026-04-19T09:02:10Z", "progress": { "done": 8200, "total": 142857 } }, - "pending_embeddings_total": 134657 + "missing_embeddings_total": 134657 } } ``` `active_generation` is always present in the object (null until the first build completes). `building_generation` is omitted when no -rebuild is in flight. `pending_embeddings_total` is the sum of rows -still pending across the active and building generations. See +rebuild is in flight. `missing_embeddings_total` reports live messages +still needing embedding for the generation the worker will target next: +the building generation while a rebuild is in flight, otherwise the active +generation. During a rebuild the old active generation keeps serving vector +and hybrid search, but active-generation top-ups are frozen until the +building generation activates. See [Vector Search](/usage/vector-search/) for the end-to-end workflow. --- diff --git a/docs/changelog.md b/docs/changelog.md index d5a9e3500..90a5bdcce 100644 --- a/docs/changelog.md +++ b/docs/changelog.md @@ -123,9 +123,9 @@ All notable changes to msgvault, grouped by release. **New features** - **Vector search (semantic and hybrid).** msgvault can now embed your archive using a configured OpenAI-compatible embedding endpoint (Ollama, llama.cpp `server`, LM Studio, etc.) and search it by meaning, not just keywords. `msgvault search --mode vector` runs pure semantic search; `--mode hybrid` fuses BM25 and vector similarity via Reciprocal Rank Fusion. Exposed through local CLI search (`msgvault search`), the HTTP API (`GET /api/v1/search?mode=vector|hybrid`), and the MCP server (`search_messages` mode argument plus a new `find_similar_messages` tool). See [Vector Search](/usage/vector-search/). -- `msgvault build-embeddings` command to generate and maintain the local vector index. Incremental by default; `--full-rebuild` creates a new generation and atomically activates it once pending work drains. Same-model rebuilds keep answering against the previous active generation while the new one is built; model or dimension changes return `index_stale` until activation. +- `msgvault build-embeddings` command to generate and maintain the local vector index. Incremental by default; `--full-rebuild` creates a new generation and atomically activates it once coverage reaches zero. Same-model rebuilds keep answering against the previous active generation while the new one is built, with active-generation top-ups frozen until activation; model or dimension changes return `index_stale` until activation. - Background embedding via the daemon scheduler. A new `[vector.embed.schedule]` config block drives the embed worker on cron and/or after every successful scheduled sync, so `msgvault serve` can keep the vector index current without manual intervention. -- `/api/v1/stats` gains a `vector_search` sub-object reporting the active generation, any in-flight rebuild, and the total pending embedding queue depth. +- `/api/v1/stats` gains a `vector_search` sub-object reporting the active generation, any in-flight rebuild, and the actionable missing embedding count for the generation the worker will target next. - `msgvault rebuild-fts` command to rebuild the SQLite FTS5 shadow table after corruption. **Improvements** diff --git a/docs/cli-reference.md b/docs/cli-reference.md index dd9f692fb..9601869be 100644 --- a/docs/cli-reference.md +++ b/docs/cli-reference.md @@ -659,7 +659,7 @@ msgvault embeddings [flags] | Subcommand | Description | |---|---| | `build` | Build or update the index. Incremental by default; `--full-rebuild` starts a new generation. | -| `resume` | Drain the pending queue for the building or active generation. Always incremental. | +| `resume` | Continue scan-and-fill embedding for the building or active generation. Always incremental. | | `list` | List index generations with their state, model, dimension, and pending count. | | `activate ` | Activate a completed building generation, retiring the current active one. | | `retire ` | Retire a generation. | @@ -672,10 +672,10 @@ msgvault embeddings build [flags] | Flag | Description | |---|---| -| `--full-rebuild` | Create a new index generation and rebuild from scratch. The new generation is activated atomically once pending work drains. Same-model rebuilds keep serving the previous active generation in the meantime; model or dimension changes return `index_stale` for vector/hybrid search until the new generation activates. | +| `--full-rebuild` | Create a new index generation and rebuild from scratch. The new generation is activated atomically once coverage reaches zero. Same-model rebuilds keep serving the previous active generation in the meantime, but active-generation top-ups are frozen until activation; model or dimension changes return `index_stale` for vector/hybrid search until the new generation activates. | | `--yes` | Skip the confirmation prompt that `--full-rebuild` otherwise requires. | -Without `--full-rebuild`, the command is incremental: it resumes any in-flight rebuild that matches the configured model, otherwise drains the pending queue for the active generation, then exits. Safe to schedule via cron (or let `msgvault serve` do it via `[vector.embed.schedule]`). +Without `--full-rebuild`, the command is incremental: it resumes any in-flight rebuild that matches the configured model, otherwise scans for live messages still missing coverage in the active generation, then exits. Safe to schedule via cron (or let `msgvault serve` do it via `[vector.embed.schedule]`). ### embeddings resume @@ -683,7 +683,7 @@ Without `--full-rebuild`, the command is incremental: it resumes any in-flight r msgvault embeddings resume ``` -Drain the pending queue and finish the current generation. If a generation matching the configured model is building, this embeds its remaining rows and activates it once the queue reaches zero; otherwise it tops up the active generation. Equivalent to `msgvault embeddings build` with no flags, but never starts a full rebuild. +Continue embedding work and finish the current generation. If a generation matching the configured model is building, this embeds its remaining rows and activates it once coverage reaches zero; otherwise it tops up the active generation. Equivalent to `msgvault embeddings build` with no flags, but never starts a full rebuild. ### embeddings list @@ -699,12 +699,12 @@ Print one row per index generation: ID, state (`building`, `active`, or `retired msgvault embeddings activate [flags] ``` -Activate a completed building generation and retire the currently active one. By default this refuses to activate a generation that still has pending rows, has not finished seeding, or whose fingerprint does not match the current config. +Activate a completed building generation and retire the currently active one. By default this refuses to activate a generation that still has messages missing coverage or whose fingerprint does not match the current config. | Flag | Description | |---|---| | `--yes` | Skip the confirmation prompt. | -| `--force` | Activate even with pending rows or a fingerprint mismatch. | +| `--force` | Activate even with missing coverage or a fingerprint mismatch. | ### embeddings retire diff --git a/docs/usage/vector-search.md b/docs/usage/vector-search.md index 1801e00e4..3a3d64608 100644 --- a/docs/usage/vector-search.md +++ b/docs/usage/vector-search.md @@ -28,8 +28,8 @@ on your own machine or network. !!! note Vector indexing operates over msgvault's shared `messages` table. A full rebuild embeds every non-deleted message row, including imported - chat messages. Chat import commands do not auto-enqueue new rows into - an existing vector generation, so run + chat messages. Chat import commands do not run the embed worker after + import, so run `msgvault embeddings build --full-rebuild --yes` after importing local files or chat/text data if you want those messages in the vector index. Chat-specific preprocessing and ranking are not @@ -200,18 +200,18 @@ messages, embed it: msgvault embeddings build --full-rebuild --yes ``` -This creates a new **building generation**, seeds the pending queue -with every non-deleted message in your archive, drains the queue in -batches through your configured embedder, and atomically activates -the generation once every pending row has been embedded. During the +This creates a new **building generation**, scans every non-deleted +message in your archive, embeds missing rows in batches through your +configured embedder, and atomically activates the generation once +coverage reaches zero. During the first build, when no active generation exists yet, HTTP and MCP vector/hybrid search return `index_building`; use `mode=fts` for the interim. !!! tip You can interrupt and resume. Each invocation of `msgvault embeddings build` - processes whatever is currently pending and activates the generation - when the queue reaches zero. `Ctrl+C` is safe; run `msgvault embeddings build` + scans for messages still missing coverage and activates the generation + when coverage reaches zero. `Ctrl+C` is safe; run `msgvault embeddings build` again and it picks up from where it left off. The initial embed is the largest and longest operation. Runtime is @@ -231,26 +231,28 @@ on how you run it. ### CLI workflow (manual syncs) If you run `msgvault sync-full` or `msgvault sync` (alias: -`sync-incremental`) by hand, new Gmail and IMAP messages are -**auto-enqueued** into every -non-retired generation during the sync. In steady state that means the -active generation; during a rebuild it means both the old active -generation and the new building generation. Run -`msgvault embeddings build` (no `--full-rebuild`) to drain the queue: +`sync-incremental`) by hand, new Gmail and IMAP messages persist with +`embed_gen = NULL`. In steady state, `msgvault embeddings build` scans +those rows and tops up the active generation. During a rebuild, the +worker targets the building generation first so it can activate; the old +active generation keeps serving vector and hybrid search, but is frozen +and will not receive top-ups until the build activates. Run +`msgvault embeddings build` (no `--full-rebuild`) to continue the scan: ```bash -# Sync new messages (auto-enqueues them for embedding) +# Sync new messages (marks them as needing embedding) msgvault sync you@gmail.com -# Drain the embedding queue into the active generation +# Scan and embed missing rows for the active or building generation msgvault embeddings build ``` `msgvault embeddings build` without `--full-rebuild` is a short, incremental -operation: it picks up the configured active generation, drains any -pending rows, and exits. `msgvault embeddings resume` is a synonym for this -drain that never starts a full rebuild. You can schedule either via cron, -run it after every sync, or chain it (`msgvault sync && msgvault embeddings build`). +operation: it resumes a matching building generation if one exists, +otherwise it tops up the configured active generation, and exits. +`msgvault embeddings resume` is a synonym for this drain that never starts a +full rebuild. You can schedule either via cron, run it after every sync, or +chain it (`msgvault sync && msgvault embeddings build`). ### Daemon workflow (`msgvault serve`) @@ -265,30 +267,54 @@ run_after_sync = true # and opportunistically after every scheduled sync ``` With `run_after_sync = true`, every successful scheduled sync -triggers an immediate embed pass against the queue it just -populated. The standalone cron ensures the queue drains even when -syncs are quiet (e.g. overnight). An empty `cron = ""` disables the +triggers an immediate embed pass over messages still missing coverage. +The standalone cron ensures embedding catches up even when syncs are +quiet (e.g. overnight). An empty `cron = ""` disables the standalone schedule (useful if you only want the post-sync trigger). -### What auto-enqueues +### What Triggers Embedding -| Ingest path | Auto-enqueues? | +| Ingest path | Runs the embed worker? | |---|---| -| `sync-full` / `sync` (Gmail, IMAP) | Yes | -| Scheduled syncs in `msgvault serve` | Yes | +| Manual `sync-full` / `sync` (Gmail, IMAP) | No. Run `msgvault embeddings build` afterward | +| Scheduled syncs in `msgvault serve` | Yes, when `[vector.embed.schedule].run_after_sync = true` | | `import-pst`, `import-emlx`, `import-mbox` | No. Re-run `--full-rebuild` after large imports | | Chat/text imports (iMessage, WhatsApp, Google Voice, Messenger, SyncTech SMS) | No. Run a full rebuild after importing if you want chats included | -For ingest paths that do not auto-enqueue, running +For ingest paths that do not immediately schedule embedding work, running `msgvault embeddings build --full-rebuild --yes` rebuilds the index over the full archive including the newly-imported messages. A same-model full rebuild is atomic from the searcher's perspective: vector and hybrid queries keep answering from the previous active generation until the -new one is ready. If the rebuild changes the configured model or +new one is ready. That previous active generation is intentionally frozen +during the rebuild, so messages synced after the rebuild starts may not appear +in vector or hybrid results until the building generation activates. If the +rebuild changes the configured model or dimension, vector and hybrid queries return `index_stale` until the new generation activates. +### CAS resolution (accepted single-user residual) + +When a message's text changes during embedding (for example +`msgvault repair-encoding` rewriting a body while an embed run is in flight), +the embed worker uses an optimistic compare-and-set on the message's +`last_modified` timestamp to avoid stamping an embedding built from stale +text: if `last_modified` moved between the worker reading the content and +writing the coverage stamp, the stamp is skipped and the message is re-embedded +on a later run. + +`last_modified` has **1-second resolution** (it is a `CURRENT_TIMESTAMP` +default/trigger). A concurrent edit that lands in the *same whole second* as +the worker's content read leaves `last_modified` unchanged, so the CAS can +mark an embedding current even though it was built from the now-stale text. +This sub-second window is an accepted residual for this single-user tool — an +edit and an embed of the *same* message within the *same second* is rare. It +self-recovers: the next edit to that message bumps `last_modified` (and +`repair-encoding` clears its coverage stamp outright), and a full rebuild +(`embeddings build --full-rebuild`) or the periodic full-scan backstop +re-embeds it regardless. + ## Search **CLI:** @@ -399,9 +425,11 @@ curl -H "X-API-Key: ..." http://localhost:8080/api/v1/stats | jq .vector_search ``` The `active_generation.message_count` should roughly match -`total_messages`. `pending_embeddings_total` shows how many rows -still need embedding (either because a rebuild is in flight or -because recent syncs have not yet been drained). +`total_messages` when no rebuild is in flight. During a rebuild it reports +the frozen serving index, while `building_generation.progress` reports the +replacement index. `missing_embeddings_total` shows how many live messages +still need embedding for the generation the worker will target next: the +building generation during a rebuild, otherwise the active generation. ## What Gets Embedded diff --git a/internal/api/handlers_test.go b/internal/api/handlers_test.go index 7c84a3ebc..bfe0331bb 100644 --- a/internal/api/handlers_test.go +++ b/internal/api/handlers_test.go @@ -1917,10 +1917,13 @@ func (f *fakeVectorBackend) Stats(_ context.Context, _ vector.GenerationID) (vec func (f *fakeVectorBackend) LoadVector(_ context.Context, _ int64) ([]float32, error) { return nil, errors.New("not implemented") } -func (f *fakeVectorBackend) Close() error { return nil } -func (f *fakeVectorBackend) EnsureSeeded(_ context.Context, _ vector.GenerationID) error { - return errors.New("not implemented") +func (f *fakeVectorBackend) ResetWatermarkBelow(_ context.Context, _ int64) error { + return nil } +func (f *fakeVectorBackend) EmbeddedMessageCount(_ context.Context, _ vector.GenerationID) (int64, error) { + return 0, errors.New("not implemented") +} +func (f *fakeVectorBackend) Close() error { return nil } func TestHandleStats_VectorDisabled(t *testing.T) { srv, _ := newTestServerWithMockStore(t) @@ -1987,7 +1990,7 @@ func TestHandleStats_VectorEnabledWithActive(t *testing.T) { require.True(ok, "expected 'vector_search' object, got %T: %v", resp["vector_search"], resp["vector_search"]) assert.Equal(true, vs["enabled"], "enabled") - assert.InDelta(float64(7), vs["pending_embeddings_total"], 1e-9, "pending_embeddings_total") + assert.InDelta(float64(7), vs["missing_embeddings_total"], 1e-9, "missing_embeddings_total") active, ok := vs["active_generation"].(map[string]any) require.True(ok, "expected 'vector_search.active_generation' object, got %T", vs["active_generation"]) diff --git a/internal/mcp/server_test.go b/internal/mcp/server_test.go index 53f786c33..e11cc76e4 100644 --- a/internal/mcp/server_test.go +++ b/internal/mcp/server_test.go @@ -684,7 +684,7 @@ func TestGetStats_VectorEnabled(t *testing.T) { assert.Equal(vector.GenerationID(5), ag.ID, "active_generation.id") assert.Equal("nomic-embed", ag.Model, "active_generation.model") assert.Equal(int64(100), ag.MessageCount, "active_generation.message_count") - assert.Equal(int64(3), resp.VectorSearch.PendingEmbeddingsTotal, "pending_embeddings_total") + assert.Equal(int64(3), resp.VectorSearch.MissingEmbeddingsTotal, "missing_embeddings_total") assert.Nil(resp.VectorSearch.BuildingGeneration, "building_generation") } @@ -1480,6 +1480,12 @@ type fakeBackend struct { func (f *fakeBackend) LoadVector(_ context.Context, _ int64) ([]float32, error) { return f.loadVec, f.loadErr } +func (f *fakeBackend) ResetWatermarkBelow(_ context.Context, _ int64) error { + return nil +} +func (f *fakeBackend) EmbeddedMessageCount(_ context.Context, _ vector.GenerationID) (int64, error) { + return 0, errors.New("not implemented") +} func (f *fakeBackend) ActiveGeneration(_ context.Context) (vector.Generation, error) { return f.active, f.activeErr } @@ -1511,9 +1517,6 @@ func (f *fakeBackend) Stats(_ context.Context, gen vector.GenerationID) (vector. return f.stats[gen], nil } func (f *fakeBackend) Close() error { return nil } -func (f *fakeBackend) EnsureSeeded(_ context.Context, _ vector.GenerationID) error { - return nil -} var _ vector.Backend = (*fakeBackend)(nil) diff --git a/internal/scheduler/embed_job.go b/internal/scheduler/embed_job.go index 4170acbdb..cecea3622 100644 --- a/internal/scheduler/embed_job.go +++ b/internal/scheduler/embed_job.go @@ -2,29 +2,46 @@ package scheduler import ( "context" - "database/sql" "errors" "log/slog" "sync" + "time" "go.kenn.io/msgvault/internal/vector" "go.kenn.io/msgvault/internal/vector/embed" ) +// defaultBackstopInterval is how often the daemon embed job runs a full +// watermark-ignoring backstop pass when BackstopInterval is left zero. +const defaultBackstopInterval = 24 * time.Hour + // EmbedRunner is the subset of *embed.Worker that EmbedJob needs. // Tests satisfy it with a fake. type EmbedRunner interface { RunOnce(ctx context.Context, gen vector.GenerationID) (embed.RunResult, error) + // RunBackstop performs a full-scan pass that ignores the per-generation + // watermark, recovering below-watermark stragglers (repair-encoding + // resets, transient errors, crashes). Idempotent: already-covered rows + // are skipped by the scan predicate. + RunBackstop(ctx context.Context, gen vector.GenerationID) (embed.RunResult, error) ReclaimStale(ctx context.Context) (int, error) } +// EmbedCoverage is the subset of *store.Store the activation gate needs: +// the count of live messages still needing embedding for a generation, +// read from the main DB. Tests satisfy it with a fake. +type EmbedCoverage interface { + MissingCount(ctx context.Context, activeGen int64) (int64, error) +} + // Compile-time check that the production worker satisfies EmbedRunner. var _ EmbedRunner = (*embed.Worker)(nil) // EmbedJob runs the vector-embedding worker. Each invocation prefers // an in-flight rebuild for the configured fingerprint over the -// existing active generation, drains its queue via RunOnce, and -// activates it once pending hits zero. This mirrors the CLI +// existing active generation, embeds its outstanding messages via +// RunOnce, and activates once coverage is complete (no live message +// still needs embedding). This mirrors the CLI // (cmd/msgvault/cmd/embed_vector.go pickEmbedGeneration) so a // daemon-only deployment can complete a `--full-rebuild` started by // the operator. Without the building-first preference, a daemon @@ -40,18 +57,11 @@ type EmbedJob struct { Backend vector.Backend Log *slog.Logger - // VectorsDB is the vectors.db handle, used to count remaining - // pending_embeddings for activation gating. May be nil; in that - // case the daemon will not auto-activate building generations. - VectorsDB *sql.DB - - // Rebind translates ?-placeholders to the driver's native form for - // queries this job issues directly against VectorsDB (pendingCount). - // nil is treated as the identity (used by SQLite); the PostgreSQL - // serve path must wire in (&store.PostgreSQLDialect{}).Rebind so the - // activation-gate count runs on pgx — a bare ? is rejected by the - // pgx driver. - Rebind func(string) string + // Store provides the main-DB coverage count used for activation + // gating (how many live messages still need embedding for the + // building generation). May be nil; in that case the daemon will not + // auto-activate building generations. + Store EmbedCoverage // Fingerprint is the configured generation fingerprint (typically // vector.Config.GenerationFingerprint() — "model:dim:preprocess"). @@ -64,6 +74,33 @@ type EmbedJob struct { // is still refused. Fingerprint string + // BackstopInterval controls how often Run also performs a full + // watermark-ignoring backstop pass (RunBackstop) in addition to the + // per-tick RunOnce. The backstop recovers below-watermark stragglers + // (repair-encoding NULL resets, transient errors, crashes) that the + // incremental scan skips. Zero uses defaultBackstopInterval (24h). + // A negative value disables the auto-backstop entirely. + BackstopInterval time.Duration + + // Now returns the current time; overridable in tests to drive the + // backstop interval deterministically. nil uses time.Now. + Now func() time.Time + + // lastBackstop maps each generation to the time its most recent backstop + // ran, used to gate the next one by BackstopInterval. Keyed per generation + // so that switching the target (e.g. the active gen recently backstopped, + // then a building gen selected) does not let one generation's recent + // backstop throttle a different generation's first backstop — which would + // otherwise delay recovery of a below-watermark straggler and block + // auto-activation for up to BackstopInterval. In-memory (not persisted): a + // daemon restart resets it, so the first tick after a restart runs one extra + // backstop per generation — harmless because RunBackstop is idempotent. + // Read/written only while the running lock is held, so it needs no separate + // guard. Lazily allocated in maybeRunBackstop so the zero value stays usable. + // Growth is negligible (a handful of generations over the tool's life), so + // no pruning is needed. + lastBackstop map[vector.GenerationID]time.Time + // running guards against overlapping Run calls (cron fires while a // post-sync hook is still draining, etc). sync.Mutex.TryLock gives // us "skip if busy" without serializing a queue of waiters. @@ -98,20 +135,6 @@ func (j *EmbedJob) Run(ctx context.Context) { return } - // Guard against the CreateGeneration crash window: if a prior - // rebuild inserted the building row but died before committing - // the initial seed, the pending queue is empty and the daemon - // would happily "drain" it and activate an unseeded generation. - // EnsureSeeded is idempotent on already-seeded generations, so - // calling it on every resume is cheap and safe. - if isBuilding { - if err := j.Backend.EnsureSeeded(ctx, target); err != nil { - log.Warn("embed: ensure seeded failed; leaving building generation for CLI to resolve", - "gen", target, "error", err) - return - } - } - res, err := j.Worker.RunOnce(ctx, target) if err != nil { log.Warn("embed run failed", "gen", target, "error", err) @@ -120,48 +143,57 @@ func (j *EmbedJob) Run(ctx context.Context) { log.Info("embed run complete", "gen", target, "building", isBuilding, - "claimed", res.Claimed, + "scanned", res.Claimed, "succeeded", res.Succeeded, "failed", res.Failed, "truncated", res.Truncated, ) + // Periodic full backstop (~once per BackstopInterval). RunOnce only + // scans forward from the per-gen watermark, so below-watermark + // stragglers (repair-encoding NULL resets, transient errors, crashes) + // are otherwise only recovered by the manual `embeddings build + // --backstop`. Weaving it into this existing job gives `msgvault serve` + // users that recovery for free. The backstop reuses the same + // scan/embed/stamp path with the cursor pinned at 0, in modest + // non-locking batches, and is idempotent (already-covered rows are + // skipped) so it never re-embeds stamped messages. + j.maybeRunBackstop(ctx, target, log) + if !isBuilding { return } - // Activation gate: only flip the building generation to active - // when the queue has fully drained for it. Transient embed - // failures that the worker later recovers from must not block - // activation, but a generation with pending rows must not - // auto-activate either (it would expose an incomplete index). + // Activation gate: only flip the building generation to active when + // coverage is complete (no live message still needs embedding for it). + // Transient embed failures that the worker later recovers from must + // not block activation, but an incompletely-covered generation must + // not auto-activate either (it would expose an incomplete index). // - // This check + ActivateGeneration is intentionally non-atomic. - // If sync.EnqueueMessages commits a new pending row between the - // pendingCount read and the activation call, activation still - // succeeds and the new row stays bound to the (now-active) - // generation. The next worker tick picks it up via the active- - // generation top-up path, so the system reaches consistency on - // the next run rather than blocking activation forever on a - // moving target. This is by design — at steady state every - // active generation has incremental rows showing up between - // runs, so the activation gate must not require a snapshot. - if j.VectorsDB == nil { - log.Debug("embed: building drained but VectorsDB not wired; skipping auto-activation", + // This check + ActivateGeneration is intentionally non-atomic on + // SQLite (cross-DB): a message synced between the coverage read and + // the activation call leaves embed_gen NULL on the now-active + // generation. The next worker tick (and the full-scan backstop) picks + // it up via the active-generation scan, so the system reaches + // consistency on the next run rather than blocking activation forever + // on a moving target. The backend re-asserts the gate (atomically on + // PG; via a Go pre-check on SQLite) inside ActivateGeneration. + if j.Store == nil { + log.Debug("embed: building covered but Store not wired; skipping auto-activation", "gen", target) return } - remaining, err := j.pendingCount(ctx, target) + missing, err := j.Store.MissingCount(ctx, int64(target)) if err != nil { - log.Warn("embed: count pending after run failed", "gen", target, "error", err) + log.Warn("embed: coverage count after run failed", "gen", target, "error", err) return } - if remaining > 0 { - log.Info("embed: building generation still has pending rows; will retry next tick", - "gen", target, "remaining", remaining) + if missing > 0 { + log.Info("embed: building generation still has messages needing embedding; will retry next tick", + "gen", target, "remaining", missing) return } - // force=false: the pendingCount==0 check above is the scheduler's gate, - // and the backend re-asserts it atomically inside the activation tx. + // force=false: the missing==0 check above is the scheduler's gate, + // and the backend re-asserts it inside ActivateGeneration. if err := j.Backend.ActivateGeneration(ctx, target, false); err != nil { log.Warn("embed: activation failed", "gen", target, "error", err) return @@ -169,6 +201,50 @@ func (j *EmbedJob) Run(ctx context.Context) { log.Info("embed: building generation activated", "gen", target) } +// maybeRunBackstop runs a full watermark-ignoring backstop pass on gen when +// BackstopInterval has elapsed since this generation's last one, then records +// the time. The throttle is keyed per generation so a recent backstop of one +// generation cannot suppress a different generation's first backstop. Called +// with the running lock held (from Run), so lastBackstop needs no separate +// guard. A negative BackstopInterval disables it; zero defaults to 24h. A +// backstop failure is logged, not fatal — the next interval retries. +func (j *EmbedJob) maybeRunBackstop(ctx context.Context, gen vector.GenerationID, log *slog.Logger) { + interval := j.BackstopInterval + if interval < 0 { + return // explicitly disabled + } + if interval == 0 { + interval = defaultBackstopInterval + } + now := time.Now + if j.Now != nil { + now = j.Now + } + t := now() + // First run for this generation (no recorded time) always runs a backstop; + // thereafter gate by the interval against this generation's own last run. + if last, ok := j.lastBackstop[gen]; ok && t.Sub(last) < interval { + return + } + res, err := j.Worker.RunBackstop(ctx, gen) + if err != nil { + log.Warn("embed backstop failed", "gen", gen, "error", err) + // Do not advance lastBackstop on failure so the next tick retries. + return + } + if j.lastBackstop == nil { + j.lastBackstop = make(map[vector.GenerationID]time.Time) + } + j.lastBackstop[gen] = t + log.Info("embed backstop complete", + "gen", gen, + "scanned", res.Claimed, + "succeeded", res.Succeeded, + "failed", res.Failed, + "truncated", res.Truncated, + ) +} + // pickTarget returns the generation to drain plus an isBuilding flag // for the activation gate. Order: // @@ -231,18 +307,3 @@ func (j *EmbedJob) pickTarget(ctx context.Context, log *slog.Logger) (vector.Gen return 0, false, false } } - -// pendingCount returns the number of pending_embeddings rows for gen. -// Used by the activation gate. -func (j *EmbedJob) pendingCount(ctx context.Context, gen vector.GenerationID) (int, error) { - rebind := j.Rebind - if rebind == nil { - rebind = func(q string) string { return q } - } - var n int - if err := j.VectorsDB.QueryRowContext(ctx, - rebind(`SELECT COUNT(*) FROM pending_embeddings WHERE generation_id = ?`), int64(gen)).Scan(&n); err != nil { - return 0, err - } - return n, nil -} diff --git a/internal/scheduler/embed_job_backstop_test.go b/internal/scheduler/embed_job_backstop_test.go new file mode 100644 index 000000000..eb33a4dfb --- /dev/null +++ b/internal/scheduler/embed_job_backstop_test.go @@ -0,0 +1,232 @@ +//go:build sqlite_vec + +package scheduler + +import ( + "context" + "database/sql" + "fmt" + "path/filepath" + "strings" + "testing" + "time" + + "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/require" + + "go.kenn.io/msgvault/internal/store" + "go.kenn.io/msgvault/internal/vector" + "go.kenn.io/msgvault/internal/vector/embed" + "go.kenn.io/msgvault/internal/vector/sqlitevec" +) + +// e2eWorkStore is a minimal embed.WorkStore over the test main DB, +// mirroring store.ScanForEmbedding / store.SetEmbedGen. +type e2eWorkStore struct{ db *sql.DB } + +func (s *e2eWorkStore) ScanForEmbedding(ctx context.Context, target, afterID int64, limit int) ([]int64, error) { + rows, err := s.db.QueryContext(ctx, + `SELECT id FROM messages + WHERE (embed_gen IS NULL OR embed_gen <> ?) + AND deleted_at IS NULL AND deleted_from_source_at IS NULL + AND id > ? + ORDER BY id LIMIT ?`, target, afterID, limit) + if err != nil { + return nil, err + } + defer func() { _ = rows.Close() }() + var out []int64 + for rows.Next() { + var id int64 + if err := rows.Scan(&id); err != nil { + return nil, err + } + out = append(out, id) + } + return out, rows.Err() +} + +func (s *e2eWorkStore) SetEmbedGen(ctx context.Context, ids []int64, target int64) error { + if len(ids) == 0 { + return nil + } + ph := make([]string, len(ids)) + args := make([]any, 0, 1+len(ids)) + args = append(args, target) + for i, id := range ids { + ph[i] = "?" + args = append(args, id) + } + _, err := s.db.ExecContext(ctx, + `UPDATE messages SET embed_gen = ? WHERE id IN (`+strings.Join(ph, ",")+`)`, args...) + return err +} + +func (s *e2eWorkStore) SetEmbedGenIfUnchanged(ctx context.Context, items []store.EmbedGenStamp, target int64) (missed []int64, err error) { + for _, it := range items { + res, err := s.db.ExecContext(ctx, + `UPDATE messages SET embed_gen = ? WHERE id = ? AND last_modified = ?`, + target, it.ID, it.LastModified) + if err != nil { + return missed, err + } + n, err := res.RowsAffected() + if err != nil { + return missed, err + } + if n == 0 { + missed = append(missed, it.ID) + } + } + return missed, nil +} + +// e2eCoverage satisfies EmbedCoverage from the live main DB so the +// EmbedJob's activation gate reflects real coverage. +type e2eCoverage struct{ db *sql.DB } + +func (c *e2eCoverage) MissingCount(ctx context.Context, activeGen int64) (int64, error) { + var missing int64 + if err := c.db.QueryRowContext(ctx, + `SELECT COUNT(*) FROM messages + WHERE (embed_gen IS NULL OR embed_gen <> ?) + AND deleted_at IS NULL AND deleted_from_source_at IS NULL`, activeGen).Scan(&missing); err != nil { + return 0, err + } + return missing, nil +} + +// e2eClient returns one deterministic non-zero vector per input. +type e2eClient struct{ dim int } + +func (c *e2eClient) Embed(_ context.Context, inputs []string) ([][]float32, error) { + out := make([][]float32, len(inputs)) + for i := range inputs { + v := make([]float32, c.dim) + v[0] = float32(len(inputs[i])%c.dim + 1) + out[i] = v + } + return out, nil +} + +func countMissingE2E(t *testing.T, db *sql.DB, gen int64) int { + t.Helper() + var n int + require.NoError(t, db.QueryRow( + `SELECT COUNT(*) FROM messages + WHERE (embed_gen IS NULL OR embed_gen <> ?) + AND deleted_at IS NULL AND deleted_from_source_at IS NULL`, gen).Scan(&n)) + return n +} + +// TestEmbedJob_Backstop_RecoversSubWatermarkStraggler is the end-to-end +// backstop test: a real EmbedJob (real Worker + sqlitevec backend) whose +// backstop interval has elapsed runs a backstop that re-embeds a +// below-watermark straggler WITHOUT re-embedding already-stamped messages; +// and when the interval has NOT elapsed it only runs RunOnce (which misses +// the sub-watermark straggler). +func TestEmbedJob_Backstop_RecoversSubWatermarkStraggler(t *testing.T) { + ctx := context.Background() + dir := t.TempDir() + mainPath := filepath.Join(dir, "main.db") + require.NoError(t, sqlitevec.RegisterExtension(), "RegisterExtension") + mainDB, err := sql.Open(sqlitevec.DriverName(), mainPath) + require.NoError(t, err, "open main") + t.Cleanup(func() { _ = mainDB.Close() }) + + _, err = mainDB.Exec(` +CREATE TABLE messages ( + id INTEGER PRIMARY KEY, subject TEXT, + deleted_at DATETIME, deleted_from_source_at DATETIME, embed_gen INTEGER, + last_modified DATETIME DEFAULT CURRENT_TIMESTAMP); +CREATE TABLE message_bodies ( + message_id INTEGER PRIMARY KEY, body_text TEXT, body_html TEXT); +CREATE TABLE applied_migrations (name TEXT PRIMARY KEY, applied_at DATETIME); +CREATE TRIGGER trg_messages_last_modified +AFTER UPDATE ON messages FOR EACH ROW +WHEN OLD.last_modified = NEW.last_modified +BEGIN + UPDATE messages SET last_modified = CURRENT_TIMESTAMP WHERE id = NEW.id; +END; +CREATE TRIGGER trg_message_bodies_last_modified_upd +AFTER UPDATE ON message_bodies FOR EACH ROW +BEGIN + UPDATE messages SET last_modified = CURRENT_TIMESTAMP WHERE id = NEW.message_id; +END; +CREATE TRIGGER trg_message_bodies_last_modified_ins +AFTER INSERT ON message_bodies FOR EACH ROW +BEGIN + UPDATE messages SET last_modified = CURRENT_TIMESTAMP WHERE id = NEW.message_id; +END;`) + require.NoError(t, err, "schema") + const n = 4 + for i := 1; i <= n; i++ { + _, err = mainDB.Exec(`INSERT INTO messages (id, subject) VALUES (?, ?)`, i, fmt.Sprintf("msg %d", i)) + require.NoError(t, err, "insert message") + _, err = mainDB.Exec(`INSERT INTO message_bodies (message_id, body_text) VALUES (?, ?)`, i, fmt.Sprintf("body %d", i)) + require.NoError(t, err, "insert body") + } + + vecPath := filepath.Join(dir, "vectors.db") + backend, err := sqlitevec.Open(ctx, sqlitevec.Options{ + Path: vecPath, MainPath: mainPath, Dimension: 4, MainDB: mainDB, + }) + require.NoError(t, err, "sqlitevec.Open") + t.Cleanup(func() { _ = backend.Close() }) + + gen, err := backend.CreateGeneration(ctx, "fake", 4, "") + require.NoError(t, err, "CreateGeneration") + + vecDB, err := sql.Open(sqlitevec.DriverName(), vecPath) + require.NoError(t, err, "open vectors handle") + t.Cleanup(func() { _ = vecDB.Close() }) + + ws := &e2eWorkStore{db: mainDB} + worker := embed.NewWorker(embed.WorkerDeps{ + Backend: backend, VectorsDB: vecDB, MainDB: mainDB, + Store: ws, Client: &e2eClient{dim: 4}, BatchSize: 8, + LastModifiedExpr: "CAST(m.last_modified AS TEXT)", + }) + + // Drain the corpus fully via the worker so every message is embedded + + // stamped and the per-gen watermark advances to the max id. + _, err = worker.RunOnce(ctx, gen) + require.NoError(t, err, "initial drain") + require.NoError(t, backend.ActivateGeneration(ctx, gen, false), "activate (coverage complete)") + require.Equal(t, 0, countMissingE2E(t, mainDB, int64(gen)), "all embedded after drain") + + // Create a sub-watermark straggler: un-stamp message 2 (its id is below + // the watermark, so a plain RunOnce will skip it). This models a + // repair-encoding NULL reset. + _, err = mainDB.ExecContext(ctx, `UPDATE messages SET embed_gen = NULL WHERE id = 2`) + require.NoError(t, err, "un-stamp straggler") + require.Equal(t, 1, countMissingE2E(t, mainDB, int64(gen)), "straggler now missing") + + now := time.Now() + clock := &now + job := &EmbedJob{ + Worker: worker, + Backend: backend, + Store: &e2eCoverage{db: mainDB}, + Fingerprint: "fake:4", + BackstopInterval: 24 * time.Hour, + Now: func() time.Time { return *clock }, + // Seed this generation's last backstop to "just now" so the first Run + // is WITHIN the interval and exercises the RunOnce-only path before we + // elapse it. + lastBackstop: map[vector.GenerationID]time.Time{gen: now}, + } + + // Tick A (within interval): RunOnce only. The sub-watermark straggler is + // NOT recovered because RunOnce resumes from the watermark. + job.Run(ctx) + assert.Equal(t, 1, countMissingE2E(t, mainDB, int64(gen)), + "within interval: RunOnce alone misses the sub-watermark straggler") + + // Tick B (interval elapsed): the backstop runs and recovers the + // straggler without re-embedding the already-stamped messages. + *clock = now.Add(25 * time.Hour) + job.Run(ctx) + assert.Equal(t, 0, countMissingE2E(t, mainDB, int64(gen)), + "after interval: backstop recovers the sub-watermark straggler") +} diff --git a/internal/scheduler/embed_job_pg_test.go b/internal/scheduler/embed_job_pg_test.go deleted file mode 100644 index a5ccd63e8..000000000 --- a/internal/scheduler/embed_job_pg_test.go +++ /dev/null @@ -1,131 +0,0 @@ -//go:build pgvector - -package scheduler - -import ( - "context" - "crypto/rand" - "database/sql" - "encoding/hex" - "os" - "strings" - "testing" - - _ "github.com/jackc/pgx/v5/stdlib" - "github.com/stretchr/testify/assert" - "github.com/stretchr/testify/require" - "go.kenn.io/msgvault/internal/store" - "go.kenn.io/msgvault/internal/vector" -) - -// openPGPendingDB stands up an isolated per-test schema on -// MSGVAULT_TEST_DB containing just the pending_embeddings table the -// activation gate's pendingCount queries. Returns a pgx *sql.DB scoped to -// the schema; the schema is dropped on cleanup. Skips when -// MSGVAULT_TEST_DB is unset or not a postgres DSN. -func openPGPendingDB(t *testing.T) *sql.DB { - t.Helper() - url := os.Getenv("MSGVAULT_TEST_DB") - if !strings.HasPrefix(url, "postgres://") && !strings.HasPrefix(url, "postgresql://") { - t.Skip("pgvector scheduler tests require MSGVAULT_TEST_DB to point at a PostgreSQL DSN") - } - - buf := make([]byte, 8) - _, err := rand.Read(buf) - require.NoError(t, err, "random schema name") - schemaName := "sched_pending_test_" + hex.EncodeToString(buf) - - setup, err := sql.Open("pgx", url) - require.NoError(t, err, "open setup") - defer func() { _ = setup.Close() }() - _, err = setup.Exec("CREATE SCHEMA " + schemaName) - require.NoError(t, err, "create schema") - - sep := "?" - if strings.Contains(url, "?") { - sep = "&" - } - testURL := url + sep + "search_path=" + schemaName + ",public" - - db, err := sql.Open("pgx", testURL) - require.NoError(t, err, "open") - t.Cleanup(func() { - _ = db.Close() - cleanup, err := sql.Open("pgx", url) - if err != nil { - return - } - defer func() { _ = cleanup.Close() }() - _, _ = cleanup.Exec("DROP SCHEMA " + schemaName + " CASCADE") - }) - - _, err = db.Exec(` - CREATE TABLE pending_embeddings ( - generation_id BIGINT NOT NULL, - message_id BIGINT NOT NULL, - PRIMARY KEY (generation_id, message_id) - )`) - require.NoError(t, err, "create pending_embeddings") - return db -} - -// TestEmbedJobPG_ActivatesBuildingWhenDrained drives the activation gate -// against live PG: with the pgvector VectorsDB handle, pendingCount must -// rebind its ? placeholder to $N so pgx accepts the count. The building -// generation has zero pending rows, so the gate passes and the daemon -// activates it. -// -// Before EmbedJob.Rebind was wired, pendingCount issued a bare `?` against -// the pgx handle and failed with SQLSTATE 42601 ("syntax error"), which -// the activation path swallows as a warn — the building generation would -// never auto-activate on PG. This test fails-without / passes-with the fix. -func TestEmbedJobPG_ActivatesBuildingWhenDrained(t *testing.T) { - db := openPGPendingDB(t) - building := &vector.Generation{ID: 77, State: vector.GenerationBuilding, Fingerprint: "m:768"} - backend := &fakeBackend{ - activeErr: vector.ErrNoActiveGeneration, - building: building, - } - runner := &fakeRunner{} - job := &EmbedJob{ - Worker: runner, - Backend: backend, - VectorsDB: db, - Rebind: (&store.PostgreSQLDialect{}).Rebind, - Fingerprint: "m:768", - } - - job.Run(context.Background()) - - assert.Equal(t, []vector.GenerationID{77}, backend.activations(), - "building generation must auto-activate when its PG pending queue is drained") -} - -// TestEmbedJobPG_DoesNotActivateWhilePending is the inverse: with a -// pending row present, pendingCount (rebound to $N) returns > 0 and the -// gate must NOT activate. This also exercises the rebound query on pgx — -// a non-rebinding pendingCount errors out before it can read the count. -func TestEmbedJobPG_DoesNotActivateWhilePending(t *testing.T) { - db := openPGPendingDB(t) - _, err := db.Exec(`INSERT INTO pending_embeddings (generation_id, message_id) VALUES (77, 1)`) - require.NoError(t, err, "seed pending") - - building := &vector.Generation{ID: 77, State: vector.GenerationBuilding, Fingerprint: "m:768"} - backend := &fakeBackend{ - activeErr: vector.ErrNoActiveGeneration, - building: building, - } - runner := &fakeRunner{} - job := &EmbedJob{ - Worker: runner, - Backend: backend, - VectorsDB: db, - Rebind: (&store.PostgreSQLDialect{}).Rebind, - Fingerprint: "m:768", - } - - job.Run(context.Background()) - - assert.Empty(t, backend.activations(), - "building generation must not activate while PG pending rows remain") -} diff --git a/internal/scheduler/scheduler_test.go b/internal/scheduler/scheduler_test.go index ba960b87d..7913afdc8 100644 --- a/internal/scheduler/scheduler_test.go +++ b/internal/scheduler/scheduler_test.go @@ -2,14 +2,12 @@ package scheduler import ( "context" - "database/sql" "errors" "sync" "sync/atomic" "testing" "time" - _ "github.com/mattn/go-sqlite3" assertpkg "github.com/stretchr/testify/assert" requirepkg "github.com/stretchr/testify/require" @@ -403,8 +401,8 @@ func TestTriggerSyncAfterStop(t *testing.T) { // ---------- fakes for EmbedJob tests ---------- // fakeBackend implements vector.Backend. Only ActiveGeneration, -// BuildingGeneration, ActivateGeneration, and EnsureSeeded are -// meaningfully populated; the rest panic to catch accidental usage. +// BuildingGeneration, and ActivateGeneration are meaningfully populated; +// the rest panic to catch accidental usage. type fakeBackend struct { active vector.Generation activeErr error @@ -415,10 +413,6 @@ type fakeBackend struct { activateErr error mu sync.Mutex activateCallIDs []vector.GenerationID - // ensureSeededErr is what EnsureSeeded returns; ensureSeededIDs - // records the gen IDs the EmbedJob passed to EnsureSeeded. - ensureSeededErr error - ensureSeededIDs []vector.GenerationID activeCalls atomic.Int32 buildingCalls atomic.Int32 @@ -466,30 +460,33 @@ func (f *fakeBackend) Stats(ctx context.Context, gen vector.GenerationID) (vecto func (f *fakeBackend) LoadVector(ctx context.Context, messageID int64) ([]float32, error) { panic("unexpected: LoadVector") } -func (f *fakeBackend) Close() error { return nil } -func (f *fakeBackend) EnsureSeeded(_ context.Context, gen vector.GenerationID) error { - f.mu.Lock() - f.ensureSeededIDs = append(f.ensureSeededIDs, gen) - f.mu.Unlock() - return f.ensureSeededErr +func (f *fakeBackend) ResetWatermarkBelow(ctx context.Context, minID int64) error { + panic("unexpected: ResetWatermarkBelow") } -func (f *fakeBackend) ensureSeededCalls() []vector.GenerationID { - f.mu.Lock() - defer f.mu.Unlock() - return append([]vector.GenerationID(nil), f.ensureSeededIDs...) +func (f *fakeBackend) EmbeddedMessageCount(ctx context.Context, gen vector.GenerationID) (int64, error) { + panic("unexpected: EmbeddedMessageCount") } +func (f *fakeBackend) Close() error { return nil } // fakeRunner records calls to satisfy EmbedRunner. type fakeRunner struct { - mu sync.Mutex - reclaimErr error - reclaimCalls int - runErr error - runCalls int - lastRunGen vector.GenerationID - runOnceResult embed.RunResult - runDoneOnce sync.Once - runDone chan struct{} // optional: closed after first RunOnce + mu sync.Mutex + reclaimErr error + reclaimCalls int + runErr error + runCalls int + lastRunGen vector.GenerationID + runOnceResult embed.RunResult + backstopErr error + backstopCalls int + lastBackstop vector.GenerationID + backstopResult embed.RunResult + runDoneOnce sync.Once + runDone chan struct{} // optional: closed after first RunOnce + // onBackstop, if set, is invoked from RunBackstop (after recording the + // call) to let tests model a side effect of the backstop pass, e.g. a + // straggler becoming covered. Called while r.mu is held. + onBackstop func() } func (r *fakeRunner) ReclaimStale(ctx context.Context) (int, error) { @@ -513,12 +510,29 @@ func (r *fakeRunner) RunOnce(ctx context.Context, gen vector.GenerationID) (embe return res, err } +func (r *fakeRunner) RunBackstop(ctx context.Context, gen vector.GenerationID) (embed.RunResult, error) { + r.mu.Lock() + defer r.mu.Unlock() + r.backstopCalls++ + r.lastBackstop = gen + if r.onBackstop != nil && r.backstopErr == nil { + r.onBackstop() + } + return r.backstopResult, r.backstopErr +} + func (r *fakeRunner) calls() (reclaim, run int, lastGen vector.GenerationID) { r.mu.Lock() defer r.mu.Unlock() return r.reclaimCalls, r.runCalls, r.lastRunGen } +func (r *fakeRunner) backstops() (n int, lastGen vector.GenerationID) { + r.mu.Lock() + defer r.mu.Unlock() + return r.backstopCalls, r.lastBackstop +} + // ---------- EmbedJob tests ---------- func TestEmbedJob_Run_ActiveGeneration(t *testing.T) { @@ -650,9 +664,8 @@ func TestEmbedJob_Run_PrefersBuildingOverActive(t *testing.T) { active: vector.Generation{ID: 5, State: vector.GenerationActive, Fingerprint: "m:768"}, building: building, } - // Pending count = 0 (no VectorsDB wired), so the activation gate - // will skip auto-activation; we're only asserting the target - // selection here. + // No Store wired, so the activation gate skips auto-activation; we're + // only asserting target selection here. runner := &fakeRunner{} job := &EmbedJob{Worker: runner, Backend: backend, Fingerprint: "m:768"} @@ -665,20 +678,20 @@ func TestEmbedJob_Run_PrefersBuildingOverActive(t *testing.T) { } // TestEmbedJob_Run_ActivatesBuildingWhenDrained verifies the -// activation gate: after RunOnce on a building generation, if -// pending_embeddings is empty for that gen, the daemon must call -// ActivateGeneration so the new index actually starts serving. -// Without this, a daemon-only deployment can never complete a +// activation gate: after RunOnce on a building generation, if coverage +// is complete for that gen (no live message still needs embedding), the +// daemon must call ActivateGeneration so the new index actually starts +// serving. Without this, a daemon-only deployment can never complete a // `--full-rebuild` started by the CLI. func TestEmbedJob_Run_ActivatesBuildingWhenDrained(t *testing.T) { - db := newPendingDB(t) building := &vector.Generation{ID: 77, State: vector.GenerationBuilding, Fingerprint: "m:768"} backend := &fakeBackend{ activeErr: vector.ErrNoActiveGeneration, building: building, } runner := &fakeRunner{} - job := &EmbedJob{Worker: runner, Backend: backend, VectorsDB: db, Fingerprint: "m:768"} + cov := &fakeCoverage{missing: 0} + job := &EmbedJob{Worker: runner, Backend: backend, Store: cov, Fingerprint: "m:768"} job.Run(context.Background()) @@ -686,23 +699,21 @@ func TestEmbedJob_Run_ActivatesBuildingWhenDrained(t *testing.T) { } // TestEmbedJob_Run_DoesNotActivateWhilePending guards the inverse -// case: pending_embeddings still has rows, so the building must NOT -// be activated yet (its index is incomplete). +// case: coverage still reports missing messages, so the building must +// NOT be activated yet (its index is incomplete). func TestEmbedJob_Run_DoesNotActivateWhilePending(t *testing.T) { - db := newPendingDB(t) - _, err := db.Exec(`INSERT INTO pending_embeddings (generation_id, message_id) VALUES (77, 1)`) - requirepkg.NoError(t, err, "seed pending") building := &vector.Generation{ID: 77, State: vector.GenerationBuilding, Fingerprint: "m:768"} backend := &fakeBackend{ activeErr: vector.ErrNoActiveGeneration, building: building, } runner := &fakeRunner{} - job := &EmbedJob{Worker: runner, Backend: backend, VectorsDB: db, Fingerprint: "m:768"} + cov := &fakeCoverage{missing: 1} + job := &EmbedJob{Worker: runner, Backend: backend, Store: cov, Fingerprint: "m:768"} job.Run(context.Background()) - assertpkg.Empty(t, backend.activations(), "activations (pending still > 0)") + assertpkg.Empty(t, backend.activations(), "activations (missing still > 0)") } // TestEmbedJob_Run_LeavesMismatchedBuildingForCLI guards against the @@ -726,82 +737,34 @@ func TestEmbedJob_Run_LeavesMismatchedBuildingForCLI(t *testing.T) { assertpkg.Empty(t, backend.activations(), "activations") } -// TestEmbedJob_Run_EnsuresSeededBeforeRunOnce regresses the crash -// window where CreateGeneration inserted a `building` row but died -// before committing the initial seed. Without EnsureSeeded on the -// resume path, RunOnce would see an empty queue, pendingCount would -// be 0, and the daemon would activate an unseeded generation — a -// silent, catastrophic data loss for semantic search. EnsureSeeded -// must be called BEFORE RunOnce so the seed commits first. -func TestEmbedJob_Run_EnsuresSeededBeforeRunOnce(t *testing.T) { - db := newPendingDB(t) - building := &vector.Generation{ID: 99, State: vector.GenerationBuilding, Fingerprint: "m:768"} - backend := &fakeBackend{ - activeErr: vector.ErrNoActiveGeneration, - building: building, - } - runner := &fakeRunner{} - job := &EmbedJob{Worker: runner, Backend: backend, VectorsDB: db, Fingerprint: "m:768"} - - job.Run(context.Background()) - - assertpkg.Equal(t, []vector.GenerationID{99}, backend.ensureSeededCalls(), "EnsureSeeded calls") - _, run, _ := runner.calls() - assertpkg.Equal(t, 1, run, "RunOnce calls (should run after seeding)") -} - -// TestEmbedJob_Run_EnsureSeededErrorBailsOut guards the error path: -// if EnsureSeeded returns an error (e.g. the generation was already -// activated or retired between BuildingGeneration and EnsureSeeded), -// the daemon must NOT call RunOnce or ActivateGeneration — the -// generation is not in a state the daemon can safely drive. -func TestEmbedJob_Run_EnsureSeededErrorBailsOut(t *testing.T) { - db := newPendingDB(t) - building := &vector.Generation{ID: 55, State: vector.GenerationBuilding, Fingerprint: "m:768"} - backend := &fakeBackend{ - activeErr: vector.ErrNoActiveGeneration, - building: building, - ensureSeededErr: errors.New("generation state=active, want building"), - } - runner := &fakeRunner{} - job := &EmbedJob{Worker: runner, Backend: backend, VectorsDB: db, Fingerprint: "m:768"} - - job.Run(context.Background()) - - _, run, _ := runner.calls() - assertpkg.Equal(t, 0, run, "RunOnce calls (EnsureSeeded failed — must not proceed)") - assertpkg.Empty(t, backend.activations(), "activations (EnsureSeeded failed)") -} - // TestEmbedJob_Run_PostActivationEnqueueDrainsOnNextRun is the // eventual-consistency check that pairs with the comment in // embed_job.go's activation gate. It simulates the race the gate is -// designed to tolerate: pendingCount reads 0, activation flips -// state to active, then a new pending row appears (as if a sync -// committed between the read and the activate). The next worker -// run must pick the now-active generation as its target — proving -// the post-activation top-up path runs and the system converges. +// designed to tolerate: coverage reads 0 missing, activation flips +// state to active, then a new message appears (as if a sync committed +// between the read and the activate). The next worker run must pick the +// now-active generation as its target — proving the post-activation +// top-up path runs and the system converges. func TestEmbedJob_Run_PostActivationEnqueueDrainsOnNextRun(t *testing.T) { require := requirepkg.New(t) assert := assertpkg.New(t) - db := newPendingDB(t) gen := vector.Generation{ID: 88, State: vector.GenerationBuilding, Fingerprint: "m:768"} backend := &fakeBackend{ activeErr: vector.ErrNoActiveGeneration, building: &gen, } runner := &fakeRunner{} - job := &EmbedJob{Worker: runner, Backend: backend, VectorsDB: db, Fingerprint: "m:768"} + cov := &fakeCoverage{missing: 0} + job := &EmbedJob{Worker: runner, Backend: backend, Store: cov, Fingerprint: "m:768"} - // Tick 1: building drained, activation flips to active. + // Tick 1: building covered, activation flips to active. job.Run(context.Background()) require.Equal([]vector.GenerationID{88}, backend.activations(), "tick 1 activations") - // Simulate the race: a sync.EnqueueMessages commit lands AFTER - // activation, adding a pending row bound to the (now-active) - // generation. The fakeBackend reflects the post-activation state. - _, err := db.Exec(`INSERT INTO pending_embeddings (generation_id, message_id) VALUES (88, 1)`) - require.NoError(err, "enqueue") + // Simulate the race: a sync commit lands AFTER activation, adding a + // message that reads as missing for the (now-active) generation. The + // fakeBackend reflects the post-activation state. + cov.missing = 1 backend.building = nil backend.active = vector.Generation{ID: 88, State: vector.GenerationActive, Fingerprint: "m:768"} backend.activeErr = nil @@ -816,21 +779,184 @@ func TestEmbedJob_Run_PostActivationEnqueueDrainsOnNextRun(t *testing.T) { assert.Len(backend.activations(), 1, "activations (only first activation)") } -// newPendingDB returns an in-memory SQLite handle with just the -// pending_embeddings table the activation gate counts against. -func newPendingDB(t *testing.T) *sql.DB { - t.Helper() - db, err := sql.Open("sqlite3", ":memory:") - requirepkg.NoError(t, err, "open") - t.Cleanup(func() { _ = db.Close() }) - _, err = db.Exec(` -CREATE TABLE pending_embeddings ( - generation_id INTEGER NOT NULL, - message_id INTEGER NOT NULL, - PRIMARY KEY (generation_id, message_id) -);`) - requirepkg.NoError(t, err, "schema") - return db +// TestEmbedJob_Run_BackstopRunsOnFirstTick verifies the auto-backstop is +// woven into the existing embed job: on the first tick (lastBackstop zero) +// it runs a full backstop on the same target as RunOnce. +func TestEmbedJob_Run_BackstopRunsOnFirstTick(t *testing.T) { + backend := &fakeBackend{active: vector.Generation{ID: 5, State: vector.GenerationActive}} + runner := &fakeRunner{} + job := &EmbedJob{Worker: runner, Backend: backend} + + job.Run(context.Background()) + + _, run, runGen := runner.calls() + assertpkg.Equal(t, 1, run, "RunOnce calls") + n, bsGen := runner.backstops() + assertpkg.Equal(t, 1, n, "RunBackstop calls on first tick") + assertpkg.Equal(t, runGen, bsGen, "backstop targets the same generation as RunOnce") +} + +// TestEmbedJob_Run_BackstopGatedByInterval verifies the ~daily gating: a +// second tick within BackstopInterval does NOT run another backstop (only +// RunOnce), and a tick after the interval elapses runs one again. +func TestEmbedJob_Run_BackstopGatedByInterval(t *testing.T) { + assert := assertpkg.New(t) + backend := &fakeBackend{active: vector.Generation{ID: 5, State: vector.GenerationActive}} + runner := &fakeRunner{} + now := time.Now() + clock := &now + job := &EmbedJob{ + Worker: runner, + Backend: backend, + BackstopInterval: 24 * time.Hour, + Now: func() time.Time { return *clock }, + } + + // Tick 1: backstop runs (first tick). + job.Run(context.Background()) + n, _ := runner.backstops() + assert.Equal(1, n, "tick 1: backstop runs") + + // Tick 2, only 1h later: within interval -> only RunOnce, no backstop. + *clock = now.Add(1 * time.Hour) + job.Run(context.Background()) + n, _ = runner.backstops() + assert.Equal(1, n, "tick 2 (within interval): no extra backstop") + _, run, _ := runner.calls() + assert.Equal(2, run, "tick 2: RunOnce still runs") + + // Tick 3, 25h after the last backstop: interval elapsed -> backstop runs. + *clock = now.Add(25 * time.Hour) + job.Run(context.Background()) + n, _ = runner.backstops() + assert.Equal(2, n, "tick 3 (interval elapsed): backstop runs again") +} + +// TestEmbedJob_Run_BackstopDisabled verifies a negative BackstopInterval +// disables the auto-backstop entirely (only RunOnce runs). +func TestEmbedJob_Run_BackstopDisabled(t *testing.T) { + backend := &fakeBackend{active: vector.Generation{ID: 5, State: vector.GenerationActive}} + runner := &fakeRunner{} + job := &EmbedJob{Worker: runner, Backend: backend, BackstopInterval: -1} + + job.Run(context.Background()) + + n, _ := runner.backstops() + assertpkg.Equal(t, 0, n, "backstop disabled: no RunBackstop") + _, run, _ := runner.calls() + assertpkg.Equal(t, 1, run, "RunOnce still runs") +} + +// TestEmbedJob_Run_BackstopFailureNotFatal verifies a backstop error is +// logged but does not block the rest of the cycle, and lastBackstop is not +// advanced (so the next tick retries). +func TestEmbedJob_Run_BackstopFailureRetries(t *testing.T) { + backend := &fakeBackend{active: vector.Generation{ID: 5, State: vector.GenerationActive}} + runner := &fakeRunner{backstopErr: errors.New("boom")} + now := time.Now() + clock := &now + job := &EmbedJob{ + Worker: runner, + Backend: backend, + Now: func() time.Time { return *clock }, + } + + // Tick 1: backstop attempted, fails. + job.Run(context.Background()) + n, _ := runner.backstops() + assertpkg.Equal(t, 1, n, "tick 1: backstop attempted") + + // Tick 2 immediately after: because the failure did not advance + // lastBackstop, the backstop is retried (lastBackstop still zero). + runner.backstopErr = nil + job.Run(context.Background()) + n, _ = runner.backstops() + assertpkg.Equal(t, 2, n, "tick 2: backstop retried after prior failure") +} + +// TestEmbedJob_Run_BackstopThrottleIsPerGeneration reproduces the compound +// precondition the per-generation throttle fixes: the throttle was recently +// set for the ACTIVE generation (so a single job-global throttle WOULD skip +// the next backstop), then pickTarget switches to a DIFFERENT building +// generation that has a below-watermark straggler. With a global time.Time +// throttle the building generation's first backstop would be suppressed for up +// to BackstopInterval — leaving the straggler unrecovered and blocking +// auto-activation (MissingCount stays > 0). With the per-generation map the +// building generation has no recorded backstop, so it runs on this tick, +// recovers the straggler, and the generation activates. +// +// This FAILS with the old global throttle (no backstop for gen 99 -> straggler +// remains, no activation) and PASSES with the per-gen map. +func TestEmbedJob_Run_BackstopThrottleIsPerGeneration(t *testing.T) { + assert := assertpkg.New(t) + building := &vector.Generation{ID: 99, State: vector.GenerationBuilding, Fingerprint: "m:768"} + backend := &fakeBackend{ + active: vector.Generation{ID: 5, State: vector.GenerationActive, Fingerprint: "m:768"}, + building: building, + } + // The straggler is recovered by the backstop pass: coverage reports it + // missing UNTIL RunBackstop runs, then reports complete. This mirrors the + // production recovery path (RunBackstop re-embeds the sub-watermark + // straggler, after which MissingCount drops to 0 and the gen can activate). + cov := &recoverOnBackstopCoverage{} + runner := &fakeRunner{onBackstop: cov.markRecovered} + now := time.Now() + clock := &now + job := &EmbedJob{ + Worker: runner, + Backend: backend, + Store: cov, + Fingerprint: "m:768", + BackstopInterval: 24 * time.Hour, + Now: func() time.Time { return *clock }, + // Seed only the ACTIVE generation's last backstop to "just now". A + // job-global throttle would read this and skip the building gen's + // backstop; the per-gen map must not, because gen 99 has no entry. + lastBackstop: map[vector.GenerationID]time.Time{5: now}, + } + + // Single tick: pickTarget prefers the building generation. Its backstop + // must run despite the active generation's recent (seeded) backstop. + job.Run(context.Background()) + + n, bsGen := runner.backstops() + assert.Equal(1, n, "building generation backstop must run despite active gen's recent backstop") + assert.Equal(vector.GenerationID(99), bsGen, "backstop targets the building generation") + assert.Equal([]vector.GenerationID{99}, backend.activations(), + "building generation activates after backstop recovers its straggler") +} + +// recoverOnBackstopCoverage models the activation gate's view of a building +// generation that has one below-watermark straggler: MissingCount reports 1 +// until the backstop pass recovers it (markRecovered), then 0. +type recoverOnBackstopCoverage struct { + mu sync.Mutex + recovered bool +} + +func (c *recoverOnBackstopCoverage) markRecovered() { + c.mu.Lock() + defer c.mu.Unlock() + c.recovered = true +} + +func (c *recoverOnBackstopCoverage) MissingCount(context.Context, int64) (int64, error) { + c.mu.Lock() + defer c.mu.Unlock() + if c.recovered { + return 0, nil + } + return 1, nil +} + +// fakeCoverage satisfies EmbedCoverage for the activation-gate tests: +// it reports a fixed number of live messages still needing embedding. +type fakeCoverage struct { + missing int64 +} + +func (c *fakeCoverage) MissingCount(_ context.Context, _ int64) (int64, error) { + return c.missing, nil } // slowRunner blocks RunOnce on `release` so tests can control when it @@ -859,6 +985,10 @@ func (r *slowRunner) RunOnce(context.Context, vector.GenerationID) (embed.RunRes return embed.RunResult{}, nil } +func (r *slowRunner) RunBackstop(context.Context, vector.GenerationID) (embed.RunResult, error) { + return embed.RunResult{}, nil +} + func (r *slowRunner) calls() int { r.mu.Lock() defer r.mu.Unlock() diff --git a/internal/store/dialect.go b/internal/store/dialect.go index a22e02e30..6b90adafb 100644 --- a/internal/store/dialect.go +++ b/internal/store/dialect.go @@ -129,6 +129,19 @@ type Dialect interface { // 30s timeout on a large archive (finding S1). EnsureFTSIndex(q querier) error + // EnsureTriggers idempotently creates the database-maintained triggers + // that bump messages.last_modified on any change to a message or its + // body row. Called by InitSchema after LegacyColumnMigrations (which add + // the last_modified column on legacy DBs), so the column is guaranteed + // present. SQLite is a no-op: its triggers are `CREATE TRIGGER IF NOT + // EXISTS` in schema.sql, re-exec'd idempotently by InitSchema. PostgreSQL + // creates them here because CREATE TRIGGER is not idempotent before PG14, + // so the impl wraps each in `DROP TRIGGER IF EXISTS ...; CREATE TRIGGER`. + // + // Takes a querier (not *sql.DB) so InitSchema can run it on the + // maintenance transaction (consistent with EnsureFTSIndex). + EnsureTriggers(q querier) error + // LegacyColumnMigrations returns ALTER TABLE ADD COLUMN statements to // bring older databases up to date with schema columns added over time. // Both dialects return the same logical list, translated to the diff --git a/internal/store/dialect_pg.go b/internal/store/dialect_pg.go index 3d87c1562..a0738cd78 100644 --- a/internal/store/dialect_pg.go +++ b/internal/store/dialect_pg.go @@ -320,6 +320,16 @@ func (d *PostgreSQLDialect) LegacyColumnMigrations() []ColumnMigration { // column and FTS stays unavailable. Its GIN index is created // separately by EnsureFTSIndex AFTER this migration runs. [cr2-10] {`ALTER TABLE messages ADD COLUMN IF NOT EXISTS search_fts TSVECTOR`, "search_fts"}, + // embed_gen: per-message vector-embedding watermark. NULL default + // means every legacy row reads as "needs embedding", which is + // correct — the scan-and-fill worker (and backstop) will embed and + // stamp them. No backfill. + {`ALTER TABLE messages ADD COLUMN IF NOT EXISTS embed_gen BIGINT`, "embed_gen"}, + // last_modified: row-level last-modified watermark, the embed + // worker's optimistic-CAS token. Existing rows get the default + // (CURRENT_TIMESTAMP at the time the column is added); the triggers + // created by EnsureTriggers keep it current thereafter. + {`ALTER TABLE messages ADD COLUMN IF NOT EXISTS last_modified TIMESTAMPTZ DEFAULT CURRENT_TIMESTAMP`, "last_modified"}, } } @@ -347,6 +357,54 @@ func (d *PostgreSQLDialect) EnsureFTSIndex(q querier) error { return nil } +// EnsureTriggers creates the last_modified maintenance triggers idempotently. +// Two triggers feed messages.last_modified: +// +// - trg_messages_last_modified (BEFORE UPDATE on messages): sets +// NEW.last_modified in-row. BEFORE → no secondary write → no recursion. +// The WHEN guard (OLD.last_modified IS NOT DISTINCT FROM NEW.last_modified) +// yields to an explicit last_modified write in the UPDATE rather than +// overriding it, mirroring the SQLite trigger's guard. +// - trg_message_bodies_last_modified (AFTER INSERT OR UPDATE on +// message_bodies): bumps the parent message's last_modified so body +// edits move the worker's CAS token too. +// +// CREATE TRIGGER is not idempotent before PG14, so each trigger is dropped +// (IF EXISTS) and recreated; the functions use CREATE OR REPLACE. Re-running +// InitSchema is therefore safe. Runs on the querier so InitSchema can route +// it through the maintenance transaction (consistent with EnsureFTSIndex). +func (d *PostgreSQLDialect) EnsureTriggers(q querier) error { + stmts := []string{ + `CREATE OR REPLACE FUNCTION set_messages_last_modified() RETURNS trigger AS $$ + BEGIN + NEW.last_modified := CURRENT_TIMESTAMP; + RETURN NEW; + END; + $$ LANGUAGE plpgsql`, + `DROP TRIGGER IF EXISTS trg_messages_last_modified ON messages`, + `CREATE TRIGGER trg_messages_last_modified + BEFORE UPDATE ON messages FOR EACH ROW + WHEN (OLD.last_modified IS NOT DISTINCT FROM NEW.last_modified) + EXECUTE FUNCTION set_messages_last_modified()`, + `CREATE OR REPLACE FUNCTION bump_message_last_modified() RETURNS trigger AS $$ + BEGIN + UPDATE messages SET last_modified = CURRENT_TIMESTAMP WHERE id = NEW.message_id; + RETURN NEW; + END; + $$ LANGUAGE plpgsql`, + `DROP TRIGGER IF EXISTS trg_message_bodies_last_modified ON message_bodies`, + `CREATE TRIGGER trg_message_bodies_last_modified + AFTER INSERT OR UPDATE ON message_bodies FOR EACH ROW + EXECUTE FUNCTION bump_message_last_modified()`, + } + for _, stmt := range stmts { + if _, err := q.Exec(stmt); err != nil { + return fmt.Errorf("ensure last_modified triggers: %w", err) + } + } + return nil +} + // DatabaseSize queries pg_database_size() for the current database. func (d *PostgreSQLDialect) DatabaseSize(db *sql.DB, _ string) (int64, error) { var size int64 @@ -374,7 +432,7 @@ func (d *PostgreSQLDialect) CheckpointWAL(db *sql.DB) error { return nil } // SchemaStaleCheck returns the SQL to check whether migrations are needed. // PostgreSQL uses information_schema instead of pragma_table_info. func (d *PostgreSQLDialect) SchemaStaleCheck() string { - return postgresColumnExistsSQL("conversations", "conversation_type") + return postgresColumnExistsSQL("messages", "embed_gen") } // IsDuplicateColumnError returns true if the error is a "column already exists" error. diff --git a/internal/store/dialect_sqlite.go b/internal/store/dialect_sqlite.go index c5d5b922d..122d08a2e 100644 --- a/internal/store/dialect_sqlite.go +++ b/internal/store/dialect_sqlite.go @@ -225,6 +225,11 @@ func (d *SQLiteDialect) FTSRebuildSchema(q querier) error { // not a post-migration step (cr2-10). func (d *SQLiteDialect) EnsureFTSIndex(querier) error { return nil } +// EnsureTriggers is a no-op for SQLite: the last_modified triggers are +// `CREATE TRIGGER IF NOT EXISTS` in schema.sql, which InitSchema re-execs +// idempotently on every open (fresh and existing DBs alike). +func (d *SQLiteDialect) EnsureTriggers(querier) error { return nil } + // LegacyColumnMigrations returns the ALTER TABLE ADD COLUMN statements that // bring older SQLite databases up to the current schema. IsDuplicateColumnError // silences these when the column already exists (idempotent migrations). @@ -243,6 +248,21 @@ func (d *SQLiteDialect) LegacyColumnMigrations() []ColumnMigration { {`ALTER TABLE messages ADD COLUMN delete_batch_id TEXT`, "delete_batch_id"}, {`ALTER TABLE conversations ADD COLUMN title TEXT`, "title"}, {`ALTER TABLE conversations ADD COLUMN conversation_type TEXT NOT NULL DEFAULT 'email_thread'`, "conversation_type"}, + // embed_gen: per-message vector-embedding watermark. NULL default + // means every legacy row reads as "needs embedding", which is + // correct — the scan-and-fill worker (and backstop) will embed and + // stamp them. No backfill. + {`ALTER TABLE messages ADD COLUMN embed_gen INTEGER`, "embed_gen"}, + // last_modified: row-level last-modified watermark, the embed + // worker's optimistic-CAS token. SQLite rejects a non-constant + // DEFAULT in ADD COLUMN ("Cannot add a column with non-constant + // default"), so the column is added with no default (existing rows + // get NULL) and InitSchema's backfillLastModified follows up with a + // one-shot `UPDATE ... SET last_modified = CURRENT_TIMESTAMP WHERE + // last_modified IS NULL` so the CAS token is a comparable value + // (NULL would never match `last_modified = ?`). Fresh DBs keep the + // CREATE TABLE default in schema.sql, which IS allowed. + {`ALTER TABLE messages ADD COLUMN last_modified DATETIME`, "last_modified"}, } } @@ -285,7 +305,7 @@ func (d *SQLiteDialect) CheckpointWAL(db *sql.DB) error { // SchemaStaleCheck returns the SQL to check whether the most recent migration column exists. func (d *SQLiteDialect) SchemaStaleCheck() string { - return "SELECT COUNT(*) FROM pragma_table_info('conversations') WHERE name = 'conversation_type'" + return "SELECT COUNT(*) FROM pragma_table_info('messages') WHERE name = 'embed_gen'" } // IsDuplicateColumnError returns true if the error is "duplicate column name" from ALTER TABLE. diff --git a/internal/store/embed_gen.go b/internal/store/embed_gen.go new file mode 100644 index 000000000..e7517283a --- /dev/null +++ b/internal/store/embed_gen.go @@ -0,0 +1,287 @@ +package store + +import ( + "context" + "fmt" + "strings" +) + +// embedGenStampChunkRows caps how many message ids go into a single +// SetEmbedGen UPDATE. Each statement binds one placeholder per id plus +// one for the target generation, so 500 ids = 501 bound parameters — +// comfortably under SQLite's historical 999 (and the store's 900-param +// convention; see insertInChunks) and PostgreSQL's 65,535. Mirrors the +// store's existing chunking discipline so an oversized embed batch never +// blows the driver bind ceiling. A var (not const) only so tests can +// lower it to exercise the chunk boundary; production never reassigns it. +var embedGenStampChunkRows = 500 + +// ScanForEmbedding returns up to limit live message ids that still need +// embedding for the target generation — i.e. rows whose embed_gen does +// not already equal target — scanning forward from afterID in id order. +// +// The portable predicate (embed_gen IS NULL OR embed_gen <> ?) covers +// both never-embedded rows (NULL) and rows stamped for a different +// generation, and avoids any IS DISTINCT FROM driver-version doubt. The +// forward bound (id > afterID) lets the caller resume from a per-gen +// watermark; pass 0 for a full scan (the backstop). Results are ordered +// by id so the caller can advance the watermark to the batch's max id. +// +// This runs against the MAIN db (messages + embed_gen live there on both +// backends). On SQLite the embeddings themselves live in vectors.db, so +// this find-work query and the SetEmbedGen stamp cannot share a tx with +// the embeddings upsert — the worker orders the steps (upsert, then +// stamp) and relies on idempotency, see internal/vector/embed/worker.go. +func (s *Store) ScanForEmbedding(ctx context.Context, target int64, afterID int64, limit int) ([]int64, error) { + if limit <= 0 { + return nil, nil + } + q := `SELECT id FROM messages + WHERE (embed_gen IS NULL OR embed_gen <> ?) + AND ` + LiveMessagesWhere("", true) + ` + AND id > ? + ORDER BY id + LIMIT ?` + rows, err := s.db.QueryContext(ctx, q, target, afterID, limit) + if err != nil { + return nil, fmt.Errorf("scan for embedding: %w", err) + } + defer func() { _ = rows.Close() }() + + var out []int64 + for rows.Next() { + var id int64 + if err := rows.Scan(&id); err != nil { + return nil, fmt.Errorf("scan message id: %w", err) + } + out = append(out, id) + } + if err := rows.Err(); err != nil { + return nil, fmt.Errorf("iterate message ids: %w", err) + } + return out, nil +} + +// SetEmbedGen stamps embed_gen = target on the given message ids, +// marking them covered for that generation. Used by the embed worker +// after a successful upsert (the rows now have embeddings for target) or +// to skip-mark rows that are missing/empty and will never produce an +// embedding. Idempotent: re-stamping an already-stamped row is a no-op. +// +// The ids are processed in chunks (see embedGenStampChunkRows) to stay +// under the driver's bind limit; chunks are not wrapped in a single +// transaction because each chunk's UPDATE is independently idempotent and +// the cross-DB worker contract already tolerates a partial stamp (the +// next scan re-finds any unstamped rows and re-runs an idempotent batch). +func (s *Store) SetEmbedGen(ctx context.Context, ids []int64, target int64) error { + if len(ids) == 0 { + return nil + } + for start := 0; start < len(ids); start += embedGenStampChunkRows { + end := min(start+embedGenStampChunkRows, len(ids)) + chunk := ids[start:end] + + placeholders := make([]string, len(chunk)) + args := make([]any, 0, 1+len(chunk)) + args = append(args, target) + for i, id := range chunk { + placeholders[i] = "?" + args = append(args, id) + } + q := `UPDATE messages SET embed_gen = ? WHERE id IN (` + + strings.Join(placeholders, ",") + `)` + if _, err := s.db.ExecContext(ctx, q, args...); err != nil { + return fmt.Errorf("set embed_gen: %w", err) + } + } + return nil +} + +// EmbedGenStamp pairs a message id with the last_modified token captured +// when the worker read that message's content. SetEmbedGenIfUnchanged +// stamps embed_gen only while last_modified still equals this value. +// +// LastModified is carried as an opaque `any` so the worker can round-trip +// whatever the driver scanned without the store needing a backend-specific +// type: on SQLite the worker scans CAST(last_modified AS TEXT) into a string +// (defeating go-sqlite3's DATETIME→time.Time coercion, which would otherwise +// reformat the value and break equality on the round-trip) and binds the same +// string back; on PostgreSQL it scans a time.Time and binds the same +// time.Time back. The WHERE comparison runs entirely server-side against the +// stored value. +type EmbedGenStamp struct { + ID int64 + LastModified any +} + +// SetEmbedGenIfUnchanged stamps embed_gen = target on each message, but +// ONLY if its last_modified still equals the value captured at content-read +// time (optimistic CAS). A message whose last_modified changed between read +// and stamp — e.g. repair-encoding (or any concurrent content edit) rewrote +// its text, which the DB triggers reflected by bumping last_modified — is +// NOT stamped (its UPDATE matches 0 rows); it stays "needs embedding" and is +// re-found and re-embedded with the corrected content on the next scan. This +// closes the read→stamp race that an unconditional stamp would lose by +// marking the row embedded-with-stale-content. +// +// The worker's own stamp UPDATE bumps last_modified on BOTH backends via +// their triggers: this UPDATE sets only embed_gen (not last_modified), so the +// SQLite AFTER-UPDATE trigger fires (its WHEN OLD.last_modified = NEW... holds) +// and re-stamps last_modified, and the PG BEFORE-UPDATE trigger fires too (its +// WHEN OLD.last_modified IS NOT DISTINCT FROM NEW... holds) and sets +// last_modified = CURRENT_TIMESTAMP. The WHERE comparison matches against the +// PRE-trigger value, so a legitimate stamp still affects exactly 1 row (it is +// NOT a CAS miss); only a value that changed BEFORE this UPDATE ran blocks it. +// The post-stamp bump is correctness-neutral: once embed_gen = target the row +// is terminal/covered and excluded by the scan predicate, so no later scan +// re-finds it on account of the bumped last_modified. +// +// Each row is a separate UPDATE because every message carries a distinct +// last_modified token. Statements are not wrapped in one transaction: each is +// independently correct, and the cross-DB worker contract already tolerates a +// partial stamp (the next scan re-finds any unstamped row and re-runs an +// idempotent batch). Used by the embed worker's content read→stamp path; the +// backfill path keeps the plain SetEmbedGen (it has no read→stamp window). +// +// Returns the ids whose per-row UPDATE matched 0 rows — the CAS MISSES. A miss +// means last_modified moved between the worker's content read and this stamp +// (a concurrent repair/edit bumped it via the DB triggers), so the row was NOT +// stamped and stays "needs embedding". The worker surfaces these (logs them and +// excludes them from its success accounting) but does NOT hold the watermark +// back: a missed row's last_modified moved (and its embed_gen may be NULL), so +// the auto-backstop's watermark-ignoring full scan re-finds and re-embeds it +// with the corrected content. A real driver error still aborts (returns err). +// +// ACCEPTED RESIDUAL — 1-second CAS resolution (single-user). The CAS token is +// last_modified, defaulted/bumped by CURRENT_TIMESTAMP (schema.sql:310 and the +// AFTER/BEFORE-UPDATE triggers), which has 1-SECOND resolution on both backends. +// So a content edit that lands in the SAME WHOLE SECOND as the worker's content +// read leaves last_modified textually UNCHANGED — this CAS then matches and +// stamps embed_gen=target on an embedding built from the now-stale text, a +// missed staleness the sub-second window cannot detect. This is an accepted +// residual for the single-user tool (an edit and an embed of the same message in +// the same second is rare) and is NOT closed by schema/behavior change. It +// self-recovers: the next edit to that message (repair-encoding or any sync +// update) bumps last_modified and clears embed_gen (repair) / re-finds it, and a +// full rebuild or the auto-backstop re-embeds it regardless. See +// docs/usage/vector-search.md ("CAS resolution"). +func (s *Store) SetEmbedGenIfUnchanged(ctx context.Context, items []EmbedGenStamp, target int64) (missed []int64, err error) { + for _, it := range items { + q := `UPDATE messages SET embed_gen = ? WHERE id = ? AND last_modified = ?` + res, err := s.db.ExecContext(ctx, q, target, it.ID, it.LastModified) + if err != nil { + return missed, fmt.Errorf("set embed_gen if unchanged (id=%d): %w", it.ID, err) + } + n, err := res.RowsAffected() + if err != nil { + return missed, fmt.Errorf("rows affected (id=%d): %w", it.ID, err) + } + if n == 0 { + missed = append(missed, it.ID) + } + } + return missed, nil +} + +// ResetEmbedGen clears embed_gen (sets it back to NULL) on the given +// message ids, marking them as needing embedding again. Used by +// repair-encoding after rewriting a message's text so the scan-and-fill +// worker re-embeds it with the corrected content on its next run. Chunked +// to stay under the driver's bind limit; idempotent. +func (s *Store) ResetEmbedGen(ctx context.Context, ids []int64) error { + if len(ids) == 0 { + return nil + } + for start := 0; start < len(ids); start += embedGenStampChunkRows { + end := min(start+embedGenStampChunkRows, len(ids)) + chunk := ids[start:end] + + placeholders := make([]string, len(chunk)) + args := make([]any, 0, len(chunk)) + for i, id := range chunk { + placeholders[i] = "?" + args = append(args, id) + } + q := `UPDATE messages SET embed_gen = NULL WHERE id IN (` + + strings.Join(placeholders, ",") + `)` + if _, err := s.db.ExecContext(ctx, q, args...); err != nil { + return fmt.Errorf("reset embed_gen: %w", err) + } + } + return nil +} + +// CoverageCounts reports embedding coverage for activeGen, computed from +// the MAIN db (messages + embed_gen) so it is a single-DB query on both +// backends and needs no access to the embeddings store. +// +// - live: total live messages (the embedding universe). +// - stamped: live messages stamped embed_gen = activeGen. This is the +// 2nd return value (historically named "embedded"). It counts every +// row the worker has marked DONE for the generation, INCLUDING blanks — +// messages with no extractable body that were stamped terminal but +// never produced a vector. It is therefore an UPPER bound on the true +// embedded count; the embedded/blank split is resolved at the display +// layer via the backend's EmbeddedMessageCount (the embeddings table +// lives in a separate DB on SQLite, so this single-DB query cannot do +// it). blank = stamped - embedded. +// - blank: the 3rd return value is always 0 here — it cannot be +// computed without the embeddings table. The real blank count is +// derived by the caller as stamped - backend.EmbeddedMessageCount(gen) +// (see cmd/msgvault/cmd/embeddings_manage.go). Kept in the signature +// so callers that only need missing (the scheduler/CLI activation gate) +// do not have to change. +// - missing: live messages still needing work for activeGen +// (embed_gen IS NULL OR embed_gen <> activeGen). live = stamped + +// missing exactly. With the display-layer split: live = embedded + +// blank + missing. +// +// activeGen == 0 means "no active/target generation"; then everything +// live is missing and stamped is 0. +func (s *Store) CoverageCounts(ctx context.Context, activeGen int64) (live, stamped, blank, missing int64, err error) { + live, err = s.countLiveMessages(ctx) + if err != nil { + return 0, 0, 0, 0, err + } + if activeGen != 0 { + q := `SELECT COUNT(*) FROM messages + WHERE embed_gen = ? AND ` + LiveMessagesWhere("", true) + if err := s.db.QueryRowContext(ctx, q, activeGen).Scan(&stamped); err != nil { + return 0, 0, 0, 0, fmt.Errorf("count stamped: %w", err) + } + } + missing = max(live-stamped, 0) + return live, stamped, 0, missing, nil +} + +// MissingCount returns just the "missing" coverage figure for activeGen +// (live messages still needing work: embed_gen IS NULL OR embed_gen <> +// activeGen). It is a thin accessor for the scheduler/CLI activation +// gates, which only consult the missing count; missing = live - stamped. +func (s *Store) MissingCount(ctx context.Context, activeGen int64) (int64, error) { + live, err := s.countLiveMessages(ctx) + if err != nil { + return 0, err + } + if activeGen == 0 { + return live, nil + } + var stamped int64 + q := `SELECT COUNT(*) FROM messages + WHERE embed_gen = ? AND ` + LiveMessagesWhere("", true) + if err := s.db.QueryRowContext(ctx, q, activeGen).Scan(&stamped); err != nil { + return 0, fmt.Errorf("count stamped: %w", err) + } + return max(live-stamped, 0), nil +} + +// countLiveMessages returns the total live-message count. Shared by +// CoverageCounts; kept separate so the live-predicate stays in one place. +func (s *Store) countLiveMessages(ctx context.Context) (int64, error) { + var n int64 + q := `SELECT COUNT(*) FROM messages WHERE ` + LiveMessagesWhere("", true) + if err := s.db.QueryRowContext(ctx, q).Scan(&n); err != nil { + return 0, fmt.Errorf("count live messages: %w", err) + } + return n, nil +} diff --git a/internal/store/last_modified_test.go b/internal/store/last_modified_test.go new file mode 100644 index 000000000..96b85e170 --- /dev/null +++ b/internal/store/last_modified_test.go @@ -0,0 +1,255 @@ +package store_test + +import ( + "context" + "database/sql" + "path/filepath" + "testing" + + "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/require" + "go.kenn.io/msgvault/internal/store" + "go.kenn.io/msgvault/internal/testutil" +) + +// seedMessageForLM creates a source + conversation + one message (with a body +// row) and returns the message id. Shared by the last_modified trigger tests. +func seedMessageForLM(t *testing.T, st *store.Store) int64 { + t.Helper() + src, err := st.GetOrCreateSource("gmail", "alice@example.com") + require.NoError(t, err, "GetOrCreateSource") + convID, err := st.EnsureConversationWithType(src.ID, "conv-lm", "email_thread", "Subject") + require.NoError(t, err, "EnsureConversationWithType") + id, err := st.UpsertMessage(&store.Message{ + SourceID: src.ID, + SourceMessageID: "msg-lm", + ConversationID: convID, + MessageType: "email", + Subject: sql.NullString{String: "original subject", Valid: true}, + }) + require.NoError(t, err, "UpsertMessage") + require.NoError(t, st.UpsertMessageBody(id, + sql.NullString{String: "original body", Valid: true}, + sql.NullString{}), "UpsertMessageBody") + return id +} + +// baselineLM stamps a fixed, far-past last_modified on the message so a +// subsequent trigger-driven bump produces a different, easily-asserted value +// without needing the test to sleep for the timestamp resolution to tick. The +// explicit write is itself preserved (not re-bumped) because the trigger's +// WHEN guard only fires when OLD.last_modified == NEW.last_modified, and here +// they differ. +func baselineLM(t *testing.T, st *store.Store, id int64) string { + t.Helper() + const past = "2000-01-01 00:00:00+00" + _, err := st.DB().Exec( + st.Rebind(`UPDATE messages SET last_modified = ? WHERE id = ?`), past, id) + require.NoError(t, err, "set baseline last_modified") + return readLM(t, st, id) +} + +// readLM reads last_modified as a comparable string on both backends. On +// SQLite it CASTs to TEXT to defeat go-sqlite3's DATETIME→time.Time coercion +// (the same trick the embed worker uses); on PostgreSQL it casts to text in +// SQL so the comparison is a plain string on either backend. +func readLM(t *testing.T, st *store.Store, id int64) string { + t.Helper() + expr := "CAST(last_modified AS TEXT)" + var s string + require.NoError(t, st.DB().QueryRow( + st.Rebind(`SELECT `+expr+` FROM messages WHERE id = ?`), id).Scan(&s), + "read last_modified") + return s +} + +// TestLastModified_MessageUpdateBumps verifies any UPDATE to a message row +// bumps last_modified via the trigger. +func TestLastModified_MessageUpdateBumps(t *testing.T) { + st := testutil.NewTestStore(t) + id := seedMessageForLM(t, st) + base := baselineLM(t, st, id) + + // A content UPDATE that does NOT touch last_modified must trigger a bump. + _, err := st.DB().Exec( + st.Rebind(`UPDATE messages SET subject = ? WHERE id = ?`), "changed subject", id) + require.NoError(t, err, "update subject") + + got := readLM(t, st, id) + assert.NotEqual(t, base, got, "message UPDATE must bump last_modified") +} + +// TestLastModified_EmbedGenUpdateBumps verifies even an embed_gen-only UPDATE +// bumps last_modified — expected/harmless (the worker's CAS WHERE matches the +// PRE-trigger value, so its own stamp still succeeds; see +// SetEmbedGenIfUnchanged). +func TestLastModified_EmbedGenUpdateBumps(t *testing.T) { + st := testutil.NewTestStore(t) + id := seedMessageForLM(t, st) + base := baselineLM(t, st, id) + + _, err := st.DB().Exec( + st.Rebind(`UPDATE messages SET embed_gen = ? WHERE id = ?`), int64(7), id) + require.NoError(t, err, "update embed_gen") + + got := readLM(t, st, id) + assert.NotEqual(t, base, got, "embed_gen-only UPDATE bumps last_modified (expected)") +} + +// TestLastModified_BodyUpdateBumpsParent verifies an UPDATE to message_bodies +// bumps the PARENT message's last_modified (the repair-encoding rewrite path). +func TestLastModified_BodyUpdateBumpsParent(t *testing.T) { + st := testutil.NewTestStore(t) + id := seedMessageForLM(t, st) + base := baselineLM(t, st, id) + + _, err := st.DB().Exec( + st.Rebind(`UPDATE message_bodies SET body_text = ? WHERE message_id = ?`), + "corrected body", id) + require.NoError(t, err, "update body") + + got := readLM(t, st, id) + assert.NotEqual(t, base, got, "message_bodies UPDATE must bump parent last_modified") +} + +// TestLastModified_BodyInsertBumpsParent verifies an INSERT into +// message_bodies bumps the parent message's last_modified. +func TestLastModified_BodyInsertBumpsParent(t *testing.T) { + require := require.New(t) + st := testutil.NewTestStore(t) + src, err := st.GetOrCreateSource("gmail", "bob@example.com") + require.NoError(err, "GetOrCreateSource") + convID, err := st.EnsureConversationWithType(src.ID, "conv-lm2", "email_thread", "Subject") + require.NoError(err, "EnsureConversationWithType") + id, err := st.UpsertMessage(&store.Message{ + SourceID: src.ID, + SourceMessageID: "msg-lm2", + ConversationID: convID, + MessageType: "email", + Subject: sql.NullString{String: "subject", Valid: true}, + }) + require.NoError(err, "UpsertMessage") + base := baselineLM(t, st, id) + + require.NoError(st.UpsertMessageBody(id, + sql.NullString{String: "first body", Valid: true}, + sql.NullString{}), "insert body") + + got := readLM(t, st, id) + assert.NotEqual(t, base, got, "message_bodies INSERT must bump parent last_modified") +} + +// TestLastModified_UpgradePathMissingColumn covers the universal SQLite +// upgrade path for the last_modified watermark: a pre-existing archive whose +// messages table predates the column. On such a DB, InitSchema runs schema.sql +// FIRST — which executes `CREATE TRIGGER IF NOT EXISTS trg_messages_last_modified`, +// a trigger that REFERENCES last_modified — BEFORE LegacyColumnMigrations adds +// the column. This only works because SQLite resolves a trigger body's column +// references lazily (at fire time, not create time). After the column is added, +// InitSchema's one-shot backfill stamps the pre-existing NULL rows. +// +// Every existing SQLite user hits this exact path on upgrade, yet the other +// last_modified trigger tests all use a fresh DB where the column already +// exists when the trigger is created — so none of them exercise the +// trigger-before-column ordering. This test reconstructs the precondition by +// dropping the column (and the triggers that reference it, which SQLite would +// otherwise refuse to leave dangling) from a real schema, then re-runs the +// production InitSchema and asserts (a) it succeeds, (b) the column is added +// and backfilled to a non-NULL value for the pre-existing rows, and (c) the +// re-created trigger then functions as the CAS watermark. +// +// SQLite-only: it relies on ALTER TABLE DROP COLUMN and SQLite's deferred +// trigger column resolution. PostgreSQL's ADD COLUMN ... DEFAULT +// CURRENT_TIMESTAMP backfills automatically and its triggers are created +// after the column, so the upgrade ordering risk does not apply there. +func TestLastModified_UpgradePathMissingColumn(t *testing.T) { + testutil.SkipIfPostgres(t, "SQLite ALTER TABLE DROP COLUMN + deferred trigger column resolution") + require := require.New(t) + assert := assert.New(t) + + dbPath := filepath.Join(t.TempDir(), "upgrade.db") + + // 1. Build a real schema, seed two messages (with bodies), then strip the + // last_modified column to reproduce a pre-last_modified archive. + seed, err := store.OpenForTest(dbPath) + require.NoError(err, "open seed store") + require.NoError(seed.InitSchema(), "seed InitSchema") + _, err = seed.DB().Exec(` +INSERT INTO sources (id, source_type, identifier) VALUES (1, 'gmail', 'alice@example.com'); +INSERT INTO conversations (id, source_id, conversation_type) VALUES (1, 1, 'email_thread'); +INSERT INTO messages (id, conversation_id, source_id, source_message_id, message_type, subject) +VALUES (1, 1, 1, 'm1', 'email', 'original one'), + (2, 1, 1, 'm2', 'email', 'original two'); +INSERT INTO message_bodies (message_id, body_text) VALUES (1, 'body one'), (2, 'body two'); +`) + require.NoError(err, "seed rows") + + // SQLite refuses to DROP a column while a trigger references it, so drop the + // three last_modified triggers first; the resulting shape (messages without + // last_modified, no last_modified triggers) is exactly what an archive built + // before the column looks like. + for _, trg := range []string{ + "trg_messages_last_modified", + "trg_message_bodies_last_modified_upd", + "trg_message_bodies_last_modified_ins", + } { + _, err = seed.DB().Exec(`DROP TRIGGER IF EXISTS ` + trg) + require.NoErrorf(err, "drop trigger %s", trg) + } + _, err = seed.DB().Exec(`ALTER TABLE messages DROP COLUMN last_modified`) + require.NoError(err, "drop last_modified to simulate pre-upgrade schema") + + var preCols int + require.NoError(seed.DB().QueryRow( + `SELECT COUNT(*) FROM pragma_table_info('messages') WHERE name = 'last_modified'`).Scan(&preCols), + "check column dropped") + require.Equal(0, preCols, "precondition: messages must lack last_modified before upgrade") + require.NoError(seed.Close(), "close seed store") + + // 2. Reopen and run the PRODUCTION upgrade entry point. (a) It must succeed: + // schema.sql creates trg_messages_last_modified (referencing last_modified) + // before LegacyColumnMigrations adds the column. + st, err := store.OpenForTest(dbPath) + require.NoError(err, "reopen upgraded store") + t.Cleanup(func() { _ = st.Close() }) + require.NoError(st.InitSchema(), + "InitSchema must succeed on a messages table lacking last_modified") + + // (b) The column now exists and the pre-existing rows were backfilled to a + // non-NULL value (a NULL CAS token would loop "needs embedding" forever). + var postCols int + require.NoError(st.DB().QueryRow( + `SELECT COUNT(*) FROM pragma_table_info('messages') WHERE name = 'last_modified'`).Scan(&postCols), + "check column added") + assert.Equal(1, postCols, "InitSchema must add last_modified") + + var nullCount int + require.NoError(st.DB().QueryRow( + `SELECT COUNT(*) FROM messages WHERE last_modified IS NULL`).Scan(&nullCount), + "count NULL last_modified") + assert.Equal(0, nullCount, "backfill must populate last_modified for pre-existing rows") + + // (c) The re-created trigger functions: a content UPDATE bumps last_modified. + // Baseline to a fixed far-past value so the bump is an unambiguous change. + base := baselineLM(t, st, 1) + _, err = st.DB().Exec( + st.Rebind(`UPDATE messages SET subject = ? WHERE id = ?`), "changed one", int64(1)) + require.NoError(err, "update subject after upgrade") + got := readLM(t, st, 1) + assert.NotEqual(base, got, + "re-created trigger must bump last_modified on UPDATE after upgrade") +} + +// TestLastModified_NoInfiniteRecursion is a liveness check: a message UPDATE +// completes (the trigger's own UPDATE does not re-fire forever). If recursion +// were unbounded the Exec would error or hang; we simply require it returns. +func TestLastModified_NoInfiniteRecursion(t *testing.T) { + st := testutil.NewTestStore(t) + id := seedMessageForLM(t, st) + ctx := context.Background() + for range 5 { + _, err := st.DB().ExecContext(ctx, + st.Rebind(`UPDATE messages SET snippet = ? WHERE id = ?`), "s", id) + require.NoError(t, err, "repeated update must not recurse/hang") + } +} diff --git a/internal/store/messages.go b/internal/store/messages.go index c0f070c16..60e6aba4f 100644 --- a/internal/store/messages.go +++ b/internal/store/messages.go @@ -188,6 +188,10 @@ func upsertMessageSQL(now string) string { has_attachments, attachment_count, archived_at ) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, %s) ON CONFLICT(source_id, source_message_id) DO UPDATE SET + embed_gen = CASE + WHEN COALESCE(messages.subject, '') <> COALESCE(excluded.subject, '') THEN NULL + ELSE messages.embed_gen + END, conversation_id = excluded.conversation_id, rfc822_message_id = excluded.rfc822_message_id, sent_at = excluded.sent_at, @@ -247,16 +251,55 @@ func (s *Store) UpsertMessageBody(messageID int64, bodyText, bodyHTML sql.NullSt } func upsertMessageBody(q querier, messageID int64, bodyText, bodyHTML sql.NullString) error { - _, err := q.Exec(` + bodyChanged, err := messageBodyChanged(q, messageID, bodyText, bodyHTML) + if err != nil { + return err + } + _, err = q.Exec(` INSERT INTO message_bodies (message_id, body_text, body_html) VALUES (?, ?, ?) ON CONFLICT(message_id) DO UPDATE SET body_text = excluded.body_text, body_html = excluded.body_html `, messageID, bodyText, bodyHTML) + if err != nil { + return err + } + if !bodyChanged { + return nil + } + _, err = q.Exec(`UPDATE messages SET embed_gen = NULL WHERE id = ? AND embed_gen IS NOT NULL`, messageID) return err } +func messageBodyChanged(q querier, messageID int64, bodyText, bodyHTML sql.NullString) (bool, error) { + var oldText, oldHTML sql.NullString + err := q.QueryRow(` + SELECT body_text, body_html FROM message_bodies WHERE message_id = ? + `, messageID).Scan(&oldText, &oldHTML) + if errors.Is(err, sql.ErrNoRows) { + return embeddingBodyValue(bodyText, bodyHTML) != "", nil + } + if err != nil { + return false, err + } + return embeddingBodyValue(oldText, oldHTML) != embeddingBodyValue(bodyText, bodyHTML), nil +} + +func embeddingBodyValue(bodyText, bodyHTML sql.NullString) string { + if v := nullStringValue(bodyText); v != "" { + return v + } + return mime.StripHTML(nullStringValue(bodyHTML)) +} + +func nullStringValue(ns sql.NullString) string { + if !ns.Valid { + return "" + } + return ns.String +} + // UpsertMessageRaw stores the compressed raw MIME data for a message. func (s *Store) UpsertMessageRaw(messageID int64, rawData []byte) error { return upsertMessageRaw(s.db, messageID, rawData) diff --git a/internal/store/messages_test.go b/internal/store/messages_test.go index b386ddd70..7aa3bf985 100644 --- a/internal/store/messages_test.go +++ b/internal/store/messages_test.go @@ -100,6 +100,69 @@ func TestRecomputeConversationStats(t *testing.T) { assert.Equal(3, count, "idempotency message_count") } +// TestEmbedGen_OrphanImpossibleAndCoverage pins the scan-and-fill +// embed_gen contract: +// - a freshly-upserted message has embed_gen NULL (column default), so +// CoverageCounts reports it as missing for any generation — the +// scan-and-fill worker picks it up with no enqueue step (orphan rows +// are impossible). +// - SetEmbedGen stamps it covered; CoverageCounts then reports it +// embedded. +// - a subsequent UpsertMessage (ON CONFLICT DO UPDATE) clears embed_gen +// when the embeddable subject text changes. +func TestEmbedGen_OrphanImpossibleAndCoverage(t *testing.T) { + require := requirepkg.New(t) + assert := assertpkg.New(t) + st := testutil.NewTestStore(t) + + source, err := st.GetOrCreateSource("gmail", "me@example.com") + require.NoError(err, "GetOrCreateSource") + convID, err := st.EnsureConversationWithType(source.ID, "conv-1", "email_thread", "Subject") + require.NoError(err, "EnsureConversationWithType") + + msg := &store.Message{ + SourceID: source.ID, + SourceMessageID: "m1", + ConversationID: convID, + MessageType: "email", + Subject: sql.NullString{String: "hello", Valid: true}, + } + id, err := st.UpsertMessage(msg) + require.NoError(err, "UpsertMessage") + + const gen = int64(7) + ctx := t.Context() + + // New row: embed_gen NULL by default -> reported missing for any gen. + var embedGen sql.NullInt64 + require.NoError(st.DB().QueryRow( + st.Rebind(`SELECT embed_gen FROM messages WHERE id = ?`), id).Scan(&embedGen)) + assert.False(embedGen.Valid, "new message must have NULL embed_gen (no enqueue, no orphan)") + + live, embedded, _, missing, err := st.CoverageCounts(ctx, gen) + require.NoError(err, "CoverageCounts (before stamp)") + assert.Equal(int64(1), live, "one live message") + assert.Equal(int64(0), embedded, "none embedded yet") + assert.Equal(int64(1), missing, "the new message is missing") + + // Stamp it covered. + require.NoError(st.SetEmbedGen(ctx, []int64{id}, gen), "SetEmbedGen") + live, embedded, _, missing, err = st.CoverageCounts(ctx, gen) + require.NoError(err, "CoverageCounts (after stamp)") + assert.Equal(int64(1), live, "still one live message") + assert.Equal(int64(1), embedded, "now embedded") + assert.Equal(int64(0), missing, "nothing missing") + + // Re-upsert the same message with changed embedding input: embed_gen must + // be cleared so the scan-and-fill worker re-embeds it. + msg.Subject = sql.NullString{String: "hello (edited)", Valid: true} + _, err = st.UpsertMessage(msg) + require.NoError(err, "re-UpsertMessage") + require.NoError(st.DB().QueryRow( + st.Rebind(`SELECT embed_gen FROM messages WHERE id = ?`), id).Scan(&embedGen)) + assert.False(embedGen.Valid, "subject change must clear embed_gen") +} + func TestEnsureParticipantByPhone_IdentifierType(t *testing.T) { require := requirepkg.New(t) assert := assertpkg.New(t) diff --git a/internal/store/schema.sql b/internal/store/schema.sql index 68921674c..60969efea 100644 --- a/internal/store/schema.sql +++ b/internal/store/schema.sql @@ -155,9 +155,26 @@ CREATE TABLE IF NOT EXISTS messages ( archived_at DATETIME DEFAULT CURRENT_TIMESTAMP, indexing_version INTEGER DEFAULT 1, + -- Row-level last-modified watermark, maintained ENTIRELY by the + -- database (triggers below), never by application write paths. Used by + -- the embed worker as an optimistic-CAS token: it captures this value + -- when it reads a message's content and stamps embed_gen only if the + -- value is unchanged at stamp time, so a concurrent content edit + -- (e.g. repair-encoding) that lands between read and stamp leaves the + -- row unstamped and it is re-embedded with the corrected content. + last_modified DATETIME DEFAULT CURRENT_TIMESTAMP, + -- Platform-specific metadata metadata JSON, + -- Vector-embedding watermark: the index generation this message's + -- embeddings were last written for. NULL means "needs embedding" + -- (new rows default to NULL); a value equal to the active/building + -- generation id means "covered". The scan-and-fill embed worker + -- finds work via (embed_gen IS NULL OR embed_gen <> ) and + -- stamps this column after a successful upsert (or skip). + embed_gen INTEGER, + UNIQUE(source_id, source_message_id) ); @@ -269,6 +286,44 @@ CREATE TABLE IF NOT EXISTS message_bodies ( body_html TEXT ); +-- ============================================================================ +-- LAST-MODIFIED TRIGGERS +-- ============================================================================ +-- messages.last_modified is bumped to CURRENT_TIMESTAMP on ANY change to a +-- message row OR any insert/update of its body row. This is a TRUE row-level +-- last-modified (blanket, not column-specific): the embed worker uses it as an +-- optimistic-CAS token, so it must move whenever any embeddable content could +-- have changed. No application write path bumps it manually — the database +-- owns it via these triggers. InitSchema re-execs schema.sql idempotently, so +-- `IF NOT EXISTS` makes these safe on both fresh and existing databases. + +-- On messages: re-stamp last_modified after any UPDATE. The WHEN guard +-- (OLD.last_modified = NEW.last_modified) prevents infinite recursion: the +-- trigger's own UPDATE changes last_modified, so on the re-fire +-- OLD.last_modified <> NEW.last_modified and WHEN evaluates false, regardless +-- of the recursive_triggers pragma. It also yields to an explicit +-- last_modified write in the original UPDATE rather than clobbering it. +CREATE TRIGGER IF NOT EXISTS trg_messages_last_modified +AFTER UPDATE ON messages FOR EACH ROW +WHEN OLD.last_modified = NEW.last_modified +BEGIN + UPDATE messages SET last_modified = CURRENT_TIMESTAMP WHERE id = NEW.id; +END; + +-- On message_bodies: a body change must bump the parent message's +-- last_modified so the worker's CAS token covers body edits too. +CREATE TRIGGER IF NOT EXISTS trg_message_bodies_last_modified_upd +AFTER UPDATE ON message_bodies FOR EACH ROW +BEGIN + UPDATE messages SET last_modified = CURRENT_TIMESTAMP WHERE id = NEW.message_id; +END; + +CREATE TRIGGER IF NOT EXISTS trg_message_bodies_last_modified_ins +AFTER INSERT ON message_bodies FOR EACH ROW +BEGIN + UPDATE messages SET last_modified = CURRENT_TIMESTAMP WHERE id = NEW.message_id; +END; + -- Original message data (for re-parsing/export) CREATE TABLE IF NOT EXISTS message_raw ( message_id INTEGER PRIMARY KEY REFERENCES messages(id) ON DELETE CASCADE, diff --git a/internal/store/schema_pg.sql b/internal/store/schema_pg.sql index 3c64927fd..0391d7810 100644 --- a/internal/store/schema_pg.sql +++ b/internal/store/schema_pg.sql @@ -132,9 +132,20 @@ CREATE TABLE IF NOT EXISTS messages ( metadata JSONB, + -- Row-level last-modified watermark, maintained ENTIRELY by the + -- database (triggers, created by EnsureTriggers), never by application + -- write paths. Used by the embed worker as an optimistic-CAS token. + -- See schema.sql for the full contract. + last_modified TIMESTAMPTZ DEFAULT CURRENT_TIMESTAMP, + -- Full-text search column search_fts TSVECTOR, + -- Vector-embedding watermark: the index generation this message's + -- embeddings were last written for. NULL means "needs embedding" + -- (new rows default to NULL). See schema.sql for the full contract. + embed_gen BIGINT, + UNIQUE(source_id, source_message_id) ); diff --git a/internal/store/store.go b/internal/store/store.go index 0f92a4553..96756f681 100644 --- a/internal/store/store.go +++ b/internal/store/store.go @@ -652,7 +652,7 @@ func (s *Store) SchemaStale() (bool, string, error) { return false, "", fmt.Errorf("check schema version: %w", err) } if count == 0 { - return true, "conversations.conversation_type", nil + return true, "messages.embed_gen", nil } return false, "", nil } @@ -723,6 +723,25 @@ func (s *Store) InitSchema() error { } } + // Backfill last_modified for rows that predate the column. SQLite cannot + // ADD COLUMN with a non-constant default, so the legacy ADD COLUMN above + // leaves existing rows NULL; this one-shot UPDATE sets them to + // CURRENT_TIMESTAMP so the embed worker's CAS token is a comparable value + // (a NULL token would never satisfy `last_modified = ?` and the row would + // loop "needs embedding" forever). Idempotent and portable: on a fresh + // DB (or PostgreSQL, whose ADD COLUMN ... DEFAULT CURRENT_TIMESTAMP + // backfills automatically) no rows are NULL, so this is a no-op. Run + // under runMaintenance so the full-table UPDATE on a large archive is not + // cut off by the pool-wide statement_timeout (no-op reset on SQLite). + if err := s.runMaintenance(context.Background(), func(ctx context.Context, tx *loggedTx) error { + _, err := tx.ExecContext(ctx, + `UPDATE messages SET last_modified = `+s.dialect.Now()+ + ` WHERE last_modified IS NULL`) + return err + }); err != nil { + return fmt.Errorf("backfill last_modified: %w", err) + } + // Create FTS indexes that depend on columns just added by the legacy // migrations (PostgreSQL's GIN index on messages.search_fts). No-op on // SQLite. Must run after the migration loop above. [cr2-10] @@ -737,6 +756,34 @@ func (s *Store) InitSchema() error { return fmt.Errorf("ensure FTS index: %w", err) } + // Create the last_modified maintenance triggers. Must run after the + // migration loop above adds the last_modified column on legacy DBs. + // SQLite is a no-op here (its triggers ride schema.sql); PostgreSQL + // creates them idempotently. Run under runMaintenance for consistency + // with EnsureFTSIndex (no statement_timeout cap on the DDL). + if err := s.runMaintenance(context.Background(), func(ctx context.Context, tx *loggedTx) error { + return s.dialect.EnsureTriggers(tx) + }); err != nil { + return fmt.Errorf("ensure last_modified triggers: %w", err) + } + + // Drop the obsolete partial index over messages needing embedding. It was + // redundant with the per-generation embed watermark (the work-finder scan + // rides the messages PRIMARY KEY B-tree via `id > :watermark ORDER BY id`) + // and useless during a rebuild (old-gen leftovers carry a non-NULL embed_gen + // that an `embed_gen IS NULL` index never covers), while costing index + // maintenance on the two hottest write paths (message insert + embed_gen + // stamp). DROP IF EXISTS is idempotent and portable across SQLite/PG; it + // cleans up any dev DB that already created the index. Run under + // runMaintenance to match the original CREATE's transaction context. + if err := s.runMaintenance(context.Background(), func(ctx context.Context, tx *loggedTx) error { + _, err := tx.ExecContext(ctx, + `DROP INDEX IF EXISTS idx_messages_embed_gen`) + return err + }); err != nil { + return fmt.Errorf("drop idx_messages_embed_gen: %w", err) + } + // Load the optional FTS schema, if the dialect keeps one separate. // PostgreSQL returns "" here because its tsvector lives in the main schema. if ftsFile := s.dialect.SchemaFTS(); ftsFile != "" { diff --git a/internal/store/store_test.go b/internal/store/store_test.go index e2ee1327c..f82be377c 100644 --- a/internal/store/store_test.go +++ b/internal/store/store_test.go @@ -1524,6 +1524,93 @@ func TestStore_PersistMessage_Upsert(t *testing.T) { assert.Equal("updated body", bodyText.String, "body_text") } +func TestStore_PersistMessageClearsEmbedGenWhenEmbeddingInputsChange(t *testing.T) { + require := requirepkg.New(t) + assert := assertpkg.New(t) + f := storetest.New(t) + ctx := t.Context() + + msg := storetest.NewMessage(f.Source.ID, f.ConvID). + WithSourceMessageID("persist-embed-gen"). + WithSubject("Original subject"). + WithSnippet("preview"). + Build() + data := &store.MessagePersistData{ + Message: msg, + BodyText: sql.NullString{String: "original body", Valid: true}, + RawMIME: sampleRawMessage, + } + + msgID, err := f.Store.PersistMessage(data) + require.NoError(err, "PersistMessage first call") + + const gen = int64(7) + require.NoError(f.Store.SetEmbedGen(ctx, []int64{msgID}, gen), "SetEmbedGen") + assert.Equal(sql.NullInt64{Int64: gen, Valid: true}, readEmbedGen(t, f.Store, msgID), + "precondition: message is stamped") + + msg.Snippet = sql.NullString{String: "preview changed", Valid: true} + _, err = f.Store.PersistMessage(data) + require.NoError(err, "PersistMessage unchanged embedding inputs") + assert.Equal(sql.NullInt64{Int64: gen, Valid: true}, readEmbedGen(t, f.Store, msgID), + "non-embedding metadata must not clear embed_gen") + + data.BodyHTML = sql.NullString{String: "

rendering changed

", Valid: true} + _, err = f.Store.PersistMessage(data) + require.NoError(err, "PersistMessage changed HTML with unchanged plaintext") + assert.Equal(sql.NullInt64{Int64: gen, Valid: true}, readEmbedGen(t, f.Store, msgID), + "HTML-only changes must not clear embed_gen while plaintext is the embedded body") + + data.BodyText = sql.NullString{String: "updated body", Valid: true} + _, err = f.Store.PersistMessage(data) + require.NoError(err, "PersistMessage changed body") + assert.False(readEmbedGen(t, f.Store, msgID).Valid, "body change must clear embed_gen") + + require.NoError(f.Store.SetEmbedGen(ctx, []int64{msgID}, gen), "SetEmbedGen after body change") + msg.Subject = sql.NullString{String: "Updated subject", Valid: true} + _, err = f.Store.PersistMessage(data) + require.NoError(err, "PersistMessage changed subject") + assert.False(readEmbedGen(t, f.Store, msgID).Valid, "subject change must clear embed_gen") +} + +func TestStore_PersistMessagePreservesEmbedGenForEquivalentHTMLFallback(t *testing.T) { + require := requirepkg.New(t) + assert := assertpkg.New(t) + f := storetest.New(t) + ctx := t.Context() + + msg := storetest.NewMessage(f.Source.ID, f.ConvID). + WithSourceMessageID("persist-html-only-embed-gen"). + WithSubject("HTML only"). + Build() + data := &store.MessagePersistData{ + Message: msg, + BodyHTML: sql.NullString{String: "

Rendered body

", Valid: true}, + RawMIME: sampleRawMessage, + } + + msgID, err := f.Store.PersistMessage(data) + require.NoError(err, "PersistMessage first call") + + const gen = int64(7) + require.NoError(f.Store.SetEmbedGen(ctx, []int64{msgID}, gen), "SetEmbedGen") + + data.BodyHTML = sql.NullString{String: "
Rendered body
", Valid: true} + _, err = f.Store.PersistMessage(data) + require.NoError(err, "PersistMessage equivalent HTML fallback") + assert.Equal(sql.NullInt64{Int64: gen, Valid: true}, readEmbedGen(t, f.Store, msgID), + "markup-only HTML fallback changes must not clear embed_gen") +} + +func readEmbedGen(t *testing.T, st *store.Store, msgID int64) sql.NullInt64 { + t.Helper() + var got sql.NullInt64 + err := st.DB().QueryRowContext(t.Context(), + st.Rebind(`SELECT embed_gen FROM messages WHERE id = ?`), msgID).Scan(&got) + requirepkg.NoError(t, err, "read embed_gen") + return got +} + // --- GetStatsForScope tests --- // makeSecondSource creates a second source and conversation in the same store as f. diff --git a/internal/sync/embed_enqueue_test.go b/internal/sync/embed_enqueue_test.go deleted file mode 100644 index 2c2f7af80..000000000 --- a/internal/sync/embed_enqueue_test.go +++ /dev/null @@ -1,135 +0,0 @@ -package sync - -import ( - "context" - "errors" - "testing" - - assertpkg "github.com/stretchr/testify/assert" - requirepkg "github.com/stretchr/testify/require" - "go.kenn.io/msgvault/internal/gmail" - "go.kenn.io/msgvault/internal/testutil" -) - -// failingEnqueuer always fails EnqueueMessages, recording how many times -// it was invoked. It simulates a broken vector-search queue (e.g. a -// pending_embeddings INSERT failure on PostgreSQL, or a missing -// vectors.db on SQLite). -type failingEnqueuer struct { - calls int -} - -func (f *failingEnqueuer) EnqueueMessages(_ context.Context, _ []int64) error { - f.calls++ - return errors.New("simulated enqueue failure") -} - -// newEnqueueTestEnv builds a sync test environment backed by the store -// selected via MSGVAULT_TEST_DB (SQLite by default, PostgreSQL under -// `make test-pg`). Unlike newTestEnv it does NOT hard-code SQLite, so the -// same test exercises the enqueue paths on both backends. -func newEnqueueTestEnv(t *testing.T, enq EmbedEnqueuer) *TestEnv { - t.Helper() - - st := testutil.NewTestStore(t) - - mock := gmail.NewMockAPI() - mock.Profile = &gmail.Profile{ - EmailAddress: testEmail, - MessagesTotal: 0, - HistoryID: 1000, - } - - syncer := New(mock, st, nil) - syncer.SetEmbedEnqueuer(enq) - - return &TestEnv{ - Store: st, - Mock: mock, - Syncer: syncer, - TmpDir: t.TempDir(), - Context: context.Background(), - } -} - -// TestFullSync_EnqueueFailureIsNonFatal verifies that when the vector -// enqueue fails during a full sync, the sync still succeeds and the -// message rows stay persisted — on EVERY backend. This is the SQLite -// parity behavior: enqueue failures are warn-and-continue, never a hard -// error (missed IDs are recovered by `msgvault embed --full-rebuild`). -// -// Has teeth: with the previous PostgreSQL hard-bail -// (`if s.store.IsPostgreSQL() { return ..., fmt.Errorf("vector enqueue -// failed (PG)") }`), this test fails on the PostgreSQL backend -// (`make test-pg`) because Full() would return an error instead of -// succeeding. -func TestFullSync_EnqueueFailureIsNonFatal(t *testing.T) { - enq := &failingEnqueuer{} - env := newEnqueueTestEnv(t, enq) - - env.Mock.Profile.MessagesTotal = 2 - env.Mock.Profile.HistoryID = 12345 - env.Mock.AddMessage("msg1", testMIME(), []string{"INBOX"}) - env.Mock.AddMessage("msg2", testMIME(), []string{"INBOX"}) - - summary, err := env.Syncer.Full(env.Context, testEmail) - requirepkg.NoError(t, err, "full sync must succeed despite enqueue failure") - assertSummary(t, summary, WantSummary{Added: new(int64(2)), Errors: new(int64(0))}) - - // The enqueuer was actually exercised (the failure path was hit). - assertpkg.Positive(t, enq.calls, "enqueuer should have been invoked") - - // Messages are persisted even though the enqueue failed. - assertMessageCount(t, env.Store, 2) -} - -// TestIncrementalSync_EnqueueFailureIsNonFatal verifies the same parity -// behavior for the incremental-sync batch enqueue site. -func TestIncrementalSync_EnqueueFailureIsNonFatal(t *testing.T) { - enq := &failingEnqueuer{} - env := newEnqueueTestEnv(t, enq) - source := env.CreateSourceWithHistory(t, "12340") - - env.Mock.Profile.MessagesTotal = 2 - env.Mock.AddMessage("new-msg-1", testMIME(), []string{"INBOX"}) - env.Mock.AddMessage("new-msg-2", testMIME(), []string{"INBOX"}) - env.SetHistory(12350, - historyAdded("new-msg-1"), - historyAdded("new-msg-2"), - ) - - summary, err := env.Syncer.Incremental(env.Context, source) - requirepkg.NoError(t, err, "incremental sync must succeed despite enqueue failure") - assertSummary(t, summary, WantSummary{Added: new(int64(2))}) - - assertpkg.Positive(t, enq.calls, "enqueuer should have been invoked") - assertMessageCount(t, env.Store, 2) -} - -// TestIncrementalSync_PerMessageEnqueueFailureIsNonFatal verifies the -// parity behavior for the per-message enqueue site in handleLabelChange -// (a label added to a message that does not yet exist locally, so it is -// fetched and ingested inline). -func TestIncrementalSync_PerMessageEnqueueFailureIsNonFatal(t *testing.T) { - require := requirepkg.New(t) - enq := &failingEnqueuer{} - env := newEnqueueTestEnv(t, enq) - source := env.CreateSourceWithHistory(t, "12340") - _, err := env.Store.EnsureLabel(source.ID, "INBOX", "Inbox", "system") - require.NoError(err, "EnsureLabel INBOX") - _, err = env.Store.EnsureLabel(source.ID, "STARRED", "Starred", "system") - require.NoError(err, "EnsureLabel STARRED") - - env.Mock.Profile.MessagesTotal = 1 - env.Mock.AddMessage("new-msg", testMIME(), []string{"INBOX", "STARRED"}) - env.SetHistory(12350, historyLabelAdded("new-msg", "STARRED")) - - _, err = env.Syncer.Incremental(env.Context, source) - require.NoError(err, "incremental sync must succeed despite per-message enqueue failure") - - assertpkg.Positive(t, enq.calls, "enqueuer should have been invoked") - assertMessageCount(t, env.Store, 1) -} - -// compile-time check that failingEnqueuer satisfies the interface. -var _ EmbedEnqueuer = (*failingEnqueuer)(nil) diff --git a/internal/sync/incremental.go b/internal/sync/incremental.go index 3485c3f5a..18d2699a5 100644 --- a/internal/sync/incremental.go +++ b/internal/sync/incremental.go @@ -162,7 +162,6 @@ func (s *Syncer) Incremental(ctx context.Context, source *store.Source) (summary } checkpoint.ErrorsCount += int64(len(newMsgIDs)) } else { - var insertedIDs []int64 for i, fetch := range rawMessages { raw := fetch.Message if raw == nil { @@ -178,31 +177,19 @@ func (s *Syncer) Incremental(ctx context.Context, source *store.Source) (summary continue } threadID := newMsgThreads[newMsgIDs[i]] - insertedID, err := s.ingestMessage(source.ID, raw, threadID, labelMap) - if err != nil { + if err := s.ingestMessage(source.ID, raw, threadID, labelMap); err != nil { s.logger.Warn("failed to ingest added message", "id", newMsgIDs[i], "error", err) s.recordSyncItem(syncID, newMsgIDs[i], syncItemPhaseIngest, store.SyncRunItemStatusError, syncItemKindIngestError, err) checkpoint.ErrorsCount++ continue } - if insertedID > 0 { - insertedIDs = append(insertedIDs, insertedID) - } checkpoint.MessagesAdded++ summary.BytesDownloaded += int64(len(raw.Raw)) } - // Hook vector-search enqueue. A failed enqueue is - // non-fatal on both backends: the message rows are - // already persisted, and any missed IDs are recovered by - // a full vector rebuild (`msgvault embed --full-rebuild`), - // which re-seeds every live message (pgvector and - // sqlitevec both provide this path). - if s.embedEnqueuer != nil && len(insertedIDs) > 0 { - if err := s.embedEnqueuer.EnqueueMessages(ctx, insertedIDs); err != nil { - s.logger.Warn("vector enqueue failed", "ids", len(insertedIDs), "error", err) - } - } + // Newly-persisted messages get embed_gen = NULL by column + // default, so the scan-and-fill embed worker picks them up + // automatically — no sync-time enqueue step is needed. } } @@ -314,23 +301,14 @@ func (s *Syncer) handleLabelChange(ctx context.Context, syncID, sourceID int64, checkpoint.ErrorsCount++ return false, err } - insertedID, err := s.ingestMessage(sourceID, raw, threadID, labelMap) - if err != nil { + if err := s.ingestMessage(sourceID, raw, threadID, labelMap); err != nil { s.recordSyncItem(syncID, messageID, syncItemPhaseIngest, store.SyncRunItemStatusError, syncItemKindIngestError, err) checkpoint.ErrorsCount++ return false, err } - // Hook vector-search enqueue for the new message. A failed - // enqueue is non-fatal on both backends: the message row is - // already persisted, and any missed ID is recovered by a - // full vector rebuild (`msgvault embed --full-rebuild`), - // which re-seeds every live message (pgvector and sqlitevec - // both provide this path). - if s.embedEnqueuer != nil && insertedID > 0 { - if err := s.embedEnqueuer.EnqueueMessages(ctx, []int64{insertedID}); err != nil { - s.logger.Warn("vector enqueue failed", "ids", 1, "error", err) - } - } + // The new message gets embed_gen = NULL by column default, so + // the scan-and-fill embed worker picks it up automatically — no + // sync-time enqueue step is needed. checkpoint.MessagesAdded++ if raw != nil { summary.BytesDownloaded += int64(len(raw.Raw)) diff --git a/internal/sync/sync.go b/internal/sync/sync.go index b7b7d6823..c670c9fa7 100644 --- a/internal/sync/sync.go +++ b/internal/sync/sync.go @@ -22,12 +22,6 @@ import ( // ErrHistoryExpired indicates that the Gmail history ID is too old and a full sync is required. var ErrHistoryExpired = errors.New("history expired - run full sync") -// EmbedEnqueuer is optionally supplied to a Syncer; nil means vector -// search is disabled. Set via SetEmbedEnqueuer. -type EmbedEnqueuer interface { - EnqueueMessages(ctx context.Context, messageIDs []int64) error -} - // Options configures sync behavior. type Options struct { // SourceType is the type of source being synced ("gmail" or "imap"). @@ -63,12 +57,11 @@ func DefaultOptions() *Options { // Syncer performs Gmail synchronization. type Syncer struct { - client gmail.API - store *store.Store - logger *slog.Logger - progress gmail.SyncProgress - opts *Options - embedEnqueuer EmbedEnqueuer + client gmail.API + store *store.Store + logger *slog.Logger + progress gmail.SyncProgress + opts *Options } // New creates a new Syncer. @@ -98,12 +91,6 @@ func (s *Syncer) WithProgress(p gmail.SyncProgress) *Syncer { return s } -// SetEmbedEnqueuer wires up the optional vector-search enqueuer. Safe -// to call with nil to disable. -func (s *Syncer) SetEmbedEnqueuer(e EmbedEnqueuer) { - s.embedEnqueuer = e -} - // syncState holds the state for a sync operation. type syncState struct { syncID int64 @@ -202,7 +189,6 @@ func (s *Syncer) processBatch(ctx context.Context, syncID, sourceID int64, listR return nil, fmt.Errorf("fetch messages: %w", err) } - var insertedIDs []int64 for i, fetch := range rawMessages { raw := fetch.Message if raw == nil { @@ -240,7 +226,7 @@ func (s *Syncer) processBatch(ctx context.Context, syncID, sourceID int64, listR } threadID := threadIDs[newIDs[i]] - insertedID, err := s.ingestMessage(sourceID, raw, threadID, labelMap) + err := s.ingestMessage(sourceID, raw, threadID, labelMap) if err != nil { if errors.Is(err, errDuplicateRFC822) { result.skipped++ @@ -252,25 +238,13 @@ func (s *Syncer) processBatch(ctx context.Context, syncID, sourceID int64, listR continue } - if insertedID > 0 { - insertedIDs = append(insertedIDs, insertedID) - } result.added++ summary.BytesDownloaded += int64(len(raw.Raw)) } - // Hook vector-search enqueue after the batch-insert point. - // A failed enqueue is non-fatal on both backends: the message - // rows are already persisted, and any IDs missed by a failed - // enqueue are recovered by a full vector rebuild - // (`msgvault embed --full-rebuild`), which re-seeds every live - // message (both pgvector and sqlitevec provide this path). So we - // warn and continue rather than abort the sync. - if s.embedEnqueuer != nil && len(insertedIDs) > 0 { - if err := s.embedEnqueuer.EnqueueMessages(ctx, insertedIDs); err != nil { - s.logger.Warn("vector enqueue failed", "ids", len(insertedIDs), "error", err) - } - } + // Newly-persisted messages get embed_gen = NULL by column default, + // so the scan-and-fill embed worker picks them up automatically on + // its next run — no sync-time enqueue step is needed. } return result, nil @@ -622,8 +596,10 @@ func (s *Syncer) parseToModel(sourceID int64, raw *gmail.RawMessage, threadID st }, nil } -// persistMessage stores a parsed message and all related data. Returns -// the internal message ID for hooks (e.g. vector-search enqueue). +// persistMessage stores a parsed message and all related data, returning +// the internal message ID to callers. No vector-search enqueue happens +// here: persisted rows leave embed_gen NULL (column default) and the +// scan-and-fill worker discovers them later. func (s *Syncer) persistMessage(data *messageData, labelMap map[string]int64) (int64, error) { // Map Gmail label IDs to internal IDs var labelIDs []int64 @@ -712,13 +688,12 @@ func (s *Syncer) persistMessage(data *messageData, labelMap map[string]int64) (i // composite IDs change when messages move between mailboxes. var errDuplicateRFC822 = errors.New("duplicate RFC822 Message-ID") -// ingestMessage parses and stores a single message, returning the -// internal message ID on success. Returns (0, errDuplicateRFC822) for -// IMAP deduplication skips. -func (s *Syncer) ingestMessage(sourceID int64, raw *gmail.RawMessage, threadID string, labelMap map[string]int64) (int64, error) { +// ingestMessage parses and stores a single message. Returns +// errDuplicateRFC822 for IMAP deduplication skips. +func (s *Syncer) ingestMessage(sourceID int64, raw *gmail.RawMessage, threadID string, labelMap map[string]int64) error { data, err := s.parseToModel(sourceID, raw, threadID) if err != nil { - return 0, err + return err } // For IMAP sources, check if a message with the same RFC822 @@ -733,7 +708,7 @@ func (s *Syncer) ingestMessage(sourceID int64, raw *gmail.RawMessage, threadID s existingID, err := s.store.GetMessageIDByRFC822ID( sourceID, data.message.RFC822MessageID.String) if err != nil { - return 0, fmt.Errorf("check rfc822 dedup: %w", err) + return fmt.Errorf("check rfc822 dedup: %w", err) } if existingID > 0 { var labelIDs []int64 @@ -747,13 +722,14 @@ func (s *Syncer) ingestMessage(sourceID int64, raw *gmail.RawMessage, threadID s data.message.SourceMessageID, labelIDs, ); err != nil { - return 0, fmt.Errorf("update dedup message: %w", err) + return fmt.Errorf("update dedup message: %w", err) } - return 0, errDuplicateRFC822 + return errDuplicateRFC822 } } - return s.persistMessage(data, labelMap) + _, err = s.persistMessage(data, labelMap) + return err } // ensureAddressUTF8 validates and converts address names to valid UTF-8 in place. diff --git a/internal/testutil/store_helpers.go b/internal/testutil/store_helpers.go index f8a173b32..e5ebbf8ec 100644 --- a/internal/testutil/store_helpers.go +++ b/internal/testutil/store_helpers.go @@ -31,6 +31,21 @@ func NewTestStore(t *testing.T) *store.Store { return newPostgresTestStore(t, testDB) } + return NewSQLiteTestStore(t) +} + +// NewSQLiteTestStore creates a temporary SQLite store, ALWAYS, ignoring +// MSGVAULT_TEST_DB. Use it for tests that are intrinsically tied to a SQLite +// main DB regardless of the configured backend — e.g. the sqlitevec vectors +// backend, whose Open-time probes (mainTableExists' sqlite_master lookup, +// resetOrphanedEmbedGen/BackfillEmbedGenForUpgrade) run SQLite-dialect SQL +// against the main handle. In production sqlitevec is only ever paired with a +// SQLite main store (the backend factory picks pgvector when the store is +// PostgreSQL), so such a test must not adopt a PostgreSQL main store just +// because MSGVAULT_TEST_DB is set. +func NewSQLiteTestStore(t *testing.T) *store.Store { + t.Helper() + dbPath := filepath.Join(t.TempDir(), "test.db") st, err := store.OpenForTest(dbPath) require.NoError(t, err, "open store") diff --git a/internal/vector/backend.go b/internal/vector/backend.go index 637fcd108..3fe14affb 100644 --- a/internal/vector/backend.go +++ b/internal/vector/backend.go @@ -123,8 +123,13 @@ type Hit struct { // Stats reports the size of one generation (or 0 for totals). type Stats struct { EmbeddingCount int64 - PendingCount int64 - StorageBytes int64 + // PendingCount, under the scan-and-fill design, is the number of live + // messages still needing embedding for this generation (embed_gen <> + // gen), computed from the main DB rather than a queue table. It is 0 + // for the aggregate (gen == 0) path. The name is retained for API + // stability; semantically it is now a "missing" count. + PendingCount int64 + StorageBytes int64 } // Backend is the minimum contract a vector store must implement. @@ -139,19 +144,22 @@ type Backend interface { // ActivateGeneration atomically retires the current active generation // (if any, deleting its embeddings on backends that share an index - // graph) and promotes gen to active. The promotion enforces, inside the - // same transaction as the state flip, that gen is in state='building' - // and — unless force is true — that gen has finished seeding - // (seeded_at IS NOT NULL) and has zero pending embedding rows. force - // bypasses the seeded/pending gate (operator `--force`); the gate stays - // atomic so a concurrent enqueue cannot slip a pending row in between a - // caller's pre-check and the flip. On a gate failure the backend returns - // a precise error distinguishing pending vs unseeded vs not-building. + // graph) and promotes gen to active. The promotion enforces that gen is + // in state='building' and — unless force is true — that gen has full + // coverage (no live message still needs embedding for it, i.e. + // missing==0). On PG the coverage gate is folded into the same + // transaction as the state flip; on SQLite (cross-DB) it is a Go + // pre-check before the flip, with the full-scan backstop covering the + // TOCTOU window. force bypasses the coverage gate (operator `--force`). + // On a gate failure the backend returns a precise error distinguishing + // missing-coverage vs not-building. ActivateGeneration(ctx context.Context, gen GenerationID, force bool) error - // RetireGeneration marks gen as retired, deleting its embeddings on - // backends that share an index graph (pgvector) and reaping its pending - // queue rows. Unless force is true, the state-flip UPDATE refuses to + // RetireGeneration marks gen as retired (a state flip on its + // index_generations row), and on backends that share an index graph + // (pgvector) also deletes the generation's embeddings so the shared HNSW + // graph stays generation-clean. (There is no pending queue to reap under + // scan-and-fill.) Unless force is true, the state-flip UPDATE refuses to // retire a generation in state='active', returning ErrRefuseRetireActive // WITHOUT deleting anything; the guard is enforced atomically inside the // retire transaction so a concurrent activation between a caller's @@ -170,17 +178,19 @@ type Backend interface { Delete(ctx context.Context, gen GenerationID, messageIDs []int64) error Stats(ctx context.Context, gen GenerationID) (Stats, error) - // EnsureSeeded guarantees that the building generation gen has had - // its initial pending_embeddings seed pass committed. If a prior - // CreateGeneration crashed between inserting the building row and - // committing the seed, the queue would be empty and a naive resume - // could "drain" zero rows and activate an unseeded index. - // EnsureSeeded re-runs the seed (idempotent — INSERT OR IGNORE) and - // stamps seeded_at when it commits. Call this on the resume path - // before draining the queue. Returns ErrUnknownGeneration if gen no - // longer exists in the index, and an error if gen is not in the - // `building` state. - EnsureSeeded(ctx context.Context, gen GenerationID) error + // EmbeddedMessageCount reports how many distinct LIVE, stamped + // (embed_gen == gen) messages actually have at least one embedding row + // for gen. This is the "embedded" leg of the coverage readout (live / + // embedded / blank / missing). It lives on the backend because the + // embeddings table is in vectors.db on SQLite (and the main DB on PG); + // only the backend holds that handle. The live+stamped intersection is + // REQUIRED for the coverage invariant to hold: SQLite intersects the + // vectors.db embedding ids against a live+stamped query on the main DB + // (cross-DB json_each, mirroring dropDeletedFromSource), while PostgreSQL + // uses a single JOIN to messages. Distinct from Stats.EmbeddingCount only + // in intent: this is the dedicated coverage helper and never folds the + // aggregate (gen == 0) path. + EmbeddedMessageCount(ctx context.Context, gen GenerationID) (int64, error) // LoadVector returns the embedding for a specific message in the // active generation. Returns ErrNoActiveGeneration if none exists, or @@ -188,6 +198,20 @@ type Backend interface { // generation. LoadVector(ctx context.Context, messageID int64) ([]float32, error) + // ResetWatermarkBelow lowers the scan-and-fill forward-scan watermark + // for EVERY generation to at most minID-1 (clamped at 0), so the next + // incremental RunOnce re-scans from below minID and re-finds rows whose + // embed_gen was just reset to NULL (repair-encoding). Without it, a + // repaired message whose id sits BELOW the current watermark would never + // be re-found by an incremental scan (the scan applies `id > watermark`) + // and would only be recovered by a full-scan backstop. Lowering is a MIN + // against the stored watermark, so it never pushes the cursor FORWARD past + // unswept work; a row already at/below the watermark is left untouched. + // minID < 1 is a no-op. The watermark lives in vectors.db on SQLite and + // the main DB on PostgreSQL, so each backend implements it against its + // own handle/dialect. Idempotent. + ResetWatermarkBelow(ctx context.Context, minID int64) error + Close() error } diff --git a/internal/vector/config.go b/internal/vector/config.go index e8f8cfcde..5dd3ebd85 100644 --- a/internal/vector/config.go +++ b/internal/vector/config.go @@ -208,6 +208,12 @@ func (s SearchConfig) MaxPageSizeHybridClamp() int { // embedding endpoint itself (e.g. scheduling). type EmbedConfig struct { Schedule EmbedScheduleConfig `toml:"schedule"` + // BackstopInterval is how often the daemon embed job also runs a full + // watermark-ignoring backstop pass (recovering below-watermark + // stragglers from repair-encoding resets, transient errors, or crashes) + // in addition to the per-tick incremental scan. Zero uses the EmbedJob + // default (24h); a negative value disables the auto-backstop. + BackstopInterval time.Duration `toml:"backstop_interval"` } // EmbedScheduleConfig controls when the embed worker runs on its own diff --git a/internal/vector/embed/backfill_test.go b/internal/vector/embed/backfill_test.go new file mode 100644 index 000000000..09e9539cc --- /dev/null +++ b/internal/vector/embed/backfill_test.go @@ -0,0 +1,196 @@ +//go:build sqlite_vec + +package embed + +import ( + "context" + "database/sql" + "fmt" + "path/filepath" + "testing" + + "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/require" + + "go.kenn.io/msgvault/internal/vector" + "go.kenn.io/msgvault/internal/vector/sqlitevec" +) + +// backfillFixture stands up a real sqlitevec backend over a main DB that +// includes an applied_migrations ledger (which newWorkerFixture omits), so +// the one-time embed_gen upgrade backfill can be exercised +// end-to-end against the worker. +type backfillFixture struct { + MainDB *sql.DB + VectorsDB *sql.DB + Backend *sqlitevec.Backend + Store WorkStore + Client *fakeEmbeddingClient +} + +// newBackfillFixture creates n messages (id 1..n) with NULL embed_gen plus +// the applied_migrations ledger and message_bodies, and opens a backend. +func newBackfillFixture(t *testing.T, n int) *backfillFixture { + t.Helper() + ctx := context.Background() + + dir := t.TempDir() + mainPath := filepath.Join(dir, "main.db") + require.NoError(t, sqlitevec.RegisterExtension(), "RegisterExtension") + mainDB, err := sql.Open(sqlitevec.DriverName(), mainPath) + require.NoError(t, err, "open main") + t.Cleanup(func() { _ = mainDB.Close() }) + + schema := testMainSchema + ` +CREATE TABLE applied_migrations ( + name TEXT PRIMARY KEY, + applied_at DATETIME DEFAULT CURRENT_TIMESTAMP +);` + _, err = mainDB.Exec(schema) + require.NoError(t, err, "schema") + for i := 1; i <= n; i++ { + _, err := mainDB.Exec( + `INSERT INTO messages (id, subject) VALUES (?, ?)`, i, fmt.Sprintf("msg %d", i)) + require.NoError(t, err, "insert message") + _, err = mainDB.Exec( + `INSERT INTO message_bodies (message_id, body_text) VALUES (?, ?)`, i, fmt.Sprintf("body %d", i)) + require.NoError(t, err, "insert body") + } + + vecPath := filepath.Join(dir, "vectors.db") + b, err := sqlitevec.Open(ctx, sqlitevec.Options{ + Path: vecPath, + MainPath: mainPath, + Dimension: 4, + MainDB: mainDB, + }) + require.NoError(t, err, "sqlitevec.Open") + t.Cleanup(func() { _ = b.Close() }) + + vecDB, err := sql.Open(sqlitevec.DriverName(), vecPath) + require.NoError(t, err, "open vectors.db handle") + t.Cleanup(func() { _ = vecDB.Close() }) + + return &backfillFixture{ + MainDB: mainDB, + VectorsDB: vecDB, + Backend: b, + Store: &testWorkStore{db: mainDB}, + Client: &fakeEmbeddingClient{dim: 4}, + } +} + +func embedGenOf(t *testing.T, db *sql.DB, id int64) (val int64, isNull bool) { + t.Helper() + var v sql.NullInt64 + require.NoError(t, db.QueryRow(`SELECT embed_gen FROM messages WHERE id = ?`, id).Scan(&v)) + return v.Int64, !v.Valid +} + +// TestBackfillEmbedGen_UpgradeStampsEmbeddedOnly simulates an upgrade from +// a pre-embed_gen build: an active generation already has embeddings for +// some messages, but embed_gen is NULL everywhere (the ADD COLUMN did no +// backfill). The one-time backfill must stamp embed_gen=active for the +// already-embedded messages and leave the un-embedded one NULL; coverage +// then becomes honest; re-running the backfill is a ledger-guarded no-op; +// and a worker RunOnce re-embeds ONLY the un-embedded straggler. +func TestBackfillEmbedGen_UpgradeStampsEmbeddedOnly(t *testing.T) { + ctx := context.Background() + // 3 messages: 1 and 2 will be embedded under the active gen; 3 will not. + f := newBackfillFixture(t, 3) + + gen, err := f.Backend.CreateGeneration(ctx, "fake", 4, "") + require.NoError(t, err, "CreateGeneration") + + // Embed messages 1 and 2 under the generation (upsert vectors). This is + // the "already embedded before upgrade" state. + chunks := []vector.Chunk{ + {MessageID: 1, Vector: []float32{1, 0, 0, 0}}, + {MessageID: 2, Vector: []float32{0, 1, 0, 0}}, + } + require.NoError(t, f.Backend.Upsert(ctx, gen, chunks), "Upsert") + + // Stamp + activate so there is an ACTIVE generation, then simulate the + // upgrade by resetting embed_gen to NULL on every message (as if the + // embed_gen column had just been added with no backfill). + require.NoError(t, f.Store.SetEmbedGen(ctx, []int64{1, 2, 3}, int64(gen)), "stamp") + require.NoError(t, f.Backend.ActivateGeneration(ctx, gen, true), "activate (force)") + _, err = f.MainDB.ExecContext(ctx, `UPDATE messages SET embed_gen = NULL`) + require.NoError(t, err, "reset embed_gen to NULL (simulate upgrade)") + + // Sanity: coverage now (wrongly) reports all 3 as missing. + require.Equal(t, 3, countMissing(t, f.MainDB, int64(gen)), "pre-backfill: all missing") + + // newBackfillFixture's Open already ran (and marked) the backfill when + // no generation existed. Clear the ledger row so the manual call below + // reproduces the real upgrade timing: the first Open where an active + // generation + pre-existing embeddings are present. + _, err = f.MainDB.ExecContext(ctx, + `DELETE FROM applied_migrations WHERE name = ?`, "embed_gen_backfill_active_v1") + require.NoError(t, err, "reset ledger") + + // Run the one-time backfill. + require.NoError(t, f.Backend.BackfillEmbedGenForUpgrade(ctx), "backfill") + + // Messages 1 and 2 (already embedded) are stamped; 3 stays NULL. + for _, id := range []int64{1, 2} { + v, isNull := embedGenOf(t, f.MainDB, id) + assert.Falsef(t, isNull, "msg %d should be stamped", id) + assert.Equalf(t, int64(gen), v, "msg %d embed_gen", id) + } + v3, isNull3 := embedGenOf(t, f.MainDB, 3) + assert.True(t, isNull3, "msg 3 (un-embedded) stays NULL") + _ = v3 + + // Coverage is now honest: only message 3 is missing. + assert.Equal(t, 1, countMissing(t, f.MainDB, int64(gen)), "post-backfill: only msg 3 missing") + + // Re-running the backfill is a ledger-guarded no-op: it must NOT re-stamp + // message 3 (which is legitimately unembedded). + require.NoError(t, f.Backend.BackfillEmbedGenForUpgrade(ctx), "backfill again (no-op)") + _, isNull3Again := embedGenOf(t, f.MainDB, 3) + assert.True(t, isNull3Again, "msg 3 still NULL after second backfill (ledger no-op)") + + // A worker RunOnce against the active generation must re-embed ONLY the + // straggler (message 3), not the already-stamped 1 and 2. + w := NewWorker(WorkerDeps{ + Backend: f.Backend, + VectorsDB: f.VectorsDB, + MainDB: f.MainDB, + Store: f.Store, + Client: f.Client, + BatchSize: 8, + }) + res, err := w.RunOnce(ctx, gen) + require.NoError(t, err, "RunOnce") + assert.Equal(t, 1, res.Succeeded, "worker re-embeds only the un-stamped straggler") + assert.Equal(t, 0, countMissing(t, f.MainDB, int64(gen)), "coverage complete after straggler embedded") +} + +// TestBackfillEmbedGen_NoActiveGenerationMarksLedger verifies the backfill +// no-ops cleanly (and marks the ledger) when there is no active generation: +// nothing to stamp, but the migration is recorded so it never re-runs. +func TestBackfillEmbedGen_NoActiveGenerationMarksLedger(t *testing.T) { + ctx := context.Background() + f := newBackfillFixture(t, 2) + + // newBackfillFixture's Open already marked the ledger (no gen at open + // time); clear it so this call is the one that marks it. + _, err := f.MainDB.ExecContext(ctx, + `DELETE FROM applied_migrations WHERE name = ?`, "embed_gen_backfill_active_v1") + require.NoError(t, err, "reset ledger") + + require.NoError(t, f.Backend.BackfillEmbedGenForUpgrade(ctx), "backfill (no active gen)") + + var n int + require.NoError(t, f.MainDB.QueryRow( + `SELECT COUNT(*) FROM applied_migrations WHERE name = ?`, + "embed_gen_backfill_active_v1").Scan(&n)) + assert.Equal(t, 1, n, "ledger marked even with no active generation") + + // Both messages remain NULL (no embeddings to stamp from). + for _, id := range []int64{1, 2} { + _, isNull := embedGenOf(t, f.MainDB, id) + assert.Truef(t, isNull, "msg %d stays NULL", id) + } +} diff --git a/internal/vector/embed/enqueue.go b/internal/vector/embed/enqueue.go deleted file mode 100644 index e5e76cd04..000000000 --- a/internal/vector/embed/enqueue.go +++ /dev/null @@ -1,232 +0,0 @@ -package embed - -import ( - "context" - "database/sql" - "errors" - "fmt" - "strings" - "time" - - "go.kenn.io/msgvault/internal/sync" - "go.kenn.io/msgvault/internal/vector" -) - -// afterGenSnapshotHook is a test-only synchronization seam. When non-nil -// it is invoked once inside EnqueueMessages' transaction AFTER the -// non-retired generation snapshot is read but BEFORE any per-generation -// re-validation or pending insert runs. It lets the concurrency -// regression test commit a RetireGeneration at exactly the window the -// orphan-pending race opens (snapshot read → retire commits → enqueue -// inserts), proving the locked re-validation excludes the now-retired -// generation. It is always nil in production. Mirrors the afterChunkHook -// seam in queue.go. -var afterGenSnapshotHook func() - -// enqueueChunkRows caps how many (gen, message) tuples go into a single -// INSERT statement. Each row binds 3 placeholders (generation_id, -// message_id, enqueued_at), so 500 rows = 1,500 bound parameters. The -// compiled SQLite driver (mattn/go-sqlite3) allows up to 32,766 bound -// variables per statement, so 1,500 is comfortably within budget; the -// value is also small enough to avoid an oversized prepared statement on -// PostgreSQL. (For reference, the store package caps multi-row inserts at -// 900 params to stay under SQLite's historical 999 limit — see -// insertInChunks.) -// -// The Enqueuer can be handed up to ~5,000 IDs by sync, fanned out across -// up to two non-retired generations; without chunking that would be -// 3×5,000 = 15,000 placeholders per statement, which bloats the prepared -// statement. 500 keeps every statement comfortably small on both SQLite -// and PostgreSQL while still amortizing the per-statement overhead (a -// 5,000-ID batch becomes 10 statements, not 5,000 single-row inserts). -const enqueueChunkRows = 500 - -// Compile-time assertion that *Enqueuer satisfies the sync.EmbedEnqueuer -// interface expected by internal/sync.Syncer. -var _ sync.EmbedEnqueuer = (*Enqueuer)(nil) - -// Enqueuer inserts message IDs into pending_embeddings for every -// non-retired generation. Implements the EmbedEnqueuer interface -// expected by internal/sync. -// -// Dual-enqueue is intentional: when a rebuild is in progress there are -// two non-retired generations (active + building); every newly-synced -// message gets queued into both so the building index stays current. -type Enqueuer struct { - db *sql.DB - // rebind translates ?-placeholders to the driver's native form; nil - // is normalized to identity (SQLite). insertOrIgnore rewrites a - // complete "INSERT OR IGNORE INTO ..." statement into the dialect's - // conflict-ignoring form (SQLite: identity; PostgreSQL: strips - // "OR IGNORE" and appends "ON CONFLICT DO NOTHING"); nil is - // normalized to identity. Both are applied in the same order the - // store package uses: insertOrIgnore first (it operates on the - // ?-placeholder SQLite form), then rebind. - rebind func(string) string - insertOrIgnore func(string) string - // isPG is true when the underlying driver is PostgreSQL. When set, - // EnqueueMessages re-validates each generation under a row lock - // (SELECT ... FOR NO KEY UPDATE) that conflicts with the implicit - // no-key tuple lock RetireGeneration/ActivateGeneration's - // state-flip UPDATE takes, so a generation retired concurrently with - // an enqueue cannot end up with an orphan pending row. SQLite does not - // support the FOR NO KEY UPDATE syntax (and does not need it — its - // file-level write serialization plus busy_timeout already serialize - // the enqueue against the retire), so the clause is omitted there. - isPG bool -} - -// NewEnqueuer returns an Enqueuer backed by the embeddings database -// (vectors.db on SQLite, the shared main DB on PostgreSQL). rebind and -// insertOrIgnore make the Enqueuer dialect-portable without importing -// internal/store, mirroring NewQueue's decoupled func style: pass nil -// for both on SQLite (identity), or the dialect's Rebind and -// InsertOrIgnore for pgx. -// -// Like NewQueue, the Enqueuer detects PostgreSQL by probing rebind: if -// rebind("?") == "$1" the driver is pgx and the per-generation -// re-validation SELECT acquires a FOR NO KEY UPDATE row lock. -func NewEnqueuer(db *sql.DB, rebind, insertOrIgnore func(string) string) *Enqueuer { - if rebind == nil { - rebind = func(q string) string { return q } - } - if insertOrIgnore == nil { - insertOrIgnore = func(q string) string { return q } - } - return &Enqueuer{ - db: db, - rebind: rebind, - insertOrIgnore: insertOrIgnore, - isPG: rebind("?") == "$1", - } -} - -// EnqueueMessages adds the given IDs to pending_embeddings for every -// generation not in state 'retired'. Duplicate IDs are silently ignored -// via INSERT OR IGNORE. Caller must only pass non-deleted message IDs — -// the deletion predicate is not checked here. -func (e *Enqueuer) EnqueueMessages(ctx context.Context, messageIDs []int64) error { - if len(messageIDs) == 0 { - return nil - } - tx, err := e.db.BeginTx(ctx, nil) - if err != nil { - return fmt.Errorf("begin enqueue tx: %w", err) - } - defer func() { _ = tx.Rollback() }() - - gens, err := func() ([]int64, error) { - rows, err := tx.QueryContext(ctx, - e.rebind(`SELECT id FROM index_generations WHERE state != ?`), - string(vector.GenerationRetired)) - if err != nil { - return nil, fmt.Errorf("select non-retired generations: %w", err) - } - defer func() { _ = rows.Close() }() - var out []int64 - for rows.Next() { - var id int64 - if err := rows.Scan(&id); err != nil { - return nil, fmt.Errorf("scan generation id: %w", err) - } - out = append(out, id) - } - if err := rows.Err(); err != nil { - return nil, fmt.Errorf("iterate generations: %w", err) - } - return out, nil - }() - if err != nil { - return err - } - if len(gens) == 0 { - return tx.Commit() - } - - // Test-only synchronization seam (nil in production): fires after the - // non-retired snapshot is read but before the locked re-validation + - // inserts below, so the concurrency regression test can commit a - // RetireGeneration inside the exact window the orphan-pending race opens. - if afterGenSnapshotHook != nil { - afterGenSnapshotHook() - } - - // revalidate re-reads a generation's state under a row lock that - // conflicts with the no-key tuple lock RetireGeneration / - // ActivateGeneration's state-flip UPDATE takes, then reports whether - // the locked re-read still sees the generation as non-retired. The - // initial non-retired snapshot above is read at the start of this tx - // under READ COMMITTED, so without this guard a concurrent retire could - // commit between that snapshot and the INSERT below — the FK insert - // takes only FOR KEY SHARE on index_generations, which does NOT conflict - // with retire's FOR NO KEY UPDATE, so the orphan pending row would commit - // and never be reaped (pickTarget skips retired gens; the retired - // index_generations row is preserved so its ON DELETE CASCADE never - // fires). Re-validating under FOR NO KEY UPDATE serializes the two - // interleavings: - // - enqueue-first: retire's state-flip UPDATE blocks on this lock, - // then its DELETE removes the rows we just inserted -> no orphan. - // - retire-first: this locking SELECT blocks until retire commits, then - // re-reads state='retired' and returns false -> we insert nothing. - // On SQLite the FOR NO KEY UPDATE clause is omitted (unsupported syntax); - // its file-level write serialization + busy_timeout force a retry on the - // losing writer, so the same invariant holds without an explicit lock. - revalidate := `SELECT id FROM index_generations WHERE id = ? AND state != ?` - if e.isPG { - revalidate += ` FOR NO KEY UPDATE` - } - revalidate = e.rebind(revalidate) - - now := time.Now().Unix() - for _, g := range gens { - // Re-read the generation's state under a row lock; skip it if it has - // been retired since the non-retired snapshot above. Reuses the same - // tx so the lock is held through the INSERTs below. - var lockedID int64 - err := tx.QueryRowContext(ctx, revalidate, g, string(vector.GenerationRetired)).Scan(&lockedID) - if errors.Is(err, sql.ErrNoRows) { - // Retired concurrently (PG: by a now-committed retire we just - // blocked on; SQLite: by a retire that won the write race) — do - // not enqueue, leaving no orphan pending row for this gen. - continue - } - if err != nil { - return fmt.Errorf("re-validate generation %d: %w", g, err) - } - // Bulk-insert one row per (gen, message) pair via chunked multi-row - // VALUES statements. Each (gen, message) tuple binds 3 parameters, so - // we cap each statement at enqueueChunkRows rows to stay under - // SQLite's parameter limit and avoid an oversized prepared statement - // on either backend. For a 5,000-message batch with two non-retired - // generations this is ~20 writes against the embeddings DB lock - // instead of 10,000 single-row inserts — keeps the embed worker's - // Claim from starving while sync flushes. The previous json_each - // path issued one statement per generation but is SQLite-only; - // chunked VALUES is portable to pgx. - for start := 0; start < len(messageIDs); start += enqueueChunkRows { - end := min(start+enqueueChunkRows, len(messageIDs)) - chunk := messageIDs[start:end] - - placeholders := make([]string, len(chunk)) - args := make([]any, 0, len(chunk)*3) - for i, id := range chunk { - placeholders[i] = "(?, ?, ?)" - args = append(args, g, id, now) - } - // Build the SQLite-form statement, then apply the dialect's - // insert-or-ignore rewrite (operates on ? placeholders), - // then rebind ? → $N. Same ordering as the store package's - // InsertOrIgnore-then-loggedDB-Rebind pipeline. - stmt := `INSERT OR IGNORE INTO pending_embeddings (generation_id, message_id, enqueued_at) VALUES ` + - strings.Join(placeholders, ",") - stmt = e.rebind(e.insertOrIgnore(stmt)) - if _, err := tx.ExecContext(ctx, stmt, args...); err != nil { - return fmt.Errorf("insert pending (gen=%d): %w", g, err) - } - } - } - if err := tx.Commit(); err != nil { - return fmt.Errorf("commit enqueue: %w", err) - } - return nil -} diff --git a/internal/vector/embed/enqueue_pg_test.go b/internal/vector/embed/enqueue_pg_test.go deleted file mode 100644 index cdfc2356f..000000000 --- a/internal/vector/embed/enqueue_pg_test.go +++ /dev/null @@ -1,230 +0,0 @@ -//go:build pgvector - -package embed - -import ( - "context" - "crypto/rand" - "database/sql" - "encoding/hex" - "os" - "strings" - "testing" - - _ "github.com/jackc/pgx/v5/stdlib" - "github.com/stretchr/testify/assert" - "github.com/stretchr/testify/require" - "go.kenn.io/msgvault/internal/store" - "go.kenn.io/msgvault/internal/vector" - "go.kenn.io/msgvault/internal/vector/pgvector" -) - -// openPGEnqueueDB stands up a per-test schema on MSGVAULT_TEST_DB with the -// pgvector schema (index_generations + pending_embeddings). The Enqueuer -// only reads index_generations and writes pending_embeddings, so the main -// messages table is not needed here. Returns the *sql.DB; cleanup drops -// the schema via t.Cleanup. -func openPGEnqueueDB(t *testing.T) *sql.DB { - t.Helper() - url := os.Getenv("MSGVAULT_TEST_DB") - if !strings.HasPrefix(url, "postgres://") && !strings.HasPrefix(url, "postgresql://") { - t.Skip("pgvector enqueue tests require MSGVAULT_TEST_DB to point at a PostgreSQL DSN") - } - - buf := make([]byte, 8) - _, err := rand.Read(buf) - require.NoError(t, err, "random schema name") - schemaName := "embed_e_test_" + hex.EncodeToString(buf) - - setup, err := sql.Open("pgx", url) - require.NoError(t, err, "open setup") - defer func() { _ = setup.Close() }() - _, err = setup.Exec("CREATE SCHEMA " + schemaName) - require.NoError(t, err, "create schema") - - testURL := url - sep := "?" - if strings.Contains(url, "?") { - sep = "&" - } - testURL += sep + "search_path=" + schemaName + ",public" - - db, err := sql.Open("pgx", testURL) - require.NoError(t, err, "open") - t.Cleanup(func() { - _ = db.Close() - cleanup, err := sql.Open("pgx", url) - if err != nil { - return - } - defer func() { _ = cleanup.Close() }() - _, _ = cleanup.Exec("DROP SCHEMA " + schemaName + " CASCADE") - }) - - require.NoError(t, pgvector.Migrate(context.Background(), db, 0, false), "pgvector.Migrate") - return db -} - -// insertPGGeneration inserts an index_generations row with an explicit id -// and state so the test can control which generations are non-retired. -func insertPGGeneration(t *testing.T, db *sql.DB, id int64, state string) { - t.Helper() - _, err := db.ExecContext(context.Background(), ` - INSERT INTO index_generations (id, model, dimension, fingerprint, started_at, state) - OVERRIDING SYSTEM VALUE - VALUES ($1, 'm', 768, 'm:768', 0, $2)`, id, state) - require.NoError(t, err, "insert generation") -} - -func pgPendingCount(t *testing.T, db *sql.DB, gen int64) int { - t.Helper() - var n int - require.NoError(t, db.QueryRow( - `SELECT COUNT(*) FROM pending_embeddings WHERE generation_id = $1`, gen).Scan(&n), - "pending count") - return n -} - -func pgEnqueuer(db *sql.DB) *Enqueuer { - d := &store.PostgreSQLDialect{} - return NewEnqueuer(db, d.Rebind, d.InsertOrIgnore) -} - -// TestEnqueuerPG_DualEnqueueAndRetiredExclusion asserts that on pgx the -// Enqueuer inserts one pending row per (non-retired generation, message) -// and skips retired generations. Before the json_each → chunked-VALUES -// port this failed against pgx: json_each is SQLite-only and the bare `?` -// placeholders are rejected by the pgx driver. -func TestEnqueuerPG_DualEnqueueAndRetiredExclusion(t *testing.T) { - ctx := context.Background() - db := openPGEnqueueDB(t) - insertPGGeneration(t, db, 1, "active") - insertPGGeneration(t, db, 2, "building") - insertPGGeneration(t, db, 3, "retired") // must NOT receive rows. - - e := pgEnqueuer(db) - require.NoError(t, e.EnqueueMessages(ctx, []int64{10, 11, 12}), "EnqueueMessages") - - assert.Equal(t, 3, pgPendingCount(t, db, 1), "active generation pending count") - assert.Equal(t, 3, pgPendingCount(t, db, 2), "building generation pending count") - assert.Equal(t, 0, pgPendingCount(t, db, 3), "retired generation must be excluded") -} - -// TestEnqueuerPG_RetireDuringEnqueue_NoOrphan drives the concurrent -// retire-during-enqueue interleaving that the locked per-generation -// re-validation closes. It forces the exact window the orphan-pending race -// opens — the enqueue tx reads the non-retired snapshot, THEN a concurrent -// RetireGeneration commits (UPDATE state='retired' + DELETE pending), THEN -// the enqueue attempts its inserts — and asserts no pending row is left -// behind for the now-retired generation. -// -// Without the fix the enqueue inserts pending rows for the snapshotted -// (now-retired) generation after retire's DELETE has run, so an orphan row -// commits and is never reaped. With the fix the locked re-read sees -// state='retired' and skips the generation, so the post-state has zero -// pending rows for it. -// -// The interleave is made deterministic via afterGenSnapshotHook: the hook -// fires inside the enqueue tx after the snapshot read and runs the retire -// to completion before returning, so the enqueue's re-validation always -// observes the committed retire. -func TestEnqueuerPG_RetireDuringEnqueue_NoOrphan(t *testing.T) { - ctx := context.Background() - db := openPGEnqueueDB(t) - - // Gen 1 stays active (so a non-force retire of gen 2 is permitted) and - // is the control that must keep its rows. Gen 2 is the building gen that - // gets retired mid-enqueue and must end with zero pending rows. - insertPGGeneration(t, db, 1, "active") - insertPGGeneration(t, db, 2, "building") - - backend, err := pgvector.Open(ctx, pgvector.Options{DB: db, SkipMigrate: true}) - require.NoError(t, err, "open pgvector backend") - - // The hook fires once, inside the enqueue tx, after the non-retired - // snapshot (which still includes gen 2) is read. We retire gen 2 to - // completion here so the enqueue's subsequent locked re-validation - // observes the committed state='retired'. Reset the seam so it cannot - // leak into sibling tests sharing this package's globals. - var retireErr error - afterGenSnapshotHook = func() { - retireErr = backend.RetireGeneration(ctx, 2, false) - } - t.Cleanup(func() { afterGenSnapshotHook = nil }) - - e := pgEnqueuer(db) - require.NoError(t, e.EnqueueMessages(ctx, []int64{10, 11, 12}), "EnqueueMessages") - require.NoError(t, retireErr, "RetireGeneration during enqueue") - - // Gen 2 was retired before the enqueue inserted its rows: the locked - // re-validation must have excluded it, leaving zero orphan pending rows. - assert.Equal(t, 0, pgPendingCount(t, db, 2), - "retired-mid-enqueue generation must have no orphan pending rows") - // Gen 1 stayed active throughout, so it still receives every id. - assert.Equal(t, 3, pgPendingCount(t, db, 1), - "active generation still enqueued despite concurrent retire") - - // Sanity: gen 2 really is retired. - var state string - require.NoError(t, db.QueryRow( - `SELECT state FROM index_generations WHERE id = $1`, int64(2)).Scan(&state)) - assert.Equal(t, string(vector.GenerationRetired), state, "gen 2 retired") -} - -// TestEnqueuerPG_Idempotent asserts re-enqueueing the same IDs is a no-op -// via ON CONFLICT (generation_id, message_id) DO NOTHING — exercised both -// across calls and within a single call carrying duplicate IDs. -func TestEnqueuerPG_Idempotent(t *testing.T) { - ctx := context.Background() - db := openPGEnqueueDB(t) - insertPGGeneration(t, db, 1, "active") - - e := pgEnqueuer(db) - require.NoError(t, e.EnqueueMessages(ctx, []int64{42}), "first enqueue") - // Re-enqueue across calls and with an intra-call duplicate. - require.NoError(t, e.EnqueueMessages(ctx, []int64{42, 42}), "re-enqueue with duplicates") - assert.Equal(t, 1, pgPendingCount(t, db, 1), "duplicate (gen, message) must collapse to one row") -} - -// TestEnqueuerPG_MultiChunk enqueues more IDs than enqueueChunkRows so the -// chunked-VALUES insert spans more than one statement. This exercises the -// chunk loop's boundary handling and confirms the total parameter count -// per statement stays bounded on pgx. The count uses -// 2*enqueueChunkRows + 1 so the final chunk is a small remainder. -func TestEnqueuerPG_MultiChunk(t *testing.T) { - ctx := context.Background() - db := openPGEnqueueDB(t) - insertPGGeneration(t, db, 1, "active") - insertPGGeneration(t, db, 2, "building") - - const total = 2*enqueueChunkRows + 1 - ids := make([]int64, total) - for i := range ids { - ids[i] = int64(i + 1) - } - - e := pgEnqueuer(db) - require.NoError(t, e.EnqueueMessages(ctx, ids), "EnqueueMessages spanning multiple chunks") - - assert.Equal(t, total, pgPendingCount(t, db, 1), "active generation got every id across chunks") - assert.Equal(t, total, pgPendingCount(t, db, 2), "building generation got every id across chunks") - - // Re-enqueue the full batch: still idempotent across multiple chunks. - require.NoError(t, e.EnqueueMessages(ctx, ids), "re-enqueue multi-chunk batch") - assert.Equal(t, total, pgPendingCount(t, db, 1), "active count unchanged after idempotent re-enqueue") -} - -// TestEnqueuerPG_NoGenerations_Noop asserts EnqueueMessages is a clean -// no-op when there are no non-retired generations. -func TestEnqueuerPG_NoGenerations_Noop(t *testing.T) { - ctx := context.Background() - db := openPGEnqueueDB(t) - insertPGGeneration(t, db, 1, "retired") - - e := pgEnqueuer(db) - require.NoError(t, e.EnqueueMessages(ctx, []int64{1, 2, 3}), "EnqueueMessages with only retired gen") - - var n int - require.NoError(t, db.QueryRow(`SELECT COUNT(*) FROM pending_embeddings`).Scan(&n)) - assert.Equal(t, 0, n, "no pending rows when only retired generations exist") -} diff --git a/internal/vector/embed/enqueue_test.go b/internal/vector/embed/enqueue_test.go deleted file mode 100644 index 23c920457..000000000 --- a/internal/vector/embed/enqueue_test.go +++ /dev/null @@ -1,65 +0,0 @@ -//go:build sqlite_vec - -package embed - -import ( - "context" - "testing" - - "github.com/stretchr/testify/assert" - "github.com/stretchr/testify/require" -) - -func TestEnqueuer_NoGenerations_Noop(t *testing.T) { - ctx := context.Background() - db := openVectorsDBForEnqueue(t) - e := NewEnqueuer(db, nil, nil) - require.NoError(t, e.EnqueueMessages(ctx, []int64{1, 2, 3}), "EnqueueMessages with no generations") - // Should be no pending rows. - var n int - require.NoError(t, db.QueryRowContext(ctx, `SELECT COUNT(*) FROM pending_embeddings`).Scan(&n)) - assert.Equal(t, 0, n, "pending count") -} - -func TestEnqueuer_ActiveGenerationOnly(t *testing.T) { - ctx := context.Background() - db := openVectorsDBForEnqueue(t) - insertGenerationStatic(t, db, 1, "active") - e := NewEnqueuer(db, nil, nil) - require.NoError(t, e.EnqueueMessages(ctx, []int64{10, 11})) - assertPending(t, db, 1, 2) -} - -func TestEnqueuer_ActiveAndBuilding_DualEnqueue(t *testing.T) { - ctx := context.Background() - db := openVectorsDBForEnqueue(t) - insertGenerationStatic(t, db, 1, "active") - insertGenerationStatic(t, db, 2, "building") - insertGenerationStatic(t, db, 3, "retired") // should NOT receive. - e := NewEnqueuer(db, nil, nil) - require.NoError(t, e.EnqueueMessages(ctx, []int64{100})) - assertPending(t, db, 1, 1) - assertPending(t, db, 2, 1) - assertPending(t, db, 3, 0) -} - -func TestEnqueuer_DuplicateIDs_Ignored(t *testing.T) { - ctx := context.Background() - db := openVectorsDBForEnqueue(t) - insertGenerationStatic(t, db, 1, "active") - e := NewEnqueuer(db, nil, nil) - require.NoError(t, e.EnqueueMessages(ctx, []int64{42})) - // Second call with same ID should not error; count still 1. - require.NoError(t, e.EnqueueMessages(ctx, []int64{42, 42})) - assertPending(t, db, 1, 1) -} - -func TestEnqueuer_EmptyIDs_Noop(t *testing.T) { - ctx := context.Background() - db := openVectorsDBForEnqueue(t) - insertGenerationStatic(t, db, 1, "active") - e := NewEnqueuer(db, nil, nil) - assert.NoError(t, e.EnqueueMessages(ctx, nil), "EnqueueMessages(nil)") - assert.NoError(t, e.EnqueueMessages(ctx, []int64{}), "EnqueueMessages([])") - assertPending(t, db, 1, 0) -} diff --git a/internal/vector/embed/orphan_reset_test.go b/internal/vector/embed/orphan_reset_test.go new file mode 100644 index 000000000..169156234 --- /dev/null +++ b/internal/vector/embed/orphan_reset_test.go @@ -0,0 +1,177 @@ +//go:build sqlite_vec + +package embed + +import ( + "context" + "database/sql" + "fmt" + "path/filepath" + "testing" + + "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/require" + + "go.kenn.io/msgvault/internal/vector" + "go.kenn.io/msgvault/internal/vector/sqlitevec" +) + +// orphanFixture stands up a main DB (messages + bodies + applied_migrations) +// and a vectors.db at known paths, so a test can stamp orphaned embed_gen +// values and then RE-OPEN the backend to exercise resetOrphanedEmbedGen +// (which only runs from sqlitevec.Open). +type orphanFixture struct { + MainDB *sql.DB + MainPath string + VecPath string + Store WorkStore + Client *fakeEmbeddingClient +} + +// newOrphanFixture creates n live messages (id 1..n) with NULL embed_gen, +// plus the message_bodies and applied_migrations tables, at known file paths. +// It does NOT open a backend; the caller opens (and re-opens) as needed. +func newOrphanFixture(t *testing.T, n int) *orphanFixture { + t.Helper() + + dir := t.TempDir() + mainPath := filepath.Join(dir, "main.db") + require.NoError(t, sqlitevec.RegisterExtension(), "RegisterExtension") + mainDB, err := sql.Open(sqlitevec.DriverName(), mainPath) + require.NoError(t, err, "open main") + t.Cleanup(func() { _ = mainDB.Close() }) + + schema := testMainSchema + ` +CREATE TABLE applied_migrations ( + name TEXT PRIMARY KEY, + applied_at DATETIME DEFAULT CURRENT_TIMESTAMP +);` + _, err = mainDB.Exec(schema) + require.NoError(t, err, "schema") + for i := 1; i <= n; i++ { + _, err := mainDB.Exec( + `INSERT INTO messages (id, subject) VALUES (?, ?)`, i, fmt.Sprintf("msg %d", i)) + require.NoError(t, err, "insert message") + _, err = mainDB.Exec( + `INSERT INTO message_bodies (message_id, body_text) VALUES (?, ?)`, i, fmt.Sprintf("body %d", i)) + require.NoError(t, err, "insert body") + } + + return &orphanFixture{ + MainDB: mainDB, + MainPath: mainPath, + VecPath: filepath.Join(dir, "vectors.db"), + Store: &testWorkStore{db: mainDB}, + Client: &fakeEmbeddingClient{dim: 4}, + } +} + +// openBackend opens (or re-opens) the sqlitevec backend over the fixture's +// vectors.db + main DB, registering cleanup. +func (f *orphanFixture) openBackend(ctx context.Context, t *testing.T) *sqlitevec.Backend { + t.Helper() + b, err := sqlitevec.Open(ctx, sqlitevec.Options{ + Path: f.VecPath, + MainPath: f.MainPath, + Dimension: 4, + MainDB: f.MainDB, + }) + require.NoError(t, err, "sqlitevec.Open") + t.Cleanup(func() { _ = b.Close() }) + return b +} + +// TestResetOrphanedEmbedGen_RecreateScenario reproduces the vectors.db- +// recreate bug (Codex 129c #1): main.db carries embed_gen=1 stamps but the +// (fresh / empty) index_generations does NOT contain id 1. Opening the backend +// writable must reset those orphaned stamps to NULL BEFORE any rebuild can +// reuse id 1, so coverage reports them missing and a subsequent build re-embeds +// them — no false "done"/empty-index activation. +func TestResetOrphanedEmbedGen_RecreateScenario(t *testing.T) { + ctx := context.Background() + f := newOrphanFixture(t, 2) + + // Simulate a recreated vectors.db: empty index_generations, but main.db + // already stamps both messages embed_gen=1 (the old, now-gone gen id). + _, err := f.MainDB.ExecContext(ctx, `UPDATE messages SET embed_gen = 1`) + require.NoError(t, err, "stamp orphaned embed_gen=1") + + // Sanity: WITHOUT the reset, coverage for a freshly-created gen id 1 would + // (wrongly) read these as covered. Confirm both are currently stamped. + require.Equal(t, 0, countMissing(t, f.MainDB, 1), "pre-open: stamps mask coverage") + + // Open writable: index_generations is empty, so the valid-id set is empty + // and ALL non-NULL stamps are orphaned -> cleared. + b := f.openBackend(ctx, t) + + for _, id := range []int64{1, 2} { + _, isNull := embedGenOf(t, f.MainDB, id) + assert.Truef(t, isNull, "msg %d embed_gen reset to NULL after recreate open", id) + } + + // Now a rebuild creates a fresh gen (id 1, reusing the AUTOINCREMENT seed) + // and coverage correctly reports both messages missing. + gen, err := b.CreateGeneration(ctx, "fake", 4, "") + require.NoError(t, err, "CreateGeneration") + require.Equal(t, int64(1), int64(gen), "fresh vectors.db restarts gen ids at 1") + assert.Equal(t, 2, countMissing(t, f.MainDB, int64(gen)), + "both messages missing for the reused gen id (no false coverage)") + + // A worker RunOnce re-embeds both, so the index is NOT empty. + w := NewWorker(WorkerDeps{ + Backend: b, + VectorsDB: mustOpenVecDB(t, f.VecPath), + MainDB: f.MainDB, + Store: f.Store, + Client: f.Client, + BatchSize: 8, + }) + res, err := w.RunOnce(ctx, gen) + require.NoError(t, err, "RunOnce") + assert.Equal(t, 2, res.Succeeded, "both messages re-embedded after reset") + assert.Equal(t, 0, countMissing(t, f.MainDB, int64(gen)), "coverage complete after re-embed") +} + +// TestResetOrphanedEmbedGen_NoFalsePositive verifies the reset PRESERVES +// stamps that reference a still-existing generation row. A message stamped for +// a real, retained gen (active or retired — retire only flips state) must NOT +// be reset, so the normal activate/retire flow never re-embeds good data. +func TestResetOrphanedEmbedGen_NoFalsePositive(t *testing.T) { + ctx := context.Background() + f := newOrphanFixture(t, 2) + + // First Open: empty vectors.db. Create + activate a real generation with + // both messages embedded and stamped — the normal "fully covered" state. + b := f.openBackend(ctx, t) + gen, err := b.CreateGeneration(ctx, "fake", 4, "") + require.NoError(t, err, "CreateGeneration") + require.NoError(t, b.Upsert(ctx, gen, []vector.Chunk{ + {MessageID: 1, Vector: []float32{1, 0, 0, 0}}, + {MessageID: 2, Vector: []float32{0, 1, 0, 0}}, + }), "Upsert") + require.NoError(t, f.Store.SetEmbedGen(ctx, []int64{1, 2}, int64(gen)), "stamp") + require.NoError(t, b.ActivateGeneration(ctx, gen, true), "Activate") + require.NoError(t, b.Close(), "close first backend") + + // Re-open writable: gen still exists in index_generations, so its stamps + // must be PRESERVED (not reset). + f.openBackend(ctx, t) + + for _, id := range []int64{1, 2} { + v, isNull := embedGenOf(t, f.MainDB, id) + assert.Falsef(t, isNull, "msg %d stamp preserved (gen still exists)", id) + assert.Equalf(t, int64(gen), v, "msg %d embed_gen preserved", id) + } + assert.Equal(t, 0, countMissing(t, f.MainDB, int64(gen)), + "coverage stays complete; no spurious re-embed") +} + +// mustOpenVecDB opens a second handle to vectors.db for the worker's +// embed_runs / watermark writes, mirroring newBackfillFixture. +func mustOpenVecDB(t *testing.T, path string) *sql.DB { + t.Helper() + db, err := sql.Open(sqlitevec.DriverName(), path) + require.NoError(t, err, "open vectors.db handle") + t.Cleanup(func() { _ = db.Close() }) + return db +} diff --git a/internal/vector/embed/queue.go b/internal/vector/embed/queue.go deleted file mode 100644 index 48a147d51..000000000 --- a/internal/vector/embed/queue.go +++ /dev/null @@ -1,298 +0,0 @@ -package embed - -import ( - "context" - "crypto/rand" - "database/sql" - "encoding/hex" - "fmt" - "slices" - "strings" - "time" - - "go.kenn.io/msgvault/internal/vector" -) - -// Queue wraps pending_embeddings with a crash-safe claim-mark-complete -// pattern. A claim atomically marks up to N available rows with a token -// and the current timestamp; Complete deletes the rows (on success) and -// Release clears the claim (on failure). Rows whose claims are older -// than a configurable cutoff can be reclaimed via ReclaimStale, so a -// crashed worker does not strand pending work. -type Queue struct { - db *sql.DB - rebind func(string) string - // isPG is true when the underlying driver is PostgreSQL. When set, - // Claim uses FOR UPDATE SKIP LOCKED in the inner SELECT to prevent - // concurrent workers from claiming the same pending rows. - isPG bool -} - -// NewQueue returns a Queue bound to db. The caller retains ownership of -// db; Queue does not close it. rebind translates ?-placeholders to the -// driver's native form; pass an identity function (or nil) for SQLite -// and the PostgreSQL dialect's Rebind for pgx. -// -// The Queue detects whether the backend is PostgreSQL by probing rebind: -// if rebind("?") == "$1" the driver is pgx and Claim will use -// FOR UPDATE SKIP LOCKED to prevent concurrent workers from double-claiming. -func NewQueue(db *sql.DB, rebind func(string) string) *Queue { - if rebind == nil { - rebind = func(q string) string { return q } - } - return &Queue{db: db, rebind: rebind, isPG: rebind("?") == "$1"} -} - -// Claim marks up to batch pending rows for gen as claimed by a fresh -// token, returning the message IDs in ascending order alongside the -// token to present to Complete or Release. -// -// If batch <= 0, or no rows are available, Claim returns (nil, "", nil). -// Returning an empty token for "no work" avoids asking callers to hold a -// dead token. -func (q *Queue) Claim(ctx context.Context, gen vector.GenerationID, batch int) ([]int64, string, error) { - if batch <= 0 { - return nil, "", nil - } - token, err := newToken() - if err != nil { - return nil, "", fmt.Errorf("new token: %w", err) - } - now := time.Now().Unix() - - tx, err := q.db.BeginTx(ctx, nil) - if err != nil { - return nil, "", fmt.Errorf("begin claim tx: %w", err) - } - defer func() { _ = tx.Rollback() }() - - // claimSQL selects the candidate rows for the UPDATE. PostgreSQL uses - // FOR UPDATE SKIP LOCKED so that concurrent workers each see a disjoint - // slice of available rows; without it two workers can select the same - // rows in their subquery snapshots and the later UPDATE will simply - // overwrite the earlier claim token, causing duplicate work. - // SQLite serializes writers at the file level so no advisory locking is - // needed there (and it does not support the FOR UPDATE syntax). - claimSubquery := ` - SELECT generation_id, message_id - FROM pending_embeddings - WHERE generation_id = ? - AND claimed_at IS NULL - ORDER BY message_id - LIMIT ?` - if q.isPG { - claimSubquery += ` - FOR UPDATE SKIP LOCKED` - } - claimSQL := ` - UPDATE pending_embeddings - SET claimed_at = ?, claim_token = ? - WHERE (generation_id, message_id) IN (` + claimSubquery + `) - RETURNING message_id` - - ids, err := func() ([]int64, error) { - rows, err := tx.QueryContext(ctx, q.rebind(claimSQL), - now, token, int64(gen), batch) - if err != nil { - return nil, fmt.Errorf("claim query: %w", err) - } - defer func() { _ = rows.Close() }() - var out []int64 - for rows.Next() { - var id int64 - if err := rows.Scan(&id); err != nil { - return nil, fmt.Errorf("scan claimed id: %w", err) - } - out = append(out, id) - } - if err := rows.Err(); err != nil { - return nil, fmt.Errorf("claim rows: %w", err) - } - return out, nil - }() - if err != nil { - return nil, "", err - } - if err := tx.Commit(); err != nil { - return nil, "", fmt.Errorf("commit claim: %w", err) - } - if len(ids) == 0 { - return nil, "", nil - } - // The subquery's ORDER BY decides WHICH rows get claimed, but - // RETURNING does not guarantee order. Sort explicitly so callers - // can rely on ascending ids (matters for deterministic test - // assertions and for pairing ids with fetched message bodies by - // position). - slices.Sort(ids) - return ids, token, nil -} - -// completeReleaseChunkRows caps how many message ids go into a single -// Complete/Release statement's IN clause. Each statement binds one -// placeholder per id plus two (generation_id, claim_token), so a chunk -// of 500 ids = 502 bound parameters — comfortably under SQLite's -// 32,766-variable ceiling and PostgreSQL's 65,535 limit even if a -// misconfigured Embeddings.BatchSize claims far more rows than the -// default 32. Mirrors enqueue.go's enqueueChunkRows discipline so a -// single oversized batch never blows the driver bind ceiling. It is a -// var (not const) only so tests can lower it to exercise the chunk -// boundary without driving a multi-thousand-row batch through the DB; -// production never reassigns it. -var completeReleaseChunkRows = 500 - -// afterChunkHook is a test-only fault-injection seam. When non-nil it is -// invoked after each chunk's statement executes inside execTokenScoped, -// receiving the number of ids processed so far. Returning a non-nil error -// aborts the loop and triggers the transaction rollback, letting tests -// prove cross-chunk atomicity (a failure after an earlier chunk must leave -// zero rows changed). It is always nil in production. -var afterChunkHook func(processed int) error - -// Complete deletes the claimed rows from the queue. Only rows whose -// claim_token matches token are removed; any row that was reclaimed or -// re-claimed under a different token is left in place. A nil or empty -// ids slice is a no-op. The ids are processed in chunks (see -// completeReleaseChunkRows); all chunks run inside a single transaction -// (see execTokenScoped), so the delete is atomic across chunks — either -// every matching row is removed or, on any error, none are. -func (q *Queue) Complete(ctx context.Context, gen vector.GenerationID, token string, ids []int64) error { - const stmt = ` - DELETE FROM pending_embeddings - WHERE generation_id = ? - AND claim_token = ? - AND message_id IN ` - if err := q.execTokenScoped(ctx, stmt, gen, token, ids); err != nil { - return fmt.Errorf("delete pending: %w", err) - } - return nil -} - -// Release returns claimed rows to the pool so another worker can pick -// them up (for embedding failures). Only rows whose claim_token matches -// token are released. A nil or empty ids slice is a no-op. Like -// Complete, the ids are processed in token-scoped chunks (see -// completeReleaseChunkRows) inside a single transaction, so the release -// is atomic across chunks — all matching rows are cleared or, on error, -// none are. -func (q *Queue) Release(ctx context.Context, gen vector.GenerationID, token string, ids []int64) error { - const stmt = ` - UPDATE pending_embeddings - SET claimed_at = NULL, claim_token = NULL - WHERE generation_id = ? - AND claim_token = ? - AND message_id IN ` - if err := q.execTokenScoped(ctx, stmt, gen, token, ids); err != nil { - return fmt.Errorf("release: %w", err) - } - return nil -} - -// execTokenScoped runs stmtPrefix (a DELETE or UPDATE ending in -// "... message_id IN ") once per chunk of ids, appending an -// inPlaceholders IN clause and binding (gen, token, ids...) for each -// chunk. Chunking keeps the per-statement bind count under the driver's -// limit regardless of how many ids a single claim produced. Every chunk -// is filtered on generation_id = gen AND claim_token = token, so the -// token-scoped semantics are identical to a single statement; because the -// chunks operate over disjoint id subsets the additive deletes/updates -// compose correctly. A nil or empty ids slice is a no-op. -// -// All chunks run inside a single transaction so the operation is -// all-or-nothing: before chunking, Complete/Release was one atomic -// statement, and wrapping the chunks in a tx restores that guarantee. If -// any chunk fails (DB error or context cancellation) the whole batch is -// rolled back and no rows are left partially deleted/updated while the -// caller still sees an error. Works on both SQLite (mattn supports a tx -// spanning multiple statements) and PostgreSQL (pgx). -func (q *Queue) execTokenScoped(ctx context.Context, stmtPrefix string, gen vector.GenerationID, token string, ids []int64) error { - if len(ids) == 0 { - return nil - } - - tx, err := q.db.BeginTx(ctx, nil) - if err != nil { - return fmt.Errorf("begin token-scoped tx: %w", err) - } - // Roll back unless Commit below succeeds. After a successful Commit - // this Rollback is a no-op (sql.ErrTxDone), so it cannot mask success. - committed := false - defer func() { - if !committed { - _ = tx.Rollback() - } - }() - - for start := 0; start < len(ids); start += completeReleaseChunkRows { - end := min(start+completeReleaseChunkRows, len(ids)) - chunk := ids[start:end] - - args := make([]any, 0, 2+len(chunk)) - args = append(args, int64(gen), token) - for _, id := range chunk { - args = append(args, id) - } - query := q.rebind(stmtPrefix + inPlaceholders(len(chunk))) - if _, err := tx.ExecContext(ctx, query, args...); err != nil { - // The deferred Rollback discards every earlier chunk in this - // tx, so the failure leaves the queue untouched. Surface the - // original error %w-wrapped; the rollback error (if any) is - // intentionally not propagated so it cannot mask this one. - return fmt.Errorf("exec chunk: %w", err) - } - // Test-only fault-injection seam (nil in production): lets the - // atomicity tests force a failure AFTER an earlier chunk has - // already executed inside this tx, exercising the cross-chunk - // rollback path deterministically. Mirrors the preReturn/OnEmbed - // seams used elsewhere in this package. - if afterChunkHook != nil { - if err := afterChunkHook(end); err != nil { - return fmt.Errorf("exec chunk: %w", err) - } - } - } - - if err := tx.Commit(); err != nil { - return fmt.Errorf("commit token-scoped tx: %w", err) - } - committed = true - return nil -} - -// ReclaimStale clears the claim on any pending row whose claimed_at is -// older than olderThan. Returns the number of rows reclaimed. -func (q *Queue) ReclaimStale(ctx context.Context, olderThan time.Duration) (int, error) { - cutoff := time.Now().Add(-olderThan).Unix() - res, err := q.db.ExecContext(ctx, q.rebind(` - UPDATE pending_embeddings - SET claimed_at = NULL, claim_token = NULL - WHERE claimed_at IS NOT NULL AND claimed_at < ?`), cutoff) - if err != nil { - return 0, fmt.Errorf("reclaim stale: %w", err) - } - n, err := res.RowsAffected() - if err != nil { - return 0, fmt.Errorf("rows affected: %w", err) - } - return int(n), nil -} - -// inPlaceholders returns "(?,?,...)" with n placeholders, for building -// IN clauses dynamically. The output uses ? regardless of dialect; the -// caller is expected to run the surrounding query through rebind. -func inPlaceholders(n int) string { - ph := make([]string, n) - for i := range ph { - ph[i] = "?" - } - return "(" + strings.Join(ph, ",") + ")" -} - -// newToken returns 16 hex characters backed by 8 bytes of crypto/rand. -func newToken() (string, error) { - b := make([]byte, 8) - if _, err := rand.Read(b); err != nil { - return "", fmt.Errorf("read random: %w", err) - } - return hex.EncodeToString(b), nil -} diff --git a/internal/vector/embed/queue_pg_test.go b/internal/vector/embed/queue_pg_test.go deleted file mode 100644 index b9b93e366..000000000 --- a/internal/vector/embed/queue_pg_test.go +++ /dev/null @@ -1,435 +0,0 @@ -//go:build pgvector - -package embed - -import ( - "context" - "crypto/rand" - "database/sql" - "encoding/hex" - "errors" - "os" - "slices" - "sort" - "strings" - "sync" - "testing" - "time" - - _ "github.com/jackc/pgx/v5/stdlib" - "github.com/stretchr/testify/assert" - "github.com/stretchr/testify/require" - "go.kenn.io/msgvault/internal/store" - "go.kenn.io/msgvault/internal/vector/pgvector" -) - -// newPGQueueSchema stands up a per-test schema on MSGVAULT_TEST_DB, applies -// the pgvector schema, and seeds one building generation (id=1) with n -// pending rows. It returns BOTH the *sql.DB and the schema-scoped DSN so -// callers that need a SECOND independent handle on the SAME schema (e.g. the -// SKIP-LOCKED concurrency test) can sql.Open the returned dsn. The schema is -// dropped on cleanup. Skips when MSGVAULT_TEST_DB is not a PostgreSQL DSN. -func newPGQueueSchema(t *testing.T, n int) (db *sql.DB, dsn string) { - t.Helper() - url := os.Getenv("MSGVAULT_TEST_DB") - if !strings.HasPrefix(url, "postgres://") && !strings.HasPrefix(url, "postgresql://") { - t.Skip("pgvector queue tests require MSGVAULT_TEST_DB to point at a PostgreSQL DSN") - } - - buf := make([]byte, 8) - _, err := rand.Read(buf) - require.NoError(t, err, "random schema name") - schemaName := "embed_q_test_" + hex.EncodeToString(buf) - - setup, err := sql.Open("pgx", url) - require.NoError(t, err, "open setup") - defer func() { _ = setup.Close() }() - _, err = setup.Exec("CREATE SCHEMA " + schemaName) - require.NoError(t, err, "create schema") - - sep := "?" - if strings.Contains(url, "?") { - sep = "&" - } - dsn = url + sep + "search_path=" + schemaName + ",public" - - db, err = sql.Open("pgx", dsn) - require.NoError(t, err, "open") - t.Cleanup(func() { - _ = db.Close() - cleanup, err := sql.Open("pgx", url) - if err != nil { - return - } - defer func() { _ = cleanup.Close() }() - _, _ = cleanup.Exec("DROP SCHEMA " + schemaName + " CASCADE") - }) - - ctx := context.Background() - require.NoError(t, pgvector.Migrate(ctx, db, 0, false), "pgvector.Migrate") - - _, err = db.ExecContext(ctx, ` - INSERT INTO index_generations (id, model, dimension, fingerprint, started_at, state) - OVERRIDING SYSTEM VALUE - VALUES (1, 'm', 768, 'm:768', 0, 'building')`) - require.NoError(t, err, "insert generation") - for i := 1; i <= n; i++ { - _, err := db.ExecContext(ctx, - `INSERT INTO pending_embeddings (generation_id, message_id, enqueued_at) VALUES (1, $1, 0)`, - i) - require.NoError(t, err, "insert pending") - } - return db, dsn -} - -// openPGQueueDB stands up a per-test schema seeded with n pending rows and -// returns the *sql.DB. Thin wrapper over newPGQueueSchema for the common -// single-handle case; cleanup drops the schema via t.Cleanup. -func openPGQueueDB(t *testing.T, n int) *sql.DB { - t.Helper() - db, _ := newPGQueueSchema(t, n) - return db -} - -// pgCountAvailable returns the number of available (unclaimed) rows for -// the single building generation (id=1) that newPGQueueSchema seeds — -// the only generation these queue tests create. -func pgCountAvailable(t *testing.T, db *sql.DB) int { - t.Helper() - var n int - err := db.QueryRow( - `SELECT COUNT(*) FROM pending_embeddings WHERE generation_id = 1 AND claimed_at IS NULL`).Scan(&n) - require.NoError(t, err, "countAvailable") - return n -} - -func pgRebind() func(string) string { - return (&store.PostgreSQLDialect{}).Rebind -} - -func TestQueuePG_ClaimReleaseComplete(t *testing.T) { - ctx := context.Background() - db := openPGQueueDB(t, 5) - q := NewQueue(db, pgRebind()) - - ids, token, err := q.Claim(ctx, 1, 3) - require.NoError(t, err, "Claim") - require.Len(t, ids, 3) - require.NotEmpty(t, token) - - more, token2, err := q.Claim(ctx, 1, 10) - require.NoError(t, err) - assert.Len(t, more, 2) - assert.NotEqual(t, token, token2, "second claim must use a fresh token") - - require.NoError(t, q.Release(ctx, 1, token, ids), "Release") - assert.Equal(t, 3, pgCountAvailable(t, db), "available after release") - - require.NoError(t, q.Complete(ctx, 1, token2, more), "Complete") - var total int - require.NoError(t, db.QueryRow(`SELECT COUNT(*) FROM pending_embeddings`).Scan(&total)) - assert.Equal(t, 3, total, "pending total after complete = 5 - 2") -} - -func TestQueuePG_Claim_EmptyBatchIsNoop(t *testing.T) { - ctx := context.Background() - db := openPGQueueDB(t, 1) - q := NewQueue(db, pgRebind()) - ids, token, err := q.Claim(ctx, 1, 0) - require.NoError(t, err, "Claim(0)") - assert.Empty(t, ids) - assert.Empty(t, token) -} - -func TestQueuePG_Claim_NoAvailableReturnsEmpty(t *testing.T) { - ctx := context.Background() - db := openPGQueueDB(t, 0) - q := NewQueue(db, pgRebind()) - ids, token, err := q.Claim(ctx, 1, 10) - require.NoError(t, err, "Claim") - assert.Empty(t, ids) - assert.Empty(t, token) -} - -func TestQueuePG_Complete_WrongTokenNoop(t *testing.T) { - ctx := context.Background() - db := openPGQueueDB(t, 2) - q := NewQueue(db, pgRebind()) - ids, _, err := q.Claim(ctx, 1, 2) - require.NoError(t, err) - require.NoError(t, q.Complete(ctx, 1, "deadbeef", ids), "Complete with wrong token") - var n int - require.NoError(t, db.QueryRow(`SELECT COUNT(*) FROM pending_embeddings`).Scan(&n)) - assert.Equal(t, 2, n, "Complete should not delete on token mismatch") -} - -func TestQueuePG_Release_WrongTokenNoop(t *testing.T) { - ctx := context.Background() - db := openPGQueueDB(t, 2) - q := NewQueue(db, pgRebind()) - ids, _, err := q.Claim(ctx, 1, 2) - require.NoError(t, err) - require.NoError(t, q.Release(ctx, 1, "deadbeef", ids), "Release with wrong token") - assert.Equal(t, 0, pgCountAvailable(t, db), "available after wrong-token release (still claimed)") -} - -func TestQueuePG_ReclaimStale(t *testing.T) { - ctx := context.Background() - db := openPGQueueDB(t, 2) - q := NewQueue(db, pgRebind()) - _, _, err := q.Claim(ctx, 1, 2) - require.NoError(t, err) - _, err = db.ExecContext(ctx, - `UPDATE pending_embeddings SET claimed_at = $1 WHERE generation_id = 1`, - time.Now().Add(-20*time.Minute).Unix()) - require.NoError(t, err) - n, err := q.ReclaimStale(ctx, 10*time.Minute) - require.NoError(t, err) - assert.Equal(t, 2, n, "reclaimed") - assert.Equal(t, 2, pgCountAvailable(t, db), "available after reclaim") -} - -func TestQueuePG_Complete_EmptyIDsIsNoop(t *testing.T) { - ctx := context.Background() - db := openPGQueueDB(t, 1) - q := NewQueue(db, pgRebind()) - assert.NoError(t, q.Complete(ctx, 1, "token", nil), "Complete(nil)") -} - -func TestQueuePG_Claim_ReturnsIDsAscending(t *testing.T) { - ctx := context.Background() - db := openPGQueueDB(t, 10) - q := NewQueue(db, pgRebind()) - - ids, _, err := q.Claim(ctx, 1, 10) - require.NoError(t, err, "Claim") - require.Len(t, ids, 10) - assert.True(t, sort.SliceIsSorted(ids, func(i, j int) bool { return ids[i] < ids[j] }), - "ids not ascending: %v", ids) -} - -// TestQueuePG_CompleteRelease_ChunksLargeIDSets is the PG counterpart of -// TestQueue_CompleteRelease_ChunksLargeIDSets: it lowers the chunk size -// so Complete/Release span multiple token-scoped statements on pgx, then -// asserts every intended row is released/deleted. -func TestQueuePG_CompleteRelease_ChunksLargeIDSets(t *testing.T) { - ctx := context.Background() - - orig := completeReleaseChunkRows - completeReleaseChunkRows = 2 - t.Cleanup(func() { completeReleaseChunkRows = orig }) - - const n = 5 - db := openPGQueueDB(t, n) - q := NewQueue(db, pgRebind()) - - ids, token, err := q.Claim(ctx, 1, n) - require.NoError(t, err, "Claim") - require.Len(t, ids, n) - - require.NoError(t, q.Release(ctx, 1, token, ids), "Release (chunked)") - assert.Equal(t, n, pgCountAvailable(t, db), "all rows available after chunked Release") - - ids2, token2, err := q.Claim(ctx, 1, n) - require.NoError(t, err, "re-Claim") - require.Len(t, ids2, n) - require.NoError(t, q.Complete(ctx, 1, token2, ids2), "Complete (chunked)") - var total int - require.NoError(t, db.QueryRow(`SELECT COUNT(*) FROM pending_embeddings`).Scan(&total)) - assert.Equal(t, 0, total, "all rows deleted after chunked Complete") -} - -// TestQueuePG_CompleteRelease_ChunkedTokenScoped verifies the chunked -// pgx path preserves the token filter across chunk boundaries. -func TestQueuePG_CompleteRelease_ChunkedTokenScoped(t *testing.T) { - ctx := context.Background() - - orig := completeReleaseChunkRows - completeReleaseChunkRows = 2 - t.Cleanup(func() { completeReleaseChunkRows = orig }) - - const n = 5 - db := openPGQueueDB(t, n) - q := NewQueue(db, pgRebind()) - - ids, _, err := q.Claim(ctx, 1, n) - require.NoError(t, err, "Claim") - require.Len(t, ids, n) - - require.NoError(t, q.Complete(ctx, 1, "deadbeef", ids), "Complete wrong token (chunked)") - var total int - require.NoError(t, db.QueryRow(`SELECT COUNT(*) FROM pending_embeddings`).Scan(&total)) - assert.Equal(t, n, total, "wrong-token chunked Complete must not delete") - - require.NoError(t, q.Release(ctx, 1, "deadbeef", ids), "Release wrong token (chunked)") - assert.Equal(t, 0, pgCountAvailable(t, db), "wrong-token chunked Release must leave rows claimed") -} - -// TestQueuePG_CompleteRelease_AtomicAcrossChunks is the pgx counterpart of -// TestQueue_CompleteRelease_AtomicAcrossChunks: it proves the chunked -// Complete/Release rolls back entirely when a chunk after the first fails, -// so the operation is all-or-nothing on PostgreSQL too. -func TestQueuePG_CompleteRelease_AtomicAcrossChunks(t *testing.T) { - ctx := context.Background() - - origChunk := completeReleaseChunkRows - completeReleaseChunkRows = 2 - t.Cleanup(func() { completeReleaseChunkRows = origChunk }) - - injected := errors.New("injected mid-batch failure") - t.Cleanup(func() { afterChunkHook = nil }) - - // --- Complete (DELETE) atomicity --- - const n = 5 // 5 ids over chunk size 2 → 3 chunks - db := openPGQueueDB(t, n) - q := NewQueue(db, pgRebind()) - - ids, token, err := q.Claim(ctx, 1, n) - require.NoError(t, err, "Claim") - require.Len(t, ids, n) - - afterChunkHook = func(int) error { return injected } - err = q.Complete(ctx, 1, token, ids) - require.Error(t, err, "Complete must surface the injected failure") - require.ErrorIs(t, err, injected, "error must wrap the injected cause") - afterChunkHook = nil - - var total int - require.NoError(t, db.QueryRow(`SELECT COUNT(*) FROM pending_embeddings`).Scan(&total)) - assert.Equal(t, n, total, "failed chunked Complete must delete zero rows (all-or-nothing)") - - require.NoError(t, q.Complete(ctx, 1, token, ids), "retry Complete") - require.NoError(t, db.QueryRow(`SELECT COUNT(*) FROM pending_embeddings`).Scan(&total)) - assert.Equal(t, 0, total, "clean retry deletes all rows") - - // --- Release (UPDATE) atomicity --- - db2 := openPGQueueDB(t, n) - q2 := NewQueue(db2, pgRebind()) - - ids2, token2, err := q2.Claim(ctx, 1, n) - require.NoError(t, err, "Claim (release case)") - require.Len(t, ids2, n) - require.Equal(t, 0, pgCountAvailable(t, db2), "all claimed before release") - - afterChunkHook = func(int) error { return injected } - err = q2.Release(ctx, 1, token2, ids2) - require.Error(t, err, "Release must surface the injected failure") - require.ErrorIs(t, err, injected, "error must wrap the injected cause") - afterChunkHook = nil - - assert.Equal(t, 0, pgCountAvailable(t, db2), - "failed chunked Release must clear zero claims (all-or-nothing)") - - require.NoError(t, q2.Release(ctx, 1, token2, ids2), "retry Release") - assert.Equal(t, n, pgCountAvailable(t, db2), "clean retry releases all rows") -} - -func TestQueuePG_Complete_AfterReclaim_PreservesNewClaim(t *testing.T) { - ctx := context.Background() - db := openPGQueueDB(t, 2) - q := NewQueue(db, pgRebind()) - - idsA, tokenA, err := q.Claim(ctx, 1, 2) - require.NoError(t, err, "Claim A") - require.Len(t, idsA, 2) - - _, err = db.ExecContext(ctx, - `UPDATE pending_embeddings SET claimed_at = $1 WHERE generation_id = 1`, - time.Now().Add(-20*time.Minute).Unix()) - require.NoError(t, err) - n, err := q.ReclaimStale(ctx, 10*time.Minute) - require.NoError(t, err, "ReclaimStale") - require.Equal(t, 2, n, "ReclaimStale count") - - idsB, tokenB, err := q.Claim(ctx, 1, 2) - require.NoError(t, err, "Claim B") - require.Len(t, idsB, 2) - require.NotEqual(t, tokenA, tokenB) - - require.NoError(t, q.Complete(ctx, 1, tokenA, idsA), "Complete(stale tokenA)") - var remaining int - require.NoError(t, db.QueryRow(`SELECT COUNT(*) FROM pending_embeddings`).Scan(&remaining)) - require.Equal(t, 2, remaining, "stale token must not delete") - - var claimed int - require.NoError(t, db.QueryRow( - `SELECT COUNT(*) FROM pending_embeddings WHERE claim_token = $1`, tokenB).Scan(&claimed)) - assert.Equal(t, 2, claimed, "rows still holding B's token") - - require.NoError(t, q.Complete(ctx, 1, tokenB, idsB), "Complete(tokenB)") - require.NoError(t, db.QueryRow(`SELECT COUNT(*) FROM pending_embeddings`).Scan(&remaining)) - assert.Equal(t, 0, remaining, "pending rows after B's Complete") -} - -// TestQueuePG_ConcurrentClaim_SkipLocked verifies that FOR UPDATE SKIP LOCKED -// prevents two concurrent claimers from double-claiming the same rows. Each -// claimer runs on a separate *sql.DB (independent connection pool) so the -// claims are genuinely concurrent at the database level. -// -// The test inserts N pending rows, then fires two goroutines each calling -// Claim(N) concurrently. Because SKIP LOCKED makes the two transactions see -// disjoint available sets, the union of their claimed IDs must equal exactly -// the N inserted rows with no overlaps. -func TestQueuePG_ConcurrentClaim_SkipLocked(t *testing.T) { - const n = 20 - ctx := context.Background() - - // One isolated schema seeded with n pending rows. newPGQueueSchema returns - // the schema-scoped DSN so the second handle below targets the SAME schema - // — essential for this test, since two different schemas would make the - // SKIP-LOCKED assertion pass vacuously (two queues over disjoint tables). - db1, dsn := newPGQueueSchema(t, n) - - // Open a second independent connection on the SAME schema so the two Queue - // instances use separate connection pools and their transactions do not - // share state. - db2, err := sql.Open("pgx", dsn) - require.NoError(t, err, "open db2") - t.Cleanup(func() { _ = db2.Close() }) - - // Guard against an accidental two-schema refactor: a row visible via db1 - // must also be visible via db2 (they resolve the same pending table). - var visible int - require.NoError(t, - db2.QueryRowContext(ctx, `SELECT COUNT(*) FROM pending_embeddings WHERE generation_id = 1`).Scan(&visible), - "db2 must see db1's seeded rows (same schema)") - require.Equal(t, n, visible, "db1 and db2 must target the same schema's pending table") - - q1 := NewQueue(db1, pgRebind()) - q2 := NewQueue(db2, pgRebind()) - - type result struct { - ids []int64 - token string - err error - } - ch := make(chan result, 2) - - var wg sync.WaitGroup - wg.Add(2) - go func() { - defer wg.Done() - ids, tok, err := q1.Claim(ctx, 1, n) - ch <- result{ids, tok, err} - }() - go func() { - defer wg.Done() - ids, tok, err := q2.Claim(ctx, 1, n) - ch <- result{ids, tok, err} - }() - wg.Wait() - close(ch) - - var allIDs []int64 - for res := range ch { - require.NoError(t, res.err, "Claim must not error") - allIDs = append(allIDs, res.ids...) - } - - // The union of claimed IDs must equal exactly {1..n} with no duplicates. - slices.Sort(allIDs) - require.Len(t, allIDs, n, "total claimed rows must equal n (no rows unclaimed and no duplicates)") - for i, id := range allIDs { - assert.Equal(t, int64(i+1), id, "claimed ID at position %d", i) - } -} diff --git a/internal/vector/embed/queue_test.go b/internal/vector/embed/queue_test.go deleted file mode 100644 index ef87aa02c..000000000 --- a/internal/vector/embed/queue_test.go +++ /dev/null @@ -1,322 +0,0 @@ -//go:build sqlite_vec - -package embed - -import ( - "context" - "errors" - "sort" - "testing" - "time" - - assertpkg "github.com/stretchr/testify/assert" - requirepkg "github.com/stretchr/testify/require" -) - -func TestQueue_ClaimReleaseComplete(t *testing.T) { - require := requirepkg.New(t) - assert := assertpkg.New(t) - ctx := context.Background() - db := openVectorsDBWithPending(t, 5) - q := NewQueue(db, nil) - - ids, token, err := q.Claim(ctx, 1, 3) - require.NoError(err, "Claim") - require.Len(ids, 3) - require.NotEmpty(token, "claim token") - - // Second claim sees only 2 available. - more, token2, err := q.Claim(ctx, 1, 10) - require.NoError(err) - assert.Len(more, 2) - assert.NotEqual(token, token2, "token collision") - - require.NoError(q.Release(ctx, 1, token, ids), "Release") - assert.Equal(3, countAvailable(t, db, 1), "available after release") - - // Now complete the second batch; pending count should drop by 2. - require.NoError(q.Complete(ctx, 1, token2, more), "Complete") - var total int - require.NoError(db.QueryRow(`SELECT COUNT(*) FROM pending_embeddings`).Scan(&total), "total") - assert.Equal(3, total, "pending total after complete (5 - 2)") -} - -func TestQueue_Claim_EmptyBatchIsNoop(t *testing.T) { - ctx := context.Background() - db := openVectorsDBWithPending(t, 1) - q := NewQueue(db, nil) - ids, token, err := q.Claim(ctx, 1, 0) - requirepkg.NoError(t, err, "Claim(0)") - assertpkg.Empty(t, ids) - assertpkg.Empty(t, token) -} - -func TestQueue_Claim_NoAvailableReturnsEmpty(t *testing.T) { - ctx := context.Background() - db := openVectorsDBWithPending(t, 0) - q := NewQueue(db, nil) - ids, token, err := q.Claim(ctx, 1, 10) - requirepkg.NoError(t, err, "Claim") - assertpkg.Empty(t, ids) - assertpkg.Empty(t, token) -} - -func TestQueue_Complete_WrongTokenNoop(t *testing.T) { - require := requirepkg.New(t) - ctx := context.Background() - db := openVectorsDBWithPending(t, 2) - q := NewQueue(db, nil) - ids, _, err := q.Claim(ctx, 1, 2) - require.NoError(err) - // Wrong token — rows should remain. - require.NoError(q.Complete(ctx, 1, "deadbeef", ids), "Complete with wrong token") - var n int - require.NoError(db.QueryRow(`SELECT COUNT(*) FROM pending_embeddings`).Scan(&n)) - assertpkg.Equal(t, 2, n, "Complete should not delete on token mismatch") -} - -func TestQueue_Release_WrongTokenNoop(t *testing.T) { - ctx := context.Background() - db := openVectorsDBWithPending(t, 2) - q := NewQueue(db, nil) - ids, _, err := q.Claim(ctx, 1, 2) - requirepkg.NoError(t, err) - requirepkg.NoError(t, q.Release(ctx, 1, "deadbeef", ids), "Release with wrong token") - assertpkg.Equal(t, 0, countAvailable(t, db, 1), "still claimed") -} - -func TestQueue_ReclaimStale(t *testing.T) { - require := requirepkg.New(t) - assert := assertpkg.New(t) - ctx := context.Background() - db := openVectorsDBWithPending(t, 2) - q := NewQueue(db, nil) - _, _, err := q.Claim(ctx, 1, 2) - require.NoError(err) - // Back-date the claim past the threshold. - _, err = db.ExecContext(ctx, - `UPDATE pending_embeddings SET claimed_at = ? WHERE generation_id = 1`, - time.Now().Add(-20*time.Minute).Unix()) - require.NoError(err) - n, err := q.ReclaimStale(ctx, 10*time.Minute) - require.NoError(err) - assert.Equal(2, n, "reclaimed") - assert.Equal(2, countAvailable(t, db, 1), "available after reclaim") -} - -func TestQueue_Complete_EmptyIDsIsNoop(t *testing.T) { - ctx := context.Background() - db := openVectorsDBWithPending(t, 1) - q := NewQueue(db, nil) - assertpkg.NoError(t, q.Complete(ctx, 1, "token", nil), "Complete(nil)") -} - -// TestQueue_Claim_ReturnsIDsAscending verifies that Claim's returned -// slice is sorted ascending regardless of the order SQLite's -// UPDATE...RETURNING clause produces rows. Callers (the Worker) pair -// ids with fetched message rows by position, so a non-deterministic -// order would cause silent vector↔message mixups. -func TestQueue_Claim_ReturnsIDsAscending(t *testing.T) { - ctx := context.Background() - db := openVectorsDBWithPending(t, 10) - q := NewQueue(db, nil) - - ids, _, err := q.Claim(ctx, 1, 10) - requirepkg.NoError(t, err, "Claim") - requirepkg.Len(t, ids, 10) - assertpkg.True(t, sort.SliceIsSorted(ids, func(i, j int) bool { return ids[i] < ids[j] }), - "ids not ascending: %v", ids) -} - -// TestQueue_CompleteRelease_ChunksLargeIDSets verifies that Complete -// and Release split an id set larger than completeReleaseChunkRows into -// multiple token-scoped statements and still affect exactly the intended -// rows. The chunk size is temporarily lowered so the test exercises the -// chunk boundary (3 chunks) with a modest row count rather than driving -// thousands of rows through the driver. -func TestQueue_CompleteRelease_ChunksLargeIDSets(t *testing.T) { - require := requirepkg.New(t) - assert := assertpkg.New(t) - ctx := context.Background() - - orig := completeReleaseChunkRows - completeReleaseChunkRows = 2 - t.Cleanup(func() { completeReleaseChunkRows = orig }) - - // 5 ids over a chunk size of 2 → 3 chunks (2, 2, 1). - const n = 5 - db := openVectorsDBWithPending(t, n) - q := NewQueue(db, nil) - - ids, token, err := q.Claim(ctx, 1, n) - require.NoError(err, "Claim") - require.Len(ids, n) - - // Release across chunks: every row returns to the pool. - require.NoError(q.Release(ctx, 1, token, ids), "Release (chunked)") - assert.Equal(n, countAvailable(t, db, 1), "all rows available after chunked Release") - - // Re-claim and Complete across chunks: every row is deleted. - ids2, token2, err := q.Claim(ctx, 1, n) - require.NoError(err, "re-Claim") - require.Len(ids2, n) - require.NoError(q.Complete(ctx, 1, token2, ids2), "Complete (chunked)") - var total int - require.NoError(db.QueryRow(`SELECT COUNT(*) FROM pending_embeddings`).Scan(&total)) - assert.Equal(0, total, "all rows deleted after chunked Complete") -} - -// TestQueue_CompleteRelease_ChunkedTokenScoped verifies that the chunked -// path keeps its token filter: a Complete/Release spanning multiple -// chunks with a wrong token must not touch any row, even the rows in -// chunks that would otherwise match by id+generation. -func TestQueue_CompleteRelease_ChunkedTokenScoped(t *testing.T) { - require := requirepkg.New(t) - assert := assertpkg.New(t) - ctx := context.Background() - - orig := completeReleaseChunkRows - completeReleaseChunkRows = 2 - t.Cleanup(func() { completeReleaseChunkRows = orig }) - - const n = 5 - db := openVectorsDBWithPending(t, n) - q := NewQueue(db, nil) - - ids, _, err := q.Claim(ctx, 1, n) - require.NoError(err, "Claim") - require.Len(ids, n) - - // Wrong token across all chunks: nothing deleted, nothing released. - require.NoError(q.Complete(ctx, 1, "deadbeef", ids), "Complete wrong token (chunked)") - var total int - require.NoError(db.QueryRow(`SELECT COUNT(*) FROM pending_embeddings`).Scan(&total)) - assert.Equal(n, total, "wrong-token chunked Complete must not delete") - - require.NoError(q.Release(ctx, 1, "deadbeef", ids), "Release wrong token (chunked)") - assert.Equal(0, countAvailable(t, db, 1), "wrong-token chunked Release must leave rows claimed") -} - -// TestQueue_CompleteRelease_AtomicAcrossChunks proves the chunked -// Complete/Release is all-or-nothing: if a chunk after the first fails, -// the rows the earlier chunk(s) already touched inside the transaction -// must be rolled back, leaving the queue exactly as it was. Before the -// chunked path was wrapped in a transaction this regressed — earlier -// chunks committed independently and a mid-batch failure left the -// DELETE/UPDATE partially applied while still returning an error. -// -// afterChunkHook (a test-only seam) forces a failure right after the -// first chunk executes, so the second chunk never runs and the whole -// tx rolls back. -func TestQueue_CompleteRelease_AtomicAcrossChunks(t *testing.T) { - require := requirepkg.New(t) - assert := assertpkg.New(t) - ctx := context.Background() - - origChunk := completeReleaseChunkRows - completeReleaseChunkRows = 2 - t.Cleanup(func() { completeReleaseChunkRows = origChunk }) - - // Fail as soon as the first chunk (2 ids) has executed; the second - // chunk must never apply, and the first must be rolled back. - injected := errors.New("injected mid-batch failure") - t.Cleanup(func() { afterChunkHook = nil }) - - // --- Complete (DELETE) atomicity --- - const n = 5 // 5 ids over chunk size 2 → 3 chunks - db := openVectorsDBWithPending(t, n) - q := NewQueue(db, nil) - - ids, token, err := q.Claim(ctx, 1, n) - require.NoError(err, "Claim") - require.Len(ids, n) - - afterChunkHook = func(int) error { return injected } - err = q.Complete(ctx, 1, token, ids) - require.Error(err, "Complete must surface the injected failure") - require.ErrorIs(err, injected, "error must wrap the injected cause") - afterChunkHook = nil - - // Atomicity: NOT ONE row was deleted (the first chunk rolled back). - var total int - require.NoError(db.QueryRow(`SELECT COUNT(*) FROM pending_embeddings`).Scan(&total)) - assert.Equal(n, total, "failed chunked Complete must delete zero rows (all-or-nothing)") - - // And a clean retry (no fault) deletes everything. - require.NoError(q.Complete(ctx, 1, token, ids), "retry Complete") - require.NoError(db.QueryRow(`SELECT COUNT(*) FROM pending_embeddings`).Scan(&total)) - assert.Equal(0, total, "clean retry deletes all rows") - - // --- Release (UPDATE) atomicity --- - db2 := openVectorsDBWithPending(t, n) - q2 := NewQueue(db2, nil) - - ids2, token2, err := q2.Claim(ctx, 1, n) - require.NoError(err, "Claim (release case)") - require.Len(ids2, n) - require.Equal(0, countAvailable(t, db2, 1), "all claimed before release") - - afterChunkHook = func(int) error { return injected } - err = q2.Release(ctx, 1, token2, ids2) - require.Error(err, "Release must surface the injected failure") - require.ErrorIs(err, injected, "error must wrap the injected cause") - afterChunkHook = nil - - // Atomicity: NOT ONE row was released (still claimed under token2). - assert.Equal(0, countAvailable(t, db2, 1), - "failed chunked Release must clear zero claims (all-or-nothing)") - - // Clean retry releases everything. - require.NoError(q2.Release(ctx, 1, token2, ids2), "retry Release") - assert.Equal(n, countAvailable(t, db2, 1), "clean retry releases all rows") -} - -// TestQueue_Complete_AfterReclaim_PreservesNewClaim simulates the -// stale-worker-completing-late race: worker A claims rows, stalls -// long enough for ReclaimStale to clear the claim, worker B -// re-claims the same rows, then worker A finally finishes and calls -// Complete with its old token. The token check must prevent A from -// deleting B's row. -func TestQueue_Complete_AfterReclaim_PreservesNewClaim(t *testing.T) { - require := requirepkg.New(t) - assert := assertpkg.New(t) - ctx := context.Background() - db := openVectorsDBWithPending(t, 2) - q := NewQueue(db, nil) - - idsA, tokenA, err := q.Claim(ctx, 1, 2) - require.NoError(err, "Claim A") - require.Len(idsA, 2, "Claim A ids") - - // Back-date A's claim past the threshold, then reclaim. - _, err = db.ExecContext(ctx, - `UPDATE pending_embeddings SET claimed_at = ? WHERE generation_id = 1`, - time.Now().Add(-20*time.Minute).Unix()) - require.NoError(err) - n, err := q.ReclaimStale(ctx, 10*time.Minute) - require.NoError(err) - require.Equal(2, n, "ReclaimStale n") - - idsB, tokenB, err := q.Claim(ctx, 1, 2) - require.NoError(err, "Claim B") - require.Len(idsB, 2) - require.NotEqual(tokenA, tokenB) - - // Stale worker A finishes and calls Complete with its dead token. - // The token check must keep B's rows intact. - require.NoError(q.Complete(ctx, 1, tokenA, idsA), "Complete(stale tokenA)") - var remaining int - require.NoError(db.QueryRow(`SELECT COUNT(*) FROM pending_embeddings`).Scan(&remaining)) - require.Equal(2, remaining, "pending rows after stale Complete; stale token must not delete") - - // B's claim should still be intact (claim_token matches tokenB). - var claimed int - require.NoError(db.QueryRow( - `SELECT COUNT(*) FROM pending_embeddings WHERE claim_token = ?`, tokenB).Scan(&claimed)) - assert.Equal(2, claimed, "rows still holding B's token") - - // B can now legitimately Complete. - require.NoError(q.Complete(ctx, 1, tokenB, idsB), "Complete(tokenB)") - require.NoError(db.QueryRow(`SELECT COUNT(*) FROM pending_embeddings`).Scan(&remaining)) - assert.Equal(0, remaining, "pending rows after B's Complete") -} diff --git a/internal/vector/embed/testsupport_test.go b/internal/vector/embed/testsupport_test.go index a0748da82..d271347e1 100644 --- a/internal/vector/embed/testsupport_test.go +++ b/internal/vector/embed/testsupport_test.go @@ -7,66 +7,159 @@ import ( "database/sql" "fmt" "path/filepath" + "strings" "testing" - "github.com/stretchr/testify/assert" "github.com/stretchr/testify/require" + "go.kenn.io/msgvault/internal/store" "go.kenn.io/msgvault/internal/vector" "go.kenn.io/msgvault/internal/vector/sqlitevec" ) -// openVectorsDBWithPending opens a fresh vectors.db with one generation -// (id=1) and n pending rows for that generation. The database is closed -// automatically on test cleanup. -func openVectorsDBWithPending(t *testing.T, n int) *sql.DB { - t.Helper() - ctx := context.Background() - require.NoError(t, sqlitevec.RegisterExtension(), "RegisterExtension") - path := filepath.Join(t.TempDir(), "vectors.db") - db, err := sql.Open(sqlitevec.DriverName(), path) - require.NoError(t, err, "open vectors.db") - t.Cleanup(func() { _ = db.Close() }) - require.NoError(t, sqlitevec.Migrate(ctx, db, 768), "Migrate") - - _, err = db.ExecContext(ctx, ` - INSERT INTO index_generations (id, model, dimension, fingerprint, started_at, state) - VALUES (1, 'm', 768, 'm:768', 0, 'building')`) - require.NoError(t, err, "insert generation") - for i := 1; i <= n; i++ { - _, err := db.ExecContext(ctx, - `INSERT INTO pending_embeddings (generation_id, message_id, enqueued_at) VALUES (1, ?, 0)`, - i) - require.NoError(t, err, "insert pending") +// testMainSchema is the minimal main-DB schema the worker reads, including +// the last_modified column + the database-maintained triggers that bump it on +// any message change or body insert/update — mirroring production schema.sql +// so the CAS round-trip and trigger behavior are exercised in tests. +const testMainSchema = ` +CREATE TABLE messages ( + id INTEGER PRIMARY KEY, + subject TEXT, + deleted_at DATETIME, + deleted_from_source_at DATETIME, + embed_gen INTEGER, + last_modified DATETIME DEFAULT CURRENT_TIMESTAMP +); +CREATE TABLE message_bodies ( + message_id INTEGER PRIMARY KEY, + body_text TEXT, + body_html TEXT +); +CREATE TRIGGER trg_messages_last_modified +AFTER UPDATE ON messages FOR EACH ROW +WHEN OLD.last_modified = NEW.last_modified +BEGIN + UPDATE messages SET last_modified = CURRENT_TIMESTAMP WHERE id = NEW.id; +END; +CREATE TRIGGER trg_message_bodies_last_modified_upd +AFTER UPDATE ON message_bodies FOR EACH ROW +BEGIN + UPDATE messages SET last_modified = CURRENT_TIMESTAMP WHERE id = NEW.message_id; +END; +CREATE TRIGGER trg_message_bodies_last_modified_ins +AFTER INSERT ON message_bodies FOR EACH ROW +BEGIN + UPDATE messages SET last_modified = CURRENT_TIMESTAMP WHERE id = NEW.message_id; +END;` + +// testWorkStore is a minimal WorkStore backed by the test main DB. It +// mirrors store.ScanForEmbedding / store.SetEmbedGen against the test's +// `messages` table (which carries id, subject, deleted_at, +// deleted_from_source_at, embed_gen, last_modified). +type testWorkStore struct { + db *sql.DB +} + +func (s *testWorkStore) ScanForEmbedding(ctx context.Context, target int64, afterID int64, limit int) ([]int64, error) { + rows, err := s.db.QueryContext(ctx, + `SELECT id FROM messages + WHERE (embed_gen IS NULL OR embed_gen <> ?) + AND deleted_at IS NULL AND deleted_from_source_at IS NULL + AND id > ? + ORDER BY id LIMIT ?`, target, afterID, limit) + if err != nil { + return nil, err + } + defer func() { _ = rows.Close() }() + var out []int64 + for rows.Next() { + var id int64 + if err := rows.Scan(&id); err != nil { + return nil, err + } + out = append(out, id) + } + return out, rows.Err() +} + +func (s *testWorkStore) SetEmbedGen(ctx context.Context, ids []int64, target int64) error { + if len(ids) == 0 { + return nil + } + ph := make([]string, len(ids)) + args := make([]any, 0, 1+len(ids)) + args = append(args, target) + for i, id := range ids { + ph[i] = "?" + args = append(args, id) } - return db + _, err := s.db.ExecContext(ctx, + `UPDATE messages SET embed_gen = ? WHERE id IN (`+strings.Join(ph, ",")+`)`, args...) + return err } -// countAvailable returns the number of rows for gen whose claimed_at -// IS NULL (i.e. available to be claimed). -func countAvailable(t *testing.T, db *sql.DB, gen int64) int { +// SetEmbedGenIfUnchanged mirrors store.Store.SetEmbedGenIfUnchanged: a +// per-row optimistic-CAS stamp gated on last_modified, used by the worker's +// content read→stamp path. Returns the ids whose UPDATE matched 0 rows (CAS +// misses) so the worker can log them and exclude them from success accounting. +func (s *testWorkStore) SetEmbedGenIfUnchanged(ctx context.Context, items []store.EmbedGenStamp, target int64) (missed []int64, err error) { + for _, it := range items { + res, err := s.db.ExecContext(ctx, + `UPDATE messages SET embed_gen = ? WHERE id = ? AND last_modified = ?`, + target, it.ID, it.LastModified) + if err != nil { + return missed, err + } + n, err := res.RowsAffected() + if err != nil { + return missed, err + } + if n == 0 { + missed = append(missed, it.ID) + } + } + return missed, nil +} + +// countMissing returns how many live messages still need embedding for +// gen (embed_gen IS NULL OR embed_gen <> gen) in the test main DB. +func countMissing(t *testing.T, db *sql.DB, gen int64) int { t.Helper() var n int err := db.QueryRow( - `SELECT COUNT(*) FROM pending_embeddings WHERE generation_id = ? AND claimed_at IS NULL`, - gen).Scan(&n) - require.NoError(t, err, "countAvailable") + `SELECT COUNT(*) FROM messages + WHERE (embed_gen IS NULL OR embed_gen <> ?) + AND deleted_at IS NULL AND deleted_from_source_at IS NULL`, gen).Scan(&n) + require.NoError(t, err, "countMissing") return n } +// readWatermark returns the persisted watermark for gen (0 if absent). +func readWatermark(t *testing.T, db *sql.DB, gen int64) int64 { + t.Helper() + var id int64 + err := db.QueryRow(`SELECT watermark_id FROM embed_watermark WHERE generation_id = ?`, gen).Scan(&id) + if err == sql.ErrNoRows { + return 0 + } + require.NoError(t, err, "readWatermark") + return id +} + // workerFixture bundles everything needed for an end-to-end worker test. type workerFixture struct { MainDB *sql.DB VectorsDB *sql.DB + Store WorkStore Backend vector.Backend BuildingGen vector.GenerationID FakeClient *fakeEmbeddingClient } // newWorkerFixture creates a main DB with n messages (subject="msg N", -// body="body N"), opens a real sqlitevec backend, creates a building -// generation (seeds pending_embeddings from the main DB), and installs a -// fakeEmbeddingClient that returns a deterministic vector per input. +// body="body N", embed_gen NULL), opens a real sqlitevec backend, creates +// a building generation, and installs a fakeEmbeddingClient that returns a +// deterministic vector per input. func newWorkerFixture(t *testing.T, n int) *workerFixture { t.Helper() ctx := context.Background() @@ -78,19 +171,7 @@ func newWorkerFixture(t *testing.T, n int) *workerFixture { require.NoError(t, err, "open main") t.Cleanup(func() { _ = mainDB.Close() }) - schema := ` -CREATE TABLE messages ( - id INTEGER PRIMARY KEY, - subject TEXT, - deleted_at DATETIME, - deleted_from_source_at DATETIME -); -CREATE TABLE message_bodies ( - message_id INTEGER PRIMARY KEY, - body_text TEXT, - body_html TEXT -);` - _, err = mainDB.Exec(schema) + _, err = mainDB.Exec(testMainSchema) require.NoError(t, err, "schema") for i := 1; i <= n; i++ { _, err := mainDB.Exec( @@ -124,50 +205,17 @@ CREATE TABLE message_bodies ( return &workerFixture{ MainDB: mainDB, VectorsDB: vecDB, + Store: &testWorkStore{db: mainDB}, Backend: b, BuildingGen: gid, FakeClient: fc, } } -// openVectorsDBForEnqueue opens a vectors.db with the schema applied but -// NO generations. Useful for Enqueuer tests that insert their own generations. -func openVectorsDBForEnqueue(t *testing.T) *sql.DB { - t.Helper() - ctx := context.Background() - require.NoError(t, sqlitevec.RegisterExtension(), "RegisterExtension") - path := filepath.Join(t.TempDir(), "vectors.db") - db, err := sql.Open(sqlitevec.DriverName(), path) - require.NoError(t, err, "open") - t.Cleanup(func() { _ = db.Close() }) - require.NoError(t, sqlitevec.Migrate(ctx, db, 768), "Migrate") - return db -} - -// insertGenerationStatic inserts an index_generations row with the given -// state. id is used verbatim (not auto-increment). -func insertGenerationStatic(t *testing.T, db *sql.DB, id int64, state string) { - t.Helper() - _, err := db.Exec( - `INSERT INTO index_generations (id, model, dimension, fingerprint, started_at, state) - VALUES (?, 'm', 768, 'm:768', 0, ?)`, id, state) - require.NoError(t, err, "insert generation %d", id) -} - -// assertPending asserts the number of pending rows for gen. -func assertPending(t *testing.T, db *sql.DB, gen int64, want int) { - t.Helper() - var n int - err := db.QueryRow( - `SELECT COUNT(*) FROM pending_embeddings WHERE generation_id = ?`, gen).Scan(&n) - require.NoError(t, err, "count pending (gen=%d)", gen) - assert.Equal(t, want, n, "pending for gen %d", gen) -} - // fakeEmbeddingClient returns a deterministic vector per input; tests // may force failures with FailNext(n) or run a callback inside Embed -// (after the queue claim, before Upsert/Complete) to perturb DB state -// for race or failure testing. +// (after the scan, before Upsert/stamp) to perturb DB state for race or +// failure testing. type fakeEmbeddingClient struct { dim int failN int diff --git a/internal/vector/embed/watermark.go b/internal/vector/embed/watermark.go new file mode 100644 index 000000000..46a6baf27 --- /dev/null +++ b/internal/vector/embed/watermark.go @@ -0,0 +1,77 @@ +package embed + +import ( + "context" + "database/sql" + "errors" + "fmt" + + "go.kenn.io/msgvault/internal/vector" +) + +// Watermark reads and writes per-generation forward-scan resume points in +// the embed_watermark table. It lives WITH the generations it watermarks: +// vectors.db on SQLite, the main PostgreSQL database on PG. The worker +// seeds its scan from GetWatermark at run start and advances it after each +// successful batch via SetWatermark. +// +// The watermark is a pure optimization. Losing it (or never seeding it) +// only restarts the next scan from id 0, which is harmless: the scan +// predicate (embed_gen IS NULL OR embed_gen <> gen) plus the idempotent +// embeddings upsert make re-sweeping already-covered rows a no-op. The +// full-scan backstop ignores the watermark entirely. +// +// The upsert SQL (INSERT ... ON CONFLICT ... DO UPDATE SET ... = +// excluded....) is portable across SQLite (3.24+) and PostgreSQL, so +// Watermark needs no dialect probe beyond rebind. +type Watermark struct { + db *sql.DB + rebind func(string) string +} + +// NewWatermark returns a Watermark bound to db (the generation-side DB). +// The caller retains ownership of db. rebind translates ?-placeholders to +// the driver's native form; pass nil (or an identity func) for SQLite and +// the PostgreSQL dialect's Rebind for pgx. +func NewWatermark(db *sql.DB, rebind func(string) string) *Watermark { + if rebind == nil { + rebind = func(q string) string { return q } + } + return &Watermark{db: db, rebind: rebind} +} + +// GetWatermark returns the stored watermark for gen, or 0 when no row +// exists yet (which makes the next scan start from the beginning — safe by +// design). A nil db (watermark disabled) returns 0 without error. +func (w *Watermark) GetWatermark(ctx context.Context, gen vector.GenerationID) (int64, error) { + if w == nil || w.db == nil { + return 0, nil + } + var id int64 + err := w.db.QueryRowContext(ctx, + w.rebind(`SELECT watermark_id FROM embed_watermark WHERE generation_id = ?`), + int64(gen)).Scan(&id) + if errors.Is(err, sql.ErrNoRows) { + return 0, nil + } + if err != nil { + return 0, fmt.Errorf("get watermark: %w", err) + } + return id, nil +} + +// SetWatermark upserts the watermark for gen to id. A nil db (watermark +// disabled) is a no-op. Advancing the watermark is non-critical — a +// failure here is logged by the worker, not fatal — so callers may treat +// the error as best-effort. +func (w *Watermark) SetWatermark(ctx context.Context, gen vector.GenerationID, id int64) error { + if w == nil || w.db == nil { + return nil + } + stmt := `INSERT INTO embed_watermark (generation_id, watermark_id) VALUES (?, ?) + ON CONFLICT (generation_id) DO UPDATE SET watermark_id = excluded.watermark_id` + if _, err := w.db.ExecContext(ctx, w.rebind(stmt), int64(gen), id); err != nil { + return fmt.Errorf("set watermark: %w", err) + } + return nil +} diff --git a/internal/vector/embed/worker.go b/internal/vector/embed/worker.go index 144edc813..7bb0943e3 100644 --- a/internal/vector/embed/worker.go +++ b/internal/vector/embed/worker.go @@ -6,11 +6,13 @@ import ( "errors" "fmt" "log/slog" + "maps" "strings" "time" "unicode/utf8" "go.kenn.io/msgvault/internal/mime" + "go.kenn.io/msgvault/internal/store" "go.kenn.io/msgvault/internal/vector" ) @@ -20,31 +22,52 @@ type EmbeddingClient interface { Embed(ctx context.Context, inputs []string) ([][]float32, error) } +// WorkStore is the subset of *store.Store the worker uses to find work +// and stamp coverage against the MAIN db. It is a narrow interface — only +// the few methods the worker actually calls — so the embed package depends +// on just that surface and the worker is easy to fake in tests, mirroring +// the func-injection style the queue/enqueuer used. *store.Store satisfies +// it implicitly. (The package still imports internal/store for the shared +// EmbedGenStamp type used by SetEmbedGenIfUnchanged.) +type WorkStore interface { + // ScanForEmbedding returns up to limit live message ids needing work + // for target (embed_gen IS NULL OR embed_gen <> target), scanning + // forward from afterID in id order. + ScanForEmbedding(ctx context.Context, target int64, afterID int64, limit int) ([]int64, error) + // SetEmbedGen stamps embed_gen=target on ids (idempotent). Used by the + // BACKFILL path, which has no content read→stamp window to guard. + SetEmbedGen(ctx context.Context, ids []int64, target int64) error + // SetEmbedGenIfUnchanged stamps embed_gen=target on each item ONLY if + // its last_modified still equals the value captured at content-read time + // (optimistic CAS). A row whose last_modified changed (a concurrent + // content edit bumped it via the DB triggers) is not stamped and is + // re-found by the next scan. Used by the scan-and-fill read→stamp path. + // Returns the ids whose UPDATE matched 0 rows (the CAS misses) so the + // worker can log them and exclude them from its success accounting; the + // watermark still advances and the backstop recovers them. + SetEmbedGenIfUnchanged(ctx context.Context, items []store.EmbedGenStamp, target int64) (missed []int64, err error) +} + // WorkerDeps bundles the collaborators a Worker needs. Backend, VectorsDB, -// MainDB, and Client are required; the remaining fields have sensible -// defaults when zero: BatchSize defaults to 32, StaleThreshold is -// auto-derived from EmbedTimeout × EmbedMaxRetries with a 10-minute -// floor (see NewWorker), MaxConsecutiveFailures defaults to 5, Log -// defaults to slog.Default(). +// MainDB, Store, and Client are required; the remaining fields have +// sensible defaults when zero: BatchSize defaults to 32, +// MaxConsecutiveFailures defaults to 5, Log defaults to slog.Default(). type WorkerDeps struct { - Backend vector.Backend - VectorsDB *sql.DB - MainDB *sql.DB - Client EmbeddingClient - Preprocess PreprocessConfig - MaxInputChars int - BatchSize int - StaleThreshold time.Duration - // EmbedTimeout and EmbedMaxRetries inform the StaleThreshold - // auto-derivation: a single batch can legitimately stay claimed for - // up to Timeout × MaxRetries (MaxRetries is the embed.Client's - // total-attempts count, not retries-after-the-first) before the - // worker gives up, so reclaim must wait longer than that to avoid - // reclaiming live work. Both are read only when StaleThreshold is - // zero; EmbedMaxRetries=0 is normalized to 3 to match - // embed.NewClient's default — see derivedStaleThreshold. - EmbedTimeout time.Duration - EmbedMaxRetries int + Backend vector.Backend + // VectorsDB is the generation-side DB handle (vectors.db on SQLite, + // the shared main DB on PG). Used for embed_runs and the watermark. + VectorsDB *sql.DB + // MainDB is the main msgvault.db handle (messages + bodies). Used by + // embedBatch's body-fetch query. + MainDB *sql.DB + // Store finds work and stamps coverage against MainDB. Required. + Store WorkStore + Client EmbeddingClient + Preprocess PreprocessConfig + MaxInputChars int + BatchSize int + // beforeSkipStamp is a test hook for read-to-stamp race coverage. + beforeSkipStamp func(ctx context.Context, ids []int64) // MaxConsecutiveFailures caps the number of consecutive batch // failures (embed error or upsert error) before RunOnce gives up // and returns an error. A successful batch resets the counter. @@ -52,27 +75,40 @@ type WorkerDeps struct { MaxConsecutiveFailures int // Rebind translates ?-placeholders to the driver's native form. // nil is treated as the identity (used by SQLite); pgvector callers - // must wire in (&store.PostgreSQLDialect{}).Rebind so the queue's - // IN-clause and UPDATE statements run on pgx. + // must wire in (&store.PostgreSQLDialect{}).Rebind so the embed_runs, + // watermark, and body-fetch statements run on pgx. Rebind func(string) string - Log *slog.Logger - // TotalPending is the queue depth at run start, used by a Progress + // LastModifiedExpr is the SELECT expression embedBatch uses to read each + // message's last_modified CAS token. It MUST scan into a value that + // round-trips by exact equality when bound back into the CAS UPDATE's + // `WHERE last_modified = ?`: + // - SQLite: "CAST(m.last_modified AS TEXT)" — the CAST defeats + // go-sqlite3's DATETIME→time.Time auto-coercion (which reformats the + // value and breaks equality); the worker scans a string and binds the + // same string back. + // - PostgreSQL: "m.last_modified" — pgx scans/binds time.Time, equality + // holds. + // Zero value defaults to the SQLite CAST form (the default backend); the + // pgvector caller sets "m.last_modified". + LastModifiedExpr string + Log *slog.Logger + // TotalPending is the work depth at run start, used by a Progress // callback (if any) to report percent done and ETA. Zero disables // the denominator — Progress still fires but leaves ETA empty. TotalPending int - // Progress, if non-nil, is called after queue rows are durably - // completed, whether they produced embeddings or were intentionally - // dropped as missing/empty/unembeddable. Done and BatchMsgs count - // completed queue rows so they can be compared to TotalPending. - // Callbacks run on the worker goroutine; rate-limit inside the - // callback if output is expensive. + // Progress, if non-nil, is called after a batch is durably handled, + // whether it produced embeddings or was intentionally skip-marked as + // missing/empty/unembeddable. Done and BatchMsgs count handled + // messages so they can be compared to TotalPending. Callbacks run on + // the worker goroutine; rate-limit inside the callback if output is + // expensive. Progress func(ProgressReport) } -// ProgressReport captures RunOnce progress after a set of queue rows -// has been completed. Done and BatchMsgs count completed pending rows; -// BatchChars counts source chars for rows that actually embedded. -// BatchElapsed is end-to-end for that progress unit. +// ProgressReport captures RunOnce progress after a set of messages has +// been handled. Done and BatchMsgs count handled messages; BatchChars +// counts source chars for messages that actually embedded. BatchElapsed +// is end-to-end for that progress unit. type ProgressReport struct { Done int TotalPending int @@ -82,23 +118,26 @@ type ProgressReport struct { RunElapsed time.Duration } -// Worker drives one building generation from claimed pending rows to -// persisted embeddings. A single Worker is safe for sequential use; to -// parallelize, construct multiple workers that share the same Backend -// and DB handles. +// Worker drives one generation from needs-work messages to persisted +// embeddings via a scan-and-fill loop: it scans the main DB for messages +// whose embed_gen does not match the target generation, embeds them, +// upserts the vectors, then stamps embed_gen so they drop out of the next +// scan. A single Worker is safe for sequential use. type Worker struct { deps WorkerDeps - q *Queue + wm *Watermark // rebind translates ?-placeholders to the driver's native form for // queries the worker issues directly against MainDB (embedBatch's - // IN-clause). Resolved in NewWorker from WorkerDeps.Rebind; nil is - // normalized to the identity so the SQLite path is unchanged. - rebind func(string) string - runStart time.Time // valid only during a RunOnce call + // IN-clause). nil is normalized to the identity. + rebind func(string) string + // lastModifiedExpr is the SELECT expression for the last_modified CAS + // token (see WorkerDeps.LastModifiedExpr). Normalized to the SQLite CAST + // form when the dep is empty. + lastModifiedExpr string + runStart time.Time // valid only during a RunOnce call } // NewWorker constructs a Worker, applying defaults for BatchSize (32), -// StaleThreshold (auto-derived; see derivedStaleThreshold), // MaxConsecutiveFailures (5), and Log (slog.Default()). func NewWorker(d WorkerDeps) *Worker { if d.Log == nil { @@ -107,9 +146,6 @@ func NewWorker(d WorkerDeps) *Worker { if d.BatchSize == 0 { d.BatchSize = 32 } - if d.StaleThreshold == 0 { - d.StaleThreshold = derivedStaleThreshold(d.EmbedTimeout, d.EmbedMaxRetries) - } if d.MaxConsecutiveFailures == 0 { d.MaxConsecutiveFailures = 5 } @@ -117,45 +153,13 @@ func NewWorker(d WorkerDeps) *Worker { if rebind == nil { rebind = func(q string) string { return q } } - return &Worker{deps: d, q: NewQueue(d.VectorsDB, rebind), rebind: rebind} -} - -// derivedStaleThreshold picks a default StaleThreshold from the -// embedder's per-request timeout and retry count, with a 10-minute -// floor. A claim must outlive at least one full retry budget -// (timeout × attempts) — anything less risks ReclaimStale pulling -// rows out from under a still-running embed call, which would then -// race a concurrent worker on the same batch and leave stale -// Complete tokens. The 2× safety factor absorbs scheduler jitter -// and pre/post-call overhead. The floor preserves the historical -// default for the common case (Timeout=30s × 3 attempts = 3 minutes -// derived; floor wins). -// -// maxRetries here matches embed.Client's MaxRetries semantics: it is -// the TOTAL number of HTTP attempts (not retries-after-the-first). -// A zero value is normalized to 3 to mirror embed.NewClient's -// default. Without this, callers that set EmbedTimeout but leave -// EmbedMaxRetries at its zero value would derive a budget for a -// single attempt, while the client would actually try up to three -// times — and ReclaimStale could pull live claims out from under a -// retrying embed call. -func derivedStaleThreshold(timeout time.Duration, maxRetries int) time.Duration { - const floor = 10 * time.Minute - if timeout <= 0 { - return floor - } - attempts := maxRetries - if attempts == 0 { - attempts = 3 - } - if attempts < 1 { - attempts = 1 + lmExpr := d.LastModifiedExpr + if lmExpr == "" { + // Default to the SQLite CAST form (the default backend); pgvector + // callers set "m.last_modified". + lmExpr = "CAST(m.last_modified AS TEXT)" } - derived := 2 * timeout * time.Duration(attempts) - if derived < floor { - return floor - } - return derived + return &Worker{deps: d, wm: NewWatermark(d.VectorsDB, rebind), rebind: rebind, lastModifiedExpr: lmExpr} } // RunResult summarizes the outcome of RunOnce. @@ -191,16 +195,11 @@ type inputChunk struct { Trunc bool } -// ReclaimStale releases claims older than StaleThreshold so crashed -// workers don't leave rows stuck. Call at startup before RunOnce. -// Returns the number of rows reclaimed. -func (w *Worker) ReclaimStale(ctx context.Context) (int, error) { - n, err := w.q.ReclaimStale(ctx, w.deps.StaleThreshold) - if err != nil { - return 0, fmt.Errorf("reclaim stale: %w", err) - } - return n, nil -} +// ReclaimStale is a no-op retained to satisfy the scheduler's EmbedRunner +// interface. The scan-and-fill design has no claim leases to reclaim: a +// crashed worker leaves messages simply unstamped (embed_gen unchanged), +// and the next scan re-finds them. Always returns (0, nil). +func (w *Worker) ReclaimStale(ctx context.Context) (int, error) { return 0, nil } // startEmbedRun inserts an embed_runs row and returns the new row's id. // A failure is non-fatal — run tracking is observability, not correctness. @@ -240,15 +239,39 @@ func (w *Worker) finalizeEmbedRun(ctx context.Context, runID int64, res RunResul } } -// RunOnce drains the queue for the given generation until empty, -// releasing claimed rows on embed or upsert error so another worker can -// retry them. Returns when pending is empty or ctx is cancelled. +// RunOnce scans the given generation for messages needing embedding and +// fills them in, resuming the forward scan from the persisted per-gen +// watermark. It returns when no needs-work messages remain (the scan +// returns empty) or ctx is cancelled. +// +// Cross-DB ordering (SQLite): the find-work scan and the embed_gen stamp +// run against MainDB while the embeddings upsert runs against VectorsDB, +// so they cannot be one transaction. The worker orders the steps — +// embeddings upsert FIRST, then stamp embed_gen — and relies on +// idempotency: the upsert is keyed by (gen, msg, chunk), so a crash +// between the two steps just re-does an idempotent batch on the next scan. // // Returns an error when consecutive batch failures reach -// MaxConsecutiveFailures, so a persistently misconfigured embedder -// (bad credentials, unreachable endpoint) surfaces quickly instead of -// looping forever. A successful batch resets the failure counter. +// MaxConsecutiveFailures, so a persistently misconfigured embedder (bad +// credentials, unreachable endpoint) surfaces quickly instead of looping +// forever. A successful batch resets the failure counter. func (w *Worker) RunOnce(ctx context.Context, gen vector.GenerationID) (res RunResult, retErr error) { + return w.run(ctx, gen, false) +} + +// RunBackstop performs a full-scan pass that ignores the per-gen +// watermark, driving coverage to zero even for sub-watermark stragglers +// (a message that was unstamped but already swept past by the optimistic +// watermark — e.g. dropped during a transient failure, or a legacy row +// whose id sits below where a prior run advanced). It reuses the same +// scan/embed/stamp path with the scan cursor pinned at 0. Idempotent: +// already-covered rows are skipped by the scan predicate, so re-running it +// is cheap once the corpus is embedded. +func (w *Worker) RunBackstop(ctx context.Context, gen vector.GenerationID) (res RunResult, retErr error) { + return w.run(ctx, gen, true) +} + +func (w *Worker) run(ctx context.Context, gen vector.GenerationID, backstop bool) (res RunResult, retErr error) { consecutiveFailures := 0 var lastErr error completedRows := 0 @@ -256,44 +279,48 @@ func (w *Worker) RunOnce(ctx context.Context, gen vector.GenerationID) (res RunR runID := w.startEmbedRun(ctx, gen, w.runStart.Unix()) defer func() { // Finalize on a context detached from the caller's cancellation so - // the embed_runs row is stamped (ended_at/counters/error) even when - // RunOnce exits because ctx was cancelled (Ctrl-C / SIGTERM / - // daemon shutdown). Running the close-out UPDATE on the cancelled - // ctx would short-circuit in database/sql and leave the row open - // forever, corrupting the "find in-flight/crashed runs" signal. - // A short timeout keeps shutdown from hanging on a wedged DB. - // Mirrors the query/duckdb.go cleanup convention. + // the embed_runs row is stamped even when RunOnce exits because ctx + // was cancelled. Running the close-out UPDATE on the cancelled ctx + // would short-circuit in database/sql and leave the row open + // forever. A short timeout keeps shutdown from hanging on a wedged DB. fctx, cancel := context.WithTimeout(context.WithoutCancel(ctx), 5*time.Second) defer cancel() w.finalizeEmbedRun(fctx, runID, res, retErr, time.Now().Unix()) }() - // orphanDrainErr/orphanDrainCount preserve the latest orphan-drain - // failure across iterations so we can surface it on the empty-claim - // exit. Without this, a Complete() failure on orphan rows would be - // logged but invisible to the caller — and if those orphans were - // the last queue rows, the next Claim returns empty and RunOnce - // would falsely report a clean drain even though stuck claimed - // rows persist until ReclaimStale (~10 min later). - var orphanDrainErr error - var orphanDrainCount int + + // Seed the forward-scan cursor. The backstop ignores the watermark and + // scans from the beginning so it catches sub-watermark stragglers. + var afterID int64 + if !backstop { + wm, err := w.wm.GetWatermark(ctx, gen) + if err != nil { + // Non-fatal: a missing/unreadable watermark just restarts the + // scan from 0 (the scan predicate + idempotent upsert make this + // harmless). Log and continue. + w.deps.Log.Warn("embed: read watermark failed; scanning from start", + "gen", gen, "error", err) + } else { + afterID = wm + } + } + for { if err := ctx.Err(); err != nil { return res, fmt.Errorf("RunOnce: %w", err) } batchStart := time.Now() - ids, token, err := w.q.Claim(ctx, gen, w.deps.BatchSize) + ids, err := w.deps.Store.ScanForEmbedding(ctx, int64(gen), afterID, w.deps.BatchSize) if err != nil { - return res, fmt.Errorf("claim: %w", err) + return res, fmt.Errorf("scan for embedding: %w", err) } if len(ids) == 0 { - if orphanDrainErr != nil { - return res, fmt.Errorf( - "orphan-drain failed for %d row(s); they remain claimed and will be recovered by ReclaimStale on the next run: %w", - orphanDrainCount, orphanDrainErr) - } return res, nil } res.Claimed += len(ids) + // batchMax is the highest id in this scan slice; once the batch is + // stamped these rows drop out of the predicate, but advancing the + // cursor past them avoids re-scanning the covered prefix. + batchMax := ids[len(ids)-1] eb, err := w.embedBatch(ctx, ids) if err != nil { @@ -302,47 +329,57 @@ func (w *Worker) RunOnce(ctx context.Context, gen vector.GenerationID) (res RunR w.deps.Log.Warn("embed batch failed", "gen", gen, "ids", len(ids), "error", err) if errors.Is(err, ErrPermanent4xx) { - // Walk the claimed IDs one at a time. Drain decides - // per-ID whether to drop (if some embed, the 4xxs are - // message-specific) or release (if none embed, - // endpoint-wide failure can't be ruled out). + // Walk the scanned ids one at a time. Drain decides per-ID + // whether to stamp (it embedded, or is a confirmed + // message-specific 4xx given some sibling embedded) or leave + // unstamped (endpoint-wide failure can't be ruled out). w.deps.Log.Info("embed: downshifting to BatchSize=1 to drain failing batch", "gen", gen, "batch_size", len(ids)) - embedded, dropped, drainErr := w.downshiftDrain(ctx, gen, token, ids, &res, &completedRows) + embedded, embeddedOK, stamped, safeAdvanceID, drainErr := w.downshiftDrain(ctx, gen, ids, &res, &completedRows) res.Succeeded += embedded if drainErr != nil { w.deps.Log.Info("embed: downshift drain returned error", "gen", gen, "batch_size", len(ids), - "embedded", embedded, "dropped", dropped, - "error", drainErr) + "embedded", embedded, "embedded_ok", embeddedOK, "stamped", stamped, "error", drainErr) } else { w.deps.Log.Info("embed: downshift drain complete; resuming configured batch size", "gen", gen, "batch_size", len(ids), - "embedded", embedded, "dropped", dropped) + "embedded", embedded, "embedded_ok", embeddedOK, "stamped", stamped) } - // Forward progress resets the cap. Same rule as the - // all-clean main-loop success path. - if embedded > 0 { + // Forward progress resets the cap and advances the cursor — + // but ONLY past rows that are actually stamped. On a clean + // drain every id is resolved (embedded, skip-marked, or a + // message-specific 4xx drop) so safeAdvanceID == batchMax and + // behavior is unchanged. On a NON-4xx (transient) drain error + // an EARLIER singleton may have stamped while a LATER one was + // left unstamped; safeAdvanceID is the highest CONTIGUOUSLY + // stamped id before the failure, so the watermark never jumps + // past the unstamped straggler and the next RunOnce re-finds + // and retries it (idempotent). Without this, the watermark + // would advance to batchMax and the straggler would be + // stranded — recovered by the next backstop pass (manual + // `embeddings build --backstop` or the serve auto-backstop). + // + // Reset the failure cap on embeddedOK (the endpoint embedded + + // upserted something this drain), NOT on `embedded` (which only + // counts CAS-stamped singletons). A singleton that embedded but + // CAS-missed its stamp proves the endpoint is healthy, so a + // recurring CAS miss must not be able to trip the + // misconfig/abort cap. + if embeddedOK > 0 { consecutiveFailures = 0 } + if safeAdvanceID > afterID { + afterID = safeAdvanceID + w.advanceWatermark(ctx, gen, safeAdvanceID, backstop) + } if drainErr != nil { - // Distinguish "drain confirms upstream 4xx" - // (every singleton returned the same - // ErrPermanent4xx — same failure as the upstream - // batch, already counted) from "drain hit an - // independent error" (transient-after-retries, - // upsert/complete failure, ctx cancel — a fresh - // failure that should fail this run immediately. lastErr = drainErr - // A retired generation is a benign drop, never a hard - // abort. downshiftDrain handles ErrGenerationRetired - // inline today (so it does not surface here), but guard - // defensively so a future drain path that propagates the - // sentinel cannot trip the generic non-4xx abort below. + // A retired generation is a benign drop, never a hard abort. if errors.Is(drainErr, vector.ErrGenerationRetired) { - continue + return res, nil } if !errors.Is(drainErr, ErrPermanent4xx) { return res, fmt.Errorf("downshift drain: %w", drainErr) @@ -351,15 +388,21 @@ func (w *Worker) RunOnce(ctx context.Context, gen vector.GenerationID) (res RunR return res, fmt.Errorf("embed worker aborting after %d consecutive failures: %w", consecutiveFailures, lastErr) } + // Every singleton 4xx'd and nothing embedded: the rows are + // left unstamped (so a misconfigured endpoint does not + // silently lose work) and the cursor does NOT advance, so + // the next scan re-finds them and the failure cap trips. + // Avoid busy-spinning: continue lets the loop re-scan; + // consecutiveFailures will reach the cap. + continue } continue } - // Non-4xx error: original release-and-fail path. + // Non-4xx error: leave the batch unstamped (next scan re-finds + // it) and do not advance the cursor, so the failure cap can + // short-circuit the loop on a persistent fault. res.Failed += len(ids) - if rerr := w.q.Release(ctx, gen, token, ids); rerr != nil { - w.deps.Log.Error("release after embed failure", "error", rerr) - } if consecutiveFailures >= w.deps.MaxConsecutiveFailures { return res, fmt.Errorf("embed worker aborting after %d consecutive failures: %w", consecutiveFailures, lastErr) @@ -368,80 +411,57 @@ func (w *Worker) RunOnce(ctx context.Context, gen vector.GenerationID) (res RunR } res.Truncated += eb.truncated + // Skip-mark messages that produced no embeddable content (missing + // from the main DB, or empty after preprocess). Stamping embed_gen + // IS the skip-marker — it drops them out of the next scan, the + // scan-and-fill replacement for deleting a queue row. + skipIDs := append(append([]int64(nil), eb.missing...), eb.empty...) + if len(eb.chunks) == 0 { - // Nothing to embed (every claimed id was missing from - // main DB or preprocessed to empty). Drop the orphans - // and move on; failure here counts toward - // MaxConsecutiveFailures because the loop would - // otherwise busy-spin on a stuck claim until - // ReclaimStale runs (10 min default). - dropIDs := append(append([]int64(nil), eb.missing...), eb.empty...) - if len(dropIDs) > 0 { + // Nothing to embed. Stamp the skip set so the scan advances. + if len(skipIDs) > 0 { if len(eb.missing) > 0 { - w.deps.Log.Warn("pending messages missing from main DB", - "gen", gen, "ids", eb.missing) + w.deps.Log.Warn("messages missing from main DB", "gen", gen, "ids", eb.missing) } if len(eb.empty) > 0 { - w.deps.Log.Warn("pending messages empty after preprocess", - "gen", gen, "ids", eb.empty) + w.deps.Log.Warn("messages empty after preprocess", "gen", gen, "ids", eb.empty) } - if cerr := w.q.Complete(ctx, gen, token, dropIDs); cerr != nil { - res.Failed += len(dropIDs) - w.deps.Log.Error("complete drop failed", "error", cerr, - "gen", gen, "ids", len(dropIDs)) + missed, serr := w.stampSkipped(ctx, gen, skipIDs, eb.lastModified) + if serr != nil { + res.Failed += len(skipIDs) + w.deps.Log.Error("stamp skip set failed", "error", serr, "gen", gen, "ids", len(skipIDs)) consecutiveFailures++ - lastErr = cerr - orphanDrainErr = cerr - orphanDrainCount += len(dropIDs) + lastErr = serr if consecutiveFailures >= w.deps.MaxConsecutiveFailures { return res, fmt.Errorf("embed worker aborting after %d consecutive failures: %w", consecutiveFailures, lastErr) } continue } - completedRows += len(dropIDs) - w.reportProgress(completedRows, len(dropIDs), 0, time.Since(batchStart)) + w.logCASMisses(gen, missed) + // Count only rows actually stamped (a CAS miss was not stamped). + stampedRows := len(skipIDs) - len(missed) + completedRows += stampedRows + w.reportProgress(completedRows, stampedRows, 0, time.Since(batchStart)) } + consecutiveFailures = 0 + afterID = batchMax + w.advanceWatermark(ctx, gen, batchMax, backstop) continue } + // Step 1: upsert embeddings (VectorsDB side). if err := w.deps.Backend.Upsert(ctx, gen, eb.chunks); err != nil { if errors.Is(err, vector.ErrGenerationRetired) { - // The generation was retired out from under this worker - // (its claims were reclaimed and a newer generation took - // over, or an operator retired it). Per the documented - // contract on vector.ErrGenerationRetired this is a benign - // "drop the batch" signal, NOT a hard failure: re-embedding - // would just re-fail identically and burn embedding-API cost - // up to MaxConsecutiveFailures. Token-aware DROP the claimed - // rows (Complete is a token-scoped DELETE, safe against a - // concurrent newer claim), do not count this as a failure, - // and continue draining so the run finishes cleanly. - // Drop the FULL claimed batch, not just the embedded subset: - // missing/empty rows were claimed under this token too, and - // leaving them claimed would strand them until ReclaimStale - // (cr2-5). `ids` is exactly the set of message IDs claimed for - // this batch (every one is embedded, missing, or empty). - w.deps.Log.Info("embed: generation retired mid-run; dropping batch", - "gen", gen, "ids", len(ids)) - if cerr := w.q.Complete(ctx, gen, token, ids); cerr != nil { - // A Complete failure during a retired-gen drop leaves these - // rows claimed; route it through the orphan-drain surfacing - // channel so RunOnce cannot report a false-clean drain while - // rows remain stuck (cr2-6). Re-embedding a retired - // generation is pointless, so surface-and-continue (no - // consecutiveFailures escalation), matching the orphan path. - w.deps.Log.Error("complete drop after retired generation", "error", cerr, - "gen", gen, "ids", len(ids)) - orphanDrainErr = cerr - orphanDrainCount += len(ids) - } - continue + // The generation was retired out from under this worker. Per + // the ErrGenerationRetired contract this is a benign "stop" + // signal, not a hard failure: re-embedding would re-fail + // identically. Do NOT stamp embed_gen (the retired gen is + // going away) and end the run cleanly. + w.deps.Log.Info("embed: generation retired mid-run; stopping", "gen", gen) + return res, nil } res.Failed += len(eb.embeddedIDs) - if rerr := w.q.Release(ctx, gen, token, eb.embeddedIDs); rerr != nil { - w.deps.Log.Error("release after upsert failure", "error", rerr) - } w.deps.Log.Error("upsert failed", "gen", gen, "ids", len(eb.embeddedIDs), "error", err) consecutiveFailures++ lastErr = err @@ -451,86 +471,78 @@ func (w *Worker) RunOnce(ctx context.Context, gen vector.GenerationID) (res RunR } continue } - // Complete acknowledges work via (gen, msg, claim_token) so a - // stale worker whose claim was already reclaimed cannot wipe - // the queue row belonging to the newer worker. Failure here - // means the embedded rows stay claimed; ReclaimStale will - // rescue them eventually but the next RunOnce would falsely - // report a clean drain in the meantime — count the batch as - // failed so the failure cap can short-circuit the loop. - if cerr := w.q.Complete(ctx, gen, token, eb.embeddedIDs); cerr != nil { + + // Step 2: atomically skip-mark empty/missing rows and remove their stale + // embeddings while holding the main DB stamp transaction open. The stamp + // must be CAS-proven before deletion, and the delete must happen before + // commit: once embed_gen is committed, coverage treats the row as + // complete and the worker will not revisit it to clean up old vectors. + var skipMissed []int64 + if len(skipIDs) > 0 { + var err error + skipMissed, err = w.stampSkipped(ctx, gen, skipIDs, eb.lastModified) + if err != nil { + res.Failed += len(skipIDs) + w.deps.Log.Error("stamp skip set failed", "gen", gen, "ids", len(skipIDs), "error", err) + consecutiveFailures++ + lastErr = err + if consecutiveFailures >= w.deps.MaxConsecutiveFailures { + return res, fmt.Errorf("embed worker aborting after %d consecutive failures: %w", + consecutiveFailures, lastErr) + } + continue + } + } + + // Safe to stamp after the upsert: the upsert is idempotent, so a crash + // before this stamp just re-does the embedded rows next scan. + missed, serr := w.stampCovered(ctx, gen, eb.embeddedIDs, eb.lastModified) + if serr != nil { res.Failed += len(eb.embeddedIDs) - w.deps.Log.Error("complete failed", "error", cerr, - "gen", gen, "ids", len(eb.embeddedIDs)) + w.deps.Log.Error("stamp embed_gen failed", "gen", gen, "ids", len(eb.embeddedIDs), "error", serr) consecutiveFailures++ - lastErr = cerr + lastErr = serr if consecutiveFailures >= w.deps.MaxConsecutiveFailures { return res, fmt.Errorf("embed worker aborting after %d consecutive failures: %w", consecutiveFailures, lastErr) } + // Do not advance the cursor: next scan re-finds the unstamped rows + // (the upsert already ran, so re-embedding is idempotent). continue } + missed = append(missed, skipMissed...) + w.logCASMisses(gen, missed) - // Drop queue rows for messages that disappeared between - // enqueue and claim. We do this AFTER embedded rows are - // safely upserted and acknowledged so a Complete failure on - // the orphans does not strand the valid embedded rows in a - // claimed-but-unembedded state. Using Complete with our claim - // token makes this a token-aware delete: we only remove rows - // we still own. Failure here still counts as a batch failure - // because the orphan rows would stay claimed until - // ReclaimStale runs and falsely block the queue. - dropIDs := append(append([]int64(nil), eb.missing...), eb.empty...) - if len(dropIDs) > 0 { - if len(eb.missing) > 0 { - w.deps.Log.Warn("pending messages missing from main DB", - "gen", gen, "ids", eb.missing) - } - if len(eb.empty) > 0 { - w.deps.Log.Warn("pending messages empty after preprocess", - "gen", gen, "ids", eb.empty) - } - if cerr := w.q.Complete(ctx, gen, token, dropIDs); cerr != nil { - res.Failed += len(dropIDs) - w.deps.Log.Error("complete drop failed", "error", cerr, - "gen", gen, "ids", len(dropIDs)) - consecutiveFailures++ - lastErr = cerr - orphanDrainErr = cerr - orphanDrainCount += len(dropIDs) - batchChars := 0 - for _, c := range eb.chunks { - batchChars += c.SourceCharLen - } - completedRows += len(eb.embeddedIDs) - w.reportProgress(completedRows, len(eb.embeddedIDs), batchChars, time.Since(batchStart)) - if consecutiveFailures >= w.deps.MaxConsecutiveFailures { - // Embedded rows were already counted into - // res.Succeeded above; record the orphan-drain - // failure and abort. - res.Succeeded += len(eb.embeddedIDs) - return res, fmt.Errorf("embed worker aborting after %d consecutive failures: %w", - consecutiveFailures, lastErr) - } - // Even though the orphan drain failed, the embedded - // rows ARE done — count them and reset the cap. - // Forward progress on real messages should reset - // consecutiveFailures the same way it does in the - // downshift drain path and the all-clean success - // path. The orphan-drop Complete failure has its - // own surfacing channel via orphanDrainErr (the - // empty-claim exit returns it instead of nil), so - // we don't need consecutiveFailures to escalate - // orphan failures into an abort. The orphan rows - // stay claimed and ReclaimStale recovers them. - res.Succeeded += len(eb.embeddedIDs) - consecutiveFailures = 0 - continue - } + if len(eb.missing) > 0 { + w.deps.Log.Warn("messages missing from main DB", "gen", gen, "ids", eb.missing) + } + if len(eb.empty) > 0 { + w.deps.Log.Warn("messages empty after preprocess", "gen", gen, "ids", eb.empty) } - res.Succeeded += len(eb.embeddedIDs) + + // Only rows ACTUALLY stamped count as succeeded. A CAS miss (its + // last_modified moved between read and stamp) was not stamped, so it is + // excluded from Succeeded/progress and the backstop will recover it. + // Misses can come from embedded ids or from CAS-protected skips, while + // missing ids use the unconditional skip stamp and never miss. + missedEmbedded := countMembers(eb.embeddedIDs, missed) + succeeded := len(eb.embeddedIDs) - missedEmbedded + res.Succeeded += succeeded consecutiveFailures = 0 - batchProcessed := len(eb.embeddedIDs) + len(dropIDs) + // Advance the watermark exactly as before — to batchMax — even on a + // whole-batch CAS miss. Holding it back would head-of-line-block the + // drain; the backstop is the recovery path for missed rows. Because the + // watermark always advances, the drain still terminates (no no-progress + // loop) even when a batch makes no forward success. + afterID = batchMax + w.advanceWatermark(ctx, gen, batchMax, backstop) + + // Count only rows actually stamped toward progress (CAS misses were not + // stamped). When EVERY embedded id missed CAS and there are no skips, + // this batch made no forward progress for result/progress accounting — + // but we keep scanning (the watermark advanced), so the drain drains. + stampedSkips := len(skipIDs) - countMembers(skipIDs, missed) + batchProcessed := succeeded + stampedSkips completedRows += batchProcessed batchChars := 0 for _, c := range eb.chunks { @@ -540,6 +552,21 @@ func (w *Worker) RunOnce(ctx context.Context, gen vector.GenerationID) (res RunR } } +// advanceWatermark persists the per-gen forward-scan cursor to id after a +// batch made forward progress. The backstop never persists (it scans from +// 0 by design and must not push the optimistic watermark backward or +// forward). Failure is non-critical — the watermark is a pure +// optimization — so it is logged, not returned. +func (w *Worker) advanceWatermark(ctx context.Context, gen vector.GenerationID, id int64, backstop bool) { + if backstop { + return + } + if err := w.wm.SetWatermark(ctx, gen, id); err != nil { + w.deps.Log.Warn("embed: advance watermark failed (non-critical)", + "gen", gen, "id", id, "error", err) + } +} + // embedBatchResult carries the output of embedBatch. chunks and // embeddedIDs are aligned by position and correspond to messages that // were actually fetched and embedded. missing lists ids from the @@ -552,13 +579,17 @@ type embedBatchResult struct { missing []int64 empty []int64 truncated int + // lastModified maps each FETCHED id (embedded or empty) to the CAS token + // captured at read time. Missing ids are absent (they have no row, so + // there is nothing to CAS-stamp — they are skip-marked unconditionally). + lastModified map[int64]any } // embedBatch fetches subject/body for ids, preprocesses each, calls the // embedding client, and assembles the resulting chunks. Messages that -// vanished between enqueue and claim (e.g. the sync deleted them) are +// vanished between scan and fetch (e.g. the sync deleted them) are // reported in the returned result's missing slice rather than causing -// a failure — the caller decides how to drain them from the queue. +// a failure — the caller skip-marks them. func (w *Worker) embedBatch(ctx context.Context, ids []int64) (embedBatchResult, error) { placeholders := make([]string, len(ids)) args := make([]any, len(ids)) @@ -567,10 +598,10 @@ func (w *Worker) embedBatch(ctx context.Context, ids []int64) (embedBatchResult, args[i] = id } query := w.rebind(fmt.Sprintf(` - SELECT m.id, COALESCE(m.subject, ''), COALESCE(mb.body_text, ''), COALESCE(mb.body_html, '') + SELECT m.id, COALESCE(m.subject, ''), COALESCE(mb.body_text, ''), COALESCE(mb.body_html, ''), %s FROM messages m LEFT JOIN message_bodies mb ON mb.message_id = m.id - WHERE m.id IN (%s)`, strings.Join(placeholders, ","))) + WHERE m.id IN (%s)`, w.lastModifiedExpr, strings.Join(placeholders, ","))) rows, err := w.deps.MainDB.QueryContext(ctx, query, args...) if err != nil { @@ -581,12 +612,20 @@ func (w *Worker) embedBatch(ctx context.Context, ids []int64) (embedBatchResult, var msgs []msgText var empty []int64 fetched := make(map[int64]struct{}, len(ids)) + // lastModified holds the per-message CAS token captured at read time, + // keyed by id. The worker binds the EXACT value scanned here back into + // the CAS stamp's WHERE last_modified = ?, so a concurrent content edit + // that bumped last_modified between this read and the stamp blocks the + // stamp (0 rows) and the row is re-found next scan. + lastModified := make(map[int64]any, len(ids)) for rows.Next() { var id int64 var subject, bodyText, bodyHTML string - if err := rows.Scan(&id, &subject, &bodyText, &bodyHTML); err != nil { + var lm any + if err := rows.Scan(&id, &subject, &bodyText, &bodyHTML, &lm); err != nil { return embedBatchResult{}, fmt.Errorf("scan message row: %w", err) } + lastModified[id] = lm // Fall back to HTML-to-text when the plaintext body is empty — // HTML-only messages would otherwise get subject-only embeddings // and have materially worse semantic recall. @@ -625,8 +664,8 @@ func (w *Worker) embedBatch(ctx context.Context, ids []int64) (embedBatchResult, return embedBatchResult{}, fmt.Errorf("iterate message rows: %w", err) } - // Identify claimed ids that had no row in messages; we'll report - // them back so the caller can drop them from the queue. + // Identify scanned ids that had no row in messages; we'll report + // them back so the caller can skip-mark them. var missing []int64 for _, id := range ids { if _, ok := fetched[id]; !ok { @@ -635,9 +674,9 @@ func (w *Worker) embedBatch(ctx context.Context, ids []int64) (embedBatchResult, } if len(msgs) == 0 { - // All claimed ids are missing — return an empty result (no - // chunks, no error). Caller handles the drop. - return embedBatchResult{missing: missing, empty: empty}, nil + // All scanned ids are missing/empty — return an empty result (no + // chunks, no error). Caller skip-marks them. + return embedBatchResult{missing: missing, empty: empty, lastModified: lastModified}, nil } // Chunk every message into windows of at most MaxInputChars runes. @@ -673,9 +712,6 @@ func (w *Worker) embedBatch(ctx context.Context, ids []int64) (embedBatchResult, // sentence may have been split across the boundary // (overlap exists to recover from this), or any // chunk of a message that was truncated upstream. - // Both feed embeddings.truncated and the per-message - // counter so users see a faithful picture of which - // embeddings cover their full source content. Trunc: msgTrunc || (chunkWindow > 0 && (sp.CharEnd-sp.CharStart) == chunkWindow && j < len(spans)-1), } @@ -689,9 +725,9 @@ func (w *Worker) embedBatch(ctx context.Context, ids []int64) (embedBatchResult, // single embed call past the provider's per-request limit (Ollama // stops responding around 250 inputs; OpenAI caps at 2048; either // way, payload size + request-timeout grow with the input count). - // The pending queue stays per-message — a message completes only - // after every one of its chunks has been embedded and upserted in - // this same call, so partial-failure semantics are unchanged. + // A message completes only after every one of its chunks has been + // embedded and upserted in this same call, so partial-failure + // semantics are unchanged. embedSubBatchSize := w.deps.BatchSize if embedSubBatchSize <= 0 { embedSubBatchSize = len(inputs) @@ -702,7 +738,11 @@ func (w *Worker) embedBatch(ctx context.Context, ids []int64) (embedBatchResult, end := min(i+embedSubBatchSize, len(inputs)) got, err := w.deps.Client.Embed(ctx, inputs[i:end]) if err != nil { - return embedBatchResult{}, fmt.Errorf("embed: %w", err) + return embedBatchResult{ + missing: missing, + empty: empty, + lastModified: lastModified, + }, fmt.Errorf("embed: %w", err) } if len(got) != end-i { return embedBatchResult{}, fmt.Errorf( @@ -753,77 +793,80 @@ func (w *Worker) embedBatch(ctx context.Context, ids []int64) (embedBatchResult, } } return embedBatchResult{ - chunks: chunks, - embeddedIDs: embeddedIDs, - missing: missing, - empty: empty, - truncated: truncated, + chunks: chunks, + embeddedIDs: embeddedIDs, + missing: missing, + empty: empty, + truncated: truncated, + lastModified: lastModified, }, nil } -// downshiftDrain handles a non-retryable 4xx on a claimed batch by -// walking the same already-claimed IDs one at a time. The IDs remain -// owned under the caller's claim_token throughout the drain, so we -// never re-Claim them — that would race other workers. -// -// Singleton 4xxs are NOT eagerly Completed. ErrPermanent4xx covers -// both message-specific failures (413 payload-too-large, 422 -// Unprocessable, 400 invalid-input) and endpoint/config-wide -// failures (401 bad-key, 403 forbidden, 404 invalid-model, 400 -// malformed-shared-config) — the two are indistinguishable at the -// call site. If we Complete-deleted on every singleton 4xx, a -// misconfigured endpoint would silently destroy work. Instead we -// defer the drop decision: collect the 4xxing IDs, and at end of -// drain decide based on whether anything embedded. +// downshiftDrain handles a non-retryable 4xx on a scanned batch by walking +// the same ids one at a time. Singletons that embed are upserted and +// stamped immediately; singletons that 4xx are deferred and the drop +// decision is made at end-of-drain based on whether anything embedded +// (message-specific 4xx → stamp-drop; endpoint-wide failure → leave +// unstamped so a misconfigured endpoint does not silently lose work). // -// Returned `embedded` is the count of singletons that successfully -// embedded. -// -// Returned `dropped` is the count of singletons whose drop was -// confirmed (Complete succeeded). A drain that releases its deferred -// IDs back to the queue (because no singleton embedded) returns -// `dropped == 0`. -// -// Returned `err`: -// - non-nil all-drop: every singleton 4xxd, no embeds. Deferred -// IDs were Released back to the queue (so a misconfigured -// endpoint does not lose work) and the wrapped 4xx is returned. -// The caller increments consecutiveFailures and the cap will -// eventually trip, surfacing the original 4xx body. -// - non-nil non-4xx interruption: transient errors that exhausted -// retries inside embedBatch, upsert/complete failures, or a -// cancellation seen after embedBatch starts. Deferred and -// unprocessed IDs are released before returning so a later run -// can retry them promptly. A cancellation observed before the -// next singleton starts returns ctx.Err without releasing; those -// rows remain claimed for ReclaimStale to recover. -// - nil: drain completed cleanly. If `embedded > 0` and there were -// deferred IDs, they were Completed as message-specific drops. +// Returns: +// - embedded: count of singletons that successfully embedded AND CAS-stamped +// their embed_gen (a singleton that embedded+upserted but whose CAS stamp +// missed is excluded — it is recovered by the backstop). This feeds +// res.Succeeded / progress. +// - embeddedOK: count of singletons that successfully embedded + upserted, +// REGARDLESS of whether the subsequent CAS stamp landed. This is the +// endpoint-health signal: if the endpoint demonstrably embedded something +// this drain (embeddedOK > 0), any sibling 4xxs are message-specific drops, +// not an endpoint-wide outage — even when those embeds CAS-missed and so +// contributed nothing to `embedded`. The caller resets the +// consecutive-failure counter on embeddedOK > 0. +// - stamped: count of ids whose embed_gen was stamped (embedded + +// confirmed message-specific drops). When this is > 0 the caller may +// advance the scan cursor; when it is 0 the deferred ids are left +// unstamped and the cursor must not advance. +// - safeAdvanceID: the highest scanned id the caller may advance the +// watermark past WITHOUT stranding an unstamped row. On a clean drain +// every id is resolved (stamped or message-specific drop) so this is +// the batch's max id. On a NON-4xx error return it is the highest +// CONTIGUOUSLY-stamped id reached before the failure — so the watermark +// does not jump past a later unstamped straggler that a transient fault +// left behind. The next RunOnce re-finds the straggler (id > +// safeAdvanceID) and retries it idempotently; the failure cap still +// bounds repeated transient failures. (On the all-drop 4xx return +// nothing is stamped so this stays 0.) +// - err: nil on a clean drain; ErrPermanent4xx (wrapped) when every +// singleton 4xx'd with no embeds (deferred ids left unstamped); +// ErrGenerationRetired (wrapped) when the generation was retired +// mid-drain (benign); or any other error (transient-after-retries, +// upsert, stamp) that should fail the run. func (w *Worker) downshiftDrain( ctx context.Context, gen vector.GenerationID, - token string, ids []int64, res *RunResult, completedRows *int, -) (embedded int, dropped int, err error) { +) (embedded int, embeddedOK int, stamped int, safeAdvanceID int64, err error) { var deferredDrops []int64 var lastDeferredErr error - // retiredObserved records that at least one singleton's Upsert reported - // the generation as retired. It is load-bearing for the end-of-drain - // decision (cr2-7): once a generation is retired, no future run will ever - // re-claim these rows (pickTarget never targets retired gens), so the - // endpoint-misconfig protection that Releases on embedded==0 is moot and - // would only orphan the deferred rows and trigger the re-embed/hard-abort - // loop. retiredDrainErr captures a Complete failure during a retired drop - // so RunOnce can surface it rather than report a false-clean run (cr2-6). - var retiredObserved bool - var retiredDrainErr error + // lm accumulates last_modified CAS tokens across the singleton fetches so + // the end-of-drain deferred-drop stamp can CAS rows whose content is + // unchanged. A deferred-drop id whose embedBatch ERRORED (embedder + // rejected it) has no token here and falls back to an unconditional + // stamp in stampCovered — acceptable for the already-degraded 4xx path. + lm := make(map[int64]any, len(ids)) + // contiguousStampedID tracks the highest id with an unbroken + // stamped-from-the-start prefix. The first time an id is left unresolved + // (a deferred 4xx, or a non-4xx error return) brokeContiguity latches + // and we stop advancing it — everything from that id on is unsafe to + // skip past. + var contiguousStampedID int64 + brokeContiguity := false - for i, id := range ids { + for _, id := range ids { select { case <-ctx.Done(): - return embedded, dropped, ctx.Err() + return embedded, embeddedOK, stamped, contiguousStampedID, ctx.Err() default: } @@ -831,149 +874,275 @@ func (w *Worker) downshiftDrain( eb, e := w.embedBatch(ctx, []int64{id}) if e != nil { if errors.Is(e, ErrPermanent4xx) { - // Defer the drop decision. See function-level - // comment for the endpoint-vs-message distinction. + maps.Copy(lm, eb.lastModified) + // Defer the drop decision. See function-level comment. A + // deferred id breaks the contiguous-stamped prefix: even if it + // is stamped at end-of-drain, the watermark must not skip past + // it on an error return. deferredDrops = append(deferredDrops, id) lastDeferredErr = e + brokeContiguity = true continue } - w.releaseDownshiftRemainder(ctx, gen, token, append(append([]int64(nil), deferredDrops...), ids[i:]...)) - return embedded, dropped, e + // Non-4xx (transient) error: this id is left UNSTAMPED. Return the + // contiguous-stamped id so the caller does not advance the + // watermark past it; the next RunOnce re-finds it. + return embedded, embeddedOK, stamped, contiguousStampedID, e } + // Carry forward the CAS token for this fetched id. + maps.Copy(lm, eb.lastModified) if len(eb.chunks) == 0 { - drop := append(append([]int64(nil), eb.missing...), eb.empty...) - if len(drop) > 0 { - if cerr := w.q.Complete(ctx, gen, token, drop); cerr != nil { - res.Failed += len(drop) - w.releaseDownshiftRemainder(ctx, gen, token, append(append([]int64(nil), deferredDrops...), ids[i:]...)) - return embedded, dropped, fmt.Errorf("complete drop: %w", cerr) + // Missing/empty singleton — skip-mark it. + skip := append(append([]int64(nil), eb.missing...), eb.empty...) + // stampedThisID reports whether this singleton's skip-mark actually + // landed. Default true so the len(skip)==0 sub-case (nothing to + // skip-mark — should not normally happen here, but guard it) does + // NOT break contiguity: there is no unstamped row to strand. + stampedThisID := true + if len(skip) > 0 { + missed, serr := w.stampSkipped(ctx, gen, skip, eb.lastModified) + if serr != nil { + res.Failed += len(skip) + return embedded, embeddedOK, stamped, contiguousStampedID, fmt.Errorf("stamp skip: %w", serr) } - dropped += len(drop) - *completedRows += len(drop) - w.reportProgress(*completedRows, len(drop), 0, time.Since(batchStart)) + w.logCASMisses(gen, missed) + stampedSkip := len(skip) - len(missed) + stamped += stampedSkip + *completedRows += stampedSkip + w.reportProgress(*completedRows, stampedSkip, 0, time.Since(batchStart)) + // An empty singleton's skip-mark goes through the optimistic CAS + // (its last_modified token is captured at read time) and CAN miss + // when a concurrent edit moved last_modified — e.g. an empty + // message that just got real content via repair. A CAS miss leaves + // the row UNSTAMPED, so it must not be skipped past. + stampedThisID = stampedSkip > 0 + } + // Advance the contiguous-stamped prefix only when this singleton was + // ACTUALLY stamped. A CAS-missed skip-mark (stampedThisID == false) is + // left unstamped and recovered by the backstop, so the watermark must + // not skip past it. Once a CAS miss breaks the prefix, latch + // brokeContiguity so a later stamped id cannot re-extend it over the + // unstamped gap. + if stampedThisID { + if !brokeContiguity { + contiguousStampedID = id + } + } else { + brokeContiguity = true } continue } if uerr := w.deps.Backend.Upsert(ctx, gen, eb.chunks); uerr != nil { if errors.Is(uerr, vector.ErrGenerationRetired) { - // Benign per the ErrGenerationRetired contract: the - // generation was retired mid-drain. Token-aware DROP this - // singleton's row and continue the drain rather than - // wrapping into a non-4xx error (which RunOnce would treat - // as a hard abort). The remaining claimed rows will also - // observe the retired state and drop the same way, so the - // drain finishes cleanly and RunOnce returns nil. Record the - // retirement so the end-of-drain decision drops any deferred - // 4xx rows instead of Releasing them (cr2-7). - retiredObserved = true - w.deps.Log.Info("embed: generation retired mid-drain; dropping singleton", - "gen", gen, "id", id) - if cerr := w.q.Complete(ctx, gen, token, []int64{id}); cerr != nil { - // Surface the Complete failure rather than swallowing it - // (cr2-6): the row stays claimed and RunOnce must not - // report a clean run. - w.deps.Log.Error("complete drop after retired generation (drain)", "error", cerr, - "gen", gen, "id", id) - retiredDrainErr = cerr - } - continue + // Generation retired mid-drain. Stop draining and surface the + // benign sentinel; remaining singletons would observe the same + // state. Do not stamp (the gen is going away). + w.deps.Log.Info("embed: generation retired mid-drain; stopping", "gen", gen, "id", id) + return embedded, embeddedOK, stamped, contiguousStampedID, fmt.Errorf("upsert: %w", uerr) } - w.releaseDownshiftRemainder(ctx, gen, token, append(append([]int64(nil), deferredDrops...), ids[i:]...)) - return embedded, dropped, fmt.Errorf("upsert: %w", uerr) + return embedded, embeddedOK, stamped, contiguousStampedID, fmt.Errorf("upsert: %w", uerr) } - if cerr := w.q.Complete(ctx, gen, token, eb.embeddedIDs); cerr != nil { - w.releaseDownshiftRemainder(ctx, gen, token, append(append([]int64(nil), deferredDrops...), ids[i:]...)) - return embedded, dropped, fmt.Errorf("complete: %w", cerr) + // The endpoint demonstrably embedded + upserted this singleton. Count it + // toward endpoint health NOW, before the CAS stamp result — a stamp that + // later misses (a concurrent edit moved last_modified) does not mean the + // endpoint failed, so it must not be able to misclassify this drain as an + // endpoint-wide all-drop. + embeddedOK++ + missed, serr := w.stampCovered(ctx, gen, eb.embeddedIDs, eb.lastModified) + if serr != nil { + return embedded, embeddedOK, stamped, contiguousStampedID, fmt.Errorf("stamp embed_gen: %w", serr) } + w.logCASMisses(gen, missed) + // A CAS miss on this singleton means its content changed since the + // read: it was not stamped and the backstop will recover it, so it does + // not count as embedded/stamped here. + stampedHere := len(eb.embeddedIDs) - len(missed) res.Truncated += eb.truncated - embedded += len(eb.embeddedIDs) - *completedRows += len(eb.embeddedIDs) + embedded += stampedHere + stamped += stampedHere + *completedRows += stampedHere + // Advance the contiguous-stamped prefix only when this singleton was + // ACTUALLY stamped. A CAS-missed singleton (stampedHere == 0) is left + // unstamped and recovered by the backstop, so the watermark must not skip + // past it. Once a CAS miss breaks the prefix, latch brokeContiguity so a + // later stamped id cannot re-extend it over the unstamped gap. + if stampedHere > 0 { + if !brokeContiguity { + contiguousStampedID = id + } + } else { + brokeContiguity = true + } batchChars := 0 for _, c := range eb.chunks { batchChars += c.SourceCharLen } - w.reportProgress(*completedRows, len(eb.embeddedIDs), batchChars, time.Since(batchStart)) + w.reportProgress(*completedRows, stampedHere, batchChars, time.Since(batchStart)) } - // Drain finished. Decide deferred-drop fate. + // Drain finished cleanly: every id is resolved (stamped, skip-marked, or + // a deferred 4xx about to be stamped below), so the whole scanned batch + // is safe to advance past. + safeAdvanceID = ids[len(ids)-1] + + // Decide deferred-drop fate. if len(deferredDrops) == 0 { - // No deferred 4xx rows. Surface a retired-drop Complete failure if one - // occurred (cr2-6); otherwise the drain is clean. - if retiredDrainErr != nil { - return embedded, dropped, fmt.Errorf("complete drop after retired generation: %w", retiredDrainErr) - } - return embedded, dropped, nil + return embedded, embeddedOK, stamped, safeAdvanceID, nil } - // A retirement was observed: the generation is gone, so re-claiming is - // impossible (pickTarget never targets retired gens). Releasing the - // deferred 4xx rows would orphan them forever and trigger the wasteful - // re-embed/hard-abort loop. Token-DROP them instead and return nil - // (benign) — UNLESS a retired-drop Complete already failed, in which case - // we surface that so RunOnce reports the stuck rows (cr2-6/cr2-7). This - // check MUST precede the embedded==0 all-drop Release path below; the - // retiredObserved flag (not embedded==0 alone) is what distinguishes a - // retired generation from a misconfigured endpoint, preserving the - // silent-delete-on-misconfig guard. - if retiredObserved { + if embeddedOK > 0 { + // The endpoint demonstrably embedded something this drain, so the 4xxs + // are message-specific (oversize input, malformed input, etc.) — NOT an + // endpoint-wide outage. Key on embeddedOK (successful embed+upsert) and + // not `embedded` (successful CAS stamp): a singleton that embedded but + // CAS-missed its stamp still proves the endpoint is healthy, and must + // not let a genuine 4xx sibling be misclassified as an all-drop. + // Stamp the deferred 4xxs so they drop out of future scans. for _, id := range deferredDrops { - w.deps.Log.Warn("dropping deferred 4xx pending message; generation retired", + w.deps.Log.Warn("stamping (dropping) message after singleton 4xx", "gen", gen, "id", id, "error", lastDeferredErr) } dropStart := time.Now() - if cerr := w.q.Complete(ctx, gen, token, deferredDrops); cerr != nil { + // Deferred drops carry the last_modified token captured before the + // singleton embedder call failed, so stampSkipped CAS-protects them and + // deletes stale vectors only for rows whose drop stamp actually landed. + missed, serr := w.stampSkipped(ctx, gen, deferredDrops, lm) + if serr != nil { res.Failed += len(deferredDrops) - return embedded, dropped, fmt.Errorf("complete drop after retired generation: %w", cerr) + // Some deferred drops are now unstamped; do not advance past the + // contiguous-stamped prefix. + return embedded, embeddedOK, stamped, contiguousStampedID, fmt.Errorf("stamp drop: %w", serr) } - dropped += len(deferredDrops) - *completedRows += len(deferredDrops) - w.reportProgress(*completedRows, len(deferredDrops), 0, time.Since(dropStart)) - if retiredDrainErr != nil { - return embedded, dropped, fmt.Errorf("complete drop after retired generation: %w", retiredDrainErr) - } - return embedded, dropped, nil + w.logCASMisses(gen, missed) + stampedDrops := len(deferredDrops) - len(missed) + stamped += stampedDrops + *completedRows += stampedDrops + w.reportProgress(*completedRows, stampedDrops, 0, time.Since(dropStart)) + // All deferred drops are now stamped; the entire batch is resolved. + return embedded, embeddedOK, stamped, safeAdvanceID, nil } - if embedded > 0 { - // Endpoint works for some messages, so the 4xxs are - // message-specific (oversize input, malformed input, etc.). - // Drop them. - for _, id := range deferredDrops { - w.deps.Log.Warn("dropping pending message after singleton 4xx", - "gen", gen, "id", id, "error", lastDeferredErr) + // embeddedOK == 0. The endpoint embedded nothing this drain, so we can't + // distinguish an endpoint-wide failure from a batch where every message + // just happened to be unembeddable. Leave the deferred ids UNSTAMPED so a + // misconfigured endpoint does not silently drop work, and return the + // wrapped 4xx so the caller surfaces it. The unstamped ids are re-found on + // the next scan; if the underlying problem persists, the + // consecutive-failure cap eventually trips with the same 4xx body. Advance + // only past the contiguous stamped prefix (the leading missing/empty skips, + // if any) so the unstamped deferred ids are not skipped. + return embedded, embeddedOK, stamped, contiguousStampedID, fmt.Errorf("downshift all-drop: every singleton returned non-retryable 4xx (left %d row(s) unstamped): %w", + len(deferredDrops), lastDeferredErr) +} + +// stampCovered stamps embed_gen=gen for ids, choosing per id between an +// optimistic-CAS stamp and an unconditional one based on whether a +// last_modified token was captured for that id at content-read time: +// +// - ids present in lm (fetched: embedded or empty-after-preprocess) are +// CAS-stamped via SetEmbedGenIfUnchanged — if a concurrent content edit +// bumped last_modified between read and now, the stamp matches 0 rows and +// the row stays "needs embedding" for the next scan (the repair-race fix). +// - ids absent from lm (missing: no row in messages) are stamped +// unconditionally via SetEmbedGen — there is no content and no token to +// guard, and they must drop out of the scan so it can advance. +// +// Returns the CAS-MISS ids: rows whose optimistic-CAS UPDATE matched 0 rows +// because last_modified moved between read and stamp (a concurrent repair/edit). +// They were NOT stamped; the caller logs them and excludes them from success +// accounting. The watermark still advances (the caller's job) and the backstop +// recovers them. A missed CAS is NOT an error — only a real driver failure on +// either path returns err, for the caller's consecutive-failure accounting. +func (w *Worker) stampCovered(ctx context.Context, gen vector.GenerationID, ids []int64, lm map[int64]any) (missed []int64, err error) { + var cas []store.EmbedGenStamp + var plain []int64 + for _, id := range ids { + if tok, ok := lm[id]; ok { + cas = append(cas, store.EmbedGenStamp{ID: id, LastModified: tok}) + } else { + plain = append(plain, id) } - dropStart := time.Now() - if cerr := w.q.Complete(ctx, gen, token, deferredDrops); cerr != nil { - res.Failed += len(deferredDrops) - return embedded, dropped, fmt.Errorf("complete drop: %w", cerr) + } + if len(cas) > 0 { + m, err := w.deps.Store.SetEmbedGenIfUnchanged(ctx, cas, int64(gen)) + if err != nil { + return missed, err } - dropped += len(deferredDrops) - *completedRows += len(deferredDrops) - w.reportProgress(*completedRows, len(deferredDrops), 0, time.Since(dropStart)) - return embedded, dropped, nil + missed = append(missed, m...) } - // embedded == 0. We can't distinguish endpoint-wide failure from a - // batch where every message just happened to be unembeddable. - // Release the deferred IDs (rather than Completing them) so a - // misconfigured endpoint does not silently destroy work, and - // return the wrapped 4xx so the caller surfaces it. The released - // IDs go back to the pending queue and will be re-claimed; if the - // underlying problem persists, the consecutive-failure cap will - // eventually trip with the same 4xx body in lastErr. - if rerr := w.q.Release(ctx, gen, token, deferredDrops); rerr != nil { - w.deps.Log.Error("release after all-drop drain", "error", rerr, - "gen", gen, "ids", len(deferredDrops)) + if len(plain) > 0 { + if err := w.deps.Store.SetEmbedGen(ctx, plain, int64(gen)); err != nil { + return missed, err + } } - return embedded, dropped, fmt.Errorf("downshift all-drop: every singleton returned non-retryable 4xx (released %d row(s) back to queue): %w", - len(deferredDrops), lastDeferredErr) + return missed, nil } -func (w *Worker) releaseDownshiftRemainder(ctx context.Context, gen vector.GenerationID, token string, ids []int64) { +func (w *Worker) stampSkipped(ctx context.Context, gen vector.GenerationID, ids []int64, lm map[int64]any) (missed []int64, err error) { if len(ids) == 0 { - return + return nil, nil + } + if w.deps.beforeSkipStamp != nil { + w.deps.beforeSkipStamp(ctx, ids) } - if rerr := w.q.Release(ctx, gen, token, ids); rerr != nil { - w.deps.Log.Error("release after downshift interruption", "error", rerr, - "gen", gen, "ids", len(ids)) + + tx, err := w.deps.MainDB.BeginTx(ctx, nil) + if err != nil { + return nil, fmt.Errorf("begin skip stamp tx: %w", err) + } + defer func() { _ = tx.Rollback() }() + + deleteIDs := make([]int64, 0, len(ids)) + for _, id := range ids { + if tok, ok := lm[id]; ok { + res, err := tx.ExecContext(ctx, + w.rebind(`UPDATE messages SET embed_gen = ? WHERE id = ? AND last_modified = ?`), + int64(gen), id, tok) + if err != nil { + return nil, fmt.Errorf("set skipped embed_gen if unchanged (id=%d): %w", id, err) + } + n, err := res.RowsAffected() + if err != nil { + return nil, fmt.Errorf("rows affected for skipped stamp (id=%d): %w", id, err) + } + if n == 0 { + missed = append(missed, id) + continue + } + deleteIDs = append(deleteIDs, id) + continue + } + + if _, err := tx.ExecContext(ctx, + w.rebind(`UPDATE messages SET embed_gen = ? WHERE id = ?`), + int64(gen), id); err != nil { + return nil, fmt.Errorf("set skipped embed_gen (id=%d): %w", id, err) + } + // Missing rows have no message row to stamp, but stale vector rows for + // that id/generation should still be removed. + deleteIDs = append(deleteIDs, id) + } + + if err := w.deps.Backend.Delete(ctx, gen, deleteIDs); err != nil { + return nil, fmt.Errorf("delete stale skipped embeddings: %w", err) } + if err := tx.Commit(); err != nil { + return nil, fmt.Errorf("commit skip stamp tx: %w", err) + } + return missed, nil +} + +// logCASMisses records the CAS-missed ids returned by stampCovered. A miss +// means last_modified moved between the worker's content read and the stamp (a +// concurrent repair/edit), so the row was not stamped. These rows are NOT lost: +// their last_modified moved (embed_gen may be NULL), so the auto-backstop's +// watermark-ignoring scan re-finds and re-embeds them with the corrected +// content. The watermark is deliberately NOT held back (that would +// head-of-line-block the drain); the backstop is the recovery mechanism. +func (w *Worker) logCASMisses(gen vector.GenerationID, missed []int64) { + if len(missed) == 0 { + return + } + w.deps.Log.Info("embed: embed_gen CAS misses (concurrent edit); will be recovered by backstop", + "gen", gen, "count", len(missed), "ids", missed) } func (w *Worker) reportProgress(done, batchMsgs, batchChars int, batchElapsed time.Duration) { @@ -990,6 +1159,27 @@ func (w *Worker) reportProgress(done, batchMsgs, batchChars int, batchElapsed ti }) } +// countMembers returns how many ids in `set` also appear in `subset`. Used to +// count how many of a batch's stamped ids were actually CAS misses (the missed +// slice is a subset of the ids passed to stampCovered) so the worker can net +// them out of its success/progress accounting. +func countMembers(set, subset []int64) int { + if len(set) == 0 || len(subset) == 0 { + return 0 + } + want := make(map[int64]struct{}, len(subset)) + for _, id := range subset { + want[id] = struct{}{} + } + n := 0 + for _, id := range set { + if _, ok := want[id]; ok { + n++ + } + } + return n +} + // totalPieceChars sums the rune counts of every chunk in the batch, for // debug logging — distinct from totalChars because a long message // contributes one msgText row but several inputChunk rows. diff --git a/internal/vector/embed/worker_cas_test.go b/internal/vector/embed/worker_cas_test.go new file mode 100644 index 000000000..2ad363f1c --- /dev/null +++ b/internal/vector/embed/worker_cas_test.go @@ -0,0 +1,459 @@ +//go:build sqlite_vec + +package embed + +import ( + "bytes" + "context" + "database/sql" + "fmt" + "log/slog" + "strings" + "testing" + + "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/require" + + "go.kenn.io/msgvault/internal/store" +) + +// stamps builds a single-item EmbedGenStamp slice for a CAS stamp call. +func stamps(id int64, lastModified any) []store.EmbedGenStamp { + return []store.EmbedGenStamp{{ID: id, LastModified: lastModified}} +} + +// lmOf reads a message's last_modified as the literal stored text (CAST AS +// TEXT defeats go-sqlite3's DATETIME coercion, matching the worker). +func lmOf(t *testing.T, db *sql.DB, id int64) string { + t.Helper() + var s string + require.NoError(t, db.QueryRow( + `SELECT CAST(last_modified AS TEXT) FROM messages WHERE id = ?`, id).Scan(&s)) + return s +} + +// setBaselineLM pins last_modified to a fixed far-past value so a subsequent +// trigger-driven bump is guaranteed to differ (sidesteps SQLite's 1-second +// timestamp resolution). The explicit write is preserved by the trigger's +// WHEN guard (OLD != NEW), not re-bumped. +func setBaselineLM(t *testing.T, db *sql.DB, id int64) string { + t.Helper() + _, err := db.Exec( + `UPDATE messages SET last_modified = '2000-01-01 00:00:00' WHERE id = ?`, id) + require.NoError(t, err, "baseline last_modified") + return lmOf(t, db, id) +} + +// TestWorker_CASRepairRace is the core regression for Codex 129d #1: a +// concurrent content edit (repair-encoding) that lands BETWEEN the worker +// reading a message's content and stamping embed_gen must NOT leave the row +// marked embedded-with-stale-content. The optimistic CAS on last_modified +// catches the change and leaves the row "needs embedding". +func TestWorker_CASRepairRace(t *testing.T) { + ctx := context.Background() + f := newWorkerFixture(t, 1) + + // Baseline last_modified to a fixed past value (= the token T the worker + // will capture at read time). + tokenAtRead := setBaselineLM(t, f.MainDB, 1) + + // Inject the race: when the embedder is called (after the worker scanned + + // fetched content and captured last_modified = T, before it stamps), + // simulate repair-encoding rewriting the body. The body UPDATE fires the + // trigger, bumping last_modified to T2 (!= T); repair-encoding also resets + // embed_gen -> NULL. + f.FakeClient.preReturn = func() { + _, err := f.MainDB.Exec( + `UPDATE message_bodies SET body_text = 'corrected content' WHERE message_id = 1`) + require.NoError(t, err, "race: rewrite body") + _, err = f.MainDB.Exec(`UPDATE messages SET embed_gen = NULL WHERE id = 1`) + require.NoError(t, err, "race: reset embed_gen") + } + + w := newTestWorker(f, 1) + res, err := w.RunOnce(ctx, f.BuildingGen) + require.NoError(t, err, "RunOnce") + + // The CAS stamp targeted WHERE last_modified = T, but the row is now T2, + // so 0 rows were stamped: embed_gen is still NULL and the row still needs + // embedding. (Without the CAS, the unconditional stamp would have marked + // it covered with the STALE pre-repair content — proven below.) + _, isNull := embedGenOf(t, f.MainDB, 1) + assert.True(t, isNull, "raced row must NOT be stamped (embed_gen still NULL)") + assert.Equal(t, 1, countMissing(t, f.MainDB, int64(f.BuildingGen)), + "raced row still needs embedding") + + // A CAS miss is NOT counted as a success. + assert.Equal(t, 0, res.Succeeded, "CAS-missed row not counted in Succeeded") + + // The watermark still advances to batchMax (the single scanned id) — the + // drain does not stick on the missed row; the backstop is the recovery. + assert.Equal(t, int64(1), readWatermark(t, f.VectorsDB, int64(f.BuildingGen)), + "watermark advances past the CAS-missed row") + + // Confirm last_modified actually moved (the race really happened). + assert.NotEqual(t, tokenAtRead, lmOf(t, f.MainDB, 1), "last_modified bumped by race") + + // Recovery: clear the preReturn race, then a backstop pass (scans from 0, + // ignoring the watermark) re-embeds the row with the corrected content. + f.FakeClient.preReturn = nil + res, err = w.RunBackstop(ctx, f.BuildingGen) + require.NoError(t, err, "RunBackstop recovery") + assert.Equal(t, 1, res.Succeeded, "raced row re-embedded on recovery") + assert.Equal(t, 0, countMissing(t, f.MainDB, int64(f.BuildingGen)), + "coverage complete after recovery") +} + +// TestWorker_CASRepairRace_OldCodeWouldFail proves the OLD behavior was +// buggy: an UNCONDITIONAL stamp (the pre-fix Store.SetEmbedGen) applied after +// the same race marks the row covered-with-stale-content — exactly the defect +// the CAS fix removes. +func TestWorker_CASRepairRace_OldCodeWouldFail(t *testing.T) { + ctx := context.Background() + f := newWorkerFixture(t, 1) + setBaselineLM(t, f.MainDB, 1) + + // Simulate the worker having read content (token captured), then the race + // edit landing (body rewrite bumps last_modified; embed_gen reset to NULL). + _, err := f.MainDB.Exec( + `UPDATE message_bodies SET body_text = 'corrected content' WHERE message_id = 1`) + require.NoError(t, err, "race: rewrite body") + _, err = f.MainDB.Exec(`UPDATE messages SET embed_gen = NULL WHERE id = 1`) + require.NoError(t, err, "race: reset embed_gen") + + // OLD path: unconditional stamp ignores last_modified and WRONGLY marks + // the row covered despite the post-read content change. + require.NoError(t, f.Store.SetEmbedGen(ctx, []int64{1}, int64(f.BuildingGen)), + "old unconditional stamp") + assert.Equal(t, 0, countMissing(t, f.MainDB, int64(f.BuildingGen)), + "OLD code: row wrongly marked covered (the bug)") + + // NEW path: a CAS stamp with the STALE token (captured before the race) + // does NOT mark it covered — the desired behavior. + _, err = f.MainDB.Exec(`UPDATE messages SET embed_gen = NULL WHERE id = 1`) + require.NoError(t, err, "reset for CAS check") + staleToken := "2000-01-01 00:00:00" + missed, err := f.Store.SetEmbedGenIfUnchanged(ctx, + stamps(1, staleToken), int64(f.BuildingGen)) + require.NoError(t, err, "CAS with stale token") + assert.Equal(t, []int64{1}, missed, "stale-token CAS returns the missed id") + assert.Equal(t, 1, countMissing(t, f.MainDB, int64(f.BuildingGen)), + "NEW code: CAS with stale token leaves row needing embedding") +} + +// TestWorker_CASNormalPath verifies the happy path: when last_modified is +// unchanged between read and stamp, the CAS stamp succeeds and embed_gen is +// set — and the trigger bumping last_modified as a side effect of the stamp's +// own UPDATE does not break it (the WHERE matches the pre-trigger value). +func TestWorker_CASNormalPath(t *testing.T) { + ctx := context.Background() + f := newWorkerFixture(t, 3) + + w := newTestWorker(f, 3) + res, err := w.RunOnce(ctx, f.BuildingGen) + require.NoError(t, err, "RunOnce") + assert.Equal(t, 3, res.Succeeded, "all embedded") + assert.Equal(t, 0, countMissing(t, f.MainDB, int64(f.BuildingGen)), + "all stamped via CAS normal path") + + // Every row is stamped to the building gen. + for id := int64(1); id <= 3; id++ { + v, isNull := embedGenOf(t, f.MainDB, id) + assert.False(t, isNull, "msg %d stamped", id) + assert.Equal(t, int64(f.BuildingGen), v, "msg %d embed_gen", id) + } +} + +// TestWorker_CASMissAccounting is the focused accounting regression for the +// "surface CAS misses" change: in a batch where ONE row is raced (its +// last_modified moves between read and stamp) and the others are not, the +// worker must (a) NOT count the missed row in Succeeded, (b) LOG the missed id, +// (c) still ADVANCE the watermark to batchMax (no head-of-line block), and (d) +// recover the missed row on a subsequent RunBackstop. +func TestWorker_CASMissAccounting(t *testing.T) { + ctx := context.Background() + f := newWorkerFixture(t, 2) + + // Pin both rows' last_modified to a fixed far-past token (= what the worker + // captures at read time) so the raced bump is guaranteed to differ. + setBaselineLM(t, f.MainDB, 1) + setBaselineLM(t, f.MainDB, 2) + + // Inject the race for ONLY message 1: after the worker scanned + read both + // rows' content (capturing last_modified), rewrite msg 1's body (bumps its + // last_modified via trigger; CAS for id 1 will miss) and reset its + // embed_gen. Message 2 is untouched and stamps normally. + f.FakeClient.preReturn = func() { + _, err := f.MainDB.Exec( + `UPDATE message_bodies SET body_text = 'corrected content' WHERE message_id = 1`) + require.NoError(t, err, "race: rewrite body of msg 1") + _, err = f.MainDB.Exec(`UPDATE messages SET embed_gen = NULL WHERE id = 1`) + require.NoError(t, err, "race: reset embed_gen of msg 1") + } + + var logbuf bytes.Buffer + logger := slog.New(slog.NewTextHandler(&logbuf, &slog.HandlerOptions{Level: slog.LevelDebug})) + // Batch size 2 so both ids are read and stamped in one batch (one CAS miss, + // one success). + w := NewWorker(WorkerDeps{ + Backend: f.Backend, + VectorsDB: f.VectorsDB, + MainDB: f.MainDB, + Store: f.Store, + Client: f.FakeClient, + BatchSize: 2, + Log: logger, + }) + res, err := w.RunOnce(ctx, f.BuildingGen) + require.NoError(t, err, "RunOnce") + + // (a) Only the non-raced row counts as succeeded; the CAS miss does not. + assert.Equal(t, 1, res.Succeeded, "only the non-raced row counts as Succeeded") + + // (b) The missed id is logged. + logs := logbuf.String() + assert.Contains(t, logs, "embed_gen CAS misses", "CAS miss is logged") + assert.Contains(t, logs, "count=1", "logs the miss count") + + // (c) The watermark advanced to batchMax (id 2) despite the miss — the + // drain does not stick on the missed row. + assert.Equal(t, int64(2), readWatermark(t, f.VectorsDB, int64(f.BuildingGen)), + "watermark advances to batchMax despite the CAS miss") + + // The raced row (1) is still missing; the clean row (2) is covered. + _, isNull := embedGenOf(t, f.MainDB, 1) + assert.True(t, isNull, "raced row 1 still needs embedding") + v2, isNull2 := embedGenOf(t, f.MainDB, 2) + assert.False(t, isNull2, "clean row 2 stamped") + assert.Equal(t, int64(f.BuildingGen), v2, "row 2 embed_gen") + assert.Equal(t, 1, countMissing(t, f.MainDB, int64(f.BuildingGen)), + "exactly the raced row remains") + + // (d) A backstop pass (scans from 0, ignoring the watermark) recovers the + // CAS-missed row with its corrected content. + f.FakeClient.preReturn = nil + bres, err := w.RunBackstop(ctx, f.BuildingGen) + require.NoError(t, err, "RunBackstop recovery") + assert.Equal(t, 1, bres.Succeeded, "backstop re-embeds the CAS-missed row") + assert.Equal(t, 0, countMissing(t, f.MainDB, int64(f.BuildingGen)), + "coverage complete after backstop") +} + +// TestWorker_CASSelfBumpDoesNotBlockStamp pins the self-bump invariant: the +// stamp UPDATE itself fires the AFTER-UPDATE trigger and bumps last_modified, +// but because the WHERE compares the PRE-trigger value the stamp still +// matches its row. Verified directly against the store CAS method. +func TestWorker_CASSelfBumpDoesNotBlockStamp(t *testing.T) { + ctx := context.Background() + f := newWorkerFixture(t, 1) + token := setBaselineLM(t, f.MainDB, 1) + + missed, err := f.Store.SetEmbedGenIfUnchanged(ctx, + stamps(1, token), int64(f.BuildingGen)) + require.NoError(t, err, "CAS stamp") + assert.Empty(t, missed, "self-bump stamp succeeds (no CAS miss)") + + v, isNull := embedGenOf(t, f.MainDB, 1) + require.False(t, isNull, "row stamped despite self-bump") + assert.Equal(t, int64(f.BuildingGen), v, "embed_gen set") + // The stamp's own UPDATE bumped last_modified off the baseline. + assert.NotEqual(t, token, lmOf(t, f.MainDB, 1), "self-bump moved last_modified") +} + +// TestWorker_Downshift_EmptySkipCASMissNotSkippedPastWatermark is the +// fail-on-regression for the empty/skip-mark contiguity bug (Codex 129h +// follow-up). Within a singleton drain, an EMPTY singleton (id 1) CAS-MISSES +// its skip-mark (a concurrent edit moved last_modified between the worker's +// content read and the stamp), and a later sibling (id 2) returns a genuine 4xx +// while NOTHING embeds — so the drain takes the all-drop error/return path and +// the caller advances the watermark to the drain's safeAdvanceID +// (contiguousStampedID). +// +// PRE-FIX: the empty/skip branch advanced contiguousStampedID to the empty +// singleton's id gated ONLY on !brokeContiguity — it ignored whether the +// skip-mark actually stamped. A CAS-missed (unstamped) empty singleton therefore +// extended the contiguous-stamped prefix, so the error-path safeAdvanceID +// skipped PAST it: the watermark advanced to id 1, and a subsequent NORMAL +// RunOnce (id > watermark) no longer re-found the unstamped row — only the +// backstop's full scan from 0 could recover it (backstop-only recovery). +// +// POST-FIX: the branch mirrors the embed branch — it advances the prefix only +// when the skip-mark ACTUALLY stamped, else latches brokeContiguity. The +// CAS-missed empty singleton breaks the prefix, so safeAdvanceID stays below it; +// the watermark does not skip past it and a normal RunOnce re-finds it. +func TestWorker_Downshift_EmptySkipCASMissNotSkippedPastWatermark(t *testing.T) { + ctx := context.Background() + f := newWorkerFixture(t, 2) + + // Make msg 1 EMPTY (no subject, blank body) so embedBatch reports it in + // `empty` and the singleton drain takes the len(eb.chunks)==0 skip branch. + _, err := f.MainDB.Exec(`UPDATE messages SET subject = NULL WHERE id = 1`) + require.NoError(t, err, "null subject of msg 1") + _, err = f.MainDB.Exec(`UPDATE message_bodies SET body_text = '' WHERE message_id = 1`) + require.NoError(t, err, "blank body of msg 1") + + // Force the downshift, then a genuine 4xx for msg 2 with NOTHING embedded: + // - the whole-batch embedBatch call (msg 2's chunk; msg 1 is empty) 4xxs; + // - singleton msg 1 is empty → no Embed call → skip branch (CAS misses); + // - singleton msg 2 4xxs → deferred; embeddedOK stays 0 → all-drop return. + f.FakeClient.OnEmbed = func(inputs []string) ([][]float32, error) { + return nil, fmt.Errorf("embed: HTTP 400: blocked content: %w", ErrPermanent4xx) + } + missOnce := true + + w := NewWorker(WorkerDeps{ + Backend: f.Backend, + VectorsDB: f.VectorsDB, + MainDB: f.MainDB, + Store: f.Store, + Client: f.FakeClient, + BatchSize: 2, + MaxConsecutiveFailures: 1, // abort after the single all-drop failure (no busy re-scan) + beforeSkipStamp: func(ctx context.Context, ids []int64) { + if !missOnce { + return + } + for _, id := range ids { + if id != 1 { + continue + } + missOnce = false + _, err := f.MainDB.ExecContext(ctx, + `UPDATE messages SET last_modified = '2099-01-01 00:00:00' WHERE id = ?`, id) + require.NoError(t, err, "force skip CAS miss") + return + } + }, + }) + + // The drain is an all-drop (embeddedOK==0): RunOnce returns the wrapped + // ErrPermanent4xx without advancing past the unstamped rows. + _, err = w.RunOnce(ctx, f.BuildingGen) + require.Error(t, err, "all-drop drain surfaces an error") + + // THE INVARIANT: the watermark must NOT skip past the CAS-missed empty + // singleton (id 1). Pre-fix it advanced to 1 (stranding the row); post-fix + // it stays below 1 so a normal scan (id > watermark) re-finds it. + assert.Less(t, readWatermark(t, f.VectorsDB, int64(f.BuildingGen)), int64(1), + "watermark must not skip past the CAS-missed empty singleton") + + // The empty singleton was NOT stamped (its skip-mark CAS-missed) and so + // still needs embedding — recoverable. + _, isNull1 := embedGenOf(t, f.MainDB, 1) + assert.True(t, isNull1, "CAS-missed empty singleton (msg 1) left unstamped") + + // Concrete proof of re-discovery by a NORMAL (non-backstop) scan: with the + // 4xx cleared and the race no longer firing, a plain RunOnce re-finds msg 1 + // (id > watermark) and skip-marks it. This is the behavior the bug broke — + // pre-fix the watermark sat at 1 and a normal RunOnce scanned id > 1 only, + // so msg 1 was reachable solely via the backstop. + f.FakeClient.OnEmbed = nil + _, err = w.RunOnce(ctx, f.BuildingGen) + require.NoError(t, err, "follow-up normal RunOnce") + _, isNull1After := embedGenOf(t, f.MainDB, 1) + assert.False(t, isNull1After, "normal RunOnce re-found and skip-marked msg 1 (not backstop-only)") +} + +// TestWorker_Downshift_CASMissNotAllDrop is the fail-on-regression for the +// downshift all-drop misclassification (Codex 129h). Within a singleton drain, +// one message genuinely returns a permanent 4xx while ANOTHER embeds + upserts +// successfully but CAS-MISSES its stamp (a concurrent content edit bumped +// last_modified between the worker's read and its stamp). +// +// PRE-FIX: downshiftDrain counted only CAS-STAMPED singletons toward +// `embedded`, and classified endpoint health on `embedded > 0`. The +// embedded-but-CAS-missed singleton contributed 0 to `embedded`, so with a +// genuine-4xx sibling the drain saw embedded==0 and misclassified a HEALTHY +// endpoint as an endpoint-wide all-drop: it left the genuine 4xx UNSTAMPED and +// returned the wrapped ErrPermanent4xx. RunOnce then did NOT reset +// consecutiveFailures (it keyed on embedded>0) nor advance the cursor, so the +// next scan re-found the same batch, re-downshifted, and tripped the +// consecutive-failure cap — a SPURIOUS abort of an otherwise-fine endpoint. +// +// POST-FIX: the drain tracks embeddedOK (successful embed+upsert regardless of +// the CAS outcome) and classifies endpoint health on it, so the genuine 4xx is +// treated as a message-specific drop (stamped), the failure cap is reset, and +// RunOnce completes without aborting. The CAS-missed row stays recoverable +// (embed_gen still NULL, picked up by the backstop). +func TestWorker_Downshift_CASMissNotAllDrop(t *testing.T) { + ctx := context.Background() + f := newWorkerFixture(t, 2) + + // Pin msg 1's last_modified to a fixed far-past token so the mid-embed + // body rewrite is guaranteed to bump it to a different value (the CAS + // miss). msg 2 is the genuine 4xx; its last_modified does not matter + // (the 4xx path stamps it unconditionally). + setBaselineLM(t, f.MainDB, 1) + + // Downshift orchestration: + // - the whole-batch call (len(inputs) > 1) 4xxs, forcing the downshift; + // - singleton msg 1 (text contains "body 1") embeds OK, but inside the + // embed call we rewrite its body — bumping last_modified via the + // trigger so the worker's subsequent CAS stamp MISSES; + // - singleton msg 2 (text contains "body 2") returns a genuine 4xx. + f.FakeClient.OnEmbed = func(inputs []string) ([][]float32, error) { + if len(inputs) > 1 { + return nil, fmt.Errorf("embed: HTTP 400: batch too long: %w", ErrPermanent4xx) + } + if strings.Contains(inputs[0], "body 2") { + return nil, fmt.Errorf("embed: HTTP 400: blocked content: %w", ErrPermanent4xx) + } + // msg 1: race a content edit in BETWEEN the worker's read (which + // captured last_modified) and its stamp, so the CAS stamp misses. + _, err := f.MainDB.Exec( + `UPDATE message_bodies SET body_text = 'corrected content' WHERE message_id = 1`) + require.NoError(t, err, "race: rewrite body of msg 1") + v := make([]float32, f.FakeClient.dim) + v[0] = 1 + return [][]float32{v}, nil + } + + var logbuf bytes.Buffer + logger := slog.New(slog.NewTextHandler(&logbuf, &slog.HandlerOptions{Level: slog.LevelDebug})) + // MaxConsecutiveFailures=2 so the spurious abort would trip quickly under + // the pre-fix logic (the all-drop misclassification re-occurs every scan). + w := NewWorker(WorkerDeps{ + Backend: f.Backend, + VectorsDB: f.VectorsDB, + MainDB: f.MainDB, + Store: f.Store, + Client: f.FakeClient, + BatchSize: 2, + MaxConsecutiveFailures: 2, + Log: logger, + }) + + // (a)+(b): RunOnce must NOT abort — the genuine 4xx is a message-specific + // drop, not an endpoint-wide all-drop, and the failure cap is reset because + // the endpoint embedded something (embeddedOK > 0). + _, err := w.RunOnce(ctx, f.BuildingGen) + require.NoError(t, err, "RunOnce must not abort (healthy endpoint, not an all-drop)") + + // The genuine-4xx row (msg 2) was stamp-dropped, NOT left unstamped. + v2, isNull2 := embedGenOf(t, f.MainDB, 2) + assert.False(t, isNull2, "genuine 4xx row (msg 2) stamp-dropped (message-specific)") + assert.Equal(t, int64(f.BuildingGen), v2, "msg 2 embed_gen = target") + + // (c): the CAS-missed row (msg 1) is NOT stamped — it remains recoverable + // (embed_gen still NULL), to be picked up by the backstop. The drain must + // not have stranded it as "covered". + _, isNull1 := embedGenOf(t, f.MainDB, 1) + assert.True(t, isNull1, "CAS-missed row (msg 1) left unstamped (recoverable)") + + // Exactly the CAS-missed row remains needing embedding. + assert.Equal(t, 1, countMissing(t, f.MainDB, int64(f.BuildingGen)), + "only the CAS-missed row remains") + + // The CAS miss was logged (proves the race really happened and was handled + // as a miss, not silently stamped). + assert.Contains(t, logbuf.String(), "embed_gen CAS misses", "CAS miss logged") + + // Recovery: the backstop (full scan from 0, ignoring the watermark) + // re-embeds the CAS-missed row with its corrected content. + f.FakeClient.OnEmbed = nil + bres, err := w.RunBackstop(ctx, f.BuildingGen) + require.NoError(t, err, "RunBackstop recovery") + assert.Equal(t, 1, bres.Succeeded, "backstop re-embeds the CAS-missed row") + assert.Equal(t, 0, countMissing(t, f.MainDB, int64(f.BuildingGen)), + "coverage complete after backstop") +} diff --git a/internal/vector/embed/worker_pg_test.go b/internal/vector/embed/worker_pg_test.go index 2e9bfc84e..4cf5a01b0 100644 --- a/internal/vector/embed/worker_pg_test.go +++ b/internal/vector/embed/worker_pg_test.go @@ -21,8 +21,13 @@ import ( // pgFakeEmbeddingClient returns one deterministic, non-zero vector per // input. Defined locally because the sqlite_vec testsupport's -// fakeEmbeddingClient is behind a different build tag. -type pgFakeEmbeddingClient struct{ dim int } +// fakeEmbeddingClient is behind a different build tag. preReturn, if set, +// fires after inputs are received but before the vectors are returned — +// letting a test perturb DB state to simulate a read→stamp race. +type pgFakeEmbeddingClient struct { + dim int + preReturn func() +} func (c *pgFakeEmbeddingClient) Embed(_ context.Context, inputs []string) ([][]float32, error) { out := make([][]float32, len(inputs)) @@ -31,13 +36,97 @@ func (c *pgFakeEmbeddingClient) Embed(_ context.Context, inputs []string) ([][]f v[0] = float32(len(inputs[i])%c.dim + 1) out[i] = v } + if c.preReturn != nil { + c.preReturn() + } return out, nil } +// pgWorkStore is a minimal WorkStore over the PG test schema, mirroring +// store.ScanForEmbedding / store.SetEmbedGen with $N placeholders. +type pgWorkStore struct{ db *sql.DB } + +func (s *pgWorkStore) ScanForEmbedding(ctx context.Context, target int64, afterID int64, limit int) ([]int64, error) { + rows, err := s.db.QueryContext(ctx, + `SELECT id FROM messages + WHERE (embed_gen IS NULL OR embed_gen <> $1) + AND deleted_at IS NULL AND deleted_from_source_at IS NULL + AND id > $2 + ORDER BY id LIMIT $3`, target, afterID, limit) + if err != nil { + return nil, err + } + defer func() { _ = rows.Close() }() + var out []int64 + for rows.Next() { + var id int64 + if err := rows.Scan(&id); err != nil { + return nil, err + } + out = append(out, id) + } + return out, rows.Err() +} + +func (s *pgWorkStore) SetEmbedGen(ctx context.Context, ids []int64, target int64) error { + if len(ids) == 0 { + return nil + } + _, err := s.db.ExecContext(ctx, + `UPDATE messages SET embed_gen = $1 WHERE id = ANY($2::bigint[])`, target, int64ArrayLiteral(ids)) + return err +} + +// SetEmbedGenIfUnchanged mirrors store.Store.SetEmbedGenIfUnchanged on the +// PG test schema: a per-row optimistic-CAS stamp gated on last_modified. +// Returns the ids whose UPDATE matched 0 rows (CAS misses). +func (s *pgWorkStore) SetEmbedGenIfUnchanged(ctx context.Context, items []store.EmbedGenStamp, target int64) (missed []int64, err error) { + for _, it := range items { + res, err := s.db.ExecContext(ctx, + `UPDATE messages SET embed_gen = $1 WHERE id = $2 AND last_modified = $3`, + target, it.ID, it.LastModified) + if err != nil { + return missed, err + } + n, err := res.RowsAffected() + if err != nil { + return missed, err + } + if n == 0 { + missed = append(missed, it.ID) + } + } + return missed, nil +} + +func int64ArrayLiteral(ids []int64) string { + var sb strings.Builder + sb.WriteByte('{') + for i, id := range ids { + if i > 0 { + sb.WriteByte(',') + } + fmt.Fprintf(&sb, "%d", id) + } + sb.WriteByte('}') + return sb.String() +} + +func pgCountMissing(t *testing.T, db *sql.DB, gen int64) int { + t.Helper() + var n int + require.NoError(t, db.QueryRow( + `SELECT COUNT(*) FROM messages + WHERE (embed_gen IS NULL OR embed_gen <> $1) + AND deleted_at IS NULL AND deleted_from_source_at IS NULL`, gen).Scan(&n)) + return n +} + // openPGWorkerDB stands up a per-test schema on MSGVAULT_TEST_DB with the // minimal main-schema tables embedBatch reads (messages + message_bodies, -// including the deleted_* columns LiveMessagesWhere references) and seeds -// n live messages. It returns the *sql.DB; cleanup drops the schema. +// including embed_gen and the deleted_* columns LiveMessagesWhere +// references) and seeds n live messages. Returns the *sql.DB; cleanup +// drops the schema. func openPGWorkerDB(t *testing.T, n int) *sql.DB { t.Helper() url := os.Getenv("MSGVAULT_TEST_DB") @@ -80,13 +169,34 @@ func openPGWorkerDB(t *testing.T, n int) *sql.DB { id BIGINT PRIMARY KEY, subject TEXT, deleted_at TIMESTAMPTZ, - deleted_from_source_at TIMESTAMPTZ + deleted_from_source_at TIMESTAMPTZ, + embed_gen BIGINT, + last_modified TIMESTAMPTZ DEFAULT CURRENT_TIMESTAMP ); CREATE TABLE message_bodies ( message_id BIGINT PRIMARY KEY, body_text TEXT, body_html TEXT - );`) + ); + CREATE OR REPLACE FUNCTION set_messages_last_modified() RETURNS trigger AS $f$ + BEGIN + NEW.last_modified := CURRENT_TIMESTAMP; + RETURN NEW; + END; + $f$ LANGUAGE plpgsql; + CREATE TRIGGER trg_messages_last_modified + BEFORE UPDATE ON messages FOR EACH ROW + WHEN (OLD.last_modified IS NOT DISTINCT FROM NEW.last_modified) + EXECUTE FUNCTION set_messages_last_modified(); + CREATE OR REPLACE FUNCTION bump_message_last_modified() RETURNS trigger AS $f$ + BEGIN + UPDATE messages SET last_modified = CURRENT_TIMESTAMP WHERE id = NEW.message_id; + RETURN NEW; + END; + $f$ LANGUAGE plpgsql; + CREATE TRIGGER trg_message_bodies_last_modified + AFTER INSERT OR UPDATE ON message_bodies FOR EACH ROW + EXECUTE FUNCTION bump_message_last_modified();`) require.NoError(t, err, "create main schema") ctx := context.Background() @@ -101,12 +211,10 @@ func openPGWorkerDB(t *testing.T, n int) *sql.DB { return db } -// TestWorkerPG_RunOnce_EndToEnd drives the full embed BUILD pipeline -// against pgx: CreateGeneration seeds pending_embeddings from messages, -// then RunOnce claims, fetches bodies via embedBatch's IN(...) query, -// embeds, upserts, and completes. This exercises the $N-placeholder path -// in embedBatch — before the rebind fix it failed with pgx error 42601 -// ("syntax error at or near ','") because embedBatch emitted literal `?`. +// TestWorkerPG_RunOnce_EndToEnd drives the full scan-and-fill pipeline +// against pgx: the worker scans messages.embed_gen, fetches bodies via +// embedBatch's IN(...) query (rebound to $N), embeds, upserts, and stamps +// embed_gen. Coverage must reach zero. func TestWorkerPG_RunOnce_EndToEnd(t *testing.T) { ctx := context.Background() const n = 5 @@ -119,19 +227,18 @@ func TestWorkerPG_RunOnce_EndToEnd(t *testing.T) { gen, err := backend.CreateGeneration(ctx, "fake", 4, "") require.NoError(t, err, "CreateGeneration") - // Sanity: seeding put one pending row per live message. - var pending int - require.NoError(t, db.QueryRowContext(ctx, - `SELECT COUNT(*) FROM pending_embeddings WHERE generation_id = $1`, int64(gen)).Scan(&pending)) - require.Equal(t, n, pending, "pending seeded from messages") + // Everything reads as missing before the run. + require.Equal(t, n, pgCountMissing(t, db, int64(gen)), "missing before run") worker := NewWorker(WorkerDeps{ - Backend: backend, - VectorsDB: db, - MainDB: db, - Client: &pgFakeEmbeddingClient{dim: 4}, - Rebind: (&store.PostgreSQLDialect{}).Rebind, - BatchSize: 2, // force multiple claim/embedBatch rounds + Backend: backend, + VectorsDB: db, + MainDB: db, + Store: &pgWorkStore{db: db}, + Client: &pgFakeEmbeddingClient{dim: 4}, + Rebind: (&store.PostgreSQLDialect{}).Rebind, + LastModifiedExpr: "m.last_modified", + BatchSize: 2, // force multiple scan/embedBatch rounds }) res, err := worker.RunOnce(ctx, gen) @@ -139,10 +246,8 @@ func TestWorkerPG_RunOnce_EndToEnd(t *testing.T) { assert.Equal(t, n, res.Succeeded, "all messages embedded") assert.Equal(t, 0, res.Failed, "no failures") - // Queue fully drained. - require.NoError(t, db.QueryRowContext(ctx, - `SELECT COUNT(*) FROM pending_embeddings WHERE generation_id = $1`, int64(gen)).Scan(&pending)) - assert.Equal(t, 0, pending, "pending drained after RunOnce") + // Coverage complete after the run. + assert.Equal(t, 0, pgCountMissing(t, db, int64(gen)), "missing after run") // Embeddings landed, one row per message. var embedded int @@ -151,11 +256,9 @@ func TestWorkerPG_RunOnce_EndToEnd(t *testing.T) { assert.Equal(t, n, embedded, "one embedding row per message") } -// TestWorkerPG_EmbedBatch_RebindsINClause targets embedBatch directly: -// it must rebind the WHERE id IN (...) placeholders to $N so the pgx -// driver accepts the query. A non-rebinding embedBatch returns a 42601 -// syntax error here; the assertion is simply that the fetch succeeds and -// returns the seeded messages. +// TestWorkerPG_EmbedBatch_RebindsINClause targets embedBatch directly: it +// must rebind the WHERE id IN (...) placeholders to $N so the pgx driver +// accepts the query. func TestWorkerPG_EmbedBatch_RebindsINClause(t *testing.T) { ctx := context.Background() db := openPGWorkerDB(t, 3) @@ -165,11 +268,13 @@ func TestWorkerPG_EmbedBatch_RebindsINClause(t *testing.T) { t.Cleanup(func() { _ = backend.Close() }) w := NewWorker(WorkerDeps{ - Backend: backend, - VectorsDB: db, - MainDB: db, - Client: &pgFakeEmbeddingClient{dim: 4}, - Rebind: (&store.PostgreSQLDialect{}).Rebind, + Backend: backend, + VectorsDB: db, + MainDB: db, + Store: &pgWorkStore{db: db}, + Client: &pgFakeEmbeddingClient{dim: 4}, + Rebind: (&store.PostgreSQLDialect{}).Rebind, + LastModifiedExpr: "m.last_modified", }) eb, err := w.embedBatch(ctx, []int64{1, 2, 3}) @@ -178,8 +283,169 @@ func TestWorkerPG_EmbedBatch_RebindsINClause(t *testing.T) { assert.Len(t, eb.chunks, 3, "one chunk per short message") assert.Empty(t, eb.missing, "no missing messages") assert.Empty(t, eb.empty, "no empty messages") - // Every chunk carries a non-zero vector of the generation's dim. for _, c := range eb.chunks { assert.Len(t, c.Vector, 4) } } + +// pgLMOf reads message id 1's last_modified as text on PG (the single +// seeded message these last_modified-trigger tests operate on). +func pgLMOf(t *testing.T, db *sql.DB) string { + t.Helper() + var s string + require.NoError(t, db.QueryRow( + `SELECT CAST(last_modified AS TEXT) FROM messages WHERE id = 1`).Scan(&s)) + return s +} + +// pgReadWatermark returns the persisted forward-scan watermark for gen on PG +// (0 if absent). +func pgReadWatermark(t *testing.T, db *sql.DB, gen int64) int64 { + t.Helper() + var id int64 + err := db.QueryRow( + `SELECT watermark_id FROM embed_watermark WHERE generation_id = $1`, gen).Scan(&id) + if err == sql.ErrNoRows { + return 0 + } + require.NoError(t, err, "pgReadWatermark") + return id +} + +// TestWorkerPG_TriggersBumpLastModified verifies the PG trigger pair: a +// message UPDATE and a message_bodies INSERT/UPDATE both move +// messages.last_modified. +func TestWorkerPG_TriggersBumpLastModified(t *testing.T) { + ctx := context.Background() + db := openPGWorkerDB(t, 0) + + _, err := db.ExecContext(ctx, + `INSERT INTO messages (id, subject) VALUES (1, 'subject')`) + require.NoError(t, err, "insert message") + // Pin a far-past baseline so a bump is detectable regardless of clock + // resolution. (The BEFORE trigger preserves an explicit set via its + // WHEN guard, so this value sticks.) + _, err = db.ExecContext(ctx, + `UPDATE messages SET last_modified = '2000-01-01 00:00:00+00' WHERE id = 1`) + require.NoError(t, err, "baseline") + base := pgLMOf(t, db) + + // Message UPDATE bumps. + _, err = db.ExecContext(ctx, `UPDATE messages SET subject = 'changed' WHERE id = 1`) + require.NoError(t, err, "update message") + afterMsg := pgLMOf(t, db) + assert.NotEqual(t, base, afterMsg, "message UPDATE bumps last_modified") + + // Re-baseline, then body INSERT bumps the parent. + _, err = db.ExecContext(ctx, + `UPDATE messages SET last_modified = '2000-01-01 00:00:00+00' WHERE id = 1`) + require.NoError(t, err, "re-baseline") + _, err = db.ExecContext(ctx, + `INSERT INTO message_bodies (message_id, body_text) VALUES (1, 'body')`) + require.NoError(t, err, "insert body") + assert.NotEqual(t, "2000-01-01 00:00:00+00", pgLMOf(t, db), + "body INSERT bumps parent last_modified") + + // Re-baseline, then body UPDATE bumps the parent. + _, err = db.ExecContext(ctx, + `UPDATE messages SET last_modified = '2000-01-01 00:00:00+00' WHERE id = 1`) + require.NoError(t, err, "re-baseline 2") + base2 := pgLMOf(t, db) + _, err = db.ExecContext(ctx, + `UPDATE message_bodies SET body_text = 'corrected' WHERE message_id = 1`) + require.NoError(t, err, "update body") + assert.NotEqual(t, base2, pgLMOf(t, db), "body UPDATE bumps parent last_modified") +} + +// TestWorkerPG_CASRepairRace mirrors the SQLite CAS regression on PG: a +// content edit landing between read and stamp leaves the row unstamped. +func TestWorkerPG_CASRepairRace(t *testing.T) { + ctx := context.Background() + db := openPGWorkerDB(t, 1) + + backend, err := pgvector.Open(ctx, pgvector.Options{DB: db, Dimension: 4}) + require.NoError(t, err, "pgvector.Open") + t.Cleanup(func() { _ = backend.Close() }) + gen, err := backend.CreateGeneration(ctx, "fake", 4, "") + require.NoError(t, err, "CreateGeneration") + + // Baseline the token to a fixed past value. + _, err = db.ExecContext(ctx, + `UPDATE messages SET last_modified = '2000-01-01 00:00:00+00' WHERE id = 1`) + require.NoError(t, err, "baseline") + token := pgLMOf(t, db) + + client := &pgFakeEmbeddingClient{dim: 4} + client.preReturn = func() { + // Repair-encoding race: rewrite body (bumps last_modified via trigger) + // and reset embed_gen. + _, e := db.ExecContext(ctx, + `UPDATE message_bodies SET body_text = 'corrected' WHERE message_id = 1`) + require.NoError(t, e, "race body rewrite") + _, e = db.ExecContext(ctx, `UPDATE messages SET embed_gen = NULL WHERE id = 1`) + require.NoError(t, e, "race embed_gen reset") + } + + w := NewWorker(WorkerDeps{ + Backend: backend, + VectorsDB: db, + MainDB: db, + Store: &pgWorkStore{db: db}, + Client: client, + Rebind: (&store.PostgreSQLDialect{}).Rebind, + LastModifiedExpr: "m.last_modified", + BatchSize: 1, + }) + res, err := w.RunOnce(ctx, gen) + require.NoError(t, err, "RunOnce") + + // CAS targeted the stale token; the row moved, so it is NOT stamped. + var embedGen sql.NullInt64 + require.NoError(t, db.QueryRowContext(ctx, + `SELECT embed_gen FROM messages WHERE id = 1`).Scan(&embedGen)) + assert.False(t, embedGen.Valid, "raced row must NOT be stamped") + assert.Equal(t, 1, pgCountMissing(t, db, int64(gen)), "raced row still needs embedding") + assert.NotEqual(t, token, pgLMOf(t, db), "last_modified bumped by race") + + // A CAS miss is not counted as a success, and the watermark still advances + // past the missed row (id 1) — the drain does not stick. + assert.Equal(t, 0, res.Succeeded, "CAS-missed row not counted in Succeeded") + assert.Equal(t, int64(1), pgReadWatermark(t, db, int64(gen)), + "watermark advances past the CAS-missed row") + + // Recovery: backstop re-embeds with the corrected content. + client.preReturn = nil + res, err = w.RunBackstop(ctx, gen) + require.NoError(t, err, "RunBackstop recovery") + assert.Equal(t, 1, res.Succeeded, "raced row re-embedded on recovery") + assert.Equal(t, 0, pgCountMissing(t, db, int64(gen)), "coverage complete after recovery") +} + +// TestWorkerPG_CASNormalPath verifies the happy path on PG: unchanged +// last_modified → CAS stamp succeeds for every message. +func TestWorkerPG_CASNormalPath(t *testing.T) { + ctx := context.Background() + const n = 3 + db := openPGWorkerDB(t, n) + + backend, err := pgvector.Open(ctx, pgvector.Options{DB: db, Dimension: 4}) + require.NoError(t, err, "pgvector.Open") + t.Cleanup(func() { _ = backend.Close() }) + gen, err := backend.CreateGeneration(ctx, "fake", 4, "") + require.NoError(t, err, "CreateGeneration") + + w := NewWorker(WorkerDeps{ + Backend: backend, + VectorsDB: db, + MainDB: db, + Store: &pgWorkStore{db: db}, + Client: &pgFakeEmbeddingClient{dim: 4}, + Rebind: (&store.PostgreSQLDialect{}).Rebind, + LastModifiedExpr: "m.last_modified", + BatchSize: 2, + }) + res, err := w.RunOnce(ctx, gen) + require.NoError(t, err, "RunOnce") + assert.Equal(t, n, res.Succeeded, "all embedded via CAS") + assert.Equal(t, 0, pgCountMissing(t, db, int64(gen)), "all stamped") +} diff --git a/internal/vector/embed/worker_repair_watermark_test.go b/internal/vector/embed/worker_repair_watermark_test.go new file mode 100644 index 000000000..93b278586 --- /dev/null +++ b/internal/vector/embed/worker_repair_watermark_test.go @@ -0,0 +1,89 @@ +//go:build sqlite_vec + +package embed + +import ( + "context" + "testing" + + assertpkg "github.com/stretchr/testify/assert" + requirepkg "github.com/stretchr/testify/require" +) + +// TestWorker_RepairBelowWatermark_ReembedsAfterWatermarkReset is the +// regression guard for the below-watermark repair gap: repair-encoding clears +// embed_gen=NULL on a repaired message, but an INCREMENTAL embed run resumes +// from the per-gen watermark and only scans ids ABOVE it (ScanForEmbedding +// applies `id > watermark`). A repaired message whose id sits BELOW the current +// watermark is therefore never re-found by an incremental run — it would wait +// for a full-scan backstop (which the CLI defaults off and serve can have +// disabled). +// +// The fix lowers the watermark below the repaired id (Backend.ResetWatermarkBelow) +// so the next incremental RunOnce re-finds and re-embeds it. This test pins both +// halves of the gap: +// +// 1. WITHOUT the watermark reset, an incremental RunOnce after repair finds +// NOTHING for the below-watermark repaired message (it stays missing) — +// proving the gap the fix targets. +// 2. WITH the watermark reset (the new path), the next incremental RunOnce +// re-embeds the repaired message (Succeeded>=1, missing==0). +func TestWorker_RepairBelowWatermark_ReembedsAfterWatermarkReset(t *testing.T) { + require := requirepkg.New(t) + assert := assertpkg.New(t) + ctx := context.Background() + + // Seed 5 messages and embed all of them. With BatchSize 5 the worker scans + // 1..5, embeds them, and advances the watermark to 5 (batchMax). + f := newWorkerFixture(t, 5) + w := newTestWorker(f, 5) + res, err := w.RunOnce(ctx, f.BuildingGen) + require.NoError(err, "initial RunOnce") + require.Equal(5, res.Succeeded, "all 5 embedded") + require.Equal(0, countMissing(t, f.MainDB, int64(f.BuildingGen)), "no missing after initial drain") + require.Equal(int64(5), readWatermark(t, f.VectorsDB, int64(f.BuildingGen)), + "watermark advanced to the max embedded id") + + // Simulate repair-encoding on message 2 (BELOW the watermark of 5): + // - rewrite its body (the corrected text), AND + // - reset embed_gen to NULL (what store.Store.ResetEmbedGen does). + // The body rewrite fires the trigger that bumps last_modified, mirroring a + // real repair-encoding pass. + const repairedID = 2 + _, err = f.MainDB.ExecContext(ctx, + `UPDATE message_bodies SET body_text = ? WHERE message_id = ?`, + "repaired body 2 with corrected text", repairedID) + require.NoError(err, "rewrite repaired body") + _, err = f.MainDB.ExecContext(ctx, + `UPDATE messages SET embed_gen = NULL WHERE id = ?`, repairedID) + require.NoError(err, "reset embed_gen (ResetEmbedGen equivalent)") + + // The repaired message now reads as missing for the generation. + require.Equal(1, countMissing(t, f.MainDB, int64(f.BuildingGen)), + "repaired message reads as missing after embed_gen reset") + + // (1) WITHOUT the watermark reset: an incremental RunOnce resumes from the + // watermark (5) and scans only id > 5, so it never re-finds message 2. The + // gap the fix targets. + gapWorker := newTestWorker(f, 5) + gapRes, err := gapWorker.RunOnce(ctx, f.BuildingGen) + require.NoError(err, "incremental RunOnce before watermark reset") + assert.Equal(0, gapRes.Succeeded, "without the fix, the below-watermark repaired message is NOT re-found") + assert.Equal(1, countMissing(t, f.MainDB, int64(f.BuildingGen)), + "without the fix, the repaired message stays missing (waits for backstop)") + + // (2) WITH the fix: lower the watermark below the repaired id via the new + // backend path, then run an incremental RunOnce. It now re-finds and + // re-embeds message 2. + require.NoError(f.Backend.ResetWatermarkBelow(ctx, repairedID), + "ResetWatermarkBelow (the new repair path)") + assert.Equal(int64(repairedID-1), readWatermark(t, f.VectorsDB, int64(f.BuildingGen)), + "watermark lowered to just below the repaired id") + + fixWorker := newTestWorker(f, 5) + fixRes, err := fixWorker.RunOnce(ctx, f.BuildingGen) + require.NoError(err, "incremental RunOnce after watermark reset") + assert.GreaterOrEqual(fixRes.Succeeded, 1, "the repaired message is re-embedded after the watermark reset") + assert.Equal(0, countMissing(t, f.MainDB, int64(f.BuildingGen)), + "no missing after the fix re-embeds the repaired message") +} diff --git a/internal/vector/embed/worker_test.go b/internal/vector/embed/worker_test.go index 4645e1ad1..35f7fd43f 100644 --- a/internal/vector/embed/worker_test.go +++ b/internal/vector/embed/worker_test.go @@ -4,12 +4,10 @@ package embed import ( "context" - "database/sql" "errors" "fmt" "strings" "testing" - "time" "unicode/utf8" assertpkg "github.com/stretchr/testify/assert" @@ -18,1544 +16,612 @@ import ( "go.kenn.io/msgvault/internal/vector" ) -func TestDerivedStaleThreshold(t *testing.T) { - cases := []struct { - name string - timeout time.Duration - maxRetries int - want time.Duration - }{ - {"zero timeout returns floor", 0, 3, 10 * time.Minute}, - {"small budget keeps floor", 30 * time.Second, 3, 10 * time.Minute}, // 2*30s*3 = 3m → floor wins - {"large timeout exceeds floor", 5 * time.Minute, 3, 30 * time.Minute}, // 2*5m*3 = 30m - {"high attempts scale", 30 * time.Second, 30, 30 * time.Minute}, // 2*30s*30 = 30m - {"negative attempts treated as 1 attempt", 1 * time.Hour, -5, 2 * time.Hour}, // 2*1h*1 = 2h, exceeds floor - // Regression: callers that set EmbedTimeout but leave - // EmbedMaxRetries at zero used to derive a budget for a single - // attempt (2*10m*1 = 20m). The fix mirrors embed.NewClient's - // default of 3 total attempts → 60m. - {"zero attempts mirror client default", 10 * time.Minute, 0, 60 * time.Minute}, - } - for _, tc := range cases { - t.Run(tc.name, func(t *testing.T) { - got := derivedStaleThreshold(tc.timeout, tc.maxRetries) - assertpkg.Equalf(t, tc.want, got, - "derivedStaleThreshold(%v, %d)", tc.timeout, tc.maxRetries) - }) - } -} - -// TestWorker_SplitsChunkInputsAcrossSubBatches verifies that a -// message whose chunk fan-out exceeds BatchSize is sent to the -// embedder across multiple sub-batched Embed calls. Without the -// split, a 64-chunk message claimed via BatchSize=8 would flatten -// into a single 64-input request, exceeding provider per-request -// limits and tripping API timeouts (the very failure mode caught by -// roborev #323). -func TestWorker_SplitsChunkInputsAcrossSubBatches(t *testing.T) { - require := requirepkg.New(t) - assert := assertpkg.New(t) - ctx := context.Background() - f := newWorkerFixture(t, 1) - - // Build a body large enough to produce ~12 chunks at - // MaxInputChars=80 with the worker's overlap heuristic. Any value - // well above BatchSize would do; 12 is comfortably enough to - // exercise multiple sub-batches. - body := strings.Repeat("lorem ipsum dolor sit amet consectetur adipiscing elit. ", 40) - _, err := f.MainDB.Exec(`UPDATE message_bodies SET body_text = ? WHERE message_id = 1`, body) - require.NoError(err, "update body") - - // Capture the size of every Embed call so we can prove the split - // happened and that no sub-batch exceeded BatchSize. - const batchSize = 4 - var sizes []int - f.FakeClient.OnEmbed = func(inputs []string) ([][]float32, error) { - sizes = append(sizes, len(inputs)) - out := make([][]float32, len(inputs)) - for i := range inputs { - v := make([]float32, 4) - v[0] = float32(len(inputs[i])%4 + 1) - out[i] = v - } - return out, nil - } - - w := NewWorker(WorkerDeps{ - Backend: f.Backend, - VectorsDB: f.VectorsDB, - MainDB: f.MainDB, - Client: f.FakeClient, - Preprocess: PreprocessConfig{}, - MaxInputChars: 80, - BatchSize: batchSize, - }) - _, err = w.RunOnce(ctx, f.BuildingGen) - require.NoError(err, "RunOnce") - - require.GreaterOrEqual(len(sizes), 2, "expected >= 2 sub-batches, got sizes=%v", sizes) - for i, n := range sizes { - assert.LessOrEqualf(n, batchSize, "sub-batch %d size", i) - assert.NotZerof(n, "sub-batch %d was empty", i) - } -} - -// TestWorker_CapsRawBodyBeforePreprocess is the roborev #323 (717ac4c) -// regression: a 5 MB body must not be handed to Preprocess in full. -// Preprocess runs O(input) regex passes; without an upstream cap the -// embedding worker pays seconds of CPU and tens of MB of scratch -// allocs on every multi-megabyte body before the chunker drops the -// tail anyway. The cap should be derived from MaxInputChars and -// maxSpansPerMessage so it scales with what the chunker can actually -// emit. -func TestWorker_CapsRawBodyBeforePreprocess(t *testing.T) { - ctx := context.Background() - f := newWorkerFixture(t, 1) - - // 5 million chars of unbroken letters. Far larger than the - // raw-body cap (which at MaxInputChars=100 is 100 * 64 * 16 = - // 102,400 runes), but well-defined under regex passes if those - // run unbounded. - hugeBody := strings.Repeat("a", 5_000_000) - _, err := f.MainDB.Exec(`UPDATE message_bodies SET body_text = ? WHERE message_id = 1`, hugeBody) - requirepkg.NoError(t, err, "update body") - - // Capture every input the worker hands to the embedder. The - // individual input slices together represent the chunker's - // output; the total must come from at most the cap window. - var observed []string - f.FakeClient.OnEmbed = func(inputs []string) ([][]float32, error) { - observed = append(observed, inputs...) - out := make([][]float32, len(inputs)) - for i := range inputs { - v := make([]float32, 4) - v[0] = float32(len(inputs[i])%4 + 1) - out[i] = v - } - return out, nil - } - - w := NewWorker(WorkerDeps{ - Backend: f.Backend, - VectorsDB: f.VectorsDB, - MainDB: f.MainDB, - Client: f.FakeClient, - Preprocess: PreprocessConfig{}, - MaxInputChars: 100, - BatchSize: 8, - }) - _, err = w.RunOnce(ctx, f.BuildingGen) - requirepkg.NoError(t, err, "RunOnce") - - // All inputs must fit within the chunker's output window - // (maxSpansPerMessage * MaxInputChars = 6400 runes). The raw - // body cap means Preprocess only ever saw ~102K runes, not 5M. - totalRunes := 0 - for _, s := range observed { - totalRunes += utf8.RuneCountInString(s) - } - assertpkg.LessOrEqualf(t, totalRunes, maxSpansPerMessage*100, - "total embedder input runes = %d, want <= %d (chunker window)", totalRunes, maxSpansPerMessage*100) -} - -// TestWorker_PrefixBase64DoesNotHidePoseTail is the roborev #323 -// (2d8f45d) regression: a body whose first megabyte is an inline -// base64 PNG must still get its prose tail to the embedder. Earlier -// versions capped the raw body before sanitize, so the cap chopped -// the base64 blob before StripBase64 could strip it — the prose -// past the cap never reached Preprocess. The fix runs the cheap -// pollution removal (CRLF + StripBase64) BEFORE the cap and the -// heavy regex passes (StripHTML, URL tracking, whitespace) AFTER, -// so blob-prefixed bodies preserve the prose. -func TestWorker_PrefixBase64DoesNotHidePoseTail(t *testing.T) { - ctx := context.Background() - f := newWorkerFixture(t, 1) - - const sentinel = "QUICKFOX-MARKER-IS-THE-PROSE-TAIL" - // 2M chars of base64-shaped padding (no slashes; matches the - // strip regex), then 100 bytes of prose containing the - // sentinel. - hugeBase64 := strings.Repeat("A", 2_000_000) - body := "data:image/png;base64," + hugeBase64 + " " + sentinel + " end." - _, err := f.MainDB.Exec(`UPDATE message_bodies SET body_text = ? WHERE message_id = 1`, body) - requirepkg.NoError(t, err, "update body") - - var observed []string - f.FakeClient.OnEmbed = func(inputs []string) ([][]float32, error) { - observed = append(observed, inputs...) - out := make([][]float32, len(inputs)) - for i := range inputs { - v := make([]float32, 4) - v[0] = float32(len(inputs[i])%4 + 1) - out[i] = v - } - return out, nil - } - - w := NewWorker(WorkerDeps{ - Backend: f.Backend, - VectorsDB: f.VectorsDB, - MainDB: f.MainDB, - Client: f.FakeClient, - Preprocess: PreprocessConfig{StripBase64: true}, - MaxInputChars: 100, - BatchSize: 8, - }) - _, err = w.RunOnce(ctx, f.BuildingGen) - requirepkg.NoError(t, err, "RunOnce") - - // The sentinel — sitting past 2 MB of base64 in the raw body — - // must appear in at least one chunk handed to the embedder. - // Without the cheap-strip-first ordering, the cap would have - // chopped before the prose ever surfaced. - found := false - for _, s := range observed { - if strings.Contains(s, sentinel) { - found = true - break - } - } - assertpkg.Truef(t, found, - "sentinel %q absent from embedder inputs; the prose tail was lost behind the base64 blob", sentinel) -} - -// TestWorker_TruncatedCountedPerMessageNotPerChunk pins the -// roborev #323 (717ac4c) metric-accounting fix: when a single long -// message produces multiple truncated chunks, RunResult.Truncated -// must record one message, not one per chunk. Otherwise progress -// metrics inflate (a single oversized message could read as N -// truncations in a Succeeded=1 batch). -func TestWorker_TruncatedCountedPerMessageNotPerChunk(t *testing.T) { - require := requirepkg.New(t) - assert := assertpkg.New(t) - ctx := context.Background() - f := newWorkerFixture(t, 1) - - // Long unbroken text: ChunkText's hard-cut path marks all but - // the last span as Trunc=true. With MaxInputChars=100 the body - // produces several truncated chunks before maxSpans caps it. - body := strings.Repeat("a", 600) - _, err := f.MainDB.Exec(`UPDATE message_bodies SET body_text = ? WHERE message_id = 1`, body) - require.NoError(err, "update body") - - w := NewWorker(WorkerDeps{ - Backend: f.Backend, - VectorsDB: f.VectorsDB, - MainDB: f.MainDB, - Client: f.FakeClient, - Preprocess: PreprocessConfig{}, - MaxInputChars: 100, - BatchSize: 8, - }) - res, err := w.RunOnce(ctx, f.BuildingGen) - require.NoError(err, "RunOnce") - assert.Equal(1, res.Succeeded) - // Confirm the chunks table actually has multiple truncated - // rows for this message — otherwise the test wouldn't be - // exercising the per-message-vs-per-chunk distinction. - var truncChunks int - err = f.VectorsDB.QueryRowContext(ctx, - `SELECT COUNT(*) FROM embeddings WHERE message_id = 1 AND truncated = 1`).Scan(&truncChunks) - require.NoError(err, "count truncated chunks") - require.GreaterOrEqualf(truncChunks, 2, - "test produced %d truncated chunks, expected >= 2 to exercise the metric", truncChunks) - assert.Equalf(1, res.Truncated, - "Truncated = %d, want 1 (one message, regardless of %d truncated chunks)", res.Truncated, truncChunks) -} - -// TestWorker_FansOutLongMessageIntoMultipleChunks confirms the -// chunking path: a single pending message whose preprocessed body -// exceeds MaxInputChars produces N > 1 embedder inputs, all of which -// land in the embeddings table with distinct chunk_index values, and -// the queue is drained in one shot (not N times) — Complete is -// keyed on message_id, not on (message_id, chunk_index). -func TestWorker_FansOutLongMessageIntoMultipleChunks(t *testing.T) { - require := requirepkg.New(t) - assert := assertpkg.New(t) - ctx := context.Background() - f := newWorkerFixture(t, 1) - - // Replace the seeded message's body with one long enough to need - // multiple chunks at MaxInputChars=200. Each "paragraph" is ~150 - // chars; six paragraphs ≈ 900 chars → at least 4 chunks. - body := strings.Repeat("lorem ipsum dolor sit amet consectetur adipiscing elit. "+ - "sed do eiusmod tempor incididunt ut labore et dolore magna aliqua. "+ - "ut enim ad minim veniam quis nostrud exercitation. "+ - "\n\n", 6) - _, err := f.MainDB.Exec(`UPDATE message_bodies SET body_text = ? WHERE message_id = 1`, body) - require.NoError(err, "update body") - - w := NewWorker(WorkerDeps{ - Backend: f.Backend, - VectorsDB: f.VectorsDB, - MainDB: f.MainDB, - Client: f.FakeClient, - Preprocess: PreprocessConfig{}, - MaxInputChars: 200, // forces multi-chunk fan-out - BatchSize: 8, - }) - res, err := w.RunOnce(ctx, f.BuildingGen) - require.NoError(err, "RunOnce") - assert.Equal(1, res.Succeeded, "one distinct message embedded") - assert.Equal(0, res.Failed) - - // embeddings should hold N > 1 rows for the message, with - // consecutive chunk_index values starting at 0. - var rowCount int - err = f.VectorsDB.QueryRowContext(ctx, - `SELECT COUNT(*) FROM embeddings WHERE generation_id = ? AND message_id = 1`, - int64(f.BuildingGen)).Scan(&rowCount) - require.NoError(err, "count chunks") - require.GreaterOrEqual(rowCount, 2) - var distinctCI int - err = f.VectorsDB.QueryRowContext(ctx, - `SELECT COUNT(DISTINCT chunk_index) FROM embeddings WHERE generation_id = ? AND message_id = 1`, - int64(f.BuildingGen)).Scan(&distinctCI) - require.NoError(err, "count distinct chunk_index") - assert.Equal(rowCount, distinctCI, "each chunk should be uniquely indexed") - var minCI, maxCI int - err = f.VectorsDB.QueryRowContext(ctx, - `SELECT MIN(chunk_index), MAX(chunk_index) FROM embeddings WHERE generation_id = ?`, - int64(f.BuildingGen)).Scan(&minCI, &maxCI) - require.NoError(err, "min/max chunk_index") - assert.Equal(0, minCI, "chunk_index minimum") - assert.Equal(rowCount-1, maxCI, "chunk_index maximum") - // message_count tracks distinct messages, so it must read as 1 - // despite the multi-chunk fan-out. - var msgCount int - err = f.VectorsDB.QueryRowContext(ctx, - `SELECT message_count FROM index_generations WHERE id = ?`, - int64(f.BuildingGen)).Scan(&msgCount) - require.NoError(err, "read message_count") - assert.Equal(1, msgCount) - // Queue is fully drained: Complete is keyed on message_id, so all - // chunks of message 1 finish together when its singleton pending - // row is removed. - var pending int - err = f.VectorsDB.QueryRowContext(ctx, - `SELECT COUNT(*) FROM pending_embeddings WHERE generation_id = ?`, - int64(f.BuildingGen)).Scan(&pending) - require.NoError(err, "count pending") - assert.Equal(0, pending, "pending remaining") -} - -func TestWorker_DrainsPendingEndToEnd(t *testing.T) { - require := requirepkg.New(t) - assert := assertpkg.New(t) - ctx := context.Background() - f := newWorkerFixture(t, 3) - - w := NewWorker(WorkerDeps{ - Backend: f.Backend, - VectorsDB: f.VectorsDB, - MainDB: f.MainDB, - Client: f.FakeClient, - Preprocess: PreprocessConfig{StripQuotes: true, StripSignatures: true}, - MaxInputChars: 8000, - BatchSize: 2, - }) - - res, err := w.RunOnce(ctx, f.BuildingGen) - require.NoError(err, "RunOnce") - assert.Equal(3, res.Succeeded) - assert.Equal(0, res.Failed) - - var n int - err = f.VectorsDB.QueryRowContext(ctx, - `SELECT COUNT(*) FROM pending_embeddings WHERE generation_id = ?`, - int64(f.BuildingGen)).Scan(&n) - require.NoError(err, "count pending") - assert.Equal(0, n, "pending remaining") -} - -func TestWorker_ReleasesOnClientError(t *testing.T) { - ctx := context.Background() - f := newWorkerFixture(t, 3) - f.FakeClient.FailNext(1) // first Embed errors; remaining batches succeed - - w := NewWorker(WorkerDeps{ - Backend: f.Backend, - VectorsDB: f.VectorsDB, - MainDB: f.MainDB, - Client: f.FakeClient, - Preprocess: PreprocessConfig{}, - MaxInputChars: 8000, - BatchSize: 1, // batch of 1 so the first Embed fails exactly one id - }) - res, err := w.RunOnce(ctx, f.BuildingGen) - requirepkg.NoError(t, err, "RunOnce") - assertpkg.GreaterOrEqual(t, res.Failed, 1, "expected at least 1 failure") - // The worker retries the released row and eventually drains everything. - assertpkg.Equal(t, 3, res.Succeeded, "failed row gets retried after Release") -} - -func TestWorker_ReleasesOnUpsertError(t *testing.T) { - // Driving an Upsert failure requires forcing a dimension mismatch; the - // fake client returns 4-dim vectors matching the generation's - // dimension, so the easy lever isn't available. The Release-on-error - // path is covered by TestWorker_ReleasesOnClientError. - t.Skip("covered by TestWorker_ReleasesOnClientError") -} - -func TestWorker_EmptyPendingReturnsZeroResult(t *testing.T) { - assert := assertpkg.New(t) - ctx := context.Background() - f := newWorkerFixture(t, 0) // 0 messages → no pending rows - - w := NewWorker(WorkerDeps{ +// newTestWorker builds a Worker over the fixture with the given batch +// size and any extra deps overrides applied via the mutate callback. +func newTestWorker(f *workerFixture, batchSize int) *Worker { + return NewWorker(WorkerDeps{ Backend: f.Backend, VectorsDB: f.VectorsDB, MainDB: f.MainDB, + Store: f.Store, Client: f.FakeClient, - BatchSize: 10, + BatchSize: batchSize, }) - res, err := w.RunOnce(ctx, f.BuildingGen) - requirepkg.NoError(t, err, "RunOnce") - assert.Equal(0, res.Claimed) - assert.Equal(0, res.Succeeded) - assert.Equal(0, res.Failed) } -func TestWorker_RespectsContextCancel(t *testing.T) { - f := newWorkerFixture(t, 5) - - w := NewWorker(WorkerDeps{ - Backend: f.Backend, - VectorsDB: f.VectorsDB, - MainDB: f.MainDB, - Client: f.FakeClient, - BatchSize: 1, - }) - ctx, cancel := context.WithCancel(context.Background()) - cancel() // pre-cancelled - _, err := w.RunOnce(ctx, f.BuildingGen) - requirepkg.Error(t, err, "expected cancellation error") -} - -func TestWorker_ReclaimStale_FromStartup(t *testing.T) { +// TestWorker_DrainsToZeroEndToEnd is the happy-path: a fresh corpus is +// scanned, embedded, and every message ends up stamped (embed_gen = gen) +// so coverage reaches zero. +func TestWorker_DrainsToZeroEndToEnd(t *testing.T) { require := requirepkg.New(t) assert := assertpkg.New(t) - ctx := context.Background() - f := newWorkerFixture(t, 2) - - w := NewWorker(WorkerDeps{ - Backend: f.Backend, - VectorsDB: f.VectorsDB, - MainDB: f.MainDB, - Client: f.FakeClient, - BatchSize: 2, - StaleThreshold: 10 * time.Minute, - }) - - // Simulate a crashed worker: claim 2 rows, then back-date the claim. - q := NewQueue(f.VectorsDB, nil) - ids, _, err := q.Claim(ctx, f.BuildingGen, 2) - require.NoError(err, "Claim setup") - require.Len(ids, 2) - _, err = f.VectorsDB.ExecContext(ctx, - `UPDATE pending_embeddings SET claimed_at = ? WHERE generation_id = ?`, - time.Now().Add(-20*time.Minute).Unix(), int64(f.BuildingGen)) - require.NoError(err, "backdate") - - n, err := w.ReclaimStale(ctx) - require.NoError(err, "ReclaimStale") - assert.Equal(2, n, "reclaimed") - - // Verify the rows are available again. - var available int - err = f.VectorsDB.QueryRowContext(ctx, - `SELECT COUNT(*) FROM pending_embeddings WHERE generation_id = ? AND claimed_at IS NULL`, - int64(f.BuildingGen)).Scan(&available) - require.NoError(err, "count available") - assert.Equal(2, available, "available after reclaim") -} - -func TestWorker_StaleThresholdDefault(t *testing.T) { - f := newWorkerFixture(t, 0) - w := NewWorker(WorkerDeps{ - Backend: f.Backend, - VectorsDB: f.VectorsDB, - MainDB: f.MainDB, - Client: f.FakeClient, - }) - assertpkg.Equal(t, 10*time.Minute, w.deps.StaleThreshold, "default StaleThreshold") - assertpkg.Equal(t, 5, w.deps.MaxConsecutiveFailures, "default MaxConsecutiveFailures") -} - -// TestWorker_AbortsAfterConsecutiveFailures verifies that a -// persistently failing embedder causes RunOnce to return an error -// rather than loop forever releasing and re-claiming. -func TestWorker_AbortsAfterConsecutiveFailures(t *testing.T) { - require := requirepkg.New(t) - assert := assertpkg.New(t) - ctx := context.Background() - f := newWorkerFixture(t, 3) - // Force every Embed call to fail — a huge failure budget ensures - // we hit the MaxConsecutiveFailures limit first. - f.FakeClient.FailNext(1 << 30) - - w := NewWorker(WorkerDeps{ - Backend: f.Backend, - VectorsDB: f.VectorsDB, - MainDB: f.MainDB, - Client: f.FakeClient, - BatchSize: 1, - MaxConsecutiveFailures: 3, - }) - - res, err := w.RunOnce(ctx, f.BuildingGen) - require.Error(err, "want error after consecutive failures") - assert.GreaterOrEqual(res.Failed, 3, "one per consecutive failure") - // Any leftover claims should have been released; pending is non-empty. - var pending int - err = f.VectorsDB.QueryRowContext(ctx, - `SELECT COUNT(*) FROM pending_embeddings WHERE generation_id = ?`, - int64(f.BuildingGen)).Scan(&pending) - require.NoError(err, "count pending") - assert.NotZero(pending, "rows should have been released") -} - -// TestWorker_ConsecutiveFailureCounterResetsOnSuccess confirms that -// intermittent failures below the limit do not abort — each success -// resets the counter. -func TestWorker_ConsecutiveFailureCounterResetsOnSuccess(t *testing.T) { - ctx := context.Background() - f := newWorkerFixture(t, 4) - // Fail twice (below the limit of 3), then all subsequent succeed. - f.FakeClient.FailNext(2) - - w := NewWorker(WorkerDeps{ - Backend: f.Backend, - VectorsDB: f.VectorsDB, - MainDB: f.MainDB, - Client: f.FakeClient, - BatchSize: 1, - MaxConsecutiveFailures: 3, - }) - res, err := w.RunOnce(ctx, f.BuildingGen) - requirepkg.NoError(t, err, "2 failures, budget 3 — should not abort") - assertpkg.Equal(t, 4, res.Succeeded, "all 4 messages ultimately drain") -} - -// TestWorker_RuneCountUsedForSourceCharLen regresses the -// byte-vs-rune mismatch: Preprocess truncates by runes, so the -// SourceCharLen field on each Chunk must also be a rune count or -// CJK/emoji inputs get inflated by 2-4x. We embed a short Japanese -// subject (whose UTF-8 byte length is much larger than its rune -// count) and assert the persisted source_char_len matches runes. -func TestWorker_RuneCountUsedForSourceCharLen(t *testing.T) { - require := requirepkg.New(t) - ctx := context.Background() - f := newWorkerFixture(t, 0) // start empty so we control the message text - - // "こんにちは世界" = 7 runes, 21 UTF-8 bytes. Preprocess prepends - // "Subject: " (9 ASCII bytes/runes) and "\n\n" (2). The full - // preprocessed string has 18 runes and 32 bytes — a 1.78x - // inflation if we record bytes by mistake. - const subject = "こんにちは世界" - _, err := f.MainDB.ExecContext(ctx, - `INSERT INTO messages (id, subject) VALUES (1, ?)`, subject) - require.NoError(err, "insert message") - _, err = f.VectorsDB.ExecContext(ctx, - `INSERT INTO pending_embeddings (generation_id, message_id, enqueued_at) VALUES (?, 1, 0)`, - int64(f.BuildingGen)) - require.NoError(err, "seed pending") - - w := NewWorker(WorkerDeps{ - Backend: f.Backend, - VectorsDB: f.VectorsDB, - MainDB: f.MainDB, - Client: f.FakeClient, - BatchSize: 1, - }) - res, err := w.RunOnce(ctx, f.BuildingGen) - require.NoError(err, "RunOnce") - require.Equal(1, res.Succeeded) - - const wantRunes = 18 // len("Subject: \n\n") + 7 runes for the kanji - var got int - err = f.VectorsDB.QueryRowContext(ctx, - `SELECT source_char_len FROM embeddings WHERE generation_id = ? AND message_id = 1`, - int64(f.BuildingGen)).Scan(&got) - require.NoError(err, "read source_char_len") - assertpkg.Equal(t, wantRunes, got, "source_char_len (rune count, not byte length)") -} - -// TestWorker_FallsBackToHTMLWhenBodyTextEmpty guards the HTML-only -// recall path: messages whose plaintext body is absent should still -// be embedded using HTML-stripped text rather than silently degrading -// to subject-only embeddings. -func TestWorker_FallsBackToHTMLWhenBodyTextEmpty(t *testing.T) { - require := requirepkg.New(t) - assert := assertpkg.New(t) - ctx := context.Background() - f := newWorkerFixture(t, 0) - - const html = `

planning offsite agenda

Thursday afternoon

` - _, err := f.MainDB.ExecContext(ctx, - `INSERT INTO messages (id, subject) VALUES (1, ?)`, "meeting") - require.NoError(err, "insert message") - _, err = f.MainDB.ExecContext(ctx, - `INSERT INTO message_bodies (message_id, body_text, body_html) VALUES (1, '', ?)`, html) - require.NoError(err, "insert body") - _, err = f.VectorsDB.ExecContext(ctx, - `INSERT INTO pending_embeddings (generation_id, message_id, enqueued_at) VALUES (?, 1, 0)`, - int64(f.BuildingGen)) - require.NoError(err, "seed pending") - - w := NewWorker(WorkerDeps{ - Backend: f.Backend, - VectorsDB: f.VectorsDB, - MainDB: f.MainDB, - Client: f.FakeClient, - MaxInputChars: 8000, - BatchSize: 1, - }) - _, err = w.RunOnce(ctx, f.BuildingGen) - require.NoError(err, "RunOnce") - - require.Len(f.FakeClient.LastInputs, 1) - got := f.FakeClient.LastInputs[0] - // The preprocessed text should contain the HTML paragraph text, - // not just the subject — that's the whole point of the fallback. - assert.Contains(got, "planning offsite agenda", "embed input missing HTML body text") - assert.Contains(got, "Thursday afternoon", "embed input missing second paragraph") - assert.NotContains(got, "

", "embed input still contains HTML tags") - assert.NotContains(got, "", "embed input still contains HTML tags") -} - -// TestWorker_CompleteFailureCountsAsBatchFailure regresses the bug -// where Queue.Complete failures were log-only: the embedded rows -// stayed claimed, the next Claim returned empty, and RunOnce -// reported a clean drain. After this fix Complete failure must count -// toward MaxConsecutiveFailures so the loop short-circuits instead of -// silently spinning until ReclaimStale rescues the rows minutes later. -// -// The earlier version of this test dropped pending_embeddings to make -// Complete fail, but that also broke the next Claim — the test then -// passed because Claim errored out, not because Complete failure was -// detected. To actually exercise the stuck-claim path we install a -// BEFORE DELETE trigger that fires only on Complete (Claim does an -// UPDATE, not a DELETE, so it still succeeds). After RunOnce errors, -// we assert the pending row is still present AND claimed — proving -// the loop noticed the stuck state instead of silently treating an -// empty Claim as a clean drain. -func TestWorker_CompleteFailureCountsAsBatchFailure(t *testing.T) { - require := requirepkg.New(t) - assert := assertpkg.New(t) - ctx := context.Background() - // Need ≥ MaxConsecutiveFailures messages so successive claims pull - // fresh rows; otherwise a single failed Complete leaves one stuck- - // claimed row and the next Claim returns empty (which RunOnce - // rightly treats as a clean drain — the bug we're regressing - // against would never trip with a single-message fixture). - f := newWorkerFixture(t, 3) - - _, err := f.VectorsDB.ExecContext(ctx, ` - CREATE TRIGGER block_pending_delete - BEFORE DELETE ON pending_embeddings - BEGIN - SELECT RAISE(FAIL, 'simulated complete failure'); - END`) - require.NoError(err, "install trigger") - - w := NewWorker(WorkerDeps{ - Backend: f.Backend, - VectorsDB: f.VectorsDB, - MainDB: f.MainDB, - Client: f.FakeClient, - BatchSize: 1, - MaxConsecutiveFailures: 2, - }) - - res, err := w.RunOnce(ctx, f.BuildingGen) - require.Error(err, "want error after Complete failures (regression: silent success)") - assert.Equal(0, res.Succeeded, "Complete failed, work was not durably finished") - assert.NotZero(res.Failed, "Complete failure should count as a batch failure") - - // Stuck-claim check: pending_embeddings row is still there (the - // trigger blocked Complete's DELETE) and is marked claimed (the - // previous Claim's UPDATE went through). A naive "log-only" - // Complete handler would silently report success; the failure - // counter is what makes RunOnce notice and abort. - var pending, claimed int - err = f.VectorsDB.QueryRowContext(ctx, - `SELECT COUNT(*), - COALESCE(SUM(CASE WHEN claimed_at IS NOT NULL THEN 1 ELSE 0 END), 0) - FROM pending_embeddings WHERE generation_id = ?`, - int64(f.BuildingGen)).Scan(&pending, &claimed) - require.NoError(err, "count pending") - assert.NotZero(pending, "Complete should have failed and left the row in place") - assert.NotZero(claimed, "Claim's UPDATE should have left the row marked claimed") -} - -// TestWorker_OrphanCompleteFailureDoesNotStrandValidWork regresses -// two related bugs around orphan-drain failure: -// -// 1. Original (R53a): a failed Complete(missing) call ran BEFORE -// Upsert and used `continue`, leaving the still-valid claimed IDs -// in the same batch claimed but unembedded until ReclaimStale. -// After the fix, orphan-drain runs AFTER the embedded rows are -// upserted and acknowledged. -// -// 2. R58: when the orphan was the last queue row, the next Claim -// returned empty and RunOnce exited nil — leaving the orphan -// stranded for ~10 min until ReclaimStale, with no signal to -// the caller. After the fix, the empty-claim exit surfaces the -// orphan-drain failure as a non-nil error so the user knows the -// run was incomplete. -func TestWorker_OrphanCompleteFailureDoesNotStrandValidWork(t *testing.T) { - require := requirepkg.New(t) - assert := assertpkg.New(t) - ctx := context.Background() - // 2 messages enqueued; we'll delete one from the main DB so it - // reaches embedBatch as "missing". - f := newWorkerFixture(t, 2) - - const orphanID = 2 - _, err := f.MainDB.ExecContext(ctx, - `DELETE FROM messages WHERE id = ?`, orphanID) - require.NoError(err, "delete orphan from main") - _, err = f.MainDB.ExecContext(ctx, - `DELETE FROM message_bodies WHERE message_id = ?`, orphanID) - require.NoError(err, "delete orphan body") - - // Selective trigger: only the orphan's Complete DELETE fails. The - // embedded row's Complete must still succeed so we can prove the - // valid work is durably finished even when the orphan drain fails. - _, err = f.VectorsDB.ExecContext(ctx, fmt.Sprintf(` - CREATE TRIGGER block_orphan_drain - BEFORE DELETE ON pending_embeddings - WHEN OLD.message_id = %d - BEGIN - SELECT RAISE(FAIL, 'simulated orphan complete failure'); - END`, orphanID)) - require.NoError(err, "install trigger") - - var reports []ProgressReport - w := NewWorker(WorkerDeps{ - Backend: f.Backend, - VectorsDB: f.VectorsDB, - MainDB: f.MainDB, - Client: f.FakeClient, - BatchSize: 2, - MaxConsecutiveFailures: 5, // generous so the orphan drain failure does not abort mid-loop - TotalPending: 2, - Progress: func(p ProgressReport) { - reports = append(reports, p) - }, - }) - - res, err := w.RunOnce(ctx, f.BuildingGen) - require.Error(err, "want non-nil error (orphan drain failed and orphan remained stuck)") - require.ErrorContains(err, "orphan-drain") - require.ErrorContains(err, "ReclaimStale", "user knows recovery is automatic") - assert.Equal(1, res.Succeeded, "the valid message must be counted as completed") - assert.NotZero(res.Failed, "orphan drain failure should be reported") - require.NotEmpty(reports, "expected progress for valid embedded row even though orphan drain failed") - final := reports[len(reports)-1] - assert.Equal(1, final.Done, "final progress Done = 1 durable embedded row") - assert.Equal(1, final.BatchMsgs, "final progress BatchMsgs = 1 durable embedded row") - - // The valid message's pending row must be GONE (Complete succeeded). - // The original bug left it claimed-but-not-completed because the - // orphan-drain failure short-circuited before Upsert. - var validPending int - err = f.VectorsDB.QueryRowContext(ctx, - `SELECT COUNT(*) FROM pending_embeddings WHERE generation_id = ? AND message_id = 1`, - int64(f.BuildingGen)).Scan(&validPending) - require.NoError(err, "count valid pending") - assert.Equal(0, validPending, "R53a regression: valid row stranded by orphan drain failure") - - // And the embedded row should be in the embeddings table. - var embedded int - err = f.VectorsDB.QueryRowContext(ctx, - `SELECT COUNT(*) FROM embeddings WHERE generation_id = ? AND message_id = 1`, - int64(f.BuildingGen)).Scan(&embedded) - require.NoError(err, "count embedded") - assert.Equal(1, embedded, "Upsert should have run before orphan drain") - - // The orphan row stays claimed (token is non-NULL) — that's the - // state ReclaimStale is built to recover from. The error returned - // above is what tells the caller "this run isn't actually clean". - var orphanClaimed int - err = f.VectorsDB.QueryRowContext(ctx, - `SELECT COUNT(*) FROM pending_embeddings - WHERE generation_id = ? AND message_id = ? AND claim_token IS NOT NULL`, - int64(f.BuildingGen), orphanID).Scan(&orphanClaimed) - require.NoError(err, "count orphan claimed") - assert.Equal(1, orphanClaimed, "the trigger blocks the Complete DELETE") -} - -// TestWorker_MissingMessagesDrainedFromQueue verifies that claimed -// rows whose messages were deleted from the main DB are dropped from -// the queue (via Complete) rather than silently re-looped forever. -func TestWorker_MissingMessagesDrainedFromQueue(t *testing.T) { - require := requirepkg.New(t) - assert := assertpkg.New(t) - ctx := context.Background() - f := newWorkerFixture(t, 3) - - // Simulate sync deleting messages 2 and 3 from the main DB - // AFTER CreateGeneration seeded the queue. - _, err := f.MainDB.ExecContext(ctx, - `DELETE FROM messages WHERE id IN (2, 3)`) - require.NoError(err, "delete messages") - _, err = f.MainDB.ExecContext(ctx, - `DELETE FROM message_bodies WHERE message_id IN (2, 3)`) - require.NoError(err, "delete bodies") - - w := NewWorker(WorkerDeps{ - Backend: f.Backend, - VectorsDB: f.VectorsDB, - MainDB: f.MainDB, - Client: f.FakeClient, - BatchSize: 3, - }) - - res, err := w.RunOnce(ctx, f.BuildingGen) - require.NoError(err, "RunOnce") - // Message 1 embedded; 2 and 3 dropped as missing. - assert.Equal(1, res.Succeeded) - // Queue should be fully drained (no infinite loop on missing rows). - var pending int - err = f.VectorsDB.QueryRowContext(ctx, - `SELECT COUNT(*) FROM pending_embeddings WHERE generation_id = ?`, - int64(f.BuildingGen)).Scan(&pending) - require.NoError(err, "count pending") - assert.Equal(0, pending, "missing rows should be removed") - // Only one embedding row (for message 1). - var embedded int - err = f.VectorsDB.QueryRowContext(ctx, - `SELECT COUNT(*) FROM embeddings WHERE generation_id = ?`, - int64(f.BuildingGen)).Scan(&embedded) - require.NoError(err, "count embeddings") - assert.Equal(1, embedded) -} - -// TestWorker_EmptyPreprocessedMessagesDrainedFromQueue verifies that -// messages whose content is stripped to empty are dropped from the -// queue instead of being sent to embedders that reject empty inputs. -func TestWorker_EmptyPreprocessedMessagesDrainedFromQueue(t *testing.T) { - require := requirepkg.New(t) - assert := assertpkg.New(t) - ctx := context.Background() - f := newWorkerFixture(t, 0) - - // Message 1 becomes empty after quote stripping; message 2 remains - // embeddable so the batch must still succeed. - _, err := f.MainDB.ExecContext(ctx, - `INSERT INTO messages (id, subject) VALUES (1, ''), (2, 'kept')`) - require.NoError(err, "insert messages") - _, err = f.MainDB.ExecContext(ctx, - `INSERT INTO message_bodies (message_id, body_text) VALUES - (1, '> quoted only'), - (2, 'actual body')`) - require.NoError(err, "insert bodies") - _, err = f.VectorsDB.ExecContext(ctx, - `INSERT INTO pending_embeddings (generation_id, message_id, enqueued_at) VALUES - (?, 1, 0), - (?, 2, 0)`, - int64(f.BuildingGen), int64(f.BuildingGen)) - require.NoError(err, "seed pending") - - w := NewWorker(WorkerDeps{ - Backend: f.Backend, - VectorsDB: f.VectorsDB, - MainDB: f.MainDB, - Client: f.FakeClient, - Preprocess: PreprocessConfig{StripQuotes: true}, - MaxInputChars: 8000, - BatchSize: 2, - }) - - res, err := w.RunOnce(ctx, f.BuildingGen) - require.NoError(err, "RunOnce") - assert.Equal(1, res.Succeeded) - require.Len(f.FakeClient.LastInputs, 1) - require.NotEmpty(strings.TrimSpace(f.FakeClient.LastInputs[0]), - "embedder received empty input %q", f.FakeClient.LastInputs[0]) - - var pending int - err = f.VectorsDB.QueryRowContext(ctx, - `SELECT COUNT(*) FROM pending_embeddings WHERE generation_id = ?`, - int64(f.BuildingGen)).Scan(&pending) - require.NoError(err, "count pending") - assert.Equal(0, pending, "pending after drain") - - var embedded int - err = f.VectorsDB.QueryRowContext(ctx, - `SELECT COUNT(*) FROM embeddings WHERE generation_id = ?`, - int64(f.BuildingGen)).Scan(&embedded) - require.NoError(err, "count embeddings") - assert.Equal(1, embedded) -} - -// Progress fires once per fully-successful batch and carries cumulative -// Done, batch size, and char counts — enough for an ETA printer to work -// off of without peeking at worker internals. -func TestWorker_ProgressCalledPerSuccessfulBatch(t *testing.T) { - require := requirepkg.New(t) - assert := assertpkg.New(t) - ctx := context.Background() f := newWorkerFixture(t, 5) - var reports []ProgressReport - w := NewWorker(WorkerDeps{ - Backend: f.Backend, - VectorsDB: f.VectorsDB, - MainDB: f.MainDB, - Client: f.FakeClient, - Preprocess: PreprocessConfig{}, - MaxInputChars: 8000, - BatchSize: 2, - TotalPending: 5, - Progress: func(p ProgressReport) { - reports = append(reports, p) - }, - }) - - res, err := w.RunOnce(ctx, f.BuildingGen) + w := newTestWorker(f, 2) + res, err := w.RunOnce(context.Background(), f.BuildingGen) require.NoError(err, "RunOnce") - require.Equal(5, res.Succeeded) - // 5 messages, batch=2 → batches of 2, 2, 1 → three Progress calls. - require.Len(reports, 3) - wantDone := []int{2, 4, 5} - wantBatchMsgs := []int{2, 2, 1} - for i, p := range reports { - assert.Equalf(wantDone[i], p.Done, "report[%d].Done", i) - assert.Equalf(wantBatchMsgs[i], p.BatchMsgs, "report[%d].BatchMsgs", i) - assert.Equalf(5, p.TotalPending, "report[%d].TotalPending", i) - assert.Positivef(p.BatchChars, "report[%d].BatchChars (non-empty fixture bodies)", i) - assert.GreaterOrEqualf(p.BatchElapsed, time.Duration(0), "report[%d].BatchElapsed", i) - assert.GreaterOrEqualf(p.RunElapsed, p.BatchElapsed, - "report[%d].RunElapsed=%s < BatchElapsed=%s", i, p.RunElapsed, p.BatchElapsed) - } + assert.Equal(5, res.Succeeded, "Succeeded") + assert.Equal(0, res.Failed, "Failed") + assert.Equal(0, countMissing(t, f.MainDB, int64(f.BuildingGen)), "missing after drain") } -func TestWorker_ProgressCountsDroppedRowsTowardTotal(t *testing.T) { +// TestWorker_StampsAfterUpsert verifies the ordered idempotent steps: +// every embedded message has embed_gen stamped to the target generation. +func TestWorker_StampsAfterUpsert(t *testing.T) { require := requirepkg.New(t) - ctx := context.Background() f := newWorkerFixture(t, 3) - const missingID = 2 - _, err := f.MainDB.ExecContext(ctx, - `DELETE FROM messages WHERE id = ?`, missingID) - require.NoError(err, "delete missing message") - _, err = f.MainDB.ExecContext(ctx, - `DELETE FROM message_bodies WHERE message_id = ?`, missingID) - require.NoError(err, "delete missing body") - - var reports []ProgressReport - w := NewWorker(WorkerDeps{ - Backend: f.Backend, - VectorsDB: f.VectorsDB, - MainDB: f.MainDB, - Client: f.FakeClient, - MaxInputChars: 8000, - BatchSize: 3, - TotalPending: 3, - Progress: func(p ProgressReport) { - reports = append(reports, p) - }, - }) - - res, err := w.RunOnce(ctx, f.BuildingGen) + w := newTestWorker(f, 3) + _, err := w.RunOnce(context.Background(), f.BuildingGen) require.NoError(err, "RunOnce") - require.Equal(2, res.Succeeded, "embedded rows") - require.NotEmpty(reports, "expected progress report for mixed embed/drop batch") - final := reports[len(reports)-1] - require.Equal(3, final.Done, "final progress Done = pending rows processed") - require.Equal(3, final.BatchMsgs, "final progress BatchMsgs = pending rows processed in batch") -} -// TestWorker_DownshiftDrain_HappyPath_AllSingletonsSucceed verifies -// that when a multi-message batch returns ErrPermanent4xx (e.g. one -// message in the batch is too long for the model), the worker walks -// the same already-claimed IDs one at a time and embeds the rest. -func TestWorker_DownshiftDrain_HappyPath_AllSingletonsSucceed(t *testing.T) { - f := newWorkerFixture(t, 3) - f.FakeClient.OnEmbed = func(inputs []string) ([][]float32, error) { - if len(inputs) > 1 { - return nil, fmt.Errorf("embed: HTTP 400: too long: %w", ErrPermanent4xx) - } - v := make([]float32, 4) - v[0] = 1 - return [][]float32{v}, nil - } - w := NewWorker(WorkerDeps{ - Backend: f.Backend, - VectorsDB: f.VectorsDB, - MainDB: f.MainDB, - Client: f.FakeClient, - BatchSize: 3, - }) - res, err := w.RunOnce(context.Background(), f.BuildingGen) - requirepkg.NoError(t, err, "RunOnce") - requirepkg.Equal(t, 3, res.Succeeded, "Succeeded") - requirepkg.Equal(t, 0, res.Failed, "Failed") - assertPending(t, f.VectorsDB, int64(f.BuildingGen), 0) -} - -// TestWorker_DownshiftDrain_PartialDrop verifies that singleton 4xxs -// inside a drain are dropped (Completed without an embedding) while -// the rest of the drain proceeds normally. -func TestWorker_DownshiftDrain_PartialDrop(t *testing.T) { - f := newWorkerFixture(t, 3) - var singletonSeen int - f.FakeClient.OnEmbed = func(inputs []string) ([][]float32, error) { - if len(inputs) > 1 { - return nil, fmt.Errorf("embed: HTTP 400: too long: %w", ErrPermanent4xx) - } - singletonSeen++ - if singletonSeen == 2 { - return nil, fmt.Errorf("embed: HTTP 400: blocked: %w", ErrPermanent4xx) - } - v := make([]float32, 4) - v[0] = 1 - return [][]float32{v}, nil - } - w := NewWorker(WorkerDeps{ - Backend: f.Backend, - VectorsDB: f.VectorsDB, - MainDB: f.MainDB, - Client: f.FakeClient, - BatchSize: 3, - }) - res, err := w.RunOnce(context.Background(), f.BuildingGen) - requirepkg.NoError(t, err, "RunOnce") - assertpkg.Equal(t, 2, res.Succeeded, "Succeeded") - // Singleton 4xx drops are NOT counted as Failed — Complete - // succeeded, so the worker treated the unembeddable message - // the same way the main loop treats missing/empty drops. - // res.Failed is reserved for genuine processing failures - // (Complete errors, transient embed failures, etc.). - assertpkg.Equal(t, 0, res.Failed, "no Complete errors expected") - assertPending(t, f.VectorsDB, int64(f.BuildingGen), 0) + var stamped int + require.NoError(f.MainDB.QueryRow( + `SELECT COUNT(*) FROM messages WHERE embed_gen = ?`, int64(f.BuildingGen)).Scan(&stamped)) + requirepkg.Equal(t, 3, stamped, "stamped messages") } -// TestWorker_DownshiftDrain_AllDrop_StillTripsCap verifies that a -// fully misconfigured endpoint (every message rejected as 4xx) still -// trips the consecutive-failure cap so the worker aborts instead of -// silently dropping every message. -func TestWorker_DownshiftDrain_AllDrop_StillTripsCap(t *testing.T) { - f := newWorkerFixture(t, 6) - f.FakeClient.OnEmbed = func(inputs []string) ([][]float32, error) { - return nil, fmt.Errorf("embed: HTTP 400: misconfigured: %w", ErrPermanent4xx) - } - w := NewWorker(WorkerDeps{ - Backend: f.Backend, - VectorsDB: f.VectorsDB, - MainDB: f.MainDB, - Client: f.FakeClient, - BatchSize: 3, - MaxConsecutiveFailures: 2, - }) - _, err := w.RunOnce(context.Background(), f.BuildingGen) - requirepkg.Error(t, err, "expected abort error") - requirepkg.ErrorContains(t, err, "consecutive failures") - assertpkg.ErrorContains(t, err, "misconfigured", "expected original 4xx body in error") +// TestWorker_EmptyCorpusReturnsZero: scanning an empty corpus returns a +// zero result and no error. +func TestWorker_EmptyCorpusReturnsZero(t *testing.T) { + f := newWorkerFixture(t, 0) + w := newTestWorker(f, 8) + res, err := w.RunOnce(context.Background(), f.BuildingGen) + requirepkg.NoError(t, err, "RunOnce") + assertpkg.Equal(t, 0, res.Claimed, "Claimed") + assertpkg.Equal(t, 0, res.Succeeded, "Succeeded") } -// TestWorker_DownshiftDrain_AllDropClean_NoSilentDelete covers the -// most dangerous failure mode: a misconfigured endpoint (bad API -// key, wrong model, malformed shared request config) returns 4xx -// for every input. ErrPermanent4xx is indistinguishable from a -// message-specific 4xx at the call site, so the worker MUST NOT -// Complete-delete pending rows when no singleton in the drain -// embedded — it must release them so the cap eventually trips and -// the operator sees the failure with the original 4xx body intact -// AND the rows still in the queue for retry after fixing the -// config. -func TestWorker_DownshiftDrain_AllDropClean_NoSilentDelete(t *testing.T) { - assert := assertpkg.New(t) - f := newWorkerFixture(t, 4) - f.FakeClient.OnEmbed = func(inputs []string) ([][]float32, error) { - return nil, fmt.Errorf("embed: HTTP 401: bad-api-key: %w", ErrPermanent4xx) - } - // BatchSize=2, default MaxConsecutiveFailures=5. Each iteration: - // upstream 4xx (cf+1), drain walks both singletons, both 4xx, - // drain returns wrapped ErrPermanent4xx (no double-count since - // the drain confirms the upstream failure rather than adding a - // new one), drain releases the 2 deferred IDs back to the queue. - // After 5 iterations the cap trips. Pending count stays at 4 - // throughout because rows are released, not Completed. +// TestWorker_AbortsAfterConsecutiveFailures: a persistently failing +// embedder trips MaxConsecutiveFailures and RunOnce returns an error, +// leaving the messages unstamped (so the next run re-finds them). +func TestWorker_AbortsAfterConsecutiveFailures(t *testing.T) { + f := newWorkerFixture(t, 10) + f.FakeClient.FailNext(1000) w := NewWorker(WorkerDeps{ - Backend: f.Backend, - VectorsDB: f.VectorsDB, - MainDB: f.MainDB, - Client: f.FakeClient, - BatchSize: 2, + Backend: f.Backend, + VectorsDB: f.VectorsDB, + MainDB: f.MainDB, + Store: f.Store, + Client: f.FakeClient, + BatchSize: 2, + MaxConsecutiveFailures: 3, }) res, err := w.RunOnce(context.Background(), f.BuildingGen) - requirepkg.Error(t, err, "expected cap-trip error on misconfigured endpoint") - assert.Equal(0, res.Succeeded, "no embeds during all-drop") - requirepkg.ErrorContains(t, err, "consecutive failures", "expected cap-trip error") - requirepkg.ErrorContains(t, err, "bad-api-key", "expected original 4xx body in error") - // Critical: rows must NOT have been silently deleted. They - // should still be in pending_embeddings (released back, not - // Completed) so a corrected config can re-claim them on the - // next run. - assertPending(t, f.VectorsDB, int64(f.BuildingGen), 4) + requirepkg.Error(t, err, "expected abort") + requirepkg.ErrorContains(t, err, "consecutive failures") + assertpkg.Equal(t, 0, res.Succeeded, "nothing should succeed") + // All messages left unstamped (next scan re-finds them). + assertpkg.Equal(t, 10, countMissing(t, f.MainDB, int64(f.BuildingGen)), "still missing") } -// TestWorker_SingletonBatch_4xx_NoSilentDelete verifies that a -// BatchSize=1 claim returning ErrPermanent4xx does NOT silently -// delete the row. The drain walks the single ID, defers the drop, -// finds embedded == 0, releases the row back to the queue, and -// returns the wrapped 4xx. The caller sees errors.Is(err, -// ErrPermanent4xx) so the drain return doesn't double-count, but -// the upstream batch failure still increments consecutiveFailures -// once per iteration. With MaxConsecutiveFailures=3 the cap trips -// after 3 iterations and the row remains in pending_embeddings. -func TestWorker_SingletonBatch_4xx_NoSilentDelete(t *testing.T) { - f := newWorkerFixture(t, 1) - f.FakeClient.OnEmbed = func(inputs []string) ([][]float32, error) { - return nil, fmt.Errorf("embed: HTTP 400: bad: %w", ErrPermanent4xx) - } +// TestWorker_FailureLeavesUnstampedThenRecovers: a transient failure on +// the first attempt leaves rows unstamped; a second run (embedder now +// healthy) completes them. Idempotent re-do. +func TestWorker_FailureLeavesUnstampedThenRecovers(t *testing.T) { + f := newWorkerFixture(t, 3) + // Fail the first batch, then succeed. + f.FakeClient.FailNext(1) w := NewWorker(WorkerDeps{ Backend: f.Backend, VectorsDB: f.VectorsDB, MainDB: f.MainDB, + Store: f.Store, Client: f.FakeClient, - BatchSize: 1, - MaxConsecutiveFailures: 3, + BatchSize: 3, + MaxConsecutiveFailures: 5, }) - _, err := w.RunOnce(context.Background(), f.BuildingGen) - requirepkg.Error(t, err, "expected abort after cap") - requirepkg.ErrorContains(t, err, "consecutive failures") - assertPending(t, f.VectorsDB, int64(f.BuildingGen), 1) + // First run: the single batch fails once, then the loop re-scans the + // same (unstamped) ids and succeeds. + res, err := w.RunOnce(context.Background(), f.BuildingGen) + requirepkg.NoError(t, err, "RunOnce") + assertpkg.Equal(t, 3, res.Succeeded, "Succeeded after recovery") + assertpkg.Equal(t, 0, countMissing(t, f.MainDB, int64(f.BuildingGen)), "missing after recovery") } -// TestWorker_DownshiftDrain_CtxCancelMidDrain verifies that -// cancellation during the drain returns ctx.Err() and the remaining -// claimed rows are not lost (they remain in pending_embeddings to be -// recovered by ReclaimStale). -func TestWorker_DownshiftDrain_CtxCancelMidDrain(t *testing.T) { +// TestWorker_RespectsContextCancel: a cancelled context aborts RunOnce. +func TestWorker_RespectsContextCancel(t *testing.T) { f := newWorkerFixture(t, 3) ctx, cancel := context.WithCancel(context.Background()) - var singletonCalls int - f.FakeClient.OnEmbed = func(inputs []string) ([][]float32, error) { - if len(inputs) > 1 { - return nil, fmt.Errorf("embed: HTTP 400: %w", ErrPermanent4xx) - } - singletonCalls++ - if singletonCalls == 2 { - cancel() - return nil, context.Canceled - } - v := make([]float32, 4) - v[0] = 1 - return [][]float32{v}, nil - } - w := NewWorker(WorkerDeps{ - Backend: f.Backend, - VectorsDB: f.VectorsDB, - MainDB: f.MainDB, - Client: f.FakeClient, - BatchSize: 3, - }) + cancel() + w := newTestWorker(f, 2) _, err := w.RunOnce(ctx, f.BuildingGen) - requirepkg.ErrorIs(t, err, context.Canceled) - assertPending(t, f.VectorsDB, int64(f.BuildingGen), 2) + requirepkg.Error(t, err, "expected context error") } -// embedRunRow captures the embed_runs lifecycle columns for assertions. -type embedRunRow struct { - startedAt int64 - endedAt sql.NullInt64 - claimed int64 - succeeded int64 - failed int64 - errText sql.NullString +// TestWorker_MissingMessagesSkipMarked: ids that vanished from the main +// DB between scan and fetch are skip-marked (stamped) so they drop out of +// the next scan rather than spinning forever. +func TestWorker_MissingMessagesSkipMarked(t *testing.T) { + f := newWorkerFixture(t, 3) + // Delete message 2's row entirely (gone from main DB) but leave its + // embed_gen NULL so the scan still finds it. + _, err := f.MainDB.Exec(`DELETE FROM messages WHERE id = 2`) + requirepkg.NoError(t, err, "delete msg 2") + // Re-insert a placeholder id 2 with NULL embed_gen but no body so the + // scan finds it; then drop its body row to make embedBatch see it as + // present-but-empty. Instead, simulate "missing" by inserting an id + // the scan returns but messages has no row: not possible after delete. + // So this test covers the empty case via a blank body. + _, err = f.MainDB.Exec( + `INSERT INTO messages (id, subject, embed_gen) VALUES (2, '', NULL)`) + requirepkg.NoError(t, err, "reinsert msg 2 empty") + _, err = f.MainDB.Exec(`DELETE FROM message_bodies WHERE message_id = 2`) + requirepkg.NoError(t, err, "delete body 2") + + w := newTestWorker(f, 8) + _, err = w.RunOnce(context.Background(), f.BuildingGen) + requirepkg.NoError(t, err, "RunOnce") + // Empty message 2 must be skip-marked, not re-found. + assertpkg.Equal(t, 0, countMissing(t, f.MainDB, int64(f.BuildingGen)), "all stamped") } -// readSingleEmbedRun returns the sole embed_runs row for gen, requiring -// exactly one to exist. -func readSingleEmbedRun(t *testing.T, db *sql.DB, gen int64) embedRunRow { - t.Helper() - var n int - requirepkg.NoError(t, - db.QueryRow(`SELECT COUNT(*) FROM embed_runs WHERE generation_id = ?`, gen).Scan(&n), - "count embed_runs") - requirepkg.Equal(t, 1, n, "exactly one embed_runs row must be opened per RunOnce") - var r embedRunRow - requirepkg.NoError(t, - db.QueryRow(`SELECT started_at, ended_at, claimed, succeeded, failed, error - FROM embed_runs WHERE generation_id = ?`, gen). - Scan(&r.startedAt, &r.endedAt, &r.claimed, &r.succeeded, &r.failed, &r.errText), - "read embed_runs row") - return r -} +// TestWorker_EmptyMessageSkipMarkedNotReprocessed: a message that +// preprocesses to empty is stamped (skip-marker) and a second run does +// NOT re-process it (the embedder is not called again for it). +func TestWorker_EmptyMessageSkipMarkedNotReprocessed(t *testing.T) { + require := requirepkg.New(t) + f := newWorkerFixture(t, 1) + // Blank out the only message so it preprocesses to empty. + _, err := f.MainDB.Exec(`UPDATE messages SET subject = '' WHERE id = 1`) + require.NoError(err, "blank subject") + _, err = f.MainDB.Exec(`UPDATE message_bodies SET body_text = '' WHERE message_id = 1`) + require.NoError(err, "blank body") + + w := newTestWorker(f, 8) + _, err = w.RunOnce(context.Background(), f.BuildingGen) + require.NoError(err, "RunOnce 1") + require.Equal(0, countMissing(t, f.MainDB, int64(f.BuildingGen)), "skip-marked") + + callsBefore := f.FakeClient.calls + _, err = w.RunOnce(context.Background(), f.BuildingGen) + require.NoError(err, "RunOnce 2") + // Second run finds nothing (the empty message is stamped), so the + // embedder is not called again. + assertpkg.Equal(t, callsBefore, f.FakeClient.calls, "no re-processing of skip-marked message") +} + +func TestWorker_EmptyMessageDeletesExistingEmbeddingBeforeSkipMark(t *testing.T) { + require := requirepkg.New(t) + assert := assertpkg.New(t) + ctx := context.Background() + f := newWorkerFixture(t, 1) + w := newTestWorker(f, 1) -// TestWorker_EmbedRun_LifecycleHappyPath asserts that a successful RunOnce -// opens exactly one embed_runs row and stamps it on exit: started_at set, -// ended_at non-NULL, error NULL, and counters matching the result. -func TestWorker_EmbedRun_LifecycleHappyPath(t *testing.T) { - f := newWorkerFixture(t, 3) - w := NewWorker(WorkerDeps{ - Backend: f.Backend, - VectorsDB: f.VectorsDB, - MainDB: f.MainDB, - Client: f.FakeClient, - BatchSize: 3, - }) - res, err := w.RunOnce(context.Background(), f.BuildingGen) - requirepkg.NoError(t, err, "RunOnce") + _, err := w.RunOnce(ctx, f.BuildingGen) + require.NoError(err, "initial RunOnce") + + embedded, err := f.Backend.EmbeddedMessageCount(ctx, f.BuildingGen) + require.NoError(err, "EmbeddedMessageCount before empty") + require.Equal(int64(1), embedded, "precondition: message has an embedding") + + _, err = f.MainDB.Exec(`UPDATE messages SET subject = '', embed_gen = NULL WHERE id = 1`) + require.NoError(err, "blank subject and invalidate") + _, err = f.MainDB.Exec(`UPDATE message_bodies SET body_text = '', body_html = '' WHERE message_id = 1`) + require.NoError(err, "blank body") + + res, err := w.RunBackstop(ctx, f.BuildingGen) + require.NoError(err, "RunBackstop after message became empty") + assert.Equal(0, res.Succeeded, "empty message is skip-marked, not embedded") + assert.Equal(0, countMissing(t, f.MainDB, int64(f.BuildingGen)), "empty message is covered") - r := readSingleEmbedRun(t, f.VectorsDB, int64(f.BuildingGen)) - assertpkg.Positive(t, r.startedAt, "started_at must be stamped") - assertpkg.True(t, r.endedAt.Valid, "ended_at must be stamped on clean exit") - assertpkg.False(t, r.errText.Valid, "error must be NULL on success") - assertpkg.Equal(t, int64(res.Succeeded), r.succeeded, "succeeded counter") - assertpkg.Equal(t, int64(3), r.succeeded, "all three messages embedded") + embedded, err = f.Backend.EmbeddedMessageCount(ctx, f.BuildingGen) + require.NoError(err, "EmbeddedMessageCount after empty") + assert.Equal(int64(0), embedded, "empty skip must not leave a counted embedding") + + stats, err := f.Backend.Stats(ctx, f.BuildingGen) + require.NoError(err, "Stats after empty") + assert.Equal(int64(0), stats.EmbeddingCount, "empty skip must remove stale vector rows") + + hits, err := f.Backend.Search(ctx, f.BuildingGen, []float32{1, 0, 0, 0}, 10, vector.Filter{}) + require.NoError(err, "Search after empty") + assert.Empty(hits, "empty skip must not leave the message searchable") } -// TestWorker_EmbedRun_FinalizedOnCancellation pins embed-queue-concurrency-1: -// even when RunOnce exits because ctx was cancelled mid-drain, the -// embed_runs row must be finalized (ended_at set, error populated) rather -// than left open forever. This FAILS against the pre-fix code that ran the -// finalize UPDATE on the already-cancelled ctx, and PASSES once finalize -// runs on a detached context. -func TestWorker_EmbedRun_FinalizedOnCancellation(t *testing.T) { - f := newWorkerFixture(t, 3) - ctx, cancel := context.WithCancel(context.Background()) - var singletonCalls int - f.FakeClient.OnEmbed = func(inputs []string) ([][]float32, error) { - if len(inputs) > 1 { - return nil, fmt.Errorf("embed: HTTP 400: %w", ErrPermanent4xx) - } - singletonCalls++ - if singletonCalls == 2 { - cancel() - return nil, context.Canceled - } - v := make([]float32, 4) - v[0] = 1 - return [][]float32{v}, nil - } - w := NewWorker(WorkerDeps{ - Backend: f.Backend, - VectorsDB: f.VectorsDB, - MainDB: f.MainDB, - Client: f.FakeClient, - BatchSize: 3, - }) +func TestWorker_EmptyMessageCASMissDoesNotDeleteExistingEmbedding(t *testing.T) { + require := requirepkg.New(t) + assert := assertpkg.New(t) + ctx := context.Background() + f := newWorkerFixture(t, 2) + w := newTestWorker(f, 2) + _, err := w.RunOnce(ctx, f.BuildingGen) - requirepkg.ErrorIs(t, err, context.Canceled) + require.NoError(err, "initial RunOnce") + + _, err = f.MainDB.Exec(`UPDATE messages SET subject = '', embed_gen = NULL WHERE id = 1`) + require.NoError(err, "blank subject and invalidate msg 1") + _, err = f.MainDB.Exec(`UPDATE message_bodies SET body_text = '', body_html = '' WHERE message_id = 1`) + require.NoError(err, "blank body msg 1") + _, err = f.MainDB.Exec(`UPDATE messages SET embed_gen = NULL WHERE id = 2`) + require.NoError(err, "invalidate msg 2 to force mixed batch embed") + + f.FakeClient.preReturn = func() { + _, err := f.MainDB.Exec(`UPDATE messages SET subject = ?, embed_gen = NULL WHERE id = 1`, "repaired subject") + require.NoError(err, "race update subject") + _, err = f.MainDB.Exec(`UPDATE message_bodies SET body_text = ? WHERE message_id = 1`, "repaired body") + require.NoError(err, "race update body") + _, err = f.MainDB.Exec(`UPDATE messages SET last_modified = '2099-01-01 00:00:00' WHERE id = 1`) + require.NoError(err, "force CAS token change") + } + res, err := w.RunBackstop(ctx, f.BuildingGen) + f.FakeClient.preReturn = nil + require.NoError(err, "RunBackstop with skip CAS miss") + assert.Equal(1, res.Succeeded, "only msg 2 is embedded and stamped") + assert.Equal(1, countMissing(t, f.MainDB, int64(f.BuildingGen)), "CAS-missed msg 1 remains recoverable") - r := readSingleEmbedRun(t, f.VectorsDB, int64(f.BuildingGen)) - assertpkg.True(t, r.endedAt.Valid, - "ended_at must be stamped even when RunOnce exits via cancellation") - assertpkg.True(t, r.errText.Valid, - "error must record the cancellation cause, not be left NULL") + var vectorRows int64 + err = f.VectorsDB.QueryRowContext(ctx, + `SELECT COUNT(DISTINCT message_id) FROM embeddings WHERE generation_id = ?`, + int64(f.BuildingGen)).Scan(&vectorRows) + require.NoError(err, "raw vector row count after skip CAS miss") + assert.Equal(int64(2), vectorRows, "CAS-missed skip must not delete existing vectors") } -// retiredUpsertBackend wraps a real vector.Backend but forces every -// Upsert to return vector.ErrGenerationRetired, simulating a generation -// that was retired out from under a stale worker mid-run. All other -// methods delegate to the embedded backend. -type retiredUpsertBackend struct { - vector.Backend +// TestWorker_FallsBackToHTMLWhenBodyTextEmpty: an HTML-only message is +// embedded via stripped HTML rather than a subject-only embedding. +func TestWorker_FallsBackToHTMLWhenBodyTextEmpty(t *testing.T) { + require := requirepkg.New(t) + f := newWorkerFixture(t, 1) + _, err := f.MainDB.Exec(`UPDATE messages SET subject = 'hi' WHERE id = 1`) + require.NoError(err) + _, err = f.MainDB.Exec( + `UPDATE message_bodies SET body_text = '', body_html = ? WHERE message_id = 1`, + "

distinctive html body content

") + require.NoError(err) + + w := newTestWorker(f, 1) + _, err = w.RunOnce(context.Background(), f.BuildingGen) + require.NoError(err, "RunOnce") + joined := strings.Join(f.FakeClient.LastInputs, " ") + assertpkg.Contains(t, joined, "distinctive html body content", "HTML fallback text embedded") } -func (b retiredUpsertBackend) Upsert(_ context.Context, gen vector.GenerationID, _ []vector.Chunk) error { - return fmt.Errorf("%w: %d", vector.ErrGenerationRetired, gen) +// TestWorker_RuneCountUsedForSourceCharLen: SourceCharLen reflects rune +// count, not byte count, for multibyte input. +func TestWorker_RuneCountUsedForSourceCharLen(t *testing.T) { + require := requirepkg.New(t) + f := newWorkerFixture(t, 1) + body := strings.Repeat("é", 50) // 50 runes, 100 bytes + _, err := f.MainDB.Exec(`UPDATE messages SET subject = '' WHERE id = 1`) + require.NoError(err) + _, err = f.MainDB.Exec(`UPDATE message_bodies SET body_text = ? WHERE message_id = 1`, body) + require.NoError(err) + + w := newTestWorker(f, 1) + _, err = w.RunOnce(context.Background(), f.BuildingGen) + require.NoError(err, "RunOnce") + + var srcLen int + require.NoError(f.VectorsDB.QueryRow( + `SELECT source_char_len FROM embeddings WHERE message_id = 1 AND chunk_index = 0`).Scan(&srcLen)) + assertpkg.LessOrEqual(t, srcLen, utf8.RuneCountInString(body), "source_char_len in runes") + assertpkg.Positive(t, srcLen, "non-zero") } -// TestWorker_RetiredGenerationDrainsWithoutHardError pins -// concurrency-locks-1/2: when Backend.Upsert returns -// vector.ErrGenerationRetired, the worker must treat it as a benign -// "drop the batch" signal — NOT a hard failure. RunOnce must return nil, -// the queue must fully drain (the retired rows are token-dropped via -// Complete), and the embedding client must be invoked at most once per -// batch (no re-embed loop burning API cost up to MaxConsecutiveFailures). -// -// Revert-proof: without the ErrGenerationRetired guards in RunOnce's -// Upsert path, the worker would Release the rows, re-Claim them, and -// re-embed identically until MaxConsecutiveFailures, then return a -// spurious "embed worker aborting" error — failing both the nil-error -// and the embed-call-count assertions. -func TestWorker_RetiredGenerationDrainsWithoutHardError(t *testing.T) { +// TestWorker_SplitsChunkInputsAcrossSubBatches: a message whose chunk +// fan-out exceeds BatchSize is embedded across multiple sub-batched Embed +// calls (none larger than BatchSize). +func TestWorker_SplitsChunkInputsAcrossSubBatches(t *testing.T) { require := requirepkg.New(t) assert := assertpkg.New(t) - ctx := context.Background() - f := newWorkerFixture(t, 3) + f := newWorkerFixture(t, 1) + body := strings.Repeat("lorem ipsum dolor sit amet consectetur adipiscing elit. ", 40) + _, err := f.MainDB.Exec(`UPDATE message_bodies SET body_text = ? WHERE message_id = 1`, body) + require.NoError(err, "update body") - var embedCalls int + const batchSize = 4 + var sizes []int f.FakeClient.OnEmbed = func(inputs []string) ([][]float32, error) { - embedCalls++ + sizes = append(sizes, len(inputs)) out := make([][]float32, len(inputs)) for i := range inputs { v := make([]float32, 4) - v[0] = 1 + v[0] = float32(len(inputs[i])%4 + 1) out[i] = v } return out, nil } - w := NewWorker(WorkerDeps{ - Backend: retiredUpsertBackend{Backend: f.Backend}, - VectorsDB: f.VectorsDB, - MainDB: f.MainDB, - Client: f.FakeClient, - BatchSize: 1, // one message per batch → at most one embed call each - MaxConsecutiveFailures: 5, + Backend: f.Backend, + VectorsDB: f.VectorsDB, + MainDB: f.MainDB, + Store: f.Store, + Client: f.FakeClient, + MaxInputChars: 80, + BatchSize: batchSize, }) + _, err = w.RunOnce(context.Background(), f.BuildingGen) + require.NoError(err, "RunOnce") + require.GreaterOrEqual(len(sizes), 2, "expected >= 2 sub-batches, got %v", sizes) + for i, n := range sizes { + assert.LessOrEqualf(n, batchSize, "sub-batch %d size", i) + assert.NotZerof(n, "sub-batch %d empty", i) + } +} - res, err := w.RunOnce(ctx, f.BuildingGen) - require.NoError(err, "RunOnce must return nil for a retired generation (benign drop)") - assert.Equal(0, res.Failed, "retired-generation drop must not count as a failure") - assert.Equal(0, res.Succeeded, "nothing was actually embedded (rows dropped)") +// TestWorker_Progress fires the progress callback per handled batch with +// the configured TotalPending denominator. +func TestWorker_Progress(t *testing.T) { + f := newWorkerFixture(t, 5) + var reports []ProgressReport + w := NewWorker(WorkerDeps{ + Backend: f.Backend, + VectorsDB: f.VectorsDB, + MainDB: f.MainDB, + Store: f.Store, + Client: f.FakeClient, + BatchSize: 2, + TotalPending: 5, + Progress: func(p ProgressReport) { reports = append(reports, p) }, + }) + _, err := w.RunOnce(context.Background(), f.BuildingGen) + requirepkg.NoError(t, err, "RunOnce") + requirepkg.NotEmpty(t, reports, "progress reports") + for i, p := range reports { + assertpkg.Equalf(t, 5, p.TotalPending, "report[%d].TotalPending", i) + } + assertpkg.Equal(t, 5, reports[len(reports)-1].Done, "final Done") +} - // Queue fully drained: every retired row was token-dropped via Complete. - assert.Equal(0, countAvailable(t, f.VectorsDB, int64(f.BuildingGen)), "available after drain") - assertPending(t, f.VectorsDB, int64(f.BuildingGen), 0) +// --- Watermark behavior --- - // At most one embed call per batch (3 messages, BatchSize=1 → exactly 3). - // Without the guard the worker would re-embed each row up to - // MaxConsecutiveFailures times before aborting. - assert.LessOrEqualf(embedCalls, 3, "embed client invoked %d times; expected <= 1 per batch (no re-embed loop)", embedCalls) +// TestWorker_AdvancesWatermark: after a successful run the per-gen +// watermark is advanced to the highest scanned id. +func TestWorker_AdvancesWatermark(t *testing.T) { + f := newWorkerFixture(t, 5) + w := newTestWorker(f, 2) + _, err := w.RunOnce(context.Background(), f.BuildingGen) + requirepkg.NoError(t, err, "RunOnce") + assertpkg.Equal(t, int64(5), readWatermark(t, f.VectorsDB, int64(f.BuildingGen)), "watermark at max id") } -// TestWorker_RetiredGenerationDrainsViaDownshift covers the downshift -// drain arm of concurrency-locks-2: a multi-message batch trips -// ErrPermanent4xx (forcing the singleton downshift), each singleton then -// embeds fine but Upsert returns ErrGenerationRetired. The drain must -// treat the retired generation as a benign drop (token-drop + continue) -// rather than wrapping it into a non-4xx error that hard-aborts RunOnce. -func TestWorker_RetiredGenerationDrainsViaDownshift(t *testing.T) { +// TestWorker_WatermarkLossHarmless: dropping the watermark and rerunning +// is a no-op (idempotent) — already-stamped rows are skipped by the scan. +func TestWorker_WatermarkLossHarmless(t *testing.T) { require := requirepkg.New(t) - assert := assertpkg.New(t) - ctx := context.Background() - f := newWorkerFixture(t, 3) - - f.FakeClient.OnEmbed = func(inputs []string) ([][]float32, error) { - if len(inputs) > 1 { - // Force the downshift to BatchSize=1. - return nil, fmt.Errorf("embed: HTTP 400: too long: %w", ErrPermanent4xx) - } - v := make([]float32, 4) - v[0] = 1 - return [][]float32{v}, nil - } + f := newWorkerFixture(t, 4) + w := newTestWorker(f, 4) + _, err := w.RunOnce(context.Background(), f.BuildingGen) + require.NoError(err, "RunOnce 1") + require.Equal(0, countMissing(t, f.MainDB, int64(f.BuildingGen)), "all stamped") - w := NewWorker(WorkerDeps{ - Backend: retiredUpsertBackend{Backend: f.Backend}, - VectorsDB: f.VectorsDB, - MainDB: f.MainDB, - Client: f.FakeClient, - BatchSize: 3, // multi-message batch → 4xx → downshift - MaxConsecutiveFailures: 5, - }) + // Simulate watermark loss. + _, err = f.VectorsDB.Exec(`DELETE FROM embed_watermark`) + require.NoError(err, "drop watermark") - res, err := w.RunOnce(ctx, f.BuildingGen) - require.NoError(err, "RunOnce must return nil when the generation is retired mid-drain") - assert.Equal(0, res.Succeeded, "nothing durably embedded (rows dropped)") - assertPending(t, f.VectorsDB, int64(f.BuildingGen), 0) + callsBefore := f.FakeClient.calls + res, err := w.RunOnce(context.Background(), f.BuildingGen) + require.NoError(err, "RunOnce 2 (watermark lost)") + assertpkg.Equal(t, 0, res.Succeeded, "nothing to re-embed") + assertpkg.Equal(t, callsBefore, f.FakeClient.calls, "no re-embed after watermark loss") } -// TestWorker_RetiredGeneration_DrainsFullClaimedBatch pins cr2-5: the -// main-batch ErrGenerationRetired arm must Complete the FULL claimed batch -// (embedded + missing + empty), not just the embedded subset. Here a batch of -// two is claimed; msg 2 was deleted from the main DB so it reaches embedBatch -// as "missing". msg 1 embeds, but Upsert reports the generation retired, so -// the whole batch must be benignly dropped. Before the fix only msg 1 was -// Completed, stranding msg 2 claimed until ReclaimStale and permanently -// inflating PendingCount. -// -// Revert-proof: dropping the full-batch Complete back to eb.embeddedIDs makes -// assertPending(...,0) fail with one stranded row for the missing message. -func TestWorker_RetiredGeneration_DrainsFullClaimedBatch(t *testing.T) { +// TestWorker_BackstopCatchesSubWatermarkStraggler: a message left +// unstamped BELOW the persisted watermark is invisible to RunOnce +// (watermark-bounded) but caught by RunBackstop (full scan from 0). +func TestWorker_BackstopCatchesSubWatermarkStraggler(t *testing.T) { require := requirepkg.New(t) - assert := assertpkg.New(t) - ctx := context.Background() - f := newWorkerFixture(t, 2) + f := newWorkerFixture(t, 5) + w := newTestWorker(f, 5) + _, err := w.RunOnce(context.Background(), f.BuildingGen) + require.NoError(err, "RunOnce 1") + require.Equal(0, countMissing(t, f.MainDB, int64(f.BuildingGen)), "all stamped") + require.Equal(int64(5), readWatermark(t, f.VectorsDB, int64(f.BuildingGen)), "watermark") - const missingID = 2 - _, err := f.MainDB.ExecContext(ctx, `DELETE FROM messages WHERE id = ?`, missingID) - require.NoError(err, "delete missing from main") - _, err = f.MainDB.ExecContext(ctx, `DELETE FROM message_bodies WHERE message_id = ?`, missingID) - require.NoError(err, "delete missing body") + // Manually un-stamp message 2 (a sub-watermark straggler) — as if a + // prior run dropped it during a transient fault while the watermark + // advanced past it. + _, err = f.MainDB.Exec(`UPDATE messages SET embed_gen = NULL WHERE id = 2`) + require.NoError(err, "unstamp msg 2") - w := NewWorker(WorkerDeps{ - Backend: retiredUpsertBackend{Backend: f.Backend}, - VectorsDB: f.VectorsDB, - MainDB: f.MainDB, - Client: f.FakeClient, - BatchSize: 2, // both rows claimed in one batch - MaxConsecutiveFailures: 5, - }) + // RunOnce resumes from the watermark (id > 5) and does NOT see id 2. + res, err := w.RunOnce(context.Background(), f.BuildingGen) + require.NoError(err, "RunOnce 2") + assertpkg.Equal(t, 0, res.Succeeded, "RunOnce misses sub-watermark straggler") + assertpkg.Equal(t, 1, countMissing(t, f.MainDB, int64(f.BuildingGen)), "straggler still missing") - res, err := w.RunOnce(ctx, f.BuildingGen) - require.NoError(err, "RunOnce must return nil for a retired generation (benign drop)") - assert.Equal(0, res.Failed, "retired drop must not count as a failure") - // The full claimed batch (embedded msg 1 + missing msg 2) must be drained. - assertPending(t, f.VectorsDB, int64(f.BuildingGen), 0) - assert.Equal(0, countAvailable(t, f.VectorsDB, int64(f.BuildingGen)), "no rows left claimed or available") + // Backstop scans from 0 and catches it. + res, err = w.RunBackstop(context.Background(), f.BuildingGen) + require.NoError(err, "RunBackstop") + assertpkg.Equal(t, 1, res.Succeeded, "backstop embeds the straggler") + assertpkg.Equal(t, 0, countMissing(t, f.MainDB, int64(f.BuildingGen)), "straggler covered") } -// TestWorker_RetiredDownshift_MixedWith4xxDropsCleanly pins cr2-7. A batch of -// three trips ErrPermanent4xx (forcing the singleton downshift). In the -// drain, msg 1 keeps returning 4xx (deferred), while msg 2 and msg 3 embed -// fine but their Upsert reports the generation retired. The drain therefore -// ends with embedded==0 and a non-empty deferredDrops set. The retiredObserved -// flag must make the worker token-DROP the deferred 4xx row and return nil — -// NOT take the embedded==0 all-drop Release path (which would orphan the row -// for a generation no future run re-claims, then re-embed/hard-abort). -// -// Revert-proof: removing the retiredObserved branch makes downshiftDrain take -// the embedded==0 Release+ErrPermanent4xx path; RunOnce then re-claims and -// re-embeds the released row each loop until MaxConsecutiveFailures, returning -// a non-nil "consecutive failures" abort and leaving the row in -// pending_embeddings — failing the nil-error, res.Failed==0, and -// assertPending(...,0) assertions below. -func TestWorker_RetiredDownshift_MixedWith4xxDropsCleanly(t *testing.T) { +// TestWorker_BackstopDoesNotPersistWatermark: the backstop must not +// touch the persisted watermark (it scans from 0 by design). +func TestWorker_BackstopDoesNotPersistWatermark(t *testing.T) { require := requirepkg.New(t) - assert := assertpkg.New(t) - ctx := context.Background() f := newWorkerFixture(t, 3) + w := newTestWorker(f, 3) + _, err := w.RunOnce(context.Background(), f.BuildingGen) + require.NoError(err, "RunOnce") + wmBefore := readWatermark(t, f.VectorsDB, int64(f.BuildingGen)) + + // Un-stamp one and run the backstop; the watermark must be unchanged. + _, err = f.MainDB.Exec(`UPDATE messages SET embed_gen = NULL WHERE id = 1`) + require.NoError(err) + _, err = w.RunBackstop(context.Background(), f.BuildingGen) + require.NoError(err, "RunBackstop") + assertpkg.Equal(t, wmBefore, readWatermark(t, f.VectorsDB, int64(f.BuildingGen)), "watermark unchanged by backstop") +} + +// TestWorker_ReclaimStaleIsNoOp: ReclaimStale always returns (0, nil) +// under the scan-and-fill design (kept for the EmbedRunner interface). +func TestWorker_ReclaimStaleIsNoOp(t *testing.T) { + f := newWorkerFixture(t, 1) + w := newTestWorker(f, 1) + n, err := w.ReclaimStale(context.Background()) + requirepkg.NoError(t, err, "ReclaimStale") + assertpkg.Equal(t, 0, n, "no-op returns 0") +} +// --- Downshift / 4xx behavior --- + +// TestWorker_Downshift_MessageSpecific4xxStampedDropped: when a batch +// 4xxs but singletons embed, the failing message is a message-specific +// 4xx and gets stamped (dropped) so the run completes. +func TestWorker_Downshift_MessageSpecific4xxStampedDropped(t *testing.T) { + f := newWorkerFixture(t, 3) + var singletonSeen int f.FakeClient.OnEmbed = func(inputs []string) ([][]float32, error) { if len(inputs) > 1 { - // Force the downshift to BatchSize=1. - return nil, fmt.Errorf("embed: HTTP 400: batch: %w", ErrPermanent4xx) + return nil, fmt.Errorf("embed: HTTP 400: too long: %w", ErrPermanent4xx) } - // Singleton for msg 1 keeps returning 4xx → deferred drop candidate. - if strings.Contains(inputs[0], "msg 1") { - return nil, fmt.Errorf("embed: HTTP 400: msg-specific: %w", ErrPermanent4xx) + singletonSeen++ + if singletonSeen == 2 { + return nil, fmt.Errorf("embed: HTTP 400: blocked: %w", ErrPermanent4xx) } - // msg 2 / msg 3 embed fine; their Upsert will report retired. v := make([]float32, 4) v[0] = 1 return [][]float32{v}, nil } - - w := NewWorker(WorkerDeps{ - Backend: retiredUpsertBackend{Backend: f.Backend}, - VectorsDB: f.VectorsDB, - MainDB: f.MainDB, - Client: f.FakeClient, - BatchSize: 3, // multi-message batch → 4xx → downshift - MaxConsecutiveFailures: 5, - }) - - res, err := w.RunOnce(ctx, f.BuildingGen) - require.NoError(err, "RunOnce must return nil: retirement is benign, deferred 4xx row is dropped not released") - assert.Equal(0, res.Failed, "retired-generation drain must not count failures") - assert.Equal(0, res.Succeeded, "nothing durably embedded (all rows dropped)") - // No orphaned rows: the deferred 4xx singleton was token-DROPPED, not - // released back to the queue for a generation no future run re-claims. - assertPending(t, f.VectorsDB, int64(f.BuildingGen), 0) + w := newTestWorker(f, 3) + res, err := w.RunOnce(context.Background(), f.BuildingGen) + requirepkg.NoError(t, err, "RunOnce") + assertpkg.Equal(t, 2, res.Succeeded, "Succeeded") + // All three stamped (2 embedded + 1 message-specific drop). + assertpkg.Equal(t, 0, countMissing(t, f.MainDB, int64(f.BuildingGen)), "all stamped") } -// TestWorker_RetiredDrainCompleteFailure_Surfaces pins cr2-6 for the main-loop -// retired arm: when the retired-gen drop's Complete DELETE fails at the DB -// level and those are the last queue rows, RunOnce must NOT report a clean -// (nil) run. The failure is routed through the same orphan-drain surfacing -// channel so the empty-claim exit returns a non-nil error referencing -// ReclaimStale, rather than swallowing it with a log line. -func TestWorker_RetiredDrainCompleteFailure_Surfaces(t *testing.T) { - require := requirepkg.New(t) - ctx := context.Background() - f := newWorkerFixture(t, 1) - - _, err := f.VectorsDB.ExecContext(ctx, ` - CREATE TRIGGER block_pending_delete_retired - BEFORE DELETE ON pending_embeddings - BEGIN - SELECT RAISE(FAIL, 'simulated complete failure during retired drop'); - END`) - require.NoError(err, "install trigger") - +// TestWorker_Downshift_AllDropNoSilentDelete: a fully misconfigured +// endpoint (every input 4xx) must NOT stamp/drop any message and must +// trip the failure cap, leaving the rows unstamped for retry. +func TestWorker_Downshift_AllDropNoSilentDelete(t *testing.T) { + f := newWorkerFixture(t, 4) + f.FakeClient.OnEmbed = func(inputs []string) ([][]float32, error) { + return nil, fmt.Errorf("embed: HTTP 401: bad-api-key: %w", ErrPermanent4xx) + } w := NewWorker(WorkerDeps{ - Backend: retiredUpsertBackend{Backend: f.Backend}, + Backend: f.Backend, VectorsDB: f.VectorsDB, MainDB: f.MainDB, + Store: f.Store, Client: f.FakeClient, - BatchSize: 1, - MaxConsecutiveFailures: 5, + BatchSize: 4, + MaxConsecutiveFailures: 2, }) - - _, err = w.RunOnce(ctx, f.BuildingGen) - require.Error(err, "Complete failure during retired drop must be surfaced, not swallowed") - require.ErrorContains(err, "ReclaimStale", "caller must learn recovery is automatic") - // The row stays claimed (the trigger blocked the DELETE). - var claimed int - require.NoError(f.VectorsDB.QueryRowContext(ctx, - `SELECT COUNT(*) FROM pending_embeddings WHERE generation_id = ? AND claim_token IS NOT NULL`, - int64(f.BuildingGen)).Scan(&claimed), "count claimed") - require.Equal(1, claimed, "retired-drop Complete failure leaves the row claimed") + _, err := w.RunOnce(context.Background(), f.BuildingGen) + requirepkg.Error(t, err, "expected abort") + // No message stamped — the misconfigured endpoint did not silently + // drop work. + assertpkg.Equal(t, 4, countMissing(t, f.MainDB, int64(f.BuildingGen)), "nothing stamped") } -// TestWorker_RetiredDownshiftCompleteFailure_Surfaces pins cr2-6 for the -// downshift retired arm: a Complete failure while dropping a retired-gen -// singleton in downshiftDrain must surface from RunOnce (non-nil error) -// rather than being logged and lost. -func TestWorker_RetiredDownshiftCompleteFailure_Surfaces(t *testing.T) { +// TestWorker_Downshift_Non4xxDoesNotStrandStraggler proves the watermark +// is NOT advanced past an unstamped straggler when a downshift hits a +// NON-4xx (transient) error AFTER an earlier singleton already stamped. +// +// Setup: a 3-message batch 4xxs as a whole (triggering the downshift to +// BatchSize=1); then singleton id 1 embeds (and is stamped) while singleton +// id 2 returns a NON-4xx error. The old code advanced the watermark to +// batchMax (3), so subsequent RunOnce scans (id > 3) would skip ids 2 and 3 +// forever — only the MANUAL-only backstop could recover them. The fix +// advances only to the highest contiguously-stamped id (1). +// +// Asserts: (a) the persisted watermark is 1, not 3; (b) ids 2 and 3 are +// still missing; (c) a subsequent RunOnce with a healthy embedder (NO +// backstop) re-finds and embeds them, reaching zero coverage. +func TestWorker_Downshift_Non4xxDoesNotStrandStraggler(t *testing.T) { require := requirepkg.New(t) - ctx := context.Background() - f := newWorkerFixture(t, 2) + assert := assertpkg.New(t) + f := newWorkerFixture(t, 3) + // First pass: whole batch 4xxs (forces downshift); singleton id 1 + // embeds; singleton id 2 returns a transient (NON-4xx) error. f.FakeClient.OnEmbed = func(inputs []string) ([][]float32, error) { if len(inputs) > 1 { - return nil, fmt.Errorf("embed: HTTP 400: batch: %w", ErrPermanent4xx) + // Whole-batch call — force the downshift. + return nil, fmt.Errorf("embed: HTTP 400: batch too long: %w", ErrPermanent4xx) } - v := make([]float32, 4) + // Singleton. Message 2's preprocessed text contains "body 2". + if strings.Contains(inputs[0], "body 2") { + // Transient error — NOT a 4xx. Must leave id 2 unstamped and + // must not let the watermark jump past it. + return nil, errors.New("simulated transient embed failure for msg 2") + } + v := make([]float32, f.FakeClient.dim) v[0] = 1 return [][]float32{v}, nil } - - _, err := f.VectorsDB.ExecContext(ctx, ` - CREATE TRIGGER block_pending_delete_downshift - BEFORE DELETE ON pending_embeddings - BEGIN - SELECT RAISE(FAIL, 'simulated complete failure during downshift retired drop'); - END`) - require.NoError(err, "install trigger") - w := NewWorker(WorkerDeps{ - Backend: retiredUpsertBackend{Backend: f.Backend}, + Backend: f.Backend, VectorsDB: f.VectorsDB, MainDB: f.MainDB, + Store: f.Store, Client: f.FakeClient, - BatchSize: 2, // multi-message → 4xx → downshift, then retired Upsert + BatchSize: 3, MaxConsecutiveFailures: 5, }) + _, err := w.RunOnce(context.Background(), f.BuildingGen) + require.Error(err, "expected a transient drain error") + + // (a) Watermark must stay at the contiguously-stamped id (1), NOT + // batchMax (3). + assert.Equal(int64(1), readWatermark(t, f.VectorsDB, int64(f.BuildingGen)), + "watermark not advanced past the unstamped straggler") + // (b) ids 2 and 3 are still unstamped (1 is stamped). + assert.Equal(2, countMissing(t, f.MainDB, int64(f.BuildingGen)), "stragglers still missing") + + // (c) A subsequent RunOnce (NO backstop) with a healthy embedder + // re-finds the stragglers (scan id > watermark==1) and embeds them. + f.FakeClient.OnEmbed = nil // restore default healthy behavior + res, err := w.RunOnce(context.Background(), f.BuildingGen) + require.NoError(err, "second RunOnce") + assert.Equal(2, res.Succeeded, "stragglers embedded on retry") + assert.Equal(0, countMissing(t, f.MainDB, int64(f.BuildingGen)), + "coverage complete without the manual backstop") +} + +// --- embed_runs lifecycle --- + +// TestWorker_EmbedRunLifecycle: a successful RunOnce opens exactly one +// embed_runs row and stamps ended_at + counters on it. +func TestWorker_EmbedRunLifecycle(t *testing.T) { + require := requirepkg.New(t) + f := newWorkerFixture(t, 2) + w := newTestWorker(f, 2) + res, err := w.RunOnce(context.Background(), f.BuildingGen) + require.NoError(err, "RunOnce") - _, err = w.RunOnce(ctx, f.BuildingGen) - require.Error(err, "downshift retired-drop Complete failure must surface from RunOnce") + var n int + require.NoError(f.VectorsDB.QueryRow(`SELECT COUNT(*) FROM embed_runs`).Scan(&n)) + require.Equal(1, n, "exactly one embed_runs row") + var ended, succeeded int + require.NoError(f.VectorsDB.QueryRow( + `SELECT COALESCE(ended_at, 0), succeeded FROM embed_runs LIMIT 1`).Scan(&ended, &succeeded)) + assertpkg.NotZero(t, ended, "ended_at stamped") + assertpkg.Equal(t, res.Succeeded, succeeded, "succeeded counter") } -func TestWorker_DownshiftDrain_TransientErrorReleasesRemainingAndErrors(t *testing.T) { +// --- retired generation --- + +// TestWorker_RetiredGenerationStopsCleanly: if the generation is retired +// mid-run, Upsert returns ErrGenerationRetired and RunOnce returns nil +// (benign stop), leaving no embed_gen stamps for the retired gen. +func TestWorker_RetiredGenerationStopsCleanly(t *testing.T) { require := requirepkg.New(t) f := newWorkerFixture(t, 3) - var singletonCalls int - f.FakeClient.OnEmbed = func(inputs []string) ([][]float32, error) { - if len(inputs) > 1 { - return nil, fmt.Errorf("embed: HTTP 400: %w", ErrPermanent4xx) - } - singletonCalls++ - if singletonCalls == 2 { - return nil, errors.New("temporary network failure") - } - v := make([]float32, 4) - v[0] = 1 - return [][]float32{v}, nil - } - w := NewWorker(WorkerDeps{ - Backend: f.Backend, - VectorsDB: f.VectorsDB, - MainDB: f.MainDB, - Client: f.FakeClient, - BatchSize: 3, - }) + // Retire the building generation directly so the next Upsert observes + // state='retired'. + _, err := f.VectorsDB.Exec( + `UPDATE index_generations SET state = 'retired' WHERE id = ?`, int64(f.BuildingGen)) + require.NoError(err, "retire gen") + w := newTestWorker(f, 3) res, err := w.RunOnce(context.Background(), f.BuildingGen) - require.Error(err, "expected transient mid-drain error") - require.ErrorContains(err, "temporary network failure") - require.Equal(1, res.Succeeded, "completed singleton before transient error") - assertPending(t, f.VectorsDB, int64(f.BuildingGen), 2) - require.Equal(2, countAvailable(t, f.VectorsDB, int64(f.BuildingGen)), "released rows") -} + require.NoError(err, "RunOnce must return nil for a retired generation (benign stop)") + assertpkg.Equal(t, 0, res.Succeeded, "nothing embedded into retired gen") + // No message stamped to the retired generation. + var stamped int + require.NoError(f.MainDB.QueryRow( + `SELECT COUNT(*) FROM messages WHERE embed_gen = ?`, int64(f.BuildingGen)).Scan(&stamped)) + assertpkg.Equal(t, 0, stamped, "no stamps for retired gen") +} + +// compile-time: *Worker satisfies the embed runner shape used elsewhere. +var _ interface { + RunOnce(ctx context.Context, gen vector.GenerationID) (RunResult, error) + RunBackstop(ctx context.Context, gen vector.GenerationID) (RunResult, error) + ReclaimStale(ctx context.Context) (int, error) +} = (*Worker)(nil) diff --git a/internal/vector/errors.go b/internal/vector/errors.go index 877536923..b4d5a5ecf 100644 --- a/internal/vector/errors.go +++ b/internal/vector/errors.go @@ -57,15 +57,6 @@ var ( // cannot delete the now-serving generation's embeddings. ErrRefuseRetireActive = errors.New("refusing to retire the active (serving) generation without force") - // ErrGenerationNotBuilding is returned by EnsureSeeded when the - // target generation is no longer in state='building' — e.g. a - // concurrent activation flipped it to active, or a retire call - // moved it to retired, between the caller's BuildingGeneration - // read and EnsureSeeded. Callers performing a resume can treat - // this as a retryable race and re-resolve the active/building - // state instead of aborting. - ErrGenerationNotBuilding = errors.New("generation is not in state=building") - // ErrEmbeddingTimeout is returned by the hybrid engine when the // embedding endpoint did not respond before the request context // was cancelled (typically because the HTTP server's per-request diff --git a/internal/vector/generations_test.go b/internal/vector/generations_test.go index 78d379dc9..37b80e7e0 100644 --- a/internal/vector/generations_test.go +++ b/internal/vector/generations_test.go @@ -57,10 +57,13 @@ func (f *fakeBackend) Stats(context.Context, GenerationID) (Stats, error) { func (f *fakeBackend) LoadVector(context.Context, int64) ([]float32, error) { return nil, errors.New("not implemented") } -func (f *fakeBackend) Close() error { return nil } -func (f *fakeBackend) EnsureSeeded(context.Context, GenerationID) error { +func (f *fakeBackend) ResetWatermarkBelow(context.Context, int64) error { return errors.New("not implemented") } +func (f *fakeBackend) EmbeddedMessageCount(context.Context, GenerationID) (int64, error) { + return 0, errors.New("not implemented") +} +func (f *fakeBackend) Close() error { return nil } func TestResolveActiveForFingerprint_Matches(t *testing.T) { b := &fakeBackend{active: &Generation{ID: 1, Fingerprint: "m:768:p1-111111"}} diff --git a/internal/vector/pgvector/backend.go b/internal/vector/pgvector/backend.go index f8df95b73..beec11b1d 100644 --- a/internal/vector/pgvector/backend.go +++ b/internal/vector/pgvector/backend.go @@ -39,11 +39,25 @@ type Options struct { // per-dimension HNSW index on first migration. Optional; if zero // the index is created on first CreateGeneration. Dimension int - // SkipMigrate suppresses the automatic schema migration on Open. - // Set this when the caller holds a read-only connection (e.g. the - // MCP server), where CREATE EXTENSION and DDL statements are - // rejected by PostgreSQL with SQLSTATE 25006. + // SkipMigrate suppresses the privileged CREATE EXTENSION + full + // migrate. A WRITABLE open still applies the (extension-less) schema so + // the one-time upgrade lands — read-only-ness is now signalled by + // ReadOnly, not SkipMigrate. Set this when the caller cannot run the + // privileged `CREATE EXTENSION vector` (e.g. a management command on a + // DB whose extension+schema were already installed by serve/build), but + // still wants the writable Open to bring up the extension-less schema + + // run the one-time upgrade backfill. The heavy migrate (CREATE EXTENSION + // + eager index build) is skipped; the extension-less schema apply is + // not (it is gated by ReadOnly instead). SkipMigrate bool + // ReadOnly indicates the connection cannot write — e.g. MCP + // store.OpenReadOnly, where CREATE EXTENSION and DDL statements are + // rejected by PostgreSQL with SQLSTATE 25006. When set, Open performs NO + // writes: no Migrate, no schema apply, no orphan reset, no upgrade + // backfill. Mirrors sqlitevec.Options.ReadOnly. A read-only open must set + // this (typically alongside SkipMigrate); a writable management open + // leaves it false so the upgrade backfill still lands. + ReadOnly bool // SkipExtension suppresses only the `CREATE EXTENSION IF NOT EXISTS // vector` step during migration while still creating the schema // tables and indexes. Set this when the vector extension is @@ -71,12 +85,57 @@ func Open(ctx context.Context, opts Options) (*Backend, error) { if opts.DB == nil { return nil, errors.New("pgvector.Open: Options.DB is required") } + b := &Backend{db: opts.DB} if !opts.SkipMigrate { + // serve / build / search: full migrate incl. CREATE EXTENSION (the + // extension step is gated by SkipExtension for managed PG). The eager + // per-dimension HNSW index is built here too. if err := Migrate(ctx, opts.DB, opts.Dimension, opts.SkipExtension); err != nil { return nil, fmt.Errorf("pgvector migrate: %w", err) } } - return &Backend{db: opts.DB}, nil + if !opts.ReadOnly { + // Writable open. When the heavy Migrate above was skipped (management + // sets SkipMigrate=true to avoid the privileged CREATE EXTENSION), still + // apply the schema WITHOUT the extension so embed_watermark etc. exist + // and the one-time upgrade backfill lands — parity with sqlitevec (which + // always Migrates vectors.db + backfills unless ReadOnly). + if opts.SkipMigrate { + if err := Migrate(ctx, opts.DB, opts.Dimension, true /* skipExtension */); err != nil { + return nil, fmt.Errorf("pgvector migrate (schema-only): %w", err) + } + } + // Orphaned-stamp reset (DB-recreate safety): clear embed_gen for any + // message whose stamp points to a generation id absent from + // index_generations. MUST run BEFORE BackfillEmbedGenForUpgrade. On PG + // messages and index_generations share one DB, so a true recreate means + // the whole DB was dropped (stamps and generations vanish together) — + // but the reset is kept for symmetry with sqlitevec and to defend + // against partial restores (e.g. messages restored, embeddings not). + // Not ledger-guarded: re-checks every writable Open; cheap + idempotent. + // Skipped on the ReadOnly path, where writes are rejected anyway. + if err := b.resetOrphanedEmbedGen(ctx); err != nil { + return nil, fmt.Errorf("reset orphaned embed_gen: %w", err) + } + // One-time upgrade backfill (Package A): stamp embed_gen for messages + // already embedded under the active generation so an upgraded archive + // is not reported as entirely missing (which would re-embed it all). + // Ledger-guarded, runs at most once. Skipped on the ReadOnly path, + // where writes are rejected anyway. + if err := b.BackfillEmbedGenForUpgrade(ctx); err != nil { + return nil, fmt.Errorf("embed_gen upgrade backfill: %w", err) + } + // Drop the dead pending_embeddings queue table now that the backfill has + // consulted it: the backfill preserves the table's legacy + // re-embed signal, then we drop it here. On the writable path only — + // a read-only Open never reaches here (this whole block is gated on + // !opts.ReadOnly), so the table (and its signal) survives until a + // writable open. Idempotent. + if err := b.dropDeadPendingEmbeddings(ctx); err != nil { + return nil, fmt.Errorf("drop dead pending_embeddings: %w", err) + } + } + return b, nil } // Close is a no-op for the pgvector backend: the *sql.DB handle is @@ -89,10 +148,14 @@ func (b *Backend) Close() error { return nil } // can retrieve it here instead of carrying the main handle separately. func (b *Backend) DB() *sql.DB { return b.db } -// CreateGeneration allocates a new building generation and seeds -// pending_embeddings with every currently-embeddable message in -// messages. Mirrors the sqlitevec semantics (§5.1): if a building row -// with the same fingerprint already exists, returns its id so a crashed +// CreateGeneration allocates a new building generation. Under the +// scan-and-fill design there is no pending_embeddings seed: the embed +// worker populates the generation by scanning messages whose embed_gen +// does not yet match it. seeded_at is stamped at creation as harmless +// vestigial metadata (it no longer gates a seed pass and no longer gates +// activation; coverage is the real gate). Mirrors the sqlitevec semantics +// (§5.1): if a building row with +// the same fingerprint already exists, returns its id so a crashed // rebuild can resume; a mismatched fingerprint surfaces // vector.ErrBuildingInProgress. func (b *Backend) CreateGeneration(ctx context.Context, model string, dim int, fingerprint string) (vector.GenerationID, error) { @@ -105,78 +168,13 @@ func (b *Backend) CreateGeneration(ctx context.Context, model string, dim int, f } now := time.Now().Unix() - gen, isNew, err := b.claimOrInsertBuilding(ctx, model, dim, fp, now) + gen, _, err := b.claimOrInsertBuilding(ctx, model, dim, fp, now) if err != nil { return 0, err } - - if !isNew { - seeded, err := b.isGenerationSeeded(ctx, gen) - if err != nil { - return 0, err - } - if seeded { - return gen, nil - } - } - if err := b.seedPending(ctx, gen, now); err != nil { - return 0, err - } - if err := b.markGenerationSeeded(ctx, gen, now); err != nil { - return 0, err - } return gen, nil } -func (b *Backend) isGenerationSeeded(ctx context.Context, gen vector.GenerationID) (bool, error) { - var seededAt sql.NullInt64 - err := b.db.QueryRowContext(ctx, - `SELECT seeded_at FROM index_generations WHERE id = $1`, int64(gen)).Scan(&seededAt) - if err != nil { - return false, fmt.Errorf("read seeded_at: %w", err) - } - return seededAt.Valid, nil -} - -func (b *Backend) markGenerationSeeded(ctx context.Context, gen vector.GenerationID, now int64) error { - if _, err := b.db.ExecContext(ctx, - `UPDATE index_generations SET seeded_at = COALESCE(seeded_at, $1) WHERE id = $2`, - now, int64(gen)); err != nil { - return fmt.Errorf("mark generation seeded: %w", err) - } - return nil -} - -// EnsureSeeded mirrors sqlitevec.EnsureSeeded: re-runs the initial seed -// pass when seeded_at is NULL so an interrupted resume cannot activate -// an empty generation. -func (b *Backend) EnsureSeeded(ctx context.Context, gen vector.GenerationID) error { - var state string - err := b.db.QueryRowContext(ctx, - `SELECT state FROM index_generations WHERE id = $1`, int64(gen)).Scan(&state) - if errors.Is(err, sql.ErrNoRows) { - return fmt.Errorf("%w: %d", vector.ErrUnknownGeneration, gen) - } - if err != nil { - return fmt.Errorf("lookup generation %d: %w", gen, err) - } - if state != string(vector.GenerationBuilding) { - return fmt.Errorf("%w: generation %d state=%q", vector.ErrGenerationNotBuilding, gen, state) - } - seeded, err := b.isGenerationSeeded(ctx, gen) - if err != nil { - return err - } - if seeded { - return nil - } - now := time.Now().Unix() - if err := b.seedPending(ctx, gen, now); err != nil { - return err - } - return b.markGenerationSeeded(ctx, gen, now) -} - // claimOrInsertBuilding returns (id, isNew, err). See sqlitevec for // rationale — same race-recovery shape, translated to pgx error codes. func (b *Backend) claimOrInsertBuilding(ctx context.Context, model string, dim int, fp string, now int64) (vector.GenerationID, bool, error) { @@ -190,11 +188,15 @@ func (b *Backend) claimOrInsertBuilding(ctx context.Context, model string, dim i return id, false, nil } + // seeded_at is stamped at creation as harmless vestigial metadata: + // scan-and-fill has no separate seed pass, and activation no longer + // gates on it (coverage is the real gate). Kept only so the column is + // populated for legacy display. var newID int64 err := b.db.QueryRowContext(ctx, `INSERT INTO index_generations - (model, dimension, fingerprint, started_at, state) - VALUES ($1, $2, $3, $4, 'building') + (model, dimension, fingerprint, started_at, seeded_at, state) + VALUES ($1, $2, $3, $4, $4, 'building') RETURNING id`, model, dim, fp, now).Scan(&newID) if err != nil { @@ -245,101 +247,17 @@ func isUniqueViolation(err error) bool { return pgErr.Code == "23505" } -// afterSeedLockHook is a test-only synchronization seam. When non-nil it is -// invoked once inside seedPending's transaction AFTER the SET LOCAL -// statement_timeout reset but BEFORE the generation's state is re-read under -// the FOR NO KEY UPDATE row lock. It lets the concurrency regression test -// commit a RetireGeneration at exactly the window the orphan-pending race -// opens (seed tx begins → retire commits → seed re-reads + inserts), proving -// the locked re-validation refuses to seed a now-retired generation. It is -// always nil in production. Mirrors enqueue.go's afterGenSnapshotHook. -var afterSeedLockHook func() - -// seedPending inserts one pending_embeddings row per live message in -// the main schema. Uses ON CONFLICT DO NOTHING for idempotency on -// retries and to deduplicate against rows already added by the -// concurrent Enqueuer path (parallel to sqlitevec's INSERT OR IGNORE). -// -// Because messages and pending_embeddings live in the same Postgres -// database, this can be done in a single INSERT … SELECT rather than -// streaming rows through Go like the SQLite backend does. -// -// The generation's state is re-read under a FOR NO KEY UPDATE row lock IN the -// same tx before the insert. This mirrors the Enqueuer's locked re-validation -// (enqueue.go): the lock conflicts with the no-key tuple lock that -// RetireGeneration / ActivateGeneration's state-flip UPDATE takes, so a -// concurrent retire that deletes this generation's pending rows and flips it -// to 'retired' cannot interleave with this seed to leave orphan pending rows -// behind. The two interleavings serialize: -// - seed-first: retire's state-flip UPDATE blocks on this lock, then its -// DELETE removes the rows we just inserted -> no orphan. -// - retire-first: this locking SELECT blocks until retire commits, then -// re-reads state='retired' and we skip the insert -> we seed nothing. -// -// Seedable = not yet retired, matching how the Enqueuer decides eligibility -// (WHERE state != 'retired'). A generation deleted outright (no row) is also -// skipped. -func (b *Backend) seedPending(ctx context.Context, gen vector.GenerationID, now int64) error { - tx, err := b.db.BeginTx(ctx, nil) - if err != nil { - return fmt.Errorf("begin seed tx: %w", err) - } - defer func() { _ = tx.Rollback() }() - - // Disable the pool-wide 30s statement_timeout for this tx: the shared - // store pool sets statement_timeout=30s via pgx RuntimeParams - // (postgresConnConfig), and this single INSERT ... SELECT over the whole - // messages table can exceed that on a 1M+ message archive (finding S1's - // family, V7). SET LOCAL is tx-scoped and auto-resets on commit/rollback, - // so the timeout cannot leak onto other connections. [V7] - if _, err := tx.ExecContext(ctx, "SET LOCAL statement_timeout = 0"); err != nil { - return fmt.Errorf("disable statement_timeout for seed: %w", err) - } - - // Test-only synchronization seam (nil in production): fires after the tx - // begins but before the locked re-read below, so the concurrency - // regression test can commit a RetireGeneration inside the exact window - // the orphan-pending race opens. - if afterSeedLockHook != nil { - afterSeedLockHook() - } - - // Re-read the generation's state under a FOR NO KEY UPDATE row lock and - // confirm it is still seedable (not retired) before inserting. If it has - // been retired (or deleted) concurrently, skip the insert so we never seed - // a retired generation with orphan pending rows. Held through the INSERT - // below in this same tx. - var state string - err = tx.QueryRowContext(ctx, - `SELECT state FROM index_generations WHERE id = $1 FOR NO KEY UPDATE`, - int64(gen)).Scan(&state) - if errors.Is(err, sql.ErrNoRows) { - // Generation deleted concurrently — nothing to seed. - return tx.Commit() - } - if err != nil { - return fmt.Errorf("re-validate generation %d for seed: %w", gen, err) - } - if state == string(vector.GenerationRetired) { - // Retired concurrently (PG: by a now-committed retire we just blocked - // on) — do not seed, leaving no orphan pending rows for this gen. - return tx.Commit() - } - - stmt := fmt.Sprintf(` - INSERT INTO pending_embeddings (generation_id, message_id, enqueued_at) - SELECT $1, id, $2 - FROM messages - WHERE %s - ON CONFLICT (generation_id, message_id) DO NOTHING`, - store.LiveMessagesWhere("", true)) - if _, err := tx.ExecContext(ctx, stmt, int64(gen), now); err != nil { - return fmt.Errorf("seed pending: %w", err) - } - if err := tx.Commit(); err != nil { - return fmt.Errorf("commit seed pending: %w", err) - } - return nil +// missingForGenExistsClause is the coverage gate predicate: a generation +// is fully covered when no live message still needs embedding for it +// (embed_gen IS NULL OR embed_gen <> gen). Built once and reused by +// ActivateGeneration (in-tx, single-DB on PG) and Stats. The $N ordinal +// of the generation id is supplied by the caller. +func missingForGenExistsClause(genArg string) string { + return fmt.Sprintf(`EXISTS ( + SELECT 1 FROM messages + WHERE (embed_gen IS NULL OR embed_gen <> %s) + AND %s + )`, genArg, store.LiveMessagesWhere("", true)) } // ActivateGeneration atomically retires the current active generation @@ -363,8 +281,8 @@ func (b *Backend) ActivateGeneration(ctx context.Context, gen vector.GenerationI defer func() { _ = tx.Rollback() }() // Disable the pool-wide 30s statement_timeout for this tx: the auto-retire - // path below DELETEs the demoted generation's embeddings + pending rows, - // which are corpus-size on a large archive and can exceed the shared store + // path below DELETEs the demoted generation's embeddings, which are + // corpus-size on a large archive and can exceed the shared store // pool's statement_timeout=30s, cancelling the activation at 30s and rolling // it back (finding C1, S1 family). SET LOCAL is tx-scoped and auto-resets on // commit/rollback, so the timeout cannot leak onto other connections. Must be @@ -393,35 +311,22 @@ func (b *Backend) ActivateGeneration(ctx context.Context, gen vector.GenerationI `DELETE FROM embeddings WHERE generation_id = $1`, demoted.Int64); err != nil { return fmt.Errorf("delete retired generation %d embeddings: %w", demoted.Int64, err) } - // Reap the demoted generation's queue rows in the same tx. Retired - // generations are never re-targeted by pickTarget, so any leftover - // pending_embeddings rows would be orphaned forever (the - // index_generations row is preserved, so the ON DELETE CASCADE never - // fires). Deleting them keeps the documented stats invariant - // ("retired generations have zero pending items") true. [cr2-3, cr2-4] - if _, err := tx.ExecContext(ctx, - `DELETE FROM pending_embeddings WHERE generation_id = $1`, demoted.Int64); err != nil { - return fmt.Errorf("delete retired generation %d pending: %w", demoted.Int64, err) - } } - // The promote re-checks the seeded/no-pending gate IN the same tx as the - // flip (unless force). This closes the window between a CALLER's pre-flight - // pending read and this UPDATE: no pending row committed before this - // statement can sneak gen past the gate. It does NOT serialize against a - // concurrent enqueue.go dual-enqueue under READ COMMITTED — the FK key-share - // lock enqueue takes does not conflict with this non-key UPDATE, so an - // enqueue that commits just AFTER this gated UPDATE can still leave one - // pending row on the now-active gen. That post-flip row is acceptable: the - // embed worker's active-generation top-up (see embed_job.go pickTarget / - // enqueue.go) simply processes it on the next run. [cr2-1] + // The promote re-checks the coverage gate IN the same tx as the flip + // (unless force): refuse to activate while any live message still needs + // embedding for gen (embed_gen <> gen). On PG messages and the + // generation lifecycle share one database, so the gate is folded into + // the activation UPDATE — fully atomic with the state flip, no TOCTOU. + // The seeded_at gate was removed: seeding was the old queue-population + // phase, which scan-and-fill no longer has, so a legacy/crashed gen with + // seeded_at=NULL but full coverage must be activatable. Coverage + // (missing==0) is the real gate. res, err := tx.ExecContext(ctx, `UPDATE index_generations SET state = 'active', activated_at = $1, completed_at = COALESCE(completed_at, $2) WHERE id = $3 AND state = 'building' - AND ($4 OR seeded_at IS NOT NULL) - AND ($4 OR NOT EXISTS ( - SELECT 1 FROM pending_embeddings WHERE generation_id = $3 - ))`, now, now, int64(gen), force) + AND ($4 OR NOT `+missingForGenExistsClause("$3")+`)`, + now, now, int64(gen), force) if err != nil { return fmt.Errorf("activate: %w", err) } @@ -436,34 +341,38 @@ func (b *Backend) ActivateGeneration(ctx context.Context, gen vector.GenerationI } // activateGateError re-reads gen inside the activation tx to return a -// precise reason the gated promote affected zero rows: pending rows present, -// not finished seeding, unknown generation, or not in 'building' state. -// Mirrors the prior CLI raw helper so callers get the same actionable -// messages now that the gate lives in the backend. +// precise reason the gated promote affected zero rows. The existence + +// 'building'-state lifecycle check runs FIRST: an unknown/non-building gen +// also satisfies the coverage predicate (embed_gen <> gen is true for an +// unknown gen id), so checking coverage first would surface the misleading +// "messages needing embedding" error instead of the real lifecycle reason. func activateGateError(ctx context.Context, tx *sql.Tx, gen vector.GenerationID, force bool) error { - var pending int64 - if err := tx.QueryRowContext(ctx, - `SELECT COUNT(*) FROM pending_embeddings WHERE generation_id = $1`, int64(gen)).Scan(&pending); err != nil { - return fmt.Errorf("count pending rows for generation %d: %w", gen, err) - } - if pending > 0 && !force { - return fmt.Errorf("generation %d still has %d pending embedding rows; run `msgvault embeddings resume` or pass --force", - gen, pending) - } var state vector.GenerationState - var seededAt sql.NullInt64 if err := tx.QueryRowContext(ctx, - `SELECT state, seeded_at FROM index_generations WHERE id = $1`, int64(gen)).Scan(&state, &seededAt); err != nil { + `SELECT state FROM index_generations WHERE id = $1`, int64(gen)).Scan(&state); err != nil { if errors.Is(err, sql.ErrNoRows) { return fmt.Errorf("%w: %d", vector.ErrUnknownGeneration, gen) } return fmt.Errorf("lookup generation %d: %w", gen, err) } - if state == vector.GenerationBuilding && !seededAt.Valid && !force { - return fmt.Errorf("generation %d has not finished seeding; run `msgvault embeddings resume` or pass --force", - gen) + if state != vector.GenerationBuilding { + return fmt.Errorf("generation %d not in 'building' state", gen) + } + // Gen exists and is building, so the only remaining reason the gated + // promote affected zero rows is the coverage term. + var missing bool + if err := tx.QueryRowContext(ctx, + `SELECT `+missingForGenExistsClause("$1"), int64(gen)).Scan(&missing); err != nil { + return fmt.Errorf("check coverage for generation %d: %w", gen, err) + } + if missing && !force { + return fmt.Errorf("generation %d still has messages needing embedding; run `msgvault embeddings resume` or pass --force", gen) } - return fmt.Errorf("generation %d not in 'building' state", gen) + // Gen reads as building with full coverage yet the gated UPDATE still + // matched no rows: a concurrent transaction must have flipped its state + // between the promote and this re-read. Surface it rather than reporting a + // phantom gate. + return fmt.Errorf("activate generation %d: gated promote affected no rows (state=%q)", gen, state) } // RetireGeneration marks the given generation as retired and DELETEs its @@ -481,7 +390,7 @@ func activateGateError(ctx context.Context, tx *sql.Tx, gen vector.GenerationID, // Unless force is true, the state-flip UPDATE refuses to retire a generation // in state='active' (WHERE state != 'active'): if it affects zero rows the // active guard tripped, so the tx rolls back returning ErrRefuseRetireActive -// WITHOUT touching embeddings or pending rows. The guard lives in the same tx +// WITHOUT touching embeddings. The guard lives in the same tx // as the flip — closing the CLI's pre-flight TOCTOU so a concurrent // activation cannot delete the now-serving generation's embeddings without // --force-active. force retires unconditionally (operator override). @@ -493,7 +402,7 @@ func (b *Backend) RetireGeneration(ctx context.Context, gen vector.GenerationID, defer func() { _ = tx.Rollback() }() // Disable the pool-wide 30s statement_timeout for this tx: the DELETEs below - // remove the retired generation's embeddings + pending rows, which are + // remove the retired generation's embeddings, which are // corpus-size on a large archive and can exceed the shared store pool's // statement_timeout=30s, cancelling the retire at 30s and rolling it back // (finding C1, S1 family). SET LOCAL is tx-scoped and auto-resets on @@ -521,15 +430,7 @@ func (b *Backend) RetireGeneration(ctx context.Context, gen vector.GenerationID, `DELETE FROM embeddings WHERE generation_id = $1`, int64(gen)); err != nil { return fmt.Errorf("delete retired generation %d embeddings: %w", gen, err) } - // Reap the retired generation's queue rows in the same tx so they cannot - // be orphaned (no future run re-targets a retired generation, and the - // preserved index_generations row means the ON DELETE CASCADE never - // fires). Keeps the "retired generations have zero pending items" - // stats invariant true. [cr2-2, cr2-3] - if _, err := tx.ExecContext(ctx, - `DELETE FROM pending_embeddings WHERE generation_id = $1`, int64(gen)); err != nil { - return fmt.Errorf("delete retired generation %d pending: %w", gen, err) - } + // Scan-and-fill has no per-generation queue to reap. if err := tx.Commit(); err != nil { return fmt.Errorf("commit retire generation %d: %w", gen, err) } @@ -843,6 +744,45 @@ func (b *Backend) LoadVector(ctx context.Context, messageID int64) ([]float32, e return parseVectorLiteral(lit, active.Dimension) } +// ResetWatermarkBelow lowers the embed_watermark for EVERY generation to at +// most minID-1 (clamped at 0) so a subsequent incremental RunOnce re-scans +// from below minID and re-finds rows whose embed_gen was just reset to NULL +// by repair-encoding. On PostgreSQL the watermark lives in the same database +// as messages (b.db). +// +// PostgreSQL's two-argument scalar minimum is LEAST (MIN is aggregate-only), +// so `watermark_id = LEAST(watermark_id, $1)` never raises a generation's +// cursor — it only lowers one that currently sits above the new floor. minID +// < 1 is a no-op. The UPDATE runs inside a tx that lifts the pool-wide +// statement_timeout, matching the sibling write helpers (Migrate, +// resetOrphanedEmbedGen, BackfillEmbedGenForUpgrade); the UPDATE itself is +// tiny (one row per generation) but the tx keeps the convention uniform and +// is robust under a busy pool. SET LOCAL is tx-scoped so the disabled timeout +// cannot leak onto other pooled connections. Idempotent. +func (b *Backend) ResetWatermarkBelow(ctx context.Context, minID int64) error { + if minID < 1 { + return nil + } + floorID := minID - 1 + tx, err := b.db.BeginTx(ctx, nil) + if err != nil { + return fmt.Errorf("reset watermark below %d: begin tx: %w", minID, err) + } + defer func() { _ = tx.Rollback() }() + + if _, err := tx.ExecContext(ctx, "SET LOCAL statement_timeout = 0"); err != nil { + return fmt.Errorf("reset watermark below %d: disable statement_timeout: %w", minID, err) + } + if _, err := tx.ExecContext(ctx, + `UPDATE embed_watermark SET watermark_id = LEAST(watermark_id, $1)`, floorID); err != nil { + return fmt.Errorf("reset watermark below %d: %w", minID, err) + } + if err := tx.Commit(); err != nil { + return fmt.Errorf("reset watermark below %d: commit tx: %w", minID, err) + } + return nil +} + // Search runs an ANN query against the given generation and returns // the top-k hits (optionally intersected with a structured filter). // Uses pgvector's cosine-distance operator (<=>), which returns 0..2; @@ -1217,9 +1157,19 @@ func (b *Backend) Stats(ctx context.Context, gen vector.GenerationID) (vector.St if err := b.db.QueryRowContext(ctx, embeddingCountSQL, args...).Scan(&s.EmbeddingCount); err != nil { return s, fmt.Errorf("count embeddings: %w", err) } - if err := b.db.QueryRowContext(ctx, - `SELECT COUNT(*) FROM pending_embeddings `+where, args...).Scan(&s.PendingCount); err != nil { - return s, fmt.Errorf("count pending: %w", err) + // PendingCount is now "messages still needing embedding for this + // generation" (embed_gen <> gen), read from the messages table in the + // same database. The aggregate path (gen == 0) has no single target + // generation, so it reports 0 — the StatsView consumer sums per-gen + // pending across the active/building generations anyway. + if gen != 0 { + if err := b.db.QueryRowContext(ctx, + `SELECT COUNT(*) FROM messages + WHERE (embed_gen IS NULL OR embed_gen <> $1) + AND `+store.LiveMessagesWhere("", true), + int64(gen)).Scan(&s.PendingCount); err != nil { + return s, fmt.Errorf("count missing: %w", err) + } } // StorageBytes: total on-disk size of the embeddings table (heap + // indexes + TOAST), table-wide rather than per-generation. Unlike @@ -1233,3 +1183,37 @@ func (b *Backend) Stats(ctx context.Context, gen vector.GenerationID) (vector.St } return s, nil } + +// EmbeddedMessageCount returns the number of LIVE messages that are +// stamped for gen (embed_gen = gen) AND actually have at least one vector +// for the generation. Used by the coverage readout to split stamped +// messages into embedded vs blank. Counts distinct messages (not chunk +// rows) so a long, multi-chunk message counts once, matching the +// EmbeddingCount semantic elsewhere. +// +// The liveness + stamped filter is REQUIRED for the coverage invariant +// live == embedded + blank + missing to hold. A non-live message +// (soft-deleted via deleted_at / deleted_from_source_at, or a dedup +// loser) keeps its embedding rows — Backend.Delete has no production +// callers — so an unfiltered COUNT(DISTINCT message_id) over the +// embeddings table can exceed stamped (which is live-only), driving +// blank = stamped - embedded negative (clamped to 0) and breaking the +// invariant (EMBEDDED could display larger than LIVE). +// +// On PostgreSQL embeddings and messages share one database (b.db), so the +// live intersection is a single JOIN against messages, mirroring +// store.LiveMessagesWhere's predicate. +func (b *Backend) EmbeddedMessageCount(ctx context.Context, gen vector.GenerationID) (int64, error) { + var n int64 + if err := b.db.QueryRowContext(ctx, + `SELECT COUNT(DISTINCT e.message_id) + FROM embeddings e + JOIN messages m ON m.id = e.message_id + WHERE e.generation_id = $1 + AND m.embed_gen = $1 + AND `+store.LiveMessagesWhere("m", true), + int64(gen)).Scan(&n); err != nil { + return 0, fmt.Errorf("count embedded messages: %w", err) + } + return n, nil +} diff --git a/internal/vector/pgvector/backend_recall_test.go b/internal/vector/pgvector/backend_recall_test.go index cf4713248..df94603f4 100644 --- a/internal/vector/pgvector/backend_recall_test.go +++ b/internal/vector/pgvector/backend_recall_test.go @@ -89,10 +89,6 @@ func seedRecallCorpus(t *testing.T, b *Backend, db *sql.DB, multiChunks, singles } require.NoError(t, b.Upsert(ctx, gen, chunks), "Upsert") - _, err = b.db.ExecContext(ctx, - `DELETE FROM pending_embeddings WHERE generation_id = $1`, int64(gen)) - require.NoError(t, err, "clear pending") - query := make([]float32, recallDim) query[0] = 1 return gen, query @@ -256,10 +252,6 @@ func seedDistinctNearQueryCorpus(t *testing.T, b *Backend, db *sql.DB, count int } require.NoError(t, b.Upsert(ctx, gen, chunks), "Upsert") - _, err = b.db.ExecContext(ctx, - `DELETE FROM pending_embeddings WHERE generation_id = $1`, int64(gen)) - require.NoError(t, err, "clear pending") - // ANALYZE so the planner has row-count statistics; without them it may // mis-cost the HNSW index against the btree(+sort) alternatives and the // EXPLAIN assertion below becomes flaky. diff --git a/internal/vector/pgvector/backend_retire_test.go b/internal/vector/pgvector/backend_retire_test.go index 1b05610d2..88a24a4fc 100644 --- a/internal/vector/pgvector/backend_retire_test.go +++ b/internal/vector/pgvector/backend_retire_test.go @@ -23,32 +23,6 @@ func countEmbeddingRows(t *testing.T, b *Backend, gen vector.GenerationID) int { return n } -// countPendingRows returns the number of pending_embeddings rows belonging to -// a generation. Used by the retire-cleans-pending tests (cr2-2/cr2-3/cr2-4). -func countPendingRows(t *testing.T, b *Backend, gen vector.GenerationID) int { - t.Helper() - var n int - require.NoError(t, b.db.QueryRowContext(context.Background(), - `SELECT COUNT(*) FROM pending_embeddings WHERE generation_id = $1`, int64(gen)).Scan(&n), - "count pending rows for generation %d", gen) - return n -} - -// seedPending inserts a pending_embeddings row for (gen, msgID) directly, -// without going through the worker. The message row is created first to -// satisfy the FK. Used to simulate queue rows left behind on a retired gen. -func seedPending(t *testing.T, b *Backend, gen vector.GenerationID, msgID int64) { - t.Helper() - ctx := context.Background() - _, err := b.db.ExecContext(ctx, - `INSERT INTO messages (id) VALUES ($1) ON CONFLICT DO NOTHING`, msgID) - require.NoErrorf(t, err, "seed message %d", msgID) - _, err = b.db.ExecContext(ctx, - `INSERT INTO pending_embeddings (generation_id, message_id, enqueued_at) - VALUES ($1, $2, $3) ON CONFLICT DO NOTHING`, int64(gen), msgID, 0) - require.NoErrorf(t, err, "seed pending (%d, %d)", gen, msgID) -} - // genMessageCount returns index_generations.message_count for a generation. func genMessageCount(t *testing.T, b *Backend, gen vector.GenerationID) int { t.Helper() @@ -131,9 +105,10 @@ func genState(t *testing.T, b *Backend, gen vector.GenerationID) string { } // buildGenWithVectors creates a fresh building generation, upserts one chunk -// per supplied (message_id -> vector), clears pending, and returns the gen id. -// It does NOT activate. Caller controls activation order so the retire paths -// can be exercised explicitly. +// per supplied (message_id -> vector), stamps embed_gen on those messages +// (fully-embedded end state), and returns the gen id. It does NOT activate. +// Caller controls activation order so the retire paths can be exercised +// explicitly. func buildGenWithVectors(t *testing.T, b *Backend, model string, dim int, vecs map[int64][]float32) vector.GenerationID { t.Helper() ctx := context.Background() @@ -149,9 +124,11 @@ func buildGenWithVectors(t *testing.T, b *Backend, model string, dim int, vecs m chunks = append(chunks, vector.Chunk{MessageID: id, ChunkIndex: 0, Vector: v}) } require.NoError(t, b.Upsert(ctx, gen, chunks), "Upsert") - _, err = b.db.ExecContext(ctx, - `DELETE FROM pending_embeddings WHERE generation_id = $1`, int64(gen)) - require.NoError(t, err, "clear pending") + for id := range vecs { + _, err = b.db.ExecContext(ctx, + `UPDATE messages SET embed_gen = $1 WHERE id = $2`, int64(gen), id) + require.NoErrorf(t, err, "stamp embed_gen for msg %d", id) + } return gen } @@ -196,11 +173,6 @@ func TestBackend_ActivateGeneration_AutoRetireDeletesPrevious(t *testing.T) { require.NoError(t, b.ActivateGeneration(ctx, genA, true), "activate A") require.Equal(t, 2, countEmbeddingRows(t, b, genA), "A populated before re-embed") - // Leave an undrained queue row on A so we can prove the auto-retire reaps - // pending for the SAME id whose state it flipped (RETURNING-id provability). - seedPending(t, b, genA, 30) - require.Equal(t, 1, countPendingRows(t, b, genA), "precondition: A has a pending row") - // Generation B: a new building generation at the same dimension (the // normal re-embed flow). Activating it auto-retires A. genB := buildGenWithVectors(t, b, "model-b", 4, map[int64][]float32{ @@ -218,17 +190,14 @@ func TestBackend_ActivateGeneration_AutoRetireDeletesPrevious(t *testing.T) { // RETURNING-id provability: the demote folds into one // `UPDATE ... WHERE state='active' RETURNING id` statement, so the id whose - // embeddings+pending get deleted is exactly the row that flipped to retired. - // Assert the previously-active gen (genA) is the sole retired row AND that - // both its embeddings and its pending were the ones cleaned, while the new - // active gen (genB) keeps its rows. This pins that the RETURNING'd id == - // the deleted id == the previously-active generation. + // embeddings get deleted is exactly the row that flipped to retired. Assert + // the previously-active gen (genA) is the sole retired row AND that its + // embeddings were the ones deleted, while the new active gen (genB) keeps + // its rows. retired := singleRetiredGen(t, b) assert.Equal(t, genA, retired, "the previously-active gen must be the sole retired row") assert.Equal(t, 0, countEmbeddingRows(t, b, retired), "embeddings deleted for exactly the RETURNING'd (retired) id") - assert.Equal(t, 0, countPendingRows(t, b, retired), - "pending reaped for exactly the RETURNING'd (retired) id") active, err := b.ActiveGeneration(ctx) require.NoError(t, err, "ActiveGeneration after activate B") @@ -276,42 +245,15 @@ func TestBackend_ActivateGeneration_PreservesBuildingGenerations(t *testing.T) { assert.Equal(t, 1, countEmbeddingRows(t, b, genC), "C's own rows preserved") } -// TestBackend_RetireGeneration_CleansPending pins the cr2-2/cr2-3 fix for the -// explicit-retire path: RetireGeneration must DELETE the generation's -// pending_embeddings rows in the same tx as the state flip. Retired -// generations are never re-targeted by pickTarget, so any leftover queue rows -// would be orphaned forever and would violate the documented "retired -// generations have zero pending items" stats invariant. -func TestBackend_RetireGeneration_CleansPending(t *testing.T) { - b, ctx, _ := newBackendForTest(t) - - gen := buildGenWithVectors(t, b, "model-a", 4, map[int64][]float32{ - 1: unitVec(4, 0), - }) - // Simulate queue rows left behind on the generation (e.g. an incremental - // enqueue that was never drained before retire). - seedPending(t, b, gen, 10) - seedPending(t, b, gen, 11) - require.Equal(t, 2, countPendingRows(t, b, gen), "precondition: pending rows present") - - require.NoError(t, b.RetireGeneration(ctx, gen, false), "RetireGeneration") - - assert.Equal(t, 0, countPendingRows(t, b, gen), - "retire must delete the generation's pending_embeddings rows") - assert.Equal(t, string(vector.GenerationRetired), genState(t, b, gen), - "index_generations row must remain, flipped to retired") -} - // TestBackend_RetireGeneration_ActiveGuard pins the retire-TOCTOU // class-closing fix: the active-gen guard lives ATOMICALLY inside // RetireGeneration's tx (mirroring ActivateGeneration's force gate). On -// pgvector, retire DELETES the generation's embeddings (shared HNSW graph) and -// reaps its pending rows — so refusing the active generation without force is -// what prevents a concurrent activation from wiping the now-serving graph. +// pgvector, retire DELETES the generation's embeddings (shared HNSW graph), +// so refusing the active generation without force is what prevents a +// concurrent activation from wiping the now-serving graph. // - force=false against the ACTIVE generation is refused with -// ErrRefuseRetireActive, leaving state='active' and BOTH its embeddings and -// pending rows untouched. -// - force=true retires the active generation, deleting embeddings + pending. +// ErrRefuseRetireActive, leaving state='active' and its embeddings intact. +// - force=true retires the active generation, deleting embeddings. // - force=false against a NON-active (building) generation retires fine. func TestBackend_RetireGeneration_ActiveGuard(t *testing.T) { b, ctx, _ := newBackendForTest(t) @@ -321,14 +263,10 @@ func TestBackend_RetireGeneration_ActiveGuard(t *testing.T) { 2: unitVec(4, 1), }) require.NoError(t, b.ActivateGeneration(ctx, genA, true), "activate A") - // Leave an undrained pending row on the active gen. - seedPending(t, b, genA, 30) require.Equal(t, 2, countEmbeddingRows(t, b, genA), "precondition: A has embeddings") - require.Equal(t, 1, countPendingRows(t, b, genA), "precondition: A has a pending row") require.Equal(t, string(vector.GenerationActive), genState(t, b, genA), "precondition: A active") - // (1) Non-forced retire of the ACTIVE gen is refused atomically: sentinel - // error, state unchanged, and NEITHER embeddings NOR pending rows deleted. + // (1) Non-forced retire of the ACTIVE gen is refused atomically. err := b.RetireGeneration(ctx, genA, false) require.ErrorIs(t, err, vector.ErrRefuseRetireActive, "non-forced retire of active gen must return ErrRefuseRetireActive") @@ -336,19 +274,14 @@ func TestBackend_RetireGeneration_ActiveGuard(t *testing.T) { "refused retire must leave the active gen's state unchanged") assert.Equal(t, 2, countEmbeddingRows(t, b, genA), "refused retire must NOT delete the active gen's embeddings") - assert.Equal(t, 1, countPendingRows(t, b, genA), - "refused retire must NOT reap the active gen's pending rows") - // (2) Forced retire succeeds: state flips to retired, embeddings deleted, - // pending reaped. + // (2) Forced retire succeeds: state flips to retired, embeddings deleted. require.NoError(t, b.RetireGeneration(ctx, genA, true), "forced retire of active gen must succeed") assert.Equal(t, string(vector.GenerationRetired), genState(t, b, genA), "forced retire flips state to retired") assert.Equal(t, 0, countEmbeddingRows(t, b, genA), "forced retire deletes the gen's embeddings") - assert.Equal(t, 0, countPendingRows(t, b, genA), - "forced retire reaps the gen's pending rows") // (3) A NON-active (building) generation retires fine without force. genB := buildGenWithVectors(t, b, "model-b", 4, map[int64][]float32{ @@ -361,33 +294,6 @@ func TestBackend_RetireGeneration_ActiveGuard(t *testing.T) { "non-active gen retires to retired without force") } -// TestBackend_ActivateGeneration_AutoRetireCleansPending pins the -// cr2-3/cr2-4 fix for the auto-retire path: activating a new generation must -// reap the demoted (now-retired) generation's pending_embeddings rows in the -// same tx as the state flip. -func TestBackend_ActivateGeneration_AutoRetireCleansPending(t *testing.T) { - b, ctx, _ := newBackendForTest(t) - - genA := buildGenWithVectors(t, b, "model-a", 4, map[int64][]float32{ - 1: unitVec(4, 0), - }) - require.NoError(t, b.ActivateGeneration(ctx, genA, true), "activate A") - // Stage incremental queue rows on the active gen that haven't drained yet. - seedPending(t, b, genA, 20) - seedPending(t, b, genA, 21) - require.Equal(t, 2, countPendingRows(t, b, genA), "precondition: pending rows on active gen") - - genB := buildGenWithVectors(t, b, "model-b", 4, map[int64][]float32{ - 1: unitVec(4, 1), - }) - require.NoError(t, b.ActivateGeneration(ctx, genB, true), "activate B (auto-retires A)") - - assert.Equal(t, string(vector.GenerationRetired), genState(t, b, genA), - "A must be retired by B's activation") - assert.Equal(t, 0, countPendingRows(t, b, genA), - "auto-retire must delete the demoted generation's pending_embeddings rows") -} - // TestBackend_DeleteOnRetire_KeepsActiveRecallClean is the recall proof for // Codex MEDIUM #1. It constructs the contamination scenario the fix targets: // generation A (retired) and the active generation B share one dimension and diff --git a/internal/vector/pgvector/backend_seed_race_test.go b/internal/vector/pgvector/backend_seed_race_test.go deleted file mode 100644 index 797d3ea0a..000000000 --- a/internal/vector/pgvector/backend_seed_race_test.go +++ /dev/null @@ -1,114 +0,0 @@ -//go:build pgvector - -package pgvector - -import ( - "context" - "testing" - - "github.com/stretchr/testify/assert" - "github.com/stretchr/testify/require" - "go.kenn.io/msgvault/internal/vector" -) - -// insertBuildingGeneration inserts a 'building' index_generations row with an -// explicit id (no pending rows yet) so a test can drive seedPending against a -// generation in its pre-seed state. Mirrors the enqueue test's -// insertPGGeneration helper. -func insertBuildingGeneration(t *testing.T, b *Backend, id int64, dim int) vector.GenerationID { - t.Helper() - _, err := b.db.ExecContext(context.Background(), ` - INSERT INTO index_generations (id, model, dimension, fingerprint, started_at, state) - OVERRIDING SYSTEM VALUE - VALUES ($1, 'm', $2, $3, 0, 'building')`, - id, dim, "m:768") - require.NoError(t, err, "insert building generation") - return vector.GenerationID(id) -} - -// TestBackend_SeedPending_RetireDuringSeed_NoOrphan drives the concurrent -// retire-during-seed interleaving that seedPending's locked re-validation -// closes. It forces the exact window the orphan-pending race opens — the seed -// tx begins, THEN a concurrent RetireGeneration commits (UPDATE state='retired' -// + DELETE pending), THEN seedPending re-reads the generation under -// FOR NO KEY UPDATE and runs its INSERT … SELECT — and asserts no pending row -// is left behind for the now-retired generation. -// -// Without the lock+recheck (the bug), seedPending's INSERT … SELECT runs after -// retire's DELETE has already cleared the generation's pending rows, so it -// inserts fresh pending rows for a retired generation that no worker will ever -// target — orphan work. With the fix the locked re-read observes -// state='retired' and skips the insert, so the post-state has zero pending rows -// for that generation. -// -// The interleave is made deterministic via afterSeedLockHook: the hook fires -// inside the seed tx after BEGIN but before the locked re-read, and runs the -// retire to completion before returning, so seedPending's re-validation always -// observes the committed retire. Mirrors the Enqueuer's -// TestEnqueuerPG_RetireDuringEnqueue_NoOrphan. -func TestBackend_SeedPending_RetireDuringSeed_NoOrphan(t *testing.T) { - b, ctx, db := newBackendForTest(t) - - const dim = 768 - - // Several live messages so that, absent the fix, the INSERT … SELECT would - // actually insert orphan pending rows (newBackendForTest already created - // message id=1; add more to make the orphan obvious). - for _, id := range []int64{2, 3, 4} { - _, err := db.ExecContext(ctx, - `INSERT INTO messages (id) VALUES ($1) ON CONFLICT DO NOTHING`, id) - require.NoErrorf(t, err, "seed message %d", id) - } - - // A building generation in its pre-seed state (no pending rows yet). - gen := insertBuildingGeneration(t, b, 1, dim) - - // The hook fires once, inside the seed tx, after BEGIN but before the - // locked re-read. We retire the building generation to completion here - // (non-force is permitted — it is not active) so seedPending's subsequent - // FOR NO KEY UPDATE re-read observes the committed state='retired'. Reset - // the seam so it cannot leak into sibling tests sharing this package's - // globals. - var retireErr error - afterSeedLockHook = func() { - retireErr = b.RetireGeneration(ctx, gen, false) - } - t.Cleanup(func() { afterSeedLockHook = nil }) - - require.NoError(t, b.seedPending(ctx, gen, 0), "seedPending") - require.NoError(t, retireErr, "RetireGeneration during seed") - - // The generation was retired before seedPending inserted its rows: the - // locked re-validation must have skipped the insert, leaving zero orphan - // pending rows. - assert.Equal(t, 0, countPendingRows(t, b, gen), - "retired-mid-seed generation must have no orphan pending rows") - - // Sanity: the generation really is retired. - var state string - require.NoError(t, b.db.QueryRowContext(ctx, - `SELECT state FROM index_generations WHERE id = $1`, int64(gen)).Scan(&state)) - assert.Equal(t, string(vector.GenerationRetired), state, "generation retired") -} - -// TestBackend_SeedPending_SeedsBuildingGeneration is the control: with no -// concurrent retire, seedPending populates one pending_embeddings row per live -// message for a building generation. Confirms the new lock+recheck does not -// regress the normal seed path. -func TestBackend_SeedPending_SeedsBuildingGeneration(t *testing.T) { - b, ctx, db := newBackendForTest(t) - - const dim = 768 - for _, id := range []int64{2, 3, 4} { - _, err := db.ExecContext(ctx, - `INSERT INTO messages (id) VALUES ($1) ON CONFLICT DO NOTHING`, id) - require.NoErrorf(t, err, "seed message %d", id) - } - - gen := insertBuildingGeneration(t, b, 1, dim) - require.NoError(t, b.seedPending(ctx, gen, 0), "seedPending") - - // One pending row per live message (ids 1..4). - assert.Equal(t, 4, countPendingRows(t, b, gen), - "seedPending must enqueue one pending row per live message") -} diff --git a/internal/vector/pgvector/backend_test.go b/internal/vector/pgvector/backend_test.go index bf85b9442..25b999da0 100644 --- a/internal/vector/pgvector/backend_test.go +++ b/internal/vector/pgvector/backend_test.go @@ -41,28 +41,28 @@ func TestBackend_CreateActivateRetire(t *testing.T) { assert.Error(t, err, "ActiveGeneration should error after retire") } -// TestBackend_CreateGeneration_SeedsPending verifies the initial seed -// pass populates pending_embeddings with one row per live message. -func TestBackend_CreateGeneration_SeedsPending(t *testing.T) { +// TestBackend_CreateGeneration_StampsSeededAt verifies CreateGeneration +// stamps seeded_at so the activation gate's lifecycle check passes. +func TestBackend_CreateGeneration_StampsSeededAt(t *testing.T) { b, ctx, _ := newBackendForTest(t) gid, err := b.CreateGeneration(ctx, "m", 768, "") require.NoError(t, err, "Create") - var n int - err = b.db.QueryRowContext(ctx, - `SELECT COUNT(*) FROM pending_embeddings WHERE generation_id = $1`, int64(gid), - ).Scan(&n) - require.NoError(t, err, "count pending") - assert.Equal(t, 1, n, "pending count") + var seededAt sql.NullInt64 + require.NoError(t, b.db.QueryRowContext(ctx, + `SELECT seeded_at FROM index_generations WHERE id = $1`, int64(gid)).Scan(&seededAt)) + assert.True(t, seededAt.Valid, "seeded_at stamped at creation") } -// TestBackend_CreateGeneration_SkipsDeleted ensures the seed pass -// honours the live-message predicate, so soft-deleted rows are not -// re-embedded by a future rebuild. -func TestBackend_CreateGeneration_SkipsDeleted(t *testing.T) { +// TestBackend_CoverageGate_SkipsDeleted ensures the coverage gate honours +// the live-message predicate: a soft-deleted message does not count as +// missing, so a building generation can activate without force. +func TestBackend_CoverageGate_SkipsDeleted(t *testing.T) { db := openPGTestDB(t) - _, err := db.Exec(`INSERT INTO messages (id, deleted_from_source_at) VALUES (1, NOW())`) - require.NoError(t, err, "seed deleted") + // testSetupPGSchema seeds a live message id=1; mark it deleted so the + // only message is excluded from the coverage universe. + _, err := db.Exec(`UPDATE messages SET deleted_from_source_at = NOW() WHERE id = 1`) + require.NoError(t, err, "soft-delete message") ctx := context.Background() b, err := Open(ctx, Options{DB: db, Dimension: 768}) @@ -72,12 +72,11 @@ func TestBackend_CreateGeneration_SkipsDeleted(t *testing.T) { gid, err := b.CreateGeneration(ctx, "m", 768, "") require.NoError(t, err, "Create") - var n int - err = b.db.QueryRowContext(ctx, - `SELECT COUNT(*) FROM pending_embeddings WHERE generation_id = $1`, int64(gid), - ).Scan(&n) - require.NoError(t, err, "count pending") - assert.Equal(t, 0, n, "pending count (deleted message must be skipped)") + s, err := b.Stats(ctx, gid) + require.NoError(t, err, "Stats") + assert.Equal(t, int64(0), s.PendingCount, "deleted message must not count as missing") + // With no live missing message, activation passes the coverage gate. + require.NoError(t, b.ActivateGeneration(ctx, gid, false), "activate (no missing)") } // TestBackend_CreateGeneration_ResumesBuilding checks the idempotent @@ -109,62 +108,86 @@ func TestBackend_CreateGeneration_MismatchedFingerprint(t *testing.T) { "error = %v, want wrapping ErrBuildingInProgress", err) } -// TestBackend_CreateGeneration_ResumeReseedsUnseededGeneration covers -// the "crash between row insert and seed commit" path: a building row -// exists but seeded_at is NULL. Resume must re-run seedPending. -func TestBackend_CreateGeneration_ResumeReseedsUnseededGeneration(t *testing.T) { - b, ctx, _ := newBackendForTest(t) - +// TestBackend_ActivateGeneration_CoverageGate pins the scan-and-fill +// activation gate (in-tx on PG): a generation with a live message still +// needing embedding (embed_gen <> gen) is refused without force, and +// succeeds once the message is stamped covered. +func TestBackend_ActivateGeneration_CoverageGate(t *testing.T) { + b, ctx, db := newBackendForTest(t) gen, err := b.CreateGeneration(ctx, "m", 768, "") - require.NoError(t, err, "first Create") - - _, err = b.db.ExecContext(ctx, - `UPDATE index_generations SET seeded_at = NULL WHERE id = $1`, int64(gen)) - require.NoError(t, err, "clear seeded_at") - - _, err = b.db.ExecContext(ctx, - `DELETE FROM pending_embeddings WHERE generation_id = $1`, int64(gen)) - require.NoError(t, err, "clear pending") - - resumed, err := b.CreateGeneration(ctx, "m", 768, "") - require.NoError(t, err, "resume Create") - assert.Equal(t, gen, resumed, "resumed gen must match original") + require.NoError(t, err, "Create") - var pending int - err = b.db.QueryRowContext(ctx, - `SELECT COUNT(*) FROM pending_embeddings WHERE generation_id = $1`, - int64(gen)).Scan(&pending) - require.NoError(t, err, "count pending") - assert.Equal(t, 1, pending, "pending count after resume") + // The seeded live message (id=1) is unembedded -> activation refused. + err = b.ActivateGeneration(ctx, gen, false) + require.Error(t, err, "activate must be refused with missing coverage") + assert.Contains(t, err.Error(), "needing embedding") - var seededAt sql.NullInt64 - err = b.db.QueryRowContext(ctx, - `SELECT seeded_at FROM index_generations WHERE id = $1`, int64(gen)).Scan(&seededAt) - require.NoError(t, err, "read seeded_at") - assert.True(t, seededAt.Valid, "seeded_at still NULL after resume re-seed") + // Stamp it covered, then activation succeeds. + _, err = db.ExecContext(ctx, `UPDATE messages SET embed_gen = $1 WHERE id = 1`, int64(gen)) + require.NoError(t, err, "stamp embed_gen") + require.NoError(t, b.ActivateGeneration(ctx, gen, false), "activate after coverage complete") } -// TestBackend_EnsureSeeded_Idempotent calls EnsureSeeded twice and -// asserts the seeded_at stamp persists across calls. -func TestBackend_EnsureSeeded_Idempotent(t *testing.T) { +// TestBackend_ActivateGeneration_LifecycleErrorBeforeCoverage pins that +// activating an unknown or non-building generation WITHOUT --force returns +// the lifecycle error (unknown generation / not in 'building' state), NOT +// the misleading "messages needing embedding" coverage error. The coverage +// predicate (embed_gen <> gen) is true for an unknown gen id, so +// activateGateError must check existence + 'building' state before coverage. +// The seeded test message (id=1) stays unembedded so the coverage gate WOULD +// trip if checked first. +func TestBackend_ActivateGeneration_LifecycleErrorBeforeCoverage(t *testing.T) { b, ctx, _ := newBackendForTest(t) + + // (a) Unknown gen id: lifecycle error (ErrUnknownGeneration), not coverage. + err := b.ActivateGeneration(ctx, vector.GenerationID(999), false) + require.Error(t, err, "activating unknown gen must fail") + require.ErrorIs(t, err, vector.ErrUnknownGeneration, + "unknown gen must return ErrUnknownGeneration, not coverage error") + assert.NotContains(t, err.Error(), "needing embedding", + "unknown gen must NOT surface the coverage error") + + // (b) Non-building (retired) gen id: lifecycle error, not coverage. gen, err := b.CreateGeneration(ctx, "m", 768, "") require.NoError(t, err, "Create") - require.NoError(t, b.EnsureSeeded(ctx, gen), "EnsureSeeded #1") - require.NoError(t, b.EnsureSeeded(ctx, gen), "EnsureSeeded #2") + require.NoError(t, b.ActivateGeneration(ctx, gen, true), "force-activate to bypass coverage") + require.NoError(t, b.RetireGeneration(ctx, gen, true), "force-retire to reach non-building state") + require.Equal(t, string(vector.GenerationRetired), genState(t, b, gen), "precondition: gen retired") + + err = b.ActivateGeneration(ctx, gen, false) + require.Error(t, err, "activating retired gen must fail") + assert.Contains(t, err.Error(), "not in 'building' state", + "retired gen must return the not-building lifecycle error") + assert.NotContains(t, err.Error(), "needing embedding", + "retired gen must NOT surface the coverage error") } -// TestBackend_EnsureSeeded_RejectsActiveGeneration verifies the guard -// that prevents re-seeding a non-building generation. -func TestBackend_EnsureSeeded_RejectsActiveGeneration(t *testing.T) { - b, ctx, _ := newBackendForTest(t) +// TestBackend_ActivateGeneration_NullSeededAtActivatesWithCoverage mirrors +// the sqlitevec lifecycle test: a legacy/crashed generation with seeded_at +// NULL must still activate WITHOUT --force as long as coverage is complete +// (missing==0). The old seeded_at IS NOT NULL gate is gone; coverage is the +// real gate. +func TestBackend_ActivateGeneration_NullSeededAtActivatesWithCoverage(t *testing.T) { + b, ctx, db := newBackendForTest(t) gen, err := b.CreateGeneration(ctx, "m", 768, "") require.NoError(t, err, "Create") - require.NoError(t, b.ActivateGeneration(ctx, gen, true), "Activate") - err = b.EnsureSeeded(ctx, gen) - assert.ErrorIs(t, err, vector.ErrGenerationNotBuilding, - "EnsureSeeded on active gen returned %v, want ErrGenerationNotBuilding", err) + // Simulate a legacy/crashed generation: clear seeded_at. + _, err = db.ExecContext(ctx, + `UPDATE index_generations SET seeded_at = NULL WHERE id = $1`, int64(gen)) + require.NoError(t, err, "clear seeded_at") + var seededAt sql.NullInt64 + require.NoError(t, b.db.QueryRowContext(ctx, + `SELECT seeded_at FROM index_generations WHERE id = $1`, int64(gen)).Scan(&seededAt)) + require.False(t, seededAt.Valid, "precondition: seeded_at is NULL") + + // Make coverage complete (worker would stamp this after upsert). + _, err = db.ExecContext(ctx, `UPDATE messages SET embed_gen = $1 WHERE id = 1`, int64(gen)) + require.NoError(t, err, "stamp embed_gen") + + // Activation succeeds WITHOUT force despite seeded_at=NULL. + require.NoError(t, b.ActivateGeneration(ctx, gen, false), + "NULL seeded_at + full coverage must activate without --force") } // TestBackend_Upsert_RejectsDimensionMismatch ensures the per-chunk diff --git a/internal/vector/pgvector/backend_testhelpers_test.go b/internal/vector/pgvector/backend_testhelpers_test.go index 35d9c05dd..1875edbd9 100644 --- a/internal/vector/pgvector/backend_testhelpers_test.go +++ b/internal/vector/pgvector/backend_testhelpers_test.go @@ -99,7 +99,8 @@ func testSetupPGSchema(t *testing.T, db *sql.DB) { size_estimate BIGINT, sent_at TIMESTAMPTZ, deleted_at TIMESTAMPTZ, - deleted_from_source_at TIMESTAMPTZ + deleted_from_source_at TIMESTAMPTZ, + embed_gen BIGINT ); CREATE TABLE message_recipients ( id BIGINT GENERATED ALWAYS AS IDENTITY PRIMARY KEY, @@ -147,9 +148,9 @@ func unitVec(dim, axis int) []float32 { // seedAndEmbed inserts any missing message rows, creates a building // generation sized to the first vector, upserts every supplied vector -// as a chunk, and clears pending_embeddings for those rows so the -// caller sees the "fully embedded" end state. Mirrors the sqlitevec -// helper of the same name. +// as a chunk, and stamps messages.embed_gen for those rows so the caller +// sees the "fully embedded" end state (coverage complete). Mirrors the +// sqlitevec helper of the same name. func seedAndEmbed(t *testing.T, b *Backend, db *sql.DB, vecs map[int64][]float32) vector.GenerationID { t.Helper() require.NotEmpty(t, vecs, "seedAndEmbed: no vectors supplied") @@ -182,8 +183,10 @@ func seedAndEmbed(t *testing.T, b *Backend, db *sql.DB, vecs map[int64][]float32 } require.NoError(t, b.Upsert(ctx, gid, chunks), "Upsert") - _, err = b.db.ExecContext(ctx, - `DELETE FROM pending_embeddings WHERE generation_id = $1`, int64(gid)) - require.NoError(t, err, "clear pending") + for _, id := range ids { + _, err = db.ExecContext(ctx, + `UPDATE messages SET embed_gen = $1 WHERE id = $2`, int64(gid), id) + require.NoErrorf(t, err, "stamp embed_gen for msg %d", id) + } return gid } diff --git a/internal/vector/pgvector/backfill.go b/internal/vector/pgvector/backfill.go new file mode 100644 index 000000000..c8fbf5b9d --- /dev/null +++ b/internal/vector/pgvector/backfill.go @@ -0,0 +1,370 @@ +//go:build pgvector + +package pgvector + +import ( + "context" + "database/sql" + "errors" + "fmt" + + "go.kenn.io/msgvault/internal/vector" +) + +// embedGenBackfillMigration is the applied_migrations ledger key that +// guards the one-time embed_gen upgrade backfill. Stable string — never +// change it, or the backfill would re-run on every Open. +const embedGenBackfillMigration = "embed_gen_backfill_active_v1" + +// BackfillEmbedGenForUpgrade performs the ONE-TIME upgrade backfill +// (Package A): when an active generation exists, it stamps embed_gen=active +// on every message that already has >=1 embedding row under that generation +// but whose embed_gen is still NULL. +// +// Why: the embed_gen ADD COLUMN migration does no backfill, so a user +// upgrading from v0.14–v0.15 (who already has an active generation + a +// fully-embedded corpus) would have embed_gen=NULL everywhere. Coverage +// would then report the ENTIRE archive as missing and the worker would +// re-embed all of it. This stamps the already-embedded rows instead — a +// cheap metadata UPDATE, no re-embed. +// +// Guards: +// - ONE-TIME via the applied_migrations ledger (key +// embedGenBackfillMigration). Check-then-run-then-mark. It must NOT run +// on every Open: re-running would clobber repair-encoding's NULL resets +// before they re-embed, and fight an in-progress rebuild. +// Accepted residual window: the check and the mark are NOT atomic across +// PROCESSES, so two concurrent first-opens of a freshly-upgraded DB +// (before either marks the ledger) can both run the backfill once; the +// second could re-stamp a row repair-encoding just reset to NULL. This +// window is ONE-SHOT (only at the first post-upgrade open, before the +// ledger is marked) and astronomically rare — accepted, not closed +// (operator decision): the mitigation is operational (run only one +// embedding process at a time; see README Vector Search). Within a single +// process the in-tx mark + EmbedJob single-flight lock prevent re-runs. +// - It lives in the VECTOR layer because the embeddings table is only +// reachable here. +// - The stamp UPDATE only touches rows where embed_gen IS NULL, so it +// never overwrites a row already stamped for another generation. +// +// No-ops cleanly when the ledger already records it, there is no active +// generation, the embeddings table is empty, or there are no +// embedded-but-unstamped rows. Idempotent. +// +// Single DB on PostgreSQL: messages, embeddings, and the ledger all share +// b.db, so the backfill is one EXISTS-correlated UPDATE. +func (b *Backend) BackfillEmbedGenForUpgrade(ctx context.Context) error { + // A database without applied_migrations is not a real msgvault store + // (e.g. a minimal test fixture, or a DB opened before the store schema + // ran); skip the backfill entirely rather than fail Open. + var ledger *string + if err := b.db.QueryRowContext(ctx, + `SELECT to_regclass('applied_migrations')::text`).Scan(&ledger); err != nil { + return fmt.Errorf("backfill: probe ledger: %w", err) + } + if ledger == nil { + return nil + } + + // Robustness guard (mirrors resetOrphanedEmbedGen and the SQLite side): + // the stamp UPDATE writes messages.embed_gen, so a DB whose messages table + // predates the embed_gen column (a partial restore, or a writable Open that + // ran before store.InitSchema added the column) must skip rather than fail + // Open. The column is added by PostgreSQLDialect.LegacyColumnMigrations and + // is present on any store created via the full schema. + hasCol, err := messagesHasEmbedGen(ctx, b.db) + if err != nil { + return err + } + if !hasCol { + return nil + } + + applied, err := b.backfillApplied(ctx) + if err != nil { + return err + } + if applied { + return nil + } + + // Resolve the active generation. No active generation means nothing to + // backfill — but we still mark the migration applied so a later + // just-activated generation does not retroactively trigger a backfill + // that re-stamps rows repair-encoding may have reset. A lone ledger + // INSERT is trivially atomic, so it runs directly (no transaction). + // + // Intentional scope limit: only the ACTIVE generation is backfilled. Any + // BUILDING generation that existed pre-upgrade is left unstamped — a + // resumed rebuild idempotently re-embeds that bounded portion (scan-and- + // fill skips already-covered rows), so the cost is small and one-time. + // Per-generation backfill complexity is not worth it for a single-user + // tool. + active, err := b.ActiveGeneration(ctx) + if err != nil { + if errors.Is(err, vector.ErrNoActiveGeneration) { + return b.markBackfillAppliedExec(ctx, b.db) + } + return fmt.Errorf("backfill: resolve active generation: %w", err) + } + + // Atomicity (Codex 129d #2/#3): the embed_gen stamp UPDATE and the + // ledger mark must be all-or-nothing. messages and applied_migrations + // share b.db, so a single transaction covers both. If the process + // crashes (or any error occurs) after the UPDATE but before the mark, an + // autocommit pair would leave the ledger UNMARKED while embed_gen was + // already stamped → the supposedly one-time backfill re-runs on the next + // Open and clobbers any NULL resets repair-encoding made in the interim. + // Wrapping both in one tx makes a crash leave the DB exactly pre-backfill + // (no stamps, no mark), so the next Open re-runs cleanly. + tx, err := b.db.BeginTx(ctx, nil) + if err != nil { + return fmt.Errorf("backfill: begin tx: %w", err) + } + committed := false + defer func() { + if !committed { + _ = tx.Rollback() + } + }() + + // Disable the pool-wide 30s statement_timeout for this tx: on a real corpus + // the stamp UPDATE below is a ≈28s nested-loop EXISTS-correlated semi-join + // over the full messages table (≈53k rows), which — combined with the + // preceding resetOrphanedEmbedGen UPDATE on the same Open — exceeds the + // shared store pool's statement_timeout=30s, cancelling the one-time + // upgrade backfill at 30s (SQLSTATE 57014) and rolling it back so the + // upgrade never completes (finding S1 family, mirroring + // Migrate/ActivateGeneration/RetireGeneration/EnsureVectorIndex). SET LOCAL + // is tx-scoped and auto-resets on commit/rollback, so the disabled timeout + // cannot leak onto other pooled connections. Must be the first statement in + // the tx to cover the stamp UPDATE. + if _, err := tx.ExecContext(ctx, "SET LOCAL statement_timeout = 0"); err != nil { + return fmt.Errorf("backfill: disable statement_timeout: %w", err) + } + + // Preserve the legacy pending re-embed signal. Under the + // OLD design, pending_embeddings was a re-embed flag: a message could carry + // BOTH an active-gen embedding AND an active-gen pending row (old + // repair-encoding re-enqueued already-embedded messages; the old worker + // deleted the pending row only on a successful re-embed). Stamping + // embed_gen=active on such a message would read "covered" forever and never + // re-embed it — silent permanent staleness. EXCLUDE active-gen pending ids + // from the stamp so they end embed_gen=NULL and scan-and-fill re-embeds them. + // + // pending_embeddings may already be gone (a DB upgraded before this change + // dropped it in Migrate, or a fresh DB never had it). Probe first and add the + // NOT IN exclusion only when the table is present; otherwise the plain stamp + // (no legacy signal to preserve) runs. + pendingExists, err := pendingEmbeddingsExists(ctx, tx) + if err != nil { + return err + } + stamp := `UPDATE messages SET embed_gen = $1 + WHERE embed_gen IS NULL + AND EXISTS ( + SELECT 1 FROM embeddings e + WHERE e.message_id = messages.id + AND e.generation_id = $1)` + if pendingExists { + stamp += ` + AND messages.id NOT IN ( + SELECT message_id FROM pending_embeddings WHERE generation_id = $1)` + } + // Stamp embed_gen=active for messages with an embedding row under the + // active generation, only where embed_gen is still NULL (never overwrite + // a row stamped for another generation), excluding active-gen pending ids. + if _, err := tx.ExecContext(ctx, stamp, int64(active.ID)); err != nil { + return fmt.Errorf("backfill: stamp embed_gen: %w", err) + } + + if err := b.markBackfillAppliedExec(ctx, tx); err != nil { + return err + } + + if err := tx.Commit(); err != nil { + return fmt.Errorf("backfill: commit tx: %w", err) + } + committed = true + return nil +} + +// rowQueryer is the subset of *sql.DB / *sql.Tx that pendingEmbeddingsExists +// needs, so it can probe either on the pool or inside the backfill tx. +type rowQueryer interface { + QueryRowContext(ctx context.Context, query string, args ...any) *sql.Row +} + +// pendingEmbeddingsExists reports whether the legacy pending_embeddings table +// is present. Used by BackfillEmbedGenForUpgrade to decide whether to apply the +// active-gen pending exclusion. to_regclass returns NULL for a +// non-existent relation, so a NULL scan means "absent". +func pendingEmbeddingsExists(ctx context.Context, q rowQueryer) (bool, error) { + var reg *string + if err := q.QueryRowContext(ctx, + `SELECT to_regclass('pending_embeddings')::text`).Scan(®); err != nil { + return false, fmt.Errorf("backfill: probe pending_embeddings: %w", err) + } + return reg != nil, nil +} + +// dropDeadPendingEmbeddings drops the legacy pending_embeddings queue table. +// The scan-and-fill design replaced the per-generation seed queue with a live +// messages.embed_gen scan, so the table is otherwise unused; left in place it +// only wastes space and confuses operators. +// +// It runs on every WRITABLE Open, AFTER BackfillEmbedGenForUpgrade has had a +// chance to consult the table and preserve its re-embed signal. +// Doing the drop here rather than in Migrate guarantees the backfill (when it +// runs) sees the table first; gated to the writable Open path so a read-only +// Open leaves the table — and its signal — intact for the next writable open. +// Runs inside a tx that lifts the pool-wide statement_timeout (DROP takes an +// ACCESS EXCLUSIVE lock; the lock-wait alone can exceed 30s on a busy serve), +// matching the sibling write helpers. Idempotent: DROP TABLE IF EXISTS is a +// no-op on fresh DBs and on a second run. +func (b *Backend) dropDeadPendingEmbeddings(ctx context.Context) error { + tx, err := b.db.BeginTx(ctx, nil) + if err != nil { + return fmt.Errorf("drop dead pending_embeddings: begin tx: %w", err) + } + defer func() { _ = tx.Rollback() }() + + if _, err := tx.ExecContext(ctx, "SET LOCAL statement_timeout = 0"); err != nil { + return fmt.Errorf("drop dead pending_embeddings: disable statement_timeout: %w", err) + } + if _, err := tx.ExecContext(ctx, `DROP TABLE IF EXISTS pending_embeddings`); err != nil { + return fmt.Errorf("drop dead pending_embeddings table: %w", err) + } + if err := tx.Commit(); err != nil { + return fmt.Errorf("drop dead pending_embeddings: commit tx: %w", err) + } + return nil +} + +// resetOrphanedEmbedGen clears messages.embed_gen for every message whose +// stamp references a generation id that does NOT exist in index_generations +// (an "orphaned" stamp). It runs on every WRITABLE Open BEFORE +// BackfillEmbedGenForUpgrade. +// +// Why: this mirrors the sqlitevec safety net. On SQLite, index_generations +// lives in a replaceable vectors.db and embed_gen lives in the durable +// main.db, so a vectors.db wipe restarts gen ids at 1 while old stamps linger, +// masking coverage and activating an empty index. On PG everything shares one +// database, so a true recreate drops messages and index_generations together — +// the orphan window only opens under a partial restore (e.g. messages restored +// but embeddings/generations not). The reset is kept for symmetry and to +// defend that partial-restore case. +// +// False-positive-proof: a stamp pointing to a still-existing generation row +// (active, building, OR retired — retire only flips state, it does not delete +// the row) is KEPT, so the normal activate/retire flow never trips this reset. +// Only a vanished gen id triggers a clear. +// +// Single DB on PG: one UPDATE with a NOT IN subquery. `NOT IN (subquery)` +// handles the empty case correctly in PostgreSQL (it degrades to "all rows"), +// and index_generations.id is NOT NULL so the NULL-in-subquery pitfall cannot +// arise. +// +// Guards (mirror BackfillEmbedGenForUpgrade / the Open ReadOnly gate): the +// caller (Open) skips this on the ReadOnly path. NOT ledger-guarded: it +// re-checks every writable Open; cheap + idempotent (a second run finds no +// orphans and updates nothing). +func (b *Backend) resetOrphanedEmbedGen(ctx context.Context) error { + // Robustness guard (mirrors the SQLite resetOrphanedEmbedGen, which skips + // when applied_migrations is absent because "such a fixture also lacks the + // embed_gen column"): the reset UPDATE writes messages.embed_gen, so a DB + // whose messages table predates the embed_gen column must skip rather than + // fail Open with `column "embed_gen" does not exist (SQLSTATE 42703)`. This + // happens on a partial restore, or when a writable Open (e.g. `msgvault + // search --vector` via search_vector.go) runs before store.InitSchema has + // added the column. The column is added by + // PostgreSQLDialect.LegacyColumnMigrations and is present on any store + // created via the full schema. + hasCol, err := messagesHasEmbedGen(ctx, b.db) + if err != nil { + return err + } + if !hasCol { + return nil + } + + // Run the UPDATE inside a short tx that disables the pool-wide 30s + // statement_timeout (finding S1 family, mirroring + // Migrate/ActivateGeneration/RetireGeneration/EnsureVectorIndex and the + // upgrade backfill below). This reset is normally cheap (0 rows when there + // are no orphans), but on a partial restore it can clear corpus-size stamps + // and — running right before BackfillEmbedGenForUpgrade on the same Open — + // would otherwise risk a 30s cancellation (SQLSTATE 57014). SET LOCAL is + // tx-scoped and auto-resets on commit/rollback, so the disabled timeout + // cannot leak onto other pooled connections. Must be the first statement in + // the tx. Behaviour is otherwise identical: same WHERE clause, idempotent + // (a second run finds no orphans), and gated by the ReadOnly Open path. + tx, err := b.db.BeginTx(ctx, nil) + if err != nil { + return fmt.Errorf("reset orphaned embed_gen: begin tx: %w", err) + } + defer func() { _ = tx.Rollback() }() + + if _, err := tx.ExecContext(ctx, "SET LOCAL statement_timeout = 0"); err != nil { + return fmt.Errorf("reset orphaned embed_gen: disable statement_timeout: %w", err) + } + if _, err := tx.ExecContext(ctx, + `UPDATE messages SET embed_gen = NULL + WHERE embed_gen IS NOT NULL + AND embed_gen NOT IN (SELECT id FROM index_generations)`); err != nil { + return fmt.Errorf("reset orphaned embed_gen: clear orphaned stamps: %w", err) + } + if err := tx.Commit(); err != nil { + return fmt.Errorf("reset orphaned embed_gen: commit tx: %w", err) + } + return nil +} + +// messagesHasEmbedGen reports whether the messages table has an embed_gen +// column. Used by resetOrphanedEmbedGen and BackfillEmbedGenForUpgrade to +// no-op rather than fail Open when the column is absent (a DB whose messages +// predates the column, or a writable Open before store.InitSchema ran). A +// missing messages table also reports false. Mirrors the SQLite side's +// table-existence guard. +func messagesHasEmbedGen(ctx context.Context, db *sql.DB) (bool, error) { + var n int + if err := db.QueryRowContext(ctx, + `SELECT COUNT(*) FROM information_schema.columns + WHERE table_name = 'messages' AND column_name = 'embed_gen' + AND table_schema = ANY (current_schemas(false))`).Scan(&n); err != nil { + return false, fmt.Errorf("probe messages.embed_gen column: %w", err) + } + return n > 0, nil +} + +// backfillApplied reports whether the one-time backfill ledger row exists. +func (b *Backend) backfillApplied(ctx context.Context) (bool, error) { + var n int + if err := b.db.QueryRowContext(ctx, + `SELECT COUNT(*) FROM applied_migrations WHERE name = $1`, + embedGenBackfillMigration).Scan(&n); err != nil { + return false, fmt.Errorf("backfill: check ledger: %w", err) + } + return n > 0, nil +} + +// execer is the subset of *sql.DB / *sql.Tx the ledger mark needs, so +// markBackfillAppliedExec can run either directly (lone INSERT, the +// no-active-gen path) or inside the backfill transaction (alongside the +// embed_gen UPDATE, for atomicity). +type execer interface { + ExecContext(ctx context.Context, query string, args ...any) (sql.Result, error) +} + +// markBackfillAppliedExec records the one-time backfill in the ledger via +// the given execer. ON CONFLICT DO NOTHING keeps it idempotent under a +// concurrent Open. Pass b.db for a standalone mark, or the backfill tx so +// the mark commits atomically with the embed_gen UPDATE. +func (b *Backend) markBackfillAppliedExec(ctx context.Context, ex execer) error { + if _, err := ex.ExecContext(ctx, + `INSERT INTO applied_migrations (name) VALUES ($1) ON CONFLICT DO NOTHING`, + embedGenBackfillMigration); err != nil { + return fmt.Errorf("backfill: mark ledger: %w", err) + } + return nil +} diff --git a/internal/vector/pgvector/backfill_test.go b/internal/vector/pgvector/backfill_test.go new file mode 100644 index 000000000..d6d0947f6 --- /dev/null +++ b/internal/vector/pgvector/backfill_test.go @@ -0,0 +1,769 @@ +//go:build pgvector + +package pgvector + +import ( + "context" + "database/sql" + "fmt" + "os" + "strings" + "testing" + + "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/require" + "go.kenn.io/msgvault/internal/vector" +) + +// embedGenOf reads embed_gen for a message, reporting whether it is NULL. +func embedGenOf(t *testing.T, db *sql.DB, id int64) (val int64, isNull bool) { + t.Helper() + var v sql.NullInt64 + require.NoError(t, db.QueryRow(`SELECT embed_gen FROM messages WHERE id = $1`, id).Scan(&v)) + return v.Int64, !v.Valid +} + +// TestBackfillEmbedGen_UpgradeStampsEmbeddedOnly mirrors the sqlitevec FIX +// B test on PostgreSQL: an active generation already has embeddings for +// some messages but embed_gen is NULL everywhere (the ADD COLUMN did no +// backfill). The one-time backfill stamps embed_gen=active for the embedded +// messages and leaves the un-embedded one NULL; coverage becomes honest; +// re-running is a ledger-guarded no-op. +func TestBackfillEmbedGen_UpgradeStampsEmbeddedOnly(t *testing.T) { + ctx := context.Background() + db := openPGTestDB(t) + // The minimal PG test schema omits applied_migrations; create it so the + // ledger guard has somewhere to record. + _, err := db.Exec(`CREATE TABLE applied_migrations ( + name TEXT PRIMARY KEY, + applied_at TIMESTAMPTZ NOT NULL DEFAULT CURRENT_TIMESTAMP + )`) + require.NoError(t, err, "create applied_migrations") + + // 3 messages: 1 and 2 embedded under the active gen, 3 not. + for _, id := range []int64{1, 2, 3} { + _, err := db.Exec(`INSERT INTO messages (id) VALUES ($1)`, id) + require.NoError(t, err, "insert message") + } + + b, err := Open(ctx, Options{DB: db, Dimension: 4}) + require.NoError(t, err, "Open") + t.Cleanup(func() { _ = b.Close() }) + + gen, err := b.CreateGeneration(ctx, "fake", 4, "") + require.NoError(t, err, "CreateGeneration") + + chunks := []vector.Chunk{ + {MessageID: 1, Vector: []float32{1, 0, 0, 0}}, + {MessageID: 2, Vector: []float32{0, 1, 0, 0}}, + } + require.NoError(t, b.Upsert(ctx, gen, chunks), "Upsert") + + // Stamp + activate, then simulate the upgrade by resetting embed_gen. + _, err = db.ExecContext(ctx, `UPDATE messages SET embed_gen = $1`, int64(gen)) + require.NoError(t, err, "stamp") + require.NoError(t, b.ActivateGeneration(ctx, gen, true), "activate (force)") + _, err = db.ExecContext(ctx, `UPDATE messages SET embed_gen = NULL`) + require.NoError(t, err, "reset embed_gen to NULL (simulate upgrade)") + + // Open already ran (and marked) the backfill at open time when no gen + // existed; clear the ledger so this call reproduces the real upgrade + // timing (first Open where an active gen + embeddings are present). + _, err = db.ExecContext(ctx, + `DELETE FROM applied_migrations WHERE name = $1`, embedGenBackfillMigration) + require.NoError(t, err, "reset ledger") + + require.NoError(t, b.BackfillEmbedGenForUpgrade(ctx), "backfill") + + for _, id := range []int64{1, 2} { + v, isNull := embedGenOf(t, db, id) + assert.Falsef(t, isNull, "msg %d should be stamped", id) + assert.Equalf(t, int64(gen), v, "msg %d embed_gen", id) + } + _, isNull3 := embedGenOf(t, db, 3) + assert.True(t, isNull3, "msg 3 (un-embedded) stays NULL") + + // Coverage is honest: only message 3 missing. + s, err := b.Stats(ctx, gen) + require.NoError(t, err, "Stats") + assert.Equal(t, int64(1), s.PendingCount, "post-backfill: only msg 3 missing") + + // Re-running is a ledger-guarded no-op: msg 3 stays NULL. + require.NoError(t, b.BackfillEmbedGenForUpgrade(ctx), "backfill again (no-op)") + _, isNull3Again := embedGenOf(t, db, 3) + assert.True(t, isNull3Again, "msg 3 still NULL after second backfill (ledger no-op)") +} + +// TestBackfillEmbedGen_PreservesActiveGenPendingReembedSignal is the PG +// regression guard for the pending-signal preservation case: the one-time +// upgrade backfill must NOT stamp embed_gen=active on a message that carried an +// active-gen pending_embeddings row (the OLD re-embed flag), even though that +// message has an active-gen embedding. Such a message had a STALE embedding +// queued for re-embed (old repair-encoding re-enqueued it). If the backfill +// stamps it "covered" it is never re-embedded — silent permanent staleness. It +// must end embed_gen=NULL so scan-and-fill re-embeds it; a normal embedded +// message with no pending row must end embed_gen=active. pending_embeddings is +// dropped after. +func TestBackfillEmbedGen_PreservesActiveGenPendingReembedSignal(t *testing.T) { + ctx := context.Background() + db := openPGTestDB(t) + _, err := db.Exec(`CREATE TABLE applied_migrations ( + name TEXT PRIMARY KEY, + applied_at TIMESTAMPTZ NOT NULL DEFAULT CURRENT_TIMESTAMP + )`) + require.NoError(t, err, "create applied_migrations") + + // msg 1: embedded AND pending-for-re-embed (stale) -> must stay NULL. + // msg 2: embedded, NO pending row (normal) -> must be stamped. + for _, id := range []int64{1, 2} { + _, err := db.Exec(`INSERT INTO messages (id) VALUES ($1)`, id) + require.NoError(t, err, "insert message") + } + + b, err := Open(ctx, Options{DB: db, Dimension: 4}) + require.NoError(t, err, "Open") + t.Cleanup(func() { _ = b.Close() }) + + gen, err := b.CreateGeneration(ctx, "fake", 4, "") + require.NoError(t, err, "CreateGeneration") + require.NoError(t, b.Upsert(ctx, gen, []vector.Chunk{ + {MessageID: 1, Vector: []float32{1, 0, 0, 0}}, + {MessageID: 2, Vector: []float32{0, 1, 0, 0}}, + }), "Upsert") + require.NoError(t, b.ActivateGeneration(ctx, gen, true), "Activate") + + // Reconstruct the OLD-state precondition: pending_embeddings exists and + // carries an active-gen row for msg 1 only (msg 1 was re-enqueued for + // re-embed while still holding its stale active-gen embedding). + _, err = db.ExecContext(ctx, `CREATE TABLE pending_embeddings ( + generation_id BIGINT NOT NULL, + message_id BIGINT NOT NULL + )`) + require.NoError(t, err, "create legacy pending_embeddings") + _, err = db.ExecContext(ctx, + `INSERT INTO pending_embeddings (generation_id, message_id) VALUES ($1, 1)`, int64(gen)) + require.NoError(t, err, "seed active-gen pending row for msg 1") + + // Simulate the upgrade: embed_gen NULL everywhere, ledger cleared so the + // next backfill runs (Open marked it at open time when no gen existed). + _, err = db.ExecContext(ctx, `UPDATE messages SET embed_gen = NULL`) + require.NoError(t, err, "reset embed_gen") + _, err = db.ExecContext(ctx, + `DELETE FROM applied_migrations WHERE name = $1`, embedGenBackfillMigration) + require.NoError(t, err, "clear ledger") + + require.NoError(t, b.BackfillEmbedGenForUpgrade(ctx), "backfill") + + // msg 1 (had an active-gen pending re-embed row) must stay NULL → re-embed. + _, isNull1 := embedGenOf(t, db, 1) + assert.True(t, isNull1, + "msg 1 (active-gen pending re-embed) must stay embed_gen=NULL so it re-embeds") + // msg 2 (normal embedded, no pending) must be stamped → not re-embedded. + v2, isNull2 := embedGenOf(t, db, 2) + assert.False(t, isNull2, "msg 2 (no pending row) must be stamped") + assert.Equal(t, int64(gen), v2, "msg 2 embed_gen = active") +} + +// TestOpen_DropsDeadPendingEmbeddings pins that a normal writable Open drops +// the dead pending_embeddings table AFTER the backfill has had a chance to +// consult it. The drop moved out of Migrate into the Open +// writable path. +func TestOpen_DropsDeadPendingEmbeddings(t *testing.T) { + ctx := context.Background() + db := openPGTestDB(t) + _, err := db.Exec(`CREATE TABLE applied_migrations ( + name TEXT PRIMARY KEY, + applied_at TIMESTAMPTZ NOT NULL DEFAULT CURRENT_TIMESTAMP + )`) + require.NoError(t, err, "create applied_migrations") + + // Stand up a legacy pending_embeddings table, then open writably. + _, err = db.ExecContext(ctx, `CREATE TABLE pending_embeddings ( + generation_id BIGINT NOT NULL, + message_id BIGINT NOT NULL + )`) + require.NoError(t, err, "create legacy pending_embeddings") + + b, err := Open(ctx, Options{DB: db, Dimension: 4}) + require.NoError(t, err, "writable Open") + t.Cleanup(func() { _ = b.Close() }) + + var reg *string + require.NoError(t, db.QueryRowContext(ctx, + `SELECT to_regclass('pending_embeddings')::text`).Scan(®)) + assert.Nil(t, reg, "writable Open must drop pending_embeddings after the backfill consults it") +} + +// TestBackfillEmbedGen_StampAndMarkAtomic_RollbackOnMarkFailure is the +// PostgreSQL companion to the sqlitevec atomicity guard (Codex 129d #2/#3): +// the embed_gen stamp UPDATE and the applied_migrations ledger mark must be +// ONE transaction. messages and applied_migrations share b.db on PG, so a +// single tx covers both. +// +// Fault injection: a BEFORE INSERT trigger on applied_migrations RAISEs an +// exception when the backfill ledger row is inserted, so the mark step fails +// AFTER the embed_gen UPDATE has run inside the same tx. If atomic, the UPDATE +// must be ROLLED BACK (embed_gen stays NULL) and the ledger must stay UNMARKED, +// so a later clean backfill re-runs and completes. +func TestBackfillEmbedGen_StampAndMarkAtomic_RollbackOnMarkFailure(t *testing.T) { + ctx := context.Background() + db := openPGTestDB(t) + _, err := db.Exec(`CREATE TABLE applied_migrations ( + name TEXT PRIMARY KEY, + applied_at TIMESTAMPTZ NOT NULL DEFAULT CURRENT_TIMESTAMP + )`) + require.NoError(t, err, "create applied_migrations") + + _, err = db.Exec(`INSERT INTO messages (id) VALUES (1)`) + require.NoError(t, err, "insert message") + + b, err := Open(ctx, Options{DB: db, Dimension: 4}) + require.NoError(t, err, "Open") + t.Cleanup(func() { _ = b.Close() }) + + gen, err := b.CreateGeneration(ctx, "fake", 4, "") + require.NoError(t, err, "CreateGeneration") + require.NoError(t, b.Upsert(ctx, gen, []vector.Chunk{ + {MessageID: 1, Vector: []float32{1, 0, 0, 0}}, + }), "Upsert") + require.NoError(t, b.ActivateGeneration(ctx, gen, true), "Activate") + + // Pre-upgrade state: embed_gen NULL, ledger cleared (Open already marked + // it when no gen existed). + _, err = db.ExecContext(ctx, `UPDATE messages SET embed_gen = NULL`) + require.NoError(t, err, "reset embed_gen") + _, err = db.ExecContext(ctx, + `DELETE FROM applied_migrations WHERE name = $1`, embedGenBackfillMigration) + require.NoError(t, err, "clear ledger") + + // Install a fault that makes ONLY the ledger mark fail. The embed_gen + // UPDATE on messages still succeeds, so a non-atomic implementation would + // leave embed_gen stamped while the ledger stays unmarked. + _, err = db.Exec(`CREATE FUNCTION zz_fail_backfill_mark() RETURNS trigger AS $fn$ + BEGIN + IF NEW.name = '` + embedGenBackfillMigration + `' THEN + RAISE EXCEPTION 'injected backfill mark failure'; + END IF; + RETURN NEW; + END; + $fn$ LANGUAGE plpgsql`) + require.NoError(t, err, "create fault function") + _, err = db.Exec(`CREATE TRIGGER zz_fail_backfill_mark + BEFORE INSERT ON applied_migrations + FOR EACH ROW EXECUTE FUNCTION zz_fail_backfill_mark()`) + require.NoError(t, err, "install fault trigger") + + err = b.BackfillEmbedGenForUpgrade(ctx) + require.Error(t, err, "backfill must surface the injected ledger-mark failure") + assert.Contains(t, err.Error(), "injected backfill mark failure") + + // Atomicity: the stamp must have been ROLLED BACK with the failed mark. + _, isNull := embedGenOf(t, db, 1) + assert.True(t, isNull, + "embed_gen must be rolled back to NULL when the ledger mark fails (atomic)") + var marked int + require.NoError(t, db.QueryRow( + `SELECT COUNT(*) FROM applied_migrations WHERE name = $1`, + embedGenBackfillMigration).Scan(&marked)) + assert.Equal(t, 0, marked, "ledger must stay unmarked when the backfill tx rolls back") + + // Recovery: remove the fault and re-run. The migration was never marked, + // so the one-time backfill re-runs cleanly and now completes. + _, err = db.Exec(`DROP TRIGGER zz_fail_backfill_mark ON applied_migrations`) + require.NoError(t, err, "drop fault trigger") + require.NoError(t, b.BackfillEmbedGenForUpgrade(ctx), "clean re-run must succeed") + + v, isNull := embedGenOf(t, db, 1) + assert.False(t, isNull, "embed_gen stamped after clean re-run") + assert.Equal(t, int64(gen), v, "embed_gen references the active generation") + require.NoError(t, db.QueryRow( + `SELECT COUNT(*) FROM applied_migrations WHERE name = $1`, + embedGenBackfillMigration).Scan(&marked)) + assert.Equal(t, 1, marked, "ledger marked after clean re-run") +} + +// TestResetOrphanedEmbedGen_RecreateScenario mirrors the sqlitevec recreate +// test on PostgreSQL: messages carry stamps for a generation id that no longer +// exists in index_generations (a partial restore — messages restored, the +// generation row not). A writable Open must reset those orphaned stamps to NULL +// so coverage reports them missing rather than masking an empty index. +func TestResetOrphanedEmbedGen_RecreateScenario(t *testing.T) { + ctx := context.Background() + db := openPGTestDB(t) + + for _, id := range []int64{1, 2} { + _, err := db.Exec(`INSERT INTO messages (id) VALUES ($1)`, id) + require.NoError(t, err, "insert message") + } + + // Open (creates the empty index_generations), then stamp both messages + // for a generation id (99) that does not exist — the orphan condition. + b, err := Open(ctx, Options{DB: db, Dimension: 4}) + require.NoError(t, err, "Open") + t.Cleanup(func() { _ = b.Close() }) + _, err = db.ExecContext(ctx, `UPDATE messages SET embed_gen = 99`) + require.NoError(t, err, "stamp orphaned embed_gen=99") + + // Re-open writable: the reset runs and clears the orphaned stamps. + b2, err := Open(ctx, Options{DB: db, Dimension: 4}) + require.NoError(t, err, "re-Open (writable)") + t.Cleanup(func() { _ = b2.Close() }) + + for _, id := range []int64{1, 2} { + _, isNull := embedGenOf(t, db, id) + assert.Truef(t, isNull, "msg %d embed_gen reset to NULL (orphaned)", id) + } +} + +// TestResetOrphanedEmbedGen_NoFalsePositive verifies the PG reset PRESERVES +// stamps that reference a still-existing generation row (active or retired — +// retire only flips state on PG, it does not delete the index_generations row). +func TestResetOrphanedEmbedGen_NoFalsePositive(t *testing.T) { + ctx := context.Background() + db := openPGTestDB(t) + + for _, id := range []int64{1, 2} { + _, err := db.Exec(`INSERT INTO messages (id) VALUES ($1)`, id) + require.NoError(t, err, "insert message") + } + + b, err := Open(ctx, Options{DB: db, Dimension: 4}) + require.NoError(t, err, "Open") + t.Cleanup(func() { _ = b.Close() }) + + gen, err := b.CreateGeneration(ctx, "fake", 4, "") + require.NoError(t, err, "CreateGeneration") + require.NoError(t, b.Upsert(ctx, gen, []vector.Chunk{ + {MessageID: 1, Vector: []float32{1, 0, 0, 0}}, + {MessageID: 2, Vector: []float32{0, 1, 0, 0}}, + }), "Upsert") + _, err = db.ExecContext(ctx, `UPDATE messages SET embed_gen = $1`, int64(gen)) + require.NoError(t, err, "stamp") + require.NoError(t, b.ActivateGeneration(ctx, gen, true), "Activate") + + // Re-open writable: gen still exists, so its stamps must be preserved. + b2, err := Open(ctx, Options{DB: db, Dimension: 4}) + require.NoError(t, err, "re-Open (writable)") + t.Cleanup(func() { _ = b2.Close() }) + + for _, id := range []int64{1, 2} { + v, isNull := embedGenOf(t, db, id) + assert.Falsef(t, isNull, "msg %d stamp preserved (gen still exists)", id) + assert.Equalf(t, int64(gen), v, "msg %d embed_gen preserved", id) + } +} + +// TestResetOrphanedEmbedGen_ReadOnly_Skipped verifies the orphaned-stamp +// reset is suppressed on the READ-ONLY Open path (ReadOnly=true), where +// writes are rejected. An orphaned stamp must be left untouched. The +// read-only signal is ReadOnly, not SkipMigrate: an MCP open sets both, but a +// writable management open (SkipMigrate=true, ReadOnly=false) must still run +// the reset — see TestResetOrphanedEmbedGen_SkipMigrate_Management_Resets. +func TestResetOrphanedEmbedGen_ReadOnly_Skipped(t *testing.T) { + ctx := context.Background() + db := openPGTestDB(t) + + _, err := db.Exec(`INSERT INTO messages (id) VALUES (1)`) + require.NoError(t, err, "insert message") + + // Bring up the schema (index_generations etc.) via a writable Open, then + // stamp an orphaned embed_gen. + b, err := Open(ctx, Options{DB: db, Dimension: 4}) + require.NoError(t, err, "Open (writable, migrate)") + t.Cleanup(func() { _ = b.Close() }) + _, err = db.ExecContext(ctx, `UPDATE messages SET embed_gen = 99`) + require.NoError(t, err, "stamp orphaned embed_gen=99") + + // Read-only Open (MCP path sets SkipMigrate + ReadOnly): the reset must be + // skipped because no writes are permitted. + b2, err := Open(ctx, Options{DB: db, Dimension: 4, SkipMigrate: true, ReadOnly: true}) + require.NoError(t, err, "Open (ReadOnly) must not error") + t.Cleanup(func() { _ = b2.Close() }) + + v, isNull := embedGenOf(t, db, 1) + assert.False(t, isNull, "ReadOnly Open must NOT reset the orphaned embed_gen") + assert.Equal(t, int64(99), v, "orphaned stamp unchanged under ReadOnly Open") +} + +// TestResetOrphanedEmbedGen_SkipMigrate_Management_Resets is the inverse of +// the ReadOnly test and the heart of the bug fix: a WRITABLE management open +// (SkipMigrate=true to avoid CREATE EXTENSION, ReadOnly=false) MUST still run +// the orphan reset — gating moved from SkipMigrate to ReadOnly. Pre-fix this +// open performed NO writes (reset was gated on !SkipMigrate), leaving the +// orphaned stamp in place. +func TestResetOrphanedEmbedGen_SkipMigrate_Management_Resets(t *testing.T) { + ctx := context.Background() + db := openPGTestDB(t) + + _, err := db.Exec(`INSERT INTO messages (id) VALUES (1)`) + require.NoError(t, err, "insert message") + + // Bring up the schema via a writable full Open, then stamp an orphan. + b, err := Open(ctx, Options{DB: db, Dimension: 4}) + require.NoError(t, err, "Open (writable, migrate)") + t.Cleanup(func() { _ = b.Close() }) + _, err = db.ExecContext(ctx, `UPDATE messages SET embed_gen = 99`) + require.NoError(t, err, "stamp orphaned embed_gen=99") + + // Management open: SkipMigrate (no CREATE EXTENSION) but writable. The + // reset must run and clear the orphaned stamp. + b2, err := Open(ctx, Options{DB: db, Dimension: 4, SkipMigrate: true}) + require.NoError(t, err, "Open (SkipMigrate, writable) must not error") + t.Cleanup(func() { _ = b2.Close() }) + + _, isNull := embedGenOf(t, db, 1) + assert.True(t, isNull, "writable management Open MUST reset the orphaned embed_gen") +} + +// pgRegclassExists reports whether a relation resolves in the connection's +// search_path (per-test schema first, then public). Used to assert the +// extension-less schema apply created embed_watermark on a management Open. +func pgRegclassExists(t *testing.T, db *sql.DB, rel string) bool { + t.Helper() + var name *string + require.NoError(t, db.QueryRow(`SELECT to_regclass($1)::text`, rel).Scan(&name)) + return name != nil +} + +// TestManagementOpen_FiresUpgradeBackfill is the primary regression test for +// the PG upgrade-backfill bug: on PostgreSQL the one-time embed_gen backfill + +// embed_watermark creation were gated behind SkipMigrate, which the writable +// management/coverage commands set true (to avoid the privileged CREATE +// EXTENSION). A post-upgrade PG archive therefore reported its whole corpus as +// missing (embed_gen NULL everywhere) on the first management command. +// +// The fix gates the one-time upgrade (extension-less schema apply + orphan +// reset + backfill) on !ReadOnly, not !SkipMigrate. A writable management open +// (SkipMigrate=true, ReadOnly=false) now backfills like SQLite. +// +// Setup: an UPGRADED-but-unstamped archive — messages present, embeddings +// present under an active generation, but embed_gen NULL on the embedded rows +// (the ADD COLUMN did no backfill) and the ledger cleared. Then a fresh +// management-style Open must stamp the embedded rows, leave embed_watermark in +// place, and set the ledger key. +// +// Pre-fix failure: with the old gating (the whole upgrade block behind +// !SkipMigrate), a SkipMigrate=true Open performed NO writes — no schema-only +// migrate, no backfill — so embed_gen stayed NULL on rows 1 and 2 and the +// ledger key stayed absent. The post-fix assertions below (stamped == gen, +// ledger present) would fail. Verified by reasoning against the pre-fix Open +// body (a single `if !opts.SkipMigrate { ... }` guarding migrate+reset+ +// backfill); reverting Open to that shape makes this test red. +func TestManagementOpen_FiresUpgradeBackfill(t *testing.T) { + ctx := context.Background() + db := openPGTestDB(t) + _, err := db.Exec(`CREATE TABLE applied_migrations ( + name TEXT PRIMARY KEY, + applied_at TIMESTAMPTZ NOT NULL DEFAULT CURRENT_TIMESTAMP + )`) + require.NoError(t, err, "create applied_migrations") + + // 3 messages: 1 and 2 embedded under the active gen, 3 not. + for _, id := range []int64{1, 2, 3} { + _, err := db.Exec(`INSERT INTO messages (id) VALUES ($1)`, id) + require.NoError(t, err, "insert message") + } + + // Stand up a serve/build-style archive: full Open migrates the schema and + // builds + activates a generation with embeddings for messages 1 and 2. + setup, err := Open(ctx, Options{DB: db, Dimension: 4}) + require.NoError(t, err, "setup Open (full migrate)") + gen, err := setup.CreateGeneration(ctx, "fake", 4, "") + require.NoError(t, err, "CreateGeneration") + require.NoError(t, setup.Upsert(ctx, gen, []vector.Chunk{ + {MessageID: 1, Vector: []float32{1, 0, 0, 0}}, + {MessageID: 2, Vector: []float32{0, 1, 0, 0}}, + }), "Upsert") + _, err = db.ExecContext(ctx, `UPDATE messages SET embed_gen = $1 WHERE id IN (1,2)`, int64(gen)) + require.NoError(t, err, "stamp") + require.NoError(t, setup.ActivateGeneration(ctx, gen, true), "Activate (force)") + require.NoError(t, setup.Close(), "close setup backend") + + // Simulate the upgrade: embeddings + active gen present, but embed_gen + // reset to NULL on the embedded rows, and the backfill ledger cleared (the + // full Open above already marked it when no gen existed — clear it so this + // reproduces the real first-post-upgrade timing). + _, err = db.ExecContext(ctx, `UPDATE messages SET embed_gen = NULL`) + require.NoError(t, err, "reset embed_gen to NULL (simulate upgrade)") + _, err = db.ExecContext(ctx, + `DELETE FROM applied_migrations WHERE name = $1`, embedGenBackfillMigration) + require.NoError(t, err, "clear backfill ledger") + // Also drop embed_watermark to prove the management open's schema-only + // migrate re-creates it (a pre-upgrade archive predates the table). + _, err = db.ExecContext(ctx, `DROP TABLE IF EXISTS embed_watermark`) + require.NoError(t, err, "drop embed_watermark to prove schema-only re-apply") + + // THE FIX: a writable management-style Open (SkipMigrate=true to skip the + // privileged CREATE EXTENSION, ReadOnly=false). It must apply the + // extension-less schema (re-creating embed_watermark) and run the one-time + // backfill (stamping embed_gen for the embedded rows). + mgmt, err := Open(ctx, Options{DB: db, Dimension: 4, SkipMigrate: true}) + require.NoError(t, err, "management Open (SkipMigrate, writable)") + t.Cleanup(func() { _ = mgmt.Close() }) + + // Backfill ran: messages 1 and 2 are stamped for the active gen. + for _, id := range []int64{1, 2} { + v, isNull := embedGenOf(t, db, id) + assert.Falsef(t, isNull, "msg %d should be stamped by the management-open backfill", id) + assert.Equalf(t, int64(gen), v, "msg %d embed_gen", id) + } + // Message 3 (never embedded) stays NULL. + _, isNull3 := embedGenOf(t, db, 3) + assert.True(t, isNull3, "msg 3 (un-embedded) stays NULL") + + // embed_watermark exists again (schema-only migrate re-applied it). + assert.True(t, pgRegclassExists(t, db, "embed_watermark"), + "embed_watermark must exist after the management open's schema-only migrate") + + // The ledger key is set so the one-time backfill never re-runs. + var marked int + require.NoError(t, db.QueryRow( + `SELECT COUNT(*) FROM applied_migrations WHERE name = $1`, + embedGenBackfillMigration).Scan(&marked)) + assert.Equal(t, 1, marked, "backfill ledger key must be set after management open") + + // Coverage is honest: only message 3 is now missing. + s, err := mgmt.Stats(ctx, gen) + require.NoError(t, err, "Stats") + assert.Equal(t, int64(1), s.PendingCount, "post-backfill: only msg 3 missing") +} + +// TestReadOnlyOpen_PerformsNoWrites asserts the read-only safety guarantee +// from the other direction: a ReadOnly=true Open must perform NO writes — no +// schema apply, no orphan reset, no backfill — even when an upgrade backfill +// would otherwise be due. The schema-only migrate must NOT re-create a dropped +// table, and embed_gen must stay NULL. +// +// This stands in for a truly read-only PG connection (the MCP store. +// OpenReadOnly handle), which the unit harness opens read-write; we assert the +// CODE PATH instead: ReadOnly=true ⇒ neither Migrate nor the backfill is +// attempted, observable as "no side effects on the DB". +func TestReadOnlyOpen_PerformsNoWrites(t *testing.T) { + ctx := context.Background() + db := openPGTestDB(t) + _, err := db.Exec(`CREATE TABLE applied_migrations ( + name TEXT PRIMARY KEY, + applied_at TIMESTAMPTZ NOT NULL DEFAULT CURRENT_TIMESTAMP + )`) + require.NoError(t, err, "create applied_migrations") + + for _, id := range []int64{1, 2} { + _, err := db.Exec(`INSERT INTO messages (id) VALUES ($1)`, id) + require.NoError(t, err, "insert message") + } + + // Build + activate a generation with embeddings, then simulate the + // upgrade (embed_gen NULL, ledger cleared, embed_watermark dropped). + setup, err := Open(ctx, Options{DB: db, Dimension: 4}) + require.NoError(t, err, "setup Open") + gen, err := setup.CreateGeneration(ctx, "fake", 4, "") + require.NoError(t, err, "CreateGeneration") + require.NoError(t, setup.Upsert(ctx, gen, []vector.Chunk{ + {MessageID: 1, Vector: []float32{1, 0, 0, 0}}, + {MessageID: 2, Vector: []float32{0, 1, 0, 0}}, + }), "Upsert") + _, err = db.ExecContext(ctx, `UPDATE messages SET embed_gen = $1`, int64(gen)) + require.NoError(t, err, "stamp") + require.NoError(t, setup.ActivateGeneration(ctx, gen, true), "Activate (force)") + require.NoError(t, setup.Close(), "close setup") + + _, err = db.ExecContext(ctx, `UPDATE messages SET embed_gen = NULL`) + require.NoError(t, err, "reset embed_gen (simulate upgrade)") + _, err = db.ExecContext(ctx, + `DELETE FROM applied_migrations WHERE name = $1`, embedGenBackfillMigration) + require.NoError(t, err, "clear ledger") + _, err = db.ExecContext(ctx, `DROP TABLE IF EXISTS embed_watermark`) + require.NoError(t, err, "drop embed_watermark") + + // Read-only Open: MUST NOT write. SkipMigrate suppresses CREATE EXTENSION + + // full migrate; ReadOnly suppresses the schema-only migrate + reset + + // backfill. + ro, err := Open(ctx, Options{DB: db, Dimension: 4, SkipMigrate: true, ReadOnly: true}) + require.NoError(t, err, "read-only Open must not error") + t.Cleanup(func() { _ = ro.Close() }) + + // No backfill: both embedded rows stay NULL. + for _, id := range []int64{1, 2} { + _, isNull := embedGenOf(t, db, id) + assert.Truef(t, isNull, "ReadOnly Open must NOT stamp msg %d", id) + } + // No schema apply: embed_watermark must NOT have been re-created. + assert.False(t, pgRegclassExists(t, db, "embed_watermark"), + "ReadOnly Open must NOT re-create embed_watermark (no schema apply)") + // No ledger mark. + var marked int + require.NoError(t, db.QueryRow( + `SELECT COUNT(*) FROM applied_migrations WHERE name = $1`, + embedGenBackfillMigration).Scan(&marked)) + assert.Equal(t, 0, marked, "ReadOnly Open must NOT mark the backfill ledger") +} + +// lowTimeoutMS is the SESSION statement_timeout the low-timeout backfill handle +// runs under. It is sized to sit comfortably ABOVE the backfill's cheap +// pre-flight reads (catalog probe, ledger check, active-generation lookup — +// sub-millisecond once the catalog cache is warmed, see openLowTimeoutHandle) +// yet far BELOW the stamp UPDATE over lowTimeoutSeedRows rows (≈190ms, +// measured). The fix's `SET LOCAL statement_timeout = 0` lifts it for the tx, so +// post-fix the UPDATE completes; pre-fix it is cancelled with SQLSTATE 57014. +const lowTimeoutMS = 50 + +// lowTimeoutSeedRows is the number of embedded-but-unstamped messages the +// regression test seeds. Sized so the stamp UPDATE's nested-loop semi-join over +// these rows reliably exceeds lowTimeoutMS (≈190ms at 30k, ≈4x margin) while +// staying cheap to seed (one bulk INSERT … SELECT generate_series). +const lowTimeoutSeedRows = 30000 + +// openLowTimeoutHandle opens a dedicated single-connection *sql.DB on the SAME +// per-test schema as db, with the session statement_timeout pinned to +// lowTimeoutMS. SetMaxOpenConns(1) makes the SET sticky: every query the +// backfill issues on this handle runs on the one connection that carries the low +// session timeout, so the test is deterministic — the timeout provably applies +// to the exact connection the backfill uses (no pool flakiness). The schema is +// read from db's live search_path so both handles see the same tables. +// +// The PostgreSQL catalog cache is WARMED (an information_schema probe of the +// same shape the backfill's messagesHasEmbedGen guard issues) BEFORE the timeout +// is lowered: the first such probe on a fresh connection is cold (~15ms), but +// warm reruns are sub-millisecond, so warming guarantees the backfill's +// pre-flight reads clear lowTimeoutMS with a large margin. Only the heavy stamp +// UPDATE is meant to trip the timeout. +func openLowTimeoutHandle(t *testing.T, db *sql.DB) *sql.DB { + t.Helper() + + // The per-test schema is the first entry of db's search_path (set by + // openPGTestDB as ",public"). Read it back so the low-timeout + // handle targets the SAME tables. + var searchPath string + require.NoError(t, db.QueryRow(`SHOW search_path`).Scan(&searchPath), + "read search_path from test db") + require.NotEmpty(t, searchPath, "search_path must be set on the test db") + + url := os.Getenv("MSGVAULT_TEST_DB") + require.NotEmpty(t, url, "MSGVAULT_TEST_DB must be set") + sep := "?" + if strings.Contains(url, "?") { + sep = "&" + } + lowURL := url + sep + "search_path=" + searchPath + + low, err := sql.Open("pgx", lowURL) + require.NoError(t, err, "open low-timeout handle") + // Single connection: the SET below is then sticky for every subsequent + // query on this handle (no other pooled connection to dodge the timeout). + low.SetMaxOpenConns(1) + t.Cleanup(func() { _ = low.Close() }) + + // Warm the catalog cache on this connection so the backfill's pre-flight + // reads (the same information_schema probe) are sub-millisecond, well under + // lowTimeoutMS. Done BEFORE lowering the timeout so the cold first probe is + // never itself cancelled. + var n int + require.NoError(t, low.QueryRow( + `SELECT COUNT(*) FROM information_schema.columns + WHERE table_name = 'messages' AND column_name = 'embed_gen' + AND table_schema = ANY (current_schemas(false))`).Scan(&n), + "warm catalog cache on low handle") + + _, err = low.Exec(fmt.Sprintf(`SET statement_timeout = '%dms'`, lowTimeoutMS)) + require.NoError(t, err, "set low statement_timeout on low handle") + return low +} + +// TestBackfillEmbedGen_CompletesUnderLowStatementTimeout is the regression test +// for the prod-corpus bug: on a real 53k-row corpus the one-time +// upgrade backfill's stamp UPDATE (a ≈28s nested-loop semi-join) was cancelled +// by the pool's 30s statement_timeout (SQLSTATE 57014) and rolled back, so the +// upgrade never completed. The fix adds `SET LOCAL statement_timeout = 0` as the +// first statement inside the backfill tx (and the orphan-reset tx). +// +// Determinism: rather than reproduce 28s of work, we pin the SESSION +// statement_timeout to lowTimeoutMS on a SINGLE-connection handle +// (SetMaxOpenConns(1)) and run the backfill on that handle. With +// lowTimeoutSeedRows embedded rows the stamp UPDATE takes ≈190ms — well over the +// 50ms timeout — so PRE-FIX it is cancelled (57014); POST-FIX the tx's +// `SET LOCAL statement_timeout = 0` lifts the timeout for exactly that tx, so +// the backfill commits and stamps embed_gen. The backfill's cheap pre-flight +// reads stay well under the timeout (catalog cache warmed in +// openLowTimeoutHandle). +// +// Ordering matters: ALL seeding (schema, messages, embeddings, the embed_gen +// reset, the ledger clear) runs on the NORMAL db handle BEFORE the low timeout +// is applied, so seeding is never itself cancelled. Only the backfill runs under +// the low timeout. +// +// Pre-fix verification (recorded in the task report): with backfill.go reverted +// to pr5-pre-timeoutfix, this test FAILS — BackfillEmbedGenForUpgrade returns a +// "canceling statement due to statement timeout (SQLSTATE 57014)" error and +// embed_gen stays NULL. +func TestBackfillEmbedGen_CompletesUnderLowStatementTimeout(t *testing.T) { + ctx := context.Background() + db := openPGTestDB(t) + _, err := db.Exec(`CREATE TABLE applied_migrations ( + name TEXT PRIMARY KEY, + applied_at TIMESTAMPTZ NOT NULL DEFAULT CURRENT_TIMESTAMP + )`) + require.NoError(t, err, "create applied_migrations") + + // Bring up the embeddings schema (index_generations, embeddings, …) via a + // full Open on the NORMAL handle, then bulk-seed lowTimeoutSeedRows messages + // + one embedding each so the stamp UPDATE has real work to do. + b, err := Open(ctx, Options{DB: db, Dimension: 4}) + require.NoError(t, err, "Open") + t.Cleanup(func() { _ = b.Close() }) + + gen, err := b.CreateGeneration(ctx, "fake", 4, "") + require.NoError(t, err, "CreateGeneration") + + _, err = db.ExecContext(ctx, + `INSERT INTO messages (id) SELECT generate_series(1, $1)`, lowTimeoutSeedRows) + require.NoError(t, err, "bulk insert messages") + // One embedding per message under the generation. Columns mirror schema.sql's + // NOT NULL set; the vector value is irrelevant (the backfill only checks + // existence via EXISTS). + _, err = db.ExecContext(ctx, + `INSERT INTO embeddings + (generation_id, message_id, chunk_index, embedded_at, source_char_len, dimension, embedding) + SELECT $1, g, 0, 0, 1, 4, ('[' || (g % 4) || ',0,0,0]')::vector + FROM generate_series(1, $2) g`, int64(gen), lowTimeoutSeedRows) + require.NoError(t, err, "bulk insert embeddings") + require.NoError(t, b.ActivateGeneration(ctx, gen, true), "Activate (force)") + + // Simulate the upgrade: embeddings + active gen present, embed_gen NULL, the + // backfill ledger cleared so the next backfill call reproduces the real + // first-post-upgrade timing. All on the NORMAL handle, before lowering the + // timeout. + _, err = db.ExecContext(ctx, `UPDATE messages SET embed_gen = NULL`) + require.NoError(t, err, "reset embed_gen to NULL (simulate upgrade)") + _, err = db.ExecContext(ctx, + `DELETE FROM applied_migrations WHERE name = $1`, embedGenBackfillMigration) + require.NoError(t, err, "clear backfill ledger") + + // NOW switch to the low-timeout single-connection handle and run the backfill + // on it. Pre-fix the stamp UPDATE is cancelled (57014); post-fix the tx's + // SET LOCAL statement_timeout = 0 lets it complete. + low := openLowTimeoutHandle(t, db) + lowBackend := &Backend{db: low} + + require.NoErrorf(t, lowBackend.BackfillEmbedGenForUpgrade(ctx), + "backfill must COMPLETE under a %dms statement_timeout (SET LOCAL lifts it)", lowTimeoutMS) + + // The stamp landed despite the low session timeout: a sample of embedded rows + // is now stamped for the active generation, and the full count matches. + for _, id := range []int64{1, lowTimeoutSeedRows / 2, lowTimeoutSeedRows} { + v, isNull := embedGenOf(t, db, id) + assert.Falsef(t, isNull, "msg %d should be stamped by the low-timeout backfill", id) + assert.Equalf(t, int64(gen), v, "msg %d embed_gen", id) + } + var stamped int64 + require.NoError(t, db.QueryRow( + `SELECT COUNT(*) FROM messages WHERE embed_gen = $1`, int64(gen)).Scan(&stamped)) + assert.Equal(t, int64(lowTimeoutSeedRows), stamped, + "every embedded row stamped after the low-timeout backfill") + + // And the ledger is marked so the one-time backfill never re-runs. + var marked int + require.NoError(t, db.QueryRow( + `SELECT COUNT(*) FROM applied_migrations WHERE name = $1`, + embedGenBackfillMigration).Scan(&marked)) + assert.Equal(t, 1, marked, "backfill ledger marked after completing under low timeout") +} diff --git a/internal/vector/pgvector/coverage_test.go b/internal/vector/pgvector/coverage_test.go new file mode 100644 index 000000000..2293b8d9d --- /dev/null +++ b/internal/vector/pgvector/coverage_test.go @@ -0,0 +1,92 @@ +//go:build pgvector + +package pgvector + +import ( + "context" + "testing" + + assertpkg "github.com/stretchr/testify/assert" + requirepkg "github.com/stretchr/testify/require" + + "go.kenn.io/msgvault/internal/vector" +) + +// TestCoverageSplit_EmbeddedBlankMissing mirrors the sqlitevec coverage +// test for the PostgreSQL backend: it proves EmbeddedMessageCount counts +// only messages with a real vector row, so the display-layer blank count +// (stamped - embedded) is a true "stamped but unembeddable" detector. +// +// Skips unless MSGVAULT_TEST_DB points at a live PostgreSQL with pgvector. +// It uses the minimal main schema the pgvector tests stand up, so live / +// stamped / missing are computed here with the same predicate +// store.CoverageCounts uses (the real CoverageCounts is exercised against +// the full schema in the backend-agnostic store coverage test). +func TestCoverageSplit_EmbeddedBlankMissing(t *testing.T) { + require := requirepkg.New(t) + assert := assertpkg.New(t) + ctx := context.Background() + + db := openPGTestDB(t) // skips when MSGVAULT_TEST_DB is unset + b, err := Open(ctx, Options{DB: db, Dimension: 8}) + require.NoError(err, "Open backend") + t.Cleanup(func() { _ = b.Close() }) + + // 5 live messages: 2 embedded, 2 blank, 1 missing. + ids := []int64{1, 2, 3, 4, 5} + for _, id := range ids { + _, err := db.ExecContext(ctx, + `INSERT INTO messages (id) VALUES ($1) ON CONFLICT DO NOTHING`, id) + require.NoErrorf(err, "insert message %d", id) + } + embedded := []int64{1, 2} + blanks := []int64{3, 4} + // id 5 stays missing (embed_gen NULL). + + gen, err := b.CreateGeneration(ctx, "test-model", 8, "fp") + require.NoError(err, "CreateGeneration") + + vec := func(seed float32) []float32 { + v := make([]float32, 8) + v[0] = seed + return v + } + require.NoError(b.Upsert(ctx, gen, []vector.Chunk{ + {MessageID: embedded[0], Vector: vec(1)}, + {MessageID: embedded[1], Vector: vec(2)}, + }), "Upsert embedded vectors") + + // Stamp embedded + blank rows DONE; blanks get no vector. + for _, id := range append(append([]int64{}, embedded...), blanks...) { + _, err := db.ExecContext(ctx, + `UPDATE messages SET embed_gen = $1 WHERE id = $2`, int64(gen), id) + require.NoErrorf(err, "stamp embed_gen for msg %d", id) + } + + // live / stamped / missing computed with the same predicate + // store.CoverageCounts uses (no soft-deletes here, so the live filter is + // just deleted_at/deleted_from_source_at IS NULL — all 5 qualify). + var live, stamped int64 + require.NoError(db.QueryRowContext(ctx, + `SELECT COUNT(*) FROM messages + WHERE deleted_at IS NULL AND deleted_from_source_at IS NULL`).Scan(&live), + "count live") + require.NoError(db.QueryRowContext(ctx, + `SELECT COUNT(*) FROM messages + WHERE embed_gen = $1 AND deleted_at IS NULL AND deleted_from_source_at IS NULL`, + int64(gen)).Scan(&stamped), "count stamped") + missing := live - stamped + + embeddedCount, err := b.EmbeddedMessageCount(ctx, gen) + require.NoError(err, "EmbeddedMessageCount") + blank := max(stamped-embeddedCount, 0) + + assert.Equal(int64(5), live, "live = all 5 messages") + assert.Equal(int64(4), stamped, "stamped = 4 (2 embedded + 2 blank)") + assert.Equal(int64(2), embeddedCount, "embedded = 2 (distinct message_ids with a vector)") + assert.Equal(int64(2), blank, "blank = stamped - embedded = 2") + assert.Equal(int64(1), missing, "missing = 1 (never stamped)") + + assert.Equal(live, embeddedCount+blank+missing, + "invariant: live == embedded + blank + missing") +} diff --git a/internal/vector/pgvector/ext_stub.go b/internal/vector/pgvector/ext_stub.go index bd49ecc7f..f82e31309 100644 --- a/internal/vector/pgvector/ext_stub.go +++ b/internal/vector/pgvector/ext_stub.go @@ -25,6 +25,7 @@ type Options struct { DB *sql.DB Dimension int SkipMigrate bool + ReadOnly bool SkipExtension bool } @@ -90,16 +91,21 @@ func (b *Backend) Stats(_ context.Context, _ vector.GenerationID) (vector.Stats, return vector.Stats{}, ErrNotBuilt } -// EnsureSeeded always returns ErrNotBuilt in non-pgvector builds. -func (b *Backend) EnsureSeeded(_ context.Context, _ vector.GenerationID) error { - return ErrNotBuilt -} - // LoadVector always returns ErrNotBuilt in non-pgvector builds. func (b *Backend) LoadVector(_ context.Context, _ int64) ([]float32, error) { return nil, ErrNotBuilt } +// ResetWatermarkBelow always returns ErrNotBuilt in non-pgvector builds. +func (b *Backend) ResetWatermarkBelow(_ context.Context, _ int64) error { + return ErrNotBuilt +} + +// EmbeddedMessageCount always returns ErrNotBuilt in non-pgvector builds. +func (b *Backend) EmbeddedMessageCount(_ context.Context, _ vector.GenerationID) (int64, error) { + return 0, ErrNotBuilt +} + // Compile-time check that the stub matches the vector.Backend // interface. Keeping the assertion here means changes to the interface // break stub and real builds in lockstep. diff --git a/internal/vector/pgvector/migrate.go b/internal/vector/pgvector/migrate.go index ea89248f2..14077e240 100644 --- a/internal/vector/pgvector/migrate.go +++ b/internal/vector/pgvector/migrate.go @@ -50,7 +50,7 @@ func Migrate(ctx context.Context, db migrateExecer, defaultDim int, skipExtensio // Wrap the schema apply AND the redundant-index drop in a single // transaction that disables the pool-wide 30s statement_timeout - // (finding S1, mirroring EnsureVectorIndex/seedPending). Two reasons: + // (finding S1, mirroring EnsureVectorIndex). Two reasons: // - DROP INDEX takes an ACCESS EXCLUSIVE lock; on a busy serve daemon // the lock-wait alone can exceed 30s. // - On a legacy populated DB, schema.sql's `CREATE INDEX IF NOT EXISTS` @@ -82,6 +82,18 @@ func Migrate(ctx context.Context, db migrateExecer, defaultDim int, skipExtensio if _, err := tx.ExecContext(ctx, `DROP INDEX IF EXISTS idx_embeddings_gen_msg`); err != nil { return fmt.Errorf("drop redundant idx_embeddings_gen_msg: %w", err) } + // NOTE: the dead pending_embeddings queue table is NOT dropped here. The + // scan-and-fill design replaced the per-generation seed queue with a live + // messages.embed_gen scan, so the table is otherwise unused — BUT the + // one-time upgrade backfill (BackfillEmbedGenForUpgrade) must first consult + // it to preserve the legacy "pending re-embed" signal: a message could + // carry BOTH a stale active-gen embedding AND an active-gen pending row. + // Dropping it here, before the backfill reads it, would let the backfill + // stamp those messages "covered" and never re-embed them. The drop now + // happens in the writable Open path AFTER the backfill has consulted it + // (see dropDeadPendingEmbeddings, called from Open). Migrate runs on + // read-only opens too, where dropping would be wrong (it must linger until + // a writable open honors the signal). if err := tx.Commit(); err != nil { return fmt.Errorf("commit pgvector migrate tx: %w", err) } diff --git a/internal/vector/pgvector/review_fixes_test.go b/internal/vector/pgvector/review_fixes_test.go index 0e30cd187..e457a038c 100644 --- a/internal/vector/pgvector/review_fixes_test.go +++ b/internal/vector/pgvector/review_fixes_test.go @@ -222,6 +222,36 @@ func TestMigrate_DropsPreExistingGenMsgIndex(t *testing.T) { "re-migrate must drop the legacy idx_embeddings_gen_msg") } +// TestMigrate_KeepsDeadPendingEmbeddings pins that Migrate ALONE no longer +// drops the dead pending_embeddings queue table: the one-time upgrade backfill +// (BackfillEmbedGenForUpgrade) must first consult it to preserve its legacy +// re-embed signal. The drop moved to the writable Open path, +// AFTER the backfill — see TestOpen_DropsDeadPendingEmbeddings and +// TestBackfillEmbedGen_PreservesActiveGenPendingReembedSignal. Migrate runs on +// read-only opens too, where dropping (before the signal is honored on a later +// writable open) would be wrong. +func TestMigrate_KeepsDeadPendingEmbeddings(t *testing.T) { + db := openPGTestDB(t) + ctx := context.Background() + require.NoError(t, Migrate(ctx, db, 0, false), "first Migrate") + + // Stand up a legacy pending_embeddings table, then re-migrate. + _, err := db.ExecContext(ctx, `CREATE TABLE pending_embeddings ( + generation_id BIGINT NOT NULL, + message_id BIGINT NOT NULL + )`) + require.NoError(t, err, "create legacy pending_embeddings") + var reg *string + require.NoError(t, db.QueryRowContext(ctx, + `SELECT to_regclass('pending_embeddings')::text`).Scan(®)) + require.NotNil(t, reg, "legacy table should exist before re-migrate") + + require.NoError(t, Migrate(ctx, db, 0, false), "second Migrate") + require.NoError(t, db.QueryRowContext(ctx, + `SELECT to_regclass('pending_embeddings')::text`).Scan(®)) + assert.NotNil(t, reg, "Migrate alone must NOT drop pending_embeddings (Open does, after the backfill consults it)") +} + // TestMigrate_SkipExtension (V5 / finding B3) asserts that the skipExtension // flag is HONORED — not merely that the schema objects exist (which would also // be true for an impl that ignored the flag and ran the harmless @@ -257,7 +287,7 @@ func TestMigrate_SkipExtension(t *testing.T) { assertHatchedDDL(t, tracer) // Schema tables exist. - for _, table := range []string{"index_generations", "embeddings", "pending_embeddings", "embed_runs"} { + for _, table := range []string{"index_generations", "embeddings", "embed_watermark", "embed_runs"} { var reg sql.NullString require.NoError(t, db.QueryRowContext(ctx, `SELECT to_regclass($1)::text`, table).Scan(®), @@ -338,7 +368,8 @@ func assertCreateExtensionOutsideTx(t *testing.T, tracer *sqlTracer) { // TestOpen_SkipExtensionWiring (V5) pins the Options.SkipExtension wiring: // Open with SkipExtension:true must succeed and produce a working backend // (schema created without running CREATE EXTENSION). Distinct from -// SkipMigrate, which suppresses all DDL. +// SkipMigrate (suppresses CREATE EXTENSION + the heavy full migrate) and +// ReadOnly (suppresses ALL writes). func TestOpen_SkipExtensionWiring(t *testing.T) { db := openPGTestDB(t) ctx := context.Background() @@ -516,8 +547,6 @@ func TestSearch_FilteredInlineExists_MultiChunk(t *testing.T) { {MessageID: 1, ChunkIndex: 1, Vector: unitVec(4, 2)}, {MessageID: 2, ChunkIndex: 0, Vector: unitVec(4, 1)}, }), "Upsert") - _, err = b.db.ExecContext(ctx, `DELETE FROM pending_embeddings WHERE generation_id = $1`, int64(gen)) - require.NoError(t, err, "clear pending") hits, err := b.Search(ctx, gen, unitVec(4, 0), 10, vector.Filter{SourceIDs: []int64{10}}) require.NoError(t, err, "Search") diff --git a/internal/vector/pgvector/schema.sql b/internal/vector/pgvector/schema.sql index c6c888d25..ba4b13471 100644 --- a/internal/vector/pgvector/schema.sql +++ b/internal/vector/pgvector/schema.sql @@ -12,11 +12,11 @@ CREATE TABLE IF NOT EXISTS index_generations ( dimension INTEGER NOT NULL, fingerprint TEXT NOT NULL, started_at BIGINT NOT NULL, - -- seeded_at marks when the initial pending_embeddings seed pass - -- finished. NULL means "row inserted but seed never committed" - -- (e.g. crash between insert and seed) — the resume path re-runs - -- seedPending in that case rather than activating an empty - -- generation. + -- seeded_at is stamped at CreateGeneration as harmless vestigial + -- metadata. Under the scan-and-fill design there is no separate seed + -- pass, and activation no longer gates on it (coverage — missing==0 — + -- is the real gate). Retained only so the column stays populated for + -- legacy display; no destructive migration drops it. seeded_at BIGINT, completed_at BIGINT, activated_at BIGINT, @@ -63,19 +63,6 @@ CREATE TABLE IF NOT EXISTS embeddings ( CREATE INDEX IF NOT EXISTS idx_embeddings_msg ON embeddings(message_id); CREATE INDEX IF NOT EXISTS idx_embeddings_dim ON embeddings(dimension); -CREATE TABLE IF NOT EXISTS pending_embeddings ( - generation_id BIGINT NOT NULL REFERENCES index_generations(id) ON DELETE CASCADE, - message_id BIGINT NOT NULL, - enqueued_at BIGINT NOT NULL, - claimed_at BIGINT, - claim_token TEXT, - PRIMARY KEY (generation_id, message_id) -); -CREATE INDEX IF NOT EXISTS idx_pending_available - ON pending_embeddings(generation_id, message_id) WHERE claimed_at IS NULL; -CREATE INDEX IF NOT EXISTS idx_pending_claims - ON pending_embeddings(claimed_at) WHERE claimed_at IS NOT NULL; - CREATE TABLE IF NOT EXISTS embed_runs ( id BIGINT GENERATED ALWAYS AS IDENTITY PRIMARY KEY, generation_id BIGINT NOT NULL REFERENCES index_generations(id), @@ -87,3 +74,15 @@ CREATE TABLE IF NOT EXISTS embed_runs ( truncated INTEGER NOT NULL DEFAULT 0, error TEXT ); + +-- embed_watermark tracks the highest message id the scan-and-fill embed +-- worker has already swept for a generation, so each RunOnce resumes the +-- forward scan instead of re-scanning the whole messages table. It is a +-- pure optimization: losing it only makes the next scan start at id 0, +-- which is harmless (the scan predicate + idempotent upsert make +-- re-sweeping covered rows a no-op). The full-scan backstop ignores it. +-- See internal/vector/sqlitevec/schema.sql for the full contract. +CREATE TABLE IF NOT EXISTS embed_watermark ( + generation_id BIGINT PRIMARY KEY, + watermark_id BIGINT NOT NULL DEFAULT 0 +); diff --git a/internal/vector/sqlitevec/backend.go b/internal/vector/sqlitevec/backend.go index d4846dd5b..fda7a49bb 100644 --- a/internal/vector/sqlitevec/backend.go +++ b/internal/vector/sqlitevec/backend.go @@ -32,6 +32,14 @@ type Options struct { MainPath string // filesystem path to msgvault.db; required for FusedSearch Dimension int // default dimension for EnsureVectorTable at open MainDB *sql.DB // handle to the main msgvault.db + // ReadOnly indicates the main DB handle (MainDB) was opened read-only + // — e.g. the MCP server's store.OpenReadOnly (_query_only=true). When + // set, Open SKIPS BackfillEmbedGenForUpgrade, which would otherwise + // WRITE messages.embed_gen + applied_migrations through the read-only + // main handle and fail. This mirrors pgvector.Options.SkipMigrate's + // read-only guard. Migrate still runs because it only writes vectors.db, + // which is opened read-write regardless. + ReadOnly bool } // Backend implements vector.Backend and vector.FusingBackend against a @@ -42,6 +50,10 @@ type Backend struct { path string // filesystem path to vectors.db mainPath string // filesystem path to msgvault.db (for ATTACH) dim int + // readOnly is true when mainDB was opened read-only (MCP). The + // one-time upgrade backfill self-guards on it so it never writes + // through the read-only main handle. See Options.ReadOnly. + readOnly bool } // Open opens vectors.db, runs migrations, and retains the main database @@ -58,13 +70,53 @@ func Open(ctx context.Context, opts Options) (*Backend, error) { _ = db.Close() return nil, fmt.Errorf("migrate vectors.db: %w", err) } - return &Backend{ + b := &Backend{ db: db, mainDB: opts.MainDB, path: opts.Path, mainPath: opts.MainPath, dim: opts.Dimension, - }, nil + readOnly: opts.ReadOnly, + } + // Orphaned-stamp reset (vectors.db-recreate safety): clear embed_gen for + // any message whose stamp points to a generation id that no longer exists + // in index_generations. This MUST run BEFORE BackfillEmbedGenForUpgrade so + // a freshly recreated vectors.db (empty index_generations, ids restarting + // at 1) cannot reuse an old gen id whose stale stamps would mask coverage. + // Not ledger-guarded: a recreate can happen between any two process + // starts, so it re-checks on every writable Open (cheap + idempotent). + // Self-guards on b.mainDB == nil / b.readOnly exactly like the backfill. + if err := b.resetOrphanedEmbedGen(ctx); err != nil { + _ = db.Close() + return nil, fmt.Errorf("reset orphaned embed_gen: %w", err) + } + // One-time upgrade backfill (Package A): stamp embed_gen for messages + // already embedded under the active generation, so an upgraded v0.14– + // v0.15 archive does not read as entirely missing and trigger a full + // re-embed. Ledger-guarded, so it runs at most once. No-ops when the + // main DB handle is absent (management commands), already applied, or + // the main handle is read-only (MCP) — the backfill self-guards on + // b.readOnly so it never WRITES through a query-only main handle. Migrate + // above still ran: it only writes vectors.db, which is read-write here. + if err := b.BackfillEmbedGenForUpgrade(ctx); err != nil { + _ = db.Close() + return nil, fmt.Errorf("embed_gen upgrade backfill: %w", err) + } + // Drop the dead pending_embeddings queue table now that the backfill has + // consulted it: the backfill preserves the table's legacy + // re-embed signal, then we drop it here. Gated to the writable path — + // mirrors the backfill's own b.readOnly / b.mainDB guards — so a read-only + // Open leaves the table (and its signal) for a later writable open. Skipped + // when the main handle is absent (management commands) so a backend opened + // without a writable main DB does not mutate vectors.db schema unexpectedly. + // Idempotent. + if b.mainDB != nil && !b.readOnly { + if err := b.dropDeadPendingEmbeddings(ctx); err != nil { + _ = db.Close() + return nil, fmt.Errorf("drop dead pending_embeddings: %w", err) + } + } + return b, nil } // Close releases the vectors.db handle. @@ -78,27 +130,20 @@ func (b *Backend) DB() *sql.DB { return b.db } func (b *Backend) Path() string { return b.path } // CreateGeneration allocates a new building generation (§5.1 of the -// spec) and seeds pending_embeddings with every currently-embeddable -// message. If a building generation already exists with the same -// fingerprint, returns its id so a crashed or interrupted rebuild can -// resume; on resume the seed pass is skipped iff the previous attempt -// recorded `seeded_at` (so messages that the previous attempt already -// embedded — and Queue.Complete therefore already removed from -// pending_embeddings — are NOT re-enqueued). When the previous attempt -// crashed BEFORE the seed transaction committed, seeded_at is NULL -// and we re-run seedPending so we don't activate an empty generation. -// seedPending uses INSERT OR IGNORE, so rerunning it is safe regardless -// of what the Enqueuer has dual-enqueued in the meantime. +// spec). Under the scan-and-fill design there is no pending_embeddings +// seed: a building generation is just a row, and the embed worker +// populates it by scanning messages whose embed_gen does not yet match +// the generation. seeded_at is stamped at creation as harmless vestigial +// metadata (it no longer gates a seed pass and no longer gates +// activation; coverage is the real gate). // -// A mismatched fingerprint returns an error wrapping -// vector.ErrBuildingInProgress so the caller can surface an actionable -// message rather than a raw unique-index violation. -// -// Concurrency note: the new building generation is committed *before* -// seeding so that a concurrent Enqueuer (driven by sync) immediately -// sees the new generation and dual-enqueues newly-synced messages. The -// seed loop then uses INSERT OR IGNORE, so any rows the Enqueuer has -// already written are silently de-duplicated and nothing is missed. +// If a building generation already exists with the same fingerprint, +// returns its id so a crashed or interrupted rebuild can resume — +// scan-and-fill simply continues from wherever the previous attempt left +// off (covered rows are skipped by the scan predicate). A mismatched +// fingerprint returns an error wrapping vector.ErrBuildingInProgress so +// the caller can surface an actionable message rather than a raw +// unique-index violation. func (b *Backend) CreateGeneration(ctx context.Context, model string, dim int, fingerprint string) (vector.GenerationID, error) { if err := EnsureVectorTable(ctx, b.db, dim); err != nil { return 0, err @@ -115,97 +160,13 @@ func (b *Backend) CreateGeneration(ctx context.Context, model string, dim int, f } now := time.Now().Unix() - gen, isNew, err := b.claimOrInsertBuilding(ctx, model, dim, fp, now) + gen, _, err := b.claimOrInsertBuilding(ctx, model, dim, fp, now) if err != nil { return 0, err } - - if !isNew { - // Resume path: only skip seedPending when the prior attempt's - // seed transaction committed. seeded_at IS NULL means the - // process died between the building-row insert and the seed - // commit; pending_embeddings is empty (or only contains - // dual-enqueued rows from concurrent Enqueuer activity), so - // activating now would silently replace a valid active index - // with an unseeded one. Re-run seedPending; the INSERT OR - // IGNORE statements de-duplicate against any dual-enqueued or - // already-completed rows. - seeded, err := b.isGenerationSeeded(ctx, gen) - if err != nil { - return 0, err - } - if seeded { - return gen, nil - } - // Fall through to seedPending + mark seeded. - } - if err := b.seedPending(ctx, gen, now); err != nil { - return 0, err - } - if err := b.markGenerationSeeded(ctx, gen, now); err != nil { - return 0, err - } return gen, nil } -// isGenerationSeeded reports whether the initial seedPending pass for -// gen committed (seeded_at IS NOT NULL). -func (b *Backend) isGenerationSeeded(ctx context.Context, gen vector.GenerationID) (bool, error) { - var seededAt sql.NullInt64 - err := b.db.QueryRowContext(ctx, - `SELECT seeded_at FROM index_generations WHERE id = ?`, int64(gen)).Scan(&seededAt) - if err != nil { - return false, fmt.Errorf("read seeded_at: %w", err) - } - return seededAt.Valid, nil -} - -// markGenerationSeeded stamps seeded_at on gen so future resume calls -// know the initial seed pass committed. Idempotent: COALESCE preserves -// the original timestamp when called more than once. -func (b *Backend) markGenerationSeeded(ctx context.Context, gen vector.GenerationID, now int64) error { - if _, err := b.db.ExecContext(ctx, - `UPDATE index_generations SET seeded_at = COALESCE(seeded_at, ?) WHERE id = ?`, - now, int64(gen)); err != nil { - return fmt.Errorf("mark generation seeded: %w", err) - } - return nil -} - -// EnsureSeeded re-runs the initial seed pass for gen if seeded_at is -// NULL. Used on the resume path so that a crash between -// claimOrInsertBuilding and the original seedPending commit cannot -// cause a later `msgvault embeddings build` to "drain" zero pending rows and -// activate an unseeded generation. Returns an error if gen no longer -// exists or has been activated/retired (state != 'building'); the -// caller should surface --full-rebuild guidance in that case. -func (b *Backend) EnsureSeeded(ctx context.Context, gen vector.GenerationID) error { - var state string - err := b.db.QueryRowContext(ctx, - `SELECT state FROM index_generations WHERE id = ?`, int64(gen)).Scan(&state) - if errors.Is(err, sql.ErrNoRows) { - return fmt.Errorf("%w: %d", vector.ErrUnknownGeneration, gen) - } - if err != nil { - return fmt.Errorf("lookup generation %d: %w", gen, err) - } - if state != string(vector.GenerationBuilding) { - return fmt.Errorf("%w: generation %d state=%q", vector.ErrGenerationNotBuilding, gen, state) - } - seeded, err := b.isGenerationSeeded(ctx, gen) - if err != nil { - return err - } - if seeded { - return nil - } - now := time.Now().Unix() - if err := b.seedPending(ctx, gen, now); err != nil { - return err - } - return b.markGenerationSeeded(ctx, gen, now) -} - // claimOrInsertBuilding returns (id, isNew, err). isNew=true means // this call inserted a fresh building row; isNew=false means we // reused an existing building row whose fingerprint matched. Reusing @@ -228,11 +189,15 @@ func (b *Backend) claimOrInsertBuilding(ctx context.Context, model string, dim i return id, false, nil } + // seeded_at is stamped at creation as harmless vestigial metadata: + // scan-and-fill has no separate seed pass, and activation no longer + // gates on it (coverage is the real gate). Kept only so the column is + // populated for legacy display. res, err := b.db.ExecContext(ctx, `INSERT INTO index_generations - (model, dimension, fingerprint, started_at, state) - VALUES (?, ?, ?, ?, 'building')`, - model, dim, fp, now) + (model, dimension, fingerprint, started_at, seeded_at, state) + VALUES (?, ?, ?, ?, ?, 'building')`, + model, dim, fp, now, now) if err != nil { // A concurrent CreateGeneration may have inserted between our // SELECT and INSERT. The unique partial index on (state) where @@ -299,59 +264,67 @@ func isUniqueConstraintErr(err error) bool { sqliteErr.ExtendedCode == sqlite3.ErrConstraintPrimaryKey } -// seedPending inserts one pending_embeddings row per non-deleted -// message in the main DB. Uses INSERT OR IGNORE so rows that the -// Enqueuer already added for this generation (via the dual-enqueue -// path) are silently de-duplicated, and the operation is safe to -// retry if interrupted. Runs under a single vectors.db transaction so -// the seed itself is atomic. -func (b *Backend) seedPending(ctx context.Context, gen vector.GenerationID, now int64) error { - // Embedding-seeding: skip dedup-hidden and remote-deleted rows - // using the canonical live-message predicate - // (store.LiveMessagesWhere). Dedup Execute does not remove - // vector-store rows by design: if a message is embedded then later - // soft-deleted, the embedding stays in the vector store and - // query-time live filtering (dropDeletedFromSource, - // filteredMessageIDs) enforces the live-message contract. - rows, err := b.mainDB.QueryContext(ctx, - `SELECT id FROM messages WHERE `+store.LiveMessagesWhere("", true)) +// hasMissingForGen reports whether any live message in the main DB still +// needs embedding for gen (embed_gen IS NULL OR embed_gen <> gen). This is +// the scan-and-fill coverage gate. On SQLite the messages live in the main +// DB while the generation lifecycle lives in vectors.db, so the gate +// cannot be folded into the activation tx — ActivateGeneration runs this +// Go pre-check against b.mainDB before its vectors.db tx (mirroring the +// intentionally-non-atomic scheduler gate). The full-scan backstop covers +// any TOCTOU window between this read and the flip. +func (b *Backend) hasMissingForGen(ctx context.Context, gen vector.GenerationID) (bool, error) { + var exists int + err := b.mainDB.QueryRowContext(ctx, + `SELECT EXISTS ( + SELECT 1 FROM messages + WHERE (embed_gen IS NULL OR embed_gen <> ?) + AND `+store.LiveMessagesWhere("", true)+` + )`, int64(gen)).Scan(&exists) if err != nil { - return fmt.Errorf("select messages: %w", err) + return false, fmt.Errorf("check missing coverage for generation %d: %w", gen, err) } - defer func() { _ = rows.Close() }() + return exists == 1, nil +} - tx, err := b.db.BeginTx(ctx, nil) - if err != nil { - return fmt.Errorf("begin seed tx: %w", err) +// ActivateGeneration atomically retires the current active generation +// (if any) and promotes `gen` to active. +func (b *Backend) ActivateGeneration(ctx context.Context, gen vector.GenerationID, force bool) error { + // Lifecycle pre-check: verify gen exists AND is in 'building' state + // BEFORE the coverage pre-check below. The coverage predicate + // (embed_gen IS NULL OR embed_gen <> gen) is true for an unknown gen id, + // so an unknown/non-building gen would otherwise surface the misleading + // "messages needing embedding" coverage error instead of the real + // lifecycle error. The vectors.db tx's gated UPDATE re-derives this + // invariant atomically (via activateGateError); this read-only lookup + // just orders the errors correctly. Force does not bypass it — a force + // activation of an unknown/non-building gen is still a lifecycle error, + // matching the tx's WHERE id = ? AND state = 'building' clause. + var state vector.GenerationState + if err := b.db.QueryRowContext(ctx, + `SELECT state FROM index_generations WHERE id = ?`, int64(gen)).Scan(&state); err != nil { + if errors.Is(err, sql.ErrNoRows) { + return fmt.Errorf("%w: %d", vector.ErrUnknownGeneration, gen) + } + return fmt.Errorf("lookup generation %d: %w", gen, err) } - defer func() { _ = tx.Rollback() }() - - stmt, err := tx.PrepareContext(ctx, - `INSERT OR IGNORE INTO pending_embeddings (generation_id, message_id, enqueued_at) - VALUES (?, ?, ?)`) - if err != nil { - return fmt.Errorf("prepare pending insert: %w", err) + if state != vector.GenerationBuilding { + return fmt.Errorf("generation %d not in 'building' state", gen) } - defer func() { _ = stmt.Close() }() - for rows.Next() { - var id int64 - if err := rows.Scan(&id); err != nil { + // Coverage pre-check: refuse to activate a generation that still + // has live messages needing embedding, unless force. Cross-DB on + // SQLite, so it runs here as a Go pre-check before the vectors.db tx; + // the backstop covers the TOCTOU window. + if !force { + missing, err := b.hasMissingForGen(ctx, gen) + if err != nil { return err } - if _, err := stmt.ExecContext(ctx, int64(gen), id, now); err != nil { - return fmt.Errorf("insert pending: %w", err) + if missing { + return fmt.Errorf("generation %d still has messages needing embedding; run `msgvault embeddings resume` or pass --force", gen) } } - if err := rows.Err(); err != nil { - return err - } - return tx.Commit() -} -// ActivateGeneration atomically retires the current active generation -// (if any) and promotes `gen` to active. -func (b *Backend) ActivateGeneration(ctx context.Context, gen vector.GenerationID, force bool) error { now := time.Now().Unix() tx, err := b.db.BeginTx(ctx, nil) if err != nil { @@ -359,96 +332,62 @@ func (b *Backend) ActivateGeneration(ctx context.Context, gen vector.GenerationI } defer func() { _ = tx.Rollback() }() - // Demote the current active generation and capture its id in a single - // statement via RETURNING (SQLite 3.35+), so the id whose queue rows we reap - // below is provably the row this UPDATE retired (no separate SELECT that - // could diverge). No active row -> no row returned -> demoted invalid -> the - // reap is skipped, exactly as before. Done inside the tx so the demote+reap - // is atomic with the activation below. - var demoted sql.NullInt64 - if err := tx.QueryRowContext(ctx, + // Demote the current active generation (if any). sqlitevec retains + // retired generations' vectors (its vec0 PARTITION KEY isolates them), + // so there is nothing else to reap. Done inside the tx so the demote is + // atomic with the activation below. + if _, err := tx.ExecContext(ctx, `UPDATE index_generations SET state = 'retired', completed_at = COALESCE(completed_at, ?) - WHERE state = 'active' - RETURNING id`, now).Scan(&demoted); err != nil && - !errors.Is(err, sql.ErrNoRows) { + WHERE state = 'active'`, now); err != nil { return fmt.Errorf("retire previous active: %w", err) } - if demoted.Valid { - // Reap the demoted generation's queue rows in the same tx. Retired - // generations are never re-targeted by pickTarget, so leftover - // pending_embeddings rows would be orphaned forever (the - // index_generations row is preserved, so the ON DELETE CASCADE never - // fires). Keeps the "retired generations have zero pending items" - // stats invariant true. [cr2-3, cr2-4] - if _, err := tx.ExecContext(ctx, - `DELETE FROM pending_embeddings WHERE generation_id = ?`, demoted.Int64); err != nil { - return fmt.Errorf("delete retired generation %d pending: %w", demoted.Int64, err) - } - } - // Re-check the seeded/no-pending gate IN the activation tx (unless force). - // SQLite serializes writers, so the gate and flip are atomic once inside - // the tx: this closes the window between a CALLER's pre-flight pending read - // and this flip — no pending row committed before this statement can sneak - // gen past the gate. It does NOT prevent an enqueue that commits just AFTER - // this flip from leaving one pending row on the now-active gen; that - // post-flip row is acceptable and is drained by the embed worker's - // active-generation top-up on the next run (see embed_job.go pickTarget / - // enqueue.go). [cr2-1] + // Promote gen to active. The coverage gate (no live message still + // needs embedding for gen) was enforced by the Go pre-check above + // against the main DB; here we only enforce the lifecycle invariant + // (gen is in 'building' state). The seeded_at gate was removed: seeding + // was the old queue-population phase, which scan-and-fill no longer has, + // so a legacy/crashed gen with seeded_at=NULL but full coverage must be + // activatable. Coverage (missing==0) is the real gate. res, err := tx.ExecContext(ctx, `UPDATE index_generations SET state = 'active', activated_at = ?, completed_at = COALESCE(completed_at, ?) - WHERE id = ? AND state = 'building' - AND (? OR seeded_at IS NOT NULL) - AND (? OR NOT EXISTS ( - SELECT 1 FROM pending_embeddings WHERE generation_id = ? - ))`, now, now, int64(gen), force, force, int64(gen)) + WHERE id = ? AND state = 'building'`, now, now, int64(gen)) if err != nil { return fmt.Errorf("activate: %w", err) } n, _ := res.RowsAffected() if n == 0 { - return activateGateError(ctx, tx, gen, force) + return activateGateError(ctx, tx, gen) } return tx.Commit() } // activateGateError re-reads gen inside the activation tx to return a -// precise reason the gated promote affected zero rows: pending rows present, -// not finished seeding, unknown generation, or not in 'building' state. -func activateGateError(ctx context.Context, tx *sql.Tx, gen vector.GenerationID, force bool) error { - var pending int64 - if err := tx.QueryRowContext(ctx, - `SELECT COUNT(*) FROM pending_embeddings WHERE generation_id = ?`, int64(gen)).Scan(&pending); err != nil { - return fmt.Errorf("count pending rows for generation %d: %w", gen, err) - } - if pending > 0 && !force { - return fmt.Errorf("generation %d still has %d pending embedding rows; run `msgvault embeddings resume` or pass --force", - gen, pending) - } +// precise reason the gated promote affected zero rows: unknown +// generation or not in 'building' state. The coverage (missing) gate is +// handled by the Go pre-check in ActivateGeneration, so it is not +// re-derived here. +func activateGateError(ctx context.Context, tx *sql.Tx, gen vector.GenerationID) error { var state vector.GenerationState - var seededAt sql.NullInt64 if err := tx.QueryRowContext(ctx, - `SELECT state, seeded_at FROM index_generations WHERE id = ?`, int64(gen)).Scan(&state, &seededAt); err != nil { + `SELECT state FROM index_generations WHERE id = ?`, int64(gen)).Scan(&state); err != nil { if errors.Is(err, sql.ErrNoRows) { return fmt.Errorf("%w: %d", vector.ErrUnknownGeneration, gen) } return fmt.Errorf("lookup generation %d: %w", gen, err) } - if state == vector.GenerationBuilding && !seededAt.Valid && !force { - return fmt.Errorf("generation %d has not finished seeding; run `msgvault embeddings resume` or pass --force", - gen) - } return fmt.Errorf("generation %d not in 'building' state", gen) } -// RetireGeneration marks the given generation as retired and reaps its -// queue rows in one transaction. +// RetireGeneration marks the given generation as retired (a state flip +// only). sqlitevec retains the retired generation's vectors (vec0 PARTITION +// KEY isolation), so there is no queue to reap. // // Unless force is true, the state-flip UPDATE refuses to retire a generation // in state='active' (WHERE state != 'active'): if it affects zero rows the // active guard tripped, so the tx rolls back returning ErrRefuseRetireActive -// WITHOUT reaping pending rows. SQLite serializes writers, so the guard and +// leaving state unchanged. SQLite serializes writers, so the guard and // flip are atomic once inside the tx — closing the CLI's pre-flight TOCTOU so // a concurrent activation cannot retire the now-serving generation without // --force-active. force retires unconditionally (operator override). @@ -473,15 +412,8 @@ func (b *Backend) RetireGeneration(ctx context.Context, gen vector.GenerationID, if n, _ := res.RowsAffected(); n == 0 { return retireGateError(ctx, tx, gen, force) } - // Reap the retired generation's queue rows in the same tx so they cannot - // be orphaned (no future run re-targets a retired generation, and the - // preserved index_generations row means the ON DELETE CASCADE never - // fires). Keeps the "retired generations have zero pending items" - // stats invariant true. [cr2-2, cr2-4] - if _, err := tx.ExecContext(ctx, - `DELETE FROM pending_embeddings WHERE generation_id = ?`, int64(gen)); err != nil { - return fmt.Errorf("delete retired generation %d pending: %w", gen, err) - } + // Scan-and-fill has no per-generation queue to reap; sqlitevec retains + // the retired generation's vectors (vec0 PARTITION KEY isolation). if err := tx.Commit(); err != nil { return fmt.Errorf("commit retire generation %d: %w", gen, err) } @@ -565,10 +497,9 @@ func (b *Backend) generationByState(ctx context.Context, state vector.Generation // exist, and an error wrapping vector.ErrDimensionMismatch if any chunk's // vector length does not match the generation's recorded dimension. // -// Upsert does NOT touch pending_embeddings — that is the queue's -// responsibility and must go through Queue.Complete, which matches the -// claim_token so a late-finishing stale worker cannot erase the queue -// row belonging to the newer worker that has already reclaimed it. +// Upsert does NOT touch messages.embed_gen — that is the worker's +// responsibility (it stamps embed_gen AFTER a successful upsert, an +// ordered idempotent step since the two live in different DBs on SQLite). func (b *Backend) Upsert(ctx context.Context, gen vector.GenerationID, chunks []vector.Chunk) error { if len(chunks) == 0 { return nil @@ -830,6 +761,28 @@ func (b *Backend) LoadVector(ctx context.Context, messageID int64) ([]float32, e return blobToFloat32(blob, active.Dimension) } +// ResetWatermarkBelow lowers the embed_watermark for EVERY generation to at +// most minID-1 (clamped at 0) so a subsequent incremental RunOnce re-scans +// from below minID and re-finds rows whose embed_gen was just reset to NULL +// by repair-encoding. The watermark lives in vectors.db on SQLite (b.db). +// +// SQLite's MIN(a, b) is the scalar two-argument minimum (not the aggregate), +// so `watermark_id = MIN(watermark_id, ?)` never raises a generation's +// cursor — it only lowers one that currently sits above the new floor. minID +// < 1 is a no-op (nothing below id 1). Idempotent: a second call with the +// same or higher minID changes nothing. +func (b *Backend) ResetWatermarkBelow(ctx context.Context, minID int64) error { + if minID < 1 { + return nil + } + floorID := minID - 1 + if _, err := b.db.ExecContext(ctx, + `UPDATE embed_watermark SET watermark_id = MIN(watermark_id, ?)`, floorID); err != nil { + return fmt.Errorf("reset watermark below %d: %w", minID, err) + } + return nil +} + // Search runs an ANN query against the given generation and returns the // top-k hits (optionally intersected with a structured filter). Hits are // ordered by ascending distance and assigned 1-based ranks. @@ -1387,9 +1340,97 @@ func (b *Backend) Stats(ctx context.Context, gen vector.GenerationID) (vector.St if err := b.db.QueryRowContext(ctx, embeddingCountSQL, args...).Scan(&s.EmbeddingCount); err != nil { return s, fmt.Errorf("count embeddings: %w", err) } - if err := b.db.QueryRowContext(ctx, - `SELECT COUNT(*) FROM pending_embeddings `+where, args...).Scan(&s.PendingCount); err != nil { - return s, fmt.Errorf("count pending: %w", err) + // PendingCount is now "messages still needing embedding for this + // generation" (embed_gen <> gen), read from the main DB rather than a + // queue table. The aggregate path (gen == 0) has no single target + // generation, so it reports 0 — the StatsView consumer sums per-gen + // pending across the active/building generations anyway. A nil mainDB + // (e.g. management commands that open the backend without the main + // handle) reports 0 rather than failing Stats. + if gen != 0 && b.mainDB != nil { + if err := b.mainDB.QueryRowContext(ctx, + `SELECT COUNT(*) FROM messages + WHERE (embed_gen IS NULL OR embed_gen <> ?) + AND `+store.LiveMessagesWhere("", true), + int64(gen)).Scan(&s.PendingCount); err != nil { + return s, fmt.Errorf("count missing: %w", err) + } } return s, nil } + +// EmbeddedMessageCount returns the number of LIVE messages that are +// stamped for gen (embed_gen = gen) AND actually have at least one vector +// for the generation. Used by the coverage readout to split stamped +// messages into embedded vs blank. Counts distinct messages (not chunk +// rows) so a long, multi-chunk message counts once, matching the +// EmbeddingCount semantic elsewhere. +// +// The liveness + stamped filter is REQUIRED for the coverage invariant +// live == embedded + blank + missing to hold. A non-live message +// (soft-deleted via deleted_at / deleted_from_source_at, or a dedup +// loser) keeps its embedding rows — Backend.Delete has no production +// callers — so an unfiltered COUNT(DISTINCT message_id) over the +// embeddings table can exceed stamped (which is live-only), driving +// blank = stamped - embedded negative (clamped to 0) and breaking the +// invariant (EMBEDDED could display larger than LIVE). +// +// Cross-DB on SQLite: embeddings live in vectors.db (b.db) while messages +// + embed_gen live in main.db (b.mainDB), two separate *sql.DB handles, so +// this cannot be a single JOIN. ATTACH is not used because it does not +// persist reliably across database/sql pooled connections. Instead we +// mirror the established cross-DB pattern (see dropDeletedFromSource): +// pull the distinct embedded message ids from vectors.db, then intersect +// them against the live+stamped set in main.db via json_each. A nil +// mainDB (management commands that opened the backend without the main +// handle) falls back to the unfiltered vectors.db count. +func (b *Backend) EmbeddedMessageCount(ctx context.Context, gen vector.GenerationID) (int64, error) { + if b.mainDB == nil { + var n int64 + if err := b.db.QueryRowContext(ctx, + `SELECT COUNT(DISTINCT message_id) FROM embeddings WHERE generation_id = ?`, + int64(gen)).Scan(&n); err != nil { + return 0, fmt.Errorf("count embedded messages: %w", err) + } + return n, nil + } + + // Step 1 (vectors.db): distinct message ids with >=1 vector for gen. + rows, err := b.db.QueryContext(ctx, + `SELECT DISTINCT message_id FROM embeddings WHERE generation_id = ?`, + int64(gen)) + if err != nil { + return 0, fmt.Errorf("list embedded message ids: %w", err) + } + defer func() { _ = rows.Close() }() + var ids []int64 + for rows.Next() { + var id int64 + if err := rows.Scan(&id); err != nil { + return 0, fmt.Errorf("scan embedded message id: %w", err) + } + ids = append(ids, id) + } + if err := rows.Err(); err != nil { + return 0, fmt.Errorf("iterate embedded message ids: %w", err) + } + if len(ids) == 0 { + return 0, nil + } + + // Step 2 (main.db): how many of those are live AND stamped for gen. + blob, err := json.Marshal(ids) + if err != nil { + return 0, fmt.Errorf("encode embedded ids: %w", err) + } + var n int64 + if err := b.mainDB.QueryRowContext(ctx, + `SELECT COUNT(*) FROM messages + WHERE id IN (SELECT value FROM json_each(?)) + AND embed_gen = ? + AND `+store.LiveMessagesWhere("", true), + string(blob), int64(gen)).Scan(&n); err != nil { + return 0, fmt.Errorf("count live embedded messages: %w", err) + } + return n, nil +} diff --git a/internal/vector/sqlitevec/backend_test.go b/internal/vector/sqlitevec/backend_test.go index 5682bb29a..2b3bb55ed 100644 --- a/internal/vector/sqlitevec/backend_test.go +++ b/internal/vector/sqlitevec/backend_test.go @@ -56,34 +56,17 @@ func TestBackend_CreateActivateRetire(t *testing.T) { require.Error(err, "ActiveGeneration should error after retire") } -// pendingCount returns the number of pending_embeddings rows for a generation. -func pendingCount(t *testing.T, b *Backend, gen vector.GenerationID) int { +// missingCountSV returns the number of live messages still needing +// embedding for gen (embed_gen <> gen) in the backend's main DB. This is +// the scan-and-fill coverage count that replaced pending_embeddings. +func missingCountSV(t *testing.T, b *Backend, gen vector.GenerationID) int { t.Helper() - var n int - requirepkg.NoError(t, b.db.QueryRowContext(context.Background(), - `SELECT COUNT(*) FROM pending_embeddings WHERE generation_id = ?`, int64(gen)).Scan(&n), - "count pending rows") - return n -} - -// TestBackend_RetireGeneration_CleansPending pins the cr2-2/cr2-4 fix for the -// explicit-retire path: RetireGeneration must DELETE the generation's -// pending_embeddings rows in the same tx as the state flip. A retired -// generation is never re-targeted by pickTarget, so leftover queue rows would -// be orphaned forever and would violate the documented "retired generations -// have zero pending items" stats invariant. -func TestBackend_RetireGeneration_CleansPending(t *testing.T) { - b, ctx := newBackendForTest(t) - - // CreateGeneration seeds one pending row (for msg id=1 in the test main DB). - gen, err := b.CreateGeneration(ctx, "m", 768, "") - requirepkg.NoError(t, err, "CreateGeneration") - requirepkg.Equal(t, 1, pendingCount(t, b, gen), "precondition: pending row present") - - requirepkg.NoError(t, b.RetireGeneration(ctx, gen, false), "RetireGeneration") - - assertpkg.Equal(t, 0, pendingCount(t, b, gen), - "retire must delete the generation's pending_embeddings rows") + missing, err := b.hasMissingForGen(context.Background(), gen) + requirepkg.NoError(t, err, "hasMissingForGen") + if missing { + return 1 + } + return 0 } // genStateSV reads index_generations.state for a generation. @@ -99,37 +82,31 @@ func genStateSV(t *testing.T, b *Backend, gen vector.GenerationID) vector.Genera // TestBackend_RetireGeneration_ActiveGuard pins the retire-TOCTOU class-closing // fix: the active-gen guard lives ATOMICALLY inside RetireGeneration's tx. // - force=false against the ACTIVE generation is refused with -// ErrRefuseRetireActive, leaving state='active' and its pending rows intact -// (no destructive reap of the now-serving generation). -// - force=true retires the active generation and reaps its pending rows. +// ErrRefuseRetireActive, leaving state='active'. +// - force=true retires the active generation. // - force=false against a NON-active (building) generation retires fine. func TestBackend_RetireGeneration_ActiveGuard(t *testing.T) { b, ctx := newBackendForTest(t) - // Build + force-activate genA, leaving an undrained pending row on it. + // Build + force-activate genA (force bypasses the coverage gate so the + // one unembedded test message does not block activation). genA, err := b.CreateGeneration(ctx, "model-a", 768, "") requirepkg.NoError(t, err, "CreateGeneration A") requirepkg.NoError(t, b.ActivateGeneration(ctx, genA, true), "activate A (force)") - requirepkg.Equal(t, 1, pendingCount(t, b, genA), "precondition: pending row on active gen") requirepkg.Equal(t, vector.GenerationActive, genStateSV(t, b, genA), "precondition: A active") - // (1) Non-forced retire of the ACTIVE gen is refused atomically: sentinel - // error, state unchanged, pending rows NOT reaped. + // (1) Non-forced retire of the ACTIVE gen is refused atomically. err = b.RetireGeneration(ctx, genA, false) requirepkg.ErrorIs(t, err, vector.ErrRefuseRetireActive, "non-forced retire of active gen must return ErrRefuseRetireActive") assertpkg.Equal(t, vector.GenerationActive, genStateSV(t, b, genA), "refused retire must leave the active gen's state unchanged") - assertpkg.Equal(t, 1, pendingCount(t, b, genA), - "refused retire must NOT reap the active gen's pending rows") - // (2) Forced retire succeeds: state flips to retired and pending is reaped. + // (2) Forced retire succeeds. requirepkg.NoError(t, b.RetireGeneration(ctx, genA, true), "forced retire of active gen must succeed") assertpkg.Equal(t, vector.GenerationRetired, genStateSV(t, b, genA), "forced retire flips state to retired") - assertpkg.Equal(t, 0, pendingCount(t, b, genA), - "forced retire reaps the gen's pending rows") // (3) A NON-active (building) generation retires fine without force. genB, err := b.CreateGeneration(ctx, "model-b", 768, "") @@ -141,39 +118,123 @@ func TestBackend_RetireGeneration_ActiveGuard(t *testing.T) { "non-active gen retires to retired without force") } -// TestBackend_ActivateGeneration_AutoRetireCleansPending pins the -// cr2-3/cr2-4 fix for the auto-retire path: activating a new generation must -// reap the demoted (now-retired) generation's pending_embeddings rows in the -// same tx as the state flip. -func TestBackend_ActivateGeneration_AutoRetireCleansPending(t *testing.T) { +// TestBackend_ActivateGeneration_AutoRetires pins the auto-retire path: +// activating a new generation demotes the previously-active one to +// retired in the same tx as the state flip (RETURNING-id provable). +func TestBackend_ActivateGeneration_AutoRetires(t *testing.T) { b, ctx := newBackendForTest(t) genA, err := b.CreateGeneration(ctx, "model-a", 768, "") requirepkg.NoError(t, err, "CreateGeneration A") - // Force-activate A while it still has its seeded pending row (mimicking an - // undrained incremental queue row left on the active gen at re-embed time). requirepkg.NoError(t, b.ActivateGeneration(ctx, genA, true), "activate A (force)") - requirepkg.Equal(t, 1, pendingCount(t, b, genA), "precondition: pending row on active gen") genB, err := b.CreateGeneration(ctx, "model-b", 768, "") requirepkg.NoError(t, err, "CreateGeneration B") requirepkg.NoError(t, b.ActivateGeneration(ctx, genB, true), "activate B (auto-retires A)") - assertpkg.Equal(t, 0, pendingCount(t, b, genA), - "auto-retire must delete the demoted generation's pending_embeddings rows") - - // RETURNING-id provability: the demote folds into one - // `UPDATE ... WHERE state='active' RETURNING id` statement (SQLite 3.35+), - // so the id whose pending rows get reaped is exactly the row that flipped to - // retired. Assert the previously-active gen (genA) is the sole retired row - // AND that it is the one whose pending was reaped, while the new active gen - // (genB) is not retired. This pins that the RETURNING'd id == the reaped id - // == the previously-active generation. retired := singleRetiredGenSV(t, b) assertpkg.Equal(t, genA, retired, "the previously-active gen must be the sole retired row") assertpkg.NotEqual(t, genB, retired, "the newly-activated gen must not be retired") - assertpkg.Equal(t, 0, pendingCount(t, b, retired), - "pending reaped for exactly the RETURNING'd (retired) id") +} + +// TestBackend_ActivateGeneration_CoverageGate pins the scan-and-fill +// activation gate: a generation with a live message still needing +// embedding (embed_gen <> gen) is refused without force, and succeeds once +// coverage is complete (or with force). +func TestBackend_ActivateGeneration_CoverageGate(t *testing.T) { + b, ctx := newBackendForTest(t) + gen, err := b.CreateGeneration(ctx, "m", 768, "") + requirepkg.NoError(t, err, "CreateGeneration") + requirepkg.Equal(t, 1, missingCountSV(t, b, gen), "precondition: one missing message") + + // Non-forced activate is refused while the message is unembedded. + err = b.ActivateGeneration(ctx, gen, false) + requirepkg.Error(t, err, "activate must be refused with missing coverage") + assertpkg.Contains(t, err.Error(), "needing embedding") + + // Stamp the message as covered, then activation succeeds. + _, err = b.mainDB.ExecContext(ctx, `UPDATE messages SET embed_gen = ? WHERE id = 1`, int64(gen)) + requirepkg.NoError(t, err, "stamp embed_gen") + requirepkg.Equal(t, 0, missingCountSV(t, b, gen), "covered now") + requirepkg.NoError(t, b.ActivateGeneration(ctx, gen, false), "activate after coverage complete") + assertpkg.Equal(t, vector.GenerationActive, genStateSV(t, b, gen), "now active") +} + +// TestBackend_ActivateGeneration_LifecycleErrorBeforeCoverage pins that +// activating an unknown or non-building generation WITHOUT --force returns +// the lifecycle error (unknown generation / not in 'building' state), NOT +// the misleading "messages needing embedding" coverage error. The coverage +// predicate (embed_gen IS NULL OR embed_gen <> gen) is true for an unknown +// gen id, so the lifecycle check must run first. The seeded test message +// (id=1) stays unembedded so the coverage gate WOULD trip if checked first. +func TestBackend_ActivateGeneration_LifecycleErrorBeforeCoverage(t *testing.T) { + b, ctx := newBackendForTest(t) + requirepkg.Equal(t, 1, missingCountSV(t, b, vector.GenerationID(999)), + "precondition: coverage gate would trip for any gen (message unembedded)") + + // (a) Unknown gen id: lifecycle error (ErrUnknownGeneration), not coverage. + err := b.ActivateGeneration(ctx, vector.GenerationID(999), false) + requirepkg.Error(t, err, "activating unknown gen must fail") + requirepkg.ErrorIs(t, err, vector.ErrUnknownGeneration, + "unknown gen must return ErrUnknownGeneration, not coverage error") + assertpkg.NotContains(t, err.Error(), "needing embedding", + "unknown gen must NOT surface the coverage error") + + // (b) Non-building (retired) gen id: lifecycle error, not coverage. + gen, err := b.CreateGeneration(ctx, "m", 768, "") + requirepkg.NoError(t, err, "CreateGeneration") + requirepkg.NoError(t, b.ActivateGeneration(ctx, gen, true), "force-activate to bypass coverage") + requirepkg.NoError(t, b.RetireGeneration(ctx, gen, true), "force-retire to reach non-building state") + requirepkg.Equal(t, vector.GenerationRetired, genStateSV(t, b, gen), "precondition: gen retired") + + err = b.ActivateGeneration(ctx, gen, false) + requirepkg.Error(t, err, "activating retired gen must fail") + assertpkg.Contains(t, err.Error(), "not in 'building' state", + "retired gen must return the not-building lifecycle error") + assertpkg.NotContains(t, err.Error(), "needing embedding", + "retired gen must NOT surface the coverage error") +} + +// TestBackend_SingleTargetRebuild pins the single-target invariant: while +// a new generation B builds, the active generation A keeps serving +// (stale-but-correct), and B only becomes active once its coverage is +// complete — at which point A is retired in the same swap. There is no +// dual-write fan-out; the per-message embed_gen names exactly one target. +func TestBackend_SingleTargetRebuild(t *testing.T) { + b, ctx := newBackendForTest(t) + + // A: build, cover (force-activate to skip the gate for the one test + // message), and start serving. + genA, err := b.CreateGeneration(ctx, "model-a", 768, "") + requirepkg.NoError(t, err, "CreateGeneration A") + requirepkg.NoError(t, b.ActivateGeneration(ctx, genA, true), "activate A (force)") + active, err := b.ActiveGeneration(ctx) + requirepkg.NoError(t, err, "ActiveGeneration") + requirepkg.Equal(t, genA, active.ID, "A is serving") + + // B: a new building generation for the same corpus. The message reads + // as missing for B (embed_gen still names A), but A keeps serving + // unchanged — stale-but-correct mid-rebuild. + genB, err := b.CreateGeneration(ctx, "model-b", 768, "") + requirepkg.NoError(t, err, "CreateGeneration B") + requirepkg.Equal(t, 1, missingCountSV(t, b, genB), "message missing for B mid-rebuild") + active, err = b.ActiveGeneration(ctx) + requirepkg.NoError(t, err, "ActiveGeneration mid-rebuild") + assertpkg.Equal(t, genA, active.ID, "A still serving while B builds") + + // B's activation is refused until its coverage is complete. + requirepkg.Error(t, b.ActivateGeneration(ctx, genB, false), "B refused while incomplete") + + // Cover the message for B (worker would do this after upsert), then + // activate B — the swap retires A and makes B the single serving gen. + _, err = b.mainDB.ExecContext(ctx, `UPDATE messages SET embed_gen = ? WHERE id = 1`, int64(genB)) + requirepkg.NoError(t, err, "stamp embed_gen for B") + requirepkg.NoError(t, b.ActivateGeneration(ctx, genB, false), "activate B after coverage complete") + + active, err = b.ActiveGeneration(ctx) + requirepkg.NoError(t, err, "ActiveGeneration after swap") + assertpkg.Equal(t, genB, active.ID, "B is the single serving gen after swap") + assertpkg.Equal(t, vector.GenerationRetired, genStateSV(t, b, genA), "A retired by the swap") } // singleRetiredGenSV returns the id of the one generation in state='retired', @@ -188,16 +249,47 @@ func singleRetiredGenSV(t *testing.T, b *Backend) vector.GenerationID { return vector.GenerationID(id) } -func TestBackend_CreateGeneration_SeedsPending(t *testing.T) { +// TestBackend_CreateGeneration_StampsSeededAt confirms CreateGeneration +// stamps seeded_at so the activation gate's lifecycle check passes. +func TestBackend_CreateGeneration_StampsSeededAt(t *testing.T) { b, ctx := newBackendForTest(t) gid, err := b.CreateGeneration(ctx, "m", 768, "") requirepkg.NoError(t, err, "Create") - var n int - err = b.db.QueryRowContext(ctx, - `SELECT COUNT(*) FROM pending_embeddings WHERE generation_id = ?`, gid, - ).Scan(&n) - requirepkg.NoError(t, err, "count pending") - assertpkg.Equal(t, 1, n, "pending count") + var seededAt sql.NullInt64 + requirepkg.NoError(t, b.db.QueryRowContext(ctx, + `SELECT seeded_at FROM index_generations WHERE id = ?`, int64(gid)).Scan(&seededAt)) + assertpkg.True(t, seededAt.Valid, "seeded_at stamped at creation") +} + +// TestBackend_ActivateGeneration_NullSeededAtActivatesWithCoverage pins +// that a legacy/crashed generation whose seeded_at is NULL must still +// activate WITHOUT --force as long as coverage is complete (missing==0). +// The old seeded_at IS NOT NULL gate would have rejected it and pointed +// users at `embeddings resume`, which cannot stamp seeded_at — making the +// row unactivatable except via --force. Coverage is the real gate now. +func TestBackend_ActivateGeneration_NullSeededAtActivatesWithCoverage(t *testing.T) { + b, ctx := newBackendForTest(t) + gen, err := b.CreateGeneration(ctx, "m", 768, "") + requirepkg.NoError(t, err, "CreateGeneration") + + // Simulate a legacy/crashed generation: clear seeded_at. + _, err = b.db.ExecContext(ctx, + `UPDATE index_generations SET seeded_at = NULL WHERE id = ?`, int64(gen)) + requirepkg.NoError(t, err, "clear seeded_at") + var seededAt sql.NullInt64 + requirepkg.NoError(t, b.db.QueryRowContext(ctx, + `SELECT seeded_at FROM index_generations WHERE id = ?`, int64(gen)).Scan(&seededAt)) + requirepkg.False(t, seededAt.Valid, "precondition: seeded_at is NULL") + + // Make coverage complete (worker would stamp this after upsert). + _, err = b.mainDB.ExecContext(ctx, `UPDATE messages SET embed_gen = ? WHERE id = 1`, int64(gen)) + requirepkg.NoError(t, err, "stamp embed_gen") + requirepkg.Equal(t, 0, missingCountSV(t, b, gen), "precondition: coverage complete") + + // Activation succeeds WITHOUT force despite seeded_at=NULL. + requirepkg.NoError(t, b.ActivateGeneration(ctx, gen, false), + "NULL seeded_at + full coverage must activate without --force") + assertpkg.Equal(t, vector.GenerationActive, genStateSV(t, b, gen), "now active") } // TestBackend_CreateGeneration_ResumesBuilding confirms that calling @@ -231,87 +323,6 @@ func TestBackend_CreateGeneration_MismatchedFingerprint(t *testing.T) { assertpkg.ErrorIs(t, err, vector.ErrBuildingInProgress) } -// TestBackend_CreateGeneration_ResumeDoesNotReseedCompleted is the -// regression test for the "interrupted full rebuild re-embeds -// everything" bug: after the worker has already embedded some messages -// (Queue.Complete removed those rows from pending_embeddings), a -// retry'd CreateGeneration must NOT push them back onto the queue. We -// simulate this by manually removing a pending row, then calling -// CreateGeneration again with the same fingerprint and asserting the -// removed row is not re-enqueued. -func TestBackend_CreateGeneration_ResumeDoesNotReseedCompleted(t *testing.T) { - require := requirepkg.New(t) - assert := assertpkg.New(t) - b, ctx := newBackendForTest(t) - - gen, err := b.CreateGeneration(ctx, "m", 768, "") - require.NoError(err, "first Create") - - // Simulate Queue.Complete: remove the pending row for the only - // pre-seeded message (id=1) as if it were already embedded. - _, err = b.db.ExecContext(ctx, - `DELETE FROM pending_embeddings WHERE generation_id = ? AND message_id = ?`, - int64(gen), int64(1)) - require.NoError(err, "delete pending") - - // Resume: CreateGeneration must reuse the existing building gen - // and NOT re-enqueue the completed message. - resumed, err := b.CreateGeneration(ctx, "m", 768, "") - require.NoError(err, "resume Create") - assert.Equal(gen, resumed, "resumed gen should reuse existing") - var pending int - err = b.db.QueryRowContext(ctx, - `SELECT COUNT(*) FROM pending_embeddings WHERE generation_id = ? AND message_id = 1`, - int64(gen)).Scan(&pending) - require.NoError(err, "count pending") - assert.Equal(0, pending, "resume re-seeded a completed message") -} - -// TestBackend_CreateGeneration_ResumeReseedsUnseededGeneration covers -// the "crash between row insert and seed commit" path: a building row -// exists but seeded_at is NULL because the previous attempt died -// before the seed transaction committed. A naive resume would skip -// seedPending, leave pending_embeddings empty, and let -// `msgvault embeddings build` activate the unseeded generation — silently -// replacing the prior active index with an empty one. The fix is to -// re-run seedPending whenever seeded_at IS NULL on resume. -func TestBackend_CreateGeneration_ResumeReseedsUnseededGeneration(t *testing.T) { - require := requirepkg.New(t) - assert := assertpkg.New(t) - b, ctx := newBackendForTest(t) - - gen, err := b.CreateGeneration(ctx, "m", 768, "") - require.NoError(err, "first Create") - // Simulate the crash window: clear seeded_at AND wipe the seeded - // rows so the post-resume pending count is exactly what the resume - // re-seed would produce. Without this we couldn't distinguish - // "rows are present because resume re-seeded" from "rows are - // present because the original seed left them there". - _, err = b.db.ExecContext(ctx, - `UPDATE index_generations SET seeded_at = NULL WHERE id = ?`, int64(gen)) - require.NoError(err, "clear seeded_at") - _, err = b.db.ExecContext(ctx, - `DELETE FROM pending_embeddings WHERE generation_id = ?`, int64(gen)) - require.NoError(err, "clear pending") - - resumed, err := b.CreateGeneration(ctx, "m", 768, "") - require.NoError(err, "resume Create") - assert.Equal(gen, resumed, "resumed gen should reuse existing") - var pending int - err = b.db.QueryRowContext(ctx, - `SELECT COUNT(*) FROM pending_embeddings WHERE generation_id = ?`, - int64(gen)).Scan(&pending) - require.NoError(err, "count pending") - assert.Equal(1, pending, "resume must re-seed an unseeded build") - // And seeded_at should now be populated so a second resume - // would correctly skip re-seeding. - var seededAt sql.NullInt64 - err = b.db.QueryRowContext(ctx, - `SELECT seeded_at FROM index_generations WHERE id = ?`, int64(gen)).Scan(&seededAt) - require.NoError(err, "read seeded_at") - assert.True(seededAt.Valid, "seeded_at still NULL after resume re-seed; second resume would re-seed again") -} - // TestBackend_ClaimOrInsertBuilding_RaceRecoversFromUniqueConstraint // exercises the post-INSERT unique-constraint recovery path: when a // concurrent writer slips a building row in between our SELECT and @@ -347,107 +358,28 @@ func TestBackend_ClaimOrInsertBuilding_RecoversFromExistingRow(t *testing.T) { assert.Equal(gen1, gen2, "should reuse gen id") } -// TestBackend_CreateGeneration_SeedCommitsVisibleFirst confirms the -// new building row is committed *before* the seed pass runs, so a -// concurrent Enqueuer can see the generation and dual-enqueue -// newly-synced messages. Without this ordering there is a window -// during which sync-side enqueues would be scoped only to the active -// generation and the new build would be missing messages. -// -// The previous version of this test polled on a short loop and -// passed even if visibility happened only AFTER CreateGeneration -// returned, because <-done would block until the goroutine finished -// and the polling loop would then see the committed row regardless. -// We now seed many messages to make seedPending take measurable time -// and require visibility to be observed strictly while the goroutine -// is still in flight (done has not fired yet). -func TestBackend_CreateGeneration_SeedCommitsVisibleFirst(t *testing.T) { - require := requirepkg.New(t) - ctx := context.Background() - - // Build a backend whose main DB has many messages so seedPending - // has enough work that we can race a visibility poll against it. - // 5_000 rows is comfortably more than the one row in the standard - // helper and drives seedPending into the millisecond range even on - // a fast laptop — far longer than the polling interval below. - main := openMainDBWithOneMessage(t) - insert, err := main.PrepareContext(ctx, `INSERT INTO messages (id) VALUES (?)`) - require.NoError(err, "prepare insert") - defer func() { _ = insert.Close() }() - for i := int64(2); i <= 5000; i++ { - _, err := insert.ExecContext(ctx, i) - require.NoErrorf(err, "insert msg %d", i) - } - - b, err := Open(ctx, Options{ - Path: filepath.Join(t.TempDir(), "vectors.db"), - Dimension: 768, - MainDB: main, - }) - require.NoError(err, "Open") - t.Cleanup(func() { _ = b.Close() }) - - done := make(chan error, 1) - go func() { - _, err := b.CreateGeneration(ctx, "m", 768, "") - done <- err - }() - - // Poll for visibility, but strictly while the goroutine is still - // in flight: every iteration first checks `done` via select-default, - // and a poll that fires after `done` is closed counts as a failure - // because we'd then be observing a row that was committed at any - // point — including after return. With 5000 messages to seed, we - // have hundreds of polling windows before CreateGeneration returns. - deadline := time.Now().Add(5 * time.Second) - var ( - visibleInFlight bool - doneFiredFirst bool - ) -poll: - for time.Now().Before(deadline) { - select { - case err := <-done: - // Push the result back so the post-loop assertion can - // also read it. If we got here without observing the row - // yet, that is a failure. - done <- err - doneFiredFirst = true - break poll - default: - } - var id int64 - qErr := b.db.QueryRowContext(ctx, - `SELECT id FROM index_generations WHERE state = 'building'`).Scan(&id) - if qErr == nil && id > 0 { - visibleInFlight = true - break poll - } - time.Sleep(1 * time.Millisecond) - } - - require.NoError(<-done, "CreateGeneration") - require.False(doneFiredFirst, "CreateGeneration returned before the building row became visible — commit was deferred to after seed") - require.True(visibleInFlight, "building generation was never visible while CreateGeneration was in flight") -} - -func TestBackend_CreateGeneration_SkipsDeletedMessages(t *testing.T) { +// TestBackend_CoverageGate_SkipsDeletedMessages verifies the coverage +// gate's live-message predicate excludes soft-deleted rows: a backend +// whose only message is deleted-from-source reports zero missing, so a +// building generation can activate without force. +func TestBackend_CoverageGate_SkipsDeletedMessages(t *testing.T) { b := openBackendWithOneDeletedMessage(t) t.Cleanup(func() { _ = b.Close() }) ctx := context.Background() gid, err := b.CreateGeneration(ctx, "m", 768, "") requirepkg.NoError(t, err, "Create") - var n int - _ = b.db.QueryRowContext(ctx, - `SELECT COUNT(*) FROM pending_embeddings WHERE generation_id = ?`, gid).Scan(&n) - assertpkg.Equal(t, 0, n, "pending count for deleted message") + missing, err := b.hasMissingForGen(ctx, gid) + requirepkg.NoError(t, err, "hasMissingForGen") + assertpkg.False(t, missing, "deleted-from-source message must not count as missing") + // With no live missing message, activation passes the coverage gate. + requirepkg.NoError(t, b.ActivateGeneration(ctx, gid, false), "activate (no missing)") } -// TestBackend_SeedPending_SkipsDedupHidden verifies that seedPending -// omits messages soft-deleted by dedup (deleted_at IS NOT NULL). -func TestBackend_SeedPending_SkipsDedupHidden(t *testing.T) { +// TestBackend_CoverageGate_SkipsDedupHidden verifies the coverage gate +// excludes dedup-hidden messages (deleted_at IS NOT NULL): only the live +// message counts toward "missing". +func TestBackend_CoverageGate_SkipsDedupHidden(t *testing.T) { require := requirepkg.New(t) - t.Helper() ctx := context.Background() db, err := sql.Open("sqlite3", ":memory:") @@ -456,10 +388,10 @@ func TestBackend_SeedPending_SkipsDedupHidden(t *testing.T) { _, err = db.Exec(`CREATE TABLE messages ( id INTEGER PRIMARY KEY, deleted_at DATETIME, - deleted_from_source_at DATETIME + deleted_from_source_at DATETIME, + embed_gen INTEGER )`) require.NoError(err, "create messages") - // Insert one live and one dedup-hidden message. _, err = db.Exec(`INSERT INTO messages (id) VALUES (1)`) require.NoError(err, "insert live") _, err = db.Exec(`INSERT INTO messages (id, deleted_at) VALUES (2, CURRENT_TIMESTAMP)`) @@ -475,19 +407,17 @@ func TestBackend_SeedPending_SkipsDedupHidden(t *testing.T) { gid, err := b.CreateGeneration(ctx, "m", 768, "") require.NoError(err, "CreateGeneration") - var n int - err = b.db.QueryRowContext(ctx, - `SELECT COUNT(*) FROM pending_embeddings WHERE generation_id = ?`, gid).Scan(&n) - require.NoError(err, "count pending") - assertpkg.Equal(t, 1, n, "dedup-hidden message must be excluded") + // Stamp the dedup-hidden message as if covered for some other gen; it + // must still not count (it is dedup-hidden), and the live one must. + s, err := b.Stats(ctx, gid) + require.NoError(err, "Stats") + assertpkg.Equal(t, int64(1), s.PendingCount, "only the live message counts as missing") } // TestBackend_Upsert_WritesEmbeddingAndVector verifies Upsert's // contract: it writes the embeddings row and the dimension-specific -// vec0 row, and explicitly does NOT touch pending_embeddings. The -// queue is the sole owner of that table so that Queue.Complete's -// token check can prevent a stale worker from wiping a newer worker's -// claim. +// vec0 row, and explicitly does NOT touch messages.embed_gen (the worker +// stamps that after a successful upsert, in the main DB). func TestBackend_Upsert_WritesEmbeddingAndVector(t *testing.T) { require := requirepkg.New(t) assert := assertpkg.New(t) @@ -515,12 +445,12 @@ func TestBackend_Upsert_WritesEmbeddingAndVector(t *testing.T) { require.NoError(err, "count vectors_vec_d768") assert.Equal(1, n, "vectors_vec_d768 count") - // Pending row is still present — the queue owns that table and - // only Queue.Complete may remove it. - err = b.db.QueryRowContext(ctx, - `SELECT COUNT(*) FROM pending_embeddings WHERE generation_id = ? AND message_id = 1`, gid).Scan(&n) - require.NoError(err, "count pending") - assert.Equal(1, n, "Upsert must not touch pending_embeddings") + // embed_gen is untouched by Upsert — message 1 still reads as missing. + var embedGen sql.NullInt64 + err = b.mainDB.QueryRowContext(ctx, + `SELECT embed_gen FROM messages WHERE id = 1`).Scan(&embedGen) + require.NoError(err, "read embed_gen") + assert.False(embedGen.Valid, "Upsert must not stamp messages.embed_gen") } func TestBackend_Upsert_DimensionMismatch(t *testing.T) { @@ -1387,7 +1317,8 @@ func TestBackend_Stats_PendingCountAfterCreate(t *testing.T) { b, ctx := newBackendForTest(t) gid, err := b.CreateGeneration(ctx, "m", 768, "") require.NoError(err, "CreateGeneration") - // CreateGeneration seeds 1 pending row for the one pre-seeded message. + // The one live message is unembedded, so PendingCount (= missing for + // this gen, read from the main DB) is 1. s, err := b.Stats(ctx, gid) require.NoError(err, "Stats") assert.Equal(int64(0), s.EmbeddingCount) diff --git a/internal/vector/sqlitevec/backend_testhelpers_test.go b/internal/vector/sqlitevec/backend_testhelpers_test.go index bfde9f79c..c51b0c6db 100644 --- a/internal/vector/sqlitevec/backend_testhelpers_test.go +++ b/internal/vector/sqlitevec/backend_testhelpers_test.go @@ -26,7 +26,8 @@ func openMainDBWithOneMessage(t *testing.T) *sql.DB { _, err = db.Exec(`CREATE TABLE messages ( id INTEGER PRIMARY KEY, deleted_at DATETIME, - deleted_from_source_at DATETIME + deleted_from_source_at DATETIME, + embed_gen INTEGER )`) require.NoError(t, err, "create messages") _, err = db.Exec(`INSERT INTO messages (id) VALUES (1)`) @@ -45,7 +46,8 @@ func openBackendWithOneDeletedMessage(t *testing.T) *Backend { _, err = db.Exec(`CREATE TABLE messages ( id INTEGER PRIMARY KEY, deleted_at DATETIME, - deleted_from_source_at DATETIME + deleted_from_source_at DATETIME, + embed_gen INTEGER )`) require.NoError(t, err, "create messages") _, err = db.Exec(`INSERT INTO messages (id, deleted_from_source_at) VALUES (1, CURRENT_TIMESTAMP)`) @@ -95,7 +97,8 @@ CREATE TABLE messages ( size_estimate INTEGER, sent_at DATETIME, deleted_at DATETIME, - deleted_from_source_at DATETIME + deleted_from_source_at DATETIME, + embed_gen INTEGER ); CREATE VIRTUAL TABLE messages_fts USING fts5(subject, body, content='', contentless_delete=1); CREATE TABLE message_labels ( @@ -183,12 +186,15 @@ func seedAndEmbed(t *testing.T, b *Backend, vecs map[int64][]float32) vector.Gen chunks = append(chunks, vector.Chunk{MessageID: id, Vector: vecs[id]}) } require.NoError(t, b.Upsert(ctx, gid, chunks), "Upsert") - // Upsert intentionally does not clear pending_embeddings — that - // belongs to the queue's token-aware Complete. For helper - // scenarios that want the "fully embedded" end state, we clear - // pending here directly. - _, err = b.db.ExecContext(ctx, - `DELETE FROM pending_embeddings WHERE generation_id = ?`, int64(gid)) - require.NoError(t, err, "clear pending") + // Upsert intentionally does not stamp messages.embed_gen — that is the + // worker's job (it stamps after a successful upsert). For helper + // scenarios that want the "fully embedded" end state (coverage + // complete), stamp the seeded messages directly so a later + // ActivateGeneration's coverage gate would pass. + for _, id := range ids { + _, err = b.mainDB.ExecContext(ctx, + `UPDATE messages SET embed_gen = ? WHERE id = ?`, int64(gid), id) + require.NoErrorf(t, err, "stamp embed_gen for msg %d", id) + } return gid } diff --git a/internal/vector/sqlitevec/backfill.go b/internal/vector/sqlitevec/backfill.go new file mode 100644 index 000000000..9532e4095 --- /dev/null +++ b/internal/vector/sqlitevec/backfill.go @@ -0,0 +1,438 @@ +//go:build sqlite_vec + +package sqlitevec + +import ( + "context" + "database/sql" + "errors" + "fmt" + "strings" + + "go.kenn.io/msgvault/internal/vector" +) + +// embedGenBackfillMigration is the applied_migrations ledger key that +// guards the one-time embed_gen upgrade backfill. Stable string — never +// change it, or the backfill would re-run on every Open. +const embedGenBackfillMigration = "embed_gen_backfill_active_v1" + +// backfillStampChunk caps how many message ids go into one stamping +// UPDATE so the bind-parameter count stays well under SQLite's limit. +const backfillStampChunk = 500 + +// BackfillEmbedGenForUpgrade performs the ONE-TIME upgrade backfill +// (Package A): when an active generation exists, it stamps embed_gen=active +// on every main-DB message that already has >=1 embedding row under that +// generation but whose embed_gen is still NULL. +// +// Why: the embed_gen ADD COLUMN migration does no backfill, so a user +// upgrading from v0.14–v0.15 (who already has an active generation + a +// fully-embedded corpus) would have embed_gen=NULL everywhere. Coverage +// would then report the ENTIRE archive as missing and the worker would +// re-embed all of it. This stamps the already-embedded rows instead — a +// cheap metadata UPDATE, no re-embed. +// +// Guards: +// - ONE-TIME via the applied_migrations ledger (key +// embedGenBackfillMigration). Check-then-run-then-mark. It must NOT run +// on every Open: re-running would clobber repair-encoding's NULL resets +// before they re-embed, and fight an in-progress rebuild. +// Accepted residual window: the check and the mark are NOT atomic across +// PROCESSES, so two concurrent first-opens of a freshly-upgraded DB +// (before either marks the ledger) can both run the backfill once; the +// second could re-stamp a row repair-encoding just reset to NULL. This +// window is ONE-SHOT (only at the first post-upgrade open, before the +// ledger is marked) and astronomically rare — accepted, not closed +// (operator decision): the mitigation is operational (run only one +// embedding process at a time; see README Vector Search). Within a single +// process the in-tx mark + EmbedJob single-flight lock prevent re-runs. +// - It lives in the VECTOR layer because the embeddings table is only +// reachable here (it is in vectors.db on SQLite, a separate *sql.DB +// from messages). +// - The stamp UPDATE only touches rows where embed_gen IS NULL, so it +// never overwrites a row already stamped for another generation. +// +// No-ops cleanly when: no main DB handle, the ledger already records it, no +// active generation, no embeddings table, or no embedded-but-unstamped +// rows. Idempotent. +// +// Cross-DB on SQLite: the embeddings ids come from vectors.db (b.db); the +// stamp and the ledger live in main.db (b.mainDB), two separate *sql.DB +// handles. This mirrors the established cross-DB pattern (see +// EmbeddedMessageCount / dropDeletedFromSource): read ids from vectors.db, +// stamp on main.db. +func (b *Backend) BackfillEmbedGenForUpgrade(ctx context.Context) error { + if b.mainDB == nil { + // Management commands may open the backend without the main handle; + // they never run the backfill. + return nil + } + if b.readOnly { + // The main handle was opened read-only (MCP: store.OpenReadOnly, + // _query_only=true). The backfill WRITES messages.embed_gen and the + // applied_migrations ledger, which the query-only handle rejects. + // Skip it entirely — mirrors pgvector's SkipMigrate read-only guard. + // A write-path process (serve, embeddings CLI) runs the backfill + // instead. + return nil + } + + // A main DB without applied_migrations is not a real msgvault store + // (e.g. a hand-rolled test fixture or a DB opened before the store + // schema ran); skip the backfill entirely rather than fail Open. + hasLedger, err := mainTableExists(ctx, b.mainDB, "applied_migrations") + if err != nil { + return err + } + if !hasLedger { + return nil + } + + applied, err := b.backfillApplied(ctx) + if err != nil { + return err + } + if applied { + return nil + } + + // Resolve the active generation. No active generation means nothing to + // backfill — but we still mark the migration applied so a later + // just-activated generation does not retroactively trigger a backfill + // that re-stamps rows repair-encoding may have reset. The active gen at + // upgrade time is the only one whose pre-existing embeddings predate the + // embed_gen column; generations created after upgrade are stamped by the + // worker as it embeds. + // + // Intentional scope limit: only the ACTIVE generation is backfilled. Any + // BUILDING generation that existed pre-upgrade is left unstamped — a + // resumed rebuild idempotently re-embeds that bounded portion (scan-and- + // fill skips already-covered rows), so the cost is small and one-time. + // Per-generation backfill complexity is not worth it for a single-user + // tool. + active, err := b.ActiveGeneration(ctx) + if err != nil { + if errors.Is(err, vector.ErrNoActiveGeneration) { + // No work to stamp; a lone ledger INSERT is trivially atomic, so + // it runs directly on main.db (no transaction needed). + return b.markBackfillApplied(ctx, b.mainDB) + } + return fmt.Errorf("backfill: resolve active generation: %w", err) + } + + // Distinct message ids that already have an embedding row for the active + // generation, read from vectors.db. This cross-DB read stays OUTSIDE the + // main.db transaction below: it is read-only and targets a different + // *sql.DB handle, so it cannot participate in the main.db tx. + rows, err := b.db.QueryContext(ctx, + `SELECT DISTINCT message_id FROM embeddings WHERE generation_id = ?`, + int64(active.ID)) + if err != nil { + return fmt.Errorf("backfill: list embedded message ids: %w", err) + } + defer func() { _ = rows.Close() }() + var ids []int64 + for rows.Next() { + var id int64 + if err := rows.Scan(&id); err != nil { + return fmt.Errorf("backfill: scan embedded message id: %w", err) + } + ids = append(ids, id) + } + if err := rows.Err(); err != nil { + return fmt.Errorf("backfill: iterate embedded message ids: %w", err) + } + + // Preserve the legacy pending re-embed signal. Under the + // OLD design, pending_embeddings was a re-embed flag: a message could carry + // BOTH an active-gen embedding AND an active-gen pending row (old + // repair-encoding re-enqueued already-embedded messages; the old worker + // deleted the pending row only on a successful re-embed). If we stamp + // embed_gen=active on such a message it reads "covered" forever and is never + // re-embedded — silent permanent staleness against the corrected text. + // EXCLUDE every active-gen pending message id from the stamp set so it ends + // embed_gen=NULL and the scan-and-fill worker re-embeds it. pending_embeddings + // lives in vectors.db (b.db) on SQLite, the same handle as embeddings. + pendingIDs, err := b.activeGenPendingIDs(ctx, active.ID) + if err != nil { + return err + } + if len(pendingIDs) > 0 { + pendingSet := make(map[int64]struct{}, len(pendingIDs)) + for _, id := range pendingIDs { + pendingSet[id] = struct{}{} + } + kept := ids[:0] + for _, id := range ids { + if _, isPending := pendingSet[id]; !isPending { + kept = append(kept, id) + } + } + ids = kept + } + + // Atomicity (Codex 129d #2/#3): the embed_gen stamp UPDATE(s) and the + // ledger mark must be all-or-nothing. messages and applied_migrations + // both live in main.db, so a single transaction covers every chunk plus + // the mark. Without it, a crash (or error) after some chunks but before + // the mark would leave the ledger UNMARKED while embed_gen rows were + // already stamped → the one-time backfill re-runs on the next Open and + // clobbers any NULL resets repair-encoding made in the interim. With one + // tx, a partial-chunk failure rolls back EVERY chunk and the mark, so the + // migration stays unmarked and the next Open re-runs cleanly from scratch. + tx, err := b.mainDB.BeginTx(ctx, nil) + if err != nil { + return fmt.Errorf("backfill: begin tx: %w", err) + } + committed := false + defer func() { + if !committed { + _ = tx.Rollback() + } + }() + + // Stamp embed_gen=active for those ids on main.db, but only where it is + // still NULL — never overwrite a row already stamped for a different + // generation. Chunked to stay under the bind limit; ALL chunks run on the + // same tx. + for start := 0; start < len(ids); start += backfillStampChunk { + end := min(start+backfillStampChunk, len(ids)) + chunk := ids[start:end] + placeholders := make([]string, len(chunk)) + args := make([]any, 0, 1+len(chunk)) + args = append(args, int64(active.ID)) + for i, id := range chunk { + placeholders[i] = "?" + args = append(args, id) + } + q := `UPDATE messages SET embed_gen = ? + WHERE embed_gen IS NULL + AND id IN (` + strings.Join(placeholders, ",") + `)` + if _, err := tx.ExecContext(ctx, q, args...); err != nil { + return fmt.Errorf("backfill: stamp embed_gen: %w", err) + } + } + + if err := b.markBackfillApplied(ctx, tx); err != nil { + return err + } + + if err := tx.Commit(); err != nil { + return fmt.Errorf("backfill: commit tx: %w", err) + } + committed = true + return nil +} + +// activeGenPendingIDs returns the message ids that carry an active-generation +// row in the legacy pending_embeddings table — the OLD "needs (re-)embedding" +// signal. Used by BackfillEmbedGenForUpgrade to EXCLUDE these from the +// embed_gen stamp so they re-embed under scan-and-fill. +// +// Returns nil with no error when pending_embeddings does not exist (a fresh +// DB, or one already cleaned up): there is no legacy signal to preserve. +// pending_embeddings lives in vectors.db (b.db). +func (b *Backend) activeGenPendingIDs(ctx context.Context, active vector.GenerationID) ([]int64, error) { + exists, err := tableExists(ctx, b.db, "pending_embeddings") + if err != nil { + return nil, fmt.Errorf("backfill: probe pending_embeddings: %w", err) + } + if !exists { + return nil, nil + } + rows, err := b.db.QueryContext(ctx, + `SELECT DISTINCT message_id FROM pending_embeddings WHERE generation_id = ?`, + int64(active)) + if err != nil { + return nil, fmt.Errorf("backfill: list active-gen pending ids: %w", err) + } + defer func() { _ = rows.Close() }() + var ids []int64 + for rows.Next() { + var id int64 + if err := rows.Scan(&id); err != nil { + return nil, fmt.Errorf("backfill: scan pending id: %w", err) + } + ids = append(ids, id) + } + if err := rows.Err(); err != nil { + return nil, fmt.Errorf("backfill: iterate pending ids: %w", err) + } + return ids, nil +} + +// dropDeadPendingEmbeddings drops the legacy pending_embeddings queue table +// from vectors.db. The scan-and-fill design replaced the per-generation seed +// queue with a live messages.embed_gen scan, so the table is otherwise unused; +// left in place it only wastes space and confuses operators inspecting +// vectors.db. +// +// It runs on every WRITABLE Open, AFTER BackfillEmbedGenForUpgrade has had a +// chance to consult the table and preserve its re-embed signal. +// Doing the drop here rather than in Migrate guarantees the backfill (when it +// runs) sees the table first. It is gated to the writable path so a read-only +// Open leaves the table — and its signal — intact for the next writable open. +// Idempotent: DROP TABLE IF EXISTS is a no-op on fresh DBs and on a second run. +// vectors.db is read-write regardless of the main-handle readOnly flag, so this +// targets b.db unconditionally; the CALLER gates it on the writable Open path. +func (b *Backend) dropDeadPendingEmbeddings(ctx context.Context) error { + if _, err := b.db.ExecContext(ctx, `DROP TABLE IF EXISTS pending_embeddings`); err != nil { + return fmt.Errorf("drop dead pending_embeddings table: %w", err) + } + return nil +} + +// resetOrphanedEmbedGen clears messages.embed_gen for every main-DB message +// whose stamp references a generation id that does NOT exist in +// index_generations (an "orphaned" stamp). It runs on every WRITABLE Open +// BEFORE BackfillEmbedGenForUpgrade. +// +// Why: index_generations.id AUTOINCREMENTs inside the REPLACEABLE vectors.db, +// while embed_gen stamps live in the durable main.db. If a user deletes and +// recreates vectors.db but keeps main.db, the fresh index_generations restarts +// ids at 1 while main.db still carries old stamps (e.g. embed_gen=1). A later +// rebuild then reuses gen id 1, the coverage scan predicate +// (embed_gen IS NULL OR embed_gen <> target) treats those stale stamps as +// already-covered, coverage reaches missing==0, and an EMPTY index is +// activated — search returns nothing while coverage claims done. Clearing +// orphaned stamps before any rebuild can reuse id 1 closes that hole. +// +// This is the precise, false-positive-proof form of the operator's +// "recreate-detection reset": a stamp pointing to a STILL-EXISTING generation +// row (active, building, OR retired — retire only flips state, it does not +// delete the row) is KEPT. So the normal activate/retire flow, where a rebuild +// re-stamps live messages to the new active gen and the old gen's row is merely +// marked retired, never trips this reset. Only a genuinely vanished gen id +// (vectors.db recreated/wiped) triggers a clear. +// +// Cross-DB on SQLite: the valid gen-id set comes from vectors.db (b.db); the +// reset runs against main.db (b.mainDB). When the valid set is EMPTY (recreated +// vectors.db), the predicate degrades to "all non-NULL stamps" — handled +// explicitly because `NOT IN ()` is a SQL pitfall. The valid set is tiny (a +// handful of generations), so a literal IN-list is fine. +// +// Guards (mirror BackfillEmbedGenForUpgrade): no-op when the main handle is +// absent (management commands) or read-only (MCP). NOT ledger-guarded: a +// recreate can happen between any two process starts, so this must re-check +// every writable Open. It is cheap and idempotent — a second run finds no +// orphans and updates nothing. +func (b *Backend) resetOrphanedEmbedGen(ctx context.Context) error { + if b.mainDB == nil { + // Management commands open the backend without the main handle. + return nil + } + if b.readOnly { + // Read-only main handle (MCP): the reset WRITES messages.embed_gen, + // which the query-only handle rejects. Skip — a write-path process + // (serve, embeddings CLI) performs the reset instead. Mirrors the + // backfill's b.readOnly guard. + return nil + } + + // A main DB without applied_migrations is not a real msgvault store (e.g. + // a hand-rolled test fixture, or a DB opened before the store schema ran); + // such a fixture also lacks the embed_gen column. Skip the reset entirely + // rather than fail Open — mirrors BackfillEmbedGenForUpgrade's identical + // guard so the two open-time steps gate the same way. + hasLedger, err := mainTableExists(ctx, b.mainDB, "applied_migrations") + if err != nil { + return err + } + if !hasLedger { + return nil + } + + // Collect the set of valid generation ids from vectors.db. + rows, err := b.db.QueryContext(ctx, `SELECT id FROM index_generations`) + if err != nil { + return fmt.Errorf("reset orphaned embed_gen: list generation ids: %w", err) + } + defer func() { _ = rows.Close() }() + var validIDs []int64 + for rows.Next() { + var id int64 + if err := rows.Scan(&id); err != nil { + return fmt.Errorf("reset orphaned embed_gen: scan generation id: %w", err) + } + validIDs = append(validIDs, id) + } + if err := rows.Err(); err != nil { + return fmt.Errorf("reset orphaned embed_gen: iterate generation ids: %w", err) + } + + // Empty valid set (recreated/empty vectors.db): every non-NULL stamp is + // orphaned. `NOT IN ()` is a SQL pitfall, so special-case it to a plain + // "clear all non-NULL stamps" UPDATE. + if len(validIDs) == 0 { + if _, err := b.mainDB.ExecContext(ctx, + `UPDATE messages SET embed_gen = NULL WHERE embed_gen IS NOT NULL`); err != nil { + return fmt.Errorf("reset orphaned embed_gen: clear all stamps: %w", err) + } + return nil + } + + // Non-empty valid set: clear only stamps that fall outside it. The set is + // tiny, so a literal IN-list is well under SQLite's bind limit. + placeholders := make([]string, len(validIDs)) + args := make([]any, len(validIDs)) + for i, id := range validIDs { + placeholders[i] = "?" + args[i] = id + } + q := `UPDATE messages SET embed_gen = NULL + WHERE embed_gen IS NOT NULL + AND embed_gen NOT IN (` + strings.Join(placeholders, ",") + `)` + if _, err := b.mainDB.ExecContext(ctx, q, args...); err != nil { + return fmt.Errorf("reset orphaned embed_gen: clear orphaned stamps: %w", err) + } + return nil +} + +// backfillApplied reports whether the one-time backfill ledger row exists +// in main.db. A missing applied_migrations table (older main schema) is +// treated as "not applied" — the table is created by the store schema, so +// this only matters in unusual test setups; the markBackfillApplied write +// would then surface the real error. +func (b *Backend) backfillApplied(ctx context.Context) (bool, error) { + var n int + if err := b.mainDB.QueryRowContext(ctx, + `SELECT COUNT(*) FROM applied_migrations WHERE name = ?`, + embedGenBackfillMigration).Scan(&n); err != nil { + return false, fmt.Errorf("backfill: check ledger: %w", err) + } + return n > 0, nil +} + +// execer is the subset of *sql.DB / *sql.Tx the ledger mark needs, so +// markBackfillApplied can run either directly on main.db (lone INSERT, the +// no-active-gen path) or inside the backfill transaction (alongside the +// chunked embed_gen UPDATEs, for atomicity). +type execer interface { + ExecContext(ctx context.Context, query string, args ...any) (sql.Result, error) +} + +// markBackfillApplied records the one-time backfill in main.db's ledger via +// the given execer. INSERT OR IGNORE keeps it idempotent under a concurrent +// Open. Pass b.mainDB for a standalone mark, or the backfill tx so the mark +// commits atomically with the embed_gen UPDATEs. +func (b *Backend) markBackfillApplied(ctx context.Context, ex execer) error { + if _, err := ex.ExecContext(ctx, + `INSERT OR IGNORE INTO applied_migrations (name) VALUES (?)`, + embedGenBackfillMigration); err != nil { + return fmt.Errorf("backfill: mark ledger: %w", err) + } + return nil +} + +// mainTableExists asks sqlite_master in db (the MAIN db, distinct from +// vectors.db) whether a regular or virtual table named `name` exists. +func mainTableExists(ctx context.Context, db *sql.DB, name string) (bool, error) { + var n int + if err := db.QueryRowContext(ctx, + `SELECT COUNT(*) FROM sqlite_master WHERE type IN ('table','virtual') AND name = ?`, + name).Scan(&n); err != nil { + return false, fmt.Errorf("backfill: probe %s: %w", name, err) + } + return n > 0, nil +} diff --git a/internal/vector/sqlitevec/backfill_readonly_test.go b/internal/vector/sqlitevec/backfill_readonly_test.go new file mode 100644 index 000000000..f09ea7718 --- /dev/null +++ b/internal/vector/sqlitevec/backfill_readonly_test.go @@ -0,0 +1,169 @@ +//go:build sqlite_vec + +package sqlitevec + +import ( + "context" + "database/sql" + "path/filepath" + "testing" + + _ "github.com/mattn/go-sqlite3" + "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/require" + + "go.kenn.io/msgvault/internal/store" + "go.kenn.io/msgvault/internal/vector" +) + +// backfillLedgerMarked reports whether the one-time upgrade backfill ledger +// row exists in the given main DB handle. +func backfillLedgerMarked(t *testing.T, db *sql.DB) bool { + t.Helper() + var n int + require.NoError(t, db.QueryRow( + `SELECT COUNT(*) FROM applied_migrations WHERE name = ?`, + embedGenBackfillMigration).Scan(&n)) + return n > 0 +} + +// TestBackfillEmbedGen_ReadOnlyMainDB_Skipped is the regression guard for +// Codex #3: the MCP server opens the main DB query-only +// (store.OpenReadOnly, _query_only=true), but setupVectorFeatures -> +// sqlitevec.Open ran BackfillEmbedGenForUpgrade, which WRITES +// messages.embed_gen + applied_migrations through that read-only handle. The +// readOnly flag was honored on PG (SkipMigrate) but ignored on SQLite, so +// MCP startup failed (or wrote through the query-only handle) whenever the +// backfill ledger was not yet marked. +// +// With Options.ReadOnly plumbed from the MCP readOnly arg, the backfill +// self-guards: a read-only Open with an UNMARKED ledger and an active +// generation must NOT attempt the write, must NOT error, and must leave the +// ledger unmarked. Migrate still runs (vectors.db is read-write). +func TestBackfillEmbedGen_ReadOnlyMainDB_Skipped(t *testing.T) { + require.NoError(t, RegisterExtension(), "RegisterExtension") + ctx := context.Background() + + dir := t.TempDir() + mainPath := filepath.Join(dir, "msgvault.db") + vecPath := filepath.Join(dir, "vectors.db") + + // Build a real main DB with one live message and an active generation + // whose embedding exists, then reset embed_gen + clear the ledger so the + // backfill would have real work (the embed_gen-stamping write) to do. + s, err := store.Open(mainPath) + require.NoError(t, err, "store.Open (rw)") + require.NoError(t, s.InitSchema(), "InitSchema") + _, err = s.DB().Exec(` +INSERT INTO sources (id, source_type, identifier) VALUES (1, 'gmail', 'me@example.com'); +INSERT INTO conversations (id, source_id, conversation_type) VALUES (1, 1, 'email_thread'); +INSERT INTO messages (id, conversation_id, source_id, source_message_id, message_type) +VALUES (1, 1, 1, 'm1', 'email'); +`) + require.NoError(t, err, "seed message") + + rw, err := Open(ctx, Options{ + Path: vecPath, MainPath: mainPath, Dimension: 4, MainDB: s.DB(), + }) + require.NoError(t, err, "rw backend Open") + gen, err := rw.CreateGeneration(ctx, "model", 4, "model:4") + require.NoError(t, err, "CreateGeneration") + require.NoError(t, rw.Upsert(ctx, gen, vectorChunkOne()), "Upsert") + require.NoError(t, rw.ActivateGeneration(ctx, gen, true), "Activate") + require.NoError(t, rw.Close(), "close rw backend") + + // Simulate the pre-upgrade state: embed_gen NULL everywhere, ledger + // cleared. A WRITE-path Open here would stamp embed_gen and mark the + // ledger; a READ-only Open must do neither. + _, err = s.DB().Exec(`UPDATE messages SET embed_gen = NULL`) + require.NoError(t, err, "reset embed_gen") + _, err = s.DB().Exec(`DELETE FROM applied_migrations WHERE name = ?`, embedGenBackfillMigration) + require.NoError(t, err, "clear ledger") + require.NoError(t, s.Close(), "close rw store") + + // Reopen the main DB read-only, exactly as the MCP server does. + ro, err := store.OpenReadOnly(mainPath) + require.NoError(t, err, "store.OpenReadOnly") + defer func() { _ = ro.Close() }() + + // MCP path: sqlitevec.Open with ReadOnly=true. The backfill must be + // skipped — no write through the query-only handle, no error. + b, err := Open(ctx, Options{ + Path: vecPath, + MainPath: mainPath, + Dimension: 4, + MainDB: ro.DB(), + ReadOnly: true, + }) + require.NoError(t, err, "read-only Open must not error (backfill skipped)") + defer func() { _ = b.Close() }() + + // Verify nothing was written: ledger stays unmarked, embed_gen stays NULL. + assert.False(t, backfillLedgerMarked(t, ro.DB()), + "read-only Open must NOT mark the backfill ledger") + var v sql.NullInt64 + require.NoError(t, ro.DB().QueryRow(`SELECT embed_gen FROM messages WHERE id = 1`).Scan(&v)) + assert.False(t, v.Valid, "read-only Open must NOT stamp embed_gen") +} + +// vectorChunkOne returns a single-chunk slice for message 1 with a unit +// 4-dim vector — the embedding the read-only backfill test pre-seeds. +func vectorChunkOne() []vector.Chunk { + return []vector.Chunk{{MessageID: 1, ChunkIndex: 0, Vector: []float32{0, 0, 0, 1}}} +} + +// TestResetOrphanedEmbedGen_ReadOnlyMainDB_Skipped is the read-only guard for +// the orphaned-stamp reset (Codex 129c #1). The reset WRITES +// messages.embed_gen, so a read-only main handle (MCP: store.OpenReadOnly, +// _query_only=true) must SKIP it entirely — no write attempt, no error, stamps +// untouched. Mirrors the backfill's b.readOnly guard. +// +// The setup leaves an ORPHANED stamp (embed_gen=99 with an empty +// index_generations) so a writable Open WOULD reset it; the read-only Open +// must not. +func TestResetOrphanedEmbedGen_ReadOnlyMainDB_Skipped(t *testing.T) { + require.NoError(t, RegisterExtension(), "RegisterExtension") + ctx := context.Background() + + dir := t.TempDir() + mainPath := filepath.Join(dir, "msgvault.db") + vecPath := filepath.Join(dir, "vectors.db") + + // Build a real main DB with one live message whose embed_gen references a + // generation id that does NOT exist in the (empty) vectors.db + // index_generations — i.e. an orphaned stamp. + s, err := store.Open(mainPath) + require.NoError(t, err, "store.Open (rw)") + require.NoError(t, s.InitSchema(), "InitSchema") + _, err = s.DB().Exec(` +INSERT INTO sources (id, source_type, identifier) VALUES (1, 'gmail', 'me@example.com'); +INSERT INTO conversations (id, source_id, conversation_type) VALUES (1, 1, 'email_thread'); +INSERT INTO messages (id, conversation_id, source_id, source_message_id, message_type, embed_gen) +VALUES (1, 1, 1, 'm1', 'email', 99); +`) + require.NoError(t, err, "seed message with orphaned embed_gen") + require.NoError(t, s.Close(), "close rw store") + + // Reopen the main DB read-only, exactly as the MCP server does. Migrate + // will create an empty index_generations in vectors.db (read-write), so id + // 99 is orphaned; the reset would clear it on a WRITABLE open. + ro, err := store.OpenReadOnly(mainPath) + require.NoError(t, err, "store.OpenReadOnly") + defer func() { _ = ro.Close() }() + + b, err := Open(ctx, Options{ + Path: vecPath, + MainPath: mainPath, + Dimension: 4, + MainDB: ro.DB(), + ReadOnly: true, + }) + require.NoError(t, err, "read-only Open must not error (reset skipped)") + defer func() { _ = b.Close() }() + + // The orphaned stamp must be PRESERVED: a read-only Open writes nothing. + var v sql.NullInt64 + require.NoError(t, ro.DB().QueryRow(`SELECT embed_gen FROM messages WHERE id = 1`).Scan(&v)) + assert.True(t, v.Valid, "read-only Open must NOT reset the orphaned embed_gen") + assert.Equal(t, int64(99), v.Int64, "orphaned stamp unchanged under read-only Open") +} diff --git a/internal/vector/sqlitevec/backfill_test.go b/internal/vector/sqlitevec/backfill_test.go new file mode 100644 index 000000000..a2f848d45 --- /dev/null +++ b/internal/vector/sqlitevec/backfill_test.go @@ -0,0 +1,291 @@ +//go:build sqlite_vec + +package sqlitevec + +import ( + "context" + "database/sql" + "path/filepath" + "testing" + + _ "github.com/mattn/go-sqlite3" + "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/require" + + "go.kenn.io/msgvault/internal/store" + "go.kenn.io/msgvault/internal/vector" +) + +// embedGenOf reads embed_gen for a message, reporting the value and whether +// it is NULL. +func embedGenOf(t *testing.T, db *sql.DB, id int64) (val int64, isNull bool) { + t.Helper() + var v sql.NullInt64 + require.NoError(t, db.QueryRow(`SELECT embed_gen FROM messages WHERE id = ?`, id).Scan(&v)) + return v.Int64, !v.Valid +} + +// seedEmbeddedMain builds a real main DB with one live message, opens a +// writable backend, creates+activates a generation with an embedding for +// message 1, then simulates the pre-upgrade state: embed_gen NULL and the +// backfill ledger cleared. It returns the closed paths and the (still open) +// rw store so the caller can drive a fresh Open. The rw store/backend are +// closed before returning so a subsequent Open holds the only handles. +func seedEmbeddedMain(ctx context.Context, t *testing.T) (mainPath, vecPath string) { + t.Helper() + dir := t.TempDir() + mainPath = filepath.Join(dir, "msgvault.db") + vecPath = filepath.Join(dir, "vectors.db") + + s, err := store.Open(mainPath) + require.NoError(t, err, "store.Open (rw)") + require.NoError(t, s.InitSchema(), "InitSchema") + _, err = s.DB().Exec(` +INSERT INTO sources (id, source_type, identifier) VALUES (1, 'gmail', 'me@example.com'); +INSERT INTO conversations (id, source_id, conversation_type) VALUES (1, 1, 'email_thread'); +INSERT INTO messages (id, conversation_id, source_id, source_message_id, message_type) +VALUES (1, 1, 1, 'm1', 'email'); +`) + require.NoError(t, err, "seed message") + + rw, err := Open(ctx, Options{ + Path: vecPath, MainPath: mainPath, Dimension: 4, MainDB: s.DB(), + }) + require.NoError(t, err, "rw backend Open") + gen, err := rw.CreateGeneration(ctx, "model", 4, "model:4") + require.NoError(t, err, "CreateGeneration") + require.NoError(t, rw.Upsert(ctx, gen, vectorChunkOne()), "Upsert") + require.NoError(t, rw.ActivateGeneration(ctx, gen, true), "Activate") + require.NoError(t, rw.Close(), "close rw backend") + + // Simulate the pre-upgrade state: embed_gen NULL, ledger cleared. A + // write-path Open would now stamp embed_gen=active and mark the ledger. + _, err = s.DB().Exec(`UPDATE messages SET embed_gen = NULL`) + require.NoError(t, err, "reset embed_gen") + _, err = s.DB().Exec(`DELETE FROM applied_migrations WHERE name = ?`, embedGenBackfillMigration) + require.NoError(t, err, "clear ledger") + require.NoError(t, s.Close(), "close rw store") + + return mainPath, vecPath +} + +// TestBackfillEmbedGen_StampAndMarkAtomic_RollbackOnMarkFailure is the +// regression guard for Codex 129d #2/#3: the embed_gen stamp UPDATE and the +// applied_migrations ledger mark must be ONE atomic transaction. +// +// Fault injection: a BEFORE INSERT trigger on applied_migrations RAISE(ABORT)s +// when the backfill ledger row is inserted. RAISE(ABORT) errors even under +// INSERT OR IGNORE, so the ledger-mark step fails AFTER the embed_gen UPDATE +// has run inside the same tx. If the two are atomic, the UPDATE must be ROLLED +// BACK (embed_gen stays NULL) and the ledger must stay UNMARKED — leaving the +// DB exactly pre-backfill so a later clean Open re-runs and completes. +func TestBackfillEmbedGen_StampAndMarkAtomic_RollbackOnMarkFailure(t *testing.T) { + require.NoError(t, RegisterExtension(), "RegisterExtension") + ctx := context.Background() + mainPath, vecPath := seedEmbeddedMain(ctx, t) + + // Reopen the main DB read-write and install a fault that makes ONLY the + // ledger mark fail. The embed_gen UPDATE on messages still succeeds, so a + // non-atomic implementation would leave embed_gen stamped while the ledger + // stays unmarked. + s, err := store.Open(mainPath) + require.NoError(t, err, "reopen main rw") + defer func() { _ = s.Close() }() + _, err = s.DB().Exec(`CREATE TRIGGER zz_fail_backfill_mark + BEFORE INSERT ON applied_migrations + WHEN NEW.name = '` + embedGenBackfillMigration + `' + BEGIN SELECT RAISE(ABORT, 'injected backfill mark failure'); END;`) + require.NoError(t, err, "install fault trigger") + + b, err := Open(ctx, Options{ + Path: vecPath, MainPath: mainPath, Dimension: 4, MainDB: s.DB(), + }) + // Open runs the backfill; the injected mark failure must surface as an + // error from Open (the tx rolls back). + if b != nil { + t.Cleanup(func() { _ = b.Close() }) + } + require.Error(t, err, "Open must surface the injected ledger-mark failure") + assert.Contains(t, err.Error(), "injected backfill mark failure") + + // Atomicity: the embed_gen stamp must have been ROLLED BACK with the + // failed mark — stamp NOT applied, ledger NOT marked. + _, isNull := embedGenOf(t, s.DB(), 1) + assert.True(t, isNull, + "embed_gen must be rolled back to NULL when the ledger mark fails (atomic)") + assert.False(t, backfillLedgerMarked(t, s.DB()), + "ledger must stay unmarked when the backfill tx rolls back") + + // Recovery: remove the fault and re-Open. The migration was never marked, + // so the one-time backfill re-runs cleanly and now completes — both the + // stamp and the mark land. + _, err = s.DB().Exec(`DROP TRIGGER zz_fail_backfill_mark`) + require.NoError(t, err, "drop fault trigger") + + b2, err := Open(ctx, Options{ + Path: vecPath, MainPath: mainPath, Dimension: 4, MainDB: s.DB(), + }) + require.NoError(t, err, "clean re-Open must succeed (backfill re-runs)") + t.Cleanup(func() { _ = b2.Close() }) + + _, isNull = embedGenOf(t, s.DB(), 1) + assert.False(t, isNull, "embed_gen stamped after clean re-Open") + assert.True(t, backfillLedgerMarked(t, s.DB()), + "ledger marked after clean re-Open") +} + +// TestBackfillEmbedGen_StampAndMarkAtomic_BothPresentOnSuccess is the positive +// companion: a successful backfill leaves BOTH the embed_gen stamp and the +// ledger mark present (the all-or-nothing tx committed both). +func TestBackfillEmbedGen_StampAndMarkAtomic_BothPresentOnSuccess(t *testing.T) { + require.NoError(t, RegisterExtension(), "RegisterExtension") + ctx := context.Background() + mainPath, vecPath := seedEmbeddedMain(ctx, t) + + s, err := store.Open(mainPath) + require.NoError(t, err, "reopen main rw") + defer func() { _ = s.Close() }() + + b, err := Open(ctx, Options{ + Path: vecPath, MainPath: mainPath, Dimension: 4, MainDB: s.DB(), + }) + require.NoError(t, err, "Open (runs backfill)") + t.Cleanup(func() { _ = b.Close() }) + + // Both committed together. + v, isNull := embedGenOf(t, s.DB(), 1) + require.False(t, isNull, "embed_gen stamped after successful backfill") + assert.Positive(t, v, "embed_gen references the active generation") + assert.True(t, backfillLedgerMarked(t, s.DB()), + "ledger marked after successful backfill") +} + +// TestBackfillEmbedGen_PreservesActiveGenPendingReembedSignal is the SQLite +// regression guard for the pending-signal preservation case: the one-time +// upgrade backfill must NOT stamp embed_gen=active on a message that carried an +// active-gen pending_embeddings row (the OLD re-embed flag), even though it has +// an active-gen embedding. Such a message had a STALE embedding queued for +// re-embed (old repair-encoding re-enqueued it); stamping it "covered" would +// leave it permanently stale. It must end embed_gen=NULL so scan-and-fill +// re-embeds it, while a normal embedded message with no pending row ends +// embed_gen=active. pending_embeddings (in vectors.db) is dropped by Open after. +func TestBackfillEmbedGen_PreservesActiveGenPendingReembedSignal(t *testing.T) { + require.NoError(t, RegisterExtension(), "RegisterExtension") + ctx := context.Background() + + dir := t.TempDir() + mainPath := filepath.Join(dir, "msgvault.db") + vecPath := filepath.Join(dir, "vectors.db") + + // Real main DB with two live messages. Close the store via t.Cleanup + // (registered before the backend's, so LIFO closes the store LAST, after + // the backend that borrows s.DB()) — otherwise the open msgvault.db handle + // blocks t.TempDir() cleanup on Windows. + s, err := store.Open(mainPath) + require.NoError(t, err, "store.Open (rw)") + t.Cleanup(func() { _ = s.Close() }) + require.NoError(t, s.InitSchema(), "InitSchema") + _, err = s.DB().Exec(` +INSERT INTO sources (id, source_type, identifier) VALUES (1, 'gmail', 'me@example.com'); +INSERT INTO conversations (id, source_id, conversation_type) VALUES (1, 1, 'email_thread'); +INSERT INTO messages (id, conversation_id, source_id, source_message_id, message_type) +VALUES (1, 1, 1, 'm1', 'email'), (2, 1, 1, 'm2', 'email'); +`) + require.NoError(t, err, "seed messages") + + rw, err := Open(ctx, Options{ + Path: vecPath, MainPath: mainPath, Dimension: 4, MainDB: s.DB(), + }) + require.NoError(t, err, "rw backend Open") + gen, err := rw.CreateGeneration(ctx, "model", 4, "model:4") + require.NoError(t, err, "CreateGeneration") + // Both messages have an active-gen embedding. + require.NoError(t, rw.Upsert(ctx, gen, []vector.Chunk{ + {MessageID: 1, ChunkIndex: 0, Vector: []float32{0, 0, 0, 1}}, + {MessageID: 2, ChunkIndex: 0, Vector: []float32{0, 0, 1, 0}}, + }), "Upsert") + require.NoError(t, rw.ActivateGeneration(ctx, gen, true), "Activate") + + // Reconstruct the OLD-state precondition inside vectors.db: pending_embeddings + // exists with an active-gen row for msg 1 ONLY (msg 1 was re-enqueued for + // re-embed while still holding its stale active-gen embedding). + _, err = rw.DB().ExecContext(ctx, `CREATE TABLE pending_embeddings ( + generation_id INTEGER NOT NULL, + message_id INTEGER NOT NULL + )`) + require.NoError(t, err, "create legacy pending_embeddings") + _, err = rw.DB().ExecContext(ctx, + `INSERT INTO pending_embeddings (generation_id, message_id) VALUES (?, 1)`, int64(gen)) + require.NoError(t, err, "seed active-gen pending row for msg 1") + require.NoError(t, rw.Close(), "close rw backend") + + // Simulate the upgrade: embed_gen NULL everywhere, ledger cleared. + _, err = s.DB().Exec(`UPDATE messages SET embed_gen = NULL`) + require.NoError(t, err, "reset embed_gen") + _, err = s.DB().Exec(`DELETE FROM applied_migrations WHERE name = ?`, embedGenBackfillMigration) + require.NoError(t, err, "clear ledger") + + // Writable Open runs the backfill (which consults pending) then drops the + // table. + b, err := Open(ctx, Options{ + Path: vecPath, MainPath: mainPath, Dimension: 4, MainDB: s.DB(), + }) + require.NoError(t, err, "writable Open (runs backfill)") + t.Cleanup(func() { _ = b.Close() }) + + // msg 1 (had an active-gen pending re-embed row) must stay NULL → re-embed. + _, isNull1 := embedGenOf(t, s.DB(), 1) + assert.True(t, isNull1, + "msg 1 (active-gen pending re-embed) must stay embed_gen=NULL so it re-embeds") + // msg 2 (normal embedded, no pending) must be stamped → not re-embedded. + v2, isNull2 := embedGenOf(t, s.DB(), 2) + assert.False(t, isNull2, "msg 2 (no pending row) must be stamped") + assert.Equal(t, int64(gen), v2, "msg 2 embed_gen = active") + + // The dead pending_embeddings table is dropped after the backfill consumed + // its signal. + exists, err := tableExists(ctx, b.DB(), "pending_embeddings") + require.NoError(t, err, "probe pending_embeddings") + assert.False(t, exists, "writable Open must drop pending_embeddings after the backfill consults it") +} + +// TestOpen_DropsDeadPendingEmbeddings pins that a normal writable Open drops +// the dead pending_embeddings table from vectors.db AFTER the backfill has had +// a chance to consult it. The drop moved out of Migrate into +// the Open writable path. +func TestOpen_DropsDeadPendingEmbeddings(t *testing.T) { + require.NoError(t, RegisterExtension(), "RegisterExtension") + ctx := context.Background() + + dir := t.TempDir() + mainPath := filepath.Join(dir, "msgvault.db") + vecPath := filepath.Join(dir, "vectors.db") + + s, err := store.Open(mainPath) + require.NoError(t, err, "store.Open (rw)") + require.NoError(t, s.InitSchema(), "InitSchema") + defer func() { _ = s.Close() }() + + // First Open creates vectors.db; then stand up a legacy pending_embeddings + // table and reopen writably so the Open drop path runs against it. + b0, err := Open(ctx, Options{ + Path: vecPath, MainPath: mainPath, Dimension: 4, MainDB: s.DB(), + }) + require.NoError(t, err, "first Open") + _, err = b0.DB().ExecContext(ctx, `CREATE TABLE pending_embeddings ( + generation_id INTEGER NOT NULL, + message_id INTEGER NOT NULL + )`) + require.NoError(t, err, "create legacy pending_embeddings") + require.NoError(t, b0.Close(), "close first Open") + + b, err := Open(ctx, Options{ + Path: vecPath, MainPath: mainPath, Dimension: 4, MainDB: s.DB(), + }) + require.NoError(t, err, "writable reopen") + t.Cleanup(func() { _ = b.Close() }) + + exists, err := tableExists(ctx, b.DB(), "pending_embeddings") + require.NoError(t, err, "probe pending_embeddings") + assert.False(t, exists, "writable Open must drop pending_embeddings") +} diff --git a/internal/vector/sqlitevec/coverage_test.go b/internal/vector/sqlitevec/coverage_test.go new file mode 100644 index 000000000..2ffcd0ac6 --- /dev/null +++ b/internal/vector/sqlitevec/coverage_test.go @@ -0,0 +1,220 @@ +//go:build fts5 && sqlite_vec + +package sqlitevec + +import ( + "context" + "database/sql" + "path/filepath" + "testing" + + assertpkg "github.com/stretchr/testify/assert" + requirepkg "github.com/stretchr/testify/require" + + "go.kenn.io/msgvault/internal/store" + "go.kenn.io/msgvault/internal/testutil" + "go.kenn.io/msgvault/internal/vector" +) + +// TestCoverageSplit_EmbeddedBlankMissing proves the full coverage readout — +// live / embedded / blank / missing — is computed from real state, not +// stubbed. It builds a generation where: +// +// - two messages are EMBEDDED: they have an actual vector row (Upsert) and +// are stamped embed_gen=gen (the worker's DONE mark). +// - two messages are BLANK: stamped embed_gen=gen (terminal DONE) but with +// NO vector — the empty/unembeddable case the blank count exists to +// surface (body-extraction-regression detector). +// - one message is MISSING: never stamped (embed_gen NULL). +// +// It then asserts each leg is the exact expected number, computing the split +// exactly as the CLI does: +// +// stamped = CoverageCounts(gen) 2nd value (embed_gen=gen, incl. blanks) +// embedded = backend.EmbeddedMessageCount(gen) (COUNT(DISTINCT message_id)) +// blank = stamped - embedded +// missing = CoverageCounts(gen) 4th value +// +// and verifies the load-bearing invariant live == embedded + blank + missing. +func TestCoverageSplit_EmbeddedBlankMissing(t *testing.T) { + require := requirepkg.New(t) + assert := assertpkg.New(t) + ctx := context.Background() + + // A sqlitevec test must use a SQLite main store regardless of + // MSGVAULT_TEST_DB: the backend's Open-time probes run SQLite-dialect SQL + // (sqlite_master) against this handle, and in production sqlitevec is only + // ever paired with a SQLite main store. + st := testutil.NewSQLiteTestStore(t) + + // Open a sqlitevec backend over the SAME main DB handle. MainPath is + // only needed for FusedSearch (ATTACH); EmbeddedMessageCount/Upsert work + // off MainDB + vectors.db alone. + b, err := Open(ctx, Options{ + Path: filepath.Join(t.TempDir(), "vectors.db"), + Dimension: 8, + MainDB: st.DB(), + }) + require.NoError(err, "Open backend") + t.Cleanup(func() { _ = b.Close() }) + + source, err := st.GetOrCreateSource("gmail", "me@example.com") + require.NoError(err, "GetOrCreateSource") + convID, err := st.EnsureConversationWithType(source.ID, "conv-1", "email_thread", "Subject") + require.NoError(err, "EnsureConversationWithType") + + // Create 5 live messages. + makeMsg := func(srcMsgID string) int64 { + m := &store.Message{ + SourceID: source.ID, + SourceMessageID: srcMsgID, + ConversationID: convID, + MessageType: "email", + Subject: sql.NullString{String: "s-" + srcMsgID, Valid: true}, + } + id, err := st.UpsertMessage(m) + require.NoErrorf(err, "UpsertMessage %s", srcMsgID) + return id + } + embeddedA := makeMsg("emb-a") + embeddedB := makeMsg("emb-b") + blankA := makeMsg("blank-a") + blankB := makeMsg("blank-b") + missing := makeMsg("missing") // never stamped + + gen, err := b.CreateGeneration(ctx, "test-model", 8, "fp") + require.NoError(err, "CreateGeneration") + + // Embedded messages: real vector rows + stamp. + vec := func(seed float32) []float32 { + v := make([]float32, 8) + v[0] = seed + return v + } + require.NoError(b.Upsert(ctx, gen, []vector.Chunk{ + {MessageID: embeddedA, Vector: vec(1)}, + {MessageID: embeddedB, Vector: vec(2)}, + }), "Upsert embedded vectors") + require.NoError(st.SetEmbedGen(ctx, []int64{embeddedA, embeddedB}, int64(gen)), "stamp embedded") + + // Blank messages: stamped terminal DONE but NO vector row. + require.NoError(st.SetEmbedGen(ctx, []int64{blankA, blankB}, int64(gen)), "stamp blank") + + // missing: left with embed_gen NULL — nothing to do. + _ = missing + + // --- Compute the split exactly as the CLI does. --- + live, stamped, blankFromStore, missingCount, err := st.CoverageCounts(ctx, int64(gen)) + require.NoError(err, "CoverageCounts") + embedded, err := b.EmbeddedMessageCount(ctx, gen) + require.NoError(err, "EmbeddedMessageCount") + blank := max(stamped-embedded, 0) + + // CoverageCounts' 3rd return is the legacy always-0 stub; the real blank + // is the display-layer computation. Pin both facts. + assert.Equal(int64(0), blankFromStore, "CoverageCounts blank stays the 0 stub") + + assert.Equal(int64(5), live, "live = all 5 messages") + assert.Equal(int64(4), stamped, "stamped = 4 (2 embedded + 2 blank)") + assert.Equal(int64(2), embedded, "embedded = 2 (distinct message_ids with a vector)") + assert.Equal(int64(2), blank, "blank = stamped - embedded = 2") + assert.Equal(int64(1), missingCount, "missing = 1 (never stamped)") + + // The load-bearing invariant. + assert.Equal(live, embedded+blank+missingCount, + "invariant: live == embedded + blank + missing") +} + +// TestCoverageSplit_NonLiveEmbeddedHoldsInvariant proves the coverage +// invariant survives a message that was EMBEDDED for the generation and +// then went non-live (soft-deleted). Backend.Delete has no production +// callers, so the embedding row survives the soft-delete; an unfiltered +// EmbeddedMessageCount would then count the dead message, making +// embedded > stamped (stamped is live-only), driving blank negative +// (clamped to 0) and breaking live == embedded + blank + missing — with +// EMBEDDED able to exceed LIVE. +// +// With the live-intersected count the dead message drops out of embedded, +// so embedded <= stamped <= live, blank >= 0, and the invariant holds. +func TestCoverageSplit_NonLiveEmbeddedHoldsInvariant(t *testing.T) { + require := requirepkg.New(t) + assert := assertpkg.New(t) + ctx := context.Background() + + // See TestCoverageSplit_EmbeddedBlankMissing: a sqlitevec test must use a + // SQLite main store regardless of MSGVAULT_TEST_DB. + st := testutil.NewSQLiteTestStore(t) + b, err := Open(ctx, Options{ + Path: filepath.Join(t.TempDir(), "vectors.db"), + Dimension: 8, + MainDB: st.DB(), + }) + require.NoError(err, "Open backend") + t.Cleanup(func() { _ = b.Close() }) + + source, err := st.GetOrCreateSource("gmail", "me@example.com") + require.NoError(err, "GetOrCreateSource") + convID, err := st.EnsureConversationWithType(source.ID, "conv-1", "email_thread", "Subject") + require.NoError(err, "EnsureConversationWithType") + + makeMsg := func(srcMsgID string) int64 { + m := &store.Message{ + SourceID: source.ID, + SourceMessageID: srcMsgID, + ConversationID: convID, + MessageType: "email", + Subject: sql.NullString{String: "s-" + srcMsgID, Valid: true}, + } + id, err := st.UpsertMessage(m) + require.NoErrorf(err, "UpsertMessage %s", srcMsgID) + return id + } + embeddedA := makeMsg("emb-a") + embeddedB := makeMsg("emb-b") + missing := makeMsg("missing") + + gen, err := b.CreateGeneration(ctx, "test-model", 8, "fp") + require.NoError(err, "CreateGeneration") + + vec := func(seed float32) []float32 { + v := make([]float32, 8) + v[0] = seed + return v + } + require.NoError(b.Upsert(ctx, gen, []vector.Chunk{ + {MessageID: embeddedA, Vector: vec(1)}, + {MessageID: embeddedB, Vector: vec(2)}, + }), "Upsert embedded vectors") + require.NoError(st.SetEmbedGen(ctx, []int64{embeddedA, embeddedB}, int64(gen)), "stamp embedded") + _ = missing + + // Sanity before the soft-delete: both embedded messages are live. + embeddedBefore, err := b.EmbeddedMessageCount(ctx, gen) + require.NoError(err, "EmbeddedMessageCount before") + assert.Equal(int64(2), embeddedBefore, "two live embedded before soft-delete") + + // Soft-delete one EMBEDDED message (deleted_from_source_at) — its + // embedding row stays behind, but it is no longer a live message. + _, err = st.DB().Exec( + st.Rebind("UPDATE messages SET deleted_from_source_at = CURRENT_TIMESTAMP WHERE id = ?"), + embeddedA) + require.NoError(err, "soft-delete embeddedA") + + // Compute the split exactly as the CLI does. + live, stamped, _, missingCount, err := st.CoverageCounts(ctx, int64(gen)) + require.NoError(err, "CoverageCounts") + embedded, err := b.EmbeddedMessageCount(ctx, gen) + require.NoError(err, "EmbeddedMessageCount after") + blank := max(stamped-embedded, 0) + + // The dead message must NOT be counted as embedded. + assert.Equal(int64(1), embedded, "non-live embedded message excluded") + assert.LessOrEqual(embedded, live, "embedded <= live") + assert.LessOrEqual(embedded, stamped, "embedded <= stamped") + assert.GreaterOrEqual(blank, int64(0), "blank >= 0") + // live = 2 (embeddedB live-embedded + missing live-unstamped). + assert.Equal(int64(2), live, "live excludes the soft-deleted message") + // The load-bearing invariant survives the non-live embedded row. + assert.Equal(live, embedded+blank+missingCount, + "invariant: live == embedded + blank + missing") +} diff --git a/internal/vector/sqlitevec/ext_stub.go b/internal/vector/sqlitevec/ext_stub.go index 9b3aafffb..081a8398b 100644 --- a/internal/vector/sqlitevec/ext_stub.go +++ b/internal/vector/sqlitevec/ext_stub.go @@ -36,6 +36,7 @@ type Options struct { MainPath string Dimension int MainDB *sql.DB + ReadOnly bool } // Backend is the stub backend type for builds without sqlite_vec. @@ -106,12 +107,17 @@ func (b *Backend) Stats(_ context.Context, _ vector.GenerationID) (vector.Stats, return vector.Stats{}, ErrNotBuilt } -// EnsureSeeded is a stub that always returns ErrNotBuilt. -func (b *Backend) EnsureSeeded(_ context.Context, _ vector.GenerationID) error { - return ErrNotBuilt -} - // LoadVector is a stub that always returns ErrNotBuilt. func (b *Backend) LoadVector(_ context.Context, _ int64) ([]float32, error) { return nil, ErrNotBuilt } + +// ResetWatermarkBelow is a stub that always returns ErrNotBuilt. +func (b *Backend) ResetWatermarkBelow(_ context.Context, _ int64) error { + return ErrNotBuilt +} + +// EmbeddedMessageCount is a stub that always returns ErrNotBuilt. +func (b *Backend) EmbeddedMessageCount(_ context.Context, _ vector.GenerationID) (int64, error) { + return 0, ErrNotBuilt +} diff --git a/internal/vector/sqlitevec/migrate.go b/internal/vector/sqlitevec/migrate.go index f732deeb2..07e2e7506 100644 --- a/internal/vector/sqlitevec/migrate.go +++ b/internal/vector/sqlitevec/migrate.go @@ -44,6 +44,19 @@ func Migrate(ctx context.Context, db *sql.DB, defaultDim int) error { return fmt.Errorf("migrate vectors.db (%s): %w", m.desc, err) } } + // NOTE: the dead pending_embeddings queue table is NOT dropped here. The + // scan-and-fill design replaced the per-generation seed queue with a live + // messages.embed_gen scan, so the table is otherwise unused — BUT the + // one-time upgrade backfill (BackfillEmbedGenForUpgrade) must first consult + // it to preserve the legacy "pending re-embed" signal: a message could + // carry BOTH a stale active-gen embedding AND an active-gen pending row + // (old repair-encoding re-enqueued already-embedded messages). Dropping it + // here, before the backfill reads it, would let the backfill stamp those + // messages "covered" and never re-embed them. The drop now happens in the + // writable Open path AFTER the backfill has consulted it (see + // dropDeadPendingEmbeddings, called from Open). Migrate runs on read-only + // opens too, where dropping would be wrong (it must linger until a writable + // open honors the signal). if _, err := db.ExecContext(ctx, `PRAGMA journal_mode = WAL`); err != nil { return fmt.Errorf("enable WAL: %w", err) } diff --git a/internal/vector/sqlitevec/migrate_test.go b/internal/vector/sqlitevec/migrate_test.go index a94cecc4e..7256b74d9 100644 --- a/internal/vector/sqlitevec/migrate_test.go +++ b/internal/vector/sqlitevec/migrate_test.go @@ -25,7 +25,7 @@ func TestMigrate_FreshAndIdempotent(t *testing.T) { for _, tbl := range []string{ "index_generations", "embeddings", "embed_runs", - "pending_embeddings", "vectors_vec_d768", "schema_version", + "embed_watermark", "vectors_vec_d768", "schema_version", } { var name string err := db.QueryRow(`SELECT name FROM sqlite_master WHERE name = ?`, tbl).Scan(&name) @@ -322,6 +322,40 @@ func TestMigrate_CreatesDimensionSpecificVecTable(t *testing.T) { assertpkg.NoError(t, err, "vectors_vec_d1024 not created") } +// TestMigrate_KeepsDeadPendingEmbeddings pins that Migrate ALONE no longer +// drops the dead pending_embeddings queue table: the one-time upgrade backfill +// (BackfillEmbedGenForUpgrade) must first consult the table to preserve its +// legacy re-embed signal. The drop moved to the writable Open +// path, AFTER the backfill — see TestOpen_DropsDeadPendingEmbeddings and +// TestBackfillEmbedGen_PreservesActiveGenPendingReembedSignal. Migrate runs on +// read-only opens too, where dropping (before the signal is honored on a later +// writable open) would be wrong. +func TestMigrate_KeepsDeadPendingEmbeddings(t *testing.T) { + ctx := context.Background() + path := filepath.Join(t.TempDir(), "vectors.db") + db := openTestDB(t, path) + t.Cleanup(func() { _ = db.Close() }) + + // Stand up a legacy pending_embeddings table before migrating. + _, err := db.ExecContext(ctx, `CREATE TABLE pending_embeddings ( + generation_id INTEGER NOT NULL, + message_id INTEGER NOT NULL + )`) + requirepkg.NoError(t, err, "create legacy pending_embeddings") + + requirepkg.NoError(t, Migrate(ctx, db, 768), "migrate") + + exists, err := tableExists(ctx, db, "pending_embeddings") + requirepkg.NoError(t, err, "probe pending_embeddings") + assertpkg.True(t, exists, "Migrate alone must NOT drop pending_embeddings (Open does, after the backfill consults it)") + + // Idempotent: a second Migrate still leaves it (the drop is Open's job). + requirepkg.NoError(t, Migrate(ctx, db, 768), "second migrate (idempotent)") + exists, err = tableExists(ctx, db, "pending_embeddings") + requirepkg.NoError(t, err, "probe pending_embeddings after second migrate") + assertpkg.True(t, exists, "second Migrate still must not drop pending_embeddings") +} + func openTestDB(t *testing.T, path string) *sql.DB { t.Helper() requirepkg.NoError(t, RegisterExtension(), "register") @@ -379,9 +413,13 @@ func TestForeignKeys_PerConnection(t *testing.T) { err := c.QueryRowContext(ctx, `PRAGMA foreign_keys`).Scan(&fk) require.NoErrorf(err, "conn %d pragma read", i) assert.Equalf(1, fk, "conn %d: foreign_keys (ConnectHook missed this conn)", i) + // embeddings.generation_id REFERENCES index_generations(id); insert + // a row pointing at a non-existent generation to trigger the FK + // violation on a properly-configured connection. _, err = c.ExecContext(ctx, - `INSERT INTO pending_embeddings (generation_id, message_id, enqueued_at) - VALUES (?, ?, ?)`, 9999999, int64(i), int64(i)) + `INSERT INTO embeddings + (generation_id, message_id, chunk_index, embedded_at, source_char_len) + VALUES (?, ?, 0, 0, 0)`, 9999999, int64(i)) //nolint:testifylint // guarded assert+continue: a require here would abort the per-conn loop instead of skipping to the next connection if !assert.Errorf(err, "conn %d: FK-violating insert should fail", i) { continue diff --git a/internal/vector/sqlitevec/schema.sql b/internal/vector/sqlitevec/schema.sql index 94e740a1e..d8aab02d5 100644 --- a/internal/vector/sqlitevec/schema.sql +++ b/internal/vector/sqlitevec/schema.sql @@ -11,12 +11,11 @@ CREATE TABLE IF NOT EXISTS index_generations ( dimension INTEGER NOT NULL, fingerprint TEXT NOT NULL, started_at INTEGER NOT NULL, - -- seeded_at marks when the initial pending_embeddings seed pass - -- finished. NULL means "row inserted but seed never committed" - -- (e.g. crash between insert and seed) — the resume path re-runs - -- seedPending in that case rather than activating an empty - -- generation. Columns added after release ship via the ALTER - -- TABLE migrations in migrate.go for already-initialized databases. + -- seeded_at is stamped at CreateGeneration as harmless vestigial + -- metadata. Under the scan-and-fill design there is no separate seed + -- pass, and activation no longer gates on it (coverage — missing==0 — + -- is the real gate). Retained only so the column stays populated for + -- legacy display; no destructive migration drops it. seeded_at INTEGER, completed_at INTEGER, activated_at INTEGER, @@ -55,19 +54,6 @@ CREATE TABLE IF NOT EXISTS embeddings ( CREATE INDEX IF NOT EXISTS idx_embeddings_msg ON embeddings(message_id); CREATE INDEX IF NOT EXISTS idx_embeddings_gen_msg ON embeddings(generation_id, message_id); -CREATE TABLE IF NOT EXISTS pending_embeddings ( - generation_id INTEGER NOT NULL REFERENCES index_generations(id) ON DELETE CASCADE, - message_id INTEGER NOT NULL, - enqueued_at INTEGER NOT NULL, - claimed_at INTEGER, - claim_token TEXT, - PRIMARY KEY (generation_id, message_id) -); -CREATE INDEX IF NOT EXISTS idx_pending_available - ON pending_embeddings(generation_id, message_id) WHERE claimed_at IS NULL; -CREATE INDEX IF NOT EXISTS idx_pending_claims - ON pending_embeddings(claimed_at) WHERE claimed_at IS NOT NULL; - CREATE TABLE IF NOT EXISTS embed_runs ( id INTEGER PRIMARY KEY AUTOINCREMENT, generation_id INTEGER NOT NULL REFERENCES index_generations(id), @@ -79,3 +65,18 @@ CREATE TABLE IF NOT EXISTS embed_runs ( truncated INTEGER NOT NULL DEFAULT 0, error TEXT ); + +-- embed_watermark tracks the highest message id the scan-and-fill embed +-- worker has already swept for a generation, so each RunOnce resumes the +-- forward scan from where the last one stopped instead of re-scanning the +-- whole messages B-tree. It is a pure optimization: losing it (or never +-- seeding it) only makes the next scan start at id 0, which is harmless +-- because the scan predicate (embed_gen IS NULL OR embed_gen <> gen) and +-- the idempotent upsert make re-sweeping covered rows a no-op. The +-- full-scan backstop ignores this watermark entirely. No FK to messages +-- (those live in the main DB on SQLite); it lives here with the +-- generations it watermarks. +CREATE TABLE IF NOT EXISTS embed_watermark ( + generation_id INTEGER PRIMARY KEY, + watermark_id INTEGER NOT NULL DEFAULT 0 +); diff --git a/internal/vector/stats.go b/internal/vector/stats.go index 8cc5cc19f..f83b9ef0b 100644 --- a/internal/vector/stats.go +++ b/internal/vector/stats.go @@ -26,10 +26,14 @@ type StatsView struct { // Omitted entirely when no build is running. BuildingGeneration *BuildingSummary `json:"building_generation,omitempty"` - // PendingEmbeddingsTotal is the sum of pending_embeddings rows - // across the active and building generations. Retired generations - // are assumed to have zero pending items. - PendingEmbeddingsTotal int64 `json:"pending_embeddings_total"` + // MissingEmbeddingsTotal is the live-message count still needing embedding + // for the generation the worker will actually target next. When a rebuild + // is in flight, building-generation coverage is the actionable target and + // active-generation top-ups are intentionally frozen until activation. + // Without a building generation, it reports active-generation drift. Retired + // generations contribute zero. Replaces the former pending-embeddings queue + // total under the scan-and-fill design. + MissingEmbeddingsTotal int64 `json:"missing_embeddings_total"` } // GenerationSummary reports the serving state for the active index @@ -53,9 +57,10 @@ type BuildingSummary struct { Progress Progress `json:"progress"` } -// Progress reports the build-queue position for a generation. Done is -// the count of already-embedded messages; Total is Done plus the -// currently-pending queue depth. +// Progress reports embedding coverage for a generation under scan-and-fill +// (there is no build/pending queue). Done is the count of already-embedded +// messages; Total is Done plus the live messages still missing an embedding +// for the generation (embed_gen <> gen), i.e. the coverage denominator. type Progress struct { Done int64 `json:"done"` Total int64 `json:"total"` @@ -81,6 +86,9 @@ func CollectStats(ctx context.Context, b Backend) (*StatsView, error) { } out := &StatsView{Enabled: true} var errs []error + var buildingExists bool + var activePending int64 + var activeStatsOK bool active, err := b.ActiveGeneration(ctx) switch { @@ -98,7 +106,8 @@ func CollectStats(ctx context.Context, b Backend) (*StatsView, error) { MessageCount: s.EmbeddingCount, ActivatedAt: formatTimePtr(active.ActivatedAt), } - out.PendingEmbeddingsTotal += s.PendingCount + activePending = s.PendingCount + activeStatsOK = true } case errors.Is(err, ErrNoActiveGeneration): // Leave ActiveGeneration nil; this is normal during first build. @@ -110,6 +119,7 @@ func CollectStats(ctx context.Context, b Backend) (*StatsView, error) { if err != nil { errs = append(errs, fmt.Errorf("building generation: %w", err)) } else if building != nil { + buildingExists = true s, sErr := b.Stats(ctx, building.ID) if sErr != nil { errs = append(errs, fmt.Errorf("stats for building generation %d: %w", building.ID, sErr)) @@ -124,9 +134,12 @@ func CollectStats(ctx context.Context, b Backend) (*StatsView, error) { Total: s.EmbeddingCount + s.PendingCount, }, } - out.PendingEmbeddingsTotal += s.PendingCount + out.MissingEmbeddingsTotal = s.PendingCount } } + if !buildingExists && activeStatsOK { + out.MissingEmbeddingsTotal = activePending + } return out, errors.Join(errs...) } diff --git a/internal/vector/stats_test.go b/internal/vector/stats_test.go index 84837f516..e168374d4 100644 --- a/internal/vector/stats_test.go +++ b/internal/vector/stats_test.go @@ -75,12 +75,16 @@ func (f *statsFakeBackend) LoadVector(context.Context, int64) ([]float32, error) return nil, errors.New("not implemented") } -func (f *statsFakeBackend) Close() error { return nil } - -func (f *statsFakeBackend) EnsureSeeded(context.Context, GenerationID) error { +func (f *statsFakeBackend) ResetWatermarkBelow(context.Context, int64) error { return errors.New("not implemented") } +func (f *statsFakeBackend) EmbeddedMessageCount(context.Context, GenerationID) (int64, error) { + return 0, errors.New("not implemented") +} + +func (f *statsFakeBackend) Close() error { return nil } + var _ Backend = (*statsFakeBackend)(nil) func TestCollectStats_NilBackend(t *testing.T) { @@ -121,7 +125,7 @@ func TestCollectStats_ActiveOnly(t *testing.T) { assert.Equal(int64(100), ag.MessageCount) assert.Equal(activatedAt.Format(time.RFC3339), ag.ActivatedAt) assert.Nil(sv.BuildingGeneration) - assert.Equal(int64(7), sv.PendingEmbeddingsTotal) + assert.Equal(int64(7), sv.MissingEmbeddingsTotal) } func TestCollectStats_BuildingOnly(t *testing.T) { @@ -154,7 +158,7 @@ func TestCollectStats_BuildingOnly(t *testing.T) { assert.Equal(startedAt.Format(time.RFC3339), bg.StartedAt) assert.Equal(int64(40), bg.Progress.Done) assert.Equal(int64(100), bg.Progress.Total) - assert.Equal(int64(60), sv.PendingEmbeddingsTotal) + assert.Equal(int64(60), sv.MissingEmbeddingsTotal) } func TestCollectStats_BothGenerations(t *testing.T) { @@ -192,8 +196,10 @@ func TestCollectStats_BothGenerations(t *testing.T) { if assert.NotNil(sv.BuildingGeneration) { assert.Equal(GenerationID(2), sv.BuildingGeneration.ID) } - // Sum of both pending counts: 3 + 450. - assert.Equal(int64(453), sv.PendingEmbeddingsTotal) + // While a rebuild is in flight the worker targets the building generation + // and freezes active-generation top-ups, so the total reports actionable + // building work rather than adding active-generation drift. + assert.Equal(int64(450), sv.MissingEmbeddingsTotal) } func TestCollectStats_ActiveError(t *testing.T) { @@ -252,7 +258,38 @@ func TestCollectStats_BuildingStatsError_Tolerated(t *testing.T) { require.ErrorIs(err, wantErr) require.NotNil(sv, "CollectStats sv = nil, want non-nil envelope") assert.Nil(sv.BuildingGeneration, "Stats failed") - assert.Equal(int64(0), sv.PendingEmbeddingsTotal) + assert.Equal(int64(0), sv.MissingEmbeddingsTotal) +} + +func TestCollectStats_BuildingStatsErrorDoesNotReportFrozenActiveDrift(t *testing.T) { + require := requirepkg.New(t) + assert := assertpkg.New(t) + wantErr := errors.New("stats table locked") + b := &statsFakeBackend{ + active: &Generation{ + ID: 1, + Model: "m1", + Dimension: 384, + Fingerprint: "m1:384", + State: GenerationActive, + }, + building: &Generation{ + ID: 2, + Model: "m2", + Dimension: 768, + }, + statsByGen: map[GenerationID]Stats{ + 1: {EmbeddingCount: 500, PendingCount: 12}, + }, + statsErr: map[GenerationID]error{2: wantErr}, + } + + sv, err := CollectStats(context.Background(), b) + require.Error(err, "CollectStats err should wrap want") + require.ErrorIs(err, wantErr) + require.NotNil(sv, "CollectStats sv = nil, want non-nil envelope") + assert.Nil(sv.BuildingGeneration, "Stats failed") + assert.Equal(int64(0), sv.MissingEmbeddingsTotal, "active drift is frozen while building exists") } func TestCollectStats_StatsError_Tolerated(t *testing.T) { @@ -279,5 +316,5 @@ func TestCollectStats_StatsError_Tolerated(t *testing.T) { require.NotNil(sv, "CollectStats sv = nil, want non-nil envelope") assert.True(sv.Enabled, "backend is non-nil") assert.Nil(sv.ActiveGeneration, "Stats failed") - assert.Equal(int64(0), sv.PendingEmbeddingsTotal, "no successful stats") + assert.Equal(int64(0), sv.MissingEmbeddingsTotal, "no successful stats") }