diff --git a/src/pipeline/pipeline_incremental.c b/src/pipeline/pipeline_incremental.c index ceeaaef9..1ae75862 100644 --- a/src/pipeline/pipeline_incremental.c +++ b/src/pipeline/pipeline_incremental.c @@ -25,6 +25,7 @@ enum { INCR_RING_BUF = 4, INCR_RING_MASK = 3, INCR_TS_BUF = 24, INCR_WAL_BUF = 1 #include "foundation/compat_fs.h" #include "foundation/platform.h" +#include #include #include #include @@ -120,48 +121,247 @@ static bool *classify_files(cbm_file_info_t *files, int file_count, cbm_file_has return changed; } -/* Find stored files that no longer exist on disk. Returns count. */ -static int find_deleted_files(cbm_file_info_t *files, int file_count, cbm_file_hash_t *stored, - int stored_count, char ***out_deleted) { +/* Classify stored files that are absent from current discovery. Returns the + * count of truly-deleted files (output via out_deleted) and ALSO collects + * mode-skipped files into out_mode_skipped (caller frees both). + * + * A stored file is classified as: + * - "deleted" — `stat()` returns ENOENT or ENOTDIR. Its nodes will + * be purged and its hash row dropped. + * - "mode-skipped" — `stat()` succeeds. The file exists on disk but the + * current discovery pass didn't visit it (e.g. excluded + * by FAST_SKIP_DIRS in fast/moderate mode). Its nodes + * must be preserved AND its hash row must be carried + * forward into the new DB so subsequent reindexes can + * still see it as "known" rather than treating it as + * new-or-deleted. + * + * Without this distinction, a fast-mode reindex after a full-mode index + * would silently purge every file under `tools/`, `scripts/`, `bin/`, + * `build/`, `docs/`, `__tests__/`, etc. — see task + * claude-connectors/codebase-memory-index-repository-is-destructive-... + * and the 2026-04-13 Skyline incident (packages/mcp/src/tools/ vanished + * from a live graph mid-session). + * + * Mode-skipped hash preservation is the second half of the additive-merge + * contract: dump_and_persist re-upserts these hash rows so the next reindex + * can correctly detect a real on-disk deletion of a mode-skipped file (as + * opposed to seeing it as "never existed" → noop → orphaned graph nodes). + * + * Fail-safe rules (preserve nodes on uncertainty): + * - repo_path NULL → log error and preserve everything (return 0 + * deletions, empty mode_skipped). The caller contract is that + * repo_path is required; a NULL means a misconfigured pipeline, + * not a deletion signal. + * - snprintf truncation (combined path ≥ CBM_SZ_4K) → preserve. We can't + * reliably stat a truncated path. Treat as mode-skipped. + * - stat() errno != ENOENT/ENOTDIR (EACCES, EIO, ELOOP, transient NFS, + * etc.) → preserve. The file may exist; we just can't see it right now. + * Treat as mode-skipped. + * + * Note: we use stat() (not lstat()) on purpose. A symlink whose target was + * deleted should be classified as deleted from the indexer's perspective + * because the indexer follows symlinks during discovery — a stale symlink + * has no source to parse. */ +static int find_deleted_files(const char *repo_path, cbm_file_info_t *files, int file_count, + cbm_file_hash_t *stored, int stored_count, char ***out_deleted, + cbm_file_hash_t **out_mode_skipped, int *out_mode_skipped_count) { + *out_deleted = NULL; + *out_mode_skipped = NULL; + *out_mode_skipped_count = 0; + + if (!repo_path) { + /* Misconfigured pipeline. Preserve everything rather than risk + * silently re-introducing the destructive overwrite this function + * was rewritten to prevent. */ + cbm_log_error("incremental.err", "msg", "find_deleted_files_null_repo_path"); + return 0; + } + CBMHashTable *current = cbm_ht_create((size_t)file_count * PAIR_LEN); for (int i = 0; i < file_count; i++) { cbm_ht_set(current, files[i].rel_path, &files[i]); } - int count = 0; - int cap = CBM_SZ_64; - char **deleted = malloc((size_t)cap * sizeof(char *)); + int del_count = 0; + int del_cap = CBM_SZ_64; + char **deleted = malloc((size_t)del_cap * sizeof(char *)); + if (!deleted) { + cbm_log_error("incremental.err", "msg", "find_deleted_files_oom"); + cbm_ht_free(current); + return 0; + } + + int ms_count = 0; + int ms_cap = CBM_SZ_64; + cbm_file_hash_t *mode_skipped = malloc((size_t)ms_cap * sizeof(cbm_file_hash_t)); + if (!mode_skipped) { + cbm_log_error("incremental.err", "msg", "find_deleted_files_oom_ms"); + free(deleted); + cbm_ht_free(current); + return 0; + } for (int i = 0; i < stored_count; i++) { - if (!cbm_ht_get(current, stored[i].rel_path)) { - if (count >= cap) { - cap *= PAIR_LEN; - char **tmp = realloc(deleted, (size_t)cap * sizeof(char *)); + if (cbm_ht_get(current, stored[i].rel_path)) { + continue; /* still visited by current pass */ + } + /* Not in current discovery — check if it's truly deleted or just + * mode-skipped (excluded by FAST_SKIP_DIRS etc.). */ + bool preserve = false; + char abs_path[CBM_SZ_4K]; + int n = snprintf(abs_path, sizeof(abs_path), "%s/%s", repo_path, stored[i].rel_path); + if (n < 0 || n >= (int)sizeof(abs_path)) { + /* Truncation or encoding error — can't reliably stat. Preserve. */ + cbm_log_warn("incremental.path_truncated", "rel_path", stored[i].rel_path); + preserve = true; + } else { + struct stat st; + if (stat(abs_path, &st) == 0) { + /* File exists on disk — mode-skipped, not deleted. */ + preserve = true; + } else if (errno != ENOENT && errno != ENOTDIR) { + /* Transient or permission error — fail safe by preserving. + * EACCES, EIO, ELOOP, ENAMETOOLONG, etc. */ + cbm_log_warn("incremental.stat_uncertain", "rel_path", stored[i].rel_path, "errno", + itoa_buf(errno)); + preserve = true; + } + } + + if (preserve) { + /* Carry forward the existing hash row so subsequent reindexes + * can correctly classify this file. */ + if (ms_count >= ms_cap) { + ms_cap *= PAIR_LEN; + cbm_file_hash_t *tmp = realloc(mode_skipped, (size_t)ms_cap * sizeof(*tmp)); if (!tmp) { + cbm_log_error("incremental.err", "msg", + "find_deleted_files_realloc_oom_ms"); break; } - deleted = tmp; + mode_skipped = tmp; } - deleted[count++] = strdup(stored[i].rel_path); + char *rp = strdup(stored[i].rel_path); + char *sh = stored[i].sha256 ? strdup(stored[i].sha256) : NULL; + if (!rp || (stored[i].sha256 && !sh)) { + /* OOM mid-record. Drop this entry rather than persist a + * row with a NULL rel_path that would silently fail the + * NOT NULL constraint in upsert and reintroduce the + * orphaned-node bug. */ + cbm_log_error("incremental.err", "msg", "find_deleted_files_strdup_oom", + "rel_path", stored[i].rel_path); + free(rp); + free(sh); + break; + } + mode_skipped[ms_count].project = NULL; /* unused by upsert API */ + mode_skipped[ms_count].rel_path = rp; + mode_skipped[ms_count].sha256 = sh; + mode_skipped[ms_count].mtime_ns = stored[i].mtime_ns; + mode_skipped[ms_count].size = stored[i].size; + ms_count++; + continue; + } + + /* File is truly gone — record for purge. */ + if (del_count >= del_cap) { + del_cap *= PAIR_LEN; + char **tmp = realloc(deleted, (size_t)del_cap * sizeof(char *)); + if (!tmp) { + cbm_log_error("incremental.err", "msg", "find_deleted_files_realloc_oom"); + break; + } + deleted = tmp; } + deleted[del_count++] = strdup(stored[i].rel_path); } cbm_ht_free(current); *out_deleted = deleted; - return count; + *out_mode_skipped = mode_skipped; + *out_mode_skipped_count = ms_count; + return del_count; +} + +/* Free a mode_skipped array allocated by find_deleted_files. */ +static void free_mode_skipped(cbm_file_hash_t *ms, int count) { + if (!ms) { + return; + } + for (int i = 0; i < count; i++) { + free((void *)ms[i].rel_path); + free((void *)ms[i].sha256); + } + free(ms); } /* ── Persist file hashes ─────────────────────────────────────────── */ +/* Persist file hash rows for the current discovery and any mode-skipped + * files preserved from the previous DB. + * + * Partial-failure policy: an `upsert` failure on any single row is logged + * as a warning and the loop continues. We deliberately do NOT abort the + * whole reindex on a single bad row — partial preservation is better than + * total loss, and a transient failure on one file should not invalidate + * the entire incremental update. The trade-off is that a silently-failed + * row produces the same downstream effect as if the file were never + * indexed at all (forced re-parse on the next run for current-files, + * potential orphaned-node revival for mode_skipped). The warning surface + * is the only signal that something went wrong. */ static void persist_hashes(cbm_store_t *store, const char *project, cbm_file_info_t *files, - int file_count) { + int file_count, const cbm_file_hash_t *mode_skipped, + int mode_skipped_count) { + int current_failed = 0; + int ms_failed = 0; + + /* Current discovery: re-stat to capture any mtime/size that changed + * during the run, and write fresh hash rows for visited files. */ for (int i = 0; i < file_count; i++) { struct stat st; if (stat(files[i].path, &st) != 0) { continue; } - cbm_store_upsert_file_hash(store, project, files[i].rel_path, "", stat_mtime_ns(&st), - st.st_size); + int rc = cbm_store_upsert_file_hash(store, project, files[i].rel_path, "", + stat_mtime_ns(&st), st.st_size); + if (rc != CBM_STORE_OK) { + cbm_log_warn("incremental.persist_hash_failed", "scope", "current", "rel_path", + files[i].rel_path, "rc", itoa_buf(rc)); + current_failed++; + } + } + + /* Mode-skipped (preserved): re-upsert hash rows from the previous DB + * so the next reindex can still classify these files correctly. Without + * this, an orphaned-node bug emerges where: + * - full mode indexes everything + * - fast mode runs and drops mode-skipped hash rows + * - file is then deleted on disk + * - next reindex's stored hashes don't include the file → noop or + * can't detect the deletion → graph nodes for the deleted file + * remain forever (or until a destructive rebuild). + * + * A failure here is more serious than a current-files failure because + * it can revive the orphaned-node bug for that specific file. Logged + * with scope=mode_skipped so the warning is searchable. */ + if (mode_skipped) { + for (int i = 0; i < mode_skipped_count; i++) { + int rc = cbm_store_upsert_file_hash(store, project, mode_skipped[i].rel_path, + mode_skipped[i].sha256 ? mode_skipped[i].sha256 + : "", + mode_skipped[i].mtime_ns, mode_skipped[i].size); + if (rc != CBM_STORE_OK) { + cbm_log_warn("incremental.persist_hash_failed", "scope", "mode_skipped", + "rel_path", mode_skipped[i].rel_path, "rc", itoa_buf(rc)); + ms_failed++; + } + } + } + + if (current_failed > 0 || ms_failed > 0) { + cbm_log_warn("incremental.persist_summary", "current_failed", itoa_buf(current_failed), + "mode_skipped_failed", itoa_buf(ms_failed)); } } @@ -256,9 +456,13 @@ static void run_postpasses(cbm_pipeline_ctx_t *ctx, cbm_file_info_t *changed_fil itoa_buf((int)elapsed_ms(t))); } } -/* Delete old DB and dump merged graph + hashes to disk. */ +/* Delete old DB and dump merged graph + hashes to disk. + * Mode-skipped hash rows are preserved across the rebuild so subsequent + * reindexes can correctly distinguish "never indexed" from "indexed but + * not visited this pass". */ static void dump_and_persist(cbm_gbuf_t *gbuf, const char *db_path, const char *project, - cbm_file_info_t *files, int file_count) { + cbm_file_info_t *files, int file_count, + const cbm_file_hash_t *mode_skipped, int mode_skipped_count) { struct timespec t; cbm_clock_gettime(CLOCK_MONOTONIC, &t); @@ -276,7 +480,7 @@ static void dump_and_persist(cbm_gbuf_t *gbuf, const char *db_path, const char * cbm_store_t *hash_store = cbm_store_open_path(db_path); if (hash_store) { - persist_hashes(hash_store, project, files, file_count); + persist_hashes(hash_store, project, files, file_count, mode_skipped, mode_skipped_count); /* FTS5 rebuild after incremental dump. The btree dump path bypasses * any triggers that could have kept nodes_fts synchronized, so we @@ -323,18 +527,27 @@ int cbm_pipeline_run_incremental(cbm_pipeline_t *p, const char *db_path, cbm_fil bool *is_changed = classify_files(files, file_count, stored, stored_count, &n_changed, &n_unchanged); - /* Find deleted files */ + /* Classify stored files absent from current discovery: truly-deleted + * (purge) vs mode-skipped (preserve nodes AND hash rows). */ char **deleted = NULL; - int deleted_count = find_deleted_files(files, file_count, stored, stored_count, &deleted); + cbm_file_hash_t *mode_skipped = NULL; + int mode_skipped_count = 0; + int deleted_count = + find_deleted_files(cbm_pipeline_repo_path(p), files, file_count, stored, stored_count, + &deleted, &mode_skipped, &mode_skipped_count); cbm_log_info("incremental.classify", "changed", itoa_buf(n_changed), "unchanged", - itoa_buf(n_unchanged), "deleted", itoa_buf(deleted_count)); + itoa_buf(n_unchanged), "deleted", itoa_buf(deleted_count), "mode_skipped", + itoa_buf(mode_skipped_count)); - /* Fast path: nothing changed → skip */ + /* Fast path: nothing changed → skip. The on-disk DB is left untouched, + * which means existing hash rows (including for any mode-skipped files + * that were already preserved by an earlier run) remain intact. */ if (n_changed == 0 && deleted_count == 0) { cbm_log_info("incremental.noop", "reason", "no_changes"); free(is_changed); free(deleted); + free_mode_skipped(mode_skipped, mode_skipped_count); cbm_store_free_file_hashes(stored, stored_count); cbm_store_close(store); return 0; @@ -374,6 +587,7 @@ int cbm_pipeline_run_incremental(cbm_pipeline_t *p, const char *db_path, cbm_fil free(deleted[i]); } free(deleted); + free_mode_skipped(mode_skipped, mode_skipped_count); cbm_store_close(store); return CBM_NOT_FOUND; } @@ -424,8 +638,12 @@ int cbm_pipeline_run_incremental(cbm_pipeline_t *p, const char *db_path, cbm_fil free(changed_files); cbm_registry_free(registry); - /* Step 7: Dump to disk */ - dump_and_persist(existing, db_path, project, files, file_count); + /* Step 7: Dump to disk (preserves mode-skipped hash rows so the next + * reindex can correctly classify those files instead of seeing them + * as never-existed). */ + dump_and_persist(existing, db_path, project, files, file_count, mode_skipped, + mode_skipped_count); + free_mode_skipped(mode_skipped, mode_skipped_count); cbm_gbuf_free(existing); cbm_log_info("incremental.done", "elapsed_ms", itoa_buf((int)elapsed_ms(t0))); diff --git a/tests/test_pipeline.c b/tests/test_pipeline.c index cbe8fc86..3e46b6f7 100644 --- a/tests/test_pipeline.c +++ b/tests/test_pipeline.c @@ -15,6 +15,7 @@ #include #include #include "foundation/compat_thread.h" +#include #include #include #include "graph_buffer/graph_buffer.h" @@ -4525,6 +4526,154 @@ TEST(incremental_new_file_added) { PASS(); } +TEST(incremental_fast_preserves_mode_skipped_tools_dir) { + /* Regression: 2026-04-13. A fast-mode reindex after a full-mode index + * was silently destroying every file under FAST_SKIP_DIRS directories + * (`tools`, `scripts`, `bin`, `build`, `docs`, ...) by classifying them + * as deleted in find_deleted_files even though they still existed on + * disk. The Skyline graph lost packages/mcp/src/tools/ (18 files / ~500 + * nodes) mid-session when a concurrent /develop run obediently called + * mode='fast'. This test pins the additive semantics: lesser-mode + * reindexes must NOT delete files that are merely outside the current + * pass's discovery scope. Fix: find_deleted_files now stat()s each + * stored-but-missing file and only purges it if it is truly absent + * from disk. */ + char tmpdir[256]; + snprintf(tmpdir, sizeof(tmpdir), "/tmp/cbm_modeskip_XXXXXX"); + if (!cbm_mkdtemp(tmpdir)) { + SKIP("tmpdir"); + } + char dbpath[512]; + snprintf(dbpath, sizeof(dbpath), "%s/test.db", tmpdir); + + char path[512]; + FILE *f; + + /* main.go — root-level production code (visible in fast and full) */ + snprintf(path, sizeof(path), "%s/main.go", tmpdir); + f = fopen(path, "w"); + ASSERT_NOT_NULL(f); + fprintf(f, "package main\n\nfunc main() {\n}\n"); + fclose(f); + + /* tools/util.go — production code under a FAST_SKIP_DIRS directory. + * Full mode indexes it; fast mode skips it via the discover.c heuristic. */ + char tools_dir[512]; + snprintf(tools_dir, sizeof(tools_dir), "%s/tools", tmpdir); + cbm_mkdir_p(tools_dir, 0755); + snprintf(path, sizeof(path), "%s/tools/util.go", tmpdir); + f = fopen(path, "w"); + ASSERT_NOT_NULL(f); + fprintf(f, "package tools\n\nfunc Util() string {\n\treturn \"u\"\n}\n"); + fclose(f); + + /* Step 1: full-mode index — both files should be present */ + cbm_pipeline_t *p = cbm_pipeline_new(tmpdir, dbpath, CBM_MODE_FULL); + ASSERT_NOT_NULL(p); + ASSERT_EQ(cbm_pipeline_run(p), 0); + char *project = strdup(cbm_pipeline_project_name(p)); + cbm_pipeline_free(p); + + cbm_store_t *s = cbm_store_open_path(dbpath); + ASSERT_NOT_NULL(s); + cbm_node_t *tools_nodes_before = NULL; + int tools_count_before = 0; + cbm_store_find_nodes_by_file(s, project, "tools/util.go", &tools_nodes_before, + &tools_count_before); + ASSERT_GT(tools_count_before, 0); /* full mode must see tools/util.go */ + cbm_store_free_nodes(tools_nodes_before, tools_count_before); + int total_before = cbm_store_count_nodes(s, project); + cbm_store_close(s); + + /* Step 2: fast-mode reindex — tools/util.go MUST survive (additive semantics) */ + p = cbm_pipeline_new(tmpdir, dbpath, CBM_MODE_FAST); + ASSERT_NOT_NULL(p); + ASSERT_EQ(cbm_pipeline_run(p), 0); + cbm_pipeline_free(p); + + s = cbm_store_open_path(dbpath); + ASSERT_NOT_NULL(s); + cbm_node_t *tools_nodes_after = NULL; + int tools_count_after = 0; + cbm_store_find_nodes_by_file(s, project, "tools/util.go", &tools_nodes_after, + &tools_count_after); + /* The critical assertion: tools/util.go nodes must still be present after + * a fast-mode reindex that skipped the tools/ directory. Before the fix, + * this was 0. */ + ASSERT_GT(tools_count_after, 0); + ASSERT_EQ(tools_count_after, tools_count_before); /* same nodes, untouched */ + cbm_store_free_nodes(tools_nodes_after, tools_count_after); + + /* Sanity: total node count should not have collapsed by ~the size of tools/ */ + int total_after = cbm_store_count_nodes(s, project); + ASSERT_GTE(total_after, total_before); /* additive — never less */ + cbm_store_close(s); + + /* Step 3: mutate main.go and fast reindex — forces dump_and_persist to + * run (instead of the noop early-return path that step 2 hit). This is + * the real dangerous path: the gbuf gets loaded, mutated for main.go, + * dumped back to disk. tools/util.go must survive THAT cycle, not just + * the trivial noop path. Audit finding from 2026-04-13. */ + snprintf(path, sizeof(path), "%s/main.go", tmpdir); + f = fopen(path, "w"); + ASSERT_NOT_NULL(f); + fprintf(f, "package main\n\nfunc main() {\n\tprintln(\"changed\")\n}\n"); + fclose(f); + /* Bump mtime explicitly — some filesystems have coarse mtime resolution + * and the rewrite could land in the same tick as the original write. */ + struct stat mst; + if (stat(path, &mst) == 0) { + struct timespec times[2]; + times[0].tv_sec = mst.st_atime; + times[0].tv_nsec = 0; + times[1].tv_sec = mst.st_mtime + 5; + times[1].tv_nsec = 0; + utimensat(AT_FDCWD, path, times, 0); + } + + p = cbm_pipeline_new(tmpdir, dbpath, CBM_MODE_FAST); + ASSERT_NOT_NULL(p); + ASSERT_EQ(cbm_pipeline_run(p), 0); + cbm_pipeline_free(p); + + s = cbm_store_open_path(dbpath); + ASSERT_NOT_NULL(s); + cbm_node_t *tools_nodes_run3 = NULL; + int tools_count_run3 = 0; + cbm_store_find_nodes_by_file(s, project, "tools/util.go", &tools_nodes_run3, &tools_count_run3); + /* tools/util.go nodes must STILL be present after a fast reindex that + * actually ran the full dump_and_persist cycle (not the noop fast-path). */ + ASSERT_EQ(tools_count_run3, tools_count_before); + cbm_store_free_nodes(tools_nodes_run3, tools_count_run3); + cbm_store_close(s); + + /* Step 4: actually delete tools/util.go from disk and full-reindex. + * Now it really is gone, so its nodes should be purged. This pins the + * other half of the contract: the stat-based check correctly identifies + * truly-deleted files as deleted. */ + snprintf(path, sizeof(path), "%s/tools/util.go", tmpdir); + unlink(path); + + p = cbm_pipeline_new(tmpdir, dbpath, CBM_MODE_FULL); + ASSERT_NOT_NULL(p); + ASSERT_EQ(cbm_pipeline_run(p), 0); + cbm_pipeline_free(p); + + s = cbm_store_open_path(dbpath); + ASSERT_NOT_NULL(s); + cbm_node_t *tools_nodes_deleted = NULL; + int tools_count_deleted = 0; + cbm_store_find_nodes_by_file(s, project, "tools/util.go", &tools_nodes_deleted, + &tools_count_deleted); + ASSERT_EQ(tools_count_deleted, 0); /* truly deleted → purged */ + cbm_store_free_nodes(tools_nodes_deleted, tools_count_deleted); + cbm_store_close(s); + + free(project); + th_rmtree(tmpdir); + PASS(); +} + TEST(incremental_k8s_manifest_indexed) { /* Full index with a k8s manifest, then add a new manifest via incremental. * Verifies that cbm_pipeline_pass_k8s() runs during incremental re-index. */ @@ -5354,6 +5503,7 @@ SUITE(pipeline) { RUN_TEST(incremental_detects_changed_file); RUN_TEST(incremental_detects_deleted_file); RUN_TEST(incremental_new_file_added); + RUN_TEST(incremental_fast_preserves_mode_skipped_tools_dir); RUN_TEST(incremental_k8s_manifest_indexed); RUN_TEST(incremental_kustomize_module_indexed); /* Resource management & internal helper tests */ diff --git a/tests/test_store_nodes.c b/tests/test_store_nodes.c index b433ff2a..cbcb9e72 100644 --- a/tests/test_store_nodes.c +++ b/tests/test_store_nodes.c @@ -461,6 +461,48 @@ TEST(store_file_hash_crud) { PASS(); } +TEST(store_file_hash_upsert_rejects_null_required_fields) { + /* Pins the API contract that `cbm_store_upsert_file_hash` returns + * CBM_STORE_ERR (not silent OK) when a NOT NULL column would receive + * SQL NULL. This is the failure mode that + * `pipeline_incremental.c:persist_hashes` checks for and logs as + * `incremental.persist_hash_failed`. If this contract ever changes + * (e.g. the schema relaxes NOT NULL on rel_path or sha256), the + * downstream warning becomes silent and the orphaned-node bug class + * can re-emerge. Track that change here, not just in the consumer. */ + cbm_store_t *s = cbm_store_open_memory(); + ASSERT_NOT_NULL(s); + cbm_store_upsert_project(s, "test", "/tmp/test"); + + /* Sanity: a fully-valid upsert returns OK. */ + int rc = cbm_store_upsert_file_hash(s, "test", "main.go", "abc123", 1000000, 512); + ASSERT_EQ(rc, CBM_STORE_OK); + + /* NULL sha256 violates NOT NULL on file_hashes.sha256 → must return ERR. */ + rc = cbm_store_upsert_file_hash(s, "test", "other.go", NULL, 2000000, 1024); + ASSERT_EQ(rc, CBM_STORE_ERR); + + /* NULL rel_path violates NOT NULL on file_hashes.rel_path → must return ERR. */ + rc = cbm_store_upsert_file_hash(s, "test", NULL, "deadbeef", 3000000, 2048); + ASSERT_EQ(rc, CBM_STORE_ERR); + + /* NULL project violates NOT NULL on file_hashes.project → must return ERR. */ + rc = cbm_store_upsert_file_hash(s, NULL, "third.go", "cafebabe", 4000000, 4096); + ASSERT_EQ(rc, CBM_STORE_ERR); + + /* The valid row from earlier must still be present — partial-failure + * policy: a single bad upsert does not corrupt or remove other rows. */ + cbm_file_hash_t *hashes = NULL; + int count = 0; + cbm_store_get_file_hashes(s, "test", &hashes, &count); + ASSERT_EQ(count, 1); + ASSERT_STR_EQ(hashes[0].rel_path, "main.go"); + cbm_store_free_file_hashes(hashes, count); + + cbm_store_close(s); + PASS(); +} + /* ── Properties JSON round-trip ─────────────────────────────────── */ TEST(store_node_properties_json) { @@ -1523,6 +1565,7 @@ SUITE(store_nodes) { RUN_TEST(store_node_batch_empty); RUN_TEST(store_cascade_delete); RUN_TEST(store_file_hash_crud); + RUN_TEST(store_file_hash_upsert_rejects_null_required_fields); RUN_TEST(store_node_properties_json); RUN_TEST(store_node_null_properties); RUN_TEST(store_find_by_file_overlap);