From 685124041e76c85c54a24d55ff887df925a94291 Mon Sep 17 00:00:00 2001 From: Thibault NORMAND Date: Mon, 27 Apr 2026 18:36:47 +0200 Subject: [PATCH] feat(symgraph): improve ingestion pipeline for large repositories. --- Cargo.toml | 1 + src/cli/commands.rs | 10 +- src/cli/db_utils.rs | 74 +++++++- src/db/mod.rs | 365 ++++++++++++++++++++++++++++++++---- src/extraction/mod.rs | 140 ++++++++------ src/lib.rs | 304 +++++++++++++++++++++++------- src/mcp/handlers/reindex.rs | 92 ++++++++- src/mcp/mod.rs | 6 +- src/mcp/types.rs | 2 +- tests/integration_test.rs | 115 ++++++++++++ 10 files changed, 931 insertions(+), 178 deletions(-) diff --git a/Cargo.toml b/Cargo.toml index 3ade409..a8f4f48 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -66,6 +66,7 @@ sha2 = "0.10" hex = "0.4" tracing = "0.1" tracing-subscriber = { version = "0.3", features = ["env-filter"] } +indicatif = "0.17" [features] default = ["sqlite"] diff --git a/src/cli/commands.rs b/src/cli/commands.rs index 895ca83..10129f4 100644 --- a/src/cli/commands.rs +++ b/src/cli/commands.rs @@ -5,9 +5,11 @@ use tracing::info; use crate::context::{format_context_markdown, ContextBuilder, ContextOptions}; use crate::db::Database; -use crate::{index_codebase, IndexConfig}; +use crate::IndexConfig; -use super::db_utils::{canonicalize_path, database_path, open_project_database}; +use super::db_utils::{ + canonicalize_path, database_path, open_project_database, rebuild_project_database, +}; /// Index a codebase at the given path pub fn index_command(path: &str) -> Result<()> { @@ -16,11 +18,11 @@ pub fn index_command(path: &str) -> Result<()> { let config = IndexConfig { root: project_root.clone(), + show_progress: true, ..Default::default() }; - let stats = index_codebase(&mut db, &config)?; - + let stats = rebuild_project_database(&mut db, &config)?; println!("\nIndexing complete!"); println!(" Files indexed: {}", stats.files); println!(" Symbols found: {}", stats.nodes); diff --git a/src/cli/db_utils.rs b/src/cli/db_utils.rs index 89ee9fe..34bc69b 100644 --- a/src/cli/db_utils.rs +++ b/src/cli/db_utils.rs @@ -1,12 +1,18 @@ //! Database path and initialization utilities use anyhow::{Context, Result}; -use std::path::PathBuf; +use std::{ + path::{Path, PathBuf}, + process, + time::{SystemTime, UNIX_EPOCH}, +}; use crate::db::Database; +use crate::{build_full_index, IndexConfig, IndexingStats}; const DB_DIR: &str = ".symgraph"; const DB_FILE: &str = "index.db"; +const SHADOW_DB_PREFIX: &str = "index.shadow"; /// Get the database path for a project root pub fn database_path(project_root: &str) -> PathBuf { @@ -27,6 +33,72 @@ pub fn open_project_database(project_root: &str) -> Result { Database::open(&db_path) } +/// Open a fresh shadow database in the project index directory. +pub fn open_shadow_database(project_root: &str) -> Result { + let shadow_path = shadow_database_path(project_root)?; + Database::cleanup_on_disk_path(&shadow_path)?; + Database::open(&shadow_path) +} + +/// Best-effort cleanup for a shadow database path and its SQLite sidecars. +pub fn cleanup_shadow_database_path(path: &Path) -> Result<()> { + Database::cleanup_on_disk_path(path) +} + +/// Best-effort cleanup for a shadow database handle and its SQLite sidecars. +pub fn cleanup_shadow_database(db: &Database) -> Result<()> { + match db.path() { + Some(path) => cleanup_shadow_database_path(path), + None => Ok(()), + } +} + +/// Flush, close, and atomically swap a prepared shadow database into place. +pub fn swap_shadow_database(live_db: &mut Database, shadow_db: Database) -> Result<()> { + let shadow_path = shadow_db.prepare_for_swap()?; + live_db.replace_with_shadow(&shadow_path) +} + +/// Build a full shadow index and atomically swap it into the live database handle. +pub fn rebuild_project_database( + live_db: &mut Database, + config: &IndexConfig, +) -> Result { + let mut shadow_db = open_shadow_database(&config.root)?; + let shadow_path = shadow_db + .path() + .map(Path::to_path_buf) + .context("shadow database is not file-backed")?; + + match build_full_index(&mut shadow_db, config) { + Ok(stats) => { + if let Err(err) = swap_shadow_database(live_db, shadow_db) { + let _ = cleanup_shadow_database_path(&shadow_path); + Err(err) + } else { + Ok(stats) + } + } + Err(err) => { + let _ = cleanup_shadow_database_path(&shadow_path); + Err(err) + } + } +} + +fn shadow_database_path(project_root: &str) -> Result { + ensure_database_directory(project_root)?; + let nonce = SystemTime::now() + .duration_since(UNIX_EPOCH) + .context("System clock is before Unix epoch")? + .as_nanos(); + Ok(PathBuf::from(project_root).join(DB_DIR).join(format!( + "{SHADOW_DB_PREFIX}.{}.{}.db", + process::id(), + nonce + ))) +} + /// Canonicalize and validate a path pub fn canonicalize_path(path: &str) -> Result { let canonical = std::path::Path::new(path) diff --git a/src/db/mod.rs b/src/db/mod.rs index cb7c8e0..f1cbf21 100644 --- a/src/db/mod.rs +++ b/src/db/mod.rs @@ -8,46 +8,65 @@ mod schema; -use anyhow::Result; +use anyhow::{Context, Result}; use rusqlite::{params, Connection, OptionalExtension}; -use std::path::Path; +use std::{ + fs, + io::ErrorKind, + path::{Path, PathBuf}, +}; use crate::types::{ Edge, EdgeKind, FileRecord, IndexStats, Language, Node, NodeKind, UnresolvedReference, Visibility, }; +const CONNECTION_PRAGMAS: &str = "PRAGMA foreign_keys = ON; \ + PRAGMA journal_mode = WAL; \ + PRAGMA synchronous = NORMAL; \ + PRAGMA cache_size = -64000;"; + /// Database handle for the code graph pub struct Database { conn: Connection, + path: Option, } impl Database { /// Open or create a database at the given path pub fn open>(path: P) -> Result { - let conn = Connection::open(path)?; - Self::from_connection(conn) + let path = path.as_ref().to_path_buf(); + let conn = Connection::open(&path)?; + Self::from_connection(conn, Some(path)) } /// Create an in-memory database (for testing) pub fn in_memory() -> Result { let conn = Connection::open_in_memory()?; - Self::from_connection(conn) + Self::from_connection(conn, None) } /// Shared initialization: set PRAGMAs and create schema - fn from_connection(conn: Connection) -> Result { - conn.execute_batch( - "PRAGMA foreign_keys = ON; \ - PRAGMA journal_mode = WAL; \ - PRAGMA synchronous = NORMAL; \ - PRAGMA cache_size = -64000;", - )?; - let db = Self { conn }; + fn from_connection(conn: Connection, path: Option) -> Result { + conn.execute_batch(CONNECTION_PRAGMAS)?; + let db = Self { conn, path }; db.initialize()?; Ok(db) } + /// Path to the on-disk database, if this handle is file-backed. + pub fn path(&self) -> Option<&Path> { + self.path.as_deref() + } + + /// Close the underlying SQLite connection. + pub fn close(self) -> Result<()> { + match self.conn.close() { + Ok(()) => Ok(()), + Err((_conn, err)) => Err(err.into()), + } + } + /// Initialize the database schema and run additive migrations. fn initialize(&self) -> Result<()> { self.conn.execute_batch(schema::SCHEMA)?; @@ -665,14 +684,57 @@ impl Database { // ========================================================================= /// Insert multiple nodes using a single prepared statement. - /// Each node's `id` field is set to 0 before insertion (DB assigns the real id). + /// Each node's `id` field is updated to the database-assigned row id. /// Returns a map from old (extraction-time) id to new (database) id. pub fn insert_nodes_batch( &self, nodes: &mut [Node], + ) -> Result> { + let id_map = self.insert_nodes_base_batch(nodes)?; + self.insert_node_fts_rows(nodes)?; + self.insert_semantic_fts_rows(nodes)?; + Ok(id_map) + } + + /// Insert multiple nodes without updating FTS tables. + /// Used by full shadow builds, which rebuild FTS once at the end. + pub fn insert_nodes_batch_without_fts( + &self, + nodes: &mut [Node], + ) -> Result> { + self.insert_nodes_base_batch(nodes) + } + + /// Rebuild both FTS indexes from the canonical `nodes` table. + pub fn rebuild_fts_indexes(&self) -> Result<()> { + self.conn + .execute_batch("INSERT INTO nodes_fts(nodes_fts) VALUES('rebuild');") + .context("rebuild_fts_indexes: nodes_fts")?; + self.conn + .execute("DELETE FROM nodes_semantic_fts", []) + .context("rebuild_fts_indexes: nodes_semantic_fts clear")?; + + let mut select = self + .conn + .prepare_cached("SELECT * FROM nodes ORDER BY id")?; + let rows = select.query_map([], Self::row_to_node)?; + let mut insert = self + .conn + .prepare_cached("INSERT INTO nodes_semantic_fts(rowid, tokens) VALUES (?1, ?2)")?; + for node in rows { + let node = node?; + insert.execute(params![node.id, build_semantic_tokens(&node)])?; + } + + self.optimize_fts() + } + + fn insert_nodes_base_batch( + &self, + nodes: &mut [Node], ) -> Result> { let mut id_map = std::collections::HashMap::with_capacity(nodes.len()); - let mut stmt = self.conn.prepare( + let mut stmt = self.conn.prepare_cached( r#" INSERT INTO nodes ( kind, name, qualified_name, file_path, start_line, end_line, @@ -681,15 +743,8 @@ impl Database { ) VALUES (?1, ?2, ?3, ?4, ?5, ?6, ?7, ?8, ?9, ?10, ?11, ?12, ?13, ?14, ?15, ?16, ?17) "#, )?; - let mut fts_stmt = self - .conn - .prepare("INSERT INTO nodes_fts(rowid, name, qualified_name) VALUES (?1, ?2, ?3)")?; - let mut sem_stmt = self - .conn - .prepare("INSERT INTO nodes_semantic_fts(rowid, tokens) VALUES (?1, ?2)")?; for node in nodes.iter_mut() { let old_id = node.id; - node.id = 0; stmt.execute(params![ node.kind.as_str(), node.name, @@ -710,14 +765,32 @@ impl Database { node.language.as_str(), ])?; let new_id = self.conn.last_insert_rowid(); - fts_stmt.execute(params![new_id, node.name, node.qualified_name])?; - let tokens = build_semantic_tokens(node); - sem_stmt.execute(params![new_id, tokens])?; + node.id = new_id; id_map.insert(old_id, new_id); } Ok(id_map) } + fn insert_node_fts_rows(&self, nodes: &[Node]) -> Result<()> { + let mut stmt = self.conn.prepare_cached( + "INSERT INTO nodes_fts(rowid, name, qualified_name) VALUES (?1, ?2, ?3)", + )?; + for node in nodes { + stmt.execute(params![node.id, node.name, node.qualified_name])?; + } + Ok(()) + } + + fn insert_semantic_fts_rows(&self, nodes: &[Node]) -> Result<()> { + let mut stmt = self + .conn + .prepare_cached("INSERT INTO nodes_semantic_fts(rowid, tokens) VALUES (?1, ?2)")?; + for node in nodes { + stmt.execute(params![node.id, build_semantic_tokens(node)])?; + } + Ok(()) + } + /// Insert multiple edges using a single prepared statement. /// Maps source_id and target_id through `id_map`; edges whose ids /// cannot be mapped are silently skipped. @@ -728,7 +801,7 @@ impl Database { id_map: &std::collections::HashMap, ) -> Result { let mut count: u64 = 0; - let mut stmt = self.conn.prepare( + let mut stmt = self.conn.prepare_cached( r#" INSERT INTO edges (source_id, target_id, kind, file_path, line, column) VALUES (?1, ?2, ?3, ?4, ?5, ?6) @@ -762,7 +835,7 @@ impl Database { refs: &mut [UnresolvedReference], id_map: &std::collections::HashMap, ) -> Result<()> { - let mut stmt = self.conn.prepare( + let mut stmt = self.conn.prepare_cached( r#" INSERT INTO unresolved_refs (source_node_id, reference_name, kind, file_path, line, column) VALUES (?1, ?2, ?3, ?4, ?5, ?6) @@ -851,21 +924,130 @@ impl Database { }) } + /// Disable FTS5 automerge on both search tables before a bulk insert. + /// Prevents per-insert segment merges; call `optimize_fts` when done. + /// + /// Note: FTS5 configuration commands like `automerge` require the + /// two-column form `INSERT INTO ft(ft, rank) VALUES('option', value)`. + /// The single-column form `VALUES('automerge=0')` is only for parameterless + /// maintenance commands (`optimize`, `rebuild`, `delete-all`, etc.). + pub fn disable_fts_automerge(&self) -> Result<()> { + use anyhow::Context; + self.conn + .execute( + "INSERT INTO nodes_fts(nodes_fts, rank) VALUES('automerge', 0)", + [], + ) + .context("disable_fts_automerge: nodes_fts")?; + self.conn + .execute( + "INSERT INTO nodes_semantic_fts(nodes_semantic_fts, rank) VALUES('automerge', 0)", + [], + ) + .context("disable_fts_automerge: nodes_semantic_fts")?; + Ok(()) + } + + /// Merge all FTS5 segments into one after a bulk insert. + /// Much faster than the incremental per-insert merges that `automerge=8` + /// (the default) would have done. + pub fn optimize_fts(&self) -> Result<()> { + use anyhow::Context; + self.conn + .execute_batch("INSERT INTO nodes_fts(nodes_fts) VALUES('optimize');") + .context("optimize_fts: nodes_fts")?; + self.conn + .execute_batch("INSERT INTO nodes_semantic_fts(nodes_semantic_fts) VALUES('optimize');") + .context("optimize_fts: nodes_semantic_fts")?; + Ok(()) + } + /// Begin a transaction pub fn begin_transaction(&mut self) -> Result<()> { - self.conn.execute("BEGIN TRANSACTION", [])?; + use anyhow::Context; + self.conn + .execute("BEGIN TRANSACTION", []) + .context("begin_transaction")?; Ok(()) } - /// Commit a transaction - pub fn commit(&mut self) -> Result<()> { - self.conn.execute("COMMIT", [])?; + /// Commit a transaction without checkpointing the WAL. + pub fn commit_transaction(&mut self) -> Result<()> { + self.conn.execute("COMMIT", []).context("commit: COMMIT")?; Ok(()) } + /// Checkpoint the WAL and truncate it so the database file stays compact. + pub fn checkpoint_wal_truncate(&self) -> Result<()> { + self.conn + .execute_batch("PRAGMA wal_checkpoint(TRUNCATE)") + .context("wal_checkpoint")?; + Ok(()) + } + + /// Commit a transaction and checkpoint the WAL so the file stays compact. + pub fn commit(&mut self) -> Result<()> { + self.commit_transaction()?; + self.checkpoint_wal_truncate() + } + /// Rollback a transaction pub fn rollback(&mut self) -> Result<()> { - self.conn.execute("ROLLBACK", [])?; + self.conn.execute("ROLLBACK", []).context("rollback")?; + Ok(()) + } + + /// Flush a shadow database to disk and close it before an atomic swap. + pub fn prepare_for_swap(self) -> Result { + let path = self + .path + .clone() + .context("prepare_for_swap requires an on-disk database")?; + self.checkpoint_wal_truncate()?; + self.close()?; + cleanup_sqlite_sidecars(&path)?; + Ok(path) + } + + /// Atomically replace this on-disk database file with a prepared shadow database. + pub fn replace_with_shadow>(&mut self, shadow_path: P) -> Result<()> { + let live_path = self + .path + .clone() + .context("replace_with_shadow requires an on-disk database")?; + let shadow_path = shadow_path.as_ref().to_path_buf(); + + self.checkpoint_wal_truncate()?; + + let placeholder = Connection::open_in_memory().context("opening placeholder connection")?; + let live_conn = std::mem::replace(&mut self.conn, placeholder); + if let Err((conn, err)) = live_conn.close() { + self.conn = conn; + return Err(err.into()); + } + + cleanup_sqlite_sidecars(&live_path)?; + + if let Err(err) = fs::rename(&shadow_path, &live_path) { + self.reopen_from_path(&live_path)?; + return Err(err.into()); + } + + cleanup_sqlite_sidecars(&shadow_path)?; + self.reopen_from_path(&live_path) + } + + /// Delete a database file and any SQLite sidecars if they exist. + pub fn cleanup_on_disk_path>(path: P) -> Result<()> { + let path = path.as_ref(); + remove_file_if_exists(path)?; + cleanup_sqlite_sidecars(path) + } + + fn reopen_from_path(&mut self, path: &Path) -> Result<()> { + let reopened = Self::open(path)?; + self.conn = reopened.conn; + self.path = reopened.path; Ok(()) } @@ -1133,9 +1315,32 @@ fn normalize_query_for_fts(query: &str) -> String { .join(" ") } +fn sqlite_sidecar_paths(path: &Path) -> [PathBuf; 2] { + [ + PathBuf::from(format!("{}-wal", path.display())), + PathBuf::from(format!("{}-shm", path.display())), + ] +} + +fn cleanup_sqlite_sidecars(path: &Path) -> Result<()> { + for sidecar in sqlite_sidecar_paths(path) { + remove_file_if_exists(&sidecar)?; + } + Ok(()) +} + +fn remove_file_if_exists(path: &Path) -> Result<()> { + match fs::remove_file(path) { + Ok(()) => Ok(()), + Err(err) if err.kind() == ErrorKind::NotFound => Ok(()), + Err(err) => Err(err.into()), + } +} + #[cfg(test)] mod tests { use super::*; + use tempfile::tempdir; fn create_test_node(name: &str, kind: NodeKind, file_path: &str) -> Node { Node { @@ -1179,6 +1384,15 @@ mod tests { assert!(db.is_ok()); } + #[test] + fn test_open_tracks_on_disk_path() { + let dir = tempdir().unwrap(); + let path = dir.path().join("tracked.db"); + + let db = Database::open(&path).unwrap(); + assert_eq!(db.path(), Some(path.as_path())); + } + #[test] fn test_database_stats_empty() { let db = Database::in_memory().unwrap(); @@ -1703,6 +1917,93 @@ mod tests { assert_eq!(stats.total_nodes, 0); } + #[test] + fn test_prepare_for_swap_cleans_sidecars() { + let dir = tempdir().unwrap(); + let path = dir.path().join("shadow.db"); + let db = Database::open(&path).unwrap(); + let file = mk_file("shadow.rs"); + + db.insert_or_update_file(&file).unwrap(); + db.insert_node(&create_test_node( + "shadow_fn", + NodeKind::Function, + "shadow.rs", + )) + .unwrap(); + + let prepared_path = db.prepare_for_swap().unwrap(); + assert_eq!(prepared_path, path); + assert!(prepared_path.exists()); + assert!(!PathBuf::from(format!("{}-wal", prepared_path.display())).exists()); + assert!(!PathBuf::from(format!("{}-shm", prepared_path.display())).exists()); + } + + #[test] + fn test_replace_with_shadow_reopens_new_contents() { + let dir = tempdir().unwrap(); + let live_path = dir.path().join("live.db"); + let shadow_path = dir.path().join("shadow.db"); + + let mut live = Database::open(&live_path).unwrap(); + let old_file = mk_file("old.rs"); + live.insert_or_update_file(&old_file).unwrap(); + live.insert_node(&create_test_node("old_fn", NodeKind::Function, "old.rs")) + .unwrap(); + + let shadow = Database::open(&shadow_path).unwrap(); + let new_file = mk_file("new.rs"); + shadow.insert_or_update_file(&new_file).unwrap(); + shadow + .insert_node(&create_test_node("new_fn", NodeKind::Function, "new.rs")) + .unwrap(); + + let prepared_shadow = shadow.prepare_for_swap().unwrap(); + live.replace_with_shadow(&prepared_shadow).unwrap(); + + assert_eq!(live.path(), Some(live_path.as_path())); + assert!(live.find_node_by_name("old_fn").unwrap().is_none()); + assert!(live.find_node_by_name("new_fn").unwrap().is_some()); + assert!(live.get_file("old.rs").unwrap().is_none()); + assert!(live.get_file("new.rs").unwrap().is_some()); + assert!(!prepared_shadow.exists()); + } + + #[test] + fn test_cleanup_on_disk_path_removes_db_and_sidecars() { + let dir = tempdir().unwrap(); + let path = dir.path().join("cleanup.db"); + + std::fs::write(&path, b"db").unwrap(); + std::fs::write(format!("{}-wal", path.display()), b"wal").unwrap(); + std::fs::write(format!("{}-shm", path.display()), b"shm").unwrap(); + + Database::cleanup_on_disk_path(&path).unwrap(); + + assert!(!path.exists()); + assert!(!PathBuf::from(format!("{}-wal", path.display())).exists()); + assert!(!PathBuf::from(format!("{}-shm", path.display())).exists()); + } + + #[test] + fn test_rebuild_fts_indexes_restores_search() { + let db = Database::in_memory().unwrap(); + let file = mk_file("auth.rs"); + db.insert_or_update_file(&file).unwrap(); + + let mut node = create_test_node("validateToken", NodeKind::Function, "auth.rs"); + node.docstring = Some("Validate JWT bearer token".to_string()); + db.insert_nodes_batch_without_fts(&mut [node]).unwrap(); + + assert!(db.semantic_search("jwt bearer", 10).unwrap().is_empty()); + + db.disable_fts_automerge().unwrap(); + db.rebuild_fts_indexes().unwrap(); + + let hits = db.semantic_search("jwt bearer", 10).unwrap(); + assert!(hits.iter().any(|n| n.name == "validateToken")); + } + #[test] fn test_get_hierarchy() { let db = Database::in_memory().unwrap(); diff --git a/src/extraction/mod.rs b/src/extraction/mod.rs index 60d63a2..1fc85d3 100644 --- a/src/extraction/mod.rs +++ b/src/extraction/mod.rs @@ -159,32 +159,47 @@ struct ExtractionContext<'a> { } impl<'a> ExtractionContext<'a> { - fn traverse_node(&mut self, node: tree_sitter::Node) { - let node_type = node.kind(); + fn traverse_node<'tree>(&mut self, root: tree_sitter::Node<'tree>) { + enum Work<'t> { + Visit(tree_sitter::Node<'t>), + PopStack, + } - // Check if this is a symbol we care about - if let Some(kind) = self.config.node_type_to_kind(node_type) { - self.extract_symbol(node, kind); - } else { - // Continue traversing children + fn push_children<'t>(node: tree_sitter::Node<'t>, work: &mut Vec>) { let mut cursor = node.walk(); - for child in node.children(&mut cursor) { - self.traverse_node(child); + let children: Vec<_> = node.children(&mut cursor).collect(); + for child in children.into_iter().rev() { + work.push(Work::Visit(child)); } } - } - fn extract_symbol(&mut self, node: tree_sitter::Node, kind: NodeKind) { - let name = self.extract_name(&node, kind); - if name.is_empty() { - // Skip anonymous nodes - let mut cursor = node.walk(); - for child in node.children(&mut cursor) { - self.traverse_node(child); + let mut work: Vec> = vec![Work::Visit(root)]; + + while let Some(item) = work.pop() { + match item { + Work::PopStack => { + self.node_stack.pop(); + } + Work::Visit(node) => { + let node_type = node.kind(); + if let Some(kind) = self.config.node_type_to_kind(node_type) { + let name = self.extract_name(&node, kind); + if name.is_empty() { + push_children(node, &mut work); + } else { + self.emit_symbol(node, kind, name); + work.push(Work::PopStack); + push_children(node, &mut work); + } + } else { + push_children(node, &mut work); + } + } } - return; } + } + fn emit_symbol(&mut self, node: tree_sitter::Node, kind: NodeKind, name: String) { let start = node.start_position(); let end = node.end_position(); @@ -214,7 +229,6 @@ impl<'a> ExtractionContext<'a> { self.next_id += 1; self.result.nodes.push(symbol); - // Create contains edge from parent if let Some(&parent_id) = self.node_stack.last() { let edge = Edge { id: 0, @@ -228,19 +242,8 @@ impl<'a> ExtractionContext<'a> { self.result.edges.push(edge); } - // Push this symbol onto the stack and traverse children self.node_stack.push(symbol_id); - - // Extract function calls and other references from body - self.extract_references(&node, symbol_id); - - // Traverse children for nested definitions - let mut cursor = node.walk(); - for child in node.children(&mut cursor) { - self.traverse_node(child); - } - - self.node_stack.pop(); + self.find_calls(node, symbol_id); } fn extract_name(&self, node: &tree_sitter::Node, _kind: NodeKind) -> String { @@ -282,7 +285,11 @@ impl<'a> ExtractionContext<'a> { // Truncate at opening brace or newline let sig = sig.split('{').next().unwrap_or(sig).trim(); if sig.len() > 200 { - Some(format!("{}...", &sig[..200])) + let boundary = (0..=200) + .rev() + .find(|&i| sig.is_char_boundary(i)) + .unwrap_or(0); + Some(format!("{}...", &sig[..boundary])) } else { Some(sig.to_string()) } @@ -481,33 +488,27 @@ impl<'a> ExtractionContext<'a> { Some(parts.join("::")) } - fn extract_references(&mut self, node: &tree_sitter::Node, source_id: i64) { - // Find call expressions within this node - self.find_calls(node, source_id); - } - - fn find_calls(&mut self, node: &tree_sitter::Node, source_id: i64) { - let kind = node.kind(); - - if self.config.is_call_node(kind) { - if let Some(func_name) = self.extract_call_name(node) { - let start = node.start_position(); - let uref = UnresolvedReference { - source_node_id: source_id, - reference_name: func_name, - kind: EdgeKind::Calls, - file_path: self.file_path.clone(), - line: start.row as u32 + 1, - column: start.column as u32, - }; - self.result.unresolved_refs.push(uref); + fn find_calls<'tree>(&mut self, root: tree_sitter::Node<'tree>, source_id: i64) { + let mut stack: Vec> = vec![root]; + while let Some(node) = stack.pop() { + if self.config.is_call_node(node.kind()) { + if let Some(func_name) = self.extract_call_name(&node) { + let start = node.start_position(); + self.result.unresolved_refs.push(UnresolvedReference { + source_node_id: source_id, + reference_name: func_name, + kind: EdgeKind::Calls, + file_path: self.file_path.clone(), + line: start.row as u32 + 1, + column: start.column as u32, + }); + } + } + let mut cursor = node.walk(); + let children: Vec<_> = node.children(&mut cursor).collect(); + for child in children.into_iter().rev() { + stack.push(child); } - } - - // Recurse into children - let mut cursor = node.walk(); - for child in node.children(&mut cursor) { - self.find_calls(&child, source_id); } } @@ -1258,6 +1259,29 @@ end assert!(!is_generated_content("fn foo() { 1 + 1 }")); } + #[test] + fn test_deeply_nested_does_not_overflow() { + // Regression test: the old recursive traverse_node would stack-overflow + // on ASTs deeper than ~500 levels. The iterative rewrite must handle this. + let mut extractor = Extractor::new(); + let depth = 500usize; + let mut code = String::new(); + for i in 0..depth { + code.push_str(&format!("fn f{}(){{", i)); + } + code.push_str(&"}".repeat(depth)); + let result = extractor.extract_file("deep.rs", &code); + assert!(result.errors.is_empty()); + assert_eq!( + result + .nodes + .iter() + .filter(|n| n.kind == NodeKind::Function) + .count(), + depth + ); + } + #[test] fn test_test_name_heuristic() { assert!(test_name_heuristic("test_user_login")); diff --git a/src/lib.rs b/src/lib.rs index 3c7d28b..a1c400e 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -41,6 +41,11 @@ use std::time::SystemTime; use anyhow::Result; use ignore::WalkBuilder; +use indicatif::{ProgressBar, ProgressStyle}; + +/// Commit and checkpoint the WAL after this many files during bulk indexing. +/// Keeps WAL size bounded without requiring a single massive transaction. +const CHECKPOINT_INTERVAL: usize = 200; use rayon::prelude::*; use sha2::{Digest, Sha256}; use tracing::{debug, info, warn}; @@ -63,6 +68,8 @@ pub struct IndexConfig { pub respect_gitignore: bool, /// Skip the global resolve_references pass (for scoped resolution) pub skip_resolve: bool, + /// Render progress bars to stderr during indexing (disable for library/server use) + pub show_progress: bool, } impl Default for IndexConfig { @@ -103,6 +110,7 @@ impl Default for IndexConfig { ], respect_gitignore: true, skip_resolve: false, + show_progress: false, } } } @@ -124,24 +132,76 @@ struct ExtractedFile { result: ExtractionResult, } -/// Index a codebase into the database +/// Indexing behavior for a storage target. +#[cfg(feature = "sqlite")] +#[derive(Clone, Copy)] +enum IndexMode { + Incremental, + FullBuild, +} + +/// Incrementally index only changed files into an existing database. #[cfg(feature = "sqlite")] pub fn index_codebase(db: &mut Database, config: &IndexConfig) -> Result { + run_index_codebase(db, config, IndexMode::Incremental) +} + +/// Build a complete index into an empty target database. +#[cfg(feature = "sqlite")] +pub fn build_full_index(db: &mut Database, config: &IndexConfig) -> Result { + run_index_codebase(db, config, IndexMode::FullBuild) +} + +#[cfg(feature = "sqlite")] +fn run_index_codebase( + db: &mut Database, + config: &IndexConfig, + mode: IndexMode, +) -> Result { let root = Path::new(&config.root).canonicalize()?; info!("Indexing codebase at {}", root.display()); let mut stats = IndexingStats::default(); + let entries_to_extract = collect_entries(db, config, &root, mode, &mut stats)?; + let extracted = extract_entries(entries_to_extract, config.show_progress); + + match mode { + IndexMode::Incremental => store_incremental_index(db, config, extracted, &mut stats)?, + IndexMode::FullBuild => store_full_index(db, config, extracted, &mut stats)?, + } + + info!( + "Indexed {} files, {} nodes, {} edges ({} refs resolved)", + stats.files, stats.nodes, stats.edges, stats.resolved_refs + ); + + Ok(stats) +} - // Build the walker - let mut walker = WalkBuilder::new(&root); +#[cfg(feature = "sqlite")] +fn collect_entries( + db: &Database, + config: &IndexConfig, + root: &Path, + mode: IndexMode, + stats: &mut IndexingStats, +) -> Result> { + let mut walker = WalkBuilder::new(root); walker .hidden(false) .git_ignore(config.respect_gitignore) .git_global(config.respect_gitignore) .git_exclude(config.respect_gitignore); - // Phase 1a: Sequentially walk directory, read files, compute hashes, check reindex need - let mut entries_to_extract: Vec = Vec::new(); + let mut entries_to_extract = Vec::new(); + let scan_pb = if config.show_progress { + let pb = ProgressBar::new_spinner(); + pb.set_prefix("Scanning"); + pb.enable_steady_tick(std::time::Duration::from_millis(100)); + pb + } else { + ProgressBar::hidden() + }; for entry in walker.build() { let entry = match entry { @@ -153,26 +213,21 @@ pub fn index_codebase(db: &mut Database, config: &IndexConfig) -> Result Result c, Err(err) => { @@ -202,20 +255,17 @@ pub fn index_codebase(db: &mut Database, config: &IndexConfig) -> Result Result Result, show_progress: bool) -> Vec { + let parse_pb = if show_progress { + let pb = ProgressBar::new(entries_to_extract.len() as u64); + pb.set_style( + ProgressStyle::with_template( + " {prefix:<10} [{bar:40.cyan/blue}] {pos:>5}/{len:<5} {msg}", + ) + .unwrap() + .progress_chars("=> "), + ); + pb.set_prefix("Parsing"); + pb + } else { + ProgressBar::hidden() + }; + let extracted: Vec = entries_to_extract .into_par_iter() .map(|entry| { let mut extractor = Extractor::new(); let result = extractor.extract_file(&entry.rel_path, &entry.content); + parse_pb.inc(1); ExtractedFile { entry, result } }) .collect(); + parse_pb.finish_and_clear(); + extracted +} - // Phase 2: Sequential database operations inside a transaction +#[cfg(feature = "sqlite")] +fn store_incremental_index( + db: &mut Database, + config: &IndexConfig, + extracted: Vec, + stats: &mut IndexingStats, +) -> Result<()> { + let total = extracted.len(); db.begin_transaction()?; + db.disable_fts_automerge()?; - for extracted_file in extracted { - let entry = extracted_file.entry; - let result = extracted_file.result; - - debug!("Indexing: {}", entry.rel_path); + let store_pb = make_store_progress_bar(total, config.show_progress); + for (i, extracted_file) in extracted.into_iter().enumerate() { + store_extracted_file(db, extracted_file, stats, true, true)?; + store_pb.inc(1); - // Delete existing data for this file - db.delete_file(&entry.rel_path)?; + let is_last = i + 1 == total; + if (i + 1) % CHECKPOINT_INTERVAL == 0 && !is_last { + db.commit()?; + db.begin_transaction()?; + } + } + store_pb.finish_and_clear(); - // Store file record FIRST (nodes have FK to files) - let file_record = FileRecord { - path: entry.rel_path.clone(), - content_hash: entry.content_hash, - language: entry.language, - size: entry.content.len() as u64, - modified_at: entry.modified_at, - indexed_at: SystemTime::now() - .duration_since(SystemTime::UNIX_EPOCH) - .map(|d| d.as_secs() as i64) - .unwrap_or(0), - node_count: result.nodes.len() as u32, - }; - db.insert_or_update_file(&file_record)?; + db.optimize_fts()?; + resolve_references_if_needed(db, config, stats)?; + db.commit()?; + Ok(()) +} - // Store nodes (batch: prepare statement once, reuse for all rows) - let node_count = file_record.node_count; - let mut nodes = result.nodes; - let id_map = db.insert_nodes_batch(&mut nodes)?; +#[cfg(feature = "sqlite")] +fn store_full_index( + db: &mut Database, + config: &IndexConfig, + extracted: Vec, + stats: &mut IndexingStats, +) -> Result<()> { + db.begin_transaction()?; - // Store edges with mapped IDs (batch) - let mut edges = result.edges; - let edge_count = db.insert_edges_batch(&mut edges, &id_map)?; - stats.edges += edge_count; + let store_pb = make_store_progress_bar(extracted.len(), config.show_progress); + for extracted_file in extracted { + store_extracted_file(db, extracted_file, stats, false, false)?; + store_pb.inc(1); + } + store_pb.finish_and_clear(); - // Store unresolved references with mapped IDs (batch) - let mut unresolved_refs = result.unresolved_refs; - db.insert_unresolved_refs_batch(&mut unresolved_refs, &id_map)?; + resolve_references_if_needed(db, config, stats)?; + db.disable_fts_automerge()?; + db.rebuild_fts_indexes()?; + db.commit_transaction()?; + Ok(()) +} - stats.files += 1; - stats.nodes += node_count as u64; - stats.errors += result.errors.len() as u64; +#[cfg(feature = "sqlite")] +fn make_store_progress_bar(total: usize, show_progress: bool) -> ProgressBar { + if show_progress { + let pb = ProgressBar::new(total as u64); + pb.set_style( + ProgressStyle::with_template( + " {prefix:<10} [{bar:40.cyan/blue}] {pos:>5}/{len:<5} {msg}", + ) + .unwrap() + .progress_chars("=> "), + ); + pb.set_prefix("Storing"); + pb + } else { + ProgressBar::hidden() } +} - // Resolve references (unless caller will handle scoped resolution) - if !config.skip_resolve { - info!("Resolving references..."); - let resolved = db.resolve_references()?; - stats.resolved_refs = resolved as u64; +#[cfg(feature = "sqlite")] +fn store_extracted_file( + db: &Database, + extracted_file: ExtractedFile, + stats: &mut IndexingStats, + delete_existing: bool, + maintain_fts: bool, +) -> Result<()> { + let entry = extracted_file.entry; + let result = extracted_file.result; + + debug!("Indexing: {}", entry.rel_path); + if delete_existing { + db.delete_file(&entry.rel_path)?; } - // Commit transaction - db.commit()?; + let file_record = build_file_record(&entry, result.nodes.len()); + let node_count = file_record.node_count; + let error_count = result.errors.len() as u64; + db.insert_or_update_file(&file_record)?; + + let mut nodes = result.nodes; + let id_map = if maintain_fts { + db.insert_nodes_batch(&mut nodes)? + } else { + db.insert_nodes_batch_without_fts(&mut nodes)? + }; + + let mut edges = result.edges; + let edge_count = db.insert_edges_batch(&mut edges, &id_map)?; + stats.edges += edge_count; + + let mut unresolved_refs = result.unresolved_refs; + db.insert_unresolved_refs_batch(&mut unresolved_refs, &id_map)?; + + stats.files += 1; + stats.nodes += node_count as u64; + stats.errors += error_count; + Ok(()) +} - info!( - "Indexed {} files, {} nodes, {} edges ({} refs resolved)", - stats.files, stats.nodes, stats.edges, stats.resolved_refs - ); +#[cfg(feature = "sqlite")] +fn build_file_record(entry: &FileEntry, node_count: usize) -> FileRecord { + FileRecord { + path: entry.rel_path.clone(), + content_hash: entry.content_hash.clone(), + language: entry.language, + size: entry.content.len() as u64, + modified_at: entry.modified_at, + indexed_at: SystemTime::now() + .duration_since(SystemTime::UNIX_EPOCH) + .map(|d| d.as_secs() as i64) + .unwrap_or(0), + node_count: node_count as u32, + } +} - Ok(stats) +#[cfg(feature = "sqlite")] +fn resolve_references_if_needed( + db: &Database, + config: &IndexConfig, + stats: &mut IndexingStats, +) -> Result<()> { + if config.skip_resolve { + return Ok(()); + } + + info!("Resolving references..."); + let resolve_pb = if config.show_progress { + let pb = ProgressBar::new_spinner(); + pb.set_prefix("Resolving"); + pb.set_message("references..."); + pb.enable_steady_tick(std::time::Duration::from_millis(100)); + pb + } else { + ProgressBar::hidden() + }; + let resolved = db.resolve_references()?; + resolve_pb.finish_and_clear(); + stats.resolved_refs = resolved as u64; + Ok(()) } /// Statistics from an indexing operation diff --git a/src/mcp/handlers/reindex.rs b/src/mcp/handlers/reindex.rs index 2d7cc37..bb1361f 100644 --- a/src/mcp/handlers/reindex.rs +++ b/src/mcp/handlers/reindex.rs @@ -1,5 +1,6 @@ //! Reindexing handler +use crate::cli::rebuild_project_database; use crate::db::Database; use crate::mcp::format::normalize_path; use crate::mcp::types::ReindexRequest; @@ -14,7 +15,10 @@ pub fn handle_reindex( // If specific files requested, delete and reindex just those if let Some(files) = &req.files { if files.is_empty() { - return Ok("No files specified. Provide file paths or omit the parameter to reindex all changed files.".to_string()); + return Ok( + "No files specified. Provide file paths or omit the parameter to rebuild the full index." + .to_string(), + ); } let mut errors = Vec::new(); @@ -68,20 +72,90 @@ pub fn handle_reindex( Err(e) => Err(format!("Reindex failed: {}", e)), } } else { - // Full incremental reindex + // Full shadow rebuild let config = IndexConfig { root: project_root.to_string(), ..Default::default() }; - match index_codebase(db, &config) { - Ok(stats) => { - Ok(format!( - "## Reindex Complete\n\n**Files processed:** {}\n**Files skipped (unchanged):** {}\n**Symbols found:** {}\n**Edges created:** {}\n**References resolved:** {}\n**Errors:** {}\n", - stats.files, stats.skipped, stats.nodes, stats.edges, stats.resolved_refs, stats.errors - )) - } + match rebuild_project_database(db, &config) { + Ok(stats) => Ok(format!( + "## Reindex Complete\n\n**Mode:** full rebuild\n**Files indexed:** {}\n**Symbols found:** {}\n**Edges created:** {}\n**References resolved:** {}\n**Errors:** {}\n", + stats.files, stats.nodes, stats.edges, stats.resolved_refs, stats.errors + )), Err(e) => Err(format!("Reindex failed: {}", e)), } } } + +#[cfg(test)] +mod tests { + use super::*; + use std::fs; + use tempfile::tempdir; + + use crate::cli::{open_project_database, rebuild_project_database}; + + fn write_file(path: &std::path::Path, content: &str) { + if let Some(parent) = path.parent() { + fs::create_dir_all(parent).unwrap(); + } + fs::write(path, content).unwrap(); + } + + #[test] + fn test_handle_reindex_full_rebuild_reopens_live_handle() { + let dir = tempdir().unwrap(); + let project_root = dir.path().display().to_string(); + let file_path = dir.path().join("src/lib.rs"); + write_file(&file_path, "pub fn old_symbol() {}\n"); + + let mut db = open_project_database(&project_root).unwrap(); + let config = IndexConfig { + root: project_root.clone(), + ..Default::default() + }; + rebuild_project_database(&mut db, &config).unwrap(); + assert!(db.find_node_by_name("old_symbol").unwrap().is_some()); + + write_file(&file_path, "pub fn new_symbol() {}\n"); + let output = + handle_reindex(&mut db, &project_root, &ReindexRequest { files: None }).unwrap(); + + assert!(output.contains("**Mode:** full rebuild")); + assert!(db.find_node_by_name("old_symbol").unwrap().is_none()); + assert!(db.find_node_by_name("new_symbol").unwrap().is_some()); + } + + #[test] + fn test_handle_reindex_specific_files_stays_in_place() { + let dir = tempdir().unwrap(); + let project_root = dir.path().display().to_string(); + let a_path = dir.path().join("src/a.rs"); + let b_path = dir.path().join("src/b.rs"); + write_file(&a_path, "pub fn old_a() {}\n"); + write_file(&b_path, "pub fn stable_b() {}\n"); + + let mut db = open_project_database(&project_root).unwrap(); + let config = IndexConfig { + root: project_root.clone(), + ..Default::default() + }; + rebuild_project_database(&mut db, &config).unwrap(); + + write_file(&a_path, "pub fn new_a() {}\n"); + let output = handle_reindex( + &mut db, + &project_root, + &ReindexRequest { + files: Some(vec!["src/a.rs".to_string()]), + }, + ) + .unwrap(); + + assert!(output.contains("**Files reindexed:** 1")); + assert!(db.find_node_by_name("old_a").unwrap().is_none()); + assert!(db.find_node_by_name("new_a").unwrap().is_some()); + assert!(db.find_node_by_name("stable_b").unwrap().is_some()); + } +} diff --git a/src/mcp/mod.rs b/src/mcp/mod.rs index 2c87760..a0b4a47 100644 --- a/src/mcp/mod.rs +++ b/src/mcp/mod.rs @@ -200,10 +200,10 @@ impl SymgraphHandler { self.with_db(|db| handlers::symbol::handle_references(db, &req)) } - /// Trigger incremental reindexing (runs in background, returns immediately) + /// Trigger background reindexing (runs in background, returns immediately) #[tool( name = "symgraph-reindex", - description = "Trigger incremental reindexing of the codebase. Only changed files are re-parsed. Runs in background and returns immediately." + description = "Trigger reindexing of the codebase. When files are provided, only those files are updated in place; otherwise a full shadow rebuild runs in the background." )] fn symgraph_reindex(&self, Parameters(req): Parameters) -> String { // If a reindex is already running, refuse to start another one. @@ -241,7 +241,7 @@ impl SymgraphHandler { n ), None => { - "Reindexing all changed files in background. Use symgraph-status to check progress." + "Rebuilding the full index in background. Use symgraph-status to check progress." .to_string() } } diff --git a/src/mcp/types.rs b/src/mcp/types.rs index 33c5a31..e5607f5 100644 --- a/src/mcp/types.rs +++ b/src/mcp/types.rs @@ -48,7 +48,7 @@ pub struct DefinitionRequest { #[derive(Debug, Deserialize, schemars::JsonSchema)] pub struct ReindexRequest { #[schemars( - description = "Optional: specific files to reindex. If empty, reindexes all changed files." + description = "Optional: specific files to reindex. If omitted, rebuilds the full index via a shadow database." )] pub files: Option>, } diff --git a/tests/integration_test.rs b/tests/integration_test.rs index 7edc6ee..1417dfb 100644 --- a/tests/integration_test.rs +++ b/tests/integration_test.rs @@ -2,10 +2,14 @@ //! //! These tests verify the end-to-end workflow of indexing and querying code. +use std::fs; + +use symgraph::cli::{open_project_database, rebuild_project_database}; use symgraph::db::Database; use symgraph::extraction::Extractor; use symgraph::graph::Graph; use symgraph::types::{EdgeKind, FileRecord, Language, NodeKind}; +use symgraph::{build_full_index, index_codebase, IndexConfig}; use tempfile::tempdir; /// Helper to set up a test database with indexed code @@ -271,6 +275,85 @@ fn test_database_persistence() { } } +#[test] +fn test_build_full_index_populates_empty_target_db() { + let dir = tempdir().unwrap(); + let src = dir.path().join("src"); + fs::create_dir(&src).unwrap(); + fs::write( + src.join("main.rs"), + "fn main() { helper(); }\nfn helper() {}\n", + ) + .unwrap(); + + let db_path = dir.path().join("full-build.db"); + let mut db = Database::open(&db_path).unwrap(); + let config = IndexConfig { + root: dir.path().display().to_string(), + ..Default::default() + }; + + let stats = build_full_index(&mut db, &config).unwrap(); + + assert_eq!(stats.files, 1); + assert_eq!(stats.skipped, 0); + assert!(stats.nodes >= 2); + assert!(db.find_node_by_name("main").unwrap().is_some()); + assert!(db.find_node_by_name("helper").unwrap().is_some()); + assert!(!db.semantic_search("helper", 10).unwrap().is_empty()); +} + +#[test] +fn test_rebuild_project_database_replaces_stale_rows() { + let dir = tempdir().unwrap(); + let src = dir.path().join("src"); + fs::create_dir(&src).unwrap(); + fs::write(src.join("old.rs"), "fn old_symbol() {}\n").unwrap(); + + let project_root = dir.path().display().to_string(); + let mut db = open_project_database(&project_root).unwrap(); + let config = IndexConfig { + root: project_root.clone(), + ..Default::default() + }; + + rebuild_project_database(&mut db, &config).unwrap(); + assert!(db.find_node_by_name("old_symbol").unwrap().is_some()); + + fs::remove_file(src.join("old.rs")).unwrap(); + fs::write(src.join("new.rs"), "fn new_symbol() {}\n").unwrap(); + + rebuild_project_database(&mut db, &config).unwrap(); + + assert!(db.find_node_by_name("old_symbol").unwrap().is_none()); + assert!(db.find_node_by_name("new_symbol").unwrap().is_some()); + assert_eq!(db.get_stats().unwrap().total_files, 1); +} + +#[test] +fn test_rebuild_project_database_keeps_live_db_on_failure() { + let dir = tempdir().unwrap(); + let src = dir.path().join("src"); + fs::create_dir(&src).unwrap(); + fs::write(src.join("live.rs"), "fn live_symbol() {}\n").unwrap(); + + let project_root = dir.path().display().to_string(); + let mut db = open_project_database(&project_root).unwrap(); + let good_config = IndexConfig { + root: project_root.clone(), + ..Default::default() + }; + rebuild_project_database(&mut db, &good_config).unwrap(); + + let bad_config = IndexConfig { + root: src.join("live.rs").display().to_string(), + ..Default::default() + }; + assert!(rebuild_project_database(&mut db, &bad_config).is_err()); + + assert!(db.find_node_by_name("live_symbol").unwrap().is_some()); +} + #[test] fn test_incremental_indexing() { let db = Database::in_memory().unwrap(); @@ -667,3 +750,35 @@ fn test_search_performance() { duration ); } + +/// Stress test: exercise the periodic-commit / WAL-checkpoint branch for +/// in-place incremental indexing by crossing CHECKPOINT_INTERVAL (200). +/// Uses an on-disk database to mirror the write path that still batches +/// commits mid-loop. +#[test] +fn test_incremental_index_codebase_periodic_checkpoint() { + let dir = tempdir().unwrap(); + let src = dir.path().join("src"); + fs::create_dir(&src).unwrap(); + + // 250 small Rust files — enough to cross the 200-file checkpoint boundary. + for i in 0..250 { + let path = src.join(format!("mod_{:03}.rs", i)); + let body = format!( + "pub fn handler_{i}() {{ helper_{i}(); }}\nfn helper_{i}() {{}}\n", + i = i + ); + fs::write(&path, body).unwrap(); + } + + let db_path = dir.path().join("symgraph.db"); + let mut db = Database::open(&db_path).unwrap(); + let config = IndexConfig { + root: dir.path().display().to_string(), + ..Default::default() + }; + + let stats = index_codebase(&mut db, &config).expect("index_codebase failed"); + assert_eq!(stats.files, 250); + assert!(stats.nodes >= 500); +}