Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 8 additions & 0 deletions codebase_rag/constants.py
Original file line number Diff line number Diff line change
Expand Up @@ -848,11 +848,19 @@ class TreeSitterModule(StrEnum):
class EventType(StrEnum):
MODIFIED = "modified"
CREATED = "created"
DELETED = "deleted"


CYPHER_DELETE_MODULE = "MATCH (m:Module {path: $path})-[*0..]->(c) DETACH DELETE m, c"
CYPHER_DELETE_FILE = "MATCH (f:File {path: $path}) DETACH DELETE f"
CYPHER_DELETE_FOLDER = "MATCH (f:Folder {path: $path}) DETACH DELETE f"
CYPHER_DELETE_CALLS = "MATCH ()-[r:CALLS]->() DELETE r"

# (H) Queries for orphan pruning — returns all paths stored in the graph
CYPHER_ALL_FILE_PATHS = "MATCH (f:File) RETURN f.path AS path"
CYPHER_ALL_MODULE_PATHS = "MATCH (m:Module) RETURN m.path AS path"
CYPHER_ALL_FOLDER_PATHS = "MATCH (f:Folder) RETURN f.path AS path"

REALTIME_LOGGER_FORMAT = (
"<green>{time:YYYY-MM-DD HH:mm:ss.SSS}</green> | "
"<level>{level: <8}</level> | "
Expand Down
46 changes: 46 additions & 0 deletions codebase_rag/graph_updater.py
Original file line number Diff line number Diff line change
Expand Up @@ -327,6 +327,8 @@ def run(self, force: bool = False) -> None:
logger.info(ls.ANALYSIS_COMPLETE)
self.ingestor.flush_all()

self._prune_orphan_nodes()

self._generate_semantic_embeddings()

def remove_file_from_state(self, file_path: Path) -> None:
Expand Down Expand Up @@ -439,6 +441,13 @@ def _process_files(self, force: bool = False) -> None:
for deleted_key in deleted_keys:
deleted_path = self.repo_path / deleted_key
self.remove_file_from_state(deleted_path)
if isinstance(self.ingestor, QueryProtocol):
self.ingestor.execute_write(
cs.CYPHER_DELETE_MODULE, {cs.KEY_PATH: deleted_key}
)
self.ingestor.execute_write(
cs.CYPHER_DELETE_FILE, {cs.KEY_PATH: deleted_key}
)

if skipped_count > 0:
logger.info(ls.INCREMENTAL_SKIPPED, count=skipped_count)
Expand Down Expand Up @@ -475,6 +484,43 @@ def _process_function_calls(self) -> None:
file_path, root_node, language, self.queries
)

def _prune_orphan_nodes(self) -> None:
"""Remove graph nodes whose files/folders no longer exist on disk."""
if not isinstance(self.ingestor, QueryProtocol):
return

logger.info(ls.PRUNE_START)
total_pruned = 0

prune_specs: list[tuple[str, str, str]] = [
(cs.CYPHER_ALL_FILE_PATHS, cs.CYPHER_DELETE_FILE, "File"),
(cs.CYPHER_ALL_MODULE_PATHS, cs.CYPHER_DELETE_MODULE, "Module"),
(cs.CYPHER_ALL_FOLDER_PATHS, cs.CYPHER_DELETE_FOLDER, "Folder"),
]

for query_all, delete_query, label in prune_specs:
rows = self.ingestor.fetch_all(query_all)
orphans = [
r["path"]
for r in rows
if r.get("path")
and not (self.repo_path / r["path"]).exists()
]

if orphans:
logger.info(ls.PRUNE_FOUND, count=len(orphans), label=label)
for orphan_path in orphans:
logger.debug(ls.PRUNE_DELETING, label=label, path=orphan_path)
self.ingestor.execute_write(
delete_query, {cs.KEY_PATH: orphan_path}
)
total_pruned += len(orphans)

if total_pruned:
logger.info(ls.PRUNE_COMPLETE, count=total_pruned)
else:
logger.info(ls.PRUNE_SKIP)

def _generate_semantic_embeddings(self) -> None:
if not has_semantic_dependencies():
logger.info(ls.SEMANTIC_NOT_AVAILABLE)
Expand Down
7 changes: 7 additions & 0 deletions codebase_rag/logs.py
Original file line number Diff line number Diff line change
Expand Up @@ -653,6 +653,13 @@
INCREMENTAL_CHANGED = "Re-indexing {count} changed files"
INCREMENTAL_DELETED = "Removed state for {count} deleted files"
INCREMENTAL_FORCE = "Force mode enabled, bypassing hash cache"

# (H) Orphan pruning logs
PRUNE_START = "--- Pruning orphan nodes from graph ---"
PRUNE_FOUND = "Found {count} orphan {label} nodes to remove"
PRUNE_DELETING = "Pruning orphan {label}: {path}"
PRUNE_COMPLETE = "Pruning complete. Removed {count} orphan nodes."
PRUNE_SKIP = "No orphan nodes found. Graph is clean."
FILE_HASH_UNCHANGED = "File unchanged (hash match): {path}"
FILE_HASH_CHANGED = "File changed (hash mismatch): {path}"
FILE_HASH_NEW = "New file detected: {path}"
Expand Down
Loading
Loading