diff --git a/codebase_rag/constants.py b/codebase_rag/constants.py
index 14ee184c7..0f4d7e1a1 100644
--- a/codebase_rag/constants.py
+++ b/codebase_rag/constants.py
@@ -848,11 +848,19 @@ class TreeSitterModule(StrEnum):
class EventType(StrEnum):
MODIFIED = "modified"
CREATED = "created"
+ DELETED = "deleted"
CYPHER_DELETE_MODULE = "MATCH (m:Module {path: $path})-[*0..]->(c) DETACH DELETE m, c"
+CYPHER_DELETE_FILE = "MATCH (f:File {path: $path}) DETACH DELETE f"
+CYPHER_DELETE_FOLDER = "MATCH (f:Folder {path: $path}) DETACH DELETE f"
CYPHER_DELETE_CALLS = "MATCH ()-[r:CALLS]->() DELETE r"
+# (H) Queries for orphan pruning — returns all paths stored in the graph
+CYPHER_ALL_FILE_PATHS = "MATCH (f:File) RETURN f.path AS path"
+CYPHER_ALL_MODULE_PATHS = "MATCH (m:Module) RETURN m.path AS path"
+CYPHER_ALL_FOLDER_PATHS = "MATCH (f:Folder) RETURN f.path AS path"
+
REALTIME_LOGGER_FORMAT = (
"{time:YYYY-MM-DD HH:mm:ss.SSS} | "
"{level: <8} | "
diff --git a/codebase_rag/graph_updater.py b/codebase_rag/graph_updater.py
index 6a7eacbaa..47bdf60c5 100644
--- a/codebase_rag/graph_updater.py
+++ b/codebase_rag/graph_updater.py
@@ -327,6 +327,8 @@ def run(self, force: bool = False) -> None:
logger.info(ls.ANALYSIS_COMPLETE)
self.ingestor.flush_all()
+ self._prune_orphan_nodes()
+
self._generate_semantic_embeddings()
def remove_file_from_state(self, file_path: Path) -> None:
@@ -439,6 +441,13 @@ def _process_files(self, force: bool = False) -> None:
for deleted_key in deleted_keys:
deleted_path = self.repo_path / deleted_key
self.remove_file_from_state(deleted_path)
+ if isinstance(self.ingestor, QueryProtocol):
+ self.ingestor.execute_write(
+ cs.CYPHER_DELETE_MODULE, {cs.KEY_PATH: deleted_key}
+ )
+ self.ingestor.execute_write(
+ cs.CYPHER_DELETE_FILE, {cs.KEY_PATH: deleted_key}
+ )
if skipped_count > 0:
logger.info(ls.INCREMENTAL_SKIPPED, count=skipped_count)
@@ -475,6 +484,43 @@ def _process_function_calls(self) -> None:
file_path, root_node, language, self.queries
)
+ def _prune_orphan_nodes(self) -> None:
+ """Remove graph nodes whose files/folders no longer exist on disk."""
+ if not isinstance(self.ingestor, QueryProtocol):
+ return
+
+ logger.info(ls.PRUNE_START)
+ total_pruned = 0
+
+ prune_specs: list[tuple[str, str, str]] = [
+ (cs.CYPHER_ALL_FILE_PATHS, cs.CYPHER_DELETE_FILE, "File"),
+ (cs.CYPHER_ALL_MODULE_PATHS, cs.CYPHER_DELETE_MODULE, "Module"),
+ (cs.CYPHER_ALL_FOLDER_PATHS, cs.CYPHER_DELETE_FOLDER, "Folder"),
+ ]
+
+ for query_all, delete_query, label in prune_specs:
+ rows = self.ingestor.fetch_all(query_all)
+ orphans = [
+ r["path"]
+ for r in rows
+ if r.get("path")
+ and not (self.repo_path / r["path"]).exists()
+ ]
+
+ if orphans:
+ logger.info(ls.PRUNE_FOUND, count=len(orphans), label=label)
+ for orphan_path in orphans:
+ logger.debug(ls.PRUNE_DELETING, label=label, path=orphan_path)
+ self.ingestor.execute_write(
+ delete_query, {cs.KEY_PATH: orphan_path}
+ )
+ total_pruned += len(orphans)
+
+ if total_pruned:
+ logger.info(ls.PRUNE_COMPLETE, count=total_pruned)
+ else:
+ logger.info(ls.PRUNE_SKIP)
+
def _generate_semantic_embeddings(self) -> None:
if not has_semantic_dependencies():
logger.info(ls.SEMANTIC_NOT_AVAILABLE)
diff --git a/codebase_rag/logs.py b/codebase_rag/logs.py
index c997b1100..417f968d9 100644
--- a/codebase_rag/logs.py
+++ b/codebase_rag/logs.py
@@ -653,6 +653,13 @@
INCREMENTAL_CHANGED = "Re-indexing {count} changed files"
INCREMENTAL_DELETED = "Removed state for {count} deleted files"
INCREMENTAL_FORCE = "Force mode enabled, bypassing hash cache"
+
+# (H) Orphan pruning logs
+PRUNE_START = "--- Pruning orphan nodes from graph ---"
+PRUNE_FOUND = "Found {count} orphan {label} nodes to remove"
+PRUNE_DELETING = "Pruning orphan {label}: {path}"
+PRUNE_COMPLETE = "Pruning complete. Removed {count} orphan nodes."
+PRUNE_SKIP = "No orphan nodes found. Graph is clean."
FILE_HASH_UNCHANGED = "File unchanged (hash match): {path}"
FILE_HASH_CHANGED = "File changed (hash mismatch): {path}"
FILE_HASH_NEW = "New file detected: {path}"
diff --git a/codebase_rag/tests/test_graph_updater_pruning.py b/codebase_rag/tests/test_graph_updater_pruning.py
new file mode 100644
index 000000000..20f4f1858
--- /dev/null
+++ b/codebase_rag/tests/test_graph_updater_pruning.py
@@ -0,0 +1,311 @@
+# (H) Tests for orphan node pruning in GraphUpdater._prune_orphan_nodes
+# (H) and Cypher deletion in _process_files for hash-cache-detected deletions.
+from pathlib import Path
+from unittest.mock import MagicMock, call, patch
+
+import pytest
+
+from codebase_rag import constants as cs
+from codebase_rag.graph_updater import GraphUpdater
+from codebase_rag.parser_loader import load_parsers
+
+
+@pytest.fixture
+def updater(temp_repo: Path, mock_ingestor: MagicMock) -> GraphUpdater:
+ parsers, queries = load_parsers()
+ return GraphUpdater(
+ ingestor=mock_ingestor,
+ repo_path=temp_repo,
+ parsers=parsers,
+ queries=queries,
+ )
+
+
+@pytest.fixture
+def py_project(temp_repo: Path) -> Path:
+ (temp_repo / "__init__.py").touch()
+ (temp_repo / "module_a.py").write_text("def func_a():\n pass\n")
+ (temp_repo / "module_b.py").write_text("def func_b():\n pass\n")
+ sub = temp_repo / "subpkg"
+ sub.mkdir()
+ (sub / "__init__.py").touch()
+ (sub / "inner.py").write_text("def inner_func():\n pass\n")
+ return temp_repo
+
+
+class TestPruneOrphanNodes:
+ """Tests for GraphUpdater._prune_orphan_nodes."""
+
+ def test_prune_removes_orphan_file_nodes(
+ self, py_project: Path, mock_ingestor: MagicMock
+ ) -> None:
+ """Orphan File nodes whose paths don't exist on disk are deleted."""
+ parsers, queries = load_parsers()
+ updater = GraphUpdater(
+ ingestor=mock_ingestor,
+ repo_path=py_project,
+ parsers=parsers,
+ queries=queries,
+ )
+
+ # (H) Simulate graph returning a file path that no longer exists
+ mock_ingestor.fetch_all.side_effect = [
+ [{"path": "deleted_project/server.py"}, {"path": "module_a.py"}],
+ [],
+ [],
+ ]
+ updater._prune_orphan_nodes()
+
+ # (H) Only the orphan path should be deleted
+ delete_calls = [
+ c
+ for c in mock_ingestor.execute_write.call_args_list
+ if c.args[0] == cs.CYPHER_DELETE_FILE
+ ]
+ assert len(delete_calls) == 1
+ assert delete_calls[0].args[1] == {cs.KEY_PATH: "deleted_project/server.py"}
+
+ def test_prune_removes_orphan_module_nodes(
+ self, py_project: Path, mock_ingestor: MagicMock
+ ) -> None:
+ """Orphan Module nodes are deleted via CYPHER_DELETE_MODULE (cascading)."""
+ parsers, queries = load_parsers()
+ updater = GraphUpdater(
+ ingestor=mock_ingestor,
+ repo_path=py_project,
+ parsers=parsers,
+ queries=queries,
+ )
+
+ mock_ingestor.fetch_all.side_effect = [
+ [],
+ [{"path": "old_project/main.py"}],
+ [],
+ ]
+ updater._prune_orphan_nodes()
+
+ delete_calls = [
+ c
+ for c in mock_ingestor.execute_write.call_args_list
+ if c.args[0] == cs.CYPHER_DELETE_MODULE
+ ]
+ assert len(delete_calls) == 1
+ assert delete_calls[0].args[1] == {cs.KEY_PATH: "old_project/main.py"}
+
+ def test_prune_removes_orphan_folder_nodes(
+ self, py_project: Path, mock_ingestor: MagicMock
+ ) -> None:
+ """Orphan Folder nodes are deleted via CYPHER_DELETE_FOLDER."""
+ parsers, queries = load_parsers()
+ updater = GraphUpdater(
+ ingestor=mock_ingestor,
+ repo_path=py_project,
+ parsers=parsers,
+ queries=queries,
+ )
+
+ mock_ingestor.fetch_all.side_effect = [
+ [],
+ [],
+ [{"path": "projects/mcp-openclaw-bridge"}, {"path": "subpkg"}],
+ ]
+ updater._prune_orphan_nodes()
+
+ delete_calls = [
+ c
+ for c in mock_ingestor.execute_write.call_args_list
+ if c.args[0] == cs.CYPHER_DELETE_FOLDER
+ ]
+ # (H) Only the non-existent path is pruned; "subpkg" still exists on disk
+ assert len(delete_calls) == 1
+ assert delete_calls[0].args[1] == {
+ cs.KEY_PATH: "projects/mcp-openclaw-bridge"
+ }
+
+ def test_prune_no_orphans_skips_deletes(
+ self, py_project: Path, mock_ingestor: MagicMock
+ ) -> None:
+ """When all graph nodes exist on disk, no delete queries are issued."""
+ parsers, queries = load_parsers()
+ updater = GraphUpdater(
+ ingestor=mock_ingestor,
+ repo_path=py_project,
+ parsers=parsers,
+ queries=queries,
+ )
+
+ mock_ingestor.fetch_all.side_effect = [
+ [{"path": "module_a.py"}],
+ [{"path": "module_a.py"}],
+ [{"path": "subpkg"}],
+ ]
+ updater._prune_orphan_nodes()
+
+ assert mock_ingestor.execute_write.call_count == 0
+
+ def test_prune_handles_empty_graph(
+ self, py_project: Path, mock_ingestor: MagicMock
+ ) -> None:
+ """Pruning on an empty graph does nothing."""
+ parsers, queries = load_parsers()
+ updater = GraphUpdater(
+ ingestor=mock_ingestor,
+ repo_path=py_project,
+ parsers=parsers,
+ queries=queries,
+ )
+
+ mock_ingestor.fetch_all.return_value = []
+ updater._prune_orphan_nodes()
+
+ assert mock_ingestor.execute_write.call_count == 0
+
+ def test_prune_handles_none_path_gracefully(
+ self, py_project: Path, mock_ingestor: MagicMock
+ ) -> None:
+ """Rows with None path values are skipped without error."""
+ parsers, queries = load_parsers()
+ updater = GraphUpdater(
+ ingestor=mock_ingestor,
+ repo_path=py_project,
+ parsers=parsers,
+ queries=queries,
+ )
+
+ mock_ingestor.fetch_all.side_effect = [
+ [{"path": None}, {"path": "module_a.py"}],
+ [],
+ [],
+ ]
+ updater._prune_orphan_nodes()
+
+ assert mock_ingestor.execute_write.call_count == 0
+
+ def test_prune_multiple_orphans_across_types(
+ self, py_project: Path, mock_ingestor: MagicMock
+ ) -> None:
+ """Multiple orphan nodes across File, Module, Folder are all pruned."""
+ parsers, queries = load_parsers()
+ updater = GraphUpdater(
+ ingestor=mock_ingestor,
+ repo_path=py_project,
+ parsers=parsers,
+ queries=queries,
+ )
+
+ mock_ingestor.fetch_all.side_effect = [
+ [{"path": "gone/a.py"}, {"path": "gone/b.py"}],
+ [{"path": "gone/a.py"}],
+ [{"path": "gone"}],
+ ]
+ updater._prune_orphan_nodes()
+
+ # (H) 2 File + 1 Module + 1 Folder = 4 deletes
+ assert mock_ingestor.execute_write.call_count == 4
+
+
+class TestProcessFilesDeletesCypherNodes:
+ """Tests that _process_files issues Cypher deletes for hash-cache-detected deletions."""
+
+ def test_deleted_file_triggers_cypher_delete(
+ self, py_project: Path, mock_ingestor: MagicMock
+ ) -> None:
+ """When a file is deleted between runs, both MODULE and FILE Cypher deletes are issued."""
+ parsers, queries = load_parsers()
+ updater = GraphUpdater(
+ ingestor=mock_ingestor,
+ repo_path=py_project,
+ parsers=parsers,
+ queries=queries,
+ )
+
+ # (H) Stub fetch_all so _prune_orphan_nodes doesn't interfere
+ mock_ingestor.fetch_all.return_value = []
+ updater.run()
+
+ (py_project / "module_b.py").unlink()
+ mock_ingestor.reset_mock()
+ mock_ingestor.fetch_all.return_value = []
+
+ updater2 = GraphUpdater(
+ ingestor=mock_ingestor,
+ repo_path=py_project,
+ parsers=parsers,
+ queries=queries,
+ )
+ updater2.run()
+
+ # (H) Verify CYPHER_DELETE_MODULE and CYPHER_DELETE_FILE were called for module_b.py
+ module_deletes = [
+ c
+ for c in mock_ingestor.execute_write.call_args_list
+ if c.args[0] == cs.CYPHER_DELETE_MODULE
+ and c.args[1].get(cs.KEY_PATH) == "module_b.py"
+ ]
+ file_deletes = [
+ c
+ for c in mock_ingestor.execute_write.call_args_list
+ if c.args[0] == cs.CYPHER_DELETE_FILE
+ and c.args[1].get(cs.KEY_PATH) == "module_b.py"
+ ]
+ assert len(module_deletes) >= 1
+ assert len(file_deletes) >= 1
+
+ def test_no_deletes_when_no_files_removed(
+ self, py_project: Path, mock_ingestor: MagicMock
+ ) -> None:
+ """When no files are deleted between runs, no delete queries are issued for files."""
+ parsers, queries = load_parsers()
+
+ mock_ingestor.fetch_all.return_value = []
+
+ updater = GraphUpdater(
+ ingestor=mock_ingestor,
+ repo_path=py_project,
+ parsers=parsers,
+ queries=queries,
+ )
+ updater.run()
+
+ mock_ingestor.reset_mock()
+ mock_ingestor.fetch_all.return_value = []
+
+ updater2 = GraphUpdater(
+ ingestor=mock_ingestor,
+ repo_path=py_project,
+ parsers=parsers,
+ queries=queries,
+ )
+ updater2.run()
+
+ # (H) No CYPHER_DELETE_MODULE or CYPHER_DELETE_FILE for specific paths
+ path_deletes = [
+ c
+ for c in mock_ingestor.execute_write.call_args_list
+ if c.args[0] in (cs.CYPHER_DELETE_MODULE, cs.CYPHER_DELETE_FILE)
+ and len(c.args) > 1
+ ]
+ assert len(path_deletes) == 0
+
+
+class TestPruneCalledDuringRun:
+ """Tests that _prune_orphan_nodes is called as part of GraphUpdater.run()."""
+
+ def test_run_calls_prune(
+ self, py_project: Path, mock_ingestor: MagicMock
+ ) -> None:
+ """GraphUpdater.run() invokes _prune_orphan_nodes after flush."""
+ parsers, queries = load_parsers()
+ mock_ingestor.fetch_all.return_value = []
+
+ updater = GraphUpdater(
+ ingestor=mock_ingestor,
+ repo_path=py_project,
+ parsers=parsers,
+ queries=queries,
+ )
+ with patch.object(
+ updater, "_prune_orphan_nodes", wraps=updater._prune_orphan_nodes
+ ) as spy:
+ updater.run()
+ spy.assert_called_once()
diff --git a/codebase_rag/tests/test_realtime_updater.py b/codebase_rag/tests/test_realtime_updater.py
index 2061fac0e..200af6757 100644
--- a/codebase_rag/tests/test_realtime_updater.py
+++ b/codebase_rag/tests/test_realtime_updater.py
@@ -42,7 +42,8 @@ def test_file_creation_flow(
event_handler.dispatch(event)
- assert mock_updater.ingestor.execute_write.call_count == 2
+ # (H) 3 execute_write calls: DELETE_MODULE, DELETE_FILE, DELETE_CALLS
+ assert mock_updater.ingestor.execute_write.call_count == 3
mock_updater.factory.definition_processor.process_file.assert_called_once_with(
test_file,
"python",
@@ -62,7 +63,8 @@ def test_file_modification_flow(
event_handler.dispatch(event)
- assert mock_updater.ingestor.execute_write.call_count == 2
+ # (H) 3 execute_write calls: DELETE_MODULE, DELETE_FILE, DELETE_CALLS
+ assert mock_updater.ingestor.execute_write.call_count == 3
mock_updater.factory.definition_processor.process_file.assert_called_once_with(
test_file,
"python",
@@ -81,7 +83,8 @@ def test_file_deletion_flow(
event_handler.dispatch(event)
- assert mock_updater.ingestor.execute_write.call_count == 2
+ # (H) 3 execute_write calls: DELETE_MODULE, DELETE_FILE, DELETE_CALLS
+ assert mock_updater.ingestor.execute_write.call_count == 3
mock_updater.factory.definition_processor.process_file.assert_not_called()
mock_updater.ingestor.flush_all.assert_called_once()
@@ -117,16 +120,22 @@ def test_directory_creation_is_ignored(
mock_updater.ingestor.flush_all.assert_not_called()
-def test_unsupported_file_types_are_ignored(
+def test_non_code_files_create_file_nodes(
event_handler: CodeChangeEventHandler, mock_updater: MagicMock, temp_repo: Path
) -> None:
- """Test that changing an unsupported file type is ignored after deletion query."""
- unsupported_file = temp_repo / "document.md"
- unsupported_file.write_text(encoding="utf-8", data="# Markdown file")
- event = FileModifiedEvent(str(unsupported_file))
+ """Test that non-code files (like .md) create File nodes but skip AST parsing."""
+ non_code_file = temp_repo / "document.md"
+ non_code_file.write_text(encoding="utf-8", data="# Markdown file")
+ event = FileModifiedEvent(str(non_code_file))
event_handler.dispatch(event)
- assert mock_updater.ingestor.execute_write.call_count == 2
+ # (H) 3 execute_write calls: DELETE_MODULE, DELETE_FILE, DELETE_CALLS
+ assert mock_updater.ingestor.execute_write.call_count == 3
+ # (H) AST parsing is skipped for non-code files
mock_updater.factory.definition_processor.process_file.assert_not_called()
+ # (H) But File node creation IS called for all file types
+ mock_updater.factory.structure_processor.process_generic_file.assert_called_once_with(
+ non_code_file, "document.md"
+ )
mock_updater.ingestor.flush_all.assert_called_once()
diff --git a/pyproject.toml b/pyproject.toml
index 78ca119e4..5974df144 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -142,6 +142,7 @@ dev = [
"pre-commit>=4.2.0",
"pyinstaller>=6.14.1",
"pylint>=4.0.4",
+ "pytest>=9.0.2",
"radon>=6.0.1",
"ruff>=0.5.5",
"semgrep>=1.79.0",
diff --git a/realtime_updater.py b/realtime_updater.py
index 778674228..e95d9ee78 100644
--- a/realtime_updater.py
+++ b/realtime_updater.py
@@ -14,6 +14,7 @@
from codebase_rag.config import settings
from codebase_rag.constants import (
CYPHER_DELETE_CALLS,
+ CYPHER_DELETE_FILE,
CYPHER_DELETE_MODULE,
IGNORE_PATTERNS,
IGNORE_SUFFIXES,
@@ -73,18 +74,33 @@ def dispatch(self, event: FileSystemEvent) -> None:
path = Path(src_path)
relative_path_str = str(path.relative_to(self.updater.repo_path))
+ # (H) Only process events that actually change file content
+ # (H) Skip read-only events like "opened", "closed_no_write" that don't modify the file
+ relevant_events = {
+ EventType.MODIFIED,
+ EventType.CREATED,
+ EventType.DELETED, # (H) watchdog deletion event
+ }
+ if event.event_type not in relevant_events:
+ return
+
logger.warning(
logs.CHANGE_DETECTED.format(event_type=event.event_type, path=path)
)
- # (H) Step 1
+ # (H) Step 1: Delete existing nodes for this file path
+ # (H) Delete Module node and its children (for code files)
ingestor.execute_write(CYPHER_DELETE_MODULE, {KEY_PATH: relative_path_str})
+ # (H) Delete File node (for all files including non-code like .md, .json)
+ ingestor.execute_write(
+ CYPHER_DELETE_FILE, {KEY_PATH: relative_path_str}
+ )
logger.debug(logs.DELETION_QUERY.format(path=relative_path_str))
# (H) Step 2
self.updater.remove_file_from_state(path)
- # (H) Step 3
+ # (H) Step 3: Re-parse code files and create File nodes for ALL files
if event.event_type in (EventType.MODIFIED, EventType.CREATED):
lang_config = get_language_spec(path.suffix)
if (
@@ -101,6 +117,11 @@ def dispatch(self, event: FileSystemEvent) -> None:
root_node, language = result
self.updater.ast_cache[path] = (root_node, language)
+ # (H) Create File node for ALL files (code and non-code like .md, .json, etc.)
+ self.updater.factory.structure_processor.process_generic_file(
+ path, path.name
+ )
+
# (H) Step 4
logger.info(logs.RECALC_CALLS)
ingestor.execute_write(CYPHER_DELETE_CALLS)
diff --git a/uv.lock b/uv.lock
index 081bc1177..d1b0c09c0 100644
--- a/uv.lock
+++ b/uv.lock
@@ -484,7 +484,7 @@ wheels = [
[[package]]
name = "code-graph-rag"
-version = "0.0.100"
+version = "0.0.101"
source = { editable = "." }
dependencies = [
{ name = "click" },
@@ -539,6 +539,7 @@ dev = [
{ name = "pre-commit" },
{ name = "pyinstaller" },
{ name = "pylint" },
+ { name = "pytest" },
{ name = "radon" },
{ name = "ruff" },
{ name = "semgrep" },
@@ -600,6 +601,7 @@ dev = [
{ name = "pre-commit", specifier = ">=4.2.0" },
{ name = "pyinstaller", specifier = ">=6.14.1" },
{ name = "pylint", specifier = ">=4.0.4" },
+ { name = "pytest", specifier = ">=9.0.2" },
{ name = "radon", specifier = ">=6.0.1" },
{ name = "ruff", specifier = ">=0.5.5" },
{ name = "semgrep", specifier = ">=1.79.0" },