diff --git a/codebase_rag/constants.py b/codebase_rag/constants.py index 14ee184c7..0f4d7e1a1 100644 --- a/codebase_rag/constants.py +++ b/codebase_rag/constants.py @@ -848,11 +848,19 @@ class TreeSitterModule(StrEnum): class EventType(StrEnum): MODIFIED = "modified" CREATED = "created" + DELETED = "deleted" CYPHER_DELETE_MODULE = "MATCH (m:Module {path: $path})-[*0..]->(c) DETACH DELETE m, c" +CYPHER_DELETE_FILE = "MATCH (f:File {path: $path}) DETACH DELETE f" +CYPHER_DELETE_FOLDER = "MATCH (f:Folder {path: $path}) DETACH DELETE f" CYPHER_DELETE_CALLS = "MATCH ()-[r:CALLS]->() DELETE r" +# (H) Queries for orphan pruning — returns all paths stored in the graph +CYPHER_ALL_FILE_PATHS = "MATCH (f:File) RETURN f.path AS path" +CYPHER_ALL_MODULE_PATHS = "MATCH (m:Module) RETURN m.path AS path" +CYPHER_ALL_FOLDER_PATHS = "MATCH (f:Folder) RETURN f.path AS path" + REALTIME_LOGGER_FORMAT = ( "{time:YYYY-MM-DD HH:mm:ss.SSS} | " "{level: <8} | " diff --git a/codebase_rag/graph_updater.py b/codebase_rag/graph_updater.py index 6a7eacbaa..47bdf60c5 100644 --- a/codebase_rag/graph_updater.py +++ b/codebase_rag/graph_updater.py @@ -327,6 +327,8 @@ def run(self, force: bool = False) -> None: logger.info(ls.ANALYSIS_COMPLETE) self.ingestor.flush_all() + self._prune_orphan_nodes() + self._generate_semantic_embeddings() def remove_file_from_state(self, file_path: Path) -> None: @@ -439,6 +441,13 @@ def _process_files(self, force: bool = False) -> None: for deleted_key in deleted_keys: deleted_path = self.repo_path / deleted_key self.remove_file_from_state(deleted_path) + if isinstance(self.ingestor, QueryProtocol): + self.ingestor.execute_write( + cs.CYPHER_DELETE_MODULE, {cs.KEY_PATH: deleted_key} + ) + self.ingestor.execute_write( + cs.CYPHER_DELETE_FILE, {cs.KEY_PATH: deleted_key} + ) if skipped_count > 0: logger.info(ls.INCREMENTAL_SKIPPED, count=skipped_count) @@ -475,6 +484,43 @@ def _process_function_calls(self) -> None: file_path, root_node, language, self.queries ) + def _prune_orphan_nodes(self) -> None: + """Remove graph nodes whose files/folders no longer exist on disk.""" + if not isinstance(self.ingestor, QueryProtocol): + return + + logger.info(ls.PRUNE_START) + total_pruned = 0 + + prune_specs: list[tuple[str, str, str]] = [ + (cs.CYPHER_ALL_FILE_PATHS, cs.CYPHER_DELETE_FILE, "File"), + (cs.CYPHER_ALL_MODULE_PATHS, cs.CYPHER_DELETE_MODULE, "Module"), + (cs.CYPHER_ALL_FOLDER_PATHS, cs.CYPHER_DELETE_FOLDER, "Folder"), + ] + + for query_all, delete_query, label in prune_specs: + rows = self.ingestor.fetch_all(query_all) + orphans = [ + r["path"] + for r in rows + if r.get("path") + and not (self.repo_path / r["path"]).exists() + ] + + if orphans: + logger.info(ls.PRUNE_FOUND, count=len(orphans), label=label) + for orphan_path in orphans: + logger.debug(ls.PRUNE_DELETING, label=label, path=orphan_path) + self.ingestor.execute_write( + delete_query, {cs.KEY_PATH: orphan_path} + ) + total_pruned += len(orphans) + + if total_pruned: + logger.info(ls.PRUNE_COMPLETE, count=total_pruned) + else: + logger.info(ls.PRUNE_SKIP) + def _generate_semantic_embeddings(self) -> None: if not has_semantic_dependencies(): logger.info(ls.SEMANTIC_NOT_AVAILABLE) diff --git a/codebase_rag/logs.py b/codebase_rag/logs.py index c997b1100..417f968d9 100644 --- a/codebase_rag/logs.py +++ b/codebase_rag/logs.py @@ -653,6 +653,13 @@ INCREMENTAL_CHANGED = "Re-indexing {count} changed files" INCREMENTAL_DELETED = "Removed state for {count} deleted files" INCREMENTAL_FORCE = "Force mode enabled, bypassing hash cache" + +# (H) Orphan pruning logs +PRUNE_START = "--- Pruning orphan nodes from graph ---" +PRUNE_FOUND = "Found {count} orphan {label} nodes to remove" +PRUNE_DELETING = "Pruning orphan {label}: {path}" +PRUNE_COMPLETE = "Pruning complete. Removed {count} orphan nodes." +PRUNE_SKIP = "No orphan nodes found. Graph is clean." FILE_HASH_UNCHANGED = "File unchanged (hash match): {path}" FILE_HASH_CHANGED = "File changed (hash mismatch): {path}" FILE_HASH_NEW = "New file detected: {path}" diff --git a/codebase_rag/tests/test_graph_updater_pruning.py b/codebase_rag/tests/test_graph_updater_pruning.py new file mode 100644 index 000000000..20f4f1858 --- /dev/null +++ b/codebase_rag/tests/test_graph_updater_pruning.py @@ -0,0 +1,311 @@ +# (H) Tests for orphan node pruning in GraphUpdater._prune_orphan_nodes +# (H) and Cypher deletion in _process_files for hash-cache-detected deletions. +from pathlib import Path +from unittest.mock import MagicMock, call, patch + +import pytest + +from codebase_rag import constants as cs +from codebase_rag.graph_updater import GraphUpdater +from codebase_rag.parser_loader import load_parsers + + +@pytest.fixture +def updater(temp_repo: Path, mock_ingestor: MagicMock) -> GraphUpdater: + parsers, queries = load_parsers() + return GraphUpdater( + ingestor=mock_ingestor, + repo_path=temp_repo, + parsers=parsers, + queries=queries, + ) + + +@pytest.fixture +def py_project(temp_repo: Path) -> Path: + (temp_repo / "__init__.py").touch() + (temp_repo / "module_a.py").write_text("def func_a():\n pass\n") + (temp_repo / "module_b.py").write_text("def func_b():\n pass\n") + sub = temp_repo / "subpkg" + sub.mkdir() + (sub / "__init__.py").touch() + (sub / "inner.py").write_text("def inner_func():\n pass\n") + return temp_repo + + +class TestPruneOrphanNodes: + """Tests for GraphUpdater._prune_orphan_nodes.""" + + def test_prune_removes_orphan_file_nodes( + self, py_project: Path, mock_ingestor: MagicMock + ) -> None: + """Orphan File nodes whose paths don't exist on disk are deleted.""" + parsers, queries = load_parsers() + updater = GraphUpdater( + ingestor=mock_ingestor, + repo_path=py_project, + parsers=parsers, + queries=queries, + ) + + # (H) Simulate graph returning a file path that no longer exists + mock_ingestor.fetch_all.side_effect = [ + [{"path": "deleted_project/server.py"}, {"path": "module_a.py"}], + [], + [], + ] + updater._prune_orphan_nodes() + + # (H) Only the orphan path should be deleted + delete_calls = [ + c + for c in mock_ingestor.execute_write.call_args_list + if c.args[0] == cs.CYPHER_DELETE_FILE + ] + assert len(delete_calls) == 1 + assert delete_calls[0].args[1] == {cs.KEY_PATH: "deleted_project/server.py"} + + def test_prune_removes_orphan_module_nodes( + self, py_project: Path, mock_ingestor: MagicMock + ) -> None: + """Orphan Module nodes are deleted via CYPHER_DELETE_MODULE (cascading).""" + parsers, queries = load_parsers() + updater = GraphUpdater( + ingestor=mock_ingestor, + repo_path=py_project, + parsers=parsers, + queries=queries, + ) + + mock_ingestor.fetch_all.side_effect = [ + [], + [{"path": "old_project/main.py"}], + [], + ] + updater._prune_orphan_nodes() + + delete_calls = [ + c + for c in mock_ingestor.execute_write.call_args_list + if c.args[0] == cs.CYPHER_DELETE_MODULE + ] + assert len(delete_calls) == 1 + assert delete_calls[0].args[1] == {cs.KEY_PATH: "old_project/main.py"} + + def test_prune_removes_orphan_folder_nodes( + self, py_project: Path, mock_ingestor: MagicMock + ) -> None: + """Orphan Folder nodes are deleted via CYPHER_DELETE_FOLDER.""" + parsers, queries = load_parsers() + updater = GraphUpdater( + ingestor=mock_ingestor, + repo_path=py_project, + parsers=parsers, + queries=queries, + ) + + mock_ingestor.fetch_all.side_effect = [ + [], + [], + [{"path": "projects/mcp-openclaw-bridge"}, {"path": "subpkg"}], + ] + updater._prune_orphan_nodes() + + delete_calls = [ + c + for c in mock_ingestor.execute_write.call_args_list + if c.args[0] == cs.CYPHER_DELETE_FOLDER + ] + # (H) Only the non-existent path is pruned; "subpkg" still exists on disk + assert len(delete_calls) == 1 + assert delete_calls[0].args[1] == { + cs.KEY_PATH: "projects/mcp-openclaw-bridge" + } + + def test_prune_no_orphans_skips_deletes( + self, py_project: Path, mock_ingestor: MagicMock + ) -> None: + """When all graph nodes exist on disk, no delete queries are issued.""" + parsers, queries = load_parsers() + updater = GraphUpdater( + ingestor=mock_ingestor, + repo_path=py_project, + parsers=parsers, + queries=queries, + ) + + mock_ingestor.fetch_all.side_effect = [ + [{"path": "module_a.py"}], + [{"path": "module_a.py"}], + [{"path": "subpkg"}], + ] + updater._prune_orphan_nodes() + + assert mock_ingestor.execute_write.call_count == 0 + + def test_prune_handles_empty_graph( + self, py_project: Path, mock_ingestor: MagicMock + ) -> None: + """Pruning on an empty graph does nothing.""" + parsers, queries = load_parsers() + updater = GraphUpdater( + ingestor=mock_ingestor, + repo_path=py_project, + parsers=parsers, + queries=queries, + ) + + mock_ingestor.fetch_all.return_value = [] + updater._prune_orphan_nodes() + + assert mock_ingestor.execute_write.call_count == 0 + + def test_prune_handles_none_path_gracefully( + self, py_project: Path, mock_ingestor: MagicMock + ) -> None: + """Rows with None path values are skipped without error.""" + parsers, queries = load_parsers() + updater = GraphUpdater( + ingestor=mock_ingestor, + repo_path=py_project, + parsers=parsers, + queries=queries, + ) + + mock_ingestor.fetch_all.side_effect = [ + [{"path": None}, {"path": "module_a.py"}], + [], + [], + ] + updater._prune_orphan_nodes() + + assert mock_ingestor.execute_write.call_count == 0 + + def test_prune_multiple_orphans_across_types( + self, py_project: Path, mock_ingestor: MagicMock + ) -> None: + """Multiple orphan nodes across File, Module, Folder are all pruned.""" + parsers, queries = load_parsers() + updater = GraphUpdater( + ingestor=mock_ingestor, + repo_path=py_project, + parsers=parsers, + queries=queries, + ) + + mock_ingestor.fetch_all.side_effect = [ + [{"path": "gone/a.py"}, {"path": "gone/b.py"}], + [{"path": "gone/a.py"}], + [{"path": "gone"}], + ] + updater._prune_orphan_nodes() + + # (H) 2 File + 1 Module + 1 Folder = 4 deletes + assert mock_ingestor.execute_write.call_count == 4 + + +class TestProcessFilesDeletesCypherNodes: + """Tests that _process_files issues Cypher deletes for hash-cache-detected deletions.""" + + def test_deleted_file_triggers_cypher_delete( + self, py_project: Path, mock_ingestor: MagicMock + ) -> None: + """When a file is deleted between runs, both MODULE and FILE Cypher deletes are issued.""" + parsers, queries = load_parsers() + updater = GraphUpdater( + ingestor=mock_ingestor, + repo_path=py_project, + parsers=parsers, + queries=queries, + ) + + # (H) Stub fetch_all so _prune_orphan_nodes doesn't interfere + mock_ingestor.fetch_all.return_value = [] + updater.run() + + (py_project / "module_b.py").unlink() + mock_ingestor.reset_mock() + mock_ingestor.fetch_all.return_value = [] + + updater2 = GraphUpdater( + ingestor=mock_ingestor, + repo_path=py_project, + parsers=parsers, + queries=queries, + ) + updater2.run() + + # (H) Verify CYPHER_DELETE_MODULE and CYPHER_DELETE_FILE were called for module_b.py + module_deletes = [ + c + for c in mock_ingestor.execute_write.call_args_list + if c.args[0] == cs.CYPHER_DELETE_MODULE + and c.args[1].get(cs.KEY_PATH) == "module_b.py" + ] + file_deletes = [ + c + for c in mock_ingestor.execute_write.call_args_list + if c.args[0] == cs.CYPHER_DELETE_FILE + and c.args[1].get(cs.KEY_PATH) == "module_b.py" + ] + assert len(module_deletes) >= 1 + assert len(file_deletes) >= 1 + + def test_no_deletes_when_no_files_removed( + self, py_project: Path, mock_ingestor: MagicMock + ) -> None: + """When no files are deleted between runs, no delete queries are issued for files.""" + parsers, queries = load_parsers() + + mock_ingestor.fetch_all.return_value = [] + + updater = GraphUpdater( + ingestor=mock_ingestor, + repo_path=py_project, + parsers=parsers, + queries=queries, + ) + updater.run() + + mock_ingestor.reset_mock() + mock_ingestor.fetch_all.return_value = [] + + updater2 = GraphUpdater( + ingestor=mock_ingestor, + repo_path=py_project, + parsers=parsers, + queries=queries, + ) + updater2.run() + + # (H) No CYPHER_DELETE_MODULE or CYPHER_DELETE_FILE for specific paths + path_deletes = [ + c + for c in mock_ingestor.execute_write.call_args_list + if c.args[0] in (cs.CYPHER_DELETE_MODULE, cs.CYPHER_DELETE_FILE) + and len(c.args) > 1 + ] + assert len(path_deletes) == 0 + + +class TestPruneCalledDuringRun: + """Tests that _prune_orphan_nodes is called as part of GraphUpdater.run().""" + + def test_run_calls_prune( + self, py_project: Path, mock_ingestor: MagicMock + ) -> None: + """GraphUpdater.run() invokes _prune_orphan_nodes after flush.""" + parsers, queries = load_parsers() + mock_ingestor.fetch_all.return_value = [] + + updater = GraphUpdater( + ingestor=mock_ingestor, + repo_path=py_project, + parsers=parsers, + queries=queries, + ) + with patch.object( + updater, "_prune_orphan_nodes", wraps=updater._prune_orphan_nodes + ) as spy: + updater.run() + spy.assert_called_once() diff --git a/codebase_rag/tests/test_realtime_updater.py b/codebase_rag/tests/test_realtime_updater.py index 2061fac0e..200af6757 100644 --- a/codebase_rag/tests/test_realtime_updater.py +++ b/codebase_rag/tests/test_realtime_updater.py @@ -42,7 +42,8 @@ def test_file_creation_flow( event_handler.dispatch(event) - assert mock_updater.ingestor.execute_write.call_count == 2 + # (H) 3 execute_write calls: DELETE_MODULE, DELETE_FILE, DELETE_CALLS + assert mock_updater.ingestor.execute_write.call_count == 3 mock_updater.factory.definition_processor.process_file.assert_called_once_with( test_file, "python", @@ -62,7 +63,8 @@ def test_file_modification_flow( event_handler.dispatch(event) - assert mock_updater.ingestor.execute_write.call_count == 2 + # (H) 3 execute_write calls: DELETE_MODULE, DELETE_FILE, DELETE_CALLS + assert mock_updater.ingestor.execute_write.call_count == 3 mock_updater.factory.definition_processor.process_file.assert_called_once_with( test_file, "python", @@ -81,7 +83,8 @@ def test_file_deletion_flow( event_handler.dispatch(event) - assert mock_updater.ingestor.execute_write.call_count == 2 + # (H) 3 execute_write calls: DELETE_MODULE, DELETE_FILE, DELETE_CALLS + assert mock_updater.ingestor.execute_write.call_count == 3 mock_updater.factory.definition_processor.process_file.assert_not_called() mock_updater.ingestor.flush_all.assert_called_once() @@ -117,16 +120,22 @@ def test_directory_creation_is_ignored( mock_updater.ingestor.flush_all.assert_not_called() -def test_unsupported_file_types_are_ignored( +def test_non_code_files_create_file_nodes( event_handler: CodeChangeEventHandler, mock_updater: MagicMock, temp_repo: Path ) -> None: - """Test that changing an unsupported file type is ignored after deletion query.""" - unsupported_file = temp_repo / "document.md" - unsupported_file.write_text(encoding="utf-8", data="# Markdown file") - event = FileModifiedEvent(str(unsupported_file)) + """Test that non-code files (like .md) create File nodes but skip AST parsing.""" + non_code_file = temp_repo / "document.md" + non_code_file.write_text(encoding="utf-8", data="# Markdown file") + event = FileModifiedEvent(str(non_code_file)) event_handler.dispatch(event) - assert mock_updater.ingestor.execute_write.call_count == 2 + # (H) 3 execute_write calls: DELETE_MODULE, DELETE_FILE, DELETE_CALLS + assert mock_updater.ingestor.execute_write.call_count == 3 + # (H) AST parsing is skipped for non-code files mock_updater.factory.definition_processor.process_file.assert_not_called() + # (H) But File node creation IS called for all file types + mock_updater.factory.structure_processor.process_generic_file.assert_called_once_with( + non_code_file, "document.md" + ) mock_updater.ingestor.flush_all.assert_called_once() diff --git a/pyproject.toml b/pyproject.toml index 78ca119e4..5974df144 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -142,6 +142,7 @@ dev = [ "pre-commit>=4.2.0", "pyinstaller>=6.14.1", "pylint>=4.0.4", + "pytest>=9.0.2", "radon>=6.0.1", "ruff>=0.5.5", "semgrep>=1.79.0", diff --git a/realtime_updater.py b/realtime_updater.py index 778674228..e95d9ee78 100644 --- a/realtime_updater.py +++ b/realtime_updater.py @@ -14,6 +14,7 @@ from codebase_rag.config import settings from codebase_rag.constants import ( CYPHER_DELETE_CALLS, + CYPHER_DELETE_FILE, CYPHER_DELETE_MODULE, IGNORE_PATTERNS, IGNORE_SUFFIXES, @@ -73,18 +74,33 @@ def dispatch(self, event: FileSystemEvent) -> None: path = Path(src_path) relative_path_str = str(path.relative_to(self.updater.repo_path)) + # (H) Only process events that actually change file content + # (H) Skip read-only events like "opened", "closed_no_write" that don't modify the file + relevant_events = { + EventType.MODIFIED, + EventType.CREATED, + EventType.DELETED, # (H) watchdog deletion event + } + if event.event_type not in relevant_events: + return + logger.warning( logs.CHANGE_DETECTED.format(event_type=event.event_type, path=path) ) - # (H) Step 1 + # (H) Step 1: Delete existing nodes for this file path + # (H) Delete Module node and its children (for code files) ingestor.execute_write(CYPHER_DELETE_MODULE, {KEY_PATH: relative_path_str}) + # (H) Delete File node (for all files including non-code like .md, .json) + ingestor.execute_write( + CYPHER_DELETE_FILE, {KEY_PATH: relative_path_str} + ) logger.debug(logs.DELETION_QUERY.format(path=relative_path_str)) # (H) Step 2 self.updater.remove_file_from_state(path) - # (H) Step 3 + # (H) Step 3: Re-parse code files and create File nodes for ALL files if event.event_type in (EventType.MODIFIED, EventType.CREATED): lang_config = get_language_spec(path.suffix) if ( @@ -101,6 +117,11 @@ def dispatch(self, event: FileSystemEvent) -> None: root_node, language = result self.updater.ast_cache[path] = (root_node, language) + # (H) Create File node for ALL files (code and non-code like .md, .json, etc.) + self.updater.factory.structure_processor.process_generic_file( + path, path.name + ) + # (H) Step 4 logger.info(logs.RECALC_CALLS) ingestor.execute_write(CYPHER_DELETE_CALLS) diff --git a/uv.lock b/uv.lock index 081bc1177..d1b0c09c0 100644 --- a/uv.lock +++ b/uv.lock @@ -484,7 +484,7 @@ wheels = [ [[package]] name = "code-graph-rag" -version = "0.0.100" +version = "0.0.101" source = { editable = "." } dependencies = [ { name = "click" }, @@ -539,6 +539,7 @@ dev = [ { name = "pre-commit" }, { name = "pyinstaller" }, { name = "pylint" }, + { name = "pytest" }, { name = "radon" }, { name = "ruff" }, { name = "semgrep" }, @@ -600,6 +601,7 @@ dev = [ { name = "pre-commit", specifier = ">=4.2.0" }, { name = "pyinstaller", specifier = ">=6.14.1" }, { name = "pylint", specifier = ">=4.0.4" }, + { name = "pytest", specifier = ">=9.0.2" }, { name = "radon", specifier = ">=6.0.1" }, { name = "ruff", specifier = ">=0.5.5" }, { name = "semgrep", specifier = ">=1.79.0" },