From ce4ecd9e6102ad41a36809f63f819db432ecc0c5 Mon Sep 17 00:00:00 2001
From: Dvir Dukhan <12258836+DvirDukhan@users.noreply.github.com>
Date: Wed, 3 Jun 2026 21:14:33 +0300
Subject: [PATCH 1/2] feat(analyzers): syntactic IMPORTS edges + derived
 OVERRIDES

Add language-agnostic File->File IMPORTS edges via per-analyzer import
resolution (Python: dotted-module index) and derive OVERRIDES edges from
the EXTENDS+DEFINES hierarchy. Wired into the analysis pipeline. Improves
the graph for all consumers (HTTP API + MCP) and feeds search_code centrality.

Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>
---
 api/analyzers/analyzer.py        |  33 ++++++++++
 api/analyzers/python/analyzer.py | 104 +++++++++++++++++++++++++++++++
 api/analyzers/source_analyzer.py |  30 +++++++++
 api/graph.py                     |  34 ++++++++++
 4 files changed, 201 insertions(+)

diff --git a/api/analyzers/analyzer.py b/api/analyzers/analyzer.py
index 63202851..53c1e36a 100644
--- a/api/analyzers/analyzer.py
+++ b/api/analyzers/analyzer.py
@@ -70,6 +70,39 @@ def needs_lsp(self) -> bool:
         """
         return True
         
+    def build_import_index(self, files: dict[Path, File], root: Path) -> object:
+        """
+        Build a language-specific index used to resolve import statements to
+        in-repo files. Returns an opaque structure consumed by
+        ``resolve_imports``. Default: no import resolution for this language.
+
+        Args:
+            files (dict[Path, File]): All parsed files keyed by absolute path.
+            root (Path): The analyzed repository root.
+
+        Returns:
+            object: Opaque index, or ``None`` when unsupported.
+        """
+
+        return None
+
+    def resolve_imports(self, file: File, root: Path, index: object) -> list[File]:
+        """
+        Resolve the import statements of ``file`` to the in-repo files they
+        depend on. Purely syntactic by default (no LSP). Each returned File is
+        connected to ``file`` with an ``IMPORTS`` edge by the orchestrator.
+
+        Args:
+            file (File): The importing file (already parsed; ``file.tree`` set).
+            root (Path): The analyzed repository root.
+            index (object): The structure returned by ``build_import_index``.
+
+        Returns:
+            list[File]: In-repo files imported by ``file`` (deduped, self excluded).
+        """
+
+        return []
+
     @abstractmethod
     def add_dependencies(self, path: Path, files: list[Path]):
         """
diff --git a/api/analyzers/python/analyzer.py b/api/analyzers/python/analyzer.py
index 8cdfe96e..8f2edf8b 100644
--- a/api/analyzers/python/analyzer.py
+++ b/api/analyzers/python/analyzer.py
@@ -136,6 +136,110 @@ def add_symbols(self, entity: Entity) -> None:
     def is_dependency(self, file_path: str) -> bool:
         return "venv" in file_path
 
+    def _module_parts(self, file_path: Path, root: Path) -> Optional[list[str]]:
+        """Dotted module path components for ``file_path`` relative to ``root``."""
+        try:
+            rel = file_path.relative_to(root)
+        except ValueError:
+            return None
+        parts = list(rel.with_suffix('').parts)
+        if parts and parts[-1] == '__init__':
+            parts = parts[:-1]
+        return parts
+
+    def build_import_index(self, files: dict[Path, File], root: Path) -> object:
+        """Index in-repo files by dotted module name.
+
+        Two maps: ``exact`` keyed by the full dotted path from ``root`` and
+        ``suffix`` keyed by every trailing sub-path (first file wins). The
+        suffix map tolerates ``src/``/``lib/`` layouts where the import name
+        (``matplotlib.axes``) differs from the path-from-root
+        (``lib.matplotlib.axes``).
+        """
+        exact: dict[str, File] = {}
+        suffix: dict[str, File] = {}
+        for fpath, file in files.items():
+            if self.is_dependency(str(fpath)):
+                continue
+            parts = self._module_parts(fpath, root)
+            if not parts:
+                continue
+            exact.setdefault('.'.join(parts), file)
+            for i in range(len(parts)):
+                suffix.setdefault('.'.join(parts[i:]), file)
+        return {'exact': exact, 'suffix': suffix}
+
+    def _resolve_dotted(self, dotted: str, index: dict) -> Optional[File]:
+        if not dotted:
+            return None
+        f = index['exact'].get(dotted) or index['suffix'].get(dotted)
+        if f is None and '.' in dotted:
+            # imported name may be a symbol inside a module; drop the last part.
+            parent = dotted.rsplit('.', 1)[0]
+            f = index['exact'].get(parent) or index['suffix'].get(parent)
+        return f
+
+    def _import_requests(self, file: File) -> list[tuple[str, int]]:
+        """Extract (dotted, level) resolution requests from import statements."""
+        requests: list[tuple[str, int]] = []
+        captures = self._captures(
+            "(import_statement) @i (import_from_statement) @f",
+            file.tree.root_node,
+        )
+        for node in captures.get('i', []):
+            for child in node.named_children:
+                target = child
+                if child.type == 'aliased_import':
+                    target = child.child_by_field_name('name')
+                if target is not None and target.type == 'dotted_name':
+                    requests.append((target.text.decode('utf-8'), 0))
+        for node in captures.get('f', []):
+            module = node.child_by_field_name('module_name')
+            level = 0
+            base = ''
+            if module is not None:
+                if module.type == 'relative_import':
+                    prefix = next((c for c in module.children if c.type == 'import_prefix'), None)
+                    level = len(prefix.text.decode('utf-8')) if prefix is not None else 1
+                    dotted_part = next((c for c in module.named_children if c.type == 'dotted_name'), None)
+                    base = dotted_part.text.decode('utf-8') if dotted_part is not None else ''
+                else:
+                    base = module.text.decode('utf-8')
+            requests.append((base, level))
+            for name_node in node.children_by_field_name('name'):
+                leaf = name_node
+                if name_node.type == 'aliased_import':
+                    leaf = name_node.child_by_field_name('name')
+                if leaf is not None:
+                    name_txt = leaf.text.decode('utf-8')
+                    requests.append((f"{base}.{name_txt}" if base else name_txt, level))
+        return requests
+
+    def resolve_imports(self, file: File, root: Path, index: object) -> list[File]:
+        if not index:
+            return []
+        package_parts = self._module_parts(file.path, root)
+        if package_parts is None:
+            return []
+        # Package of the importing file = its parent dotted path.
+        package_parts = package_parts[:-1] if package_parts else []
+        seen: set[Path] = set()
+        targets: list[File] = []
+        for dotted, level in self._import_requests(file):
+            if level:
+                base = package_parts[: len(package_parts) - (level - 1)] if level > 1 else list(package_parts)
+                full = '.'.join([*base, dotted]) if dotted else '.'.join(base)
+            else:
+                full = dotted
+            resolved = self._resolve_dotted(full, index)
+            if resolved is None or resolved.path == file.path or resolved.path in seen:
+                continue
+            if self.is_dependency(str(resolved.path)):
+                continue
+            seen.add(resolved.path)
+            targets.append(resolved)
+        return targets
+
     def _extract_type_target(self, node: Node) -> Optional[Node]:
         if node.type == 'attribute':
             return node.child_by_field_name('attribute')
diff --git a/api/analyzers/source_analyzer.py b/api/analyzers/source_analyzer.py
index 49dd00b3..d22367d3 100644
--- a/api/analyzers/source_analyzer.py
+++ b/api/analyzers/source_analyzer.py
@@ -183,9 +183,33 @@ def second_pass(self, graph: Graph, files: list[Path], path: Path) -> None:
                             elif key == "parameters":
                                 graph.connect_entities("PARAMETERS", entity.id, resolved.id)
 
+    def link_imports(self, graph: Graph, root: Path) -> None:
+        """Add ``IMPORTS`` edges (File -> File) via per-language resolution.
+
+        Purely syntactic for Python (no LSP), so this runs after ``first_pass``
+        once every file has a graph id. Languages whose analyzer does not
+        implement import resolution are silently skipped.
+        """
+        indices: dict[str, object] = {}
+        for file_path, file in self.files.items():
+            analyzer = analyzers.get(file_path.suffix)
+            if analyzer is None:
+                continue
+            if file_path.suffix not in indices:
+                indices[file_path.suffix] = analyzer.build_import_index(self.files, root)
+            index = indices[file_path.suffix]
+            if not index:
+                continue
+            for target in analyzer.resolve_imports(file, root, index):
+                if getattr(file, "id", None) is None or getattr(target, "id", None) is None:
+                    continue
+                graph.connect_entities("IMPORTS", file.id, target.id)
+
     def analyze_files(self, files: list[Path], path: Path, graph: Graph) -> None:
         self.first_pass(path, files, [], graph)
+        self.link_imports(graph, path)
         self.second_pass(graph, files, path)
+        graph.derive_overrides()
 
     def analyze_sources(self, path: Path, ignore: list[str], graph: Graph) -> None:
         path = path.resolve()
@@ -193,9 +217,15 @@ def analyze_sources(self, path: Path, ignore: list[str], graph: Graph) -> None:
         # First pass analysis of the source code
         self.first_pass(path, files, ignore, graph)
 
+        # Link import edges (syntactic, language-specific, no LSP)
+        self.link_imports(graph, path)
+
         # Second pass analysis of the source code
         self.second_pass(graph, files, path)
 
+        # Derive override edges from the resolved class hierarchy
+        graph.derive_overrides()
+
     def analyze_local_folder(self, path: str, g: Graph, ignore: Optional[list[str]] = []) -> None:
         """
         Analyze path.
diff --git a/api/graph.py b/api/graph.py
index eda72e63..8b5000a0 100644
--- a/api/graph.py
+++ b/api/graph.py
@@ -485,6 +485,40 @@ def connect_entities(self, relation: str, src_id: int, dest_id: int, properties:
         params = {'src_id': src_id, 'dest_id': dest_id, "properties": properties}
         self._query(q, params)
 
+    def derive_overrides(self, max_depth: int = 3) -> int:
+        """
+        Derive ``OVERRIDES`` edges from the existing class hierarchy.
+
+        A method ``m`` on a subclass overrides method ``m2`` on an ancestor
+        class when they share a name. Pure graph derivation over existing
+        ``EXTENDS`` + ``DEFINES`` edges, so it is language-agnostic. The edge
+        carries ``depth`` (inheritance distance) for downstream filtering.
+
+        Args:
+            max_depth (int): Maximum inheritance distance to bridge.
+
+        Returns:
+            int: Number of OVERRIDES edges after derivation.
+        """
+
+        q = f"""MATCH (sub:Class)-[x:EXTENDS*1..{int(max_depth)}]->(sup:Class)
+                WHERE ID(sub) <> ID(sup)
+                WITH DISTINCT sub, sup, length(x) AS depth
+                MATCH (sub)-[:DEFINES]->(m:Function)
+                MATCH (sup)-[:DEFINES]->(m2:Function)
+                WHERE m.name = m2.name AND ID(m) <> ID(m2)
+                MERGE (m)-[e:OVERRIDES]->(m2)
+                ON CREATE SET e.depth = depth"""
+
+        try:
+            self._query(q)
+        except Exception as exc:  # noqa: BLE001 — derivation is best-effort
+            logging.warning("derive_overrides failed: %s", exc)
+            return 0
+
+        res = self._query("MATCH ()-[e:OVERRIDES]->() RETURN count(e)").result_set
+        return int(res[0][0]) if res else 0
+
     def function_calls_function(self, caller_id: int, callee_id: int, pos: int) -> None:
         """
         Establish a 'CALLS' relationship between two function nodes.

From 8fa2a43aae2bdf12d7e5b128cbfed9392ef3f452 Mon Sep 17 00:00:00 2001
From: Dvir Dukhan <12258836+DvirDukhan@users.noreply.github.com>
Date: Fri, 5 Jun 2026 17:46:24 +0300
Subject: [PATCH 2/2] =?UTF-8?q?fix(analyzer):=20correct=20tree-sitter=20re?=
 =?UTF-8?q?solver=20name=E2=86=92def=20pairing?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The per-module symbol table in `_index_file` was built by zipping two
independently-grouped `QueryCursor.captures()` lists (`@name` and `@def`).
When `@def` positions shift relative to `@name` (e.g. decorated defs), the
zip mis-pairs names with definitions, so imported-call resolution attaches
CALLS edges to the wrong target — producing phantom edges to functions whose
token never appears at the call site.

Fix: iterate per-match via a `_matches()` helper wrapping
`QueryCursor.matches()`, which guarantees each match's `@name`/`@def`
captures belong together. Applied across all four indexing loops
(top-level funcs, classes, assigns, class methods).

Impact (deterministic graph-vs-jedi-oracle caller bench, n=40, paired,
identical harness — only the resolver differs):
  uxarray CALLS macro-F1 0.178 → 0.713 (median 0.0 → 0.94)
  arkouda CALLS macro-F1 0.031 → 0.262

Adds two regression tests asserting each imported call resolves to the def
whose name matches exactly (10 top-level functions, 8 classes).

Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>
---
 api/analyzers/python/ts_resolver.py        | 71 ++++++++++++++--------
 tests/analyzers/test_ts_python_resolver.py | 52 ++++++++++++++++
 2 files changed, 97 insertions(+), 26 deletions(-)

diff --git a/api/analyzers/python/ts_resolver.py b/api/analyzers/python/ts_resolver.py
index d6b60c79..e20fd89b 100644
--- a/api/analyzers/python/ts_resolver.py
+++ b/api/analyzers/python/ts_resolver.py
@@ -167,6 +167,21 @@ def _captures(query, root: Node) -> dict[str, list[Node]]:
     return cursor.captures(root)
 
 
+def _matches(query, root: Node) -> list[tuple[int, dict[str, list[Node]]]]:
+    """Return per-match capture groups.
+
+    Unlike :func:`_captures` (which groups *all* nodes by capture name into
+    parallel lists that are **not** guaranteed to be index-aligned across
+    different capture names), this yields one dict per match so that, e.g.,
+    a ``@name`` capture is always paired with the ``@def`` capture from the
+    *same* match. Zipping the two independent lists from ``captures()`` mis-
+    aligns names and definitions whenever the per-capture node orderings
+    diverge, scrambling the module symbol table.
+    """
+    cursor = QueryCursor(query)
+    return cursor.matches(root)
+
+
 # ---------------------------------------------------------------------------
 # Public resolver
 # ---------------------------------------------------------------------------
@@ -217,46 +232,50 @@ def _ensure_built(self, files: dict[Path, File], project_root: Path) -> None:
 
     def _index_file(self, mi: _ModuleIndex, root: Node) -> None:
         # Top-level functions
-        caps = _captures(self._queries.top_level_func, root)
-        names = caps.get("name", [])
-        defs = caps.get("def", [])
-        for name_node, def_node in zip(names, defs):
-            name = name_node.text.decode("utf-8")
-            d = _Definition(mi.file_path, _strip_decorator(def_node), "func")
+        for _, caps in _matches(self._queries.top_level_func, root):
+            name_nodes = caps.get("name", [])
+            def_nodes = caps.get("def", [])
+            if not name_nodes or not def_nodes:
+                continue
+            name = name_nodes[0].text.decode("utf-8")
+            d = _Definition(mi.file_path, _strip_decorator(def_nodes[0]), "func")
             mi.top_level[name] = d
             self._by_name[name].append(d)
 
         # Top-level classes
-        caps = _captures(self._queries.top_level_class, root)
-        names = caps.get("name", [])
-        defs = caps.get("def", [])
-        for name_node, def_node in zip(names, defs):
-            name = name_node.text.decode("utf-8")
-            d = _Definition(mi.file_path, _strip_decorator(def_node), "class")
+        for _, caps in _matches(self._queries.top_level_class, root):
+            name_nodes = caps.get("name", [])
+            def_nodes = caps.get("def", [])
+            if not name_nodes or not def_nodes:
+                continue
+            name = name_nodes[0].text.decode("utf-8")
+            d = _Definition(mi.file_path, _strip_decorator(def_nodes[0]), "class")
             mi.top_level[name] = d
             self._by_name[name].append(d)
 
         # Top-level assignments (for class aliases like ``Foo = OtherFoo``)
-        caps = _captures(self._queries.top_level_assign, root)
-        names = caps.get("name", [])
-        defs = caps.get("def", [])
-        for name_node, def_node in zip(names, defs):
-            name = name_node.text.decode("utf-8")
+        for _, caps in _matches(self._queries.top_level_assign, root):
+            name_nodes = caps.get("name", [])
+            def_nodes = caps.get("def", [])
+            if not name_nodes or not def_nodes:
+                continue
+            name = name_nodes[0].text.decode("utf-8")
             if name in mi.top_level:
                 continue
-            d = _Definition(mi.file_path, def_node, "var")
+            d = _Definition(mi.file_path, def_nodes[0], "var")
             mi.top_level[name] = d
             self._by_name[name].append(d)
 
         # Class methods
-        caps = _captures(self._queries.class_methods, root)
-        class_names = caps.get("class_name", [])
-        method_names = caps.get("method_name", [])
-        method_defs = caps.get("method_def", [])
-        for cls_node, mname_node, mdef_node in zip(class_names, method_names, method_defs):
-            class_name = cls_node.text.decode("utf-8")
-            method_name = mname_node.text.decode("utf-8")
-            d = _Definition(mi.file_path, _strip_decorator(mdef_node), "method")
+        for _, caps in _matches(self._queries.class_methods, root):
+            class_nodes = caps.get("class_name", [])
+            mname_nodes = caps.get("method_name", [])
+            mdef_nodes = caps.get("method_def", [])
+            if not class_nodes or not mname_nodes or not mdef_nodes:
+                continue
+            class_name = class_nodes[0].text.decode("utf-8")
+            method_name = mname_nodes[0].text.decode("utf-8")
+            d = _Definition(mi.file_path, _strip_decorator(mdef_nodes[0]), "method")
             mi.class_methods.setdefault(class_name, {})[method_name] = d
             self._by_name[method_name].append(d)
 
diff --git a/tests/analyzers/test_ts_python_resolver.py b/tests/analyzers/test_ts_python_resolver.py
index 2e8d3621..88d0d7e1 100644
--- a/tests/analyzers/test_ts_python_resolver.py
+++ b/tests/analyzers/test_ts_python_resolver.py
@@ -227,6 +227,58 @@ def test_resolver_unknown_name_returns_empty(tmp_path: Path):
     assert r.resolve(files, mod, tmp_path.resolve(), name) == []
 
 
+def test_resolver_many_defs_name_def_alignment(tmp_path: Path):
+    """Regression for the scrambled module symbol table.
+
+    With several top-level definitions in one module, pairing the ``@name``
+    and ``@def`` captures by zipping two independently-grouped lists mis-
+    aligned names with definitions (e.g. an imported ``arange`` call resolved
+    to the ``array`` def node). Each imported call must resolve to the def
+    whose name actually matches the call name.
+    """
+    lib_src = "".join(f"def fn_{i}():\n    return {i}\n\n" for i in range(10))
+    import_line = "from lib import " + ", ".join(f"fn_{i}" for i in range(10))
+    call_lines = "\n".join(f"    fn_{i}()" for i in range(10))
+    app_src = f"{import_line}\n\ndef use():\n{call_lines}\n"
+    files = _make_project(tmp_path, {"lib.py": lib_src, "app.py": app_src})
+    r = TreeSitterPythonResolver(_PY)
+    app_path = (tmp_path / "app.py").resolve()
+    lib_path = (tmp_path / "lib.py").resolve()
+    root = files[app_path].tree.root_node
+    for i in range(10):
+        call = _find_call_node(root, f"fn_{i}(")
+        out = r.resolve(
+            files, app_path, tmp_path.resolve(), call.child_by_field_name("function")
+        )
+        assert len(out) == 1, f"fn_{i} did not resolve uniquely"
+        file, def_node = out[0]
+        assert file.path == lib_path
+        resolved_name = def_node.child_by_field_name("name").text.decode("utf-8")
+        assert resolved_name == f"fn_{i}", (
+            f"call fn_{i} resolved to wrong def {resolved_name}"
+        )
+
+
+def test_resolver_many_classes_name_def_alignment(tmp_path: Path):
+    """Same alignment regression for top-level classes."""
+    lib_src = "".join(f"class Cls{i}:\n    pass\n\n" for i in range(8))
+    import_line = "from lib import " + ", ".join(f"Cls{i}" for i in range(8))
+    body = "\n".join(f"    Cls{i}()" for i in range(8))
+    app_src = f"{import_line}\n\ndef use():\n{body}\n"
+    files = _make_project(tmp_path, {"lib.py": lib_src, "app.py": app_src})
+    r = TreeSitterPythonResolver(_PY)
+    app_path = (tmp_path / "app.py").resolve()
+    root = files[app_path].tree.root_node
+    for i in range(8):
+        call = _find_call_node(root, f"Cls{i}(")
+        out = r.resolve(
+            files, app_path, tmp_path.resolve(), call.child_by_field_name("function")
+        )
+        assert len(out) == 1
+        resolved_name = out[0][1].child_by_field_name("name").text.decode("utf-8")
+        assert resolved_name == f"Cls{i}"
+
+
 # ---------------------------------------------------------------------------
 # PythonAnalyzer integration via env var
 # ---------------------------------------------------------------------------