From 612b04fcdb555b300052c9729db7175bff999743 Mon Sep 17 00:00:00 2001 From: Dvir Dukhan <12258836+DvirDukhan@users.noreply.github.com> Date: Thu, 28 May 2026 09:16:11 +0300 Subject: [PATCH] perf(analyzers): memoise compiled tree-sitter queries MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit AbstractAnalyzer._captures was recompiling its query string on every call. cProfile on pytest-dev/pytest-6202 (204 files) showed tree_sitter.Language.query consuming 3.03s of the 6.36s first_pass — ~48% of analyzer time spent rebuilding queries that never change. Cache them on the analyzer instance, keyed by pattern string. Also switches from the deprecated language.query() to the Query(language, pattern) constructor. Wall-time on pytest-6202 (CODE_GRAPH_PY_RESOLVER=tree_sitter): before: 6.9s after: 3.7s Benefits every tree-sitter analyzer (Python, JavaScript, Kotlin), not just the new static resolver. Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- api/analyzers/analyzer.py | 15 ++++++++++++--- 1 file changed, 12 insertions(+), 3 deletions(-) diff --git a/api/analyzers/analyzer.py b/api/analyzers/analyzer.py index 63202851..137a4478 100644 --- a/api/analyzers/analyzer.py +++ b/api/analyzers/analyzer.py @@ -1,7 +1,7 @@ from pathlib import Path from typing import Optional -from tree_sitter import Language, Node, Parser, Point, QueryCursor +from tree_sitter import Language, Node, Parser, Point, Query, QueryCursor from api.entities.entity import Entity from api.entities.file import File from abc import ABC, abstractmethod @@ -11,11 +11,20 @@ class AbstractAnalyzer(ABC): def __init__(self, language: Language) -> None: self.language = language self.parser = Parser(language) + # Memoise compiled queries; tree-sitter query compilation is ~370us + # each and adds up to seconds on large repos. + self._query_cache: dict[str, Query] = {} + + def _get_query(self, pattern: str) -> Query: + q = self._query_cache.get(pattern) + if q is None: + q = Query(self.language, pattern) + self._query_cache[pattern] = q + return q def _captures(self, pattern: str, node: Node) -> dict: """Run a tree-sitter query and return captures dict.""" - query = self.language.query(pattern) - cursor = QueryCursor(query) + cursor = QueryCursor(self._get_query(pattern)) return cursor.captures(node) def find_parent(self, node: Node, parent_types: list) -> Node: