mandiant · saniyafatima07 · Jun 10, 2026 · Mar 18, 2026 · Mar 16, 2026 · Mar 16, 2026
diff --git a/.github/mypy/mypy.ini b/.github/mypy/mypy.ini
@@ -86,3 +86,6 @@ ignore_missing_imports = True
 
 [mypy-ghidra.*]
 ignore_missing_imports = True
+
+[mypy-tree_sitter.*]
+ignore_missing_imports = True
diff --git a/.github/pyinstaller/hooks/hook-capa.features.extractors.ts.signatures.py b/.github/pyinstaller/hooks/hook-capa.features.extractors.ts.signatures.py
@@ -0,0 +1,20 @@
+# Copyright 2026 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from PyInstaller.utils.hooks import collect_data_files
+
+
+# Tree-sitter signature lookups use importlib.resources, so PyInstaller must
+# bundle the JSON files alongside the package.
+datas = collect_data_files("capa.features.extractors.ts.signatures")
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -145,6 +145,7 @@ This release includes Ghidra PyGhidra support, performance improvements, depende
 
 ### New Features
 
+- Tree-Sitter Script Analysis
 - ghidra: support PyGhidra @mike-hunhoff #2788
 - vmray: extract number features from whitelisted void_ptr parameters (hKey, hKeyRoot) @adeboyedn #2835
 

diff --git a/capa/capabilities/static.py b/capa/capabilities/static.py
@@ -205,7 +205,7 @@ def find_static_capabilities(
                     match_count += len(matches_)
 
             logger.debug(
-                "analyzed function 0x%x and extracted %d features, %d matches in %0.02fs",
+                "analyzed function %s and extracted %d features, %d matches in %0.02fs",
                 f.address,
                 code_capabilities.feature_count,
                 match_count,

diff --git a/capa/features/address.py b/capa/features/address.py
@@ -159,6 +159,26 @@ def __hash__(self):
         return int.__hash__(self)
 
 
+class FileOffsetRangeAddress(Address):
+    """an address range relative to the start of a file"""
+
+    def __init__(self, start_byte, end_byte):
+        self.start_byte = start_byte
+        self.end_byte = end_byte
+
+    def __eq__(self, other):
+        return (self.start_byte, self.end_byte) == (other.start_byte, other.end_byte)
+
+    def __lt__(self, other):
+        return (self.start_byte, self.end_byte) < (other.start_byte, other.end_byte)
+
+    def __hash__(self):
+        return hash((self.start_byte, self.end_byte))
+
+    def __repr__(self):
+        return f"file(0x{self.start_byte:x}, 0x{self.end_byte:x})"
+
+
 class DNTokenAddress(int, Address):
     """a .NET token"""
 

diff --git a/capa/features/common.py b/capa/features/common.py
@@ -487,10 +487,18 @@ def evaluate(self, features: "capa.engine.FeatureSet", short_circuit=True):
         return Result(False, self, [])
 
 
+class ScriptLanguage(Feature):
+    def __init__(self, value: str, description=None):
+        super().__init__(value, description=description)
+        self.name = "script language"
+
+
 FORMAT_PE = "pe"
 FORMAT_ELF = "elf"
+FORMAT_SCRIPT = "script"
 FORMAT_DOTNET = "dotnet"
-VALID_FORMAT = (FORMAT_PE, FORMAT_ELF, FORMAT_DOTNET)
+FORMAT_SCRIPT = "script"
+VALID_FORMAT = (FORMAT_PE, FORMAT_ELF, FORMAT_DOTNET, FORMAT_SCRIPT)
 # internal only, not to be used in rules
 FORMAT_AUTO = "auto"
 FORMAT_SC32 = "sc32"
@@ -508,6 +516,7 @@ def evaluate(self, features: "capa.engine.FeatureSet", short_circuit=True):
     FORMAT_PE,
     FORMAT_ELF,
     FORMAT_DOTNET,
+    FORMAT_SCRIPT,
     FORMAT_FREEZE,
     FORMAT_RESULT,
     FORMAT_BINEXPORT2,

diff --git a/capa/features/extractors/common.py b/capa/features/extractors/common.py
@@ -21,6 +21,7 @@
 
 import pefile
 
+import capa.features
 import capa.features.extractors.elf
 import capa.features.extractors.pefile
 import capa.features.extractors.strings
@@ -29,20 +30,20 @@
     OS_ANY,
     OS_AUTO,
     ARCH_ANY,
-    VALID_OS,
     FORMAT_PE,
     FORMAT_ELF,
     OS_WINDOWS,
-    VALID_ARCH,
     FORMAT_FREEZE,
     FORMAT_RESULT,
+    FORMAT_SCRIPT,
     Arch,
     Format,
     String,
     Feature,
 )
 from capa.features.freeze import is_freeze
 from capa.features.address import NO_ADDRESS, Address, FileOffsetAddress
+from capa.features.extractors.ts.autodetect import is_script
 
 logger = logging.getLogger(__name__)
 
@@ -53,7 +54,7 @@
 MATCH_JSON_OBJECT = b'{"'
 
 
-def extract_file_strings(buf: bytes) -> Iterator[tuple[String, Address]]:
+def extract_file_strings(buf: bytes, **kwargs) -> Iterator[tuple[String, Address]]:
     """
     extract ASCII and UTF-16 LE strings from file
     """
@@ -78,6 +79,8 @@ def extract_format(buf: bytes) -> Iterator[tuple[Feature, Address]]:
         # we don't know what it is exactly, but may support it (e.g. a dynamic CAPE sandbox report)
         # skip verdict here and let subsequent code analyze this further
         return
+    elif is_script(buf):
+        yield Format(FORMAT_SCRIPT), NO_ADDRESS
     else:
         # we likely end up here:
         #  1. handling a file format (e.g. macho)
@@ -98,7 +101,7 @@ def extract_arch(buf) -> Iterator[tuple[Feature, Address]]:
         with contextlib.closing(io.BytesIO(buf)) as f:
             arch = capa.features.extractors.elf.detect_elf_arch(f)
 
-        if arch not in VALID_ARCH:
+        if arch not in capa.features.common.VALID_ARCH:
             logger.debug("unsupported arch: %s", arch)
             return
 
@@ -115,10 +118,7 @@ def extract_arch(buf) -> Iterator[tuple[Feature, Address]]:
         # rules that rely on arch conditions will fail to match on shellcode.
         #
         # for (2), this logic will need to be updated as the format is implemented.
-        logger.debug(
-            "unsupported file format: %s, will not guess Arch",
-            binascii.hexlify(buf[:4]).decode("ascii"),
-        )
+        logger.debug("unsupported file format: %s, will not guess Arch", binascii.hexlify(buf[:4]).decode("ascii"))
         return
 
 
@@ -135,7 +135,7 @@ def extract_os(buf, os=OS_AUTO) -> Iterator[tuple[Feature, Address]]:
         with contextlib.closing(io.BytesIO(buf)) as f:
             os = capa.features.extractors.elf.detect_elf_os(f)
 
-        if os not in VALID_OS:
+        if os not in capa.features.common.VALID_OS:
             logger.debug("unsupported os: %s", os)
             return
 
@@ -150,8 +150,5 @@ def extract_os(buf, os=OS_AUTO) -> Iterator[tuple[Feature, Address]]:
         # rules that rely on OS conditions will fail to match on shellcode.
         #
         # for (2), this logic will need to be updated as the format is implemented.
-        logger.debug(
-            "unsupported file format: %s, will not guess OS",
-            binascii.hexlify(buf[:4]).decode("ascii"),
-        )
+        logger.debug("unsupported file format: %s, will not guess OS", binascii.hexlify(buf[:4]).decode("ascii"))
         return
diff --git a/capa/features/extractors/script.py b/capa/features/extractors/script.py
@@ -0,0 +1,55 @@
+# Copyright 2022 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from typing import Tuple, Iterator
+
+from capa.features.common import OS, OS_ANY, ARCH_ANY, FORMAT_SCRIPT, Arch, Format, Feature, ScriptLanguage
+from capa.features.address import NO_ADDRESS, Address, FileOffsetRangeAddress
+
+# Can be used to instantiate tree_sitter Language objects (see ts/query.py)
+LANG_CS = "c_sharp"
+LANG_HTML = "html"
+LANG_JS = "javascript"
+LANG_PY = "python"
+LANG_TEM = "embedded_template"
+
+EXT_ASPX = ("aspx", "aspx_")
+EXT_CS = ("cs", "cs_")
+EXT_HTML = ("html", "html_")
+EXT_PY = ("py", "py_")
+
+
+LANGUAGE_FEATURE_FORMAT = {
+    LANG_CS: "C#",
+    LANG_HTML: "HTML",
+    LANG_JS: "JavaScript",
+    LANG_PY: "Python",
+    LANG_TEM: "Embedded Template",
+}
+
+
+def extract_arch() -> Iterator[Tuple[Feature, Address]]:
+    yield Arch(ARCH_ANY), NO_ADDRESS
+
+
+def extract_language(language: str, addr: FileOffsetRangeAddress) -> Iterator[Tuple[Feature, Address]]:
+    yield ScriptLanguage(LANGUAGE_FEATURE_FORMAT[language]), addr
+
+
+def extract_os() -> Iterator[Tuple[Feature, Address]]:
+    yield OS(OS_ANY), NO_ADDRESS
+
+
+def extract_format() -> Iterator[Tuple[Feature, Address]]:
+    yield Format(FORMAT_SCRIPT), NO_ADDRESS
diff --git a/capa/features/extractors/ts/__init__.py b/capa/features/extractors/ts/__init__.py
diff --git a/capa/features/extractors/ts/autodetect.py b/capa/features/extractors/ts/autodetect.py
@@ -0,0 +1,80 @@
+# Copyright 2022 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from typing import Optional
+from pathlib import Path
+
+from tree_sitter import Node, Tree, Query, Parser, Language, QueryCursor
+
+from capa.features.extractors.script import EXT_CS, EXT_PY, LANG_CS, LANG_PY, EXT_ASPX, EXT_HTML, LANG_TEM, LANG_HTML
+from capa.features.extractors.ts.query import TS_LANGUAGES
+
+
+def is_script(buf: bytes) -> bool:
+    try:
+        return bool(get_language_ts(buf))
+    except ValueError:
+        return False
+
+
+def _parse(ts_language: Language, buf: bytes) -> Optional[Tree]:
+    try:
+        parser = Parser(ts_language)
+        return parser.parse(buf)
+    except ValueError:
+        return None
+
+
+def _contains_errors(ts_language, node: Node) -> bool:
+    query = Query(ts_language, "(ERROR) @error")
+    return bool(QueryCursor(query).captures(node))
+
+
+def get_language_ts(buf: bytes) -> str:
+    for language, ts_language in TS_LANGUAGES.items():
+        tree = _parse(ts_language, buf)
+        if tree and not _contains_errors(ts_language, tree.root_node):
+            return language
+    raise ValueError("failed to parse the language")
+
+
+def get_template_language_ts(buf: bytes) -> str:
+    for language, ts_language in TS_LANGUAGES.items():
+        if language in [LANG_TEM, LANG_HTML]:
+            continue
+        tree = _parse(ts_language, buf)
+        if tree and not _contains_errors(ts_language, tree.root_node):
+            return language
+    raise ValueError("failed to parse the language")
+
+
+def get_language_from_ext(path: str) -> str:
+    if path.endswith(EXT_ASPX):
+        return LANG_TEM
+    if path.endswith(EXT_CS):
+        return LANG_CS
+    if path.endswith(EXT_HTML):
+        return LANG_HTML
+    if path.endswith(EXT_PY):
+        return LANG_PY
+    raise ValueError(f"{path} has an unrecognized or an unsupported extension.")
+
+
+def get_language(path: Path) -> str:
+    try:
+        with path.open("rb") as f:
+            buf = f.read()
+        return get_language_ts(buf)
+    except ValueError:
+        return get_language_from_ext(str(path))