NVIDIA · wernerkasselman-au · Jun 21, 2026 · Jun 21, 2026 · Jun 21, 2026 · Jun 21, 2026
diff --git a/README.md b/README.md
@@ -483,6 +483,7 @@ The tool requires outbound HTTPS access to `api.osv.dev` for live vulnerability
 - **Encrypted/binary code**: Cannot analyze compiled or encrypted content
 - **Runtime behavior**: Static analysis only, no dynamic execution
 - **Offline SC4**: Without network access to `api.osv.dev`, SC4 uses a small static fallback list
+- **Per-file size limit**: Files larger than 50 MiB fail the scan closed rather than being silently skipped, so oversized content cannot hide from analysis. This is a per-file *analysis* limit, not an ingest limit: the upstream url/zip/git fetch in `input_handler` is not yet bounded, so a very large download or a decompression bomb can exhaust memory or disk before the per-file gate runs (tracked separately). Raising the limit from the previous 1 MB means mid-size files that used to be skipped are now analyzed and, with `--llm`, chunked and sent to the model, which improves detection but increases resource and token cost.
 
 ## Research Background
 

diff --git a/src/skillspector/llm_analyzer_base.py b/src/skillspector/llm_analyzer_base.py
@@ -276,9 +276,22 @@ def get_batches(
             for f in findings:
                 findings_by_file[f.file].append(f)
 
+        # Fail closed on oversized cached content. A programmatic caller can
+        # build a file_cache directly (the semantic_* analyzers call this with
+        # the build_context cache, but nothing stops a caller injecting their
+        # own), so without this guard content above the per-file byte limit
+        # would be chunked and sent to the LLM, bypassing the same fail-closed
+        # gate the static/AST/taint/YARA entry points enforce. Imported lazily:
+        # a module-level import would cycle, since the analyzers package __init__
+        # imports the semantic_* analyzers, which import this module.
+        from skillspector.nodes.analyzers.static_runner import raise_if_content_exceeds_limit
+
         batches: list[Batch] = []
         for path in file_paths:
-            content = file_cache.get(path) or "No content available for this file."
+            cached = file_cache.get(path)
+            if cached is not None:
+                raise_if_content_exceeds_limit(path, cached)
+            content = cached or "No content available for this file."
             file_findings = findings_by_file.get(path, [])
 
             extra = self._estimate_extra_overhead(file_findings)

diff --git a/src/skillspector/nodes/analyzers/behavioral_ast.py b/src/skillspector/nodes/analyzers/behavioral_ast.py
@@ -24,7 +24,7 @@
 from skillspector.state import AnalyzerNodeResponse, SkillspectorState
 
 from .common import get_context_from_lines, get_source_segment, resolve_call_name
-from .static_runner import MAX_FILE_BYTES, analyzer_finding_to_finding
+from .static_runner import analyzer_finding_to_finding, raise_if_content_exceeds_limit
 
 ANALYZER_ID = "behavioral_ast"
 logger = get_logger(__name__)
@@ -216,8 +216,9 @@ def node(state: SkillspectorState) -> AnalyzerNodeResponse:
         if not path.endswith(".py"):
             continue
         content = file_cache.get(path)
-        if content is None or len(content) > MAX_FILE_BYTES:
+        if content is None:
             continue
+        raise_if_content_exceeds_limit(path, content)
         raw = _analyze_python(content, path)
         all_findings.extend(analyzer_finding_to_finding(af) for af in raw)
 

diff --git a/src/skillspector/nodes/analyzers/behavioral_taint_tracking.py b/src/skillspector/nodes/analyzers/behavioral_taint_tracking.py
@@ -36,7 +36,7 @@
     resolve_call_name_typed,
     resolve_dotted_name,
 )
-from .static_runner import MAX_FILE_BYTES, analyzer_finding_to_finding
+from .static_runner import analyzer_finding_to_finding, raise_if_content_exceeds_limit
 
 ANALYZER_ID = "behavioral_taint_tracking"
 logger = get_logger(__name__)
@@ -400,8 +400,9 @@ def node(state: SkillspectorState) -> AnalyzerNodeResponse:
         if not path.endswith(".py"):
             continue
         content = file_cache.get(path)
-        if content is None or len(content) > MAX_FILE_BYTES:
+        if content is None:
             continue
+        raise_if_content_exceeds_limit(path, content)
         raw = _analyze_python(content, path)
         all_findings.extend(analyzer_finding_to_finding(af) for af in raw)
 

diff --git a/src/skillspector/nodes/analyzers/static_runner.py b/src/skillspector/nodes/analyzers/static_runner.py
@@ -46,7 +46,7 @@
     ".rs": "rust",
 }
 
-MAX_FILE_BYTES = 1_000_000
+MAX_FILE_BYTES = 50 * 1024 * 1024
 _EVAL_DATASET_FILES = {
     "evals/evals.json",
     "evals/evals.jsonl",
@@ -59,6 +59,31 @@
 }
 
 
+def raise_if_content_exceeds_limit(path: str, content: str) -> None:
+    """Fail closed if analyzer input exceeds the per-file analysis limit.
+
+    Measures characters, not re-encoded bytes. The authoritative size gate is
+    on-disk bytes (build_context._validate_file_sizes, via stat), which runs
+    before any content is cached; this is a defense-in-depth guard for callers
+    that build a file_cache directly and bypass that gate. A decoded string's
+    character count is always <= its source UTF-8 byte length (every code point,
+    including the U+FFFD that errors="replace" emits for an undecodable byte,
+    consumes at least one byte), so content the on-disk gate admitted can never
+    trip this, while a programmatically injected oversized string still does.
+
+    Re-encoding the content to count bytes here would instead falsely abort a
+    legitimate sub-limit binary file: errors="replace" turns each undecodable
+    on-disk byte into a 3-byte U+FFFD, so a ~17 MiB binary that passed the
+    on-disk gate inflates past 50 MiB on re-encode and aborts the whole scan.
+    """
+    size = len(content)
+    if size > MAX_FILE_BYTES:
+        raise ValueError(
+            "Scan aborted: file content exceeds the per-file analysis limit "
+            f"({MAX_FILE_BYTES} characters): {path} ({size} characters)"
+        )
+
+
 def _infer_file_type(path: str) -> str:
     """Infer file type from path (extension)."""
     idx = path.rfind(".")
@@ -125,14 +150,7 @@ def run_static_patterns(
         if content is None:
             logger.debug("Skipping %s: no content in file_cache", path)
             continue
-        if len(content) > MAX_FILE_BYTES:
-            logger.debug(
-                "Skipping %s: size %d exceeds MAX_FILE_BYTES (%d)",
-                path,
-                len(content),
-                MAX_FILE_BYTES,
-            )
-            continue
+        raise_if_content_exceeds_limit(path, content)
         file_type = _infer_file_type(path)
         for module in pattern_modules:
             raw = module.analyze(content=content, file_path=path, file_type=file_type)

diff --git a/src/skillspector/nodes/analyzers/static_yara.py b/src/skillspector/nodes/analyzers/static_yara.py
@@ -33,7 +33,11 @@
 
 from .common import get_context, get_line_number
 from .pattern_defaults import PatternCategory
-from .static_runner import MAX_FILE_BYTES, analyzer_finding_to_finding
+from .static_runner import (
+    MAX_FILE_BYTES,
+    analyzer_finding_to_finding,
+    raise_if_content_exceeds_limit,
+)
 
 ANALYZER_ID = "static_yara"
 logger = get_logger(__name__)
@@ -66,7 +70,31 @@ def _collect_rule_files(*dirs: Path) -> list[Path]:
             continue
         for ext in _RULE_EXTENSIONS:
             files.update(d.rglob(ext))
-    return sorted(files)
+
+    # Cap operator-supplied rule files (the --yara-rules-dir tree) at the same
+    # per-file byte limit the analyzers enforce, so an oversized or malicious
+    # rule file cannot exhaust memory at yara.compile() time, bypassing the gate
+    # that bounds scanned content. Built-in rules are small and trusted; an
+    # over-limit file is skipped (with a warning) rather than failing the scan,
+    # since rule files are operator config, not the artifact under analysis.
+    bounded: list[Path] = []
+    for p in sorted(files):
+        try:
+            size_bytes = p.stat().st_size
+        except OSError:
+            logger.debug("%s: could not stat rule file, skipping: %s", ANALYZER_ID, p)
+            continue
+        if size_bytes > MAX_FILE_BYTES:
+            logger.warning(
+                "%s: skipping rule file over the %d-byte limit: %s (%d bytes)",
+                ANALYZER_ID,
+                MAX_FILE_BYTES,
+                p,
+                size_bytes,
+            )
+            continue
+        bounded.append(p)
+    return bounded
 
 
 def _content_hash(rule_files: list[Path]) -> str:
@@ -243,9 +271,7 @@ def node(state: SkillspectorState) -> AnalyzerNodeResponse:
         content = file_cache.get(path)
         if content is None:
             continue
-        if len(content) > MAX_FILE_BYTES:
-            logger.debug("%s: skipping %s (exceeds size limit)", ANALYZER_ID, path)
-            continue
+        raise_if_content_exceeds_limit(path, content)
         for af in _match_file(rules, content, path):
             findings.append(analyzer_finding_to_finding(af))
 

diff --git a/src/skillspector/nodes/build_context.py b/src/skillspector/nodes/build_context.py
@@ -60,6 +60,11 @@
     {".py", ".sh", ".bash", ".zsh", ".js", ".ts", ".rb", ".go", ".rs", ".pl"}
 )
 
+# Per-file read cap. Files larger than this fail the scan rather than being
+# skipped, because skipping would let malicious content hide in an oversized
+# file. Aligned with static_runner.MAX_FILE_BYTES.
+_MAX_READ_BYTES = 50 * 1024 * 1024
+
 
 def _resolve_skill_dir(state: SkillspectorState) -> Path:
     """Resolve state skill_path to an existing directory Path."""
@@ -106,10 +111,26 @@ def _infer_file_type(path: str) -> str:
 
 
 def _count_lines(file_path: Path) -> int:
-    """Count lines in a file, handling binary and errors gracefully."""
+    """Count lines in a file, handling binary and errors gracefully.
+
+    Reads the file in fixed-size binary chunks and counts newline bytes so
+    peak memory stays bounded regardless of file size *or* line length — a
+    multi-gigabyte file with no newlines must not be materialized in memory
+    (consistent with the _MAX_READ_BYTES cap in _read_file_cache). The count
+    matches ``len(text.splitlines())`` for the common ``\\n`` / ``\\r\\n`` cases:
+    number of newline bytes, plus one for a final line without a trailing
+    newline.
+    """
     try:
-        content = file_path.read_text(encoding="utf-8", errors="replace")
-        return len(content.splitlines())
+        newline_count = 0
+        last_byte = b""
+        with file_path.open("rb") as fh:
+            while chunk := fh.read(65536):
+                newline_count += chunk.count(b"\n")
+                last_byte = chunk[-1:]
+        if last_byte and last_byte != b"\n":
+            newline_count += 1  # final line with no trailing newline
+        return newline_count
     except OSError:
         logger.debug("Could not read file for line count: %s", file_path)
         return 0
@@ -164,6 +185,33 @@ def _read_file_cache(skill_dir: Path, components: list[str]) -> dict[str, str]:
     return file_cache
 
 
+def _validate_file_sizes(skill_dir: Path, components: list[str]) -> None:
+    """Fail the scan if any discovered file exceeds the per-file read cap."""
+    oversized: list[tuple[str, int]] = []
+    for path in components:
+        full = skill_dir / path
+        if not full.is_file():
+            continue
+        try:
+            size_bytes = full.stat().st_size
+        except OSError:
+            logger.debug("Could not stat file for size validation: %s", path)
+            continue
+        if size_bytes > _MAX_READ_BYTES:
+            oversized.append((path, size_bytes))
+
+    if not oversized:
+        return
+
+    details = ", ".join(f"{path} ({size_bytes} bytes)" for path, size_bytes in oversized[:5])
+    if len(oversized) > 5:
+        details += f", and {len(oversized) - 5} more"
+    raise ValueError(
+        "Scan aborted: file size exceeds the per-file analysis limit "
+        f"({_MAX_READ_BYTES} bytes): {details}"
+    )
+
+
 def _parse_manifest(skill_dir: Path) -> dict[str, object]:
     """Parse SKILL.md or skill.md YAML frontmatter into a manifest dict.
 
@@ -175,7 +223,17 @@ def _parse_manifest(skill_dir: Path) -> dict[str, object]:
         if not path.is_file():
             continue
         try:
-            content = path.read_text(encoding="utf-8", errors="replace")
+            # Only the leading YAML frontmatter is needed, and it sits at the
+            # top of the file. Read a bounded *byte* prefix (binary mode, then
+            # decode) so an oversized SKILL.md (a huge body, or a malicious
+            # multi-GB file) is never materialized whole. Reading bytes rather
+            # than text makes the cap a true byte ceiling: a text-mode
+            # read(_MAX_READ_BYTES) bounds characters, which multibyte content
+            # could inflate well past _MAX_READ_BYTES of memory. If a skill's
+            # frontmatter somehow exceeds this, the closing delimiter falls
+            # outside the prefix and parsing degrades to {} below.
+            with path.open("rb") as fh:
+                content = fh.read(_MAX_READ_BYTES).decode("utf-8", errors="replace")
         except OSError:
             logger.debug("Could not read manifest file: %s", name)
             return {}
@@ -225,6 +283,7 @@ def build_context(state: SkillspectorState) -> dict[str, object]:
     skill_dir = _resolve_skill_dir(state)
 
     components = _walk_skill_files(skill_dir)
+    _validate_file_sizes(skill_dir, components)
     file_cache = _read_file_cache(skill_dir, components)
     manifest = _parse_manifest(skill_dir)
     component_metadata, has_executable_scripts = _build_component_metadata(skill_dir, components)

diff --git a/tests/nodes/analyzers/test_behavioral_taint_tracking.py b/tests/nodes/analyzers/test_behavioral_taint_tracking.py
@@ -17,6 +17,8 @@
 
 from __future__ import annotations
 
+import pytest
+
 from skillspector.nodes.analyzers import behavioral_taint_tracking
 
 
@@ -258,13 +260,13 @@ def test_missing_file_in_cache(self):
         result = behavioral_taint_tracking.node(state)
         assert result["findings"] == []
 
-    def test_oversized_file_skipped(self):
+    def test_oversized_file_fails_scan(self):
         from skillspector.nodes.analyzers.static_runner import MAX_FILE_BYTES
 
         big = 'import os\nexec(os.environ.get("KEY"))\n' + ("x = 1\n" * MAX_FILE_BYTES)
         state = {"components": ["big.py"], "file_cache": {"big.py": big}}
-        result = behavioral_taint_tracking.node(state)
-        assert result["findings"] == []
+        with pytest.raises(ValueError, match="big\\.py"):
+            behavioral_taint_tracking.node(state)
 
     def test_multiple_files_produce_findings(self):
         state = {

diff --git a/tests/nodes/analyzers/test_static_patterns.py b/tests/nodes/analyzers/test_static_patterns.py
@@ -17,6 +17,8 @@
 
 from __future__ import annotations
 
+import pytest
+
 from skillspector.nodes.analyzers import (
     static_patterns_data_exfiltration as data_exfiltration_module,
 )
@@ -172,3 +174,39 @@ def test_empty_components_returns_empty(self):
         state = {"components": [], "file_cache": {}}
         findings = static_runner.run_static_patterns(state, [prompt_injection_module])
         assert findings == []
+
+
+class TestStaticRunnerSizeLimit:
+    def test_content_limit_is_enforced_in_characters(
+        self, monkeypatch: pytest.MonkeyPatch
+    ) -> None:
+        """Content whose character count exceeds the cap fails closed."""
+        monkeypatch.setattr(static_runner, "MAX_FILE_BYTES", 4)
+
+        with pytest.raises(ValueError, match=r"multi\.txt .*5 characters"):
+            static_runner.raise_if_content_exceeds_limit("multi.txt", "xxxxx")
+
+    def test_sub_limit_multibyte_content_does_not_falsely_abort(
+        self, monkeypatch: pytest.MonkeyPatch
+    ) -> None:
+        """A string under the char cap must not abort even if its UTF-8 byte
+        length is over it.
+
+        Regression guard: measuring re-encoded bytes here falsely aborts a
+        legitimate sub-limit binary file, whose errors="replace" decode inflates
+        each undecodable byte into a 3-byte U+FFFD. "ééé" is 3 chars but 6 bytes;
+        with a 4-unit cap the byte measure would raise, the char measure must not.
+        """
+        monkeypatch.setattr(static_runner, "MAX_FILE_BYTES", 4)
+
+        static_runner.raise_if_content_exceeds_limit("multi.txt", "ééé")  # must not raise
+
+    def test_run_static_patterns_fails_on_oversized_content(
+        self,
+        monkeypatch: pytest.MonkeyPatch,
+    ) -> None:
+        monkeypatch.setattr(static_runner, "MAX_FILE_BYTES", 4)
+        state = {"components": ["multi.md"], "file_cache": {"multi.md": "xxxxx"}}
+
+        with pytest.raises(ValueError, match=r"multi\.md"):
+            static_runner.run_static_patterns(state, [prompt_injection_module])