Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -483,6 +483,7 @@ The tool requires outbound HTTPS access to `api.osv.dev` for live vulnerability
- **Encrypted/binary code**: Cannot analyze compiled or encrypted content
- **Runtime behavior**: Static analysis only, no dynamic execution
- **Offline SC4**: Without network access to `api.osv.dev`, SC4 uses a small static fallback list
- **Per-file size limit**: Files larger than 50 MiB fail the scan closed rather than being silently skipped, so oversized content cannot hide from analysis. This is a per-file *analysis* limit, not an ingest limit: the upstream url/zip/git fetch in `input_handler` is not yet bounded, so a very large download or a decompression bomb can exhaust memory or disk before the per-file gate runs (tracked separately). Raising the limit from the previous 1 MB means mid-size files that used to be skipped are now analyzed and, with `--llm`, chunked and sent to the model, which improves detection but increases resource and token cost.

## Research Background

Expand Down
15 changes: 14 additions & 1 deletion src/skillspector/llm_analyzer_base.py
Original file line number Diff line number Diff line change
Expand Up @@ -276,9 +276,22 @@ def get_batches(
for f in findings:
findings_by_file[f.file].append(f)

# Fail closed on oversized cached content. A programmatic caller can
# build a file_cache directly (the semantic_* analyzers call this with
# the build_context cache, but nothing stops a caller injecting their
# own), so without this guard content above the per-file byte limit
# would be chunked and sent to the LLM, bypassing the same fail-closed
# gate the static/AST/taint/YARA entry points enforce. Imported lazily:
# a module-level import would cycle, since the analyzers package __init__
# imports the semantic_* analyzers, which import this module.
from skillspector.nodes.analyzers.static_runner import raise_if_content_exceeds_limit

batches: list[Batch] = []
for path in file_paths:
content = file_cache.get(path) or "No content available for this file."
cached = file_cache.get(path)
if cached is not None:
raise_if_content_exceeds_limit(path, cached)
content = cached or "No content available for this file."
file_findings = findings_by_file.get(path, [])

extra = self._estimate_extra_overhead(file_findings)
Expand Down
5 changes: 3 additions & 2 deletions src/skillspector/nodes/analyzers/behavioral_ast.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,7 @@
from skillspector.state import AnalyzerNodeResponse, SkillspectorState

from .common import get_context_from_lines, get_source_segment, resolve_call_name
from .static_runner import MAX_FILE_BYTES, analyzer_finding_to_finding
from .static_runner import analyzer_finding_to_finding, raise_if_content_exceeds_limit

ANALYZER_ID = "behavioral_ast"
logger = get_logger(__name__)
Expand Down Expand Up @@ -216,8 +216,9 @@ def node(state: SkillspectorState) -> AnalyzerNodeResponse:
if not path.endswith(".py"):
continue
content = file_cache.get(path)
if content is None or len(content) > MAX_FILE_BYTES:
if content is None:
continue
raise_if_content_exceeds_limit(path, content)
raw = _analyze_python(content, path)
all_findings.extend(analyzer_finding_to_finding(af) for af in raw)

Expand Down
5 changes: 3 additions & 2 deletions src/skillspector/nodes/analyzers/behavioral_taint_tracking.py
Original file line number Diff line number Diff line change
Expand Up @@ -36,7 +36,7 @@
resolve_call_name_typed,
resolve_dotted_name,
)
from .static_runner import MAX_FILE_BYTES, analyzer_finding_to_finding
from .static_runner import analyzer_finding_to_finding, raise_if_content_exceeds_limit

ANALYZER_ID = "behavioral_taint_tracking"
logger = get_logger(__name__)
Expand Down Expand Up @@ -400,8 +400,9 @@ def node(state: SkillspectorState) -> AnalyzerNodeResponse:
if not path.endswith(".py"):
continue
content = file_cache.get(path)
if content is None or len(content) > MAX_FILE_BYTES:
if content is None:
continue
raise_if_content_exceeds_limit(path, content)
raw = _analyze_python(content, path)
all_findings.extend(analyzer_finding_to_finding(af) for af in raw)

Expand Down
36 changes: 27 additions & 9 deletions src/skillspector/nodes/analyzers/static_runner.py
Original file line number Diff line number Diff line change
Expand Up @@ -46,7 +46,7 @@
".rs": "rust",
}

MAX_FILE_BYTES = 1_000_000
MAX_FILE_BYTES = 50 * 1024 * 1024
_EVAL_DATASET_FILES = {
"evals/evals.json",
"evals/evals.jsonl",
Expand All @@ -59,6 +59,31 @@
}


def raise_if_content_exceeds_limit(path: str, content: str) -> None:
"""Fail closed if analyzer input exceeds the per-file analysis limit.

Measures characters, not re-encoded bytes. The authoritative size gate is
on-disk bytes (build_context._validate_file_sizes, via stat), which runs
before any content is cached; this is a defense-in-depth guard for callers
that build a file_cache directly and bypass that gate. A decoded string's
character count is always <= its source UTF-8 byte length (every code point,
including the U+FFFD that errors="replace" emits for an undecodable byte,
consumes at least one byte), so content the on-disk gate admitted can never
trip this, while a programmatically injected oversized string still does.

Re-encoding the content to count bytes here would instead falsely abort a
legitimate sub-limit binary file: errors="replace" turns each undecodable
on-disk byte into a 3-byte U+FFFD, so a ~17 MiB binary that passed the
on-disk gate inflates past 50 MiB on re-encode and aborts the whole scan.
"""
size = len(content)
if size > MAX_FILE_BYTES:
raise ValueError(
"Scan aborted: file content exceeds the per-file analysis limit "
f"({MAX_FILE_BYTES} characters): {path} ({size} characters)"
)


def _infer_file_type(path: str) -> str:
"""Infer file type from path (extension)."""
idx = path.rfind(".")
Expand Down Expand Up @@ -125,14 +150,7 @@ def run_static_patterns(
if content is None:
logger.debug("Skipping %s: no content in file_cache", path)
continue
if len(content) > MAX_FILE_BYTES:
logger.debug(
"Skipping %s: size %d exceeds MAX_FILE_BYTES (%d)",
path,
len(content),
MAX_FILE_BYTES,
)
continue
raise_if_content_exceeds_limit(path, content)
file_type = _infer_file_type(path)
for module in pattern_modules:
raw = module.analyze(content=content, file_path=path, file_type=file_type)
Expand Down
36 changes: 31 additions & 5 deletions src/skillspector/nodes/analyzers/static_yara.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,7 +33,11 @@

from .common import get_context, get_line_number
from .pattern_defaults import PatternCategory
from .static_runner import MAX_FILE_BYTES, analyzer_finding_to_finding
from .static_runner import (
MAX_FILE_BYTES,
analyzer_finding_to_finding,
raise_if_content_exceeds_limit,
)

ANALYZER_ID = "static_yara"
logger = get_logger(__name__)
Expand Down Expand Up @@ -66,7 +70,31 @@ def _collect_rule_files(*dirs: Path) -> list[Path]:
continue
for ext in _RULE_EXTENSIONS:
files.update(d.rglob(ext))
return sorted(files)

# Cap operator-supplied rule files (the --yara-rules-dir tree) at the same
# per-file byte limit the analyzers enforce, so an oversized or malicious
# rule file cannot exhaust memory at yara.compile() time, bypassing the gate
# that bounds scanned content. Built-in rules are small and trusted; an
# over-limit file is skipped (with a warning) rather than failing the scan,
# since rule files are operator config, not the artifact under analysis.
bounded: list[Path] = []
for p in sorted(files):
try:
size_bytes = p.stat().st_size
except OSError:
logger.debug("%s: could not stat rule file, skipping: %s", ANALYZER_ID, p)
continue
if size_bytes > MAX_FILE_BYTES:
logger.warning(
"%s: skipping rule file over the %d-byte limit: %s (%d bytes)",
ANALYZER_ID,
MAX_FILE_BYTES,
p,
size_bytes,
)
continue
bounded.append(p)
return bounded


def _content_hash(rule_files: list[Path]) -> str:
Expand Down Expand Up @@ -243,9 +271,7 @@ def node(state: SkillspectorState) -> AnalyzerNodeResponse:
content = file_cache.get(path)
if content is None:
continue
if len(content) > MAX_FILE_BYTES:
logger.debug("%s: skipping %s (exceeds size limit)", ANALYZER_ID, path)
continue
raise_if_content_exceeds_limit(path, content)
for af in _match_file(rules, content, path):
findings.append(analyzer_finding_to_finding(af))

Expand Down
67 changes: 63 additions & 4 deletions src/skillspector/nodes/build_context.py
Original file line number Diff line number Diff line change
Expand Up @@ -60,6 +60,11 @@
{".py", ".sh", ".bash", ".zsh", ".js", ".ts", ".rb", ".go", ".rs", ".pl"}
)

# Per-file read cap. Files larger than this fail the scan rather than being
# skipped, because skipping would let malicious content hide in an oversized
# file. Aligned with static_runner.MAX_FILE_BYTES.
_MAX_READ_BYTES = 50 * 1024 * 1024


def _resolve_skill_dir(state: SkillspectorState) -> Path:
"""Resolve state skill_path to an existing directory Path."""
Expand Down Expand Up @@ -106,10 +111,26 @@ def _infer_file_type(path: str) -> str:


def _count_lines(file_path: Path) -> int:
"""Count lines in a file, handling binary and errors gracefully."""
"""Count lines in a file, handling binary and errors gracefully.

Reads the file in fixed-size binary chunks and counts newline bytes so
peak memory stays bounded regardless of file size *or* line length — a
multi-gigabyte file with no newlines must not be materialized in memory
(consistent with the _MAX_READ_BYTES cap in _read_file_cache). The count
matches ``len(text.splitlines())`` for the common ``\\n`` / ``\\r\\n`` cases:
number of newline bytes, plus one for a final line without a trailing
newline.
"""
try:
content = file_path.read_text(encoding="utf-8", errors="replace")
return len(content.splitlines())
newline_count = 0
last_byte = b""
with file_path.open("rb") as fh:
while chunk := fh.read(65536):
newline_count += chunk.count(b"\n")
last_byte = chunk[-1:]
if last_byte and last_byte != b"\n":
newline_count += 1 # final line with no trailing newline
return newline_count
except OSError:
logger.debug("Could not read file for line count: %s", file_path)
return 0
Expand Down Expand Up @@ -164,6 +185,33 @@ def _read_file_cache(skill_dir: Path, components: list[str]) -> dict[str, str]:
return file_cache


def _validate_file_sizes(skill_dir: Path, components: list[str]) -> None:
"""Fail the scan if any discovered file exceeds the per-file read cap."""
oversized: list[tuple[str, int]] = []
for path in components:
full = skill_dir / path
if not full.is_file():
continue
try:
size_bytes = full.stat().st_size
except OSError:
logger.debug("Could not stat file for size validation: %s", path)
continue
if size_bytes > _MAX_READ_BYTES:
oversized.append((path, size_bytes))

if not oversized:
return

details = ", ".join(f"{path} ({size_bytes} bytes)" for path, size_bytes in oversized[:5])
if len(oversized) > 5:
details += f", and {len(oversized) - 5} more"
raise ValueError(
"Scan aborted: file size exceeds the per-file analysis limit "
f"({_MAX_READ_BYTES} bytes): {details}"
)


def _parse_manifest(skill_dir: Path) -> dict[str, object]:
"""Parse SKILL.md or skill.md YAML frontmatter into a manifest dict.

Expand All @@ -175,7 +223,17 @@ def _parse_manifest(skill_dir: Path) -> dict[str, object]:
if not path.is_file():
continue
try:
content = path.read_text(encoding="utf-8", errors="replace")
# Only the leading YAML frontmatter is needed, and it sits at the
# top of the file. Read a bounded *byte* prefix (binary mode, then
# decode) so an oversized SKILL.md (a huge body, or a malicious
# multi-GB file) is never materialized whole. Reading bytes rather
# than text makes the cap a true byte ceiling: a text-mode
# read(_MAX_READ_BYTES) bounds characters, which multibyte content
# could inflate well past _MAX_READ_BYTES of memory. If a skill's
# frontmatter somehow exceeds this, the closing delimiter falls
# outside the prefix and parsing degrades to {} below.
with path.open("rb") as fh:
content = fh.read(_MAX_READ_BYTES).decode("utf-8", errors="replace")
except OSError:
logger.debug("Could not read manifest file: %s", name)
return {}
Expand Down Expand Up @@ -225,6 +283,7 @@ def build_context(state: SkillspectorState) -> dict[str, object]:
skill_dir = _resolve_skill_dir(state)

components = _walk_skill_files(skill_dir)
_validate_file_sizes(skill_dir, components)
file_cache = _read_file_cache(skill_dir, components)
manifest = _parse_manifest(skill_dir)
component_metadata, has_executable_scripts = _build_component_metadata(skill_dir, components)
Expand Down
8 changes: 5 additions & 3 deletions tests/nodes/analyzers/test_behavioral_taint_tracking.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,8 @@

from __future__ import annotations

import pytest

from skillspector.nodes.analyzers import behavioral_taint_tracking


Expand Down Expand Up @@ -258,13 +260,13 @@ def test_missing_file_in_cache(self):
result = behavioral_taint_tracking.node(state)
assert result["findings"] == []

def test_oversized_file_skipped(self):
def test_oversized_file_fails_scan(self):
from skillspector.nodes.analyzers.static_runner import MAX_FILE_BYTES

big = 'import os\nexec(os.environ.get("KEY"))\n' + ("x = 1\n" * MAX_FILE_BYTES)
state = {"components": ["big.py"], "file_cache": {"big.py": big}}
result = behavioral_taint_tracking.node(state)
assert result["findings"] == []
with pytest.raises(ValueError, match="big\\.py"):
behavioral_taint_tracking.node(state)

def test_multiple_files_produce_findings(self):
state = {
Expand Down
38 changes: 38 additions & 0 deletions tests/nodes/analyzers/test_static_patterns.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,8 @@

from __future__ import annotations

import pytest

from skillspector.nodes.analyzers import (
static_patterns_data_exfiltration as data_exfiltration_module,
)
Expand Down Expand Up @@ -172,3 +174,39 @@ def test_empty_components_returns_empty(self):
state = {"components": [], "file_cache": {}}
findings = static_runner.run_static_patterns(state, [prompt_injection_module])
assert findings == []


class TestStaticRunnerSizeLimit:
def test_content_limit_is_enforced_in_characters(
self, monkeypatch: pytest.MonkeyPatch
) -> None:
"""Content whose character count exceeds the cap fails closed."""
monkeypatch.setattr(static_runner, "MAX_FILE_BYTES", 4)

with pytest.raises(ValueError, match=r"multi\.txt .*5 characters"):
static_runner.raise_if_content_exceeds_limit("multi.txt", "xxxxx")

def test_sub_limit_multibyte_content_does_not_falsely_abort(
self, monkeypatch: pytest.MonkeyPatch
) -> None:
"""A string under the char cap must not abort even if its UTF-8 byte
length is over it.

Regression guard: measuring re-encoded bytes here falsely aborts a
legitimate sub-limit binary file, whose errors="replace" decode inflates
each undecodable byte into a 3-byte U+FFFD. "ééé" is 3 chars but 6 bytes;
with a 4-unit cap the byte measure would raise, the char measure must not.
"""
monkeypatch.setattr(static_runner, "MAX_FILE_BYTES", 4)

static_runner.raise_if_content_exceeds_limit("multi.txt", "ééé") # must not raise

def test_run_static_patterns_fails_on_oversized_content(
self,
monkeypatch: pytest.MonkeyPatch,
) -> None:
monkeypatch.setattr(static_runner, "MAX_FILE_BYTES", 4)
state = {"components": ["multi.md"], "file_cache": {"multi.md": "xxxxx"}}

with pytest.raises(ValueError, match=r"multi\.md"):
static_runner.run_static_patterns(state, [prompt_injection_module])
Loading