Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
9 changes: 9 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -120,6 +120,15 @@ skillspector scan https://github.com/user/my-skill
skillspector scan ./my-skill.zip
```

#### Size limits

SkillSpector enforces two independent caps on remote and archive inputs to bound the impact of oversized downloads and zip bombs:

- **Per-ingest cap**: `INGEST_MAX_BYTES` (100 MiB) — applied to streamed URL downloads, total uncompressed size of zip archives, and post-clone disk usage of Git repos.
- **Zip member cap**: `INGEST_MAX_ZIP_MEMBERS` (10,000) — caps the number of entries in a single zip.

Note that the per-file 1 MB analysis cap (`MAX_FILE_BYTES`) is a separate, downstream limit: it bounds what individual analyzers will read out of an already-ingested directory. The ingest caps above bound how much content can land on disk in the first place. A breach of either ingest cap fails closed with an `IngestLimitExceededError`.

### Output Formats

```bash
Expand Down
163 changes: 149 additions & 14 deletions src/skillspector/input_handler.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,11 @@
- Single markdown files
- Local directories

Ported from legacy implementation.
Each remote/archive ingest path is bounded by ``INGEST_MAX_BYTES`` and
``INGEST_MAX_ZIP_MEMBERS`` so that the per-file analysis caps downstream
of ``InputHandler.resolve()`` are not defeated by an oversized download,
a zip bomb, or a too-large git clone. This file fails closed on any
ingest budget breach (closes #21 / #131).
"""

from __future__ import annotations
Expand All @@ -41,6 +45,30 @@

logger = get_logger(__name__)

# Hard ceiling on what any single ingest path can pull into the temp dir.
# Sized above the per-file analysis cap (``MAX_FILE_BYTES`` = 1 MB) so a
# legitimate multi-file skill is not blocked at ingest, but tight enough
# to bound memory / disk DoS from a malicious source.
INGEST_MAX_BYTES = 100 * 1024 * 1024 # 100 MiB

# Hard ceiling on the number of members in a zip we are willing to
# extract. Catches the "many tiny files" zip-bomb variant where each
# entry is small but the entry count itself exhausts the filesystem.
INGEST_MAX_ZIP_MEMBERS = 10_000

# Chunk size for streaming HTTP downloads. Small enough that the
# byte-count breach check fires promptly; large enough to keep syscall
# overhead reasonable on legitimate inputs.
_DOWNLOAD_CHUNK_BYTES = 64 * 1024


class IngestLimitExceededError(ValueError):
"""Raised when an ingest path exceeds an ``INGEST_MAX_*`` budget.

Subclass of ``ValueError`` so existing callers that catch
``ValueError`` from ``InputHandler.resolve()`` continue to work.
"""


class InputHandler:
"""
Expand All @@ -64,8 +92,10 @@ def resolve(self, input_path: str) -> tuple[Path, str]:
source_type is one of: "git", "url", "zip", "file", "directory"

Raises:
ValueError: If input type cannot be determined
FileNotFoundError: If local path doesn't exist
ValueError: If input type cannot be determined, or if an
ingest path exceeds ``INGEST_MAX_BYTES`` /
``INGEST_MAX_ZIP_MEMBERS`` (``IngestLimitExceededError``).
FileNotFoundError: If local path doesn't exist.
"""
input_path = input_path.strip()

Expand Down Expand Up @@ -123,7 +153,7 @@ def _is_file_url(self, path: str) -> bool:
return not self._is_git_url(path)

def _clone_git(self, url: str) -> Path:
"""Clone a Git repository to a temporary directory."""
"""Clone a Git repository to a temporary directory, bounded by ``INGEST_MAX_BYTES``."""
temp_dir = self._get_temp_dir()
clone_dir = temp_dir / "repo"
try:
Expand All @@ -145,40 +175,126 @@ def _clone_git(self, url: str) -> Path:
raise ValueError(
"Git is not installed. Please install git to scan repositories."
) from None

# Post-clone size check: a successful --depth 1 clone may still
# land an arbitrarily large tree on disk. Reject (and clean up)
# if it exceeds the ingest budget.
total = _directory_size_bytes(clone_dir)
if total > INGEST_MAX_BYTES:
shutil.rmtree(clone_dir, ignore_errors=True)
logger.warning(
"Git clone of %s exceeded ingest cap: %d > %d bytes",
url,
total,
INGEST_MAX_BYTES,
)
raise IngestLimitExceededError(
f"Git clone exceeded ingest cap: {total} bytes > "
f"INGEST_MAX_BYTES ({INGEST_MAX_BYTES})"
)
return clone_dir

def _download_file(self, url: str) -> Path:
"""Download a file from URL to a temporary directory."""
"""Download a file from URL to a temporary directory.

Streams the body to disk in chunks while running a byte counter.
The cap check fires before each chunk is written, so a breach
aborts immediately without accumulating the body in memory. A
partial file produced by a mid-stream breach is removed before
the exception propagates.
"""
temp_dir = self._get_temp_dir()
parsed = urlparse(url)
filename = Path(parsed.path).name or "SKILL.md"
# Write to a stable target path inside the temp dir so we can
# rename / move it after the download succeeds without ever
# holding the body in memory. Use a sentinel name for the
# download itself; we rename / replace at the end.
download_path = temp_dir / "_download.partial"
content_type = ""
try:
with httpx.Client(follow_redirects=True, timeout=30) as client:
response = client.get(url)
response.raise_for_status()
content = response.content
with client.stream("GET", url) as response:
response.raise_for_status()
content_type = response.headers.get("content-type", "")
# Cheap up-front check: trust Content-Length when the
# server provides it, so we abort before reading any
# body bytes. Streaming check below covers the case
# where the header is missing or wrong.
declared = response.headers.get("content-length")
if declared is not None:
try:
declared_bytes = int(declared)
except ValueError:
# Malformed header — fall through to the
# streamed byte counter, which is authoritative.
declared_bytes = None
if declared_bytes is not None and declared_bytes > INGEST_MAX_BYTES:
raise IngestLimitExceededError(
f"Download exceeded ingest cap: "
f"Content-Length {declared} bytes > "
f"INGEST_MAX_BYTES ({INGEST_MAX_BYTES})"
)

received = 0
with download_path.open("wb") as out:
for chunk in response.iter_bytes(_DOWNLOAD_CHUNK_BYTES):
received += len(chunk)
if received > INGEST_MAX_BYTES:
raise IngestLimitExceededError(
f"Download exceeded ingest cap: streamed "
f"{received} bytes > INGEST_MAX_BYTES "
f"({INGEST_MAX_BYTES})"
)
out.write(chunk)
except httpx.HTTPError as e:
# Best-effort cleanup of any partial download.
download_path.unlink(missing_ok=True)
logger.warning("Download failed for %s: %s", url, e)
raise ValueError(f"Failed to download file: {e}") from e
if filename.endswith(".zip") or (
response.headers.get("content-type", "").startswith("application/zip")
):
except IngestLimitExceededError:
# Don't leave the partial bomb on disk.
download_path.unlink(missing_ok=True)
raise

is_zip = filename.endswith(".zip") or content_type.startswith("application/zip")
if is_zip:
zip_path = temp_dir / "download.zip"
zip_path.write_bytes(content)
download_path.replace(zip_path)
return self._extract_zip(zip_path)
file_path = temp_dir / filename
file_path.write_bytes(content)
download_path.replace(file_path)
return temp_dir

def _extract_zip(self, zip_path: Path) -> Path:
"""Extract a zip file to a temporary directory."""
"""Extract a zip file, bounded by ``INGEST_MAX_BYTES`` and ``INGEST_MAX_ZIP_MEMBERS``.

Sums ``ZipInfo.file_size`` (uncompressed size) across all members
before extracting and refuses to extract if either the total or
the member count exceeds the cap. This rejects classic zip
bombs (small archive, huge declared uncompressed size) without
materialising any of the bomb on disk.
"""
if not zip_path.exists():
raise FileNotFoundError(f"Zip file not found: {zip_path}") from None
temp_dir = self._get_temp_dir()
extract_dir = temp_dir / "extracted"
extract_dir.mkdir(exist_ok=True)
try:
with zipfile.ZipFile(zip_path, "r") as zf:
infos = zf.infolist()
if len(infos) > INGEST_MAX_ZIP_MEMBERS:
raise IngestLimitExceededError(
f"Zip exceeded ingest cap: {len(infos)} members > "
f"INGEST_MAX_ZIP_MEMBERS ({INGEST_MAX_ZIP_MEMBERS})"
)
total_uncompressed = sum(info.file_size for info in infos)
if total_uncompressed > INGEST_MAX_BYTES:
raise IngestLimitExceededError(
f"Zip exceeded ingest cap: uncompressed "
f"{total_uncompressed} bytes > INGEST_MAX_BYTES "
f"({INGEST_MAX_BYTES})"
)
zf.extractall(extract_dir)
except zipfile.BadZipFile:
logger.warning("Invalid zip or extract failed: %s", zip_path)
Expand All @@ -196,3 +312,22 @@ def _wrap_single_file(self, file_path: Path) -> Path:
dest = temp_dir / file_path.name
shutil.copy2(file_path, dest)
return temp_dir


def _directory_size_bytes(path: Path) -> int:
"""Return the total size of all regular files under *path*, in bytes.

Symlinks are not followed (``Path.is_file()`` returns ``True`` only
for regular files), so a malicious symlink to ``/dev/zero`` cannot
cause a runaway count.
"""
total = 0
for p in path.rglob("*"):
if p.is_file() and not p.is_symlink():
try:
total += p.stat().st_size
except OSError:
# File disappeared mid-walk (race with concurrent fs ops).
# Skip rather than fail the whole ingest.
continue
return total
Loading