Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
15 changes: 13 additions & 2 deletions gitshield/cli.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
"""GitShield CLI — Prevent accidental secret commits."""

import dataclasses
import re
import sys
from pathlib import Path
Expand All @@ -9,7 +10,8 @@
from . import __version__
from .config import build_custom_patterns, filter_findings, load_config, load_ignore_list, find_git_root
from .formatter import print_findings, print_json, print_blocked_message, colorize, Colors
from .scanner import scan_path, ScannerError
from .models import ScannerError
from .scanner import scan_path


@click.group()
Expand Down Expand Up @@ -49,7 +51,16 @@ def scan(path: str, staged: bool, no_git: bool, as_json: bool, sarif: bool, quie
# Output
if sarif:
from .formatter import print_sarif
print_sarif(findings)
# SARIF requires relative URIs so GitHub Code Scanning can map findings.
scan_root = Path(path).resolve()
sarif_findings = []
for f in findings:
try:
rel = str(Path(f.file).relative_to(scan_root))
except ValueError:
rel = f.file
sarif_findings.append(dataclasses.replace(f, file=rel))
print_sarif(sarif_findings)
elif as_json:
print_json(findings)
else:
Expand Down
37 changes: 34 additions & 3 deletions gitshield/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -78,7 +78,7 @@ class GitShieldConfig:
entropy_threshold: float = 4.5
scan_tests: bool = False
allowlist_paths: List[str] = field(default_factory=list)
allowlist_rules: List[str] = field(default_factory=list)
allowlist_rules: Set[str] = field(default_factory=set)
allowlist_fingerprints: Set[str] = field(default_factory=set)
custom_patterns: List[Dict[str, Any]] = field(default_factory=list)

Expand Down Expand Up @@ -173,11 +173,16 @@ def load_config(path: Path) -> GitShieldConfig:
else:
fingerprints = set()

try:
entropy_threshold = float(scan.get("entropy_threshold", 4.5))
except (ValueError, TypeError):
entropy_threshold = 4.5

return GitShieldConfig(
entropy_threshold=float(scan.get("entropy_threshold", 4.5)),
entropy_threshold=entropy_threshold,
scan_tests=bool(scan.get("scan_tests", False)),
allowlist_paths=list(allowlist.get("paths", [])),
allowlist_rules=list(allowlist.get("rules", [])),
allowlist_rules=set(allowlist.get("rules", [])),
allowlist_fingerprints=fingerprints,
custom_patterns=list(data.get("custom_patterns", [])),
)
Expand Down Expand Up @@ -270,6 +275,28 @@ def filter_findings(
# Custom pattern builder
# ---------------------------------------------------------------------------

_REDOS_TEST_STRING = "a" * 100


def _regex_is_safe(compiled_re) -> bool:
"""Return True if the regex completes on a benign test string within 1 second.

Protects against catastrophic backtracking (ReDoS) in custom patterns from
.gitshield.toml. Uses a background thread so it works cross-platform.
"""
import threading

finished = threading.Event()

def _run():
compiled_re.search(_REDOS_TEST_STRING)
finished.set()

t = threading.Thread(target=_run, daemon=True)
t.start()
return finished.wait(timeout=1.0)


def build_custom_patterns(config: "GitShieldConfig") -> List[Pattern]:
"""Convert config.custom_patterns dicts into Pattern objects.

Expand Down Expand Up @@ -303,6 +330,10 @@ def build_custom_patterns(config: "GitShieldConfig") -> List[Pattern]:
print(f"gitshield: custom pattern '{pattern_id}' has invalid regex: {exc}", file=sys.stderr)
continue

if not _regex_is_safe(compiled):
print(f"gitshield: custom pattern '{pattern_id}' timed out (possible ReDoS), skipping", file=sys.stderr)
continue

try:
built.append(Pattern(
id=pattern_id,
Expand Down
19 changes: 19 additions & 0 deletions gitshield/db.py
Original file line number Diff line number Diff line change
Expand Up @@ -116,6 +116,25 @@ def mark_notified(
conn.commit()


def mark_notified_batch(
repo_url: str,
fingerprints: List[str],
email: Optional[str] = None,
method: str = "email",
) -> None:
"""Record that we notified about multiple findings in a single transaction."""
if not fingerprints:
return
conn = get_connection()
now = datetime.now().isoformat()
conn.executemany("""
INSERT OR IGNORE INTO notifications
(repo_url, email, fingerprint, notified_at, method)
VALUES (?, ?, ?, ?, ?)
""", [(repo_url, email, fp, now, method) for fp in fingerprints])
conn.commit()


def get_notified_fingerprints(repo_url: str, fingerprints: List[str]) -> Set[str]:
"""Return the subset of *fingerprints* that have already been notified."""
if not fingerprints:
Expand Down
43 changes: 27 additions & 16 deletions gitshield/engine.py
Original file line number Diff line number Diff line change
Expand Up @@ -57,16 +57,6 @@
# ---------------------------------------------------------------------------


def _is_binary_file(filepath: Path) -> bool:
"""Return True if *filepath* looks like a binary file (null byte in first 8 KB)."""
try:
with open(filepath, "rb") as fh:
chunk = fh.read(8192)
return b"\x00" in chunk
except (OSError, IOError):
return True # unreadable — treat as binary


def _should_skip_path(path: Path) -> bool:
"""Return True if *path* should be skipped based on directory name or extension."""
# Check only directory components for skip-listed directory names (not filename).
Expand Down Expand Up @@ -113,16 +103,19 @@ def _compile_gitignore_patterns(patterns: List[str]) -> List[tuple]:

def _matches_gitignore(rel_path: str, ignore_patterns: List[tuple]) -> bool:
"""Return True if *rel_path* matches any pre-compiled gitignore pattern."""
path_obj = Path(rel_path)
parts = path_obj.parts
name = path_obj.name
for is_dir, compiled_re in ignore_patterns:
if is_dir:
# Directory-only pattern: match against path components.
if any(compiled_re.fullmatch(part) for part in Path(rel_path).parts):
if any(compiled_re.fullmatch(part) for part in parts):
return True
else:
# Match against full relative path and also the basename.
if compiled_re.fullmatch(rel_path):
return True
if compiled_re.fullmatch(Path(rel_path).name):
if compiled_re.fullmatch(name):
return True
return False

Expand Down Expand Up @@ -155,7 +148,7 @@ def scan_text(
"""
findings: List[Finding] = []
lines = text.splitlines()
all_patterns = list(PATTERNS) + list(extra_patterns or [])
all_patterns = PATTERNS if not extra_patterns else list(PATTERNS) + list(extra_patterns)

for idx, line in enumerate(lines, start=1):
# Honour inline ignore directives.
Expand Down Expand Up @@ -297,7 +290,12 @@ def scan_directory(

# ---- staged-only mode: delegate to git for the file list ----
if staged_only:
return _scan_staged(root)
return _scan_staged(
root,
config_threshold=config_threshold,
extra_patterns=extra_patterns,
scan_tests=scan_tests,
)

# ---- full tree walk ----
ignore_patterns: List[tuple] = []
Expand Down Expand Up @@ -372,7 +370,12 @@ def scan_content(
# Internal: staged-file scanning
# ---------------------------------------------------------------------------

def _scan_staged(root: Path) -> List[Finding]:
def _scan_staged(
root: Path,
config_threshold: Optional[float] = None,
extra_patterns: Optional[List] = None,
scan_tests: bool = True,
) -> List[Finding]:
"""Scan only files staged in git inside *root*."""
try:
result = subprocess.run(
Expand Down Expand Up @@ -400,6 +403,14 @@ def _scan_staged(root: Path) -> List[Finding]:
continue
if _should_skip_path(file_path):
continue
findings.extend(scan_file(file_path))
if not scan_tests and _is_test_file(file_path.name):
continue
findings.extend(
scan_file(
file_path,
config_threshold=config_threshold,
extra_patterns=extra_patterns,
)
)

return findings
37 changes: 24 additions & 13 deletions gitshield/hook.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@

import json
import sys
from pathlib import Path
from typing import List

from .config import build_custom_patterns, load_config
Expand Down Expand Up @@ -29,16 +30,25 @@
".key",
".p12",
".pfx",
".htpasswd",
"id_rsa",
"id_ed25519",
"id_ecdsa",
".netrc",
".pgpass",
".npmrc",
".pypirc",
]


def _is_allowed_path(filepath: str) -> bool:
"""Check if filepath is in the allowlist (example env files only)."""
lower = filepath.lower()
for pattern in ALLOWED_PATHS:
if lower.endswith(pattern):
return True
return False
"""Check if filepath is in the allowlist (example env files only).

Matches only the basename to prevent bypass via paths like
'/app/secrets/malicious.env.example'.
"""
basename = Path(filepath).name.lower()
return basename in ALLOWED_PATHS


def _is_sensitive_path(filepath: str) -> bool:
Expand Down Expand Up @@ -81,7 +91,6 @@ def handle_hook(input_data: dict) -> dict:

# Load config for custom patterns and entropy threshold.
try:
from pathlib import Path
config = load_config(Path("."))
custom = build_custom_patterns(config) or None
threshold = config.entropy_threshold
Expand All @@ -90,9 +99,9 @@ def handle_hook(input_data: dict) -> dict:
custom = None
threshold = None

# Handle Write / Edit tools
if tool_name in ("Write", "Edit"):
filepath = str(tool_input.get("file_path", tool_input.get("path", "")))
# Handle Write / Edit / NotebookEdit tools
if tool_name in ("Write", "Edit", "NotebookEdit"):
filepath = str(tool_input.get("file_path", tool_input.get("notebook_path", tool_input.get("path", ""))))

# Skip allowed paths
if _is_allowed_path(filepath):
Expand All @@ -102,6 +111,8 @@ def handle_hook(input_data: dict) -> dict:
content = str(tool_input.get("content", ""))
if not content:
content = str(tool_input.get("new_string", ""))
if not content:
content = str(tool_input.get("cell_source", ""))

if not content:
return {"result": "approve"}
Expand All @@ -117,8 +128,8 @@ def handle_hook(input_data: dict) -> dict:
"result": "block",
"reason": _format_block_reason(findings, filepath),
}
# Non-sensitive path: block on critical/high, warn on medium/low
critical = [f for f in findings if f.severity in ("critical", "high")]
# Non-sensitive path: block on critical/high/medium
critical = [f for f in findings if f.severity in ("critical", "high", "medium")]
if critical:
return {
"result": "block",
Expand All @@ -139,7 +150,7 @@ def handle_hook(input_data: dict) -> dict:
)

if findings:
critical = [f for f in findings if f.severity in ("critical", "high")]
critical = [f for f in findings if f.severity in ("critical", "high", "medium")]
if critical:
return {
"result": "block",
Expand Down
3 changes: 2 additions & 1 deletion gitshield/monitor.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,8 @@

from .config import get_github_token
from .db import was_scanned_recently, mark_scanned
from .scanner import scan_path, Finding
from .models import Finding
from .scanner import scan_path

# Valid GitHub owner/name characters.
_VALID_GH_NAME = re.compile(r'^[A-Za-z0-9._-]+$')
Expand Down
18 changes: 11 additions & 7 deletions gitshield/notifier.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@
from .config import get_github_token
from .models import Finding
from .monitor import RepoInfo
from .db import mark_notified, get_notified_fingerprints
from .db import mark_notified, mark_notified_batch, get_notified_fingerprints


class NotifierError(Exception):
Expand Down Expand Up @@ -76,6 +76,9 @@ def send_email(
To stop receiving these alerts, rotate your credentials and remove them from git history.
"""

if requests is None:
raise NotifierError("requests package required: pip install gitshield[patrol]")

if dry_run:
print(f"[DRY RUN] Would send email to: {to_email}")
print(f"Subject: {subject}")
Expand All @@ -98,9 +101,8 @@ def send_email(
)
response.raise_for_status()

# Mark as notified
for f in findings:
mark_notified(repo.url, f.fingerprint, to_email, "email")
# Mark all findings as notified in a single transaction
mark_notified_batch(repo.url, [f.fingerprint for f in findings], to_email, "email")

return True

Expand Down Expand Up @@ -153,6 +155,9 @@ def create_github_issue(
*Automated alert from [GitShield](https://github.com/bokiko/gitshield)*
"""

if requests is None:
raise NotifierError("requests package required: pip install gitshield[patrol]")

if dry_run:
print(f"[DRY RUN] Would create issue on: {repo.url}")
print(f"Title: {title}")
Expand All @@ -174,9 +179,8 @@ def create_github_issue(
)
response.raise_for_status()

# Mark as notified
for f in findings:
mark_notified(repo.url, f.fingerprint, method="github_issue")
# Mark all findings as notified in a single transaction
mark_notified_batch(repo.url, [f.fingerprint for f in findings], method="github_issue")

return True

Expand Down
4 changes: 2 additions & 2 deletions gitshield/patterns.py
Original file line number Diff line number Diff line change
Expand Up @@ -590,9 +590,9 @@ def __post_init__(self) -> None:
id="cohere-api-key",
name="Cohere API Key",
regex=re.compile(
r"[A-Za-z0-9]{40}"
r"(?i)(?:cohere[_\-]?api[_\-]?key|COHERE_API_KEY)\s*[:=]\s*['\"]?([A-Za-z0-9]{40})['\"]?"
),
description="Cohere API key (40-char alphanumeric with co- context)",
description="Cohere API key (contextual keyword + 40-char alphanumeric)",
severity="medium",
entropy_threshold=4.5,
),
Expand Down
Loading
Loading