Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
86 changes: 61 additions & 25 deletions scripts/codelens.py
Original file line number Diff line number Diff line change
Expand Up @@ -268,14 +268,25 @@ def _registry_exists(workspace: str) -> bool:

def _auto_setup(workspace: str) -> Dict[str, Any]:
"""Auto-run init + scan when no registry exists. Returns scan result or error info.

Includes timeout protection: if the workspace has many source files,
limits scan to --max-files 3000 to prevent long auto-setup times.

Applies a hard cap of ``_AUTO_SETUP_MAX_FILES`` (3000) files on BOTH the
subprocess path and the in-process fallback path, so auto-setup can never
silently hang on huge repos (issue #34).

Returns a dict with:
- ``auto_setup``: "ok" | "failed"
- ``capped``: True iff the 3000-file cap was reached (only on success)
- ``fallback``: True iff the in-process fallback path was taken (only on success)
- ``files_scanned``: total files scanned (only on success)
- ``hint``: human-readable note (only present when ``capped`` is True)
- ``stage`` / ``error``: failure details (only on failure)
"""
from commands.init import cmd_init
from commands.scan import cmd_scan
import subprocess

# Cap to prevent 5+ minute auto-setup on large repos
# Cap to prevent 5+ minute auto-setup on large repos.
# Applied to BOTH the subprocess path and the in-process fallback.
_AUTO_SETUP_MAX_FILES = 3000
_AUTO_SETUP_TIMEOUT_MSG = (
"Auto-setup running with --max-files 3000 to prevent timeout. "
Expand All @@ -292,37 +303,57 @@ def _auto_setup(workspace: str) -> Dict[str, Any]:
except Exception as e:
return {"auto_setup": "failed", "stage": "init", "error": str(e)}

# Step 2: Scan (with max-files cap for auto-setup)
# Step 2: Scan (with --max-files cap on BOTH paths)
try:
print(f"[CodeLens] {_AUTO_SETUP_TIMEOUT_MSG}", file=sys.stderr)
# Use subprocess to run scan with --max-files flag
# This avoids coupling to cmd_scan's internal signature
# Primary path: subprocess with --max-files flag (timeout=120s).
# This isolates the scan in a child process so we can enforce a
# hard wall-clock timeout on top of the file-count cap.
scan_cmd = [sys.executable, os.path.join(SCRIPT_DIR, "codelens.py"),
"scan", workspace, "--max-files", str(_AUTO_SETUP_MAX_FILES)]
scan_proc = __import__("subprocess").run(
scan_cmd, capture_output=True, text=True, timeout=120
)
if scan_proc.returncode != 0:
# Fallback: try without max-files
try:
scan_result = cmd_scan(workspace, incremental=False)
if scan_result.get("status") != "ok":
return {"auto_setup": "failed", "stage": "scan", "error": scan_result}
except Exception as e2:
return {"auto_setup": "failed", "stage": "scan", "error": str(e2)}
else:
scan_result = json.loads(scan_proc.stdout) if scan_proc.stdout.strip() else {"status": "ok"}
fallback_taken = False
scan_result: Optional[Dict[str, Any]] = None
try:
scan_proc = subprocess.run(
scan_cmd, capture_output=True, text=True, timeout=120
)
if scan_proc.returncode == 0:
scan_result = (
json.loads(scan_proc.stdout)
if scan_proc.stdout.strip()
else {"status": "ok"}
)
except Exception as e:
print(f"[CodeLens] Scan subprocess error: {e}; "
"falling back to in-process scan.", file=sys.stderr)

# Fallback path: in-process scan with the SAME max_files cap.
# The cap is enforced by cmd_scan(max_files=...) so huge repos
# cannot hang auto-setup even when the subprocess path fails.
if scan_result is None:
fallback_taken = True
print(f"[CodeLens] Falling back to in-process scan "
f"with max_files={_AUTO_SETUP_MAX_FILES}.", file=sys.stderr)
scan_result = cmd_scan(
workspace, incremental=False, max_files=_AUTO_SETUP_MAX_FILES
)
if scan_result.get("status") != "ok":
return {"auto_setup": "failed", "stage": "scan", "error": scan_result}

files_scanned = scan_result.get("files_scanned", {})
total_files = sum(v for v in files_scanned.values() if isinstance(v, int)) if isinstance(files_scanned, dict) else 0
print(f"[CodeLens] Auto-setup complete. {total_files} files scanned. Registry built.", file=sys.stderr)

result_info = {
capped = total_files >= _AUTO_SETUP_MAX_FILES
print(f"[CodeLens] Auto-setup complete. {total_files} files scanned. "
f"Registry built. (fallback={fallback_taken}, capped={capped})",
file=sys.stderr)

result_info: Dict[str, Any] = {
"auto_setup": "ok",
"files_scanned": total_files,
"capped": total_files >= _AUTO_SETUP_MAX_FILES,
"capped": capped,
"fallback": fallback_taken,
}
if total_files >= _AUTO_SETUP_MAX_FILES:
if capped:
result_info["hint"] = "Auto-setup capped at 3000 files. Run 'scan' manually for full analysis."
return result_info
except Exception as e:
Expand Down Expand Up @@ -1018,6 +1049,11 @@ def main():
auto_setup_info = {
"auto_setup": True,
"message": "Registry was auto-built. For best results, run 'scan' manually on large repos.",
# Issue #34: surface which path produced the registry and
# whether the 3000-file cap was hit, so MCP clients / agents
# can decide whether to trust the registry or re-scan.
"capped": bool(auto_setup_result.get("capped", False)),
"fallback": bool(auto_setup_result.get("fallback", False)),
}
else:
auto_setup_info = {
Expand Down
37 changes: 35 additions & 2 deletions scripts/commands/scan.py
Original file line number Diff line number Diff line change
Expand Up @@ -52,25 +52,33 @@
help="Only re-scan changed files")
parser.add_argument("--plugins", nargs="*", default=None,
help="Enable plugin rules: specify plugin names or 'all' for all rule_pack plugins")
parser.add_argument("--max-files", type=int, default=None,
help="Cap total files scanned (default: unlimited). "
"Used by auto-setup to prevent timeout on huge repos.")


def execute(args, workspace):
"""Execute the scan command."""
incremental = getattr(args, 'incremental', False)
plugins = getattr(args, 'plugins', None)
max_files = getattr(args, 'max_files', None)
# Only auto-enable incremental if the user didn't explicitly request a full scan
# and the registry already exists. We check for explicit --incremental flag.
# Note: When user runs "scan" without --incremental, they expect a full scan.
# Auto-incremental was causing confusion where 2nd scan would miss changes.
# Now: explicit --incremental for incremental, bare "scan" for full scan.
return cmd_scan(workspace, incremental, plugins=plugins)
return cmd_scan(workspace, incremental, plugins=plugins, max_files=max_files)


def cmd_scan(workspace: str, incremental: bool = False, plugins: Optional[list] = None) -> Dict[str, Any]:
def cmd_scan(workspace: str, incremental: bool = False, plugins: Optional[list] = None,

Check failure on line 73 in scripts/commands/scan.py

View check run for this annotation

SonarQubeCloud / SonarCloud Code Analysis

Refactor this function to reduce its Cognitive Complexity from 495 to the 15 allowed.

See more on https://sonarcloud.io/project/issues?id=Wolfvin_CodeLens&issues=AZ8UEfoSlrQJlDPZPeWS&open=AZ8UEfoSlrQJlDPZPeWS&pullRequest=86
max_files: Optional[int] = None) -> Dict[str, Any]:
"""
Scan the workspace and build/update the registry.

If incremental=True, only re-scan changed files.
If plugins is provided, load plugin rules for the scan.
If max_files is provided and > 0, cap the total number of discovered files
that get parsed (used by auto-setup to prevent timeout on huge repos).
"""
workspace = os.path.abspath(workspace)
config = load_config(workspace)
Expand All @@ -88,6 +96,12 @@
# Discover files
files = discover_files(workspace, config)

# Apply max_files cap (auto-setup uses this to bound scan time on huge repos).
# The cap is applied AFTER discovery but BEFORE parsing, so os.walk cost is
# unchanged but parsing/registry-build cost is bounded.
if max_files is not None and max_files > 0:
files = _cap_discovered_files(files, max_files)

# Check if incremental scan is possible
changed_files = None
if incremental:
Expand Down Expand Up @@ -1172,6 +1186,25 @@
return f"Detected {', '.join(parts)} source files — these languages do not have dedicated parsers yet. CodeLens uses regex-based fallback extraction for many languages, but analysis may be less accurate than for fully supported languages (JS/TS/Python/Rust/HTML/CSS). Note: Go, Java, Kotlin, C/C++, C#, Ruby, Elixir, Dart, Swift, Scala, Shell, PHP, GDScript, Lua, and Objective-C all have fallback parsers; they are listed here only when no parser exists."


def _cap_discovered_files(files: Dict[str, List[str]], max_files: int) -> Dict[str, List[str]]:
"""Cap total files across all categories to ``max_files``.

Truncates per-category lists in dict iteration order until the budget
is exhausted; remaining categories are emptied. Used by auto-setup to
bound scan time on huge repos (issue #34).
"""
capped: Dict[str, List[str]] = {}
remaining = max_files
for key, file_list in files.items():
if not file_list or remaining <= 0:
capped[key] = []
continue
take = file_list[:remaining]
capped[key] = take
remaining -= len(take)
return capped


def discover_files(workspace: str, config: Dict) -> Dict[str, List[str]]:
"""
Discover all relevant source files in the workspace.
Expand Down
188 changes: 188 additions & 0 deletions tests/test_cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -587,3 +587,191 @@
finally:
import shutil
shutil.rmtree(ws, ignore_errors=True)


class TestAutoSetupFallbackCap:
"""Regression tests for issue #34.

``_auto_setup`` in scripts/codelens.py runs scan via subprocess with
``--max-files 3000`` as a timeout guard. When the subprocess failed
(non-zero exit / exception), the fallback called
``cmd_scan(workspace, incremental=False)`` with NO cap and NO timeout,
so huge repos could hang auto-setup indefinitely — while the result
hint still claimed "Auto-setup capped at 3000 files" (a lie).

Fix: the fallback must pass ``max_files=_AUTO_SETUP_MAX_FILES`` to
``cmd_scan``, and ``result["_auto_setup"]`` must surface ``capped`` and
``fallback`` flags so MCP clients can tell which path produced the
registry.
"""

def test_fallback_passes_max_files_cap(self, monkeypatch):
"""When the subprocess scan fails, the fallback ``cmd_scan`` call
must be invoked with ``max_files=3000`` (not uncapped)."""
import subprocess
from commands import scan as scan_mod
import codelens

ws = _create_sample_workspace()
try:
# Force subprocess.run to raise so the fallback path is taken.
def fake_run(*args, **kwargs):
raise subprocess.SubprocessError("simulated subprocess failure")
monkeypatch.setattr(subprocess, "run", fake_run)

# Spy on cmd_scan to capture the max_files kwarg.
captured = {}
real_cmd_scan = scan_mod.cmd_scan

def spy_cmd_scan(workspace, incremental=False, plugins=None, max_files=None):
captured["called"] = True
captured["max_files"] = max_files
captured["incremental"] = incremental
# Delegate to the real cmd_scan so the registry actually
# gets built (otherwise _auto_setup would fail downstream).
return real_cmd_scan(
workspace, incremental=incremental,
plugins=plugins, max_files=max_files,
)
monkeypatch.setattr(scan_mod, "cmd_scan", spy_cmd_scan)

result = codelens._auto_setup(ws)

assert captured.get("called") is True, (
"Fallback path did not call cmd_scan at all"
)
assert captured.get("max_files") == 3000, (
f"Fallback must pass max_files=3000 to cmd_scan; "
f"got: {captured.get('max_files')!r}"
)
assert result.get("auto_setup") == "ok"
assert result.get("fallback") is True
finally:
import shutil
shutil.rmtree(ws, ignore_errors=True)

def test_fallback_sets_capped_and_fallback_flags(self, monkeypatch, capsys):
"""``result['_auto_setup']`` must include ``capped=True`` and
``fallback=True`` when the fallback path runs against a workspace
large enough to actually hit the 3000-file cap.

Verifies issue #34's Definition of Done item #2: the hint that
says "capped at 3000 files" must no longer be a lie.

Drives the full CLI flow (``codelens.main()``) so the assertion
is on the actual ``result["_auto_setup"]`` dict that gets attached
to the command's JSON output, not just on ``_auto_setup()``'s
private return value.
"""
import subprocess
import codelens

# Build a workspace with > 3000 source files so the cap is hit.
ws = tempfile.mkdtemp()
try:
os.makedirs(os.path.join(ws, "src"), exist_ok=True)
for i in range(3001):
with open(os.path.join(ws, "src", f"f{i}.py"), "w") as fh:
fh.write(f"def f{i}():\n pass\n")

# Force subprocess.run to raise → fallback path is taken.
# This patches subprocess.run in THIS process, which is the
# same process codelens.main() runs in, so the inner
# subprocess call inside _auto_setup will hit the patch.
def fake_run(*args, **kwargs):
raise subprocess.SubprocessError("simulated subprocess failure")
monkeypatch.setattr(subprocess, "run", fake_run)

# Drive the full CLI flow in-process so we can assert on the
# actual result["_auto_setup"] attached to the JSON output.
old_argv = sys.argv
sys.argv = ["codelens.py", "list", ws, "--format", "json"]
try:
codelens.main()
except SystemExit as e:

Check failure on line 691 in tests/test_cli.py

View check run for this annotation

SonarQubeCloud / SonarCloud Code Analysis

Reraise this exception to stop the application as the user expects

See more on https://sonarcloud.io/project/issues?id=Wolfvin_CodeLens&issues=AZ8UEfr0lrQJlDPZPeWT&open=AZ8UEfr0lrQJlDPZPeWT&pullRequest=86
# main() may sys.exit(0) on success or sys.exit(1) on gate
# failure (for `check`). For `list`, success path returns
# normally or exits 0.
assert e.code in (0, None), (
f"unexpected exit code from main(): {e.code}"
)
finally:
sys.argv = old_argv

captured = capsys.readouterr()
assert captured.out.strip().startswith("{"), (
f"expected JSON on stdout; got: {captured.out[:300]!r}"
)
result = json.loads(captured.out.strip())
auto = result.get("_auto_setup")
assert auto is not None, (
"result['_auto_setup'] missing from CLI output; "
f"got keys: {list(result.keys())}"
)
assert auto.get("fallback") is True, (
f"expected fallback=True after subprocess failure; "
f"got _auto_setup={auto!r}"
)
assert auto.get("capped") is True, (
f"expected capped=True when workspace has >3000 files; "
f"got _auto_setup={auto!r}"
)
finally:
import shutil
shutil.rmtree(ws, ignore_errors=True)

def test_main_path_no_fallback_when_subprocess_succeeds(self, capsys):
"""Sanity guard for Definition of Done item #3: when the subprocess
path succeeds, ``fallback`` must be False and the auto-setup flags
must still be present on ``result["_auto_setup"]``.

Also confirms the issue #34 fix didn't break the main path: with
``--max-files`` now a registered scan argument, the subprocess no
longer exits 2 on argparse rejection, so the main path actually
runs end-to-end (previously it silently failed every time and the
fallback was always taken).
"""
import codelens

ws = _create_sample_workspace()
try:
# Real subprocess (no monkeypatching). With the fix,
# `--max-files` is now a valid scan arg, so the subprocess
# should succeed and the fallback should NOT be taken.
old_argv = sys.argv
sys.argv = ["codelens.py", "list", ws, "--format", "json"]
try:
codelens.main()
except SystemExit as e:

Check failure on line 745 in tests/test_cli.py

View check run for this annotation

SonarQubeCloud / SonarCloud Code Analysis

Reraise this exception to stop the application as the user expects

See more on https://sonarcloud.io/project/issues?id=Wolfvin_CodeLens&issues=AZ8UEfr0lrQJlDPZPeWU&open=AZ8UEfr0lrQJlDPZPeWU&pullRequest=86
assert e.code in (0, None), (
f"unexpected exit code from main(): {e.code}"
)
finally:
sys.argv = old_argv

captured = capsys.readouterr()
assert captured.out.strip().startswith("{"), (
f"expected JSON on stdout; got: {captured.out[:300]!r}"
)
result = json.loads(captured.out.strip())
auto = result.get("_auto_setup")
assert auto is not None, (
f"result['_auto_setup'] missing; got keys: {list(result.keys())}"
)
# Sample workspace has ~4 files (html, css, js, rs) — well below
# the 3000-file cap, so capped must be False.
assert auto.get("fallback") is False, (
f"expected fallback=False on subprocess success; "
f"got _auto_setup={auto!r}"
)
assert auto.get("capped") is False, (
f"expected capped=False for small workspace; "
f"got _auto_setup={auto!r}"
)
# Flags must always be present (even when False) so MCP clients
# can rely on the schema.
assert "capped" in auto, "capped flag missing from _auto_setup"
assert "fallback" in auto, "fallback flag missing from _auto_setup"
finally:
import shutil
shutil.rmtree(ws, ignore_errors=True)
Loading