diff --git a/.github/workflows/validate.yml b/.github/workflows/validate.yml index 5469d0a..2f73721 100644 --- a/.github/workflows/validate.yml +++ b/.github/workflows/validate.yml @@ -26,7 +26,9 @@ jobs: - name: Check shell scripts run: bash -n skills/autoreview/scripts/test-review-harness - name: Check Python scripts - run: python3 -m py_compile skills/autoreview/scripts/autoreview skills/autoreview/scripts/test-review-harness.py + run: python3 -m py_compile skills/autoreview/scripts/autoreview skills/autoreview/scripts/test-review-harness.py skills/autoreview/scripts/autoreview_test.py + - name: Run Python tests + run: python3 -m unittest skills/autoreview/scripts/autoreview_test.py - name: Check Node scripts run: node --check skills/agent-transcript/scripts/agent-transcript - name: Run Node tests diff --git a/skills/autoreview/SKILL.md b/skills/autoreview/SKILL.md index 5364ce5..fdb26a5 100644 --- a/skills/autoreview/SKILL.md +++ b/skills/autoreview/SKILL.md @@ -1,6 +1,6 @@ --- name: autoreview -description: "Run a structured code review (Codex default, Claude optional) as a closeout check on a local or PR branch before commit or ship." +description: "Run a structured code review (Codex default, optional Claude, Droid, Copilot, or Cursor) as a closeout check on a local or PR branch before commit or ship." --- # Auto Review @@ -11,7 +11,7 @@ Codex review is the default when no engine is set. It usually delivers the best Use when: -- user asks for Codex review / Claude review / autoreview / second-model review +- user asks for Codex review / Claude review / Cursor review / autoreview / second-model review - after non-trivial code edits, before final/commit/ship - reviewing a local branch or PR branch after fixes @@ -29,7 +29,7 @@ Use when: - For security-audit suppression changes, verify accepted findings remain auditable: suppressed findings stay in structured output, active output keeps an unsuppressible suppression notice, and aggregate findings cannot hide unrelated active risk. - Never switch or override the requested review engine/model. If the review hits model capacity, retry the same command a few times with the same engine/model. - Be patient with large bundles. Structured review can take up to 30 minutes while the model call is active, especially with Codex tools or web search. -- Treat heartbeat lines like `review still running: ... elapsed=... pid=...` as healthy progress, not a hang. Let the helper continue while heartbeats are advancing. Pass `--stream-engine-output` when live engine text is useful; Codex and Claude filter tool/file chatter, other engines pass raw output through. +- Treat heartbeat lines like `review still running: ... elapsed=... pid=...` as healthy progress, not a hang. Let the helper continue while heartbeats are advancing. Pass `--stream-engine-output` when live engine text is useful; Codex, Claude, and Cursor filter noisy tool/file chatter, other engines pass raw output through. - Do not kill a review just because it has been quiet for 2-5 minutes, or because it is still running under the 30-minute window. Inspect the process only after missing multiple expected heartbeats, after 30 minutes, or after an obviously failed subprocess; prefer letting the same helper command finish. - Tools are useful in review mode. The helper allows read-only inspection tools and web search by default so reviewers can check dependency contracts, upstream docs, and current behavior. - Security perspective is always included, but it should not cripple legitimate functionality. Report security findings only when the change creates a concrete, actionable risk or removes an important safety check. @@ -144,6 +144,10 @@ Run multiple reviewers against one frozen bundle: "$AUTOREVIEW" --panel ``` +`--reviewers all` intentionally excludes Cursor. Cursor review can require +additional trust flags when workspace instructions or local MCP config are +present, so opt into Cursor explicitly. + Set reviewer models and thinking/effort explicitly: ```bash @@ -158,7 +162,41 @@ Inline syntax is also supported: Codex maps thinking to `model_reasoning_effort` and accepts `low`, `medium`, `high`, or `xhigh`. Claude maps thinking to `--effort` and also accepts `max`. -Engines without a real thinking knob reject `--thinking`. +Cursor supports `--model` but rejects `--thinking`. Engines without a real +thinking knob reject `--thinking`. + +## Cursor review + +Run Cursor explicitly: + +```bash +"$AUTOREVIEW" --engine cursor --model auto +``` + +Cursor review uses `cursor-agent` in `--mode ask` with `--workspace `, +`--trust`, and `--sandbox enabled` by default. Authenticate with +`cursor-agent login` or `CURSOR_API_KEY`. + +Optional Cursor-specific controls: + +- `CURSOR_BIN` or `--cursor-bin` to select the CLI binary +- `AUTOREVIEW_CURSOR_SANDBOX=enabled|disabled` or `--cursor-sandbox ...` +- `AUTOREVIEW_CURSOR_APPROVE_MCPS=1`, `--cursor-approve-mcps`, or `--no-cursor-approve-mcps` for trusted MCP-enabled environments +- `AUTOREVIEW_CURSOR_ALLOW_WORKSPACE_INSTRUCTIONS=1`, `--cursor-allow-workspace-instructions`, or `--no-cursor-allow-workspace-instructions` when the repo contains Cursor rules, `AGENTS.md`, `CLAUDE.md`, or local MCP config + +Cursor caveats: + +- `--thinking`, `--no-tools`, and `--no-web-search` are not supported +- Cursor CLI currently documents no `--ignore-rules` / safe-mode equivalent for + review runs +- Cursor CLI also does not document a schema flag like Codex/Claude, so the + helper validates Cursor JSON locally and retries malformed structured output a + few times before failing +- by default the helper fails closed when workspace `.cursor/rules`, + `AGENTS.md`, `CLAUDE.md`, or local MCP config would be loaded; opt in with + `--cursor-allow-workspace-instructions` only for trusted repos +- local MCP config also fails closed unless `--cursor-approve-mcps` is set +- keep MCP auto-approval deliberate, even in trusted repos ## Context Efficiency @@ -196,15 +234,15 @@ The helper: - accepts `--mode uncommitted` as an alias for `--mode local` - otherwise uses current PR base if `gh pr view` works - otherwise uses `origin/main` for non-main branches -- supports `--engine codex`, `claude`, `droid`, and `copilot`; default is `AUTOREVIEW_ENGINE` or `codex`; Codex should remain the default when nothing is set +- supports `--engine codex`, `claude`, `droid`, `copilot`, and `cursor`; default is `AUTOREVIEW_ENGINE` or `codex`; Codex should remain the default when nothing is set - resolves bare `git`, `gh`, reviewer, and PowerShell shell commands from absolute `PATH` entries only, never from the reviewed checkout; explicit relative `--*-bin` paths are resolved from the reviewed repository root - use `--mode commit --commit ` for already-committed work, especially clean `main` after landing - should be left in `--mode auto` or forced to `--mode branch` for PR/branch work; do not force `--mode local` after committing - writes only to stdout unless `--output`, `--json-output`, or live streamed engine stderr is set - supports `--dry-run`, `--parallel-tests`, `--parallel-tests-shell`, `--prompt`, `--prompt-file`, `--dataset`, `--no-tools`, `--no-web-search`, and commit refs -- supports `--stream-engine-output` or `AUTOREVIEW_STREAM_ENGINE_OUTPUT=1` for live engine text while preserving structured validation; Codex and Claude hide tool/file event details, emit compact activity summaries, and report usage at turn completion -- supports opt-in review panels with `--panel` / `--reviewers`, plus per-engine `--model` and `--thinking` -- allows read-only tools and web search by default where the selected CLI supports them; forbids nested review in the prompt; Codex is run through `codex exec` with read-only sandbox and structured output +- supports `--stream-engine-output` or `AUTOREVIEW_STREAM_ENGINE_OUTPUT=1` for live engine text while preserving structured validation; Codex, Claude, and Cursor hide noisy tool/status events, emit compact activity summaries, and report usage at turn completion +- supports opt-in review panels with `--panel` / `--reviewers`, plus per-engine `--model` and `--thinking` where the selected engine supports it +- allows read-only tools and web search by default where the selected CLI supports them; forbids nested review in the prompt; Codex is run through `codex exec` with read-only sandbox and structured output, and Cursor is run through `cursor-agent --mode ask --workspace --sandbox ...` with session/request metadata echoed for traceability and a fail-closed workspace-instruction gate - prints `review still running: elapsed=s pid=` to stderr at long-running intervals while waiting for the selected review engine, unless streamed output or compact Codex activity has been visible recently - prints `autoreview clean: no accepted/actionable findings reported` when the selected review command exits 0 - exits nonzero when accepted/actionable findings are present diff --git a/skills/autoreview/scripts/autoreview b/skills/autoreview/scripts/autoreview index a5ea1d5..f85bee3 100755 --- a/skills/autoreview/scripts/autoreview +++ b/skills/autoreview/scripts/autoreview @@ -17,12 +17,14 @@ from pathlib import Path from typing import Any, Callable -ENGINES = ("codex", "claude", "droid", "copilot") +ENGINES = ("codex", "claude", "droid", "copilot", "cursor") +ALL_REVIEWERS = ("codex", "claude", "droid", "copilot") THINKING_LEVELS_BY_ENGINE = { "codex": {"low", "medium", "high", "xhigh"}, "claude": {"low", "medium", "high", "xhigh", "max"}, "droid": set(), "copilot": set(), + "cursor": set(), } @@ -346,6 +348,10 @@ def bounded_field(text: str, limit: int) -> str: return text[: max(0, limit - len(suffix))] + suffix +def env_truthy(name: str) -> bool: + return os.environ.get(name, "").strip().lower() in {"1", "true", "yes", "on"} + + def read_text(path: Path, limit: int = 40_000) -> str: try: data = path.read_bytes() @@ -629,6 +635,114 @@ def run_copilot(args: argparse.Namespace, repo: Path, prompt: str) -> str: return result.stdout +def cursor_result_event(text: str) -> dict[str, Any] | None: + stripped = text.strip() + if not stripped: + return None + try: + parsed = json.loads(stripped) + except json.JSONDecodeError: + parsed = None + if isinstance(parsed, dict) and parsed.get("type") == "result": + return parsed + for line in reversed(stripped.splitlines()): + line = line.strip() + if not line: + continue + try: + event = json.loads(line) + except json.JSONDecodeError: + continue + if isinstance(event, dict) and event.get("type") == "result": + return event + return None + + +def cursor_rule_warnings(repo: Path) -> list[str]: + warnings: list[str] = [] + if (repo / ".cursor" / "rules").exists(): + warnings.append("workspace .cursor/rules will be loaded") + for instruction_file in ("AGENTS.md", "CLAUDE.md"): + if (repo / instruction_file).exists(): + warnings.append(f"workspace {instruction_file} will be loaded") + local_mcp_paths = [path for path in (repo / ".cursor" / "mcp.json", repo / "mcp.json") if path.exists()] + if local_mcp_paths: + warnings.append("workspace MCP config will be discovered") + return warnings + + +def print_cursor_metadata(text: str) -> None: + event = cursor_result_event(text) + if not event: + return + parts: list[str] = [] + for key in ("session_id", "request_id"): + value = event.get(key) + if isinstance(value, str) and value: + parts.append(f"{key}={value}") + if parts: + print("cursor metadata: " + " ".join(parts), file=sys.stderr) + + +def run_cursor(args: argparse.Namespace, repo: Path, prompt: str) -> str: + if args.thinking: + raise SystemExit("--thinking is not supported by the cursor engine") + if not args.tools: + raise SystemExit("--no-tools is not supported by the cursor engine; Cursor review runs in Ask mode with read-only tool access") + if not args.web_search: + raise SystemExit("--no-web-search is not supported by the cursor engine; Cursor CLI does not expose a switch to disable web search for an agent run") + warnings = cursor_rule_warnings(repo) + local_mcp_config = any(path.exists() for path in (repo / ".cursor" / "mcp.json", repo / "mcp.json")) + if warnings and not args.cursor_allow_workspace_instructions: + raise SystemExit( + "cursor engine refused workspace instructions: " + + "; ".join(warnings) + + ". Re-run with --cursor-allow-workspace-instructions only for trusted repos." + ) + if local_mcp_config and not args.cursor_approve_mcps: + raise SystemExit( + "cursor engine refused local MCP config without approval: " + + "; ".join(path.name for path in (repo / ".cursor" / "mcp.json", repo / "mcp.json") if path.exists()) + + ". Re-run with --cursor-approve-mcps only for trusted repos." + ) + if warnings: + print( + "cursor isolation warning: " + + "; ".join(warnings) + + ". Cursor CLI does not document an ignore-rules/safe-mode equivalent for review runs.", + file=sys.stderr, + ) + cmd = [ + resolve_command(args.cursor_bin, repo), + "--print", + "--trust", + "--workspace", + str(repo), + "--mode", + "ask", + "--sandbox", + args.cursor_sandbox, + "--output-format", + "stream-json" if args.stream_engine_output else "json", + ] + if args.cursor_approve_mcps: + cmd.append("--approve-mcps") + if args.model: + cmd.extend(["--model", args.model]) + result = run_with_heartbeat( + cmd, + repo, + input_text=prompt, + label="cursor", + stream_output=args.stream_engine_output, + stream_display=CursorStreamDisplay() if args.stream_engine_output else None, + ) + print_cursor_metadata(result.stdout) + if result.returncode != 0: + raise SystemExit(f"cursor engine failed ({result.returncode})\n{result.stderr or result.stdout}") + return result.stdout + + class CodexStreamDisplay: def __init__(self, *, activity_seconds: int = 20) -> None: self.activity_seconds = activity_seconds @@ -748,6 +862,68 @@ class ClaudeStreamDisplay: return text +class CursorStreamDisplay: + def __init__(self, *, activity_seconds: int = 20) -> None: + self.activity_seconds = activity_seconds + self.hidden_events = 0 + self.last_visible = time.monotonic() + self.started = False + + def __call__(self, name: str, line: str) -> str | None: + if name != "stdout": + return line + try: + event = json.loads(line) + except json.JSONDecodeError: + return self.visible(line) + if not isinstance(event, dict): + return self.hidden_activity() + event_type = event.get("type") + if event_type == "system" and not self.started: + self.started = True + model = event.get("model") + suffix = f" model={model}" if isinstance(model, str) and model else "" + return self.visible(f"cursor turn started{suffix}\n") + if event_type == "assistant": + return self.assistant_message(event) + if event_type == "result": + request_id = event.get("request_id") + suffix = f" request_id={request_id}" if isinstance(request_id, str) and request_id else "" + return self.visible(self.flush_hidden() + format_cursor_usage(event.get("usage")) + suffix + "\n") + return self.hidden_activity() + + def assistant_message(self, event: dict[str, Any]) -> str | None: + message = event.get("message") + if not isinstance(message, dict): + return self.hidden_activity() + chunks: list[str] = [] + for item in message.get("content", []): + if not isinstance(item, dict): + continue + if item.get("type") == "text" and isinstance(item.get("text"), str): + chunks.append(item["text"].rstrip()) + if chunks: + return self.visible(self.flush_hidden() + "\n".join(chunks) + "\n") + return self.hidden_activity() + + def hidden_activity(self) -> str | None: + self.hidden_events += 1 + if time.monotonic() - self.last_visible < self.activity_seconds: + return None + return self.visible(self.flush_hidden()) + + def flush_hidden(self) -> str: + if not self.hidden_events: + return "" + count = self.hidden_events + self.hidden_events = 0 + return f"cursor activity: {count} hidden tool/status events\n" + + def visible(self, text: str) -> str: + self.last_visible = time.monotonic() + return text + + def format_codex_usage(usage: dict[str, Any]) -> str: fields = [ "input_tokens", @@ -759,6 +935,19 @@ def format_codex_usage(usage: dict[str, Any]) -> str: return "codex usage: " + " ".join(parts) if parts else "codex usage: unavailable" +def format_cursor_usage(usage: Any) -> str: + if not isinstance(usage, dict): + return "cursor usage: unavailable" + fields = [ + "inputTokens", + "outputTokens", + "cacheReadTokens", + "cacheWriteTokens", + ] + parts = [f"{field}={usage[field]}" for field in fields if isinstance(usage.get(field), int)] + return "cursor usage: " + " ".join(parts) if parts else "cursor usage: unavailable" + + def claude_allowed_tools(args: argparse.Namespace) -> str: tools = [tool.strip() for tool in args.claude_allowed_tools.split(",") if tool.strip()] if not args.web_search: @@ -773,19 +962,23 @@ def extract_json(text: str) -> dict[str, Any]: try: parsed = json.loads(stripped) except json.JSONDecodeError as exc: - fenced_report = parse_json_candidate(stripped) - if isinstance(fenced_report, dict) and "findings" in fenced_report: - return fenced_report jsonl_report = extract_json_from_jsonl(stripped) if jsonl_report: return jsonl_report + fenced_report = extract_findings_json_from_text(stripped) or parse_json_candidate(stripped) + if isinstance(fenced_report, dict) and "findings" in fenced_report: + return fenced_report raise SystemExit(f"review engine returned non-JSON output: {exc}\n{stripped[:2000]}") if isinstance(parsed, dict) and "findings" in parsed: return parsed if isinstance(parsed, dict) and isinstance(parsed.get("structured_output"), dict): return parsed["structured_output"] + if isinstance(parsed, dict) and isinstance(parsed.get("result"), dict): + result_object = parsed["result"] + if "findings" in result_object: + return result_object if isinstance(parsed, dict) and isinstance(parsed.get("result"), str): - result_json = parse_json_candidate(parsed["result"]) + result_json = extract_findings_json_from_text(parsed["result"]) or parse_json_candidate(parsed["result"]) if isinstance(result_json, dict) and "findings" in result_json: return result_json raise SystemExit(f"review engine result was not structured JSON:\n{parsed['result'][:2000]}") @@ -796,7 +989,9 @@ def extract_json(text: str) -> dict[str, Any]: def extract_json_from_jsonl(text: str) -> dict[str, Any] | None: + terminal_candidates: list[str | dict[str, Any]] = [] candidates: list[str | dict[str, Any]] = [] + assistant_candidates: list[str] = [] for line in text.splitlines(): line = line.strip() if not line: @@ -813,21 +1008,58 @@ def extract_json_from_jsonl(text: str) -> dict[str, Any] | None: data = event.get("data") if isinstance(data, dict) and isinstance(data.get("content"), str): candidates.append(data["content"]) + message = event.get("message") + if isinstance(message, dict): + for item in message.get("content", []): + if isinstance(item, dict) and item.get("type") == "text" and isinstance(item.get("text"), str): + assistant_candidates.append(item["text"]) if isinstance(event.get("result"), str): - candidates.append(event["result"]) + terminal_candidates.append(event["result"]) + if isinstance(event.get("result"), dict): + terminal_candidates.append(event["result"]) if isinstance(event.get("structured_output"), dict): - candidates.append(event["structured_output"]) + terminal_candidates.append(event["structured_output"]) + for candidate in reversed(terminal_candidates): + if isinstance(candidate, dict): + if "findings" in candidate: + return candidate + continue + parsed = extract_findings_json_from_text(candidate) or parse_json_candidate(candidate) + if isinstance(parsed, dict) and "findings" in parsed: + return parsed + if terminal_candidates: + raise SystemExit("review engine result was not structured JSON:\n" + str(terminal_candidates[-1])[:2000]) for candidate in reversed(candidates): if isinstance(candidate, dict): if "findings" in candidate: return candidate continue - parsed = parse_json_candidate(candidate) + parsed = extract_findings_json_from_text(candidate) or parse_json_candidate(candidate) + if isinstance(parsed, dict) and "findings" in parsed: + return parsed + for candidate in reversed(assistant_candidates): + parsed = extract_findings_json_from_text(candidate) or parse_json_candidate(candidate) if isinstance(parsed, dict) and "findings" in parsed: return parsed return None +def extract_findings_json_from_text(text: str) -> dict[str, Any] | None: + stripped = text.strip() + decoder = json.JSONDecoder() + parsed: dict[str, Any] | None = None + for index, char in enumerate(stripped): + if char != "{": + continue + try: + candidate, _ = decoder.raw_decode(stripped[index:]) + except json.JSONDecodeError: + continue + if isinstance(candidate, dict) and "findings" in candidate: + parsed = candidate + return parsed + + def parse_json_candidate(text: str) -> Any | None: stripped = text.strip() if stripped.startswith("```"): @@ -844,6 +1076,12 @@ def parse_json_candidate(text: str) -> Any | None: return parsed +def is_structured_output_failure(message: str) -> bool: + return message.startswith("review engine returned non-JSON output") or message.startswith( + "review engine result was not structured JSON" + ) + + def validate_report(report: dict[str, Any], repo: Path, changed_paths: set[str], required: list[str]) -> None: allowed_top = {"findings", "overall_correctness", "overall_explanation", "overall_confidence"} extra_top = set(report) - allowed_top @@ -985,7 +1223,8 @@ def parse_args() -> argparse.Namespace: parser.add_argument("--claude-bin", default=os.environ.get("CLAUDE_BIN", "claude")) parser.add_argument("--droid-bin", default=os.environ.get("DROID_BIN", "droid")) parser.add_argument("--copilot-bin", default=os.environ.get("COPILOT_BIN", "copilot")) - parser.add_argument("--no-tools", dest="tools", action="store_false", default=True, help="Disable tools for engines that support it. Codex and copilot reject no-tools review.") + parser.add_argument("--cursor-bin", default=os.environ.get("CURSOR_BIN", "cursor-agent")) + parser.add_argument("--no-tools", dest="tools", action="store_false", default=True, help="Disable tools for engines that support it. Codex, copilot, and Cursor reject no-tools review.") parser.add_argument("--no-web-search", dest="web_search", action="store_false", default=True) parser.add_argument( "--claude-allowed-tools", @@ -1003,7 +1242,39 @@ def parse_args() -> argparse.Namespace: "--stream-engine-output", action="store_true", default=os.environ.get("AUTOREVIEW_STREAM_ENGINE_OUTPUT") == "1", - help="Stream review engine output while preserving buffered output for validation. Codex output is filtered to hide tool/file chatter.", + help="Stream review engine output while preserving buffered output for validation. Codex, Claude, and Cursor filter noisy tool/status chatter.", + ) + parser.add_argument( + "--cursor-sandbox", + choices=["enabled", "disabled"], + default=os.environ.get("AUTOREVIEW_CURSOR_SANDBOX", "enabled"), + help="Sandbox mode for Cursor review runs. Default: enabled.", + ) + parser.add_argument( + "--cursor-approve-mcps", + dest="cursor_approve_mcps", + action="store_true", + default=None, + help="Auto-approve Cursor MCP servers for trusted review environments.", + ) + parser.add_argument( + "--no-cursor-approve-mcps", + dest="cursor_approve_mcps", + action="store_false", + help="Disable Cursor MCP auto-approval even if AUTOREVIEW_CURSOR_APPROVE_MCPS is set.", + ) + parser.add_argument( + "--cursor-allow-workspace-instructions", + dest="cursor_allow_workspace_instructions", + action="store_true", + default=None, + help="Allow Cursor review to run when the repo contains .cursor/rules, AGENTS.md, CLAUDE.md, or local MCP config. Use only for trusted repos.", + ) + parser.add_argument( + "--no-cursor-allow-workspace-instructions", + dest="cursor_allow_workspace_instructions", + action="store_false", + help="Fail closed when Cursor would load workspace instructions or local MCP config.", ) parser.add_argument("--parallel-tests", help="Run a test command concurrently with review; failure fails the helper.") parser.add_argument( @@ -1016,6 +1287,10 @@ def parse_args() -> argparse.Namespace: parser.add_argument("--expect-findings", action="store_true", help="Treat findings as success; for harness acceptance tests.") parser.add_argument("--dry-run", action="store_true") args = parser.parse_args() + if args.cursor_approve_mcps is None: + args.cursor_approve_mcps = env_truthy("AUTOREVIEW_CURSOR_APPROVE_MCPS") + if args.cursor_allow_workspace_instructions is None: + args.cursor_allow_workspace_instructions = env_truthy("AUTOREVIEW_CURSOR_ALLOW_WORKSPACE_INSTRUCTIONS") if args.engine not in ENGINES: raise SystemExit(f"invalid --engine/AUTOREVIEW_ENGINE: {args.engine}") return args @@ -1030,6 +1305,8 @@ def run_engine(args: argparse.Namespace, repo: Path, prompt: str) -> str: return run_droid(args, repo, prompt) if args.engine == "copilot": return run_copilot(args, repo, prompt) + if args.engine == "cursor": + return run_cursor(args, repo, prompt) raise SystemExit(f"unsupported engine: {args.engine}") @@ -1077,7 +1354,7 @@ def reviewer_args(args: argparse.Namespace) -> list[argparse.Namespace]: if args.reviewers: tokens = [token.strip() for token in args.reviewers.split(",") if token.strip()] if len(tokens) == 1 and tokens[0] == "all": - tokens = list(ENGINES) + tokens = list(ALL_REVIEWERS) reviewers = [parse_reviewer_token(token) for token in tokens] elif args.panel: engines = [args.engine] @@ -1117,10 +1394,21 @@ def reviewer_label(args: argparse.Namespace) -> str: def run_reviewer(args: argparse.Namespace, repo: Path, prompt: str, changed_paths: set[str], required: list[str]) -> dict[str, Any]: - raw = run_engine(args, repo, prompt) - report = extract_json(raw) - validate_report(report, repo, changed_paths, required) - return report + attempts = 3 if args.engine == "cursor" else 1 + for attempt in range(1, attempts + 1): + raw = run_engine(args, repo, prompt) + try: + report = extract_json(raw) + validate_report(report, repo, changed_paths, required) + return report + except SystemExit as exc: + if attempt >= attempts or not is_structured_output_failure(str(exc)): + raise + print( + f"retrying {args.engine} structured output validation after attempt {attempt}: {exc}", + file=sys.stderr, + ) + raise SystemExit(f"{args.engine} structured output validation failed after {attempts} attempts") def merge_panel_reports(reports: list[tuple[str, dict[str, Any]]]) -> dict[str, Any]: diff --git a/skills/autoreview/scripts/autoreview_test.py b/skills/autoreview/scripts/autoreview_test.py new file mode 100644 index 0000000..cedaa14 --- /dev/null +++ b/skills/autoreview/scripts/autoreview_test.py @@ -0,0 +1,183 @@ +#!/usr/bin/env python3 +from __future__ import annotations + +import argparse +import importlib.util +import json +import tempfile +import unittest +from importlib.machinery import SourceFileLoader +from pathlib import Path + + +SCRIPT_PATH = Path(__file__).with_name("autoreview") +LOADER = SourceFileLoader("autoreview_module", str(SCRIPT_PATH)) +SPEC = importlib.util.spec_from_loader(LOADER.name, LOADER) +assert SPEC is not None +AUTOREVIEW = importlib.util.module_from_spec(SPEC) +LOADER.exec_module(AUTOREVIEW) + + +FINAL_REPORT = { + "findings": [], + "overall_correctness": "patch is correct", + "overall_explanation": "clean", + "overall_confidence": 0.9, +} + +DRAFT_REPORT = { + "findings": [ + { + "title": "Draft finding", + "body": "draft", + "priority": "P3", + "confidence": 0.2, + "category": "maintainability", + "code_location": {"file_path": "draft.js", "line": 1}, + } + ], + "overall_correctness": "patch is incorrect", + "overall_explanation": "draft", + "overall_confidence": 0.2, +} + + +class AutoreviewCursorTests(unittest.TestCase): + def test_extract_json_prefers_terminal_result_event(self) -> None: + stream = "\n".join( + [ + json.dumps( + { + "type": "assistant", + "message": {"role": "assistant", "content": [{"type": "text", "text": json.dumps(DRAFT_REPORT)}]}, + } + ), + json.dumps( + { + "type": "result", + "subtype": "success", + "result": json.dumps(FINAL_REPORT), + "session_id": "session-id", + "request_id": "request-id", + } + ), + ] + ) + self.assertEqual(AUTOREVIEW.extract_json(stream), FINAL_REPORT) + + def test_extract_json_can_fallback_to_assistant_message(self) -> None: + stream = json.dumps( + { + "type": "assistant", + "message": {"role": "assistant", "content": [{"type": "text", "text": json.dumps(FINAL_REPORT)}]}, + } + ) + self.assertEqual(AUTOREVIEW.extract_json(stream), FINAL_REPORT) + + def test_extract_json_does_not_fallback_past_bad_terminal_result(self) -> None: + stream = "\n".join( + [ + json.dumps( + { + "type": "assistant", + "message": {"role": "assistant", "content": [{"type": "text", "text": json.dumps(FINAL_REPORT)}]}, + } + ), + json.dumps( + { + "type": "result", + "subtype": "success", + "result": "not json", + } + ), + ] + ) + with self.assertRaises(SystemExit) as exc_info: + AUTOREVIEW.extract_json(stream) + self.assertIn("review engine result was not structured JSON", str(exc_info.exception)) + + def test_extract_json_accepts_dict_result_payload(self) -> None: + payload = { + "type": "result", + "subtype": "success", + "result": FINAL_REPORT, + "session_id": "session-id", + "request_id": "request-id", + } + self.assertEqual(AUTOREVIEW.extract_json(json.dumps(payload)), FINAL_REPORT) + + def test_extract_json_accepts_result_string_with_preamble(self) -> None: + payload = { + "type": "result", + "subtype": "success", + "result": "Inspecting the diff first.\n" + json.dumps(FINAL_REPORT), + } + self.assertEqual(AUTOREVIEW.extract_json(json.dumps(payload)), FINAL_REPORT) + + def test_extract_findings_json_from_text_prefers_last_findings_object(self) -> None: + later_report = { + "findings": [ + { + "title": "Later finding", + "body": "later", + "priority": "P2", + "confidence": 0.8, + "category": "bug", + "code_location": {"file_path": "later.js", "line": 2}, + } + ], + "overall_correctness": "patch is incorrect", + "overall_explanation": "later", + "overall_confidence": 0.8, + } + text = f"{json.dumps(FINAL_REPORT)} separator {json.dumps(later_report)}" + self.assertEqual(AUTOREVIEW.extract_findings_json_from_text(text), later_report) + + def test_retry_filter_only_matches_parse_failures(self) -> None: + self.assertTrue(AUTOREVIEW.is_structured_output_failure("review engine returned non-JSON output: nope")) + self.assertTrue(AUTOREVIEW.is_structured_output_failure("review engine result was not structured JSON:\nnope")) + self.assertFalse(AUTOREVIEW.is_structured_output_failure("review JSON missing required key: findings")) + self.assertFalse(AUTOREVIEW.is_structured_output_failure("finding 0 has invalid priority")) + + def test_cursor_workspace_instructions_fail_closed(self) -> None: + with tempfile.TemporaryDirectory(prefix="autoreview-cursor-test.") as tmpdir: + repo = Path(tmpdir) + (repo / "AGENTS.md").write_text("fixture\n") + args = argparse.Namespace( + thinking=None, + tools=True, + web_search=True, + cursor_allow_workspace_instructions=False, + cursor_approve_mcps=False, + cursor_bin="cursor-agent", + cursor_sandbox="enabled", + model="auto", + stream_engine_output=False, + ) + with self.assertRaises(SystemExit) as exc_info: + AUTOREVIEW.run_cursor(args, repo, "prompt") + self.assertIn("cursor engine refused workspace instructions", str(exc_info.exception)) + + def test_cursor_local_mcp_requires_explicit_approval(self) -> None: + with tempfile.TemporaryDirectory(prefix="autoreview-cursor-test.") as tmpdir: + repo = Path(tmpdir) + (repo / ".cursor").mkdir() + (repo / ".cursor" / "mcp.json").write_text("{}\n") + args = argparse.Namespace( + thinking=None, + tools=True, + web_search=True, + cursor_allow_workspace_instructions=True, + cursor_approve_mcps=False, + cursor_bin="cursor-agent", + cursor_sandbox="enabled", + model="auto", + stream_engine_output=False, + ) + with self.assertRaises(SystemExit) as exc_info: + AUTOREVIEW.run_cursor(args, repo, "prompt") + self.assertIn("cursor engine refused local MCP config without approval", str(exc_info.exception)) + + +if __name__ == "__main__": + unittest.main() diff --git a/skills/autoreview/scripts/test-review-harness.ps1 b/skills/autoreview/scripts/test-review-harness.ps1 index ffca686..cc65442 100644 --- a/skills/autoreview/scripts/test-review-harness.ps1 +++ b/skills/autoreview/scripts/test-review-harness.ps1 @@ -3,7 +3,7 @@ param( [ValidateSet('malicious', 'benign')] [string] $Fixture, - [ValidateSet('codex', 'claude', 'droid', 'copilot')] + [ValidateSet('codex', 'claude', 'droid', 'copilot', 'cursor')] [string[]] $Engine, [Alias('h')] diff --git a/skills/autoreview/scripts/test-review-harness.py b/skills/autoreview/scripts/test-review-harness.py index 7f1baaa..7498c6e 100644 --- a/skills/autoreview/scripts/test-review-harness.py +++ b/skills/autoreview/scripts/test-review-harness.py @@ -12,7 +12,7 @@ from pathlib import Path -ENGINES = ("codex", "claude", "droid", "copilot") +ENGINES = ("codex", "claude", "droid", "copilot", "cursor") DEFAULT_ENGINES = ("codex", "claude") MALICIOUS_INITIAL = """export function uploadPath(name) {