diff --git a/skill_eval/agent_runner.py b/skill_eval/agent_runner.py index d8da247..9628c73 100644 --- a/skill_eval/agent_runner.py +++ b/skill_eval/agent_runner.py @@ -11,12 +11,15 @@ import abc import json +import logging import shutil import subprocess import time from pathlib import Path from typing import Optional +LOG = logging.getLogger("skill_eval.agent_runner") + # --------------------------------------------------------------------------- # Abstract interface @@ -183,6 +186,9 @@ def run_prompt( if output_format == "stream-json": cmd.extend(["--output-format", "stream-json", "--verbose"]) + LOG.debug("Claude command: %s", " ".join(cmd)) + LOG.debug("Working directory: %s", workspace_dir or "(inherited)") + start = time.monotonic() try: result = subprocess.run( @@ -193,12 +199,18 @@ def run_prompt( cwd=workspace_dir, ) elapsed = time.monotonic() - start + LOG.debug("Claude exited with code %d in %.1fs", result.returncode, elapsed) + LOG.debug("Claude stdout: %s", result.stdout[:5000]) + if result.stderr: + LOG.debug("Claude stderr: %s", result.stderr[:2000]) return result.stdout, result.stderr, result.returncode, elapsed except subprocess.TimeoutExpired: elapsed = time.monotonic() - start + LOG.debug("Claude timed out after %ds", timeout) return "", f"Timed out after {timeout}s", -1, elapsed except FileNotFoundError: elapsed = time.monotonic() - start + LOG.debug("Claude CLI not found on PATH") return "", "claude CLI not found", -1, elapsed def parse_output(self, raw: str) -> dict: diff --git a/skill_eval/cli.py b/skill_eval/cli.py index fa60d57..8d43f46 100644 --- a/skill_eval/cli.py +++ b/skill_eval/cli.py @@ -6,6 +6,8 @@ import sys from pathlib import Path +from skill_eval.logging_config import configure_logging + from skill_eval.schemas import AuditReport, Finding, calculate_score, calculate_grade from skill_eval.audit.structure_check import check_structure from skill_eval.audit.security_scan import scan_security, SAFE_DOMAINS @@ -110,6 +112,11 @@ def main(argv: list[str] | None = None) -> int: prog="skill-eval", description="Agent Skill Security & Quality Evaluation", ) + # Global flags + parser.add_argument("--debug", action="store_true", + help="Emit debug logs to stderr") + parser.add_argument("--debug-log", type=str, default=None, metavar="FILE", + help="Write debug logs to FILE") subparsers = parser.add_subparsers(dest="command", help="Available commands") # audit command @@ -250,6 +257,9 @@ def main(argv: list[str] | None = None) -> int: args = parser.parse_args(argv) + # Configure logging before dispatching to any subcommand + configure_logging(debug=args.debug, debug_log=args.debug_log) + if not args.command: parser.print_help() return 0 diff --git a/skill_eval/functional.py b/skill_eval/functional.py index 3a79924..7613563 100644 --- a/skill_eval/functional.py +++ b/skill_eval/functional.py @@ -8,6 +8,7 @@ from __future__ import annotations import json +import logging import math import shutil import sys @@ -23,6 +24,8 @@ from skill_eval.agent_runner import AgentRunner, AgentNotAvailableError, get_runner from skill_eval.grading import grade_output +LOG = logging.getLogger("skill_eval.functional") + # --------------------------------------------------------------------------- # Public entry point @@ -55,6 +58,9 @@ def run_functional_eval( """ path = Path(skill_path).resolve() + LOG.debug("Starting functional eval for skill: %s", path) + LOG.debug("Runs per eval: %d, timeout: %ds, agent: %s", runs_per_eval, timeout, agent) + # Load evals evals_file = Path(evals_path) if evals_path else path / "evals" / "evals.json" try: @@ -243,6 +249,9 @@ def _execute_eval_pair( with_workspace = Path(with_tmpdir) without_workspace = Path(without_tmpdir) + LOG.debug("With-skill workspace: %s", with_workspace) + LOG.debug("Without-skill workspace: %s", without_workspace) + # Copy eval case files into both workspaces for ws in (with_workspace, without_workspace): for rel_file in eval_case.files: @@ -251,6 +260,7 @@ def _execute_eval_pair( if src.is_file(): dst.parent.mkdir(parents=True, exist_ok=True) shutil.copy2(src, dst) + LOG.debug("Copied eval file %s -> %s", src, dst) # Copy skill resource directories into with-skill workspace only, # so the agent can access scripts/, references/, and assets/ as @@ -262,10 +272,12 @@ def _execute_eval_pair( if src_dir.is_dir(): dst_dir = with_workspace / subdir shutil.copytree(src_dir, dst_dir, dirs_exist_ok=True) + LOG.debug("Copied skill dir %s -> %s", src_dir, dst_dir) # Also copy SKILL.md so the agent can read it if needed _skill_md = _skill / "SKILL.md" if _skill_md.is_file(): shutil.copy2(_skill_md, with_workspace / "SKILL.md") + LOG.debug("Copied SKILL.md to workspace") # Copy .claude/ settings directory so the agent inherits # project-level configuration (e.g., AWS profile, MCP servers, # permission settings) when running from the temp workspace. @@ -274,6 +286,7 @@ def _execute_eval_pair( shutil.copytree( _claude_dir, with_workspace / ".claude", dirs_exist_ok=True ) + LOG.debug("Copied .claude/ settings to workspace") # Run WITH skill with_stdout, with_stderr, with_rc, with_elapsed = runner.run_prompt( diff --git a/skill_eval/logging_config.py b/skill_eval/logging_config.py new file mode 100644 index 0000000..242b416 --- /dev/null +++ b/skill_eval/logging_config.py @@ -0,0 +1,46 @@ +"""Logging configuration for skill-eval CLI.""" + +from __future__ import annotations + +import logging +import sys + + +LOG = logging.getLogger("skill_eval") + + +def configure_logging( + debug: bool = False, + debug_log: str | None = None, +) -> None: + """Configure logging based on CLI flags. + + Args: + debug: If True, emit DEBUG logs to stderr. + debug_log: If set, write DEBUG logs to this file path. + """ + # If neither flag is set, keep logging quiet (WARNING only) + if not debug and not debug_log: + logging.basicConfig(level=logging.WARNING, force=True) + return + + handlers: list[logging.Handler] = [] + + if debug: + stderr_handler = logging.StreamHandler(sys.stderr) + stderr_handler.setLevel(logging.DEBUG) + stderr_handler.setFormatter(logging.Formatter( + "%(asctime)s [%(levelname)s] %(name)s: %(message)s", + datefmt="%H:%M:%S", + )) + handlers.append(stderr_handler) + + if debug_log: + file_handler = logging.FileHandler(debug_log, mode="w") + file_handler.setLevel(logging.DEBUG) + file_handler.setFormatter(logging.Formatter( + "%(asctime)s [%(levelname)s] %(name)s: %(message)s", + )) + handlers.append(file_handler) + + logging.basicConfig(level=logging.DEBUG, handlers=handlers, force=True) diff --git a/skill_eval/trigger.py b/skill_eval/trigger.py index 3697347..36c8a89 100644 --- a/skill_eval/trigger.py +++ b/skill_eval/trigger.py @@ -7,6 +7,7 @@ from __future__ import annotations import json +import logging import sys from pathlib import Path from typing import Optional @@ -15,6 +16,8 @@ from skill_eval.eval_schemas import TriggerQuery, TriggerQueryResult, TriggerReport from skill_eval.agent_runner import AgentRunner, AgentNotAvailableError, get_runner +LOG = logging.getLogger("skill_eval.trigger") + # --------------------------------------------------------------------------- # Public entry point @@ -47,6 +50,9 @@ def run_trigger_eval( """ path = Path(skill_path).resolve() + LOG.debug("Starting trigger eval for skill: %s", path) + LOG.debug("Runs per query: %d, timeout: %ds, agent: %s", runs_per_query, timeout, agent) + # Load queries queries_file = Path(queries_path) if queries_path else path / "evals" / "eval_queries.json" try: @@ -163,6 +169,8 @@ def _run_trigger_query( if runner is None: runner = get_runner("claude") + LOG.debug("Running trigger query (%d runs): %s", runs, query.query[:100]) + trigger_count = 0 all_input_tokens: list[int] = [] all_output_tokens: list[int] = [] @@ -177,6 +185,17 @@ def _run_trigger_query( if rc == 0 and stdout.strip(): parsed = runner.parse_output(stdout) signal = _classify_trigger_signal(parsed, skill_path) + LOG.debug( + "Query: %s | signal=%s | tool_calls=%d | text_preview=%s", + query.query[:60], + signal, + len(parsed.get("tool_calls", [])), + parsed.get("text", "")[:200], + ) + LOG.debug("Tool calls: %s", [ + {"name": tc.get("name"), "input": tc.get("input", {})} + for tc in parsed.get("tool_calls", []) + ]) # For should_trigger=false queries, only count strong (tool-based) # signals as triggers. Text-only mentions are too noisy for # negative queries — an agent may casually mention a skill name @@ -190,6 +209,8 @@ def _run_trigger_query( tc = parsed["token_counts"] all_input_tokens.append(tc.get("input_tokens", 0)) all_output_tokens.append(tc.get("output_tokens", 0)) + else: + LOG.debug("Claude returned rc=%d, stdout empty=%s", rc, not stdout.strip()) trigger_rate = trigger_count / runs if runs > 0 else 0.0