aws-samples · melanie531 · May 19, 2026 · May 19, 2026
diff --git a/skill_eval/agent_runner.py b/skill_eval/agent_runner.py
@@ -11,12 +11,15 @@
 
 import abc
 import json
+import logging
 import shutil
 import subprocess
 import time
 from pathlib import Path
 from typing import Optional
 
+LOG = logging.getLogger("skill_eval.agent_runner")
+
 
 # ---------------------------------------------------------------------------
 # Abstract interface
@@ -183,6 +186,9 @@ def run_prompt(
         if output_format == "stream-json":
             cmd.extend(["--output-format", "stream-json", "--verbose"])
 
+        LOG.debug("Claude command: %s", " ".join(cmd))
+        LOG.debug("Working directory: %s", workspace_dir or "(inherited)")
+
         start = time.monotonic()
         try:
             result = subprocess.run(
@@ -193,12 +199,18 @@ def run_prompt(
                 cwd=workspace_dir,
             )
             elapsed = time.monotonic() - start
+            LOG.debug("Claude exited with code %d in %.1fs", result.returncode, elapsed)
+            LOG.debug("Claude stdout: %s", result.stdout[:5000])
+            if result.stderr:
+                LOG.debug("Claude stderr: %s", result.stderr[:2000])
             return result.stdout, result.stderr, result.returncode, elapsed
         except subprocess.TimeoutExpired:
             elapsed = time.monotonic() - start
+            LOG.debug("Claude timed out after %ds", timeout)
             return "", f"Timed out after {timeout}s", -1, elapsed
         except FileNotFoundError:
             elapsed = time.monotonic() - start
+            LOG.debug("Claude CLI not found on PATH")
             return "", "claude CLI not found", -1, elapsed
 
     def parse_output(self, raw: str) -> dict:

diff --git a/skill_eval/cli.py b/skill_eval/cli.py
@@ -6,6 +6,8 @@
 import sys
 from pathlib import Path
 
+from skill_eval.logging_config import configure_logging
+
 from skill_eval.schemas import AuditReport, Finding, calculate_score, calculate_grade
 from skill_eval.audit.structure_check import check_structure
 from skill_eval.audit.security_scan import scan_security, SAFE_DOMAINS
@@ -110,6 +112,11 @@ def main(argv: list[str] | None = None) -> int:
         prog="skill-eval",
         description="Agent Skill Security & Quality Evaluation",
     )
+    # Global flags
+    parser.add_argument("--debug", action="store_true",
+                        help="Emit debug logs to stderr")
+    parser.add_argument("--debug-log", type=str, default=None, metavar="FILE",
+                        help="Write debug logs to FILE")
     subparsers = parser.add_subparsers(dest="command", help="Available commands")
 
     # audit command
@@ -250,6 +257,9 @@ def main(argv: list[str] | None = None) -> int:
 
     args = parser.parse_args(argv)
 
+    # Configure logging before dispatching to any subcommand
+    configure_logging(debug=args.debug, debug_log=args.debug_log)
+
     if not args.command:
         parser.print_help()
         return 0

diff --git a/skill_eval/functional.py b/skill_eval/functional.py
@@ -8,6 +8,7 @@
 from __future__ import annotations
 
 import json
+import logging
 import math
 import shutil
 import sys
@@ -23,6 +24,8 @@
 from skill_eval.agent_runner import AgentRunner, AgentNotAvailableError, get_runner
 from skill_eval.grading import grade_output
 
+LOG = logging.getLogger("skill_eval.functional")
+
 
 # ---------------------------------------------------------------------------
 # Public entry point
@@ -55,6 +58,9 @@ def run_functional_eval(
     """
     path = Path(skill_path).resolve()
 
+    LOG.debug("Starting functional eval for skill: %s", path)
+    LOG.debug("Runs per eval: %d, timeout: %ds, agent: %s", runs_per_eval, timeout, agent)
+
     # Load evals
     evals_file = Path(evals_path) if evals_path else path / "evals" / "evals.json"
     try:
@@ -243,6 +249,9 @@ def _execute_eval_pair(
         with_workspace = Path(with_tmpdir)
         without_workspace = Path(without_tmpdir)
 
+        LOG.debug("With-skill workspace: %s", with_workspace)
+        LOG.debug("Without-skill workspace: %s", without_workspace)
+
         # Copy eval case files into both workspaces
         for ws in (with_workspace, without_workspace):
             for rel_file in eval_case.files:
@@ -251,6 +260,7 @@ def _execute_eval_pair(
                 if src.is_file():
                     dst.parent.mkdir(parents=True, exist_ok=True)
                     shutil.copy2(src, dst)
+                    LOG.debug("Copied eval file %s -> %s", src, dst)
 
         # Copy skill resource directories into with-skill workspace only,
         # so the agent can access scripts/, references/, and assets/ as
@@ -262,10 +272,12 @@ def _execute_eval_pair(
                 if src_dir.is_dir():
                     dst_dir = with_workspace / subdir
                     shutil.copytree(src_dir, dst_dir, dirs_exist_ok=True)
+                    LOG.debug("Copied skill dir %s -> %s", src_dir, dst_dir)
             # Also copy SKILL.md so the agent can read it if needed
             _skill_md = _skill / "SKILL.md"
             if _skill_md.is_file():
                 shutil.copy2(_skill_md, with_workspace / "SKILL.md")
+                LOG.debug("Copied SKILL.md to workspace")
             # Copy .claude/ settings directory so the agent inherits
             # project-level configuration (e.g., AWS profile, MCP servers,
             # permission settings) when running from the temp workspace.
@@ -274,6 +286,7 @@ def _execute_eval_pair(
                 shutil.copytree(
                     _claude_dir, with_workspace / ".claude", dirs_exist_ok=True
                 )
+                LOG.debug("Copied .claude/ settings to workspace")
 
         # Run WITH skill
         with_stdout, with_stderr, with_rc, with_elapsed = runner.run_prompt(

diff --git a/skill_eval/logging_config.py b/skill_eval/logging_config.py
@@ -0,0 +1,46 @@
+"""Logging configuration for skill-eval CLI."""
+
+from __future__ import annotations
+
+import logging
+import sys
+
+
+LOG = logging.getLogger("skill_eval")
+
+
+def configure_logging(
+    debug: bool = False,
+    debug_log: str | None = None,
+) -> None:
+    """Configure logging based on CLI flags.
+
+    Args:
+        debug: If True, emit DEBUG logs to stderr.
+        debug_log: If set, write DEBUG logs to this file path.
+    """
+    # If neither flag is set, keep logging quiet (WARNING only)
+    if not debug and not debug_log:
+        logging.basicConfig(level=logging.WARNING, force=True)
+        return
+
+    handlers: list[logging.Handler] = []
+
+    if debug:
+        stderr_handler = logging.StreamHandler(sys.stderr)
+        stderr_handler.setLevel(logging.DEBUG)
+        stderr_handler.setFormatter(logging.Formatter(
+            "%(asctime)s [%(levelname)s] %(name)s: %(message)s",
+            datefmt="%H:%M:%S",
+        ))
+        handlers.append(stderr_handler)
+
+    if debug_log:
+        file_handler = logging.FileHandler(debug_log, mode="w")
+        file_handler.setLevel(logging.DEBUG)
+        file_handler.setFormatter(logging.Formatter(
+            "%(asctime)s [%(levelname)s] %(name)s: %(message)s",
+        ))
+        handlers.append(file_handler)
+
+    logging.basicConfig(level=logging.DEBUG, handlers=handlers, force=True)
diff --git a/skill_eval/trigger.py b/skill_eval/trigger.py
@@ -7,6 +7,7 @@
 from __future__ import annotations
 
 import json
+import logging
 import sys
 from pathlib import Path
 from typing import Optional
@@ -15,6 +16,8 @@
 from skill_eval.eval_schemas import TriggerQuery, TriggerQueryResult, TriggerReport
 from skill_eval.agent_runner import AgentRunner, AgentNotAvailableError, get_runner
 
+LOG = logging.getLogger("skill_eval.trigger")
+
 
 # ---------------------------------------------------------------------------
 # Public entry point
@@ -47,6 +50,9 @@ def run_trigger_eval(
     """
     path = Path(skill_path).resolve()
 
+    LOG.debug("Starting trigger eval for skill: %s", path)
+    LOG.debug("Runs per query: %d, timeout: %ds, agent: %s", runs_per_query, timeout, agent)
+
     # Load queries
     queries_file = Path(queries_path) if queries_path else path / "evals" / "eval_queries.json"
     try:
@@ -163,6 +169,8 @@ def _run_trigger_query(
     if runner is None:
         runner = get_runner("claude")
 
+    LOG.debug("Running trigger query (%d runs): %s", runs, query.query[:100])
+
     trigger_count = 0
     all_input_tokens: list[int] = []
     all_output_tokens: list[int] = []
@@ -177,6 +185,17 @@ def _run_trigger_query(
         if rc == 0 and stdout.strip():
             parsed = runner.parse_output(stdout)
             signal = _classify_trigger_signal(parsed, skill_path)
+            LOG.debug(
+                "Query: %s | signal=%s | tool_calls=%d | text_preview=%s",
+                query.query[:60],
+                signal,
+                len(parsed.get("tool_calls", [])),
+                parsed.get("text", "")[:200],
+            )
+            LOG.debug("Tool calls: %s", [
+                {"name": tc.get("name"), "input": tc.get("input", {})}
+                for tc in parsed.get("tool_calls", [])
+            ])
             # For should_trigger=false queries, only count strong (tool-based)
             # signals as triggers.  Text-only mentions are too noisy for
             # negative queries — an agent may casually mention a skill name
@@ -190,6 +209,8 @@ def _run_trigger_query(
             tc = parsed["token_counts"]
             all_input_tokens.append(tc.get("input_tokens", 0))
             all_output_tokens.append(tc.get("output_tokens", 0))
+        else:
+            LOG.debug("Claude returned rc=%d, stdout empty=%s", rc, not stdout.strip())
 
     trigger_rate = trigger_count / runs if runs > 0 else 0.0