Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
12 changes: 12 additions & 0 deletions skill_eval/agent_runner.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,12 +11,15 @@

import abc
import json
import logging
import shutil
import subprocess
import time
from pathlib import Path
from typing import Optional

LOG = logging.getLogger("skill_eval.agent_runner")


# ---------------------------------------------------------------------------
# Abstract interface
Expand Down Expand Up @@ -183,6 +186,9 @@ def run_prompt(
if output_format == "stream-json":
cmd.extend(["--output-format", "stream-json", "--verbose"])

LOG.debug("Claude command: %s", " ".join(cmd))
LOG.debug("Working directory: %s", workspace_dir or "(inherited)")

start = time.monotonic()
try:
result = subprocess.run(
Expand All @@ -193,12 +199,18 @@ def run_prompt(
cwd=workspace_dir,
)
elapsed = time.monotonic() - start
LOG.debug("Claude exited with code %d in %.1fs", result.returncode, elapsed)
LOG.debug("Claude stdout: %s", result.stdout[:5000])
if result.stderr:
LOG.debug("Claude stderr: %s", result.stderr[:2000])
return result.stdout, result.stderr, result.returncode, elapsed
except subprocess.TimeoutExpired:
elapsed = time.monotonic() - start
LOG.debug("Claude timed out after %ds", timeout)
return "", f"Timed out after {timeout}s", -1, elapsed
except FileNotFoundError:
elapsed = time.monotonic() - start
LOG.debug("Claude CLI not found on PATH")
return "", "claude CLI not found", -1, elapsed

def parse_output(self, raw: str) -> dict:
Expand Down
10 changes: 10 additions & 0 deletions skill_eval/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,8 @@
import sys
from pathlib import Path

from skill_eval.logging_config import configure_logging

from skill_eval.schemas import AuditReport, Finding, calculate_score, calculate_grade
from skill_eval.audit.structure_check import check_structure
from skill_eval.audit.security_scan import scan_security, SAFE_DOMAINS
Expand Down Expand Up @@ -110,6 +112,11 @@ def main(argv: list[str] | None = None) -> int:
prog="skill-eval",
description="Agent Skill Security & Quality Evaluation",
)
# Global flags
parser.add_argument("--debug", action="store_true",
help="Emit debug logs to stderr")
parser.add_argument("--debug-log", type=str, default=None, metavar="FILE",
help="Write debug logs to FILE")
subparsers = parser.add_subparsers(dest="command", help="Available commands")

# audit command
Expand Down Expand Up @@ -250,6 +257,9 @@ def main(argv: list[str] | None = None) -> int:

args = parser.parse_args(argv)

# Configure logging before dispatching to any subcommand
configure_logging(debug=args.debug, debug_log=args.debug_log)

if not args.command:
parser.print_help()
return 0
Expand Down
13 changes: 13 additions & 0 deletions skill_eval/functional.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@
from __future__ import annotations

import json
import logging
import math
import shutil
import sys
Expand All @@ -23,6 +24,8 @@
from skill_eval.agent_runner import AgentRunner, AgentNotAvailableError, get_runner
from skill_eval.grading import grade_output

LOG = logging.getLogger("skill_eval.functional")


# ---------------------------------------------------------------------------
# Public entry point
Expand Down Expand Up @@ -55,6 +58,9 @@ def run_functional_eval(
"""
path = Path(skill_path).resolve()

LOG.debug("Starting functional eval for skill: %s", path)
LOG.debug("Runs per eval: %d, timeout: %ds, agent: %s", runs_per_eval, timeout, agent)

# Load evals
evals_file = Path(evals_path) if evals_path else path / "evals" / "evals.json"
try:
Expand Down Expand Up @@ -243,6 +249,9 @@ def _execute_eval_pair(
with_workspace = Path(with_tmpdir)
without_workspace = Path(without_tmpdir)

LOG.debug("With-skill workspace: %s", with_workspace)
LOG.debug("Without-skill workspace: %s", without_workspace)

# Copy eval case files into both workspaces
for ws in (with_workspace, without_workspace):
for rel_file in eval_case.files:
Expand All @@ -251,6 +260,7 @@ def _execute_eval_pair(
if src.is_file():
dst.parent.mkdir(parents=True, exist_ok=True)
shutil.copy2(src, dst)
LOG.debug("Copied eval file %s -> %s", src, dst)

# Copy skill resource directories into with-skill workspace only,
# so the agent can access scripts/, references/, and assets/ as
Expand All @@ -262,10 +272,12 @@ def _execute_eval_pair(
if src_dir.is_dir():
dst_dir = with_workspace / subdir
shutil.copytree(src_dir, dst_dir, dirs_exist_ok=True)
LOG.debug("Copied skill dir %s -> %s", src_dir, dst_dir)
# Also copy SKILL.md so the agent can read it if needed
_skill_md = _skill / "SKILL.md"
if _skill_md.is_file():
shutil.copy2(_skill_md, with_workspace / "SKILL.md")
LOG.debug("Copied SKILL.md to workspace")
# Copy .claude/ settings directory so the agent inherits
# project-level configuration (e.g., AWS profile, MCP servers,
# permission settings) when running from the temp workspace.
Expand All @@ -274,6 +286,7 @@ def _execute_eval_pair(
shutil.copytree(
_claude_dir, with_workspace / ".claude", dirs_exist_ok=True
)
LOG.debug("Copied .claude/ settings to workspace")

# Run WITH skill
with_stdout, with_stderr, with_rc, with_elapsed = runner.run_prompt(
Expand Down
46 changes: 46 additions & 0 deletions skill_eval/logging_config.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,46 @@
"""Logging configuration for skill-eval CLI."""

from __future__ import annotations

import logging
import sys


LOG = logging.getLogger("skill_eval")


def configure_logging(
debug: bool = False,
debug_log: str | None = None,
) -> None:
"""Configure logging based on CLI flags.

Args:
debug: If True, emit DEBUG logs to stderr.
debug_log: If set, write DEBUG logs to this file path.
"""
# If neither flag is set, keep logging quiet (WARNING only)
if not debug and not debug_log:
logging.basicConfig(level=logging.WARNING, force=True)
return

handlers: list[logging.Handler] = []

if debug:
stderr_handler = logging.StreamHandler(sys.stderr)
stderr_handler.setLevel(logging.DEBUG)
stderr_handler.setFormatter(logging.Formatter(
"%(asctime)s [%(levelname)s] %(name)s: %(message)s",
datefmt="%H:%M:%S",
))
handlers.append(stderr_handler)

if debug_log:
file_handler = logging.FileHandler(debug_log, mode="w")
file_handler.setLevel(logging.DEBUG)
file_handler.setFormatter(logging.Formatter(
"%(asctime)s [%(levelname)s] %(name)s: %(message)s",
))
handlers.append(file_handler)

logging.basicConfig(level=logging.DEBUG, handlers=handlers, force=True)
21 changes: 21 additions & 0 deletions skill_eval/trigger.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@
from __future__ import annotations

import json
import logging
import sys
from pathlib import Path
from typing import Optional
Expand All @@ -15,6 +16,8 @@
from skill_eval.eval_schemas import TriggerQuery, TriggerQueryResult, TriggerReport
from skill_eval.agent_runner import AgentRunner, AgentNotAvailableError, get_runner

LOG = logging.getLogger("skill_eval.trigger")


# ---------------------------------------------------------------------------
# Public entry point
Expand Down Expand Up @@ -47,6 +50,9 @@ def run_trigger_eval(
"""
path = Path(skill_path).resolve()

LOG.debug("Starting trigger eval for skill: %s", path)
LOG.debug("Runs per query: %d, timeout: %ds, agent: %s", runs_per_query, timeout, agent)

# Load queries
queries_file = Path(queries_path) if queries_path else path / "evals" / "eval_queries.json"
try:
Expand Down Expand Up @@ -163,6 +169,8 @@ def _run_trigger_query(
if runner is None:
runner = get_runner("claude")

LOG.debug("Running trigger query (%d runs): %s", runs, query.query[:100])

trigger_count = 0
all_input_tokens: list[int] = []
all_output_tokens: list[int] = []
Expand All @@ -177,6 +185,17 @@ def _run_trigger_query(
if rc == 0 and stdout.strip():
parsed = runner.parse_output(stdout)
signal = _classify_trigger_signal(parsed, skill_path)
LOG.debug(
"Query: %s | signal=%s | tool_calls=%d | text_preview=%s",
query.query[:60],
signal,
len(parsed.get("tool_calls", [])),
parsed.get("text", "")[:200],
)
LOG.debug("Tool calls: %s", [
{"name": tc.get("name"), "input": tc.get("input", {})}
for tc in parsed.get("tool_calls", [])
])
# For should_trigger=false queries, only count strong (tool-based)
# signals as triggers. Text-only mentions are too noisy for
# negative queries — an agent may casually mention a skill name
Expand All @@ -190,6 +209,8 @@ def _run_trigger_query(
tc = parsed["token_counts"]
all_input_tokens.append(tc.get("input_tokens", 0))
all_output_tokens.append(tc.get("output_tokens", 0))
else:
LOG.debug("Claude returned rc=%d, stdout empty=%s", rc, not stdout.strip())

trigger_rate = trigger_count / runs if runs > 0 else 0.0

Expand Down
Loading