diff --git a/skill_eval/cli.py b/skill_eval/cli.py index 8d43f46..c7287d9 100644 --- a/skill_eval/cli.py +++ b/skill_eval/cli.py @@ -220,6 +220,10 @@ def main(argv: list[str] | None = None) -> int: help="Agent runner to use (default: claude)") report_parser.add_argument("--include-all", action="store_true", help="Audit scans entire directory tree instead of just skill-standard directories") + report_parser.add_argument("--runs-functional", type=int, default=1, + help="Number of runs per functional eval case (default: 1)") + report_parser.add_argument("--runs-trigger", type=int, default=3, + help="Number of runs per trigger query (default: 3)") # compare command compare_parser = subparsers.add_parser("compare", @@ -386,6 +390,8 @@ def main(argv: list[str] | None = None) -> int: timeout=args.timeout, agent=args.agent, include_all=args.include_all, + runs_functional=args.runs_functional, + runs_trigger=args.runs_trigger, ) elif args.command == "compare": diff --git a/skill_eval/unified_report.py b/skill_eval/unified_report.py index c8cf61f..b3bbf3c 100644 --- a/skill_eval/unified_report.py +++ b/skill_eval/unified_report.py @@ -81,6 +81,8 @@ def run_unified_report( timeout: int = 120, agent: str = "claude", include_all: bool = False, + runs_functional: int = 1, + runs_trigger: int = 3, ) -> int: """Run all applicable evaluations and produce a unified report. @@ -95,6 +97,8 @@ def run_unified_report( timeout: Timeout per agent invocation in seconds. agent: Name of the registered agent runner. include_all: If True, audit scans entire directory tree. + runs_functional: Number of runs per functional eval case. + runs_trigger: Number of runs per trigger query. Returns: Exit code: 0 = passed, 1 = failed, 2 = error. @@ -132,7 +136,7 @@ def run_unified_report( evals_file = path / "evals" / "evals.json" if include_functional and evals_file.is_file(): try: - func_result = _run_functional(str(path), dry_run, timeout, agent) + func_result = _run_functional(str(path), dry_run, timeout, agent, runs_functional) if func_result is not None: functional_norm = func_result["overall"] func_section = { @@ -158,7 +162,7 @@ def run_unified_report( queries_file = path / "evals" / "eval_queries.json" if include_trigger and queries_file.is_file(): try: - trigger_result = _run_trigger(str(path), dry_run, timeout, agent) + trigger_result = _run_trigger(str(path), dry_run, timeout, agent, runs_trigger) if trigger_result is not None: trigger_norm = trigger_result["pass_rate"] sections["trigger"] = { @@ -241,6 +245,7 @@ def _run_functional( dry_run: bool, timeout: int, agent: str, + runs_per_eval: int = 1, ) -> Optional[dict]: """Run functional eval and return summary dict, or None on error.""" from skill_eval.functional import run_functional_eval @@ -255,6 +260,7 @@ def _run_functional( agent=agent, output_path=str(out_file), format="json", + runs_per_eval=runs_per_eval, ) if dry_run: @@ -282,6 +288,7 @@ def _run_trigger( dry_run: bool, timeout: int, agent: str, + runs_per_query: int = 3, ) -> Optional[dict]: """Run trigger eval and return summary dict, or None on error.""" from skill_eval.trigger import run_trigger_eval @@ -296,6 +303,7 @@ def _run_trigger( agent=agent, output_path=str(out_file), format="json", + runs_per_query=runs_per_query, ) if dry_run: