diff --git a/README.md b/README.md index c5558f3..e4672c8 100644 --- a/README.md +++ b/README.md @@ -318,6 +318,10 @@ Every evolved variant must pass: 4. **Semantic preservation** — Must not drift from original purpose 5. **PR review** — All changes go through human review, never direct commit +### Automated PR opening (opt-in) + +`--create-pr` branches the source repo, commits the evolved artifact, pushes, and opens a GitHub PR via `gh` on a deploy decision. Off by default; intended for personal-use direct-push workflows against a repo you own. Pair with `--pr-draft` for a human review gate, and `--pr-base-branch`/`--pr-branch-prefix` to control where the PR lands. The default refuses to run against a dirty source tree (escape hatch: `--pr-allow-dirty`) and against non-git-backed sources like the Claude Code plugin cache. **Do not pair with campaign loops** — every accepted run opens its own PR, so a 10-skill sweep is 10 PRs to review. + ## Full Plan See [PLAN.md](PLAN.md) for the complete architecture, evaluation data strategy, constraints, benchmarks integration, and phased timeline. diff --git a/evolution/core/config.py b/evolution/core/config.py index 81d2d51..3f22696 100644 --- a/evolution/core/config.py +++ b/evolution/core/config.py @@ -108,7 +108,11 @@ def get_lm(self, role: Role) -> ResolvedLM: enable_confusable_bucket: bool = False output_dir: Path = field(default_factory=lambda: Path("./output")) - create_pr: bool = True + # Reserved for future ergonomic-default support; the per-run boolean + # is currently carried via the `--create-pr/--no-create-pr` CLI flag, + # not this field. Kept here so users programming against + # EvolutionConfig have an obvious surface to extend. + create_pr: bool = False seed: int = 42 diff --git a/evolution/core/pr_automation.py b/evolution/core/pr_automation.py new file mode 100644 index 0000000..3364bb9 --- /dev/null +++ b/evolution/core/pr_automation.py @@ -0,0 +1,338 @@ +"""Open a PR against the source repository after a successful evolve run. + +GEPA writes evolved artifacts to ``output/.../evolved_skill.md`` (or the +tool equivalent). Promoting an evolution to the source repo is otherwise a +manual copy-branch-commit-push-PR dance; this helper collapses those steps +into one function. + +Opt-in only: the CLI flag that wires this in defaults to ``False``. The +helper is intentionally artifact-agnostic — it takes a relative path and +content blob, not a skill/tool type discriminator. +""" + +import os +import re +import secrets +import subprocess +import tempfile +from dataclasses import dataclass +from datetime import datetime +from pathlib import Path +from typing import Any, Literal, Optional + +from rich.console import Console + +_STDERR_TAIL_BYTES = 1024 +_GIT_TIMEOUT_SECONDS = 60 +_GH_TIMEOUT_SECONDS = 120 +_BRANCH_SANITIZE_RE = re.compile(r"[^A-Za-z0-9._-]+") + + +@dataclass(frozen=True) +class PRResult: + status: Literal["created", "skipped", "failed"] + reason: str = "" + branch: Optional[str] = None + commit_sha: Optional[str] = None + url: Optional[str] = None + + +def disabled_pr_block() -> dict[str, Any]: + """The `pr_created` block written when `--create-pr` is off. + + Shape-stable with `pr_block_from_result` so downstream consumers can + index ``payload["pr_created"]["url"]`` without checking the status. + """ + return {"status": "disabled", "reason": None, "branch": None, "commit_sha": None, "url": None} + + +def pr_block_from_result(result: PRResult) -> dict[str, Any]: + """Convert a `PRResult` into the `gate_decision.json::pr_created` block.""" + return { + "status": result.status, + "reason": result.reason, + "branch": result.branch, + "commit_sha": result.commit_sha, + "url": result.url, + } + + +def find_git_root(path: Path) -> Optional[Path]: + """Return the git worktree root for ``path``, or ``None`` if not in a repo.""" + start = path if path.is_dir() else path.parent + try: + result = subprocess.run( + ["git", "rev-parse", "--show-toplevel"], + cwd=str(start), + capture_output=True, + text=True, + timeout=_GIT_TIMEOUT_SECONDS, + ) + except (subprocess.TimeoutExpired, FileNotFoundError, OSError): + return None + if result.returncode != 0: + return None + return Path(result.stdout.strip()) + + +def _tail(text: Optional[str]) -> str: + if not text: + return "" + return text[-_STDERR_TAIL_BYTES:] + + +def _run_git( + args: list[str], + *, + cwd: Path, +) -> tuple[bool, str | subprocess.CompletedProcess]: + """Run a git command. Returns ``(True, completed)`` on success + (returncode==0), or ``(False, formatted_reason)`` on any failure mode + (timeout, missing git binary, non-zero exit). Centralizing the reason + formatting keeps every callsite to a one-line ``return PRResult(... reason=res)``. + """ + cmd_name = args[0] if args else "git" + try: + result = subprocess.run( + ["git", *args], + cwd=str(cwd), + capture_output=True, + text=True, + timeout=_GIT_TIMEOUT_SECONDS, + ) + except subprocess.TimeoutExpired: + return False, f"git {cmd_name} timed out after {_GIT_TIMEOUT_SECONDS}s" + except FileNotFoundError: + return False, "git not found on PATH" + if result.returncode != 0: + return False, f"git {cmd_name} failed: {_tail(result.stderr)}" + return True, result + + +def _branch_name(prefix: str, artifact_name: str, timestamp: datetime) -> str: + sanitized = _BRANCH_SANITIZE_RE.sub("-", artifact_name).strip("-") + ts = timestamp.strftime("%Y%m%d-%H%M%S") + suffix = secrets.token_hex(2) + return f"{prefix}{sanitized}-{ts}-{suffix}" + + +def _atomic_copy(src: Path, dst: Path) -> None: + # tempfile + os.replace is atomic only when src and dst share a filesystem, + # so the tempfile is created under dst.parent. + dst.parent.mkdir(parents=True, exist_ok=True) + data = src.read_bytes() + with tempfile.NamedTemporaryFile( + delete=False, dir=str(dst.parent), prefix=".pr_atomic_" + ) as tmp: + tmp.write(data) + tmp_path = Path(tmp.name) + os.replace(tmp_path, dst) + + +def _format_pr_body(gate_decision: dict[str, Any], metrics: dict[str, Any]) -> str: + decision = gate_decision.get("decision", "unknown") + signal = gate_decision.get("decision_signal", "synthetic") + reason = gate_decision.get("reason", "") + baseline = metrics.get("baseline_mean") + evolved = metrics.get("evolved_mean") + delta = metrics.get("delta") + if delta is None and baseline is not None and evolved is not None: + delta = evolved - baseline + + lines = [f"## Evolution decision: {decision}", ""] + + if signal == "closed_loop": + gained = gate_decision.get("cl_tasks_gained") + required = gate_decision.get("cl_required_gain") + if gained is not None: + headline = f"**Closed-loop tasks gained: +{gained}**" + if required is not None: + headline += f" (required ≥ {required})" + lines += [headline, ""] + lines += [f"Decision signal: `closed_loop`", ""] + else: + lines += [f"Decision signal: `synthetic`", ""] + + if reason: + lines += [f"Reason: `{reason}`", ""] + + if baseline is not None and evolved is not None: + sign = "+" if (delta or 0) >= 0 else "" + lines += [ + "### Holdout score", + f"- baseline: `{baseline:.2f}`", + f"- evolved: `{evolved:.2f}`", + f"- delta: `{sign}{delta:.2f}`", + "", + ] + + bootstrap = gate_decision.get("bootstrap") + if isinstance(bootstrap, dict) and "ci_low" in bootstrap and "ci_high" in bootstrap: + lines += [ + "### Bootstrap CI", + f"- 95% CI: `[{bootstrap['ci_low']:.3f}, {bootstrap['ci_high']:.3f}]`", + "", + ] + + baseline_chars = gate_decision.get("baseline_chars") + evolved_chars = gate_decision.get("evolved_chars") + if baseline_chars is not None and evolved_chars is not None: + size_delta = evolved_chars - baseline_chars + lines += [ + "### Artifact size", + f"- baseline: `{baseline_chars}` chars", + f"- evolved: `{evolved_chars}` chars (`{size_delta:+d}`)", + "", + ] + + cost_summary = gate_decision.get("cost_summary") + if isinstance(cost_summary, dict) and "total_usd" in cost_summary: + lines += ["### Cost", f"- total: `${cost_summary['total_usd']:.4f}`", ""] + + lines += [ + "---", + "Generated by agent-self-evolution. Review and merge or close manually.", + ] + return "\n".join(line for line in lines if line is not None) + + +def _commit_message(artifact_name: str, metrics: dict[str, Any], signal: str, gate_decision: dict[str, Any]) -> str: + if signal == "closed_loop": + gained = gate_decision.get("cl_tasks_gained", 0) + summary = f"CL tasks +{gained}" + else: + baseline = metrics.get("baseline_mean") + evolved = metrics.get("evolved_mean") + delta = metrics.get("delta") + if delta is None and baseline is not None and evolved is not None: + delta = evolved - baseline + if baseline is not None and evolved is not None and delta is not None: + summary = f"holdout {baseline:.2f}->{evolved:.2f} ({delta:+.2f})" + else: + summary = "deploy" + return f"evolve({artifact_name}): {summary}" + + +def create_pr( + *, + source_repo_root: Optional[Path], + source_artifact_relpath: str, + evolved_artifact_path: Path, + artifact_name: str, + gate_decision: dict[str, Any], + metrics: dict[str, Any], + base_branch: str, + branch_prefix: str, + draft: bool, + allow_dirty: bool, + console: Console, +) -> PRResult: + if source_repo_root is None: + return PRResult( + status="skipped", + reason="source repo not git-backed (e.g., Claude Code plugin cache)", + ) + + # 1. Dirty-tree check + ok, res = _run_git(["status", "--porcelain"], cwd=source_repo_root) + if not ok: + return PRResult(status="failed", reason=res) # type: ignore[arg-type] + if res.stdout.strip() and not allow_dirty: + console.print("[yellow]Dirty working tree detected:[/yellow]") + console.print(res.stdout.rstrip()) + return PRResult( + status="skipped", + reason="dirty working tree (pass --pr-allow-dirty to override)", + ) + + # 2. Fetch origin + ok, res = _run_git(["fetch", "origin", base_branch], cwd=source_repo_root) + if not ok: + return PRResult(status="failed", reason=res) # type: ignore[arg-type] + + # 3. Branch from origin/ + branch = _branch_name(branch_prefix, artifact_name, datetime.now()) + ok, res = _run_git( + ["checkout", "-b", branch, f"origin/{base_branch}"], cwd=source_repo_root + ) + if not ok: + return PRResult(status="failed", reason=res, branch=branch) # type: ignore[arg-type] + + # 4. Atomic copy + dst = source_repo_root / source_artifact_relpath + try: + _atomic_copy(evolved_artifact_path, dst) + except OSError as exc: + return PRResult(status="failed", reason=f"atomic copy failed: {exc}", branch=branch) + + # 5. Stage + commit + ok, res = _run_git(["add", source_artifact_relpath], cwd=source_repo_root) + if not ok: + return PRResult(status="failed", reason=res, branch=branch) # type: ignore[arg-type] + + signal = gate_decision.get("decision_signal", "synthetic") + message = _commit_message(artifact_name, metrics, signal, gate_decision) + ok, res = _run_git(["commit", "-m", message], cwd=source_repo_root) + if not ok: + return PRResult(status="failed", reason=res, branch=branch) # type: ignore[arg-type] + + ok, res = _run_git(["rev-parse", "HEAD"], cwd=source_repo_root) + commit_sha: Optional[str] = None + if ok and isinstance(res, subprocess.CompletedProcess): + commit_sha = res.stdout.strip() + + # 6. Push + ok, res = _run_git(["push", "origin", branch], cwd=source_repo_root) + if not ok: + return PRResult(status="failed", reason=res, branch=branch, commit_sha=commit_sha) # type: ignore[arg-type] + + # 7. gh pr create + title = _commit_message(artifact_name, metrics, signal, gate_decision) + body = _format_pr_body(gate_decision, metrics) + gh_args = [ + "gh", "pr", "create", + "--base", base_branch, + "--head", branch, + "--title", title, + "--body", body, + ] + if draft: + gh_args.append("--draft") + try: + gh_res = subprocess.run( + gh_args, + cwd=str(source_repo_root), + capture_output=True, + text=True, + timeout=_GH_TIMEOUT_SECONDS, + ) + except subprocess.TimeoutExpired: + return PRResult( + status="failed", + reason=f"gh pr create timed out after {_GH_TIMEOUT_SECONDS}s", + branch=branch, + commit_sha=commit_sha, + ) + except FileNotFoundError: + return PRResult( + status="failed", + reason="gh not found on PATH", + branch=branch, + commit_sha=commit_sha, + ) + if gh_res.returncode != 0: + return PRResult( + status="failed", + reason=f"gh pr create failed: {_tail(gh_res.stderr)}", + branch=branch, + commit_sha=commit_sha, + ) + + url = gh_res.stdout.strip().splitlines()[-1] if gh_res.stdout.strip() else "" + return PRResult( + status="created", + reason="", + branch=branch, + commit_sha=commit_sha, + url=url, + ) diff --git a/evolution/core/run_inputs.py b/evolution/core/run_inputs.py index 213dac6..d418efa 100644 --- a/evolution/core/run_inputs.py +++ b/evolution/core/run_inputs.py @@ -20,6 +20,7 @@ def build_run_inputs( quality_gate_preset: str, eval_source: str, gepa_acceptance: str, + create_pr: bool, fitness_profile: Optional[str] = None, enable_confusable_bucket: Optional[bool] = None, ) -> dict[str, Any]: @@ -39,6 +40,7 @@ def build_run_inputs( "quality_gate_preset": quality_gate_preset, "eval_source": eval_source, "gepa_acceptance": gepa_acceptance, + "create_pr": create_pr, } if fitness_profile is not None: run_inputs["fitness_profile"] = fitness_profile diff --git a/evolution/skills/evolve_skill.py b/evolution/skills/evolve_skill.py index cc69348..1b01948 100644 --- a/evolution/skills/evolve_skill.py +++ b/evolution/skills/evolve_skill.py @@ -40,6 +40,12 @@ resolve_default_lm, resolved_lms_dump, ) +from evolution.core.pr_automation import ( + create_pr, + disabled_pr_block, + find_git_root, + pr_block_from_result, +) from evolution.core.quality_gate import ( QUALITY_GATE_PRESETS, _check_cl_primary_gate, @@ -651,6 +657,11 @@ def evolve( closed_loop_in_valset: bool = False, closed_loop_agent_model: Optional[str] = None, closed_loop_task_timeout_seconds: Optional[int] = None, + create_pr_flag: bool = False, + pr_base_branch: str = "main", + pr_branch_prefix: str = "evolve/", + pr_draft: bool = False, + pr_allow_dirty: bool = False, ): """Main evolution function — orchestrates the full optimization loop.""" @@ -1109,6 +1120,7 @@ def evolve( quality_gate_preset=quality_gate, eval_source=eval_source, gepa_acceptance=config.gepa_acceptance, + create_pr=create_pr_flag, ), }) console.print(f" Saved failed variant to {failed_path}") @@ -1150,6 +1162,7 @@ def evolve( quality_gate_preset=quality_gate, eval_source=eval_source, gepa_acceptance=config.gepa_acceptance, + create_pr=create_pr_flag, ) use_cl_primary = ( @@ -1412,8 +1425,57 @@ def evolve( # didn't fire even though CL may be configured. decision_payload["reason_synthetic"] = "preflight_skipped" + # Persist evolved + baseline artifacts once on the deploy path + # for both the PR hook (needs the path) and the post-table + # reporting (needs them on disk for the user). + if growth_pass: + evolved_skill_path = output_dir / "evolved_skill.md" + evolved_skill_path.write_text(evolved_full) + (output_dir / "baseline_skill.md").write_text(skill["raw"]) + + # Run PR automation BEFORE writing gate_decision.json so the PR + # outcome lands in the same single-write block — calibration + # scripts grepping pr_created don't have to special-case a + # re-write or missing key. + pr_created_block: dict[str, Any] = disabled_pr_block() + if growth_pass and create_pr_flag: + source_repo_root = find_git_root(skill_path) + source_artifact_relpath = ( + str(skill_path.relative_to(source_repo_root)) + if source_repo_root is not None + else str(skill_path) + ) + pr_result = create_pr( + source_repo_root=source_repo_root, + source_artifact_relpath=source_artifact_relpath, + evolved_artifact_path=evolved_skill_path, + artifact_name=skill_name, + gate_decision=decision_payload, + metrics={ + "baseline_mean": avg_baseline, + "evolved_mean": avg_evolved, + "delta": improvement, + }, + base_branch=pr_base_branch, + branch_prefix=pr_branch_prefix, + draft=pr_draft, + allow_dirty=pr_allow_dirty, + console=console, + ) + pr_created_block = pr_block_from_result(pr_result) + decision_payload["pr_created"] = pr_created_block + gate_path = write_gate_decision(output_dir, decision_payload) console.print(f" [dim]Gate decision logged to {gate_path}[/dim]") + if pr_created_block["status"] == "created": + console.print( + f" [green]✓ PR opened: {pr_created_block['url']}[/green]" + ) + elif pr_created_block["status"] in ("skipped", "failed"): + console.print( + f" [yellow]PR automation {pr_created_block['status']}: " + f"{pr_created_block['reason']}[/yellow]" + ) if not growth_pass: console.print("[red]✗ Evolved skill REJECTED by quality gate — not deploying[/red]") @@ -1479,9 +1541,6 @@ def evolve( console.print() console.print(table) - evolved_skill_path = output_dir / "evolved_skill.md" - evolved_skill_path.write_text(evolved_full) - (output_dir / "baseline_skill.md").write_text(skill["raw"]) metrics = { "skill_name": skill_name, "timestamp": timestamp, @@ -1541,6 +1600,7 @@ def evolve( quality_gate_preset=quality_gate, eval_source=eval_source, gepa_acceptance=config.gepa_acceptance, + create_pr=create_pr_flag, ), schema_version="5", ) @@ -1848,6 +1908,48 @@ def evolve( "fix for noisy LM-judge fitness where strict acceptance rejects " "~50% of true-equal mutations.", ) +@click.option( + "--create-pr/--no-create-pr", + "create_pr_flag", + is_flag=True, + default=False, + help="On a deploy decision, branch the source repo, commit the evolved " + "artifact, push, and open a GitHub PR. Off by default — opt in " + "per-run. No-op on reject. Skips cleanly when the source isn't " + "git-backed (e.g. Claude Code plugin cache).", +) +@click.option( + "--pr-base-branch", + "pr_base_branch", + default="main", + type=str, + help="Target branch for the PR opened by --create-pr (default: main).", +) +@click.option( + "--pr-branch-prefix", + "pr_branch_prefix", + default="evolve/", + type=str, + help="Prefix for the PR's head branch under --create-pr. Branch names " + "become '{prefix}{artifact}-{timestamp}-{hex}'.", +) +@click.option( + "--pr-draft", + "pr_draft", + is_flag=True, + default=False, + help="Open the --create-pr PR as a draft (recommended for personal " + "automation pipelines that want a human review gate before merge).", +) +@click.option( + "--pr-allow-dirty", + "pr_allow_dirty", + is_flag=True, + default=False, + help="Override --create-pr's dirty-tree refusal. Default behavior " + "skips PR creation when the source repo has uncommitted changes " + "to avoid sweeping unrelated edits into the evolution PR.", +) @click.option( "--closed-loop-during-evolution", "closed_loop_suite_path", @@ -1943,6 +2045,11 @@ def main(skill, iterations, eval_source, dataset_path, optimizer_model, reflecti force_saturation_check, gepa_minibatch_size, gepa_acceptance, + create_pr_flag, + pr_base_branch, + pr_branch_prefix, + pr_draft, + pr_allow_dirty, closed_loop_suite_path, closed_loop_saturation_threshold, closed_loop_min_iters, @@ -2000,6 +2107,11 @@ def main(skill, iterations, eval_source, dataset_path, optimizer_model, reflecti closed_loop_in_valset=closed_loop_in_valset, closed_loop_agent_model=closed_loop_agent_model, closed_loop_task_timeout_seconds=closed_loop_task_timeout_seconds, + create_pr_flag=create_pr_flag, + pr_base_branch=pr_base_branch, + pr_branch_prefix=pr_branch_prefix, + pr_draft=pr_draft, + pr_allow_dirty=pr_allow_dirty, ) except HermesProviderError as exc: # Render a clean error panel instead of dumping a Python traceback diff --git a/evolution/tools/evolve_tool.py b/evolution/tools/evolve_tool.py index b76388d..8a97d4a 100644 --- a/evolution/tools/evolve_tool.py +++ b/evolution/tools/evolve_tool.py @@ -58,6 +58,12 @@ register_litellm_cost_callback, register_litellm_failure_callback, ) +from evolution.core.pr_automation import ( + create_pr, + disabled_pr_block, + find_git_root, + pr_block_from_result, +) from evolution.core.quality_gate import ( QUALITY_GATE_PRESETS, _check_cl_primary_gate, @@ -384,6 +390,11 @@ def evolve( force_saturation_check: bool = False, gepa_minibatch_size: int = 3, gepa_acceptance: str = "improvement-or-equal", + create_pr_flag: bool = False, + pr_base_branch: str = "main", + pr_branch_prefix: str = "evolve/", + pr_draft: bool = False, + pr_allow_dirty: bool = False, ) -> dict[str, Any]: """Evolve one tool description inside a manifest. @@ -804,6 +815,7 @@ def evolve( quality_gate_preset=quality_gate, eval_source=eval_source, gepa_acceptance=config.gepa_acceptance, + create_pr=create_pr_flag, fitness_profile=fitness_profile, enable_confusable_bucket=config.enable_confusable_bucket, ) @@ -1103,8 +1115,63 @@ def evolve( # consumers distinguish 'preflight saw no weak_signal' from # 'preflight didn't run.' decision_payload["reason_synthetic"] = "preflight_skipped" + + # Compute evolved_manifest + persist baseline/evolved manifest + # artifacts once on the deploy path. The PR hook, the patch + # emitter, and the apply call all reference these. + if growth_pass: + evolved_manifest = manifest.replace_description(tool_name, evolved_description) + evolved_manifest_path = output_dir / "evolved_manifest.json" + evolved_manifest_path.write_text( + json.dumps(_manifest_to_dict(evolved_manifest), indent=2) + "\n" + ) + (output_dir / "baseline_manifest.json").write_text( + json.dumps(_manifest_to_dict(manifest), indent=2) + "\n" + ) + + # Run PR automation BEFORE writing gate_decision.json so the PR + # outcome lands in the same single-write block — calibration + # scripts grepping pr_created don't have to special-case a + # re-write or missing key. + pr_created_block: dict[str, Any] = disabled_pr_block() + if growth_pass and create_pr_flag: + source_repo_root = find_git_root(manifest_path) + source_artifact_relpath = ( + str(manifest_path.relative_to(source_repo_root)) + if source_repo_root is not None + else str(manifest_path) + ) + pr_result = create_pr( + source_repo_root=source_repo_root, + source_artifact_relpath=source_artifact_relpath, + evolved_artifact_path=evolved_manifest_path, + artifact_name=tool_name, + gate_decision=decision_payload, + metrics={ + "baseline_mean": avg_baseline, + "evolved_mean": avg_evolved, + "delta": improvement, + }, + base_branch=pr_base_branch, + branch_prefix=pr_branch_prefix, + draft=pr_draft, + allow_dirty=pr_allow_dirty, + console=console, + ) + pr_created_block = pr_block_from_result(pr_result) + decision_payload["pr_created"] = pr_created_block + gate_path = write_gate_decision(output_dir, decision_payload) console.print(f" [dim]Gate decision logged to {gate_path}[/dim]") + if pr_created_block["status"] == "created": + console.print( + f" [green]✓ PR opened: {pr_created_block['url']}[/green]" + ) + elif pr_created_block["status"] in ("skipped", "failed"): + console.print( + f" [yellow]PR automation {pr_created_block['status']}: " + f"{pr_created_block['reason']}[/yellow]" + ) if not growth_pass: console.print("[red]✗ Evolved description REJECTED by quality gate — not deploying[/red]") @@ -1168,13 +1235,6 @@ def evolve( console.print() console.print(table) - evolved_manifest = manifest.replace_description(tool_name, evolved_description) - (output_dir / "baseline_manifest.json").write_text( - json.dumps(_manifest_to_dict(manifest), indent=2) + "\n" - ) - (output_dir / "evolved_manifest.json").write_text( - json.dumps(_manifest_to_dict(evolved_manifest), indent=2) + "\n" - ) metrics = { "tool_name": tool_name, "manifest_path": str(manifest_path), @@ -1244,6 +1304,7 @@ def evolve( quality_gate_preset=quality_gate, eval_source=eval_source, gepa_acceptance=config.gepa_acceptance, + create_pr=create_pr_flag, fitness_profile=fitness_profile, enable_confusable_bucket=config.enable_confusable_bucket, ) @@ -1493,6 +1554,48 @@ def evolve( "fix for noisy LM-judge fitness where strict acceptance rejects " "~50% of true-equal mutations.", ) +@click.option( + "--create-pr/--no-create-pr", + "create_pr_flag", + is_flag=True, + default=False, + help="On a deploy decision, branch the source repo, commit the evolved " + "artifact, push, and open a GitHub PR. Off by default — opt in " + "per-run. No-op on reject. Skips cleanly when the source isn't " + "git-backed (e.g. Claude Code plugin cache).", +) +@click.option( + "--pr-base-branch", + "pr_base_branch", + default="main", + type=str, + help="Target branch for the PR opened by --create-pr (default: main).", +) +@click.option( + "--pr-branch-prefix", + "pr_branch_prefix", + default="evolve/", + type=str, + help="Prefix for the PR's head branch under --create-pr. Branch names " + "become '{prefix}{artifact}-{timestamp}-{hex}'.", +) +@click.option( + "--pr-draft", + "pr_draft", + is_flag=True, + default=False, + help="Open the --create-pr PR as a draft (recommended for personal " + "automation pipelines that want a human review gate before merge).", +) +@click.option( + "--pr-allow-dirty", + "pr_allow_dirty", + is_flag=True, + default=False, + help="Override --create-pr's dirty-tree refusal. Default behavior " + "skips PR creation when the source repo has uncommitted changes " + "to avoid sweeping unrelated edits into the evolution PR.", +) @click.option( "--closed-loop-in-valset/--no-closed-loop-in-valset", "closed_loop_in_valset", @@ -1548,6 +1651,11 @@ def main( force_saturation_check: bool, gepa_minibatch_size: int, gepa_acceptance: str, + create_pr_flag: bool, + pr_base_branch: str, + pr_branch_prefix: str, + pr_draft: bool, + pr_allow_dirty: bool, closed_loop_suite_path: Optional[Path], closed_loop_hermes_repo: Optional[Path], closed_loop_saturation_threshold: float, @@ -1602,6 +1710,11 @@ def main( force_saturation_check=force_saturation_check, gepa_minibatch_size=gepa_minibatch_size, gepa_acceptance=gepa_acceptance, + create_pr_flag=create_pr_flag, + pr_base_branch=pr_base_branch, + pr_branch_prefix=pr_branch_prefix, + pr_draft=pr_draft, + pr_allow_dirty=pr_allow_dirty, ) except HermesProviderError as exc: # Render a clean error panel instead of dumping a Python traceback — diff --git a/tests/core/test_pr_automation.py b/tests/core/test_pr_automation.py new file mode 100644 index 0000000..a3d6e82 --- /dev/null +++ b/tests/core/test_pr_automation.py @@ -0,0 +1,280 @@ +"""Tests for ``evolution.core.pr_automation``. + +Covers the orchestration paths (skip / failed / created) with mocked +subprocess plus a single integration test against an ephemeral local +git pair to confirm the real `git`/file-copy choreography works. +""" + +import re +import subprocess +from datetime import datetime +from pathlib import Path +from unittest.mock import MagicMock, patch + +import pytest +from rich.console import Console + +from evolution.core.pr_automation import ( + PRResult, + _atomic_copy, + _branch_name, + _format_pr_body, + create_pr, + find_git_root, +) + + +def _ok(stdout: str = "", stderr: str = "") -> subprocess.CompletedProcess: + return subprocess.CompletedProcess(args=[], returncode=0, stdout=stdout, stderr=stderr) + + +def _fail(stderr: str, stdout: str = "") -> subprocess.CompletedProcess: + return subprocess.CompletedProcess(args=[], returncode=1, stdout=stdout, stderr=stderr) + + +def _happy_path_side_effect(pr_url: str = "https://github.com/o/r/pull/42"): + """Return a side_effect that walks the happy-path subprocess sequence. + + Order mirrors create_pr's orchestration: status, fetch, checkout, add, + commit, rev-parse, push, gh pr create. + """ + responses = [ + _ok(stdout=""), # git status --porcelain (clean) + _ok(), # git fetch + _ok(), # git checkout -b + _ok(), # git add + _ok(), # git commit + _ok(stdout="abc1234\n"), # git rev-parse HEAD + _ok(), # git push + _ok(stdout=f"{pr_url}\n"), # gh pr create + ] + return responses + + +class TestPRAutomation: + def _kwargs(self, tmp_path: Path, **overrides): + evolved = tmp_path / "evolved.md" + evolved.write_text("evolved content") + source_root = tmp_path / "source" + (source_root / "skills" / "test").mkdir(parents=True) + (source_root / "skills" / "test" / "SKILL.md").write_text("baseline") + defaults = dict( + source_repo_root=source_root, + source_artifact_relpath="skills/test/SKILL.md", + evolved_artifact_path=evolved, + artifact_name="test-skill", + gate_decision={"decision_signal": "synthetic", "decision": "deploy"}, + metrics={"baseline_mean": 0.40, "evolved_mean": 0.55, "delta": 0.15}, + base_branch="main", + branch_prefix="evolve/", + draft=False, + allow_dirty=False, + console=Console(), + ) + defaults.update(overrides) + return defaults + + def test_happy_path_creates_pr(self, tmp_path: Path): + responses = _happy_path_side_effect("https://github.com/o/r/pull/123") + with patch("evolution.core.pr_automation.subprocess.run", side_effect=responses): + result = create_pr(**self._kwargs(tmp_path)) + assert result.status == "created" + assert result.url == "https://github.com/o/r/pull/123" + assert result.branch.startswith("evolve/test-skill-") + assert result.commit_sha == "abc1234" + + def test_skipped_when_source_repo_root_is_none(self, tmp_path: Path): + with patch("evolution.core.pr_automation.subprocess.run") as run: + result = create_pr(**self._kwargs(tmp_path, source_repo_root=None)) + assert result.status == "skipped" + assert "git-backed" in result.reason + assert result.branch is None + assert result.commit_sha is None + assert result.url is None + run.assert_not_called() + + def test_skipped_on_dirty_tree_when_allow_dirty_false(self, tmp_path: Path): + responses = [_ok(stdout=" M file.txt\n?? other.txt\n")] + with patch("evolution.core.pr_automation.subprocess.run", side_effect=responses): + result = create_pr(**self._kwargs(tmp_path)) + assert result.status == "skipped" + assert "dirty" in result.reason.lower() + + def test_proceeds_on_dirty_tree_when_allow_dirty_true(self, tmp_path: Path): + responses = [ + _ok(stdout=" M file.txt\n"), # dirty status — but ignored + _ok(), # fetch + _ok(), # checkout + _ok(), # add + _ok(), # commit + _ok(stdout="deadbee\n"), # rev-parse + _ok(), # push + _ok(stdout="https://github.com/o/r/pull/9\n"), # gh pr create + ] + with patch("evolution.core.pr_automation.subprocess.run", side_effect=responses): + result = create_pr(**self._kwargs(tmp_path, allow_dirty=True)) + assert result.status == "created" + assert result.url == "https://github.com/o/r/pull/9" + + def test_failed_when_gh_not_on_path(self, tmp_path: Path): + responses = [ + _ok(stdout=""), + _ok(), _ok(), _ok(), _ok(), + _ok(stdout="abc1234\n"), + _ok(), + FileNotFoundError("gh: command not found"), + ] + with patch("evolution.core.pr_automation.subprocess.run", side_effect=responses): + result = create_pr(**self._kwargs(tmp_path)) + assert result.status == "failed" + assert "gh" in result.reason.lower() + assert result.branch is not None + assert result.commit_sha == "abc1234" + + def test_failed_when_push_fails_captures_stderr(self, tmp_path: Path): + responses = [ + _ok(stdout=""), + _ok(), _ok(), _ok(), _ok(), + _ok(stdout="abc1234\n"), + _fail(stderr="remote: Permission denied to user@example.com\n"), + ] + with patch("evolution.core.pr_automation.subprocess.run", side_effect=responses): + result = create_pr(**self._kwargs(tmp_path)) + assert result.status == "failed" + assert "Permission denied" in result.reason + assert result.branch is not None + assert result.commit_sha == "abc1234" + + def test_branch_name_has_4char_suffix_and_sanitized(self): + ts = datetime(2026, 5, 25, 14, 30, 45) + name = _branch_name("evolve/", "some/skill:with spaces", ts) + assert re.match(r"^evolve/some-skill-with-spaces-\d{8}-\d{6}-[0-9a-f]{4}$", name), name + + def test_format_pr_body_synthetic_deploy(self): + body = _format_pr_body( + gate_decision={ + "decision_signal": "synthetic", + "decision": "deploy", + "reason": "growth_quality_gate_passed", + "baseline_chars": 1000, + "evolved_chars": 1100, + "cost_summary": {"total_usd": 1.23}, + }, + metrics={"baseline_mean": 0.40, "evolved_mean": 0.55, "delta": 0.15}, + ) + assert "deploy" in body.lower() + assert "0.40" in body and "0.55" in body + assert "+0.15" in body or "0.15" in body + assert "Generated by agent-self-evolution" in body + + def test_format_pr_body_cl_primary_deploy(self): + body = _format_pr_body( + gate_decision={ + "decision_signal": "closed_loop", + "decision": "deploy", + "reason": "cl_primary_gate_passed", + "cl_tasks_gained": 3, + "cl_required_gain": 1, + "cost_summary": {"total_usd": 4.50}, + }, + metrics={"baseline_mean": 0.40, "evolved_mean": 0.42, "delta": 0.02}, + ) + # CL gain should appear prominently (in the first ~400 chars, i.e. above the fold) + assert "+3" in body[:600] or "3 task" in body[:600] or "cl_tasks_gained" in body[:600].lower() + assert "closed" in body.lower() or "cl" in body.lower() + assert "Generated by agent-self-evolution" in body + + def test_integration_against_ephemeral_repo(self, tmp_path: Path): + bare = tmp_path / "remote.git" + clone = tmp_path / "clone" + subprocess.run(["git", "init", "--bare", str(bare)], check=True, capture_output=True) + subprocess.run(["git", "clone", str(bare), str(clone)], check=True, capture_output=True) + # Seed the repo so origin/main exists. + subprocess.run(["git", "-C", str(clone), "config", "user.email", "t@t"], check=True) + subprocess.run(["git", "-C", str(clone), "config", "user.name", "t"], check=True) + subprocess.run(["git", "-C", str(clone), "checkout", "-b", "main"], check=True, capture_output=True) + skill_path = clone / "skills" / "test" / "SKILL.md" + skill_path.parent.mkdir(parents=True) + skill_path.write_text("baseline content\n") + subprocess.run(["git", "-C", str(clone), "add", "."], check=True) + subprocess.run(["git", "-C", str(clone), "commit", "-m", "seed"], check=True, capture_output=True) + subprocess.run(["git", "-C", str(clone), "push", "-u", "origin", "main"], check=True, capture_output=True) + + evolved = tmp_path / "evolved.md" + evolved.write_text("evolved improved content\n") + + real_run = subprocess.run + + def fake_run(cmd, *args, **kwargs): + # Only intercept the gh CLI call; everything else hits real git. + if isinstance(cmd, (list, tuple)) and len(cmd) > 0 and cmd[0] == "gh": + return subprocess.CompletedProcess( + args=cmd, returncode=0, + stdout="https://github.com/fake/repo/pull/777\n", stderr="", + ) + return real_run(cmd, *args, **kwargs) + + with patch("evolution.core.pr_automation.subprocess.run", side_effect=fake_run): + result = create_pr( + source_repo_root=clone, + source_artifact_relpath="skills/test/SKILL.md", + evolved_artifact_path=evolved, + artifact_name="test-skill", + gate_decision={"decision_signal": "synthetic", "decision": "deploy"}, + metrics={"baseline_mean": 0.40, "evolved_mean": 0.55, "delta": 0.15}, + base_branch="main", + branch_prefix="evolve/", + draft=False, + allow_dirty=False, + console=Console(), + ) + + assert result.status == "created" + assert result.url == "https://github.com/fake/repo/pull/777" + assert result.branch.startswith("evolve/test-skill-") + assert result.commit_sha and len(result.commit_sha) >= 7 + + # Branch exists on the bare remote + ls = subprocess.run( + ["git", "-C", str(bare), "branch", "--list", result.branch], + check=True, capture_output=True, text=True, + ) + assert result.branch in ls.stdout + + # File content on the new branch matches evolved artifact + show = subprocess.run( + ["git", "-C", str(bare), "show", f"{result.branch}:skills/test/SKILL.md"], + check=True, capture_output=True, text=True, + ) + assert show.stdout == "evolved improved content\n" + + +class TestFindGitRoot: + def test_returns_root_inside_repo(self, tmp_path: Path): + subprocess.run(["git", "init", str(tmp_path)], check=True, capture_output=True) + nested = tmp_path / "a" / "b" / "c.md" + nested.parent.mkdir(parents=True) + nested.write_text("x") + root = find_git_root(nested) + assert root is not None and root.resolve() == tmp_path.resolve() + + def test_returns_none_outside_repo(self, tmp_path: Path): + # tmp_path is fresh and not a git repo + assert find_git_root(tmp_path / "missing.md") is None + + +class TestAtomicCopy: + def test_replaces_existing_file(self, tmp_path: Path): + src = tmp_path / "src.md" + dst = tmp_path / "dst.md" + src.write_text("new") + dst.write_text("old") + _atomic_copy(src, dst) + assert dst.read_text() == "new" + + def test_creates_when_missing(self, tmp_path: Path): + src = tmp_path / "src.md" + dst = tmp_path / "dst.md" + src.write_text("hello") + _atomic_copy(src, dst) + assert dst.read_text() == "hello" diff --git a/tests/core/test_run_inputs.py b/tests/core/test_run_inputs.py index 2857575..0ed1954 100644 --- a/tests/core/test_run_inputs.py +++ b/tests/core/test_run_inputs.py @@ -28,6 +28,7 @@ def test_skill_side_shape(self): quality_gate_preset="default", eval_source="synthetic", gepa_acceptance="improvement_or_equal", + create_pr=False, ) assert set(result.keys()) == { "seed", @@ -41,8 +42,10 @@ def test_skill_side_shape(self): "quality_gate_preset", "eval_source", "gepa_acceptance", + "create_pr", } assert result["gepa_acceptance"] == "improvement_or_equal" + assert result["create_pr"] is False def test_tool_side_adds_fitness_profile_and_confusable_bucket(self): config = _fake_config() @@ -54,6 +57,7 @@ def test_tool_side_adds_fitness_profile_and_confusable_bucket(self): quality_gate_preset="default", eval_source="synthetic", gepa_acceptance="strict_improvement", + create_pr=True, fitness_profile="balanced", enable_confusable_bucket=True, ) @@ -69,12 +73,14 @@ def test_tool_side_adds_fitness_profile_and_confusable_bucket(self): "quality_gate_preset", "eval_source", "gepa_acceptance", + "create_pr", "fitness_profile", "enable_confusable_bucket", } assert result["gepa_acceptance"] == "strict_improvement" assert result["fitness_profile"] == "balanced" assert result["enable_confusable_bucket"] is True + assert result["create_pr"] is True def test_resolved_lms_matches_helper_output(self): config = _fake_config() @@ -85,6 +91,7 @@ def test_resolved_lms_matches_helper_output(self): quality_gate_preset="default", eval_source="synthetic", gepa_acceptance="improvement_or_equal", + create_pr=False, ) expected = resolved_lms_dump( optimizer="openai/gpt-4.1", @@ -106,6 +113,7 @@ def test_enable_confusable_bucket_round_trips_when_passed(self): quality_gate_preset="default", eval_source="synthetic", gepa_acceptance="improvement_or_equal", + create_pr=False, fitness_profile="balanced", enable_confusable_bucket=config.enable_confusable_bucket, ) diff --git a/tests/skills/test_evolve_skill_validation_flow.py b/tests/skills/test_evolve_skill_validation_flow.py index 8a49759..38e7633 100644 --- a/tests/skills/test_evolve_skill_validation_flow.py +++ b/tests/skills/test_evolve_skill_validation_flow.py @@ -965,3 +965,104 @@ def test_gepa_acceptance_strict_passes_strict_improvement(self, skill_dir): f"Expected acceptance_criterion=strict_improvement; " f"got {captured['gepa_kwargs']!r}. CLI output: {result.output}" ) + + +class TestPRAutomationWiring: + """Wiring tests for the --create-pr flag's pr_created block. The flag + is off by default; when on with no git-backed source, create_pr returns + a skipped PRResult that should round-trip into gate_decision.json.""" + + @pytest.fixture + def skill_dir(self, tmp_path): + skills_root = tmp_path / "skills" + skill_path = skills_root / "demo-skill" + skill_path.mkdir(parents=True) + (skill_path / "SKILL.md").write_text( + "---\nname: demo-skill\ndescription: a test skill\n---\n\nDo X.\n" + ) + return skills_root + + def _run(self, skill_dir: Path, extra_cli_args: list[str], monkeypatch, *, find_git_root_return=None): + fake_candidate = MagicMock() + fake_candidate.skill_text = "evolved skill text" + fake_module = MagicMock() + fake_module.skill_text = "evolved skill text" + fake_module.detailed_results = SimpleNamespace( + candidates=[fake_candidate], + val_aggregate_scores=[1.0], + best_idx=0, + ) + fake_builder = MagicMock() + fake_builder.generate.return_value = _fake_skill_dataset() + # Pin cwd to a tmp dir so the orchestrator's "output/" parent + # doesn't pile inside the repo's working tree. + monkeypatch.chdir(skill_dir.parent) + with patch( + "evolution.skills.evolve_skill.SyntheticDatasetBuilder", return_value=fake_builder, + ), patch( + "evolution.skills.evolve_skill._preflight_lm_credentials", + ), patch( + "evolution.skills.evolve_skill.dspy.GEPA.compile", return_value=fake_module, + ), patch( + "evolution.skills.evolve_skill._holdout_evaluate_with_metric", + return_value=(0.6, [0.6] * 10), + ), patch( + "evolution.skills.evolve_skill.find_git_root", + return_value=find_git_root_return, + ): + runner = CliRunner() + return runner.invoke( + evolve_skill_cli, + ["--skill", "demo-skill", "--skill-source-dir", str(skill_dir), + "--iterations", "1", "--no-preflight", + "--no-saturation-check", "--quality-gate", "non-inferiority", + *extra_cli_args], + catch_exceptions=False, + ) + + def _read_latest_gate_decision(self, base_dir: Path) -> dict: + # The orchestrator writes to output///gate_decision.json + # under cwd; pull the only run under the demo-skill bucket. + runs = sorted((base_dir / "output" / "demo-skill").iterdir()) + assert runs, "no run directory produced under output/demo-skill" + path = runs[-1] / "gate_decision.json" + assert path.exists(), f"gate_decision.json missing at {path}" + return json.loads(path.read_text()) + + def test_pr_created_block_disabled_when_create_pr_false( + self, skill_dir, tmp_path, monkeypatch, + ): + # Default --no-create-pr → gate_decision.json carries pr_created with + # status "disabled". The key is always present so calibration scripts + # don't have to check for absence. + result = self._run(skill_dir, extra_cli_args=[], monkeypatch=monkeypatch) + assert result.exit_code == 0, result.output + payload = self._read_latest_gate_decision(skill_dir.parent) + # All 5 fields present even when disabled — keeps downstream + # consumers free to use payload["pr_created"]["url"] without + # special-casing the absence of the key. + assert payload["pr_created"] == { + "status": "disabled", + "reason": None, + "branch": None, + "commit_sha": None, + "url": None, + } + assert payload["run_inputs"]["create_pr"] is False + + def test_pr_created_block_records_skip_when_create_pr_true_and_no_repo( + self, skill_dir, tmp_path, monkeypatch, + ): + # --create-pr on + find_git_root → None (no git-backed source) lands + # a skipped PRResult in pr_created with the right reason string. + result = self._run( + skill_dir, + extra_cli_args=["--create-pr"], + monkeypatch=monkeypatch, + find_git_root_return=None, + ) + assert result.exit_code == 0, result.output + payload = self._read_latest_gate_decision(skill_dir.parent) + assert payload["pr_created"]["status"] == "skipped" + assert "git-backed" in payload["pr_created"]["reason"] + assert payload["run_inputs"]["create_pr"] is True diff --git a/tests/tools/test_evolve_tool_validation_flow.py b/tests/tools/test_evolve_tool_validation_flow.py index 05198f2..5e21ea7 100644 --- a/tests/tools/test_evolve_tool_validation_flow.py +++ b/tests/tools/test_evolve_tool_validation_flow.py @@ -237,6 +237,85 @@ def test_gate_decision_schema_on_deploy(self, temp_manifest: Path, tmp_path: Pat assert result["baseline_score"] < result["evolved_score"] +class TestPRAutomationWiring: + """Wiring tests for the --create-pr flag's pr_created block. The flag + is off by default; when on with no git-backed source, create_pr returns + a skipped PRResult that should round-trip into gate_decision.json.""" + + def _run(self, temp_manifest, run_dir, *, create_pr_flag, find_git_root_return=None): + manifest = ToolManifest.from_json_file(temp_manifest) + with ( + patch.object( + SyntheticDatasetBuilder, + "_call_lm_for_bucket", + side_effect=_bucket_side_effect(15, 9, 6), + ), + patch( + "evolution.tools.evolve_tool.dspy.GEPA", + new=_make_fake_gepa( + _build_evolved_module(manifest, EVOLVED_DESCRIPTION) + ), + ), + patch.object( + ToolJudge, + "score", + new=_scripted_judge_score(target_score=0.95, regression_score=0.0), + ), + patch.object( + ToolModule, + "forward", + new=_scripted_module_forward(expected_tool_for_evolved="search_files"), + ), + patch( + "evolution.tools.evolve_tool.find_git_root", + return_value=find_git_root_return, + ), + ): + evolve( + tool_name="search_files", + manifest_path=temp_manifest, + iterations=1, + eval_dataset_size=30, + holdout_ratio=0.5, + quality_gate="non-inferiority", + enable_confusable_bucket=True, + output_dir=run_dir, + create_pr_flag=create_pr_flag, + ) + + def test_pr_created_block_disabled_when_create_pr_false( + self, temp_manifest: Path, tmp_path: Path, + ): + run_dir = tmp_path / "run" + self._run(temp_manifest, run_dir, create_pr_flag=False) + payload = json.loads((run_dir / "gate_decision.json").read_text()) + # All 5 fields present even when disabled — keeps downstream + # consumers free to use payload["pr_created"]["url"] without + # special-casing the absence of the key. + assert payload["pr_created"] == { + "status": "disabled", + "reason": None, + "branch": None, + "commit_sha": None, + "url": None, + } + assert payload["run_inputs"]["create_pr"] is False + + def test_pr_created_block_records_skip_when_create_pr_true_and_no_repo( + self, temp_manifest: Path, tmp_path: Path, + ): + run_dir = tmp_path / "run" + self._run( + temp_manifest, run_dir, + create_pr_flag=True, + find_git_root_return=None, + ) + payload = json.loads((run_dir / "gate_decision.json").read_text()) + assert payload["pr_created"]["status"] == "skipped" + assert "git-backed" in payload["pr_created"]["reason"] + assert payload["run_inputs"]["create_pr"] is True + + class TestApplyOverwritesSourceManifest: """`apply=True` writes the evolved description back to the source manifest."""