diff --git a/README.md b/README.md
index c5558f3..e4672c8 100644
--- a/README.md
+++ b/README.md
@@ -318,6 +318,10 @@ Every evolved variant must pass:
4. **Semantic preservation** — Must not drift from original purpose
5. **PR review** — All changes go through human review, never direct commit
+### Automated PR opening (opt-in)
+
+`--create-pr` branches the source repo, commits the evolved artifact, pushes, and opens a GitHub PR via `gh` on a deploy decision. Off by default; intended for personal-use direct-push workflows against a repo you own. Pair with `--pr-draft` for a human review gate, and `--pr-base-branch`/`--pr-branch-prefix` to control where the PR lands. The default refuses to run against a dirty source tree (escape hatch: `--pr-allow-dirty`) and against non-git-backed sources like the Claude Code plugin cache. **Do not pair with campaign loops** — every accepted run opens its own PR, so a 10-skill sweep is 10 PRs to review.
+
## Full Plan
See [PLAN.md](PLAN.md) for the complete architecture, evaluation data strategy, constraints, benchmarks integration, and phased timeline.
diff --git a/evolution/core/config.py b/evolution/core/config.py
index 81d2d51..3f22696 100644
--- a/evolution/core/config.py
+++ b/evolution/core/config.py
@@ -108,7 +108,11 @@ def get_lm(self, role: Role) -> ResolvedLM:
enable_confusable_bucket: bool = False
output_dir: Path = field(default_factory=lambda: Path("./output"))
- create_pr: bool = True
+ # Reserved for future ergonomic-default support; the per-run boolean
+ # is currently carried via the `--create-pr/--no-create-pr` CLI flag,
+ # not this field. Kept here so users programming against
+ # EvolutionConfig have an obvious surface to extend.
+ create_pr: bool = False
seed: int = 42
diff --git a/evolution/core/pr_automation.py b/evolution/core/pr_automation.py
new file mode 100644
index 0000000..3364bb9
--- /dev/null
+++ b/evolution/core/pr_automation.py
@@ -0,0 +1,338 @@
+"""Open a PR against the source repository after a successful evolve run.
+
+GEPA writes evolved artifacts to ``output/.../evolved_skill.md`` (or the
+tool equivalent). Promoting an evolution to the source repo is otherwise a
+manual copy-branch-commit-push-PR dance; this helper collapses those steps
+into one function.
+
+Opt-in only: the CLI flag that wires this in defaults to ``False``. The
+helper is intentionally artifact-agnostic — it takes a relative path and
+content blob, not a skill/tool type discriminator.
+"""
+
+import os
+import re
+import secrets
+import subprocess
+import tempfile
+from dataclasses import dataclass
+from datetime import datetime
+from pathlib import Path
+from typing import Any, Literal, Optional
+
+from rich.console import Console
+
+_STDERR_TAIL_BYTES = 1024
+_GIT_TIMEOUT_SECONDS = 60
+_GH_TIMEOUT_SECONDS = 120
+_BRANCH_SANITIZE_RE = re.compile(r"[^A-Za-z0-9._-]+")
+
+
+@dataclass(frozen=True)
+class PRResult:
+ status: Literal["created", "skipped", "failed"]
+ reason: str = ""
+ branch: Optional[str] = None
+ commit_sha: Optional[str] = None
+ url: Optional[str] = None
+
+
+def disabled_pr_block() -> dict[str, Any]:
+ """The `pr_created` block written when `--create-pr` is off.
+
+ Shape-stable with `pr_block_from_result` so downstream consumers can
+ index ``payload["pr_created"]["url"]`` without checking the status.
+ """
+ return {"status": "disabled", "reason": None, "branch": None, "commit_sha": None, "url": None}
+
+
+def pr_block_from_result(result: PRResult) -> dict[str, Any]:
+ """Convert a `PRResult` into the `gate_decision.json::pr_created` block."""
+ return {
+ "status": result.status,
+ "reason": result.reason,
+ "branch": result.branch,
+ "commit_sha": result.commit_sha,
+ "url": result.url,
+ }
+
+
+def find_git_root(path: Path) -> Optional[Path]:
+ """Return the git worktree root for ``path``, or ``None`` if not in a repo."""
+ start = path if path.is_dir() else path.parent
+ try:
+ result = subprocess.run(
+ ["git", "rev-parse", "--show-toplevel"],
+ cwd=str(start),
+ capture_output=True,
+ text=True,
+ timeout=_GIT_TIMEOUT_SECONDS,
+ )
+ except (subprocess.TimeoutExpired, FileNotFoundError, OSError):
+ return None
+ if result.returncode != 0:
+ return None
+ return Path(result.stdout.strip())
+
+
+def _tail(text: Optional[str]) -> str:
+ if not text:
+ return ""
+ return text[-_STDERR_TAIL_BYTES:]
+
+
+def _run_git(
+ args: list[str],
+ *,
+ cwd: Path,
+) -> tuple[bool, str | subprocess.CompletedProcess]:
+ """Run a git command. Returns ``(True, completed)`` on success
+ (returncode==0), or ``(False, formatted_reason)`` on any failure mode
+ (timeout, missing git binary, non-zero exit). Centralizing the reason
+ formatting keeps every callsite to a one-line ``return PRResult(... reason=res)``.
+ """
+ cmd_name = args[0] if args else "git"
+ try:
+ result = subprocess.run(
+ ["git", *args],
+ cwd=str(cwd),
+ capture_output=True,
+ text=True,
+ timeout=_GIT_TIMEOUT_SECONDS,
+ )
+ except subprocess.TimeoutExpired:
+ return False, f"git {cmd_name} timed out after {_GIT_TIMEOUT_SECONDS}s"
+ except FileNotFoundError:
+ return False, "git not found on PATH"
+ if result.returncode != 0:
+ return False, f"git {cmd_name} failed: {_tail(result.stderr)}"
+ return True, result
+
+
+def _branch_name(prefix: str, artifact_name: str, timestamp: datetime) -> str:
+ sanitized = _BRANCH_SANITIZE_RE.sub("-", artifact_name).strip("-")
+ ts = timestamp.strftime("%Y%m%d-%H%M%S")
+ suffix = secrets.token_hex(2)
+ return f"{prefix}{sanitized}-{ts}-{suffix}"
+
+
+def _atomic_copy(src: Path, dst: Path) -> None:
+ # tempfile + os.replace is atomic only when src and dst share a filesystem,
+ # so the tempfile is created under dst.parent.
+ dst.parent.mkdir(parents=True, exist_ok=True)
+ data = src.read_bytes()
+ with tempfile.NamedTemporaryFile(
+ delete=False, dir=str(dst.parent), prefix=".pr_atomic_"
+ ) as tmp:
+ tmp.write(data)
+ tmp_path = Path(tmp.name)
+ os.replace(tmp_path, dst)
+
+
+def _format_pr_body(gate_decision: dict[str, Any], metrics: dict[str, Any]) -> str:
+ decision = gate_decision.get("decision", "unknown")
+ signal = gate_decision.get("decision_signal", "synthetic")
+ reason = gate_decision.get("reason", "")
+ baseline = metrics.get("baseline_mean")
+ evolved = metrics.get("evolved_mean")
+ delta = metrics.get("delta")
+ if delta is None and baseline is not None and evolved is not None:
+ delta = evolved - baseline
+
+ lines = [f"## Evolution decision: {decision}", ""]
+
+ if signal == "closed_loop":
+ gained = gate_decision.get("cl_tasks_gained")
+ required = gate_decision.get("cl_required_gain")
+ if gained is not None:
+ headline = f"**Closed-loop tasks gained: +{gained}**"
+ if required is not None:
+ headline += f" (required ≥ {required})"
+ lines += [headline, ""]
+ lines += [f"Decision signal: `closed_loop`", ""]
+ else:
+ lines += [f"Decision signal: `synthetic`", ""]
+
+ if reason:
+ lines += [f"Reason: `{reason}`", ""]
+
+ if baseline is not None and evolved is not None:
+ sign = "+" if (delta or 0) >= 0 else ""
+ lines += [
+ "### Holdout score",
+ f"- baseline: `{baseline:.2f}`",
+ f"- evolved: `{evolved:.2f}`",
+ f"- delta: `{sign}{delta:.2f}`",
+ "",
+ ]
+
+ bootstrap = gate_decision.get("bootstrap")
+ if isinstance(bootstrap, dict) and "ci_low" in bootstrap and "ci_high" in bootstrap:
+ lines += [
+ "### Bootstrap CI",
+ f"- 95% CI: `[{bootstrap['ci_low']:.3f}, {bootstrap['ci_high']:.3f}]`",
+ "",
+ ]
+
+ baseline_chars = gate_decision.get("baseline_chars")
+ evolved_chars = gate_decision.get("evolved_chars")
+ if baseline_chars is not None and evolved_chars is not None:
+ size_delta = evolved_chars - baseline_chars
+ lines += [
+ "### Artifact size",
+ f"- baseline: `{baseline_chars}` chars",
+ f"- evolved: `{evolved_chars}` chars (`{size_delta:+d}`)",
+ "",
+ ]
+
+ cost_summary = gate_decision.get("cost_summary")
+ if isinstance(cost_summary, dict) and "total_usd" in cost_summary:
+ lines += ["### Cost", f"- total: `${cost_summary['total_usd']:.4f}`", ""]
+
+ lines += [
+ "---",
+ "Generated by agent-self-evolution. Review and merge or close manually.",
+ ]
+ return "\n".join(line for line in lines if line is not None)
+
+
+def _commit_message(artifact_name: str, metrics: dict[str, Any], signal: str, gate_decision: dict[str, Any]) -> str:
+ if signal == "closed_loop":
+ gained = gate_decision.get("cl_tasks_gained", 0)
+ summary = f"CL tasks +{gained}"
+ else:
+ baseline = metrics.get("baseline_mean")
+ evolved = metrics.get("evolved_mean")
+ delta = metrics.get("delta")
+ if delta is None and baseline is not None and evolved is not None:
+ delta = evolved - baseline
+ if baseline is not None and evolved is not None and delta is not None:
+ summary = f"holdout {baseline:.2f}->{evolved:.2f} ({delta:+.2f})"
+ else:
+ summary = "deploy"
+ return f"evolve({artifact_name}): {summary}"
+
+
+def create_pr(
+ *,
+ source_repo_root: Optional[Path],
+ source_artifact_relpath: str,
+ evolved_artifact_path: Path,
+ artifact_name: str,
+ gate_decision: dict[str, Any],
+ metrics: dict[str, Any],
+ base_branch: str,
+ branch_prefix: str,
+ draft: bool,
+ allow_dirty: bool,
+ console: Console,
+) -> PRResult:
+ if source_repo_root is None:
+ return PRResult(
+ status="skipped",
+ reason="source repo not git-backed (e.g., Claude Code plugin cache)",
+ )
+
+ # 1. Dirty-tree check
+ ok, res = _run_git(["status", "--porcelain"], cwd=source_repo_root)
+ if not ok:
+ return PRResult(status="failed", reason=res) # type: ignore[arg-type]
+ if res.stdout.strip() and not allow_dirty:
+ console.print("[yellow]Dirty working tree detected:[/yellow]")
+ console.print(res.stdout.rstrip())
+ return PRResult(
+ status="skipped",
+ reason="dirty working tree (pass --pr-allow-dirty to override)",
+ )
+
+ # 2. Fetch origin
+ ok, res = _run_git(["fetch", "origin", base_branch], cwd=source_repo_root)
+ if not ok:
+ return PRResult(status="failed", reason=res) # type: ignore[arg-type]
+
+ # 3. Branch from origin/
+ branch = _branch_name(branch_prefix, artifact_name, datetime.now())
+ ok, res = _run_git(
+ ["checkout", "-b", branch, f"origin/{base_branch}"], cwd=source_repo_root
+ )
+ if not ok:
+ return PRResult(status="failed", reason=res, branch=branch) # type: ignore[arg-type]
+
+ # 4. Atomic copy
+ dst = source_repo_root / source_artifact_relpath
+ try:
+ _atomic_copy(evolved_artifact_path, dst)
+ except OSError as exc:
+ return PRResult(status="failed", reason=f"atomic copy failed: {exc}", branch=branch)
+
+ # 5. Stage + commit
+ ok, res = _run_git(["add", source_artifact_relpath], cwd=source_repo_root)
+ if not ok:
+ return PRResult(status="failed", reason=res, branch=branch) # type: ignore[arg-type]
+
+ signal = gate_decision.get("decision_signal", "synthetic")
+ message = _commit_message(artifact_name, metrics, signal, gate_decision)
+ ok, res = _run_git(["commit", "-m", message], cwd=source_repo_root)
+ if not ok:
+ return PRResult(status="failed", reason=res, branch=branch) # type: ignore[arg-type]
+
+ ok, res = _run_git(["rev-parse", "HEAD"], cwd=source_repo_root)
+ commit_sha: Optional[str] = None
+ if ok and isinstance(res, subprocess.CompletedProcess):
+ commit_sha = res.stdout.strip()
+
+ # 6. Push
+ ok, res = _run_git(["push", "origin", branch], cwd=source_repo_root)
+ if not ok:
+ return PRResult(status="failed", reason=res, branch=branch, commit_sha=commit_sha) # type: ignore[arg-type]
+
+ # 7. gh pr create
+ title = _commit_message(artifact_name, metrics, signal, gate_decision)
+ body = _format_pr_body(gate_decision, metrics)
+ gh_args = [
+ "gh", "pr", "create",
+ "--base", base_branch,
+ "--head", branch,
+ "--title", title,
+ "--body", body,
+ ]
+ if draft:
+ gh_args.append("--draft")
+ try:
+ gh_res = subprocess.run(
+ gh_args,
+ cwd=str(source_repo_root),
+ capture_output=True,
+ text=True,
+ timeout=_GH_TIMEOUT_SECONDS,
+ )
+ except subprocess.TimeoutExpired:
+ return PRResult(
+ status="failed",
+ reason=f"gh pr create timed out after {_GH_TIMEOUT_SECONDS}s",
+ branch=branch,
+ commit_sha=commit_sha,
+ )
+ except FileNotFoundError:
+ return PRResult(
+ status="failed",
+ reason="gh not found on PATH",
+ branch=branch,
+ commit_sha=commit_sha,
+ )
+ if gh_res.returncode != 0:
+ return PRResult(
+ status="failed",
+ reason=f"gh pr create failed: {_tail(gh_res.stderr)}",
+ branch=branch,
+ commit_sha=commit_sha,
+ )
+
+ url = gh_res.stdout.strip().splitlines()[-1] if gh_res.stdout.strip() else ""
+ return PRResult(
+ status="created",
+ reason="",
+ branch=branch,
+ commit_sha=commit_sha,
+ url=url,
+ )
diff --git a/evolution/core/run_inputs.py b/evolution/core/run_inputs.py
index 213dac6..d418efa 100644
--- a/evolution/core/run_inputs.py
+++ b/evolution/core/run_inputs.py
@@ -20,6 +20,7 @@ def build_run_inputs(
quality_gate_preset: str,
eval_source: str,
gepa_acceptance: str,
+ create_pr: bool,
fitness_profile: Optional[str] = None,
enable_confusable_bucket: Optional[bool] = None,
) -> dict[str, Any]:
@@ -39,6 +40,7 @@ def build_run_inputs(
"quality_gate_preset": quality_gate_preset,
"eval_source": eval_source,
"gepa_acceptance": gepa_acceptance,
+ "create_pr": create_pr,
}
if fitness_profile is not None:
run_inputs["fitness_profile"] = fitness_profile
diff --git a/evolution/skills/evolve_skill.py b/evolution/skills/evolve_skill.py
index cc69348..1b01948 100644
--- a/evolution/skills/evolve_skill.py
+++ b/evolution/skills/evolve_skill.py
@@ -40,6 +40,12 @@
resolve_default_lm,
resolved_lms_dump,
)
+from evolution.core.pr_automation import (
+ create_pr,
+ disabled_pr_block,
+ find_git_root,
+ pr_block_from_result,
+)
from evolution.core.quality_gate import (
QUALITY_GATE_PRESETS,
_check_cl_primary_gate,
@@ -651,6 +657,11 @@ def evolve(
closed_loop_in_valset: bool = False,
closed_loop_agent_model: Optional[str] = None,
closed_loop_task_timeout_seconds: Optional[int] = None,
+ create_pr_flag: bool = False,
+ pr_base_branch: str = "main",
+ pr_branch_prefix: str = "evolve/",
+ pr_draft: bool = False,
+ pr_allow_dirty: bool = False,
):
"""Main evolution function — orchestrates the full optimization loop."""
@@ -1109,6 +1120,7 @@ def evolve(
quality_gate_preset=quality_gate,
eval_source=eval_source,
gepa_acceptance=config.gepa_acceptance,
+ create_pr=create_pr_flag,
),
})
console.print(f" Saved failed variant to {failed_path}")
@@ -1150,6 +1162,7 @@ def evolve(
quality_gate_preset=quality_gate,
eval_source=eval_source,
gepa_acceptance=config.gepa_acceptance,
+ create_pr=create_pr_flag,
)
use_cl_primary = (
@@ -1412,8 +1425,57 @@ def evolve(
# didn't fire even though CL may be configured.
decision_payload["reason_synthetic"] = "preflight_skipped"
+ # Persist evolved + baseline artifacts once on the deploy path
+ # for both the PR hook (needs the path) and the post-table
+ # reporting (needs them on disk for the user).
+ if growth_pass:
+ evolved_skill_path = output_dir / "evolved_skill.md"
+ evolved_skill_path.write_text(evolved_full)
+ (output_dir / "baseline_skill.md").write_text(skill["raw"])
+
+ # Run PR automation BEFORE writing gate_decision.json so the PR
+ # outcome lands in the same single-write block — calibration
+ # scripts grepping pr_created don't have to special-case a
+ # re-write or missing key.
+ pr_created_block: dict[str, Any] = disabled_pr_block()
+ if growth_pass and create_pr_flag:
+ source_repo_root = find_git_root(skill_path)
+ source_artifact_relpath = (
+ str(skill_path.relative_to(source_repo_root))
+ if source_repo_root is not None
+ else str(skill_path)
+ )
+ pr_result = create_pr(
+ source_repo_root=source_repo_root,
+ source_artifact_relpath=source_artifact_relpath,
+ evolved_artifact_path=evolved_skill_path,
+ artifact_name=skill_name,
+ gate_decision=decision_payload,
+ metrics={
+ "baseline_mean": avg_baseline,
+ "evolved_mean": avg_evolved,
+ "delta": improvement,
+ },
+ base_branch=pr_base_branch,
+ branch_prefix=pr_branch_prefix,
+ draft=pr_draft,
+ allow_dirty=pr_allow_dirty,
+ console=console,
+ )
+ pr_created_block = pr_block_from_result(pr_result)
+ decision_payload["pr_created"] = pr_created_block
+
gate_path = write_gate_decision(output_dir, decision_payload)
console.print(f" [dim]Gate decision logged to {gate_path}[/dim]")
+ if pr_created_block["status"] == "created":
+ console.print(
+ f" [green]✓ PR opened: {pr_created_block['url']}[/green]"
+ )
+ elif pr_created_block["status"] in ("skipped", "failed"):
+ console.print(
+ f" [yellow]PR automation {pr_created_block['status']}: "
+ f"{pr_created_block['reason']}[/yellow]"
+ )
if not growth_pass:
console.print("[red]✗ Evolved skill REJECTED by quality gate — not deploying[/red]")
@@ -1479,9 +1541,6 @@ def evolve(
console.print()
console.print(table)
- evolved_skill_path = output_dir / "evolved_skill.md"
- evolved_skill_path.write_text(evolved_full)
- (output_dir / "baseline_skill.md").write_text(skill["raw"])
metrics = {
"skill_name": skill_name,
"timestamp": timestamp,
@@ -1541,6 +1600,7 @@ def evolve(
quality_gate_preset=quality_gate,
eval_source=eval_source,
gepa_acceptance=config.gepa_acceptance,
+ create_pr=create_pr_flag,
),
schema_version="5",
)
@@ -1848,6 +1908,48 @@ def evolve(
"fix for noisy LM-judge fitness where strict acceptance rejects "
"~50% of true-equal mutations.",
)
+@click.option(
+ "--create-pr/--no-create-pr",
+ "create_pr_flag",
+ is_flag=True,
+ default=False,
+ help="On a deploy decision, branch the source repo, commit the evolved "
+ "artifact, push, and open a GitHub PR. Off by default — opt in "
+ "per-run. No-op on reject. Skips cleanly when the source isn't "
+ "git-backed (e.g. Claude Code plugin cache).",
+)
+@click.option(
+ "--pr-base-branch",
+ "pr_base_branch",
+ default="main",
+ type=str,
+ help="Target branch for the PR opened by --create-pr (default: main).",
+)
+@click.option(
+ "--pr-branch-prefix",
+ "pr_branch_prefix",
+ default="evolve/",
+ type=str,
+ help="Prefix for the PR's head branch under --create-pr. Branch names "
+ "become '{prefix}{artifact}-{timestamp}-{hex}'.",
+)
+@click.option(
+ "--pr-draft",
+ "pr_draft",
+ is_flag=True,
+ default=False,
+ help="Open the --create-pr PR as a draft (recommended for personal "
+ "automation pipelines that want a human review gate before merge).",
+)
+@click.option(
+ "--pr-allow-dirty",
+ "pr_allow_dirty",
+ is_flag=True,
+ default=False,
+ help="Override --create-pr's dirty-tree refusal. Default behavior "
+ "skips PR creation when the source repo has uncommitted changes "
+ "to avoid sweeping unrelated edits into the evolution PR.",
+)
@click.option(
"--closed-loop-during-evolution",
"closed_loop_suite_path",
@@ -1943,6 +2045,11 @@ def main(skill, iterations, eval_source, dataset_path, optimizer_model, reflecti
force_saturation_check,
gepa_minibatch_size,
gepa_acceptance,
+ create_pr_flag,
+ pr_base_branch,
+ pr_branch_prefix,
+ pr_draft,
+ pr_allow_dirty,
closed_loop_suite_path,
closed_loop_saturation_threshold,
closed_loop_min_iters,
@@ -2000,6 +2107,11 @@ def main(skill, iterations, eval_source, dataset_path, optimizer_model, reflecti
closed_loop_in_valset=closed_loop_in_valset,
closed_loop_agent_model=closed_loop_agent_model,
closed_loop_task_timeout_seconds=closed_loop_task_timeout_seconds,
+ create_pr_flag=create_pr_flag,
+ pr_base_branch=pr_base_branch,
+ pr_branch_prefix=pr_branch_prefix,
+ pr_draft=pr_draft,
+ pr_allow_dirty=pr_allow_dirty,
)
except HermesProviderError as exc:
# Render a clean error panel instead of dumping a Python traceback
diff --git a/evolution/tools/evolve_tool.py b/evolution/tools/evolve_tool.py
index b76388d..8a97d4a 100644
--- a/evolution/tools/evolve_tool.py
+++ b/evolution/tools/evolve_tool.py
@@ -58,6 +58,12 @@
register_litellm_cost_callback,
register_litellm_failure_callback,
)
+from evolution.core.pr_automation import (
+ create_pr,
+ disabled_pr_block,
+ find_git_root,
+ pr_block_from_result,
+)
from evolution.core.quality_gate import (
QUALITY_GATE_PRESETS,
_check_cl_primary_gate,
@@ -384,6 +390,11 @@ def evolve(
force_saturation_check: bool = False,
gepa_minibatch_size: int = 3,
gepa_acceptance: str = "improvement-or-equal",
+ create_pr_flag: bool = False,
+ pr_base_branch: str = "main",
+ pr_branch_prefix: str = "evolve/",
+ pr_draft: bool = False,
+ pr_allow_dirty: bool = False,
) -> dict[str, Any]:
"""Evolve one tool description inside a manifest.
@@ -804,6 +815,7 @@ def evolve(
quality_gate_preset=quality_gate,
eval_source=eval_source,
gepa_acceptance=config.gepa_acceptance,
+ create_pr=create_pr_flag,
fitness_profile=fitness_profile,
enable_confusable_bucket=config.enable_confusable_bucket,
)
@@ -1103,8 +1115,63 @@ def evolve(
# consumers distinguish 'preflight saw no weak_signal' from
# 'preflight didn't run.'
decision_payload["reason_synthetic"] = "preflight_skipped"
+
+ # Compute evolved_manifest + persist baseline/evolved manifest
+ # artifacts once on the deploy path. The PR hook, the patch
+ # emitter, and the apply call all reference these.
+ if growth_pass:
+ evolved_manifest = manifest.replace_description(tool_name, evolved_description)
+ evolved_manifest_path = output_dir / "evolved_manifest.json"
+ evolved_manifest_path.write_text(
+ json.dumps(_manifest_to_dict(evolved_manifest), indent=2) + "\n"
+ )
+ (output_dir / "baseline_manifest.json").write_text(
+ json.dumps(_manifest_to_dict(manifest), indent=2) + "\n"
+ )
+
+ # Run PR automation BEFORE writing gate_decision.json so the PR
+ # outcome lands in the same single-write block — calibration
+ # scripts grepping pr_created don't have to special-case a
+ # re-write or missing key.
+ pr_created_block: dict[str, Any] = disabled_pr_block()
+ if growth_pass and create_pr_flag:
+ source_repo_root = find_git_root(manifest_path)
+ source_artifact_relpath = (
+ str(manifest_path.relative_to(source_repo_root))
+ if source_repo_root is not None
+ else str(manifest_path)
+ )
+ pr_result = create_pr(
+ source_repo_root=source_repo_root,
+ source_artifact_relpath=source_artifact_relpath,
+ evolved_artifact_path=evolved_manifest_path,
+ artifact_name=tool_name,
+ gate_decision=decision_payload,
+ metrics={
+ "baseline_mean": avg_baseline,
+ "evolved_mean": avg_evolved,
+ "delta": improvement,
+ },
+ base_branch=pr_base_branch,
+ branch_prefix=pr_branch_prefix,
+ draft=pr_draft,
+ allow_dirty=pr_allow_dirty,
+ console=console,
+ )
+ pr_created_block = pr_block_from_result(pr_result)
+ decision_payload["pr_created"] = pr_created_block
+
gate_path = write_gate_decision(output_dir, decision_payload)
console.print(f" [dim]Gate decision logged to {gate_path}[/dim]")
+ if pr_created_block["status"] == "created":
+ console.print(
+ f" [green]✓ PR opened: {pr_created_block['url']}[/green]"
+ )
+ elif pr_created_block["status"] in ("skipped", "failed"):
+ console.print(
+ f" [yellow]PR automation {pr_created_block['status']}: "
+ f"{pr_created_block['reason']}[/yellow]"
+ )
if not growth_pass:
console.print("[red]✗ Evolved description REJECTED by quality gate — not deploying[/red]")
@@ -1168,13 +1235,6 @@ def evolve(
console.print()
console.print(table)
- evolved_manifest = manifest.replace_description(tool_name, evolved_description)
- (output_dir / "baseline_manifest.json").write_text(
- json.dumps(_manifest_to_dict(manifest), indent=2) + "\n"
- )
- (output_dir / "evolved_manifest.json").write_text(
- json.dumps(_manifest_to_dict(evolved_manifest), indent=2) + "\n"
- )
metrics = {
"tool_name": tool_name,
"manifest_path": str(manifest_path),
@@ -1244,6 +1304,7 @@ def evolve(
quality_gate_preset=quality_gate,
eval_source=eval_source,
gepa_acceptance=config.gepa_acceptance,
+ create_pr=create_pr_flag,
fitness_profile=fitness_profile,
enable_confusable_bucket=config.enable_confusable_bucket,
)
@@ -1493,6 +1554,48 @@ def evolve(
"fix for noisy LM-judge fitness where strict acceptance rejects "
"~50% of true-equal mutations.",
)
+@click.option(
+ "--create-pr/--no-create-pr",
+ "create_pr_flag",
+ is_flag=True,
+ default=False,
+ help="On a deploy decision, branch the source repo, commit the evolved "
+ "artifact, push, and open a GitHub PR. Off by default — opt in "
+ "per-run. No-op on reject. Skips cleanly when the source isn't "
+ "git-backed (e.g. Claude Code plugin cache).",
+)
+@click.option(
+ "--pr-base-branch",
+ "pr_base_branch",
+ default="main",
+ type=str,
+ help="Target branch for the PR opened by --create-pr (default: main).",
+)
+@click.option(
+ "--pr-branch-prefix",
+ "pr_branch_prefix",
+ default="evolve/",
+ type=str,
+ help="Prefix for the PR's head branch under --create-pr. Branch names "
+ "become '{prefix}{artifact}-{timestamp}-{hex}'.",
+)
+@click.option(
+ "--pr-draft",
+ "pr_draft",
+ is_flag=True,
+ default=False,
+ help="Open the --create-pr PR as a draft (recommended for personal "
+ "automation pipelines that want a human review gate before merge).",
+)
+@click.option(
+ "--pr-allow-dirty",
+ "pr_allow_dirty",
+ is_flag=True,
+ default=False,
+ help="Override --create-pr's dirty-tree refusal. Default behavior "
+ "skips PR creation when the source repo has uncommitted changes "
+ "to avoid sweeping unrelated edits into the evolution PR.",
+)
@click.option(
"--closed-loop-in-valset/--no-closed-loop-in-valset",
"closed_loop_in_valset",
@@ -1548,6 +1651,11 @@ def main(
force_saturation_check: bool,
gepa_minibatch_size: int,
gepa_acceptance: str,
+ create_pr_flag: bool,
+ pr_base_branch: str,
+ pr_branch_prefix: str,
+ pr_draft: bool,
+ pr_allow_dirty: bool,
closed_loop_suite_path: Optional[Path],
closed_loop_hermes_repo: Optional[Path],
closed_loop_saturation_threshold: float,
@@ -1602,6 +1710,11 @@ def main(
force_saturation_check=force_saturation_check,
gepa_minibatch_size=gepa_minibatch_size,
gepa_acceptance=gepa_acceptance,
+ create_pr_flag=create_pr_flag,
+ pr_base_branch=pr_base_branch,
+ pr_branch_prefix=pr_branch_prefix,
+ pr_draft=pr_draft,
+ pr_allow_dirty=pr_allow_dirty,
)
except HermesProviderError as exc:
# Render a clean error panel instead of dumping a Python traceback —
diff --git a/tests/core/test_pr_automation.py b/tests/core/test_pr_automation.py
new file mode 100644
index 0000000..a3d6e82
--- /dev/null
+++ b/tests/core/test_pr_automation.py
@@ -0,0 +1,280 @@
+"""Tests for ``evolution.core.pr_automation``.
+
+Covers the orchestration paths (skip / failed / created) with mocked
+subprocess plus a single integration test against an ephemeral local
+git pair to confirm the real `git`/file-copy choreography works.
+"""
+
+import re
+import subprocess
+from datetime import datetime
+from pathlib import Path
+from unittest.mock import MagicMock, patch
+
+import pytest
+from rich.console import Console
+
+from evolution.core.pr_automation import (
+ PRResult,
+ _atomic_copy,
+ _branch_name,
+ _format_pr_body,
+ create_pr,
+ find_git_root,
+)
+
+
+def _ok(stdout: str = "", stderr: str = "") -> subprocess.CompletedProcess:
+ return subprocess.CompletedProcess(args=[], returncode=0, stdout=stdout, stderr=stderr)
+
+
+def _fail(stderr: str, stdout: str = "") -> subprocess.CompletedProcess:
+ return subprocess.CompletedProcess(args=[], returncode=1, stdout=stdout, stderr=stderr)
+
+
+def _happy_path_side_effect(pr_url: str = "https://github.com/o/r/pull/42"):
+ """Return a side_effect that walks the happy-path subprocess sequence.
+
+ Order mirrors create_pr's orchestration: status, fetch, checkout, add,
+ commit, rev-parse, push, gh pr create.
+ """
+ responses = [
+ _ok(stdout=""), # git status --porcelain (clean)
+ _ok(), # git fetch
+ _ok(), # git checkout -b
+ _ok(), # git add
+ _ok(), # git commit
+ _ok(stdout="abc1234\n"), # git rev-parse HEAD
+ _ok(), # git push
+ _ok(stdout=f"{pr_url}\n"), # gh pr create
+ ]
+ return responses
+
+
+class TestPRAutomation:
+ def _kwargs(self, tmp_path: Path, **overrides):
+ evolved = tmp_path / "evolved.md"
+ evolved.write_text("evolved content")
+ source_root = tmp_path / "source"
+ (source_root / "skills" / "test").mkdir(parents=True)
+ (source_root / "skills" / "test" / "SKILL.md").write_text("baseline")
+ defaults = dict(
+ source_repo_root=source_root,
+ source_artifact_relpath="skills/test/SKILL.md",
+ evolved_artifact_path=evolved,
+ artifact_name="test-skill",
+ gate_decision={"decision_signal": "synthetic", "decision": "deploy"},
+ metrics={"baseline_mean": 0.40, "evolved_mean": 0.55, "delta": 0.15},
+ base_branch="main",
+ branch_prefix="evolve/",
+ draft=False,
+ allow_dirty=False,
+ console=Console(),
+ )
+ defaults.update(overrides)
+ return defaults
+
+ def test_happy_path_creates_pr(self, tmp_path: Path):
+ responses = _happy_path_side_effect("https://github.com/o/r/pull/123")
+ with patch("evolution.core.pr_automation.subprocess.run", side_effect=responses):
+ result = create_pr(**self._kwargs(tmp_path))
+ assert result.status == "created"
+ assert result.url == "https://github.com/o/r/pull/123"
+ assert result.branch.startswith("evolve/test-skill-")
+ assert result.commit_sha == "abc1234"
+
+ def test_skipped_when_source_repo_root_is_none(self, tmp_path: Path):
+ with patch("evolution.core.pr_automation.subprocess.run") as run:
+ result = create_pr(**self._kwargs(tmp_path, source_repo_root=None))
+ assert result.status == "skipped"
+ assert "git-backed" in result.reason
+ assert result.branch is None
+ assert result.commit_sha is None
+ assert result.url is None
+ run.assert_not_called()
+
+ def test_skipped_on_dirty_tree_when_allow_dirty_false(self, tmp_path: Path):
+ responses = [_ok(stdout=" M file.txt\n?? other.txt\n")]
+ with patch("evolution.core.pr_automation.subprocess.run", side_effect=responses):
+ result = create_pr(**self._kwargs(tmp_path))
+ assert result.status == "skipped"
+ assert "dirty" in result.reason.lower()
+
+ def test_proceeds_on_dirty_tree_when_allow_dirty_true(self, tmp_path: Path):
+ responses = [
+ _ok(stdout=" M file.txt\n"), # dirty status — but ignored
+ _ok(), # fetch
+ _ok(), # checkout
+ _ok(), # add
+ _ok(), # commit
+ _ok(stdout="deadbee\n"), # rev-parse
+ _ok(), # push
+ _ok(stdout="https://github.com/o/r/pull/9\n"), # gh pr create
+ ]
+ with patch("evolution.core.pr_automation.subprocess.run", side_effect=responses):
+ result = create_pr(**self._kwargs(tmp_path, allow_dirty=True))
+ assert result.status == "created"
+ assert result.url == "https://github.com/o/r/pull/9"
+
+ def test_failed_when_gh_not_on_path(self, tmp_path: Path):
+ responses = [
+ _ok(stdout=""),
+ _ok(), _ok(), _ok(), _ok(),
+ _ok(stdout="abc1234\n"),
+ _ok(),
+ FileNotFoundError("gh: command not found"),
+ ]
+ with patch("evolution.core.pr_automation.subprocess.run", side_effect=responses):
+ result = create_pr(**self._kwargs(tmp_path))
+ assert result.status == "failed"
+ assert "gh" in result.reason.lower()
+ assert result.branch is not None
+ assert result.commit_sha == "abc1234"
+
+ def test_failed_when_push_fails_captures_stderr(self, tmp_path: Path):
+ responses = [
+ _ok(stdout=""),
+ _ok(), _ok(), _ok(), _ok(),
+ _ok(stdout="abc1234\n"),
+ _fail(stderr="remote: Permission denied to user@example.com\n"),
+ ]
+ with patch("evolution.core.pr_automation.subprocess.run", side_effect=responses):
+ result = create_pr(**self._kwargs(tmp_path))
+ assert result.status == "failed"
+ assert "Permission denied" in result.reason
+ assert result.branch is not None
+ assert result.commit_sha == "abc1234"
+
+ def test_branch_name_has_4char_suffix_and_sanitized(self):
+ ts = datetime(2026, 5, 25, 14, 30, 45)
+ name = _branch_name("evolve/", "some/skill:with spaces", ts)
+ assert re.match(r"^evolve/some-skill-with-spaces-\d{8}-\d{6}-[0-9a-f]{4}$", name), name
+
+ def test_format_pr_body_synthetic_deploy(self):
+ body = _format_pr_body(
+ gate_decision={
+ "decision_signal": "synthetic",
+ "decision": "deploy",
+ "reason": "growth_quality_gate_passed",
+ "baseline_chars": 1000,
+ "evolved_chars": 1100,
+ "cost_summary": {"total_usd": 1.23},
+ },
+ metrics={"baseline_mean": 0.40, "evolved_mean": 0.55, "delta": 0.15},
+ )
+ assert "deploy" in body.lower()
+ assert "0.40" in body and "0.55" in body
+ assert "+0.15" in body or "0.15" in body
+ assert "Generated by agent-self-evolution" in body
+
+ def test_format_pr_body_cl_primary_deploy(self):
+ body = _format_pr_body(
+ gate_decision={
+ "decision_signal": "closed_loop",
+ "decision": "deploy",
+ "reason": "cl_primary_gate_passed",
+ "cl_tasks_gained": 3,
+ "cl_required_gain": 1,
+ "cost_summary": {"total_usd": 4.50},
+ },
+ metrics={"baseline_mean": 0.40, "evolved_mean": 0.42, "delta": 0.02},
+ )
+ # CL gain should appear prominently (in the first ~400 chars, i.e. above the fold)
+ assert "+3" in body[:600] or "3 task" in body[:600] or "cl_tasks_gained" in body[:600].lower()
+ assert "closed" in body.lower() or "cl" in body.lower()
+ assert "Generated by agent-self-evolution" in body
+
+ def test_integration_against_ephemeral_repo(self, tmp_path: Path):
+ bare = tmp_path / "remote.git"
+ clone = tmp_path / "clone"
+ subprocess.run(["git", "init", "--bare", str(bare)], check=True, capture_output=True)
+ subprocess.run(["git", "clone", str(bare), str(clone)], check=True, capture_output=True)
+ # Seed the repo so origin/main exists.
+ subprocess.run(["git", "-C", str(clone), "config", "user.email", "t@t"], check=True)
+ subprocess.run(["git", "-C", str(clone), "config", "user.name", "t"], check=True)
+ subprocess.run(["git", "-C", str(clone), "checkout", "-b", "main"], check=True, capture_output=True)
+ skill_path = clone / "skills" / "test" / "SKILL.md"
+ skill_path.parent.mkdir(parents=True)
+ skill_path.write_text("baseline content\n")
+ subprocess.run(["git", "-C", str(clone), "add", "."], check=True)
+ subprocess.run(["git", "-C", str(clone), "commit", "-m", "seed"], check=True, capture_output=True)
+ subprocess.run(["git", "-C", str(clone), "push", "-u", "origin", "main"], check=True, capture_output=True)
+
+ evolved = tmp_path / "evolved.md"
+ evolved.write_text("evolved improved content\n")
+
+ real_run = subprocess.run
+
+ def fake_run(cmd, *args, **kwargs):
+ # Only intercept the gh CLI call; everything else hits real git.
+ if isinstance(cmd, (list, tuple)) and len(cmd) > 0 and cmd[0] == "gh":
+ return subprocess.CompletedProcess(
+ args=cmd, returncode=0,
+ stdout="https://github.com/fake/repo/pull/777\n", stderr="",
+ )
+ return real_run(cmd, *args, **kwargs)
+
+ with patch("evolution.core.pr_automation.subprocess.run", side_effect=fake_run):
+ result = create_pr(
+ source_repo_root=clone,
+ source_artifact_relpath="skills/test/SKILL.md",
+ evolved_artifact_path=evolved,
+ artifact_name="test-skill",
+ gate_decision={"decision_signal": "synthetic", "decision": "deploy"},
+ metrics={"baseline_mean": 0.40, "evolved_mean": 0.55, "delta": 0.15},
+ base_branch="main",
+ branch_prefix="evolve/",
+ draft=False,
+ allow_dirty=False,
+ console=Console(),
+ )
+
+ assert result.status == "created"
+ assert result.url == "https://github.com/fake/repo/pull/777"
+ assert result.branch.startswith("evolve/test-skill-")
+ assert result.commit_sha and len(result.commit_sha) >= 7
+
+ # Branch exists on the bare remote
+ ls = subprocess.run(
+ ["git", "-C", str(bare), "branch", "--list", result.branch],
+ check=True, capture_output=True, text=True,
+ )
+ assert result.branch in ls.stdout
+
+ # File content on the new branch matches evolved artifact
+ show = subprocess.run(
+ ["git", "-C", str(bare), "show", f"{result.branch}:skills/test/SKILL.md"],
+ check=True, capture_output=True, text=True,
+ )
+ assert show.stdout == "evolved improved content\n"
+
+
+class TestFindGitRoot:
+ def test_returns_root_inside_repo(self, tmp_path: Path):
+ subprocess.run(["git", "init", str(tmp_path)], check=True, capture_output=True)
+ nested = tmp_path / "a" / "b" / "c.md"
+ nested.parent.mkdir(parents=True)
+ nested.write_text("x")
+ root = find_git_root(nested)
+ assert root is not None and root.resolve() == tmp_path.resolve()
+
+ def test_returns_none_outside_repo(self, tmp_path: Path):
+ # tmp_path is fresh and not a git repo
+ assert find_git_root(tmp_path / "missing.md") is None
+
+
+class TestAtomicCopy:
+ def test_replaces_existing_file(self, tmp_path: Path):
+ src = tmp_path / "src.md"
+ dst = tmp_path / "dst.md"
+ src.write_text("new")
+ dst.write_text("old")
+ _atomic_copy(src, dst)
+ assert dst.read_text() == "new"
+
+ def test_creates_when_missing(self, tmp_path: Path):
+ src = tmp_path / "src.md"
+ dst = tmp_path / "dst.md"
+ src.write_text("hello")
+ _atomic_copy(src, dst)
+ assert dst.read_text() == "hello"
diff --git a/tests/core/test_run_inputs.py b/tests/core/test_run_inputs.py
index 2857575..0ed1954 100644
--- a/tests/core/test_run_inputs.py
+++ b/tests/core/test_run_inputs.py
@@ -28,6 +28,7 @@ def test_skill_side_shape(self):
quality_gate_preset="default",
eval_source="synthetic",
gepa_acceptance="improvement_or_equal",
+ create_pr=False,
)
assert set(result.keys()) == {
"seed",
@@ -41,8 +42,10 @@ def test_skill_side_shape(self):
"quality_gate_preset",
"eval_source",
"gepa_acceptance",
+ "create_pr",
}
assert result["gepa_acceptance"] == "improvement_or_equal"
+ assert result["create_pr"] is False
def test_tool_side_adds_fitness_profile_and_confusable_bucket(self):
config = _fake_config()
@@ -54,6 +57,7 @@ def test_tool_side_adds_fitness_profile_and_confusable_bucket(self):
quality_gate_preset="default",
eval_source="synthetic",
gepa_acceptance="strict_improvement",
+ create_pr=True,
fitness_profile="balanced",
enable_confusable_bucket=True,
)
@@ -69,12 +73,14 @@ def test_tool_side_adds_fitness_profile_and_confusable_bucket(self):
"quality_gate_preset",
"eval_source",
"gepa_acceptance",
+ "create_pr",
"fitness_profile",
"enable_confusable_bucket",
}
assert result["gepa_acceptance"] == "strict_improvement"
assert result["fitness_profile"] == "balanced"
assert result["enable_confusable_bucket"] is True
+ assert result["create_pr"] is True
def test_resolved_lms_matches_helper_output(self):
config = _fake_config()
@@ -85,6 +91,7 @@ def test_resolved_lms_matches_helper_output(self):
quality_gate_preset="default",
eval_source="synthetic",
gepa_acceptance="improvement_or_equal",
+ create_pr=False,
)
expected = resolved_lms_dump(
optimizer="openai/gpt-4.1",
@@ -106,6 +113,7 @@ def test_enable_confusable_bucket_round_trips_when_passed(self):
quality_gate_preset="default",
eval_source="synthetic",
gepa_acceptance="improvement_or_equal",
+ create_pr=False,
fitness_profile="balanced",
enable_confusable_bucket=config.enable_confusable_bucket,
)
diff --git a/tests/skills/test_evolve_skill_validation_flow.py b/tests/skills/test_evolve_skill_validation_flow.py
index 8a49759..38e7633 100644
--- a/tests/skills/test_evolve_skill_validation_flow.py
+++ b/tests/skills/test_evolve_skill_validation_flow.py
@@ -965,3 +965,104 @@ def test_gepa_acceptance_strict_passes_strict_improvement(self, skill_dir):
f"Expected acceptance_criterion=strict_improvement; "
f"got {captured['gepa_kwargs']!r}. CLI output: {result.output}"
)
+
+
+class TestPRAutomationWiring:
+ """Wiring tests for the --create-pr flag's pr_created block. The flag
+ is off by default; when on with no git-backed source, create_pr returns
+ a skipped PRResult that should round-trip into gate_decision.json."""
+
+ @pytest.fixture
+ def skill_dir(self, tmp_path):
+ skills_root = tmp_path / "skills"
+ skill_path = skills_root / "demo-skill"
+ skill_path.mkdir(parents=True)
+ (skill_path / "SKILL.md").write_text(
+ "---\nname: demo-skill\ndescription: a test skill\n---\n\nDo X.\n"
+ )
+ return skills_root
+
+ def _run(self, skill_dir: Path, extra_cli_args: list[str], monkeypatch, *, find_git_root_return=None):
+ fake_candidate = MagicMock()
+ fake_candidate.skill_text = "evolved skill text"
+ fake_module = MagicMock()
+ fake_module.skill_text = "evolved skill text"
+ fake_module.detailed_results = SimpleNamespace(
+ candidates=[fake_candidate],
+ val_aggregate_scores=[1.0],
+ best_idx=0,
+ )
+ fake_builder = MagicMock()
+ fake_builder.generate.return_value = _fake_skill_dataset()
+ # Pin cwd to a tmp dir so the orchestrator's "output/" parent
+ # doesn't pile inside the repo's working tree.
+ monkeypatch.chdir(skill_dir.parent)
+ with patch(
+ "evolution.skills.evolve_skill.SyntheticDatasetBuilder", return_value=fake_builder,
+ ), patch(
+ "evolution.skills.evolve_skill._preflight_lm_credentials",
+ ), patch(
+ "evolution.skills.evolve_skill.dspy.GEPA.compile", return_value=fake_module,
+ ), patch(
+ "evolution.skills.evolve_skill._holdout_evaluate_with_metric",
+ return_value=(0.6, [0.6] * 10),
+ ), patch(
+ "evolution.skills.evolve_skill.find_git_root",
+ return_value=find_git_root_return,
+ ):
+ runner = CliRunner()
+ return runner.invoke(
+ evolve_skill_cli,
+ ["--skill", "demo-skill", "--skill-source-dir", str(skill_dir),
+ "--iterations", "1", "--no-preflight",
+ "--no-saturation-check", "--quality-gate", "non-inferiority",
+ *extra_cli_args],
+ catch_exceptions=False,
+ )
+
+ def _read_latest_gate_decision(self, base_dir: Path) -> dict:
+ # The orchestrator writes to output///gate_decision.json
+ # under cwd; pull the only run under the demo-skill bucket.
+ runs = sorted((base_dir / "output" / "demo-skill").iterdir())
+ assert runs, "no run directory produced under output/demo-skill"
+ path = runs[-1] / "gate_decision.json"
+ assert path.exists(), f"gate_decision.json missing at {path}"
+ return json.loads(path.read_text())
+
+ def test_pr_created_block_disabled_when_create_pr_false(
+ self, skill_dir, tmp_path, monkeypatch,
+ ):
+ # Default --no-create-pr → gate_decision.json carries pr_created with
+ # status "disabled". The key is always present so calibration scripts
+ # don't have to check for absence.
+ result = self._run(skill_dir, extra_cli_args=[], monkeypatch=monkeypatch)
+ assert result.exit_code == 0, result.output
+ payload = self._read_latest_gate_decision(skill_dir.parent)
+ # All 5 fields present even when disabled — keeps downstream
+ # consumers free to use payload["pr_created"]["url"] without
+ # special-casing the absence of the key.
+ assert payload["pr_created"] == {
+ "status": "disabled",
+ "reason": None,
+ "branch": None,
+ "commit_sha": None,
+ "url": None,
+ }
+ assert payload["run_inputs"]["create_pr"] is False
+
+ def test_pr_created_block_records_skip_when_create_pr_true_and_no_repo(
+ self, skill_dir, tmp_path, monkeypatch,
+ ):
+ # --create-pr on + find_git_root → None (no git-backed source) lands
+ # a skipped PRResult in pr_created with the right reason string.
+ result = self._run(
+ skill_dir,
+ extra_cli_args=["--create-pr"],
+ monkeypatch=monkeypatch,
+ find_git_root_return=None,
+ )
+ assert result.exit_code == 0, result.output
+ payload = self._read_latest_gate_decision(skill_dir.parent)
+ assert payload["pr_created"]["status"] == "skipped"
+ assert "git-backed" in payload["pr_created"]["reason"]
+ assert payload["run_inputs"]["create_pr"] is True
diff --git a/tests/tools/test_evolve_tool_validation_flow.py b/tests/tools/test_evolve_tool_validation_flow.py
index 05198f2..5e21ea7 100644
--- a/tests/tools/test_evolve_tool_validation_flow.py
+++ b/tests/tools/test_evolve_tool_validation_flow.py
@@ -237,6 +237,85 @@ def test_gate_decision_schema_on_deploy(self, temp_manifest: Path, tmp_path: Pat
assert result["baseline_score"] < result["evolved_score"]
+class TestPRAutomationWiring:
+ """Wiring tests for the --create-pr flag's pr_created block. The flag
+ is off by default; when on with no git-backed source, create_pr returns
+ a skipped PRResult that should round-trip into gate_decision.json."""
+
+ def _run(self, temp_manifest, run_dir, *, create_pr_flag, find_git_root_return=None):
+ manifest = ToolManifest.from_json_file(temp_manifest)
+ with (
+ patch.object(
+ SyntheticDatasetBuilder,
+ "_call_lm_for_bucket",
+ side_effect=_bucket_side_effect(15, 9, 6),
+ ),
+ patch(
+ "evolution.tools.evolve_tool.dspy.GEPA",
+ new=_make_fake_gepa(
+ _build_evolved_module(manifest, EVOLVED_DESCRIPTION)
+ ),
+ ),
+ patch.object(
+ ToolJudge,
+ "score",
+ new=_scripted_judge_score(target_score=0.95, regression_score=0.0),
+ ),
+ patch.object(
+ ToolModule,
+ "forward",
+ new=_scripted_module_forward(expected_tool_for_evolved="search_files"),
+ ),
+ patch(
+ "evolution.tools.evolve_tool.find_git_root",
+ return_value=find_git_root_return,
+ ),
+ ):
+ evolve(
+ tool_name="search_files",
+ manifest_path=temp_manifest,
+ iterations=1,
+ eval_dataset_size=30,
+ holdout_ratio=0.5,
+ quality_gate="non-inferiority",
+ enable_confusable_bucket=True,
+ output_dir=run_dir,
+ create_pr_flag=create_pr_flag,
+ )
+
+ def test_pr_created_block_disabled_when_create_pr_false(
+ self, temp_manifest: Path, tmp_path: Path,
+ ):
+ run_dir = tmp_path / "run"
+ self._run(temp_manifest, run_dir, create_pr_flag=False)
+ payload = json.loads((run_dir / "gate_decision.json").read_text())
+ # All 5 fields present even when disabled — keeps downstream
+ # consumers free to use payload["pr_created"]["url"] without
+ # special-casing the absence of the key.
+ assert payload["pr_created"] == {
+ "status": "disabled",
+ "reason": None,
+ "branch": None,
+ "commit_sha": None,
+ "url": None,
+ }
+ assert payload["run_inputs"]["create_pr"] is False
+
+ def test_pr_created_block_records_skip_when_create_pr_true_and_no_repo(
+ self, temp_manifest: Path, tmp_path: Path,
+ ):
+ run_dir = tmp_path / "run"
+ self._run(
+ temp_manifest, run_dir,
+ create_pr_flag=True,
+ find_git_root_return=None,
+ )
+ payload = json.loads((run_dir / "gate_decision.json").read_text())
+ assert payload["pr_created"]["status"] == "skipped"
+ assert "git-backed" in payload["pr_created"]["reason"]
+ assert payload["run_inputs"]["create_pr"] is True
+
+
class TestApplyOverwritesSourceManifest:
"""`apply=True` writes the evolved description back to the source manifest."""