From cfaa76f26ec4aed9e6a42a617e907656e9df9622 Mon Sep 17 00:00:00 2001 From: weijt606 Date: Tue, 26 May 2026 01:22:18 +0200 Subject: [PATCH 1/2] fix(adapters): update Codex & OpenCode invocations; refresh model default (v0.2.3) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit CLI agents moved on; two adapters were calling removed flags: - Codex: codex --quiet --auto-edit → codex exec --skip-git-repo-check --sandbox workspace-write (non-interactive is now 'codex exec'; old flags removed upstream; workspace isn't a git repo) - OpenCode: opencode -p → opencode run (top-level -p no longer supported upstream) Verified claude-code (claude -p) and hermes (hermes chat -q) are still current; claw-code mirrors Claude Code. Also bump default proposer model claude-sonnet-4-20250514 → claude-sonnet-4-6 (api/openai backends). Docs + CHANGELOG synced, version 0.2.3. 206 tests, lint clean. --- CHANGELOG.md | 18 ++++++++++++++++++ README.md | 18 +++++++++--------- README_CN.md | 18 +++++++++--------- docs/development/technical-architecture.md | 6 +++--- package.json | 2 +- pyproject.toml | 2 +- src/polyharness/__init__.py | 2 +- src/polyharness/config.py | 3 ++- src/polyharness/proposer/adapters/codex.py | 9 ++++++--- src/polyharness/proposer/adapters/opencode.py | 6 ++++-- src/polyharness/proposer/api_proposer.py | 2 +- tests/test_cli_adapters.py | 6 +++++- 12 files changed, 60 insertions(+), 32 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 32723fb..d0c4d8d 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -4,6 +4,24 @@ All notable changes to this project will be documented in this file. The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.1.0/). +## [0.2.3] - 2026-05-26 + +### Fixed +- **Codex adapter** — switched to `codex exec` headless mode; the old + `codex --quiet --auto-edit` invocation was removed upstream. Also adds + `--skip-git-repo-check` (the optimization workspace isn't a git repo) and + `--sandbox workspace-write` (lets the agent edit within the workspace). +- **OpenCode adapter** — switched to the `opencode run` subcommand; the + top-level `-p` flag is no longer supported upstream. + +### Changed +- Default proposer model `claude-sonnet-4-20250514` → `claude-sonnet-4-6` + (affects `api`/`openai` backends; CLI backends use their own model). +- Verified `claude-code` (`claude -p`) and `hermes` (`hermes chat -q`) are still + current; `claw-code` mirrors Claude Code (unverified, low usage). +- Docs (README / README_CN / technical-architecture) updated to the current CLI + invocations. + ## [0.2.2] - 2026-05-24 ### Added diff --git a/README.md b/README.md index 1699c26..b927adf 100644 --- a/README.md +++ b/README.md @@ -303,9 +303,9 @@ Just add `ph wrap --auto-evolve` in front of your agent command (pick the one ma # CLI agent backends — wrap the agent you already use ph wrap --auto-evolve claude -p "Refactor the auth module to use JWT" # Claude Code ph wrap --auto-evolve claw -p "Write integration tests for payments" # Claw Code -ph wrap --auto-evolve codex "Add retry logic to the API client" # Codex +ph wrap --auto-evolve codex exec "Add retry logic to the API client" # Codex ph wrap --auto-evolve hermes chat -q "Refactor the DB connection pool" # Hermes Agent -ph wrap --auto-evolve opencode -p "Fix the flaky parser test" # OpenCode +ph wrap --auto-evolve opencode run "Fix the flaky parser test" # OpenCode # Local models — wrap the CLI command directly ph wrap --auto-evolve ollama run gemma3 "Summarize this document" # Ollama @@ -373,9 +373,9 @@ After that, just use your agent as usual: ```bash claude -p "Refactor auth to JWT" # automatically becomes: ph wrap --auto-evolve claude -p ... claw -p "Write payment tests" # same — auto-wrapped -codex "Add retry logic" # same +codex exec "Add retry logic" # same hermes chat -q "Refactor pool" # same -opencode -p "Fix flaky test" # same +opencode run "Fix flaky test" # same ``` How it works: a `preexec` hook in your shell detects `claude`/`claw`/`codex`/`hermes`/`opencode` commands and transparently redirects them through `ph wrap --auto-evolve`. Your output is unchanged. @@ -466,9 +466,9 @@ The Proposer reads **all of this** before generating the next candidate. It can | `openai` | — | OpenAI-compatible API (Ollama, vLLM, LM Studio, etc). Needs `OPENAI_API_KEY` | | `claude-code` | `claude -p` | Official Claude Code CLI (Pro/Teams subscription) | | `claw-code` | `claw -p` | Open-source Claw Code CLI | -| `codex` | `codex --quiet` | OpenAI Codex CLI | +| `codex` | `codex exec` | OpenAI Codex CLI | | `hermes` | `hermes chat -q` | Nous Research [Hermes Agent](https://github.com/NousResearch/hermes-agent) CLI | -| `opencode` | `opencode -p` | OpenCode CLI | +| `opencode` | `opencode run` | OpenCode CLI | | `local` | — | Offline rule-based engine for development & testing | `ph doctor` auto-detects all available backends and shows their status. @@ -543,7 +543,7 @@ proposer: backend: api # api | openai | claude-code | claw-code | codex | hermes | opencode | local ensemble: [] # If non-empty, pick among these backends per iteration via a UCB bandit bandit_c: 1.41421356 # UCB exploration constant (higher = more exploration) - model: claude-sonnet-4-20250514 # Model name (for api/openai backends) + model: claude-sonnet-4-6 # Model name (for api/openai backends) base_url: null # Custom API endpoint (for openai backend) api_key: null # API key override (null = use env var) max_tokens: 16384 # Max output tokens per proposer turn @@ -772,9 +772,9 @@ polyharness/ │ │ └── adapters/ # Per-agent CLI adapters │ │ ├── claude_code.py # claude -p │ │ ├── claw_code.py # claw -p -│ │ ├── codex.py # codex --quiet --auto-edit +│ │ ├── codex.py # codex exec │ │ ├── hermes.py # hermes chat -q -│ │ └── opencode.py # opencode -p +│ │ └── opencode.py # opencode run │ └── templates/ # 5 built-in task templates │ ├── text-classification/ │ ├── math-word-problems/ diff --git a/README_CN.md b/README_CN.md index f9a8581..ff2a381 100644 --- a/README_CN.md +++ b/README_CN.md @@ -303,9 +303,9 @@ ph clean --keep-best # 清理候选目录释放磁盘空间 # CLI agent 后端 —— 直接包裹你已经在用的 agent ph wrap --auto-evolve claude -p "把 auth 模块重构为 JWT 方案" # Claude Code ph wrap --auto-evolve claw -p "给支付服务写集成测试" # Claw Code -ph wrap --auto-evolve codex "给 API 客户端加上重试逻辑" # Codex +ph wrap --auto-evolve codex exec "给 API 客户端加上重试逻辑" # Codex ph wrap --auto-evolve hermes chat -q "重构数据库连接池" # Hermes Agent -ph wrap --auto-evolve opencode -p "修复不稳定的 parser 测试" # OpenCode +ph wrap --auto-evolve opencode run "修复不稳定的 parser 测试" # OpenCode # 本地模型 —— 直接包裹 CLI 命令 ph wrap --auto-evolve ollama run gemma3 "总结这篇文档" # Ollama @@ -373,9 +373,9 @@ ph shell-hook install # 一次性设置,写入 ~/.zshrc ```bash claude -p "把 auth 重构为 JWT" # 自动变为:ph wrap --auto-evolve claude -p ... claw -p "写支付测试" # 同理——自动包裹 -codex "加重试逻辑" # 同理 +codex exec "加重试逻辑" # 同理 hermes chat -q "重构连接池" # 同理 -opencode -p "修复不稳定测试" # 同理 +opencode run "修复不稳定测试" # 同理 ``` 原理:shell 的 `preexec` 钩子检测到 `claude`/`claw`/`codex`/`hermes`/`opencode` 命令后,透明地通过 `ph wrap --auto-evolve` 转发。你的输出不会变。 @@ -466,9 +466,9 @@ Proposer 在生成下一个候选之前会读取**所有这些信息**。它能 | `openai` | — | 兼容 OpenAI 格式的本地/云端模型直连 (Ollama, vLLM, LM Studio 等),需配置 `OPENAI_API_KEY` | | `claude-code` | `claude -p` | 官方 Claude Code CLI(Pro/Teams 订阅) | | `claw-code` | `claw -p` | 开源 Claw Code CLI | -| `codex` | `codex --quiet` | OpenAI Codex CLI | +| `codex` | `codex exec` | OpenAI Codex CLI | | `hermes` | `hermes chat -q` | Nous Research [Hermes Agent](https://github.com/NousResearch/hermes-agent) CLI | -| `opencode` | `opencode -p` | OpenCode CLI | +| `opencode` | `opencode run` | OpenCode CLI | | `local` | — | 离线规则引擎,用于开发和测试 | `ph doctor` 会自动检测所有可用后端并显示状态。 @@ -543,7 +543,7 @@ proposer: backend: api # api | openai | claude-code | claw-code | codex | hermes | opencode | local ensemble: [] # 非空时,每轮用 UCB bandit 在这些后端中择优 bandit_c: 1.41421356 # UCB 探索常数(越大越偏探索) - model: claude-sonnet-4-20250514 # 模型名称(api/openai 后端使用) + model: claude-sonnet-4-6 # 模型名称(api/openai 后端使用) base_url: null # 自定义 API 端点(openai 后端使用) api_key: null # API 密钥覆盖(null = 使用环境变量) max_tokens: 16384 # 每轮 proposer 最大输出 token 数 @@ -772,9 +772,9 @@ polyharness/ │ │ └── adapters/ # 逐 agent CLI 适配器 │ │ ├── claude_code.py # claude -p │ │ ├── claw_code.py # claw -p -│ │ ├── codex.py # codex --quiet --auto-edit +│ │ ├── codex.py # codex exec │ │ ├── hermes.py # hermes chat -q -│ │ └── opencode.py # opencode -p +│ │ └── opencode.py # opencode run │ └── templates/ # 5 个内置任务模板 │ ├── text-classification/ │ ├── math-word-problems/ diff --git a/docs/development/technical-architecture.md b/docs/development/technical-architecture.md index a1a5cef..4d329a6 100644 --- a/docs/development/technical-architecture.md +++ b/docs/development/technical-architecture.md @@ -159,7 +159,7 @@ search: # Proposer 配置 proposer: - model: "claude-sonnet-4-20250514" # Proposer 模型 + model: "claude-sonnet-4-6" # Proposer 模型 max_tokens: 16384 # 单次输出上限 temperature: 0.7 # 生成温度 backend: "api" # api | claude-code | claw-code @@ -222,7 +222,7 @@ harness: { "iteration": 3, "parent": "iter_1", - "proposer_model": "claude-sonnet-4-20250514", + "proposer_model": "claude-sonnet-4-6", "proposer_reasoning": "iter_1 在 task_002 上失败因为缺少 retry 逻辑...", "changes_summary": "添加了 exponential backoff retry 到 API 调用层", "timestamp": "2026-04-02T14:30:00Z" @@ -1254,7 +1254,7 @@ ablation: ```yaml proposer: model: "claude-haiku-4-20250414" # 低成本模式 - # model: "claude-sonnet-4-20250514" # 平衡模式 + # model: "claude-sonnet-4-6" # 平衡模式 # model: "claude-opus-4-20250514" # 最强模式(论文设置) ``` diff --git a/package.json b/package.json index 095061e..81c718e 100644 --- a/package.json +++ b/package.json @@ -1,6 +1,6 @@ { "name": "polyharness", - "version": "0.2.2", + "version": "0.2.3", "description": "Make your AI agent evolve automatically through iterative harness optimization.", "keywords": [ "agent", diff --git a/pyproject.toml b/pyproject.toml index a93374c..889f635 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta" [project] name = "polyharness" -version = "0.2.2" +version = "0.2.3" description = "Automated harness optimization for AI agents — make your agent evolve." readme = "README.md" license = "MIT" diff --git a/src/polyharness/__init__.py b/src/polyharness/__init__.py index e932175..fedd9b3 100644 --- a/src/polyharness/__init__.py +++ b/src/polyharness/__init__.py @@ -1,3 +1,3 @@ """PolyHarness — Automated harness optimization for AI agents.""" -__version__ = "0.2.2" +__version__ = "0.2.3" diff --git a/src/polyharness/config.py b/src/polyharness/config.py index 92041ff..f845f52 100644 --- a/src/polyharness/config.py +++ b/src/polyharness/config.py @@ -84,7 +84,8 @@ class ProposerConfig(BaseModel): description="UCB exploration constant for ensemble selection. Higher = more exploration.", ) model: str = Field( - default="claude-sonnet-4-20250514", description="Model for the Proposer agent." + default="claude-sonnet-4-6", + description="Model for the Proposer agent (api/openai backends; CLI backends use their own).", ) base_url: str | None = Field( default=None, description="Optional base URL for the API (useful for local models)." diff --git a/src/polyharness/proposer/adapters/codex.py b/src/polyharness/proposer/adapters/codex.py index 06f976a..7658e55 100644 --- a/src/polyharness/proposer/adapters/codex.py +++ b/src/polyharness/proposer/adapters/codex.py @@ -1,6 +1,8 @@ """Codex CLI adapter. -Invokes OpenAI's `codex` CLI agent in quiet/non-interactive mode. +Invokes OpenAI's `codex` CLI agent in headless/non-interactive mode via +`codex exec` (the old `--quiet`/`--auto-edit` flags were removed upstream). +See: developers.openai.com/codex/noninteractive """ from __future__ import annotations @@ -23,7 +25,8 @@ def build_command(self, prompt: str, *, cli_path: str | None = None) -> list[str binary = cli_path or self.default_binary return [ binary, - "--quiet", - "--auto-edit", # allow file edits without confirmation + "exec", # headless, non-interactive mode + "--skip-git-repo-check", # the workspace is not a git repo + "--sandbox", "workspace-write", # allow edits within the workspace cwd prompt, ] diff --git a/src/polyharness/proposer/adapters/opencode.py b/src/polyharness/proposer/adapters/opencode.py index fa20f29..c350b89 100644 --- a/src/polyharness/proposer/adapters/opencode.py +++ b/src/polyharness/proposer/adapters/opencode.py @@ -1,6 +1,8 @@ """OpenCode CLI adapter. -Invokes the open-source `opencode` CLI agent. +Invokes the open-source `opencode` CLI agent in non-interactive mode via the +`run` subcommand (the old top-level `-p` flag is no longer supported upstream). +See: opencode.ai/docs/cli """ from __future__ import annotations @@ -23,6 +25,6 @@ def build_command(self, prompt: str, *, cli_path: str | None = None) -> list[str binary = cli_path or self.default_binary return [ binary, - "-p", # prompt mode + "run", # non-interactive mode (replaces old -p) prompt, ] diff --git a/src/polyharness/proposer/api_proposer.py b/src/polyharness/proposer/api_proposer.py index 2e4efe4..f5b10c5 100644 --- a/src/polyharness/proposer/api_proposer.py +++ b/src/polyharness/proposer/api_proposer.py @@ -123,7 +123,7 @@ class APIProposer(BaseProposer): def __init__( self, - model: str = "claude-sonnet-4-20250514", + model: str = "claude-sonnet-4-6", max_tokens: int = 16384, temperature: float = 0.7, ): diff --git a/tests/test_cli_adapters.py b/tests/test_cli_adapters.py index 401454e..6d00c65 100644 --- a/tests/test_cli_adapters.py +++ b/tests/test_cli_adapters.py @@ -70,7 +70,9 @@ def test_codex_command(): adapter = CodexAdapter() cmd = adapter.build_command("fix it") assert cmd[0] == "codex" - assert "--quiet" in cmd + assert "exec" in cmd # headless mode (replaces old --quiet) + assert "--skip-git-repo-check" in cmd # workspace isn't a git repo + assert "--quiet" not in cmd # removed upstream assert "fix it" in cmd @@ -78,6 +80,8 @@ def test_opencode_command(): adapter = OpenCodeAdapter() cmd = adapter.build_command("optimize") assert cmd[0] == "opencode" + assert "run" in cmd # non-interactive subcommand (replaces old -p) + assert "-p" not in cmd # no longer supported upstream assert "optimize" in cmd From 678b9ca68d39ba26809539a8585a3b71533379ab Mon Sep 17 00:00:00 2001 From: weijt606 Date: Tue, 26 May 2026 01:30:20 +0200 Subject: [PATCH 2/2] fix(claude-code): pin Opus 4.7 + auto-accept edits in headless mode MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Verified against current Claude Code (May 2026): - Add --permission-mode acceptEdits — REQUIRED for the agent to write candidate files in headless -p mode; recent Claude Code blocks edits without it. Still gates arbitrary Bash/network (least-privilege for the isolated workspace). - Pin --model claude-opus-4-7 (highest-capability Proposer). - Drop --verbose (noise in print mode; not needed). claude -p / --output-format text confirmed still current. --- CHANGELOG.md | 7 ++++++- src/polyharness/proposer/adapters/claude_code.py | 16 ++++++++++++++-- tests/test_cli_adapters.py | 8 ++++++++ 3 files changed, 28 insertions(+), 3 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index d0c4d8d..2028a97 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -13,10 +13,15 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.1.0/). `--sandbox workspace-write` (lets the agent edit within the workspace). - **OpenCode adapter** — switched to the `opencode run` subcommand; the top-level `-p` flag is no longer supported upstream. +- **Claude Code adapter** — add `--permission-mode acceptEdits` so the agent can + actually write candidate files in headless `-p` mode (recent Claude Code blocks + edits without it); drop `--verbose` (noise in print mode). ### Changed +- **Claude Code adapter** now pins `--model claude-opus-4-7` (Opus 4.7, + highest-capability) for the Proposer. - Default proposer model `claude-sonnet-4-20250514` → `claude-sonnet-4-6` - (affects `api`/`openai` backends; CLI backends use their own model). + (affects `api`/`openai` backends; other CLI backends use their own model). - Verified `claude-code` (`claude -p`) and `hermes` (`hermes chat -q`) are still current; `claw-code` mirrors Claude Code (unverified, low usage). - Docs (README / README_CN / technical-architecture) updated to the current CLI diff --git a/src/polyharness/proposer/adapters/claude_code.py b/src/polyharness/proposer/adapters/claude_code.py index 5841b31..927daec 100644 --- a/src/polyharness/proposer/adapters/claude_code.py +++ b/src/polyharness/proposer/adapters/claude_code.py @@ -2,12 +2,23 @@ Invokes the official `claude` CLI in print mode (-p). Requires an active Claude Code subscription. + +Verified against Claude Code (May 2026): +- `-p` headless mode and `--output-format text` are current. +- `--permission-mode acceptEdits` is REQUIRED for the agent to write files + non-interactively (auto-approves Read/Edit/Write); without it, headless edits + are blocked. `acceptEdits` still gates arbitrary Bash/network (least-privilege, + appropriate for the isolated workspace). +- `--model claude-opus-4-7` pins to Opus 4.7 (full name for reproducibility). """ from __future__ import annotations from polyharness.proposer.adapters.base import CLIAdapter +# Pinned Proposer model for the Claude Code backend (highest-capability). +CLAUDE_CODE_MODEL = "claude-opus-4-7" + class ClaudeCodeAdapter(CLIAdapter): """Adapter for the Claude Code CLI (`claude`).""" @@ -24,8 +35,9 @@ def build_command(self, prompt: str, *, cli_path: str | None = None) -> list[str binary = cli_path or self.default_binary return [ binary, - "-p", # print mode (non-interactive, stdout output) + "-p", # print mode (non-interactive) prompt, + "--model", CLAUDE_CODE_MODEL, # pin to Opus 4.7 + "--permission-mode", "acceptEdits", # auto-approve file edits (headless) "--output-format", "text", - "--verbose", ] diff --git a/tests/test_cli_adapters.py b/tests/test_cli_adapters.py index 6d00c65..974fc93 100644 --- a/tests/test_cli_adapters.py +++ b/tests/test_cli_adapters.py @@ -50,6 +50,14 @@ def test_claude_code_command(): assert cmd[0] == "claude" assert "-p" in cmd assert "do stuff" in cmd + # Pinned to Opus 4.7 + assert "--model" in cmd + assert "claude-opus-4-7" in cmd + # Headless edits must be auto-approved or the agent can't write candidates + assert "--permission-mode" in cmd + assert "acceptEdits" in cmd + # --verbose is noise in print mode; should be gone + assert "--verbose" not in cmd def test_claude_code_custom_path():