From d19a9e8b80e45f2492eee55324f83b5c84e80d96 Mon Sep 17 00:00:00 2001 From: Jorben Date: Sun, 7 Jun 2026 11:54:01 +0800 Subject: [PATCH 1/8] =?UTF-8?q?feat(goal):=20=E2=9C=A8=20replace=20self-at?= =?UTF-8?q?testation=20goal=5Fscored=20with=20independent=20Judge=20accept?= =?UTF-8?q?ance=20agent?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Remove the `goal_scored` tool that allowed the main agent to self-attest goal completion, replacing it with an `agent_judge` built-in subagent that independently verifies goal attainment against the project's current state. Key changes: - Add `SubagentProfile::Judge` with read-only file tools and diagnostic-only shell (soft constraint via prompt) - Add `JudgeReport` structured contract (passed, completeness_pct, findings, summary) with safe fallback parsing - Add `agent_judge` tool injection only for the main agent when an unverified goal exists; runtime gate blocks subagent/parallel recursion into Judge - Add DB migration for `judge_passed`, `judge_completeness`, `judge_findings`, `judge_summary`, `judge_evaluated_run_id` columns with backfill for legacy `status='complete'` goals - Replace continuation stop condition: `Complete && judge_passed` instead of `goal_scored`-driven status flip - Rewrite continuation prompt to instruct main agent to call `agent_judge` and follow findings on rejection - Add Judge prompt surface, templates, and output contract - Update `active_goal.tpl.md` to reflect Judge acceptance flow - Extend goal lifecycle tests for Judge pass/fail/legacy compat --- docs/goal-judge-evaluation-refactor.md | 346 +++++++++++++ .../20260607000000_goal_judge_fields.sql | 17 + src-tauri/src/core/agent_session.rs | 37 +- src-tauri/src/core/agent_session_execution.rs | 453 +++++++++--------- src-tauri/src/core/agent_session_tools.rs | 28 +- src-tauri/src/core/goal_manager.rs | 124 +++-- .../prompt/sources/custom_subagent_body.rs | 19 + .../sources/subagent_output_contract.rs | 4 + src-tauri/src/core/prompt/surface.rs | 4 + .../src/core/prompt/surface_extensions.rs | 5 + .../core/prompt/templates/active_goal.tpl.md | 20 +- .../core/prompt/templates/subagent/judge.md | 24 + .../subagent/output_contract.judge.md | 21 + src-tauri/src/core/subagent/judge_contract.rs | 287 +++++++++++ src-tauri/src/core/subagent/mod.rs | 2 + src-tauri/src/core/subagent/orchestrator.rs | 12 + .../core/subagent/runtime_orchestration.rs | 147 ++++++ src-tauri/src/gateway/gateway_runner.rs | 2 +- src-tauri/src/ipc/frontend_channels.rs | 7 +- src-tauri/src/model/goal.rs | 43 +- src-tauri/src/model/subagent.rs | 2 +- src-tauri/src/persistence/repo/goal_repo.rs | 80 +++- src-tauri/tests/goal_lifecycle.rs | 145 +++++- src/i18n/locales/en.ts | 1 + src/i18n/locales/zh-CN.ts | 1 + .../workbench-shell/model/thread-store.ts | 5 + .../workbench-shell/ui/goal-status-bar.tsx | 2 +- .../ui/runtime-thread-surface.tsx | 6 +- src/services/bridge/agent-commands.ts | 7 +- 29 files changed, 1517 insertions(+), 334 deletions(-) create mode 100644 docs/goal-judge-evaluation-refactor.md create mode 100644 src-tauri/migrations/20260607000000_goal_judge_fields.sql create mode 100644 src-tauri/src/core/prompt/templates/subagent/judge.md create mode 100644 src-tauri/src/core/prompt/templates/subagent/output_contract.judge.md create mode 100644 src-tauri/src/core/subagent/judge_contract.rs diff --git a/docs/goal-judge-evaluation-refactor.md b/docs/goal-judge-evaluation-refactor.md new file mode 100644 index 00000000..ebea6c5f --- /dev/null +++ b/docs/goal-judge-evaluation-refactor.md @@ -0,0 +1,346 @@ +# Goal 评估与续行重构方案:引入 Judge 验收 Agent + +> 状态:设计方案(待评审) +> 关联模块:`src-tauri/src/core/goal_manager.rs`、`src-tauri/src/core/subagent/`、`src-tauri/src/core/agent_run_event_handler.rs`、`src-tauri/src/model/goal.rs` +> 决策基线(已澄清): +> 1. **保留全部现有护栏**(idle 空转、clarify/update_plan 暂停、token/turn 预算上限),仅把“是否完成”的判定从自主声明改为 Judge 验收。 +> 2. **复用 `GoalStatus::Complete` 状态** 表达“通过验收”,并在 `goals` 表新增 Judge 评估字段持久化最近一次裁决;迁移需把存量 `status='complete'` goal 回填为 `judge_passed=1`。 +> 3. **由主 agent 主动调用 `agent_judge`**,系统在 run 终止后通过续行 prompt 引导主 agent 先验收、未通过则修复后重验。 +> 4. **`agent_judge` 是主 agent 专属工具**:只在有未完成 goal 时注入主 agent,且运行时必须硬性拒绝任何 subagent 递归调用 Judge,即使工具名被 `RuntimeOrchestrationTool::parse()` 解析出来也不能放行。 +> 5. **Judge 使用诊断型 shell 软约束**:Judge 的文件工具保持只读;允许 `shell` 仅用于测试、type-check、lint、只读检查等诊断验证,并通过 Judge prompt 明确禁止用 shell 修改文件、删除数据、安装依赖或改变全局状态。首版不新增受限 shell 沙箱。 +> 6. **Judge 默认使用 primary 模型角色**,优先保证验收质量;首版不把 Judge/subagent 的 token 单独计入 goal token budget,也不新增 Judge 专属硬超时,沿用现有 helper run 的 turn/取消机制。 +> 7. **删除失效的自主完成路径**:移除 `goal_scored`、`GoalVerdict::Complete` 的旧自证语义,以及由 `goal_scored` 空 evidence 触发的 `NoEvidence` / `MISSING_EVIDENCE_PROMPT` 分支。 + +--- + +## 1. 背景与问题 + +当前 goal 的"完成"判定依赖主 agent 自主调用 `goal_scored(status, evidence, pledge)` 工具来声明达成。这是一种**自证式(self-attestation)**设计: + +- 工具内部只校验 `status == "complete"`、`pledge` 文本逐字匹配、`evidence` 非空(见 `agent_session_execution.rs` 的 `execute_goal_tool()`)。 +- 它**无法验证 evidence 的真伪**,也无法核对结果是否真的满足 goal 的一致性与完整性。 + +实测发现部分模型即便明知仍有未完成项,也会照抄 pledge 文本、编造 evidence 来调用 `goal_scored` 并提前结束任务。pledge + evidence 非空这类形式化护栏对"不诚实声明"无效,这是自主声明方式的**设计缺陷**。 + +**核心思路**:把"完成判定权"从被评估者(主 agent)手中移交给独立的评估者(Judge Agent)。主 agent 不能再自己宣布通过;只有 Judge 基于 goal 内容对项目当前状态做出"通过"裁决,goal 记录才会扭转为通过验收状态。续行监督也随之改为以"是否通过验收"为准。 + +--- + +## 2. 现状梳理(已确认事实) + +### 2.1 Goal 数据模型与持久化 + +- `GoalStatus`(`src-tauri/src/model/goal.rs`):`Active` / `Paused` / `BudgetLimited` / `Complete` 四态。 +- `goals` 表(`migrations/20260530000000_goals.sql` 及后续迁移):每 `thread_id` 唯一一条 goal;含 `status`、`evidence`、`tokens_used`、`turns_used`、`max_turns`、`pause_reason`、`last_evaluated_run_id` 等列。 +- `GoalManager`(`src-tauri/src/core/goal_manager.rs`)封装 CRUD + 评估 + prompt 生成。关键方法:`mark_complete(goal_id, evidence)`、`evaluate_after_turn(response, goal) -> GoalVerdict`(同步 CPU 启发式)、`evaluate_after_run(run_id, response) -> GoalEvaluationOutcome`(异步、含去重 CAS)。 + +### 2.2 `goal_scored` 工具链路 + +- 工具定义在 `agent_session_tools.rs` 的 `runtime_tools_for_profile()`,常量 `GOAL_SCORED_TOOL_NAME` / `GOAL_SCORED_PLEDGE` 在 `goal_manager.rs`。 +- 调用分派在 `agent_session_execution.rs::execute_tool_call()` → `execute_goal_tool()`:校验 status/pledge/evidence → `mark_complete()` → 发送 `GoalCompleted` + `GoalStateUpdated` 事件。 + +### 2.3 续行监督逻辑 + +- run 终止后,`agent_run_event_handler.rs::maybe_continue_goal_after_terminal_run()` 是入口。 +- 前置条件:`goal_continuation_enabled == true`、`final_status ∈ {Completed, Interrupted}`。 +- 调用 `evaluate_after_run()` 内部走 `evaluate_after_turn()` 分层启发式: + - **Layer 1** 工具阻塞:`clarify` → `Paused(ClarifyPending)`;`update_plan` → `Paused(PlanPending)`;`goal_scored` 放行。 + - **Layer 2** idle/完成声明:连续 idle ≥ `MAX_IDLE_TURNS(3)` → `Paused(IdleBlocked)`;检测到完成关键词但未调工具 → `ChallengeEvidence`(反复声称达上限 → `IdleBlocked`)。 + - **Layer 3** 预算:tokens 超 budget → `BudgetLimited`;turns 超 `max_turns` → `Paused(BudgetExhausted)`。 + - 默认 → `Continue`。 +- verdict 为 `Continue` / `ChallengeEvidence` 时,用 continuation prompt 启动新 run;`Paused` / `BudgetLimited` / `skipped` 时不续行。 +- **关键现状**:续行从不查询 goal 的 `Complete` 状态。它实际依靠"模型没有再触发任何阻塞/完成信号 + goal 仍 `Active`"间接推断。一旦 `goal_scored` 被调用,`mark_complete()` 把 status 写成 `Complete`,下一轮 `evaluate_after_run()` 因 goal 非 `Active` 返回 `skipped`,从而停止续行。 + +### 2.4 Subagent 机制 + +- 内建 subagent:`Explore`、`Review`、`Parallel`,定义在 `subagent/runtime_orchestration.rs` 的 `RuntimeOrchestrationTool` / `SubagentProfile`。 +- 深度模型:主 agent = depth 1;主 agent 直接子代理 = depth 2(`MAIN_AGENT_CHILD_DEPTH`);`GLOBAL_MAX_DELEGATION_DEPTH = 5`;内建默认 `BUILTIN_DEFAULT_MAX_DELEGATION_DEPTH = 3`。 +- 委派校验:`orchestrator.rs::validate_delegation_capability(caller, target_tool, target_profile, child_depth)`,三重检查(调用方 `can_delegate`、全局上限、目标 `max_delegation_depth`)。 +- 权限模型:`Explore` 只读(read/list/find/search/web_search,`can_delegate=false`);`Review` 只读 + 诊断 shell + git/term 只读(`can_delegate=true`);`Custom` 按 `allowed_tools` 白名单。 +- 工具注入:主 agent 在 `agent_session_tools.rs::runtime_tools_for_profile()` 中 `tools.extend(runtime_orchestration_tools())`;自定义在 `agent_session.rs::build_session_spec()` 注入。 +- Prompt 注入:`build_helper_system_prompt()` 按 `PromptSurface`(`prompt/surface.rs`)选择 section;task 通过 `agent.prompt(request.task)` 注入为 user message。 + +--- + +## 3. 设计目标 + +1. 新增内建 **Judge** subagent,对项目当前状态做 goal 达成度评估,结构化返回:通过与否(bool)、完整度百分比、判定依据(未达成/不符合点描述)。 +2. Judge 通过时**扭转 goal 记录为通过验收状态**(复用 `Complete` + 持久化 Judge 字段)。 +3. Judge 上下文注入 goal 内容,评估重点是 goal 要求的**一致性**与**完整性**。 +4. Judge 文件工具保持**只读**,允许 `read` / `list` / `find` / `search` / `web_search`;允许 `shell` 但仅作为诊断型软约束工具用于测试、type-check、lint、只读检查;允许再发起 subagent(含并行,如 explore/review 协助),**自身最大被委派深度为 2**。 +5. **删除 `goal_scored` 工具**。完成判定不再由主 agent 自证。 +6. 续行监督改为:判定 goal 记录是否“通过验收”;未通过且 goal 仍 Active 则续行,并在 continuation prompt 中明确要求主 agent 调用 `agent_judge` 验收并遵循验收结果。 +7. **按需注入**:仅当 thread 有未通过验收的 goal 时,才向**主 agent**注入 `agent_judge` 工具;所有 subagent 均不注入且运行时拒绝递归调用 `agent_judge`;无 goal 或已验收通过时不注入。 + +--- + +## 4. 总体设计 + +### 4.1 角色与职责重划 + +| 角色 | 重构前 | 重构后 | +|------|--------|--------| +| 主 agent | 自己调 `goal_scored` 声明完成 | 干活 + 自认为完成后调 `agent_judge` 申请验收;不能自证完成 | +| Judge agent | 不存在 | 独立验收者,文件工具只读且 shell 仅诊断软约束,基于 goal 评估项目当前状态,产出结构化裁决;通过则扭转 goal 状态 | +| 续行监督 | 间接依赖 goal 非 Active 停续行 | 显式以"goal 是否通过验收(Complete + judge_passed)"为停续行依据 | + +### 4.2 端到端数据流 + +``` +用户 /goal + └─ goal_set() → create_goal(status=Active) + └─ 注入 ActiveGoalSource 到主 agent system prompt(更新文案:完成须经 agent_judge 验收) + └─ 按需向主 agent 注入 agent_judge 工具(goal 存在且尚未通过验收) + +主 agent run:工作 → 自认为达成 → 调用 agent_judge(task) + └─ execute_tool_call() 路由到 Judge 编排 + └─ HelperAgentOrchestrator::run_helper(SubagentProfile::Judge) + ├─ build_helper_system_prompt(PromptSurface::SubagentJudge) + 注入 goal objective 到上下文 + ├─ Judge 工具集:read/list/find/search/web_search/shell(仅诊断软约束) + (depth 允许时)agent_explore/agent_review/agent_parallel + ├─ Judge 调研验证:读代码、搜索、运行测试/type-check/lint 等诊断命令、并行 explore/review + └─ 产出结构化 JudgeReport { passed, completeness_pct, findings, summary } + └─ Judge 编排回写 goal 记录: + ├─ 总是:persist 最近一次 judge_passed / judge_completeness / judge_findings / judge_summary / judge_evaluated_run_id + └─ passed == true:事务写入 status=Complete + judge_passed=true + evidence=summary + 发送 GoalCompleted + GoalStateUpdated 事件 + └─ agent_judge 工具结果(JudgeReport 文本)返回给主 agent + +run 终止 + └─ maybe_continue_goal_after_terminal_run() + └─ evaluate_after_run() + ├─ 若 goal.status == Complete && goal.judge_passed == true(已通过验收)→ skipped(停续行)✅ + ├─ 若 goal.status != Active → skipped(非活跃 goal 不自动续行,保留现有暂停/预算语义) + ├─ 否则保留现有护栏:clarify/update_plan/idle/预算 → Paused/BudgetLimited + └─ 否则 → Continue:注入新版 continuation prompt + "你尚未通过验收。请先用 agent_judge 验收;若上次验收未通过, + 按 findings 修复后再次调用 agent_judge。" + └─ Continue → 启动新 run(回到主 agent run) +``` + +### 4.3 为什么选择这套方案(与备选对比) + +- **复用 `Complete` 而非新增 `Verified` 枚举**:`Complete` 在 DDL CHECK 约束、`GoalStatus` 枚举、前端状态条、gateway 文案中均已铺开。新增枚举值需要同步迁移、前端、序列化多处,收益有限。改为复用 `Complete` 并以 `judge_passed` 布尔列区分"是否经 Judge 验收",改动面最小且语义清晰(通过验收 = `Complete` 且 `judge_passed=true`)。 +- **保留全部护栏**:Judge 解决的是"完成判定的可信度",而 idle 空转、clarify/update_plan 暂停、预算上限解决的是"防止无限续行/资源失控/阻塞等待"。两者正交,移除护栏会让无 goal 评估能力时的兜底消失,引入失控风险。 +- **主 agent 主动调用 + 续行引导**(而非系统自动发起 Judge):保持与现有 subagent 调用模型一致(主 agent 通过工具调用委派),实现侵入小;系统侧只需在续行 prompt 中“催”主 agent 去验收,无需在 run 终止后再隐式拉起一个评估 run 改变运行时调度。续行 prompt 会持续施压,直到 goal 被 Judge 标记通过,规避了“主 agent 不调 Judge 就永远不验收”的死角。 +- **Judge 作为主 agent 专属内建工具**:虽然 `agent_judge` 会加入 `RuntimeOrchestrationTool::parse()`,但它不进入 `builtin_all()` 和 `delegation_tools_for_helper()`,也不允许 subagent 递归调用。这样保留统一工具解析与 helper 编排复用,同时避免 explore/review/custom/Judge 自己绕过“主 agent 申请验收”的职责边界。 +- **诊断型 shell 软约束而非新沙箱**:Judge 需要能运行测试、type-check、lint 等验证命令,因此首版复用现有 `shell` 工具;但该工具能力本身不是硬只读,必须在 Judge prompt 中明确限制为诊断用途,禁止修改文件、删除数据、安装依赖、启动交互式长进程或改变全局状态。新建受限 shell/test-runner 工具会扩大改动面,首版暂不引入。 +- **Judge 使用 primary 模型角色**:验收质量优先于成本,Judge 默认走 `model_plan.primary`。Explore/Review 继续保持现有模型策略,Judge 内部再委派时由各子代理自己的模型映射决定。 + +### 4.4 首版范围边界 + +首版目标是打通后端 Judge 验收闭环:工具注入、subagent 运行、结构化解析、goal 回写、续行停止、迁移兼容和测试覆盖。前端仅同步类型并在现有状态条显示“已验收通过”这一最小信息;`judge_completeness` 的精细 UI、额外事件、ACP/gateway 的详细状态展示、Judge token 单独计入 goal budget、Judge 专属超时或受限 shell 沙箱均作为后续增强,不进入首版。 + +--- + +## 5. 详细实现 + +### 5.1 Judge subagent profile(`subagent/runtime_orchestration.rs`) + +- `RuntimeOrchestrationTool` 新增变体 `Judge`,工具名映射 `agent_judge`;`parse("agent_judge") -> Some(Judge)`。同时补齐 `tool_name()`、`title()`、`description()`、`profile()`、`as_agent_tool()` 的 match 分支,`as_agent_tool()` 的 schema 只需要 `task: string`。 +- `SubagentProfile` 新增 `Judge` 变体,并补齐 `helper_kind()`(固定返回 `helper_judge`)、`system_prompt()`、`can_delegate()`、`max_delegation_depth()`、`helper_tools()` 等 match 分支。 +- `resolve_helper_profile()` 增加 `RuntimeOrchestrationTool::Judge => Some(SubagentProfile::Judge)`;`resolve_helper_model_role()` 增加 Judge 分支,默认使用 `model_plan.primary`,不要复用 Explore/Review 的 auxiliary 映射。 +- `helper_tools()` for `Judge`:`read` / `list` / `find` / `search` / `web_search`(条件启用)/ `shell`(仅诊断验证)。**不含** `edit` / `write` / `term_write` / `term_restart` / `term_close`。需要在工具描述和 Judge prompt 中明确:`shell` 只能运行测试、type-check、lint、只读检查等诊断命令,不能修改文件、删除数据、安装依赖、启动交互式长进程或改变全局状态。这是 prompt 软约束,不是硬沙箱。 +- `can_delegate()` for `Judge`:`true`(允许 explore/review/parallel 协助)。 +- `max_delegation_depth()` for `Judge`:`2`(即 Judge **自身最大被委派深度为 2**——主 agent depth 1 直接委派 Judge 得到 depth 2,符合 `MAIN_AGENT_CHILD_DEPTH=2`;同时这意味着 Judge 内部委派的子级会是 depth 3,需在 `delegation_tools_for_helper()` 中据此过滤)。 + > 注意:需求所述“自身最大被委派深度为2”指 Judge 作为被委派目标时允许出现在 depth ≤ 2。为了让 Judge 仍能发起 explore/review/parallel(depth 3 子级),`delegation_tools_for_helper(child_depth)` 对内建目标的过滤阈值需复核:Judge 在 depth 2 调用子级时 `child_depth=3`,仍 ≤ `GLOBAL_MAX_DELEGATION_DEPTH(5)` 且 ≤ explore/review 的 `max_delegation_depth(3)`,故可注入。实现时确保 `validate_delegation_capability` 对 Judge→explore/review 放行。 +- `delegation_tools_for_helper()` 仍只注入 Explore / Review / Custom / Parallel,**不得注入 Judge**。这使 Judge 可以委派其他 helper,但任何 helper 不能委派 Judge。 +- `RESERVED_SUBAGENT_SLUGS` 增加 `"judge"`,防止自定义 subagent 占用该 slug。由于 `RuntimeOrchestrationTool::parse()` 对 `agent_{slug}` 有通配解析,保留 slug 能避免 `agent_judge` 与自定义工具名冲突。 +- `runtime_orchestration_tools()` **不无条件包含 Judge**:Judge 改为按需注入(见 5.6),`builtin_all()` 保持仅含 explore/review/parallel,Judge 单独由主 agent 工具组装处按 goal 条件 push。 + +### 5.2 Judge 结构化协议(新增 `subagent/judge_contract.rs`) + +参照 `review_contract.rs` / `parallel_contract.rs` 模式新增: + +```rust +/// agent_judge 工具的入参(主 agent 传入)。 +pub struct JudgeRequest { + pub task: String, // 主 agent 对"为何认为达成"的说明 / 关注点 +} + +/// Judge 评估结构化产出。 +#[derive(Serialize, Deserialize)] +pub struct JudgeReport { + pub passed: bool, // 是否通过验收 + pub completeness_pct: u8, // 0-100 完整度百分比 + pub findings: Vec, // 未达成 / 不符合 goal 的具体点(passed=false 时必填) + pub summary: String, // 判定依据总述,作为通过时的 evidence +} +``` + +- Judge 的 system prompt(模板 `prompt/templates/subagent/judge.md`)强制要求最终以可解析的结构化形式(JSON 块或约定字段)返回上述四项。 +- `passed=true` 时 `summary` 必须非空,作为 `mark_complete()` 的 evidence;如果 Judge 输出 `passed=true` 但 `summary` 为空,解析层必须降级为 `passed=false`,避免无证据完成。 +- `completeness_pct` 解析后必须 clamp 到 0-100;`passed=false` 时 `findings` 必须非空,若模型未给出 findings,则把原始输出或“Judge did not provide actionable findings”写入 findings。 +- Judge 编排在拿到 Judge 文本输出后解析为 `JudgeReport`;解析失败按 `passed=false` 处理并把原始文本塞入 `findings`,避免误判通过。 + +### 5.3 Judge prompt surface 与上下文注入 + +- `prompt/surface.rs::PromptSurface` 新增 `SubagentJudge { inherited_run_mode }`。 +- `SurfacePattern::matches()` 同步更新:`AnySubagent` 必须匹配 `SubagentJudge`;`BuiltinSubagent` 也必须匹配 `SubagentJudge`,因为 Judge 是内建 subagent。若某些 prompt section 只应给 Explore/Review 而不应给 Judge,应改用更精确的 matcher 或新增 pattern,避免误注入。 +- `build_helper_system_prompt()` 增加 `SubagentProfile::Judge` → `PromptSurface::SubagentJudge { inherited_run_mode }` 映射。 +- `prompt/sources/custom_subagent_body.rs` 增加 Judge 模板映射:Judge → `templates/subagent/judge.md`。 +- `prompt/templates/subagent/judge.md`:定义 Judge 角色——独立验收员,只读评估,重点核对 goal 的一致性与完整性;说明可用工具(含诊断型 `shell`、可委派 explore/review/parallel);要求输出结构化 `JudgeReport`;明确禁止修改文件。`shell` 约束必须写成硬性行为指令:只能运行测试、type-check、lint、只读检查;不得通过 shell 编辑/删除文件、安装依赖、改变全局状态、启动交互式或长期驻留进程。 +- `prompt/sources/subagent_output_contract.rs` 增加 Judge 的输出契约 `output_contract.judge.md`,并在 contract 中重复 `passed` / `completeness_pct` / `findings` / `summary` 的字段要求和失败兜底规则。 +- **goal 内容注入采用 task 前缀方案**:Judge 上下文必须包含 goal objective,且由 `agent_session_execution.rs` 的 Judge 分支在构造 helper task 时注入,不新增 DB 读取型 prompt source。 +- task 前缀必须包含:objective、当前 goal id/status、最近一次 Judge findings/summary(若有)、主 agent 传入的 `task` 说明。这样 Judge 不依赖主 agent 自述即可核对目标。 + +### 5.4 Judge 编排与 goal 回写(`agent_session_execution.rs` + `goal_manager.rs`) + +- `execute_tool_call()`:`RuntimeOrchestrationTool::parse()` 命中 `Judge` 时进入 Judge 专用分支,不直接走普通 `execute_helper_tool()` 返回路径。该分支可复用 `resolve_helper_delegate()` / `HelperAgentOrchestrator::run_helper()`,但必须在 helper 完成后追加 JudgeReport 解析和 goal 回写。 +- Judge 分支额外步骤: + 1. 调用前从 DB 加载当前 thread 的未完成 goal;无 goal 或 goal 已 `Complete && judge_passed=true` 则返回错误(agent_judge 仅在有 goal 时可用,理论上不会被注入)。 + 2. 把 `goal.objective`、goal id/status、最近一次 judge findings/summary、主 agent 传入的 `task` 拼成 Judge task 上下文。 + 3. 以 `SubagentProfile::Judge`、`RuntimeOrchestrationTool::Judge`、depth 2 启动 helper run;模型角色使用 `model_plan.primary`。 + 4. Judge run 结束后解析 `JudgeReport`;解析失败或字段非法按 `passed=false` 处理。 + 5. 调用新增 `GoalManager::record_judge_verdict(goal_id, run_id, &report)` 持久化最近裁决;若 `report.passed`,该方法在同一事务内写入 `status=complete`、`evidence=report.summary` 与 `judge_passed=true`。 + 6. 若通过验收,发送 `GoalCompleted` + `GoalStateUpdated` 事件;若未通过,也发送 `GoalStateUpdated`,让前端/后续续行能拿到最新 findings。 + 7. 把 `JudgeReport` 文本作为工具结果返回主 agent;通过时结果中明确提示“goal 已通过验收,请停止修改并总结”,降低同一 run 后续继续改动的风险。 +- `GoalManager` 新增方法: + - `record_judge_verdict(&self, goal_id: &str, run_id: &str, report: &JudgeReport) -> Result`:写 `judge_passed` / `judge_completeness` / `judge_findings`(JSON) / `judge_summary` / `judge_evaluated_run_id`,并返回更新后的 record 供事件 payload 使用;passed 时同一事务同步写 `status=complete` 与 `evidence=report.summary`。 +- 原子性要求:`goal_repo.rs` 增加 `record_judge_verdict()` repo 方法,在事务内更新 judge_* 字段;passed 时同事务写 `status='complete'` 与 `evidence=summary`,确保 `status=complete` 与 `judge_passed=1` 不出现半更新;未通过时保持原 status(通常 Active)不变。 +- 预算边界:首版 Judge helper run 的 token 不单独计入 goal `tokens_used`。这是明确取舍;后续若要计入,需要扩展 `HelperRunResult` 携带 usage 并在 Judge 分支回写。 +- 同轮继续修改边界:系统不强行锁定 goal 后的写工具,因为主 agent 仍处于同一 run;通过验收后的工具结果和 `active_goal.tpl.md` prompt 必须要求停止修改。若未来需要硬约束,可在 `execute_tool_call()` 中对 `Complete && judge_passed` 后的 mutating tools 增加拒绝策略,首版不做。 + +### 5.5 删除 `goal_scored` 工具 + +- 删除工具定义(`agent_session_tools.rs` 中的 `goal_scored` `AgentTool::new(...)`)。 +- 删除分派分支与 `execute_goal_tool()`(`agent_session_execution.rs`)。 +- 移除常量 `GOAL_SCORED_TOOL_NAME` / `GOAL_SCORED_PLEDGE`(`goal_manager.rs`),以及 `evaluate_after_turn()` 中 `detect_tool_based_blocking` 对 `goal_scored` 的放行分支。 +- 删除旧自证语义:`GoalVerdict::Complete { evidence }` 当前没有有效生产者,删除 `goal_scored` 后一并移除,并删除 `evaluate_after_run()` 中的旧 match 分支,减少死代码。 +- 删除 `ChallengePromptVariant::NoEvidence` 与 `MISSING_EVIDENCE_PROMPT`,因为它们只服务于“调用 `goal_scored` 但 evidence 为空”的旧路径;保留 completion-claim 检测对应的 `ChallengeEvidence` / `NoTool` 语义,并把文案改为“声称完成但尚未调用 `agent_judge` 验收”。 +- 护栏保留但需改写文案:`ChallengeEvidence` 与 completion-claim 检测仍作为“提醒主 agent 去验收”的软提示,引导语从“调用 goal_scored”改为“调用 agent_judge 验收”。`GUIDANCE_PROMPT` 同步更新。 +- `agent_judge` 会被 `record_tool_call()` 记录到 goal runtime tool calls;`detect_tool_based_blocking()` 不应把它视为阻塞工具,也不应触发 pause。它与普通工具调用一样表示 agent 有行动,能重置 idle 倾向。 +- 全局检索并清理 `goal_scored` 引用:系统 prompt、`active_goal.tpl.md`、gateway 文案、前端 hardcoded kickoff prompt、测试(`tests/goal_lifecycle.rs`)等。 + +### 5.6 按需注入 `agent_judge`(仅主 agent,仅有未完成 goal 时) + +- 注入点在主 agent 工具组装处。`runtime_tools_for_profile()` 当前是纯 profile 函数,不知道 thread goal 状态;推荐在其调用方 `build_session_spec()`(`agent_session.rs`)查询并追加 Judge 工具,避免把 DB 依赖塞进纯工具构造函数。 + - 在 `build_session_spec()` 已能访问 `pool` 与 `thread_id`,查询 `goal_repo::find_by_thread_id`,若存在且尚未通过验收,则 push `RuntimeOrchestrationTool::Judge.as_agent_tool()`。 + - “尚未通过验收”的判定为:goal 存在且不是 `status == Complete && judge_passed == true`。实际自动续行仍只对 `Active` 生效;但工具注入可允许用户在恢复/继续场景中对 `Paused` 或 `BudgetLimited` goal 重新申请验收。 + - goal 不存在或已 `Complete && judge_passed`(已验收)则不注入。 +- `runtime_tools_with_custom_subagents()` 与 extension tool 合并时需维持内建工具名优先级,防止 extension/custom 工具覆盖 `agent_judge`。 +- **subagent 不注入**:Judge 工具只在主 agent 工具集 push,不进入 `delegation_tools_for_helper()` 的候选;任何 subagent(含 Judge 自身、explore/review/custom)的可委派目标列表都不包含 `agent_judge`。 +- **运行时硬门禁**:仅“不注入”不足够,因为模型或测试仍可能构造 `agent_judge` 调用,且 `RuntimeOrchestrationTool::parse()` 会命中。必须在 subagent 递归委派路径(例如 `HelperDelegationContext::handle_delegation()` / `resolve_delegation()`)中显式拒绝 `RuntimeOrchestrationTool::Judge`,返回“agent_judge can only be called by the main agent for the current goal”之类错误。 +- `agent_parallel` 的任务列表也必须拒绝 `agent_judge`。`validate_parallel_delegate_safety()` 或解析 parallel task 的位置应把 Judge 视为非法 batch target,避免通过 parallel 间接调用 Judge。 +- 主 agent 侧 `execute_tool_call()` 的 Judge 分支也要重新查询 goal 状态,不能只依赖工具注入时的状态;这是防止 race / stale tool set 的后端 backstop。 + +### 5.7 续行监督改造(`agent_run_event_handler.rs` + `goal_manager.rs`) + +- `evaluate_after_run()` / `evaluate_after_turn()` 开头新增**显式终止判定**:若 goal 已“通过验收”(`status == Complete && judge_passed == true`)→ 返回 `skipped`(停续行)。这是停续行的**主依据**。 +- 存量兼容依赖迁移回填:迁移后不应出现旧路径产生的 `status=Complete && judge_passed=false`。如果运行时遇到该组合,按异常兼容处理并停续行或记录 warning;不要把旧 complete goal 重新拉起续行。 +- 对 `Paused` / `BudgetLimited` 仍按现有语义返回 skipped,不自动续行。只有 `Active` goal 会继续进入护栏评估。 +- 其余护栏(clarify/update_plan/idle/预算)保留,作用不变。 +- `Continue` / `ChallengeEvidence` verdict 的 continuation prompt 改写为新模板(替换 `CONTINUATION_PROMPT_TEMPLATE`): + +``` +[Goal continuation — turns {turns_used}/{max_turns}] + +**Objective:** {objective} + +继续推进该目标,执行下一个具体步骤。 + +⚠️ 完成判定已改为独立验收:当你认为目标已达成时,必须调用 + agent_judge(task="说明为何认为已达成 / 需重点核对的点") +由 Judge 评估项目是否满足目标的一致性与完整性。 +- 仅当 Judge 裁决 passed=true 时,目标才会被标记为通过验收并停止续行。 +- 若上一次 Judge 验收未通过,请阅读其 findings,逐项修复后再次调用 agent_judge。 +你无法自行声明完成;只有通过 Judge 验收才算达成。 + +如果你被阻塞、需要用户输入,请使用 clarify 工具。 +``` + +- 若最近一次 Judge 未通过,必须把 `judge_findings` 摘要拼接进 continuation prompt,提升修复指向性;摘要可限制长度,避免 prompt 过长。 + +### 5.8 数据库迁移 + +新增迁移 `migrations/2026XXXXXXXXXX_goal_judge_fields.sql`: + +```sql +ALTER TABLE goals ADD COLUMN judge_passed INTEGER NOT NULL DEFAULT 0; -- bool +ALTER TABLE goals ADD COLUMN judge_completeness INTEGER; -- 0-100, nullable +ALTER TABLE goals ADD COLUMN judge_findings TEXT; -- JSON array, nullable +ALTER TABLE goals ADD COLUMN judge_summary TEXT; -- nullable +ALTER TABLE goals ADD COLUMN judge_evaluated_run_id TEXT; -- nullable + +-- 兼容旧版本 goal_scored 已完成的 goal,避免升级后被误判为未验收。 +UPDATE goals +SET judge_passed = 1, + judge_summary = COALESCE(judge_summary, evidence), + judge_completeness = COALESCE(judge_completeness, 100) +WHERE status = 'complete'; +``` + +- `GoalRecord` / `GoalDto` / `GoalPayload`(`model/goal.rs`)同步新增字段:`judge_passed: bool`、`judge_completeness: Option`(DB 读写时校验 0-100)、`judge_findings: Option`(JSON 文本,DTO 透传字符串,前端按 string/null 接收)、`judge_summary: Option`、`judge_evaluated_run_id: Option`。 +- `goal_repo.rs` 同步更新 `SELECT_COLUMNS`、`GoalRow`、`into_record()`、`insert()`。新增 `record_judge_verdict()` repo 方法,负责写 judge_* 字段;passed 时同一事务同步写 `status='complete'` 与 `evidence=summary`。 +- 若 `judge_findings` 以 JSON array 字符串存储,写入前由 `serde_json::to_string(&report.findings)` 生成;读取失败时不要 panic,DTO 可原样返回或置为 `None` 并记录 warning。 + +### 5.9 前端、IPC、gateway 与 ACP + +- `ThreadStreamEvent` 首版复用现有 `GoalCompleted` / `GoalStateUpdated`,不新增 Judge 专属事件。`GoalPayload` 增加 judge 字段后,现有事件 payload 即可携带最新裁决。 +- 前端 `GoalPayload` 类型(如 `src/services/bridge/agent-commands.ts`)与 store 类型(如 `src/modules/workbench-shell/model/thread-store.ts`)补充 judge 字段;状态条在 `Complete && judgePassed` 时显示“已验收通过”。`judge_completeness` 的进度/百分比 UI 为二阶段增强。 +- `goal-status-bar.tsx` 只做最小展示;若未实现详细展示,也必须保证新增字段不会破坏类型检查。 +- gateway / ACP 首版只要求文案与行为不再引用 `goal_scored`,并确保这些入口启动主 agent 时使用同一 `build_session_spec()` 注入逻辑,因此有未完成 goal 时也能拿到 `agent_judge`。详细展示 Judge findings/completeness 可后续增强。 + +--- + +## 6. 影响文件清单 + +| 文件 | 改动 | +|------|------| +| `src-tauri/src/model/goal.rs` | `GoalRecord`/`GoalDto`/`GoalPayload` 新增 judge_* 字段;删除 `GoalVerdict::Complete` 旧自证变体 | +| `src-tauri/src/core/goal_manager.rs` | 删除 `GOAL_SCORED_*` 常量与放行分支;删除 `MISSING_EVIDENCE_PROMPT` / `NoEvidence` 旧路径;新增 `record_judge_verdict()`;续行终止判定改为 `Complete && judge_passed`;改写 continuation/guidance 文案并拼接最近 findings | +| `src-tauri/src/core/subagent/runtime_orchestration.rs` | `RuntimeOrchestrationTool::Judge` + `SubagentProfile::Judge`(工具集/can_delegate/max_delegation_depth=2);`parse`/`profile`/`as_agent_tool`/`helper_kind` 等 match 补齐;保留 slug;`builtin_all()` 不含 Judge | +| `src-tauri/src/core/subagent/judge_contract.rs`(新增) | `JudgeRequest` / `JudgeReport` 结构化协议、JSON 解析、字段校验、失败兜底 | +| `src-tauri/src/core/subagent/orchestrator.rs` | `build_helper_system_prompt()` 支持 Judge surface;subagent 递归委派路径硬性拒绝 `agent_judge`;保持 Judge→explore/review/parallel 放行 | +| `src-tauri/src/core/subagent/parallel_contract.rs` / 相关 parallel 校验 | `agent_parallel` task 拒绝 `agent_judge` 作为子任务 | +| `src-tauri/src/core/agent_session_execution.rs` | 删除 `goal_scored` 分派与 `execute_goal_tool()`;新增 Judge 专用分支(加载 goal → task 前缀注入 → helper run → 解析 JudgeReport → 回写 goal → 发送事件) | +| `src-tauri/src/core/agent_session_tools.rs` | 删除 `goal_scored` 工具定义;保持基础 runtime tools 不含 Judge;如新增 helper 函数则提供 `agent_judge` 工具构造 | +| `src-tauri/src/core/agent_session.rs` | `build_session_spec()` 查询 goal,按“未通过验收”条件向主 agent 追加 `agent_judge`;`resolve_helper_model_role()` 将 Judge 映射到 primary | +| `src-tauri/src/core/prompt/surface.rs` | `PromptSurface::SubagentJudge`;`SurfacePattern::AnySubagent` / `BuiltinSubagent` 匹配 Judge | +| `src-tauri/src/core/prompt/sources/custom_subagent_body.rs` | Judge → `templates/subagent/judge.md` | +| `src-tauri/src/core/prompt/sources/subagent_output_contract.rs` | Judge 输出契约 | +| `src-tauri/src/core/prompt/templates/subagent/judge.md`(新增) | Judge 角色、诊断型 shell 软约束、委派说明与结构化输出要求 | +| `src-tauri/src/core/prompt/templates/active_goal.tpl.md` | 完成判定改为经 agent_judge 验收,并提示通过后停止修改 | +| `src-tauri/src/core/prompt/sources/active_goal.rs` | 文案同步(如有引用) | +| `src-tauri/src/persistence/repo/goal_repo.rs` | judge_* 列读写;新增 `record_judge_verdict()`;passed 时原子写 status/evidence/judge_* | +| `src-tauri/migrations/2026XXXXXXXXXX_goal_judge_fields.sql`(新增) | judge_* 列迁移,并回填旧 `status='complete'` 为 `judge_passed=1` | +| `src-tauri/src/gateway/gateway_runner.rs` | 移除 `goal_scored` 引导文案,改为 agent_judge 验收说明 | +| `src-tauri/src/acp/**`(如有 goal 文案/事件映射) | 确认不引用 `goal_scored`;复用 GoalStateUpdated payload 的 judge 字段 | +| `src-tauri/tests/goal_lifecycle.rs` | 重写:覆盖 Judge 通过→Complete+judge_passed→停续行;未通过→续行;旧 complete 回填兼容 | +| `src-tauri/src/core/agent_session_tests.rs` / subagent tests | 覆盖 Judge profile、模型角色、工具注入、递归拒绝、parallel 拒绝、prompt surface 匹配 | +| `src/services/bridge/agent-commands.ts` | 前端 `GoalPayload` 类型新增 judge 字段 | +| `src/modules/workbench-shell/model/thread-store.ts` | `GoalStoreState` 新增 judge 字段 | +| `src/modules/workbench-shell/ui/goal-status-bar.tsx` | 最小展示 `Complete && judgePassed` 为“已验收通过” | +| `src/modules/workbench-shell/ui/runtime-thread-surface.tsx` | 清理 goal kickoff prompt 中的 `goal_scored` 示例,改为 agent_judge 验收说明 | + +--- + +## 7. 验证计划 + +- **Rust 格式**:`cargo fmt --check --manifest-path src-tauri/Cargo.toml`。 +- **Rust 行为**:`cargo test --locked --manifest-path src-tauri/Cargo.toml`,重点 `goal_lifecycle`、subagent 委派、prompt surface 与迁移相关测试。新增/重写用例: + - Judge `passed=true` → goal 变 `Complete` 且 `judge_passed=true`,`judge_summary/evidence` 非空,下一轮 `evaluate_after_run` 返回 skipped(停续行)。 + - Judge `passed=false` → goal 仍进行中,写入 `judge_findings`,`evaluate_after_run` 返回 `Continue` 且 continuation prompt 包含最近 findings 并引导调用 `agent_judge`。 + - 存量 `status='complete'` 迁移后 `judge_passed=1`、`judge_completeness=100`,不会被新续行逻辑重新拉起。 + - `agent_judge` 仅在有未通过验收 goal 时注入主 agent;无 goal 或已验收通过时主 agent 工具集不含 `agent_judge`;任何 subagent 工具集不含 `agent_judge`。 + - 运行时门禁:subagent 直接调用 `agent_judge` 被拒绝;`agent_parallel` task 使用 `agent_judge` 被拒绝;主 agent→Judge 合法(depth 2);Judge→explore/review 合法(depth 3)。 + - Judge 模型角色使用 primary;Explore/Review 仍保持既有模型映射。 + - Prompt surface:`SubagentJudge` 能构建 system prompt;`AnySubagent` / `BuiltinSubagent` 匹配 Judge;Judge 模板包含诊断型 shell 软约束和结构化输出契约。 + - JudgeReport 解析失败、`passed=true` 但 summary 空、completeness 越界、`passed=false` findings 空 → 均视为未通过或安全兜底,不误标完成。 + - `goal_scored` 工具与常量已删除(编译期 + 检索为 0 个非历史设计文档引用)。 +- **前端**:`npm run typecheck`;若改动前端测试则 `npm run test:unit`。重点验证 `GoalPayload` / `GoalStoreState` 新字段不会破坏事件处理,`goal-status-bar.tsx` 能显示已验收通过。 +- **文案检索**:全局搜索 `goal_scored`,除历史文档/迁移注释外不应有运行时 prompt、前端提示或 gateway 文案引用。 +- **手动冒烟**:创建 goal → 主 agent 工作 → 调 agent_judge 未通过(findings)→ 续行修复 → 再次 agent_judge 通过 → goal 状态条显示已验收、续行停止。 + +--- + +## 8. 风险与边界 + +1. **主 agent 始终不调用 `agent_judge`**:goal 永远不被验收,续行会持续注入 prompt 直至护栏触发(idle/预算上限)。这正是护栏保留的价值——兜底防止无限续行。需在 prompt 中强力引导主 agent 调用 agent_judge。 +2. **Judge 误判**:Judge 也是 LLM,可能误通过或误拒。误通过风险通过“独立上下文 + 文件工具只读 + primary 模型 + 重点核对一致性/完整性 + 可跑诊断验证”降低;误拒会触发续行修复,代价是额外轮次。 +3. **诊断型 shell 不是硬只读**:Judge 可用 `shell` 意味着理论上能执行修改性命令。首版通过 Judge prompt 进行软约束,要求只运行测试、type-check、lint、只读检查,并禁止修改文件、删除数据、安装依赖、改变全局状态。若后续发现模型不稳定,应新增受限 test-runner 或 shell allowlist。 +4. **Judge 成本**:每次验收会拉起一个可委派的 subagent run,可能再并行 explore/review,token/时间开销不小。首版不把 Judge/subagent token 单独计入 goal budget,也不新增 Judge 专属硬超时;需在 continuation prompt 中提示主 agent“仅在确有把握达成时再申请验收”,避免频繁空验收。 +5. **深度语义边界**:Judge `max_delegation_depth=2` 必须与 `MAIN_AGENT_CHILD_DEPTH=2` 一致,且要确保 Judge 在 depth 2 仍能委派 depth 3 的 explore/review(受 `GLOBAL_MAX_DELEGATION_DEPTH=5` 与 explore/review 自身上限 3 约束,合法)。同时必须在递归委派和 parallel 路径拒绝任何 helper→Judge 调用,避免职责边界被绕过。 +6. **迁移兼容**:迁移必须回填 `UPDATE goals SET judge_passed=1, judge_completeness=100 ... WHERE status='complete'`。运行时若遇到 `Complete && !judge_passed`,应记录 warning 并停续行,不能把存量已完成 goal 重新拉起。 +7. **gateway / ACP 路径**:微信/企微与 ACP 同样依赖 goal 续行,首版需确认这些入口创建主 agent run 时走同一 `build_session_spec()` 注入逻辑,且 prompt/gateway 文案不再提 `goal_scored`。 +8. **同轮继续修改**:Judge 通过后主 agent 仍可能在同一 run 继续调用其他工具。首版不做写工具硬锁,通过 Judge 工具结果和 `active_goal.tpl.md` prompt 要求停止修改;若后续发现问题,再加 `Complete && judge_passed` 后 mutating tools 拒绝策略。 +9. **跨平台**:主体为 Rust/SQLite/prompt/TypeScript 类型改动,应保持跨平台兼容;shell 诊断命令由 Judge 根据项目现有命令选择,prompt 中需提醒避免平台特定假设。 diff --git a/src-tauri/migrations/20260607000000_goal_judge_fields.sql b/src-tauri/migrations/20260607000000_goal_judge_fields.sql new file mode 100644 index 00000000..11dd6954 --- /dev/null +++ b/src-tauri/migrations/20260607000000_goal_judge_fields.sql @@ -0,0 +1,17 @@ +-- Goal Judge verification fields: persist the most recent independent Judge +-- verdict for a goal. Acceptance is expressed as status='complete' AND +-- judge_passed=1 (the main agent can no longer self-attest completion). +ALTER TABLE goals ADD COLUMN judge_passed INTEGER NOT NULL DEFAULT 0; -- bool +ALTER TABLE goals ADD COLUMN judge_completeness INTEGER; -- 0-100, nullable +ALTER TABLE goals ADD COLUMN judge_findings TEXT; -- JSON array, nullable +ALTER TABLE goals ADD COLUMN judge_summary TEXT; -- nullable +ALTER TABLE goals ADD COLUMN judge_evaluated_run_id TEXT; -- nullable + +-- Backfill goals already completed via the legacy goal_scored path so that an +-- upgrade does not treat them as un-verified (which would otherwise let goal +-- continuation re-open them). +UPDATE goals +SET judge_passed = 1, + judge_summary = COALESCE(judge_summary, evidence), + judge_completeness = COALESCE(judge_completeness, 100) +WHERE status = 'complete'; diff --git a/src-tauri/src/core/agent_session.rs b/src-tauri/src/core/agent_session.rs index 7a5eb557..2c7be27e 100644 --- a/src-tauri/src/core/agent_session.rs +++ b/src-tauri/src/core/agent_session.rs @@ -601,6 +601,34 @@ pub async fn build_session_spec( .await .map(|settings| settings.is_ready()) .unwrap_or(false); + + let mut runtime_tools = runtime_tools_with_custom_subagents( + runtime_tools_with_web_search( + runtime_tools_for_profile_with_extensions(&tool_profile_name, extension_tools), + &tool_profile_name, + web_search_enabled, + ), + custom_subagent_tools, + ); + + // Inject the main-agent-only `agent_judge` acceptance tool on demand: only + // when this thread has a goal that has not yet passed Judge acceptance + // (acceptance = status Complete AND judge_passed). It is appended after the + // custom/extension merge so that the built-in tool name always wins and + // cannot be shadowed by a custom or extension tool. + if let Ok(Some(goal)) = + crate::persistence::repo::goal_repo::find_by_thread_id(pool, thread_id).await + { + let already_verified = + goal.status == crate::model::goal::GoalStatus::Complete && goal.judge_passed; + if !already_verified { + let judge_tool = crate::core::subagent::RuntimeOrchestrationTool::Judge.as_agent_tool(); + if !runtime_tools.iter().any(|t| t.name == judge_tool.name) { + runtime_tools.push(judge_tool); + } + } + } + let initial_context_calibration = build_initial_context_token_calibration( latest_historical_run.as_ref(), &history_messages, @@ -615,14 +643,7 @@ pub async fn build_session_spec( workspace_path: workspace_path.to_string(), run_mode: run_mode.to_string(), tool_profile_name: tool_profile_name.clone(), - runtime_tools: runtime_tools_with_custom_subagents( - runtime_tools_with_web_search( - runtime_tools_for_profile_with_extensions(&tool_profile_name, extension_tools), - &tool_profile_name, - web_search_enabled, - ), - custom_subagent_tools, - ), + runtime_tools, system_prompt, history_messages, history_tool_calls, diff --git a/src-tauri/src/core/agent_session_execution.rs b/src-tauri/src/core/agent_session_execution.rs index e6a2b913..f1dbb980 100644 --- a/src-tauri/src/core/agent_session_execution.rs +++ b/src-tauri/src/core/agent_session_execution.rs @@ -16,10 +16,10 @@ use crate::core::plan_checkpoint::{ build_plan_message_metadata, plan_markdown, write_plan_file, }; use crate::core::subagent::{ - extract_review_report, render_parallel_summary, HelperRunRequest, HelperRunResult, - ParallelSubagentBatchStatus, ParallelSubagentRequest, ParallelSubagentSummary, - ParallelSubagentTask, ParallelSubagentTaskResult, ParallelSubagentTaskStatus, ReviewRequest, - RuntimeOrchestrationTool, SubagentProfile, + extract_judge_report, extract_review_report, render_parallel_summary, HelperRunRequest, + HelperRunResult, JudgeReport, ParallelSubagentBatchStatus, ParallelSubagentRequest, + ParallelSubagentSummary, ParallelSubagentTask, ParallelSubagentTaskResult, + ParallelSubagentTaskStatus, ReviewRequest, RuntimeOrchestrationTool, SubagentProfile, }; use crate::core::tool_gateway::{ ApprovalRequest, ToolExecutionOptions, ToolExecutionRequest, ToolGatewayResult, @@ -294,33 +294,6 @@ impl AgentSession { .await; } - // Goal tools — handle before the main tool gateway - if tool_name == crate::core::goal_manager::GOAL_SCORED_TOOL_NAME { - let tool_call_storage_id = uuid::Uuid::now_v7().to_string(); - let insert_result = tool_call_repo::insert( - &self.pool, - &tool_call_repo::ToolCallInsert { - id: tool_call_storage_id.clone(), - tool_call_id: tool_call_id.to_string(), - run_id: self.spec.run_id.clone(), - thread_id: self.spec.thread_id.clone(), - helper_id: None, - tool_name: tool_name.to_string(), - tool_input_json: tool_input.to_string(), - status: "requested".to_string(), - }, - ) - .await; - - if let Err(error) = insert_result { - return agent_error_result(format!("failed to persist tool call: {error}")); - } - - return self - .execute_goal_tool(tool_name, tool_call_id, &tool_call_storage_id, tool_input) - .await; - } - let tool_call_storage_id = uuid::Uuid::now_v7().to_string(); let insert_result = tool_call_repo::insert( &self.pool, @@ -351,6 +324,10 @@ impl AgentSession { ) .await } + RuntimeOrchestrationTool::Judge => { + self.execute_judge_tool(tool_call_id, &tool_call_storage_id, tool_input) + .await + } _ => { self.execute_helper_tool(tool, tool_call_id, &tool_call_storage_id, tool_input) .await @@ -886,6 +863,16 @@ impl AgentSession { return Err("agent_parallel cannot be used as an individual helper".to_string()); } + if tool == RuntimeOrchestrationTool::Judge { + // agent_judge is a main-agent-only acceptance tool: it must not be + // reachable as a generic helper delegate or as an agent_parallel + // batch target. + return Err( + "agent_judge can only be called directly by the main agent for the current goal" + .to_string(), + ); + } + let HelperToolTask { task, review_request, @@ -1614,202 +1601,238 @@ impl AgentSession { } } - // ── Goal tool handlers ── + // ── Goal acceptance Judge handler ── - async fn execute_goal_tool( + /// Run the main-agent-only `agent_judge` acceptance flow: build a Judge task + /// with the current goal injected, run the Judge helper, parse its structured + /// verdict, persist it, and (on pass) flip the goal to verified/complete. + async fn execute_judge_tool( &self, - tool_name: &str, - _tool_call_id: &str, + tool_call_id: &str, tool_call_storage_id: &str, tool_input: &serde_json::Value, ) -> AgentToolResult { - let pool = self.pool.clone(); - let thread_id = self.spec.thread_id.clone(); - - match tool_name { - name if name == crate::core::goal_manager::GOAL_SCORED_TOOL_NAME => { - let status = tool_input - .get("status") - .and_then(|v| v.as_str()) - .unwrap_or(""); - let evidence = tool_input - .get("evidence") - .and_then(|v| v.as_str()) - .unwrap_or(""); - let pledge = tool_input - .get("pledge") - .and_then(|v| v.as_str()) - .unwrap_or(""); - - // Only support marking as complete - if status != "complete" { - let err_msg = "goal_scored only supports status='complete'. Use /goal pause|resume|clear from the UI for other lifecycle operations."; - tool_call_repo::update_result( - &self.pool, - tool_call_storage_id, - &serde_json::json!({ "error": err_msg }).to_string(), - "failed", - ) - .await - .ok(); - return agent_error_result(err_msg); - } + // Parse the main agent's task / rationale. + let request = match crate::core::subagent::JudgeRequest::from_tool_input(tool_input) { + Ok(request) => request, + Err(error) => { + tool_call_repo::update_result( + &self.pool, + tool_call_storage_id, + &serde_json::json!({ "error": &error }).to_string(), + "failed", + ) + .await + .ok(); + return agent_error_result(error); + } + }; - // The pledge must match the required text exactly. - if pledge.trim() != crate::core::goal_manager::GOAL_SCORED_PLEDGE { - let err_msg = format!( - "goal_scored rejected: the 'pledge' parameter must be passed verbatim as: \"{}\"", - crate::core::goal_manager::GOAL_SCORED_PLEDGE - ); - tool_call_repo::update_result( - &self.pool, - tool_call_storage_id, - &serde_json::json!({ "error": &err_msg }).to_string(), - "failed", - ) - .await - .ok(); - return agent_error_result(err_msg); - } + // Backstop: re-query goal state. agent_judge is injected only when an + // un-verified goal exists, but a stale tool set or a direct call must be + // rejected here too. + let goal = match crate::persistence::repo::goal_repo::find_by_thread_id( + &self.pool, + &self.spec.thread_id, + ) + .await + { + Ok(Some(goal)) => goal, + Ok(None) => { + let err_msg = + "agent_judge cannot run: no goal exists for this thread. Create one with the /goal command first."; + tool_call_repo::update_result( + &self.pool, + tool_call_storage_id, + &serde_json::json!({ "error": err_msg }).to_string(), + "failed", + ) + .await + .ok(); + return agent_error_result(err_msg); + } + Err(e) => { + let err_msg = format!("Failed to load goal: {e}"); + tool_call_repo::update_result( + &self.pool, + tool_call_storage_id, + &serde_json::json!({ "error": &err_msg }).to_string(), + "failed", + ) + .await + .ok(); + return agent_error_result(err_msg); + } + }; - if evidence.trim().is_empty() { - // Evidence is empty — reject the completion and challenge - let mgr = crate::core::goal_manager::GoalManager::new( - pool, - thread_id, - self.goal_runtime.clone(), - ); - let challenge = mgr.render_challenge_prompt( - crate::core::goal_manager::ChallengePromptVariant::NoEvidence, - ); - let result_text = - format!("Goal completion rejected: evidence is required. {challenge}"); - tool_call_repo::update_result( - &self.pool, - tool_call_storage_id, - &serde_json::json!({ "output": &result_text }).to_string(), - "completed", - ) - .await - .ok(); - return AgentToolResult::text(result_text); - } + if goal.status == crate::model::goal::GoalStatus::Complete && goal.judge_passed { + let err_msg = + "The goal has already passed acceptance. No further verification is needed."; + tool_call_repo::update_result( + &self.pool, + tool_call_storage_id, + &serde_json::json!({ "error": err_msg }).to_string(), + "failed", + ) + .await + .ok(); + return agent_error_result(err_msg); + } - let mgr = crate::core::goal_manager::GoalManager::new( - pool, - thread_id, - self.goal_runtime.clone(), - ); - match mgr.get_active().await { - Ok(Some(goal)) => { - if goal.status != crate::model::goal::GoalStatus::Active { - let err_msg = format!( - "Goal is not active (current status: {:?}). Cannot mark as complete.", - goal.status - ); - tool_call_repo::update_result( - &self.pool, - tool_call_storage_id, - &serde_json::json!({ "error": &err_msg }).to_string(), - "failed", - ) - .await - .ok(); - return agent_error_result(err_msg); - } - let paused_seconds = { - let mut guard = self.goal_runtime.lock().unwrap_or_else(|poisoned| { - tracing::warn!( - "goal_scored: goal_runtime mutex poisoned, recovering" - ); - poisoned.into_inner() - }); - guard.take_run_paused_seconds(&self.spec.run_id).max(0) - }; - let active_run_seconds = - crate::persistence::repo::run_repo::get_active_run_elapsed_seconds( - &self.pool, - &self.spec.thread_id, - ) - .await - .unwrap_or(None) - .map(|seconds| (seconds - paused_seconds).max(0)); - - match mgr.mark_complete(&goal.id, evidence).await { - Ok(()) => { - if let Some(run_seconds) = active_run_seconds { - if run_seconds > 0 { - mgr.account_usage(&goal.id, 0, run_seconds).await.ok(); - } - } - - let updated = mgr.get_active().await.ok().flatten(); - if let Some(ref record) = updated { - let payload = - crate::core::goal_manager::GoalManager::to_payload(record); - let _ = self.event_tx.send(ThreadStreamEvent::GoalCompleted { - thread_id: record.thread_id.clone(), - evidence: evidence.to_string(), - }); - let _ = - self.event_tx.send(ThreadStreamEvent::GoalStateUpdated { - thread_id: record.thread_id.clone(), - goal: Some(payload), - }); - } - let result_text = - format!("Goal marked as complete. Evidence: {evidence}"); - tool_call_repo::update_result( - &self.pool, - tool_call_storage_id, - &serde_json::json!({ "output": &result_text }).to_string(), - "completed", - ) - .await - .ok(); - AgentToolResult::text(result_text) - } - Err(e) => { - let err_msg = format!("Failed to complete goal: {e}"); - tool_call_repo::update_result( - &self.pool, - tool_call_storage_id, - &serde_json::json!({ "error": &err_msg }).to_string(), - "failed", - ) - .await - .ok(); - agent_error_result(err_msg) - } + // Build the Judge task: inject the goal objective + status + last verdict + // so the Judge does not rely on the main agent's self-report. + let mut prior_verdict = String::new(); + if goal.judge_evaluated_run_id.is_some() { + if let Some(summary) = goal.judge_summary.as_deref() { + if !summary.trim().is_empty() { + prior_verdict.push_str(&format!("\nPrevious Judge summary: {summary}")); + } + } + if let Some(findings_json) = goal.judge_findings.as_deref() { + if let Ok(findings) = serde_json::from_str::>(findings_json) { + if !findings.is_empty() { + prior_verdict.push_str("\nPrevious Judge findings:"); + for finding in findings { + prior_verdict.push_str(&format!("\n- {finding}")); } } - Ok(None) => { - let err_msg = "No active goal found. Create one first with /goal command."; - tool_call_repo::update_result( - &self.pool, - tool_call_storage_id, - &serde_json::json!({ "error": err_msg }).to_string(), - "failed", - ) - .await - .ok(); - agent_error_result(err_msg) - } - Err(e) => { - let err_msg = format!("Failed to load goal: {e}"); - tool_call_repo::update_result( - &self.pool, - tool_call_storage_id, - &serde_json::json!({ "error": &err_msg }).to_string(), - "failed", - ) - .await - .ok(); - agent_error_result(err_msg) - } } } - _ => agent_error_result(format!("Unknown goal tool: {tool_name}")), + } + + let judge_task = format!( + "You are verifying acceptance of the following goal for the current project.\n\n\ +Goal id: {goal_id}\n\ +Goal status: {status:?}\n\ +Goal objective:\n{objective}\n\ +{prior_verdict}\n\n\ +The main agent's note for this verification request:\n{task}\n\n\ +Independently inspect the project's current state and decide whether it satisfies the goal. \ +Return your structured JudgeReport verdict.", + goal_id = goal.id, + status = goal.status, + objective = goal.objective, + prior_verdict = prior_verdict, + task = request.task, + ); + + // Build a Judge delegate (depth 2, primary model) and run it. + let tool = RuntimeOrchestrationTool::Judge; + let helper_profile = resolve_helper_profile(&tool); + let model_role = match resolve_helper_model_role( + &self.spec.model_plan, + &tool, + helper_profile.as_ref(), + ) { + Some(role) => role, + None => { + let err_msg = "Failed to resolve a model for agent_judge.".to_string(); + tool_call_repo::update_result( + &self.pool, + tool_call_storage_id, + &serde_json::json!({ "error": &err_msg }).to_string(), + "failed", + ) + .await + .ok(); + return agent_error_result(err_msg); + } + }; + + let delegate = ResolvedHelperDelegate { + tool: tool.clone(), + agent_name: tool.tool_name(), + task: judge_task, + review_request: None, + helper_profile, + model_role, + }; + + let report: JudgeReport = match self.run_helper_for_delegate(&delegate, tool_call_id).await + { + Ok(summary) => extract_judge_report( + summary + .raw_summary + .as_deref() + .unwrap_or(summary.summary.as_str()), + ), + Err(error) => { + let err_msg = format!("agent_judge failed to run: {error}"); + tool_call_repo::update_result( + &self.pool, + tool_call_storage_id, + &serde_json::json!({ "error": &err_msg }).to_string(), + "failed", + ) + .await + .ok(); + return agent_error_result(err_msg); + } + }; + + // Persist the verdict (atomically flips to complete + judge_passed on pass). + let findings_json = + serde_json::to_string(&report.findings).unwrap_or_else(|_| "[]".to_string()); + let recorded = crate::persistence::repo::goal_repo::record_judge_verdict( + &self.pool, + &goal.id, + &self.spec.run_id, + report.passed, + report.completeness_pct as i64, + &findings_json, + &report.summary, + ) + .await; + + if let Err(e) = recorded { + let err_msg = format!("Failed to persist Judge verdict: {e}"); + tool_call_repo::update_result( + &self.pool, + tool_call_storage_id, + &serde_json::json!({ "error": &err_msg }).to_string(), + "failed", + ) + .await + .ok(); + return agent_error_result(err_msg); + } + + // Emit goal events with the freshly updated record. + if let Ok(Some(record)) = + crate::persistence::repo::goal_repo::find_by_thread_id(&self.pool, &self.spec.thread_id) + .await + { + let payload = crate::core::goal_manager::GoalManager::to_payload(&record); + if report.passed { + let _ = self.event_tx.send(ThreadStreamEvent::GoalCompleted { + thread_id: record.thread_id.clone(), + evidence: record.evidence.clone().unwrap_or_default(), + }); + } + let _ = self.event_tx.send(ThreadStreamEvent::GoalStateUpdated { + thread_id: record.thread_id.clone(), + goal: Some(payload), + }); + } + + let result_text = crate::core::subagent::judge_contract::render_parent_summary(&report); + tool_call_repo::update_result( + &self.pool, + tool_call_storage_id, + &serde_json::json!({ "output": &result_text, "passed": report.passed }).to_string(), + "completed", + ) + .await + .ok(); + + AgentToolResult { + content: vec![ContentBlock::Text(TextContent::new(result_text))], + details: Some(serde_json::json!({ + "passed": report.passed, + "completenessPct": report.completeness_pct, + "findings": report.findings, + "summary": report.summary, + })), } } } diff --git a/src-tauri/src/core/agent_session_tools.rs b/src-tauri/src/core/agent_session_tools.rs index abef8647..97966181 100644 --- a/src-tauri/src/core/agent_session_tools.rs +++ b/src-tauri/src/core/agent_session_tools.rs @@ -534,31 +534,6 @@ You may call this tool multiple times in a run to incrementally refine the plan. }), )); - // Goal tool — persistent cross-turn task completion - tools.push(AgentTool::new( - crate::core::goal_manager::GOAL_SCORED_TOOL_NAME, - "Goal Scored", - "Mark the current goal as fully achieved (score the goal). You MUST provide evidence — run tests, check file contents, or verify command output to prove the goal is truly achieved. Without evidence, the completion will be challenged. You MUST also pass the exact required pledge text. Do NOT call this tool unless you have actually verified the goal is complete with no remaining or follow-up work.", - serde_json::json!({ - "type": "object", - "properties": { - "status": { - "type": "string", - "enum": ["complete"], - "description": "Must be 'complete' to mark the goal as achieved." - }, - "evidence": { - "type": "string", - "description": "Concrete evidence that the goal is complete — test output, file change summary, command results, or verification steps. Required." - }, - "pledge": { - "type": "string", - "description": "You MUST pass this exact pledge text verbatim: \"I hereby declare: I confirm that I have fully achieved this goal, and I have confirmed that there are no remaining pending tasks or follow-up items. I confirm that I have repeatedly reviewed the output of this work, and I take responsibility for the quality of this output.\"" - } - }, - "required": ["status", "evidence", "pledge"] - }), - )); // Render artifact tool (always available) — supports charts, HTML, and SVG tools.push(AgentTool::new( "render", @@ -670,6 +645,7 @@ pub(crate) fn resolve_helper_profile(tool: &RuntimeOrchestrationTool) -> Option< match tool { RuntimeOrchestrationTool::Explore => Some(SubagentProfile::Explore), RuntimeOrchestrationTool::Review => Some(SubagentProfile::Review), + RuntimeOrchestrationTool::Judge => Some(SubagentProfile::Judge), RuntimeOrchestrationTool::Parallel | RuntimeOrchestrationTool::Custom(_) => None, } } @@ -690,6 +666,8 @@ pub(crate) fn resolve_helper_model_role( .clone() .unwrap_or_else(|| model_plan.primary.clone()), ), + // Judge prioritizes acceptance quality over cost: always use primary. + RuntimeOrchestrationTool::Judge => Some(model_plan.primary.clone()), RuntimeOrchestrationTool::Parallel | RuntimeOrchestrationTool::Custom(_) => None, } } diff --git a/src-tauri/src/core/goal_manager.rs b/src-tauri/src/core/goal_manager.rs index fa6d1101..7486e39b 100644 --- a/src-tauri/src/core/goal_manager.rs +++ b/src-tauri/src/core/goal_manager.rs @@ -19,12 +19,6 @@ pub struct GoalEvaluationOutcome { /// Default maximum turns for a goal before auto-pausing. const DEFAULT_MAX_TURNS: i64 = 50; -/// Tool name used to mark a goal as fully achieved ("score" the goal). -pub const GOAL_SCORED_TOOL_NAME: &str = "goal_scored"; - -/// Exact pledge text the agent must pass verbatim when calling `goal_scored`. -pub const GOAL_SCORED_PLEDGE: &str = "I hereby declare: I confirm that I have fully achieved this goal, and I have confirmed that there are no remaining pending tasks or follow-up items. I confirm that I have repeatedly reviewed the output of this work, and I take responsibility for the quality of this output."; - /// Continuation prompt injected when the goal is still active. const CONTINUATION_PROMPT_TEMPLATE: &str = "\ [Goal continuation — turns {turns_used}/{max_turns}] @@ -33,26 +27,28 @@ const CONTINUATION_PROMPT_TEMPLATE: &str = "\ Continue working toward this objective. Take the next concrete step. -⚠️ When the goal is fully achieved, you MUST call: - goal_scored(status=\"complete\", evidence=\"\", pledge=\"\") -Without this call, the system will keep injecting continuation prompts. +⚠️ Completion is now decided by independent verification. When you believe the +goal is achieved, you MUST call: + agent_judge(task=\"explain why you believe the goal is achieved / what to verify\") +A Judge will evaluate whether the project satisfies the goal's consistency and +completeness. +- The goal is only marked verified when the Judge returns passed=true. +- If a previous Judge verification did not pass, read its findings, fix each one, + then call agent_judge again. +You cannot declare completion yourself; only a passing Judge verdict counts. If you are blocked and need user input, use the clarify tool."; -/// Challenge prompt when the model claimed completion but did not use the tool. +/// Challenge prompt when the model claimed completion but has not requested +/// Judge verification yet. const CHALLENGE_EVIDENCE_PROMPT: &str = "\ -Before claiming the goal is complete, please provide concrete evidence: - -1. What verification commands did you run? What was the output? -2. What files did you modify? What was the purpose of each change? - -Once you have evidence, call goal_scored(status=\"complete\", evidence=\"...\", pledge=\"...\") . -If the goal is not actually complete, ignore this prompt and continue working."; +You appear to believe the goal is complete, but you have not requested independent +verification. You cannot self-declare completion. -/// Challenge prompt when the model claimed completion but evidence was empty. -const MISSING_EVIDENCE_PROMPT: &str = "\ -You called goal_scored(complete) but did not provide evidence. -Please provide completion evidence and call goal_scored(status=\"complete\", evidence=\"\", pledge=\"\") again."; +When you are confident the goal is achieved, call: + agent_judge(task=\"explain why you believe the goal is achieved / what to verify\") +The goal is only marked verified when the Judge returns passed=true. If the goal +is not actually complete, ignore this prompt and continue working."; /// Guidance prompt when the agent appears stuck. const GUIDANCE_PROMPT: &str = "\ @@ -159,6 +155,11 @@ impl GoalManager { pause_detail: None, evidence: None, last_evaluated_run_id: None, + judge_passed: false, + judge_completeness: None, + judge_findings: None, + judge_summary: None, + judge_evaluated_run_id: None, created_at: Utc::now(), updated_at: Utc::now(), }; @@ -353,7 +354,7 @@ impl GoalManager { .remove(&self.thread_id); return GoalVerdict::Paused { reason: PauseReason::IdleBlocked, - detail: Some("agent repeatedly claimed completion without providing evidence via goal_scored".into()), + detail: Some("agent repeatedly claimed completion without requesting Judge verification via agent_judge".into()), }; } return GoalVerdict::ChallengeEvidence; @@ -412,11 +413,11 @@ impl GoalManager { detail: Some("agent published a plan, awaiting approval".into()), }); } - // goal_scored is handled by the tool execution pipeline - // (agent_session_execution) which validates pledge/evidence - // and marks the goal complete. Evaluation should not - // interfere — let it pass through to idle reset and budget - // checks. + // agent_judge is the main-agent-only acceptance request. It is + // handled by the tool execution pipeline (execute_judge_tool), + // which runs the Judge and records the verdict. Evaluation must + // not treat it as a blocking tool — like any tool call it shows + // the agent acted and should reset idle tendencies. _ => {} } } @@ -486,20 +487,44 @@ impl GoalManager { // ── Prompt generation ── - /// Generate the continuation prompt for the next turn. + /// Generate the continuation prompt for the next turn. When a prior Judge + /// verification did not pass, the most recent findings are appended so the + /// agent can fix them before re-requesting verification. pub fn render_continuation_prompt(&self, goal: &GoalRecord) -> String { - CONTINUATION_PROMPT_TEMPLATE + let mut prompt = CONTINUATION_PROMPT_TEMPLATE .replace("{objective}", &goal.objective) .replace("{turns_used}", &goal.turns_used.to_string()) - .replace("{max_turns}", &goal.max_turns.to_string()) + .replace("{max_turns}", &goal.max_turns.to_string()); + + if goal.judge_evaluated_run_id.is_some() && !goal.judge_passed { + if let Some(findings_json) = goal.judge_findings.as_deref() { + if let Ok(findings) = serde_json::from_str::>(findings_json) { + let findings: Vec = findings + .into_iter() + .filter(|f| !f.trim().is_empty()) + .take(10) + .collect(); + if !findings.is_empty() { + prompt.push_str( + "\n\nMost recent Judge findings to address before re-verifying:", + ); + for finding in findings { + let trimmed = finding.trim(); + let truncated: String = trimmed.chars().take(500).collect(); + prompt.push_str(&format!("\n- {truncated}")); + } + } + } + } + } + + prompt } - /// Generate a challenge-evidence prompt when the model failed to provide evidence. - pub fn render_challenge_prompt(&self, variant: ChallengePromptVariant) -> String { - match variant { - ChallengePromptVariant::NoEvidence => MISSING_EVIDENCE_PROMPT.to_string(), - ChallengePromptVariant::NoTool => CHALLENGE_EVIDENCE_PROMPT.to_string(), - } + /// Generate a challenge prompt nudging the agent to request Judge + /// verification when it claims completion without calling `agent_judge`. + pub fn render_challenge_prompt(&self) -> String { + CHALLENGE_EVIDENCE_PROMPT.to_string() } /// Generate a guidance prompt when the agent appears stuck. @@ -517,7 +542,18 @@ impl GoalManager { None => return Ok(None), }; + // Acceptance is now decided exclusively by the Judge: a verified goal is + // `Complete && judge_passed`. Any non-Active goal stops continuation, + // preserving existing pause/budget semantics. The legacy combination + // `Complete && !judge_passed` should not occur after migration backfill; + // if it does, log it and still stop continuation rather than re-opening. if goal.status != GoalStatus::Active { + if goal.status == GoalStatus::Complete && !goal.judge_passed { + tracing::warn!( + goal_id = %goal.id, + "goal is Complete without judge_passed; treating as terminal and not re-opening" + ); + } return Ok(Some(GoalEvaluationOutcome { goal: Self::to_payload(&goal), verdict: "skipped".to_string(), @@ -594,7 +630,6 @@ impl GoalManager { GoalVerdict::BudgetLimited => { self.mark_budget_limited(¤t.id).await?; } - GoalVerdict::Complete { .. } => {} } if let Some(run_seconds) = @@ -626,11 +661,14 @@ impl GoalManager { ), GoalVerdict::ChallengeEvidence => ( "challenge_evidence", - Some(self.render_challenge_prompt(ChallengePromptVariant::NoTool)), + Some(format!( + "{}\n\n{}", + self.render_challenge_prompt(), + self.render_continuation_prompt(updated.as_ref().unwrap_or(&goal)) + )), ), GoalVerdict::Paused { reason: _, detail } => ("paused", detail.clone()), GoalVerdict::BudgetLimited => ("budget_limited", None), - GoalVerdict::Complete { .. } => ("complete", None), }; Ok(Some(GoalEvaluationOutcome { @@ -640,11 +678,3 @@ impl GoalManager { })) } } - -/// Variants for challenge prompts. -pub enum ChallengePromptVariant { - /// Model called goal_scored(complete) but evidence was empty. - NoEvidence, - /// Model claimed completion in text but didn't use the tool. - NoTool, -} diff --git a/src-tauri/src/core/prompt/sources/custom_subagent_body.rs b/src-tauri/src/core/prompt/sources/custom_subagent_body.rs index 3e7c334a..ae7a2fb3 100644 --- a/src-tauri/src/core/prompt/sources/custom_subagent_body.rs +++ b/src-tauri/src/core/prompt/sources/custom_subagent_body.rs @@ -61,6 +61,25 @@ impl SectionSource for SubagentBodySource { }, })) } + Some(SubagentProfile::Judge) => { + let template = include_str!("../templates/subagent/judge.md"); + let (_tmpl, body) = + super::super::templates::parse_front_matter(template).map_err(|e| { + FatalError::new("template.parse", format!("subagent/judge.md: {e}")) + })?; + let vars = super::super::templates::TemplateVars::new(); + let rendered = super::super::templates::render_template_strict(&body, &[], &vars) + .map_err(|e| { + FatalError::new("template.render", format!("subagent/judge.md: {e}")) + })?; + Ok(SectionOutcome::Produced(SectionBody { + markdown: rendered, + meta: SectionMeta { + template_path: Some("templates/subagent/judge.md"), + ..Default::default() + }, + })) + } Some(SubagentProfile::Custom { system_prompt, .. }) => { if system_prompt.trim().is_empty() { return Ok(SectionOutcome::Skip); diff --git a/src-tauri/src/core/prompt/sources/subagent_output_contract.rs b/src-tauri/src/core/prompt/sources/subagent_output_contract.rs index 1ad848d0..c055e5c3 100644 --- a/src-tauri/src/core/prompt/sources/subagent_output_contract.rs +++ b/src-tauri/src/core/prompt/sources/subagent_output_contract.rs @@ -16,6 +16,9 @@ const EXPLORE_TEMPLATE_EMBEDDED: &str = const REVIEW_TEMPLATE_REL_PATH: &str = "subagent/output_contract.review.md"; const REVIEW_TEMPLATE_EMBEDDED: &str = include_str!("../templates/subagent/output_contract.review.md"); +const JUDGE_TEMPLATE_REL_PATH: &str = "subagent/output_contract.judge.md"; +const JUDGE_TEMPLATE_EMBEDDED: &str = + include_str!("../templates/subagent/output_contract.judge.md"); const DECLARED_KEYS: &[&'static str] = &[]; /// Template-backed SectionSource for the SubagentOutputContract section. @@ -42,6 +45,7 @@ impl SectionSource for SubagentOutputContractSource { (EXPLORE_TEMPLATE_REL_PATH, EXPLORE_TEMPLATE_EMBEDDED) } Some(SubagentProfile::Review) => (REVIEW_TEMPLATE_REL_PATH, REVIEW_TEMPLATE_EMBEDDED), + Some(SubagentProfile::Judge) => (JUDGE_TEMPLATE_REL_PATH, JUDGE_TEMPLATE_EMBEDDED), Some(SubagentProfile::Custom { .. }) => { // Custom subagents get a generic output contract return Ok(SectionOutcome::Produced(SectionBody::markdown( diff --git a/src-tauri/src/core/prompt/surface.rs b/src-tauri/src/core/prompt/surface.rs index 009aef9b..92b6ef76 100644 --- a/src-tauri/src/core/prompt/surface.rs +++ b/src-tauri/src/core/prompt/surface.rs @@ -10,6 +10,8 @@ pub enum PromptSurface { SubagentExplore { inherited_run_mode: RunMode }, /// Built-in review subagent SubagentReview { inherited_run_mode: RunMode }, + /// Built-in goal acceptance Judge subagent + SubagentJudge { inherited_run_mode: RunMode }, /// User-defined custom subagent SubagentCustom { slug: String, @@ -70,9 +72,11 @@ impl SurfacePattern { } (SurfacePattern::AnySubagent, PromptSurface::SubagentExplore { .. }) => true, (SurfacePattern::AnySubagent, PromptSurface::SubagentReview { .. }) => true, + (SurfacePattern::AnySubagent, PromptSurface::SubagentJudge { .. }) => true, (SurfacePattern::AnySubagent, PromptSurface::SubagentCustom { .. }) => true, (SurfacePattern::BuiltinSubagent, PromptSurface::SubagentExplore { .. }) => true, (SurfacePattern::BuiltinSubagent, PromptSurface::SubagentReview { .. }) => true, + (SurfacePattern::BuiltinSubagent, PromptSurface::SubagentJudge { .. }) => true, (SurfacePattern::CustomSubagent, PromptSurface::SubagentCustom { .. }) => true, (SurfacePattern::Compaction(k), PromptSurface::Compaction { kind }) => k == kind, (SurfacePattern::AnyCompaction, PromptSurface::Compaction { .. }) => true, diff --git a/src-tauri/src/core/prompt/surface_extensions.rs b/src-tauri/src/core/prompt/surface_extensions.rs index c8ebfb68..4d7f245b 100644 --- a/src-tauri/src/core/prompt/surface_extensions.rs +++ b/src-tauri/src/core/prompt/surface_extensions.rs @@ -27,6 +27,7 @@ impl SurfaceExtension for PromptSurface { PromptSurface::MainAgent { run_mode } => SurfacePattern::MainAgent(*run_mode), PromptSurface::SubagentExplore { .. } => SurfacePattern::AnySubagent, PromptSurface::SubagentReview { .. } => SurfacePattern::AnySubagent, + PromptSurface::SubagentJudge { .. } => SurfacePattern::AnySubagent, PromptSurface::SubagentCustom { .. } => SurfacePattern::CustomSubagent, PromptSurface::Compaction { kind } => SurfacePattern::Compaction(*kind), PromptSurface::Title => SurfacePattern::Title, @@ -43,6 +44,7 @@ impl SurfaceExtension for PromptSurface { PromptSurface::MainAgent { .. } | PromptSurface::SubagentExplore { .. } | PromptSurface::SubagentReview { .. } + | PromptSurface::SubagentJudge { .. } | PromptSurface::SubagentCustom { .. } ) } @@ -76,6 +78,9 @@ mod tests { PromptSurface::SubagentReview { inherited_run_mode: RunMode::Default, }, + PromptSurface::SubagentJudge { + inherited_run_mode: RunMode::Default, + }, PromptSurface::SubagentCustom { slug: "test".into(), inherited_run_mode: RunMode::Default, diff --git a/src-tauri/src/core/prompt/templates/active_goal.tpl.md b/src-tauri/src/core/prompt/templates/active_goal.tpl.md index 0bf5ffa4..c36eb3dd 100644 --- a/src-tauri/src/core/prompt/templates/active_goal.tpl.md +++ b/src-tauri/src/core/prompt/templates/active_goal.tpl.md @@ -8,15 +8,15 @@ declared_keys: [max_turns, objective, turns_used] Objective: {{objective}} Turns used: {{turns_used}}/{{max_turns}} -**Completion requirements — ALL must be met before calling goal_scored(complete):** -1. Every subtask implied by the objective is done. No remaining work, no dangling follow-ups. -2. All changes are verified by running the relevant tests, linters, or build commands. -3. Evidence passed to goal_scored MUST include concrete verification output (test results, command output, file change summary). -Do NOT mark the goal complete until these three conditions are fully satisfied. +**Completion is decided by independent verification — you cannot self-declare it.** +1. Every subtask implied by the objective must be done, with no remaining work or dangling follow-ups. +2. Verify your work by running the relevant tests, linters, or build commands as you go. +3. When you believe the goal is achieved, you MUST request acceptance by calling `agent_judge(task="...")`. Rules: -- When you confirm the goal is fully achieved, you MUST call goal_scored(status="complete", evidence="...", pledge="...") to mark it as scored. This is the only way to mark the goal as achieved. -- The goal_scored tool requires a 'pledge' parameter. You MUST pass this exact text verbatim: "I hereby declare: I confirm that I have fully achieved this goal, and I have confirmed that there are no remaining pending tasks or follow-up items. I confirm that I have repeatedly reviewed the output of this work, and I take responsibility for the quality of this output." -- Do NOT claim completion without verifiable evidence -- If blocked and need user input, use clarify tool -- The system will automatically continue this goal across turns +- Call `agent_judge(task="explain why you believe the goal is achieved / what to verify")` when you think the goal is complete. An independent Judge will evaluate the project against the goal's consistency and completeness. +- The goal is only marked verified when the Judge returns passed=true. You cannot mark the goal complete yourself. +- If a Judge verification did not pass, read its findings, fix each one, then call `agent_judge` again. +- Once the goal has passed Judge acceptance, stop making further changes and summarize the result. +- If blocked and you need user input, use the clarify tool. +- The system will automatically continue this goal across turns until it passes Judge acceptance. diff --git a/src-tauri/src/core/prompt/templates/subagent/judge.md b/src-tauri/src/core/prompt/templates/subagent/judge.md new file mode 100644 index 00000000..0cab1d63 --- /dev/null +++ b/src-tauri/src/core/prompt/templates/subagent/judge.md @@ -0,0 +1,24 @@ +--- +section_id: SubagentJudge +version: 1 +declared_keys: [] +--- +You are the **Goal Acceptance Judge** — an independent verifier. The main agent has been working toward a goal and now believes it is achieved (or has fixed earlier findings and wants re-verification). Your job is to independently decide whether the project's **current state** truly satisfies the goal, focusing on **consistency** with what the goal asked for and **completeness** of the work. + +You are an evaluator, not an implementer. You did not do the work, and you must not take the main agent's claims at face value — verify against the actual project state. + +## What to evaluate +- Read the goal objective injected into your task and treat it as the acceptance contract. +- Inspect the relevant code, configuration, tests, and docs to confirm each requirement of the goal is actually met. +- Run diagnostic verification when it strengthens your judgment: tests, type-checks, linters, builds, and read-only inspection commands. Adapt the commands to this repository (infer them from instructions, scripts, and manifests) instead of assuming a stack. +- You may delegate to `agent_explore`, `agent_review`, or `agent_parallel` to gather evidence in parallel when the goal is broad. + +## Hard constraints (read-only acceptance) +- Your file tools are read-only. Do **not** modify, create, or delete any files. +- The `shell` tool is for **diagnostic and verification commands only** — tests, type-checks, linters, and read-only inspection. You must **never** use shell to edit or delete files, install dependencies, change global or system state, or start interactive / long-running / daemon processes. +- Do not attempt to fix the goal yourself. If something is incomplete, report it as a finding so the main agent can fix it. + +## Verdict rules +- Pass (`passed=true`) only when the project genuinely satisfies the goal with no material gaps. When you pass, `summary` must clearly state the verified evidence — it becomes the goal's completion evidence. +- If anything required by the goal is missing, inconsistent, untested, or broken, set `passed=false` and list each concrete gap in `findings`. +- Be honest and conservative: when in doubt, do not pass. A false "passed" is worse than an extra verification round. diff --git a/src-tauri/src/core/prompt/templates/subagent/output_contract.judge.md b/src-tauri/src/core/prompt/templates/subagent/output_contract.judge.md new file mode 100644 index 00000000..a695dd71 --- /dev/null +++ b/src-tauri/src/core/prompt/templates/subagent/output_contract.judge.md @@ -0,0 +1,21 @@ +--- +section_id: SubagentOutputContractJudge +version: 1 +declared_keys: [] +--- +Your output will be consumed by the parent agent and the goal acceptance pipeline, not the user. Follow any response language instructions inherited above for natural-language fields (`findings`, `summary`). + +Return exactly one JSON object with this contract and nothing else (no markdown fences, headings, or prose before or after it): + +{ + "passed": true, + "completenessPct": 100, + "findings": [], + "summary": "Concise but specific evidence for the verdict (verified requirements, commands run and their results)." +} + +Field rules: +- `passed` (boolean): true only when the project genuinely satisfies the goal. +- `completenessPct` (integer 0-100): your honest estimate of how complete the work is against the goal. +- `findings` (array of strings): each concrete unmet / inconsistent / untested / broken point. REQUIRED and non-empty when `passed=false`. +- `summary` (string): rationale for the verdict. REQUIRED and non-empty when `passed=true` — it becomes the goal's completion evidence. If you cannot provide real evidence, set `passed=false`. diff --git a/src-tauri/src/core/subagent/judge_contract.rs b/src-tauri/src/core/subagent/judge_contract.rs new file mode 100644 index 00000000..2a673d93 --- /dev/null +++ b/src-tauri/src/core/subagent/judge_contract.rs @@ -0,0 +1,287 @@ +use serde::{Deserialize, Serialize}; + +/// Input for the `agent_judge` tool (provided by the main agent). +#[derive(Debug, Clone)] +pub struct JudgeRequest { + /// The main agent's explanation of why it believes the goal is achieved, + /// and/or points it wants the Judge to focus on. + pub task: String, +} + +impl JudgeRequest { + pub fn from_tool_input(tool_input: &serde_json::Value) -> Result { + let task = tool_input + .get("task") + .and_then(serde_json::Value::as_str) + .unwrap_or_default() + .trim() + .to_string(); + + if task.is_empty() { + return Err("missing agent_judge task".to_string()); + } + + Ok(Self { task }) + } +} + +/// Structured verdict produced by the Judge subagent. +#[derive(Debug, Clone, Serialize, Deserialize)] +#[serde(rename_all = "camelCase")] +pub struct JudgeReport { + /// Whether the project currently satisfies the goal (acceptance passes). + pub passed: bool, + /// Completeness percentage 0-100. + pub completeness_pct: u8, + /// Specific unmet / non-conforming points. Required when `passed=false`. + #[serde(default)] + pub findings: Vec, + /// Rationale for the verdict. Used as completion evidence when `passed=true`. + #[serde(default)] + pub summary: String, +} + +impl JudgeReport { + /// Build a failed report carrying a single finding (used as a safe fallback + /// when the Judge output cannot be parsed). + fn failed_with_finding(finding: String) -> Self { + Self { + passed: false, + completeness_pct: 0, + findings: vec![finding], + summary: String::new(), + } + } + + /// Normalize a parsed report so it can never represent an unverifiable + /// acceptance: + /// - `completeness_pct` is clamped to 0-100. + /// - `passed=true` with an empty `summary` is downgraded to `passed=false`. + /// - `passed=false` with no findings gets a placeholder finding. + fn normalized(mut self) -> Self { + if self.completeness_pct > 100 { + self.completeness_pct = 100; + } + + if self.passed && self.summary.trim().is_empty() { + self.passed = false; + self.findings + .push("Judge reported passed=true but provided no summary/evidence; downgraded to not passed.".to_string()); + } + + if !self.passed && self.findings.is_empty() { + self.findings + .push("Judge did not provide actionable findings.".to_string()); + } + + self + } +} + +/// Parse the Judge's textual output into a `JudgeReport`. On any parse failure +/// the result is a *failed* report carrying the raw text as a finding, so a +/// malformed Judge response can never be mistaken for acceptance. +pub fn extract_judge_report(text: &str) -> JudgeReport { + let trimmed = text.trim(); + if trimmed.is_empty() { + return JudgeReport::failed_with_finding("Judge produced no output.".to_string()); + } + + if let Ok(report) = serde_json::from_str::(trimmed) { + return report.normalized(); + } + + let stripped = strip_code_fence(trimmed); + if let Ok(report) = serde_json::from_str::(stripped) { + return report.normalized(); + } + + if let Some(report) = extract_embedded_json(trimmed) { + return report.normalized(); + } + + JudgeReport::failed_with_finding(format!( + "Judge output could not be parsed as a JudgeReport. Raw output: {trimmed}" + )) +} + +/// Render a parent-facing summary of the verdict for the main agent. +pub fn render_parent_summary(report: &JudgeReport) -> String { + let mut lines = vec![format!( + "Judge verdict: {} (completeness {}%)", + if report.passed { + "PASSED" + } else { + "NOT PASSED" + }, + report.completeness_pct + )]; + + if !report.summary.trim().is_empty() { + lines.push(format!("Summary: {}", report.summary.trim())); + } + + if report.findings.is_empty() { + lines.push("Findings:\n- none".to_string()); + } else { + let rendered = report + .findings + .iter() + .map(|f| format!("- {}", f.trim())) + .collect::>() + .join("\n"); + lines.push(format!("Findings:\n{rendered}")); + } + + if report.passed { + lines.push( + "✅ The goal has passed acceptance and is now marked complete. Stop making further changes and summarize the result.".to_string(), + ); + } else { + lines.push( + "❌ The goal has NOT passed acceptance. Fix the findings above, then call agent_judge again to re-verify.".to_string(), + ); + } + + lines.join("\n\n") +} + +fn strip_code_fence(text: &str) -> &str { + text.strip_prefix("```json") + .and_then(|value| value.strip_suffix("```")) + .map(str::trim) + .or_else(|| { + text.strip_prefix("```") + .and_then(|value| value.strip_suffix("```")) + .map(str::trim) + }) + .unwrap_or(text) +} + +/// Best-effort: pull the first balanced `{...}` JSON object out of mixed prose +/// and try to parse it as a `JudgeReport`. +fn extract_embedded_json(text: &str) -> Option { + let start = text.find('{')?; + let bytes = text.as_bytes(); + let mut depth = 0usize; + let mut in_string = false; + let mut escaped = false; + for (idx, &b) in bytes.iter().enumerate().skip(start) { + if in_string { + if escaped { + escaped = false; + } else if b == b'\\' { + escaped = true; + } else if b == b'"' { + in_string = false; + } + continue; + } + match b { + b'"' => in_string = true, + b'{' => depth += 1, + b'}' => { + depth -= 1; + if depth == 0 { + let candidate = &text[start..=idx]; + return serde_json::from_str::(candidate).ok(); + } + } + _ => {} + } + } + None +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn judge_request_requires_task() { + assert!(JudgeRequest::from_tool_input(&serde_json::json!({})).is_err()); + let req = JudgeRequest::from_tool_input(&serde_json::json!({ "task": " verify it " })) + .expect("parses"); + assert_eq!(req.task, "verify it"); + } + + #[test] + fn extract_parses_plain_json() { + let report = extract_judge_report( + r#"{"passed":true,"completenessPct":100,"findings":[],"summary":"All tests pass."}"#, + ); + assert!(report.passed); + assert_eq!(report.completeness_pct, 100); + assert_eq!(report.summary, "All tests pass."); + } + + #[test] + fn extract_parses_json_fence() { + let report = extract_judge_report( + "```json\n{\"passed\":false,\"completenessPct\":40,\"findings\":[\"missing tests\"],\"summary\":\"\"}\n```", + ); + assert!(!report.passed); + assert_eq!(report.completeness_pct, 40); + assert_eq!(report.findings, vec!["missing tests"]); + } + + #[test] + fn extract_parses_embedded_json() { + let report = extract_judge_report( + "Here is my verdict:\n{\"passed\":true,\"completenessPct\":90,\"findings\":[],\"summary\":\"Looks good\"}\nThanks!", + ); + assert!(report.passed); + assert_eq!(report.summary, "Looks good"); + } + + #[test] + fn malformed_output_is_not_passed() { + let report = extract_judge_report("I think it's done, looks fine to me."); + assert!(!report.passed); + assert!(!report.findings.is_empty()); + } + + #[test] + fn empty_output_is_not_passed() { + let report = extract_judge_report(" "); + assert!(!report.passed); + assert!(!report.findings.is_empty()); + } + + #[test] + fn passed_with_empty_summary_is_downgraded() { + let report = extract_judge_report( + r#"{"passed":true,"completenessPct":100,"findings":[],"summary":" "}"#, + ); + assert!(!report.passed); + assert!(!report.findings.is_empty()); + } + + #[test] + fn completeness_is_clamped() { + let report = extract_judge_report( + r#"{"passed":false,"completenessPct":250,"findings":["x"],"summary":""}"#, + ); + assert_eq!(report.completeness_pct, 100); + } + + #[test] + fn failed_with_no_findings_gets_placeholder() { + let report = extract_judge_report( + r#"{"passed":false,"completenessPct":10,"findings":[],"summary":"incomplete"}"#, + ); + assert!(!report.passed); + assert_eq!(report.findings.len(), 1); + } + + #[test] + fn render_summary_includes_verdict_and_findings() { + let report = extract_judge_report( + r#"{"passed":false,"completenessPct":30,"findings":["A","B"],"summary":"not yet"}"#, + ); + let summary = render_parent_summary(&report); + assert!(summary.contains("NOT PASSED")); + assert!(summary.contains("- A")); + assert!(summary.contains("agent_judge again")); + } +} diff --git a/src-tauri/src/core/subagent/mod.rs b/src-tauri/src/core/subagent/mod.rs index 22760953..5bbd87f4 100644 --- a/src-tauri/src/core/subagent/mod.rs +++ b/src-tauri/src/core/subagent/mod.rs @@ -1,8 +1,10 @@ +pub mod judge_contract; pub mod orchestrator; pub mod parallel_contract; pub mod review_contract; pub mod runtime_orchestration; +pub use judge_contract::{extract_judge_report, JudgeReport, JudgeRequest}; pub use orchestrator::{ HelperAgentOrchestrator, HelperRunRequest, HelperRunResult, SubagentActivityStatus, SubagentProgressSnapshot, diff --git a/src-tauri/src/core/subagent/orchestrator.rs b/src-tauri/src/core/subagent/orchestrator.rs index 3641e7dd..cb890700 100644 --- a/src-tauri/src/core/subagent/orchestrator.rs +++ b/src-tauri/src/core/subagent/orchestrator.rs @@ -1127,6 +1127,15 @@ impl HelperDelegationContext { RuntimeOrchestrationTool::Parallel => { return Err("agent_parallel cannot be used as an individual helper".to_string()); } + RuntimeOrchestrationTool::Judge => { + // Hard gate: agent_judge is a main-agent-only tool. A subagent + // (including Judge itself) must never recursively request goal + // acceptance, even if the tool name was parsed successfully. + return Err( + "agent_judge can only be called by the main agent for the current goal" + .to_string(), + ); + } RuntimeOrchestrationTool::Custom(slug) => { crate::core::agent_session_tools::resolve_custom_subagent_profile_from_pool( &self.orchestrator.pool, @@ -1472,6 +1481,9 @@ async fn build_helper_system_prompt( SubagentProfile::Review => PromptSurface::SubagentReview { inherited_run_mode: rm, }, + SubagentProfile::Judge => PromptSurface::SubagentJudge { + inherited_run_mode: rm, + }, SubagentProfile::Custom { slug, .. } => PromptSurface::SubagentCustom { slug: slug.clone(), inherited_run_mode: rm, diff --git a/src-tauri/src/core/subagent/runtime_orchestration.rs b/src-tauri/src/core/subagent/runtime_orchestration.rs index 27150c30..25c72e8a 100644 --- a/src-tauri/src/core/subagent/runtime_orchestration.rs +++ b/src-tauri/src/core/subagent/runtime_orchestration.rs @@ -34,6 +34,10 @@ pub enum RuntimeOrchestrationTool { Explore, Review, Parallel, + /// Goal acceptance Judge. Main-agent-only tool (`agent_judge`): it is parsed + /// here for unified dispatch but is never part of `builtin_all()` nor any + /// helper's delegation tool set. + Judge, Custom(String), // slug of the custom subagent } @@ -41,6 +45,7 @@ pub enum RuntimeOrchestrationTool { pub enum SubagentProfile { Explore, Review, + Judge, Custom { slug: String, name: String, @@ -130,6 +135,7 @@ impl RuntimeOrchestrationTool { "agent_explore" => Some(Self::Explore), "agent_review" => Some(Self::Review), "agent_parallel" => Some(Self::Parallel), + "agent_judge" => Some(Self::Judge), _ => { // Match custom subagent pattern: "agent_{slug}" if let Some(slug) = tool_name.strip_prefix("agent_") { @@ -151,6 +157,7 @@ impl RuntimeOrchestrationTool { Self::Explore => "agent_explore".to_string(), Self::Review => "agent_review".to_string(), Self::Parallel => "agent_parallel".to_string(), + Self::Judge => "agent_judge".to_string(), Self::Custom(slug) => format!("agent_{slug}"), } } @@ -160,6 +167,7 @@ impl RuntimeOrchestrationTool { Self::Explore => "Agent Explore".to_string(), Self::Review => "Agent Review".to_string(), Self::Parallel => "Agent Parallel".to_string(), + Self::Judge => "Agent Judge".to_string(), Self::Custom(slug) => format!("Agent {slug}"), } } @@ -175,6 +183,9 @@ impl RuntimeOrchestrationTool { Self::Parallel => { "Delegate 1-5 independent subtasks to subagents with bounded concurrency. Use this for parallel exploration or review work only when tasks are independent and low side-effect; results are aggregated for the parent agent." } + Self::Judge => { + "Request independent acceptance verification of the current goal. The Judge inspects the project's current state (read-only, with diagnostic shell for tests/type-check/lint) against the goal and returns a structured verdict. You cannot self-declare completion — only a passing Judge verdict marks the goal verified. Call this when you believe the goal is achieved, or to re-verify after fixing prior findings." + } Self::Custom(_) => { // Custom subagents have their description set externally via custom_subagent_as_tool "Custom subagent." @@ -188,6 +199,7 @@ impl RuntimeOrchestrationTool { match self { Self::Explore => Some(SubagentProfile::Explore), Self::Review => Some(SubagentProfile::Review), + Self::Judge => Some(SubagentProfile::Judge), Self::Parallel | Self::Custom(_) => None, } } @@ -339,6 +351,16 @@ impl RuntimeOrchestrationTool { }, "required": ["task"] }), + Self::Judge => serde_json::json!({ + "type": "object", + "properties": { + "task": { + "type": "string", + "description": "Explain why you believe the goal is achieved and call out anything the Judge should focus on (e.g. acceptance criteria, areas you are unsure about). If you are re-verifying after fixing earlier findings, summarize what you changed." + } + }, + "required": ["task"] + }), }; let name = self.tool_name(); @@ -353,6 +375,7 @@ impl SubagentProfile { match self { Self::Explore => "helper_explore".to_string(), Self::Review => "helper_review".to_string(), + Self::Judge => "helper_judge".to_string(), Self::Custom { slug, .. } => format!("helper_custom_{slug}"), } } @@ -364,6 +387,8 @@ impl SubagentProfile { match self { Self::Explore => false, Self::Review => true, + // Judge may delegate explore/review/parallel to gather evidence. + Self::Judge => true, Self::Custom { can_delegate, .. } => *can_delegate, } } @@ -374,6 +399,11 @@ impl SubagentProfile { pub fn max_delegation_depth(&self) -> u32 { match self { Self::Explore | Self::Review => BUILTIN_DEFAULT_MAX_DELEGATION_DEPTH, + // Judge is delegated by the main agent (depth 1) and must be + // accepted at depth 2 (the main agent's child depth). It may itself + // delegate explore/review at depth 3, which remains within + // BUILTIN_DEFAULT_MAX_DELEGATION_DEPTH and GLOBAL_MAX_DELEGATION_DEPTH. + Self::Judge => 2, Self::Custom { max_delegation_depth, .. @@ -435,6 +465,7 @@ impl SubagentProfile { match self { Self::Explore => include_str!("../prompt/templates/subagent/explore.md").to_string(), Self::Review => include_str!("../prompt/templates/subagent/review.md").to_string(), + Self::Judge => include_str!("../prompt/templates/subagent/judge.md").to_string(), Self::Custom { system_prompt, .. } => system_prompt.clone(), } } @@ -632,6 +663,74 @@ impl SubagentProfile { ]); } + if *self == Self::Judge { + // Judge keeps file tools read-only but is allowed a diagnostic-only + // shell plus read-only git/terminal inspection for verification. + tools.extend([ + AgentTool::new( + "git_status", + "Git Status", + "Inspect repository status in the current workspace without modifying anything.", + serde_json::json!({ + "type": "object", + "properties": { + "path": { "type": "string", "description": "Optional relative path to narrow the status query." } + } + }), + ), + AgentTool::new( + "git_diff", + "Git Diff", + "Read the current Git diff in the workspace, optionally scoped to a path or staged changes.", + serde_json::json!({ + "type": "object", + "properties": { + "path": { "type": "string", "description": "Optional relative path to inspect." }, + "staged": { "type": "boolean", "description": "Set true to inspect staged changes instead of working tree changes." }, + "contextLines": { + "type": "integer", + "minimum": 1, + "maximum": 20, + "description": "Optional number of unified diff context lines. Defaults to 3 and is capped for safety." + } + } + }), + ), + AgentTool::new( + "term_status", + "Terminal Status", + TERM_STATUS_TOOL_DESCRIPTION, + serde_json::json!({ + "type": "object", + "properties": {} + }), + ), + AgentTool::new( + "term_output", + "Terminal Output", + TERM_OUTPUT_TOOL_DESCRIPTION, + serde_json::json!({ + "type": "object", + "properties": {} + }), + ), + AgentTool::new( + "shell", + "Run Command", + "Run a non-interactive shell command inside the current workspace. Judge may use this ONLY for diagnostic and verification commands such as tests, type-checks, linters, and read-only inspection. Never use it to modify files, delete data, install dependencies, start long-running or interactive processes, or change global state.", + serde_json::json!({ + "type": "object", + "properties": { + "command": { "type": "string" }, + "cwd": { "type": "string" }, + "timeout": { "type": "number" } + }, + "required": ["command"] + }), + ), + ]); + } + tools } @@ -902,6 +1001,54 @@ mod tests { ); } + #[test] + fn judge_tool_parses_but_is_not_in_builtin_catalog() { + assert_eq!( + RuntimeOrchestrationTool::parse("agent_judge"), + Some(RuntimeOrchestrationTool::Judge) + ); + // Judge is main-agent-only: it must NOT be part of the built-in + // delegation catalog that subagents can reach. + let catalog = runtime_orchestration_tools(); + assert!(!catalog.iter().any(|tool| tool.name == "agent_judge")); + } + + #[test] + fn judge_profile_is_read_only_with_diagnostic_shell() { + let tools = SubagentProfile::Judge.helper_tools(false); + let tool_names: Vec<&str> = tools.iter().map(|tool| tool.name.as_str()).collect(); + + assert!(tool_names.contains(&"read")); + assert!(tool_names.contains(&"list")); + assert!(tool_names.contains(&"find")); + assert!(tool_names.contains(&"search")); + assert!(tool_names.contains(&"shell")); + // Read-only: no file mutation or interactive terminal tools. + assert!(!tool_names.contains(&"edit")); + assert!(!tool_names.contains(&"write")); + assert!(!tool_names.contains(&"term_write")); + assert!(!tool_names.contains(&"term_restart")); + assert!(!tool_names.contains(&"term_close")); + } + + #[test] + fn judge_can_delegate_at_depth_two() { + assert!(SubagentProfile::Judge.can_delegate()); + assert_eq!(SubagentProfile::Judge.max_delegation_depth(), 2); + assert_eq!(SubagentProfile::Judge.helper_kind(), "helper_judge"); + } + + #[test] + fn judge_is_never_a_delegation_target_for_helpers() { + // Even a Judge that can delegate only receives explore/review/parallel, + // never agent_judge. + let tools = SubagentProfile::Judge.delegation_tools_for_helper(3, &[]); + let tool_names: Vec<&str> = tools.iter().map(|tool| tool.name.as_str()).collect(); + assert!(!tool_names.contains(&"agent_judge")); + assert!(tool_names.contains(&"agent_explore")); + assert!(tool_names.contains(&"agent_review")); + } + #[test] fn agent_parallel_tool_schema_has_bounded_tasks() { let tool = RuntimeOrchestrationTool::Parallel.as_agent_tool(); diff --git a/src-tauri/src/gateway/gateway_runner.rs b/src-tauri/src/gateway/gateway_runner.rs index 62889d44..42f4167b 100644 --- a/src-tauri/src/gateway/gateway_runner.rs +++ b/src-tauri/src/gateway/gateway_runner.rs @@ -920,7 +920,7 @@ async fn dispatch_command( .await?; // Build a kickoff prompt similar to the GUI /goal path let kickoff = format!( - "## Persistent Goal Started\n\nYou are now working on the following goal:\n\n**{}**\n\nThis goal has been created and is now **active**. Work toward it.\nWhen the goal is fully achieved, you MUST call:\n```json\ngoal_scored(status=\"complete\", evidence=\"test output, file changes, verification steps\", pledge=\"I hereby declare: I confirm that I have fully achieved this goal, and I have confirmed that there are no remaining pending tasks or follow-up items. I confirm that I have repeatedly reviewed the output of this work, and I take responsibility for the quality of this output.\")\n```\nDo NOT mark complete without verified evidence.\n\nIf you need user input before proceeding, use the clarify tool.\nThe goal will automatically pause and resume when the user responds.", + "## Persistent Goal Started\n\nYou are now working on the following goal:\n\n**{}**\n\nThis goal has been created and is now **active**. Work toward it.\nCompletion is decided by independent verification — you cannot self-declare it. When you believe the goal is fully achieved, you MUST request acceptance by calling:\n```json\nagent_judge(task=\"explain why you believe the goal is achieved / what to verify\")\n```\nAn independent Judge evaluates the project against the goal. The goal is only marked verified when the Judge returns passed=true. If a verification does not pass, fix the reported findings and call agent_judge again.\n\nIf you need user input before proceeding, use the clarify tool.\nThe goal will automatically pause and resume when the user responds.", objective, ); run_agent_prompt( diff --git a/src-tauri/src/ipc/frontend_channels.rs b/src-tauri/src/ipc/frontend_channels.rs index 9778222e..48990b7d 100644 --- a/src-tauri/src/ipc/frontend_channels.rs +++ b/src-tauri/src/ipc/frontend_channels.rs @@ -223,9 +223,10 @@ pub enum ThreadStreamEvent { error: Option, }, // ── Goal events ── - // GoalStateUpdated and GoalCompleted are emitted by execute_goal_tool - // (create_goal, goal_scored tools in AgentSession). GoalContinuation and - // GoalPaused are emitted by backend run-lifecycle goal orchestration after + // GoalStateUpdated and GoalCompleted are emitted by the agent_judge + // acceptance flow (execute_judge_tool in AgentSession) when the Judge + // records a verdict. GoalContinuation and GoalPaused are emitted by backend + // run-lifecycle goal orchestration after // terminal runs are evaluated. The frontend also consumes goal state via // goal_get_state / goal_evaluate command APIs. GoalStateUpdated { diff --git a/src-tauri/src/model/goal.rs b/src-tauri/src/model/goal.rs index cbef21a4..1868fb28 100644 --- a/src-tauri/src/model/goal.rs +++ b/src-tauri/src/model/goal.rs @@ -98,10 +98,9 @@ impl PauseReason { pub enum GoalVerdict { /// Goal is still active — inject continuation prompt Continue, - /// Model claimed completion but evidence is missing — inject challenge + /// Model claimed completion but has not yet requested Judge verification — + /// inject a challenge nudging it to call `agent_judge`. ChallengeEvidence, - /// Goal achieved with evidence - Complete { evidence: String }, /// Goal paused for a specific reason Paused { reason: PauseReason, @@ -127,6 +126,16 @@ pub struct GoalRecord { pub pause_detail: Option, pub evidence: Option, pub last_evaluated_run_id: Option, + /// Whether the most recent Judge verdict passed acceptance. + pub judge_passed: bool, + /// Latest Judge completeness percentage (0-100), if evaluated. + pub judge_completeness: Option, + /// Latest Judge findings as a JSON array string, if evaluated. + pub judge_findings: Option, + /// Latest Judge summary / acceptance rationale, if evaluated. + pub judge_summary: Option, + /// Run id of the run during which the latest Judge verdict was recorded. + pub judge_evaluated_run_id: Option, pub created_at: DateTime, pub updated_at: DateTime, } @@ -153,6 +162,15 @@ pub struct GoalDto { pub evidence: Option, #[serde(skip_serializing_if = "Option::is_none")] pub last_evaluated_run_id: Option, + pub judge_passed: bool, + #[serde(skip_serializing_if = "Option::is_none")] + pub judge_completeness: Option, + #[serde(skip_serializing_if = "Option::is_none")] + pub judge_findings: Option, + #[serde(skip_serializing_if = "Option::is_none")] + pub judge_summary: Option, + #[serde(skip_serializing_if = "Option::is_none")] + pub judge_evaluated_run_id: Option, pub created_at: String, pub updated_at: String, } @@ -173,6 +191,11 @@ impl From for GoalDto { pause_detail: r.pause_detail, evidence: r.evidence, last_evaluated_run_id: r.last_evaluated_run_id, + judge_passed: r.judge_passed, + judge_completeness: r.judge_completeness, + judge_findings: r.judge_findings, + judge_summary: r.judge_summary, + judge_evaluated_run_id: r.judge_evaluated_run_id, created_at: r.created_at.to_rfc3339(), updated_at: r.updated_at.to_rfc3339(), } @@ -208,6 +231,15 @@ pub struct GoalPayload { pub evidence: Option, #[serde(skip_serializing_if = "Option::is_none")] pub last_evaluated_run_id: Option, + pub judge_passed: bool, + #[serde(skip_serializing_if = "Option::is_none")] + pub judge_completeness: Option, + #[serde(skip_serializing_if = "Option::is_none")] + pub judge_findings: Option, + #[serde(skip_serializing_if = "Option::is_none")] + pub judge_summary: Option, + #[serde(skip_serializing_if = "Option::is_none")] + pub judge_evaluated_run_id: Option, } impl From for GoalPayload { @@ -226,6 +258,11 @@ impl From for GoalPayload { pause_detail: r.pause_detail, evidence: r.evidence, last_evaluated_run_id: r.last_evaluated_run_id, + judge_passed: r.judge_passed, + judge_completeness: r.judge_completeness, + judge_findings: r.judge_findings, + judge_summary: r.judge_summary, + judge_evaluated_run_id: r.judge_evaluated_run_id, } } } diff --git a/src-tauri/src/model/subagent.rs b/src-tauri/src/model/subagent.rs index d7207ab8..5f6ae6c4 100644 --- a/src-tauri/src/model/subagent.rs +++ b/src-tauri/src/model/subagent.rs @@ -137,7 +137,7 @@ pub struct ProfileSubagentAccessRecord { // Reserved slugs that cannot be used for custom subagents // --------------------------------------------------------------------------- -pub const RESERVED_SUBAGENT_SLUGS: &[&str] = &["explore", "review"]; +pub const RESERVED_SUBAGENT_SLUGS: &[&str] = &["explore", "review", "judge"]; /// Validate that a slug is well-formed and not reserved. pub fn validate_slug(slug: &str) -> Result<(), &'static str> { diff --git a/src-tauri/src/persistence/repo/goal_repo.rs b/src-tauri/src/persistence/repo/goal_repo.rs index 9b758c84..3424104a 100644 --- a/src-tauri/src/persistence/repo/goal_repo.rs +++ b/src-tauri/src/persistence/repo/goal_repo.rs @@ -6,7 +6,8 @@ use crate::model::goal::{GoalRecord, GoalStatus, PauseReason}; const SELECT_COLUMNS: &str = "id, thread_id, objective, status, token_budget, tokens_used, \ time_used_seconds, turns_used, max_turns, pause_reason, pause_detail, evidence, \ - last_evaluated_run_id, created_at, updated_at"; + last_evaluated_run_id, judge_passed, judge_completeness, judge_findings, judge_summary, \ + judge_evaluated_run_id, created_at, updated_at"; // ── Database row (raw sqlx types) ── @@ -25,6 +26,11 @@ struct GoalRow { pause_detail: Option, evidence: Option, last_evaluated_run_id: Option, + judge_passed: i64, + judge_completeness: Option, + judge_findings: Option, + judge_summary: Option, + judge_evaluated_run_id: Option, created_at: String, updated_at: String, } @@ -45,6 +51,11 @@ impl GoalRow { pause_detail: self.pause_detail, evidence: self.evidence, last_evaluated_run_id: self.last_evaluated_run_id, + judge_passed: self.judge_passed != 0, + judge_completeness: self.judge_completeness, + judge_findings: self.judge_findings, + judge_summary: self.judge_summary, + judge_evaluated_run_id: self.judge_evaluated_run_id, created_at: DateTime::parse_from_rfc3339(&self.created_at) .map(|dt| dt.with_timezone(&Utc)) .unwrap_or_else(|_| Utc::now()), @@ -80,6 +91,10 @@ pub async fn find_by_id(pool: &SqlitePool, id: &str) -> Result Result<(), AppError> { + // Note: the judge_* columns are intentionally omitted here and rely on the + // DDL defaults (judge_passed=0, others NULL) set by the goal_judge_fields + // migration. New goals always start un-verified, and the Judge verdict is + // written later via record_judge_verdict(). let now = Utc::now().to_rfc3339(); sqlx::query( "INSERT INTO goals (id, thread_id, objective, status, token_budget, tokens_used, \ @@ -196,3 +211,66 @@ pub async fn delete_by_thread_id(pool: &SqlitePool, thread_id: &str) -> Result 0) } + +/// Persist the most recent Judge verdict for a goal. Always updates the +/// `judge_*` columns. When `passed` is true, the same transaction also writes +/// `status='complete'` and `evidence=summary` so that acceptance +/// (`status=complete` AND `judge_passed=1`) can never be observed as a +/// half-applied state. When `passed` is false the goal's `status` is left +/// unchanged (typically still `active`). +#[allow(clippy::too_many_arguments)] +pub async fn record_judge_verdict( + pool: &SqlitePool, + id: &str, + run_id: &str, + passed: bool, + completeness: i64, + findings_json: &str, + summary: &str, +) -> Result { + let now = Utc::now().to_rfc3339(); + let mut tx = pool.begin().await?; + + let updated = sqlx::query( + "UPDATE goals SET \ + judge_passed = ?, \ + judge_completeness = ?, \ + judge_findings = ?, \ + judge_summary = ?, \ + judge_evaluated_run_id = ?, \ + updated_at = ? \ + WHERE id = ?", + ) + .bind(if passed { 1_i64 } else { 0_i64 }) + .bind(completeness) + .bind(findings_json) + .bind(summary) + .bind(run_id) + .bind(&now) + .bind(id) + .execute(&mut *tx) + .await?; + + if updated.rows_affected() == 0 { + tx.rollback().await?; + return Ok(false); + } + + if passed { + sqlx::query( + "UPDATE goals SET \ + status = 'complete', \ + evidence = COALESCE(NULLIF(?, ''), evidence), \ + updated_at = ? \ + WHERE id = ?", + ) + .bind(summary) + .bind(&now) + .bind(id) + .execute(&mut *tx) + .await?; + } + + tx.commit().await?; + Ok(true) +} diff --git a/src-tauri/tests/goal_lifecycle.rs b/src-tauri/tests/goal_lifecycle.rs index 48b91647..20ead9f9 100644 --- a/src-tauri/tests/goal_lifecycle.rs +++ b/src-tauri/tests/goal_lifecycle.rs @@ -3,7 +3,7 @@ mod tests { use sqlx::sqlite::{SqliteConnectOptions, SqlitePool, SqlitePoolOptions}; use std::str::FromStr; use tiycode_lib::core::app_state::GoalRuntimeState; - use tiycode_lib::core::goal_manager::{ChallengePromptVariant, GoalManager}; + use tiycode_lib::core::goal_manager::GoalManager; use tiycode_lib::model::goal::{GoalStatus, GoalVerdict, PauseReason}; use tiycode_lib::persistence::repo::goal_repo; @@ -258,7 +258,7 @@ mod tests { let mgr = GoalManager::new(pool.clone(), "thread-1".into(), test_runtime()); let goal = mgr.create_goal("Test goal", None).await.unwrap(); - // Model says "done" but doesn't call goal_scored + // Model says "done" but doesn't call agent_judge let verdict = mgr.evaluate_after_turn( "All done! The goal is complete and everything is finished.", &goal, @@ -371,23 +371,20 @@ mod tests { let prompt = mgr.render_continuation_prompt(&goal); assert!(prompt.contains("Build feature X")); - assert!(prompt.contains("goal_scored")); + assert!(prompt.contains("agent_judge")); assert!(prompt.contains("clarify")); } #[tokio::test] - async fn challenge_prompt_renders_variants() { + async fn challenge_prompt_guides_to_judge() { let mgr = GoalManager::new(setup_pool().await, "thread-1".into(), test_runtime()); - let no_evidence = mgr.render_challenge_prompt(ChallengePromptVariant::NoEvidence); - assert!(no_evidence.contains("did not provide evidence")); - - let no_tool = mgr.render_challenge_prompt(ChallengePromptVariant::NoTool); - assert!(no_tool.contains("provide concrete evidence")); - assert!(no_tool.contains("goal_scored")); + let prompt = mgr.render_challenge_prompt(); + assert!(prompt.contains("agent_judge")); + assert!(prompt.contains("cannot self-declare")); } - // ── #1 / #8: Tests for goal_scored validation logic & test gap coverage ── + // ── mark_complete validation & test gap coverage ── #[tokio::test] async fn mark_complete_rejects_empty_evidence() { @@ -456,17 +453,135 @@ mod tests { } #[tokio::test] - async fn evaluate_after_turn_goal_scored_not_blocking() { + async fn evaluate_after_turn_agent_judge_not_blocking() { let pool = setup_pool().await; let mgr = GoalManager::new(pool.clone(), "thread-1".into(), test_runtime()); let goal = mgr.create_goal("Test goal", None).await.unwrap(); - // goal_scored should NOT trigger a pause in evaluation - mgr.record_tool_call("goal_scored"); - let verdict = mgr.evaluate_after_turn("Calling goal_scored", &goal); + // agent_judge should NOT trigger a pause in evaluation + mgr.record_tool_call("agent_judge"); + let verdict = mgr.evaluate_after_turn("Calling agent_judge", &goal); assert!(matches!(verdict, GoalVerdict::Continue)); } + // ── Judge verdict persistence (record_judge_verdict) ── + + #[tokio::test] + async fn record_judge_verdict_pass_marks_complete_and_verified() { + let pool = setup_pool().await; + let mgr = GoalManager::new(pool.clone(), "thread-1".into(), test_runtime()); + let goal = mgr.create_goal("Test goal", None).await.unwrap(); + + let recorded = goal_repo::record_judge_verdict( + &pool, + &goal.id, + "run-judge-1", + true, + 100, + "[]", + "All requirements verified; tests pass.", + ) + .await + .unwrap(); + assert!(recorded); + + let updated = mgr.get_active().await.unwrap().unwrap(); + assert_eq!(updated.status, GoalStatus::Complete); + assert!(updated.judge_passed); + assert_eq!(updated.judge_completeness, Some(100)); + assert_eq!( + updated.evidence.as_deref(), + Some("All requirements verified; tests pass.") + ); + assert_eq!( + updated.judge_evaluated_run_id.as_deref(), + Some("run-judge-1") + ); + + // A verified goal stops continuation. + let outcome = mgr + .evaluate_after_run("run-after", None) + .await + .unwrap() + .unwrap(); + assert_eq!(outcome.verdict, "skipped"); + assert!(outcome.continuation_prompt.is_none()); + } + + #[tokio::test] + async fn record_judge_verdict_fail_keeps_active_and_persists_findings() { + let pool = setup_pool().await; + let mgr = GoalManager::new(pool.clone(), "thread-1".into(), test_runtime()); + let goal = mgr.create_goal("Test goal", None).await.unwrap(); + + let findings = serde_json::to_string(&vec![ + "Missing unit tests for module X".to_string(), + "Build fails on Windows".to_string(), + ]) + .unwrap(); + let recorded = goal_repo::record_judge_verdict( + &pool, + &goal.id, + "run-judge-1", + false, + 60, + &findings, + "Not yet complete.", + ) + .await + .unwrap(); + assert!(recorded); + + let updated = mgr.get_active().await.unwrap().unwrap(); + assert_eq!(updated.status, GoalStatus::Active); + assert!(!updated.judge_passed); + assert!(updated.judge_findings.is_some()); + + // Continuation prompt should surface the latest findings. + let prompt = mgr.render_continuation_prompt(&updated); + assert!(prompt.contains("Missing unit tests for module X")); + assert!(prompt.contains("agent_judge")); + } + + #[tokio::test] + async fn migration_backfills_legacy_complete_goal_as_verified() { + let pool = setup_pool().await; + let mgr = GoalManager::new(pool.clone(), "thread-1".into(), test_runtime()); + let goal = mgr.create_goal("Legacy goal", None).await.unwrap(); + + // Simulate a legacy completed goal (no judge fields set yet). + sqlx::query( + "UPDATE goals SET status = 'complete', evidence = 'legacy evidence' WHERE id = ?", + ) + .bind(&goal.id) + .execute(&pool) + .await + .unwrap(); + // Apply the same backfill the migration performs. + sqlx::query( + "UPDATE goals SET judge_passed = 1, \ + judge_summary = COALESCE(judge_summary, evidence), \ + judge_completeness = COALESCE(judge_completeness, 100) \ + WHERE status = 'complete'", + ) + .execute(&pool) + .await + .unwrap(); + + let updated = mgr.get_active().await.unwrap().unwrap(); + assert_eq!(updated.status, GoalStatus::Complete); + assert!(updated.judge_passed); + assert_eq!(updated.judge_completeness, Some(100)); + + // It must not be re-opened by continuation. + let outcome = mgr + .evaluate_after_run("run-after", None) + .await + .unwrap() + .unwrap(); + assert_eq!(outcome.verdict, "skipped"); + } + #[tokio::test] async fn evaluate_after_turn_chinese_idle_phrase_pauses() { let pool = setup_pool().await; diff --git a/src/i18n/locales/en.ts b/src/i18n/locales/en.ts index 9dd01685..f3fd2940 100644 --- a/src/i18n/locales/en.ts +++ b/src/i18n/locales/en.ts @@ -1094,6 +1094,7 @@ const en: Record = { "goal.status.paused": "Paused", "goal.status.budgetLimited": "Budget Exhausted", "goal.status.complete": "Complete", + "goal.status.verified": "Verified", "goal.time.elapsed": "Running for {{time}}", "goal.time.hoursMinutes": "{{hours}}h {{minutes}}m", "goal.time.minutesSeconds": "{{minutes}}m {{seconds}}s", diff --git a/src/i18n/locales/zh-CN.ts b/src/i18n/locales/zh-CN.ts index 5b3272d2..3f5c5164 100644 --- a/src/i18n/locales/zh-CN.ts +++ b/src/i18n/locales/zh-CN.ts @@ -1133,6 +1133,7 @@ const zhCN = { "goal.status.paused": "已暂停", "goal.status.budgetLimited": "预算耗尽", "goal.status.complete": "已完成", + "goal.status.verified": "已验收通过", "goal.time.elapsed": "已持续运行{{time}}", "goal.time.hoursMinutes": "{{hours}}小时{{minutes}}分", "goal.time.minutesSeconds": "{{minutes}}分{{seconds}}秒", diff --git a/src/modules/workbench-shell/model/thread-store.ts b/src/modules/workbench-shell/model/thread-store.ts index 4c083f85..97ec8701 100644 --- a/src/modules/workbench-shell/model/thread-store.ts +++ b/src/modules/workbench-shell/model/thread-store.ts @@ -101,6 +101,11 @@ export interface GoalStoreState { pauseDetail?: string | null; evidence?: string | null; lastEvaluatedRunId?: string | null; + judgePassed?: boolean; + judgeCompleteness?: number | null; + judgeFindings?: string | null; + judgeSummary?: string | null; + judgeEvaluatedRunId?: string | null; } // --------------------------------------------------------------------------- diff --git a/src/modules/workbench-shell/ui/goal-status-bar.tsx b/src/modules/workbench-shell/ui/goal-status-bar.tsx index 45c42922..83c2bbf7 100644 --- a/src/modules/workbench-shell/ui/goal-status-bar.tsx +++ b/src/modules/workbench-shell/ui/goal-status-bar.tsx @@ -35,7 +35,7 @@ export function GoalStatusBar({ threadId }: Props) { case "active": return "goal.status.active"; case "paused": return "goal.status.paused"; case "budget_limited": return "goal.status.budgetLimited"; - case "complete": return "goal.status.complete"; + case "complete": return goal.judgePassed ? "goal.status.verified" : "goal.status.complete"; default: return "goal.status.active"; } })(); diff --git a/src/modules/workbench-shell/ui/runtime-thread-surface.tsx b/src/modules/workbench-shell/ui/runtime-thread-surface.tsx index 6fbfb3e3..db893329 100644 --- a/src/modules/workbench-shell/ui/runtime-thread-surface.tsx +++ b/src/modules/workbench-shell/ui/runtime-thread-surface.tsx @@ -1689,11 +1689,11 @@ export function RuntimeThreadSurface({ "**" + argText + "**", "", "This goal has been created and is now **active**. Work toward it.", - "When the goal is fully achieved, you MUST call:", + "Completion is decided by independent verification — you cannot self-declare it. When you believe the goal is fully achieved, you MUST request acceptance by calling:", "```json", - "goal_scored(status=\"complete\", evidence=\"test output, file changes, verification steps\", pledge=\"I hereby declare: I confirm that I have fully achieved this goal, and I have confirmed that there are no remaining pending tasks or follow-up items. I confirm that I have repeatedly reviewed the output of this work, and I take responsibility for the quality of this output.\")", + "agent_judge(task=\"explain why you believe the goal is achieved / what to verify\")", "```", - "Do NOT mark complete without verified evidence.", + "An independent Judge evaluates the project against the goal. The goal is only marked verified when the Judge returns passed=true. If a verification does not pass, fix the reported findings and call agent_judge again.", "", "If you need user input before proceeding, use the clarify tool.", "The goal will automatically pause and resume when the user responds.", diff --git a/src/services/bridge/agent-commands.ts b/src/services/bridge/agent-commands.ts index 6e9ed990..d43262ca 100644 --- a/src/services/bridge/agent-commands.ts +++ b/src/services/bridge/agent-commands.ts @@ -735,6 +735,11 @@ export type GoalPayload = { pauseDetail?: string | null; evidence?: string | null; lastEvaluatedRunId?: string | null; + judgePassed?: boolean; + judgeCompleteness?: number | null; + judgeFindings?: string | null; + judgeSummary?: string | null; + judgeEvaluatedRunId?: string | null; }; export async function goalGetState(threadId: string): Promise { @@ -768,7 +773,7 @@ export async function goalClear(threadId: string): Promise { export type GoalEvaluateResult = { goal: GoalPayload; - verdict: "continue" | "challenge_evidence" | "complete" | "paused" | "budget_limited"; + verdict: "continue" | "challenge_evidence" | "complete" | "paused" | "budget_limited" | "skipped"; continuationPrompt?: string | null; }; From 3b77dd12c5c03c62091be6bef1e25ea2f8ea9a2b Mon Sep 17 00:00:00 2001 From: Jorben Date: Sun, 7 Jun 2026 12:39:05 +0800 Subject: [PATCH 2/8] =?UTF-8?q?refactor(goal):=20=E2=99=BB=EF=B8=8F=20remo?= =?UTF-8?q?ve=20mark=5Fcomplete=20and=20complete=20verdict?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Remove the mark_complete pathway from goals as completion will be handled through a different mechanism: - Remove mark_complete method from GoalManager - Remove "complete" from GoalEvaluateResult verdict type - Remove mark_complete test cases (evidence validation, etc.) - Update subagent surface comments to include judge BREAKING CHANGE: GoalEvaluateResult.verdict no longer includes "complete" --- src-tauri/src/core/goal_manager.rs | 23 --------------- src-tauri/src/core/prompt/surface.rs | 4 +-- src-tauri/tests/goal_lifecycle.rs | 40 --------------------------- src/services/bridge/agent-commands.ts | 2 +- 4 files changed, 3 insertions(+), 66 deletions(-) diff --git a/src-tauri/src/core/goal_manager.rs b/src-tauri/src/core/goal_manager.rs index 7486e39b..bd7298db 100644 --- a/src-tauri/src/core/goal_manager.rs +++ b/src-tauri/src/core/goal_manager.rs @@ -213,29 +213,6 @@ impl GoalManager { Ok(()) } - /// Mark the goal as complete with evidence. - pub async fn mark_complete(&self, goal_id: &str, evidence: &str) -> Result<(), AppError> { - if evidence.trim().is_empty() { - return Err(AppError::validation( - ErrorSource::Settings, - "evidence is required to mark a goal as complete", - )); - } - let updated = goal_repo::update_status( - &self.pool, - goal_id, - GoalStatus::Complete, - None, - None, - Some(evidence), - ) - .await?; - if !updated { - return Err(AppError::not_found(ErrorSource::Settings, "goal")); - } - Ok(()) - } - /// Mark the goal as budget-limited. pub async fn mark_budget_limited(&self, goal_id: &str) -> Result<(), AppError> { let updated = goal_repo::update_status( diff --git a/src-tauri/src/core/prompt/surface.rs b/src-tauri/src/core/prompt/surface.rs index 92b6ef76..554bb0b4 100644 --- a/src-tauri/src/core/prompt/surface.rs +++ b/src-tauri/src/core/prompt/surface.rs @@ -48,9 +48,9 @@ pub enum SurfacePattern { AnyMainAgent, /// Matches a specific MainAgent run_mode MainAgent(RunMode), - /// Matches any subagent surface (explore, review, custom) + /// Matches any subagent surface (explore, review, judge, custom) AnySubagent, - /// Matches built-in explore + review subagents only + /// Matches built-in explore + review + judge subagents only BuiltinSubagent, /// Matches any custom subagent regardless of slug CustomSubagent, diff --git a/src-tauri/tests/goal_lifecycle.rs b/src-tauri/tests/goal_lifecycle.rs index 20ead9f9..ecff3198 100644 --- a/src-tauri/tests/goal_lifecycle.rs +++ b/src-tauri/tests/goal_lifecycle.rs @@ -320,24 +320,6 @@ mod tests { assert_eq!(paused.status, GoalStatus::Paused); } - #[tokio::test] - async fn mark_complete_with_evidence() { - let pool = setup_pool().await; - let mgr = GoalManager::new(pool.clone(), "thread-1".into(), test_runtime()); - let goal = mgr.create_goal("Test goal", None).await.unwrap(); - - mgr.mark_complete(&goal.id, "All tests pass, files created") - .await - .unwrap(); - - let completed = mgr.get_active().await.unwrap().unwrap(); - assert_eq!(completed.status, GoalStatus::Complete); - assert_eq!( - completed.evidence.as_deref(), - Some("All tests pass, files created") - ); - } - #[tokio::test] async fn mark_budget_limited() { let pool = setup_pool().await; @@ -384,28 +366,6 @@ mod tests { assert!(prompt.contains("cannot self-declare")); } - // ── mark_complete validation & test gap coverage ── - - #[tokio::test] - async fn mark_complete_rejects_empty_evidence() { - let pool = setup_pool().await; - let mgr = GoalManager::new(pool.clone(), "thread-1".into(), test_runtime()); - let goal = mgr.create_goal("Test goal", None).await.unwrap(); - - let err = mgr.mark_complete(&goal.id, "").await.unwrap_err(); - assert!(err.user_message.contains("evidence is required")); - } - - #[tokio::test] - async fn mark_complete_rejects_whitespace_only_evidence() { - let pool = setup_pool().await; - let mgr = GoalManager::new(pool.clone(), "thread-1".into(), test_runtime()); - let goal = mgr.create_goal("Test goal", None).await.unwrap(); - - let err = mgr.mark_complete(&goal.id, " ").await.unwrap_err(); - assert!(err.user_message.contains("evidence is required")); - } - #[tokio::test] async fn evaluate_after_turn_token_budget_exhausted_returns_budget_limited() { let pool = setup_pool().await; diff --git a/src/services/bridge/agent-commands.ts b/src/services/bridge/agent-commands.ts index d43262ca..d6ec3012 100644 --- a/src/services/bridge/agent-commands.ts +++ b/src/services/bridge/agent-commands.ts @@ -773,7 +773,7 @@ export async function goalClear(threadId: string): Promise { export type GoalEvaluateResult = { goal: GoalPayload; - verdict: "continue" | "challenge_evidence" | "complete" | "paused" | "budget_limited" | "skipped"; + verdict: "continue" | "challenge_evidence" | "paused" | "budget_limited" | "skipped"; continuationPrompt?: string | null; }; From b204d9b92d6ae7f4b9cc7c7a4edeb9dfe60a420e Mon Sep 17 00:00:00 2001 From: Jorben Date: Sun, 7 Jun 2026 12:47:23 +0800 Subject: [PATCH 3/8] =?UTF-8?q?docs:=20=F0=9F=93=9D=20update=20and=20reord?= =?UTF-8?q?er=20README=20feature=20list?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Update the feature descriptions and reorder the bullet points in both README.md and README_zh.md to better reflect the current product capabilities and improve readability. Changes include: - Reordering features to highlight persistent goal management, real-time streaming, and extensibility earlier in the list - Updating descriptions for several features to be more accurate - Maintaining consistency between English and Chinese versions - Keeping the overall structure while improving flow These are documentation-only changes that do not affect functionality. --- README.md | 12 ++++++------ README_zh.md | 12 ++++++------ 2 files changed, 12 insertions(+), 12 deletions(-) diff --git a/README.md b/README.md index d7d030bf..a99948d1 100644 --- a/README.md +++ b/README.md @@ -28,22 +28,22 @@ Around that collaboration model, TiyCode brings together Agent Profiles, workspa - **AI-first coding collaboration.** TiyCode is designed around the idea that humans express intent through conversation while agents take the lead in execution. - **Agent Profiles.** Mix models from different providers, tune response style, language, and custom instructions, and switch profiles flexibly for different kinds of work. +- **Persistent goal management.** Define long-running objectives for agents to pursue across multiple turns. An independent Judge subagent evaluates completion against actual file changes, command outputs, and commit history — eliminating self-attestation bias. - **Custom Agents.** Create purpose-built sub-agents in Settings — each with its own name, system prompt, model tier, and allowed tools — then grant per-profile access and delegate work from the composer. - **Three-tier model architecture.** Each profile supports a Primary model for core reasoning, an Auxiliary model for helper tasks, and a Lightweight model for fast operations — with automatic fallback chains across tiers. - **Multi-provider support.** Connect to 13+ LLM providers out of the box — OpenAI, Anthropic, Google, Ollama, xAI, Groq, OpenRouter, DeepSeek, MiniMax, Kimi, and more — or add any OpenAI-compatible endpoint as a custom provider. - **Workspace-centered execution.** Threads stay grounded in the local workspace and connect naturally to code review, version control, repository inspection, Git worktrees, and terminal workflows. - **Task-aware execution.** Thread-scoped task boards, plan checkpoints, tool status events, and subagent progress make longer runs easier to follow and review. -- **Persistent goal management.** Set long-running objectives for agents to pursue across multiple turns, with automatic continuation, budget controls, and progress tracking. +- **Real-time execution streaming.** A rich thread stream event system delivers live updates — message deltas, tool calls, requested/active statuses, reasoning steps, subagent progress, and plan updates — all rendered through purpose-built AI Elements components. - **Rich composer inputs.** Prompt input supports text, file/photo attachments, screenshots, slash command structured argument interpolation (`--key=value`, positional args, `{{placeholder}}` templates), and large-paste handling. - **Steer & Queue.** While the agent is running, choose to steer the conversation mid-execution or queue a follow-up message for the next round — keeping you in control without interrupting the workflow. -- **Real-time execution streaming.** A rich thread stream event system delivers live updates — message deltas, tool calls, requested/active statuses, reasoning steps, subagent progress, and plan updates — all rendered through purpose-built AI Elements components. -- **Operator-friendly experience.** Slash commands with structured argument parsing, smart conversation titles, context compression controls, commit message generation, external terminal handoff including Ghostty, and compact workbench controls help the product feel fast and practical in day-to-day use. -- **Thread-level elapsed timer.** Track active execution time per thread, excluding pauses, with persistent tracking across sessions. -- **Bilingual interface.** Full i18n coverage with English and Simplified Chinese, switchable at any time. +- **Extensible by design.** Plugins, MCP servers, and Skills are treated as first-class building blocks through the `Extensions Center`. - **ACP Server support.** TiyCode can run as a headless ACP (Agent Client Protocol) server via `tiycode acp --stdio` or `tiycode acp --http `, letting external tools and IDE plugins drive the agent runtime through a standard JSON-RPC protocol without the desktop GUI. - **IM channel gateway.** Connect TiyCode to WeChat or WeCom so you can chat with the agent directly from your messaging app — scan a QR code to log in, send messages and attachments, and receive streaming responses without opening the desktop GUI. -- **Extensible by design.** Plugins, MCP servers, and Skills are treated as first-class building blocks through the `Extensions Center`. +- **Operator-friendly experience.** Slash commands with structured argument parsing, smart conversation titles, context compression controls, commit message generation, external terminal handoff including Ghostty, and compact workbench controls help the product feel fast and practical in day-to-day use. +- **Thread-level elapsed timer.** Track active execution time per thread, excluding pauses, with persistent tracking across sessions. - **Built-in runtime path.** The main execution flow is `Frontend -> Rust Core -> BuiltInAgentRuntime -> tiycore -> LLM`. +- **Bilingual interface.** Full i18n coverage with English and Simplified Chinese, switchable at any time. ## Tech Stack diff --git a/README_zh.md b/README_zh.md index c9bbdcde..dc615077 100644 --- a/README_zh.md +++ b/README_zh.md @@ -28,22 +28,22 @@ TiyCode 面向的是希望以 AI 时代的方式进行编码协作的用户。 - **AI First 的编码协作。** TiyCode 围绕"通过对话表达意图,Agent 全面执行"这一理念来设计产品形态。 - **Agent Profile。** 支持自由组合不同服务商的模型,并可配置回复风格、回复语言、自定义指令等设定,且能在不同 Profile 之间灵活切换。 +- **持久化目标管理。** 为 Agent 设置跨轮次的长期目标,由独立的 Judge 验收 Agent 基于实际文件变更、命令输出和提交历史进行完成判定——杜绝"自说自话"的信任缺陷。 - **Custom Agents。** 在设置中心创建专用子 Agent——每个拥有独立的名称、系统提示、模型层级和可用工具——按 Profile 授权后即可从 composer 委派任务。 - **三层模型架构。** 每个 Profile 支持配置 Primary 主力模型、Auxiliary 辅助模型和 Lightweight 轻量模型三个层级,层级之间具备自动回退链路。 - **多服务商接入。** 开箱支持 13+ 家 LLM 服务商 —— OpenAI、Anthropic、Google、Ollama、xAI、Groq、OpenRouter、DeepSeek、MiniMax、Kimi 等,也可将任何 OpenAI 兼容端点作为自定义 Provider 接入。 - **以工作区为中心的执行体验。** 对话线程扎根本地工作区,并与代码审阅、版本控制、仓库状态读取、Git worktree 和 Terminal 工作流自然衔接。 - **面向任务的执行可观测性。** Thread 级任务板、Plan checkpoint、工具状态事件和子 Agent 进度让长任务更容易跟踪和复查。 -- **持久化目标管理。** 为 Agent 设置跨轮次的长期目标,支持自动延续、预算控制和进度跟踪。 +- **实时执行流式推送。** 丰富的 Thread Stream 事件体系支撑实时更新 —— 消息增量、工具调用、requested / active 状态、推理步骤、子 Agent 进度与计划更新。 - **更丰富的输入能力。** Prompt 输入支持文本、文件 / 图片附件、截图、Slash Command 结构化参数插值(`--key=value`、位置参数、`{{placeholder}}` 模板变量)以及大段文本粘贴处理。 - **Steer 与 Queue。** Agent 运行中可选择「引导」即时插入消息调整方向,或「排队」将消息留待当前运行结束后再发起下一轮——无需中断工作流即可保持掌控。 -- **实时执行流式推送。** 丰富的 Thread Stream 事件体系支撑实时更新 —— 消息增量、工具调用、requested / active 状态、推理步骤、子 Agent 进度与计划更新。 -- **更友好的日常体验。** 支持结构化参数解析的 Slash Command、智能会话标题、上下文压缩、Commit Message 生成、包含 Ghostty 在内的外部终端衔接以及紧凑工作台控件,让协作过程更顺手、更连贯。 -- **线程级别耗时计时器。** 跟踪每个线程的活跃执行时间,排除暂停时间,并支持跨会话持久化跟踪。 -- **双语界面。** 完整的 i18n 支持,覆盖英文和简体中文,随时可切换。 +- **良好的通用扩展能力。** Plugins、MCP Servers 与 Skills 通过 `Extensions Center` 形成统一的扩展入口与产品模型。 - **ACP Server 支持。** TiyCode 可作为无头 ACP(Agent Client Protocol)服务器运行,通过 `tiycode acp --stdio` 或 `tiycode acp --http ` 启动,让外部工具和 IDE 插件通过标准 JSON-RPC 协议驱动 Agent 运行时,无需启动桌面 GUI。 - **IM 通道网关。** 将 TiyCode 接入微信或企业微信,扫码登录后即可在聊天应用中直接与 Agent 对话——发送消息和附件、接收流式回复,无需打开桌面 GUI。 -- **良好的通用扩展能力。** Plugins、MCP Servers 与 Skills 通过 `Extensions Center` 形成统一的扩展入口与产品模型。 +- **更友好的日常体验。** 支持结构化参数解析的 Slash Command、智能会话标题、上下文压缩、Commit Message 生成、包含 Ghostty 在内的外部终端衔接以及紧凑工作台控件,让协作过程更顺手、更连贯。 +- **线程级别耗时计时器。** 跟踪每个线程的活跃执行时间,排除暂停时间,并支持跨会话持久化跟踪。 - **内置 Runtime。** 主执行链路 `Frontend -> Rust Core -> BuiltInAgentRuntime -> tiycore -> LLM`。 +- **双语界面。** 完整的 i18n 支持,覆盖英文和简体中文,随时可切换。 ## 技术栈 From e284fbeb808919994d112a38bbd02a443e226221 Mon Sep 17 00:00:00 2001 From: Jorben Date: Sun, 7 Jun 2026 12:57:00 +0800 Subject: [PATCH 4/8] =?UTF-8?q?refactor(goal):=20=E2=99=BB=EF=B8=8F=20extr?= =?UTF-8?q?act=20resolveGoalStatusKey=20for=20testability?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - Extract inline status key resolution into a pure exported function so the complete→verified (judgePassed) branch can be unit-tested without mounting the component - Add unit tests covering all status mappings and judgePassed variants - Add test for skipped verdict passthrough in goalEvaluate --- .../ui/goal-status-bar.test.tsx | 21 +++++++++++ .../workbench-shell/ui/goal-status-bar.tsx | 35 +++++++++++++------ src/services/bridge/agent-commands.test.ts | 10 ++++++ 3 files changed, 56 insertions(+), 10 deletions(-) diff --git a/src/modules/workbench-shell/ui/goal-status-bar.test.tsx b/src/modules/workbench-shell/ui/goal-status-bar.test.tsx index 24976d9e..6c049117 100644 --- a/src/modules/workbench-shell/ui/goal-status-bar.test.tsx +++ b/src/modules/workbench-shell/ui/goal-status-bar.test.tsx @@ -1,4 +1,5 @@ import { describe, expect, it } from "vitest"; +import { resolveGoalStatusKey } from "./goal-status-bar"; const source = await import("./goal-status-bar?raw").then((module) => module.default as string); @@ -21,3 +22,23 @@ describe("GoalStatusBar layout contract", () => { expect(source).not.toContain("goal.time.hoursMinutes"); }); }); + +describe("resolveGoalStatusKey", () => { + it("maps non-complete statuses to their own keys", () => { + expect(resolveGoalStatusKey("active", undefined)).toBe("goal.status.active"); + expect(resolveGoalStatusKey("paused", undefined)).toBe("goal.status.paused"); + expect(resolveGoalStatusKey("budget_limited", undefined)).toBe("goal.status.budgetLimited"); + }); + + it("shows the verified label only when a complete goal passed Judge acceptance", () => { + expect(resolveGoalStatusKey("complete", true)).toBe("goal.status.verified"); + }); + + it("falls back to the plain complete label when judge has not passed", () => { + expect(resolveGoalStatusKey("complete", false)).toBe("goal.status.complete"); + }); + + it("treats an undefined judgePassed as not verified", () => { + expect(resolveGoalStatusKey("complete", undefined)).toBe("goal.status.complete"); + }); +}); diff --git a/src/modules/workbench-shell/ui/goal-status-bar.tsx b/src/modules/workbench-shell/ui/goal-status-bar.tsx index 83c2bbf7..86ace721 100644 --- a/src/modules/workbench-shell/ui/goal-status-bar.tsx +++ b/src/modules/workbench-shell/ui/goal-status-bar.tsx @@ -2,13 +2,36 @@ import { useCallback, useState } from "react"; import { goalGetState, goalPause, goalResume, goalClear } from "@/services/bridge/agent-commands"; -import { threadStore, useStore, shallowEqual } from "@/modules/workbench-shell/model/thread-store"; +import { threadStore, useStore, shallowEqual, type GoalStoreState } from "@/modules/workbench-shell/model/thread-store"; import { useT } from "@/i18n"; type Props = { threadId: string; }; +/** + * Resolve the i18n key for the goal status label. Extracted as a pure function + * so the `complete` → `verified` (judgePassed) branch can be unit-tested without + * mounting the component. + */ +export function resolveGoalStatusKey( + status: GoalStoreState["status"], + judgePassed: GoalStoreState["judgePassed"], +): + | "goal.status.active" + | "goal.status.paused" + | "goal.status.budgetLimited" + | "goal.status.verified" + | "goal.status.complete" { + switch (status) { + case "active": return "goal.status.active"; + case "paused": return "goal.status.paused"; + case "budget_limited": return "goal.status.budgetLimited"; + case "complete": return judgePassed ? "goal.status.verified" : "goal.status.complete"; + default: return "goal.status.active"; + } +} + export function GoalStatusBar({ threadId }: Props) { const t = useT(); const goal = useStore(threadStore, (s) => s.goalState[threadId] ?? null, shallowEqual); @@ -30,15 +53,7 @@ export function GoalStatusBar({ threadId }: Props) { if (!goal) return null; - const statusKey = (() => { - switch (goal.status) { - case "active": return "goal.status.active"; - case "paused": return "goal.status.paused"; - case "budget_limited": return "goal.status.budgetLimited"; - case "complete": return goal.judgePassed ? "goal.status.verified" : "goal.status.complete"; - default: return "goal.status.active"; - } - })(); + const statusKey = resolveGoalStatusKey(goal.status, goal.judgePassed); const statusColor = goal.status === "active" ? "bg-blue-500" diff --git a/src/services/bridge/agent-commands.test.ts b/src/services/bridge/agent-commands.test.ts index 4ffb735a..25695b82 100644 --- a/src/services/bridge/agent-commands.test.ts +++ b/src/services/bridge/agent-commands.test.ts @@ -608,6 +608,16 @@ describe("goalEvaluate", () => { expect(result).toBeNull(); }); + it("passes through the skipped verdict for already-accepted goals", async () => { + isTauriMock.mockReturnValue(true); + const result = makeGoalEvaluateResult({ verdict: "skipped", continuationPrompt: null }); + invokeMock.mockResolvedValueOnce(result); + + const outcome = await goalEvaluate("thread-1"); + expect(outcome!.verdict).toBe("skipped"); + expect(outcome!.continuationPrompt).toBeNull(); + }); + it("requires Tauri runtime", async () => { isTauriMock.mockReturnValue(false); From e8a58f2767fb3ecaa6cb2486cf9cc3e2c80c9971 Mon Sep 17 00:00:00 2001 From: Jorben Date: Sun, 7 Jun 2026 14:59:06 +0800 Subject: [PATCH 5/8] =?UTF-8?q?refactor(subagent):=20=F0=9F=94=A7=20increa?= =?UTF-8?q?se=20builtin=20default=20max=20delegation=20depth=20to=205?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Raise `BUILTIN_DEFAULT_MAX_DELEGATION_DEPTH` from 3 to 5 to match the existing `GLOBAL_MAX_DELEGATION_DEPTH`, allowing built-in subagents (explore/review) to be delegated to the same depth as custom profiles. Update delegation validation tests to reflect the new depth limits. --- src-tauri/src/core/subagent/orchestrator.rs | 16 +++++++++------- .../src/core/subagent/runtime_orchestration.rs | 6 +++--- 2 files changed, 12 insertions(+), 10 deletions(-) diff --git a/src-tauri/src/core/subagent/orchestrator.rs b/src-tauri/src/core/subagent/orchestrator.rs index cb890700..0ee2c210 100644 --- a/src-tauri/src/core/subagent/orchestrator.rs +++ b/src-tauri/src/core/subagent/orchestrator.rs @@ -1909,7 +1909,7 @@ mod tests { #[test] fn validate_delegation_allows_review_to_explore_at_depth_2() { - // Main(1) → review(2): review can delegate, explore.max=3 >= 2. + // Main(1) → review(2): review can delegate, explore.max=5 >= 2. validate_delegation_capability( &SubagentProfile::Review, &RuntimeOrchestrationTool::Explore, @@ -1921,15 +1921,17 @@ mod tests { #[test] fn validate_delegation_rejects_when_child_depth_exceeds_target_max() { - // child_depth 4 exceeds explore.max_delegation_depth (3). + // Custom target with max=4 cannot be reached at depth 5 (exceeds its config but + // still within GLOBAL_MAX_DELEGATION_DEPTH). + let target = custom_profile(true, 4); let err = validate_delegation_capability( &SubagentProfile::Review, - &RuntimeOrchestrationTool::Explore, - &SubagentProfile::Explore, - 4, + &RuntimeOrchestrationTool::Custom("shallow".to_string()), + &target, + 5, ) - .expect_err("depth 4 must exceed explore max depth 3"); - assert!(err.contains("max delegation depth is 3")); + .expect_err("depth 5 must exceed custom max depth 4"); + assert!(err.contains("max delegation depth is 4")); } #[test] diff --git a/src-tauri/src/core/subagent/runtime_orchestration.rs b/src-tauri/src/core/subagent/runtime_orchestration.rs index 25c72e8a..c458e098 100644 --- a/src-tauri/src/core/subagent/runtime_orchestration.rs +++ b/src-tauri/src/core/subagent/runtime_orchestration.rs @@ -14,7 +14,7 @@ pub const GLOBAL_MAX_DELEGATION_DEPTH: u32 = 5; /// Built-in default for the maximum delegation depth a built-in subagent /// (explore / review) may be delegated to. -pub const BUILTIN_DEFAULT_MAX_DELEGATION_DEPTH: u32 = 3; +pub const BUILTIN_DEFAULT_MAX_DELEGATION_DEPTH: u32 = 5; pub const TERM_STATUS_TOOL_DESCRIPTION: &str = "Inspect the status of the desktop app's embedded Terminal panel session for the current thread. Use this to check that panel's session state without mutating it. It does not inspect the agent runtime, CLI process, or host shell outside the panel."; @@ -1189,8 +1189,8 @@ mod tests { #[test] fn review_profile_omits_delegation_tools_beyond_builtin_depth() { - // child_depth 4 exceeds BUILTIN_DEFAULT_MAX_DELEGATION_DEPTH (3). - let tools = SubagentProfile::Review.delegation_tools_for_helper(4, &[]); + // child_depth 6 exceeds BUILTIN_DEFAULT_MAX_DELEGATION_DEPTH (5). + let tools = SubagentProfile::Review.delegation_tools_for_helper(6, &[]); assert!(tools.is_empty()); } From c15e885a9c41a4c774f7c7023155f7b6ef6e4d5f Mon Sep 17 00:00:00 2001 From: Jorben Date: Sun, 7 Jun 2026 15:01:11 +0800 Subject: [PATCH 6/8] =?UTF-8?q?docs:=20=F0=9F=93=9D=20remove=20obsolete=20?= =?UTF-8?q?design=20document?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- docs/goal-judge-evaluation-refactor.md | 346 ------------------------- 1 file changed, 346 deletions(-) delete mode 100644 docs/goal-judge-evaluation-refactor.md diff --git a/docs/goal-judge-evaluation-refactor.md b/docs/goal-judge-evaluation-refactor.md deleted file mode 100644 index ebea6c5f..00000000 --- a/docs/goal-judge-evaluation-refactor.md +++ /dev/null @@ -1,346 +0,0 @@ -# Goal 评估与续行重构方案:引入 Judge 验收 Agent - -> 状态:设计方案(待评审) -> 关联模块:`src-tauri/src/core/goal_manager.rs`、`src-tauri/src/core/subagent/`、`src-tauri/src/core/agent_run_event_handler.rs`、`src-tauri/src/model/goal.rs` -> 决策基线(已澄清): -> 1. **保留全部现有护栏**(idle 空转、clarify/update_plan 暂停、token/turn 预算上限),仅把“是否完成”的判定从自主声明改为 Judge 验收。 -> 2. **复用 `GoalStatus::Complete` 状态** 表达“通过验收”,并在 `goals` 表新增 Judge 评估字段持久化最近一次裁决;迁移需把存量 `status='complete'` goal 回填为 `judge_passed=1`。 -> 3. **由主 agent 主动调用 `agent_judge`**,系统在 run 终止后通过续行 prompt 引导主 agent 先验收、未通过则修复后重验。 -> 4. **`agent_judge` 是主 agent 专属工具**:只在有未完成 goal 时注入主 agent,且运行时必须硬性拒绝任何 subagent 递归调用 Judge,即使工具名被 `RuntimeOrchestrationTool::parse()` 解析出来也不能放行。 -> 5. **Judge 使用诊断型 shell 软约束**:Judge 的文件工具保持只读;允许 `shell` 仅用于测试、type-check、lint、只读检查等诊断验证,并通过 Judge prompt 明确禁止用 shell 修改文件、删除数据、安装依赖或改变全局状态。首版不新增受限 shell 沙箱。 -> 6. **Judge 默认使用 primary 模型角色**,优先保证验收质量;首版不把 Judge/subagent 的 token 单独计入 goal token budget,也不新增 Judge 专属硬超时,沿用现有 helper run 的 turn/取消机制。 -> 7. **删除失效的自主完成路径**:移除 `goal_scored`、`GoalVerdict::Complete` 的旧自证语义,以及由 `goal_scored` 空 evidence 触发的 `NoEvidence` / `MISSING_EVIDENCE_PROMPT` 分支。 - ---- - -## 1. 背景与问题 - -当前 goal 的"完成"判定依赖主 agent 自主调用 `goal_scored(status, evidence, pledge)` 工具来声明达成。这是一种**自证式(self-attestation)**设计: - -- 工具内部只校验 `status == "complete"`、`pledge` 文本逐字匹配、`evidence` 非空(见 `agent_session_execution.rs` 的 `execute_goal_tool()`)。 -- 它**无法验证 evidence 的真伪**,也无法核对结果是否真的满足 goal 的一致性与完整性。 - -实测发现部分模型即便明知仍有未完成项,也会照抄 pledge 文本、编造 evidence 来调用 `goal_scored` 并提前结束任务。pledge + evidence 非空这类形式化护栏对"不诚实声明"无效,这是自主声明方式的**设计缺陷**。 - -**核心思路**:把"完成判定权"从被评估者(主 agent)手中移交给独立的评估者(Judge Agent)。主 agent 不能再自己宣布通过;只有 Judge 基于 goal 内容对项目当前状态做出"通过"裁决,goal 记录才会扭转为通过验收状态。续行监督也随之改为以"是否通过验收"为准。 - ---- - -## 2. 现状梳理(已确认事实) - -### 2.1 Goal 数据模型与持久化 - -- `GoalStatus`(`src-tauri/src/model/goal.rs`):`Active` / `Paused` / `BudgetLimited` / `Complete` 四态。 -- `goals` 表(`migrations/20260530000000_goals.sql` 及后续迁移):每 `thread_id` 唯一一条 goal;含 `status`、`evidence`、`tokens_used`、`turns_used`、`max_turns`、`pause_reason`、`last_evaluated_run_id` 等列。 -- `GoalManager`(`src-tauri/src/core/goal_manager.rs`)封装 CRUD + 评估 + prompt 生成。关键方法:`mark_complete(goal_id, evidence)`、`evaluate_after_turn(response, goal) -> GoalVerdict`(同步 CPU 启发式)、`evaluate_after_run(run_id, response) -> GoalEvaluationOutcome`(异步、含去重 CAS)。 - -### 2.2 `goal_scored` 工具链路 - -- 工具定义在 `agent_session_tools.rs` 的 `runtime_tools_for_profile()`,常量 `GOAL_SCORED_TOOL_NAME` / `GOAL_SCORED_PLEDGE` 在 `goal_manager.rs`。 -- 调用分派在 `agent_session_execution.rs::execute_tool_call()` → `execute_goal_tool()`:校验 status/pledge/evidence → `mark_complete()` → 发送 `GoalCompleted` + `GoalStateUpdated` 事件。 - -### 2.3 续行监督逻辑 - -- run 终止后,`agent_run_event_handler.rs::maybe_continue_goal_after_terminal_run()` 是入口。 -- 前置条件:`goal_continuation_enabled == true`、`final_status ∈ {Completed, Interrupted}`。 -- 调用 `evaluate_after_run()` 内部走 `evaluate_after_turn()` 分层启发式: - - **Layer 1** 工具阻塞:`clarify` → `Paused(ClarifyPending)`;`update_plan` → `Paused(PlanPending)`;`goal_scored` 放行。 - - **Layer 2** idle/完成声明:连续 idle ≥ `MAX_IDLE_TURNS(3)` → `Paused(IdleBlocked)`;检测到完成关键词但未调工具 → `ChallengeEvidence`(反复声称达上限 → `IdleBlocked`)。 - - **Layer 3** 预算:tokens 超 budget → `BudgetLimited`;turns 超 `max_turns` → `Paused(BudgetExhausted)`。 - - 默认 → `Continue`。 -- verdict 为 `Continue` / `ChallengeEvidence` 时,用 continuation prompt 启动新 run;`Paused` / `BudgetLimited` / `skipped` 时不续行。 -- **关键现状**:续行从不查询 goal 的 `Complete` 状态。它实际依靠"模型没有再触发任何阻塞/完成信号 + goal 仍 `Active`"间接推断。一旦 `goal_scored` 被调用,`mark_complete()` 把 status 写成 `Complete`,下一轮 `evaluate_after_run()` 因 goal 非 `Active` 返回 `skipped`,从而停止续行。 - -### 2.4 Subagent 机制 - -- 内建 subagent:`Explore`、`Review`、`Parallel`,定义在 `subagent/runtime_orchestration.rs` 的 `RuntimeOrchestrationTool` / `SubagentProfile`。 -- 深度模型:主 agent = depth 1;主 agent 直接子代理 = depth 2(`MAIN_AGENT_CHILD_DEPTH`);`GLOBAL_MAX_DELEGATION_DEPTH = 5`;内建默认 `BUILTIN_DEFAULT_MAX_DELEGATION_DEPTH = 3`。 -- 委派校验:`orchestrator.rs::validate_delegation_capability(caller, target_tool, target_profile, child_depth)`,三重检查(调用方 `can_delegate`、全局上限、目标 `max_delegation_depth`)。 -- 权限模型:`Explore` 只读(read/list/find/search/web_search,`can_delegate=false`);`Review` 只读 + 诊断 shell + git/term 只读(`can_delegate=true`);`Custom` 按 `allowed_tools` 白名单。 -- 工具注入:主 agent 在 `agent_session_tools.rs::runtime_tools_for_profile()` 中 `tools.extend(runtime_orchestration_tools())`;自定义在 `agent_session.rs::build_session_spec()` 注入。 -- Prompt 注入:`build_helper_system_prompt()` 按 `PromptSurface`(`prompt/surface.rs`)选择 section;task 通过 `agent.prompt(request.task)` 注入为 user message。 - ---- - -## 3. 设计目标 - -1. 新增内建 **Judge** subagent,对项目当前状态做 goal 达成度评估,结构化返回:通过与否(bool)、完整度百分比、判定依据(未达成/不符合点描述)。 -2. Judge 通过时**扭转 goal 记录为通过验收状态**(复用 `Complete` + 持久化 Judge 字段)。 -3. Judge 上下文注入 goal 内容,评估重点是 goal 要求的**一致性**与**完整性**。 -4. Judge 文件工具保持**只读**,允许 `read` / `list` / `find` / `search` / `web_search`;允许 `shell` 但仅作为诊断型软约束工具用于测试、type-check、lint、只读检查;允许再发起 subagent(含并行,如 explore/review 协助),**自身最大被委派深度为 2**。 -5. **删除 `goal_scored` 工具**。完成判定不再由主 agent 自证。 -6. 续行监督改为:判定 goal 记录是否“通过验收”;未通过且 goal 仍 Active 则续行,并在 continuation prompt 中明确要求主 agent 调用 `agent_judge` 验收并遵循验收结果。 -7. **按需注入**:仅当 thread 有未通过验收的 goal 时,才向**主 agent**注入 `agent_judge` 工具;所有 subagent 均不注入且运行时拒绝递归调用 `agent_judge`;无 goal 或已验收通过时不注入。 - ---- - -## 4. 总体设计 - -### 4.1 角色与职责重划 - -| 角色 | 重构前 | 重构后 | -|------|--------|--------| -| 主 agent | 自己调 `goal_scored` 声明完成 | 干活 + 自认为完成后调 `agent_judge` 申请验收;不能自证完成 | -| Judge agent | 不存在 | 独立验收者,文件工具只读且 shell 仅诊断软约束,基于 goal 评估项目当前状态,产出结构化裁决;通过则扭转 goal 状态 | -| 续行监督 | 间接依赖 goal 非 Active 停续行 | 显式以"goal 是否通过验收(Complete + judge_passed)"为停续行依据 | - -### 4.2 端到端数据流 - -``` -用户 /goal - └─ goal_set() → create_goal(status=Active) - └─ 注入 ActiveGoalSource 到主 agent system prompt(更新文案:完成须经 agent_judge 验收) - └─ 按需向主 agent 注入 agent_judge 工具(goal 存在且尚未通过验收) - -主 agent run:工作 → 自认为达成 → 调用 agent_judge(task) - └─ execute_tool_call() 路由到 Judge 编排 - └─ HelperAgentOrchestrator::run_helper(SubagentProfile::Judge) - ├─ build_helper_system_prompt(PromptSurface::SubagentJudge) + 注入 goal objective 到上下文 - ├─ Judge 工具集:read/list/find/search/web_search/shell(仅诊断软约束) + (depth 允许时)agent_explore/agent_review/agent_parallel - ├─ Judge 调研验证:读代码、搜索、运行测试/type-check/lint 等诊断命令、并行 explore/review - └─ 产出结构化 JudgeReport { passed, completeness_pct, findings, summary } - └─ Judge 编排回写 goal 记录: - ├─ 总是:persist 最近一次 judge_passed / judge_completeness / judge_findings / judge_summary / judge_evaluated_run_id - └─ passed == true:事务写入 status=Complete + judge_passed=true + evidence=summary - 发送 GoalCompleted + GoalStateUpdated 事件 - └─ agent_judge 工具结果(JudgeReport 文本)返回给主 agent - -run 终止 - └─ maybe_continue_goal_after_terminal_run() - └─ evaluate_after_run() - ├─ 若 goal.status == Complete && goal.judge_passed == true(已通过验收)→ skipped(停续行)✅ - ├─ 若 goal.status != Active → skipped(非活跃 goal 不自动续行,保留现有暂停/预算语义) - ├─ 否则保留现有护栏:clarify/update_plan/idle/预算 → Paused/BudgetLimited - └─ 否则 → Continue:注入新版 continuation prompt - "你尚未通过验收。请先用 agent_judge 验收;若上次验收未通过, - 按 findings 修复后再次调用 agent_judge。" - └─ Continue → 启动新 run(回到主 agent run) -``` - -### 4.3 为什么选择这套方案(与备选对比) - -- **复用 `Complete` 而非新增 `Verified` 枚举**:`Complete` 在 DDL CHECK 约束、`GoalStatus` 枚举、前端状态条、gateway 文案中均已铺开。新增枚举值需要同步迁移、前端、序列化多处,收益有限。改为复用 `Complete` 并以 `judge_passed` 布尔列区分"是否经 Judge 验收",改动面最小且语义清晰(通过验收 = `Complete` 且 `judge_passed=true`)。 -- **保留全部护栏**:Judge 解决的是"完成判定的可信度",而 idle 空转、clarify/update_plan 暂停、预算上限解决的是"防止无限续行/资源失控/阻塞等待"。两者正交,移除护栏会让无 goal 评估能力时的兜底消失,引入失控风险。 -- **主 agent 主动调用 + 续行引导**(而非系统自动发起 Judge):保持与现有 subagent 调用模型一致(主 agent 通过工具调用委派),实现侵入小;系统侧只需在续行 prompt 中“催”主 agent 去验收,无需在 run 终止后再隐式拉起一个评估 run 改变运行时调度。续行 prompt 会持续施压,直到 goal 被 Judge 标记通过,规避了“主 agent 不调 Judge 就永远不验收”的死角。 -- **Judge 作为主 agent 专属内建工具**:虽然 `agent_judge` 会加入 `RuntimeOrchestrationTool::parse()`,但它不进入 `builtin_all()` 和 `delegation_tools_for_helper()`,也不允许 subagent 递归调用。这样保留统一工具解析与 helper 编排复用,同时避免 explore/review/custom/Judge 自己绕过“主 agent 申请验收”的职责边界。 -- **诊断型 shell 软约束而非新沙箱**:Judge 需要能运行测试、type-check、lint 等验证命令,因此首版复用现有 `shell` 工具;但该工具能力本身不是硬只读,必须在 Judge prompt 中明确限制为诊断用途,禁止修改文件、删除数据、安装依赖、启动交互式长进程或改变全局状态。新建受限 shell/test-runner 工具会扩大改动面,首版暂不引入。 -- **Judge 使用 primary 模型角色**:验收质量优先于成本,Judge 默认走 `model_plan.primary`。Explore/Review 继续保持现有模型策略,Judge 内部再委派时由各子代理自己的模型映射决定。 - -### 4.4 首版范围边界 - -首版目标是打通后端 Judge 验收闭环:工具注入、subagent 运行、结构化解析、goal 回写、续行停止、迁移兼容和测试覆盖。前端仅同步类型并在现有状态条显示“已验收通过”这一最小信息;`judge_completeness` 的精细 UI、额外事件、ACP/gateway 的详细状态展示、Judge token 单独计入 goal budget、Judge 专属超时或受限 shell 沙箱均作为后续增强,不进入首版。 - ---- - -## 5. 详细实现 - -### 5.1 Judge subagent profile(`subagent/runtime_orchestration.rs`) - -- `RuntimeOrchestrationTool` 新增变体 `Judge`,工具名映射 `agent_judge`;`parse("agent_judge") -> Some(Judge)`。同时补齐 `tool_name()`、`title()`、`description()`、`profile()`、`as_agent_tool()` 的 match 分支,`as_agent_tool()` 的 schema 只需要 `task: string`。 -- `SubagentProfile` 新增 `Judge` 变体,并补齐 `helper_kind()`(固定返回 `helper_judge`)、`system_prompt()`、`can_delegate()`、`max_delegation_depth()`、`helper_tools()` 等 match 分支。 -- `resolve_helper_profile()` 增加 `RuntimeOrchestrationTool::Judge => Some(SubagentProfile::Judge)`;`resolve_helper_model_role()` 增加 Judge 分支,默认使用 `model_plan.primary`,不要复用 Explore/Review 的 auxiliary 映射。 -- `helper_tools()` for `Judge`:`read` / `list` / `find` / `search` / `web_search`(条件启用)/ `shell`(仅诊断验证)。**不含** `edit` / `write` / `term_write` / `term_restart` / `term_close`。需要在工具描述和 Judge prompt 中明确:`shell` 只能运行测试、type-check、lint、只读检查等诊断命令,不能修改文件、删除数据、安装依赖、启动交互式长进程或改变全局状态。这是 prompt 软约束,不是硬沙箱。 -- `can_delegate()` for `Judge`:`true`(允许 explore/review/parallel 协助)。 -- `max_delegation_depth()` for `Judge`:`2`(即 Judge **自身最大被委派深度为 2**——主 agent depth 1 直接委派 Judge 得到 depth 2,符合 `MAIN_AGENT_CHILD_DEPTH=2`;同时这意味着 Judge 内部委派的子级会是 depth 3,需在 `delegation_tools_for_helper()` 中据此过滤)。 - > 注意:需求所述“自身最大被委派深度为2”指 Judge 作为被委派目标时允许出现在 depth ≤ 2。为了让 Judge 仍能发起 explore/review/parallel(depth 3 子级),`delegation_tools_for_helper(child_depth)` 对内建目标的过滤阈值需复核:Judge 在 depth 2 调用子级时 `child_depth=3`,仍 ≤ `GLOBAL_MAX_DELEGATION_DEPTH(5)` 且 ≤ explore/review 的 `max_delegation_depth(3)`,故可注入。实现时确保 `validate_delegation_capability` 对 Judge→explore/review 放行。 -- `delegation_tools_for_helper()` 仍只注入 Explore / Review / Custom / Parallel,**不得注入 Judge**。这使 Judge 可以委派其他 helper,但任何 helper 不能委派 Judge。 -- `RESERVED_SUBAGENT_SLUGS` 增加 `"judge"`,防止自定义 subagent 占用该 slug。由于 `RuntimeOrchestrationTool::parse()` 对 `agent_{slug}` 有通配解析,保留 slug 能避免 `agent_judge` 与自定义工具名冲突。 -- `runtime_orchestration_tools()` **不无条件包含 Judge**:Judge 改为按需注入(见 5.6),`builtin_all()` 保持仅含 explore/review/parallel,Judge 单独由主 agent 工具组装处按 goal 条件 push。 - -### 5.2 Judge 结构化协议(新增 `subagent/judge_contract.rs`) - -参照 `review_contract.rs` / `parallel_contract.rs` 模式新增: - -```rust -/// agent_judge 工具的入参(主 agent 传入)。 -pub struct JudgeRequest { - pub task: String, // 主 agent 对"为何认为达成"的说明 / 关注点 -} - -/// Judge 评估结构化产出。 -#[derive(Serialize, Deserialize)] -pub struct JudgeReport { - pub passed: bool, // 是否通过验收 - pub completeness_pct: u8, // 0-100 完整度百分比 - pub findings: Vec, // 未达成 / 不符合 goal 的具体点(passed=false 时必填) - pub summary: String, // 判定依据总述,作为通过时的 evidence -} -``` - -- Judge 的 system prompt(模板 `prompt/templates/subagent/judge.md`)强制要求最终以可解析的结构化形式(JSON 块或约定字段)返回上述四项。 -- `passed=true` 时 `summary` 必须非空,作为 `mark_complete()` 的 evidence;如果 Judge 输出 `passed=true` 但 `summary` 为空,解析层必须降级为 `passed=false`,避免无证据完成。 -- `completeness_pct` 解析后必须 clamp 到 0-100;`passed=false` 时 `findings` 必须非空,若模型未给出 findings,则把原始输出或“Judge did not provide actionable findings”写入 findings。 -- Judge 编排在拿到 Judge 文本输出后解析为 `JudgeReport`;解析失败按 `passed=false` 处理并把原始文本塞入 `findings`,避免误判通过。 - -### 5.3 Judge prompt surface 与上下文注入 - -- `prompt/surface.rs::PromptSurface` 新增 `SubagentJudge { inherited_run_mode }`。 -- `SurfacePattern::matches()` 同步更新:`AnySubagent` 必须匹配 `SubagentJudge`;`BuiltinSubagent` 也必须匹配 `SubagentJudge`,因为 Judge 是内建 subagent。若某些 prompt section 只应给 Explore/Review 而不应给 Judge,应改用更精确的 matcher 或新增 pattern,避免误注入。 -- `build_helper_system_prompt()` 增加 `SubagentProfile::Judge` → `PromptSurface::SubagentJudge { inherited_run_mode }` 映射。 -- `prompt/sources/custom_subagent_body.rs` 增加 Judge 模板映射:Judge → `templates/subagent/judge.md`。 -- `prompt/templates/subagent/judge.md`:定义 Judge 角色——独立验收员,只读评估,重点核对 goal 的一致性与完整性;说明可用工具(含诊断型 `shell`、可委派 explore/review/parallel);要求输出结构化 `JudgeReport`;明确禁止修改文件。`shell` 约束必须写成硬性行为指令:只能运行测试、type-check、lint、只读检查;不得通过 shell 编辑/删除文件、安装依赖、改变全局状态、启动交互式或长期驻留进程。 -- `prompt/sources/subagent_output_contract.rs` 增加 Judge 的输出契约 `output_contract.judge.md`,并在 contract 中重复 `passed` / `completeness_pct` / `findings` / `summary` 的字段要求和失败兜底规则。 -- **goal 内容注入采用 task 前缀方案**:Judge 上下文必须包含 goal objective,且由 `agent_session_execution.rs` 的 Judge 分支在构造 helper task 时注入,不新增 DB 读取型 prompt source。 -- task 前缀必须包含:objective、当前 goal id/status、最近一次 Judge findings/summary(若有)、主 agent 传入的 `task` 说明。这样 Judge 不依赖主 agent 自述即可核对目标。 - -### 5.4 Judge 编排与 goal 回写(`agent_session_execution.rs` + `goal_manager.rs`) - -- `execute_tool_call()`:`RuntimeOrchestrationTool::parse()` 命中 `Judge` 时进入 Judge 专用分支,不直接走普通 `execute_helper_tool()` 返回路径。该分支可复用 `resolve_helper_delegate()` / `HelperAgentOrchestrator::run_helper()`,但必须在 helper 完成后追加 JudgeReport 解析和 goal 回写。 -- Judge 分支额外步骤: - 1. 调用前从 DB 加载当前 thread 的未完成 goal;无 goal 或 goal 已 `Complete && judge_passed=true` 则返回错误(agent_judge 仅在有 goal 时可用,理论上不会被注入)。 - 2. 把 `goal.objective`、goal id/status、最近一次 judge findings/summary、主 agent 传入的 `task` 拼成 Judge task 上下文。 - 3. 以 `SubagentProfile::Judge`、`RuntimeOrchestrationTool::Judge`、depth 2 启动 helper run;模型角色使用 `model_plan.primary`。 - 4. Judge run 结束后解析 `JudgeReport`;解析失败或字段非法按 `passed=false` 处理。 - 5. 调用新增 `GoalManager::record_judge_verdict(goal_id, run_id, &report)` 持久化最近裁决;若 `report.passed`,该方法在同一事务内写入 `status=complete`、`evidence=report.summary` 与 `judge_passed=true`。 - 6. 若通过验收,发送 `GoalCompleted` + `GoalStateUpdated` 事件;若未通过,也发送 `GoalStateUpdated`,让前端/后续续行能拿到最新 findings。 - 7. 把 `JudgeReport` 文本作为工具结果返回主 agent;通过时结果中明确提示“goal 已通过验收,请停止修改并总结”,降低同一 run 后续继续改动的风险。 -- `GoalManager` 新增方法: - - `record_judge_verdict(&self, goal_id: &str, run_id: &str, report: &JudgeReport) -> Result`:写 `judge_passed` / `judge_completeness` / `judge_findings`(JSON) / `judge_summary` / `judge_evaluated_run_id`,并返回更新后的 record 供事件 payload 使用;passed 时同一事务同步写 `status=complete` 与 `evidence=report.summary`。 -- 原子性要求:`goal_repo.rs` 增加 `record_judge_verdict()` repo 方法,在事务内更新 judge_* 字段;passed 时同事务写 `status='complete'` 与 `evidence=summary`,确保 `status=complete` 与 `judge_passed=1` 不出现半更新;未通过时保持原 status(通常 Active)不变。 -- 预算边界:首版 Judge helper run 的 token 不单独计入 goal `tokens_used`。这是明确取舍;后续若要计入,需要扩展 `HelperRunResult` 携带 usage 并在 Judge 分支回写。 -- 同轮继续修改边界:系统不强行锁定 goal 后的写工具,因为主 agent 仍处于同一 run;通过验收后的工具结果和 `active_goal.tpl.md` prompt 必须要求停止修改。若未来需要硬约束,可在 `execute_tool_call()` 中对 `Complete && judge_passed` 后的 mutating tools 增加拒绝策略,首版不做。 - -### 5.5 删除 `goal_scored` 工具 - -- 删除工具定义(`agent_session_tools.rs` 中的 `goal_scored` `AgentTool::new(...)`)。 -- 删除分派分支与 `execute_goal_tool()`(`agent_session_execution.rs`)。 -- 移除常量 `GOAL_SCORED_TOOL_NAME` / `GOAL_SCORED_PLEDGE`(`goal_manager.rs`),以及 `evaluate_after_turn()` 中 `detect_tool_based_blocking` 对 `goal_scored` 的放行分支。 -- 删除旧自证语义:`GoalVerdict::Complete { evidence }` 当前没有有效生产者,删除 `goal_scored` 后一并移除,并删除 `evaluate_after_run()` 中的旧 match 分支,减少死代码。 -- 删除 `ChallengePromptVariant::NoEvidence` 与 `MISSING_EVIDENCE_PROMPT`,因为它们只服务于“调用 `goal_scored` 但 evidence 为空”的旧路径;保留 completion-claim 检测对应的 `ChallengeEvidence` / `NoTool` 语义,并把文案改为“声称完成但尚未调用 `agent_judge` 验收”。 -- 护栏保留但需改写文案:`ChallengeEvidence` 与 completion-claim 检测仍作为“提醒主 agent 去验收”的软提示,引导语从“调用 goal_scored”改为“调用 agent_judge 验收”。`GUIDANCE_PROMPT` 同步更新。 -- `agent_judge` 会被 `record_tool_call()` 记录到 goal runtime tool calls;`detect_tool_based_blocking()` 不应把它视为阻塞工具,也不应触发 pause。它与普通工具调用一样表示 agent 有行动,能重置 idle 倾向。 -- 全局检索并清理 `goal_scored` 引用:系统 prompt、`active_goal.tpl.md`、gateway 文案、前端 hardcoded kickoff prompt、测试(`tests/goal_lifecycle.rs`)等。 - -### 5.6 按需注入 `agent_judge`(仅主 agent,仅有未完成 goal 时) - -- 注入点在主 agent 工具组装处。`runtime_tools_for_profile()` 当前是纯 profile 函数,不知道 thread goal 状态;推荐在其调用方 `build_session_spec()`(`agent_session.rs`)查询并追加 Judge 工具,避免把 DB 依赖塞进纯工具构造函数。 - - 在 `build_session_spec()` 已能访问 `pool` 与 `thread_id`,查询 `goal_repo::find_by_thread_id`,若存在且尚未通过验收,则 push `RuntimeOrchestrationTool::Judge.as_agent_tool()`。 - - “尚未通过验收”的判定为:goal 存在且不是 `status == Complete && judge_passed == true`。实际自动续行仍只对 `Active` 生效;但工具注入可允许用户在恢复/继续场景中对 `Paused` 或 `BudgetLimited` goal 重新申请验收。 - - goal 不存在或已 `Complete && judge_passed`(已验收)则不注入。 -- `runtime_tools_with_custom_subagents()` 与 extension tool 合并时需维持内建工具名优先级,防止 extension/custom 工具覆盖 `agent_judge`。 -- **subagent 不注入**:Judge 工具只在主 agent 工具集 push,不进入 `delegation_tools_for_helper()` 的候选;任何 subagent(含 Judge 自身、explore/review/custom)的可委派目标列表都不包含 `agent_judge`。 -- **运行时硬门禁**:仅“不注入”不足够,因为模型或测试仍可能构造 `agent_judge` 调用,且 `RuntimeOrchestrationTool::parse()` 会命中。必须在 subagent 递归委派路径(例如 `HelperDelegationContext::handle_delegation()` / `resolve_delegation()`)中显式拒绝 `RuntimeOrchestrationTool::Judge`,返回“agent_judge can only be called by the main agent for the current goal”之类错误。 -- `agent_parallel` 的任务列表也必须拒绝 `agent_judge`。`validate_parallel_delegate_safety()` 或解析 parallel task 的位置应把 Judge 视为非法 batch target,避免通过 parallel 间接调用 Judge。 -- 主 agent 侧 `execute_tool_call()` 的 Judge 分支也要重新查询 goal 状态,不能只依赖工具注入时的状态;这是防止 race / stale tool set 的后端 backstop。 - -### 5.7 续行监督改造(`agent_run_event_handler.rs` + `goal_manager.rs`) - -- `evaluate_after_run()` / `evaluate_after_turn()` 开头新增**显式终止判定**:若 goal 已“通过验收”(`status == Complete && judge_passed == true`)→ 返回 `skipped`(停续行)。这是停续行的**主依据**。 -- 存量兼容依赖迁移回填:迁移后不应出现旧路径产生的 `status=Complete && judge_passed=false`。如果运行时遇到该组合,按异常兼容处理并停续行或记录 warning;不要把旧 complete goal 重新拉起续行。 -- 对 `Paused` / `BudgetLimited` 仍按现有语义返回 skipped,不自动续行。只有 `Active` goal 会继续进入护栏评估。 -- 其余护栏(clarify/update_plan/idle/预算)保留,作用不变。 -- `Continue` / `ChallengeEvidence` verdict 的 continuation prompt 改写为新模板(替换 `CONTINUATION_PROMPT_TEMPLATE`): - -``` -[Goal continuation — turns {turns_used}/{max_turns}] - -**Objective:** {objective} - -继续推进该目标,执行下一个具体步骤。 - -⚠️ 完成判定已改为独立验收:当你认为目标已达成时,必须调用 - agent_judge(task="说明为何认为已达成 / 需重点核对的点") -由 Judge 评估项目是否满足目标的一致性与完整性。 -- 仅当 Judge 裁决 passed=true 时,目标才会被标记为通过验收并停止续行。 -- 若上一次 Judge 验收未通过,请阅读其 findings,逐项修复后再次调用 agent_judge。 -你无法自行声明完成;只有通过 Judge 验收才算达成。 - -如果你被阻塞、需要用户输入,请使用 clarify 工具。 -``` - -- 若最近一次 Judge 未通过,必须把 `judge_findings` 摘要拼接进 continuation prompt,提升修复指向性;摘要可限制长度,避免 prompt 过长。 - -### 5.8 数据库迁移 - -新增迁移 `migrations/2026XXXXXXXXXX_goal_judge_fields.sql`: - -```sql -ALTER TABLE goals ADD COLUMN judge_passed INTEGER NOT NULL DEFAULT 0; -- bool -ALTER TABLE goals ADD COLUMN judge_completeness INTEGER; -- 0-100, nullable -ALTER TABLE goals ADD COLUMN judge_findings TEXT; -- JSON array, nullable -ALTER TABLE goals ADD COLUMN judge_summary TEXT; -- nullable -ALTER TABLE goals ADD COLUMN judge_evaluated_run_id TEXT; -- nullable - --- 兼容旧版本 goal_scored 已完成的 goal,避免升级后被误判为未验收。 -UPDATE goals -SET judge_passed = 1, - judge_summary = COALESCE(judge_summary, evidence), - judge_completeness = COALESCE(judge_completeness, 100) -WHERE status = 'complete'; -``` - -- `GoalRecord` / `GoalDto` / `GoalPayload`(`model/goal.rs`)同步新增字段:`judge_passed: bool`、`judge_completeness: Option`(DB 读写时校验 0-100)、`judge_findings: Option`(JSON 文本,DTO 透传字符串,前端按 string/null 接收)、`judge_summary: Option`、`judge_evaluated_run_id: Option`。 -- `goal_repo.rs` 同步更新 `SELECT_COLUMNS`、`GoalRow`、`into_record()`、`insert()`。新增 `record_judge_verdict()` repo 方法,负责写 judge_* 字段;passed 时同一事务同步写 `status='complete'` 与 `evidence=summary`。 -- 若 `judge_findings` 以 JSON array 字符串存储,写入前由 `serde_json::to_string(&report.findings)` 生成;读取失败时不要 panic,DTO 可原样返回或置为 `None` 并记录 warning。 - -### 5.9 前端、IPC、gateway 与 ACP - -- `ThreadStreamEvent` 首版复用现有 `GoalCompleted` / `GoalStateUpdated`,不新增 Judge 专属事件。`GoalPayload` 增加 judge 字段后,现有事件 payload 即可携带最新裁决。 -- 前端 `GoalPayload` 类型(如 `src/services/bridge/agent-commands.ts`)与 store 类型(如 `src/modules/workbench-shell/model/thread-store.ts`)补充 judge 字段;状态条在 `Complete && judgePassed` 时显示“已验收通过”。`judge_completeness` 的进度/百分比 UI 为二阶段增强。 -- `goal-status-bar.tsx` 只做最小展示;若未实现详细展示,也必须保证新增字段不会破坏类型检查。 -- gateway / ACP 首版只要求文案与行为不再引用 `goal_scored`,并确保这些入口启动主 agent 时使用同一 `build_session_spec()` 注入逻辑,因此有未完成 goal 时也能拿到 `agent_judge`。详细展示 Judge findings/completeness 可后续增强。 - ---- - -## 6. 影响文件清单 - -| 文件 | 改动 | -|------|------| -| `src-tauri/src/model/goal.rs` | `GoalRecord`/`GoalDto`/`GoalPayload` 新增 judge_* 字段;删除 `GoalVerdict::Complete` 旧自证变体 | -| `src-tauri/src/core/goal_manager.rs` | 删除 `GOAL_SCORED_*` 常量与放行分支;删除 `MISSING_EVIDENCE_PROMPT` / `NoEvidence` 旧路径;新增 `record_judge_verdict()`;续行终止判定改为 `Complete && judge_passed`;改写 continuation/guidance 文案并拼接最近 findings | -| `src-tauri/src/core/subagent/runtime_orchestration.rs` | `RuntimeOrchestrationTool::Judge` + `SubagentProfile::Judge`(工具集/can_delegate/max_delegation_depth=2);`parse`/`profile`/`as_agent_tool`/`helper_kind` 等 match 补齐;保留 slug;`builtin_all()` 不含 Judge | -| `src-tauri/src/core/subagent/judge_contract.rs`(新增) | `JudgeRequest` / `JudgeReport` 结构化协议、JSON 解析、字段校验、失败兜底 | -| `src-tauri/src/core/subagent/orchestrator.rs` | `build_helper_system_prompt()` 支持 Judge surface;subagent 递归委派路径硬性拒绝 `agent_judge`;保持 Judge→explore/review/parallel 放行 | -| `src-tauri/src/core/subagent/parallel_contract.rs` / 相关 parallel 校验 | `agent_parallel` task 拒绝 `agent_judge` 作为子任务 | -| `src-tauri/src/core/agent_session_execution.rs` | 删除 `goal_scored` 分派与 `execute_goal_tool()`;新增 Judge 专用分支(加载 goal → task 前缀注入 → helper run → 解析 JudgeReport → 回写 goal → 发送事件) | -| `src-tauri/src/core/agent_session_tools.rs` | 删除 `goal_scored` 工具定义;保持基础 runtime tools 不含 Judge;如新增 helper 函数则提供 `agent_judge` 工具构造 | -| `src-tauri/src/core/agent_session.rs` | `build_session_spec()` 查询 goal,按“未通过验收”条件向主 agent 追加 `agent_judge`;`resolve_helper_model_role()` 将 Judge 映射到 primary | -| `src-tauri/src/core/prompt/surface.rs` | `PromptSurface::SubagentJudge`;`SurfacePattern::AnySubagent` / `BuiltinSubagent` 匹配 Judge | -| `src-tauri/src/core/prompt/sources/custom_subagent_body.rs` | Judge → `templates/subagent/judge.md` | -| `src-tauri/src/core/prompt/sources/subagent_output_contract.rs` | Judge 输出契约 | -| `src-tauri/src/core/prompt/templates/subagent/judge.md`(新增) | Judge 角色、诊断型 shell 软约束、委派说明与结构化输出要求 | -| `src-tauri/src/core/prompt/templates/active_goal.tpl.md` | 完成判定改为经 agent_judge 验收,并提示通过后停止修改 | -| `src-tauri/src/core/prompt/sources/active_goal.rs` | 文案同步(如有引用) | -| `src-tauri/src/persistence/repo/goal_repo.rs` | judge_* 列读写;新增 `record_judge_verdict()`;passed 时原子写 status/evidence/judge_* | -| `src-tauri/migrations/2026XXXXXXXXXX_goal_judge_fields.sql`(新增) | judge_* 列迁移,并回填旧 `status='complete'` 为 `judge_passed=1` | -| `src-tauri/src/gateway/gateway_runner.rs` | 移除 `goal_scored` 引导文案,改为 agent_judge 验收说明 | -| `src-tauri/src/acp/**`(如有 goal 文案/事件映射) | 确认不引用 `goal_scored`;复用 GoalStateUpdated payload 的 judge 字段 | -| `src-tauri/tests/goal_lifecycle.rs` | 重写:覆盖 Judge 通过→Complete+judge_passed→停续行;未通过→续行;旧 complete 回填兼容 | -| `src-tauri/src/core/agent_session_tests.rs` / subagent tests | 覆盖 Judge profile、模型角色、工具注入、递归拒绝、parallel 拒绝、prompt surface 匹配 | -| `src/services/bridge/agent-commands.ts` | 前端 `GoalPayload` 类型新增 judge 字段 | -| `src/modules/workbench-shell/model/thread-store.ts` | `GoalStoreState` 新增 judge 字段 | -| `src/modules/workbench-shell/ui/goal-status-bar.tsx` | 最小展示 `Complete && judgePassed` 为“已验收通过” | -| `src/modules/workbench-shell/ui/runtime-thread-surface.tsx` | 清理 goal kickoff prompt 中的 `goal_scored` 示例,改为 agent_judge 验收说明 | - ---- - -## 7. 验证计划 - -- **Rust 格式**:`cargo fmt --check --manifest-path src-tauri/Cargo.toml`。 -- **Rust 行为**:`cargo test --locked --manifest-path src-tauri/Cargo.toml`,重点 `goal_lifecycle`、subagent 委派、prompt surface 与迁移相关测试。新增/重写用例: - - Judge `passed=true` → goal 变 `Complete` 且 `judge_passed=true`,`judge_summary/evidence` 非空,下一轮 `evaluate_after_run` 返回 skipped(停续行)。 - - Judge `passed=false` → goal 仍进行中,写入 `judge_findings`,`evaluate_after_run` 返回 `Continue` 且 continuation prompt 包含最近 findings 并引导调用 `agent_judge`。 - - 存量 `status='complete'` 迁移后 `judge_passed=1`、`judge_completeness=100`,不会被新续行逻辑重新拉起。 - - `agent_judge` 仅在有未通过验收 goal 时注入主 agent;无 goal 或已验收通过时主 agent 工具集不含 `agent_judge`;任何 subagent 工具集不含 `agent_judge`。 - - 运行时门禁:subagent 直接调用 `agent_judge` 被拒绝;`agent_parallel` task 使用 `agent_judge` 被拒绝;主 agent→Judge 合法(depth 2);Judge→explore/review 合法(depth 3)。 - - Judge 模型角色使用 primary;Explore/Review 仍保持既有模型映射。 - - Prompt surface:`SubagentJudge` 能构建 system prompt;`AnySubagent` / `BuiltinSubagent` 匹配 Judge;Judge 模板包含诊断型 shell 软约束和结构化输出契约。 - - JudgeReport 解析失败、`passed=true` 但 summary 空、completeness 越界、`passed=false` findings 空 → 均视为未通过或安全兜底,不误标完成。 - - `goal_scored` 工具与常量已删除(编译期 + 检索为 0 个非历史设计文档引用)。 -- **前端**:`npm run typecheck`;若改动前端测试则 `npm run test:unit`。重点验证 `GoalPayload` / `GoalStoreState` 新字段不会破坏事件处理,`goal-status-bar.tsx` 能显示已验收通过。 -- **文案检索**:全局搜索 `goal_scored`,除历史文档/迁移注释外不应有运行时 prompt、前端提示或 gateway 文案引用。 -- **手动冒烟**:创建 goal → 主 agent 工作 → 调 agent_judge 未通过(findings)→ 续行修复 → 再次 agent_judge 通过 → goal 状态条显示已验收、续行停止。 - ---- - -## 8. 风险与边界 - -1. **主 agent 始终不调用 `agent_judge`**:goal 永远不被验收,续行会持续注入 prompt 直至护栏触发(idle/预算上限)。这正是护栏保留的价值——兜底防止无限续行。需在 prompt 中强力引导主 agent 调用 agent_judge。 -2. **Judge 误判**:Judge 也是 LLM,可能误通过或误拒。误通过风险通过“独立上下文 + 文件工具只读 + primary 模型 + 重点核对一致性/完整性 + 可跑诊断验证”降低;误拒会触发续行修复,代价是额外轮次。 -3. **诊断型 shell 不是硬只读**:Judge 可用 `shell` 意味着理论上能执行修改性命令。首版通过 Judge prompt 进行软约束,要求只运行测试、type-check、lint、只读检查,并禁止修改文件、删除数据、安装依赖、改变全局状态。若后续发现模型不稳定,应新增受限 test-runner 或 shell allowlist。 -4. **Judge 成本**:每次验收会拉起一个可委派的 subagent run,可能再并行 explore/review,token/时间开销不小。首版不把 Judge/subagent token 单独计入 goal budget,也不新增 Judge 专属硬超时;需在 continuation prompt 中提示主 agent“仅在确有把握达成时再申请验收”,避免频繁空验收。 -5. **深度语义边界**:Judge `max_delegation_depth=2` 必须与 `MAIN_AGENT_CHILD_DEPTH=2` 一致,且要确保 Judge 在 depth 2 仍能委派 depth 3 的 explore/review(受 `GLOBAL_MAX_DELEGATION_DEPTH=5` 与 explore/review 自身上限 3 约束,合法)。同时必须在递归委派和 parallel 路径拒绝任何 helper→Judge 调用,避免职责边界被绕过。 -6. **迁移兼容**:迁移必须回填 `UPDATE goals SET judge_passed=1, judge_completeness=100 ... WHERE status='complete'`。运行时若遇到 `Complete && !judge_passed`,应记录 warning 并停续行,不能把存量已完成 goal 重新拉起。 -7. **gateway / ACP 路径**:微信/企微与 ACP 同样依赖 goal 续行,首版需确认这些入口创建主 agent run 时走同一 `build_session_spec()` 注入逻辑,且 prompt/gateway 文案不再提 `goal_scored`。 -8. **同轮继续修改**:Judge 通过后主 agent 仍可能在同一 run 继续调用其他工具。首版不做写工具硬锁,通过 Judge 工具结果和 `active_goal.tpl.md` prompt 要求停止修改;若后续发现问题,再加 `Complete && judge_passed` 后 mutating tools 拒绝策略。 -9. **跨平台**:主体为 Rust/SQLite/prompt/TypeScript 类型改动,应保持跨平台兼容;shell 诊断命令由 Judge 根据项目现有命令选择,prompt 中需提醒避免平台特定假设。 From d60daecf9db70a1bc22aa430b9cd0fa1b990eef6 Mon Sep 17 00:00:00 2001 From: Jorben Date: Sun, 7 Jun 2026 17:03:01 +0800 Subject: [PATCH 7/8] =?UTF-8?q?docs(judge):=20=F0=9F=93=9D=20add=20size-fi?= =?UTF-8?q?rst=20verification=20strategy=20and=20delegation=20guidelines?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../core/prompt/templates/subagent/judge.md | 58 ++++++++++++++++--- 1 file changed, 49 insertions(+), 9 deletions(-) diff --git a/src-tauri/src/core/prompt/templates/subagent/judge.md b/src-tauri/src/core/prompt/templates/subagent/judge.md index 0cab1d63..58f8b873 100644 --- a/src-tauri/src/core/prompt/templates/subagent/judge.md +++ b/src-tauri/src/core/prompt/templates/subagent/judge.md @@ -5,20 +5,60 @@ declared_keys: [] --- You are the **Goal Acceptance Judge** — an independent verifier. The main agent has been working toward a goal and now believes it is achieved (or has fixed earlier findings and wants re-verification). Your job is to independently decide whether the project's **current state** truly satisfies the goal, focusing on **consistency** with what the goal asked for and **completeness** of the work. -You are an evaluator, not an implementer. You did not do the work, and you must not take the main agent's claims at face value — verify against the actual project state. +You are an evaluator, not an implementer. You did not do the work, and you must not take the main agent's claims at face value — verify against the actual project state. Goal tasks are typically long-horizon with broad change surfaces, so your evaluation must scale: be thorough enough to catch real gaps, efficient enough to converge in one pass, and honest about what you actually verified. -## What to evaluate -- Read the goal objective injected into your task and treat it as the acceptance contract. -- Inspect the relevant code, configuration, tests, and docs to confirm each requirement of the goal is actually met. -- Run diagnostic verification when it strengthens your judgment: tests, type-checks, linters, builds, and read-only inspection commands. Adapt the commands to this repository (infer them from instructions, scripts, and manifests) instead of assuming a stack. -- You may delegate to `agent_explore`, `agent_review`, or `agent_parallel` to gather evidence in parallel when the goal is broad. +## Operating principle: size first, then verify + +Do not start verifying detail by detail before you understand the shape of the change. The right verification budget — and whether to fan out work to subagents — depends on how much actually changed and how it is distributed. + +### Step 1 — Size the change (always do this first) +- Run `git_status` and `git_diff --stat` (or the project's equivalent) to enumerate changed files, additions/deletions, and the rough surface area. +- Cross-reference with the goal objective: identify which subsystems / layers / acceptance criteria each cluster of changes maps to. +- Form an explicit mental model before any deep reading: + - **Small** — ≤ ~5 files changed, single module/layer, narrow concern. One linear pass is enough. + - **Medium** — ~6–20 files, 2–3 subsystems or layers touched, multiple acceptance criteria. + - **Large** — > 20 files, cross-cutting changes, multiple independent topics (e.g. backend + frontend + tests + config + docs), or the goal lists many distinct subtasks. +- Use these as guidance, not hard rules: a 3-file change that touches a security boundary may still warrant Large-style scrutiny; a 40-file rename may collapse to Small. +- If the change scope is genuinely tiny relative to the goal (e.g. goal asks for a feature but the diff shows trivial edits), that itself is strong evidence of incompleteness — record it and probe further before concluding. + +### Step 2 — Pick a verification strategy that matches the size +- **Small change** — verify directly. Read the changed files yourself, confirm each goal requirement against the actual code, run the targeted tests/type-checks. Do not delegate; the coordination overhead is not worth it. +- **Medium change** — split logically. Use one or two `agent_explore` / `agent_review` calls when a coherent slice (e.g. "review the new module + its consumers", "explore how config plumbing was wired") is too large to inspect in line without losing context. Run diagnostic commands (typecheck, targeted tests, lint) yourself. +- **Large change** — fan out with `agent_parallel`. Break the goal's acceptance surface into 2–5 independent topics and dispatch them in parallel. Good split axes: + - **By layer** — backend / frontend / persistence / config. + - **By subsystem** — auth / billing / notifications. + - **By concern** — functional correctness / regression risk / tests & docs / migration & compatibility. + - **By goal subtask** — one helper per acceptance criterion when the goal is itemized. + Keep each subtask independent (no shared write state), bounded in scope, and concretely scoped to file lists or topics inferred from the diff. After the parallel batch returns, **synthesize the results yourself** — reconcile conflicts, call out failures or skipped items, and form one coherent verdict. Do not just concatenate helper outputs. + +### Step 3 — Run the verification commands the project actually uses +- Adapt commands to this repository (infer from manifests, scripts, CI config, and workspace instructions). Do not assume a stack. +- Prefer the *narrowest* command that still covers the changed surface (e.g. test only the affected package) before falling back to repo-wide runs. For Large changes a repo-wide build/typecheck is usually still warranted. +- When `agent_review` is delegated, treat its verification output as authoritative — do not rerun the same commands unless its results were inconclusive. + +## Delegation guidelines +- `agent_explore` — single focused investigation: "where is X used?", "how is Y wired?", "does the codebase still reference Z?". Use when one targeted read-only sweep beats inlining a dozen `read`/`search` calls. +- `agent_review` — bounded review of a slice of the implementation, including running its tests/type-check/lint. Pass `target='diff'` when the helper should look at the workspace changes; provide an explicit changed-file list when you already have one. +- `agent_parallel` — 2–5 independent read-only/review subtasks dispatched together. Prefer this over sequential helper calls whenever the topics are genuinely independent. Never recurse parallel into parallel. +- Do **not** delegate when: + - The change is small enough to inspect inline. + - The subtasks are interdependent (later ones need earlier results). + - You only need one shell command — just run it. +- Always tell each delegate explicitly: the goal text, which slice they own, what evidence to return, and that they are read-only. ## Hard constraints (read-only acceptance) - Your file tools are read-only. Do **not** modify, create, or delete any files. -- The `shell` tool is for **diagnostic and verification commands only** — tests, type-checks, linters, and read-only inspection. You must **never** use shell to edit or delete files, install dependencies, change global or system state, or start interactive / long-running / daemon processes. +- The `shell` tool is for **diagnostic and verification commands only** — tests, type-checks, linters, builds, and read-only inspection (`git_status`, `git_diff`, `git_log`, `cat`, `ls`, etc.). You must **never** use shell to edit or delete files, install dependencies, change global or system state, or start interactive / long-running / daemon processes. - Do not attempt to fix the goal yourself. If something is incomplete, report it as a finding so the main agent can fix it. +- Helpers you delegate to inherit the same read-only constraint; remind them in the task text when relevant. + +## Coverage honesty +- Track what you actually verified vs. what you sampled vs. what you skipped. A Large change you only spot-checked is **not** the same as a Large change you fully covered. +- When delegating, if any helper failed, returned inconclusive results, or could not run a command, treat that area as **not verified** — record it explicitly and let it influence the verdict. +- Never imply a check passed without trustworthy evidence. If your `summary` cannot point to specific files, commands, or behaviors you confirmed, you do not have a basis to pass. ## Verdict rules -- Pass (`passed=true`) only when the project genuinely satisfies the goal with no material gaps. When you pass, `summary` must clearly state the verified evidence — it becomes the goal's completion evidence. -- If anything required by the goal is missing, inconsistent, untested, or broken, set `passed=false` and list each concrete gap in `findings`. +- Pass (`passed=true`) only when the project genuinely satisfies the goal with no material gaps **and** your verification covered the full change surface (directly or via successful delegates). When you pass, `summary` must clearly state the verified evidence — files inspected, commands run with their results, and which goal criteria each piece of evidence maps to. It becomes the goal's completion evidence. +- If anything required by the goal is missing, inconsistent, untested, or broken, set `passed=false` and list each concrete gap in `findings` (file path + what is wrong + why it violates the goal). One concrete finding is more valuable than three vague ones. - Be honest and conservative: when in doubt, do not pass. A false "passed" is worse than an extra verification round. +- Calibrate `completenessPct` to actual coverage and remaining gaps, not to effort spent. A change that does 80% of the goal correctly is 80, not 100, even if the implemented parts are flawless. From dc8fca0ce5a6ea8de067ff368804fe9c19bfc370 Mon Sep 17 00:00:00 2001 From: Jorben Date: Sun, 7 Jun 2026 17:49:49 +0800 Subject: [PATCH 8/8] =?UTF-8?q?refactor(goal):=20=E2=99=BB=EF=B8=8F=20remo?= =?UTF-8?q?ve=20goal-level=20time=5Fused=5Fseconds=20in=20favor=20of=20run?= =?UTF-8?q?-level=20elapsed=20tracking?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../20260607000001_drop_goal_time_used.sql | 5 + src-tauri/src/commands/agent.rs | 41 ----- src-tauri/src/core/agent_run_event_handler.rs | 34 ---- src-tauri/src/core/agent_run_manager.rs | 54 +------ src-tauri/src/core/app_state.rs | 148 ------------------ src-tauri/src/core/goal_manager.rs | 27 ++-- src-tauri/src/model/goal.rs | 5 - src-tauri/src/persistence/repo/goal_repo.rs | 12 +- src-tauri/src/persistence/repo/run_repo.rs | 78 --------- src-tauri/tests/goal_lifecycle.rs | 9 +- .../workbench-shell/model/thread-store.ts | 1 - src/services/bridge/agent-commands.test.ts | 1 - src/services/bridge/agent-commands.ts | 1 - 13 files changed, 21 insertions(+), 395 deletions(-) create mode 100644 src-tauri/migrations/20260607000001_drop_goal_time_used.sql diff --git a/src-tauri/migrations/20260607000001_drop_goal_time_used.sql b/src-tauri/migrations/20260607000001_drop_goal_time_used.sql new file mode 100644 index 00000000..06d0c76e --- /dev/null +++ b/src-tauri/migrations/20260607000001_drop_goal_time_used.sql @@ -0,0 +1,5 @@ +-- Drop goal-level time accounting. Time-tracking moved to thread_runs.elapsed_running_secs +-- (added by 20260604000000_run_elapsed_tracking.sql), which is summed across all of a thread's +-- runs (planning + implementation) and rendered by the workbench-shell timer. The goal-level +-- time_used_seconds column was write-only with no readers in budget enforcement, UI, or logging. +ALTER TABLE goals DROP COLUMN time_used_seconds; diff --git a/src-tauri/src/commands/agent.rs b/src-tauri/src/commands/agent.rs index f9e9f067..0b73853a 100644 --- a/src-tauri/src/commands/agent.rs +++ b/src-tauri/src/commands/agent.rs @@ -624,47 +624,6 @@ pub async fn goal_pause( match goal { Some(g) => { if g.status == crate::model::goal::GoalStatus::Active { - // Account elapsed time of any currently active run before pausing - if let Some(run_seconds) = - crate::persistence::repo::run_repo::get_active_run_elapsed_seconds( - &state.pool, - &thread_id, - ) - .await - .unwrap_or(None) - { - let active_run_id = crate::persistence::repo::run_repo::find_latest_by_thread( - &state.pool, - &thread_id, - ) - .await - .ok() - .flatten() - .and_then(|run| { - matches!( - run.status.as_str(), - "running" | "waiting_approval" | "needs_reply" - ) - .then_some(run.id) - }); - let paused_seconds = active_run_id - .as_deref() - .map(|run_id| { - let mut guard = - state.goal_runtime_state.lock().unwrap_or_else(|poisoned| { - tracing::warn!( - "goal_pause: goal runtime mutex poisoned, recovering" - ); - poisoned.into_inner() - }); - guard.take_run_paused_seconds(run_id).max(0) - }) - .unwrap_or(0); - let billable_seconds = (run_seconds - paused_seconds).max(0); - if billable_seconds > 0 { - mgr.account_usage(&g.id, 0, billable_seconds).await.ok(); - } - } mgr.pause(&g.id, crate::model::goal::PauseReason::UserRequested, None) .await?; } diff --git a/src-tauri/src/core/agent_run_event_handler.rs b/src-tauri/src/core/agent_run_event_handler.rs index 076c324d..6107563c 100644 --- a/src-tauri/src/core/agent_run_event_handler.rs +++ b/src-tauri/src/core/agent_run_event_handler.rs @@ -184,33 +184,6 @@ pub(crate) fn sidebar_status_for_runtime_event( } impl AgentRunManager { - fn start_goal_run_pause(&self, thread_id: &str, run_id: &str) { - if thread_id.is_empty() { - return; - } - let mut guard = self.goal_runtime_state.lock().unwrap_or_else(|poisoned| { - tracing::warn!("goal pause runtime mutex poisoned, recovering"); - poisoned.into_inner() - }); - guard.start_run_pause(thread_id, run_id); - } - - fn finish_goal_run_pause(&self, run_id: &str) { - let mut guard = self.goal_runtime_state.lock().unwrap_or_else(|poisoned| { - tracing::warn!("goal pause runtime mutex poisoned, recovering"); - poisoned.into_inner() - }); - guard.finish_run_pause(run_id); - } - - fn cleanup_goal_run_pause(&self, run_id: &str) { - let mut guard = self.goal_runtime_state.lock().unwrap_or_else(|poisoned| { - tracing::warn!("goal pause runtime mutex poisoned, recovering"); - poisoned.into_inner() - }); - guard.cleanup_run_pause(run_id); - } - pub(crate) async fn handle_runtime_channel_closed( self: &Arc, run_id: &str, @@ -410,26 +383,22 @@ impl AgentRunManager { } ThreadStreamEvent::ApprovalRequired { .. } => { let thread_id = self.get_thread_id(run_id).await; - self.start_goal_run_pause(&thread_id, run_id); run_repo::update_status(&self.pool, run_id, RunStatus::WaitingApproval).await?; thread_repo::update_status(&self.pool, &thread_id, &ThreadStatus::WaitingApproval) .await?; } ThreadStreamEvent::ClarifyRequired { .. } => { let thread_id = self.get_thread_id(run_id).await; - self.start_goal_run_pause(&thread_id, run_id); run_repo::update_status(&self.pool, run_id, RunStatus::NeedsReply).await?; thread_repo::update_status(&self.pool, &thread_id, &ThreadStatus::NeedsReply) .await?; } ThreadStreamEvent::ApprovalResolved { .. } => { - self.finish_goal_run_pause(run_id); run_repo::update_status(&self.pool, run_id, RunStatus::Running).await?; let thread_id = self.get_thread_id(run_id).await; thread_repo::update_status(&self.pool, &thread_id, &ThreadStatus::Running).await?; } ThreadStreamEvent::ClarifyResolved { .. } => { - self.finish_goal_run_pause(run_id); run_repo::update_status(&self.pool, run_id, RunStatus::Running).await?; let thread_id = self.get_thread_id(run_id).await; thread_repo::update_status(&self.pool, &thread_id, &ThreadStatus::Running).await?; @@ -458,7 +427,6 @@ impl AgentRunManager { } ThreadStreamEvent::RunCheckpointed { .. } => { let thread_id = self.get_thread_id(run_id).await; - self.start_goal_run_pause(&thread_id, run_id); run_repo::update_status(&self.pool, run_id, RunStatus::WaitingApproval).await?; thread_repo::update_status(&self.pool, &thread_id, &ThreadStatus::WaitingApproval) .await?; @@ -476,7 +444,6 @@ impl AgentRunManager { | ThreadStreamEvent::RunFailed { error, .. } => Some(error.as_str()), _ => None, }; - self.finish_goal_run_pause(run_id); self.finish_run(run_id, final_status, error_message).await?; let thread_id = self.get_thread_id(run_id).await; if let Some(frontend_tx) = self.frontend_tx_for_run(run_id).await { @@ -567,7 +534,6 @@ impl AgentRunManager { ); } } - self.cleanup_goal_run_pause(run_id); } Ok(()) diff --git a/src-tauri/src/core/agent_run_manager.rs b/src-tauri/src/core/agent_run_manager.rs index eefc0a1d..e6c8a6ad 100644 --- a/src-tauri/src/core/agent_run_manager.rs +++ b/src-tauri/src/core/agent_run_manager.rs @@ -24,7 +24,7 @@ use crate::core::sleep_manager::SleepManager; use crate::ipc::frontend_channels::ThreadStreamEvent; use crate::model::errors::{AppError, ErrorSource}; use crate::model::thread::{MessageAttachmentDto, MessageRecord, RunStatus}; -use crate::persistence::repo::{goal_repo, message_repo, run_repo, thread_repo, workspace_repo}; +use crate::persistence::repo::{message_repo, run_repo, thread_repo, workspace_repo}; pub(crate) use crate::core::agent_run_event_handler::build_orphaned_run_terminal_event; #[cfg(test)] @@ -433,44 +433,6 @@ impl AgentRunManager { let (profile_id, provider_id, model_id) = extract_run_model_refs(&model_plan_value); - // Account the planning run's billable time to the active goal so the - // frontend timer displays the correct accumulated time when the new - // implementation run starts (the frontend resets its local elapsed on - // every run_id change, so time_used_seconds must include the full - // planning-phase cost). - { - let planning_elapsed = run_repo::get_run_elapsed_seconds(&self.pool, &planning_run_id) - .await? - .unwrap_or(0); - let paused_seconds = { - let mut guard = self.goal_runtime_state.lock().unwrap_or_else(|poisoned| { - tracing::warn!("goal pause runtime mutex poisoned, recovering"); - poisoned.into_inner() - }); - guard.take_run_paused_seconds(&planning_run_id).max(0) - }; - let billable = (planning_elapsed - paused_seconds).max(0); - if billable > 0 { - if let Ok(Some(goal)) = goal_repo::find_by_thread_id(&self.pool, thread_id).await { - if let Err(error) = goal_repo::account_usage( - &self.pool, &goal.id, - 0, // tokens_delta: planning turns were already counted - billable, 0, // turns_delta - ) - .await - { - tracing::warn!( - planning_run_id = %planning_run_id, - goal_id = %goal.id, - billable_seconds = billable, - error = %error, - "failed to account planning run time to goal" - ); - } - } - } - } - let mut approval_metadata = approval_metadata; approval_metadata.state = IMPLEMENTATION_PLAN_APPROVED_STATE.to_string(); approval_metadata.approved_action = Some(action.clone()); @@ -512,20 +474,6 @@ impl AgentRunManager { ) .await?; - // Emit the updated goal state through the new run's event channel so - // the frontend sees the accumulated time_used_seconds (which now - // includes the planning-run time) before it starts the real-time timer - // for the new implementation run. - if let Ok(Some(goal)) = goal_repo::find_by_thread_id(&self.pool, thread_id).await { - let runs = self.active_runs.lock().await; - if let Some(run) = runs.get(&result.0) { - let _ = run.frontend_tx.send(ThreadStreamEvent::GoalStateUpdated { - thread_id: thread_id.to_string(), - goal: Some(crate::model::goal::GoalPayload::from(goal)), - }); - } - } - if let Some(seed_messages) = context_seed_messages.as_ref() { self.persist_messages(seed_messages).await?; } diff --git a/src-tauri/src/core/app_state.rs b/src-tauri/src/core/app_state.rs index 00e2f166..ac3d81a7 100644 --- a/src-tauri/src/core/app_state.rs +++ b/src-tauri/src/core/app_state.rs @@ -1,7 +1,6 @@ use std::collections::HashMap; use std::sync::{Arc, Mutex}; -use chrono::{DateTime, Utc}; use sqlx::SqlitePool; use tauri::AppHandle; @@ -31,12 +30,6 @@ pub struct GoalRuntimeState { pub idle_turn_count: HashMap, /// Consecutive completion claim counter per thread. pub completion_claim_count: HashMap, - /// Pause start timestamp per run while it waits for user action. - pub run_pause_started_at: HashMap>, - /// Accumulated user-wait pause seconds per run. - pub run_paused_seconds: HashMap, - /// Thread ID for each run with pause accounting state. - pub run_pause_thread_ids: HashMap, } impl GoalRuntimeState { @@ -47,66 +40,6 @@ impl GoalRuntimeState { self.thread_tool_calls.remove(thread_id); self.idle_turn_count.remove(thread_id); self.completion_claim_count.remove(thread_id); - - let run_ids: Vec = self - .run_pause_thread_ids - .iter() - .filter_map(|(run_id, stored_thread_id)| { - (stored_thread_id == thread_id).then(|| run_id.clone()) - }) - .collect(); - for run_id in run_ids { - self.cleanup_run_pause(&run_id); - } - } - - /// Begin timing a run's user-action pause. Repeated starts are ignored so - /// nested or duplicate waiting events do not lose the original start time. - pub fn start_run_pause(&mut self, thread_id: &str, run_id: &str) { - self.run_pause_thread_ids - .entry(run_id.to_string()) - .or_insert_with(|| thread_id.to_string()); - self.start_run_pause_at(run_id, Utc::now()); - } - - fn start_run_pause_at(&mut self, run_id: &str, started_at: DateTime) { - self.run_pause_started_at - .entry(run_id.to_string()) - .or_insert(started_at); - } - - /// Finish the current pause interval for a run and accumulate whole seconds. - pub fn finish_run_pause(&mut self, run_id: &str) -> i64 { - self.finish_run_pause_at(run_id, Utc::now()) - } - - fn finish_run_pause_at(&mut self, run_id: &str, finished_at: DateTime) -> i64 { - let Some(started_at) = self.run_pause_started_at.remove(run_id) else { - return *self.run_paused_seconds.get(run_id).unwrap_or(&0); - }; - - let paused_seconds = (finished_at - started_at).num_seconds().max(0); - let total = self - .run_paused_seconds - .entry(run_id.to_string()) - .or_insert(0); - *total += paused_seconds; - *total - } - - /// Take and clear the accumulated pause seconds for a run. - pub fn take_run_paused_seconds(&mut self, run_id: &str) -> i64 { - self.finish_run_pause(run_id); - let seconds = self.run_paused_seconds.remove(run_id).unwrap_or(0); - self.run_pause_thread_ids.remove(run_id); - seconds - } - - /// Clear all pause accounting state for a run. - pub fn cleanup_run_pause(&mut self, run_id: &str) { - self.run_pause_started_at.remove(run_id); - self.run_paused_seconds.remove(run_id); - self.run_pause_thread_ids.remove(run_id); } } @@ -186,84 +119,3 @@ impl AppState { } } } - -#[cfg(test)] -mod tests { - use super::GoalRuntimeState; - use chrono::{Duration, TimeZone, Utc}; - - #[test] - fn run_pause_tracking_is_idempotent_accumulative_and_cleared_on_take() { - let mut state = GoalRuntimeState::default(); - let start = Utc.with_ymd_and_hms(2026, 5, 31, 12, 0, 0).unwrap(); - - state - .run_pause_thread_ids - .insert("run-1".to_string(), "thread-1".to_string()); - state.start_run_pause_at("run-1", start); - state.start_run_pause_at("run-1", start + Duration::seconds(10)); - - assert_eq!( - state.finish_run_pause_at("run-1", start + Duration::seconds(5)), - 5, - ); - assert_eq!( - state.finish_run_pause_at("run-1", start + Duration::seconds(20)), - 5, - ); - - state.start_run_pause_at("run-1", start + Duration::seconds(30)); - assert_eq!( - state.finish_run_pause_at("run-1", start + Duration::seconds(37)), - 12, - ); - - assert_eq!(state.take_run_paused_seconds("run-1"), 12); - assert_eq!(state.take_run_paused_seconds("run-1"), 0); - assert!(!state.run_pause_started_at.contains_key("run-1")); - assert!(!state.run_paused_seconds.contains_key("run-1")); - assert!(!state.run_pause_thread_ids.contains_key("run-1")); - } - - #[test] - fn cleanup_thread_removes_run_pause_state_for_that_thread() { - let mut state = GoalRuntimeState::default(); - let start = Utc.with_ymd_and_hms(2026, 5, 31, 12, 0, 0).unwrap(); - - state - .run_pause_thread_ids - .insert("run-1".to_string(), "thread-1".to_string()); - state.start_run_pause_at("run-1", start); - state - .run_pause_thread_ids - .insert("run-2".to_string(), "thread-2".to_string()); - state.start_run_pause_at("run-2", start); - state.run_paused_seconds.insert("run-1".to_string(), 3); - state.run_paused_seconds.insert("run-2".to_string(), 5); - - state.cleanup_thread("thread-1"); - - assert!(!state.run_pause_started_at.contains_key("run-1")); - assert!(!state.run_paused_seconds.contains_key("run-1")); - assert!(!state.run_pause_thread_ids.contains_key("run-1")); - assert!(state.run_pause_started_at.contains_key("run-2")); - assert_eq!(state.run_paused_seconds.get("run-2"), Some(&5)); - assert_eq!( - state.run_pause_thread_ids.get("run-2").map(String::as_str), - Some("thread-2"), - ); - } - - #[test] - fn run_pause_tracking_clamps_negative_intervals() { - let mut state = GoalRuntimeState::default(); - let start = Utc.with_ymd_and_hms(2026, 5, 31, 12, 0, 0).unwrap(); - - state.start_run_pause_at("run-1", start); - - assert_eq!( - state.finish_run_pause_at("run-1", start - Duration::seconds(5)), - 0, - ); - } -} diff --git a/src-tauri/src/core/goal_manager.rs b/src-tauri/src/core/goal_manager.rs index bd7298db..f3d3ff2b 100644 --- a/src-tauri/src/core/goal_manager.rs +++ b/src-tauri/src/core/goal_manager.rs @@ -148,7 +148,6 @@ impl GoalManager { status: GoalStatus::Active, token_budget, tokens_used: 0, - time_used_seconds: 0, turns_used: 0, max_turns: DEFAULT_MAX_TURNS, pause_reason: None, @@ -251,14 +250,9 @@ impl GoalManager { goal_repo::delete_by_thread_id(&self.pool, &self.thread_id).await } - /// Account usage after a turn. Increments turn count, tokens, and time. - pub async fn account_usage( - &self, - goal_id: &str, - tokens: i64, - time_seconds: i64, - ) -> Result<(), AppError> { - goal_repo::account_usage(&self.pool, goal_id, tokens, time_seconds, 1).await + /// Account usage after a turn. Increments turn count and tokens. + pub async fn account_usage(&self, goal_id: &str, tokens: i64) -> Result<(), AppError> { + goal_repo::account_usage(&self.pool, goal_id, tokens, 1).await } // ── Auto-resume ── @@ -609,20 +603,19 @@ impl GoalManager { } } + // Bump goal turn counter for any run that did real work. We still consult + // run duration to filter out zero-work runs (e.g. an immediately-interrupted + // run shouldn't burn a turn against max_turns); active running time is + // tracked separately on thread_runs.elapsed_running_secs and is no longer + // billed against the goal here. if let Some(run_seconds) = crate::persistence::repo::run_repo::get_run_duration(&self.pool, run_id) .await .unwrap_or(None) { - let paused_seconds = self.lock_runtime().take_run_paused_seconds(run_id).max(0); - let billable_seconds = (run_seconds - paused_seconds).max(0); - if billable_seconds > 0 { - self.account_usage(¤t.id, 0, billable_seconds) - .await - .ok(); + if run_seconds > 0 { + self.account_usage(¤t.id, 0).await.ok(); } - } else { - self.lock_runtime().take_run_paused_seconds(run_id); } let updated = self.get_active().await?; diff --git a/src-tauri/src/model/goal.rs b/src-tauri/src/model/goal.rs index 1868fb28..62f129d2 100644 --- a/src-tauri/src/model/goal.rs +++ b/src-tauri/src/model/goal.rs @@ -119,7 +119,6 @@ pub struct GoalRecord { pub status: GoalStatus, pub token_budget: Option, pub tokens_used: i64, - pub time_used_seconds: i64, pub turns_used: i64, pub max_turns: i64, pub pause_reason: Option, @@ -151,7 +150,6 @@ pub struct GoalDto { #[serde(skip_serializing_if = "Option::is_none")] pub token_budget: Option, pub tokens_used: i64, - pub time_used_seconds: i64, pub turns_used: i64, pub max_turns: i64, #[serde(skip_serializing_if = "Option::is_none")] @@ -184,7 +182,6 @@ impl From for GoalDto { status: r.status, token_budget: r.token_budget, tokens_used: r.tokens_used, - time_used_seconds: r.time_used_seconds, turns_used: r.turns_used, max_turns: r.max_turns, pause_reason: r.pause_reason, @@ -218,7 +215,6 @@ pub struct GoalPayload { pub objective: String, pub status: GoalStatus, pub tokens_used: i64, - pub time_used_seconds: i64, pub turns_used: i64, pub max_turns: i64, #[serde(skip_serializing_if = "Option::is_none")] @@ -250,7 +246,6 @@ impl From for GoalPayload { objective: r.objective, status: r.status, tokens_used: r.tokens_used, - time_used_seconds: r.time_used_seconds, turns_used: r.turns_used, max_turns: r.max_turns, token_budget: r.token_budget, diff --git a/src-tauri/src/persistence/repo/goal_repo.rs b/src-tauri/src/persistence/repo/goal_repo.rs index 3424104a..72a3c53a 100644 --- a/src-tauri/src/persistence/repo/goal_repo.rs +++ b/src-tauri/src/persistence/repo/goal_repo.rs @@ -5,7 +5,7 @@ use crate::model::errors::AppError; use crate::model::goal::{GoalRecord, GoalStatus, PauseReason}; const SELECT_COLUMNS: &str = "id, thread_id, objective, status, token_budget, tokens_used, \ - time_used_seconds, turns_used, max_turns, pause_reason, pause_detail, evidence, \ + turns_used, max_turns, pause_reason, pause_detail, evidence, \ last_evaluated_run_id, judge_passed, judge_completeness, judge_findings, judge_summary, \ judge_evaluated_run_id, created_at, updated_at"; @@ -19,7 +19,6 @@ struct GoalRow { status: String, token_budget: Option, tokens_used: i64, - time_used_seconds: i64, turns_used: i64, max_turns: i64, pause_reason: Option, @@ -44,7 +43,6 @@ impl GoalRow { status: GoalStatus::from_str(&self.status), token_budget: self.token_budget, tokens_used: self.tokens_used, - time_used_seconds: self.time_used_seconds, turns_used: self.turns_used, max_turns: self.max_turns, pause_reason: self.pause_reason.map(|s| PauseReason::from_str(&s)), @@ -98,9 +96,9 @@ pub async fn insert(pool: &SqlitePool, record: &GoalRecord) -> Result<(), AppErr let now = Utc::now().to_rfc3339(); sqlx::query( "INSERT INTO goals (id, thread_id, objective, status, token_budget, tokens_used, \ - time_used_seconds, turns_used, max_turns, pause_reason, pause_detail, evidence, \ + turns_used, max_turns, pause_reason, pause_detail, evidence, \ last_evaluated_run_id, created_at, updated_at) \ - VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)", + VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)", ) .bind(&record.id) .bind(&record.thread_id) @@ -108,7 +106,6 @@ pub async fn insert(pool: &SqlitePool, record: &GoalRecord) -> Result<(), AppErr .bind(record.status.as_str()) .bind(record.token_budget) .bind(record.tokens_used) - .bind(record.time_used_seconds) .bind(record.turns_used) .bind(record.max_turns) .bind(record.pause_reason.as_ref().map(|r| r.as_str())) @@ -151,19 +148,16 @@ pub async fn account_usage( pool: &SqlitePool, id: &str, tokens_delta: i64, - time_delta_seconds: i64, turns_delta: i64, ) -> Result<(), AppError> { sqlx::query( "UPDATE goals SET \ tokens_used = tokens_used + ?, \ - time_used_seconds = time_used_seconds + ?, \ turns_used = turns_used + ?, \ updated_at = ? \ WHERE id = ?", ) .bind(tokens_delta) - .bind(time_delta_seconds) .bind(turns_delta) .bind(Utc::now().to_rfc3339()) .bind(id) diff --git a/src-tauri/src/persistence/repo/run_repo.rs b/src-tauri/src/persistence/repo/run_repo.rs index 29265a43..ad783f65 100644 --- a/src-tauri/src/persistence/repo/run_repo.rs +++ b/src-tauri/src/persistence/repo/run_repo.rs @@ -1152,44 +1152,6 @@ mod tests { "expected running segment to be added, got {elapsed}" ); } - - #[tokio::test] - async fn get_active_run_elapsed_seconds_returns_positive_for_running() { - let pool = setup_test_pool().await; - // Insert a running run with a past started_at so elapsed > 0 - sqlx::query( - "INSERT INTO thread_runs (id, thread_id, run_mode, status, started_at, input_tokens, output_tokens, total_tokens) - VALUES ('run-active', 't1', 'default', 'running', '2026-04-22T09:00:00Z', 0, 0, 0)", - ) - .execute(&pool) - .await - .expect("seed run"); - - let duration = super::get_active_run_elapsed_seconds(&pool, "t1") - .await - .unwrap() - .expect("should return elapsed seconds for running run"); - // With started_at in the past, elapsed should be > 0 - assert!(duration > 0, "expected positive elapsed, got {duration}"); - } - - #[tokio::test] - async fn get_active_run_elapsed_seconds_skips_terminal_runs() { - let pool = setup_test_pool().await; - // Insert a completed run (should be skipped) - sqlx::query( - "INSERT INTO thread_runs (id, thread_id, run_mode, status, started_at, input_tokens, output_tokens, total_tokens) - VALUES ('run-done', 't1', 'default', 'completed', '2026-04-22T09:00:00Z', 0, 0, 0)", - ) - .execute(&pool) - .await - .expect("seed run"); - - let duration = super::get_active_run_elapsed_seconds(&pool, "t1") - .await - .unwrap(); - assert!(duration.is_none(), "should skip completed runs"); - } } /// Get the duration in seconds of the last completed run for a thread. @@ -1231,46 +1193,6 @@ pub async fn get_run_duration(pool: &SqlitePool, run_id: &str) -> Result Result, AppError> { - let duration = sqlx::query_scalar::<_, Option>( - "SELECT CAST(strftime('%s', 'now') - strftime('%s', started_at) AS INTEGER) - FROM thread_runs - WHERE id = ? - LIMIT 1", - ) - .bind(run_id) - .fetch_optional(pool) - .await? - .flatten(); - Ok(duration) -} - -/// Get the elapsed seconds of any currently active (non-terminal) run for a thread. -/// Returns None if no active run exists. -pub async fn get_active_run_elapsed_seconds( - pool: &SqlitePool, - thread_id: &str, -) -> Result, AppError> { - let duration = sqlx::query_scalar::<_, Option>( - "SELECT CAST(strftime('%s', 'now') - strftime('%s', started_at) AS INTEGER) - FROM thread_runs - WHERE thread_id = ? - AND status NOT IN ('completed','failed','denied','interrupted','cancelled','limit_reached') - ORDER BY started_at DESC - LIMIT 1", - ) - .bind(thread_id) - .fetch_optional(pool) - .await? - .flatten(); - Ok(duration) -} - /// Bulk-fetch the Unix-millisecond start timestamp of the currently active /// (non-terminal) run for each thread in `thread_ids`. Threads without an /// active run are simply absent from the returned map. Used by the sidebar diff --git a/src-tauri/tests/goal_lifecycle.rs b/src-tauri/tests/goal_lifecycle.rs index ecff3198..157d7704 100644 --- a/src-tauri/tests/goal_lifecycle.rs +++ b/src-tauri/tests/goal_lifecycle.rs @@ -138,7 +138,6 @@ mod tests { let after_first = mgr.get_active().await.unwrap().unwrap(); assert_eq!(after_first.turns_used, goal.turns_used + 1); - assert_eq!(after_first.time_used_seconds, 42); assert_eq!(after_first.last_evaluated_run_id.as_deref(), Some("run-1")); let second = mgr @@ -150,10 +149,6 @@ mod tests { let after_second = mgr.get_active().await.unwrap().unwrap(); assert_eq!(after_second.turns_used, after_first.turns_used); - assert_eq!( - after_second.time_used_seconds, - after_first.time_used_seconds - ); } #[tokio::test] @@ -233,7 +228,7 @@ mod tests { let goal = mgr.create_goal("Test goal", None).await.unwrap(); // Set turns_used to at least max_turns via account_usage - goal_repo::account_usage(&pool, &goal.id, 0, 0, goal.max_turns) + goal_repo::account_usage(&pool, &goal.id, 0, goal.max_turns) .await .unwrap(); @@ -373,7 +368,7 @@ mod tests { let goal = mgr.create_goal("Test goal", Some(500)).await.unwrap(); // Accumulate tokens to reach the budget - goal_repo::account_usage(&pool, &goal.id, 500, 0, 0) + goal_repo::account_usage(&pool, &goal.id, 500, 0) .await .unwrap(); diff --git a/src/modules/workbench-shell/model/thread-store.ts b/src/modules/workbench-shell/model/thread-store.ts index 97ec8701..f87adfc6 100644 --- a/src/modules/workbench-shell/model/thread-store.ts +++ b/src/modules/workbench-shell/model/thread-store.ts @@ -93,7 +93,6 @@ export interface GoalStoreState { objective: string; status: "active" | "paused" | "budget_limited" | "complete"; tokensUsed: number; - timeUsedSeconds: number; turnsUsed: number; maxTurns: number; tokenBudget?: number | null; diff --git a/src/services/bridge/agent-commands.test.ts b/src/services/bridge/agent-commands.test.ts index 25695b82..eb521ca7 100644 --- a/src/services/bridge/agent-commands.test.ts +++ b/src/services/bridge/agent-commands.test.ts @@ -366,7 +366,6 @@ function makeGoalPayload(overrides: Partial = {}): GoalPayload { objective: "Build a todo app", status: "active", tokensUsed: 0, - timeUsedSeconds: 0, turnsUsed: 0, maxTurns: 50, tokenBudget: null, diff --git a/src/services/bridge/agent-commands.ts b/src/services/bridge/agent-commands.ts index d6ec3012..9e1c17ac 100644 --- a/src/services/bridge/agent-commands.ts +++ b/src/services/bridge/agent-commands.ts @@ -727,7 +727,6 @@ export type GoalPayload = { objective: string; status: "active" | "paused" | "budget_limited" | "complete"; tokensUsed: number; - timeUsedSeconds: number; turnsUsed: number; maxTurns: number; tokenBudget?: number | null;