From d19a9e8b80e45f2492eee55324f83b5c84e80d96 Mon Sep 17 00:00:00 2001 From: Jorben Date: Sun, 7 Jun 2026 11:54:01 +0800 Subject: [PATCH 01/16] =?UTF-8?q?feat(goal):=20=E2=9C=A8=20replace=20self-?= =?UTF-8?q?attestation=20goal=5Fscored=20with=20independent=20Judge=20acce?= =?UTF-8?q?ptance=20agent?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Remove the `goal_scored` tool that allowed the main agent to self-attest goal completion, replacing it with an `agent_judge` built-in subagent that independently verifies goal attainment against the project's current state. Key changes: - Add `SubagentProfile::Judge` with read-only file tools and diagnostic-only shell (soft constraint via prompt) - Add `JudgeReport` structured contract (passed, completeness_pct, findings, summary) with safe fallback parsing - Add `agent_judge` tool injection only for the main agent when an unverified goal exists; runtime gate blocks subagent/parallel recursion into Judge - Add DB migration for `judge_passed`, `judge_completeness`, `judge_findings`, `judge_summary`, `judge_evaluated_run_id` columns with backfill for legacy `status='complete'` goals - Replace continuation stop condition: `Complete && judge_passed` instead of `goal_scored`-driven status flip - Rewrite continuation prompt to instruct main agent to call `agent_judge` and follow findings on rejection - Add Judge prompt surface, templates, and output contract - Update `active_goal.tpl.md` to reflect Judge acceptance flow - Extend goal lifecycle tests for Judge pass/fail/legacy compat --- docs/goal-judge-evaluation-refactor.md | 346 +++++++++++++ .../20260607000000_goal_judge_fields.sql | 17 + src-tauri/src/core/agent_session.rs | 37 +- src-tauri/src/core/agent_session_execution.rs | 453 +++++++++--------- src-tauri/src/core/agent_session_tools.rs | 28 +- src-tauri/src/core/goal_manager.rs | 124 +++-- .../prompt/sources/custom_subagent_body.rs | 19 + .../sources/subagent_output_contract.rs | 4 + src-tauri/src/core/prompt/surface.rs | 4 + .../src/core/prompt/surface_extensions.rs | 5 + .../core/prompt/templates/active_goal.tpl.md | 20 +- .../core/prompt/templates/subagent/judge.md | 24 + .../subagent/output_contract.judge.md | 21 + src-tauri/src/core/subagent/judge_contract.rs | 287 +++++++++++ src-tauri/src/core/subagent/mod.rs | 2 + src-tauri/src/core/subagent/orchestrator.rs | 12 + .../core/subagent/runtime_orchestration.rs | 147 ++++++ src-tauri/src/gateway/gateway_runner.rs | 2 +- src-tauri/src/ipc/frontend_channels.rs | 7 +- src-tauri/src/model/goal.rs | 43 +- src-tauri/src/model/subagent.rs | 2 +- src-tauri/src/persistence/repo/goal_repo.rs | 80 +++- src-tauri/tests/goal_lifecycle.rs | 145 +++++- src/i18n/locales/en.ts | 1 + src/i18n/locales/zh-CN.ts | 1 + .../workbench-shell/model/thread-store.ts | 5 + .../workbench-shell/ui/goal-status-bar.tsx | 2 +- .../ui/runtime-thread-surface.tsx | 6 +- src/services/bridge/agent-commands.ts | 7 +- 29 files changed, 1517 insertions(+), 334 deletions(-) create mode 100644 docs/goal-judge-evaluation-refactor.md create mode 100644 src-tauri/migrations/20260607000000_goal_judge_fields.sql create mode 100644 src-tauri/src/core/prompt/templates/subagent/judge.md create mode 100644 src-tauri/src/core/prompt/templates/subagent/output_contract.judge.md create mode 100644 src-tauri/src/core/subagent/judge_contract.rs diff --git a/docs/goal-judge-evaluation-refactor.md b/docs/goal-judge-evaluation-refactor.md new file mode 100644 index 00000000..ebea6c5f --- /dev/null +++ b/docs/goal-judge-evaluation-refactor.md @@ -0,0 +1,346 @@ +# Goal 评估与续行重构方案:引入 Judge 验收 Agent + +> 状态:设计方案(待评审) +> 关联模块:`src-tauri/src/core/goal_manager.rs`、`src-tauri/src/core/subagent/`、`src-tauri/src/core/agent_run_event_handler.rs`、`src-tauri/src/model/goal.rs` +> 决策基线(已澄清): +> 1. **保留全部现有护栏**(idle 空转、clarify/update_plan 暂停、token/turn 预算上限),仅把“是否完成”的判定从自主声明改为 Judge 验收。 +> 2. **复用 `GoalStatus::Complete` 状态** 表达“通过验收”,并在 `goals` 表新增 Judge 评估字段持久化最近一次裁决;迁移需把存量 `status='complete'` goal 回填为 `judge_passed=1`。 +> 3. **由主 agent 主动调用 `agent_judge`**,系统在 run 终止后通过续行 prompt 引导主 agent 先验收、未通过则修复后重验。 +> 4. **`agent_judge` 是主 agent 专属工具**:只在有未完成 goal 时注入主 agent,且运行时必须硬性拒绝任何 subagent 递归调用 Judge,即使工具名被 `RuntimeOrchestrationTool::parse()` 解析出来也不能放行。 +> 5. **Judge 使用诊断型 shell 软约束**:Judge 的文件工具保持只读;允许 `shell` 仅用于测试、type-check、lint、只读检查等诊断验证,并通过 Judge prompt 明确禁止用 shell 修改文件、删除数据、安装依赖或改变全局状态。首版不新增受限 shell 沙箱。 +> 6. **Judge 默认使用 primary 模型角色**,优先保证验收质量;首版不把 Judge/subagent 的 token 单独计入 goal token budget,也不新增 Judge 专属硬超时,沿用现有 helper run 的 turn/取消机制。 +> 7. **删除失效的自主完成路径**:移除 `goal_scored`、`GoalVerdict::Complete` 的旧自证语义,以及由 `goal_scored` 空 evidence 触发的 `NoEvidence` / `MISSING_EVIDENCE_PROMPT` 分支。 + +--- + +## 1. 背景与问题 + +当前 goal 的"完成"判定依赖主 agent 自主调用 `goal_scored(status, evidence, pledge)` 工具来声明达成。这是一种**自证式(self-attestation)**设计: + +- 工具内部只校验 `status == "complete"`、`pledge` 文本逐字匹配、`evidence` 非空(见 `agent_session_execution.rs` 的 `execute_goal_tool()`)。 +- 它**无法验证 evidence 的真伪**,也无法核对结果是否真的满足 goal 的一致性与完整性。 + +实测发现部分模型即便明知仍有未完成项,也会照抄 pledge 文本、编造 evidence 来调用 `goal_scored` 并提前结束任务。pledge + evidence 非空这类形式化护栏对"不诚实声明"无效,这是自主声明方式的**设计缺陷**。 + +**核心思路**:把"完成判定权"从被评估者(主 agent)手中移交给独立的评估者(Judge Agent)。主 agent 不能再自己宣布通过;只有 Judge 基于 goal 内容对项目当前状态做出"通过"裁决,goal 记录才会扭转为通过验收状态。续行监督也随之改为以"是否通过验收"为准。 + +--- + +## 2. 现状梳理(已确认事实) + +### 2.1 Goal 数据模型与持久化 + +- `GoalStatus`(`src-tauri/src/model/goal.rs`):`Active` / `Paused` / `BudgetLimited` / `Complete` 四态。 +- `goals` 表(`migrations/20260530000000_goals.sql` 及后续迁移):每 `thread_id` 唯一一条 goal;含 `status`、`evidence`、`tokens_used`、`turns_used`、`max_turns`、`pause_reason`、`last_evaluated_run_id` 等列。 +- `GoalManager`(`src-tauri/src/core/goal_manager.rs`)封装 CRUD + 评估 + prompt 生成。关键方法:`mark_complete(goal_id, evidence)`、`evaluate_after_turn(response, goal) -> GoalVerdict`(同步 CPU 启发式)、`evaluate_after_run(run_id, response) -> GoalEvaluationOutcome`(异步、含去重 CAS)。 + +### 2.2 `goal_scored` 工具链路 + +- 工具定义在 `agent_session_tools.rs` 的 `runtime_tools_for_profile()`,常量 `GOAL_SCORED_TOOL_NAME` / `GOAL_SCORED_PLEDGE` 在 `goal_manager.rs`。 +- 调用分派在 `agent_session_execution.rs::execute_tool_call()` → `execute_goal_tool()`:校验 status/pledge/evidence → `mark_complete()` → 发送 `GoalCompleted` + `GoalStateUpdated` 事件。 + +### 2.3 续行监督逻辑 + +- run 终止后,`agent_run_event_handler.rs::maybe_continue_goal_after_terminal_run()` 是入口。 +- 前置条件:`goal_continuation_enabled == true`、`final_status ∈ {Completed, Interrupted}`。 +- 调用 `evaluate_after_run()` 内部走 `evaluate_after_turn()` 分层启发式: + - **Layer 1** 工具阻塞:`clarify` → `Paused(ClarifyPending)`;`update_plan` → `Paused(PlanPending)`;`goal_scored` 放行。 + - **Layer 2** idle/完成声明:连续 idle ≥ `MAX_IDLE_TURNS(3)` → `Paused(IdleBlocked)`;检测到完成关键词但未调工具 → `ChallengeEvidence`(反复声称达上限 → `IdleBlocked`)。 + - **Layer 3** 预算:tokens 超 budget → `BudgetLimited`;turns 超 `max_turns` → `Paused(BudgetExhausted)`。 + - 默认 → `Continue`。 +- verdict 为 `Continue` / `ChallengeEvidence` 时,用 continuation prompt 启动新 run;`Paused` / `BudgetLimited` / `skipped` 时不续行。 +- **关键现状**:续行从不查询 goal 的 `Complete` 状态。它实际依靠"模型没有再触发任何阻塞/完成信号 + goal 仍 `Active`"间接推断。一旦 `goal_scored` 被调用,`mark_complete()` 把 status 写成 `Complete`,下一轮 `evaluate_after_run()` 因 goal 非 `Active` 返回 `skipped`,从而停止续行。 + +### 2.4 Subagent 机制 + +- 内建 subagent:`Explore`、`Review`、`Parallel`,定义在 `subagent/runtime_orchestration.rs` 的 `RuntimeOrchestrationTool` / `SubagentProfile`。 +- 深度模型:主 agent = depth 1;主 agent 直接子代理 = depth 2(`MAIN_AGENT_CHILD_DEPTH`);`GLOBAL_MAX_DELEGATION_DEPTH = 5`;内建默认 `BUILTIN_DEFAULT_MAX_DELEGATION_DEPTH = 3`。 +- 委派校验:`orchestrator.rs::validate_delegation_capability(caller, target_tool, target_profile, child_depth)`,三重检查(调用方 `can_delegate`、全局上限、目标 `max_delegation_depth`)。 +- 权限模型:`Explore` 只读(read/list/find/search/web_search,`can_delegate=false`);`Review` 只读 + 诊断 shell + git/term 只读(`can_delegate=true`);`Custom` 按 `allowed_tools` 白名单。 +- 工具注入:主 agent 在 `agent_session_tools.rs::runtime_tools_for_profile()` 中 `tools.extend(runtime_orchestration_tools())`;自定义在 `agent_session.rs::build_session_spec()` 注入。 +- Prompt 注入:`build_helper_system_prompt()` 按 `PromptSurface`(`prompt/surface.rs`)选择 section;task 通过 `agent.prompt(request.task)` 注入为 user message。 + +--- + +## 3. 设计目标 + +1. 新增内建 **Judge** subagent,对项目当前状态做 goal 达成度评估,结构化返回:通过与否(bool)、完整度百分比、判定依据(未达成/不符合点描述)。 +2. Judge 通过时**扭转 goal 记录为通过验收状态**(复用 `Complete` + 持久化 Judge 字段)。 +3. Judge 上下文注入 goal 内容,评估重点是 goal 要求的**一致性**与**完整性**。 +4. Judge 文件工具保持**只读**,允许 `read` / `list` / `find` / `search` / `web_search`;允许 `shell` 但仅作为诊断型软约束工具用于测试、type-check、lint、只读检查;允许再发起 subagent(含并行,如 explore/review 协助),**自身最大被委派深度为 2**。 +5. **删除 `goal_scored` 工具**。完成判定不再由主 agent 自证。 +6. 续行监督改为:判定 goal 记录是否“通过验收”;未通过且 goal 仍 Active 则续行,并在 continuation prompt 中明确要求主 agent 调用 `agent_judge` 验收并遵循验收结果。 +7. **按需注入**:仅当 thread 有未通过验收的 goal 时,才向**主 agent**注入 `agent_judge` 工具;所有 subagent 均不注入且运行时拒绝递归调用 `agent_judge`;无 goal 或已验收通过时不注入。 + +--- + +## 4. 总体设计 + +### 4.1 角色与职责重划 + +| 角色 | 重构前 | 重构后 | +|------|--------|--------| +| 主 agent | 自己调 `goal_scored` 声明完成 | 干活 + 自认为完成后调 `agent_judge` 申请验收;不能自证完成 | +| Judge agent | 不存在 | 独立验收者,文件工具只读且 shell 仅诊断软约束,基于 goal 评估项目当前状态,产出结构化裁决;通过则扭转 goal 状态 | +| 续行监督 | 间接依赖 goal 非 Active 停续行 | 显式以"goal 是否通过验收(Complete + judge_passed)"为停续行依据 | + +### 4.2 端到端数据流 + +``` +用户 /goal + └─ goal_set() → create_goal(status=Active) + └─ 注入 ActiveGoalSource 到主 agent system prompt(更新文案:完成须经 agent_judge 验收) + └─ 按需向主 agent 注入 agent_judge 工具(goal 存在且尚未通过验收) + +主 agent run:工作 → 自认为达成 → 调用 agent_judge(task) + └─ execute_tool_call() 路由到 Judge 编排 + └─ HelperAgentOrchestrator::run_helper(SubagentProfile::Judge) + ├─ build_helper_system_prompt(PromptSurface::SubagentJudge) + 注入 goal objective 到上下文 + ├─ Judge 工具集:read/list/find/search/web_search/shell(仅诊断软约束) + (depth 允许时)agent_explore/agent_review/agent_parallel + ├─ Judge 调研验证:读代码、搜索、运行测试/type-check/lint 等诊断命令、并行 explore/review + └─ 产出结构化 JudgeReport { passed, completeness_pct, findings, summary } + └─ Judge 编排回写 goal 记录: + ├─ 总是:persist 最近一次 judge_passed / judge_completeness / judge_findings / judge_summary / judge_evaluated_run_id + └─ passed == true:事务写入 status=Complete + judge_passed=true + evidence=summary + 发送 GoalCompleted + GoalStateUpdated 事件 + └─ agent_judge 工具结果(JudgeReport 文本)返回给主 agent + +run 终止 + └─ maybe_continue_goal_after_terminal_run() + └─ evaluate_after_run() + ├─ 若 goal.status == Complete && goal.judge_passed == true(已通过验收)→ skipped(停续行)✅ + ├─ 若 goal.status != Active → skipped(非活跃 goal 不自动续行,保留现有暂停/预算语义) + ├─ 否则保留现有护栏:clarify/update_plan/idle/预算 → Paused/BudgetLimited + └─ 否则 → Continue:注入新版 continuation prompt + "你尚未通过验收。请先用 agent_judge 验收;若上次验收未通过, + 按 findings 修复后再次调用 agent_judge。" + └─ Continue → 启动新 run(回到主 agent run) +``` + +### 4.3 为什么选择这套方案(与备选对比) + +- **复用 `Complete` 而非新增 `Verified` 枚举**:`Complete` 在 DDL CHECK 约束、`GoalStatus` 枚举、前端状态条、gateway 文案中均已铺开。新增枚举值需要同步迁移、前端、序列化多处,收益有限。改为复用 `Complete` 并以 `judge_passed` 布尔列区分"是否经 Judge 验收",改动面最小且语义清晰(通过验收 = `Complete` 且 `judge_passed=true`)。 +- **保留全部护栏**:Judge 解决的是"完成判定的可信度",而 idle 空转、clarify/update_plan 暂停、预算上限解决的是"防止无限续行/资源失控/阻塞等待"。两者正交,移除护栏会让无 goal 评估能力时的兜底消失,引入失控风险。 +- **主 agent 主动调用 + 续行引导**(而非系统自动发起 Judge):保持与现有 subagent 调用模型一致(主 agent 通过工具调用委派),实现侵入小;系统侧只需在续行 prompt 中“催”主 agent 去验收,无需在 run 终止后再隐式拉起一个评估 run 改变运行时调度。续行 prompt 会持续施压,直到 goal 被 Judge 标记通过,规避了“主 agent 不调 Judge 就永远不验收”的死角。 +- **Judge 作为主 agent 专属内建工具**:虽然 `agent_judge` 会加入 `RuntimeOrchestrationTool::parse()`,但它不进入 `builtin_all()` 和 `delegation_tools_for_helper()`,也不允许 subagent 递归调用。这样保留统一工具解析与 helper 编排复用,同时避免 explore/review/custom/Judge 自己绕过“主 agent 申请验收”的职责边界。 +- **诊断型 shell 软约束而非新沙箱**:Judge 需要能运行测试、type-check、lint 等验证命令,因此首版复用现有 `shell` 工具;但该工具能力本身不是硬只读,必须在 Judge prompt 中明确限制为诊断用途,禁止修改文件、删除数据、安装依赖、启动交互式长进程或改变全局状态。新建受限 shell/test-runner 工具会扩大改动面,首版暂不引入。 +- **Judge 使用 primary 模型角色**:验收质量优先于成本,Judge 默认走 `model_plan.primary`。Explore/Review 继续保持现有模型策略,Judge 内部再委派时由各子代理自己的模型映射决定。 + +### 4.4 首版范围边界 + +首版目标是打通后端 Judge 验收闭环:工具注入、subagent 运行、结构化解析、goal 回写、续行停止、迁移兼容和测试覆盖。前端仅同步类型并在现有状态条显示“已验收通过”这一最小信息;`judge_completeness` 的精细 UI、额外事件、ACP/gateway 的详细状态展示、Judge token 单独计入 goal budget、Judge 专属超时或受限 shell 沙箱均作为后续增强,不进入首版。 + +--- + +## 5. 详细实现 + +### 5.1 Judge subagent profile(`subagent/runtime_orchestration.rs`) + +- `RuntimeOrchestrationTool` 新增变体 `Judge`,工具名映射 `agent_judge`;`parse("agent_judge") -> Some(Judge)`。同时补齐 `tool_name()`、`title()`、`description()`、`profile()`、`as_agent_tool()` 的 match 分支,`as_agent_tool()` 的 schema 只需要 `task: string`。 +- `SubagentProfile` 新增 `Judge` 变体,并补齐 `helper_kind()`(固定返回 `helper_judge`)、`system_prompt()`、`can_delegate()`、`max_delegation_depth()`、`helper_tools()` 等 match 分支。 +- `resolve_helper_profile()` 增加 `RuntimeOrchestrationTool::Judge => Some(SubagentProfile::Judge)`;`resolve_helper_model_role()` 增加 Judge 分支,默认使用 `model_plan.primary`,不要复用 Explore/Review 的 auxiliary 映射。 +- `helper_tools()` for `Judge`:`read` / `list` / `find` / `search` / `web_search`(条件启用)/ `shell`(仅诊断验证)。**不含** `edit` / `write` / `term_write` / `term_restart` / `term_close`。需要在工具描述和 Judge prompt 中明确:`shell` 只能运行测试、type-check、lint、只读检查等诊断命令,不能修改文件、删除数据、安装依赖、启动交互式长进程或改变全局状态。这是 prompt 软约束,不是硬沙箱。 +- `can_delegate()` for `Judge`:`true`(允许 explore/review/parallel 协助)。 +- `max_delegation_depth()` for `Judge`:`2`(即 Judge **自身最大被委派深度为 2**——主 agent depth 1 直接委派 Judge 得到 depth 2,符合 `MAIN_AGENT_CHILD_DEPTH=2`;同时这意味着 Judge 内部委派的子级会是 depth 3,需在 `delegation_tools_for_helper()` 中据此过滤)。 + > 注意:需求所述“自身最大被委派深度为2”指 Judge 作为被委派目标时允许出现在 depth ≤ 2。为了让 Judge 仍能发起 explore/review/parallel(depth 3 子级),`delegation_tools_for_helper(child_depth)` 对内建目标的过滤阈值需复核:Judge 在 depth 2 调用子级时 `child_depth=3`,仍 ≤ `GLOBAL_MAX_DELEGATION_DEPTH(5)` 且 ≤ explore/review 的 `max_delegation_depth(3)`,故可注入。实现时确保 `validate_delegation_capability` 对 Judge→explore/review 放行。 +- `delegation_tools_for_helper()` 仍只注入 Explore / Review / Custom / Parallel,**不得注入 Judge**。这使 Judge 可以委派其他 helper,但任何 helper 不能委派 Judge。 +- `RESERVED_SUBAGENT_SLUGS` 增加 `"judge"`,防止自定义 subagent 占用该 slug。由于 `RuntimeOrchestrationTool::parse()` 对 `agent_{slug}` 有通配解析,保留 slug 能避免 `agent_judge` 与自定义工具名冲突。 +- `runtime_orchestration_tools()` **不无条件包含 Judge**:Judge 改为按需注入(见 5.6),`builtin_all()` 保持仅含 explore/review/parallel,Judge 单独由主 agent 工具组装处按 goal 条件 push。 + +### 5.2 Judge 结构化协议(新增 `subagent/judge_contract.rs`) + +参照 `review_contract.rs` / `parallel_contract.rs` 模式新增: + +```rust +/// agent_judge 工具的入参(主 agent 传入)。 +pub struct JudgeRequest { + pub task: String, // 主 agent 对"为何认为达成"的说明 / 关注点 +} + +/// Judge 评估结构化产出。 +#[derive(Serialize, Deserialize)] +pub struct JudgeReport { + pub passed: bool, // 是否通过验收 + pub completeness_pct: u8, // 0-100 完整度百分比 + pub findings: Vec, // 未达成 / 不符合 goal 的具体点(passed=false 时必填) + pub summary: String, // 判定依据总述,作为通过时的 evidence +} +``` + +- Judge 的 system prompt(模板 `prompt/templates/subagent/judge.md`)强制要求最终以可解析的结构化形式(JSON 块或约定字段)返回上述四项。 +- `passed=true` 时 `summary` 必须非空,作为 `mark_complete()` 的 evidence;如果 Judge 输出 `passed=true` 但 `summary` 为空,解析层必须降级为 `passed=false`,避免无证据完成。 +- `completeness_pct` 解析后必须 clamp 到 0-100;`passed=false` 时 `findings` 必须非空,若模型未给出 findings,则把原始输出或“Judge did not provide actionable findings”写入 findings。 +- Judge 编排在拿到 Judge 文本输出后解析为 `JudgeReport`;解析失败按 `passed=false` 处理并把原始文本塞入 `findings`,避免误判通过。 + +### 5.3 Judge prompt surface 与上下文注入 + +- `prompt/surface.rs::PromptSurface` 新增 `SubagentJudge { inherited_run_mode }`。 +- `SurfacePattern::matches()` 同步更新:`AnySubagent` 必须匹配 `SubagentJudge`;`BuiltinSubagent` 也必须匹配 `SubagentJudge`,因为 Judge 是内建 subagent。若某些 prompt section 只应给 Explore/Review 而不应给 Judge,应改用更精确的 matcher 或新增 pattern,避免误注入。 +- `build_helper_system_prompt()` 增加 `SubagentProfile::Judge` → `PromptSurface::SubagentJudge { inherited_run_mode }` 映射。 +- `prompt/sources/custom_subagent_body.rs` 增加 Judge 模板映射:Judge → `templates/subagent/judge.md`。 +- `prompt/templates/subagent/judge.md`:定义 Judge 角色——独立验收员,只读评估,重点核对 goal 的一致性与完整性;说明可用工具(含诊断型 `shell`、可委派 explore/review/parallel);要求输出结构化 `JudgeReport`;明确禁止修改文件。`shell` 约束必须写成硬性行为指令:只能运行测试、type-check、lint、只读检查;不得通过 shell 编辑/删除文件、安装依赖、改变全局状态、启动交互式或长期驻留进程。 +- `prompt/sources/subagent_output_contract.rs` 增加 Judge 的输出契约 `output_contract.judge.md`,并在 contract 中重复 `passed` / `completeness_pct` / `findings` / `summary` 的字段要求和失败兜底规则。 +- **goal 内容注入采用 task 前缀方案**:Judge 上下文必须包含 goal objective,且由 `agent_session_execution.rs` 的 Judge 分支在构造 helper task 时注入,不新增 DB 读取型 prompt source。 +- task 前缀必须包含:objective、当前 goal id/status、最近一次 Judge findings/summary(若有)、主 agent 传入的 `task` 说明。这样 Judge 不依赖主 agent 自述即可核对目标。 + +### 5.4 Judge 编排与 goal 回写(`agent_session_execution.rs` + `goal_manager.rs`) + +- `execute_tool_call()`:`RuntimeOrchestrationTool::parse()` 命中 `Judge` 时进入 Judge 专用分支,不直接走普通 `execute_helper_tool()` 返回路径。该分支可复用 `resolve_helper_delegate()` / `HelperAgentOrchestrator::run_helper()`,但必须在 helper 完成后追加 JudgeReport 解析和 goal 回写。 +- Judge 分支额外步骤: + 1. 调用前从 DB 加载当前 thread 的未完成 goal;无 goal 或 goal 已 `Complete && judge_passed=true` 则返回错误(agent_judge 仅在有 goal 时可用,理论上不会被注入)。 + 2. 把 `goal.objective`、goal id/status、最近一次 judge findings/summary、主 agent 传入的 `task` 拼成 Judge task 上下文。 + 3. 以 `SubagentProfile::Judge`、`RuntimeOrchestrationTool::Judge`、depth 2 启动 helper run;模型角色使用 `model_plan.primary`。 + 4. Judge run 结束后解析 `JudgeReport`;解析失败或字段非法按 `passed=false` 处理。 + 5. 调用新增 `GoalManager::record_judge_verdict(goal_id, run_id, &report)` 持久化最近裁决;若 `report.passed`,该方法在同一事务内写入 `status=complete`、`evidence=report.summary` 与 `judge_passed=true`。 + 6. 若通过验收,发送 `GoalCompleted` + `GoalStateUpdated` 事件;若未通过,也发送 `GoalStateUpdated`,让前端/后续续行能拿到最新 findings。 + 7. 把 `JudgeReport` 文本作为工具结果返回主 agent;通过时结果中明确提示“goal 已通过验收,请停止修改并总结”,降低同一 run 后续继续改动的风险。 +- `GoalManager` 新增方法: + - `record_judge_verdict(&self, goal_id: &str, run_id: &str, report: &JudgeReport) -> Result`:写 `judge_passed` / `judge_completeness` / `judge_findings`(JSON) / `judge_summary` / `judge_evaluated_run_id`,并返回更新后的 record 供事件 payload 使用;passed 时同一事务同步写 `status=complete` 与 `evidence=report.summary`。 +- 原子性要求:`goal_repo.rs` 增加 `record_judge_verdict()` repo 方法,在事务内更新 judge_* 字段;passed 时同事务写 `status='complete'` 与 `evidence=summary`,确保 `status=complete` 与 `judge_passed=1` 不出现半更新;未通过时保持原 status(通常 Active)不变。 +- 预算边界:首版 Judge helper run 的 token 不单独计入 goal `tokens_used`。这是明确取舍;后续若要计入,需要扩展 `HelperRunResult` 携带 usage 并在 Judge 分支回写。 +- 同轮继续修改边界:系统不强行锁定 goal 后的写工具,因为主 agent 仍处于同一 run;通过验收后的工具结果和 `active_goal.tpl.md` prompt 必须要求停止修改。若未来需要硬约束,可在 `execute_tool_call()` 中对 `Complete && judge_passed` 后的 mutating tools 增加拒绝策略,首版不做。 + +### 5.5 删除 `goal_scored` 工具 + +- 删除工具定义(`agent_session_tools.rs` 中的 `goal_scored` `AgentTool::new(...)`)。 +- 删除分派分支与 `execute_goal_tool()`(`agent_session_execution.rs`)。 +- 移除常量 `GOAL_SCORED_TOOL_NAME` / `GOAL_SCORED_PLEDGE`(`goal_manager.rs`),以及 `evaluate_after_turn()` 中 `detect_tool_based_blocking` 对 `goal_scored` 的放行分支。 +- 删除旧自证语义:`GoalVerdict::Complete { evidence }` 当前没有有效生产者,删除 `goal_scored` 后一并移除,并删除 `evaluate_after_run()` 中的旧 match 分支,减少死代码。 +- 删除 `ChallengePromptVariant::NoEvidence` 与 `MISSING_EVIDENCE_PROMPT`,因为它们只服务于“调用 `goal_scored` 但 evidence 为空”的旧路径;保留 completion-claim 检测对应的 `ChallengeEvidence` / `NoTool` 语义,并把文案改为“声称完成但尚未调用 `agent_judge` 验收”。 +- 护栏保留但需改写文案:`ChallengeEvidence` 与 completion-claim 检测仍作为“提醒主 agent 去验收”的软提示,引导语从“调用 goal_scored”改为“调用 agent_judge 验收”。`GUIDANCE_PROMPT` 同步更新。 +- `agent_judge` 会被 `record_tool_call()` 记录到 goal runtime tool calls;`detect_tool_based_blocking()` 不应把它视为阻塞工具,也不应触发 pause。它与普通工具调用一样表示 agent 有行动,能重置 idle 倾向。 +- 全局检索并清理 `goal_scored` 引用:系统 prompt、`active_goal.tpl.md`、gateway 文案、前端 hardcoded kickoff prompt、测试(`tests/goal_lifecycle.rs`)等。 + +### 5.6 按需注入 `agent_judge`(仅主 agent,仅有未完成 goal 时) + +- 注入点在主 agent 工具组装处。`runtime_tools_for_profile()` 当前是纯 profile 函数,不知道 thread goal 状态;推荐在其调用方 `build_session_spec()`(`agent_session.rs`)查询并追加 Judge 工具,避免把 DB 依赖塞进纯工具构造函数。 + - 在 `build_session_spec()` 已能访问 `pool` 与 `thread_id`,查询 `goal_repo::find_by_thread_id`,若存在且尚未通过验收,则 push `RuntimeOrchestrationTool::Judge.as_agent_tool()`。 + - “尚未通过验收”的判定为:goal 存在且不是 `status == Complete && judge_passed == true`。实际自动续行仍只对 `Active` 生效;但工具注入可允许用户在恢复/继续场景中对 `Paused` 或 `BudgetLimited` goal 重新申请验收。 + - goal 不存在或已 `Complete && judge_passed`(已验收)则不注入。 +- `runtime_tools_with_custom_subagents()` 与 extension tool 合并时需维持内建工具名优先级,防止 extension/custom 工具覆盖 `agent_judge`。 +- **subagent 不注入**:Judge 工具只在主 agent 工具集 push,不进入 `delegation_tools_for_helper()` 的候选;任何 subagent(含 Judge 自身、explore/review/custom)的可委派目标列表都不包含 `agent_judge`。 +- **运行时硬门禁**:仅“不注入”不足够,因为模型或测试仍可能构造 `agent_judge` 调用,且 `RuntimeOrchestrationTool::parse()` 会命中。必须在 subagent 递归委派路径(例如 `HelperDelegationContext::handle_delegation()` / `resolve_delegation()`)中显式拒绝 `RuntimeOrchestrationTool::Judge`,返回“agent_judge can only be called by the main agent for the current goal”之类错误。 +- `agent_parallel` 的任务列表也必须拒绝 `agent_judge`。`validate_parallel_delegate_safety()` 或解析 parallel task 的位置应把 Judge 视为非法 batch target,避免通过 parallel 间接调用 Judge。 +- 主 agent 侧 `execute_tool_call()` 的 Judge 分支也要重新查询 goal 状态,不能只依赖工具注入时的状态;这是防止 race / stale tool set 的后端 backstop。 + +### 5.7 续行监督改造(`agent_run_event_handler.rs` + `goal_manager.rs`) + +- `evaluate_after_run()` / `evaluate_after_turn()` 开头新增**显式终止判定**:若 goal 已“通过验收”(`status == Complete && judge_passed == true`)→ 返回 `skipped`(停续行)。这是停续行的**主依据**。 +- 存量兼容依赖迁移回填:迁移后不应出现旧路径产生的 `status=Complete && judge_passed=false`。如果运行时遇到该组合,按异常兼容处理并停续行或记录 warning;不要把旧 complete goal 重新拉起续行。 +- 对 `Paused` / `BudgetLimited` 仍按现有语义返回 skipped,不自动续行。只有 `Active` goal 会继续进入护栏评估。 +- 其余护栏(clarify/update_plan/idle/预算)保留,作用不变。 +- `Continue` / `ChallengeEvidence` verdict 的 continuation prompt 改写为新模板(替换 `CONTINUATION_PROMPT_TEMPLATE`): + +``` +[Goal continuation — turns {turns_used}/{max_turns}] + +**Objective:** {objective} + +继续推进该目标,执行下一个具体步骤。 + +⚠️ 完成判定已改为独立验收:当你认为目标已达成时,必须调用 + agent_judge(task="说明为何认为已达成 / 需重点核对的点") +由 Judge 评估项目是否满足目标的一致性与完整性。 +- 仅当 Judge 裁决 passed=true 时,目标才会被标记为通过验收并停止续行。 +- 若上一次 Judge 验收未通过,请阅读其 findings,逐项修复后再次调用 agent_judge。 +你无法自行声明完成;只有通过 Judge 验收才算达成。 + +如果你被阻塞、需要用户输入,请使用 clarify 工具。 +``` + +- 若最近一次 Judge 未通过,必须把 `judge_findings` 摘要拼接进 continuation prompt,提升修复指向性;摘要可限制长度,避免 prompt 过长。 + +### 5.8 数据库迁移 + +新增迁移 `migrations/2026XXXXXXXXXX_goal_judge_fields.sql`: + +```sql +ALTER TABLE goals ADD COLUMN judge_passed INTEGER NOT NULL DEFAULT 0; -- bool +ALTER TABLE goals ADD COLUMN judge_completeness INTEGER; -- 0-100, nullable +ALTER TABLE goals ADD COLUMN judge_findings TEXT; -- JSON array, nullable +ALTER TABLE goals ADD COLUMN judge_summary TEXT; -- nullable +ALTER TABLE goals ADD COLUMN judge_evaluated_run_id TEXT; -- nullable + +-- 兼容旧版本 goal_scored 已完成的 goal,避免升级后被误判为未验收。 +UPDATE goals +SET judge_passed = 1, + judge_summary = COALESCE(judge_summary, evidence), + judge_completeness = COALESCE(judge_completeness, 100) +WHERE status = 'complete'; +``` + +- `GoalRecord` / `GoalDto` / `GoalPayload`(`model/goal.rs`)同步新增字段:`judge_passed: bool`、`judge_completeness: Option`(DB 读写时校验 0-100)、`judge_findings: Option`(JSON 文本,DTO 透传字符串,前端按 string/null 接收)、`judge_summary: Option`、`judge_evaluated_run_id: Option`。 +- `goal_repo.rs` 同步更新 `SELECT_COLUMNS`、`GoalRow`、`into_record()`、`insert()`。新增 `record_judge_verdict()` repo 方法,负责写 judge_* 字段;passed 时同一事务同步写 `status='complete'` 与 `evidence=summary`。 +- 若 `judge_findings` 以 JSON array 字符串存储,写入前由 `serde_json::to_string(&report.findings)` 生成;读取失败时不要 panic,DTO 可原样返回或置为 `None` 并记录 warning。 + +### 5.9 前端、IPC、gateway 与 ACP + +- `ThreadStreamEvent` 首版复用现有 `GoalCompleted` / `GoalStateUpdated`,不新增 Judge 专属事件。`GoalPayload` 增加 judge 字段后,现有事件 payload 即可携带最新裁决。 +- 前端 `GoalPayload` 类型(如 `src/services/bridge/agent-commands.ts`)与 store 类型(如 `src/modules/workbench-shell/model/thread-store.ts`)补充 judge 字段;状态条在 `Complete && judgePassed` 时显示“已验收通过”。`judge_completeness` 的进度/百分比 UI 为二阶段增强。 +- `goal-status-bar.tsx` 只做最小展示;若未实现详细展示,也必须保证新增字段不会破坏类型检查。 +- gateway / ACP 首版只要求文案与行为不再引用 `goal_scored`,并确保这些入口启动主 agent 时使用同一 `build_session_spec()` 注入逻辑,因此有未完成 goal 时也能拿到 `agent_judge`。详细展示 Judge findings/completeness 可后续增强。 + +--- + +## 6. 影响文件清单 + +| 文件 | 改动 | +|------|------| +| `src-tauri/src/model/goal.rs` | `GoalRecord`/`GoalDto`/`GoalPayload` 新增 judge_* 字段;删除 `GoalVerdict::Complete` 旧自证变体 | +| `src-tauri/src/core/goal_manager.rs` | 删除 `GOAL_SCORED_*` 常量与放行分支;删除 `MISSING_EVIDENCE_PROMPT` / `NoEvidence` 旧路径;新增 `record_judge_verdict()`;续行终止判定改为 `Complete && judge_passed`;改写 continuation/guidance 文案并拼接最近 findings | +| `src-tauri/src/core/subagent/runtime_orchestration.rs` | `RuntimeOrchestrationTool::Judge` + `SubagentProfile::Judge`(工具集/can_delegate/max_delegation_depth=2);`parse`/`profile`/`as_agent_tool`/`helper_kind` 等 match 补齐;保留 slug;`builtin_all()` 不含 Judge | +| `src-tauri/src/core/subagent/judge_contract.rs`(新增) | `JudgeRequest` / `JudgeReport` 结构化协议、JSON 解析、字段校验、失败兜底 | +| `src-tauri/src/core/subagent/orchestrator.rs` | `build_helper_system_prompt()` 支持 Judge surface;subagent 递归委派路径硬性拒绝 `agent_judge`;保持 Judge→explore/review/parallel 放行 | +| `src-tauri/src/core/subagent/parallel_contract.rs` / 相关 parallel 校验 | `agent_parallel` task 拒绝 `agent_judge` 作为子任务 | +| `src-tauri/src/core/agent_session_execution.rs` | 删除 `goal_scored` 分派与 `execute_goal_tool()`;新增 Judge 专用分支(加载 goal → task 前缀注入 → helper run → 解析 JudgeReport → 回写 goal → 发送事件) | +| `src-tauri/src/core/agent_session_tools.rs` | 删除 `goal_scored` 工具定义;保持基础 runtime tools 不含 Judge;如新增 helper 函数则提供 `agent_judge` 工具构造 | +| `src-tauri/src/core/agent_session.rs` | `build_session_spec()` 查询 goal,按“未通过验收”条件向主 agent 追加 `agent_judge`;`resolve_helper_model_role()` 将 Judge 映射到 primary | +| `src-tauri/src/core/prompt/surface.rs` | `PromptSurface::SubagentJudge`;`SurfacePattern::AnySubagent` / `BuiltinSubagent` 匹配 Judge | +| `src-tauri/src/core/prompt/sources/custom_subagent_body.rs` | Judge → `templates/subagent/judge.md` | +| `src-tauri/src/core/prompt/sources/subagent_output_contract.rs` | Judge 输出契约 | +| `src-tauri/src/core/prompt/templates/subagent/judge.md`(新增) | Judge 角色、诊断型 shell 软约束、委派说明与结构化输出要求 | +| `src-tauri/src/core/prompt/templates/active_goal.tpl.md` | 完成判定改为经 agent_judge 验收,并提示通过后停止修改 | +| `src-tauri/src/core/prompt/sources/active_goal.rs` | 文案同步(如有引用) | +| `src-tauri/src/persistence/repo/goal_repo.rs` | judge_* 列读写;新增 `record_judge_verdict()`;passed 时原子写 status/evidence/judge_* | +| `src-tauri/migrations/2026XXXXXXXXXX_goal_judge_fields.sql`(新增) | judge_* 列迁移,并回填旧 `status='complete'` 为 `judge_passed=1` | +| `src-tauri/src/gateway/gateway_runner.rs` | 移除 `goal_scored` 引导文案,改为 agent_judge 验收说明 | +| `src-tauri/src/acp/**`(如有 goal 文案/事件映射) | 确认不引用 `goal_scored`;复用 GoalStateUpdated payload 的 judge 字段 | +| `src-tauri/tests/goal_lifecycle.rs` | 重写:覆盖 Judge 通过→Complete+judge_passed→停续行;未通过→续行;旧 complete 回填兼容 | +| `src-tauri/src/core/agent_session_tests.rs` / subagent tests | 覆盖 Judge profile、模型角色、工具注入、递归拒绝、parallel 拒绝、prompt surface 匹配 | +| `src/services/bridge/agent-commands.ts` | 前端 `GoalPayload` 类型新增 judge 字段 | +| `src/modules/workbench-shell/model/thread-store.ts` | `GoalStoreState` 新增 judge 字段 | +| `src/modules/workbench-shell/ui/goal-status-bar.tsx` | 最小展示 `Complete && judgePassed` 为“已验收通过” | +| `src/modules/workbench-shell/ui/runtime-thread-surface.tsx` | 清理 goal kickoff prompt 中的 `goal_scored` 示例,改为 agent_judge 验收说明 | + +--- + +## 7. 验证计划 + +- **Rust 格式**:`cargo fmt --check --manifest-path src-tauri/Cargo.toml`。 +- **Rust 行为**:`cargo test --locked --manifest-path src-tauri/Cargo.toml`,重点 `goal_lifecycle`、subagent 委派、prompt surface 与迁移相关测试。新增/重写用例: + - Judge `passed=true` → goal 变 `Complete` 且 `judge_passed=true`,`judge_summary/evidence` 非空,下一轮 `evaluate_after_run` 返回 skipped(停续行)。 + - Judge `passed=false` → goal 仍进行中,写入 `judge_findings`,`evaluate_after_run` 返回 `Continue` 且 continuation prompt 包含最近 findings 并引导调用 `agent_judge`。 + - 存量 `status='complete'` 迁移后 `judge_passed=1`、`judge_completeness=100`,不会被新续行逻辑重新拉起。 + - `agent_judge` 仅在有未通过验收 goal 时注入主 agent;无 goal 或已验收通过时主 agent 工具集不含 `agent_judge`;任何 subagent 工具集不含 `agent_judge`。 + - 运行时门禁:subagent 直接调用 `agent_judge` 被拒绝;`agent_parallel` task 使用 `agent_judge` 被拒绝;主 agent→Judge 合法(depth 2);Judge→explore/review 合法(depth 3)。 + - Judge 模型角色使用 primary;Explore/Review 仍保持既有模型映射。 + - Prompt surface:`SubagentJudge` 能构建 system prompt;`AnySubagent` / `BuiltinSubagent` 匹配 Judge;Judge 模板包含诊断型 shell 软约束和结构化输出契约。 + - JudgeReport 解析失败、`passed=true` 但 summary 空、completeness 越界、`passed=false` findings 空 → 均视为未通过或安全兜底,不误标完成。 + - `goal_scored` 工具与常量已删除(编译期 + 检索为 0 个非历史设计文档引用)。 +- **前端**:`npm run typecheck`;若改动前端测试则 `npm run test:unit`。重点验证 `GoalPayload` / `GoalStoreState` 新字段不会破坏事件处理,`goal-status-bar.tsx` 能显示已验收通过。 +- **文案检索**:全局搜索 `goal_scored`,除历史文档/迁移注释外不应有运行时 prompt、前端提示或 gateway 文案引用。 +- **手动冒烟**:创建 goal → 主 agent 工作 → 调 agent_judge 未通过(findings)→ 续行修复 → 再次 agent_judge 通过 → goal 状态条显示已验收、续行停止。 + +--- + +## 8. 风险与边界 + +1. **主 agent 始终不调用 `agent_judge`**:goal 永远不被验收,续行会持续注入 prompt 直至护栏触发(idle/预算上限)。这正是护栏保留的价值——兜底防止无限续行。需在 prompt 中强力引导主 agent 调用 agent_judge。 +2. **Judge 误判**:Judge 也是 LLM,可能误通过或误拒。误通过风险通过“独立上下文 + 文件工具只读 + primary 模型 + 重点核对一致性/完整性 + 可跑诊断验证”降低;误拒会触发续行修复,代价是额外轮次。 +3. **诊断型 shell 不是硬只读**:Judge 可用 `shell` 意味着理论上能执行修改性命令。首版通过 Judge prompt 进行软约束,要求只运行测试、type-check、lint、只读检查,并禁止修改文件、删除数据、安装依赖、改变全局状态。若后续发现模型不稳定,应新增受限 test-runner 或 shell allowlist。 +4. **Judge 成本**:每次验收会拉起一个可委派的 subagent run,可能再并行 explore/review,token/时间开销不小。首版不把 Judge/subagent token 单独计入 goal budget,也不新增 Judge 专属硬超时;需在 continuation prompt 中提示主 agent“仅在确有把握达成时再申请验收”,避免频繁空验收。 +5. **深度语义边界**:Judge `max_delegation_depth=2` 必须与 `MAIN_AGENT_CHILD_DEPTH=2` 一致,且要确保 Judge 在 depth 2 仍能委派 depth 3 的 explore/review(受 `GLOBAL_MAX_DELEGATION_DEPTH=5` 与 explore/review 自身上限 3 约束,合法)。同时必须在递归委派和 parallel 路径拒绝任何 helper→Judge 调用,避免职责边界被绕过。 +6. **迁移兼容**:迁移必须回填 `UPDATE goals SET judge_passed=1, judge_completeness=100 ... WHERE status='complete'`。运行时若遇到 `Complete && !judge_passed`,应记录 warning 并停续行,不能把存量已完成 goal 重新拉起。 +7. **gateway / ACP 路径**:微信/企微与 ACP 同样依赖 goal 续行,首版需确认这些入口创建主 agent run 时走同一 `build_session_spec()` 注入逻辑,且 prompt/gateway 文案不再提 `goal_scored`。 +8. **同轮继续修改**:Judge 通过后主 agent 仍可能在同一 run 继续调用其他工具。首版不做写工具硬锁,通过 Judge 工具结果和 `active_goal.tpl.md` prompt 要求停止修改;若后续发现问题,再加 `Complete && judge_passed` 后 mutating tools 拒绝策略。 +9. **跨平台**:主体为 Rust/SQLite/prompt/TypeScript 类型改动,应保持跨平台兼容;shell 诊断命令由 Judge 根据项目现有命令选择,prompt 中需提醒避免平台特定假设。 diff --git a/src-tauri/migrations/20260607000000_goal_judge_fields.sql b/src-tauri/migrations/20260607000000_goal_judge_fields.sql new file mode 100644 index 00000000..11dd6954 --- /dev/null +++ b/src-tauri/migrations/20260607000000_goal_judge_fields.sql @@ -0,0 +1,17 @@ +-- Goal Judge verification fields: persist the most recent independent Judge +-- verdict for a goal. Acceptance is expressed as status='complete' AND +-- judge_passed=1 (the main agent can no longer self-attest completion). +ALTER TABLE goals ADD COLUMN judge_passed INTEGER NOT NULL DEFAULT 0; -- bool +ALTER TABLE goals ADD COLUMN judge_completeness INTEGER; -- 0-100, nullable +ALTER TABLE goals ADD COLUMN judge_findings TEXT; -- JSON array, nullable +ALTER TABLE goals ADD COLUMN judge_summary TEXT; -- nullable +ALTER TABLE goals ADD COLUMN judge_evaluated_run_id TEXT; -- nullable + +-- Backfill goals already completed via the legacy goal_scored path so that an +-- upgrade does not treat them as un-verified (which would otherwise let goal +-- continuation re-open them). +UPDATE goals +SET judge_passed = 1, + judge_summary = COALESCE(judge_summary, evidence), + judge_completeness = COALESCE(judge_completeness, 100) +WHERE status = 'complete'; diff --git a/src-tauri/src/core/agent_session.rs b/src-tauri/src/core/agent_session.rs index 7a5eb557..2c7be27e 100644 --- a/src-tauri/src/core/agent_session.rs +++ b/src-tauri/src/core/agent_session.rs @@ -601,6 +601,34 @@ pub async fn build_session_spec( .await .map(|settings| settings.is_ready()) .unwrap_or(false); + + let mut runtime_tools = runtime_tools_with_custom_subagents( + runtime_tools_with_web_search( + runtime_tools_for_profile_with_extensions(&tool_profile_name, extension_tools), + &tool_profile_name, + web_search_enabled, + ), + custom_subagent_tools, + ); + + // Inject the main-agent-only `agent_judge` acceptance tool on demand: only + // when this thread has a goal that has not yet passed Judge acceptance + // (acceptance = status Complete AND judge_passed). It is appended after the + // custom/extension merge so that the built-in tool name always wins and + // cannot be shadowed by a custom or extension tool. + if let Ok(Some(goal)) = + crate::persistence::repo::goal_repo::find_by_thread_id(pool, thread_id).await + { + let already_verified = + goal.status == crate::model::goal::GoalStatus::Complete && goal.judge_passed; + if !already_verified { + let judge_tool = crate::core::subagent::RuntimeOrchestrationTool::Judge.as_agent_tool(); + if !runtime_tools.iter().any(|t| t.name == judge_tool.name) { + runtime_tools.push(judge_tool); + } + } + } + let initial_context_calibration = build_initial_context_token_calibration( latest_historical_run.as_ref(), &history_messages, @@ -615,14 +643,7 @@ pub async fn build_session_spec( workspace_path: workspace_path.to_string(), run_mode: run_mode.to_string(), tool_profile_name: tool_profile_name.clone(), - runtime_tools: runtime_tools_with_custom_subagents( - runtime_tools_with_web_search( - runtime_tools_for_profile_with_extensions(&tool_profile_name, extension_tools), - &tool_profile_name, - web_search_enabled, - ), - custom_subagent_tools, - ), + runtime_tools, system_prompt, history_messages, history_tool_calls, diff --git a/src-tauri/src/core/agent_session_execution.rs b/src-tauri/src/core/agent_session_execution.rs index e6a2b913..f1dbb980 100644 --- a/src-tauri/src/core/agent_session_execution.rs +++ b/src-tauri/src/core/agent_session_execution.rs @@ -16,10 +16,10 @@ use crate::core::plan_checkpoint::{ build_plan_message_metadata, plan_markdown, write_plan_file, }; use crate::core::subagent::{ - extract_review_report, render_parallel_summary, HelperRunRequest, HelperRunResult, - ParallelSubagentBatchStatus, ParallelSubagentRequest, ParallelSubagentSummary, - ParallelSubagentTask, ParallelSubagentTaskResult, ParallelSubagentTaskStatus, ReviewRequest, - RuntimeOrchestrationTool, SubagentProfile, + extract_judge_report, extract_review_report, render_parallel_summary, HelperRunRequest, + HelperRunResult, JudgeReport, ParallelSubagentBatchStatus, ParallelSubagentRequest, + ParallelSubagentSummary, ParallelSubagentTask, ParallelSubagentTaskResult, + ParallelSubagentTaskStatus, ReviewRequest, RuntimeOrchestrationTool, SubagentProfile, }; use crate::core::tool_gateway::{ ApprovalRequest, ToolExecutionOptions, ToolExecutionRequest, ToolGatewayResult, @@ -294,33 +294,6 @@ impl AgentSession { .await; } - // Goal tools — handle before the main tool gateway - if tool_name == crate::core::goal_manager::GOAL_SCORED_TOOL_NAME { - let tool_call_storage_id = uuid::Uuid::now_v7().to_string(); - let insert_result = tool_call_repo::insert( - &self.pool, - &tool_call_repo::ToolCallInsert { - id: tool_call_storage_id.clone(), - tool_call_id: tool_call_id.to_string(), - run_id: self.spec.run_id.clone(), - thread_id: self.spec.thread_id.clone(), - helper_id: None, - tool_name: tool_name.to_string(), - tool_input_json: tool_input.to_string(), - status: "requested".to_string(), - }, - ) - .await; - - if let Err(error) = insert_result { - return agent_error_result(format!("failed to persist tool call: {error}")); - } - - return self - .execute_goal_tool(tool_name, tool_call_id, &tool_call_storage_id, tool_input) - .await; - } - let tool_call_storage_id = uuid::Uuid::now_v7().to_string(); let insert_result = tool_call_repo::insert( &self.pool, @@ -351,6 +324,10 @@ impl AgentSession { ) .await } + RuntimeOrchestrationTool::Judge => { + self.execute_judge_tool(tool_call_id, &tool_call_storage_id, tool_input) + .await + } _ => { self.execute_helper_tool(tool, tool_call_id, &tool_call_storage_id, tool_input) .await @@ -886,6 +863,16 @@ impl AgentSession { return Err("agent_parallel cannot be used as an individual helper".to_string()); } + if tool == RuntimeOrchestrationTool::Judge { + // agent_judge is a main-agent-only acceptance tool: it must not be + // reachable as a generic helper delegate or as an agent_parallel + // batch target. + return Err( + "agent_judge can only be called directly by the main agent for the current goal" + .to_string(), + ); + } + let HelperToolTask { task, review_request, @@ -1614,202 +1601,238 @@ impl AgentSession { } } - // ── Goal tool handlers ── + // ── Goal acceptance Judge handler ── - async fn execute_goal_tool( + /// Run the main-agent-only `agent_judge` acceptance flow: build a Judge task + /// with the current goal injected, run the Judge helper, parse its structured + /// verdict, persist it, and (on pass) flip the goal to verified/complete. + async fn execute_judge_tool( &self, - tool_name: &str, - _tool_call_id: &str, + tool_call_id: &str, tool_call_storage_id: &str, tool_input: &serde_json::Value, ) -> AgentToolResult { - let pool = self.pool.clone(); - let thread_id = self.spec.thread_id.clone(); - - match tool_name { - name if name == crate::core::goal_manager::GOAL_SCORED_TOOL_NAME => { - let status = tool_input - .get("status") - .and_then(|v| v.as_str()) - .unwrap_or(""); - let evidence = tool_input - .get("evidence") - .and_then(|v| v.as_str()) - .unwrap_or(""); - let pledge = tool_input - .get("pledge") - .and_then(|v| v.as_str()) - .unwrap_or(""); - - // Only support marking as complete - if status != "complete" { - let err_msg = "goal_scored only supports status='complete'. Use /goal pause|resume|clear from the UI for other lifecycle operations."; - tool_call_repo::update_result( - &self.pool, - tool_call_storage_id, - &serde_json::json!({ "error": err_msg }).to_string(), - "failed", - ) - .await - .ok(); - return agent_error_result(err_msg); - } + // Parse the main agent's task / rationale. + let request = match crate::core::subagent::JudgeRequest::from_tool_input(tool_input) { + Ok(request) => request, + Err(error) => { + tool_call_repo::update_result( + &self.pool, + tool_call_storage_id, + &serde_json::json!({ "error": &error }).to_string(), + "failed", + ) + .await + .ok(); + return agent_error_result(error); + } + }; - // The pledge must match the required text exactly. - if pledge.trim() != crate::core::goal_manager::GOAL_SCORED_PLEDGE { - let err_msg = format!( - "goal_scored rejected: the 'pledge' parameter must be passed verbatim as: \"{}\"", - crate::core::goal_manager::GOAL_SCORED_PLEDGE - ); - tool_call_repo::update_result( - &self.pool, - tool_call_storage_id, - &serde_json::json!({ "error": &err_msg }).to_string(), - "failed", - ) - .await - .ok(); - return agent_error_result(err_msg); - } + // Backstop: re-query goal state. agent_judge is injected only when an + // un-verified goal exists, but a stale tool set or a direct call must be + // rejected here too. + let goal = match crate::persistence::repo::goal_repo::find_by_thread_id( + &self.pool, + &self.spec.thread_id, + ) + .await + { + Ok(Some(goal)) => goal, + Ok(None) => { + let err_msg = + "agent_judge cannot run: no goal exists for this thread. Create one with the /goal command first."; + tool_call_repo::update_result( + &self.pool, + tool_call_storage_id, + &serde_json::json!({ "error": err_msg }).to_string(), + "failed", + ) + .await + .ok(); + return agent_error_result(err_msg); + } + Err(e) => { + let err_msg = format!("Failed to load goal: {e}"); + tool_call_repo::update_result( + &self.pool, + tool_call_storage_id, + &serde_json::json!({ "error": &err_msg }).to_string(), + "failed", + ) + .await + .ok(); + return agent_error_result(err_msg); + } + }; - if evidence.trim().is_empty() { - // Evidence is empty — reject the completion and challenge - let mgr = crate::core::goal_manager::GoalManager::new( - pool, - thread_id, - self.goal_runtime.clone(), - ); - let challenge = mgr.render_challenge_prompt( - crate::core::goal_manager::ChallengePromptVariant::NoEvidence, - ); - let result_text = - format!("Goal completion rejected: evidence is required. {challenge}"); - tool_call_repo::update_result( - &self.pool, - tool_call_storage_id, - &serde_json::json!({ "output": &result_text }).to_string(), - "completed", - ) - .await - .ok(); - return AgentToolResult::text(result_text); - } + if goal.status == crate::model::goal::GoalStatus::Complete && goal.judge_passed { + let err_msg = + "The goal has already passed acceptance. No further verification is needed."; + tool_call_repo::update_result( + &self.pool, + tool_call_storage_id, + &serde_json::json!({ "error": err_msg }).to_string(), + "failed", + ) + .await + .ok(); + return agent_error_result(err_msg); + } - let mgr = crate::core::goal_manager::GoalManager::new( - pool, - thread_id, - self.goal_runtime.clone(), - ); - match mgr.get_active().await { - Ok(Some(goal)) => { - if goal.status != crate::model::goal::GoalStatus::Active { - let err_msg = format!( - "Goal is not active (current status: {:?}). Cannot mark as complete.", - goal.status - ); - tool_call_repo::update_result( - &self.pool, - tool_call_storage_id, - &serde_json::json!({ "error": &err_msg }).to_string(), - "failed", - ) - .await - .ok(); - return agent_error_result(err_msg); - } - let paused_seconds = { - let mut guard = self.goal_runtime.lock().unwrap_or_else(|poisoned| { - tracing::warn!( - "goal_scored: goal_runtime mutex poisoned, recovering" - ); - poisoned.into_inner() - }); - guard.take_run_paused_seconds(&self.spec.run_id).max(0) - }; - let active_run_seconds = - crate::persistence::repo::run_repo::get_active_run_elapsed_seconds( - &self.pool, - &self.spec.thread_id, - ) - .await - .unwrap_or(None) - .map(|seconds| (seconds - paused_seconds).max(0)); - - match mgr.mark_complete(&goal.id, evidence).await { - Ok(()) => { - if let Some(run_seconds) = active_run_seconds { - if run_seconds > 0 { - mgr.account_usage(&goal.id, 0, run_seconds).await.ok(); - } - } - - let updated = mgr.get_active().await.ok().flatten(); - if let Some(ref record) = updated { - let payload = - crate::core::goal_manager::GoalManager::to_payload(record); - let _ = self.event_tx.send(ThreadStreamEvent::GoalCompleted { - thread_id: record.thread_id.clone(), - evidence: evidence.to_string(), - }); - let _ = - self.event_tx.send(ThreadStreamEvent::GoalStateUpdated { - thread_id: record.thread_id.clone(), - goal: Some(payload), - }); - } - let result_text = - format!("Goal marked as complete. Evidence: {evidence}"); - tool_call_repo::update_result( - &self.pool, - tool_call_storage_id, - &serde_json::json!({ "output": &result_text }).to_string(), - "completed", - ) - .await - .ok(); - AgentToolResult::text(result_text) - } - Err(e) => { - let err_msg = format!("Failed to complete goal: {e}"); - tool_call_repo::update_result( - &self.pool, - tool_call_storage_id, - &serde_json::json!({ "error": &err_msg }).to_string(), - "failed", - ) - .await - .ok(); - agent_error_result(err_msg) - } + // Build the Judge task: inject the goal objective + status + last verdict + // so the Judge does not rely on the main agent's self-report. + let mut prior_verdict = String::new(); + if goal.judge_evaluated_run_id.is_some() { + if let Some(summary) = goal.judge_summary.as_deref() { + if !summary.trim().is_empty() { + prior_verdict.push_str(&format!("\nPrevious Judge summary: {summary}")); + } + } + if let Some(findings_json) = goal.judge_findings.as_deref() { + if let Ok(findings) = serde_json::from_str::>(findings_json) { + if !findings.is_empty() { + prior_verdict.push_str("\nPrevious Judge findings:"); + for finding in findings { + prior_verdict.push_str(&format!("\n- {finding}")); } } - Ok(None) => { - let err_msg = "No active goal found. Create one first with /goal command."; - tool_call_repo::update_result( - &self.pool, - tool_call_storage_id, - &serde_json::json!({ "error": err_msg }).to_string(), - "failed", - ) - .await - .ok(); - agent_error_result(err_msg) - } - Err(e) => { - let err_msg = format!("Failed to load goal: {e}"); - tool_call_repo::update_result( - &self.pool, - tool_call_storage_id, - &serde_json::json!({ "error": &err_msg }).to_string(), - "failed", - ) - .await - .ok(); - agent_error_result(err_msg) - } } } - _ => agent_error_result(format!("Unknown goal tool: {tool_name}")), + } + + let judge_task = format!( + "You are verifying acceptance of the following goal for the current project.\n\n\ +Goal id: {goal_id}\n\ +Goal status: {status:?}\n\ +Goal objective:\n{objective}\n\ +{prior_verdict}\n\n\ +The main agent's note for this verification request:\n{task}\n\n\ +Independently inspect the project's current state and decide whether it satisfies the goal. \ +Return your structured JudgeReport verdict.", + goal_id = goal.id, + status = goal.status, + objective = goal.objective, + prior_verdict = prior_verdict, + task = request.task, + ); + + // Build a Judge delegate (depth 2, primary model) and run it. + let tool = RuntimeOrchestrationTool::Judge; + let helper_profile = resolve_helper_profile(&tool); + let model_role = match resolve_helper_model_role( + &self.spec.model_plan, + &tool, + helper_profile.as_ref(), + ) { + Some(role) => role, + None => { + let err_msg = "Failed to resolve a model for agent_judge.".to_string(); + tool_call_repo::update_result( + &self.pool, + tool_call_storage_id, + &serde_json::json!({ "error": &err_msg }).to_string(), + "failed", + ) + .await + .ok(); + return agent_error_result(err_msg); + } + }; + + let delegate = ResolvedHelperDelegate { + tool: tool.clone(), + agent_name: tool.tool_name(), + task: judge_task, + review_request: None, + helper_profile, + model_role, + }; + + let report: JudgeReport = match self.run_helper_for_delegate(&delegate, tool_call_id).await + { + Ok(summary) => extract_judge_report( + summary + .raw_summary + .as_deref() + .unwrap_or(summary.summary.as_str()), + ), + Err(error) => { + let err_msg = format!("agent_judge failed to run: {error}"); + tool_call_repo::update_result( + &self.pool, + tool_call_storage_id, + &serde_json::json!({ "error": &err_msg }).to_string(), + "failed", + ) + .await + .ok(); + return agent_error_result(err_msg); + } + }; + + // Persist the verdict (atomically flips to complete + judge_passed on pass). + let findings_json = + serde_json::to_string(&report.findings).unwrap_or_else(|_| "[]".to_string()); + let recorded = crate::persistence::repo::goal_repo::record_judge_verdict( + &self.pool, + &goal.id, + &self.spec.run_id, + report.passed, + report.completeness_pct as i64, + &findings_json, + &report.summary, + ) + .await; + + if let Err(e) = recorded { + let err_msg = format!("Failed to persist Judge verdict: {e}"); + tool_call_repo::update_result( + &self.pool, + tool_call_storage_id, + &serde_json::json!({ "error": &err_msg }).to_string(), + "failed", + ) + .await + .ok(); + return agent_error_result(err_msg); + } + + // Emit goal events with the freshly updated record. + if let Ok(Some(record)) = + crate::persistence::repo::goal_repo::find_by_thread_id(&self.pool, &self.spec.thread_id) + .await + { + let payload = crate::core::goal_manager::GoalManager::to_payload(&record); + if report.passed { + let _ = self.event_tx.send(ThreadStreamEvent::GoalCompleted { + thread_id: record.thread_id.clone(), + evidence: record.evidence.clone().unwrap_or_default(), + }); + } + let _ = self.event_tx.send(ThreadStreamEvent::GoalStateUpdated { + thread_id: record.thread_id.clone(), + goal: Some(payload), + }); + } + + let result_text = crate::core::subagent::judge_contract::render_parent_summary(&report); + tool_call_repo::update_result( + &self.pool, + tool_call_storage_id, + &serde_json::json!({ "output": &result_text, "passed": report.passed }).to_string(), + "completed", + ) + .await + .ok(); + + AgentToolResult { + content: vec![ContentBlock::Text(TextContent::new(result_text))], + details: Some(serde_json::json!({ + "passed": report.passed, + "completenessPct": report.completeness_pct, + "findings": report.findings, + "summary": report.summary, + })), } } } diff --git a/src-tauri/src/core/agent_session_tools.rs b/src-tauri/src/core/agent_session_tools.rs index abef8647..97966181 100644 --- a/src-tauri/src/core/agent_session_tools.rs +++ b/src-tauri/src/core/agent_session_tools.rs @@ -534,31 +534,6 @@ You may call this tool multiple times in a run to incrementally refine the plan. }), )); - // Goal tool — persistent cross-turn task completion - tools.push(AgentTool::new( - crate::core::goal_manager::GOAL_SCORED_TOOL_NAME, - "Goal Scored", - "Mark the current goal as fully achieved (score the goal). You MUST provide evidence — run tests, check file contents, or verify command output to prove the goal is truly achieved. Without evidence, the completion will be challenged. You MUST also pass the exact required pledge text. Do NOT call this tool unless you have actually verified the goal is complete with no remaining or follow-up work.", - serde_json::json!({ - "type": "object", - "properties": { - "status": { - "type": "string", - "enum": ["complete"], - "description": "Must be 'complete' to mark the goal as achieved." - }, - "evidence": { - "type": "string", - "description": "Concrete evidence that the goal is complete — test output, file change summary, command results, or verification steps. Required." - }, - "pledge": { - "type": "string", - "description": "You MUST pass this exact pledge text verbatim: \"I hereby declare: I confirm that I have fully achieved this goal, and I have confirmed that there are no remaining pending tasks or follow-up items. I confirm that I have repeatedly reviewed the output of this work, and I take responsibility for the quality of this output.\"" - } - }, - "required": ["status", "evidence", "pledge"] - }), - )); // Render artifact tool (always available) — supports charts, HTML, and SVG tools.push(AgentTool::new( "render", @@ -670,6 +645,7 @@ pub(crate) fn resolve_helper_profile(tool: &RuntimeOrchestrationTool) -> Option< match tool { RuntimeOrchestrationTool::Explore => Some(SubagentProfile::Explore), RuntimeOrchestrationTool::Review => Some(SubagentProfile::Review), + RuntimeOrchestrationTool::Judge => Some(SubagentProfile::Judge), RuntimeOrchestrationTool::Parallel | RuntimeOrchestrationTool::Custom(_) => None, } } @@ -690,6 +666,8 @@ pub(crate) fn resolve_helper_model_role( .clone() .unwrap_or_else(|| model_plan.primary.clone()), ), + // Judge prioritizes acceptance quality over cost: always use primary. + RuntimeOrchestrationTool::Judge => Some(model_plan.primary.clone()), RuntimeOrchestrationTool::Parallel | RuntimeOrchestrationTool::Custom(_) => None, } } diff --git a/src-tauri/src/core/goal_manager.rs b/src-tauri/src/core/goal_manager.rs index fa6d1101..7486e39b 100644 --- a/src-tauri/src/core/goal_manager.rs +++ b/src-tauri/src/core/goal_manager.rs @@ -19,12 +19,6 @@ pub struct GoalEvaluationOutcome { /// Default maximum turns for a goal before auto-pausing. const DEFAULT_MAX_TURNS: i64 = 50; -/// Tool name used to mark a goal as fully achieved ("score" the goal). -pub const GOAL_SCORED_TOOL_NAME: &str = "goal_scored"; - -/// Exact pledge text the agent must pass verbatim when calling `goal_scored`. -pub const GOAL_SCORED_PLEDGE: &str = "I hereby declare: I confirm that I have fully achieved this goal, and I have confirmed that there are no remaining pending tasks or follow-up items. I confirm that I have repeatedly reviewed the output of this work, and I take responsibility for the quality of this output."; - /// Continuation prompt injected when the goal is still active. const CONTINUATION_PROMPT_TEMPLATE: &str = "\ [Goal continuation — turns {turns_used}/{max_turns}] @@ -33,26 +27,28 @@ const CONTINUATION_PROMPT_TEMPLATE: &str = "\ Continue working toward this objective. Take the next concrete step. -⚠️ When the goal is fully achieved, you MUST call: - goal_scored(status=\"complete\", evidence=\"\", pledge=\"\") -Without this call, the system will keep injecting continuation prompts. +⚠️ Completion is now decided by independent verification. When you believe the +goal is achieved, you MUST call: + agent_judge(task=\"explain why you believe the goal is achieved / what to verify\") +A Judge will evaluate whether the project satisfies the goal's consistency and +completeness. +- The goal is only marked verified when the Judge returns passed=true. +- If a previous Judge verification did not pass, read its findings, fix each one, + then call agent_judge again. +You cannot declare completion yourself; only a passing Judge verdict counts. If you are blocked and need user input, use the clarify tool."; -/// Challenge prompt when the model claimed completion but did not use the tool. +/// Challenge prompt when the model claimed completion but has not requested +/// Judge verification yet. const CHALLENGE_EVIDENCE_PROMPT: &str = "\ -Before claiming the goal is complete, please provide concrete evidence: - -1. What verification commands did you run? What was the output? -2. What files did you modify? What was the purpose of each change? - -Once you have evidence, call goal_scored(status=\"complete\", evidence=\"...\", pledge=\"...\") . -If the goal is not actually complete, ignore this prompt and continue working."; +You appear to believe the goal is complete, but you have not requested independent +verification. You cannot self-declare completion. -/// Challenge prompt when the model claimed completion but evidence was empty. -const MISSING_EVIDENCE_PROMPT: &str = "\ -You called goal_scored(complete) but did not provide evidence. -Please provide completion evidence and call goal_scored(status=\"complete\", evidence=\"\", pledge=\"\") again."; +When you are confident the goal is achieved, call: + agent_judge(task=\"explain why you believe the goal is achieved / what to verify\") +The goal is only marked verified when the Judge returns passed=true. If the goal +is not actually complete, ignore this prompt and continue working."; /// Guidance prompt when the agent appears stuck. const GUIDANCE_PROMPT: &str = "\ @@ -159,6 +155,11 @@ impl GoalManager { pause_detail: None, evidence: None, last_evaluated_run_id: None, + judge_passed: false, + judge_completeness: None, + judge_findings: None, + judge_summary: None, + judge_evaluated_run_id: None, created_at: Utc::now(), updated_at: Utc::now(), }; @@ -353,7 +354,7 @@ impl GoalManager { .remove(&self.thread_id); return GoalVerdict::Paused { reason: PauseReason::IdleBlocked, - detail: Some("agent repeatedly claimed completion without providing evidence via goal_scored".into()), + detail: Some("agent repeatedly claimed completion without requesting Judge verification via agent_judge".into()), }; } return GoalVerdict::ChallengeEvidence; @@ -412,11 +413,11 @@ impl GoalManager { detail: Some("agent published a plan, awaiting approval".into()), }); } - // goal_scored is handled by the tool execution pipeline - // (agent_session_execution) which validates pledge/evidence - // and marks the goal complete. Evaluation should not - // interfere — let it pass through to idle reset and budget - // checks. + // agent_judge is the main-agent-only acceptance request. It is + // handled by the tool execution pipeline (execute_judge_tool), + // which runs the Judge and records the verdict. Evaluation must + // not treat it as a blocking tool — like any tool call it shows + // the agent acted and should reset idle tendencies. _ => {} } } @@ -486,20 +487,44 @@ impl GoalManager { // ── Prompt generation ── - /// Generate the continuation prompt for the next turn. + /// Generate the continuation prompt for the next turn. When a prior Judge + /// verification did not pass, the most recent findings are appended so the + /// agent can fix them before re-requesting verification. pub fn render_continuation_prompt(&self, goal: &GoalRecord) -> String { - CONTINUATION_PROMPT_TEMPLATE + let mut prompt = CONTINUATION_PROMPT_TEMPLATE .replace("{objective}", &goal.objective) .replace("{turns_used}", &goal.turns_used.to_string()) - .replace("{max_turns}", &goal.max_turns.to_string()) + .replace("{max_turns}", &goal.max_turns.to_string()); + + if goal.judge_evaluated_run_id.is_some() && !goal.judge_passed { + if let Some(findings_json) = goal.judge_findings.as_deref() { + if let Ok(findings) = serde_json::from_str::>(findings_json) { + let findings: Vec = findings + .into_iter() + .filter(|f| !f.trim().is_empty()) + .take(10) + .collect(); + if !findings.is_empty() { + prompt.push_str( + "\n\nMost recent Judge findings to address before re-verifying:", + ); + for finding in findings { + let trimmed = finding.trim(); + let truncated: String = trimmed.chars().take(500).collect(); + prompt.push_str(&format!("\n- {truncated}")); + } + } + } + } + } + + prompt } - /// Generate a challenge-evidence prompt when the model failed to provide evidence. - pub fn render_challenge_prompt(&self, variant: ChallengePromptVariant) -> String { - match variant { - ChallengePromptVariant::NoEvidence => MISSING_EVIDENCE_PROMPT.to_string(), - ChallengePromptVariant::NoTool => CHALLENGE_EVIDENCE_PROMPT.to_string(), - } + /// Generate a challenge prompt nudging the agent to request Judge + /// verification when it claims completion without calling `agent_judge`. + pub fn render_challenge_prompt(&self) -> String { + CHALLENGE_EVIDENCE_PROMPT.to_string() } /// Generate a guidance prompt when the agent appears stuck. @@ -517,7 +542,18 @@ impl GoalManager { None => return Ok(None), }; + // Acceptance is now decided exclusively by the Judge: a verified goal is + // `Complete && judge_passed`. Any non-Active goal stops continuation, + // preserving existing pause/budget semantics. The legacy combination + // `Complete && !judge_passed` should not occur after migration backfill; + // if it does, log it and still stop continuation rather than re-opening. if goal.status != GoalStatus::Active { + if goal.status == GoalStatus::Complete && !goal.judge_passed { + tracing::warn!( + goal_id = %goal.id, + "goal is Complete without judge_passed; treating as terminal and not re-opening" + ); + } return Ok(Some(GoalEvaluationOutcome { goal: Self::to_payload(&goal), verdict: "skipped".to_string(), @@ -594,7 +630,6 @@ impl GoalManager { GoalVerdict::BudgetLimited => { self.mark_budget_limited(¤t.id).await?; } - GoalVerdict::Complete { .. } => {} } if let Some(run_seconds) = @@ -626,11 +661,14 @@ impl GoalManager { ), GoalVerdict::ChallengeEvidence => ( "challenge_evidence", - Some(self.render_challenge_prompt(ChallengePromptVariant::NoTool)), + Some(format!( + "{}\n\n{}", + self.render_challenge_prompt(), + self.render_continuation_prompt(updated.as_ref().unwrap_or(&goal)) + )), ), GoalVerdict::Paused { reason: _, detail } => ("paused", detail.clone()), GoalVerdict::BudgetLimited => ("budget_limited", None), - GoalVerdict::Complete { .. } => ("complete", None), }; Ok(Some(GoalEvaluationOutcome { @@ -640,11 +678,3 @@ impl GoalManager { })) } } - -/// Variants for challenge prompts. -pub enum ChallengePromptVariant { - /// Model called goal_scored(complete) but evidence was empty. - NoEvidence, - /// Model claimed completion in text but didn't use the tool. - NoTool, -} diff --git a/src-tauri/src/core/prompt/sources/custom_subagent_body.rs b/src-tauri/src/core/prompt/sources/custom_subagent_body.rs index 3e7c334a..ae7a2fb3 100644 --- a/src-tauri/src/core/prompt/sources/custom_subagent_body.rs +++ b/src-tauri/src/core/prompt/sources/custom_subagent_body.rs @@ -61,6 +61,25 @@ impl SectionSource for SubagentBodySource { }, })) } + Some(SubagentProfile::Judge) => { + let template = include_str!("../templates/subagent/judge.md"); + let (_tmpl, body) = + super::super::templates::parse_front_matter(template).map_err(|e| { + FatalError::new("template.parse", format!("subagent/judge.md: {e}")) + })?; + let vars = super::super::templates::TemplateVars::new(); + let rendered = super::super::templates::render_template_strict(&body, &[], &vars) + .map_err(|e| { + FatalError::new("template.render", format!("subagent/judge.md: {e}")) + })?; + Ok(SectionOutcome::Produced(SectionBody { + markdown: rendered, + meta: SectionMeta { + template_path: Some("templates/subagent/judge.md"), + ..Default::default() + }, + })) + } Some(SubagentProfile::Custom { system_prompt, .. }) => { if system_prompt.trim().is_empty() { return Ok(SectionOutcome::Skip); diff --git a/src-tauri/src/core/prompt/sources/subagent_output_contract.rs b/src-tauri/src/core/prompt/sources/subagent_output_contract.rs index 1ad848d0..c055e5c3 100644 --- a/src-tauri/src/core/prompt/sources/subagent_output_contract.rs +++ b/src-tauri/src/core/prompt/sources/subagent_output_contract.rs @@ -16,6 +16,9 @@ const EXPLORE_TEMPLATE_EMBEDDED: &str = const REVIEW_TEMPLATE_REL_PATH: &str = "subagent/output_contract.review.md"; const REVIEW_TEMPLATE_EMBEDDED: &str = include_str!("../templates/subagent/output_contract.review.md"); +const JUDGE_TEMPLATE_REL_PATH: &str = "subagent/output_contract.judge.md"; +const JUDGE_TEMPLATE_EMBEDDED: &str = + include_str!("../templates/subagent/output_contract.judge.md"); const DECLARED_KEYS: &[&'static str] = &[]; /// Template-backed SectionSource for the SubagentOutputContract section. @@ -42,6 +45,7 @@ impl SectionSource for SubagentOutputContractSource { (EXPLORE_TEMPLATE_REL_PATH, EXPLORE_TEMPLATE_EMBEDDED) } Some(SubagentProfile::Review) => (REVIEW_TEMPLATE_REL_PATH, REVIEW_TEMPLATE_EMBEDDED), + Some(SubagentProfile::Judge) => (JUDGE_TEMPLATE_REL_PATH, JUDGE_TEMPLATE_EMBEDDED), Some(SubagentProfile::Custom { .. }) => { // Custom subagents get a generic output contract return Ok(SectionOutcome::Produced(SectionBody::markdown( diff --git a/src-tauri/src/core/prompt/surface.rs b/src-tauri/src/core/prompt/surface.rs index 009aef9b..92b6ef76 100644 --- a/src-tauri/src/core/prompt/surface.rs +++ b/src-tauri/src/core/prompt/surface.rs @@ -10,6 +10,8 @@ pub enum PromptSurface { SubagentExplore { inherited_run_mode: RunMode }, /// Built-in review subagent SubagentReview { inherited_run_mode: RunMode }, + /// Built-in goal acceptance Judge subagent + SubagentJudge { inherited_run_mode: RunMode }, /// User-defined custom subagent SubagentCustom { slug: String, @@ -70,9 +72,11 @@ impl SurfacePattern { } (SurfacePattern::AnySubagent, PromptSurface::SubagentExplore { .. }) => true, (SurfacePattern::AnySubagent, PromptSurface::SubagentReview { .. }) => true, + (SurfacePattern::AnySubagent, PromptSurface::SubagentJudge { .. }) => true, (SurfacePattern::AnySubagent, PromptSurface::SubagentCustom { .. }) => true, (SurfacePattern::BuiltinSubagent, PromptSurface::SubagentExplore { .. }) => true, (SurfacePattern::BuiltinSubagent, PromptSurface::SubagentReview { .. }) => true, + (SurfacePattern::BuiltinSubagent, PromptSurface::SubagentJudge { .. }) => true, (SurfacePattern::CustomSubagent, PromptSurface::SubagentCustom { .. }) => true, (SurfacePattern::Compaction(k), PromptSurface::Compaction { kind }) => k == kind, (SurfacePattern::AnyCompaction, PromptSurface::Compaction { .. }) => true, diff --git a/src-tauri/src/core/prompt/surface_extensions.rs b/src-tauri/src/core/prompt/surface_extensions.rs index c8ebfb68..4d7f245b 100644 --- a/src-tauri/src/core/prompt/surface_extensions.rs +++ b/src-tauri/src/core/prompt/surface_extensions.rs @@ -27,6 +27,7 @@ impl SurfaceExtension for PromptSurface { PromptSurface::MainAgent { run_mode } => SurfacePattern::MainAgent(*run_mode), PromptSurface::SubagentExplore { .. } => SurfacePattern::AnySubagent, PromptSurface::SubagentReview { .. } => SurfacePattern::AnySubagent, + PromptSurface::SubagentJudge { .. } => SurfacePattern::AnySubagent, PromptSurface::SubagentCustom { .. } => SurfacePattern::CustomSubagent, PromptSurface::Compaction { kind } => SurfacePattern::Compaction(*kind), PromptSurface::Title => SurfacePattern::Title, @@ -43,6 +44,7 @@ impl SurfaceExtension for PromptSurface { PromptSurface::MainAgent { .. } | PromptSurface::SubagentExplore { .. } | PromptSurface::SubagentReview { .. } + | PromptSurface::SubagentJudge { .. } | PromptSurface::SubagentCustom { .. } ) } @@ -76,6 +78,9 @@ mod tests { PromptSurface::SubagentReview { inherited_run_mode: RunMode::Default, }, + PromptSurface::SubagentJudge { + inherited_run_mode: RunMode::Default, + }, PromptSurface::SubagentCustom { slug: "test".into(), inherited_run_mode: RunMode::Default, diff --git a/src-tauri/src/core/prompt/templates/active_goal.tpl.md b/src-tauri/src/core/prompt/templates/active_goal.tpl.md index 0bf5ffa4..c36eb3dd 100644 --- a/src-tauri/src/core/prompt/templates/active_goal.tpl.md +++ b/src-tauri/src/core/prompt/templates/active_goal.tpl.md @@ -8,15 +8,15 @@ declared_keys: [max_turns, objective, turns_used] Objective: {{objective}} Turns used: {{turns_used}}/{{max_turns}} -**Completion requirements — ALL must be met before calling goal_scored(complete):** -1. Every subtask implied by the objective is done. No remaining work, no dangling follow-ups. -2. All changes are verified by running the relevant tests, linters, or build commands. -3. Evidence passed to goal_scored MUST include concrete verification output (test results, command output, file change summary). -Do NOT mark the goal complete until these three conditions are fully satisfied. +**Completion is decided by independent verification — you cannot self-declare it.** +1. Every subtask implied by the objective must be done, with no remaining work or dangling follow-ups. +2. Verify your work by running the relevant tests, linters, or build commands as you go. +3. When you believe the goal is achieved, you MUST request acceptance by calling `agent_judge(task="...")`. Rules: -- When you confirm the goal is fully achieved, you MUST call goal_scored(status="complete", evidence="...", pledge="...") to mark it as scored. This is the only way to mark the goal as achieved. -- The goal_scored tool requires a 'pledge' parameter. You MUST pass this exact text verbatim: "I hereby declare: I confirm that I have fully achieved this goal, and I have confirmed that there are no remaining pending tasks or follow-up items. I confirm that I have repeatedly reviewed the output of this work, and I take responsibility for the quality of this output." -- Do NOT claim completion without verifiable evidence -- If blocked and need user input, use clarify tool -- The system will automatically continue this goal across turns +- Call `agent_judge(task="explain why you believe the goal is achieved / what to verify")` when you think the goal is complete. An independent Judge will evaluate the project against the goal's consistency and completeness. +- The goal is only marked verified when the Judge returns passed=true. You cannot mark the goal complete yourself. +- If a Judge verification did not pass, read its findings, fix each one, then call `agent_judge` again. +- Once the goal has passed Judge acceptance, stop making further changes and summarize the result. +- If blocked and you need user input, use the clarify tool. +- The system will automatically continue this goal across turns until it passes Judge acceptance. diff --git a/src-tauri/src/core/prompt/templates/subagent/judge.md b/src-tauri/src/core/prompt/templates/subagent/judge.md new file mode 100644 index 00000000..0cab1d63 --- /dev/null +++ b/src-tauri/src/core/prompt/templates/subagent/judge.md @@ -0,0 +1,24 @@ +--- +section_id: SubagentJudge +version: 1 +declared_keys: [] +--- +You are the **Goal Acceptance Judge** — an independent verifier. The main agent has been working toward a goal and now believes it is achieved (or has fixed earlier findings and wants re-verification). Your job is to independently decide whether the project's **current state** truly satisfies the goal, focusing on **consistency** with what the goal asked for and **completeness** of the work. + +You are an evaluator, not an implementer. You did not do the work, and you must not take the main agent's claims at face value — verify against the actual project state. + +## What to evaluate +- Read the goal objective injected into your task and treat it as the acceptance contract. +- Inspect the relevant code, configuration, tests, and docs to confirm each requirement of the goal is actually met. +- Run diagnostic verification when it strengthens your judgment: tests, type-checks, linters, builds, and read-only inspection commands. Adapt the commands to this repository (infer them from instructions, scripts, and manifests) instead of assuming a stack. +- You may delegate to `agent_explore`, `agent_review`, or `agent_parallel` to gather evidence in parallel when the goal is broad. + +## Hard constraints (read-only acceptance) +- Your file tools are read-only. Do **not** modify, create, or delete any files. +- The `shell` tool is for **diagnostic and verification commands only** — tests, type-checks, linters, and read-only inspection. You must **never** use shell to edit or delete files, install dependencies, change global or system state, or start interactive / long-running / daemon processes. +- Do not attempt to fix the goal yourself. If something is incomplete, report it as a finding so the main agent can fix it. + +## Verdict rules +- Pass (`passed=true`) only when the project genuinely satisfies the goal with no material gaps. When you pass, `summary` must clearly state the verified evidence — it becomes the goal's completion evidence. +- If anything required by the goal is missing, inconsistent, untested, or broken, set `passed=false` and list each concrete gap in `findings`. +- Be honest and conservative: when in doubt, do not pass. A false "passed" is worse than an extra verification round. diff --git a/src-tauri/src/core/prompt/templates/subagent/output_contract.judge.md b/src-tauri/src/core/prompt/templates/subagent/output_contract.judge.md new file mode 100644 index 00000000..a695dd71 --- /dev/null +++ b/src-tauri/src/core/prompt/templates/subagent/output_contract.judge.md @@ -0,0 +1,21 @@ +--- +section_id: SubagentOutputContractJudge +version: 1 +declared_keys: [] +--- +Your output will be consumed by the parent agent and the goal acceptance pipeline, not the user. Follow any response language instructions inherited above for natural-language fields (`findings`, `summary`). + +Return exactly one JSON object with this contract and nothing else (no markdown fences, headings, or prose before or after it): + +{ + "passed": true, + "completenessPct": 100, + "findings": [], + "summary": "Concise but specific evidence for the verdict (verified requirements, commands run and their results)." +} + +Field rules: +- `passed` (boolean): true only when the project genuinely satisfies the goal. +- `completenessPct` (integer 0-100): your honest estimate of how complete the work is against the goal. +- `findings` (array of strings): each concrete unmet / inconsistent / untested / broken point. REQUIRED and non-empty when `passed=false`. +- `summary` (string): rationale for the verdict. REQUIRED and non-empty when `passed=true` — it becomes the goal's completion evidence. If you cannot provide real evidence, set `passed=false`. diff --git a/src-tauri/src/core/subagent/judge_contract.rs b/src-tauri/src/core/subagent/judge_contract.rs new file mode 100644 index 00000000..2a673d93 --- /dev/null +++ b/src-tauri/src/core/subagent/judge_contract.rs @@ -0,0 +1,287 @@ +use serde::{Deserialize, Serialize}; + +/// Input for the `agent_judge` tool (provided by the main agent). +#[derive(Debug, Clone)] +pub struct JudgeRequest { + /// The main agent's explanation of why it believes the goal is achieved, + /// and/or points it wants the Judge to focus on. + pub task: String, +} + +impl JudgeRequest { + pub fn from_tool_input(tool_input: &serde_json::Value) -> Result { + let task = tool_input + .get("task") + .and_then(serde_json::Value::as_str) + .unwrap_or_default() + .trim() + .to_string(); + + if task.is_empty() { + return Err("missing agent_judge task".to_string()); + } + + Ok(Self { task }) + } +} + +/// Structured verdict produced by the Judge subagent. +#[derive(Debug, Clone, Serialize, Deserialize)] +#[serde(rename_all = "camelCase")] +pub struct JudgeReport { + /// Whether the project currently satisfies the goal (acceptance passes). + pub passed: bool, + /// Completeness percentage 0-100. + pub completeness_pct: u8, + /// Specific unmet / non-conforming points. Required when `passed=false`. + #[serde(default)] + pub findings: Vec, + /// Rationale for the verdict. Used as completion evidence when `passed=true`. + #[serde(default)] + pub summary: String, +} + +impl JudgeReport { + /// Build a failed report carrying a single finding (used as a safe fallback + /// when the Judge output cannot be parsed). + fn failed_with_finding(finding: String) -> Self { + Self { + passed: false, + completeness_pct: 0, + findings: vec![finding], + summary: String::new(), + } + } + + /// Normalize a parsed report so it can never represent an unverifiable + /// acceptance: + /// - `completeness_pct` is clamped to 0-100. + /// - `passed=true` with an empty `summary` is downgraded to `passed=false`. + /// - `passed=false` with no findings gets a placeholder finding. + fn normalized(mut self) -> Self { + if self.completeness_pct > 100 { + self.completeness_pct = 100; + } + + if self.passed && self.summary.trim().is_empty() { + self.passed = false; + self.findings + .push("Judge reported passed=true but provided no summary/evidence; downgraded to not passed.".to_string()); + } + + if !self.passed && self.findings.is_empty() { + self.findings + .push("Judge did not provide actionable findings.".to_string()); + } + + self + } +} + +/// Parse the Judge's textual output into a `JudgeReport`. On any parse failure +/// the result is a *failed* report carrying the raw text as a finding, so a +/// malformed Judge response can never be mistaken for acceptance. +pub fn extract_judge_report(text: &str) -> JudgeReport { + let trimmed = text.trim(); + if trimmed.is_empty() { + return JudgeReport::failed_with_finding("Judge produced no output.".to_string()); + } + + if let Ok(report) = serde_json::from_str::(trimmed) { + return report.normalized(); + } + + let stripped = strip_code_fence(trimmed); + if let Ok(report) = serde_json::from_str::(stripped) { + return report.normalized(); + } + + if let Some(report) = extract_embedded_json(trimmed) { + return report.normalized(); + } + + JudgeReport::failed_with_finding(format!( + "Judge output could not be parsed as a JudgeReport. Raw output: {trimmed}" + )) +} + +/// Render a parent-facing summary of the verdict for the main agent. +pub fn render_parent_summary(report: &JudgeReport) -> String { + let mut lines = vec![format!( + "Judge verdict: {} (completeness {}%)", + if report.passed { + "PASSED" + } else { + "NOT PASSED" + }, + report.completeness_pct + )]; + + if !report.summary.trim().is_empty() { + lines.push(format!("Summary: {}", report.summary.trim())); + } + + if report.findings.is_empty() { + lines.push("Findings:\n- none".to_string()); + } else { + let rendered = report + .findings + .iter() + .map(|f| format!("- {}", f.trim())) + .collect::>() + .join("\n"); + lines.push(format!("Findings:\n{rendered}")); + } + + if report.passed { + lines.push( + "✅ The goal has passed acceptance and is now marked complete. Stop making further changes and summarize the result.".to_string(), + ); + } else { + lines.push( + "❌ The goal has NOT passed acceptance. Fix the findings above, then call agent_judge again to re-verify.".to_string(), + ); + } + + lines.join("\n\n") +} + +fn strip_code_fence(text: &str) -> &str { + text.strip_prefix("```json") + .and_then(|value| value.strip_suffix("```")) + .map(str::trim) + .or_else(|| { + text.strip_prefix("```") + .and_then(|value| value.strip_suffix("```")) + .map(str::trim) + }) + .unwrap_or(text) +} + +/// Best-effort: pull the first balanced `{...}` JSON object out of mixed prose +/// and try to parse it as a `JudgeReport`. +fn extract_embedded_json(text: &str) -> Option { + let start = text.find('{')?; + let bytes = text.as_bytes(); + let mut depth = 0usize; + let mut in_string = false; + let mut escaped = false; + for (idx, &b) in bytes.iter().enumerate().skip(start) { + if in_string { + if escaped { + escaped = false; + } else if b == b'\\' { + escaped = true; + } else if b == b'"' { + in_string = false; + } + continue; + } + match b { + b'"' => in_string = true, + b'{' => depth += 1, + b'}' => { + depth -= 1; + if depth == 0 { + let candidate = &text[start..=idx]; + return serde_json::from_str::(candidate).ok(); + } + } + _ => {} + } + } + None +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn judge_request_requires_task() { + assert!(JudgeRequest::from_tool_input(&serde_json::json!({})).is_err()); + let req = JudgeRequest::from_tool_input(&serde_json::json!({ "task": " verify it " })) + .expect("parses"); + assert_eq!(req.task, "verify it"); + } + + #[test] + fn extract_parses_plain_json() { + let report = extract_judge_report( + r#"{"passed":true,"completenessPct":100,"findings":[],"summary":"All tests pass."}"#, + ); + assert!(report.passed); + assert_eq!(report.completeness_pct, 100); + assert_eq!(report.summary, "All tests pass."); + } + + #[test] + fn extract_parses_json_fence() { + let report = extract_judge_report( + "```json\n{\"passed\":false,\"completenessPct\":40,\"findings\":[\"missing tests\"],\"summary\":\"\"}\n```", + ); + assert!(!report.passed); + assert_eq!(report.completeness_pct, 40); + assert_eq!(report.findings, vec!["missing tests"]); + } + + #[test] + fn extract_parses_embedded_json() { + let report = extract_judge_report( + "Here is my verdict:\n{\"passed\":true,\"completenessPct\":90,\"findings\":[],\"summary\":\"Looks good\"}\nThanks!", + ); + assert!(report.passed); + assert_eq!(report.summary, "Looks good"); + } + + #[test] + fn malformed_output_is_not_passed() { + let report = extract_judge_report("I think it's done, looks fine to me."); + assert!(!report.passed); + assert!(!report.findings.is_empty()); + } + + #[test] + fn empty_output_is_not_passed() { + let report = extract_judge_report(" "); + assert!(!report.passed); + assert!(!report.findings.is_empty()); + } + + #[test] + fn passed_with_empty_summary_is_downgraded() { + let report = extract_judge_report( + r#"{"passed":true,"completenessPct":100,"findings":[],"summary":" "}"#, + ); + assert!(!report.passed); + assert!(!report.findings.is_empty()); + } + + #[test] + fn completeness_is_clamped() { + let report = extract_judge_report( + r#"{"passed":false,"completenessPct":250,"findings":["x"],"summary":""}"#, + ); + assert_eq!(report.completeness_pct, 100); + } + + #[test] + fn failed_with_no_findings_gets_placeholder() { + let report = extract_judge_report( + r#"{"passed":false,"completenessPct":10,"findings":[],"summary":"incomplete"}"#, + ); + assert!(!report.passed); + assert_eq!(report.findings.len(), 1); + } + + #[test] + fn render_summary_includes_verdict_and_findings() { + let report = extract_judge_report( + r#"{"passed":false,"completenessPct":30,"findings":["A","B"],"summary":"not yet"}"#, + ); + let summary = render_parent_summary(&report); + assert!(summary.contains("NOT PASSED")); + assert!(summary.contains("- A")); + assert!(summary.contains("agent_judge again")); + } +} diff --git a/src-tauri/src/core/subagent/mod.rs b/src-tauri/src/core/subagent/mod.rs index 22760953..5bbd87f4 100644 --- a/src-tauri/src/core/subagent/mod.rs +++ b/src-tauri/src/core/subagent/mod.rs @@ -1,8 +1,10 @@ +pub mod judge_contract; pub mod orchestrator; pub mod parallel_contract; pub mod review_contract; pub mod runtime_orchestration; +pub use judge_contract::{extract_judge_report, JudgeReport, JudgeRequest}; pub use orchestrator::{ HelperAgentOrchestrator, HelperRunRequest, HelperRunResult, SubagentActivityStatus, SubagentProgressSnapshot, diff --git a/src-tauri/src/core/subagent/orchestrator.rs b/src-tauri/src/core/subagent/orchestrator.rs index 3641e7dd..cb890700 100644 --- a/src-tauri/src/core/subagent/orchestrator.rs +++ b/src-tauri/src/core/subagent/orchestrator.rs @@ -1127,6 +1127,15 @@ impl HelperDelegationContext { RuntimeOrchestrationTool::Parallel => { return Err("agent_parallel cannot be used as an individual helper".to_string()); } + RuntimeOrchestrationTool::Judge => { + // Hard gate: agent_judge is a main-agent-only tool. A subagent + // (including Judge itself) must never recursively request goal + // acceptance, even if the tool name was parsed successfully. + return Err( + "agent_judge can only be called by the main agent for the current goal" + .to_string(), + ); + } RuntimeOrchestrationTool::Custom(slug) => { crate::core::agent_session_tools::resolve_custom_subagent_profile_from_pool( &self.orchestrator.pool, @@ -1472,6 +1481,9 @@ async fn build_helper_system_prompt( SubagentProfile::Review => PromptSurface::SubagentReview { inherited_run_mode: rm, }, + SubagentProfile::Judge => PromptSurface::SubagentJudge { + inherited_run_mode: rm, + }, SubagentProfile::Custom { slug, .. } => PromptSurface::SubagentCustom { slug: slug.clone(), inherited_run_mode: rm, diff --git a/src-tauri/src/core/subagent/runtime_orchestration.rs b/src-tauri/src/core/subagent/runtime_orchestration.rs index 27150c30..25c72e8a 100644 --- a/src-tauri/src/core/subagent/runtime_orchestration.rs +++ b/src-tauri/src/core/subagent/runtime_orchestration.rs @@ -34,6 +34,10 @@ pub enum RuntimeOrchestrationTool { Explore, Review, Parallel, + /// Goal acceptance Judge. Main-agent-only tool (`agent_judge`): it is parsed + /// here for unified dispatch but is never part of `builtin_all()` nor any + /// helper's delegation tool set. + Judge, Custom(String), // slug of the custom subagent } @@ -41,6 +45,7 @@ pub enum RuntimeOrchestrationTool { pub enum SubagentProfile { Explore, Review, + Judge, Custom { slug: String, name: String, @@ -130,6 +135,7 @@ impl RuntimeOrchestrationTool { "agent_explore" => Some(Self::Explore), "agent_review" => Some(Self::Review), "agent_parallel" => Some(Self::Parallel), + "agent_judge" => Some(Self::Judge), _ => { // Match custom subagent pattern: "agent_{slug}" if let Some(slug) = tool_name.strip_prefix("agent_") { @@ -151,6 +157,7 @@ impl RuntimeOrchestrationTool { Self::Explore => "agent_explore".to_string(), Self::Review => "agent_review".to_string(), Self::Parallel => "agent_parallel".to_string(), + Self::Judge => "agent_judge".to_string(), Self::Custom(slug) => format!("agent_{slug}"), } } @@ -160,6 +167,7 @@ impl RuntimeOrchestrationTool { Self::Explore => "Agent Explore".to_string(), Self::Review => "Agent Review".to_string(), Self::Parallel => "Agent Parallel".to_string(), + Self::Judge => "Agent Judge".to_string(), Self::Custom(slug) => format!("Agent {slug}"), } } @@ -175,6 +183,9 @@ impl RuntimeOrchestrationTool { Self::Parallel => { "Delegate 1-5 independent subtasks to subagents with bounded concurrency. Use this for parallel exploration or review work only when tasks are independent and low side-effect; results are aggregated for the parent agent." } + Self::Judge => { + "Request independent acceptance verification of the current goal. The Judge inspects the project's current state (read-only, with diagnostic shell for tests/type-check/lint) against the goal and returns a structured verdict. You cannot self-declare completion — only a passing Judge verdict marks the goal verified. Call this when you believe the goal is achieved, or to re-verify after fixing prior findings." + } Self::Custom(_) => { // Custom subagents have their description set externally via custom_subagent_as_tool "Custom subagent." @@ -188,6 +199,7 @@ impl RuntimeOrchestrationTool { match self { Self::Explore => Some(SubagentProfile::Explore), Self::Review => Some(SubagentProfile::Review), + Self::Judge => Some(SubagentProfile::Judge), Self::Parallel | Self::Custom(_) => None, } } @@ -339,6 +351,16 @@ impl RuntimeOrchestrationTool { }, "required": ["task"] }), + Self::Judge => serde_json::json!({ + "type": "object", + "properties": { + "task": { + "type": "string", + "description": "Explain why you believe the goal is achieved and call out anything the Judge should focus on (e.g. acceptance criteria, areas you are unsure about). If you are re-verifying after fixing earlier findings, summarize what you changed." + } + }, + "required": ["task"] + }), }; let name = self.tool_name(); @@ -353,6 +375,7 @@ impl SubagentProfile { match self { Self::Explore => "helper_explore".to_string(), Self::Review => "helper_review".to_string(), + Self::Judge => "helper_judge".to_string(), Self::Custom { slug, .. } => format!("helper_custom_{slug}"), } } @@ -364,6 +387,8 @@ impl SubagentProfile { match self { Self::Explore => false, Self::Review => true, + // Judge may delegate explore/review/parallel to gather evidence. + Self::Judge => true, Self::Custom { can_delegate, .. } => *can_delegate, } } @@ -374,6 +399,11 @@ impl SubagentProfile { pub fn max_delegation_depth(&self) -> u32 { match self { Self::Explore | Self::Review => BUILTIN_DEFAULT_MAX_DELEGATION_DEPTH, + // Judge is delegated by the main agent (depth 1) and must be + // accepted at depth 2 (the main agent's child depth). It may itself + // delegate explore/review at depth 3, which remains within + // BUILTIN_DEFAULT_MAX_DELEGATION_DEPTH and GLOBAL_MAX_DELEGATION_DEPTH. + Self::Judge => 2, Self::Custom { max_delegation_depth, .. @@ -435,6 +465,7 @@ impl SubagentProfile { match self { Self::Explore => include_str!("../prompt/templates/subagent/explore.md").to_string(), Self::Review => include_str!("../prompt/templates/subagent/review.md").to_string(), + Self::Judge => include_str!("../prompt/templates/subagent/judge.md").to_string(), Self::Custom { system_prompt, .. } => system_prompt.clone(), } } @@ -632,6 +663,74 @@ impl SubagentProfile { ]); } + if *self == Self::Judge { + // Judge keeps file tools read-only but is allowed a diagnostic-only + // shell plus read-only git/terminal inspection for verification. + tools.extend([ + AgentTool::new( + "git_status", + "Git Status", + "Inspect repository status in the current workspace without modifying anything.", + serde_json::json!({ + "type": "object", + "properties": { + "path": { "type": "string", "description": "Optional relative path to narrow the status query." } + } + }), + ), + AgentTool::new( + "git_diff", + "Git Diff", + "Read the current Git diff in the workspace, optionally scoped to a path or staged changes.", + serde_json::json!({ + "type": "object", + "properties": { + "path": { "type": "string", "description": "Optional relative path to inspect." }, + "staged": { "type": "boolean", "description": "Set true to inspect staged changes instead of working tree changes." }, + "contextLines": { + "type": "integer", + "minimum": 1, + "maximum": 20, + "description": "Optional number of unified diff context lines. Defaults to 3 and is capped for safety." + } + } + }), + ), + AgentTool::new( + "term_status", + "Terminal Status", + TERM_STATUS_TOOL_DESCRIPTION, + serde_json::json!({ + "type": "object", + "properties": {} + }), + ), + AgentTool::new( + "term_output", + "Terminal Output", + TERM_OUTPUT_TOOL_DESCRIPTION, + serde_json::json!({ + "type": "object", + "properties": {} + }), + ), + AgentTool::new( + "shell", + "Run Command", + "Run a non-interactive shell command inside the current workspace. Judge may use this ONLY for diagnostic and verification commands such as tests, type-checks, linters, and read-only inspection. Never use it to modify files, delete data, install dependencies, start long-running or interactive processes, or change global state.", + serde_json::json!({ + "type": "object", + "properties": { + "command": { "type": "string" }, + "cwd": { "type": "string" }, + "timeout": { "type": "number" } + }, + "required": ["command"] + }), + ), + ]); + } + tools } @@ -902,6 +1001,54 @@ mod tests { ); } + #[test] + fn judge_tool_parses_but_is_not_in_builtin_catalog() { + assert_eq!( + RuntimeOrchestrationTool::parse("agent_judge"), + Some(RuntimeOrchestrationTool::Judge) + ); + // Judge is main-agent-only: it must NOT be part of the built-in + // delegation catalog that subagents can reach. + let catalog = runtime_orchestration_tools(); + assert!(!catalog.iter().any(|tool| tool.name == "agent_judge")); + } + + #[test] + fn judge_profile_is_read_only_with_diagnostic_shell() { + let tools = SubagentProfile::Judge.helper_tools(false); + let tool_names: Vec<&str> = tools.iter().map(|tool| tool.name.as_str()).collect(); + + assert!(tool_names.contains(&"read")); + assert!(tool_names.contains(&"list")); + assert!(tool_names.contains(&"find")); + assert!(tool_names.contains(&"search")); + assert!(tool_names.contains(&"shell")); + // Read-only: no file mutation or interactive terminal tools. + assert!(!tool_names.contains(&"edit")); + assert!(!tool_names.contains(&"write")); + assert!(!tool_names.contains(&"term_write")); + assert!(!tool_names.contains(&"term_restart")); + assert!(!tool_names.contains(&"term_close")); + } + + #[test] + fn judge_can_delegate_at_depth_two() { + assert!(SubagentProfile::Judge.can_delegate()); + assert_eq!(SubagentProfile::Judge.max_delegation_depth(), 2); + assert_eq!(SubagentProfile::Judge.helper_kind(), "helper_judge"); + } + + #[test] + fn judge_is_never_a_delegation_target_for_helpers() { + // Even a Judge that can delegate only receives explore/review/parallel, + // never agent_judge. + let tools = SubagentProfile::Judge.delegation_tools_for_helper(3, &[]); + let tool_names: Vec<&str> = tools.iter().map(|tool| tool.name.as_str()).collect(); + assert!(!tool_names.contains(&"agent_judge")); + assert!(tool_names.contains(&"agent_explore")); + assert!(tool_names.contains(&"agent_review")); + } + #[test] fn agent_parallel_tool_schema_has_bounded_tasks() { let tool = RuntimeOrchestrationTool::Parallel.as_agent_tool(); diff --git a/src-tauri/src/gateway/gateway_runner.rs b/src-tauri/src/gateway/gateway_runner.rs index 62889d44..42f4167b 100644 --- a/src-tauri/src/gateway/gateway_runner.rs +++ b/src-tauri/src/gateway/gateway_runner.rs @@ -920,7 +920,7 @@ async fn dispatch_command( .await?; // Build a kickoff prompt similar to the GUI /goal path let kickoff = format!( - "## Persistent Goal Started\n\nYou are now working on the following goal:\n\n**{}**\n\nThis goal has been created and is now **active**. Work toward it.\nWhen the goal is fully achieved, you MUST call:\n```json\ngoal_scored(status=\"complete\", evidence=\"test output, file changes, verification steps\", pledge=\"I hereby declare: I confirm that I have fully achieved this goal, and I have confirmed that there are no remaining pending tasks or follow-up items. I confirm that I have repeatedly reviewed the output of this work, and I take responsibility for the quality of this output.\")\n```\nDo NOT mark complete without verified evidence.\n\nIf you need user input before proceeding, use the clarify tool.\nThe goal will automatically pause and resume when the user responds.", + "## Persistent Goal Started\n\nYou are now working on the following goal:\n\n**{}**\n\nThis goal has been created and is now **active**. Work toward it.\nCompletion is decided by independent verification — you cannot self-declare it. When you believe the goal is fully achieved, you MUST request acceptance by calling:\n```json\nagent_judge(task=\"explain why you believe the goal is achieved / what to verify\")\n```\nAn independent Judge evaluates the project against the goal. The goal is only marked verified when the Judge returns passed=true. If a verification does not pass, fix the reported findings and call agent_judge again.\n\nIf you need user input before proceeding, use the clarify tool.\nThe goal will automatically pause and resume when the user responds.", objective, ); run_agent_prompt( diff --git a/src-tauri/src/ipc/frontend_channels.rs b/src-tauri/src/ipc/frontend_channels.rs index 9778222e..48990b7d 100644 --- a/src-tauri/src/ipc/frontend_channels.rs +++ b/src-tauri/src/ipc/frontend_channels.rs @@ -223,9 +223,10 @@ pub enum ThreadStreamEvent { error: Option, }, // ── Goal events ── - // GoalStateUpdated and GoalCompleted are emitted by execute_goal_tool - // (create_goal, goal_scored tools in AgentSession). GoalContinuation and - // GoalPaused are emitted by backend run-lifecycle goal orchestration after + // GoalStateUpdated and GoalCompleted are emitted by the agent_judge + // acceptance flow (execute_judge_tool in AgentSession) when the Judge + // records a verdict. GoalContinuation and GoalPaused are emitted by backend + // run-lifecycle goal orchestration after // terminal runs are evaluated. The frontend also consumes goal state via // goal_get_state / goal_evaluate command APIs. GoalStateUpdated { diff --git a/src-tauri/src/model/goal.rs b/src-tauri/src/model/goal.rs index cbef21a4..1868fb28 100644 --- a/src-tauri/src/model/goal.rs +++ b/src-tauri/src/model/goal.rs @@ -98,10 +98,9 @@ impl PauseReason { pub enum GoalVerdict { /// Goal is still active — inject continuation prompt Continue, - /// Model claimed completion but evidence is missing — inject challenge + /// Model claimed completion but has not yet requested Judge verification — + /// inject a challenge nudging it to call `agent_judge`. ChallengeEvidence, - /// Goal achieved with evidence - Complete { evidence: String }, /// Goal paused for a specific reason Paused { reason: PauseReason, @@ -127,6 +126,16 @@ pub struct GoalRecord { pub pause_detail: Option, pub evidence: Option, pub last_evaluated_run_id: Option, + /// Whether the most recent Judge verdict passed acceptance. + pub judge_passed: bool, + /// Latest Judge completeness percentage (0-100), if evaluated. + pub judge_completeness: Option, + /// Latest Judge findings as a JSON array string, if evaluated. + pub judge_findings: Option, + /// Latest Judge summary / acceptance rationale, if evaluated. + pub judge_summary: Option, + /// Run id of the run during which the latest Judge verdict was recorded. + pub judge_evaluated_run_id: Option, pub created_at: DateTime, pub updated_at: DateTime, } @@ -153,6 +162,15 @@ pub struct GoalDto { pub evidence: Option, #[serde(skip_serializing_if = "Option::is_none")] pub last_evaluated_run_id: Option, + pub judge_passed: bool, + #[serde(skip_serializing_if = "Option::is_none")] + pub judge_completeness: Option, + #[serde(skip_serializing_if = "Option::is_none")] + pub judge_findings: Option, + #[serde(skip_serializing_if = "Option::is_none")] + pub judge_summary: Option, + #[serde(skip_serializing_if = "Option::is_none")] + pub judge_evaluated_run_id: Option, pub created_at: String, pub updated_at: String, } @@ -173,6 +191,11 @@ impl From for GoalDto { pause_detail: r.pause_detail, evidence: r.evidence, last_evaluated_run_id: r.last_evaluated_run_id, + judge_passed: r.judge_passed, + judge_completeness: r.judge_completeness, + judge_findings: r.judge_findings, + judge_summary: r.judge_summary, + judge_evaluated_run_id: r.judge_evaluated_run_id, created_at: r.created_at.to_rfc3339(), updated_at: r.updated_at.to_rfc3339(), } @@ -208,6 +231,15 @@ pub struct GoalPayload { pub evidence: Option, #[serde(skip_serializing_if = "Option::is_none")] pub last_evaluated_run_id: Option, + pub judge_passed: bool, + #[serde(skip_serializing_if = "Option::is_none")] + pub judge_completeness: Option, + #[serde(skip_serializing_if = "Option::is_none")] + pub judge_findings: Option, + #[serde(skip_serializing_if = "Option::is_none")] + pub judge_summary: Option, + #[serde(skip_serializing_if = "Option::is_none")] + pub judge_evaluated_run_id: Option, } impl From for GoalPayload { @@ -226,6 +258,11 @@ impl From for GoalPayload { pause_detail: r.pause_detail, evidence: r.evidence, last_evaluated_run_id: r.last_evaluated_run_id, + judge_passed: r.judge_passed, + judge_completeness: r.judge_completeness, + judge_findings: r.judge_findings, + judge_summary: r.judge_summary, + judge_evaluated_run_id: r.judge_evaluated_run_id, } } } diff --git a/src-tauri/src/model/subagent.rs b/src-tauri/src/model/subagent.rs index d7207ab8..5f6ae6c4 100644 --- a/src-tauri/src/model/subagent.rs +++ b/src-tauri/src/model/subagent.rs @@ -137,7 +137,7 @@ pub struct ProfileSubagentAccessRecord { // Reserved slugs that cannot be used for custom subagents // --------------------------------------------------------------------------- -pub const RESERVED_SUBAGENT_SLUGS: &[&str] = &["explore", "review"]; +pub const RESERVED_SUBAGENT_SLUGS: &[&str] = &["explore", "review", "judge"]; /// Validate that a slug is well-formed and not reserved. pub fn validate_slug(slug: &str) -> Result<(), &'static str> { diff --git a/src-tauri/src/persistence/repo/goal_repo.rs b/src-tauri/src/persistence/repo/goal_repo.rs index 9b758c84..3424104a 100644 --- a/src-tauri/src/persistence/repo/goal_repo.rs +++ b/src-tauri/src/persistence/repo/goal_repo.rs @@ -6,7 +6,8 @@ use crate::model::goal::{GoalRecord, GoalStatus, PauseReason}; const SELECT_COLUMNS: &str = "id, thread_id, objective, status, token_budget, tokens_used, \ time_used_seconds, turns_used, max_turns, pause_reason, pause_detail, evidence, \ - last_evaluated_run_id, created_at, updated_at"; + last_evaluated_run_id, judge_passed, judge_completeness, judge_findings, judge_summary, \ + judge_evaluated_run_id, created_at, updated_at"; // ── Database row (raw sqlx types) ── @@ -25,6 +26,11 @@ struct GoalRow { pause_detail: Option, evidence: Option, last_evaluated_run_id: Option, + judge_passed: i64, + judge_completeness: Option, + judge_findings: Option, + judge_summary: Option, + judge_evaluated_run_id: Option, created_at: String, updated_at: String, } @@ -45,6 +51,11 @@ impl GoalRow { pause_detail: self.pause_detail, evidence: self.evidence, last_evaluated_run_id: self.last_evaluated_run_id, + judge_passed: self.judge_passed != 0, + judge_completeness: self.judge_completeness, + judge_findings: self.judge_findings, + judge_summary: self.judge_summary, + judge_evaluated_run_id: self.judge_evaluated_run_id, created_at: DateTime::parse_from_rfc3339(&self.created_at) .map(|dt| dt.with_timezone(&Utc)) .unwrap_or_else(|_| Utc::now()), @@ -80,6 +91,10 @@ pub async fn find_by_id(pool: &SqlitePool, id: &str) -> Result Result<(), AppError> { + // Note: the judge_* columns are intentionally omitted here and rely on the + // DDL defaults (judge_passed=0, others NULL) set by the goal_judge_fields + // migration. New goals always start un-verified, and the Judge verdict is + // written later via record_judge_verdict(). let now = Utc::now().to_rfc3339(); sqlx::query( "INSERT INTO goals (id, thread_id, objective, status, token_budget, tokens_used, \ @@ -196,3 +211,66 @@ pub async fn delete_by_thread_id(pool: &SqlitePool, thread_id: &str) -> Result 0) } + +/// Persist the most recent Judge verdict for a goal. Always updates the +/// `judge_*` columns. When `passed` is true, the same transaction also writes +/// `status='complete'` and `evidence=summary` so that acceptance +/// (`status=complete` AND `judge_passed=1`) can never be observed as a +/// half-applied state. When `passed` is false the goal's `status` is left +/// unchanged (typically still `active`). +#[allow(clippy::too_many_arguments)] +pub async fn record_judge_verdict( + pool: &SqlitePool, + id: &str, + run_id: &str, + passed: bool, + completeness: i64, + findings_json: &str, + summary: &str, +) -> Result { + let now = Utc::now().to_rfc3339(); + let mut tx = pool.begin().await?; + + let updated = sqlx::query( + "UPDATE goals SET \ + judge_passed = ?, \ + judge_completeness = ?, \ + judge_findings = ?, \ + judge_summary = ?, \ + judge_evaluated_run_id = ?, \ + updated_at = ? \ + WHERE id = ?", + ) + .bind(if passed { 1_i64 } else { 0_i64 }) + .bind(completeness) + .bind(findings_json) + .bind(summary) + .bind(run_id) + .bind(&now) + .bind(id) + .execute(&mut *tx) + .await?; + + if updated.rows_affected() == 0 { + tx.rollback().await?; + return Ok(false); + } + + if passed { + sqlx::query( + "UPDATE goals SET \ + status = 'complete', \ + evidence = COALESCE(NULLIF(?, ''), evidence), \ + updated_at = ? \ + WHERE id = ?", + ) + .bind(summary) + .bind(&now) + .bind(id) + .execute(&mut *tx) + .await?; + } + + tx.commit().await?; + Ok(true) +} diff --git a/src-tauri/tests/goal_lifecycle.rs b/src-tauri/tests/goal_lifecycle.rs index 48b91647..20ead9f9 100644 --- a/src-tauri/tests/goal_lifecycle.rs +++ b/src-tauri/tests/goal_lifecycle.rs @@ -3,7 +3,7 @@ mod tests { use sqlx::sqlite::{SqliteConnectOptions, SqlitePool, SqlitePoolOptions}; use std::str::FromStr; use tiycode_lib::core::app_state::GoalRuntimeState; - use tiycode_lib::core::goal_manager::{ChallengePromptVariant, GoalManager}; + use tiycode_lib::core::goal_manager::GoalManager; use tiycode_lib::model::goal::{GoalStatus, GoalVerdict, PauseReason}; use tiycode_lib::persistence::repo::goal_repo; @@ -258,7 +258,7 @@ mod tests { let mgr = GoalManager::new(pool.clone(), "thread-1".into(), test_runtime()); let goal = mgr.create_goal("Test goal", None).await.unwrap(); - // Model says "done" but doesn't call goal_scored + // Model says "done" but doesn't call agent_judge let verdict = mgr.evaluate_after_turn( "All done! The goal is complete and everything is finished.", &goal, @@ -371,23 +371,20 @@ mod tests { let prompt = mgr.render_continuation_prompt(&goal); assert!(prompt.contains("Build feature X")); - assert!(prompt.contains("goal_scored")); + assert!(prompt.contains("agent_judge")); assert!(prompt.contains("clarify")); } #[tokio::test] - async fn challenge_prompt_renders_variants() { + async fn challenge_prompt_guides_to_judge() { let mgr = GoalManager::new(setup_pool().await, "thread-1".into(), test_runtime()); - let no_evidence = mgr.render_challenge_prompt(ChallengePromptVariant::NoEvidence); - assert!(no_evidence.contains("did not provide evidence")); - - let no_tool = mgr.render_challenge_prompt(ChallengePromptVariant::NoTool); - assert!(no_tool.contains("provide concrete evidence")); - assert!(no_tool.contains("goal_scored")); + let prompt = mgr.render_challenge_prompt(); + assert!(prompt.contains("agent_judge")); + assert!(prompt.contains("cannot self-declare")); } - // ── #1 / #8: Tests for goal_scored validation logic & test gap coverage ── + // ── mark_complete validation & test gap coverage ── #[tokio::test] async fn mark_complete_rejects_empty_evidence() { @@ -456,17 +453,135 @@ mod tests { } #[tokio::test] - async fn evaluate_after_turn_goal_scored_not_blocking() { + async fn evaluate_after_turn_agent_judge_not_blocking() { let pool = setup_pool().await; let mgr = GoalManager::new(pool.clone(), "thread-1".into(), test_runtime()); let goal = mgr.create_goal("Test goal", None).await.unwrap(); - // goal_scored should NOT trigger a pause in evaluation - mgr.record_tool_call("goal_scored"); - let verdict = mgr.evaluate_after_turn("Calling goal_scored", &goal); + // agent_judge should NOT trigger a pause in evaluation + mgr.record_tool_call("agent_judge"); + let verdict = mgr.evaluate_after_turn("Calling agent_judge", &goal); assert!(matches!(verdict, GoalVerdict::Continue)); } + // ── Judge verdict persistence (record_judge_verdict) ── + + #[tokio::test] + async fn record_judge_verdict_pass_marks_complete_and_verified() { + let pool = setup_pool().await; + let mgr = GoalManager::new(pool.clone(), "thread-1".into(), test_runtime()); + let goal = mgr.create_goal("Test goal", None).await.unwrap(); + + let recorded = goal_repo::record_judge_verdict( + &pool, + &goal.id, + "run-judge-1", + true, + 100, + "[]", + "All requirements verified; tests pass.", + ) + .await + .unwrap(); + assert!(recorded); + + let updated = mgr.get_active().await.unwrap().unwrap(); + assert_eq!(updated.status, GoalStatus::Complete); + assert!(updated.judge_passed); + assert_eq!(updated.judge_completeness, Some(100)); + assert_eq!( + updated.evidence.as_deref(), + Some("All requirements verified; tests pass.") + ); + assert_eq!( + updated.judge_evaluated_run_id.as_deref(), + Some("run-judge-1") + ); + + // A verified goal stops continuation. + let outcome = mgr + .evaluate_after_run("run-after", None) + .await + .unwrap() + .unwrap(); + assert_eq!(outcome.verdict, "skipped"); + assert!(outcome.continuation_prompt.is_none()); + } + + #[tokio::test] + async fn record_judge_verdict_fail_keeps_active_and_persists_findings() { + let pool = setup_pool().await; + let mgr = GoalManager::new(pool.clone(), "thread-1".into(), test_runtime()); + let goal = mgr.create_goal("Test goal", None).await.unwrap(); + + let findings = serde_json::to_string(&vec![ + "Missing unit tests for module X".to_string(), + "Build fails on Windows".to_string(), + ]) + .unwrap(); + let recorded = goal_repo::record_judge_verdict( + &pool, + &goal.id, + "run-judge-1", + false, + 60, + &findings, + "Not yet complete.", + ) + .await + .unwrap(); + assert!(recorded); + + let updated = mgr.get_active().await.unwrap().unwrap(); + assert_eq!(updated.status, GoalStatus::Active); + assert!(!updated.judge_passed); + assert!(updated.judge_findings.is_some()); + + // Continuation prompt should surface the latest findings. + let prompt = mgr.render_continuation_prompt(&updated); + assert!(prompt.contains("Missing unit tests for module X")); + assert!(prompt.contains("agent_judge")); + } + + #[tokio::test] + async fn migration_backfills_legacy_complete_goal_as_verified() { + let pool = setup_pool().await; + let mgr = GoalManager::new(pool.clone(), "thread-1".into(), test_runtime()); + let goal = mgr.create_goal("Legacy goal", None).await.unwrap(); + + // Simulate a legacy completed goal (no judge fields set yet). + sqlx::query( + "UPDATE goals SET status = 'complete', evidence = 'legacy evidence' WHERE id = ?", + ) + .bind(&goal.id) + .execute(&pool) + .await + .unwrap(); + // Apply the same backfill the migration performs. + sqlx::query( + "UPDATE goals SET judge_passed = 1, \ + judge_summary = COALESCE(judge_summary, evidence), \ + judge_completeness = COALESCE(judge_completeness, 100) \ + WHERE status = 'complete'", + ) + .execute(&pool) + .await + .unwrap(); + + let updated = mgr.get_active().await.unwrap().unwrap(); + assert_eq!(updated.status, GoalStatus::Complete); + assert!(updated.judge_passed); + assert_eq!(updated.judge_completeness, Some(100)); + + // It must not be re-opened by continuation. + let outcome = mgr + .evaluate_after_run("run-after", None) + .await + .unwrap() + .unwrap(); + assert_eq!(outcome.verdict, "skipped"); + } + #[tokio::test] async fn evaluate_after_turn_chinese_idle_phrase_pauses() { let pool = setup_pool().await; diff --git a/src/i18n/locales/en.ts b/src/i18n/locales/en.ts index 9dd01685..f3fd2940 100644 --- a/src/i18n/locales/en.ts +++ b/src/i18n/locales/en.ts @@ -1094,6 +1094,7 @@ const en: Record = { "goal.status.paused": "Paused", "goal.status.budgetLimited": "Budget Exhausted", "goal.status.complete": "Complete", + "goal.status.verified": "Verified", "goal.time.elapsed": "Running for {{time}}", "goal.time.hoursMinutes": "{{hours}}h {{minutes}}m", "goal.time.minutesSeconds": "{{minutes}}m {{seconds}}s", diff --git a/src/i18n/locales/zh-CN.ts b/src/i18n/locales/zh-CN.ts index 5b3272d2..3f5c5164 100644 --- a/src/i18n/locales/zh-CN.ts +++ b/src/i18n/locales/zh-CN.ts @@ -1133,6 +1133,7 @@ const zhCN = { "goal.status.paused": "已暂停", "goal.status.budgetLimited": "预算耗尽", "goal.status.complete": "已完成", + "goal.status.verified": "已验收通过", "goal.time.elapsed": "已持续运行{{time}}", "goal.time.hoursMinutes": "{{hours}}小时{{minutes}}分", "goal.time.minutesSeconds": "{{minutes}}分{{seconds}}秒", diff --git a/src/modules/workbench-shell/model/thread-store.ts b/src/modules/workbench-shell/model/thread-store.ts index 4c083f85..97ec8701 100644 --- a/src/modules/workbench-shell/model/thread-store.ts +++ b/src/modules/workbench-shell/model/thread-store.ts @@ -101,6 +101,11 @@ export interface GoalStoreState { pauseDetail?: string | null; evidence?: string | null; lastEvaluatedRunId?: string | null; + judgePassed?: boolean; + judgeCompleteness?: number | null; + judgeFindings?: string | null; + judgeSummary?: string | null; + judgeEvaluatedRunId?: string | null; } // --------------------------------------------------------------------------- diff --git a/src/modules/workbench-shell/ui/goal-status-bar.tsx b/src/modules/workbench-shell/ui/goal-status-bar.tsx index 45c42922..83c2bbf7 100644 --- a/src/modules/workbench-shell/ui/goal-status-bar.tsx +++ b/src/modules/workbench-shell/ui/goal-status-bar.tsx @@ -35,7 +35,7 @@ export function GoalStatusBar({ threadId }: Props) { case "active": return "goal.status.active"; case "paused": return "goal.status.paused"; case "budget_limited": return "goal.status.budgetLimited"; - case "complete": return "goal.status.complete"; + case "complete": return goal.judgePassed ? "goal.status.verified" : "goal.status.complete"; default: return "goal.status.active"; } })(); diff --git a/src/modules/workbench-shell/ui/runtime-thread-surface.tsx b/src/modules/workbench-shell/ui/runtime-thread-surface.tsx index 6fbfb3e3..db893329 100644 --- a/src/modules/workbench-shell/ui/runtime-thread-surface.tsx +++ b/src/modules/workbench-shell/ui/runtime-thread-surface.tsx @@ -1689,11 +1689,11 @@ export function RuntimeThreadSurface({ "**" + argText + "**", "", "This goal has been created and is now **active**. Work toward it.", - "When the goal is fully achieved, you MUST call:", + "Completion is decided by independent verification — you cannot self-declare it. When you believe the goal is fully achieved, you MUST request acceptance by calling:", "```json", - "goal_scored(status=\"complete\", evidence=\"test output, file changes, verification steps\", pledge=\"I hereby declare: I confirm that I have fully achieved this goal, and I have confirmed that there are no remaining pending tasks or follow-up items. I confirm that I have repeatedly reviewed the output of this work, and I take responsibility for the quality of this output.\")", + "agent_judge(task=\"explain why you believe the goal is achieved / what to verify\")", "```", - "Do NOT mark complete without verified evidence.", + "An independent Judge evaluates the project against the goal. The goal is only marked verified when the Judge returns passed=true. If a verification does not pass, fix the reported findings and call agent_judge again.", "", "If you need user input before proceeding, use the clarify tool.", "The goal will automatically pause and resume when the user responds.", diff --git a/src/services/bridge/agent-commands.ts b/src/services/bridge/agent-commands.ts index 6e9ed990..d43262ca 100644 --- a/src/services/bridge/agent-commands.ts +++ b/src/services/bridge/agent-commands.ts @@ -735,6 +735,11 @@ export type GoalPayload = { pauseDetail?: string | null; evidence?: string | null; lastEvaluatedRunId?: string | null; + judgePassed?: boolean; + judgeCompleteness?: number | null; + judgeFindings?: string | null; + judgeSummary?: string | null; + judgeEvaluatedRunId?: string | null; }; export async function goalGetState(threadId: string): Promise { @@ -768,7 +773,7 @@ export async function goalClear(threadId: string): Promise { export type GoalEvaluateResult = { goal: GoalPayload; - verdict: "continue" | "challenge_evidence" | "complete" | "paused" | "budget_limited"; + verdict: "continue" | "challenge_evidence" | "complete" | "paused" | "budget_limited" | "skipped"; continuationPrompt?: string | null; }; From 3b77dd12c5c03c62091be6bef1e25ea2f8ea9a2b Mon Sep 17 00:00:00 2001 From: Jorben Date: Sun, 7 Jun 2026 12:39:05 +0800 Subject: [PATCH 02/16] =?UTF-8?q?refactor(goal):=20=E2=99=BB=EF=B8=8F=20re?= =?UTF-8?q?move=20mark=5Fcomplete=20and=20complete=20verdict?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Remove the mark_complete pathway from goals as completion will be handled through a different mechanism: - Remove mark_complete method from GoalManager - Remove "complete" from GoalEvaluateResult verdict type - Remove mark_complete test cases (evidence validation, etc.) - Update subagent surface comments to include judge BREAKING CHANGE: GoalEvaluateResult.verdict no longer includes "complete" --- src-tauri/src/core/goal_manager.rs | 23 --------------- src-tauri/src/core/prompt/surface.rs | 4 +-- src-tauri/tests/goal_lifecycle.rs | 40 --------------------------- src/services/bridge/agent-commands.ts | 2 +- 4 files changed, 3 insertions(+), 66 deletions(-) diff --git a/src-tauri/src/core/goal_manager.rs b/src-tauri/src/core/goal_manager.rs index 7486e39b..bd7298db 100644 --- a/src-tauri/src/core/goal_manager.rs +++ b/src-tauri/src/core/goal_manager.rs @@ -213,29 +213,6 @@ impl GoalManager { Ok(()) } - /// Mark the goal as complete with evidence. - pub async fn mark_complete(&self, goal_id: &str, evidence: &str) -> Result<(), AppError> { - if evidence.trim().is_empty() { - return Err(AppError::validation( - ErrorSource::Settings, - "evidence is required to mark a goal as complete", - )); - } - let updated = goal_repo::update_status( - &self.pool, - goal_id, - GoalStatus::Complete, - None, - None, - Some(evidence), - ) - .await?; - if !updated { - return Err(AppError::not_found(ErrorSource::Settings, "goal")); - } - Ok(()) - } - /// Mark the goal as budget-limited. pub async fn mark_budget_limited(&self, goal_id: &str) -> Result<(), AppError> { let updated = goal_repo::update_status( diff --git a/src-tauri/src/core/prompt/surface.rs b/src-tauri/src/core/prompt/surface.rs index 92b6ef76..554bb0b4 100644 --- a/src-tauri/src/core/prompt/surface.rs +++ b/src-tauri/src/core/prompt/surface.rs @@ -48,9 +48,9 @@ pub enum SurfacePattern { AnyMainAgent, /// Matches a specific MainAgent run_mode MainAgent(RunMode), - /// Matches any subagent surface (explore, review, custom) + /// Matches any subagent surface (explore, review, judge, custom) AnySubagent, - /// Matches built-in explore + review subagents only + /// Matches built-in explore + review + judge subagents only BuiltinSubagent, /// Matches any custom subagent regardless of slug CustomSubagent, diff --git a/src-tauri/tests/goal_lifecycle.rs b/src-tauri/tests/goal_lifecycle.rs index 20ead9f9..ecff3198 100644 --- a/src-tauri/tests/goal_lifecycle.rs +++ b/src-tauri/tests/goal_lifecycle.rs @@ -320,24 +320,6 @@ mod tests { assert_eq!(paused.status, GoalStatus::Paused); } - #[tokio::test] - async fn mark_complete_with_evidence() { - let pool = setup_pool().await; - let mgr = GoalManager::new(pool.clone(), "thread-1".into(), test_runtime()); - let goal = mgr.create_goal("Test goal", None).await.unwrap(); - - mgr.mark_complete(&goal.id, "All tests pass, files created") - .await - .unwrap(); - - let completed = mgr.get_active().await.unwrap().unwrap(); - assert_eq!(completed.status, GoalStatus::Complete); - assert_eq!( - completed.evidence.as_deref(), - Some("All tests pass, files created") - ); - } - #[tokio::test] async fn mark_budget_limited() { let pool = setup_pool().await; @@ -384,28 +366,6 @@ mod tests { assert!(prompt.contains("cannot self-declare")); } - // ── mark_complete validation & test gap coverage ── - - #[tokio::test] - async fn mark_complete_rejects_empty_evidence() { - let pool = setup_pool().await; - let mgr = GoalManager::new(pool.clone(), "thread-1".into(), test_runtime()); - let goal = mgr.create_goal("Test goal", None).await.unwrap(); - - let err = mgr.mark_complete(&goal.id, "").await.unwrap_err(); - assert!(err.user_message.contains("evidence is required")); - } - - #[tokio::test] - async fn mark_complete_rejects_whitespace_only_evidence() { - let pool = setup_pool().await; - let mgr = GoalManager::new(pool.clone(), "thread-1".into(), test_runtime()); - let goal = mgr.create_goal("Test goal", None).await.unwrap(); - - let err = mgr.mark_complete(&goal.id, " ").await.unwrap_err(); - assert!(err.user_message.contains("evidence is required")); - } - #[tokio::test] async fn evaluate_after_turn_token_budget_exhausted_returns_budget_limited() { let pool = setup_pool().await; diff --git a/src/services/bridge/agent-commands.ts b/src/services/bridge/agent-commands.ts index d43262ca..d6ec3012 100644 --- a/src/services/bridge/agent-commands.ts +++ b/src/services/bridge/agent-commands.ts @@ -773,7 +773,7 @@ export async function goalClear(threadId: string): Promise { export type GoalEvaluateResult = { goal: GoalPayload; - verdict: "continue" | "challenge_evidence" | "complete" | "paused" | "budget_limited" | "skipped"; + verdict: "continue" | "challenge_evidence" | "paused" | "budget_limited" | "skipped"; continuationPrompt?: string | null; }; From b204d9b92d6ae7f4b9cc7c7a4edeb9dfe60a420e Mon Sep 17 00:00:00 2001 From: Jorben Date: Sun, 7 Jun 2026 12:47:23 +0800 Subject: [PATCH 03/16] =?UTF-8?q?docs:=20=F0=9F=93=9D=20update=20and=20reo?= =?UTF-8?q?rder=20README=20feature=20list?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Update the feature descriptions and reorder the bullet points in both README.md and README_zh.md to better reflect the current product capabilities and improve readability. Changes include: - Reordering features to highlight persistent goal management, real-time streaming, and extensibility earlier in the list - Updating descriptions for several features to be more accurate - Maintaining consistency between English and Chinese versions - Keeping the overall structure while improving flow These are documentation-only changes that do not affect functionality. --- README.md | 12 ++++++------ README_zh.md | 12 ++++++------ 2 files changed, 12 insertions(+), 12 deletions(-) diff --git a/README.md b/README.md index d7d030bf..a99948d1 100644 --- a/README.md +++ b/README.md @@ -28,22 +28,22 @@ Around that collaboration model, TiyCode brings together Agent Profiles, workspa - **AI-first coding collaboration.** TiyCode is designed around the idea that humans express intent through conversation while agents take the lead in execution. - **Agent Profiles.** Mix models from different providers, tune response style, language, and custom instructions, and switch profiles flexibly for different kinds of work. +- **Persistent goal management.** Define long-running objectives for agents to pursue across multiple turns. An independent Judge subagent evaluates completion against actual file changes, command outputs, and commit history — eliminating self-attestation bias. - **Custom Agents.** Create purpose-built sub-agents in Settings — each with its own name, system prompt, model tier, and allowed tools — then grant per-profile access and delegate work from the composer. - **Three-tier model architecture.** Each profile supports a Primary model for core reasoning, an Auxiliary model for helper tasks, and a Lightweight model for fast operations — with automatic fallback chains across tiers. - **Multi-provider support.** Connect to 13+ LLM providers out of the box — OpenAI, Anthropic, Google, Ollama, xAI, Groq, OpenRouter, DeepSeek, MiniMax, Kimi, and more — or add any OpenAI-compatible endpoint as a custom provider. - **Workspace-centered execution.** Threads stay grounded in the local workspace and connect naturally to code review, version control, repository inspection, Git worktrees, and terminal workflows. - **Task-aware execution.** Thread-scoped task boards, plan checkpoints, tool status events, and subagent progress make longer runs easier to follow and review. -- **Persistent goal management.** Set long-running objectives for agents to pursue across multiple turns, with automatic continuation, budget controls, and progress tracking. +- **Real-time execution streaming.** A rich thread stream event system delivers live updates — message deltas, tool calls, requested/active statuses, reasoning steps, subagent progress, and plan updates — all rendered through purpose-built AI Elements components. - **Rich composer inputs.** Prompt input supports text, file/photo attachments, screenshots, slash command structured argument interpolation (`--key=value`, positional args, `{{placeholder}}` templates), and large-paste handling. - **Steer & Queue.** While the agent is running, choose to steer the conversation mid-execution or queue a follow-up message for the next round — keeping you in control without interrupting the workflow. -- **Real-time execution streaming.** A rich thread stream event system delivers live updates — message deltas, tool calls, requested/active statuses, reasoning steps, subagent progress, and plan updates — all rendered through purpose-built AI Elements components. -- **Operator-friendly experience.** Slash commands with structured argument parsing, smart conversation titles, context compression controls, commit message generation, external terminal handoff including Ghostty, and compact workbench controls help the product feel fast and practical in day-to-day use. -- **Thread-level elapsed timer.** Track active execution time per thread, excluding pauses, with persistent tracking across sessions. -- **Bilingual interface.** Full i18n coverage with English and Simplified Chinese, switchable at any time. +- **Extensible by design.** Plugins, MCP servers, and Skills are treated as first-class building blocks through the `Extensions Center`. - **ACP Server support.** TiyCode can run as a headless ACP (Agent Client Protocol) server via `tiycode acp --stdio` or `tiycode acp --http `, letting external tools and IDE plugins drive the agent runtime through a standard JSON-RPC protocol without the desktop GUI. - **IM channel gateway.** Connect TiyCode to WeChat or WeCom so you can chat with the agent directly from your messaging app — scan a QR code to log in, send messages and attachments, and receive streaming responses without opening the desktop GUI. -- **Extensible by design.** Plugins, MCP servers, and Skills are treated as first-class building blocks through the `Extensions Center`. +- **Operator-friendly experience.** Slash commands with structured argument parsing, smart conversation titles, context compression controls, commit message generation, external terminal handoff including Ghostty, and compact workbench controls help the product feel fast and practical in day-to-day use. +- **Thread-level elapsed timer.** Track active execution time per thread, excluding pauses, with persistent tracking across sessions. - **Built-in runtime path.** The main execution flow is `Frontend -> Rust Core -> BuiltInAgentRuntime -> tiycore -> LLM`. +- **Bilingual interface.** Full i18n coverage with English and Simplified Chinese, switchable at any time. ## Tech Stack diff --git a/README_zh.md b/README_zh.md index c9bbdcde..dc615077 100644 --- a/README_zh.md +++ b/README_zh.md @@ -28,22 +28,22 @@ TiyCode 面向的是希望以 AI 时代的方式进行编码协作的用户。 - **AI First 的编码协作。** TiyCode 围绕"通过对话表达意图,Agent 全面执行"这一理念来设计产品形态。 - **Agent Profile。** 支持自由组合不同服务商的模型,并可配置回复风格、回复语言、自定义指令等设定,且能在不同 Profile 之间灵活切换。 +- **持久化目标管理。** 为 Agent 设置跨轮次的长期目标,由独立的 Judge 验收 Agent 基于实际文件变更、命令输出和提交历史进行完成判定——杜绝"自说自话"的信任缺陷。 - **Custom Agents。** 在设置中心创建专用子 Agent——每个拥有独立的名称、系统提示、模型层级和可用工具——按 Profile 授权后即可从 composer 委派任务。 - **三层模型架构。** 每个 Profile 支持配置 Primary 主力模型、Auxiliary 辅助模型和 Lightweight 轻量模型三个层级,层级之间具备自动回退链路。 - **多服务商接入。** 开箱支持 13+ 家 LLM 服务商 —— OpenAI、Anthropic、Google、Ollama、xAI、Groq、OpenRouter、DeepSeek、MiniMax、Kimi 等,也可将任何 OpenAI 兼容端点作为自定义 Provider 接入。 - **以工作区为中心的执行体验。** 对话线程扎根本地工作区,并与代码审阅、版本控制、仓库状态读取、Git worktree 和 Terminal 工作流自然衔接。 - **面向任务的执行可观测性。** Thread 级任务板、Plan checkpoint、工具状态事件和子 Agent 进度让长任务更容易跟踪和复查。 -- **持久化目标管理。** 为 Agent 设置跨轮次的长期目标,支持自动延续、预算控制和进度跟踪。 +- **实时执行流式推送。** 丰富的 Thread Stream 事件体系支撑实时更新 —— 消息增量、工具调用、requested / active 状态、推理步骤、子 Agent 进度与计划更新。 - **更丰富的输入能力。** Prompt 输入支持文本、文件 / 图片附件、截图、Slash Command 结构化参数插值(`--key=value`、位置参数、`{{placeholder}}` 模板变量)以及大段文本粘贴处理。 - **Steer 与 Queue。** Agent 运行中可选择「引导」即时插入消息调整方向,或「排队」将消息留待当前运行结束后再发起下一轮——无需中断工作流即可保持掌控。 -- **实时执行流式推送。** 丰富的 Thread Stream 事件体系支撑实时更新 —— 消息增量、工具调用、requested / active 状态、推理步骤、子 Agent 进度与计划更新。 -- **更友好的日常体验。** 支持结构化参数解析的 Slash Command、智能会话标题、上下文压缩、Commit Message 生成、包含 Ghostty 在内的外部终端衔接以及紧凑工作台控件,让协作过程更顺手、更连贯。 -- **线程级别耗时计时器。** 跟踪每个线程的活跃执行时间,排除暂停时间,并支持跨会话持久化跟踪。 -- **双语界面。** 完整的 i18n 支持,覆盖英文和简体中文,随时可切换。 +- **良好的通用扩展能力。** Plugins、MCP Servers 与 Skills 通过 `Extensions Center` 形成统一的扩展入口与产品模型。 - **ACP Server 支持。** TiyCode 可作为无头 ACP(Agent Client Protocol)服务器运行,通过 `tiycode acp --stdio` 或 `tiycode acp --http ` 启动,让外部工具和 IDE 插件通过标准 JSON-RPC 协议驱动 Agent 运行时,无需启动桌面 GUI。 - **IM 通道网关。** 将 TiyCode 接入微信或企业微信,扫码登录后即可在聊天应用中直接与 Agent 对话——发送消息和附件、接收流式回复,无需打开桌面 GUI。 -- **良好的通用扩展能力。** Plugins、MCP Servers 与 Skills 通过 `Extensions Center` 形成统一的扩展入口与产品模型。 +- **更友好的日常体验。** 支持结构化参数解析的 Slash Command、智能会话标题、上下文压缩、Commit Message 生成、包含 Ghostty 在内的外部终端衔接以及紧凑工作台控件,让协作过程更顺手、更连贯。 +- **线程级别耗时计时器。** 跟踪每个线程的活跃执行时间,排除暂停时间,并支持跨会话持久化跟踪。 - **内置 Runtime。** 主执行链路 `Frontend -> Rust Core -> BuiltInAgentRuntime -> tiycore -> LLM`。 +- **双语界面。** 完整的 i18n 支持,覆盖英文和简体中文,随时可切换。 ## 技术栈 From e284fbeb808919994d112a38bbd02a443e226221 Mon Sep 17 00:00:00 2001 From: Jorben Date: Sun, 7 Jun 2026 12:57:00 +0800 Subject: [PATCH 04/16] =?UTF-8?q?refactor(goal):=20=E2=99=BB=EF=B8=8F=20ex?= =?UTF-8?q?tract=20resolveGoalStatusKey=20for=20testability?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - Extract inline status key resolution into a pure exported function so the complete→verified (judgePassed) branch can be unit-tested without mounting the component - Add unit tests covering all status mappings and judgePassed variants - Add test for skipped verdict passthrough in goalEvaluate --- .../ui/goal-status-bar.test.tsx | 21 +++++++++++ .../workbench-shell/ui/goal-status-bar.tsx | 35 +++++++++++++------ src/services/bridge/agent-commands.test.ts | 10 ++++++ 3 files changed, 56 insertions(+), 10 deletions(-) diff --git a/src/modules/workbench-shell/ui/goal-status-bar.test.tsx b/src/modules/workbench-shell/ui/goal-status-bar.test.tsx index 24976d9e..6c049117 100644 --- a/src/modules/workbench-shell/ui/goal-status-bar.test.tsx +++ b/src/modules/workbench-shell/ui/goal-status-bar.test.tsx @@ -1,4 +1,5 @@ import { describe, expect, it } from "vitest"; +import { resolveGoalStatusKey } from "./goal-status-bar"; const source = await import("./goal-status-bar?raw").then((module) => module.default as string); @@ -21,3 +22,23 @@ describe("GoalStatusBar layout contract", () => { expect(source).not.toContain("goal.time.hoursMinutes"); }); }); + +describe("resolveGoalStatusKey", () => { + it("maps non-complete statuses to their own keys", () => { + expect(resolveGoalStatusKey("active", undefined)).toBe("goal.status.active"); + expect(resolveGoalStatusKey("paused", undefined)).toBe("goal.status.paused"); + expect(resolveGoalStatusKey("budget_limited", undefined)).toBe("goal.status.budgetLimited"); + }); + + it("shows the verified label only when a complete goal passed Judge acceptance", () => { + expect(resolveGoalStatusKey("complete", true)).toBe("goal.status.verified"); + }); + + it("falls back to the plain complete label when judge has not passed", () => { + expect(resolveGoalStatusKey("complete", false)).toBe("goal.status.complete"); + }); + + it("treats an undefined judgePassed as not verified", () => { + expect(resolveGoalStatusKey("complete", undefined)).toBe("goal.status.complete"); + }); +}); diff --git a/src/modules/workbench-shell/ui/goal-status-bar.tsx b/src/modules/workbench-shell/ui/goal-status-bar.tsx index 83c2bbf7..86ace721 100644 --- a/src/modules/workbench-shell/ui/goal-status-bar.tsx +++ b/src/modules/workbench-shell/ui/goal-status-bar.tsx @@ -2,13 +2,36 @@ import { useCallback, useState } from "react"; import { goalGetState, goalPause, goalResume, goalClear } from "@/services/bridge/agent-commands"; -import { threadStore, useStore, shallowEqual } from "@/modules/workbench-shell/model/thread-store"; +import { threadStore, useStore, shallowEqual, type GoalStoreState } from "@/modules/workbench-shell/model/thread-store"; import { useT } from "@/i18n"; type Props = { threadId: string; }; +/** + * Resolve the i18n key for the goal status label. Extracted as a pure function + * so the `complete` → `verified` (judgePassed) branch can be unit-tested without + * mounting the component. + */ +export function resolveGoalStatusKey( + status: GoalStoreState["status"], + judgePassed: GoalStoreState["judgePassed"], +): + | "goal.status.active" + | "goal.status.paused" + | "goal.status.budgetLimited" + | "goal.status.verified" + | "goal.status.complete" { + switch (status) { + case "active": return "goal.status.active"; + case "paused": return "goal.status.paused"; + case "budget_limited": return "goal.status.budgetLimited"; + case "complete": return judgePassed ? "goal.status.verified" : "goal.status.complete"; + default: return "goal.status.active"; + } +} + export function GoalStatusBar({ threadId }: Props) { const t = useT(); const goal = useStore(threadStore, (s) => s.goalState[threadId] ?? null, shallowEqual); @@ -30,15 +53,7 @@ export function GoalStatusBar({ threadId }: Props) { if (!goal) return null; - const statusKey = (() => { - switch (goal.status) { - case "active": return "goal.status.active"; - case "paused": return "goal.status.paused"; - case "budget_limited": return "goal.status.budgetLimited"; - case "complete": return goal.judgePassed ? "goal.status.verified" : "goal.status.complete"; - default: return "goal.status.active"; - } - })(); + const statusKey = resolveGoalStatusKey(goal.status, goal.judgePassed); const statusColor = goal.status === "active" ? "bg-blue-500" diff --git a/src/services/bridge/agent-commands.test.ts b/src/services/bridge/agent-commands.test.ts index 4ffb735a..25695b82 100644 --- a/src/services/bridge/agent-commands.test.ts +++ b/src/services/bridge/agent-commands.test.ts @@ -608,6 +608,16 @@ describe("goalEvaluate", () => { expect(result).toBeNull(); }); + it("passes through the skipped verdict for already-accepted goals", async () => { + isTauriMock.mockReturnValue(true); + const result = makeGoalEvaluateResult({ verdict: "skipped", continuationPrompt: null }); + invokeMock.mockResolvedValueOnce(result); + + const outcome = await goalEvaluate("thread-1"); + expect(outcome!.verdict).toBe("skipped"); + expect(outcome!.continuationPrompt).toBeNull(); + }); + it("requires Tauri runtime", async () => { isTauriMock.mockReturnValue(false); From e8a58f2767fb3ecaa6cb2486cf9cc3e2c80c9971 Mon Sep 17 00:00:00 2001 From: Jorben Date: Sun, 7 Jun 2026 14:59:06 +0800 Subject: [PATCH 05/16] =?UTF-8?q?refactor(subagent):=20=F0=9F=94=A7=20incr?= =?UTF-8?q?ease=20builtin=20default=20max=20delegation=20depth=20to=205?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Raise `BUILTIN_DEFAULT_MAX_DELEGATION_DEPTH` from 3 to 5 to match the existing `GLOBAL_MAX_DELEGATION_DEPTH`, allowing built-in subagents (explore/review) to be delegated to the same depth as custom profiles. Update delegation validation tests to reflect the new depth limits. --- src-tauri/src/core/subagent/orchestrator.rs | 16 +++++++++------- .../src/core/subagent/runtime_orchestration.rs | 6 +++--- 2 files changed, 12 insertions(+), 10 deletions(-) diff --git a/src-tauri/src/core/subagent/orchestrator.rs b/src-tauri/src/core/subagent/orchestrator.rs index cb890700..0ee2c210 100644 --- a/src-tauri/src/core/subagent/orchestrator.rs +++ b/src-tauri/src/core/subagent/orchestrator.rs @@ -1909,7 +1909,7 @@ mod tests { #[test] fn validate_delegation_allows_review_to_explore_at_depth_2() { - // Main(1) → review(2): review can delegate, explore.max=3 >= 2. + // Main(1) → review(2): review can delegate, explore.max=5 >= 2. validate_delegation_capability( &SubagentProfile::Review, &RuntimeOrchestrationTool::Explore, @@ -1921,15 +1921,17 @@ mod tests { #[test] fn validate_delegation_rejects_when_child_depth_exceeds_target_max() { - // child_depth 4 exceeds explore.max_delegation_depth (3). + // Custom target with max=4 cannot be reached at depth 5 (exceeds its config but + // still within GLOBAL_MAX_DELEGATION_DEPTH). + let target = custom_profile(true, 4); let err = validate_delegation_capability( &SubagentProfile::Review, - &RuntimeOrchestrationTool::Explore, - &SubagentProfile::Explore, - 4, + &RuntimeOrchestrationTool::Custom("shallow".to_string()), + &target, + 5, ) - .expect_err("depth 4 must exceed explore max depth 3"); - assert!(err.contains("max delegation depth is 3")); + .expect_err("depth 5 must exceed custom max depth 4"); + assert!(err.contains("max delegation depth is 4")); } #[test] diff --git a/src-tauri/src/core/subagent/runtime_orchestration.rs b/src-tauri/src/core/subagent/runtime_orchestration.rs index 25c72e8a..c458e098 100644 --- a/src-tauri/src/core/subagent/runtime_orchestration.rs +++ b/src-tauri/src/core/subagent/runtime_orchestration.rs @@ -14,7 +14,7 @@ pub const GLOBAL_MAX_DELEGATION_DEPTH: u32 = 5; /// Built-in default for the maximum delegation depth a built-in subagent /// (explore / review) may be delegated to. -pub const BUILTIN_DEFAULT_MAX_DELEGATION_DEPTH: u32 = 3; +pub const BUILTIN_DEFAULT_MAX_DELEGATION_DEPTH: u32 = 5; pub const TERM_STATUS_TOOL_DESCRIPTION: &str = "Inspect the status of the desktop app's embedded Terminal panel session for the current thread. Use this to check that panel's session state without mutating it. It does not inspect the agent runtime, CLI process, or host shell outside the panel."; @@ -1189,8 +1189,8 @@ mod tests { #[test] fn review_profile_omits_delegation_tools_beyond_builtin_depth() { - // child_depth 4 exceeds BUILTIN_DEFAULT_MAX_DELEGATION_DEPTH (3). - let tools = SubagentProfile::Review.delegation_tools_for_helper(4, &[]); + // child_depth 6 exceeds BUILTIN_DEFAULT_MAX_DELEGATION_DEPTH (5). + let tools = SubagentProfile::Review.delegation_tools_for_helper(6, &[]); assert!(tools.is_empty()); } From c15e885a9c41a4c774f7c7023155f7b6ef6e4d5f Mon Sep 17 00:00:00 2001 From: Jorben Date: Sun, 7 Jun 2026 15:01:11 +0800 Subject: [PATCH 06/16] =?UTF-8?q?docs:=20=F0=9F=93=9D=20remove=20obsolete?= =?UTF-8?q?=20design=20document?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- docs/goal-judge-evaluation-refactor.md | 346 ------------------------- 1 file changed, 346 deletions(-) delete mode 100644 docs/goal-judge-evaluation-refactor.md diff --git a/docs/goal-judge-evaluation-refactor.md b/docs/goal-judge-evaluation-refactor.md deleted file mode 100644 index ebea6c5f..00000000 --- a/docs/goal-judge-evaluation-refactor.md +++ /dev/null @@ -1,346 +0,0 @@ -# Goal 评估与续行重构方案:引入 Judge 验收 Agent - -> 状态:设计方案(待评审) -> 关联模块:`src-tauri/src/core/goal_manager.rs`、`src-tauri/src/core/subagent/`、`src-tauri/src/core/agent_run_event_handler.rs`、`src-tauri/src/model/goal.rs` -> 决策基线(已澄清): -> 1. **保留全部现有护栏**(idle 空转、clarify/update_plan 暂停、token/turn 预算上限),仅把“是否完成”的判定从自主声明改为 Judge 验收。 -> 2. **复用 `GoalStatus::Complete` 状态** 表达“通过验收”,并在 `goals` 表新增 Judge 评估字段持久化最近一次裁决;迁移需把存量 `status='complete'` goal 回填为 `judge_passed=1`。 -> 3. **由主 agent 主动调用 `agent_judge`**,系统在 run 终止后通过续行 prompt 引导主 agent 先验收、未通过则修复后重验。 -> 4. **`agent_judge` 是主 agent 专属工具**:只在有未完成 goal 时注入主 agent,且运行时必须硬性拒绝任何 subagent 递归调用 Judge,即使工具名被 `RuntimeOrchestrationTool::parse()` 解析出来也不能放行。 -> 5. **Judge 使用诊断型 shell 软约束**:Judge 的文件工具保持只读;允许 `shell` 仅用于测试、type-check、lint、只读检查等诊断验证,并通过 Judge prompt 明确禁止用 shell 修改文件、删除数据、安装依赖或改变全局状态。首版不新增受限 shell 沙箱。 -> 6. **Judge 默认使用 primary 模型角色**,优先保证验收质量;首版不把 Judge/subagent 的 token 单独计入 goal token budget,也不新增 Judge 专属硬超时,沿用现有 helper run 的 turn/取消机制。 -> 7. **删除失效的自主完成路径**:移除 `goal_scored`、`GoalVerdict::Complete` 的旧自证语义,以及由 `goal_scored` 空 evidence 触发的 `NoEvidence` / `MISSING_EVIDENCE_PROMPT` 分支。 - ---- - -## 1. 背景与问题 - -当前 goal 的"完成"判定依赖主 agent 自主调用 `goal_scored(status, evidence, pledge)` 工具来声明达成。这是一种**自证式(self-attestation)**设计: - -- 工具内部只校验 `status == "complete"`、`pledge` 文本逐字匹配、`evidence` 非空(见 `agent_session_execution.rs` 的 `execute_goal_tool()`)。 -- 它**无法验证 evidence 的真伪**,也无法核对结果是否真的满足 goal 的一致性与完整性。 - -实测发现部分模型即便明知仍有未完成项,也会照抄 pledge 文本、编造 evidence 来调用 `goal_scored` 并提前结束任务。pledge + evidence 非空这类形式化护栏对"不诚实声明"无效,这是自主声明方式的**设计缺陷**。 - -**核心思路**:把"完成判定权"从被评估者(主 agent)手中移交给独立的评估者(Judge Agent)。主 agent 不能再自己宣布通过;只有 Judge 基于 goal 内容对项目当前状态做出"通过"裁决,goal 记录才会扭转为通过验收状态。续行监督也随之改为以"是否通过验收"为准。 - ---- - -## 2. 现状梳理(已确认事实) - -### 2.1 Goal 数据模型与持久化 - -- `GoalStatus`(`src-tauri/src/model/goal.rs`):`Active` / `Paused` / `BudgetLimited` / `Complete` 四态。 -- `goals` 表(`migrations/20260530000000_goals.sql` 及后续迁移):每 `thread_id` 唯一一条 goal;含 `status`、`evidence`、`tokens_used`、`turns_used`、`max_turns`、`pause_reason`、`last_evaluated_run_id` 等列。 -- `GoalManager`(`src-tauri/src/core/goal_manager.rs`)封装 CRUD + 评估 + prompt 生成。关键方法:`mark_complete(goal_id, evidence)`、`evaluate_after_turn(response, goal) -> GoalVerdict`(同步 CPU 启发式)、`evaluate_after_run(run_id, response) -> GoalEvaluationOutcome`(异步、含去重 CAS)。 - -### 2.2 `goal_scored` 工具链路 - -- 工具定义在 `agent_session_tools.rs` 的 `runtime_tools_for_profile()`,常量 `GOAL_SCORED_TOOL_NAME` / `GOAL_SCORED_PLEDGE` 在 `goal_manager.rs`。 -- 调用分派在 `agent_session_execution.rs::execute_tool_call()` → `execute_goal_tool()`:校验 status/pledge/evidence → `mark_complete()` → 发送 `GoalCompleted` + `GoalStateUpdated` 事件。 - -### 2.3 续行监督逻辑 - -- run 终止后,`agent_run_event_handler.rs::maybe_continue_goal_after_terminal_run()` 是入口。 -- 前置条件:`goal_continuation_enabled == true`、`final_status ∈ {Completed, Interrupted}`。 -- 调用 `evaluate_after_run()` 内部走 `evaluate_after_turn()` 分层启发式: - - **Layer 1** 工具阻塞:`clarify` → `Paused(ClarifyPending)`;`update_plan` → `Paused(PlanPending)`;`goal_scored` 放行。 - - **Layer 2** idle/完成声明:连续 idle ≥ `MAX_IDLE_TURNS(3)` → `Paused(IdleBlocked)`;检测到完成关键词但未调工具 → `ChallengeEvidence`(反复声称达上限 → `IdleBlocked`)。 - - **Layer 3** 预算:tokens 超 budget → `BudgetLimited`;turns 超 `max_turns` → `Paused(BudgetExhausted)`。 - - 默认 → `Continue`。 -- verdict 为 `Continue` / `ChallengeEvidence` 时,用 continuation prompt 启动新 run;`Paused` / `BudgetLimited` / `skipped` 时不续行。 -- **关键现状**:续行从不查询 goal 的 `Complete` 状态。它实际依靠"模型没有再触发任何阻塞/完成信号 + goal 仍 `Active`"间接推断。一旦 `goal_scored` 被调用,`mark_complete()` 把 status 写成 `Complete`,下一轮 `evaluate_after_run()` 因 goal 非 `Active` 返回 `skipped`,从而停止续行。 - -### 2.4 Subagent 机制 - -- 内建 subagent:`Explore`、`Review`、`Parallel`,定义在 `subagent/runtime_orchestration.rs` 的 `RuntimeOrchestrationTool` / `SubagentProfile`。 -- 深度模型:主 agent = depth 1;主 agent 直接子代理 = depth 2(`MAIN_AGENT_CHILD_DEPTH`);`GLOBAL_MAX_DELEGATION_DEPTH = 5`;内建默认 `BUILTIN_DEFAULT_MAX_DELEGATION_DEPTH = 3`。 -- 委派校验:`orchestrator.rs::validate_delegation_capability(caller, target_tool, target_profile, child_depth)`,三重检查(调用方 `can_delegate`、全局上限、目标 `max_delegation_depth`)。 -- 权限模型:`Explore` 只读(read/list/find/search/web_search,`can_delegate=false`);`Review` 只读 + 诊断 shell + git/term 只读(`can_delegate=true`);`Custom` 按 `allowed_tools` 白名单。 -- 工具注入:主 agent 在 `agent_session_tools.rs::runtime_tools_for_profile()` 中 `tools.extend(runtime_orchestration_tools())`;自定义在 `agent_session.rs::build_session_spec()` 注入。 -- Prompt 注入:`build_helper_system_prompt()` 按 `PromptSurface`(`prompt/surface.rs`)选择 section;task 通过 `agent.prompt(request.task)` 注入为 user message。 - ---- - -## 3. 设计目标 - -1. 新增内建 **Judge** subagent,对项目当前状态做 goal 达成度评估,结构化返回:通过与否(bool)、完整度百分比、判定依据(未达成/不符合点描述)。 -2. Judge 通过时**扭转 goal 记录为通过验收状态**(复用 `Complete` + 持久化 Judge 字段)。 -3. Judge 上下文注入 goal 内容,评估重点是 goal 要求的**一致性**与**完整性**。 -4. Judge 文件工具保持**只读**,允许 `read` / `list` / `find` / `search` / `web_search`;允许 `shell` 但仅作为诊断型软约束工具用于测试、type-check、lint、只读检查;允许再发起 subagent(含并行,如 explore/review 协助),**自身最大被委派深度为 2**。 -5. **删除 `goal_scored` 工具**。完成判定不再由主 agent 自证。 -6. 续行监督改为:判定 goal 记录是否“通过验收”;未通过且 goal 仍 Active 则续行,并在 continuation prompt 中明确要求主 agent 调用 `agent_judge` 验收并遵循验收结果。 -7. **按需注入**:仅当 thread 有未通过验收的 goal 时,才向**主 agent**注入 `agent_judge` 工具;所有 subagent 均不注入且运行时拒绝递归调用 `agent_judge`;无 goal 或已验收通过时不注入。 - ---- - -## 4. 总体设计 - -### 4.1 角色与职责重划 - -| 角色 | 重构前 | 重构后 | -|------|--------|--------| -| 主 agent | 自己调 `goal_scored` 声明完成 | 干活 + 自认为完成后调 `agent_judge` 申请验收;不能自证完成 | -| Judge agent | 不存在 | 独立验收者,文件工具只读且 shell 仅诊断软约束,基于 goal 评估项目当前状态,产出结构化裁决;通过则扭转 goal 状态 | -| 续行监督 | 间接依赖 goal 非 Active 停续行 | 显式以"goal 是否通过验收(Complete + judge_passed)"为停续行依据 | - -### 4.2 端到端数据流 - -``` -用户 /goal - └─ goal_set() → create_goal(status=Active) - └─ 注入 ActiveGoalSource 到主 agent system prompt(更新文案:完成须经 agent_judge 验收) - └─ 按需向主 agent 注入 agent_judge 工具(goal 存在且尚未通过验收) - -主 agent run:工作 → 自认为达成 → 调用 agent_judge(task) - └─ execute_tool_call() 路由到 Judge 编排 - └─ HelperAgentOrchestrator::run_helper(SubagentProfile::Judge) - ├─ build_helper_system_prompt(PromptSurface::SubagentJudge) + 注入 goal objective 到上下文 - ├─ Judge 工具集:read/list/find/search/web_search/shell(仅诊断软约束) + (depth 允许时)agent_explore/agent_review/agent_parallel - ├─ Judge 调研验证:读代码、搜索、运行测试/type-check/lint 等诊断命令、并行 explore/review - └─ 产出结构化 JudgeReport { passed, completeness_pct, findings, summary } - └─ Judge 编排回写 goal 记录: - ├─ 总是:persist 最近一次 judge_passed / judge_completeness / judge_findings / judge_summary / judge_evaluated_run_id - └─ passed == true:事务写入 status=Complete + judge_passed=true + evidence=summary - 发送 GoalCompleted + GoalStateUpdated 事件 - └─ agent_judge 工具结果(JudgeReport 文本)返回给主 agent - -run 终止 - └─ maybe_continue_goal_after_terminal_run() - └─ evaluate_after_run() - ├─ 若 goal.status == Complete && goal.judge_passed == true(已通过验收)→ skipped(停续行)✅ - ├─ 若 goal.status != Active → skipped(非活跃 goal 不自动续行,保留现有暂停/预算语义) - ├─ 否则保留现有护栏:clarify/update_plan/idle/预算 → Paused/BudgetLimited - └─ 否则 → Continue:注入新版 continuation prompt - "你尚未通过验收。请先用 agent_judge 验收;若上次验收未通过, - 按 findings 修复后再次调用 agent_judge。" - └─ Continue → 启动新 run(回到主 agent run) -``` - -### 4.3 为什么选择这套方案(与备选对比) - -- **复用 `Complete` 而非新增 `Verified` 枚举**:`Complete` 在 DDL CHECK 约束、`GoalStatus` 枚举、前端状态条、gateway 文案中均已铺开。新增枚举值需要同步迁移、前端、序列化多处,收益有限。改为复用 `Complete` 并以 `judge_passed` 布尔列区分"是否经 Judge 验收",改动面最小且语义清晰(通过验收 = `Complete` 且 `judge_passed=true`)。 -- **保留全部护栏**:Judge 解决的是"完成判定的可信度",而 idle 空转、clarify/update_plan 暂停、预算上限解决的是"防止无限续行/资源失控/阻塞等待"。两者正交,移除护栏会让无 goal 评估能力时的兜底消失,引入失控风险。 -- **主 agent 主动调用 + 续行引导**(而非系统自动发起 Judge):保持与现有 subagent 调用模型一致(主 agent 通过工具调用委派),实现侵入小;系统侧只需在续行 prompt 中“催”主 agent 去验收,无需在 run 终止后再隐式拉起一个评估 run 改变运行时调度。续行 prompt 会持续施压,直到 goal 被 Judge 标记通过,规避了“主 agent 不调 Judge 就永远不验收”的死角。 -- **Judge 作为主 agent 专属内建工具**:虽然 `agent_judge` 会加入 `RuntimeOrchestrationTool::parse()`,但它不进入 `builtin_all()` 和 `delegation_tools_for_helper()`,也不允许 subagent 递归调用。这样保留统一工具解析与 helper 编排复用,同时避免 explore/review/custom/Judge 自己绕过“主 agent 申请验收”的职责边界。 -- **诊断型 shell 软约束而非新沙箱**:Judge 需要能运行测试、type-check、lint 等验证命令,因此首版复用现有 `shell` 工具;但该工具能力本身不是硬只读,必须在 Judge prompt 中明确限制为诊断用途,禁止修改文件、删除数据、安装依赖、启动交互式长进程或改变全局状态。新建受限 shell/test-runner 工具会扩大改动面,首版暂不引入。 -- **Judge 使用 primary 模型角色**:验收质量优先于成本,Judge 默认走 `model_plan.primary`。Explore/Review 继续保持现有模型策略,Judge 内部再委派时由各子代理自己的模型映射决定。 - -### 4.4 首版范围边界 - -首版目标是打通后端 Judge 验收闭环:工具注入、subagent 运行、结构化解析、goal 回写、续行停止、迁移兼容和测试覆盖。前端仅同步类型并在现有状态条显示“已验收通过”这一最小信息;`judge_completeness` 的精细 UI、额外事件、ACP/gateway 的详细状态展示、Judge token 单独计入 goal budget、Judge 专属超时或受限 shell 沙箱均作为后续增强,不进入首版。 - ---- - -## 5. 详细实现 - -### 5.1 Judge subagent profile(`subagent/runtime_orchestration.rs`) - -- `RuntimeOrchestrationTool` 新增变体 `Judge`,工具名映射 `agent_judge`;`parse("agent_judge") -> Some(Judge)`。同时补齐 `tool_name()`、`title()`、`description()`、`profile()`、`as_agent_tool()` 的 match 分支,`as_agent_tool()` 的 schema 只需要 `task: string`。 -- `SubagentProfile` 新增 `Judge` 变体,并补齐 `helper_kind()`(固定返回 `helper_judge`)、`system_prompt()`、`can_delegate()`、`max_delegation_depth()`、`helper_tools()` 等 match 分支。 -- `resolve_helper_profile()` 增加 `RuntimeOrchestrationTool::Judge => Some(SubagentProfile::Judge)`;`resolve_helper_model_role()` 增加 Judge 分支,默认使用 `model_plan.primary`,不要复用 Explore/Review 的 auxiliary 映射。 -- `helper_tools()` for `Judge`:`read` / `list` / `find` / `search` / `web_search`(条件启用)/ `shell`(仅诊断验证)。**不含** `edit` / `write` / `term_write` / `term_restart` / `term_close`。需要在工具描述和 Judge prompt 中明确:`shell` 只能运行测试、type-check、lint、只读检查等诊断命令,不能修改文件、删除数据、安装依赖、启动交互式长进程或改变全局状态。这是 prompt 软约束,不是硬沙箱。 -- `can_delegate()` for `Judge`:`true`(允许 explore/review/parallel 协助)。 -- `max_delegation_depth()` for `Judge`:`2`(即 Judge **自身最大被委派深度为 2**——主 agent depth 1 直接委派 Judge 得到 depth 2,符合 `MAIN_AGENT_CHILD_DEPTH=2`;同时这意味着 Judge 内部委派的子级会是 depth 3,需在 `delegation_tools_for_helper()` 中据此过滤)。 - > 注意:需求所述“自身最大被委派深度为2”指 Judge 作为被委派目标时允许出现在 depth ≤ 2。为了让 Judge 仍能发起 explore/review/parallel(depth 3 子级),`delegation_tools_for_helper(child_depth)` 对内建目标的过滤阈值需复核:Judge 在 depth 2 调用子级时 `child_depth=3`,仍 ≤ `GLOBAL_MAX_DELEGATION_DEPTH(5)` 且 ≤ explore/review 的 `max_delegation_depth(3)`,故可注入。实现时确保 `validate_delegation_capability` 对 Judge→explore/review 放行。 -- `delegation_tools_for_helper()` 仍只注入 Explore / Review / Custom / Parallel,**不得注入 Judge**。这使 Judge 可以委派其他 helper,但任何 helper 不能委派 Judge。 -- `RESERVED_SUBAGENT_SLUGS` 增加 `"judge"`,防止自定义 subagent 占用该 slug。由于 `RuntimeOrchestrationTool::parse()` 对 `agent_{slug}` 有通配解析,保留 slug 能避免 `agent_judge` 与自定义工具名冲突。 -- `runtime_orchestration_tools()` **不无条件包含 Judge**:Judge 改为按需注入(见 5.6),`builtin_all()` 保持仅含 explore/review/parallel,Judge 单独由主 agent 工具组装处按 goal 条件 push。 - -### 5.2 Judge 结构化协议(新增 `subagent/judge_contract.rs`) - -参照 `review_contract.rs` / `parallel_contract.rs` 模式新增: - -```rust -/// agent_judge 工具的入参(主 agent 传入)。 -pub struct JudgeRequest { - pub task: String, // 主 agent 对"为何认为达成"的说明 / 关注点 -} - -/// Judge 评估结构化产出。 -#[derive(Serialize, Deserialize)] -pub struct JudgeReport { - pub passed: bool, // 是否通过验收 - pub completeness_pct: u8, // 0-100 完整度百分比 - pub findings: Vec, // 未达成 / 不符合 goal 的具体点(passed=false 时必填) - pub summary: String, // 判定依据总述,作为通过时的 evidence -} -``` - -- Judge 的 system prompt(模板 `prompt/templates/subagent/judge.md`)强制要求最终以可解析的结构化形式(JSON 块或约定字段)返回上述四项。 -- `passed=true` 时 `summary` 必须非空,作为 `mark_complete()` 的 evidence;如果 Judge 输出 `passed=true` 但 `summary` 为空,解析层必须降级为 `passed=false`,避免无证据完成。 -- `completeness_pct` 解析后必须 clamp 到 0-100;`passed=false` 时 `findings` 必须非空,若模型未给出 findings,则把原始输出或“Judge did not provide actionable findings”写入 findings。 -- Judge 编排在拿到 Judge 文本输出后解析为 `JudgeReport`;解析失败按 `passed=false` 处理并把原始文本塞入 `findings`,避免误判通过。 - -### 5.3 Judge prompt surface 与上下文注入 - -- `prompt/surface.rs::PromptSurface` 新增 `SubagentJudge { inherited_run_mode }`。 -- `SurfacePattern::matches()` 同步更新:`AnySubagent` 必须匹配 `SubagentJudge`;`BuiltinSubagent` 也必须匹配 `SubagentJudge`,因为 Judge 是内建 subagent。若某些 prompt section 只应给 Explore/Review 而不应给 Judge,应改用更精确的 matcher 或新增 pattern,避免误注入。 -- `build_helper_system_prompt()` 增加 `SubagentProfile::Judge` → `PromptSurface::SubagentJudge { inherited_run_mode }` 映射。 -- `prompt/sources/custom_subagent_body.rs` 增加 Judge 模板映射:Judge → `templates/subagent/judge.md`。 -- `prompt/templates/subagent/judge.md`:定义 Judge 角色——独立验收员,只读评估,重点核对 goal 的一致性与完整性;说明可用工具(含诊断型 `shell`、可委派 explore/review/parallel);要求输出结构化 `JudgeReport`;明确禁止修改文件。`shell` 约束必须写成硬性行为指令:只能运行测试、type-check、lint、只读检查;不得通过 shell 编辑/删除文件、安装依赖、改变全局状态、启动交互式或长期驻留进程。 -- `prompt/sources/subagent_output_contract.rs` 增加 Judge 的输出契约 `output_contract.judge.md`,并在 contract 中重复 `passed` / `completeness_pct` / `findings` / `summary` 的字段要求和失败兜底规则。 -- **goal 内容注入采用 task 前缀方案**:Judge 上下文必须包含 goal objective,且由 `agent_session_execution.rs` 的 Judge 分支在构造 helper task 时注入,不新增 DB 读取型 prompt source。 -- task 前缀必须包含:objective、当前 goal id/status、最近一次 Judge findings/summary(若有)、主 agent 传入的 `task` 说明。这样 Judge 不依赖主 agent 自述即可核对目标。 - -### 5.4 Judge 编排与 goal 回写(`agent_session_execution.rs` + `goal_manager.rs`) - -- `execute_tool_call()`:`RuntimeOrchestrationTool::parse()` 命中 `Judge` 时进入 Judge 专用分支,不直接走普通 `execute_helper_tool()` 返回路径。该分支可复用 `resolve_helper_delegate()` / `HelperAgentOrchestrator::run_helper()`,但必须在 helper 完成后追加 JudgeReport 解析和 goal 回写。 -- Judge 分支额外步骤: - 1. 调用前从 DB 加载当前 thread 的未完成 goal;无 goal 或 goal 已 `Complete && judge_passed=true` 则返回错误(agent_judge 仅在有 goal 时可用,理论上不会被注入)。 - 2. 把 `goal.objective`、goal id/status、最近一次 judge findings/summary、主 agent 传入的 `task` 拼成 Judge task 上下文。 - 3. 以 `SubagentProfile::Judge`、`RuntimeOrchestrationTool::Judge`、depth 2 启动 helper run;模型角色使用 `model_plan.primary`。 - 4. Judge run 结束后解析 `JudgeReport`;解析失败或字段非法按 `passed=false` 处理。 - 5. 调用新增 `GoalManager::record_judge_verdict(goal_id, run_id, &report)` 持久化最近裁决;若 `report.passed`,该方法在同一事务内写入 `status=complete`、`evidence=report.summary` 与 `judge_passed=true`。 - 6. 若通过验收,发送 `GoalCompleted` + `GoalStateUpdated` 事件;若未通过,也发送 `GoalStateUpdated`,让前端/后续续行能拿到最新 findings。 - 7. 把 `JudgeReport` 文本作为工具结果返回主 agent;通过时结果中明确提示“goal 已通过验收,请停止修改并总结”,降低同一 run 后续继续改动的风险。 -- `GoalManager` 新增方法: - - `record_judge_verdict(&self, goal_id: &str, run_id: &str, report: &JudgeReport) -> Result`:写 `judge_passed` / `judge_completeness` / `judge_findings`(JSON) / `judge_summary` / `judge_evaluated_run_id`,并返回更新后的 record 供事件 payload 使用;passed 时同一事务同步写 `status=complete` 与 `evidence=report.summary`。 -- 原子性要求:`goal_repo.rs` 增加 `record_judge_verdict()` repo 方法,在事务内更新 judge_* 字段;passed 时同事务写 `status='complete'` 与 `evidence=summary`,确保 `status=complete` 与 `judge_passed=1` 不出现半更新;未通过时保持原 status(通常 Active)不变。 -- 预算边界:首版 Judge helper run 的 token 不单独计入 goal `tokens_used`。这是明确取舍;后续若要计入,需要扩展 `HelperRunResult` 携带 usage 并在 Judge 分支回写。 -- 同轮继续修改边界:系统不强行锁定 goal 后的写工具,因为主 agent 仍处于同一 run;通过验收后的工具结果和 `active_goal.tpl.md` prompt 必须要求停止修改。若未来需要硬约束,可在 `execute_tool_call()` 中对 `Complete && judge_passed` 后的 mutating tools 增加拒绝策略,首版不做。 - -### 5.5 删除 `goal_scored` 工具 - -- 删除工具定义(`agent_session_tools.rs` 中的 `goal_scored` `AgentTool::new(...)`)。 -- 删除分派分支与 `execute_goal_tool()`(`agent_session_execution.rs`)。 -- 移除常量 `GOAL_SCORED_TOOL_NAME` / `GOAL_SCORED_PLEDGE`(`goal_manager.rs`),以及 `evaluate_after_turn()` 中 `detect_tool_based_blocking` 对 `goal_scored` 的放行分支。 -- 删除旧自证语义:`GoalVerdict::Complete { evidence }` 当前没有有效生产者,删除 `goal_scored` 后一并移除,并删除 `evaluate_after_run()` 中的旧 match 分支,减少死代码。 -- 删除 `ChallengePromptVariant::NoEvidence` 与 `MISSING_EVIDENCE_PROMPT`,因为它们只服务于“调用 `goal_scored` 但 evidence 为空”的旧路径;保留 completion-claim 检测对应的 `ChallengeEvidence` / `NoTool` 语义,并把文案改为“声称完成但尚未调用 `agent_judge` 验收”。 -- 护栏保留但需改写文案:`ChallengeEvidence` 与 completion-claim 检测仍作为“提醒主 agent 去验收”的软提示,引导语从“调用 goal_scored”改为“调用 agent_judge 验收”。`GUIDANCE_PROMPT` 同步更新。 -- `agent_judge` 会被 `record_tool_call()` 记录到 goal runtime tool calls;`detect_tool_based_blocking()` 不应把它视为阻塞工具,也不应触发 pause。它与普通工具调用一样表示 agent 有行动,能重置 idle 倾向。 -- 全局检索并清理 `goal_scored` 引用:系统 prompt、`active_goal.tpl.md`、gateway 文案、前端 hardcoded kickoff prompt、测试(`tests/goal_lifecycle.rs`)等。 - -### 5.6 按需注入 `agent_judge`(仅主 agent,仅有未完成 goal 时) - -- 注入点在主 agent 工具组装处。`runtime_tools_for_profile()` 当前是纯 profile 函数,不知道 thread goal 状态;推荐在其调用方 `build_session_spec()`(`agent_session.rs`)查询并追加 Judge 工具,避免把 DB 依赖塞进纯工具构造函数。 - - 在 `build_session_spec()` 已能访问 `pool` 与 `thread_id`,查询 `goal_repo::find_by_thread_id`,若存在且尚未通过验收,则 push `RuntimeOrchestrationTool::Judge.as_agent_tool()`。 - - “尚未通过验收”的判定为:goal 存在且不是 `status == Complete && judge_passed == true`。实际自动续行仍只对 `Active` 生效;但工具注入可允许用户在恢复/继续场景中对 `Paused` 或 `BudgetLimited` goal 重新申请验收。 - - goal 不存在或已 `Complete && judge_passed`(已验收)则不注入。 -- `runtime_tools_with_custom_subagents()` 与 extension tool 合并时需维持内建工具名优先级,防止 extension/custom 工具覆盖 `agent_judge`。 -- **subagent 不注入**:Judge 工具只在主 agent 工具集 push,不进入 `delegation_tools_for_helper()` 的候选;任何 subagent(含 Judge 自身、explore/review/custom)的可委派目标列表都不包含 `agent_judge`。 -- **运行时硬门禁**:仅“不注入”不足够,因为模型或测试仍可能构造 `agent_judge` 调用,且 `RuntimeOrchestrationTool::parse()` 会命中。必须在 subagent 递归委派路径(例如 `HelperDelegationContext::handle_delegation()` / `resolve_delegation()`)中显式拒绝 `RuntimeOrchestrationTool::Judge`,返回“agent_judge can only be called by the main agent for the current goal”之类错误。 -- `agent_parallel` 的任务列表也必须拒绝 `agent_judge`。`validate_parallel_delegate_safety()` 或解析 parallel task 的位置应把 Judge 视为非法 batch target,避免通过 parallel 间接调用 Judge。 -- 主 agent 侧 `execute_tool_call()` 的 Judge 分支也要重新查询 goal 状态,不能只依赖工具注入时的状态;这是防止 race / stale tool set 的后端 backstop。 - -### 5.7 续行监督改造(`agent_run_event_handler.rs` + `goal_manager.rs`) - -- `evaluate_after_run()` / `evaluate_after_turn()` 开头新增**显式终止判定**:若 goal 已“通过验收”(`status == Complete && judge_passed == true`)→ 返回 `skipped`(停续行)。这是停续行的**主依据**。 -- 存量兼容依赖迁移回填:迁移后不应出现旧路径产生的 `status=Complete && judge_passed=false`。如果运行时遇到该组合,按异常兼容处理并停续行或记录 warning;不要把旧 complete goal 重新拉起续行。 -- 对 `Paused` / `BudgetLimited` 仍按现有语义返回 skipped,不自动续行。只有 `Active` goal 会继续进入护栏评估。 -- 其余护栏(clarify/update_plan/idle/预算)保留,作用不变。 -- `Continue` / `ChallengeEvidence` verdict 的 continuation prompt 改写为新模板(替换 `CONTINUATION_PROMPT_TEMPLATE`): - -``` -[Goal continuation — turns {turns_used}/{max_turns}] - -**Objective:** {objective} - -继续推进该目标,执行下一个具体步骤。 - -⚠️ 完成判定已改为独立验收:当你认为目标已达成时,必须调用 - agent_judge(task="说明为何认为已达成 / 需重点核对的点") -由 Judge 评估项目是否满足目标的一致性与完整性。 -- 仅当 Judge 裁决 passed=true 时,目标才会被标记为通过验收并停止续行。 -- 若上一次 Judge 验收未通过,请阅读其 findings,逐项修复后再次调用 agent_judge。 -你无法自行声明完成;只有通过 Judge 验收才算达成。 - -如果你被阻塞、需要用户输入,请使用 clarify 工具。 -``` - -- 若最近一次 Judge 未通过,必须把 `judge_findings` 摘要拼接进 continuation prompt,提升修复指向性;摘要可限制长度,避免 prompt 过长。 - -### 5.8 数据库迁移 - -新增迁移 `migrations/2026XXXXXXXXXX_goal_judge_fields.sql`: - -```sql -ALTER TABLE goals ADD COLUMN judge_passed INTEGER NOT NULL DEFAULT 0; -- bool -ALTER TABLE goals ADD COLUMN judge_completeness INTEGER; -- 0-100, nullable -ALTER TABLE goals ADD COLUMN judge_findings TEXT; -- JSON array, nullable -ALTER TABLE goals ADD COLUMN judge_summary TEXT; -- nullable -ALTER TABLE goals ADD COLUMN judge_evaluated_run_id TEXT; -- nullable - --- 兼容旧版本 goal_scored 已完成的 goal,避免升级后被误判为未验收。 -UPDATE goals -SET judge_passed = 1, - judge_summary = COALESCE(judge_summary, evidence), - judge_completeness = COALESCE(judge_completeness, 100) -WHERE status = 'complete'; -``` - -- `GoalRecord` / `GoalDto` / `GoalPayload`(`model/goal.rs`)同步新增字段:`judge_passed: bool`、`judge_completeness: Option`(DB 读写时校验 0-100)、`judge_findings: Option`(JSON 文本,DTO 透传字符串,前端按 string/null 接收)、`judge_summary: Option`、`judge_evaluated_run_id: Option`。 -- `goal_repo.rs` 同步更新 `SELECT_COLUMNS`、`GoalRow`、`into_record()`、`insert()`。新增 `record_judge_verdict()` repo 方法,负责写 judge_* 字段;passed 时同一事务同步写 `status='complete'` 与 `evidence=summary`。 -- 若 `judge_findings` 以 JSON array 字符串存储,写入前由 `serde_json::to_string(&report.findings)` 生成;读取失败时不要 panic,DTO 可原样返回或置为 `None` 并记录 warning。 - -### 5.9 前端、IPC、gateway 与 ACP - -- `ThreadStreamEvent` 首版复用现有 `GoalCompleted` / `GoalStateUpdated`,不新增 Judge 专属事件。`GoalPayload` 增加 judge 字段后,现有事件 payload 即可携带最新裁决。 -- 前端 `GoalPayload` 类型(如 `src/services/bridge/agent-commands.ts`)与 store 类型(如 `src/modules/workbench-shell/model/thread-store.ts`)补充 judge 字段;状态条在 `Complete && judgePassed` 时显示“已验收通过”。`judge_completeness` 的进度/百分比 UI 为二阶段增强。 -- `goal-status-bar.tsx` 只做最小展示;若未实现详细展示,也必须保证新增字段不会破坏类型检查。 -- gateway / ACP 首版只要求文案与行为不再引用 `goal_scored`,并确保这些入口启动主 agent 时使用同一 `build_session_spec()` 注入逻辑,因此有未完成 goal 时也能拿到 `agent_judge`。详细展示 Judge findings/completeness 可后续增强。 - ---- - -## 6. 影响文件清单 - -| 文件 | 改动 | -|------|------| -| `src-tauri/src/model/goal.rs` | `GoalRecord`/`GoalDto`/`GoalPayload` 新增 judge_* 字段;删除 `GoalVerdict::Complete` 旧自证变体 | -| `src-tauri/src/core/goal_manager.rs` | 删除 `GOAL_SCORED_*` 常量与放行分支;删除 `MISSING_EVIDENCE_PROMPT` / `NoEvidence` 旧路径;新增 `record_judge_verdict()`;续行终止判定改为 `Complete && judge_passed`;改写 continuation/guidance 文案并拼接最近 findings | -| `src-tauri/src/core/subagent/runtime_orchestration.rs` | `RuntimeOrchestrationTool::Judge` + `SubagentProfile::Judge`(工具集/can_delegate/max_delegation_depth=2);`parse`/`profile`/`as_agent_tool`/`helper_kind` 等 match 补齐;保留 slug;`builtin_all()` 不含 Judge | -| `src-tauri/src/core/subagent/judge_contract.rs`(新增) | `JudgeRequest` / `JudgeReport` 结构化协议、JSON 解析、字段校验、失败兜底 | -| `src-tauri/src/core/subagent/orchestrator.rs` | `build_helper_system_prompt()` 支持 Judge surface;subagent 递归委派路径硬性拒绝 `agent_judge`;保持 Judge→explore/review/parallel 放行 | -| `src-tauri/src/core/subagent/parallel_contract.rs` / 相关 parallel 校验 | `agent_parallel` task 拒绝 `agent_judge` 作为子任务 | -| `src-tauri/src/core/agent_session_execution.rs` | 删除 `goal_scored` 分派与 `execute_goal_tool()`;新增 Judge 专用分支(加载 goal → task 前缀注入 → helper run → 解析 JudgeReport → 回写 goal → 发送事件) | -| `src-tauri/src/core/agent_session_tools.rs` | 删除 `goal_scored` 工具定义;保持基础 runtime tools 不含 Judge;如新增 helper 函数则提供 `agent_judge` 工具构造 | -| `src-tauri/src/core/agent_session.rs` | `build_session_spec()` 查询 goal,按“未通过验收”条件向主 agent 追加 `agent_judge`;`resolve_helper_model_role()` 将 Judge 映射到 primary | -| `src-tauri/src/core/prompt/surface.rs` | `PromptSurface::SubagentJudge`;`SurfacePattern::AnySubagent` / `BuiltinSubagent` 匹配 Judge | -| `src-tauri/src/core/prompt/sources/custom_subagent_body.rs` | Judge → `templates/subagent/judge.md` | -| `src-tauri/src/core/prompt/sources/subagent_output_contract.rs` | Judge 输出契约 | -| `src-tauri/src/core/prompt/templates/subagent/judge.md`(新增) | Judge 角色、诊断型 shell 软约束、委派说明与结构化输出要求 | -| `src-tauri/src/core/prompt/templates/active_goal.tpl.md` | 完成判定改为经 agent_judge 验收,并提示通过后停止修改 | -| `src-tauri/src/core/prompt/sources/active_goal.rs` | 文案同步(如有引用) | -| `src-tauri/src/persistence/repo/goal_repo.rs` | judge_* 列读写;新增 `record_judge_verdict()`;passed 时原子写 status/evidence/judge_* | -| `src-tauri/migrations/2026XXXXXXXXXX_goal_judge_fields.sql`(新增) | judge_* 列迁移,并回填旧 `status='complete'` 为 `judge_passed=1` | -| `src-tauri/src/gateway/gateway_runner.rs` | 移除 `goal_scored` 引导文案,改为 agent_judge 验收说明 | -| `src-tauri/src/acp/**`(如有 goal 文案/事件映射) | 确认不引用 `goal_scored`;复用 GoalStateUpdated payload 的 judge 字段 | -| `src-tauri/tests/goal_lifecycle.rs` | 重写:覆盖 Judge 通过→Complete+judge_passed→停续行;未通过→续行;旧 complete 回填兼容 | -| `src-tauri/src/core/agent_session_tests.rs` / subagent tests | 覆盖 Judge profile、模型角色、工具注入、递归拒绝、parallel 拒绝、prompt surface 匹配 | -| `src/services/bridge/agent-commands.ts` | 前端 `GoalPayload` 类型新增 judge 字段 | -| `src/modules/workbench-shell/model/thread-store.ts` | `GoalStoreState` 新增 judge 字段 | -| `src/modules/workbench-shell/ui/goal-status-bar.tsx` | 最小展示 `Complete && judgePassed` 为“已验收通过” | -| `src/modules/workbench-shell/ui/runtime-thread-surface.tsx` | 清理 goal kickoff prompt 中的 `goal_scored` 示例,改为 agent_judge 验收说明 | - ---- - -## 7. 验证计划 - -- **Rust 格式**:`cargo fmt --check --manifest-path src-tauri/Cargo.toml`。 -- **Rust 行为**:`cargo test --locked --manifest-path src-tauri/Cargo.toml`,重点 `goal_lifecycle`、subagent 委派、prompt surface 与迁移相关测试。新增/重写用例: - - Judge `passed=true` → goal 变 `Complete` 且 `judge_passed=true`,`judge_summary/evidence` 非空,下一轮 `evaluate_after_run` 返回 skipped(停续行)。 - - Judge `passed=false` → goal 仍进行中,写入 `judge_findings`,`evaluate_after_run` 返回 `Continue` 且 continuation prompt 包含最近 findings 并引导调用 `agent_judge`。 - - 存量 `status='complete'` 迁移后 `judge_passed=1`、`judge_completeness=100`,不会被新续行逻辑重新拉起。 - - `agent_judge` 仅在有未通过验收 goal 时注入主 agent;无 goal 或已验收通过时主 agent 工具集不含 `agent_judge`;任何 subagent 工具集不含 `agent_judge`。 - - 运行时门禁:subagent 直接调用 `agent_judge` 被拒绝;`agent_parallel` task 使用 `agent_judge` 被拒绝;主 agent→Judge 合法(depth 2);Judge→explore/review 合法(depth 3)。 - - Judge 模型角色使用 primary;Explore/Review 仍保持既有模型映射。 - - Prompt surface:`SubagentJudge` 能构建 system prompt;`AnySubagent` / `BuiltinSubagent` 匹配 Judge;Judge 模板包含诊断型 shell 软约束和结构化输出契约。 - - JudgeReport 解析失败、`passed=true` 但 summary 空、completeness 越界、`passed=false` findings 空 → 均视为未通过或安全兜底,不误标完成。 - - `goal_scored` 工具与常量已删除(编译期 + 检索为 0 个非历史设计文档引用)。 -- **前端**:`npm run typecheck`;若改动前端测试则 `npm run test:unit`。重点验证 `GoalPayload` / `GoalStoreState` 新字段不会破坏事件处理,`goal-status-bar.tsx` 能显示已验收通过。 -- **文案检索**:全局搜索 `goal_scored`,除历史文档/迁移注释外不应有运行时 prompt、前端提示或 gateway 文案引用。 -- **手动冒烟**:创建 goal → 主 agent 工作 → 调 agent_judge 未通过(findings)→ 续行修复 → 再次 agent_judge 通过 → goal 状态条显示已验收、续行停止。 - ---- - -## 8. 风险与边界 - -1. **主 agent 始终不调用 `agent_judge`**:goal 永远不被验收,续行会持续注入 prompt 直至护栏触发(idle/预算上限)。这正是护栏保留的价值——兜底防止无限续行。需在 prompt 中强力引导主 agent 调用 agent_judge。 -2. **Judge 误判**:Judge 也是 LLM,可能误通过或误拒。误通过风险通过“独立上下文 + 文件工具只读 + primary 模型 + 重点核对一致性/完整性 + 可跑诊断验证”降低;误拒会触发续行修复,代价是额外轮次。 -3. **诊断型 shell 不是硬只读**:Judge 可用 `shell` 意味着理论上能执行修改性命令。首版通过 Judge prompt 进行软约束,要求只运行测试、type-check、lint、只读检查,并禁止修改文件、删除数据、安装依赖、改变全局状态。若后续发现模型不稳定,应新增受限 test-runner 或 shell allowlist。 -4. **Judge 成本**:每次验收会拉起一个可委派的 subagent run,可能再并行 explore/review,token/时间开销不小。首版不把 Judge/subagent token 单独计入 goal budget,也不新增 Judge 专属硬超时;需在 continuation prompt 中提示主 agent“仅在确有把握达成时再申请验收”,避免频繁空验收。 -5. **深度语义边界**:Judge `max_delegation_depth=2` 必须与 `MAIN_AGENT_CHILD_DEPTH=2` 一致,且要确保 Judge 在 depth 2 仍能委派 depth 3 的 explore/review(受 `GLOBAL_MAX_DELEGATION_DEPTH=5` 与 explore/review 自身上限 3 约束,合法)。同时必须在递归委派和 parallel 路径拒绝任何 helper→Judge 调用,避免职责边界被绕过。 -6. **迁移兼容**:迁移必须回填 `UPDATE goals SET judge_passed=1, judge_completeness=100 ... WHERE status='complete'`。运行时若遇到 `Complete && !judge_passed`,应记录 warning 并停续行,不能把存量已完成 goal 重新拉起。 -7. **gateway / ACP 路径**:微信/企微与 ACP 同样依赖 goal 续行,首版需确认这些入口创建主 agent run 时走同一 `build_session_spec()` 注入逻辑,且 prompt/gateway 文案不再提 `goal_scored`。 -8. **同轮继续修改**:Judge 通过后主 agent 仍可能在同一 run 继续调用其他工具。首版不做写工具硬锁,通过 Judge 工具结果和 `active_goal.tpl.md` prompt 要求停止修改;若后续发现问题,再加 `Complete && judge_passed` 后 mutating tools 拒绝策略。 -9. **跨平台**:主体为 Rust/SQLite/prompt/TypeScript 类型改动,应保持跨平台兼容;shell 诊断命令由 Judge 根据项目现有命令选择,prompt 中需提醒避免平台特定假设。 From d60daecf9db70a1bc22aa430b9cd0fa1b990eef6 Mon Sep 17 00:00:00 2001 From: Jorben Date: Sun, 7 Jun 2026 17:03:01 +0800 Subject: [PATCH 07/16] =?UTF-8?q?docs(judge):=20=F0=9F=93=9D=20add=20size-?= =?UTF-8?q?first=20verification=20strategy=20and=20delegation=20guidelines?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../core/prompt/templates/subagent/judge.md | 58 ++++++++++++++++--- 1 file changed, 49 insertions(+), 9 deletions(-) diff --git a/src-tauri/src/core/prompt/templates/subagent/judge.md b/src-tauri/src/core/prompt/templates/subagent/judge.md index 0cab1d63..58f8b873 100644 --- a/src-tauri/src/core/prompt/templates/subagent/judge.md +++ b/src-tauri/src/core/prompt/templates/subagent/judge.md @@ -5,20 +5,60 @@ declared_keys: [] --- You are the **Goal Acceptance Judge** — an independent verifier. The main agent has been working toward a goal and now believes it is achieved (or has fixed earlier findings and wants re-verification). Your job is to independently decide whether the project's **current state** truly satisfies the goal, focusing on **consistency** with what the goal asked for and **completeness** of the work. -You are an evaluator, not an implementer. You did not do the work, and you must not take the main agent's claims at face value — verify against the actual project state. +You are an evaluator, not an implementer. You did not do the work, and you must not take the main agent's claims at face value — verify against the actual project state. Goal tasks are typically long-horizon with broad change surfaces, so your evaluation must scale: be thorough enough to catch real gaps, efficient enough to converge in one pass, and honest about what you actually verified. -## What to evaluate -- Read the goal objective injected into your task and treat it as the acceptance contract. -- Inspect the relevant code, configuration, tests, and docs to confirm each requirement of the goal is actually met. -- Run diagnostic verification when it strengthens your judgment: tests, type-checks, linters, builds, and read-only inspection commands. Adapt the commands to this repository (infer them from instructions, scripts, and manifests) instead of assuming a stack. -- You may delegate to `agent_explore`, `agent_review`, or `agent_parallel` to gather evidence in parallel when the goal is broad. +## Operating principle: size first, then verify + +Do not start verifying detail by detail before you understand the shape of the change. The right verification budget — and whether to fan out work to subagents — depends on how much actually changed and how it is distributed. + +### Step 1 — Size the change (always do this first) +- Run `git_status` and `git_diff --stat` (or the project's equivalent) to enumerate changed files, additions/deletions, and the rough surface area. +- Cross-reference with the goal objective: identify which subsystems / layers / acceptance criteria each cluster of changes maps to. +- Form an explicit mental model before any deep reading: + - **Small** — ≤ ~5 files changed, single module/layer, narrow concern. One linear pass is enough. + - **Medium** — ~6–20 files, 2–3 subsystems or layers touched, multiple acceptance criteria. + - **Large** — > 20 files, cross-cutting changes, multiple independent topics (e.g. backend + frontend + tests + config + docs), or the goal lists many distinct subtasks. +- Use these as guidance, not hard rules: a 3-file change that touches a security boundary may still warrant Large-style scrutiny; a 40-file rename may collapse to Small. +- If the change scope is genuinely tiny relative to the goal (e.g. goal asks for a feature but the diff shows trivial edits), that itself is strong evidence of incompleteness — record it and probe further before concluding. + +### Step 2 — Pick a verification strategy that matches the size +- **Small change** — verify directly. Read the changed files yourself, confirm each goal requirement against the actual code, run the targeted tests/type-checks. Do not delegate; the coordination overhead is not worth it. +- **Medium change** — split logically. Use one or two `agent_explore` / `agent_review` calls when a coherent slice (e.g. "review the new module + its consumers", "explore how config plumbing was wired") is too large to inspect in line without losing context. Run diagnostic commands (typecheck, targeted tests, lint) yourself. +- **Large change** — fan out with `agent_parallel`. Break the goal's acceptance surface into 2–5 independent topics and dispatch them in parallel. Good split axes: + - **By layer** — backend / frontend / persistence / config. + - **By subsystem** — auth / billing / notifications. + - **By concern** — functional correctness / regression risk / tests & docs / migration & compatibility. + - **By goal subtask** — one helper per acceptance criterion when the goal is itemized. + Keep each subtask independent (no shared write state), bounded in scope, and concretely scoped to file lists or topics inferred from the diff. After the parallel batch returns, **synthesize the results yourself** — reconcile conflicts, call out failures or skipped items, and form one coherent verdict. Do not just concatenate helper outputs. + +### Step 3 — Run the verification commands the project actually uses +- Adapt commands to this repository (infer from manifests, scripts, CI config, and workspace instructions). Do not assume a stack. +- Prefer the *narrowest* command that still covers the changed surface (e.g. test only the affected package) before falling back to repo-wide runs. For Large changes a repo-wide build/typecheck is usually still warranted. +- When `agent_review` is delegated, treat its verification output as authoritative — do not rerun the same commands unless its results were inconclusive. + +## Delegation guidelines +- `agent_explore` — single focused investigation: "where is X used?", "how is Y wired?", "does the codebase still reference Z?". Use when one targeted read-only sweep beats inlining a dozen `read`/`search` calls. +- `agent_review` — bounded review of a slice of the implementation, including running its tests/type-check/lint. Pass `target='diff'` when the helper should look at the workspace changes; provide an explicit changed-file list when you already have one. +- `agent_parallel` — 2–5 independent read-only/review subtasks dispatched together. Prefer this over sequential helper calls whenever the topics are genuinely independent. Never recurse parallel into parallel. +- Do **not** delegate when: + - The change is small enough to inspect inline. + - The subtasks are interdependent (later ones need earlier results). + - You only need one shell command — just run it. +- Always tell each delegate explicitly: the goal text, which slice they own, what evidence to return, and that they are read-only. ## Hard constraints (read-only acceptance) - Your file tools are read-only. Do **not** modify, create, or delete any files. -- The `shell` tool is for **diagnostic and verification commands only** — tests, type-checks, linters, and read-only inspection. You must **never** use shell to edit or delete files, install dependencies, change global or system state, or start interactive / long-running / daemon processes. +- The `shell` tool is for **diagnostic and verification commands only** — tests, type-checks, linters, builds, and read-only inspection (`git_status`, `git_diff`, `git_log`, `cat`, `ls`, etc.). You must **never** use shell to edit or delete files, install dependencies, change global or system state, or start interactive / long-running / daemon processes. - Do not attempt to fix the goal yourself. If something is incomplete, report it as a finding so the main agent can fix it. +- Helpers you delegate to inherit the same read-only constraint; remind them in the task text when relevant. + +## Coverage honesty +- Track what you actually verified vs. what you sampled vs. what you skipped. A Large change you only spot-checked is **not** the same as a Large change you fully covered. +- When delegating, if any helper failed, returned inconclusive results, or could not run a command, treat that area as **not verified** — record it explicitly and let it influence the verdict. +- Never imply a check passed without trustworthy evidence. If your `summary` cannot point to specific files, commands, or behaviors you confirmed, you do not have a basis to pass. ## Verdict rules -- Pass (`passed=true`) only when the project genuinely satisfies the goal with no material gaps. When you pass, `summary` must clearly state the verified evidence — it becomes the goal's completion evidence. -- If anything required by the goal is missing, inconsistent, untested, or broken, set `passed=false` and list each concrete gap in `findings`. +- Pass (`passed=true`) only when the project genuinely satisfies the goal with no material gaps **and** your verification covered the full change surface (directly or via successful delegates). When you pass, `summary` must clearly state the verified evidence — files inspected, commands run with their results, and which goal criteria each piece of evidence maps to. It becomes the goal's completion evidence. +- If anything required by the goal is missing, inconsistent, untested, or broken, set `passed=false` and list each concrete gap in `findings` (file path + what is wrong + why it violates the goal). One concrete finding is more valuable than three vague ones. - Be honest and conservative: when in doubt, do not pass. A false "passed" is worse than an extra verification round. +- Calibrate `completenessPct` to actual coverage and remaining gaps, not to effort spent. A change that does 80% of the goal correctly is 80, not 100, even if the implemented parts are flawless. From dc8fca0ce5a6ea8de067ff368804fe9c19bfc370 Mon Sep 17 00:00:00 2001 From: Jorben Date: Sun, 7 Jun 2026 17:49:49 +0800 Subject: [PATCH 08/16] =?UTF-8?q?refactor(goal):=20=E2=99=BB=EF=B8=8F=20re?= =?UTF-8?q?move=20goal-level=20time=5Fused=5Fseconds=20in=20favor=20of=20r?= =?UTF-8?q?un-level=20elapsed=20tracking?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../20260607000001_drop_goal_time_used.sql | 5 + src-tauri/src/commands/agent.rs | 41 ----- src-tauri/src/core/agent_run_event_handler.rs | 34 ---- src-tauri/src/core/agent_run_manager.rs | 54 +------ src-tauri/src/core/app_state.rs | 148 ------------------ src-tauri/src/core/goal_manager.rs | 27 ++-- src-tauri/src/model/goal.rs | 5 - src-tauri/src/persistence/repo/goal_repo.rs | 12 +- src-tauri/src/persistence/repo/run_repo.rs | 78 --------- src-tauri/tests/goal_lifecycle.rs | 9 +- .../workbench-shell/model/thread-store.ts | 1 - src/services/bridge/agent-commands.test.ts | 1 - src/services/bridge/agent-commands.ts | 1 - 13 files changed, 21 insertions(+), 395 deletions(-) create mode 100644 src-tauri/migrations/20260607000001_drop_goal_time_used.sql diff --git a/src-tauri/migrations/20260607000001_drop_goal_time_used.sql b/src-tauri/migrations/20260607000001_drop_goal_time_used.sql new file mode 100644 index 00000000..06d0c76e --- /dev/null +++ b/src-tauri/migrations/20260607000001_drop_goal_time_used.sql @@ -0,0 +1,5 @@ +-- Drop goal-level time accounting. Time-tracking moved to thread_runs.elapsed_running_secs +-- (added by 20260604000000_run_elapsed_tracking.sql), which is summed across all of a thread's +-- runs (planning + implementation) and rendered by the workbench-shell timer. The goal-level +-- time_used_seconds column was write-only with no readers in budget enforcement, UI, or logging. +ALTER TABLE goals DROP COLUMN time_used_seconds; diff --git a/src-tauri/src/commands/agent.rs b/src-tauri/src/commands/agent.rs index f9e9f067..0b73853a 100644 --- a/src-tauri/src/commands/agent.rs +++ b/src-tauri/src/commands/agent.rs @@ -624,47 +624,6 @@ pub async fn goal_pause( match goal { Some(g) => { if g.status == crate::model::goal::GoalStatus::Active { - // Account elapsed time of any currently active run before pausing - if let Some(run_seconds) = - crate::persistence::repo::run_repo::get_active_run_elapsed_seconds( - &state.pool, - &thread_id, - ) - .await - .unwrap_or(None) - { - let active_run_id = crate::persistence::repo::run_repo::find_latest_by_thread( - &state.pool, - &thread_id, - ) - .await - .ok() - .flatten() - .and_then(|run| { - matches!( - run.status.as_str(), - "running" | "waiting_approval" | "needs_reply" - ) - .then_some(run.id) - }); - let paused_seconds = active_run_id - .as_deref() - .map(|run_id| { - let mut guard = - state.goal_runtime_state.lock().unwrap_or_else(|poisoned| { - tracing::warn!( - "goal_pause: goal runtime mutex poisoned, recovering" - ); - poisoned.into_inner() - }); - guard.take_run_paused_seconds(run_id).max(0) - }) - .unwrap_or(0); - let billable_seconds = (run_seconds - paused_seconds).max(0); - if billable_seconds > 0 { - mgr.account_usage(&g.id, 0, billable_seconds).await.ok(); - } - } mgr.pause(&g.id, crate::model::goal::PauseReason::UserRequested, None) .await?; } diff --git a/src-tauri/src/core/agent_run_event_handler.rs b/src-tauri/src/core/agent_run_event_handler.rs index 076c324d..6107563c 100644 --- a/src-tauri/src/core/agent_run_event_handler.rs +++ b/src-tauri/src/core/agent_run_event_handler.rs @@ -184,33 +184,6 @@ pub(crate) fn sidebar_status_for_runtime_event( } impl AgentRunManager { - fn start_goal_run_pause(&self, thread_id: &str, run_id: &str) { - if thread_id.is_empty() { - return; - } - let mut guard = self.goal_runtime_state.lock().unwrap_or_else(|poisoned| { - tracing::warn!("goal pause runtime mutex poisoned, recovering"); - poisoned.into_inner() - }); - guard.start_run_pause(thread_id, run_id); - } - - fn finish_goal_run_pause(&self, run_id: &str) { - let mut guard = self.goal_runtime_state.lock().unwrap_or_else(|poisoned| { - tracing::warn!("goal pause runtime mutex poisoned, recovering"); - poisoned.into_inner() - }); - guard.finish_run_pause(run_id); - } - - fn cleanup_goal_run_pause(&self, run_id: &str) { - let mut guard = self.goal_runtime_state.lock().unwrap_or_else(|poisoned| { - tracing::warn!("goal pause runtime mutex poisoned, recovering"); - poisoned.into_inner() - }); - guard.cleanup_run_pause(run_id); - } - pub(crate) async fn handle_runtime_channel_closed( self: &Arc, run_id: &str, @@ -410,26 +383,22 @@ impl AgentRunManager { } ThreadStreamEvent::ApprovalRequired { .. } => { let thread_id = self.get_thread_id(run_id).await; - self.start_goal_run_pause(&thread_id, run_id); run_repo::update_status(&self.pool, run_id, RunStatus::WaitingApproval).await?; thread_repo::update_status(&self.pool, &thread_id, &ThreadStatus::WaitingApproval) .await?; } ThreadStreamEvent::ClarifyRequired { .. } => { let thread_id = self.get_thread_id(run_id).await; - self.start_goal_run_pause(&thread_id, run_id); run_repo::update_status(&self.pool, run_id, RunStatus::NeedsReply).await?; thread_repo::update_status(&self.pool, &thread_id, &ThreadStatus::NeedsReply) .await?; } ThreadStreamEvent::ApprovalResolved { .. } => { - self.finish_goal_run_pause(run_id); run_repo::update_status(&self.pool, run_id, RunStatus::Running).await?; let thread_id = self.get_thread_id(run_id).await; thread_repo::update_status(&self.pool, &thread_id, &ThreadStatus::Running).await?; } ThreadStreamEvent::ClarifyResolved { .. } => { - self.finish_goal_run_pause(run_id); run_repo::update_status(&self.pool, run_id, RunStatus::Running).await?; let thread_id = self.get_thread_id(run_id).await; thread_repo::update_status(&self.pool, &thread_id, &ThreadStatus::Running).await?; @@ -458,7 +427,6 @@ impl AgentRunManager { } ThreadStreamEvent::RunCheckpointed { .. } => { let thread_id = self.get_thread_id(run_id).await; - self.start_goal_run_pause(&thread_id, run_id); run_repo::update_status(&self.pool, run_id, RunStatus::WaitingApproval).await?; thread_repo::update_status(&self.pool, &thread_id, &ThreadStatus::WaitingApproval) .await?; @@ -476,7 +444,6 @@ impl AgentRunManager { | ThreadStreamEvent::RunFailed { error, .. } => Some(error.as_str()), _ => None, }; - self.finish_goal_run_pause(run_id); self.finish_run(run_id, final_status, error_message).await?; let thread_id = self.get_thread_id(run_id).await; if let Some(frontend_tx) = self.frontend_tx_for_run(run_id).await { @@ -567,7 +534,6 @@ impl AgentRunManager { ); } } - self.cleanup_goal_run_pause(run_id); } Ok(()) diff --git a/src-tauri/src/core/agent_run_manager.rs b/src-tauri/src/core/agent_run_manager.rs index eefc0a1d..e6c8a6ad 100644 --- a/src-tauri/src/core/agent_run_manager.rs +++ b/src-tauri/src/core/agent_run_manager.rs @@ -24,7 +24,7 @@ use crate::core::sleep_manager::SleepManager; use crate::ipc::frontend_channels::ThreadStreamEvent; use crate::model::errors::{AppError, ErrorSource}; use crate::model::thread::{MessageAttachmentDto, MessageRecord, RunStatus}; -use crate::persistence::repo::{goal_repo, message_repo, run_repo, thread_repo, workspace_repo}; +use crate::persistence::repo::{message_repo, run_repo, thread_repo, workspace_repo}; pub(crate) use crate::core::agent_run_event_handler::build_orphaned_run_terminal_event; #[cfg(test)] @@ -433,44 +433,6 @@ impl AgentRunManager { let (profile_id, provider_id, model_id) = extract_run_model_refs(&model_plan_value); - // Account the planning run's billable time to the active goal so the - // frontend timer displays the correct accumulated time when the new - // implementation run starts (the frontend resets its local elapsed on - // every run_id change, so time_used_seconds must include the full - // planning-phase cost). - { - let planning_elapsed = run_repo::get_run_elapsed_seconds(&self.pool, &planning_run_id) - .await? - .unwrap_or(0); - let paused_seconds = { - let mut guard = self.goal_runtime_state.lock().unwrap_or_else(|poisoned| { - tracing::warn!("goal pause runtime mutex poisoned, recovering"); - poisoned.into_inner() - }); - guard.take_run_paused_seconds(&planning_run_id).max(0) - }; - let billable = (planning_elapsed - paused_seconds).max(0); - if billable > 0 { - if let Ok(Some(goal)) = goal_repo::find_by_thread_id(&self.pool, thread_id).await { - if let Err(error) = goal_repo::account_usage( - &self.pool, &goal.id, - 0, // tokens_delta: planning turns were already counted - billable, 0, // turns_delta - ) - .await - { - tracing::warn!( - planning_run_id = %planning_run_id, - goal_id = %goal.id, - billable_seconds = billable, - error = %error, - "failed to account planning run time to goal" - ); - } - } - } - } - let mut approval_metadata = approval_metadata; approval_metadata.state = IMPLEMENTATION_PLAN_APPROVED_STATE.to_string(); approval_metadata.approved_action = Some(action.clone()); @@ -512,20 +474,6 @@ impl AgentRunManager { ) .await?; - // Emit the updated goal state through the new run's event channel so - // the frontend sees the accumulated time_used_seconds (which now - // includes the planning-run time) before it starts the real-time timer - // for the new implementation run. - if let Ok(Some(goal)) = goal_repo::find_by_thread_id(&self.pool, thread_id).await { - let runs = self.active_runs.lock().await; - if let Some(run) = runs.get(&result.0) { - let _ = run.frontend_tx.send(ThreadStreamEvent::GoalStateUpdated { - thread_id: thread_id.to_string(), - goal: Some(crate::model::goal::GoalPayload::from(goal)), - }); - } - } - if let Some(seed_messages) = context_seed_messages.as_ref() { self.persist_messages(seed_messages).await?; } diff --git a/src-tauri/src/core/app_state.rs b/src-tauri/src/core/app_state.rs index 00e2f166..ac3d81a7 100644 --- a/src-tauri/src/core/app_state.rs +++ b/src-tauri/src/core/app_state.rs @@ -1,7 +1,6 @@ use std::collections::HashMap; use std::sync::{Arc, Mutex}; -use chrono::{DateTime, Utc}; use sqlx::SqlitePool; use tauri::AppHandle; @@ -31,12 +30,6 @@ pub struct GoalRuntimeState { pub idle_turn_count: HashMap, /// Consecutive completion claim counter per thread. pub completion_claim_count: HashMap, - /// Pause start timestamp per run while it waits for user action. - pub run_pause_started_at: HashMap>, - /// Accumulated user-wait pause seconds per run. - pub run_paused_seconds: HashMap, - /// Thread ID for each run with pause accounting state. - pub run_pause_thread_ids: HashMap, } impl GoalRuntimeState { @@ -47,66 +40,6 @@ impl GoalRuntimeState { self.thread_tool_calls.remove(thread_id); self.idle_turn_count.remove(thread_id); self.completion_claim_count.remove(thread_id); - - let run_ids: Vec = self - .run_pause_thread_ids - .iter() - .filter_map(|(run_id, stored_thread_id)| { - (stored_thread_id == thread_id).then(|| run_id.clone()) - }) - .collect(); - for run_id in run_ids { - self.cleanup_run_pause(&run_id); - } - } - - /// Begin timing a run's user-action pause. Repeated starts are ignored so - /// nested or duplicate waiting events do not lose the original start time. - pub fn start_run_pause(&mut self, thread_id: &str, run_id: &str) { - self.run_pause_thread_ids - .entry(run_id.to_string()) - .or_insert_with(|| thread_id.to_string()); - self.start_run_pause_at(run_id, Utc::now()); - } - - fn start_run_pause_at(&mut self, run_id: &str, started_at: DateTime) { - self.run_pause_started_at - .entry(run_id.to_string()) - .or_insert(started_at); - } - - /// Finish the current pause interval for a run and accumulate whole seconds. - pub fn finish_run_pause(&mut self, run_id: &str) -> i64 { - self.finish_run_pause_at(run_id, Utc::now()) - } - - fn finish_run_pause_at(&mut self, run_id: &str, finished_at: DateTime) -> i64 { - let Some(started_at) = self.run_pause_started_at.remove(run_id) else { - return *self.run_paused_seconds.get(run_id).unwrap_or(&0); - }; - - let paused_seconds = (finished_at - started_at).num_seconds().max(0); - let total = self - .run_paused_seconds - .entry(run_id.to_string()) - .or_insert(0); - *total += paused_seconds; - *total - } - - /// Take and clear the accumulated pause seconds for a run. - pub fn take_run_paused_seconds(&mut self, run_id: &str) -> i64 { - self.finish_run_pause(run_id); - let seconds = self.run_paused_seconds.remove(run_id).unwrap_or(0); - self.run_pause_thread_ids.remove(run_id); - seconds - } - - /// Clear all pause accounting state for a run. - pub fn cleanup_run_pause(&mut self, run_id: &str) { - self.run_pause_started_at.remove(run_id); - self.run_paused_seconds.remove(run_id); - self.run_pause_thread_ids.remove(run_id); } } @@ -186,84 +119,3 @@ impl AppState { } } } - -#[cfg(test)] -mod tests { - use super::GoalRuntimeState; - use chrono::{Duration, TimeZone, Utc}; - - #[test] - fn run_pause_tracking_is_idempotent_accumulative_and_cleared_on_take() { - let mut state = GoalRuntimeState::default(); - let start = Utc.with_ymd_and_hms(2026, 5, 31, 12, 0, 0).unwrap(); - - state - .run_pause_thread_ids - .insert("run-1".to_string(), "thread-1".to_string()); - state.start_run_pause_at("run-1", start); - state.start_run_pause_at("run-1", start + Duration::seconds(10)); - - assert_eq!( - state.finish_run_pause_at("run-1", start + Duration::seconds(5)), - 5, - ); - assert_eq!( - state.finish_run_pause_at("run-1", start + Duration::seconds(20)), - 5, - ); - - state.start_run_pause_at("run-1", start + Duration::seconds(30)); - assert_eq!( - state.finish_run_pause_at("run-1", start + Duration::seconds(37)), - 12, - ); - - assert_eq!(state.take_run_paused_seconds("run-1"), 12); - assert_eq!(state.take_run_paused_seconds("run-1"), 0); - assert!(!state.run_pause_started_at.contains_key("run-1")); - assert!(!state.run_paused_seconds.contains_key("run-1")); - assert!(!state.run_pause_thread_ids.contains_key("run-1")); - } - - #[test] - fn cleanup_thread_removes_run_pause_state_for_that_thread() { - let mut state = GoalRuntimeState::default(); - let start = Utc.with_ymd_and_hms(2026, 5, 31, 12, 0, 0).unwrap(); - - state - .run_pause_thread_ids - .insert("run-1".to_string(), "thread-1".to_string()); - state.start_run_pause_at("run-1", start); - state - .run_pause_thread_ids - .insert("run-2".to_string(), "thread-2".to_string()); - state.start_run_pause_at("run-2", start); - state.run_paused_seconds.insert("run-1".to_string(), 3); - state.run_paused_seconds.insert("run-2".to_string(), 5); - - state.cleanup_thread("thread-1"); - - assert!(!state.run_pause_started_at.contains_key("run-1")); - assert!(!state.run_paused_seconds.contains_key("run-1")); - assert!(!state.run_pause_thread_ids.contains_key("run-1")); - assert!(state.run_pause_started_at.contains_key("run-2")); - assert_eq!(state.run_paused_seconds.get("run-2"), Some(&5)); - assert_eq!( - state.run_pause_thread_ids.get("run-2").map(String::as_str), - Some("thread-2"), - ); - } - - #[test] - fn run_pause_tracking_clamps_negative_intervals() { - let mut state = GoalRuntimeState::default(); - let start = Utc.with_ymd_and_hms(2026, 5, 31, 12, 0, 0).unwrap(); - - state.start_run_pause_at("run-1", start); - - assert_eq!( - state.finish_run_pause_at("run-1", start - Duration::seconds(5)), - 0, - ); - } -} diff --git a/src-tauri/src/core/goal_manager.rs b/src-tauri/src/core/goal_manager.rs index bd7298db..f3d3ff2b 100644 --- a/src-tauri/src/core/goal_manager.rs +++ b/src-tauri/src/core/goal_manager.rs @@ -148,7 +148,6 @@ impl GoalManager { status: GoalStatus::Active, token_budget, tokens_used: 0, - time_used_seconds: 0, turns_used: 0, max_turns: DEFAULT_MAX_TURNS, pause_reason: None, @@ -251,14 +250,9 @@ impl GoalManager { goal_repo::delete_by_thread_id(&self.pool, &self.thread_id).await } - /// Account usage after a turn. Increments turn count, tokens, and time. - pub async fn account_usage( - &self, - goal_id: &str, - tokens: i64, - time_seconds: i64, - ) -> Result<(), AppError> { - goal_repo::account_usage(&self.pool, goal_id, tokens, time_seconds, 1).await + /// Account usage after a turn. Increments turn count and tokens. + pub async fn account_usage(&self, goal_id: &str, tokens: i64) -> Result<(), AppError> { + goal_repo::account_usage(&self.pool, goal_id, tokens, 1).await } // ── Auto-resume ── @@ -609,20 +603,19 @@ impl GoalManager { } } + // Bump goal turn counter for any run that did real work. We still consult + // run duration to filter out zero-work runs (e.g. an immediately-interrupted + // run shouldn't burn a turn against max_turns); active running time is + // tracked separately on thread_runs.elapsed_running_secs and is no longer + // billed against the goal here. if let Some(run_seconds) = crate::persistence::repo::run_repo::get_run_duration(&self.pool, run_id) .await .unwrap_or(None) { - let paused_seconds = self.lock_runtime().take_run_paused_seconds(run_id).max(0); - let billable_seconds = (run_seconds - paused_seconds).max(0); - if billable_seconds > 0 { - self.account_usage(¤t.id, 0, billable_seconds) - .await - .ok(); + if run_seconds > 0 { + self.account_usage(¤t.id, 0).await.ok(); } - } else { - self.lock_runtime().take_run_paused_seconds(run_id); } let updated = self.get_active().await?; diff --git a/src-tauri/src/model/goal.rs b/src-tauri/src/model/goal.rs index 1868fb28..62f129d2 100644 --- a/src-tauri/src/model/goal.rs +++ b/src-tauri/src/model/goal.rs @@ -119,7 +119,6 @@ pub struct GoalRecord { pub status: GoalStatus, pub token_budget: Option, pub tokens_used: i64, - pub time_used_seconds: i64, pub turns_used: i64, pub max_turns: i64, pub pause_reason: Option, @@ -151,7 +150,6 @@ pub struct GoalDto { #[serde(skip_serializing_if = "Option::is_none")] pub token_budget: Option, pub tokens_used: i64, - pub time_used_seconds: i64, pub turns_used: i64, pub max_turns: i64, #[serde(skip_serializing_if = "Option::is_none")] @@ -184,7 +182,6 @@ impl From for GoalDto { status: r.status, token_budget: r.token_budget, tokens_used: r.tokens_used, - time_used_seconds: r.time_used_seconds, turns_used: r.turns_used, max_turns: r.max_turns, pause_reason: r.pause_reason, @@ -218,7 +215,6 @@ pub struct GoalPayload { pub objective: String, pub status: GoalStatus, pub tokens_used: i64, - pub time_used_seconds: i64, pub turns_used: i64, pub max_turns: i64, #[serde(skip_serializing_if = "Option::is_none")] @@ -250,7 +246,6 @@ impl From for GoalPayload { objective: r.objective, status: r.status, tokens_used: r.tokens_used, - time_used_seconds: r.time_used_seconds, turns_used: r.turns_used, max_turns: r.max_turns, token_budget: r.token_budget, diff --git a/src-tauri/src/persistence/repo/goal_repo.rs b/src-tauri/src/persistence/repo/goal_repo.rs index 3424104a..72a3c53a 100644 --- a/src-tauri/src/persistence/repo/goal_repo.rs +++ b/src-tauri/src/persistence/repo/goal_repo.rs @@ -5,7 +5,7 @@ use crate::model::errors::AppError; use crate::model::goal::{GoalRecord, GoalStatus, PauseReason}; const SELECT_COLUMNS: &str = "id, thread_id, objective, status, token_budget, tokens_used, \ - time_used_seconds, turns_used, max_turns, pause_reason, pause_detail, evidence, \ + turns_used, max_turns, pause_reason, pause_detail, evidence, \ last_evaluated_run_id, judge_passed, judge_completeness, judge_findings, judge_summary, \ judge_evaluated_run_id, created_at, updated_at"; @@ -19,7 +19,6 @@ struct GoalRow { status: String, token_budget: Option, tokens_used: i64, - time_used_seconds: i64, turns_used: i64, max_turns: i64, pause_reason: Option, @@ -44,7 +43,6 @@ impl GoalRow { status: GoalStatus::from_str(&self.status), token_budget: self.token_budget, tokens_used: self.tokens_used, - time_used_seconds: self.time_used_seconds, turns_used: self.turns_used, max_turns: self.max_turns, pause_reason: self.pause_reason.map(|s| PauseReason::from_str(&s)), @@ -98,9 +96,9 @@ pub async fn insert(pool: &SqlitePool, record: &GoalRecord) -> Result<(), AppErr let now = Utc::now().to_rfc3339(); sqlx::query( "INSERT INTO goals (id, thread_id, objective, status, token_budget, tokens_used, \ - time_used_seconds, turns_used, max_turns, pause_reason, pause_detail, evidence, \ + turns_used, max_turns, pause_reason, pause_detail, evidence, \ last_evaluated_run_id, created_at, updated_at) \ - VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)", + VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)", ) .bind(&record.id) .bind(&record.thread_id) @@ -108,7 +106,6 @@ pub async fn insert(pool: &SqlitePool, record: &GoalRecord) -> Result<(), AppErr .bind(record.status.as_str()) .bind(record.token_budget) .bind(record.tokens_used) - .bind(record.time_used_seconds) .bind(record.turns_used) .bind(record.max_turns) .bind(record.pause_reason.as_ref().map(|r| r.as_str())) @@ -151,19 +148,16 @@ pub async fn account_usage( pool: &SqlitePool, id: &str, tokens_delta: i64, - time_delta_seconds: i64, turns_delta: i64, ) -> Result<(), AppError> { sqlx::query( "UPDATE goals SET \ tokens_used = tokens_used + ?, \ - time_used_seconds = time_used_seconds + ?, \ turns_used = turns_used + ?, \ updated_at = ? \ WHERE id = ?", ) .bind(tokens_delta) - .bind(time_delta_seconds) .bind(turns_delta) .bind(Utc::now().to_rfc3339()) .bind(id) diff --git a/src-tauri/src/persistence/repo/run_repo.rs b/src-tauri/src/persistence/repo/run_repo.rs index 29265a43..ad783f65 100644 --- a/src-tauri/src/persistence/repo/run_repo.rs +++ b/src-tauri/src/persistence/repo/run_repo.rs @@ -1152,44 +1152,6 @@ mod tests { "expected running segment to be added, got {elapsed}" ); } - - #[tokio::test] - async fn get_active_run_elapsed_seconds_returns_positive_for_running() { - let pool = setup_test_pool().await; - // Insert a running run with a past started_at so elapsed > 0 - sqlx::query( - "INSERT INTO thread_runs (id, thread_id, run_mode, status, started_at, input_tokens, output_tokens, total_tokens) - VALUES ('run-active', 't1', 'default', 'running', '2026-04-22T09:00:00Z', 0, 0, 0)", - ) - .execute(&pool) - .await - .expect("seed run"); - - let duration = super::get_active_run_elapsed_seconds(&pool, "t1") - .await - .unwrap() - .expect("should return elapsed seconds for running run"); - // With started_at in the past, elapsed should be > 0 - assert!(duration > 0, "expected positive elapsed, got {duration}"); - } - - #[tokio::test] - async fn get_active_run_elapsed_seconds_skips_terminal_runs() { - let pool = setup_test_pool().await; - // Insert a completed run (should be skipped) - sqlx::query( - "INSERT INTO thread_runs (id, thread_id, run_mode, status, started_at, input_tokens, output_tokens, total_tokens) - VALUES ('run-done', 't1', 'default', 'completed', '2026-04-22T09:00:00Z', 0, 0, 0)", - ) - .execute(&pool) - .await - .expect("seed run"); - - let duration = super::get_active_run_elapsed_seconds(&pool, "t1") - .await - .unwrap(); - assert!(duration.is_none(), "should skip completed runs"); - } } /// Get the duration in seconds of the last completed run for a thread. @@ -1231,46 +1193,6 @@ pub async fn get_run_duration(pool: &SqlitePool, run_id: &str) -> Result Result, AppError> { - let duration = sqlx::query_scalar::<_, Option>( - "SELECT CAST(strftime('%s', 'now') - strftime('%s', started_at) AS INTEGER) - FROM thread_runs - WHERE id = ? - LIMIT 1", - ) - .bind(run_id) - .fetch_optional(pool) - .await? - .flatten(); - Ok(duration) -} - -/// Get the elapsed seconds of any currently active (non-terminal) run for a thread. -/// Returns None if no active run exists. -pub async fn get_active_run_elapsed_seconds( - pool: &SqlitePool, - thread_id: &str, -) -> Result, AppError> { - let duration = sqlx::query_scalar::<_, Option>( - "SELECT CAST(strftime('%s', 'now') - strftime('%s', started_at) AS INTEGER) - FROM thread_runs - WHERE thread_id = ? - AND status NOT IN ('completed','failed','denied','interrupted','cancelled','limit_reached') - ORDER BY started_at DESC - LIMIT 1", - ) - .bind(thread_id) - .fetch_optional(pool) - .await? - .flatten(); - Ok(duration) -} - /// Bulk-fetch the Unix-millisecond start timestamp of the currently active /// (non-terminal) run for each thread in `thread_ids`. Threads without an /// active run are simply absent from the returned map. Used by the sidebar diff --git a/src-tauri/tests/goal_lifecycle.rs b/src-tauri/tests/goal_lifecycle.rs index ecff3198..157d7704 100644 --- a/src-tauri/tests/goal_lifecycle.rs +++ b/src-tauri/tests/goal_lifecycle.rs @@ -138,7 +138,6 @@ mod tests { let after_first = mgr.get_active().await.unwrap().unwrap(); assert_eq!(after_first.turns_used, goal.turns_used + 1); - assert_eq!(after_first.time_used_seconds, 42); assert_eq!(after_first.last_evaluated_run_id.as_deref(), Some("run-1")); let second = mgr @@ -150,10 +149,6 @@ mod tests { let after_second = mgr.get_active().await.unwrap().unwrap(); assert_eq!(after_second.turns_used, after_first.turns_used); - assert_eq!( - after_second.time_used_seconds, - after_first.time_used_seconds - ); } #[tokio::test] @@ -233,7 +228,7 @@ mod tests { let goal = mgr.create_goal("Test goal", None).await.unwrap(); // Set turns_used to at least max_turns via account_usage - goal_repo::account_usage(&pool, &goal.id, 0, 0, goal.max_turns) + goal_repo::account_usage(&pool, &goal.id, 0, goal.max_turns) .await .unwrap(); @@ -373,7 +368,7 @@ mod tests { let goal = mgr.create_goal("Test goal", Some(500)).await.unwrap(); // Accumulate tokens to reach the budget - goal_repo::account_usage(&pool, &goal.id, 500, 0, 0) + goal_repo::account_usage(&pool, &goal.id, 500, 0) .await .unwrap(); diff --git a/src/modules/workbench-shell/model/thread-store.ts b/src/modules/workbench-shell/model/thread-store.ts index 97ec8701..f87adfc6 100644 --- a/src/modules/workbench-shell/model/thread-store.ts +++ b/src/modules/workbench-shell/model/thread-store.ts @@ -93,7 +93,6 @@ export interface GoalStoreState { objective: string; status: "active" | "paused" | "budget_limited" | "complete"; tokensUsed: number; - timeUsedSeconds: number; turnsUsed: number; maxTurns: number; tokenBudget?: number | null; diff --git a/src/services/bridge/agent-commands.test.ts b/src/services/bridge/agent-commands.test.ts index 25695b82..eb521ca7 100644 --- a/src/services/bridge/agent-commands.test.ts +++ b/src/services/bridge/agent-commands.test.ts @@ -366,7 +366,6 @@ function makeGoalPayload(overrides: Partial = {}): GoalPayload { objective: "Build a todo app", status: "active", tokensUsed: 0, - timeUsedSeconds: 0, turnsUsed: 0, maxTurns: 50, tokenBudget: null, diff --git a/src/services/bridge/agent-commands.ts b/src/services/bridge/agent-commands.ts index d6ec3012..9e1c17ac 100644 --- a/src/services/bridge/agent-commands.ts +++ b/src/services/bridge/agent-commands.ts @@ -727,7 +727,6 @@ export type GoalPayload = { objective: string; status: "active" | "paused" | "budget_limited" | "complete"; tokensUsed: number; - timeUsedSeconds: number; turnsUsed: number; maxTurns: number; tokenBudget?: number | null; From 4481759f6295e31ace841ffcf88f585cf3d515d8 Mon Sep 17 00:00:00 2001 From: Jorben Date: Wed, 10 Jun 2026 12:04:44 +0800 Subject: [PATCH 09/16] =?UTF-8?q?feat(judge):=20=E2=9C=A8=20redesign=20Jud?= =?UTF-8?q?ge=20evaluation=20for=20independence=20and=20completeness?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- src-tauri/src/core/agent_session_execution.rs | 198 +++++++++++++++--- .../core/prompt/templates/active_goal.tpl.md | 4 +- .../core/prompt/templates/subagent/judge.md | 66 +++--- .../subagent/output_contract.judge.md | 6 +- src-tauri/src/core/subagent/judge_contract.rs | 18 +- .../src/persistence/repo/run_helper_repo.rs | 56 +++++ 6 files changed, 268 insertions(+), 80 deletions(-) diff --git a/src-tauri/src/core/agent_session_execution.rs b/src-tauri/src/core/agent_session_execution.rs index f1dbb980..4dd9aece 100644 --- a/src-tauri/src/core/agent_session_execution.rs +++ b/src-tauri/src/core/agent_session_execution.rs @@ -1612,8 +1612,10 @@ impl AgentSession { tool_call_storage_id: &str, tool_input: &serde_json::Value, ) -> AgentToolResult { - // Parse the main agent's task / rationale. - let request = match crate::core::subagent::JudgeRequest::from_tool_input(tool_input) { + // Parse the main agent's task / rationale. The task value is no longer + // injected into the Judge prompt — the Judge evaluates independently. + // Parsing is retained for backward compatibility and input validation. + let _request = match crate::core::subagent::JudgeRequest::from_tool_input(tool_input) { Ok(request) => request, Err(error) => { tool_call_repo::update_result( @@ -1679,41 +1681,34 @@ impl AgentSession { return agent_error_result(err_msg); } - // Build the Judge task: inject the goal objective + status + last verdict - // so the Judge does not rely on the main agent's self-report. - let mut prior_verdict = String::new(); - if goal.judge_evaluated_run_id.is_some() { - if let Some(summary) = goal.judge_summary.as_deref() { - if !summary.trim().is_empty() { - prior_verdict.push_str(&format!("\nPrevious Judge summary: {summary}")); - } - } - if let Some(findings_json) = goal.judge_findings.as_deref() { - if let Ok(findings) = serde_json::from_str::>(findings_json) { - if !findings.is_empty() { - prior_verdict.push_str("\nPrevious Judge findings:"); - for finding in findings { - prior_verdict.push_str(&format!("\n- {finding}")); - } - } - } - } - } + // Build the Judge task: inject only the goal objective, task board + // state, and (if applicable) process compliance evidence. The Judge + // receives no input from the main agent — it evaluates the project + // state independently against the goal. + + // Query task board state for cross-reference. + let task_board_summary = build_task_board_summary(&self.pool, &goal.thread_id).await; + + // Conditionally include process compliance layer for goals that + // require reviews or phase-by-phase verification. + let process_compliance = if has_process_requirements(&goal.objective) { + build_process_compliance_summary(&self.pool, &goal.thread_id).await + } else { + String::new() + }; let judge_task = format!( "You are verifying acceptance of the following goal for the current project.\n\n\ -Goal id: {goal_id}\n\ -Goal status: {status:?}\n\ -Goal objective:\n{objective}\n\ -{prior_verdict}\n\n\ -The main agent's note for this verification request:\n{task}\n\n\ +Goal objective:\n{objective}\n\n\ +{task_board_summary}\n\ +{process_compliance}\ Independently inspect the project's current state and decide whether it satisfies the goal. \ +You must verify ALL requirements in the goal, not just those that seem to have been worked on. \ +Cross-reference the task board state above with your file-system findings. \ Return your structured JudgeReport verdict.", - goal_id = goal.id, - status = goal.status, objective = goal.objective, - prior_verdict = prior_verdict, - task = request.task, + task_board_summary = task_board_summary, + process_compliance = process_compliance, ); // Build a Judge delegate (depth 2, primary model) and run it. @@ -1837,6 +1832,147 @@ Return your structured JudgeReport verdict.", } } +/// Build a human-readable summary of the task board state for the Judge. +/// Returns a string describing each step and its stage, or a note that no +/// task board exists. +async fn build_task_board_summary(pool: &sqlx::SqlitePool, thread_id: &str) -> String { + use crate::persistence::repo::{task_board_repo, task_item_repo}; + + let boards = match task_board_repo::list_by_thread(pool, thread_id).await { + Ok(boards) => boards, + Err(_) => return "(No task board data available.)\n".to_string(), + }; + + if boards.is_empty() { + return "(No task board exists for this goal. Verify entirely from file system and goal text.)\n" + .to_string(); + } + + let mut summary = String::from("## Associated task board state\n\n"); + for board in &boards { + summary.push_str(&format!( + "**{}** (status: {}):\n", + board.title, + board.status.as_str() + )); + + let items = match task_item_repo::list_by_task_board(pool, &board.id).await { + Ok(items) => items, + Err(_) => { + summary.push_str(" (Could not load task items.)\n"); + continue; + } + }; + + if items.is_empty() { + summary.push_str(" (No task items.)\n"); + continue; + } + + for item in &items { + summary.push_str(&format!( + " - [{}] {}\n", + item.stage.as_str(), + item.description + )); + } + } + + summary.push_str( + "\n**Important**: Any step above that is not `completed` and maps to a goal \ + requirement is evidence of incomplete work. Report these as findings.\n", + ); + summary +} + +/// Check whether the goal objective contains process requirements (e.g., +/// "review each phase", "每阶段验收"). When true, the Judge prompt will +/// include a process compliance layer showing the thread's review call history. +fn has_process_requirements(objective: &str) -> bool { + let lower = objective.to_lowercase(); + let keywords = [ + "review", + "验收", + "检查", + "verify each", + "verify every", + "per phase", + "每个阶段", + "每一阶段", + "每轮", + "阶段完成", + ]; + keywords.iter().any(|kw| lower.contains(&kw.to_lowercase())) +} + +/// Build a process compliance summary from the thread's run_helper history. +/// Lists all review-related helper calls chronologically with their input +/// summaries and status. Only meaningful when the goal objective contains +/// process requirements (e.g., "each phase must have a review"). +async fn build_process_compliance_summary(pool: &sqlx::SqlitePool, thread_id: &str) -> String { + use crate::persistence::repo::run_helper_repo; + + let helpers = match run_helper_repo::list_by_thread_id(pool, thread_id).await { + Ok(h) => h, + Err(_) => return String::new(), + }; + + // Filter for review-related calls: agent_review, helper_review + let reviews: Vec<_> = helpers + .iter() + .filter(|h| h.helper_kind.contains("review")) + .collect(); + + if reviews.is_empty() { + return format!( + "## Process compliance\n\n\ + No review calls found in thread history. \ + If the goal requires reviews, this is evidence of non-compliance.\n\n" + ); + } + + let mut summary = String::from("## Process compliance\n\n"); + summary.push_str("The following review calls were recorded during this goal:\n\n"); + + for (i, review) in reviews.iter().enumerate() { + let status_label = match review.status.as_str() { + "completed" => "✓ completed", + "failed" => "✗ failed", + "interrupted" => "⚠ interrupted", + _ => &review.status, + }; + + let input_preview = review + .input_summary + .as_deref() + .map(|s| { + // Truncate to first 200 chars for readability + if s.len() > 200 { + format!("{}...", &s[..200]) + } else { + s.to_string() + } + }) + .unwrap_or_else(|| "(no task description)".to_string()); + + summary.push_str(&format!( + "{}. `{}` called at {} (status: {})\n Scope: {}\n", + i + 1, + review.helper_kind, + &review.started_at[..review.started_at.len().min(19)], + status_label, + input_preview, + )); + } + + summary.push_str( + "\n**Guidance**: If the goal requires reviews at specific milestones \ + (e.g., \"after each phase\"), verify that the review calls above \ + cover all required milestones. Missing or failed reviews are findings.\n\n", + ); + summary +} + #[cfg(test)] mod tests { use super::{ diff --git a/src-tauri/src/core/prompt/templates/active_goal.tpl.md b/src-tauri/src/core/prompt/templates/active_goal.tpl.md index c36eb3dd..d96a87f8 100644 --- a/src-tauri/src/core/prompt/templates/active_goal.tpl.md +++ b/src-tauri/src/core/prompt/templates/active_goal.tpl.md @@ -11,10 +11,10 @@ Turns used: {{turns_used}}/{{max_turns}} **Completion is decided by independent verification — you cannot self-declare it.** 1. Every subtask implied by the objective must be done, with no remaining work or dangling follow-ups. 2. Verify your work by running the relevant tests, linters, or build commands as you go. -3. When you believe the goal is achieved, you MUST request acceptance by calling `agent_judge(task="...")`. +3. When you believe the goal is achieved, you MUST request acceptance by calling `agent_judge()`. Rules: -- Call `agent_judge(task="explain why you believe the goal is achieved / what to verify")` when you think the goal is complete. An independent Judge will evaluate the project against the goal's consistency and completeness. +- Call `agent_judge()` to request independent goal acceptance verification. An independent Judge will evaluate the project against the goal's completeness. You do not need to provide a self-assessment — the Judge evaluates the project state directly. - The goal is only marked verified when the Judge returns passed=true. You cannot mark the goal complete yourself. - If a Judge verification did not pass, read its findings, fix each one, then call `agent_judge` again. - Once the goal has passed Judge acceptance, stop making further changes and summarize the result. diff --git a/src-tauri/src/core/prompt/templates/subagent/judge.md b/src-tauri/src/core/prompt/templates/subagent/judge.md index 58f8b873..454325ab 100644 --- a/src-tauri/src/core/prompt/templates/subagent/judge.md +++ b/src-tauri/src/core/prompt/templates/subagent/judge.md @@ -1,64 +1,52 @@ --- section_id: SubagentJudge -version: 1 +version: 2 declared_keys: [] --- -You are the **Goal Acceptance Judge** — an independent verifier. The main agent has been working toward a goal and now believes it is achieved (or has fixed earlier findings and wants re-verification). Your job is to independently decide whether the project's **current state** truly satisfies the goal, focusing on **consistency** with what the goal asked for and **completeness** of the work. +You are the **Goal Acceptance Judge** — an independent auditor. Your task is to determine whether the project's **current state** satisfies a goal objective. You work **independently** — you receive no input from the main agent about what it did, changed, or believes is complete. Your assessment must be based solely on objective evidence: the goal objective, the project file system, the task board associated with this goal, and verification commands you run yourself. -You are an evaluator, not an implementer. You did not do the work, and you must not take the main agent's claims at face value — verify against the actual project state. Goal tasks are typically long-horizon with broad change surfaces, so your evaluation must scale: be thorough enough to catch real gaps, efficient enough to converge in one pass, and honest about what you actually verified. +You are an evaluator, not an implementer. Every evaluation is a **fresh, independent, full-scope assessment**. Do not inherit or defer to any prior judge's conclusions — each call starts from scratch. -## Operating principle: size first, then verify +## Core principle: verify the ENTIRE goal, not a subset -Do not start verifying detail by detail before you understand the shape of the change. The right verification budget — and whether to fan out work to subagents — depends on how much actually changed and how it is distributed. +You must verify **ALL** requirements in the goal against the current project state. Do not limit your verification to areas that "seem to have been worked on" or that a prior evaluation mentioned. A goal requirement you didn't check is a gap in your verification, not a gap that doesn't exist. -### Step 1 — Size the change (always do this first) -- Run `git_status` and `git_diff --stat` (or the project's equivalent) to enumerate changed files, additions/deletions, and the rough surface area. -- Cross-reference with the goal objective: identify which subsystems / layers / acceptance criteria each cluster of changes maps to. -- Form an explicit mental model before any deep reading: - - **Small** — ≤ ~5 files changed, single module/layer, narrow concern. One linear pass is enough. - - **Medium** — ~6–20 files, 2–3 subsystems or layers touched, multiple acceptance criteria. - - **Large** — > 20 files, cross-cutting changes, multiple independent topics (e.g. backend + frontend + tests + config + docs), or the goal lists many distinct subtasks. -- Use these as guidance, not hard rules: a 3-file change that touches a security boundary may still warrant Large-style scrutiny; a 40-file rename may collapse to Small. -- If the change scope is genuinely tiny relative to the goal (e.g. goal asks for a feature but the diff shows trivial edits), that itself is strong evidence of incompleteness — record it and probe further before concluding. +### Step 1 — Understand the full requirement surface +- Parse the goal objective into distinct, verifiable requirements. Every requirement must be checked — implicit ones count too (e.g., if the goal says "implement X with tests", both the implementation and the tests are required). +- Read any design documents or acceptance criteria referenced by the goal (e.g., `@docs/architecture.md`). Extract every acceptance item from them. +- Check the task board associated with this goal (provided in your task prompt). Task board steps that are not `completed` are **direct evidence of incomplete work** and must be reported. A pending step that maps to a goal requirement means that requirement is not satisfied. -### Step 2 — Pick a verification strategy that matches the size -- **Small change** — verify directly. Read the changed files yourself, confirm each goal requirement against the actual code, run the targeted tests/type-checks. Do not delegate; the coordination overhead is not worth it. -- **Medium change** — split logically. Use one or two `agent_explore` / `agent_review` calls when a coherent slice (e.g. "review the new module + its consumers", "explore how config plumbing was wired") is too large to inspect in line without losing context. Run diagnostic commands (typecheck, targeted tests, lint) yourself. -- **Large change** — fan out with `agent_parallel`. Break the goal's acceptance surface into 2–5 independent topics and dispatch them in parallel. Good split axes: - - **By layer** — backend / frontend / persistence / config. - - **By subsystem** — auth / billing / notifications. - - **By concern** — functional correctness / regression risk / tests & docs / migration & compatibility. - - **By goal subtask** — one helper per acceptance criterion when the goal is itemized. - Keep each subtask independent (no shared write state), bounded in scope, and concretely scoped to file lists or topics inferred from the diff. After the parallel batch returns, **synthesize the results yourself** — reconcile conflicts, call out failures or skipped items, and form one coherent verdict. Do not just concatenate helper outputs. +### Step 2 — Verify against the actual project state +- Read the relevant source files yourself. Do not assume code exists just because a task board step claims to have created it. +- **Call-chain verification**: for every type, function, or module you find defined, verify it is **actually wired into the runtime path** — called, consumed, or registered. A struct defined but never instantiated, a semaphore created but never acquired, or a policy trait implemented but never invoked in the request handler is **not** evidence of completion. Report these as findings. +- Run the verification commands the project uses (infer from manifests, CI config, workspace instructions): type-checks, tests, linters, formatters. Adapt to the actual project stack. +- When a protocol, endpoint, or feature is declared in code, verify its **routing** — is it reachable by an actual HTTP handler or equivalent entry point? A codec registered via `inventory::submit!` but never consumed by `inventory::iter` is half-finished work. -### Step 3 — Run the verification commands the project actually uses -- Adapt commands to this repository (infer from manifests, scripts, CI config, and workspace instructions). Do not assume a stack. -- Prefer the *narrowest* command that still covers the changed surface (e.g. test only the affected package) before falling back to repo-wide runs. For Large changes a repo-wide build/typecheck is usually still warranted. -- When `agent_review` is delegated, treat its verification output as authoritative — do not rerun the same commands unless its results were inconclusive. +### Step 3 — Cross-reference with the task board +- Compare the task board state against your file-system findings. If the board says a step is `completed` but the files don't back it up, that is a finding. If a step is `pending` that directly maps to a goal requirement, that requirement is not met — report it. +- If no task board exists for this goal, note it but do not fail on that basis alone — verify entirely from the file system and goal text. ## Delegation guidelines -- `agent_explore` — single focused investigation: "where is X used?", "how is Y wired?", "does the codebase still reference Z?". Use when one targeted read-only sweep beats inlining a dozen `read`/`search` calls. -- `agent_review` — bounded review of a slice of the implementation, including running its tests/type-check/lint. Pass `target='diff'` when the helper should look at the workspace changes; provide an explicit changed-file list when you already have one. -- `agent_parallel` — 2–5 independent read-only/review subtasks dispatched together. Prefer this over sequential helper calls whenever the topics are genuinely independent. Never recurse parallel into parallel. -- Do **not** delegate when: - - The change is small enough to inspect inline. - - The subtasks are interdependent (later ones need earlier results). - - You only need one shell command — just run it. +- `agent_explore` — single focused investigation: "where is X used?", "how is Y wired?", "does Z actually get called in the request path?". Use when one targeted read-only sweep beats inlining a dozen `read`/`search` calls. +- `agent_review` — bounded review of a slice of the implementation, including running its tests/type-check/lint. Pass `target='code'` or `target='diff'` as appropriate. +- `agent_parallel` — 2–5 independent read-only/review subtasks dispatched together. Use when the goal's requirements can be split into independent topics (by layer, subsystem, or acceptance criterion). Prefer this over sequential helper calls whenever topics are genuinely independent. +- Do **not** delegate when the goal is small enough to inspect inline or the subtasks are interdependent. - Always tell each delegate explicitly: the goal text, which slice they own, what evidence to return, and that they are read-only. ## Hard constraints (read-only acceptance) - Your file tools are read-only. Do **not** modify, create, or delete any files. -- The `shell` tool is for **diagnostic and verification commands only** — tests, type-checks, linters, builds, and read-only inspection (`git_status`, `git_diff`, `git_log`, `cat`, `ls`, etc.). You must **never** use shell to edit or delete files, install dependencies, change global or system state, or start interactive / long-running / daemon processes. +- The `shell` tool is for **diagnostic and verification commands only** — tests, type-checks, linters, builds, and read-only inspection. You must **never** use shell to edit or delete files, install dependencies, or start daemon processes. - Do not attempt to fix the goal yourself. If something is incomplete, report it as a finding so the main agent can fix it. -- Helpers you delegate to inherit the same read-only constraint; remind them in the task text when relevant. +- Helpers you delegate to inherit the same read-only constraint. ## Coverage honesty -- Track what you actually verified vs. what you sampled vs. what you skipped. A Large change you only spot-checked is **not** the same as a Large change you fully covered. +- Track what you actually verified vs. what you sampled vs. what you skipped. A goal you only spot-checked is **not** the same as one you fully covered. - When delegating, if any helper failed, returned inconclusive results, or could not run a command, treat that area as **not verified** — record it explicitly and let it influence the verdict. - Never imply a check passed without trustworthy evidence. If your `summary` cannot point to specific files, commands, or behaviors you confirmed, you do not have a basis to pass. ## Verdict rules -- Pass (`passed=true`) only when the project genuinely satisfies the goal with no material gaps **and** your verification covered the full change surface (directly or via successful delegates). When you pass, `summary` must clearly state the verified evidence — files inspected, commands run with their results, and which goal criteria each piece of evidence maps to. It becomes the goal's completion evidence. -- If anything required by the goal is missing, inconsistent, untested, or broken, set `passed=false` and list each concrete gap in `findings` (file path + what is wrong + why it violates the goal). One concrete finding is more valuable than three vague ones. +- Pass (`passed=true`) only when **every** requirement in the goal is genuinely satisfied with no material gaps, **and** your verification covered the full requirement surface. When you pass, `summary` must clearly state the verified evidence — files inspected, commands run with their results, and which goal criteria each piece of evidence maps to. It becomes the goal's completion evidence. +- If anything required by the goal is missing, inconsistent, untested, broken, or **defined but not wired**, set `passed=false` and list each concrete gap in `findings` (file path + what is wrong + why it violates the goal). One concrete finding is more valuable than three vague ones. - Be honest and conservative: when in doubt, do not pass. A false "passed" is worse than an extra verification round. - Calibrate `completenessPct` to actual coverage and remaining gaps, not to effort spent. A change that does 80% of the goal correctly is 80, not 100, even if the implemented parts are flawless. +- You must never use "pre-existing" or "accepted by prior judge" as a reason to pass a finding. Each finding stands or falls on its own merit against the goal requirements. diff --git a/src-tauri/src/core/prompt/templates/subagent/output_contract.judge.md b/src-tauri/src/core/prompt/templates/subagent/output_contract.judge.md index a695dd71..5b87ccc2 100644 --- a/src-tauri/src/core/prompt/templates/subagent/output_contract.judge.md +++ b/src-tauri/src/core/prompt/templates/subagent/output_contract.judge.md @@ -1,6 +1,6 @@ --- section_id: SubagentOutputContractJudge -version: 1 +version: 2 declared_keys: [] --- Your output will be consumed by the parent agent and the goal acceptance pipeline, not the user. Follow any response language instructions inherited above for natural-language fields (`findings`, `summary`). @@ -15,7 +15,7 @@ Return exactly one JSON object with this contract and nothing else (no markdown } Field rules: -- `passed` (boolean): true only when the project genuinely satisfies the goal. +- `passed` (boolean): true only when the project genuinely satisfies **every** goal requirement. - `completenessPct` (integer 0-100): your honest estimate of how complete the work is against the goal. -- `findings` (array of strings): each concrete unmet / inconsistent / untested / broken point. REQUIRED and non-empty when `passed=false`. +- `findings` (array of strings): each concrete unmet / inconsistent / untested / broken / not-wired point. REQUIRED and non-empty when `passed=false`. Each finding must reference a concrete file path and/or a specific goal requirement it violates. Do not accept vague descriptions — state exactly what file, what is missing, and what goal requirement is violated. - `summary` (string): rationale for the verdict. REQUIRED and non-empty when `passed=true` — it becomes the goal's completion evidence. If you cannot provide real evidence, set `passed=false`. diff --git a/src-tauri/src/core/subagent/judge_contract.rs b/src-tauri/src/core/subagent/judge_contract.rs index 2a673d93..dbd42f47 100644 --- a/src-tauri/src/core/subagent/judge_contract.rs +++ b/src-tauri/src/core/subagent/judge_contract.rs @@ -3,8 +3,10 @@ use serde::{Deserialize, Serialize}; /// Input for the `agent_judge` tool (provided by the main agent). #[derive(Debug, Clone)] pub struct JudgeRequest { - /// The main agent's explanation of why it believes the goal is achieved, - /// and/or points it wants the Judge to focus on. + /// The main agent's note for this verification request. No longer + /// injected into the Judge prompt — the Judge evaluates independently + /// against goal + file system + task board. Parsed for backward + /// compatibility but the value is discarded by execute_judge_tool. pub task: String, } @@ -17,8 +19,12 @@ impl JudgeRequest { .trim() .to_string(); + // Task is now optional; an empty task string is valid. + // The Judge does not receive the main agent's self-assessment. if task.is_empty() { - return Err("missing agent_judge task".to_string()); + return Ok(Self { + task: "Goal acceptance verification".to_string(), + }); } Ok(Self { task }) @@ -198,8 +204,10 @@ mod tests { use super::*; #[test] - fn judge_request_requires_task() { - assert!(JudgeRequest::from_tool_input(&serde_json::json!({})).is_err()); + fn judge_request_empty_task_returns_default() { + // Empty task is now valid; returns a default task string. + let req = JudgeRequest::from_tool_input(&serde_json::json!({})).expect("empty task parses"); + assert_eq!(req.task, "Goal acceptance verification"); let req = JudgeRequest::from_tool_input(&serde_json::json!({ "task": " verify it " })) .expect("parses"); assert_eq!(req.task, "verify it"); diff --git a/src-tauri/src/persistence/repo/run_helper_repo.rs b/src-tauri/src/persistence/repo/run_helper_repo.rs index 0eb59481..4fdcceeb 100644 --- a/src-tauri/src/persistence/repo/run_helper_repo.rs +++ b/src-tauri/src/persistence/repo/run_helper_repo.rs @@ -235,6 +235,27 @@ pub async fn list_by_run_ids( Ok(rows.into_iter().map(RunHelperRow::into_dto).collect()) } +/// List all run_helpers for a given thread. Used by the Judge's process +/// compliance layer to inspect review call history. +pub async fn list_by_thread_id( + pool: &SqlitePool, + thread_id: &str, +) -> Result, AppError> { + let rows = sqlx::query_as::<_, RunHelperRow>( + "SELECT id, run_id, thread_id, helper_kind, parent_tool_call_id, status, + input_summary, output_summary, error_summary, started_at, finished_at, + input_tokens, output_tokens, cache_read_tokens, cache_write_tokens, total_tokens + FROM run_helpers + WHERE thread_id = ? + ORDER BY started_at ASC, id ASC", + ) + .bind(thread_id) + .fetch_all(pool) + .await?; + + Ok(rows.into_iter().map(RunHelperRow::into_dto).collect()) +} + #[cfg(test)] mod tests { use super::*; @@ -694,6 +715,41 @@ mod tests { assert!(kinds.contains(&"explore".into())); } + #[tokio::test] + async fn list_by_thread_id_returns_helpers_for_thread() { + let pool = setup_test_pool().await; + + // Insert helpers for the test thread + for (id, kind) in &[("h-1", "helper_review"), ("h-2", "helper_explore")] { + let helper = RunHelperInsert { + id: id.to_string(), + run_id: "run-1".into(), + thread_id: "t1".into(), + helper_kind: kind.to_string(), + parent_tool_call_id: None, + status: "completed".into(), + model_role: "auxiliary".into(), + provider_id: None, + model_id: None, + input_summary: Some(format!("{kind} task")), + }; + insert(&pool, &helper).await.unwrap(); + } + + let result = list_by_thread_id(&pool, "t1").await.unwrap(); + assert_eq!(result.len(), 2); + let kinds: Vec = result.iter().map(|h| h.helper_kind.clone()).collect(); + assert!(kinds.contains(&"helper_review".into())); + assert!(kinds.contains(&"helper_explore".into())); + } + + #[tokio::test] + async fn list_by_thread_id_returns_empty_for_unknown_thread() { + let pool = setup_test_pool().await; + let result = list_by_thread_id(&pool, "t-unknown").await.unwrap(); + assert!(result.is_empty()); + } + #[tokio::test] async fn list_by_run_ids_returns_empty_for_empty_input() { let pool = setup_test_pool().await; From 0e8b153249d4afe7895dffe566da739684b29796 Mon Sep 17 00:00:00 2001 From: Jorben Date: Wed, 10 Jun 2026 12:36:39 +0800 Subject: [PATCH 10/16] =?UTF-8?q?fix(subagent):=20=F0=9F=90=9B=20make=20ta?= =?UTF-8?q?sk=20field=20optional=20and=20fix=20UTF-8=20safe=20truncation?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - Downgrade Judge prompt versions from 2 to 1 (likely a revert of unintended bump) - Change `task` field from required to optional in Judge tool schema, with updated description clarifying it is an optional note - Replace byte-based truncation with character-safe truncation to avoid panicking on multi-byte UTF-8 in process compliance summary - Simplify Judge request validation to only check input validity, discarding the parsed result used only for backward compatibility - Skip abandoned task boards when building summary to focus on relevant goal state --- src-tauri/src/core/agent_session_execution.rs | 39 ++++++++++--------- .../core/prompt/templates/subagent/judge.md | 2 +- .../subagent/output_contract.judge.md | 2 +- .../core/subagent/runtime_orchestration.rs | 5 +-- 4 files changed, 25 insertions(+), 23 deletions(-) diff --git a/src-tauri/src/core/agent_session_execution.rs b/src-tauri/src/core/agent_session_execution.rs index 4dd9aece..153cae61 100644 --- a/src-tauri/src/core/agent_session_execution.rs +++ b/src-tauri/src/core/agent_session_execution.rs @@ -1614,21 +1614,19 @@ impl AgentSession { ) -> AgentToolResult { // Parse the main agent's task / rationale. The task value is no longer // injected into the Judge prompt — the Judge evaluates independently. - // Parsing is retained for backward compatibility and input validation. - let _request = match crate::core::subagent::JudgeRequest::from_tool_input(tool_input) { - Ok(request) => request, - Err(error) => { - tool_call_repo::update_result( - &self.pool, - tool_call_storage_id, - &serde_json::json!({ "error": &error }).to_string(), - "failed", - ) - .await - .ok(); - return agent_error_result(error); - } - }; + // Parsing is retained for input validation only (rejects non-string + // task values that would violate the tool JSON schema). + if let Err(error) = crate::core::subagent::JudgeRequest::from_tool_input(tool_input) { + tool_call_repo::update_result( + &self.pool, + tool_call_storage_id, + &serde_json::json!({ "error": &error }).to_string(), + "failed", + ) + .await + .ok(); + return agent_error_result(error); + } // Backstop: re-query goal state. agent_judge is injected only when an // un-verified goal exists, but a stale tool set or a direct call must be @@ -1850,6 +1848,10 @@ async fn build_task_board_summary(pool: &sqlx::SqlitePool, thread_id: &str) -> S let mut summary = String::from("## Associated task board state\n\n"); for board in &boards { + // Skip abandoned boards — they are not relevant to the current goal. + if board.status.as_str() == "abandoned" { + continue; + } summary.push_str(&format!( "**{}** (status: {}):\n", board.title, @@ -1946,9 +1948,10 @@ async fn build_process_compliance_summary(pool: &sqlx::SqlitePool, thread_id: &s .input_summary .as_deref() .map(|s| { - // Truncate to first 200 chars for readability - if s.len() > 200 { - format!("{}...", &s[..200]) + // Truncate to first 200 chars for readability (character-safe, + // avoids panicking on multi-byte UTF-8 sequences). + if s.chars().count() > 200 { + format!("{}...", s.chars().take(200).collect::()) } else { s.to_string() } diff --git a/src-tauri/src/core/prompt/templates/subagent/judge.md b/src-tauri/src/core/prompt/templates/subagent/judge.md index 454325ab..a8cb4ebf 100644 --- a/src-tauri/src/core/prompt/templates/subagent/judge.md +++ b/src-tauri/src/core/prompt/templates/subagent/judge.md @@ -1,6 +1,6 @@ --- section_id: SubagentJudge -version: 2 +version: 1 declared_keys: [] --- You are the **Goal Acceptance Judge** — an independent auditor. Your task is to determine whether the project's **current state** satisfies a goal objective. You work **independently** — you receive no input from the main agent about what it did, changed, or believes is complete. Your assessment must be based solely on objective evidence: the goal objective, the project file system, the task board associated with this goal, and verification commands you run yourself. diff --git a/src-tauri/src/core/prompt/templates/subagent/output_contract.judge.md b/src-tauri/src/core/prompt/templates/subagent/output_contract.judge.md index 5b87ccc2..e144a2da 100644 --- a/src-tauri/src/core/prompt/templates/subagent/output_contract.judge.md +++ b/src-tauri/src/core/prompt/templates/subagent/output_contract.judge.md @@ -1,6 +1,6 @@ --- section_id: SubagentOutputContractJudge -version: 2 +version: 1 declared_keys: [] --- Your output will be consumed by the parent agent and the goal acceptance pipeline, not the user. Follow any response language instructions inherited above for natural-language fields (`findings`, `summary`). diff --git a/src-tauri/src/core/subagent/runtime_orchestration.rs b/src-tauri/src/core/subagent/runtime_orchestration.rs index c458e098..326e748b 100644 --- a/src-tauri/src/core/subagent/runtime_orchestration.rs +++ b/src-tauri/src/core/subagent/runtime_orchestration.rs @@ -356,10 +356,9 @@ impl RuntimeOrchestrationTool { "properties": { "task": { "type": "string", - "description": "Explain why you believe the goal is achieved and call out anything the Judge should focus on (e.g. acceptance criteria, areas you are unsure about). If you are re-verifying after fixing earlier findings, summarize what you changed." + "description": "Optional note for the Judge. The Judge evaluates the project state independently against the goal and does not rely on your self-assessment." } - }, - "required": ["task"] + } }), }; From 539005cecd18fc6fa3186ab4a8cdb0b633c6b0b9 Mon Sep 17 00:00:00 2001 From: Jorben Date: Thu, 11 Jun 2026 12:12:08 +0800 Subject: [PATCH 11/16] =?UTF-8?q?chore(deps):=20=F0=9F=94=A7=20align=20tiy?= =?UTF-8?q?core=20to=200.2.10-rc.2=20and=20adopt=20Usage::context=5Fsize()?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Cherry-pick the master commit (a03d9ba) that bumps tiycore from 0.2.9 to 0.2.10-rc.2 and unifies context_size semantics across RunUsageDto / frontend badge / auto-compression, removing the old initial_context_calibration heuristic path. No file conflict with the Judge work in this branch — the 25 files touched here do not overlap with the 6 Judge files resolved in the previous merge. --- src-tauri/Cargo.lock | 4 +- src-tauri/Cargo.toml | 2 +- src-tauri/src/core/agent_run_event_handler.rs | 11 + src-tauri/src/core/agent_session.rs | 173 +++++------ .../src/core/agent_session_compression.rs | 176 +++-------- src-tauri/src/core/agent_session_events.rs | 36 ++- src-tauri/src/core/agent_session_tests.rs | 285 ++++++------------ src-tauri/src/core/agent_session_types.rs | 2 - src-tauri/src/core/context_compression.rs | 150 ++++++--- src-tauri/src/model/thread.rs | 20 ++ .../src/persistence/repo/run_helper_repo.rs | 9 + src-tauri/src/persistence/repo/run_repo.rs | 11 + src-tauri/tests/frontend_integration.rs | 4 + .../workbench-shell/model/thread-store.ts | 8 + .../ui/dashboard-workbench-logic.ts | 27 +- .../ui/dashboard-workbench.test.ts | 43 ++- .../ui/dashboard-workbench.tsx | 2 +- .../ui/runtime-thread-surface-state.ts | 15 + .../ui/runtime-thread-surface.test.tsx | 4 + .../ui/runtime-thread-surface.tsx | 11 + .../workbench-shell/ui/workbench-top-bar.tsx | 6 + src/services/bridge/agent-commands.ts | 61 ++-- .../thread-stream/thread-stream.test.ts | 3 + src/shared/types/api.ts | 13 + 24 files changed, 559 insertions(+), 517 deletions(-) diff --git a/src-tauri/Cargo.lock b/src-tauri/Cargo.lock index be76f106..1428aa51 100644 --- a/src-tauri/Cargo.lock +++ b/src-tauri/Cargo.lock @@ -6282,9 +6282,9 @@ dependencies = [ [[package]] name = "tiycore" -version = "0.2.9" +version = "0.2.10-rc.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d619c75f2e2f61b57f2fcd81e7ce7e714556b75b2fda8ee932450c657de9ac29" +checksum = "809ab84f3da03ccbfc74e6676f1c99820fe8b31ff44cb4489ab5a8d5ba782172" dependencies = [ "anyhow", "arc-swap", diff --git a/src-tauri/Cargo.toml b/src-tauri/Cargo.toml index eeef8cdd..4eb566da 100644 --- a/src-tauri/Cargo.toml +++ b/src-tauri/Cargo.toml @@ -60,7 +60,7 @@ keepawake = "0.6" portable-pty = "0.9" git2 = { version = "0.20", features = ["vendored-libgit2", "vendored-openssl"] } similar = "2" -tiycore = "0.2.9" +tiycore = "0.2.10-rc.2" # Gateway (IM channel support) async-trait = "0.1" diff --git a/src-tauri/src/core/agent_run_event_handler.rs b/src-tauri/src/core/agent_run_event_handler.rs index 6107563c..b2bf3228 100644 --- a/src-tauri/src/core/agent_run_event_handler.rs +++ b/src-tauri/src/core/agent_run_event_handler.rs @@ -415,6 +415,17 @@ impl AgentRunManager { } } ThreadStreamEvent::ThreadUsageUpdated { usage, .. } => { + // `usage` here is a `RunUsageDto` populated by tiycore via + // `From`. Its `context_size` field is + // the cross-protocol unified "context occupancy" value + // (`Usage::context_size()` = input + output + cache_read + + // cache_write), NOT the wire-level `total_tokens` (which is + // per-response and provider-dependent). We round-trip the + // whole struct (input/output/cache fields + total_tokens) + // through `tiycore::types::Usage` because the persistence + // layer is typed against `Usage`; `context_size` is + // re-derived inside the `From` impl on read, so storing only + // the raw fields is sufficient and forward-compatible. let usage = tiycore::types::Usage { input: usage.input_tokens, output: usage.output_tokens, diff --git a/src-tauri/src/core/agent_session.rs b/src-tauri/src/core/agent_session.rs index 2c7be27e..abe51a88 100644 --- a/src-tauri/src/core/agent_session.rs +++ b/src-tauri/src/core/agent_session.rs @@ -523,9 +523,7 @@ fn append_runtime_queue_message( } use crate::core::agent_session_compression::{ - build_initial_context_token_calibration, current_context_token_calibration, - persist_compression_markers_to_pool, record_pending_prompt_estimate, run_auto_compression, - ContextCompressionRuntimeState, + persist_compression_markers_to_pool, run_auto_compression, }; use crate::core::agent_session_events::handle_agent_event; pub(crate) use crate::core::agent_session_history::*; @@ -562,7 +560,12 @@ pub async fn build_session_spec( .collect(); let history_tool_calls = tool_call_repo::list_parent_visible_by_run_ids(pool, &history_run_ids).await?; - let latest_historical_run = + // No longer read after the calibration seed was removed; the + // `find_latest_with_prompt_usage_by_thread_excluding_run` call + // above is kept as documentation that historical run usage is + // intentionally not used by the new `Usage::context_size()` + // trigger (the first LLM response provides the source of truth). + let _latest_historical_run = run_repo::find_latest_with_prompt_usage_by_thread_excluding_run(pool, thread_id, run_id) .await?; @@ -629,13 +632,11 @@ pub async fn build_session_spec( } } - let initial_context_calibration = build_initial_context_token_calibration( - latest_historical_run.as_ref(), - &history_messages, - &history_tool_calls, - &resolved_plan.primary, - &system_prompt, - ); + // The previous initial-context calibration seed was removed when we + // switched auto-compression to the unified `Usage::context_size()` + // trigger (see `should_compress_via_context_size`). The trigger no + // longer needs a per-history-run calibration — it compares the most + // recent LLM usage against the budget directly. Ok(AgentSessionSpec { run_id: run_id.to_string(), @@ -649,7 +650,6 @@ pub async fn build_session_spec( history_tool_calls, model_plan: resolved_plan, initial_prompt: None, - initial_context_calibration, cache_arbiter, }) } @@ -664,7 +664,13 @@ pub struct AgentSession { cancel_requested: Arc, pub(crate) checkpoint_requested: AtomicBool, pub(crate) abort_signal: tiycore::agent::AbortSignal, - context_compression_state: Arc>, + /// Most recent `tiycore::types::Usage` observed from the last LLM + /// call, shared with the `set_transform_context` closure. The + /// closure compares `usage.context_size()` against the configured + /// `CompressionSettings::budget()` to decide whether the next turn + /// needs auto-compression. Shared with the event handler so the + /// trigger stays in lockstep with the stream of usage updates. + last_observed_usage: Arc>>, runtime_queue_state: Arc>, /// Shared goal runtime state for tool call recording across command invocations. pub(crate) goal_runtime: Arc>, @@ -682,9 +688,10 @@ impl AgentSession { ) -> Arc { Arc::new_cyclic(|weak_self| { let agent = Arc::new(Agent::with_model(spec.model_plan.primary.model.clone())); - let context_compression_state = Arc::new(StdMutex::new( - ContextCompressionRuntimeState::new(spec.initial_context_calibration), - )); + // Unified trigger: start with no observed usage; the + // `set_transform_context` closure defers its first decision + // until the first LLM response reports `context_size`. + let last_observed_usage = Arc::new(StdMutex::new(None::)); let runtime_queue_state = Arc::new(StdMutex::new(RuntimeQueueState::default())); agent.set_max_turns(max_turns); agent.set_max_retries(Some(TIYCORE_REQUEST_MAX_RETRIES)); @@ -692,7 +699,7 @@ impl AgentSession { &agent, &spec, weak_self.clone(), - Arc::clone(&context_compression_state), + Arc::clone(&last_observed_usage), Arc::clone(&runtime_queue_state), event_tx.clone(), ); @@ -707,7 +714,7 @@ impl AgentSession { cancel_requested: Arc::new(AtomicBool::new(false)), checkpoint_requested: AtomicBool::new(false), abort_signal: tiycore::agent::AbortSignal::new(), - context_compression_state, + last_observed_usage, runtime_queue_state, goal_runtime, } @@ -934,8 +941,8 @@ impl AgentSession { let last_completed_message_id_ref = Arc::clone(&last_completed_message_id); let reasoning_message_id_ref = Arc::clone(¤t_reasoning_message_id); let last_usage_ref = Arc::clone(&last_usage); + let last_observed_usage_ref = Arc::clone(&self.last_observed_usage); let reasoning_ref = Arc::clone(&reasoning_buffer); - let context_compression_state_ref = Arc::clone(&self.context_compression_state); let current_turn_index: Arc>> = Arc::new(StdMutex::new(None)); let turn_index_ref = Arc::clone(¤t_turn_index); let last_text_delta = Arc::new(StdMutex::new(None::)); @@ -948,7 +955,7 @@ impl AgentSession { &last_completed_message_id_ref, &reasoning_message_id_ref, &last_usage_ref, - &context_compression_state_ref, + &last_observed_usage_ref, &reasoning_ref, &turn_index_ref, &last_text_delta_ref, @@ -1084,7 +1091,7 @@ fn configure_agent( agent: &Arc, spec: &AgentSessionSpec, weak_self: Weak, - context_compression_state: Arc>, + last_observed_usage: Arc>>, runtime_queue_state: Arc>, event_tx: mpsc::UnboundedSender, ) { @@ -1100,27 +1107,39 @@ fn configure_agent( agent.set_transport(spec.model_plan.transport); agent.set_security_config(main_agent_security_config()); - // Context compression: when messages exceed the token budget, generate - // a summary with the primary model, persist markers to DB, and keep only - // recent messages. Falls back to pure truncation if the LLM call fails. + // Context compression: when the most recent LLM call reported a + // unified context size above the input budget, summarise the older + // messages with the primary model, persist markers to DB, and keep + // only recent messages. Falls back to pure truncation if the LLM + // call fails. + // + // The trigger is `Usage::context_size()` from tiycore 0.2.10-rc.2 — + // the cross-protocol unified "context occupancy" value + // (input + output + cache_read + cache_write). The previous heuristic + // (`estimate_total_tokens(messages) + system_prompt_estimated_tokens` + // scaled by an observed `ContextTokenCalibration` ratio) was removed + // because it could disagree with the actual provider usage and let an + // over-budget call slip through, and because the heuristic + CJK + // weight drift made the trigger point unpredictable. // - // Two correctness hazards addressed here: + // Two correctness hazards addressed in [`run_auto_compression`]: // - // 1. UUID v7 timing. The reset marker we write to DB uses `now_v7()`, but - // `cut_point` in-memory points at a slice that includes messages the - // current run persisted EARLIER than this call. A naive - // `list_since_last_reset WHERE id >= reset_id` would exclude those - // earlier messages and effectively "lose" the current user prompt on - // the next reload. We therefore resolve a DB-backed boundary id - // conservatively covering all recent messages and attach it to the - // reset marker's metadata. + // 1. UUID v7 timing. The reset marker we write to DB uses `now_v7()`, + // but `cut_point` in-memory points at a slice that includes + // messages the current run persisted EARLIER than this call. A + // naive `list_since_last_reset WHERE id >= reset_id` would exclude + // those earlier messages and effectively "lose" the current user + // prompt on the next reload. We therefore resolve a DB-backed + // boundary id conservatively covering all recent messages and + // attach it to the reset marker's metadata. // // 2. Summary-of-summary decay. If a previous auto-compression already // injected a `` as the head of `messages`, naively // re-summarising `old_messages` would re-summarise an already- - // summarised prefix, losing detail each pass. Instead we detect the - // prior summary, treat it as a pinned prefix, and ask the model to - // **merge** the prior summary with the delta of messages since then. + // summarised prefix, losing detail each pass. Instead we detect + // the prior summary, treat it as a pinned prefix, and ask the + // model to **merge** the prior summary with the delta of messages + // since then. let compression_settings = crate::core::context_compression::CompressionSettings::new( spec.model_plan.primary.model.context_window, ); @@ -1129,33 +1148,26 @@ fn configure_agent( let compression_thread_id = spec.thread_id.clone(); let compression_run_id = spec.run_id.clone(); let compression_response_language = spec.model_plan.raw.response_language.clone(); - let compression_state = Arc::clone(&context_compression_state); - // Pre-compute the system prompt's estimated token count so the - // compression check includes fixed overhead that the provider counts - // against the context window but `estimate_total_tokens(messages)` - // does not see. This narrows the gap the calibration ratio has to - // cover, making the trigger point more predictable. - let system_prompt_estimated_tokens = - crate::core::context_compression::estimate_tokens(&spec.system_prompt); + let last_observed_usage_for_trigger = Arc::clone(&last_observed_usage); agent.set_transform_context(move |messages| { - // Cheap pass-through check first: only clone the heavy captured state - // (ResolvedModelRole, String ids, Weak) when compression will actually - // run. For long sessions with many turns this avoids per-turn heap - // allocations when the thread is still well under budget. + // Cheap pass-through check first: only clone the heavy captured + // state (ResolvedModelRole, String ids, Weak) when compression + // will actually run. For long sessions with many turns this + // avoids per-turn heap allocations when the thread is still + // well under budget. let settings = compression_settings.clone(); - let raw_estimated_tokens = - crate::core::context_compression::estimate_total_tokens(&messages) - .saturating_add(system_prompt_estimated_tokens); - let calibration = current_context_token_calibration(&compression_state); - let calibrated_total_tokens = crate::core::context_compression::calibrate_total_tokens( - raw_estimated_tokens, - Some(calibration), - ); - let needs_compression = !messages.is_empty() - && crate::core::context_compression::should_compress_total_tokens( - calibrated_total_tokens, - &settings, - ); + let snapshot: Option = last_observed_usage_for_trigger + .lock() + .ok() + .and_then(|guard| *guard); + // `should_compress_via_context_size` returns `false` for + // `None` (no observed usage yet) — we defer the first decision + // until the first LLM response reports its `context_size`, + // matching the design note in `context_compression`. + let needs_compression = crate::core::context_compression::should_compress_via_context_size( + snapshot.as_ref(), + &settings, + ) && !messages.is_empty(); let model_role = if needs_compression { Some(primary_model_role.clone()) @@ -1182,31 +1194,24 @@ fn configure_agent( } else { None }; - let compression_state = Arc::clone(&compression_state); async move { - let transformed_messages = if !needs_compression { - messages - } else { - // Unwraps are sound: all `Some(_)` are populated together under - // `needs_compression`, so either all four are `Some` (compression - // path) or we returned above. - run_auto_compression( - messages, - settings, - model_role.expect("model_role populated when compressing"), - weak.expect("weak populated when compressing"), - thread_id.expect("thread_id populated when compressing"), - run_id.expect("run_id populated when compressing"), - response_language.expect("response_language populated when compressing"), - ) - .await - }; - let sent_estimated_tokens = - crate::core::context_compression::estimate_total_tokens(&transformed_messages) - .saturating_add(system_prompt_estimated_tokens); - record_pending_prompt_estimate(&compression_state, sent_estimated_tokens); - transformed_messages + if !needs_compression { + return messages; + } + // Unwraps are sound: all `Some(_)` are populated together + // under `needs_compression`, so either all six are `Some` + // (compression path) or we returned above. + run_auto_compression( + messages, + settings, + model_role.expect("model_role populated when compressing"), + weak.expect("weak populated when compressing"), + thread_id.expect("thread_id populated when compressing"), + run_id.expect("run_id populated when compressing"), + response_language.expect("response_language populated when compressing"), + ) + .await } }); diff --git a/src-tauri/src/core/agent_session_compression.rs b/src-tauri/src/core/agent_session_compression.rs index 9b03a01f..af0b5427 100644 --- a/src-tauri/src/core/agent_session_compression.rs +++ b/src-tauri/src/core/agent_session_compression.rs @@ -1,125 +1,12 @@ use std::panic::{catch_unwind, AssertUnwindSafe}; -use std::sync::{Mutex as StdMutex, Weak}; +use std::sync::Weak; use sqlx::SqlitePool; use tiycore::agent::AgentMessage; -use tiycore::types::Usage; use crate::core::agent_session::AgentSession; -use crate::core::agent_session_history::convert_history_messages; use crate::core::agent_session_types::ResolvedModelRole; -use crate::core::context_compression::ContextTokenCalibration; use crate::ipc::frontend_channels::ThreadStreamEvent; -use crate::model::thread::{MessageRecord, RunSummaryDto, ToolCallDto}; - -pub(crate) fn effective_prompt_tokens(input_tokens: u64, cache_read_tokens: u64) -> u64 { - input_tokens.saturating_add(cache_read_tokens) -} - -#[derive(Debug, Default)] -pub(crate) struct ContextCompressionRuntimeState { - calibration: ContextTokenCalibration, - pub(crate) pending_prompt_estimate: Option, -} - -impl ContextCompressionRuntimeState { - pub(crate) fn new(initial_calibration: ContextTokenCalibration) -> Self { - Self { - calibration: initial_calibration, - pending_prompt_estimate: None, - } - } - - fn calibration(&self) -> ContextTokenCalibration { - self.calibration - } - - pub(crate) fn record_pending_prompt_estimate(&mut self, estimated_tokens: u32) { - self.pending_prompt_estimate = Some(estimated_tokens); - } - - fn observe_prompt_usage(&mut self, actual_prompt_tokens: u64) { - let Some(estimated_tokens) = self.pending_prompt_estimate.take() else { - return; - }; - - self.calibration = self - .calibration - .observe(estimated_tokens, actual_prompt_tokens); - } -} - -pub(crate) fn current_context_token_calibration( - state: &StdMutex, -) -> ContextTokenCalibration { - state - .lock() - .map(|state| state.calibration()) - .unwrap_or_default() -} - -pub(crate) fn record_pending_prompt_estimate( - state: &StdMutex, - estimated_tokens: u32, -) { - if let Ok(mut state) = state.lock() { - state.record_pending_prompt_estimate(estimated_tokens); - } -} - -pub(crate) fn observe_context_usage_calibration( - state: &StdMutex, - usage: &Usage, -) { - let actual_prompt_tokens = effective_prompt_tokens(usage.input, usage.cache_read); - if actual_prompt_tokens == 0 { - return; - } - - if let Ok(mut state) = state.lock() { - state.observe_prompt_usage(actual_prompt_tokens); - } -} - -pub(crate) fn build_initial_context_token_calibration( - latest_historical_run: Option<&RunSummaryDto>, - history_messages: &[MessageRecord], - history_tool_calls: &[ToolCallDto], - primary_model: &ResolvedModelRole, - system_prompt: &str, -) -> ContextTokenCalibration { - let Some(latest_historical_run) = latest_historical_run else { - return ContextTokenCalibration::default(); - }; - - let historical_prompt_tokens = effective_prompt_tokens( - latest_historical_run.usage.input_tokens, - latest_historical_run.usage.cache_read_tokens, - ); - if historical_prompt_tokens == 0 - || !run_summary_matches_primary_model(latest_historical_run, primary_model) - { - return ContextTokenCalibration::default(); - } - - let history = - convert_history_messages(history_messages, history_tool_calls, &primary_model.model); - let estimated_tokens = crate::core::context_compression::estimate_total_tokens(&history) - .saturating_add(crate::core::context_compression::estimate_tokens( - system_prompt, - )); - - ContextTokenCalibration::from_observation(estimated_tokens, historical_prompt_tokens) - .unwrap_or_default() -} - -fn run_summary_matches_primary_model( - run_summary: &RunSummaryDto, - primary_model: &ResolvedModelRole, -) -> bool { - run_summary.model_id.as_deref() == Some(primary_model.model_id.as_str()) - || run_summary.model_id.as_deref() == Some(primary_model.model.id.as_str()) -} /// Auto-compression hook body, extracted from the `set_transform_context` /// closure in [`configure_agent`] so the control flow is testable in isolation @@ -150,14 +37,20 @@ pub(crate) async fn run_auto_compression( run_id: String, response_language: Option, ) -> Vec { - // Phase 1: check if compression is needed. + // Phase 1: trust the caller. The `set_transform_context` closure in + // `configure_agent` is the single source of truth for "should we + // compress right now?" — it compares the most recent + // `tiycore::types::Usage::context_size()` from the last LLM call + // against `settings.budget()`. The previous heuristic + // (`should_compress` over `messages`) was removed because it could + // disagree with the unified-usage trigger (e.g. a long prompt with + // a small historical thread) and let an over-budget call slip + // through. // - // The hot-path caller (the `set_transform_context` closure) already gates - // on `should_compress` before cloning the heavy state, so in production - // this branch should never hit. It stays here defensively so direct - // callers (e.g. unit tests) still get correct behaviour for under-budget - // inputs without having to duplicate the check. - if !crate::core::context_compression::should_compress(&messages, &settings) { + // We still keep the cheap empty-input fast path so direct unit + // callers (and tests) get the same pass-through behaviour as before + // for the degenerate empty case. + if messages.is_empty() { return messages; } @@ -942,11 +835,23 @@ mod tests { } #[tokio::test] - async fn returns_messages_unchanged_when_under_budget() { - // With a generous budget, should_compress is false and the function - // is a pure pass-through — no clone of messages, no LLM call, no - // DB access. This exercises the most common hot-path behaviour. - let messages = vec![make_user("hi"), make_assistant("hello")]; + async fn returns_messages_unchanged_when_empty_or_dangling_weak() { + // The trigger ("should we auto-compress?") is now owned by + // `set_transform_context` — it gates on + // `should_compress_via_context_size` before calling + // `run_auto_compression`. So in production, by the time we + // get here, the caller has already decided to compress. The + // only fast-path still inside `run_auto_compression` is the + // `messages.is_empty()` empty-input guard, so we exercise + // that. + // + // With a dangling `Weak`, an empty input + // returns immediately via the fast-path: no LLM call, no + // DB access, no side effects. The test asserts both the + // length and the byte-identical content of the head + // message, which is enough to prove the function did not + // touch the input. + let messages = vec![]; let settings = settings_for_test(128_000, 1_024, 1_024); let result = run_auto_compression( @@ -960,22 +865,9 @@ mod tests { ) .await; - assert_eq!(result.len(), messages.len()); - // Content should be byte-identical — no summary was injected. - match (&result[0], &messages[0]) { - (AgentMessage::User(a), AgentMessage::User(b)) => { - let at = match &a.content { - tiycore::types::UserContent::Text(t) => t.as_str(), - _ => panic!("expected text"), - }; - let bt = match &b.content { - tiycore::types::UserContent::Text(t) => t.as_str(), - _ => panic!("expected text"), - }; - assert_eq!(at, bt); - } - _ => panic!("expected user message at head"), - } + assert_eq!(result.len(), 0); + // Empty input ⇒ empty output (pass-through fast path). + assert!(result.is_empty()); } #[tokio::test] diff --git a/src-tauri/src/core/agent_session_events.rs b/src-tauri/src/core/agent_session_events.rs index 17d88b06..7cae0c58 100644 --- a/src-tauri/src/core/agent_session_events.rs +++ b/src-tauri/src/core/agent_session_events.rs @@ -4,9 +4,6 @@ use tiycore::agent::AgentMessage; use tiycore::types::{AssistantMessageEvent, Usage}; use tokio::sync::mpsc; -use crate::core::agent_session_compression::{ - observe_context_usage_calibration, ContextCompressionRuntimeState, -}; use crate::ipc::frontend_channels::ThreadStreamEvent; use crate::model::thread::RunUsageDto; @@ -17,7 +14,7 @@ pub(crate) fn handle_agent_event( last_completed_message_id: &StdMutex>, current_reasoning_message_id: &StdMutex>, last_usage: &StdMutex>, - context_compression_state: &StdMutex, + last_observed_usage: &StdMutex>, reasoning_buffer: &StdMutex, current_turn_index: &StdMutex>, last_text_delta: &StdMutex>, @@ -177,7 +174,7 @@ pub(crate) fn handle_agent_event( run_id, event_tx, last_usage, - context_compression_state, + last_observed_usage, &partial.usage, context_window, model_display_name, @@ -208,7 +205,7 @@ pub(crate) fn handle_agent_event( run_id, event_tx, last_usage, - context_compression_state, + last_observed_usage, &assistant.usage, context_window, model_display_name, @@ -223,7 +220,7 @@ pub(crate) fn handle_agent_event( run_id, event_tx, last_usage, - context_compression_state, + last_observed_usage, &assistant.usage, context_window, model_display_name, @@ -259,7 +256,7 @@ fn emit_usage_update_if_changed( run_id: &str, event_tx: &mpsc::UnboundedSender, last_usage: &StdMutex>, - context_compression_state: &StdMutex, + last_observed_usage: &StdMutex>, usage: &Usage, context_window: &str, model_display_name: &str, @@ -287,7 +284,12 @@ fn emit_usage_update_if_changed( return; } - observe_context_usage_calibration(context_compression_state, usage); + // Record the freshly-observed usage into the shared + // `last_observed_usage` slot. The `set_transform_context` closure + // (configured by `configure_agent`) reads this same slot to decide + // whether the next turn needs auto-compression — see + // `should_compress_via_context_size` in `context_compression`. + record_observed_usage(last_observed_usage, usage); let _ = event_tx.send(ThreadStreamEvent::ThreadUsageUpdated { run_id: run_id.to_string(), @@ -297,6 +299,22 @@ fn emit_usage_update_if_changed( }); } +/// Record the most recent `tiycore::types::Usage` into the shared +/// compression-trigger slot. +/// +/// The slot is shared with `set_transform_context`: the closure reads it +/// to compare the latest unified `context_size()` against the configured +/// `CompressionSettings::budget()`. Writes are guarded by the standard +/// `lock_or_recover` poison-recovery helper so a panic in any consumer +/// cannot corrupt the trigger state. +fn record_observed_usage(last_observed_usage: &StdMutex>, usage: &Usage) { + if let Ok(mut guard) = last_observed_usage.lock() { + *guard = Some(*usage); + } else { + tracing::warn!("record_observed_usage: mutex poisoned, recovering"); + } +} + /// Helper to recover a poisoned `StdMutex`. All mutex helpers below use this /// pattern so that a panic in an unrelated thread never silently corrupts /// message-tracking state. diff --git a/src-tauri/src/core/agent_session_tests.rs b/src-tauri/src/core/agent_session_tests.rs index 4500c5b3..93c33137 100644 --- a/src-tauri/src/core/agent_session_tests.rs +++ b/src-tauri/src/core/agent_session_tests.rs @@ -1,23 +1,20 @@ #[cfg(test)] pub(super) mod tests { use super::super::{ - build_initial_context_token_calibration, build_profile_response_prompt_parts, - build_system_prompt, convert_history_messages, current_context_token_calibration, + build_profile_response_prompt_parts, build_system_prompt, convert_history_messages, handle_agent_event, main_agent_security_config, mark_runtime_queue_message_by_id, normalize_profile_response_language, normalize_profile_response_style, - plan_mode_missing_checkpoint_error, record_pending_prompt_estimate, - resolve_helper_model_role, resolve_helper_profile, resolve_model_plan, - resolve_runtime_model_role, response_style_system_instruction, + plan_mode_missing_checkpoint_error, resolve_helper_model_role, resolve_helper_profile, + resolve_model_plan, resolve_runtime_model_role, response_style_system_instruction, runtime_queue_message_display_content, runtime_security_config, runtime_tools_for_profile, runtime_tools_for_profile_with_extensions, standard_tool_timeout, trim_history_to_current_context, trim_runtime_queue_state, update_runtime_queue_state_for_event, AgentQueueMessageKind, AgentSession, - AgentSessionSpec, ContextCompressionRuntimeState, ProfileResponseStyle, ResolvedModelRole, - ResolvedRuntimeModelPlan, RuntimeModelPlan, RuntimeQueueEventAction, RuntimeQueueEventDto, - RuntimeQueueMessageDto, RuntimeQueueMessageStatus, RuntimeQueueState, SortKey, - DEFAULT_FULL_TOOL_PROFILE, MAIN_AGENT_TOOL_TIMEOUT_SECS, - PLAN_MODE_MISSING_CHECKPOINT_ERROR, PLAN_READ_ONLY_TOOL_PROFILE, - STANDARD_TOOL_TIMEOUT_SECS, SUBAGENT_TOOL_TIMEOUT_SECS, + AgentSessionSpec, ProfileResponseStyle, ResolvedModelRole, ResolvedRuntimeModelPlan, + RuntimeModelPlan, RuntimeQueueEventAction, RuntimeQueueEventDto, RuntimeQueueMessageDto, + RuntimeQueueMessageStatus, RuntimeQueueState, SortKey, DEFAULT_FULL_TOOL_PROFILE, + MAIN_AGENT_TOOL_TIMEOUT_SECS, PLAN_MODE_MISSING_CHECKPOINT_ERROR, + PLAN_READ_ONLY_TOOL_PROFILE, STANDARD_TOOL_TIMEOUT_SECS, SUBAGENT_TOOL_TIMEOUT_SECS, }; use std::fs; use std::sync::{Arc, Mutex as StdMutex}; @@ -43,7 +40,7 @@ pub(super) mod tests { use crate::ipc::frontend_channels::ThreadStreamEvent; use crate::model::provider::{AgentProfileRecord, ProviderKind, ProviderRecord}; use crate::model::subagent::CustomSubagentModelRole; - use crate::model::thread::{MessageRecord, RunSummaryDto, RunUsageDto, ToolCallDto}; + use crate::model::thread::{MessageRecord, ToolCallDto}; use crate::persistence::init_database; use crate::persistence::repo::provider_repo; @@ -196,22 +193,6 @@ pub(super) mod tests { } } - fn make_history_message(id: &str, run_id: &str, role: &str, content: &str) -> MessageRecord { - MessageRecord { - id: id.to_string(), - thread_id: "thread-1".to_string(), - run_id: Some(run_id.to_string()), - role: role.to_string(), - content_markdown: content.to_string(), - parts_json: None, - message_type: "plain_message".to_string(), - status: "completed".to_string(), - metadata_json: None, - attachments_json: None, - created_at: "2026-01-01T00:00:00.000Z".to_string(), - } - } - #[test] fn update_runtime_queue_state_for_consumed_returns_pending_messages() { let mut state = RuntimeQueueState::default(); @@ -322,7 +303,6 @@ pub(super) mod tests { history_tool_calls: Vec::new(), model_plan: sample_resolved_runtime_model_plan(None), initial_prompt: None, - initial_context_calibration: Default::default(), cache_arbiter: None, }; @@ -397,7 +377,6 @@ pub(super) mod tests { history_tool_calls: Vec::new(), model_plan: sample_resolved_runtime_model_plan(None), initial_prompt: None, - initial_context_calibration: Default::default(), cache_arbiter: None, }; let session = AgentSession::new( @@ -678,35 +657,6 @@ pub(super) mod tests { assert_eq!(state.messages[1].updated_at, now); } - fn make_run_summary(model_id: &str, input_tokens: u64) -> RunSummaryDto { - make_run_summary_with_cache(model_id, input_tokens, 0) - } - - fn make_run_summary_with_cache( - model_id: &str, - input_tokens: u64, - cache_read_tokens: u64, - ) -> RunSummaryDto { - RunSummaryDto { - id: "run-prev".to_string(), - thread_id: "thread-1".to_string(), - run_mode: "default".to_string(), - status: "completed".to_string(), - model_id: Some(model_id.to_string()), - model_display_name: Some(model_id.to_string()), - context_window: Some(TEST_CONTEXT_WINDOW.to_string()), - error_message: None, - started_at: "2026-01-01T00:00:00.000Z".to_string(), - usage: RunUsageDto { - input_tokens, - output_tokens: 128, - cache_read_tokens, - cache_write_tokens: 0, - total_tokens: input_tokens + cache_read_tokens + 128, - }, - } - } - fn message_text(message: &AgentMessage) -> String { match message { AgentMessage::User(user) => match &user.content { @@ -744,26 +694,29 @@ pub(super) mod tests { reasoning_buffer: &StdMutex, event: &AgentEvent, ) { - let context_compression_state = StdMutex::new(ContextCompressionRuntimeState::default()); - handle_test_agent_event_with_context_state( + // The unified-trigger slot is created fresh per test; tests that + // care about compression-trigger state should pass their own + // shared mutex via `handle_test_agent_event_with_observed_usage`. + let last_observed_usage = StdMutex::new(None::); + handle_test_agent_event_with_observed_usage( run_id, event_tx, current_message_id, current_reasoning_message_id, last_usage, - &context_compression_state, + &last_observed_usage, reasoning_buffer, event, ); } - fn handle_test_agent_event_with_context_state( + fn handle_test_agent_event_with_observed_usage( run_id: &str, event_tx: &mpsc::UnboundedSender, current_message_id: &StdMutex>, current_reasoning_message_id: &StdMutex>, last_usage: &StdMutex>, - context_compression_state: &StdMutex, + last_observed_usage: &StdMutex>, reasoning_buffer: &StdMutex, event: &AgentEvent, ) { @@ -777,7 +730,7 @@ pub(super) mod tests { &last_completed_message_id, current_reasoning_message_id, last_usage, - context_compression_state, + last_observed_usage, reasoning_buffer, ¤t_turn_index, &last_text_delta, @@ -1300,12 +1253,20 @@ Used for prompt assembly coverage. } #[test] - fn message_end_usage_updates_consume_pending_prompt_estimate_once() { + fn message_end_usage_updates_record_observed_usage_once_per_change() { + // Replaces the legacy `message_end_usage_updates_consume_pending_prompt_estimate_once` + // test. The new compression trigger (`Usage::context_size()`) does + // NOT depend on a pending prompt estimate — it just records the + // latest `Usage` into the shared `last_observed_usage` slot on + // every changed `MessageEnd` event. Sending the same event twice + // should still emit the `ThreadUsageUpdated` exactly once (the + // dedup happens via the `last_usage` comparison in + // `emit_usage_update_if_changed`). let (event_tx, mut event_rx) = mpsc::unbounded_channel(); let current_message_id = StdMutex::new(None::); let current_reasoning_message_id = StdMutex::new(None::); let last_usage = StdMutex::new(None::); - let context_compression_state = StdMutex::new(ContextCompressionRuntimeState::default()); + let last_observed_usage = StdMutex::new(None::); let reasoning_buffer = StdMutex::new(String::new()); let assistant = AssistantMessage::builder() .api(Api::OpenAICompletions) @@ -1315,14 +1276,13 @@ Used for prompt assembly coverage. .build() .expect("assistant message with usage"); - record_pending_prompt_estimate(&context_compression_state, 1_000); - handle_test_agent_event_with_context_state( - "run-usage-calibration", + handle_test_agent_event_with_observed_usage( + "run-usage-record", &event_tx, ¤t_message_id, ¤t_reasoning_message_id, &last_usage, - &context_compression_state, + &last_observed_usage, &reasoning_buffer, &AgentEvent::MessageEnd { turn_index: 0, @@ -1330,13 +1290,13 @@ Used for prompt assembly coverage. message: AgentMessage::Assistant(assistant.clone()), }, ); - handle_test_agent_event_with_context_state( - "run-usage-calibration", + handle_test_agent_event_with_observed_usage( + "run-usage-record", &event_tx, ¤t_message_id, ¤t_reasoning_message_id, &last_usage, - &context_compression_state, + &last_observed_usage, &reasoning_buffer, &AgentEvent::MessageEnd { turn_index: 0, @@ -1348,25 +1308,43 @@ Used for prompt assembly coverage. let usage_events = std::iter::from_fn(|| event_rx.try_recv().ok()) .filter(|event| matches!(event, ThreadStreamEvent::ThreadUsageUpdated { .. })) .count(); - let calibration = current_context_token_calibration(&context_compression_state); + let observed = last_observed_usage + .lock() + .expect("last_observed_usage mutex") + .clone(); + // The dedup still fires: identical MessageEnd ⇒ 1 emit, even + // though we wrote the same Usage into the trigger slot twice. assert_eq!(usage_events, 1); - assert_eq!(calibration.ratio_basis_points(), 15_000); - assert!(context_compression_state - .lock() - .expect("context compression state") - .pending_prompt_estimate - .is_none()); + let observed = observed.expect("observed usage should be recorded"); + assert_eq!(observed.input, 1_500); + assert_eq!(observed.output, 32); + // The unified context_size sums input + output = 1532. + assert_eq!(observed.context_size(), 1_532); } #[test] - fn usage_calibration_counts_cache_read_when_input_is_zero() { + fn usage_record_observed_includes_cache_read_for_context_size() { + // Replaces the legacy `usage_calibration_counts_cache_read_when_input_is_zero` + // test. The new unified `Usage::context_size()` already adds + // `cache_read` (and `cache_write`) to the context footprint, so + // the trigger sees the true "tokens in the context window" figure + // even when the wire-level `input` is 0 — a configuration that + // happens e.g. on Anthropic when the entire prompt was served + // from the prompt cache. We assert both: + // + // 1. The `last_observed_usage` slot is populated with the full + // `Usage` (including cache_read), so the next + // `set_transform_context` call can compare the unified + // `context_size()` against the budget. + // 2. `Usage::context_size()` adds `cache_read` to the total — + // input=0, output=32, cache_read=1500 ⇒ context_size=1532. let (event_tx, mut event_rx) = mpsc::unbounded_channel(); let current_message_id = StdMutex::new(None::); let last_completed_message_id = StdMutex::new(None::); let current_reasoning_message_id = StdMutex::new(None::); let last_usage = StdMutex::new(None::); - let context_compression_state = StdMutex::new(ContextCompressionRuntimeState::default()); + let last_observed_usage = StdMutex::new(None::); let reasoning_buffer = StdMutex::new(String::new()); let current_turn_index = StdMutex::new(None::); let last_text_delta = StdMutex::new(None::); @@ -1385,15 +1363,14 @@ Used for prompt assembly coverage. .build() .expect("assistant message with cache-read usage"); - record_pending_prompt_estimate(&context_compression_state, 1_000); handle_agent_event( - "run-cache-read-calibration", + "run-cache-read-record", &event_tx, ¤t_message_id, &last_completed_message_id, ¤t_reasoning_message_id, &last_usage, - &context_compression_state, + &last_observed_usage, &reasoning_buffer, ¤t_turn_index, &last_text_delta, @@ -1409,100 +1386,18 @@ Used for prompt assembly coverage. let usage_events = std::iter::from_fn(|| event_rx.try_recv().ok()) .filter(|event| matches!(event, ThreadStreamEvent::ThreadUsageUpdated { .. })) .count(); - let calibration = current_context_token_calibration(&context_compression_state); - - assert_eq!(usage_events, 1); - assert_eq!(calibration.ratio_basis_points(), 15_000); - assert!(context_compression_state + let observed = last_observed_usage .lock() - .expect("context compression state") - .pending_prompt_estimate - .is_none()); - } + .expect("last_observed_usage mutex") + .clone(); - #[test] - fn build_initial_context_token_calibration_seeds_from_matching_historical_run() { - let primary_model = sample_resolved_model_role("primary-model"); - let history_messages = vec![ - make_history_message("msg-1", "run-prev", "user", &"x".repeat(600)), - make_history_message("msg-2", "run-prev", "assistant", &"y".repeat(600)), - ]; - let history = convert_history_messages(&history_messages, &[], &primary_model.model); - let estimated_tokens = crate::core::context_compression::estimate_total_tokens(&history); - let run_summary = make_run_summary("primary-model", (estimated_tokens as u64) * 2); - - let calibration = build_initial_context_token_calibration( - Some(&run_summary), - &history_messages, - &[], - &primary_model, - "", - ); - - assert_eq!(calibration.ratio_basis_points(), 20_000); - assert_eq!( - calibration.apply_to_estimate(estimated_tokens), - estimated_tokens * 2 - ); - } - - #[test] - fn build_initial_context_token_calibration_counts_cache_read_tokens() { - let primary_model = sample_resolved_model_role("primary-model"); - let history_messages = vec![ - make_history_message("msg-1", "run-prev", "user", &"x".repeat(600)), - make_history_message("msg-2", "run-prev", "assistant", &"y".repeat(600)), - ]; - let history = convert_history_messages(&history_messages, &[], &primary_model.model); - let estimated_tokens = crate::core::context_compression::estimate_total_tokens(&history); - let run_summary = make_run_summary_with_cache( - "primary-model", - estimated_tokens as u64 / 2, - estimated_tokens as u64 * 3 / 2, - ); - - let calibration = build_initial_context_token_calibration( - Some(&run_summary), - &history_messages, - &[], - &primary_model, - "", - ); - - assert_eq!(calibration.ratio_basis_points(), 20_000); - assert_eq!( - calibration.apply_to_estimate(estimated_tokens), - estimated_tokens * 2 - ); - } - - #[test] - fn build_initial_context_token_calibration_ignores_mismatched_models_and_zero_usage() { - let primary_model = sample_resolved_model_role("primary-model"); - let history_messages = vec![make_history_message( - "msg-1", - "run-prev", - "user", - &"x".repeat(400), - )]; - - let mismatched = build_initial_context_token_calibration( - Some(&make_run_summary("other-model", 4_096)), - &history_messages, - &[], - &primary_model, - "", - ); - let zero_usage = build_initial_context_token_calibration( - Some(&make_run_summary("primary-model", 0)), - &history_messages, - &[], - &primary_model, - "", - ); - - assert_eq!(mismatched.ratio_basis_points(), 10_000); - assert_eq!(zero_usage.ratio_basis_points(), 10_000); + assert_eq!(usage_events, 1); + let observed = observed.expect("observed usage should be recorded"); + assert_eq!(observed.input, 0); + assert_eq!(observed.output, 32); + assert_eq!(observed.cache_read, 1_500); + // Unified context_size adds cache_read even when input is 0. + assert_eq!(observed.context_size(), 1_532); } #[test] @@ -1512,7 +1407,7 @@ Used for prompt assembly coverage. let last_completed_message_id = StdMutex::new(None::); let current_reasoning_message_id = StdMutex::new(None::); let last_usage = StdMutex::new(None::); - let context_compression_state = StdMutex::new(ContextCompressionRuntimeState::default()); + let last_observed_usage = StdMutex::new(None::); let reasoning_buffer = StdMutex::new(String::new()); let current_turn_index = StdMutex::new(None::); let last_text_delta = StdMutex::new(None::); @@ -1524,7 +1419,7 @@ Used for prompt assembly coverage. &last_completed_message_id, ¤t_reasoning_message_id, &last_usage, - &context_compression_state, + &last_observed_usage, &reasoning_buffer, ¤t_turn_index, &last_text_delta, @@ -1558,7 +1453,7 @@ Used for prompt assembly coverage. let last_completed_message_id = StdMutex::new(None::); let current_reasoning_message_id = StdMutex::new(None::); let last_usage = StdMutex::new(None::); - let context_compression_state = StdMutex::new(ContextCompressionRuntimeState::default()); + let last_observed_usage = StdMutex::new(None::); let reasoning_buffer = StdMutex::new(String::new()); let current_turn_index = StdMutex::new(None::); let last_text_delta = StdMutex::new(None::); @@ -1570,7 +1465,7 @@ Used for prompt assembly coverage. &last_completed_message_id, ¤t_reasoning_message_id, &last_usage, - &context_compression_state, + &last_observed_usage, &reasoning_buffer, ¤t_turn_index, &last_text_delta, @@ -1617,7 +1512,7 @@ Used for prompt assembly coverage. let last_completed_message_id = StdMutex::new(None::); let current_reasoning_message_id = StdMutex::new(None::); let last_usage = StdMutex::new(None::); - let context_compression_state = StdMutex::new(ContextCompressionRuntimeState::default()); + let last_observed_usage = StdMutex::new(None::); let reasoning_buffer = StdMutex::new(String::new()); let current_turn_index = StdMutex::new(None::); let last_text_delta = StdMutex::new(None::); @@ -1640,7 +1535,7 @@ Used for prompt assembly coverage. &last_completed_message_id, ¤t_reasoning_message_id, &last_usage, - &context_compression_state, + &last_observed_usage, &reasoning_buffer, ¤t_turn_index, &last_text_delta, @@ -1657,7 +1552,7 @@ Used for prompt assembly coverage. &last_completed_message_id, ¤t_reasoning_message_id, &last_usage, - &context_compression_state, + &last_observed_usage, &reasoning_buffer, ¤t_turn_index, &last_text_delta, @@ -1674,7 +1569,7 @@ Used for prompt assembly coverage. &last_completed_message_id, ¤t_reasoning_message_id, &last_usage, - &context_compression_state, + &last_observed_usage, &reasoning_buffer, ¤t_turn_index, &last_text_delta, @@ -1706,7 +1601,7 @@ Used for prompt assembly coverage. let last_completed_message_id = StdMutex::new(None::); let current_reasoning_message_id = StdMutex::new(None::); let last_usage = StdMutex::new(None::); - let context_compression_state = StdMutex::new(ContextCompressionRuntimeState::default()); + let last_observed_usage = StdMutex::new(None::); let reasoning_buffer = StdMutex::new(String::new()); let current_turn_index = StdMutex::new(None::); let last_text_delta = StdMutex::new(None::); @@ -1719,7 +1614,7 @@ Used for prompt assembly coverage. &last_completed_message_id, ¤t_reasoning_message_id, &last_usage, - &context_compression_state, + &last_observed_usage, &reasoning_buffer, ¤t_turn_index, &last_text_delta, @@ -1750,7 +1645,7 @@ Used for prompt assembly coverage. &last_completed_message_id, ¤t_reasoning_message_id, &last_usage, - &context_compression_state, + &last_observed_usage, &reasoning_buffer, ¤t_turn_index, &last_text_delta, @@ -1814,7 +1709,7 @@ Used for prompt assembly coverage. let last_completed_message_id = StdMutex::new(None::); let current_reasoning_message_id = StdMutex::new(None::); let last_usage = StdMutex::new(None::); - let context_compression_state = StdMutex::new(ContextCompressionRuntimeState::default()); + let last_observed_usage = StdMutex::new(None::); let reasoning_buffer = StdMutex::new(String::new()); let current_turn_index = StdMutex::new(None::); let last_text_delta = StdMutex::new(None::); @@ -1829,7 +1724,7 @@ Used for prompt assembly coverage. &last_completed_message_id, ¤t_reasoning_message_id, &last_usage, - &context_compression_state, + &last_observed_usage, &reasoning_buffer, ¤t_turn_index, &last_text_delta, @@ -1861,7 +1756,7 @@ Used for prompt assembly coverage. let last_completed_message_id = StdMutex::new(None::); let current_reasoning_message_id = StdMutex::new(None::); let last_usage = StdMutex::new(None::); - let context_compression_state = StdMutex::new(ContextCompressionRuntimeState::default()); + let last_observed_usage = StdMutex::new(None::); let reasoning_buffer = StdMutex::new(String::new()); let current_turn_index = StdMutex::new(None::); let last_text_delta = StdMutex::new(None::); @@ -1884,7 +1779,7 @@ Used for prompt assembly coverage. &last_completed_message_id, ¤t_reasoning_message_id, &last_usage, - &context_compression_state, + &last_observed_usage, &reasoning_buffer, ¤t_turn_index, &last_text_delta, @@ -1903,7 +1798,7 @@ Used for prompt assembly coverage. &last_completed_message_id, ¤t_reasoning_message_id, &last_usage, - &context_compression_state, + &last_observed_usage, &reasoning_buffer, ¤t_turn_index, &last_text_delta, @@ -4904,7 +4799,6 @@ Used for prompt assembly coverage. history_tool_calls: Vec::new(), model_plan: sample_resolved_runtime_model_plan(None), initial_prompt: None, - initial_context_calibration: Default::default(), cache_arbiter: None, }; let session = AgentSession::new( @@ -4979,7 +4873,6 @@ Used for prompt assembly coverage. history_tool_calls: Vec::new(), model_plan: sample_resolved_runtime_model_plan(None), initial_prompt: None, - initial_context_calibration: Default::default(), cache_arbiter: None, }; let session = AgentSession::new( diff --git a/src-tauri/src/core/agent_session_types.rs b/src-tauri/src/core/agent_session_types.rs index f689cd42..56e9be10 100644 --- a/src-tauri/src/core/agent_session_types.rs +++ b/src-tauri/src/core/agent_session_types.rs @@ -5,7 +5,6 @@ use tiycore::agent::AgentTool; use tiycore::thinking::ThinkingLevel; use tiycore::types::{Model, OpenAICompletionsCompat, Transport}; -use crate::core::context_compression::ContextTokenCalibration; use crate::core::prompt::CacheMarkerArbiter; use crate::model::provider::AgentProfileRecord; use crate::model::thread::{MessageRecord, ToolCallDto}; @@ -168,7 +167,6 @@ pub struct AgentSessionSpec { pub history_tool_calls: Vec, pub model_plan: ResolvedRuntimeModelPlan, pub initial_prompt: Option, - pub initial_context_calibration: ContextTokenCalibration, /// Global cache marker arbiter for the request lifecycle. /// Records system prompt markers and allocates message-layer quota. /// Must be reset after each LLM call (§ 3.7.1). diff --git a/src-tauri/src/core/context_compression.rs b/src-tauri/src/core/context_compression.rs index d63279a1..a0f7efcc 100644 --- a/src-tauri/src/core/context_compression.rs +++ b/src-tauri/src/core/context_compression.rs @@ -253,14 +253,27 @@ pub fn estimate_total_tokens(messages: &[AgentMessage]) -> u32 { messages.iter().map(estimate_message_tokens).sum() } -/// Apply an optional conservative calibration to a heuristic token estimate. -pub(crate) fn calibrate_total_tokens( - total_tokens: u32, - calibration: Option, -) -> u32 { - calibration - .unwrap_or_default() - .apply_to_estimate(total_tokens) +/// Trigger compression when the most recently observed unified context +/// occupancy (`tiycore::types::Usage::context_size()`) is over budget. +/// +/// This is the canonical "should we auto-compress right now?" predicate +/// used by the `set_transform_context` hook in `agent_session`. It takes +/// the last observed usage (the `Some(usage)` branch) and the configured +/// `CompressionSettings`, returning `true` iff the **cross-protocol +/// unified** context size exceeds `context_window - reserve_tokens`. +/// +/// The first time we see a thread we have no observed usage yet; the +/// closure passes `None` to defer the decision until the first LLM +/// response reports its `context_size`. In that case the function +/// returns `false` — never trigger on a missing observation. +pub fn should_compress_via_context_size( + last_usage: Option<&tiycore::types::Usage>, + settings: &CompressionSettings, +) -> bool { + match last_usage { + Some(usage) => usage.context_size() > u64::from(settings.budget()), + None => false, + } } /// Check whether a token total exceeds the compression input budget. @@ -271,25 +284,20 @@ pub(crate) fn should_compress_total_tokens( total_tokens > settings.budget() } -/// Check whether compression is needed for the given messages and settings, -/// applying an optional conservative calibration derived from real provider -/// `usage.input` samples. -pub fn should_compress_with_calibration( - messages: &[AgentMessage], - settings: &CompressionSettings, - calibration: Option, -) -> bool { +/// Check whether compression is needed for the given messages and settings. +/// +/// This is a heuristic estimator kept for direct callers (e.g. fallback +/// paths, debug tooling) and for the `compress_context_fallback` safety +/// net. The hot path in `set_transform_context` deliberately does **not** +/// use this — it uses [`should_compress_via_context_size`] so the trigger +/// reflects real provider-reported context occupancy +/// (`Usage::context_size()`) rather than a chars/4 estimate. +pub fn should_compress(messages: &[AgentMessage], settings: &CompressionSettings) -> bool { if messages.is_empty() { return false; } let total_tokens = estimate_total_tokens(messages); - let calibrated_total_tokens = calibrate_total_tokens(total_tokens, calibration); - should_compress_total_tokens(calibrated_total_tokens, settings) -} - -/// Check whether compression is needed for the given messages and settings. -pub fn should_compress(messages: &[AgentMessage], settings: &CompressionSettings) -> bool { - should_compress_with_calibration(messages, settings, None) + should_compress_total_tokens(total_tokens, settings) } /// Find the cut-point index: messages before this index are "old" (to be @@ -904,33 +912,89 @@ mod tests { } #[test] - fn should_compress_with_calibration_triggers_when_raw_estimate_is_under_budget() { - let mut messages = Vec::new(); - for i in 0..4 { - messages.push(make_user(&format!("Question {}: {}", i, "x".repeat(400)))); - messages.push(make_assistant(&format!( - "Answer {}: {}", - i, - "y".repeat(400) - ))); - } - + fn should_compress_via_context_size_triggers_when_last_usage_exceeds_budget() { + // The unified `context_size` (= input + output + cache_read + + // cache_write) is the canonical "context occupancy" source of + // truth from tiycore 0.2.10-rc.2. When the most recent LLM call + // reports a `context_size` over `context_window - reserve_tokens`, + // compression must trigger on the NEXT `set_transform_context` + // invocation. let settings = CompressionSettings { context_window: 4_000, reserve_tokens: 2_000, + // budget = 2_000 keep_recent_tokens: 500, }; - let raw_total = estimate_total_tokens(&messages); - assert!(raw_total < settings.budget()); - - let calibration = ContextTokenCalibration::from_observation(raw_total, 2_500) - .expect("non-zero observation should produce calibration"); + // 2_500 > 2_000 → trigger. + let over_budget = tiycore::types::Usage { + input: 2_000, + output: 500, + cache_read: 0, + cache_write: 0, + total_tokens: 2_500, + cost: tiycore::types::UsageCost::default(), + }; + assert!(should_compress_via_context_size( + Some(&over_budget), + &settings, + )); - assert!(!should_compress(&messages, &settings)); - assert!(should_compress_with_calibration( - &messages, + // 1_500 ≤ 2_000 → pass-through. + let under_budget = tiycore::types::Usage { + input: 1_000, + output: 500, + cache_read: 0, + cache_write: 0, + total_tokens: 1_500, + cost: tiycore::types::UsageCost::default(), + }; + assert!(!should_compress_via_context_size( + Some(&under_budget), &settings, - Some(calibration), + )); + + // First request: no observed usage yet → pass-through. + assert!(!should_compress_via_context_size(None, &settings)); + } + + #[test] + fn should_compress_via_context_size_uses_cache_read_and_cache_write() { + // The unified context size includes cache_read and cache_write + // because those tokens still occupy the provider's context window. + // A wire-level `total_tokens` may exclude them on some providers + // (e.g. Anthropic) but the unified figure must add them back. + let settings = CompressionSettings { + context_window: 4_000, + reserve_tokens: 2_000, + keep_recent_tokens: 500, + }; + let usage = tiycore::types::Usage { + input: 800, + output: 200, + cache_read: 800, + cache_write: 200, + // Wire-level: input + output = 1000. With cache: 2000. + // The Anthropic wire-level total_tokens is sometimes reported + // as `input + output + cache_read + cache_write` already + // (= 2000 here), but the unified figure is always computed + // by the sum so it doesn't depend on the provider. + total_tokens: 1_000, + cost: tiycore::types::UsageCost::default(), + }; + // context_size = 800 + 200 + 800 + 200 = 2000 = budget → NOT over. + assert!(!should_compress_via_context_size(Some(&usage), &settings)); + // Just-above: 2001 > 2000. + let just_above = tiycore::types::Usage { + input: 801, + output: 200, + cache_read: 800, + cache_write: 200, + total_tokens: 1_001, + cost: tiycore::types::UsageCost::default(), + }; + assert!(should_compress_via_context_size( + Some(&just_above), + &settings )); } diff --git a/src-tauri/src/model/thread.rs b/src-tauri/src/model/thread.rs index 4137fa27..14bfd690 100644 --- a/src-tauri/src/model/thread.rs +++ b/src-tauri/src/model/thread.rs @@ -296,6 +296,20 @@ impl From for MessageDto { // RunSummary — lightweight run info for snapshots // --------------------------------------------------------------------------- +/// Per-run token usage snapshot sent to the frontend. +/// +/// `total_tokens` carries the **wire-level** value reported by the provider +/// (e.g. OpenAI/Google: `prompt + completion`; Anthropic: `input + output`). +/// It is preserved unchanged for backwards compatibility with downstream +/// billing/reporting consumers that need the wire-level total. +/// +/// `context_size` is the **cross-protocol unified** "context occupancy" +/// value, derived from `tiycore::types::Usage::context_size()` = +/// `input + output + cache_read + cache_write`. Frontend code that asks +/// "how much of the context window are we using?" MUST read `context_size`, +/// not `total_tokens` — the latter no longer represents occupancy once we +/// accumulate usage across turns (the wire-level field is per-response, not +/// cumulative). #[derive(Debug, Clone, Default, Serialize)] #[serde(rename_all = "camelCase")] pub struct RunUsageDto { @@ -303,7 +317,12 @@ pub struct RunUsageDto { pub output_tokens: u64, pub cache_read_tokens: u64, pub cache_write_tokens: u64, + /// Wire-level total reported by the provider (per-response, not cumulative). pub total_tokens: u64, + /// Cross-protocol unified context occupancy: + /// `input + output + cache_read + cache_write`. Use this for any + /// "context used" display or trigger logic. + pub context_size: u64, } impl From for RunUsageDto { @@ -314,6 +333,7 @@ impl From for RunUsageDto { cache_read_tokens: value.cache_read, cache_write_tokens: value.cache_write, total_tokens: value.total_tokens, + context_size: value.context_size(), } } } diff --git a/src-tauri/src/persistence/repo/run_helper_repo.rs b/src-tauri/src/persistence/repo/run_helper_repo.rs index 4fdcceeb..b7d69127 100644 --- a/src-tauri/src/persistence/repo/run_helper_repo.rs +++ b/src-tauri/src/persistence/repo/run_helper_repo.rs @@ -46,6 +46,15 @@ impl RunHelperRow { cache_read_tokens: self.cache_read_tokens.max(0) as u64, cache_write_tokens: self.cache_write_tokens.max(0) as u64, total_tokens: self.total_tokens.max(0) as u64, + // Reconstruct the cross-protocol unified context size + // from the persisted per-bucket fields. The DB schema + // doesn't store `context_size` (it would duplicate the + // four per-bucket fields); we derive it on read. + context_size: (self.input_tokens + + self.output_tokens + + self.cache_read_tokens + + self.cache_write_tokens) + .max(0) as u64, }, } } diff --git a/src-tauri/src/persistence/repo/run_repo.rs b/src-tauri/src/persistence/repo/run_repo.rs index ad783f65..52c9782b 100644 --- a/src-tauri/src/persistence/repo/run_repo.rs +++ b/src-tauri/src/persistence/repo/run_repo.rs @@ -364,6 +364,17 @@ fn map_run_summary(row: RunRow) -> RunSummaryDto { cache_read_tokens: row.cache_read_tokens.max(0) as u64, cache_write_tokens: row.cache_write_tokens.max(0) as u64, total_tokens: row.total_tokens.max(0) as u64, + // Derive the cross-protocol unified context size on read — + // see the doc-comment in the `RunUsageDto` declaration. + // The DB schema stores the four per-bucket token columns; + // `context_size` is a derived projection that we re-emit + // on every read so the frontend always has a unified + // "context occupancy" figure to display. + context_size: (row.input_tokens + + row.output_tokens + + row.cache_read_tokens + + row.cache_write_tokens) + .max(0) as u64, }, } } diff --git a/src-tauri/tests/frontend_integration.rs b/src-tauri/tests/frontend_integration.rs index f24ba175..966c9b3b 100644 --- a/src-tauri/tests/frontend_integration.rs +++ b/src-tauri/tests/frontend_integration.rs @@ -558,6 +558,10 @@ fn test_all_events_have_type_field() { cache_read_tokens: 0, cache_write_tokens: 0, total_tokens: 22, + // Mirrors `Usage::context_size()` from tiycore 0.2.10-rc.2 + // (= input + output + cache_read + cache_write = 10 + 12 + // + 0 + 0 = 22). + context_size: 22, }, }, ThreadStreamEvent::RunCheckpointed { run_id: "r".into() }, diff --git a/src/modules/workbench-shell/model/thread-store.ts b/src/modules/workbench-shell/model/thread-store.ts index f87adfc6..8089e08f 100644 --- a/src/modules/workbench-shell/model/thread-store.ts +++ b/src/modules/workbench-shell/model/thread-store.ts @@ -20,6 +20,14 @@ export type ThreadContextUsage = { outputTokens: number; cacheReadTokens: number; cacheWriteTokens: number; + /** + * Cross-protocol unified context occupancy + * (`input + output + cache_read + cache_write`). The badge uses this for + * percentage and "exceeded" detection. `totalTokens` is retained for + * wire-level reporting but is no longer used as the "used" figure. + */ + contextSize: number; + /** Wire-level total reported by the provider. Preserved for parity with the backend DTO. */ totalTokens: number; modelDisplayName: string | null; runId: string; diff --git a/src/modules/workbench-shell/ui/dashboard-workbench-logic.ts b/src/modules/workbench-shell/ui/dashboard-workbench-logic.ts index 9dfc20f1..30996970 100644 --- a/src/modules/workbench-shell/ui/dashboard-workbench-logic.ts +++ b/src/modules/workbench-shell/ui/dashboard-workbench-logic.ts @@ -138,18 +138,31 @@ export function buildThreadContextBadgeData(options: { const contextWindow = parseTokenCount(options.fallbackContextWindow) ?? parseTokenCount(options.runtimeUsage?.contextWindow); - const totalTokens = options.runtimeUsage?.totalTokens ?? 0; + // Use the cross-protocol unified `contextSize` (= input + output + + // cache_read + cache_write) as the "context occupancy" figure for the + // badge. This is what tiycore 0.2.10-rc.2 exposes via + // `Usage::context_size()` and works consistently across OpenAI / + // Anthropic / Google. `totalTokens` is intentionally NOT used here — it + // is the wire-level per-response total and is provider-dependent + // (OpenAI/Google: prompt+completion; Anthropic: input+output+cache). + const contextSize = options.runtimeUsage?.contextSize ?? 0; const inputTokens = options.runtimeUsage?.inputTokens ?? 0; const outputTokens = options.runtimeUsage?.outputTokens ?? 0; const cacheReadTokens = options.runtimeUsage?.cacheReadTokens ?? 0; const cacheWriteTokens = options.runtimeUsage?.cacheWriteTokens ?? 0; + const totalTokens = options.runtimeUsage?.totalTokens ?? 0; + // Anthropic / ZenMux(Anthropic) report cache reads as a separate bucket, but + // they still count against the prompt context window and the provider's input + // billing. Surface the combined "input" figure (raw input + cache hits) so + // the header's `In … · Out …` numbers match the `used / total` total above. + const effectiveInputTokens = inputTokens + cacheReadTokens; const rawUsedPercent = contextWindow && contextWindow > 0 - ? Math.round((totalTokens / contextWindow) * 100) + ? Math.round((contextSize / contextWindow) * 100) : 0; const usageRatio = contextWindow && contextWindow > 0 - ? Math.min(totalTokens / contextWindow, 1) + ? Math.min(contextSize / contextWindow, 1) : 0; const usedPercent = contextWindow && contextWindow > 0 @@ -157,7 +170,7 @@ export function buildThreadContextBadgeData(options: { : 0; const leftPercent = Math.max(0, 100 - rawUsedPercent); const isExceeded = Boolean( - contextWindow && contextWindow > 0 && totalTokens > contextWindow, + contextWindow && contextWindow > 0 && contextSize > contextWindow, ); return { @@ -166,6 +179,7 @@ export function buildThreadContextBadgeData(options: { outputTokens, cacheReadTokens, cacheWriteTokens, + effectiveInputTokens, isExceeded, leftPercent, modelDisplayName: @@ -174,8 +188,11 @@ export function buildThreadContextBadgeData(options: { null, rawUsedPercent, totalTokens, + // New: expose the source of truth for the percentage so consumers can + // label the figure precisely. + contextSize, usageRatio, - usedLabel: formatCompactTokenCount(totalTokens), + usedLabel: formatCompactTokenCount(contextSize), totalLabel: contextWindow ? formatCompactTokenCount(contextWindow) : "N/A", usedPercent, }; diff --git a/src/modules/workbench-shell/ui/dashboard-workbench.test.ts b/src/modules/workbench-shell/ui/dashboard-workbench.test.ts index d3a157c5..e09f9413 100644 --- a/src/modules/workbench-shell/ui/dashboard-workbench.test.ts +++ b/src/modules/workbench-shell/ui/dashboard-workbench.test.ts @@ -58,6 +58,12 @@ describe("buildThreadContextBadgeData", () => { modelDisplayName: "Old Runtime Model", outputTokens: 300, runId: "run-1", + // 1200 (input) + 300 (output) + 10 (cache_read) + 5 (cache_write) = 1515. + // Mirrors `Usage::context_size()` from tiycore 0.2.10-rc.2 + // (= input + output + cache_read + cache_write). Tests can override + // contextSize / totalTokens independently to assert the cross-protocol + // unified semantics. + contextSize: 1_515, totalTokens: 1_500, ...overrides, }; @@ -75,6 +81,8 @@ describe("buildThreadContextBadgeData", () => { expect(badge.contextWindow).toBe(16_000); expect(badge.modelDisplayName).toBe("Selected Model"); + // contextSize is the new "used" figure, distinct from totalTokens. + expect(badge.contextSize).toBe(1_515); expect(badge.totalTokens).toBe(1_500); expect(badge.isExceeded).toBe(false); }); @@ -94,11 +102,19 @@ describe("buildThreadContextBadgeData", () => { expect(badge.totalLabel).toBe("32K"); }); - it("marks usage as exceeded when used tokens are over the current context window", () => { + it("uses contextSize (not totalTokens) for the percentage when contextSize is larger", () => { + // The cross-protocol unified `contextSize` is the badge's "used" figure. + // Even when the wire-level `totalTokens` is below the context window, + // `contextSize` above the window should mark the badge as exceeded. + // This mirrors Anthropic: total_tokens (wire) excludes cache_read, but + // `context_size` adds it back. const badge = buildThreadContextBadgeData({ fallbackContextWindow: "1000", fallbackModelDisplayName: "Small Model", - runtimeUsage: makeRuntimeUsage({ totalTokens: 1_250 }), + runtimeUsage: makeRuntimeUsage({ + contextSize: 1_250, // exceeds 1000 + totalTokens: 900, // under 1000 (wire-level) + }), }); expect(badge.isExceeded).toBe(true); @@ -108,6 +124,29 @@ describe("buildThreadContextBadgeData", () => { expect(badge.usageRatio).toBe(1); }); + it("uses contextSize as the percentage source when it diverges from totalTokens", () => { + // When wire-level `totalTokens` exceeds the window but the unified + // `contextSize` does not, the badge reflects the unified value + // (the new "context occupancy" source of truth from + // `Usage::context_size()`). Wire-level `totalTokens` is retained on + // the DTO for downstream reporting; it is NOT used for the badge + // percentage anymore. + const badge = buildThreadContextBadgeData({ + fallbackContextWindow: "1000", + fallbackModelDisplayName: "Small Model", + runtimeUsage: makeRuntimeUsage({ + contextSize: 800, // under 1000 + totalTokens: 1_250, // over 1000 (wire-level) + }), + }); + + expect(badge.isExceeded).toBe(false); + expect(badge.rawUsedPercent).toBe(80); + // The DTO still carries the wire-level total for consumers that + // want it; the badge just doesn't use it for percentages. + expect(badge.totalTokens).toBe(1_250); + }); + it("does not exceed when no valid context window is available", () => { const badge = buildThreadContextBadgeData({ fallbackContextWindow: null, diff --git a/src/modules/workbench-shell/ui/dashboard-workbench.tsx b/src/modules/workbench-shell/ui/dashboard-workbench.tsx index f2dd227c..8173430e 100644 --- a/src/modules/workbench-shell/ui/dashboard-workbench.tsx +++ b/src/modules/workbench-shell/ui/dashboard-workbench.tsx @@ -1097,7 +1097,7 @@ const drawerWidth = useStore(uiLayoutStore, (s) => s.drawerWidth);

In{" "} {formatCompactTokenCount( - contextBadge.inputTokens, + contextBadge.effectiveInputTokens, )}{" "} · Out{" "} {formatCompactTokenCount( diff --git a/src/modules/workbench-shell/ui/runtime-thread-surface-state.ts b/src/modules/workbench-shell/ui/runtime-thread-surface-state.ts index dab0e8be..dc8a284b 100644 --- a/src/modules/workbench-shell/ui/runtime-thread-surface-state.ts +++ b/src/modules/workbench-shell/ui/runtime-thread-surface-state.ts @@ -517,12 +517,27 @@ export function mapRunSummaryToContextUsage(run: RunSummaryDto | null) { return null; } + // Prefer the cross-protocol unified `contextSize` from tiycore + // 0.2.10-rc.2 (set in the Rust DTO via `Usage::context_size()` = + // input + output + cache_read + cache_write). Fall back to that sum + // when the field is missing — e.g. older persisted snapshots + // written before the upgrade. + const explicitContextSize = run.usage.contextSize ?? 0; + const contextSize = + explicitContextSize > 0 + ? explicitContextSize + : run.usage.inputTokens + + run.usage.outputTokens + + run.usage.cacheReadTokens + + run.usage.cacheWriteTokens; + return { contextWindow: run.contextWindow, inputTokens: run.usage.inputTokens, outputTokens: run.usage.outputTokens, cacheReadTokens: run.usage.cacheReadTokens, cacheWriteTokens: run.usage.cacheWriteTokens, + contextSize, totalTokens: run.usage.totalTokens, modelDisplayName: run.modelDisplayName, runId: run.id, diff --git a/src/modules/workbench-shell/ui/runtime-thread-surface.test.tsx b/src/modules/workbench-shell/ui/runtime-thread-surface.test.tsx index d2370961..1fd5f8fb 100644 --- a/src/modules/workbench-shell/ui/runtime-thread-surface.test.tsx +++ b/src/modules/workbench-shell/ui/runtime-thread-surface.test.tsx @@ -57,6 +57,10 @@ function makeSnapshot(activeStatus: RunStatus | null): ThreadSnapshotDto { cacheReadTokens: 0, cacheWriteTokens: 0, totalTokens: 0, + // Cross-protocol unified context occupancy, defaulting to 0 + // for an empty/seed snapshot. tiycore 0.2.10-rc.2 derives this + // as input + output + cache_read + cache_write. + contextSize: 0, }, } : null, diff --git a/src/modules/workbench-shell/ui/runtime-thread-surface.tsx b/src/modules/workbench-shell/ui/runtime-thread-surface.tsx index cb357f46..b53b4eb9 100644 --- a/src/modules/workbench-shell/ui/runtime-thread-surface.tsx +++ b/src/modules/workbench-shell/ui/runtime-thread-surface.tsx @@ -1157,6 +1157,17 @@ export function RuntimeThreadSurface({ outputTokens: event.usage.outputTokens, cacheReadTokens: event.usage.cacheReadTokens, cacheWriteTokens: event.usage.cacheWriteTokens, + // Prefer the cross-protocol unified `contextSize` from + // tiycore 0.2.10-rc.2 (= input + output + cache_read + + // cache_write). Fall back to that sum when the field is missing + // (older payloads or hand-crafted events). + contextSize: + event.usage.contextSize > 0 + ? event.usage.contextSize + : event.usage.inputTokens + + event.usage.outputTokens + + event.usage.cacheReadTokens + + event.usage.cacheWriteTokens, totalTokens: event.usage.totalTokens, modelDisplayName: event.modelDisplayName, runId: event.runId, diff --git a/src/modules/workbench-shell/ui/workbench-top-bar.tsx b/src/modules/workbench-shell/ui/workbench-top-bar.tsx index 92a00140..b5b8c7ff 100644 --- a/src/modules/workbench-shell/ui/workbench-top-bar.tsx +++ b/src/modules/workbench-shell/ui/workbench-top-bar.tsx @@ -138,6 +138,12 @@ export function WorkbenchTopBar({ return (

+ {isMacOS ? ( +
+ ) : null}
diff --git a/src/services/bridge/agent-commands.ts b/src/services/bridge/agent-commands.ts index 9e1c17ac..9159bc83 100644 --- a/src/services/bridge/agent-commands.ts +++ b/src/services/bridge/agent-commands.ts @@ -178,40 +178,41 @@ function readActivity( function readUsage(event: RawThreadStreamEvent): RunUsageDto { const value = readValue(event, "usage", "usage") as Record | null | undefined; + const inputTokens = readUsageField(value, "inputTokens", "input_tokens"); + const outputTokens = readUsageField(value, "outputTokens", "output_tokens"); + const cacheReadTokens = readUsageField(value, "cacheReadTokens", "cache_read_tokens"); + const cacheWriteTokens = readUsageField(value, "cacheWriteTokens", "cache_write_tokens"); + const totalTokens = readUsageField(value, "totalTokens", "total_tokens"); + // Prefer the unified `context_size` field sent by tiycore >= 0.2.10-rc.2 + // (it sums input + output + cache_read + cache_write consistently across + // OpenAI / Anthropic / Google). Fall back to that sum when the field is + // missing — e.g. older persisted snapshots or partial hand-crafted events. + const explicitContextSize = readUsageField(value, "contextSize", "context_size"); + const contextSize = + explicitContextSize > 0 + ? explicitContextSize + : inputTokens + outputTokens + cacheReadTokens + cacheWriteTokens; return { - inputTokens: - typeof value?.inputTokens === "number" - ? value.inputTokens - : typeof value?.input_tokens === "number" - ? value.input_tokens - : 0, - outputTokens: - typeof value?.outputTokens === "number" - ? value.outputTokens - : typeof value?.output_tokens === "number" - ? value.output_tokens - : 0, - cacheReadTokens: - typeof value?.cacheReadTokens === "number" - ? value.cacheReadTokens - : typeof value?.cache_read_tokens === "number" - ? value.cache_read_tokens - : 0, - cacheWriteTokens: - typeof value?.cacheWriteTokens === "number" - ? value.cacheWriteTokens - : typeof value?.cache_write_tokens === "number" - ? value.cache_write_tokens - : 0, - totalTokens: - typeof value?.totalTokens === "number" - ? value.totalTokens - : typeof value?.total_tokens === "number" - ? value.total_tokens - : 0, + inputTokens, + outputTokens, + cacheReadTokens, + cacheWriteTokens, + totalTokens, + contextSize, }; } +function readUsageField( + value: Record | null | undefined, + camelKey: string, + snakeKey: string, +): number { + if (!value) return 0; + if (typeof value[camelKey] === "number") return value[camelKey]; + if (typeof value[snakeKey] === "number") return value[snakeKey]; + return 0; +} + function readRuntimeQueueSnapshot(value: unknown): RuntimeQueueSnapshotDto { const fallbackId = () => Math.random().toString(36).slice(2); const raw = value && typeof value === "object" ? value as Record : {}; diff --git a/src/services/thread-stream/thread-stream.test.ts b/src/services/thread-stream/thread-stream.test.ts index 1b9a2140..51aa3a59 100644 --- a/src/services/thread-stream/thread-stream.test.ts +++ b/src/services/thread-stream/thread-stream.test.ts @@ -40,6 +40,9 @@ const usage = { cacheReadTokens: 3, cacheWriteTokens: 4, totalTokens: 10, + // Cross-protocol unified context occupancy from tiycore 0.2.10-rc.2: + // input + output + cache_read + cache_write = 1 + 2 + 3 + 4 = 10. + contextSize: 10, }; const helperSnapshot = { diff --git a/src/shared/types/api.ts b/src/shared/types/api.ts index 754934f1..1f9056a9 100644 --- a/src/shared/types/api.ts +++ b/src/shared/types/api.ts @@ -382,7 +382,20 @@ export interface RunUsageDto { outputTokens: number; cacheReadTokens: number; cacheWriteTokens: number; + /** + * Wire-level total reported by the provider. This is **per-response** and + * is NOT a reliable "context occupancy" value (different providers sum + * different buckets). Prefer {@link contextSize} for any "context used" + * display or trigger logic. + */ totalTokens: number; + /** + * Cross-protocol unified context occupancy, derived from + * `tiycore::types::Usage::context_size()` = + * `input + output + cache_read + cache_write`. Use this for the context + * badge, percentage calculations, and compression triggers. + */ + contextSize: number; } export interface ToolCallDto { From afd221e106edbbe21f9a9785a22efa2daa831f91 Mon Sep 17 00:00:00 2001 From: Jorben Date: Thu, 11 Jun 2026 16:56:28 +0800 Subject: [PATCH 12/16] =?UTF-8?q?refactor(goal):=20=E2=99=BB=EF=B8=8F=20ce?= =?UTF-8?q?ntralize=20status=20transitions=20to=20explicit=20commands=20an?= =?UTF-8?q?d=20Judge=20verdicts?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- src-tauri/src/core/goal_manager.rs | 143 ++++++++--------------------- src-tauri/src/model/goal.rs | 11 --- src-tauri/tests/goal_lifecycle.rs | 133 ++++++++++++++++----------- 3 files changed, 117 insertions(+), 170 deletions(-) diff --git a/src-tauri/src/core/goal_manager.rs b/src-tauri/src/core/goal_manager.rs index f3d3ff2b..88e6239e 100644 --- a/src-tauri/src/core/goal_manager.rs +++ b/src-tauri/src/core/goal_manager.rs @@ -255,34 +255,6 @@ impl GoalManager { goal_repo::account_usage(&self.pool, goal_id, tokens, 1).await } - // ── Auto-resume ── - - /// Check if a paused goal should auto-resume when the user sends a new message. - /// Returns Some(()) if the goal was auto-resumed, None if it shouldn't. - pub async fn try_auto_resume(&self) -> Result { - let goal = match self.get_active().await? { - Some(g) => g, - None => return Ok(false), - }; - - if goal.status != GoalStatus::Paused { - return Ok(false); - } - - let should_resume = goal - .pause_reason - .as_ref() - .map(|r| r.auto_resume_on_user_message()) - .unwrap_or(false); - - if should_resume { - goal_repo::update_status(&self.pool, &goal.id, GoalStatus::Active, None, None, None) - .await?; - } - - Ok(should_resume) - } - // ── Evaluation ── /// Evaluate whether the goal should continue, pause, or complete after a turn. @@ -307,27 +279,12 @@ impl GoalManager { return verdict; } - // Completion claim without tool call + // Completion claim without tool call — keep nudging agent toward + // Judge verification; no DB status change. The counter is still + // cleared on tool activity / resume / clear, but reaching 3 no + // longer auto-pauses (status transitions are reserved for user + // commands and Judge verdicts). if self.detect_completion_claim(response) { - let should_pause = { - let mut guard = self.lock_runtime(); - let count = guard - .completion_claim_count - .entry(self.thread_id.clone()) - .or_default(); - *count += 1; - *count >= 3 - }; - if should_pause { - // Reset counter before pausing - self.lock_runtime() - .completion_claim_count - .remove(&self.thread_id); - return GoalVerdict::Paused { - reason: PauseReason::IdleBlocked, - detail: Some("agent repeatedly claimed completion without requesting Judge verification via agent_judge".into()), - }; - } return GoalVerdict::ChallengeEvidence; } @@ -342,13 +299,12 @@ impl GoalManager { // Reset idle counters since tools were called self.reset_idle_counters(); - // ── Layer 4: Budget checks ── - if let Some(budget) = goal.token_budget { - if goal.tokens_used >= budget { - return GoalVerdict::BudgetLimited; - } - } - + // ── Layer 4: Turn budget + token budget checks ── + // `turns_used >= max_turns` auto-pauses (explicitly approved path). + // `tokens_used >= token_budget` is reported via the `budget_limited` + // verdict string (no DB status change) so the run loop can stop + // continuation. Status transitions are reserved for explicit user + // commands and Judge verdicts. if goal.turns_used >= goal.max_turns { return GoalVerdict::Paused { reason: PauseReason::BudgetExhausted, @@ -359,6 +315,12 @@ impl GoalManager { }; } + if let Some(budget) = goal.token_budget { + if goal.tokens_used >= budget { + return GoalVerdict::BudgetLimited; + } + } + // ── Default: continue ── GoalVerdict::Continue } @@ -370,35 +332,23 @@ impl GoalManager { tool_calls: &[String], _response: &str, ) -> Option { - for tool_name in tool_calls { - match tool_name.as_str() { - "clarify" => { - return Some(GoalVerdict::Paused { - reason: PauseReason::ClarifyPending, - detail: Some("agent requested clarification".into()), - }); - } - "update_plan" => { - return Some(GoalVerdict::Paused { - reason: PauseReason::PlanPending, - detail: Some("agent published a plan, awaiting approval".into()), - }); - } - // agent_judge is the main-agent-only acceptance request. It is - // handled by the tool execution pipeline (execute_judge_tool), - // which runs the Judge and records the verdict. Evaluation must - // not treat it as a blocking tool — like any tool call it shows - // the agent acted and should reset idle tendencies. - _ => {} - } - } + // Tool-based auto-pausing has been removed: status transitions are + // reserved for explicit user commands and Judge verdicts. `clarify` + // and `update_plan` no longer flip the goal to paused; they fall + // through to the continuation path. `agent_judge` is the main-agent + // acceptance request and is handled by the tool execution pipeline + // (execute_judge_tool), which runs the Judge and records the + // verdict — it is never a blocking tool here. + let _ = tool_calls; None } fn detect_idle_block(&self, response: &str) -> Option { let idle_count = self.increment_idle_count(); - let trimmed = response.trim().to_lowercase(); - + // Heuristic question detection has been removed; status transitions + // are reserved for explicit user commands and Judge verdicts, so idle + // detection is purely a turn-count trigger. `response` is unused. + let _ = response; if idle_count >= MAX_IDLE_TURNS { return Some(GoalVerdict::Paused { reason: PauseReason::IdleBlocked, @@ -407,31 +357,6 @@ impl GoalManager { )), }); } - - // Lightweight heuristic: short question-like response + no tools - if idle_count >= 2 { - let blockers = [ - "should i", - "do you want", - "would you like", - "请确认", - "需要你决定", - "which approach", - "which option", - "can you confirm", - "let me know if", - "before i proceed", - "你的选择是", - "你确认吗", - "需要你同意", - ]; - if trimmed.len() < 500 && blockers.iter().any(|b| trimmed.contains(b)) { - return Some(GoalVerdict::Paused { - reason: PauseReason::IdleBlocked, - detail: Some("agent appears blocked, may need user input".into()), - }); - } - } None } @@ -599,7 +524,15 @@ impl GoalManager { .await?; } GoalVerdict::BudgetLimited => { - self.mark_budget_limited(¤t.id).await?; + // Advisory: token budget exhausted — do NOT write to DB. + // The verdict string still propagates as "budget_limited" so + // the run loop can stop continuation. Goal status remains + // `active` and is only changed by explicit user commands + // (`/goal budget-limit`) or Judge verdicts. + tracing::info!( + goal_id = %current.id, + "token budget exhausted: emitting budget_limited verdict without DB status change" + ); } } diff --git a/src-tauri/src/model/goal.rs b/src-tauri/src/model/goal.rs index 62f129d2..ddc59393 100644 --- a/src-tauri/src/model/goal.rs +++ b/src-tauri/src/model/goal.rs @@ -80,17 +80,6 @@ impl PauseReason { } } } - - /// Whether the goal should auto-resume when the user sends a new message. - pub fn auto_resume_on_user_message(&self) -> bool { - matches!( - self, - PauseReason::ClarifyPending - | PauseReason::PlanPending - | PauseReason::IdleBlocked - | PauseReason::Interrupted - ) - } } /// Verdict from the post-turn evaluation. diff --git a/src-tauri/tests/goal_lifecycle.rs b/src-tauri/tests/goal_lifecycle.rs index 157d7704..42925c91 100644 --- a/src-tauri/tests/goal_lifecycle.rs +++ b/src-tauri/tests/goal_lifecycle.rs @@ -165,7 +165,11 @@ mod tests { } #[tokio::test] - async fn evaluate_after_turn_clarify_triggers_pause() { + async fn evaluate_after_turn_clarify_no_longer_pauses() { + // Tool-based auto-pausing has been removed: `clarify` no longer + // returns a `Paused(ClarifyPending)` verdict. Status transitions are + // reserved for explicit user commands and Judge verdicts, so the + // evaluate path falls through to `Continue`. let pool = setup_pool().await; let mgr = GoalManager::new(pool.clone(), "thread-1".into(), test_runtime()); let goal = mgr.create_goal("Test goal", None).await.unwrap(); @@ -174,17 +178,22 @@ mod tests { mgr.record_tool_call("clarify"); let verdict = mgr.evaluate_after_turn("What do you think?", &goal); - assert!(matches!( - verdict, - GoalVerdict::Paused { - reason: PauseReason::ClarifyPending, - .. - } - )); + assert!( + matches!(verdict, GoalVerdict::Continue), + "clarify should no longer pause the goal; got {verdict:?}" + ); + + // DB status must remain active — no pause was written. + let active = mgr.get_active().await.unwrap().unwrap(); + assert_eq!(active.status, GoalStatus::Active); + assert!(active.pause_reason.is_none()); } #[tokio::test] - async fn evaluate_after_turn_update_plan_triggers_pause() { + async fn evaluate_after_turn_update_plan_no_longer_pauses() { + // Tool-based auto-pausing has been removed: `update_plan` no longer + // returns a `Paused(PlanPending)` verdict. The plan tool's approval + // flow is handled outside the goal manager now. let pool = setup_pool().await; let mgr = GoalManager::new(pool.clone(), "thread-1".into(), test_runtime()); let goal = mgr.create_goal("Test goal", None).await.unwrap(); @@ -192,13 +201,14 @@ mod tests { mgr.record_tool_call("update_plan"); let verdict = mgr.evaluate_after_turn("Here is the plan", &goal); - assert!(matches!( - verdict, - GoalVerdict::Paused { - reason: PauseReason::PlanPending, - .. - } - )); + assert!( + matches!(verdict, GoalVerdict::Continue), + "update_plan should no longer pause the goal; got {verdict:?}" + ); + + let active = mgr.get_active().await.unwrap().unwrap(); + assert_eq!(active.status, GoalStatus::Active); + assert!(active.pause_reason.is_none()); } #[tokio::test] @@ -283,6 +293,9 @@ mod tests { #[tokio::test] async fn auto_resume_clarify_pending() { + // Auto-resume on user message has been removed. A paused goal — even + // one paused for a `ClarifyPending` reason — must stay paused until + // an explicit `resume()` is issued. let pool = setup_pool().await; let mgr = GoalManager::new(pool.clone(), "thread-1".into(), test_runtime()); let goal = mgr.create_goal("Test goal", None).await.unwrap(); @@ -291,15 +304,21 @@ mod tests { .await .unwrap(); - let resumed = mgr.try_auto_resume().await.unwrap(); - assert!(resumed, "ClarifyPending should auto-resume"); + // No auto-resume path exists; status stays paused. + let paused = mgr.get_active().await.unwrap().unwrap(); + assert_eq!(paused.status, GoalStatus::Paused); + // Explicit resume still works. + mgr.resume(&goal.id).await.unwrap(); let active = mgr.get_active().await.unwrap().unwrap(); assert_eq!(active.status, GoalStatus::Active); } #[tokio::test] async fn auto_resume_skips_user_requested() { + // Auto-resume on user message has been removed. A `UserRequested` + // pause is therefore equivalent to every other pause from the + // auto-resume perspective: only explicit `resume()` will reopen it. let pool = setup_pool().await; let mgr = GoalManager::new(pool.clone(), "thread-1".into(), test_runtime()); let goal = mgr.create_goal("Test goal", None).await.unwrap(); @@ -308,9 +327,6 @@ mod tests { .await .unwrap(); - let resumed = mgr.try_auto_resume().await.unwrap(); - assert!(!resumed, "UserRequested should NOT auto-resume"); - let paused = mgr.get_active().await.unwrap().unwrap(); assert_eq!(paused.status, GoalStatus::Paused); } @@ -379,32 +395,33 @@ mod tests { } #[tokio::test] - async fn evaluate_after_turn_completion_claim_thrice_pauses() { + async fn evaluate_after_turn_completion_claim_keeps_challenging() { + // Repeated self-claimed completion no longer auto-pauses. The + // challenge prompt keeps nudging the agent toward `agent_judge`; the + // DB status remains `active` until a Judge verdict lands. + // + // The independent `MAX_IDLE_TURNS` path still pauses after three + // consecutive tool-less turns, so we exercise only two tool-less + // claim turns — that is enough to confirm the completion-claim + // branch returns `ChallengeEvidence` (and not a `Paused(IdleBlocked)` + // triggered by the former three-claim counter). let pool = setup_pool().await; let mgr = GoalManager::new(pool.clone(), "thread-1".into(), test_runtime()); - let goal = mgr.create_goal("Test goal", None).await.unwrap(); - - // First claim: challenge only - let v1 = mgr.evaluate_after_turn("All done!", &goal); - assert!(matches!(v1, GoalVerdict::ChallengeEvidence)); - - let fresh1 = mgr.get_active().await.unwrap().unwrap(); - - // Second claim: challenge only - let v2 = mgr.evaluate_after_turn("Everything is complete!", &fresh1); - assert!(matches!(v2, GoalVerdict::ChallengeEvidence)); + mgr.create_goal("Test goal", None).await.unwrap(); - let fresh2 = mgr.get_active().await.unwrap().unwrap(); + for claim in ["All done!", "Everything is complete!"] { + let fresh = mgr.get_active().await.unwrap().unwrap(); + let verdict = mgr.evaluate_after_turn(claim, &fresh); + assert!( + matches!(verdict, GoalVerdict::ChallengeEvidence), + "completion claim `{claim}` should keep producing ChallengeEvidence; got {verdict:?}" + ); + } - // Third claim: should pause (IdleBlocked) - let v3 = mgr.evaluate_after_turn("Finished everything!", &fresh2); - assert!(matches!( - v3, - GoalVerdict::Paused { - reason: PauseReason::IdleBlocked, - .. - } - )); + // No pause was ever written to the DB. + let active = mgr.get_active().await.unwrap().unwrap(); + assert_eq!(active.status, GoalStatus::Active); + assert!(active.pause_reason.is_none()); } #[tokio::test] @@ -538,23 +555,31 @@ mod tests { } #[tokio::test] - async fn evaluate_after_turn_chinese_idle_phrase_pauses() { + async fn evaluate_after_turn_chinese_idle_phrase_no_longer_pauses() { + // Heuristic question-phrase detection has been removed. Short + // Chinese question-like responses must not flip the goal to paused; + // status transitions are reserved for explicit user commands and + // Judge verdicts. The independent `MAX_IDLE_TURNS` path still + // pauses after three consecutive tool-less turns, but the heuristic + // branch is gone. let pool = setup_pool().await; let mgr = GoalManager::new(pool.clone(), "thread-1".into(), test_runtime()); - let goal = mgr.create_goal("Test goal", None).await.unwrap(); + mgr.create_goal("Test goal", None).await.unwrap(); - // One idle turn first, then short Chinese question-like response - mgr.evaluate_after_turn("随便聊聊", &goal); + // Reset the idle counter so MAX_IDLE_TURNS does not fire on the + // single-tool-less turn we care about. + mgr.record_tool_call("read"); let fresh = mgr.get_active().await.unwrap().unwrap(); let verdict = mgr.evaluate_after_turn("请确认这个方案是否可以?", &fresh); - assert!(matches!( - verdict, - GoalVerdict::Paused { - reason: PauseReason::IdleBlocked, - .. - } - )); + assert!( + !matches!(verdict, GoalVerdict::Paused { .. }), + "heuristic Chinese idle phrase should no longer pause the goal; got {verdict:?}" + ); + + let active = mgr.get_active().await.unwrap().unwrap(); + assert_eq!(active.status, GoalStatus::Active); + assert!(active.pause_reason.is_none()); } #[tokio::test] From f80d652a628345d775b37399587dca36cf0ea3df Mon Sep 17 00:00:00 2001 From: Jorben Date: Thu, 11 Jun 2026 18:20:13 +0800 Subject: [PATCH 13/16] =?UTF-8?q?fix(agent):=20=F0=9F=90=9B=20fix=20timest?= =?UTF-8?q?amp=20slicing=20panic=20and=20add=20has=5Fprocess=5Frequirement?= =?UTF-8?q?s=20tests?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Replace byte-index slicing with char-aware truncation to prevent panics on multi-byte UTF-8 boundaries in timestamp formatting. Add unit tests for `has_process_requirements()` covering English and CJK keywords, substring match behaviour, edge cases, and case-insensitive matching. --- src-tauri/src/core/agent_session_execution.rs | 82 ++++++++++++++++++- 1 file changed, 81 insertions(+), 1 deletion(-) diff --git a/src-tauri/src/core/agent_session_execution.rs b/src-tauri/src/core/agent_session_execution.rs index 1b37be73..993eb0ab 100644 --- a/src-tauri/src/core/agent_session_execution.rs +++ b/src-tauri/src/core/agent_session_execution.rs @@ -1998,7 +1998,11 @@ async fn build_process_compliance_summary(pool: &sqlx::SqlitePool, thread_id: &s "{}. `{}` called at {} (status: {})\n Scope: {}\n", i + 1, review.helper_kind, - &review.started_at[..review.started_at.len().min(19)], + // Truncate to first 19 chars (RFC3339 timestamp prefix) using + // char-aware slicing to avoid panicking on multi-byte UTF-8 + // boundaries — mirrors the 200-char limit used on + // `input_preview` above. + review.started_at.chars().take(19).collect::(), status_label, input_preview, )); @@ -2316,3 +2320,79 @@ mod tests { } } } + +#[cfg(test)] +mod has_process_requirements_tests { + use super::has_process_requirements; + + #[test] + fn detects_english_keywords() { + for objective in [ + "Each phase needs a code review before merge.", + "Verify every change against the spec.", + "Run a per phase smoke test.", + ] { + assert!( + has_process_requirements(objective), + "expected keyword match in: {objective}" + ); + } + } + + #[test] + fn detects_cjk_keywords() { + for objective in [ + "每个阶段都需要验收", + "每一阶段检查通过", + "需要你每轮 review", + "完成所有阶段完成的任务", + ] { + assert!( + has_process_requirements(objective), + "expected CJK keyword match in: {objective}" + ); + } + } + + #[test] + fn records_substring_match_semantics() { + // The implementation is a plain case-insensitive substring match + // over a fixed keyword list. These cases pin down the current + // behaviour, including the substring-match quirk where "review" + // hits inside "preview". They are not assertions about an ideal + // matcher — they are regression guards against accidental keyword + // list changes. If the keyword list is later tightened, update + // this test alongside it. + assert!( + has_process_requirements("Preview the rendered HTML before shipping."), + "current implementation matches 'review' inside 'preview' (substring match)" + ); + assert!( + !has_process_requirements("Forward-looking design without explicit verify step."), + "no keyword substring present" + ); + // "审阅" (look over) is intentionally NOT a keyword — only + // "验收" (formal acceptance) and "检查" (check) are, so this + // should be rejected. + assert!( + !has_process_requirements("请仔细审阅代码风格。"), + "审阅 does not contain any current keyword" + ); + assert!( + !has_process_requirements("Survey users about preferences."), + "no keyword substring present" + ); + } + + #[test] + fn empty_and_whitespace_objectives_return_false() { + assert!(!has_process_requirements("")); + assert!(!has_process_requirements(" \n\t ")); + } + + #[test] + fn keyword_match_is_case_insensitive() { + assert!(has_process_requirements("Final REVIEW before release.")); + assert!(has_process_requirements("Need a Verify Each phase step.")); + } +} From 0cca8854ced3bf887b3cc453ee264a38aa97b0de Mon Sep 17 00:00:00 2001 From: Jorben Date: Thu, 11 Jun 2026 19:31:33 +0800 Subject: [PATCH 14/16] =?UTF-8?q?feat(compression):=20=E2=9C=A8=20reserve?= =?UTF-8?q?=2020%=20context=20window=20for=20auto-compression=20trigger?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Backend: replace fixed 16,384 token reserve with 20% of model context window (min floor 16,384). Small-window models keep the floor; GPT-4o class windows reserve ~25.6K, Claude-class ~40K, 1M-window ~200K. Frontend: add dashed threshold marker at 80% position in the thread header context pill so users can see when auto-compression will fire. --- src-tauri/src/core/context_compression.rs | 71 +++++++++++++++++-- .../ui/dashboard-workbench-logic.ts | 7 ++ .../ui/dashboard-workbench.test.ts | 14 ++++ .../ui/dashboard-workbench.tsx | 19 +++++ 4 files changed, 107 insertions(+), 4 deletions(-) diff --git a/src-tauri/src/core/context_compression.rs b/src-tauri/src/core/context_compression.rs index a0f7efcc..47c927eb 100644 --- a/src-tauri/src/core/context_compression.rs +++ b/src-tauri/src/core/context_compression.rs @@ -21,9 +21,16 @@ use tiycore::agent::AgentMessage; use tiycore::types::{ContentBlock, TextContent, UserMessage}; -/// Reserve this many tokens for the model's response + overhead. -/// Matches pi-mono `DEFAULT_COMPACTION_SETTINGS.reserveTokens`. -const RESERVE_TOKENS: u32 = 16_384; +/// Fraction of the model's context window that is reserved for the model's +/// response + provider/tool overhead, expressed in basis points (1/100th of +/// a percent). 2000 bps == 20%. +const RESERVE_BASIS_POINTS: u32 = 2_000; + +/// Minimum number of tokens to keep reserved even when 20% of the context +/// window is smaller than this floor. The previous hard-coded reserve of +/// `16_384` tokens is preserved as a safe lower bound for typical large +/// context windows while still allowing tiny windows to behave sanely. +const RESERVE_TOKENS_MIN: u32 = 16_384; /// Keep at least this many tokens of recent conversation untouched. /// With LLM-generated summaries providing rich context, we can keep a @@ -233,7 +240,7 @@ impl CompressionSettings { pub fn new(context_window: u32) -> Self { Self { context_window, - reserve_tokens: RESERVE_TOKENS, + reserve_tokens: reserve_tokens_for(context_window), keep_recent_tokens: KEEP_RECENT_TOKENS, } } @@ -244,6 +251,19 @@ impl CompressionSettings { } } +/// Reserve `RESERVE_BASIS_POINTS` (20%) of the model's context window for +/// the model's response + overhead, with `RESERVE_TOKENS_MIN` as a floor +/// so that very small windows still keep a sane amount of headroom and +/// huge windows don't get a pathologically tiny reserve. +fn reserve_tokens_for(context_window: u32) -> u32 { + let percent_budget = ((context_window as u64) + .saturating_mul(RESERVE_BASIS_POINTS as u64) + .saturating_add(9_999)) + / 10_000; + let percent_budget = percent_budget.min(u32::MAX as u64) as u32; + percent_budget.max(RESERVE_TOKENS_MIN) +} + // --------------------------------------------------------------------------- // Public API: should_compress, find_cut_point, build_compressed_messages // --------------------------------------------------------------------------- @@ -911,6 +931,49 @@ mod tests { assert_eq!(calibration.apply_to_estimate(0), 0); } + #[test] + fn compression_settings_reserves_twenty_percent_of_context_window() { + // For typical large context windows the 20% reserve is well above + // the 16,384 token floor, so the budget is exactly 80% of the + // window. This is the primary behaviour change: instead of + // reserving a fixed 16,384 tokens regardless of model, we reserve + // 20% of the model's actual context window. + let cases = [ + (128_000_u32, 25_600_u32, 102_400_u32), // GPT-4o class + (200_000_u32, 40_000_u32, 160_000_u32), // Claude-class + (1_000_000_u32, 200_000_u32, 800_000_u32), // 1M-window class + ]; + for (context_window, expected_reserve, expected_budget) in cases { + let settings = CompressionSettings::new(context_window); + assert_eq!( + settings.reserve_tokens, expected_reserve, + "20% reserve for {context_window}-token window", + ); + assert_eq!( + settings.budget(), + expected_budget, + "80% budget for {context_window}-token window", + ); + } + } + + #[test] + fn compression_settings_reserve_clamps_to_minimum_for_small_windows() { + // When 20% of the window would be smaller than the safety floor + // (16,384 tokens), the floor takes over so tiny windows still + // keep enough headroom for the model response. + let settings = CompressionSettings::new(32_000); + // 20% of 32,000 = 6,400 < 16,384 → floor wins. + assert_eq!(settings.reserve_tokens, 16_384); + assert_eq!(settings.budget(), 32_000 - 16_384); + + // Window at or below the floor: reserve equals the floor and + // saturating_sub protects `budget` from underflow. + let tiny = CompressionSettings::new(8_000); + assert_eq!(tiny.reserve_tokens, 16_384); + assert_eq!(tiny.budget(), 0); + } + #[test] fn should_compress_via_context_size_triggers_when_last_usage_exceeds_budget() { // The unified `context_size` (= input + output + cache_read + diff --git a/src/modules/workbench-shell/ui/dashboard-workbench-logic.ts b/src/modules/workbench-shell/ui/dashboard-workbench-logic.ts index 30996970..06a610a2 100644 --- a/src/modules/workbench-shell/ui/dashboard-workbench-logic.ts +++ b/src/modules/workbench-shell/ui/dashboard-workbench-logic.ts @@ -172,6 +172,12 @@ export function buildThreadContextBadgeData(options: { const isExceeded = Boolean( contextWindow && contextWindow > 0 && contextSize > contextWindow, ); + // Compression trigger threshold. The backend reserves 20% of the model's + // context window (src-tauri/src/core/context_compression.rs) and triggers + // auto-compression when the observed `context_size` exceeds the + // remaining 80% budget. Mirror that ratio here so the header pill can + // draw a hint marker at the same boundary the runtime uses. + const compressionThresholdRatio = 0.8; return { contextWindow, @@ -195,6 +201,7 @@ export function buildThreadContextBadgeData(options: { usedLabel: formatCompactTokenCount(contextSize), totalLabel: contextWindow ? formatCompactTokenCount(contextWindow) : "N/A", usedPercent, + compressionThresholdRatio, }; } diff --git a/src/modules/workbench-shell/ui/dashboard-workbench.test.ts b/src/modules/workbench-shell/ui/dashboard-workbench.test.ts index e09f9413..65a44208 100644 --- a/src/modules/workbench-shell/ui/dashboard-workbench.test.ts +++ b/src/modules/workbench-shell/ui/dashboard-workbench.test.ts @@ -159,4 +159,18 @@ describe("buildThreadContextBadgeData", () => { expect(badge.rawUsedPercent).toBe(0); expect(badge.totalLabel).toBe("N/A"); }); + + it("exposes the 80% compression threshold so the header can mark it", () => { + // The backend reserves 20% of the context window and triggers + // auto-compression when observed context_size exceeds the remaining + // 80% budget. The header pill mirrors the same ratio so the dashed + // marker is drawn at the exact boundary the runtime uses. + const badge = buildThreadContextBadgeData({ + fallbackContextWindow: "1000", + fallbackModelDisplayName: "Selected Model", + runtimeUsage: makeRuntimeUsage(), + }); + + expect(badge.compressionThresholdRatio).toBe(0.8); + }); }); diff --git a/src/modules/workbench-shell/ui/dashboard-workbench.tsx b/src/modules/workbench-shell/ui/dashboard-workbench.tsx index 8173430e..686fddd3 100644 --- a/src/modules/workbench-shell/ui/dashboard-workbench.tsx +++ b/src/modules/workbench-shell/ui/dashboard-workbench.tsx @@ -1053,6 +1053,25 @@ const drawerWidth = useStore(uiLayoutStore, (s) => s.drawerWidth); width: `${contextBadge.usageRatio * 100}%`, }} /> + {contextBadge.contextWindow && + contextBadge.contextWindow > 0 ? ( +