diff --git a/README.md b/README.md index d7d030bf..a99948d1 100644 --- a/README.md +++ b/README.md @@ -28,22 +28,22 @@ Around that collaboration model, TiyCode brings together Agent Profiles, workspa - **AI-first coding collaboration.** TiyCode is designed around the idea that humans express intent through conversation while agents take the lead in execution. - **Agent Profiles.** Mix models from different providers, tune response style, language, and custom instructions, and switch profiles flexibly for different kinds of work. +- **Persistent goal management.** Define long-running objectives for agents to pursue across multiple turns. An independent Judge subagent evaluates completion against actual file changes, command outputs, and commit history — eliminating self-attestation bias. - **Custom Agents.** Create purpose-built sub-agents in Settings — each with its own name, system prompt, model tier, and allowed tools — then grant per-profile access and delegate work from the composer. - **Three-tier model architecture.** Each profile supports a Primary model for core reasoning, an Auxiliary model for helper tasks, and a Lightweight model for fast operations — with automatic fallback chains across tiers. - **Multi-provider support.** Connect to 13+ LLM providers out of the box — OpenAI, Anthropic, Google, Ollama, xAI, Groq, OpenRouter, DeepSeek, MiniMax, Kimi, and more — or add any OpenAI-compatible endpoint as a custom provider. - **Workspace-centered execution.** Threads stay grounded in the local workspace and connect naturally to code review, version control, repository inspection, Git worktrees, and terminal workflows. - **Task-aware execution.** Thread-scoped task boards, plan checkpoints, tool status events, and subagent progress make longer runs easier to follow and review. -- **Persistent goal management.** Set long-running objectives for agents to pursue across multiple turns, with automatic continuation, budget controls, and progress tracking. +- **Real-time execution streaming.** A rich thread stream event system delivers live updates — message deltas, tool calls, requested/active statuses, reasoning steps, subagent progress, and plan updates — all rendered through purpose-built AI Elements components. - **Rich composer inputs.** Prompt input supports text, file/photo attachments, screenshots, slash command structured argument interpolation (`--key=value`, positional args, `{{placeholder}}` templates), and large-paste handling. - **Steer & Queue.** While the agent is running, choose to steer the conversation mid-execution or queue a follow-up message for the next round — keeping you in control without interrupting the workflow. -- **Real-time execution streaming.** A rich thread stream event system delivers live updates — message deltas, tool calls, requested/active statuses, reasoning steps, subagent progress, and plan updates — all rendered through purpose-built AI Elements components. -- **Operator-friendly experience.** Slash commands with structured argument parsing, smart conversation titles, context compression controls, commit message generation, external terminal handoff including Ghostty, and compact workbench controls help the product feel fast and practical in day-to-day use. -- **Thread-level elapsed timer.** Track active execution time per thread, excluding pauses, with persistent tracking across sessions. -- **Bilingual interface.** Full i18n coverage with English and Simplified Chinese, switchable at any time. +- **Extensible by design.** Plugins, MCP servers, and Skills are treated as first-class building blocks through the `Extensions Center`. - **ACP Server support.** TiyCode can run as a headless ACP (Agent Client Protocol) server via `tiycode acp --stdio` or `tiycode acp --http `, letting external tools and IDE plugins drive the agent runtime through a standard JSON-RPC protocol without the desktop GUI. - **IM channel gateway.** Connect TiyCode to WeChat or WeCom so you can chat with the agent directly from your messaging app — scan a QR code to log in, send messages and attachments, and receive streaming responses without opening the desktop GUI. -- **Extensible by design.** Plugins, MCP servers, and Skills are treated as first-class building blocks through the `Extensions Center`. +- **Operator-friendly experience.** Slash commands with structured argument parsing, smart conversation titles, context compression controls, commit message generation, external terminal handoff including Ghostty, and compact workbench controls help the product feel fast and practical in day-to-day use. +- **Thread-level elapsed timer.** Track active execution time per thread, excluding pauses, with persistent tracking across sessions. - **Built-in runtime path.** The main execution flow is `Frontend -> Rust Core -> BuiltInAgentRuntime -> tiycore -> LLM`. +- **Bilingual interface.** Full i18n coverage with English and Simplified Chinese, switchable at any time. ## Tech Stack diff --git a/README_zh.md b/README_zh.md index c9bbdcde..dc615077 100644 --- a/README_zh.md +++ b/README_zh.md @@ -28,22 +28,22 @@ TiyCode 面向的是希望以 AI 时代的方式进行编码协作的用户。 - **AI First 的编码协作。** TiyCode 围绕"通过对话表达意图,Agent 全面执行"这一理念来设计产品形态。 - **Agent Profile。** 支持自由组合不同服务商的模型,并可配置回复风格、回复语言、自定义指令等设定,且能在不同 Profile 之间灵活切换。 +- **持久化目标管理。** 为 Agent 设置跨轮次的长期目标,由独立的 Judge 验收 Agent 基于实际文件变更、命令输出和提交历史进行完成判定——杜绝"自说自话"的信任缺陷。 - **Custom Agents。** 在设置中心创建专用子 Agent——每个拥有独立的名称、系统提示、模型层级和可用工具——按 Profile 授权后即可从 composer 委派任务。 - **三层模型架构。** 每个 Profile 支持配置 Primary 主力模型、Auxiliary 辅助模型和 Lightweight 轻量模型三个层级,层级之间具备自动回退链路。 - **多服务商接入。** 开箱支持 13+ 家 LLM 服务商 —— OpenAI、Anthropic、Google、Ollama、xAI、Groq、OpenRouter、DeepSeek、MiniMax、Kimi 等,也可将任何 OpenAI 兼容端点作为自定义 Provider 接入。 - **以工作区为中心的执行体验。** 对话线程扎根本地工作区,并与代码审阅、版本控制、仓库状态读取、Git worktree 和 Terminal 工作流自然衔接。 - **面向任务的执行可观测性。** Thread 级任务板、Plan checkpoint、工具状态事件和子 Agent 进度让长任务更容易跟踪和复查。 -- **持久化目标管理。** 为 Agent 设置跨轮次的长期目标,支持自动延续、预算控制和进度跟踪。 +- **实时执行流式推送。** 丰富的 Thread Stream 事件体系支撑实时更新 —— 消息增量、工具调用、requested / active 状态、推理步骤、子 Agent 进度与计划更新。 - **更丰富的输入能力。** Prompt 输入支持文本、文件 / 图片附件、截图、Slash Command 结构化参数插值(`--key=value`、位置参数、`{{placeholder}}` 模板变量)以及大段文本粘贴处理。 - **Steer 与 Queue。** Agent 运行中可选择「引导」即时插入消息调整方向,或「排队」将消息留待当前运行结束后再发起下一轮——无需中断工作流即可保持掌控。 -- **实时执行流式推送。** 丰富的 Thread Stream 事件体系支撑实时更新 —— 消息增量、工具调用、requested / active 状态、推理步骤、子 Agent 进度与计划更新。 -- **更友好的日常体验。** 支持结构化参数解析的 Slash Command、智能会话标题、上下文压缩、Commit Message 生成、包含 Ghostty 在内的外部终端衔接以及紧凑工作台控件,让协作过程更顺手、更连贯。 -- **线程级别耗时计时器。** 跟踪每个线程的活跃执行时间,排除暂停时间,并支持跨会话持久化跟踪。 -- **双语界面。** 完整的 i18n 支持,覆盖英文和简体中文,随时可切换。 +- **良好的通用扩展能力。** Plugins、MCP Servers 与 Skills 通过 `Extensions Center` 形成统一的扩展入口与产品模型。 - **ACP Server 支持。** TiyCode 可作为无头 ACP(Agent Client Protocol)服务器运行,通过 `tiycode acp --stdio` 或 `tiycode acp --http ` 启动,让外部工具和 IDE 插件通过标准 JSON-RPC 协议驱动 Agent 运行时,无需启动桌面 GUI。 - **IM 通道网关。** 将 TiyCode 接入微信或企业微信,扫码登录后即可在聊天应用中直接与 Agent 对话——发送消息和附件、接收流式回复,无需打开桌面 GUI。 -- **良好的通用扩展能力。** Plugins、MCP Servers 与 Skills 通过 `Extensions Center` 形成统一的扩展入口与产品模型。 +- **更友好的日常体验。** 支持结构化参数解析的 Slash Command、智能会话标题、上下文压缩、Commit Message 生成、包含 Ghostty 在内的外部终端衔接以及紧凑工作台控件,让协作过程更顺手、更连贯。 +- **线程级别耗时计时器。** 跟踪每个线程的活跃执行时间,排除暂停时间,并支持跨会话持久化跟踪。 - **内置 Runtime。** 主执行链路 `Frontend -> Rust Core -> BuiltInAgentRuntime -> tiycore -> LLM`。 +- **双语界面。** 完整的 i18n 支持,覆盖英文和简体中文,随时可切换。 ## 技术栈 diff --git a/src-tauri/migrations/20260607000000_goal_judge_fields.sql b/src-tauri/migrations/20260607000000_goal_judge_fields.sql new file mode 100644 index 00000000..11dd6954 --- /dev/null +++ b/src-tauri/migrations/20260607000000_goal_judge_fields.sql @@ -0,0 +1,17 @@ +-- Goal Judge verification fields: persist the most recent independent Judge +-- verdict for a goal. Acceptance is expressed as status='complete' AND +-- judge_passed=1 (the main agent can no longer self-attest completion). +ALTER TABLE goals ADD COLUMN judge_passed INTEGER NOT NULL DEFAULT 0; -- bool +ALTER TABLE goals ADD COLUMN judge_completeness INTEGER; -- 0-100, nullable +ALTER TABLE goals ADD COLUMN judge_findings TEXT; -- JSON array, nullable +ALTER TABLE goals ADD COLUMN judge_summary TEXT; -- nullable +ALTER TABLE goals ADD COLUMN judge_evaluated_run_id TEXT; -- nullable + +-- Backfill goals already completed via the legacy goal_scored path so that an +-- upgrade does not treat them as un-verified (which would otherwise let goal +-- continuation re-open them). +UPDATE goals +SET judge_passed = 1, + judge_summary = COALESCE(judge_summary, evidence), + judge_completeness = COALESCE(judge_completeness, 100) +WHERE status = 'complete'; diff --git a/src-tauri/migrations/20260607000001_drop_goal_time_used.sql b/src-tauri/migrations/20260607000001_drop_goal_time_used.sql new file mode 100644 index 00000000..06d0c76e --- /dev/null +++ b/src-tauri/migrations/20260607000001_drop_goal_time_used.sql @@ -0,0 +1,5 @@ +-- Drop goal-level time accounting. Time-tracking moved to thread_runs.elapsed_running_secs +-- (added by 20260604000000_run_elapsed_tracking.sql), which is summed across all of a thread's +-- runs (planning + implementation) and rendered by the workbench-shell timer. The goal-level +-- time_used_seconds column was write-only with no readers in budget enforcement, UI, or logging. +ALTER TABLE goals DROP COLUMN time_used_seconds; diff --git a/src-tauri/src/commands/agent.rs b/src-tauri/src/commands/agent.rs index f9e9f067..0b73853a 100644 --- a/src-tauri/src/commands/agent.rs +++ b/src-tauri/src/commands/agent.rs @@ -624,47 +624,6 @@ pub async fn goal_pause( match goal { Some(g) => { if g.status == crate::model::goal::GoalStatus::Active { - // Account elapsed time of any currently active run before pausing - if let Some(run_seconds) = - crate::persistence::repo::run_repo::get_active_run_elapsed_seconds( - &state.pool, - &thread_id, - ) - .await - .unwrap_or(None) - { - let active_run_id = crate::persistence::repo::run_repo::find_latest_by_thread( - &state.pool, - &thread_id, - ) - .await - .ok() - .flatten() - .and_then(|run| { - matches!( - run.status.as_str(), - "running" | "waiting_approval" | "needs_reply" - ) - .then_some(run.id) - }); - let paused_seconds = active_run_id - .as_deref() - .map(|run_id| { - let mut guard = - state.goal_runtime_state.lock().unwrap_or_else(|poisoned| { - tracing::warn!( - "goal_pause: goal runtime mutex poisoned, recovering" - ); - poisoned.into_inner() - }); - guard.take_run_paused_seconds(run_id).max(0) - }) - .unwrap_or(0); - let billable_seconds = (run_seconds - paused_seconds).max(0); - if billable_seconds > 0 { - mgr.account_usage(&g.id, 0, billable_seconds).await.ok(); - } - } mgr.pause(&g.id, crate::model::goal::PauseReason::UserRequested, None) .await?; } diff --git a/src-tauri/src/core/agent_run_event_handler.rs b/src-tauri/src/core/agent_run_event_handler.rs index 076c324d..6107563c 100644 --- a/src-tauri/src/core/agent_run_event_handler.rs +++ b/src-tauri/src/core/agent_run_event_handler.rs @@ -184,33 +184,6 @@ pub(crate) fn sidebar_status_for_runtime_event( } impl AgentRunManager { - fn start_goal_run_pause(&self, thread_id: &str, run_id: &str) { - if thread_id.is_empty() { - return; - } - let mut guard = self.goal_runtime_state.lock().unwrap_or_else(|poisoned| { - tracing::warn!("goal pause runtime mutex poisoned, recovering"); - poisoned.into_inner() - }); - guard.start_run_pause(thread_id, run_id); - } - - fn finish_goal_run_pause(&self, run_id: &str) { - let mut guard = self.goal_runtime_state.lock().unwrap_or_else(|poisoned| { - tracing::warn!("goal pause runtime mutex poisoned, recovering"); - poisoned.into_inner() - }); - guard.finish_run_pause(run_id); - } - - fn cleanup_goal_run_pause(&self, run_id: &str) { - let mut guard = self.goal_runtime_state.lock().unwrap_or_else(|poisoned| { - tracing::warn!("goal pause runtime mutex poisoned, recovering"); - poisoned.into_inner() - }); - guard.cleanup_run_pause(run_id); - } - pub(crate) async fn handle_runtime_channel_closed( self: &Arc, run_id: &str, @@ -410,26 +383,22 @@ impl AgentRunManager { } ThreadStreamEvent::ApprovalRequired { .. } => { let thread_id = self.get_thread_id(run_id).await; - self.start_goal_run_pause(&thread_id, run_id); run_repo::update_status(&self.pool, run_id, RunStatus::WaitingApproval).await?; thread_repo::update_status(&self.pool, &thread_id, &ThreadStatus::WaitingApproval) .await?; } ThreadStreamEvent::ClarifyRequired { .. } => { let thread_id = self.get_thread_id(run_id).await; - self.start_goal_run_pause(&thread_id, run_id); run_repo::update_status(&self.pool, run_id, RunStatus::NeedsReply).await?; thread_repo::update_status(&self.pool, &thread_id, &ThreadStatus::NeedsReply) .await?; } ThreadStreamEvent::ApprovalResolved { .. } => { - self.finish_goal_run_pause(run_id); run_repo::update_status(&self.pool, run_id, RunStatus::Running).await?; let thread_id = self.get_thread_id(run_id).await; thread_repo::update_status(&self.pool, &thread_id, &ThreadStatus::Running).await?; } ThreadStreamEvent::ClarifyResolved { .. } => { - self.finish_goal_run_pause(run_id); run_repo::update_status(&self.pool, run_id, RunStatus::Running).await?; let thread_id = self.get_thread_id(run_id).await; thread_repo::update_status(&self.pool, &thread_id, &ThreadStatus::Running).await?; @@ -458,7 +427,6 @@ impl AgentRunManager { } ThreadStreamEvent::RunCheckpointed { .. } => { let thread_id = self.get_thread_id(run_id).await; - self.start_goal_run_pause(&thread_id, run_id); run_repo::update_status(&self.pool, run_id, RunStatus::WaitingApproval).await?; thread_repo::update_status(&self.pool, &thread_id, &ThreadStatus::WaitingApproval) .await?; @@ -476,7 +444,6 @@ impl AgentRunManager { | ThreadStreamEvent::RunFailed { error, .. } => Some(error.as_str()), _ => None, }; - self.finish_goal_run_pause(run_id); self.finish_run(run_id, final_status, error_message).await?; let thread_id = self.get_thread_id(run_id).await; if let Some(frontend_tx) = self.frontend_tx_for_run(run_id).await { @@ -567,7 +534,6 @@ impl AgentRunManager { ); } } - self.cleanup_goal_run_pause(run_id); } Ok(()) diff --git a/src-tauri/src/core/agent_run_manager.rs b/src-tauri/src/core/agent_run_manager.rs index eefc0a1d..e6c8a6ad 100644 --- a/src-tauri/src/core/agent_run_manager.rs +++ b/src-tauri/src/core/agent_run_manager.rs @@ -24,7 +24,7 @@ use crate::core::sleep_manager::SleepManager; use crate::ipc::frontend_channels::ThreadStreamEvent; use crate::model::errors::{AppError, ErrorSource}; use crate::model::thread::{MessageAttachmentDto, MessageRecord, RunStatus}; -use crate::persistence::repo::{goal_repo, message_repo, run_repo, thread_repo, workspace_repo}; +use crate::persistence::repo::{message_repo, run_repo, thread_repo, workspace_repo}; pub(crate) use crate::core::agent_run_event_handler::build_orphaned_run_terminal_event; #[cfg(test)] @@ -433,44 +433,6 @@ impl AgentRunManager { let (profile_id, provider_id, model_id) = extract_run_model_refs(&model_plan_value); - // Account the planning run's billable time to the active goal so the - // frontend timer displays the correct accumulated time when the new - // implementation run starts (the frontend resets its local elapsed on - // every run_id change, so time_used_seconds must include the full - // planning-phase cost). - { - let planning_elapsed = run_repo::get_run_elapsed_seconds(&self.pool, &planning_run_id) - .await? - .unwrap_or(0); - let paused_seconds = { - let mut guard = self.goal_runtime_state.lock().unwrap_or_else(|poisoned| { - tracing::warn!("goal pause runtime mutex poisoned, recovering"); - poisoned.into_inner() - }); - guard.take_run_paused_seconds(&planning_run_id).max(0) - }; - let billable = (planning_elapsed - paused_seconds).max(0); - if billable > 0 { - if let Ok(Some(goal)) = goal_repo::find_by_thread_id(&self.pool, thread_id).await { - if let Err(error) = goal_repo::account_usage( - &self.pool, &goal.id, - 0, // tokens_delta: planning turns were already counted - billable, 0, // turns_delta - ) - .await - { - tracing::warn!( - planning_run_id = %planning_run_id, - goal_id = %goal.id, - billable_seconds = billable, - error = %error, - "failed to account planning run time to goal" - ); - } - } - } - } - let mut approval_metadata = approval_metadata; approval_metadata.state = IMPLEMENTATION_PLAN_APPROVED_STATE.to_string(); approval_metadata.approved_action = Some(action.clone()); @@ -512,20 +474,6 @@ impl AgentRunManager { ) .await?; - // Emit the updated goal state through the new run's event channel so - // the frontend sees the accumulated time_used_seconds (which now - // includes the planning-run time) before it starts the real-time timer - // for the new implementation run. - if let Ok(Some(goal)) = goal_repo::find_by_thread_id(&self.pool, thread_id).await { - let runs = self.active_runs.lock().await; - if let Some(run) = runs.get(&result.0) { - let _ = run.frontend_tx.send(ThreadStreamEvent::GoalStateUpdated { - thread_id: thread_id.to_string(), - goal: Some(crate::model::goal::GoalPayload::from(goal)), - }); - } - } - if let Some(seed_messages) = context_seed_messages.as_ref() { self.persist_messages(seed_messages).await?; } diff --git a/src-tauri/src/core/agent_session.rs b/src-tauri/src/core/agent_session.rs index 7a5eb557..2c7be27e 100644 --- a/src-tauri/src/core/agent_session.rs +++ b/src-tauri/src/core/agent_session.rs @@ -601,6 +601,34 @@ pub async fn build_session_spec( .await .map(|settings| settings.is_ready()) .unwrap_or(false); + + let mut runtime_tools = runtime_tools_with_custom_subagents( + runtime_tools_with_web_search( + runtime_tools_for_profile_with_extensions(&tool_profile_name, extension_tools), + &tool_profile_name, + web_search_enabled, + ), + custom_subagent_tools, + ); + + // Inject the main-agent-only `agent_judge` acceptance tool on demand: only + // when this thread has a goal that has not yet passed Judge acceptance + // (acceptance = status Complete AND judge_passed). It is appended after the + // custom/extension merge so that the built-in tool name always wins and + // cannot be shadowed by a custom or extension tool. + if let Ok(Some(goal)) = + crate::persistence::repo::goal_repo::find_by_thread_id(pool, thread_id).await + { + let already_verified = + goal.status == crate::model::goal::GoalStatus::Complete && goal.judge_passed; + if !already_verified { + let judge_tool = crate::core::subagent::RuntimeOrchestrationTool::Judge.as_agent_tool(); + if !runtime_tools.iter().any(|t| t.name == judge_tool.name) { + runtime_tools.push(judge_tool); + } + } + } + let initial_context_calibration = build_initial_context_token_calibration( latest_historical_run.as_ref(), &history_messages, @@ -615,14 +643,7 @@ pub async fn build_session_spec( workspace_path: workspace_path.to_string(), run_mode: run_mode.to_string(), tool_profile_name: tool_profile_name.clone(), - runtime_tools: runtime_tools_with_custom_subagents( - runtime_tools_with_web_search( - runtime_tools_for_profile_with_extensions(&tool_profile_name, extension_tools), - &tool_profile_name, - web_search_enabled, - ), - custom_subagent_tools, - ), + runtime_tools, system_prompt, history_messages, history_tool_calls, diff --git a/src-tauri/src/core/agent_session_execution.rs b/src-tauri/src/core/agent_session_execution.rs index e6a2b913..f1dbb980 100644 --- a/src-tauri/src/core/agent_session_execution.rs +++ b/src-tauri/src/core/agent_session_execution.rs @@ -16,10 +16,10 @@ use crate::core::plan_checkpoint::{ build_plan_message_metadata, plan_markdown, write_plan_file, }; use crate::core::subagent::{ - extract_review_report, render_parallel_summary, HelperRunRequest, HelperRunResult, - ParallelSubagentBatchStatus, ParallelSubagentRequest, ParallelSubagentSummary, - ParallelSubagentTask, ParallelSubagentTaskResult, ParallelSubagentTaskStatus, ReviewRequest, - RuntimeOrchestrationTool, SubagentProfile, + extract_judge_report, extract_review_report, render_parallel_summary, HelperRunRequest, + HelperRunResult, JudgeReport, ParallelSubagentBatchStatus, ParallelSubagentRequest, + ParallelSubagentSummary, ParallelSubagentTask, ParallelSubagentTaskResult, + ParallelSubagentTaskStatus, ReviewRequest, RuntimeOrchestrationTool, SubagentProfile, }; use crate::core::tool_gateway::{ ApprovalRequest, ToolExecutionOptions, ToolExecutionRequest, ToolGatewayResult, @@ -294,33 +294,6 @@ impl AgentSession { .await; } - // Goal tools — handle before the main tool gateway - if tool_name == crate::core::goal_manager::GOAL_SCORED_TOOL_NAME { - let tool_call_storage_id = uuid::Uuid::now_v7().to_string(); - let insert_result = tool_call_repo::insert( - &self.pool, - &tool_call_repo::ToolCallInsert { - id: tool_call_storage_id.clone(), - tool_call_id: tool_call_id.to_string(), - run_id: self.spec.run_id.clone(), - thread_id: self.spec.thread_id.clone(), - helper_id: None, - tool_name: tool_name.to_string(), - tool_input_json: tool_input.to_string(), - status: "requested".to_string(), - }, - ) - .await; - - if let Err(error) = insert_result { - return agent_error_result(format!("failed to persist tool call: {error}")); - } - - return self - .execute_goal_tool(tool_name, tool_call_id, &tool_call_storage_id, tool_input) - .await; - } - let tool_call_storage_id = uuid::Uuid::now_v7().to_string(); let insert_result = tool_call_repo::insert( &self.pool, @@ -351,6 +324,10 @@ impl AgentSession { ) .await } + RuntimeOrchestrationTool::Judge => { + self.execute_judge_tool(tool_call_id, &tool_call_storage_id, tool_input) + .await + } _ => { self.execute_helper_tool(tool, tool_call_id, &tool_call_storage_id, tool_input) .await @@ -886,6 +863,16 @@ impl AgentSession { return Err("agent_parallel cannot be used as an individual helper".to_string()); } + if tool == RuntimeOrchestrationTool::Judge { + // agent_judge is a main-agent-only acceptance tool: it must not be + // reachable as a generic helper delegate or as an agent_parallel + // batch target. + return Err( + "agent_judge can only be called directly by the main agent for the current goal" + .to_string(), + ); + } + let HelperToolTask { task, review_request, @@ -1614,202 +1601,238 @@ impl AgentSession { } } - // ── Goal tool handlers ── + // ── Goal acceptance Judge handler ── - async fn execute_goal_tool( + /// Run the main-agent-only `agent_judge` acceptance flow: build a Judge task + /// with the current goal injected, run the Judge helper, parse its structured + /// verdict, persist it, and (on pass) flip the goal to verified/complete. + async fn execute_judge_tool( &self, - tool_name: &str, - _tool_call_id: &str, + tool_call_id: &str, tool_call_storage_id: &str, tool_input: &serde_json::Value, ) -> AgentToolResult { - let pool = self.pool.clone(); - let thread_id = self.spec.thread_id.clone(); - - match tool_name { - name if name == crate::core::goal_manager::GOAL_SCORED_TOOL_NAME => { - let status = tool_input - .get("status") - .and_then(|v| v.as_str()) - .unwrap_or(""); - let evidence = tool_input - .get("evidence") - .and_then(|v| v.as_str()) - .unwrap_or(""); - let pledge = tool_input - .get("pledge") - .and_then(|v| v.as_str()) - .unwrap_or(""); - - // Only support marking as complete - if status != "complete" { - let err_msg = "goal_scored only supports status='complete'. Use /goal pause|resume|clear from the UI for other lifecycle operations."; - tool_call_repo::update_result( - &self.pool, - tool_call_storage_id, - &serde_json::json!({ "error": err_msg }).to_string(), - "failed", - ) - .await - .ok(); - return agent_error_result(err_msg); - } + // Parse the main agent's task / rationale. + let request = match crate::core::subagent::JudgeRequest::from_tool_input(tool_input) { + Ok(request) => request, + Err(error) => { + tool_call_repo::update_result( + &self.pool, + tool_call_storage_id, + &serde_json::json!({ "error": &error }).to_string(), + "failed", + ) + .await + .ok(); + return agent_error_result(error); + } + }; - // The pledge must match the required text exactly. - if pledge.trim() != crate::core::goal_manager::GOAL_SCORED_PLEDGE { - let err_msg = format!( - "goal_scored rejected: the 'pledge' parameter must be passed verbatim as: \"{}\"", - crate::core::goal_manager::GOAL_SCORED_PLEDGE - ); - tool_call_repo::update_result( - &self.pool, - tool_call_storage_id, - &serde_json::json!({ "error": &err_msg }).to_string(), - "failed", - ) - .await - .ok(); - return agent_error_result(err_msg); - } + // Backstop: re-query goal state. agent_judge is injected only when an + // un-verified goal exists, but a stale tool set or a direct call must be + // rejected here too. + let goal = match crate::persistence::repo::goal_repo::find_by_thread_id( + &self.pool, + &self.spec.thread_id, + ) + .await + { + Ok(Some(goal)) => goal, + Ok(None) => { + let err_msg = + "agent_judge cannot run: no goal exists for this thread. Create one with the /goal command first."; + tool_call_repo::update_result( + &self.pool, + tool_call_storage_id, + &serde_json::json!({ "error": err_msg }).to_string(), + "failed", + ) + .await + .ok(); + return agent_error_result(err_msg); + } + Err(e) => { + let err_msg = format!("Failed to load goal: {e}"); + tool_call_repo::update_result( + &self.pool, + tool_call_storage_id, + &serde_json::json!({ "error": &err_msg }).to_string(), + "failed", + ) + .await + .ok(); + return agent_error_result(err_msg); + } + }; - if evidence.trim().is_empty() { - // Evidence is empty — reject the completion and challenge - let mgr = crate::core::goal_manager::GoalManager::new( - pool, - thread_id, - self.goal_runtime.clone(), - ); - let challenge = mgr.render_challenge_prompt( - crate::core::goal_manager::ChallengePromptVariant::NoEvidence, - ); - let result_text = - format!("Goal completion rejected: evidence is required. {challenge}"); - tool_call_repo::update_result( - &self.pool, - tool_call_storage_id, - &serde_json::json!({ "output": &result_text }).to_string(), - "completed", - ) - .await - .ok(); - return AgentToolResult::text(result_text); - } + if goal.status == crate::model::goal::GoalStatus::Complete && goal.judge_passed { + let err_msg = + "The goal has already passed acceptance. No further verification is needed."; + tool_call_repo::update_result( + &self.pool, + tool_call_storage_id, + &serde_json::json!({ "error": err_msg }).to_string(), + "failed", + ) + .await + .ok(); + return agent_error_result(err_msg); + } - let mgr = crate::core::goal_manager::GoalManager::new( - pool, - thread_id, - self.goal_runtime.clone(), - ); - match mgr.get_active().await { - Ok(Some(goal)) => { - if goal.status != crate::model::goal::GoalStatus::Active { - let err_msg = format!( - "Goal is not active (current status: {:?}). Cannot mark as complete.", - goal.status - ); - tool_call_repo::update_result( - &self.pool, - tool_call_storage_id, - &serde_json::json!({ "error": &err_msg }).to_string(), - "failed", - ) - .await - .ok(); - return agent_error_result(err_msg); - } - let paused_seconds = { - let mut guard = self.goal_runtime.lock().unwrap_or_else(|poisoned| { - tracing::warn!( - "goal_scored: goal_runtime mutex poisoned, recovering" - ); - poisoned.into_inner() - }); - guard.take_run_paused_seconds(&self.spec.run_id).max(0) - }; - let active_run_seconds = - crate::persistence::repo::run_repo::get_active_run_elapsed_seconds( - &self.pool, - &self.spec.thread_id, - ) - .await - .unwrap_or(None) - .map(|seconds| (seconds - paused_seconds).max(0)); - - match mgr.mark_complete(&goal.id, evidence).await { - Ok(()) => { - if let Some(run_seconds) = active_run_seconds { - if run_seconds > 0 { - mgr.account_usage(&goal.id, 0, run_seconds).await.ok(); - } - } - - let updated = mgr.get_active().await.ok().flatten(); - if let Some(ref record) = updated { - let payload = - crate::core::goal_manager::GoalManager::to_payload(record); - let _ = self.event_tx.send(ThreadStreamEvent::GoalCompleted { - thread_id: record.thread_id.clone(), - evidence: evidence.to_string(), - }); - let _ = - self.event_tx.send(ThreadStreamEvent::GoalStateUpdated { - thread_id: record.thread_id.clone(), - goal: Some(payload), - }); - } - let result_text = - format!("Goal marked as complete. Evidence: {evidence}"); - tool_call_repo::update_result( - &self.pool, - tool_call_storage_id, - &serde_json::json!({ "output": &result_text }).to_string(), - "completed", - ) - .await - .ok(); - AgentToolResult::text(result_text) - } - Err(e) => { - let err_msg = format!("Failed to complete goal: {e}"); - tool_call_repo::update_result( - &self.pool, - tool_call_storage_id, - &serde_json::json!({ "error": &err_msg }).to_string(), - "failed", - ) - .await - .ok(); - agent_error_result(err_msg) - } + // Build the Judge task: inject the goal objective + status + last verdict + // so the Judge does not rely on the main agent's self-report. + let mut prior_verdict = String::new(); + if goal.judge_evaluated_run_id.is_some() { + if let Some(summary) = goal.judge_summary.as_deref() { + if !summary.trim().is_empty() { + prior_verdict.push_str(&format!("\nPrevious Judge summary: {summary}")); + } + } + if let Some(findings_json) = goal.judge_findings.as_deref() { + if let Ok(findings) = serde_json::from_str::>(findings_json) { + if !findings.is_empty() { + prior_verdict.push_str("\nPrevious Judge findings:"); + for finding in findings { + prior_verdict.push_str(&format!("\n- {finding}")); } } - Ok(None) => { - let err_msg = "No active goal found. Create one first with /goal command."; - tool_call_repo::update_result( - &self.pool, - tool_call_storage_id, - &serde_json::json!({ "error": err_msg }).to_string(), - "failed", - ) - .await - .ok(); - agent_error_result(err_msg) - } - Err(e) => { - let err_msg = format!("Failed to load goal: {e}"); - tool_call_repo::update_result( - &self.pool, - tool_call_storage_id, - &serde_json::json!({ "error": &err_msg }).to_string(), - "failed", - ) - .await - .ok(); - agent_error_result(err_msg) - } } } - _ => agent_error_result(format!("Unknown goal tool: {tool_name}")), + } + + let judge_task = format!( + "You are verifying acceptance of the following goal for the current project.\n\n\ +Goal id: {goal_id}\n\ +Goal status: {status:?}\n\ +Goal objective:\n{objective}\n\ +{prior_verdict}\n\n\ +The main agent's note for this verification request:\n{task}\n\n\ +Independently inspect the project's current state and decide whether it satisfies the goal. \ +Return your structured JudgeReport verdict.", + goal_id = goal.id, + status = goal.status, + objective = goal.objective, + prior_verdict = prior_verdict, + task = request.task, + ); + + // Build a Judge delegate (depth 2, primary model) and run it. + let tool = RuntimeOrchestrationTool::Judge; + let helper_profile = resolve_helper_profile(&tool); + let model_role = match resolve_helper_model_role( + &self.spec.model_plan, + &tool, + helper_profile.as_ref(), + ) { + Some(role) => role, + None => { + let err_msg = "Failed to resolve a model for agent_judge.".to_string(); + tool_call_repo::update_result( + &self.pool, + tool_call_storage_id, + &serde_json::json!({ "error": &err_msg }).to_string(), + "failed", + ) + .await + .ok(); + return agent_error_result(err_msg); + } + }; + + let delegate = ResolvedHelperDelegate { + tool: tool.clone(), + agent_name: tool.tool_name(), + task: judge_task, + review_request: None, + helper_profile, + model_role, + }; + + let report: JudgeReport = match self.run_helper_for_delegate(&delegate, tool_call_id).await + { + Ok(summary) => extract_judge_report( + summary + .raw_summary + .as_deref() + .unwrap_or(summary.summary.as_str()), + ), + Err(error) => { + let err_msg = format!("agent_judge failed to run: {error}"); + tool_call_repo::update_result( + &self.pool, + tool_call_storage_id, + &serde_json::json!({ "error": &err_msg }).to_string(), + "failed", + ) + .await + .ok(); + return agent_error_result(err_msg); + } + }; + + // Persist the verdict (atomically flips to complete + judge_passed on pass). + let findings_json = + serde_json::to_string(&report.findings).unwrap_or_else(|_| "[]".to_string()); + let recorded = crate::persistence::repo::goal_repo::record_judge_verdict( + &self.pool, + &goal.id, + &self.spec.run_id, + report.passed, + report.completeness_pct as i64, + &findings_json, + &report.summary, + ) + .await; + + if let Err(e) = recorded { + let err_msg = format!("Failed to persist Judge verdict: {e}"); + tool_call_repo::update_result( + &self.pool, + tool_call_storage_id, + &serde_json::json!({ "error": &err_msg }).to_string(), + "failed", + ) + .await + .ok(); + return agent_error_result(err_msg); + } + + // Emit goal events with the freshly updated record. + if let Ok(Some(record)) = + crate::persistence::repo::goal_repo::find_by_thread_id(&self.pool, &self.spec.thread_id) + .await + { + let payload = crate::core::goal_manager::GoalManager::to_payload(&record); + if report.passed { + let _ = self.event_tx.send(ThreadStreamEvent::GoalCompleted { + thread_id: record.thread_id.clone(), + evidence: record.evidence.clone().unwrap_or_default(), + }); + } + let _ = self.event_tx.send(ThreadStreamEvent::GoalStateUpdated { + thread_id: record.thread_id.clone(), + goal: Some(payload), + }); + } + + let result_text = crate::core::subagent::judge_contract::render_parent_summary(&report); + tool_call_repo::update_result( + &self.pool, + tool_call_storage_id, + &serde_json::json!({ "output": &result_text, "passed": report.passed }).to_string(), + "completed", + ) + .await + .ok(); + + AgentToolResult { + content: vec![ContentBlock::Text(TextContent::new(result_text))], + details: Some(serde_json::json!({ + "passed": report.passed, + "completenessPct": report.completeness_pct, + "findings": report.findings, + "summary": report.summary, + })), } } } diff --git a/src-tauri/src/core/agent_session_tools.rs b/src-tauri/src/core/agent_session_tools.rs index abef8647..97966181 100644 --- a/src-tauri/src/core/agent_session_tools.rs +++ b/src-tauri/src/core/agent_session_tools.rs @@ -534,31 +534,6 @@ You may call this tool multiple times in a run to incrementally refine the plan. }), )); - // Goal tool — persistent cross-turn task completion - tools.push(AgentTool::new( - crate::core::goal_manager::GOAL_SCORED_TOOL_NAME, - "Goal Scored", - "Mark the current goal as fully achieved (score the goal). You MUST provide evidence — run tests, check file contents, or verify command output to prove the goal is truly achieved. Without evidence, the completion will be challenged. You MUST also pass the exact required pledge text. Do NOT call this tool unless you have actually verified the goal is complete with no remaining or follow-up work.", - serde_json::json!({ - "type": "object", - "properties": { - "status": { - "type": "string", - "enum": ["complete"], - "description": "Must be 'complete' to mark the goal as achieved." - }, - "evidence": { - "type": "string", - "description": "Concrete evidence that the goal is complete — test output, file change summary, command results, or verification steps. Required." - }, - "pledge": { - "type": "string", - "description": "You MUST pass this exact pledge text verbatim: \"I hereby declare: I confirm that I have fully achieved this goal, and I have confirmed that there are no remaining pending tasks or follow-up items. I confirm that I have repeatedly reviewed the output of this work, and I take responsibility for the quality of this output.\"" - } - }, - "required": ["status", "evidence", "pledge"] - }), - )); // Render artifact tool (always available) — supports charts, HTML, and SVG tools.push(AgentTool::new( "render", @@ -670,6 +645,7 @@ pub(crate) fn resolve_helper_profile(tool: &RuntimeOrchestrationTool) -> Option< match tool { RuntimeOrchestrationTool::Explore => Some(SubagentProfile::Explore), RuntimeOrchestrationTool::Review => Some(SubagentProfile::Review), + RuntimeOrchestrationTool::Judge => Some(SubagentProfile::Judge), RuntimeOrchestrationTool::Parallel | RuntimeOrchestrationTool::Custom(_) => None, } } @@ -690,6 +666,8 @@ pub(crate) fn resolve_helper_model_role( .clone() .unwrap_or_else(|| model_plan.primary.clone()), ), + // Judge prioritizes acceptance quality over cost: always use primary. + RuntimeOrchestrationTool::Judge => Some(model_plan.primary.clone()), RuntimeOrchestrationTool::Parallel | RuntimeOrchestrationTool::Custom(_) => None, } } diff --git a/src-tauri/src/core/app_state.rs b/src-tauri/src/core/app_state.rs index 00e2f166..ac3d81a7 100644 --- a/src-tauri/src/core/app_state.rs +++ b/src-tauri/src/core/app_state.rs @@ -1,7 +1,6 @@ use std::collections::HashMap; use std::sync::{Arc, Mutex}; -use chrono::{DateTime, Utc}; use sqlx::SqlitePool; use tauri::AppHandle; @@ -31,12 +30,6 @@ pub struct GoalRuntimeState { pub idle_turn_count: HashMap, /// Consecutive completion claim counter per thread. pub completion_claim_count: HashMap, - /// Pause start timestamp per run while it waits for user action. - pub run_pause_started_at: HashMap>, - /// Accumulated user-wait pause seconds per run. - pub run_paused_seconds: HashMap, - /// Thread ID for each run with pause accounting state. - pub run_pause_thread_ids: HashMap, } impl GoalRuntimeState { @@ -47,66 +40,6 @@ impl GoalRuntimeState { self.thread_tool_calls.remove(thread_id); self.idle_turn_count.remove(thread_id); self.completion_claim_count.remove(thread_id); - - let run_ids: Vec = self - .run_pause_thread_ids - .iter() - .filter_map(|(run_id, stored_thread_id)| { - (stored_thread_id == thread_id).then(|| run_id.clone()) - }) - .collect(); - for run_id in run_ids { - self.cleanup_run_pause(&run_id); - } - } - - /// Begin timing a run's user-action pause. Repeated starts are ignored so - /// nested or duplicate waiting events do not lose the original start time. - pub fn start_run_pause(&mut self, thread_id: &str, run_id: &str) { - self.run_pause_thread_ids - .entry(run_id.to_string()) - .or_insert_with(|| thread_id.to_string()); - self.start_run_pause_at(run_id, Utc::now()); - } - - fn start_run_pause_at(&mut self, run_id: &str, started_at: DateTime) { - self.run_pause_started_at - .entry(run_id.to_string()) - .or_insert(started_at); - } - - /// Finish the current pause interval for a run and accumulate whole seconds. - pub fn finish_run_pause(&mut self, run_id: &str) -> i64 { - self.finish_run_pause_at(run_id, Utc::now()) - } - - fn finish_run_pause_at(&mut self, run_id: &str, finished_at: DateTime) -> i64 { - let Some(started_at) = self.run_pause_started_at.remove(run_id) else { - return *self.run_paused_seconds.get(run_id).unwrap_or(&0); - }; - - let paused_seconds = (finished_at - started_at).num_seconds().max(0); - let total = self - .run_paused_seconds - .entry(run_id.to_string()) - .or_insert(0); - *total += paused_seconds; - *total - } - - /// Take and clear the accumulated pause seconds for a run. - pub fn take_run_paused_seconds(&mut self, run_id: &str) -> i64 { - self.finish_run_pause(run_id); - let seconds = self.run_paused_seconds.remove(run_id).unwrap_or(0); - self.run_pause_thread_ids.remove(run_id); - seconds - } - - /// Clear all pause accounting state for a run. - pub fn cleanup_run_pause(&mut self, run_id: &str) { - self.run_pause_started_at.remove(run_id); - self.run_paused_seconds.remove(run_id); - self.run_pause_thread_ids.remove(run_id); } } @@ -186,84 +119,3 @@ impl AppState { } } } - -#[cfg(test)] -mod tests { - use super::GoalRuntimeState; - use chrono::{Duration, TimeZone, Utc}; - - #[test] - fn run_pause_tracking_is_idempotent_accumulative_and_cleared_on_take() { - let mut state = GoalRuntimeState::default(); - let start = Utc.with_ymd_and_hms(2026, 5, 31, 12, 0, 0).unwrap(); - - state - .run_pause_thread_ids - .insert("run-1".to_string(), "thread-1".to_string()); - state.start_run_pause_at("run-1", start); - state.start_run_pause_at("run-1", start + Duration::seconds(10)); - - assert_eq!( - state.finish_run_pause_at("run-1", start + Duration::seconds(5)), - 5, - ); - assert_eq!( - state.finish_run_pause_at("run-1", start + Duration::seconds(20)), - 5, - ); - - state.start_run_pause_at("run-1", start + Duration::seconds(30)); - assert_eq!( - state.finish_run_pause_at("run-1", start + Duration::seconds(37)), - 12, - ); - - assert_eq!(state.take_run_paused_seconds("run-1"), 12); - assert_eq!(state.take_run_paused_seconds("run-1"), 0); - assert!(!state.run_pause_started_at.contains_key("run-1")); - assert!(!state.run_paused_seconds.contains_key("run-1")); - assert!(!state.run_pause_thread_ids.contains_key("run-1")); - } - - #[test] - fn cleanup_thread_removes_run_pause_state_for_that_thread() { - let mut state = GoalRuntimeState::default(); - let start = Utc.with_ymd_and_hms(2026, 5, 31, 12, 0, 0).unwrap(); - - state - .run_pause_thread_ids - .insert("run-1".to_string(), "thread-1".to_string()); - state.start_run_pause_at("run-1", start); - state - .run_pause_thread_ids - .insert("run-2".to_string(), "thread-2".to_string()); - state.start_run_pause_at("run-2", start); - state.run_paused_seconds.insert("run-1".to_string(), 3); - state.run_paused_seconds.insert("run-2".to_string(), 5); - - state.cleanup_thread("thread-1"); - - assert!(!state.run_pause_started_at.contains_key("run-1")); - assert!(!state.run_paused_seconds.contains_key("run-1")); - assert!(!state.run_pause_thread_ids.contains_key("run-1")); - assert!(state.run_pause_started_at.contains_key("run-2")); - assert_eq!(state.run_paused_seconds.get("run-2"), Some(&5)); - assert_eq!( - state.run_pause_thread_ids.get("run-2").map(String::as_str), - Some("thread-2"), - ); - } - - #[test] - fn run_pause_tracking_clamps_negative_intervals() { - let mut state = GoalRuntimeState::default(); - let start = Utc.with_ymd_and_hms(2026, 5, 31, 12, 0, 0).unwrap(); - - state.start_run_pause_at("run-1", start); - - assert_eq!( - state.finish_run_pause_at("run-1", start - Duration::seconds(5)), - 0, - ); - } -} diff --git a/src-tauri/src/core/goal_manager.rs b/src-tauri/src/core/goal_manager.rs index fa6d1101..f3d3ff2b 100644 --- a/src-tauri/src/core/goal_manager.rs +++ b/src-tauri/src/core/goal_manager.rs @@ -19,12 +19,6 @@ pub struct GoalEvaluationOutcome { /// Default maximum turns for a goal before auto-pausing. const DEFAULT_MAX_TURNS: i64 = 50; -/// Tool name used to mark a goal as fully achieved ("score" the goal). -pub const GOAL_SCORED_TOOL_NAME: &str = "goal_scored"; - -/// Exact pledge text the agent must pass verbatim when calling `goal_scored`. -pub const GOAL_SCORED_PLEDGE: &str = "I hereby declare: I confirm that I have fully achieved this goal, and I have confirmed that there are no remaining pending tasks or follow-up items. I confirm that I have repeatedly reviewed the output of this work, and I take responsibility for the quality of this output."; - /// Continuation prompt injected when the goal is still active. const CONTINUATION_PROMPT_TEMPLATE: &str = "\ [Goal continuation — turns {turns_used}/{max_turns}] @@ -33,26 +27,28 @@ const CONTINUATION_PROMPT_TEMPLATE: &str = "\ Continue working toward this objective. Take the next concrete step. -⚠️ When the goal is fully achieved, you MUST call: - goal_scored(status=\"complete\", evidence=\"\", pledge=\"\") -Without this call, the system will keep injecting continuation prompts. +⚠️ Completion is now decided by independent verification. When you believe the +goal is achieved, you MUST call: + agent_judge(task=\"explain why you believe the goal is achieved / what to verify\") +A Judge will evaluate whether the project satisfies the goal's consistency and +completeness. +- The goal is only marked verified when the Judge returns passed=true. +- If a previous Judge verification did not pass, read its findings, fix each one, + then call agent_judge again. +You cannot declare completion yourself; only a passing Judge verdict counts. If you are blocked and need user input, use the clarify tool."; -/// Challenge prompt when the model claimed completion but did not use the tool. +/// Challenge prompt when the model claimed completion but has not requested +/// Judge verification yet. const CHALLENGE_EVIDENCE_PROMPT: &str = "\ -Before claiming the goal is complete, please provide concrete evidence: - -1. What verification commands did you run? What was the output? -2. What files did you modify? What was the purpose of each change? +You appear to believe the goal is complete, but you have not requested independent +verification. You cannot self-declare completion. -Once you have evidence, call goal_scored(status=\"complete\", evidence=\"...\", pledge=\"...\") . -If the goal is not actually complete, ignore this prompt and continue working."; - -/// Challenge prompt when the model claimed completion but evidence was empty. -const MISSING_EVIDENCE_PROMPT: &str = "\ -You called goal_scored(complete) but did not provide evidence. -Please provide completion evidence and call goal_scored(status=\"complete\", evidence=\"\", pledge=\"\") again."; +When you are confident the goal is achieved, call: + agent_judge(task=\"explain why you believe the goal is achieved / what to verify\") +The goal is only marked verified when the Judge returns passed=true. If the goal +is not actually complete, ignore this prompt and continue working."; /// Guidance prompt when the agent appears stuck. const GUIDANCE_PROMPT: &str = "\ @@ -152,13 +148,17 @@ impl GoalManager { status: GoalStatus::Active, token_budget, tokens_used: 0, - time_used_seconds: 0, turns_used: 0, max_turns: DEFAULT_MAX_TURNS, pause_reason: None, pause_detail: None, evidence: None, last_evaluated_run_id: None, + judge_passed: false, + judge_completeness: None, + judge_findings: None, + judge_summary: None, + judge_evaluated_run_id: None, created_at: Utc::now(), updated_at: Utc::now(), }; @@ -212,29 +212,6 @@ impl GoalManager { Ok(()) } - /// Mark the goal as complete with evidence. - pub async fn mark_complete(&self, goal_id: &str, evidence: &str) -> Result<(), AppError> { - if evidence.trim().is_empty() { - return Err(AppError::validation( - ErrorSource::Settings, - "evidence is required to mark a goal as complete", - )); - } - let updated = goal_repo::update_status( - &self.pool, - goal_id, - GoalStatus::Complete, - None, - None, - Some(evidence), - ) - .await?; - if !updated { - return Err(AppError::not_found(ErrorSource::Settings, "goal")); - } - Ok(()) - } - /// Mark the goal as budget-limited. pub async fn mark_budget_limited(&self, goal_id: &str) -> Result<(), AppError> { let updated = goal_repo::update_status( @@ -273,14 +250,9 @@ impl GoalManager { goal_repo::delete_by_thread_id(&self.pool, &self.thread_id).await } - /// Account usage after a turn. Increments turn count, tokens, and time. - pub async fn account_usage( - &self, - goal_id: &str, - tokens: i64, - time_seconds: i64, - ) -> Result<(), AppError> { - goal_repo::account_usage(&self.pool, goal_id, tokens, time_seconds, 1).await + /// Account usage after a turn. Increments turn count and tokens. + pub async fn account_usage(&self, goal_id: &str, tokens: i64) -> Result<(), AppError> { + goal_repo::account_usage(&self.pool, goal_id, tokens, 1).await } // ── Auto-resume ── @@ -353,7 +325,7 @@ impl GoalManager { .remove(&self.thread_id); return GoalVerdict::Paused { reason: PauseReason::IdleBlocked, - detail: Some("agent repeatedly claimed completion without providing evidence via goal_scored".into()), + detail: Some("agent repeatedly claimed completion without requesting Judge verification via agent_judge".into()), }; } return GoalVerdict::ChallengeEvidence; @@ -412,11 +384,11 @@ impl GoalManager { detail: Some("agent published a plan, awaiting approval".into()), }); } - // goal_scored is handled by the tool execution pipeline - // (agent_session_execution) which validates pledge/evidence - // and marks the goal complete. Evaluation should not - // interfere — let it pass through to idle reset and budget - // checks. + // agent_judge is the main-agent-only acceptance request. It is + // handled by the tool execution pipeline (execute_judge_tool), + // which runs the Judge and records the verdict. Evaluation must + // not treat it as a blocking tool — like any tool call it shows + // the agent acted and should reset idle tendencies. _ => {} } } @@ -486,20 +458,44 @@ impl GoalManager { // ── Prompt generation ── - /// Generate the continuation prompt for the next turn. + /// Generate the continuation prompt for the next turn. When a prior Judge + /// verification did not pass, the most recent findings are appended so the + /// agent can fix them before re-requesting verification. pub fn render_continuation_prompt(&self, goal: &GoalRecord) -> String { - CONTINUATION_PROMPT_TEMPLATE + let mut prompt = CONTINUATION_PROMPT_TEMPLATE .replace("{objective}", &goal.objective) .replace("{turns_used}", &goal.turns_used.to_string()) - .replace("{max_turns}", &goal.max_turns.to_string()) + .replace("{max_turns}", &goal.max_turns.to_string()); + + if goal.judge_evaluated_run_id.is_some() && !goal.judge_passed { + if let Some(findings_json) = goal.judge_findings.as_deref() { + if let Ok(findings) = serde_json::from_str::>(findings_json) { + let findings: Vec = findings + .into_iter() + .filter(|f| !f.trim().is_empty()) + .take(10) + .collect(); + if !findings.is_empty() { + prompt.push_str( + "\n\nMost recent Judge findings to address before re-verifying:", + ); + for finding in findings { + let trimmed = finding.trim(); + let truncated: String = trimmed.chars().take(500).collect(); + prompt.push_str(&format!("\n- {truncated}")); + } + } + } + } + } + + prompt } - /// Generate a challenge-evidence prompt when the model failed to provide evidence. - pub fn render_challenge_prompt(&self, variant: ChallengePromptVariant) -> String { - match variant { - ChallengePromptVariant::NoEvidence => MISSING_EVIDENCE_PROMPT.to_string(), - ChallengePromptVariant::NoTool => CHALLENGE_EVIDENCE_PROMPT.to_string(), - } + /// Generate a challenge prompt nudging the agent to request Judge + /// verification when it claims completion without calling `agent_judge`. + pub fn render_challenge_prompt(&self) -> String { + CHALLENGE_EVIDENCE_PROMPT.to_string() } /// Generate a guidance prompt when the agent appears stuck. @@ -517,7 +513,18 @@ impl GoalManager { None => return Ok(None), }; + // Acceptance is now decided exclusively by the Judge: a verified goal is + // `Complete && judge_passed`. Any non-Active goal stops continuation, + // preserving existing pause/budget semantics. The legacy combination + // `Complete && !judge_passed` should not occur after migration backfill; + // if it does, log it and still stop continuation rather than re-opening. if goal.status != GoalStatus::Active { + if goal.status == GoalStatus::Complete && !goal.judge_passed { + tracing::warn!( + goal_id = %goal.id, + "goal is Complete without judge_passed; treating as terminal and not re-opening" + ); + } return Ok(Some(GoalEvaluationOutcome { goal: Self::to_payload(&goal), verdict: "skipped".to_string(), @@ -594,23 +601,21 @@ impl GoalManager { GoalVerdict::BudgetLimited => { self.mark_budget_limited(¤t.id).await?; } - GoalVerdict::Complete { .. } => {} } + // Bump goal turn counter for any run that did real work. We still consult + // run duration to filter out zero-work runs (e.g. an immediately-interrupted + // run shouldn't burn a turn against max_turns); active running time is + // tracked separately on thread_runs.elapsed_running_secs and is no longer + // billed against the goal here. if let Some(run_seconds) = crate::persistence::repo::run_repo::get_run_duration(&self.pool, run_id) .await .unwrap_or(None) { - let paused_seconds = self.lock_runtime().take_run_paused_seconds(run_id).max(0); - let billable_seconds = (run_seconds - paused_seconds).max(0); - if billable_seconds > 0 { - self.account_usage(¤t.id, 0, billable_seconds) - .await - .ok(); + if run_seconds > 0 { + self.account_usage(¤t.id, 0).await.ok(); } - } else { - self.lock_runtime().take_run_paused_seconds(run_id); } let updated = self.get_active().await?; @@ -626,11 +631,14 @@ impl GoalManager { ), GoalVerdict::ChallengeEvidence => ( "challenge_evidence", - Some(self.render_challenge_prompt(ChallengePromptVariant::NoTool)), + Some(format!( + "{}\n\n{}", + self.render_challenge_prompt(), + self.render_continuation_prompt(updated.as_ref().unwrap_or(&goal)) + )), ), GoalVerdict::Paused { reason: _, detail } => ("paused", detail.clone()), GoalVerdict::BudgetLimited => ("budget_limited", None), - GoalVerdict::Complete { .. } => ("complete", None), }; Ok(Some(GoalEvaluationOutcome { @@ -640,11 +648,3 @@ impl GoalManager { })) } } - -/// Variants for challenge prompts. -pub enum ChallengePromptVariant { - /// Model called goal_scored(complete) but evidence was empty. - NoEvidence, - /// Model claimed completion in text but didn't use the tool. - NoTool, -} diff --git a/src-tauri/src/core/prompt/sources/custom_subagent_body.rs b/src-tauri/src/core/prompt/sources/custom_subagent_body.rs index 3e7c334a..ae7a2fb3 100644 --- a/src-tauri/src/core/prompt/sources/custom_subagent_body.rs +++ b/src-tauri/src/core/prompt/sources/custom_subagent_body.rs @@ -61,6 +61,25 @@ impl SectionSource for SubagentBodySource { }, })) } + Some(SubagentProfile::Judge) => { + let template = include_str!("../templates/subagent/judge.md"); + let (_tmpl, body) = + super::super::templates::parse_front_matter(template).map_err(|e| { + FatalError::new("template.parse", format!("subagent/judge.md: {e}")) + })?; + let vars = super::super::templates::TemplateVars::new(); + let rendered = super::super::templates::render_template_strict(&body, &[], &vars) + .map_err(|e| { + FatalError::new("template.render", format!("subagent/judge.md: {e}")) + })?; + Ok(SectionOutcome::Produced(SectionBody { + markdown: rendered, + meta: SectionMeta { + template_path: Some("templates/subagent/judge.md"), + ..Default::default() + }, + })) + } Some(SubagentProfile::Custom { system_prompt, .. }) => { if system_prompt.trim().is_empty() { return Ok(SectionOutcome::Skip); diff --git a/src-tauri/src/core/prompt/sources/subagent_output_contract.rs b/src-tauri/src/core/prompt/sources/subagent_output_contract.rs index 1ad848d0..c055e5c3 100644 --- a/src-tauri/src/core/prompt/sources/subagent_output_contract.rs +++ b/src-tauri/src/core/prompt/sources/subagent_output_contract.rs @@ -16,6 +16,9 @@ const EXPLORE_TEMPLATE_EMBEDDED: &str = const REVIEW_TEMPLATE_REL_PATH: &str = "subagent/output_contract.review.md"; const REVIEW_TEMPLATE_EMBEDDED: &str = include_str!("../templates/subagent/output_contract.review.md"); +const JUDGE_TEMPLATE_REL_PATH: &str = "subagent/output_contract.judge.md"; +const JUDGE_TEMPLATE_EMBEDDED: &str = + include_str!("../templates/subagent/output_contract.judge.md"); const DECLARED_KEYS: &[&'static str] = &[]; /// Template-backed SectionSource for the SubagentOutputContract section. @@ -42,6 +45,7 @@ impl SectionSource for SubagentOutputContractSource { (EXPLORE_TEMPLATE_REL_PATH, EXPLORE_TEMPLATE_EMBEDDED) } Some(SubagentProfile::Review) => (REVIEW_TEMPLATE_REL_PATH, REVIEW_TEMPLATE_EMBEDDED), + Some(SubagentProfile::Judge) => (JUDGE_TEMPLATE_REL_PATH, JUDGE_TEMPLATE_EMBEDDED), Some(SubagentProfile::Custom { .. }) => { // Custom subagents get a generic output contract return Ok(SectionOutcome::Produced(SectionBody::markdown( diff --git a/src-tauri/src/core/prompt/surface.rs b/src-tauri/src/core/prompt/surface.rs index 009aef9b..554bb0b4 100644 --- a/src-tauri/src/core/prompt/surface.rs +++ b/src-tauri/src/core/prompt/surface.rs @@ -10,6 +10,8 @@ pub enum PromptSurface { SubagentExplore { inherited_run_mode: RunMode }, /// Built-in review subagent SubagentReview { inherited_run_mode: RunMode }, + /// Built-in goal acceptance Judge subagent + SubagentJudge { inherited_run_mode: RunMode }, /// User-defined custom subagent SubagentCustom { slug: String, @@ -46,9 +48,9 @@ pub enum SurfacePattern { AnyMainAgent, /// Matches a specific MainAgent run_mode MainAgent(RunMode), - /// Matches any subagent surface (explore, review, custom) + /// Matches any subagent surface (explore, review, judge, custom) AnySubagent, - /// Matches built-in explore + review subagents only + /// Matches built-in explore + review + judge subagents only BuiltinSubagent, /// Matches any custom subagent regardless of slug CustomSubagent, @@ -70,9 +72,11 @@ impl SurfacePattern { } (SurfacePattern::AnySubagent, PromptSurface::SubagentExplore { .. }) => true, (SurfacePattern::AnySubagent, PromptSurface::SubagentReview { .. }) => true, + (SurfacePattern::AnySubagent, PromptSurface::SubagentJudge { .. }) => true, (SurfacePattern::AnySubagent, PromptSurface::SubagentCustom { .. }) => true, (SurfacePattern::BuiltinSubagent, PromptSurface::SubagentExplore { .. }) => true, (SurfacePattern::BuiltinSubagent, PromptSurface::SubagentReview { .. }) => true, + (SurfacePattern::BuiltinSubagent, PromptSurface::SubagentJudge { .. }) => true, (SurfacePattern::CustomSubagent, PromptSurface::SubagentCustom { .. }) => true, (SurfacePattern::Compaction(k), PromptSurface::Compaction { kind }) => k == kind, (SurfacePattern::AnyCompaction, PromptSurface::Compaction { .. }) => true, diff --git a/src-tauri/src/core/prompt/surface_extensions.rs b/src-tauri/src/core/prompt/surface_extensions.rs index c8ebfb68..4d7f245b 100644 --- a/src-tauri/src/core/prompt/surface_extensions.rs +++ b/src-tauri/src/core/prompt/surface_extensions.rs @@ -27,6 +27,7 @@ impl SurfaceExtension for PromptSurface { PromptSurface::MainAgent { run_mode } => SurfacePattern::MainAgent(*run_mode), PromptSurface::SubagentExplore { .. } => SurfacePattern::AnySubagent, PromptSurface::SubagentReview { .. } => SurfacePattern::AnySubagent, + PromptSurface::SubagentJudge { .. } => SurfacePattern::AnySubagent, PromptSurface::SubagentCustom { .. } => SurfacePattern::CustomSubagent, PromptSurface::Compaction { kind } => SurfacePattern::Compaction(*kind), PromptSurface::Title => SurfacePattern::Title, @@ -43,6 +44,7 @@ impl SurfaceExtension for PromptSurface { PromptSurface::MainAgent { .. } | PromptSurface::SubagentExplore { .. } | PromptSurface::SubagentReview { .. } + | PromptSurface::SubagentJudge { .. } | PromptSurface::SubagentCustom { .. } ) } @@ -76,6 +78,9 @@ mod tests { PromptSurface::SubagentReview { inherited_run_mode: RunMode::Default, }, + PromptSurface::SubagentJudge { + inherited_run_mode: RunMode::Default, + }, PromptSurface::SubagentCustom { slug: "test".into(), inherited_run_mode: RunMode::Default, diff --git a/src-tauri/src/core/prompt/templates/active_goal.tpl.md b/src-tauri/src/core/prompt/templates/active_goal.tpl.md index 0bf5ffa4..c36eb3dd 100644 --- a/src-tauri/src/core/prompt/templates/active_goal.tpl.md +++ b/src-tauri/src/core/prompt/templates/active_goal.tpl.md @@ -8,15 +8,15 @@ declared_keys: [max_turns, objective, turns_used] Objective: {{objective}} Turns used: {{turns_used}}/{{max_turns}} -**Completion requirements — ALL must be met before calling goal_scored(complete):** -1. Every subtask implied by the objective is done. No remaining work, no dangling follow-ups. -2. All changes are verified by running the relevant tests, linters, or build commands. -3. Evidence passed to goal_scored MUST include concrete verification output (test results, command output, file change summary). -Do NOT mark the goal complete until these three conditions are fully satisfied. +**Completion is decided by independent verification — you cannot self-declare it.** +1. Every subtask implied by the objective must be done, with no remaining work or dangling follow-ups. +2. Verify your work by running the relevant tests, linters, or build commands as you go. +3. When you believe the goal is achieved, you MUST request acceptance by calling `agent_judge(task="...")`. Rules: -- When you confirm the goal is fully achieved, you MUST call goal_scored(status="complete", evidence="...", pledge="...") to mark it as scored. This is the only way to mark the goal as achieved. -- The goal_scored tool requires a 'pledge' parameter. You MUST pass this exact text verbatim: "I hereby declare: I confirm that I have fully achieved this goal, and I have confirmed that there are no remaining pending tasks or follow-up items. I confirm that I have repeatedly reviewed the output of this work, and I take responsibility for the quality of this output." -- Do NOT claim completion without verifiable evidence -- If blocked and need user input, use clarify tool -- The system will automatically continue this goal across turns +- Call `agent_judge(task="explain why you believe the goal is achieved / what to verify")` when you think the goal is complete. An independent Judge will evaluate the project against the goal's consistency and completeness. +- The goal is only marked verified when the Judge returns passed=true. You cannot mark the goal complete yourself. +- If a Judge verification did not pass, read its findings, fix each one, then call `agent_judge` again. +- Once the goal has passed Judge acceptance, stop making further changes and summarize the result. +- If blocked and you need user input, use the clarify tool. +- The system will automatically continue this goal across turns until it passes Judge acceptance. diff --git a/src-tauri/src/core/prompt/templates/subagent/judge.md b/src-tauri/src/core/prompt/templates/subagent/judge.md new file mode 100644 index 00000000..58f8b873 --- /dev/null +++ b/src-tauri/src/core/prompt/templates/subagent/judge.md @@ -0,0 +1,64 @@ +--- +section_id: SubagentJudge +version: 1 +declared_keys: [] +--- +You are the **Goal Acceptance Judge** — an independent verifier. The main agent has been working toward a goal and now believes it is achieved (or has fixed earlier findings and wants re-verification). Your job is to independently decide whether the project's **current state** truly satisfies the goal, focusing on **consistency** with what the goal asked for and **completeness** of the work. + +You are an evaluator, not an implementer. You did not do the work, and you must not take the main agent's claims at face value — verify against the actual project state. Goal tasks are typically long-horizon with broad change surfaces, so your evaluation must scale: be thorough enough to catch real gaps, efficient enough to converge in one pass, and honest about what you actually verified. + +## Operating principle: size first, then verify + +Do not start verifying detail by detail before you understand the shape of the change. The right verification budget — and whether to fan out work to subagents — depends on how much actually changed and how it is distributed. + +### Step 1 — Size the change (always do this first) +- Run `git_status` and `git_diff --stat` (or the project's equivalent) to enumerate changed files, additions/deletions, and the rough surface area. +- Cross-reference with the goal objective: identify which subsystems / layers / acceptance criteria each cluster of changes maps to. +- Form an explicit mental model before any deep reading: + - **Small** — ≤ ~5 files changed, single module/layer, narrow concern. One linear pass is enough. + - **Medium** — ~6–20 files, 2–3 subsystems or layers touched, multiple acceptance criteria. + - **Large** — > 20 files, cross-cutting changes, multiple independent topics (e.g. backend + frontend + tests + config + docs), or the goal lists many distinct subtasks. +- Use these as guidance, not hard rules: a 3-file change that touches a security boundary may still warrant Large-style scrutiny; a 40-file rename may collapse to Small. +- If the change scope is genuinely tiny relative to the goal (e.g. goal asks for a feature but the diff shows trivial edits), that itself is strong evidence of incompleteness — record it and probe further before concluding. + +### Step 2 — Pick a verification strategy that matches the size +- **Small change** — verify directly. Read the changed files yourself, confirm each goal requirement against the actual code, run the targeted tests/type-checks. Do not delegate; the coordination overhead is not worth it. +- **Medium change** — split logically. Use one or two `agent_explore` / `agent_review` calls when a coherent slice (e.g. "review the new module + its consumers", "explore how config plumbing was wired") is too large to inspect in line without losing context. Run diagnostic commands (typecheck, targeted tests, lint) yourself. +- **Large change** — fan out with `agent_parallel`. Break the goal's acceptance surface into 2–5 independent topics and dispatch them in parallel. Good split axes: + - **By layer** — backend / frontend / persistence / config. + - **By subsystem** — auth / billing / notifications. + - **By concern** — functional correctness / regression risk / tests & docs / migration & compatibility. + - **By goal subtask** — one helper per acceptance criterion when the goal is itemized. + Keep each subtask independent (no shared write state), bounded in scope, and concretely scoped to file lists or topics inferred from the diff. After the parallel batch returns, **synthesize the results yourself** — reconcile conflicts, call out failures or skipped items, and form one coherent verdict. Do not just concatenate helper outputs. + +### Step 3 — Run the verification commands the project actually uses +- Adapt commands to this repository (infer from manifests, scripts, CI config, and workspace instructions). Do not assume a stack. +- Prefer the *narrowest* command that still covers the changed surface (e.g. test only the affected package) before falling back to repo-wide runs. For Large changes a repo-wide build/typecheck is usually still warranted. +- When `agent_review` is delegated, treat its verification output as authoritative — do not rerun the same commands unless its results were inconclusive. + +## Delegation guidelines +- `agent_explore` — single focused investigation: "where is X used?", "how is Y wired?", "does the codebase still reference Z?". Use when one targeted read-only sweep beats inlining a dozen `read`/`search` calls. +- `agent_review` — bounded review of a slice of the implementation, including running its tests/type-check/lint. Pass `target='diff'` when the helper should look at the workspace changes; provide an explicit changed-file list when you already have one. +- `agent_parallel` — 2–5 independent read-only/review subtasks dispatched together. Prefer this over sequential helper calls whenever the topics are genuinely independent. Never recurse parallel into parallel. +- Do **not** delegate when: + - The change is small enough to inspect inline. + - The subtasks are interdependent (later ones need earlier results). + - You only need one shell command — just run it. +- Always tell each delegate explicitly: the goal text, which slice they own, what evidence to return, and that they are read-only. + +## Hard constraints (read-only acceptance) +- Your file tools are read-only. Do **not** modify, create, or delete any files. +- The `shell` tool is for **diagnostic and verification commands only** — tests, type-checks, linters, builds, and read-only inspection (`git_status`, `git_diff`, `git_log`, `cat`, `ls`, etc.). You must **never** use shell to edit or delete files, install dependencies, change global or system state, or start interactive / long-running / daemon processes. +- Do not attempt to fix the goal yourself. If something is incomplete, report it as a finding so the main agent can fix it. +- Helpers you delegate to inherit the same read-only constraint; remind them in the task text when relevant. + +## Coverage honesty +- Track what you actually verified vs. what you sampled vs. what you skipped. A Large change you only spot-checked is **not** the same as a Large change you fully covered. +- When delegating, if any helper failed, returned inconclusive results, or could not run a command, treat that area as **not verified** — record it explicitly and let it influence the verdict. +- Never imply a check passed without trustworthy evidence. If your `summary` cannot point to specific files, commands, or behaviors you confirmed, you do not have a basis to pass. + +## Verdict rules +- Pass (`passed=true`) only when the project genuinely satisfies the goal with no material gaps **and** your verification covered the full change surface (directly or via successful delegates). When you pass, `summary` must clearly state the verified evidence — files inspected, commands run with their results, and which goal criteria each piece of evidence maps to. It becomes the goal's completion evidence. +- If anything required by the goal is missing, inconsistent, untested, or broken, set `passed=false` and list each concrete gap in `findings` (file path + what is wrong + why it violates the goal). One concrete finding is more valuable than three vague ones. +- Be honest and conservative: when in doubt, do not pass. A false "passed" is worse than an extra verification round. +- Calibrate `completenessPct` to actual coverage and remaining gaps, not to effort spent. A change that does 80% of the goal correctly is 80, not 100, even if the implemented parts are flawless. diff --git a/src-tauri/src/core/prompt/templates/subagent/output_contract.judge.md b/src-tauri/src/core/prompt/templates/subagent/output_contract.judge.md new file mode 100644 index 00000000..a695dd71 --- /dev/null +++ b/src-tauri/src/core/prompt/templates/subagent/output_contract.judge.md @@ -0,0 +1,21 @@ +--- +section_id: SubagentOutputContractJudge +version: 1 +declared_keys: [] +--- +Your output will be consumed by the parent agent and the goal acceptance pipeline, not the user. Follow any response language instructions inherited above for natural-language fields (`findings`, `summary`). + +Return exactly one JSON object with this contract and nothing else (no markdown fences, headings, or prose before or after it): + +{ + "passed": true, + "completenessPct": 100, + "findings": [], + "summary": "Concise but specific evidence for the verdict (verified requirements, commands run and their results)." +} + +Field rules: +- `passed` (boolean): true only when the project genuinely satisfies the goal. +- `completenessPct` (integer 0-100): your honest estimate of how complete the work is against the goal. +- `findings` (array of strings): each concrete unmet / inconsistent / untested / broken point. REQUIRED and non-empty when `passed=false`. +- `summary` (string): rationale for the verdict. REQUIRED and non-empty when `passed=true` — it becomes the goal's completion evidence. If you cannot provide real evidence, set `passed=false`. diff --git a/src-tauri/src/core/subagent/judge_contract.rs b/src-tauri/src/core/subagent/judge_contract.rs new file mode 100644 index 00000000..2a673d93 --- /dev/null +++ b/src-tauri/src/core/subagent/judge_contract.rs @@ -0,0 +1,287 @@ +use serde::{Deserialize, Serialize}; + +/// Input for the `agent_judge` tool (provided by the main agent). +#[derive(Debug, Clone)] +pub struct JudgeRequest { + /// The main agent's explanation of why it believes the goal is achieved, + /// and/or points it wants the Judge to focus on. + pub task: String, +} + +impl JudgeRequest { + pub fn from_tool_input(tool_input: &serde_json::Value) -> Result { + let task = tool_input + .get("task") + .and_then(serde_json::Value::as_str) + .unwrap_or_default() + .trim() + .to_string(); + + if task.is_empty() { + return Err("missing agent_judge task".to_string()); + } + + Ok(Self { task }) + } +} + +/// Structured verdict produced by the Judge subagent. +#[derive(Debug, Clone, Serialize, Deserialize)] +#[serde(rename_all = "camelCase")] +pub struct JudgeReport { + /// Whether the project currently satisfies the goal (acceptance passes). + pub passed: bool, + /// Completeness percentage 0-100. + pub completeness_pct: u8, + /// Specific unmet / non-conforming points. Required when `passed=false`. + #[serde(default)] + pub findings: Vec, + /// Rationale for the verdict. Used as completion evidence when `passed=true`. + #[serde(default)] + pub summary: String, +} + +impl JudgeReport { + /// Build a failed report carrying a single finding (used as a safe fallback + /// when the Judge output cannot be parsed). + fn failed_with_finding(finding: String) -> Self { + Self { + passed: false, + completeness_pct: 0, + findings: vec![finding], + summary: String::new(), + } + } + + /// Normalize a parsed report so it can never represent an unverifiable + /// acceptance: + /// - `completeness_pct` is clamped to 0-100. + /// - `passed=true` with an empty `summary` is downgraded to `passed=false`. + /// - `passed=false` with no findings gets a placeholder finding. + fn normalized(mut self) -> Self { + if self.completeness_pct > 100 { + self.completeness_pct = 100; + } + + if self.passed && self.summary.trim().is_empty() { + self.passed = false; + self.findings + .push("Judge reported passed=true but provided no summary/evidence; downgraded to not passed.".to_string()); + } + + if !self.passed && self.findings.is_empty() { + self.findings + .push("Judge did not provide actionable findings.".to_string()); + } + + self + } +} + +/// Parse the Judge's textual output into a `JudgeReport`. On any parse failure +/// the result is a *failed* report carrying the raw text as a finding, so a +/// malformed Judge response can never be mistaken for acceptance. +pub fn extract_judge_report(text: &str) -> JudgeReport { + let trimmed = text.trim(); + if trimmed.is_empty() { + return JudgeReport::failed_with_finding("Judge produced no output.".to_string()); + } + + if let Ok(report) = serde_json::from_str::(trimmed) { + return report.normalized(); + } + + let stripped = strip_code_fence(trimmed); + if let Ok(report) = serde_json::from_str::(stripped) { + return report.normalized(); + } + + if let Some(report) = extract_embedded_json(trimmed) { + return report.normalized(); + } + + JudgeReport::failed_with_finding(format!( + "Judge output could not be parsed as a JudgeReport. Raw output: {trimmed}" + )) +} + +/// Render a parent-facing summary of the verdict for the main agent. +pub fn render_parent_summary(report: &JudgeReport) -> String { + let mut lines = vec![format!( + "Judge verdict: {} (completeness {}%)", + if report.passed { + "PASSED" + } else { + "NOT PASSED" + }, + report.completeness_pct + )]; + + if !report.summary.trim().is_empty() { + lines.push(format!("Summary: {}", report.summary.trim())); + } + + if report.findings.is_empty() { + lines.push("Findings:\n- none".to_string()); + } else { + let rendered = report + .findings + .iter() + .map(|f| format!("- {}", f.trim())) + .collect::>() + .join("\n"); + lines.push(format!("Findings:\n{rendered}")); + } + + if report.passed { + lines.push( + "✅ The goal has passed acceptance and is now marked complete. Stop making further changes and summarize the result.".to_string(), + ); + } else { + lines.push( + "❌ The goal has NOT passed acceptance. Fix the findings above, then call agent_judge again to re-verify.".to_string(), + ); + } + + lines.join("\n\n") +} + +fn strip_code_fence(text: &str) -> &str { + text.strip_prefix("```json") + .and_then(|value| value.strip_suffix("```")) + .map(str::trim) + .or_else(|| { + text.strip_prefix("```") + .and_then(|value| value.strip_suffix("```")) + .map(str::trim) + }) + .unwrap_or(text) +} + +/// Best-effort: pull the first balanced `{...}` JSON object out of mixed prose +/// and try to parse it as a `JudgeReport`. +fn extract_embedded_json(text: &str) -> Option { + let start = text.find('{')?; + let bytes = text.as_bytes(); + let mut depth = 0usize; + let mut in_string = false; + let mut escaped = false; + for (idx, &b) in bytes.iter().enumerate().skip(start) { + if in_string { + if escaped { + escaped = false; + } else if b == b'\\' { + escaped = true; + } else if b == b'"' { + in_string = false; + } + continue; + } + match b { + b'"' => in_string = true, + b'{' => depth += 1, + b'}' => { + depth -= 1; + if depth == 0 { + let candidate = &text[start..=idx]; + return serde_json::from_str::(candidate).ok(); + } + } + _ => {} + } + } + None +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn judge_request_requires_task() { + assert!(JudgeRequest::from_tool_input(&serde_json::json!({})).is_err()); + let req = JudgeRequest::from_tool_input(&serde_json::json!({ "task": " verify it " })) + .expect("parses"); + assert_eq!(req.task, "verify it"); + } + + #[test] + fn extract_parses_plain_json() { + let report = extract_judge_report( + r#"{"passed":true,"completenessPct":100,"findings":[],"summary":"All tests pass."}"#, + ); + assert!(report.passed); + assert_eq!(report.completeness_pct, 100); + assert_eq!(report.summary, "All tests pass."); + } + + #[test] + fn extract_parses_json_fence() { + let report = extract_judge_report( + "```json\n{\"passed\":false,\"completenessPct\":40,\"findings\":[\"missing tests\"],\"summary\":\"\"}\n```", + ); + assert!(!report.passed); + assert_eq!(report.completeness_pct, 40); + assert_eq!(report.findings, vec!["missing tests"]); + } + + #[test] + fn extract_parses_embedded_json() { + let report = extract_judge_report( + "Here is my verdict:\n{\"passed\":true,\"completenessPct\":90,\"findings\":[],\"summary\":\"Looks good\"}\nThanks!", + ); + assert!(report.passed); + assert_eq!(report.summary, "Looks good"); + } + + #[test] + fn malformed_output_is_not_passed() { + let report = extract_judge_report("I think it's done, looks fine to me."); + assert!(!report.passed); + assert!(!report.findings.is_empty()); + } + + #[test] + fn empty_output_is_not_passed() { + let report = extract_judge_report(" "); + assert!(!report.passed); + assert!(!report.findings.is_empty()); + } + + #[test] + fn passed_with_empty_summary_is_downgraded() { + let report = extract_judge_report( + r#"{"passed":true,"completenessPct":100,"findings":[],"summary":" "}"#, + ); + assert!(!report.passed); + assert!(!report.findings.is_empty()); + } + + #[test] + fn completeness_is_clamped() { + let report = extract_judge_report( + r#"{"passed":false,"completenessPct":250,"findings":["x"],"summary":""}"#, + ); + assert_eq!(report.completeness_pct, 100); + } + + #[test] + fn failed_with_no_findings_gets_placeholder() { + let report = extract_judge_report( + r#"{"passed":false,"completenessPct":10,"findings":[],"summary":"incomplete"}"#, + ); + assert!(!report.passed); + assert_eq!(report.findings.len(), 1); + } + + #[test] + fn render_summary_includes_verdict_and_findings() { + let report = extract_judge_report( + r#"{"passed":false,"completenessPct":30,"findings":["A","B"],"summary":"not yet"}"#, + ); + let summary = render_parent_summary(&report); + assert!(summary.contains("NOT PASSED")); + assert!(summary.contains("- A")); + assert!(summary.contains("agent_judge again")); + } +} diff --git a/src-tauri/src/core/subagent/mod.rs b/src-tauri/src/core/subagent/mod.rs index 22760953..5bbd87f4 100644 --- a/src-tauri/src/core/subagent/mod.rs +++ b/src-tauri/src/core/subagent/mod.rs @@ -1,8 +1,10 @@ +pub mod judge_contract; pub mod orchestrator; pub mod parallel_contract; pub mod review_contract; pub mod runtime_orchestration; +pub use judge_contract::{extract_judge_report, JudgeReport, JudgeRequest}; pub use orchestrator::{ HelperAgentOrchestrator, HelperRunRequest, HelperRunResult, SubagentActivityStatus, SubagentProgressSnapshot, diff --git a/src-tauri/src/core/subagent/orchestrator.rs b/src-tauri/src/core/subagent/orchestrator.rs index 3641e7dd..0ee2c210 100644 --- a/src-tauri/src/core/subagent/orchestrator.rs +++ b/src-tauri/src/core/subagent/orchestrator.rs @@ -1127,6 +1127,15 @@ impl HelperDelegationContext { RuntimeOrchestrationTool::Parallel => { return Err("agent_parallel cannot be used as an individual helper".to_string()); } + RuntimeOrchestrationTool::Judge => { + // Hard gate: agent_judge is a main-agent-only tool. A subagent + // (including Judge itself) must never recursively request goal + // acceptance, even if the tool name was parsed successfully. + return Err( + "agent_judge can only be called by the main agent for the current goal" + .to_string(), + ); + } RuntimeOrchestrationTool::Custom(slug) => { crate::core::agent_session_tools::resolve_custom_subagent_profile_from_pool( &self.orchestrator.pool, @@ -1472,6 +1481,9 @@ async fn build_helper_system_prompt( SubagentProfile::Review => PromptSurface::SubagentReview { inherited_run_mode: rm, }, + SubagentProfile::Judge => PromptSurface::SubagentJudge { + inherited_run_mode: rm, + }, SubagentProfile::Custom { slug, .. } => PromptSurface::SubagentCustom { slug: slug.clone(), inherited_run_mode: rm, @@ -1897,7 +1909,7 @@ mod tests { #[test] fn validate_delegation_allows_review_to_explore_at_depth_2() { - // Main(1) → review(2): review can delegate, explore.max=3 >= 2. + // Main(1) → review(2): review can delegate, explore.max=5 >= 2. validate_delegation_capability( &SubagentProfile::Review, &RuntimeOrchestrationTool::Explore, @@ -1909,15 +1921,17 @@ mod tests { #[test] fn validate_delegation_rejects_when_child_depth_exceeds_target_max() { - // child_depth 4 exceeds explore.max_delegation_depth (3). + // Custom target with max=4 cannot be reached at depth 5 (exceeds its config but + // still within GLOBAL_MAX_DELEGATION_DEPTH). + let target = custom_profile(true, 4); let err = validate_delegation_capability( &SubagentProfile::Review, - &RuntimeOrchestrationTool::Explore, - &SubagentProfile::Explore, - 4, + &RuntimeOrchestrationTool::Custom("shallow".to_string()), + &target, + 5, ) - .expect_err("depth 4 must exceed explore max depth 3"); - assert!(err.contains("max delegation depth is 3")); + .expect_err("depth 5 must exceed custom max depth 4"); + assert!(err.contains("max delegation depth is 4")); } #[test] diff --git a/src-tauri/src/core/subagent/runtime_orchestration.rs b/src-tauri/src/core/subagent/runtime_orchestration.rs index 27150c30..c458e098 100644 --- a/src-tauri/src/core/subagent/runtime_orchestration.rs +++ b/src-tauri/src/core/subagent/runtime_orchestration.rs @@ -14,7 +14,7 @@ pub const GLOBAL_MAX_DELEGATION_DEPTH: u32 = 5; /// Built-in default for the maximum delegation depth a built-in subagent /// (explore / review) may be delegated to. -pub const BUILTIN_DEFAULT_MAX_DELEGATION_DEPTH: u32 = 3; +pub const BUILTIN_DEFAULT_MAX_DELEGATION_DEPTH: u32 = 5; pub const TERM_STATUS_TOOL_DESCRIPTION: &str = "Inspect the status of the desktop app's embedded Terminal panel session for the current thread. Use this to check that panel's session state without mutating it. It does not inspect the agent runtime, CLI process, or host shell outside the panel."; @@ -34,6 +34,10 @@ pub enum RuntimeOrchestrationTool { Explore, Review, Parallel, + /// Goal acceptance Judge. Main-agent-only tool (`agent_judge`): it is parsed + /// here for unified dispatch but is never part of `builtin_all()` nor any + /// helper's delegation tool set. + Judge, Custom(String), // slug of the custom subagent } @@ -41,6 +45,7 @@ pub enum RuntimeOrchestrationTool { pub enum SubagentProfile { Explore, Review, + Judge, Custom { slug: String, name: String, @@ -130,6 +135,7 @@ impl RuntimeOrchestrationTool { "agent_explore" => Some(Self::Explore), "agent_review" => Some(Self::Review), "agent_parallel" => Some(Self::Parallel), + "agent_judge" => Some(Self::Judge), _ => { // Match custom subagent pattern: "agent_{slug}" if let Some(slug) = tool_name.strip_prefix("agent_") { @@ -151,6 +157,7 @@ impl RuntimeOrchestrationTool { Self::Explore => "agent_explore".to_string(), Self::Review => "agent_review".to_string(), Self::Parallel => "agent_parallel".to_string(), + Self::Judge => "agent_judge".to_string(), Self::Custom(slug) => format!("agent_{slug}"), } } @@ -160,6 +167,7 @@ impl RuntimeOrchestrationTool { Self::Explore => "Agent Explore".to_string(), Self::Review => "Agent Review".to_string(), Self::Parallel => "Agent Parallel".to_string(), + Self::Judge => "Agent Judge".to_string(), Self::Custom(slug) => format!("Agent {slug}"), } } @@ -175,6 +183,9 @@ impl RuntimeOrchestrationTool { Self::Parallel => { "Delegate 1-5 independent subtasks to subagents with bounded concurrency. Use this for parallel exploration or review work only when tasks are independent and low side-effect; results are aggregated for the parent agent." } + Self::Judge => { + "Request independent acceptance verification of the current goal. The Judge inspects the project's current state (read-only, with diagnostic shell for tests/type-check/lint) against the goal and returns a structured verdict. You cannot self-declare completion — only a passing Judge verdict marks the goal verified. Call this when you believe the goal is achieved, or to re-verify after fixing prior findings." + } Self::Custom(_) => { // Custom subagents have their description set externally via custom_subagent_as_tool "Custom subagent." @@ -188,6 +199,7 @@ impl RuntimeOrchestrationTool { match self { Self::Explore => Some(SubagentProfile::Explore), Self::Review => Some(SubagentProfile::Review), + Self::Judge => Some(SubagentProfile::Judge), Self::Parallel | Self::Custom(_) => None, } } @@ -339,6 +351,16 @@ impl RuntimeOrchestrationTool { }, "required": ["task"] }), + Self::Judge => serde_json::json!({ + "type": "object", + "properties": { + "task": { + "type": "string", + "description": "Explain why you believe the goal is achieved and call out anything the Judge should focus on (e.g. acceptance criteria, areas you are unsure about). If you are re-verifying after fixing earlier findings, summarize what you changed." + } + }, + "required": ["task"] + }), }; let name = self.tool_name(); @@ -353,6 +375,7 @@ impl SubagentProfile { match self { Self::Explore => "helper_explore".to_string(), Self::Review => "helper_review".to_string(), + Self::Judge => "helper_judge".to_string(), Self::Custom { slug, .. } => format!("helper_custom_{slug}"), } } @@ -364,6 +387,8 @@ impl SubagentProfile { match self { Self::Explore => false, Self::Review => true, + // Judge may delegate explore/review/parallel to gather evidence. + Self::Judge => true, Self::Custom { can_delegate, .. } => *can_delegate, } } @@ -374,6 +399,11 @@ impl SubagentProfile { pub fn max_delegation_depth(&self) -> u32 { match self { Self::Explore | Self::Review => BUILTIN_DEFAULT_MAX_DELEGATION_DEPTH, + // Judge is delegated by the main agent (depth 1) and must be + // accepted at depth 2 (the main agent's child depth). It may itself + // delegate explore/review at depth 3, which remains within + // BUILTIN_DEFAULT_MAX_DELEGATION_DEPTH and GLOBAL_MAX_DELEGATION_DEPTH. + Self::Judge => 2, Self::Custom { max_delegation_depth, .. @@ -435,6 +465,7 @@ impl SubagentProfile { match self { Self::Explore => include_str!("../prompt/templates/subagent/explore.md").to_string(), Self::Review => include_str!("../prompt/templates/subagent/review.md").to_string(), + Self::Judge => include_str!("../prompt/templates/subagent/judge.md").to_string(), Self::Custom { system_prompt, .. } => system_prompt.clone(), } } @@ -632,6 +663,74 @@ impl SubagentProfile { ]); } + if *self == Self::Judge { + // Judge keeps file tools read-only but is allowed a diagnostic-only + // shell plus read-only git/terminal inspection for verification. + tools.extend([ + AgentTool::new( + "git_status", + "Git Status", + "Inspect repository status in the current workspace without modifying anything.", + serde_json::json!({ + "type": "object", + "properties": { + "path": { "type": "string", "description": "Optional relative path to narrow the status query." } + } + }), + ), + AgentTool::new( + "git_diff", + "Git Diff", + "Read the current Git diff in the workspace, optionally scoped to a path or staged changes.", + serde_json::json!({ + "type": "object", + "properties": { + "path": { "type": "string", "description": "Optional relative path to inspect." }, + "staged": { "type": "boolean", "description": "Set true to inspect staged changes instead of working tree changes." }, + "contextLines": { + "type": "integer", + "minimum": 1, + "maximum": 20, + "description": "Optional number of unified diff context lines. Defaults to 3 and is capped for safety." + } + } + }), + ), + AgentTool::new( + "term_status", + "Terminal Status", + TERM_STATUS_TOOL_DESCRIPTION, + serde_json::json!({ + "type": "object", + "properties": {} + }), + ), + AgentTool::new( + "term_output", + "Terminal Output", + TERM_OUTPUT_TOOL_DESCRIPTION, + serde_json::json!({ + "type": "object", + "properties": {} + }), + ), + AgentTool::new( + "shell", + "Run Command", + "Run a non-interactive shell command inside the current workspace. Judge may use this ONLY for diagnostic and verification commands such as tests, type-checks, linters, and read-only inspection. Never use it to modify files, delete data, install dependencies, start long-running or interactive processes, or change global state.", + serde_json::json!({ + "type": "object", + "properties": { + "command": { "type": "string" }, + "cwd": { "type": "string" }, + "timeout": { "type": "number" } + }, + "required": ["command"] + }), + ), + ]); + } + tools } @@ -902,6 +1001,54 @@ mod tests { ); } + #[test] + fn judge_tool_parses_but_is_not_in_builtin_catalog() { + assert_eq!( + RuntimeOrchestrationTool::parse("agent_judge"), + Some(RuntimeOrchestrationTool::Judge) + ); + // Judge is main-agent-only: it must NOT be part of the built-in + // delegation catalog that subagents can reach. + let catalog = runtime_orchestration_tools(); + assert!(!catalog.iter().any(|tool| tool.name == "agent_judge")); + } + + #[test] + fn judge_profile_is_read_only_with_diagnostic_shell() { + let tools = SubagentProfile::Judge.helper_tools(false); + let tool_names: Vec<&str> = tools.iter().map(|tool| tool.name.as_str()).collect(); + + assert!(tool_names.contains(&"read")); + assert!(tool_names.contains(&"list")); + assert!(tool_names.contains(&"find")); + assert!(tool_names.contains(&"search")); + assert!(tool_names.contains(&"shell")); + // Read-only: no file mutation or interactive terminal tools. + assert!(!tool_names.contains(&"edit")); + assert!(!tool_names.contains(&"write")); + assert!(!tool_names.contains(&"term_write")); + assert!(!tool_names.contains(&"term_restart")); + assert!(!tool_names.contains(&"term_close")); + } + + #[test] + fn judge_can_delegate_at_depth_two() { + assert!(SubagentProfile::Judge.can_delegate()); + assert_eq!(SubagentProfile::Judge.max_delegation_depth(), 2); + assert_eq!(SubagentProfile::Judge.helper_kind(), "helper_judge"); + } + + #[test] + fn judge_is_never_a_delegation_target_for_helpers() { + // Even a Judge that can delegate only receives explore/review/parallel, + // never agent_judge. + let tools = SubagentProfile::Judge.delegation_tools_for_helper(3, &[]); + let tool_names: Vec<&str> = tools.iter().map(|tool| tool.name.as_str()).collect(); + assert!(!tool_names.contains(&"agent_judge")); + assert!(tool_names.contains(&"agent_explore")); + assert!(tool_names.contains(&"agent_review")); + } + #[test] fn agent_parallel_tool_schema_has_bounded_tasks() { let tool = RuntimeOrchestrationTool::Parallel.as_agent_tool(); @@ -1042,8 +1189,8 @@ mod tests { #[test] fn review_profile_omits_delegation_tools_beyond_builtin_depth() { - // child_depth 4 exceeds BUILTIN_DEFAULT_MAX_DELEGATION_DEPTH (3). - let tools = SubagentProfile::Review.delegation_tools_for_helper(4, &[]); + // child_depth 6 exceeds BUILTIN_DEFAULT_MAX_DELEGATION_DEPTH (5). + let tools = SubagentProfile::Review.delegation_tools_for_helper(6, &[]); assert!(tools.is_empty()); } diff --git a/src-tauri/src/gateway/gateway_runner.rs b/src-tauri/src/gateway/gateway_runner.rs index 62889d44..42f4167b 100644 --- a/src-tauri/src/gateway/gateway_runner.rs +++ b/src-tauri/src/gateway/gateway_runner.rs @@ -920,7 +920,7 @@ async fn dispatch_command( .await?; // Build a kickoff prompt similar to the GUI /goal path let kickoff = format!( - "## Persistent Goal Started\n\nYou are now working on the following goal:\n\n**{}**\n\nThis goal has been created and is now **active**. Work toward it.\nWhen the goal is fully achieved, you MUST call:\n```json\ngoal_scored(status=\"complete\", evidence=\"test output, file changes, verification steps\", pledge=\"I hereby declare: I confirm that I have fully achieved this goal, and I have confirmed that there are no remaining pending tasks or follow-up items. I confirm that I have repeatedly reviewed the output of this work, and I take responsibility for the quality of this output.\")\n```\nDo NOT mark complete without verified evidence.\n\nIf you need user input before proceeding, use the clarify tool.\nThe goal will automatically pause and resume when the user responds.", + "## Persistent Goal Started\n\nYou are now working on the following goal:\n\n**{}**\n\nThis goal has been created and is now **active**. Work toward it.\nCompletion is decided by independent verification — you cannot self-declare it. When you believe the goal is fully achieved, you MUST request acceptance by calling:\n```json\nagent_judge(task=\"explain why you believe the goal is achieved / what to verify\")\n```\nAn independent Judge evaluates the project against the goal. The goal is only marked verified when the Judge returns passed=true. If a verification does not pass, fix the reported findings and call agent_judge again.\n\nIf you need user input before proceeding, use the clarify tool.\nThe goal will automatically pause and resume when the user responds.", objective, ); run_agent_prompt( diff --git a/src-tauri/src/ipc/frontend_channels.rs b/src-tauri/src/ipc/frontend_channels.rs index 9778222e..48990b7d 100644 --- a/src-tauri/src/ipc/frontend_channels.rs +++ b/src-tauri/src/ipc/frontend_channels.rs @@ -223,9 +223,10 @@ pub enum ThreadStreamEvent { error: Option, }, // ── Goal events ── - // GoalStateUpdated and GoalCompleted are emitted by execute_goal_tool - // (create_goal, goal_scored tools in AgentSession). GoalContinuation and - // GoalPaused are emitted by backend run-lifecycle goal orchestration after + // GoalStateUpdated and GoalCompleted are emitted by the agent_judge + // acceptance flow (execute_judge_tool in AgentSession) when the Judge + // records a verdict. GoalContinuation and GoalPaused are emitted by backend + // run-lifecycle goal orchestration after // terminal runs are evaluated. The frontend also consumes goal state via // goal_get_state / goal_evaluate command APIs. GoalStateUpdated { diff --git a/src-tauri/src/model/goal.rs b/src-tauri/src/model/goal.rs index cbef21a4..62f129d2 100644 --- a/src-tauri/src/model/goal.rs +++ b/src-tauri/src/model/goal.rs @@ -98,10 +98,9 @@ impl PauseReason { pub enum GoalVerdict { /// Goal is still active — inject continuation prompt Continue, - /// Model claimed completion but evidence is missing — inject challenge + /// Model claimed completion but has not yet requested Judge verification — + /// inject a challenge nudging it to call `agent_judge`. ChallengeEvidence, - /// Goal achieved with evidence - Complete { evidence: String }, /// Goal paused for a specific reason Paused { reason: PauseReason, @@ -120,13 +119,22 @@ pub struct GoalRecord { pub status: GoalStatus, pub token_budget: Option, pub tokens_used: i64, - pub time_used_seconds: i64, pub turns_used: i64, pub max_turns: i64, pub pause_reason: Option, pub pause_detail: Option, pub evidence: Option, pub last_evaluated_run_id: Option, + /// Whether the most recent Judge verdict passed acceptance. + pub judge_passed: bool, + /// Latest Judge completeness percentage (0-100), if evaluated. + pub judge_completeness: Option, + /// Latest Judge findings as a JSON array string, if evaluated. + pub judge_findings: Option, + /// Latest Judge summary / acceptance rationale, if evaluated. + pub judge_summary: Option, + /// Run id of the run during which the latest Judge verdict was recorded. + pub judge_evaluated_run_id: Option, pub created_at: DateTime, pub updated_at: DateTime, } @@ -142,7 +150,6 @@ pub struct GoalDto { #[serde(skip_serializing_if = "Option::is_none")] pub token_budget: Option, pub tokens_used: i64, - pub time_used_seconds: i64, pub turns_used: i64, pub max_turns: i64, #[serde(skip_serializing_if = "Option::is_none")] @@ -153,6 +160,15 @@ pub struct GoalDto { pub evidence: Option, #[serde(skip_serializing_if = "Option::is_none")] pub last_evaluated_run_id: Option, + pub judge_passed: bool, + #[serde(skip_serializing_if = "Option::is_none")] + pub judge_completeness: Option, + #[serde(skip_serializing_if = "Option::is_none")] + pub judge_findings: Option, + #[serde(skip_serializing_if = "Option::is_none")] + pub judge_summary: Option, + #[serde(skip_serializing_if = "Option::is_none")] + pub judge_evaluated_run_id: Option, pub created_at: String, pub updated_at: String, } @@ -166,13 +182,17 @@ impl From for GoalDto { status: r.status, token_budget: r.token_budget, tokens_used: r.tokens_used, - time_used_seconds: r.time_used_seconds, turns_used: r.turns_used, max_turns: r.max_turns, pause_reason: r.pause_reason, pause_detail: r.pause_detail, evidence: r.evidence, last_evaluated_run_id: r.last_evaluated_run_id, + judge_passed: r.judge_passed, + judge_completeness: r.judge_completeness, + judge_findings: r.judge_findings, + judge_summary: r.judge_summary, + judge_evaluated_run_id: r.judge_evaluated_run_id, created_at: r.created_at.to_rfc3339(), updated_at: r.updated_at.to_rfc3339(), } @@ -195,7 +215,6 @@ pub struct GoalPayload { pub objective: String, pub status: GoalStatus, pub tokens_used: i64, - pub time_used_seconds: i64, pub turns_used: i64, pub max_turns: i64, #[serde(skip_serializing_if = "Option::is_none")] @@ -208,6 +227,15 @@ pub struct GoalPayload { pub evidence: Option, #[serde(skip_serializing_if = "Option::is_none")] pub last_evaluated_run_id: Option, + pub judge_passed: bool, + #[serde(skip_serializing_if = "Option::is_none")] + pub judge_completeness: Option, + #[serde(skip_serializing_if = "Option::is_none")] + pub judge_findings: Option, + #[serde(skip_serializing_if = "Option::is_none")] + pub judge_summary: Option, + #[serde(skip_serializing_if = "Option::is_none")] + pub judge_evaluated_run_id: Option, } impl From for GoalPayload { @@ -218,7 +246,6 @@ impl From for GoalPayload { objective: r.objective, status: r.status, tokens_used: r.tokens_used, - time_used_seconds: r.time_used_seconds, turns_used: r.turns_used, max_turns: r.max_turns, token_budget: r.token_budget, @@ -226,6 +253,11 @@ impl From for GoalPayload { pause_detail: r.pause_detail, evidence: r.evidence, last_evaluated_run_id: r.last_evaluated_run_id, + judge_passed: r.judge_passed, + judge_completeness: r.judge_completeness, + judge_findings: r.judge_findings, + judge_summary: r.judge_summary, + judge_evaluated_run_id: r.judge_evaluated_run_id, } } } diff --git a/src-tauri/src/model/subagent.rs b/src-tauri/src/model/subagent.rs index d7207ab8..5f6ae6c4 100644 --- a/src-tauri/src/model/subagent.rs +++ b/src-tauri/src/model/subagent.rs @@ -137,7 +137,7 @@ pub struct ProfileSubagentAccessRecord { // Reserved slugs that cannot be used for custom subagents // --------------------------------------------------------------------------- -pub const RESERVED_SUBAGENT_SLUGS: &[&str] = &["explore", "review"]; +pub const RESERVED_SUBAGENT_SLUGS: &[&str] = &["explore", "review", "judge"]; /// Validate that a slug is well-formed and not reserved. pub fn validate_slug(slug: &str) -> Result<(), &'static str> { diff --git a/src-tauri/src/persistence/repo/goal_repo.rs b/src-tauri/src/persistence/repo/goal_repo.rs index 9b758c84..72a3c53a 100644 --- a/src-tauri/src/persistence/repo/goal_repo.rs +++ b/src-tauri/src/persistence/repo/goal_repo.rs @@ -5,8 +5,9 @@ use crate::model::errors::AppError; use crate::model::goal::{GoalRecord, GoalStatus, PauseReason}; const SELECT_COLUMNS: &str = "id, thread_id, objective, status, token_budget, tokens_used, \ - time_used_seconds, turns_used, max_turns, pause_reason, pause_detail, evidence, \ - last_evaluated_run_id, created_at, updated_at"; + turns_used, max_turns, pause_reason, pause_detail, evidence, \ + last_evaluated_run_id, judge_passed, judge_completeness, judge_findings, judge_summary, \ + judge_evaluated_run_id, created_at, updated_at"; // ── Database row (raw sqlx types) ── @@ -18,13 +19,17 @@ struct GoalRow { status: String, token_budget: Option, tokens_used: i64, - time_used_seconds: i64, turns_used: i64, max_turns: i64, pause_reason: Option, pause_detail: Option, evidence: Option, last_evaluated_run_id: Option, + judge_passed: i64, + judge_completeness: Option, + judge_findings: Option, + judge_summary: Option, + judge_evaluated_run_id: Option, created_at: String, updated_at: String, } @@ -38,13 +43,17 @@ impl GoalRow { status: GoalStatus::from_str(&self.status), token_budget: self.token_budget, tokens_used: self.tokens_used, - time_used_seconds: self.time_used_seconds, turns_used: self.turns_used, max_turns: self.max_turns, pause_reason: self.pause_reason.map(|s| PauseReason::from_str(&s)), pause_detail: self.pause_detail, evidence: self.evidence, last_evaluated_run_id: self.last_evaluated_run_id, + judge_passed: self.judge_passed != 0, + judge_completeness: self.judge_completeness, + judge_findings: self.judge_findings, + judge_summary: self.judge_summary, + judge_evaluated_run_id: self.judge_evaluated_run_id, created_at: DateTime::parse_from_rfc3339(&self.created_at) .map(|dt| dt.with_timezone(&Utc)) .unwrap_or_else(|_| Utc::now()), @@ -80,12 +89,16 @@ pub async fn find_by_id(pool: &SqlitePool, id: &str) -> Result Result<(), AppError> { + // Note: the judge_* columns are intentionally omitted here and rely on the + // DDL defaults (judge_passed=0, others NULL) set by the goal_judge_fields + // migration. New goals always start un-verified, and the Judge verdict is + // written later via record_judge_verdict(). let now = Utc::now().to_rfc3339(); sqlx::query( "INSERT INTO goals (id, thread_id, objective, status, token_budget, tokens_used, \ - time_used_seconds, turns_used, max_turns, pause_reason, pause_detail, evidence, \ + turns_used, max_turns, pause_reason, pause_detail, evidence, \ last_evaluated_run_id, created_at, updated_at) \ - VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)", + VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)", ) .bind(&record.id) .bind(&record.thread_id) @@ -93,7 +106,6 @@ pub async fn insert(pool: &SqlitePool, record: &GoalRecord) -> Result<(), AppErr .bind(record.status.as_str()) .bind(record.token_budget) .bind(record.tokens_used) - .bind(record.time_used_seconds) .bind(record.turns_used) .bind(record.max_turns) .bind(record.pause_reason.as_ref().map(|r| r.as_str())) @@ -136,19 +148,16 @@ pub async fn account_usage( pool: &SqlitePool, id: &str, tokens_delta: i64, - time_delta_seconds: i64, turns_delta: i64, ) -> Result<(), AppError> { sqlx::query( "UPDATE goals SET \ tokens_used = tokens_used + ?, \ - time_used_seconds = time_used_seconds + ?, \ turns_used = turns_used + ?, \ updated_at = ? \ WHERE id = ?", ) .bind(tokens_delta) - .bind(time_delta_seconds) .bind(turns_delta) .bind(Utc::now().to_rfc3339()) .bind(id) @@ -196,3 +205,66 @@ pub async fn delete_by_thread_id(pool: &SqlitePool, thread_id: &str) -> Result 0) } + +/// Persist the most recent Judge verdict for a goal. Always updates the +/// `judge_*` columns. When `passed` is true, the same transaction also writes +/// `status='complete'` and `evidence=summary` so that acceptance +/// (`status=complete` AND `judge_passed=1`) can never be observed as a +/// half-applied state. When `passed` is false the goal's `status` is left +/// unchanged (typically still `active`). +#[allow(clippy::too_many_arguments)] +pub async fn record_judge_verdict( + pool: &SqlitePool, + id: &str, + run_id: &str, + passed: bool, + completeness: i64, + findings_json: &str, + summary: &str, +) -> Result { + let now = Utc::now().to_rfc3339(); + let mut tx = pool.begin().await?; + + let updated = sqlx::query( + "UPDATE goals SET \ + judge_passed = ?, \ + judge_completeness = ?, \ + judge_findings = ?, \ + judge_summary = ?, \ + judge_evaluated_run_id = ?, \ + updated_at = ? \ + WHERE id = ?", + ) + .bind(if passed { 1_i64 } else { 0_i64 }) + .bind(completeness) + .bind(findings_json) + .bind(summary) + .bind(run_id) + .bind(&now) + .bind(id) + .execute(&mut *tx) + .await?; + + if updated.rows_affected() == 0 { + tx.rollback().await?; + return Ok(false); + } + + if passed { + sqlx::query( + "UPDATE goals SET \ + status = 'complete', \ + evidence = COALESCE(NULLIF(?, ''), evidence), \ + updated_at = ? \ + WHERE id = ?", + ) + .bind(summary) + .bind(&now) + .bind(id) + .execute(&mut *tx) + .await?; + } + + tx.commit().await?; + Ok(true) +} diff --git a/src-tauri/src/persistence/repo/run_repo.rs b/src-tauri/src/persistence/repo/run_repo.rs index 29265a43..ad783f65 100644 --- a/src-tauri/src/persistence/repo/run_repo.rs +++ b/src-tauri/src/persistence/repo/run_repo.rs @@ -1152,44 +1152,6 @@ mod tests { "expected running segment to be added, got {elapsed}" ); } - - #[tokio::test] - async fn get_active_run_elapsed_seconds_returns_positive_for_running() { - let pool = setup_test_pool().await; - // Insert a running run with a past started_at so elapsed > 0 - sqlx::query( - "INSERT INTO thread_runs (id, thread_id, run_mode, status, started_at, input_tokens, output_tokens, total_tokens) - VALUES ('run-active', 't1', 'default', 'running', '2026-04-22T09:00:00Z', 0, 0, 0)", - ) - .execute(&pool) - .await - .expect("seed run"); - - let duration = super::get_active_run_elapsed_seconds(&pool, "t1") - .await - .unwrap() - .expect("should return elapsed seconds for running run"); - // With started_at in the past, elapsed should be > 0 - assert!(duration > 0, "expected positive elapsed, got {duration}"); - } - - #[tokio::test] - async fn get_active_run_elapsed_seconds_skips_terminal_runs() { - let pool = setup_test_pool().await; - // Insert a completed run (should be skipped) - sqlx::query( - "INSERT INTO thread_runs (id, thread_id, run_mode, status, started_at, input_tokens, output_tokens, total_tokens) - VALUES ('run-done', 't1', 'default', 'completed', '2026-04-22T09:00:00Z', 0, 0, 0)", - ) - .execute(&pool) - .await - .expect("seed run"); - - let duration = super::get_active_run_elapsed_seconds(&pool, "t1") - .await - .unwrap(); - assert!(duration.is_none(), "should skip completed runs"); - } } /// Get the duration in seconds of the last completed run for a thread. @@ -1231,46 +1193,6 @@ pub async fn get_run_duration(pool: &SqlitePool, run_id: &str) -> Result Result, AppError> { - let duration = sqlx::query_scalar::<_, Option>( - "SELECT CAST(strftime('%s', 'now') - strftime('%s', started_at) AS INTEGER) - FROM thread_runs - WHERE id = ? - LIMIT 1", - ) - .bind(run_id) - .fetch_optional(pool) - .await? - .flatten(); - Ok(duration) -} - -/// Get the elapsed seconds of any currently active (non-terminal) run for a thread. -/// Returns None if no active run exists. -pub async fn get_active_run_elapsed_seconds( - pool: &SqlitePool, - thread_id: &str, -) -> Result, AppError> { - let duration = sqlx::query_scalar::<_, Option>( - "SELECT CAST(strftime('%s', 'now') - strftime('%s', started_at) AS INTEGER) - FROM thread_runs - WHERE thread_id = ? - AND status NOT IN ('completed','failed','denied','interrupted','cancelled','limit_reached') - ORDER BY started_at DESC - LIMIT 1", - ) - .bind(thread_id) - .fetch_optional(pool) - .await? - .flatten(); - Ok(duration) -} - /// Bulk-fetch the Unix-millisecond start timestamp of the currently active /// (non-terminal) run for each thread in `thread_ids`. Threads without an /// active run are simply absent from the returned map. Used by the sidebar diff --git a/src-tauri/tests/goal_lifecycle.rs b/src-tauri/tests/goal_lifecycle.rs index 48b91647..157d7704 100644 --- a/src-tauri/tests/goal_lifecycle.rs +++ b/src-tauri/tests/goal_lifecycle.rs @@ -3,7 +3,7 @@ mod tests { use sqlx::sqlite::{SqliteConnectOptions, SqlitePool, SqlitePoolOptions}; use std::str::FromStr; use tiycode_lib::core::app_state::GoalRuntimeState; - use tiycode_lib::core::goal_manager::{ChallengePromptVariant, GoalManager}; + use tiycode_lib::core::goal_manager::GoalManager; use tiycode_lib::model::goal::{GoalStatus, GoalVerdict, PauseReason}; use tiycode_lib::persistence::repo::goal_repo; @@ -138,7 +138,6 @@ mod tests { let after_first = mgr.get_active().await.unwrap().unwrap(); assert_eq!(after_first.turns_used, goal.turns_used + 1); - assert_eq!(after_first.time_used_seconds, 42); assert_eq!(after_first.last_evaluated_run_id.as_deref(), Some("run-1")); let second = mgr @@ -150,10 +149,6 @@ mod tests { let after_second = mgr.get_active().await.unwrap().unwrap(); assert_eq!(after_second.turns_used, after_first.turns_used); - assert_eq!( - after_second.time_used_seconds, - after_first.time_used_seconds - ); } #[tokio::test] @@ -233,7 +228,7 @@ mod tests { let goal = mgr.create_goal("Test goal", None).await.unwrap(); // Set turns_used to at least max_turns via account_usage - goal_repo::account_usage(&pool, &goal.id, 0, 0, goal.max_turns) + goal_repo::account_usage(&pool, &goal.id, 0, goal.max_turns) .await .unwrap(); @@ -258,7 +253,7 @@ mod tests { let mgr = GoalManager::new(pool.clone(), "thread-1".into(), test_runtime()); let goal = mgr.create_goal("Test goal", None).await.unwrap(); - // Model says "done" but doesn't call goal_scored + // Model says "done" but doesn't call agent_judge let verdict = mgr.evaluate_after_turn( "All done! The goal is complete and everything is finished.", &goal, @@ -320,24 +315,6 @@ mod tests { assert_eq!(paused.status, GoalStatus::Paused); } - #[tokio::test] - async fn mark_complete_with_evidence() { - let pool = setup_pool().await; - let mgr = GoalManager::new(pool.clone(), "thread-1".into(), test_runtime()); - let goal = mgr.create_goal("Test goal", None).await.unwrap(); - - mgr.mark_complete(&goal.id, "All tests pass, files created") - .await - .unwrap(); - - let completed = mgr.get_active().await.unwrap().unwrap(); - assert_eq!(completed.status, GoalStatus::Complete); - assert_eq!( - completed.evidence.as_deref(), - Some("All tests pass, files created") - ); - } - #[tokio::test] async fn mark_budget_limited() { let pool = setup_pool().await; @@ -371,42 +348,17 @@ mod tests { let prompt = mgr.render_continuation_prompt(&goal); assert!(prompt.contains("Build feature X")); - assert!(prompt.contains("goal_scored")); + assert!(prompt.contains("agent_judge")); assert!(prompt.contains("clarify")); } #[tokio::test] - async fn challenge_prompt_renders_variants() { + async fn challenge_prompt_guides_to_judge() { let mgr = GoalManager::new(setup_pool().await, "thread-1".into(), test_runtime()); - let no_evidence = mgr.render_challenge_prompt(ChallengePromptVariant::NoEvidence); - assert!(no_evidence.contains("did not provide evidence")); - - let no_tool = mgr.render_challenge_prompt(ChallengePromptVariant::NoTool); - assert!(no_tool.contains("provide concrete evidence")); - assert!(no_tool.contains("goal_scored")); - } - - // ── #1 / #8: Tests for goal_scored validation logic & test gap coverage ── - - #[tokio::test] - async fn mark_complete_rejects_empty_evidence() { - let pool = setup_pool().await; - let mgr = GoalManager::new(pool.clone(), "thread-1".into(), test_runtime()); - let goal = mgr.create_goal("Test goal", None).await.unwrap(); - - let err = mgr.mark_complete(&goal.id, "").await.unwrap_err(); - assert!(err.user_message.contains("evidence is required")); - } - - #[tokio::test] - async fn mark_complete_rejects_whitespace_only_evidence() { - let pool = setup_pool().await; - let mgr = GoalManager::new(pool.clone(), "thread-1".into(), test_runtime()); - let goal = mgr.create_goal("Test goal", None).await.unwrap(); - - let err = mgr.mark_complete(&goal.id, " ").await.unwrap_err(); - assert!(err.user_message.contains("evidence is required")); + let prompt = mgr.render_challenge_prompt(); + assert!(prompt.contains("agent_judge")); + assert!(prompt.contains("cannot self-declare")); } #[tokio::test] @@ -416,7 +368,7 @@ mod tests { let goal = mgr.create_goal("Test goal", Some(500)).await.unwrap(); // Accumulate tokens to reach the budget - goal_repo::account_usage(&pool, &goal.id, 500, 0, 0) + goal_repo::account_usage(&pool, &goal.id, 500, 0) .await .unwrap(); @@ -456,17 +408,135 @@ mod tests { } #[tokio::test] - async fn evaluate_after_turn_goal_scored_not_blocking() { + async fn evaluate_after_turn_agent_judge_not_blocking() { let pool = setup_pool().await; let mgr = GoalManager::new(pool.clone(), "thread-1".into(), test_runtime()); let goal = mgr.create_goal("Test goal", None).await.unwrap(); - // goal_scored should NOT trigger a pause in evaluation - mgr.record_tool_call("goal_scored"); - let verdict = mgr.evaluate_after_turn("Calling goal_scored", &goal); + // agent_judge should NOT trigger a pause in evaluation + mgr.record_tool_call("agent_judge"); + let verdict = mgr.evaluate_after_turn("Calling agent_judge", &goal); assert!(matches!(verdict, GoalVerdict::Continue)); } + // ── Judge verdict persistence (record_judge_verdict) ── + + #[tokio::test] + async fn record_judge_verdict_pass_marks_complete_and_verified() { + let pool = setup_pool().await; + let mgr = GoalManager::new(pool.clone(), "thread-1".into(), test_runtime()); + let goal = mgr.create_goal("Test goal", None).await.unwrap(); + + let recorded = goal_repo::record_judge_verdict( + &pool, + &goal.id, + "run-judge-1", + true, + 100, + "[]", + "All requirements verified; tests pass.", + ) + .await + .unwrap(); + assert!(recorded); + + let updated = mgr.get_active().await.unwrap().unwrap(); + assert_eq!(updated.status, GoalStatus::Complete); + assert!(updated.judge_passed); + assert_eq!(updated.judge_completeness, Some(100)); + assert_eq!( + updated.evidence.as_deref(), + Some("All requirements verified; tests pass.") + ); + assert_eq!( + updated.judge_evaluated_run_id.as_deref(), + Some("run-judge-1") + ); + + // A verified goal stops continuation. + let outcome = mgr + .evaluate_after_run("run-after", None) + .await + .unwrap() + .unwrap(); + assert_eq!(outcome.verdict, "skipped"); + assert!(outcome.continuation_prompt.is_none()); + } + + #[tokio::test] + async fn record_judge_verdict_fail_keeps_active_and_persists_findings() { + let pool = setup_pool().await; + let mgr = GoalManager::new(pool.clone(), "thread-1".into(), test_runtime()); + let goal = mgr.create_goal("Test goal", None).await.unwrap(); + + let findings = serde_json::to_string(&vec![ + "Missing unit tests for module X".to_string(), + "Build fails on Windows".to_string(), + ]) + .unwrap(); + let recorded = goal_repo::record_judge_verdict( + &pool, + &goal.id, + "run-judge-1", + false, + 60, + &findings, + "Not yet complete.", + ) + .await + .unwrap(); + assert!(recorded); + + let updated = mgr.get_active().await.unwrap().unwrap(); + assert_eq!(updated.status, GoalStatus::Active); + assert!(!updated.judge_passed); + assert!(updated.judge_findings.is_some()); + + // Continuation prompt should surface the latest findings. + let prompt = mgr.render_continuation_prompt(&updated); + assert!(prompt.contains("Missing unit tests for module X")); + assert!(prompt.contains("agent_judge")); + } + + #[tokio::test] + async fn migration_backfills_legacy_complete_goal_as_verified() { + let pool = setup_pool().await; + let mgr = GoalManager::new(pool.clone(), "thread-1".into(), test_runtime()); + let goal = mgr.create_goal("Legacy goal", None).await.unwrap(); + + // Simulate a legacy completed goal (no judge fields set yet). + sqlx::query( + "UPDATE goals SET status = 'complete', evidence = 'legacy evidence' WHERE id = ?", + ) + .bind(&goal.id) + .execute(&pool) + .await + .unwrap(); + // Apply the same backfill the migration performs. + sqlx::query( + "UPDATE goals SET judge_passed = 1, \ + judge_summary = COALESCE(judge_summary, evidence), \ + judge_completeness = COALESCE(judge_completeness, 100) \ + WHERE status = 'complete'", + ) + .execute(&pool) + .await + .unwrap(); + + let updated = mgr.get_active().await.unwrap().unwrap(); + assert_eq!(updated.status, GoalStatus::Complete); + assert!(updated.judge_passed); + assert_eq!(updated.judge_completeness, Some(100)); + + // It must not be re-opened by continuation. + let outcome = mgr + .evaluate_after_run("run-after", None) + .await + .unwrap() + .unwrap(); + assert_eq!(outcome.verdict, "skipped"); + } + #[tokio::test] async fn evaluate_after_turn_chinese_idle_phrase_pauses() { let pool = setup_pool().await; diff --git a/src/i18n/locales/en.ts b/src/i18n/locales/en.ts index 9dd01685..f3fd2940 100644 --- a/src/i18n/locales/en.ts +++ b/src/i18n/locales/en.ts @@ -1094,6 +1094,7 @@ const en: Record = { "goal.status.paused": "Paused", "goal.status.budgetLimited": "Budget Exhausted", "goal.status.complete": "Complete", + "goal.status.verified": "Verified", "goal.time.elapsed": "Running for {{time}}", "goal.time.hoursMinutes": "{{hours}}h {{minutes}}m", "goal.time.minutesSeconds": "{{minutes}}m {{seconds}}s", diff --git a/src/i18n/locales/zh-CN.ts b/src/i18n/locales/zh-CN.ts index 5b3272d2..3f5c5164 100644 --- a/src/i18n/locales/zh-CN.ts +++ b/src/i18n/locales/zh-CN.ts @@ -1133,6 +1133,7 @@ const zhCN = { "goal.status.paused": "已暂停", "goal.status.budgetLimited": "预算耗尽", "goal.status.complete": "已完成", + "goal.status.verified": "已验收通过", "goal.time.elapsed": "已持续运行{{time}}", "goal.time.hoursMinutes": "{{hours}}小时{{minutes}}分", "goal.time.minutesSeconds": "{{minutes}}分{{seconds}}秒", diff --git a/src/modules/workbench-shell/model/thread-store.ts b/src/modules/workbench-shell/model/thread-store.ts index 4c083f85..f87adfc6 100644 --- a/src/modules/workbench-shell/model/thread-store.ts +++ b/src/modules/workbench-shell/model/thread-store.ts @@ -93,7 +93,6 @@ export interface GoalStoreState { objective: string; status: "active" | "paused" | "budget_limited" | "complete"; tokensUsed: number; - timeUsedSeconds: number; turnsUsed: number; maxTurns: number; tokenBudget?: number | null; @@ -101,6 +100,11 @@ export interface GoalStoreState { pauseDetail?: string | null; evidence?: string | null; lastEvaluatedRunId?: string | null; + judgePassed?: boolean; + judgeCompleteness?: number | null; + judgeFindings?: string | null; + judgeSummary?: string | null; + judgeEvaluatedRunId?: string | null; } // --------------------------------------------------------------------------- diff --git a/src/modules/workbench-shell/ui/goal-status-bar.test.tsx b/src/modules/workbench-shell/ui/goal-status-bar.test.tsx index 24976d9e..6c049117 100644 --- a/src/modules/workbench-shell/ui/goal-status-bar.test.tsx +++ b/src/modules/workbench-shell/ui/goal-status-bar.test.tsx @@ -1,4 +1,5 @@ import { describe, expect, it } from "vitest"; +import { resolveGoalStatusKey } from "./goal-status-bar"; const source = await import("./goal-status-bar?raw").then((module) => module.default as string); @@ -21,3 +22,23 @@ describe("GoalStatusBar layout contract", () => { expect(source).not.toContain("goal.time.hoursMinutes"); }); }); + +describe("resolveGoalStatusKey", () => { + it("maps non-complete statuses to their own keys", () => { + expect(resolveGoalStatusKey("active", undefined)).toBe("goal.status.active"); + expect(resolveGoalStatusKey("paused", undefined)).toBe("goal.status.paused"); + expect(resolveGoalStatusKey("budget_limited", undefined)).toBe("goal.status.budgetLimited"); + }); + + it("shows the verified label only when a complete goal passed Judge acceptance", () => { + expect(resolveGoalStatusKey("complete", true)).toBe("goal.status.verified"); + }); + + it("falls back to the plain complete label when judge has not passed", () => { + expect(resolveGoalStatusKey("complete", false)).toBe("goal.status.complete"); + }); + + it("treats an undefined judgePassed as not verified", () => { + expect(resolveGoalStatusKey("complete", undefined)).toBe("goal.status.complete"); + }); +}); diff --git a/src/modules/workbench-shell/ui/goal-status-bar.tsx b/src/modules/workbench-shell/ui/goal-status-bar.tsx index 45c42922..86ace721 100644 --- a/src/modules/workbench-shell/ui/goal-status-bar.tsx +++ b/src/modules/workbench-shell/ui/goal-status-bar.tsx @@ -2,13 +2,36 @@ import { useCallback, useState } from "react"; import { goalGetState, goalPause, goalResume, goalClear } from "@/services/bridge/agent-commands"; -import { threadStore, useStore, shallowEqual } from "@/modules/workbench-shell/model/thread-store"; +import { threadStore, useStore, shallowEqual, type GoalStoreState } from "@/modules/workbench-shell/model/thread-store"; import { useT } from "@/i18n"; type Props = { threadId: string; }; +/** + * Resolve the i18n key for the goal status label. Extracted as a pure function + * so the `complete` → `verified` (judgePassed) branch can be unit-tested without + * mounting the component. + */ +export function resolveGoalStatusKey( + status: GoalStoreState["status"], + judgePassed: GoalStoreState["judgePassed"], +): + | "goal.status.active" + | "goal.status.paused" + | "goal.status.budgetLimited" + | "goal.status.verified" + | "goal.status.complete" { + switch (status) { + case "active": return "goal.status.active"; + case "paused": return "goal.status.paused"; + case "budget_limited": return "goal.status.budgetLimited"; + case "complete": return judgePassed ? "goal.status.verified" : "goal.status.complete"; + default: return "goal.status.active"; + } +} + export function GoalStatusBar({ threadId }: Props) { const t = useT(); const goal = useStore(threadStore, (s) => s.goalState[threadId] ?? null, shallowEqual); @@ -30,15 +53,7 @@ export function GoalStatusBar({ threadId }: Props) { if (!goal) return null; - const statusKey = (() => { - switch (goal.status) { - case "active": return "goal.status.active"; - case "paused": return "goal.status.paused"; - case "budget_limited": return "goal.status.budgetLimited"; - case "complete": return "goal.status.complete"; - default: return "goal.status.active"; - } - })(); + const statusKey = resolveGoalStatusKey(goal.status, goal.judgePassed); const statusColor = goal.status === "active" ? "bg-blue-500" diff --git a/src/modules/workbench-shell/ui/runtime-thread-surface.tsx b/src/modules/workbench-shell/ui/runtime-thread-surface.tsx index 6fbfb3e3..db893329 100644 --- a/src/modules/workbench-shell/ui/runtime-thread-surface.tsx +++ b/src/modules/workbench-shell/ui/runtime-thread-surface.tsx @@ -1689,11 +1689,11 @@ export function RuntimeThreadSurface({ "**" + argText + "**", "", "This goal has been created and is now **active**. Work toward it.", - "When the goal is fully achieved, you MUST call:", + "Completion is decided by independent verification — you cannot self-declare it. When you believe the goal is fully achieved, you MUST request acceptance by calling:", "```json", - "goal_scored(status=\"complete\", evidence=\"test output, file changes, verification steps\", pledge=\"I hereby declare: I confirm that I have fully achieved this goal, and I have confirmed that there are no remaining pending tasks or follow-up items. I confirm that I have repeatedly reviewed the output of this work, and I take responsibility for the quality of this output.\")", + "agent_judge(task=\"explain why you believe the goal is achieved / what to verify\")", "```", - "Do NOT mark complete without verified evidence.", + "An independent Judge evaluates the project against the goal. The goal is only marked verified when the Judge returns passed=true. If a verification does not pass, fix the reported findings and call agent_judge again.", "", "If you need user input before proceeding, use the clarify tool.", "The goal will automatically pause and resume when the user responds.", diff --git a/src/services/bridge/agent-commands.test.ts b/src/services/bridge/agent-commands.test.ts index 4ffb735a..eb521ca7 100644 --- a/src/services/bridge/agent-commands.test.ts +++ b/src/services/bridge/agent-commands.test.ts @@ -366,7 +366,6 @@ function makeGoalPayload(overrides: Partial = {}): GoalPayload { objective: "Build a todo app", status: "active", tokensUsed: 0, - timeUsedSeconds: 0, turnsUsed: 0, maxTurns: 50, tokenBudget: null, @@ -608,6 +607,16 @@ describe("goalEvaluate", () => { expect(result).toBeNull(); }); + it("passes through the skipped verdict for already-accepted goals", async () => { + isTauriMock.mockReturnValue(true); + const result = makeGoalEvaluateResult({ verdict: "skipped", continuationPrompt: null }); + invokeMock.mockResolvedValueOnce(result); + + const outcome = await goalEvaluate("thread-1"); + expect(outcome!.verdict).toBe("skipped"); + expect(outcome!.continuationPrompt).toBeNull(); + }); + it("requires Tauri runtime", async () => { isTauriMock.mockReturnValue(false); diff --git a/src/services/bridge/agent-commands.ts b/src/services/bridge/agent-commands.ts index 6e9ed990..9e1c17ac 100644 --- a/src/services/bridge/agent-commands.ts +++ b/src/services/bridge/agent-commands.ts @@ -727,7 +727,6 @@ export type GoalPayload = { objective: string; status: "active" | "paused" | "budget_limited" | "complete"; tokensUsed: number; - timeUsedSeconds: number; turnsUsed: number; maxTurns: number; tokenBudget?: number | null; @@ -735,6 +734,11 @@ export type GoalPayload = { pauseDetail?: string | null; evidence?: string | null; lastEvaluatedRunId?: string | null; + judgePassed?: boolean; + judgeCompleteness?: number | null; + judgeFindings?: string | null; + judgeSummary?: string | null; + judgeEvaluatedRunId?: string | null; }; export async function goalGetState(threadId: string): Promise { @@ -768,7 +772,7 @@ export async function goalClear(threadId: string): Promise { export type GoalEvaluateResult = { goal: GoalPayload; - verdict: "continue" | "challenge_evidence" | "complete" | "paused" | "budget_limited"; + verdict: "continue" | "challenge_evidence" | "paused" | "budget_limited" | "skipped"; continuationPrompt?: string | null; };