diff --git a/src-tauri/src/core/agent_session_execution.rs b/src-tauri/src/core/agent_session_execution.rs index 9b5d7e0a..fa98d07e 100644 --- a/src-tauri/src/core/agent_session_execution.rs +++ b/src-tauri/src/core/agent_session_execution.rs @@ -1612,21 +1612,23 @@ impl AgentSession { tool_call_storage_id: &str, tool_input: &serde_json::Value, ) -> AgentToolResult { - // Parse the main agent's task / rationale. - let request = match crate::core::subagent::JudgeRequest::from_tool_input(tool_input) { - Ok(request) => request, - Err(error) => { - tool_call_repo::update_result( - &self.pool, - tool_call_storage_id, - &serde_json::json!({ "error": &error }).to_string(), - "failed", - ) - .await - .ok(); - return agent_error_result(error); - } - }; + // Validate the tool input shape. The main agent's optional note is + // not injected into the Judge prompt — the Judge evaluates the + // project state independently against the goal. Parsing is retained + // to reject malformed input (e.g. non-string `task` values) that + // would violate the tool JSON schema. A missing `task` is acceptable + // and falls back to a neutral default. + if let Err(error) = crate::core::subagent::JudgeRequest::from_tool_input(tool_input) { + tool_call_repo::update_result( + &self.pool, + tool_call_storage_id, + &serde_json::json!({ "error": &error }).to_string(), + "failed", + ) + .await + .ok(); + return agent_error_result(error); + } // Backstop: re-query goal state. agent_judge is injected only when an // un-verified goal exists, but a stale tool set or a direct call must be @@ -1679,41 +1681,68 @@ impl AgentSession { return agent_error_result(err_msg); } - // Build the Judge task: inject the goal objective + status + last verdict - // so the Judge does not rely on the main agent's self-report. - let mut prior_verdict = String::new(); - if goal.judge_evaluated_run_id.is_some() { + // Build the Judge task: inject the goal objective, the task board + // state, (when applicable) process compliance evidence, and (when + // this is a re-verification) the previous Judge verdict. The Judge + // receives no input from the main agent — it evaluates the project + // state independently against the goal. The previous verdict is + // included only as objective context so the Judge can confirm prior + // findings have actually been addressed, not as a starting point + // for a self-assessment. + + // Query task board state for cross-reference. + let task_board_summary = build_task_board_summary(&self.pool, &goal.thread_id).await; + + // Conditionally include process compliance layer for goals that + // require reviews or phase-by-phase verification. + let process_compliance = if has_process_requirements(&goal.objective) { + build_process_compliance_summary(&self.pool, &goal.thread_id).await + } else { + String::new() + }; + + // On re-verification, surface the prior Judge verdict as objective + // context so the Judge can confirm each prior finding has been + // genuinely resolved. This is read from the goal record (not from + // the main agent) and is empty on the first verification. + let prior_verdict = if goal.judge_evaluated_run_id.is_some() { + let mut section = String::new(); if let Some(summary) = goal.judge_summary.as_deref() { if !summary.trim().is_empty() { - prior_verdict.push_str(&format!("\nPrevious Judge summary: {summary}")); + section.push_str(&format!("\nPrevious Judge summary: {summary}")); } } if let Some(findings_json) = goal.judge_findings.as_deref() { if let Ok(findings) = serde_json::from_str::>(findings_json) { if !findings.is_empty() { - prior_verdict.push_str("\nPrevious Judge findings:"); + section.push_str("\nPrevious Judge findings:"); for finding in findings { - prior_verdict.push_str(&format!("\n- {finding}")); + section.push_str(&format!("\n- {finding}")); } } } } - } + section + } else { + String::new() + }; let judge_task = format!( "You are verifying acceptance of the following goal for the current project.\n\n\ -Goal id: {goal_id}\n\ -Goal status: {status:?}\n\ -Goal objective:\n{objective}\n\ -{prior_verdict}\n\n\ -The main agent's note for this verification request:\n{task}\n\n\ +Goal objective:\n{objective}\n\n\ +{task_board_summary}\ +{process_compliance}\ +{prior_verdict}\n\ Independently inspect the project's current state and decide whether it satisfies the goal. \ +You must verify ALL requirements in the goal, not just those that seem to have been worked on. \ +Cross-reference the task board state above with your file-system findings. \ +If this is a re-verification, confirm that every prior finding has been genuinely \ +resolved (do NOT accept claims of fix without verifying the actual change). \ Return your structured JudgeReport verdict.", - goal_id = goal.id, - status = goal.status, objective = goal.objective, + task_board_summary = task_board_summary, + process_compliance = process_compliance, prior_verdict = prior_verdict, - task = request.task, ); // Build a Judge delegate (depth 2, primary model) and run it. @@ -1837,6 +1866,156 @@ Return your structured JudgeReport verdict.", } } +/// Build a human-readable summary of the task board state for the Judge. +/// Returns a string describing each step and its stage, or a note that no +/// task board exists. +async fn build_task_board_summary(pool: &sqlx::SqlitePool, thread_id: &str) -> String { + use crate::persistence::repo::{task_board_repo, task_item_repo}; + + let boards = match task_board_repo::list_by_thread(pool, thread_id).await { + Ok(boards) => boards, + Err(_) => return "(No task board data available.)\n".to_string(), + }; + + if boards.is_empty() { + return "(No task board exists for this goal. Verify entirely from file system and goal text.)\n" + .to_string(); + } + + let mut summary = String::from("## Associated task board state\n\n"); + for board in &boards { + // Skip abandoned boards — they are not relevant to the current goal. + if board.status.as_str() == "abandoned" { + continue; + } + summary.push_str(&format!( + "**{}** (status: {}):\n", + board.title, + board.status.as_str() + )); + + let items = match task_item_repo::list_by_task_board(pool, &board.id).await { + Ok(items) => items, + Err(_) => { + summary.push_str(" (Could not load task items.)\n"); + continue; + } + }; + + if items.is_empty() { + summary.push_str(" (No task items.)\n"); + continue; + } + + for item in &items { + summary.push_str(&format!( + " - [{}] {}\n", + item.stage.as_str(), + item.description + )); + } + } + + summary.push_str( + "\n**Important**: Any step above that is not `completed` and maps to a goal \ + requirement is evidence of incomplete work. Report these as findings.\n", + ); + summary +} + +/// Check whether the goal objective contains process requirements (e.g., +/// "review each phase", "每阶段验收"). When true, the Judge prompt will +/// include a process compliance layer showing the thread's review call history. +fn has_process_requirements(objective: &str) -> bool { + let lower = objective.to_lowercase(); + let keywords = [ + "review", + "验收", + "检查", + "verify each", + "verify every", + "per phase", + "每个阶段", + "每一阶段", + "每轮", + "阶段完成", + ]; + keywords.iter().any(|kw| lower.contains(&kw.to_lowercase())) +} + +/// Build a process compliance summary from the thread's run_helper history. +/// Lists all review-related helper calls chronologically with their input +/// summaries and status. Only meaningful when the goal objective contains +/// process requirements (e.g., "each phase must have a review"). +async fn build_process_compliance_summary(pool: &sqlx::SqlitePool, thread_id: &str) -> String { + use crate::persistence::repo::run_helper_repo; + + let helpers = match run_helper_repo::list_by_thread_id(pool, thread_id).await { + Ok(h) => h, + Err(_) => return String::new(), + }; + + // Filter for review-related calls: agent_review, helper_review + let reviews: Vec<_> = helpers + .iter() + .filter(|h| h.helper_kind.contains("review")) + .collect(); + + if reviews.is_empty() { + return format!( + "## Process compliance\n\n\ + No review calls found in thread history. \ + If the goal requires reviews, this is evidence of non-compliance.\n\n" + ); + } + + let mut summary = String::from("## Process compliance\n\n"); + summary.push_str("The following review calls were recorded during this goal:\n\n"); + + for (i, review) in reviews.iter().enumerate() { + let status_label = match review.status.as_str() { + "completed" => "✓ completed", + "failed" => "✗ failed", + "interrupted" => "⚠ interrupted", + _ => &review.status, + }; + + let input_preview = review + .input_summary + .as_deref() + .map(|s| { + // Truncate to first 200 chars for readability (character-safe, + // avoids panicking on multi-byte UTF-8 sequences). + if s.chars().count() > 200 { + format!("{}...", s.chars().take(200).collect::()) + } else { + s.to_string() + } + }) + .unwrap_or_else(|| "(no task description)".to_string()); + + summary.push_str(&format!( + "{}. `{}` called at {} (status: {})\n Scope: {}\n", + i + 1, + review.helper_kind, + // Truncate to first 19 chars (RFC3339 timestamp prefix) using + // char-aware slicing to avoid panicking on multi-byte UTF-8 + // boundaries — mirrors the 200-char limit used on + // `input_preview` above. + review.started_at.chars().take(19).collect::(), + status_label, + input_preview, + )); + } + + summary.push_str( + "\n**Guidance**: If the goal requires reviews at specific milestones \ + (e.g., \"after each phase\"), verify that the review calls above \ + cover all required milestones. Missing or failed reviews are findings.\n\n", + ); + summary +} + #[cfg(test)] mod tests { use super::{ @@ -2141,3 +2320,266 @@ mod tests { } } } + +#[cfg(test)] +mod has_process_requirements_tests { + use super::has_process_requirements; + + #[test] + fn detects_english_keywords() { + for objective in [ + "Each phase needs a code review before merge.", + "Verify every change against the spec.", + "Run a per phase smoke test.", + ] { + assert!( + has_process_requirements(objective), + "expected keyword match in: {objective}" + ); + } + } + + #[test] + fn detects_cjk_keywords() { + for objective in [ + "每个阶段都需要验收", + "每一阶段检查通过", + "需要你每轮 review", + "完成所有阶段完成的任务", + ] { + assert!( + has_process_requirements(objective), + "expected CJK keyword match in: {objective}" + ); + } + } + + #[test] + fn records_substring_match_semantics() { + // The implementation is a plain case-insensitive substring match + // over a fixed keyword list. These cases pin down the current + // behaviour, including the substring-match quirk where "review" + // hits inside "preview". They are not assertions about an ideal + // matcher — they are regression guards against accidental keyword + // list changes. If the keyword list is later tightened, update + // this test alongside it. + assert!( + has_process_requirements("Preview the rendered HTML before shipping."), + "current implementation matches 'review' inside 'preview' (substring match)" + ); + assert!( + !has_process_requirements("Forward-looking design without explicit verify step."), + "no keyword substring present" + ); + // "审阅" (look over) is intentionally NOT a keyword — only + // "验收" (formal acceptance) and "检查" (check) are, so this + // should be rejected. + assert!( + !has_process_requirements("请仔细审阅代码风格。"), + "审阅 does not contain any current keyword" + ); + assert!( + !has_process_requirements("Survey users about preferences."), + "no keyword substring present" + ); + } + + #[test] + fn empty_and_whitespace_objectives_return_false() { + assert!(!has_process_requirements("")); + assert!(!has_process_requirements(" \n\t ")); + } + + #[test] + fn keyword_match_is_case_insensitive() { + assert!(has_process_requirements("Final REVIEW before release.")); + assert!(has_process_requirements("Need a Verify Each phase step.")); + } +} + +#[cfg(test)] +mod judge_summary_tests { + //! Integration tests for the two Judge-prompt context builders: + //! + //! * [`super::build_task_board_summary`] — surfaces the active task board + //! (and its items) for the Judge to cross-check against the goal. + //! * [`super::build_process_compliance_summary`] — surfaces review + //! helper history so the Judge can verify the agent followed the + //! process-requirements contract. + //! + //! These functions are private and would otherwise only be exercised + //! through the Judge tool pipeline. The tests use raw SQL seeding on + //! a SQLite in-memory pool with migrations applied, matching the + //! pattern used by [`crate::goal_lifecycle`] tests. + + use sqlx::sqlite::{SqliteConnectOptions, SqlitePool, SqlitePoolOptions}; + use std::str::FromStr; + + use super::{build_process_compliance_summary, build_task_board_summary}; + + async fn setup_pool() -> SqlitePool { + let options = SqliteConnectOptions::from_str("sqlite::memory:") + .unwrap() + .foreign_keys(true); + let pool = SqlitePoolOptions::new() + .max_connections(1) + .connect_with(options) + .await + .unwrap(); + crate::persistence::sqlite::run_migrations(&pool) + .await + .unwrap(); + + let now = chrono::Utc::now().to_rfc3339(); + sqlx::query( + "INSERT INTO workspaces (id, name, path, canonical_path, display_path, + is_default, is_git, auto_work_tree, status, created_at, updated_at) + VALUES ('ws-test', 'Test Workspace', '/tmp/test', '/tmp/test', '/tmp/test', + 0, 0, 0, 'ready', ?, ?)", + ) + .bind(&now) + .bind(&now) + .execute(&pool) + .await + .expect("failed to seed workspace"); + + sqlx::query( + "INSERT INTO threads (id, workspace_id, title, status, last_active_at, created_at, updated_at) + VALUES ('thread-1', 'ws-test', 'Test Thread', 'idle', ?, ?, ?)", + ) + .bind(&now) + .bind(&now) + .bind(&now) + .execute(&pool) + .await + .expect("failed to seed thread"); + + // run_helpers has a FK to thread_runs; seed one run to satisfy it. + sqlx::query( + "INSERT INTO thread_runs (id, thread_id, run_mode, status, started_at, finished_at) + VALUES ('run-1', 'thread-1', 'default', 'completed', ?, ?)", + ) + .bind(&now) + .bind(&now) + .execute(&pool) + .await + .expect("failed to seed run"); + + pool + } + + #[tokio::test] + async fn task_board_summary_reports_when_no_board_exists() { + let pool = setup_pool().await; + + let summary = build_task_board_summary(&pool, "thread-1").await; + + assert!( + summary.contains("No task board exists"), + "expected absent-board message, got: {summary}" + ); + assert!( + summary.contains("file system and goal text"), + "absent-board summary should nudge Judge to file-system + goal verification, got: {summary}" + ); + } + + #[tokio::test] + async fn task_board_summary_renders_active_items_and_skips_abandoned() { + let pool = setup_pool().await; + + // Two active boards (one with items, one empty) + one abandoned. + sqlx::query( + "INSERT INTO task_boards (id, thread_id, title, status, created_at, updated_at) + VALUES ('board-1', 'thread-1', 'Implement feature', 'active', '2026-01-01T00:00:00.000Z', '2026-01-01T00:00:00.000Z'), + ('board-2', 'thread-1', 'Write docs', 'active', '2026-01-02T00:00:00.000Z', '2026-01-02T00:00:00.000Z'), + ('board-3', 'thread-1', 'Old attempt', 'abandoned', '2026-01-03T00:00:00.000Z', '2026-01-03T00:00:00.000Z')", + ) + .execute(&pool) + .await + .unwrap(); + + sqlx::query( + "INSERT INTO task_items (id, task_board_id, description, stage, sort_order, created_at, updated_at) + VALUES ('item-1', 'board-1', 'add API endpoint', 'completed', 0, '2026-01-01T00:00:00.000Z', '2026-01-01T00:00:00.000Z'), + ('item-2', 'board-1', 'add CLI wiring', 'in_progress', 1, '2026-01-01T00:00:00.000Z', '2026-01-01T00:00:00.000Z')", + ) + .execute(&pool) + .await + .unwrap(); + + let summary = build_task_board_summary(&pool, "thread-1").await; + + // Active boards appear with status. + assert!(summary.contains("**Implement feature** (status: active)")); + assert!(summary.contains("**Write docs** (status: active)")); + // Abandoned boards are filtered out before rendering. + assert!( + !summary.contains("Old attempt"), + "abandoned board should be skipped, got: {summary}" + ); + // Items render in the requested `stage / description` shape. + assert!(summary.contains("- [completed] add API endpoint")); + assert!(summary.contains("- [in_progress] add CLI wiring")); + // Empty board gets a placeholder. + assert!(summary.contains("(No task items.)")); + // Footer reminds the Judge to report non-completed steps as findings. + assert!(summary.contains("Report these as findings")); + } + + #[tokio::test] + async fn process_compliance_summary_reports_when_no_reviews_exist() { + let pool = setup_pool().await; + + let summary = build_process_compliance_summary(&pool, "thread-1").await; + + assert!( + summary.contains("No review calls found"), + "expected absent-review message, got: {summary}" + ); + assert!(summary.contains("non-compliance")); + } + + #[tokio::test] + async fn process_compliance_summary_filters_to_review_helpers_only() { + let pool = setup_pool().await; + + // Three helpers: one review, one explore, one review with a long + // input_summary to exercise the 200-char truncation path. + let long_input = "x".repeat(450); + sqlx::query( + "INSERT INTO run_helpers + (id, run_id, thread_id, helper_kind, status, input_summary, started_at, finished_at) + VALUES + ('rh-1', 'run-1', 'thread-1', 'agent_review', 'completed', 'review the diff', '2026-01-01T00:00:00.000Z', '2026-01-01T00:01:00.000Z'), + ('rh-2', 'run-1', 'thread-1', 'agent_explore', 'completed', 'scan code', '2026-01-01T00:02:00.000Z', '2026-01-01T00:03:00.000Z'), + ('rh-3', 'run-1', 'thread-1', 'helper_review', 'failed', ?1, '2026-01-01T00:04:00.000Z', '2026-01-01T00:05:00.000Z')", + ) + .bind(&long_input) + .execute(&pool) + .await + .unwrap(); + + let summary = build_process_compliance_summary(&pool, "thread-1").await; + + // Both review helpers are listed. + assert!(summary.contains("`agent_review`")); + assert!(summary.contains("`helper_review`")); + // Non-review helpers are filtered out. + assert!( + !summary.contains("`agent_explore`"), + "non-review helper should be filtered, got: {summary}" + ); + // Status symbols are mapped to a human label. + assert!(summary.contains("✓ completed")); + assert!(summary.contains("✗ failed")); + // Long input is truncated to 200 chars + ellipsis. We assert the + // exact trailing shape rather than digging for a specific line: + // the input was 450 'x' characters, so the rendered Scope line + // must end with 200 'x' chars followed by "...". + assert!( + summary.contains(&format!("{}...", "x".repeat(200))), + "long input should be truncated to 200 chars + '...'" + ); + } +} diff --git a/src-tauri/src/core/context_compression.rs b/src-tauri/src/core/context_compression.rs index a0f7efcc..47c927eb 100644 --- a/src-tauri/src/core/context_compression.rs +++ b/src-tauri/src/core/context_compression.rs @@ -21,9 +21,16 @@ use tiycore::agent::AgentMessage; use tiycore::types::{ContentBlock, TextContent, UserMessage}; -/// Reserve this many tokens for the model's response + overhead. -/// Matches pi-mono `DEFAULT_COMPACTION_SETTINGS.reserveTokens`. -const RESERVE_TOKENS: u32 = 16_384; +/// Fraction of the model's context window that is reserved for the model's +/// response + provider/tool overhead, expressed in basis points (1/100th of +/// a percent). 2000 bps == 20%. +const RESERVE_BASIS_POINTS: u32 = 2_000; + +/// Minimum number of tokens to keep reserved even when 20% of the context +/// window is smaller than this floor. The previous hard-coded reserve of +/// `16_384` tokens is preserved as a safe lower bound for typical large +/// context windows while still allowing tiny windows to behave sanely. +const RESERVE_TOKENS_MIN: u32 = 16_384; /// Keep at least this many tokens of recent conversation untouched. /// With LLM-generated summaries providing rich context, we can keep a @@ -233,7 +240,7 @@ impl CompressionSettings { pub fn new(context_window: u32) -> Self { Self { context_window, - reserve_tokens: RESERVE_TOKENS, + reserve_tokens: reserve_tokens_for(context_window), keep_recent_tokens: KEEP_RECENT_TOKENS, } } @@ -244,6 +251,19 @@ impl CompressionSettings { } } +/// Reserve `RESERVE_BASIS_POINTS` (20%) of the model's context window for +/// the model's response + overhead, with `RESERVE_TOKENS_MIN` as a floor +/// so that very small windows still keep a sane amount of headroom and +/// huge windows don't get a pathologically tiny reserve. +fn reserve_tokens_for(context_window: u32) -> u32 { + let percent_budget = ((context_window as u64) + .saturating_mul(RESERVE_BASIS_POINTS as u64) + .saturating_add(9_999)) + / 10_000; + let percent_budget = percent_budget.min(u32::MAX as u64) as u32; + percent_budget.max(RESERVE_TOKENS_MIN) +} + // --------------------------------------------------------------------------- // Public API: should_compress, find_cut_point, build_compressed_messages // --------------------------------------------------------------------------- @@ -911,6 +931,49 @@ mod tests { assert_eq!(calibration.apply_to_estimate(0), 0); } + #[test] + fn compression_settings_reserves_twenty_percent_of_context_window() { + // For typical large context windows the 20% reserve is well above + // the 16,384 token floor, so the budget is exactly 80% of the + // window. This is the primary behaviour change: instead of + // reserving a fixed 16,384 tokens regardless of model, we reserve + // 20% of the model's actual context window. + let cases = [ + (128_000_u32, 25_600_u32, 102_400_u32), // GPT-4o class + (200_000_u32, 40_000_u32, 160_000_u32), // Claude-class + (1_000_000_u32, 200_000_u32, 800_000_u32), // 1M-window class + ]; + for (context_window, expected_reserve, expected_budget) in cases { + let settings = CompressionSettings::new(context_window); + assert_eq!( + settings.reserve_tokens, expected_reserve, + "20% reserve for {context_window}-token window", + ); + assert_eq!( + settings.budget(), + expected_budget, + "80% budget for {context_window}-token window", + ); + } + } + + #[test] + fn compression_settings_reserve_clamps_to_minimum_for_small_windows() { + // When 20% of the window would be smaller than the safety floor + // (16,384 tokens), the floor takes over so tiny windows still + // keep enough headroom for the model response. + let settings = CompressionSettings::new(32_000); + // 20% of 32,000 = 6,400 < 16,384 → floor wins. + assert_eq!(settings.reserve_tokens, 16_384); + assert_eq!(settings.budget(), 32_000 - 16_384); + + // Window at or below the floor: reserve equals the floor and + // saturating_sub protects `budget` from underflow. + let tiny = CompressionSettings::new(8_000); + assert_eq!(tiny.reserve_tokens, 16_384); + assert_eq!(tiny.budget(), 0); + } + #[test] fn should_compress_via_context_size_triggers_when_last_usage_exceeds_budget() { // The unified `context_size` (= input + output + cache_read + diff --git a/src-tauri/src/core/goal_manager.rs b/src-tauri/src/core/goal_manager.rs index 3e7a346e..77194db2 100644 --- a/src-tauri/src/core/goal_manager.rs +++ b/src-tauri/src/core/goal_manager.rs @@ -255,34 +255,6 @@ impl GoalManager { goal_repo::account_usage(&self.pool, goal_id, tokens, 1).await } - // ── Auto-resume ── - - /// Check if a paused goal should auto-resume when the user sends a new message. - /// Returns Some(()) if the goal was auto-resumed, None if it shouldn't. - pub async fn try_auto_resume(&self) -> Result { - let goal = match self.get_active().await? { - Some(g) => g, - None => return Ok(false), - }; - - if goal.status != GoalStatus::Paused { - return Ok(false); - } - - let should_resume = goal - .pause_reason - .as_ref() - .map(|r| r.auto_resume_on_user_message()) - .unwrap_or(false); - - if should_resume { - goal_repo::update_status(&self.pool, &goal.id, GoalStatus::Active, None, None, None) - .await?; - } - - Ok(should_resume) - } - // ── Evaluation ── /// Evaluate whether the goal should continue, pause, or complete after a turn. @@ -307,27 +279,12 @@ impl GoalManager { return verdict; } - // Completion claim without tool call + // Completion claim without tool call — keep nudging agent toward + // Judge verification; no DB status change. The counter is still + // cleared on tool activity / resume / clear, but reaching 3 no + // longer auto-pauses (status transitions are reserved for user + // commands and Judge verdicts). if self.detect_completion_claim(response) { - let should_pause = { - let mut guard = self.lock_runtime(); - let count = guard - .completion_claim_count - .entry(self.thread_id.clone()) - .or_default(); - *count += 1; - *count >= 3 - }; - if should_pause { - // Reset counter before pausing - self.lock_runtime() - .completion_claim_count - .remove(&self.thread_id); - return GoalVerdict::Paused { - reason: PauseReason::IdleBlocked, - detail: Some("agent repeatedly claimed completion without requesting Judge verification via agent_judge".into()), - }; - } return GoalVerdict::ChallengeEvidence; } @@ -342,13 +299,12 @@ impl GoalManager { // Reset idle counters since tools were called self.reset_idle_counters(); - // ── Layer 4: Budget checks ── - if let Some(budget) = goal.token_budget { - if goal.tokens_used >= budget { - return GoalVerdict::BudgetLimited; - } - } - + // ── Layer 4: Turn budget + token budget checks ── + // `turns_used >= max_turns` auto-pauses (explicitly approved path). + // `tokens_used >= token_budget` is reported via the `budget_limited` + // verdict string (no DB status change) so the run loop can stop + // continuation. Status transitions are reserved for explicit user + // commands and Judge verdicts. if goal.turns_used >= goal.max_turns { return GoalVerdict::Paused { reason: PauseReason::BudgetExhausted, @@ -359,6 +315,12 @@ impl GoalManager { }; } + if let Some(budget) = goal.token_budget { + if goal.tokens_used >= budget { + return GoalVerdict::BudgetLimited; + } + } + // ── Default: continue ── GoalVerdict::Continue } @@ -370,35 +332,23 @@ impl GoalManager { tool_calls: &[String], _response: &str, ) -> Option { - for tool_name in tool_calls { - match tool_name.as_str() { - "clarify" => { - return Some(GoalVerdict::Paused { - reason: PauseReason::ClarifyPending, - detail: Some("agent requested clarification".into()), - }); - } - "update_plan" => { - return Some(GoalVerdict::Paused { - reason: PauseReason::PlanPending, - detail: Some("agent published a plan, awaiting approval".into()), - }); - } - // agent_judge is the main-agent-only acceptance request. It is - // handled by the tool execution pipeline (execute_judge_tool), - // which runs the Judge and records the verdict. Evaluation must - // not treat it as a blocking tool — like any tool call it shows - // the agent acted and should reset idle tendencies. - _ => {} - } - } + // Tool-based auto-pausing has been removed: status transitions are + // reserved for explicit user commands and Judge verdicts. `clarify` + // and `update_plan` no longer flip the goal to paused; they fall + // through to the continuation path. `agent_judge` is the main-agent + // acceptance request and is handled by the tool execution pipeline + // (execute_judge_tool), which runs the Judge and records the + // verdict — it is never a blocking tool here. + let _ = tool_calls; None } fn detect_idle_block(&self, response: &str) -> Option { let idle_count = self.increment_idle_count(); - let trimmed = response.trim().to_lowercase(); - + // Heuristic question detection has been removed; status transitions + // are reserved for explicit user commands and Judge verdicts, so idle + // detection is purely a turn-count trigger. `response` is unused. + let _ = response; if idle_count >= MAX_IDLE_TURNS { return Some(GoalVerdict::Paused { reason: PauseReason::IdleBlocked, @@ -407,31 +357,6 @@ impl GoalManager { )), }); } - - // Lightweight heuristic: short question-like response + no tools - if idle_count >= 2 { - let blockers = [ - "should i", - "do you want", - "would you like", - "请确认", - "需要你决定", - "which approach", - "which option", - "can you confirm", - "let me know if", - "before i proceed", - "你的选择是", - "你确认吗", - "需要你同意", - ]; - if trimmed.len() < 500 && blockers.iter().any(|b| trimmed.contains(b)) { - return Some(GoalVerdict::Paused { - reason: PauseReason::IdleBlocked, - detail: Some("agent appears blocked, may need user input".into()), - }); - } - } None } @@ -613,7 +538,15 @@ impl GoalManager { .await?; } GoalVerdict::BudgetLimited => { - self.mark_budget_limited(¤t.id).await?; + // Advisory: token budget exhausted — do NOT write to DB. + // The verdict string still propagates as "budget_limited" so + // the run loop can stop continuation. Goal status remains + // `active` and is only changed by explicit user commands + // (`/goal budget-limit`) or Judge verdicts. + tracing::info!( + goal_id = %current.id, + "token budget exhausted: emitting budget_limited verdict without DB status change" + ); } } diff --git a/src-tauri/src/core/prompt/templates/active_goal.tpl.md b/src-tauri/src/core/prompt/templates/active_goal.tpl.md index c36eb3dd..d96a87f8 100644 --- a/src-tauri/src/core/prompt/templates/active_goal.tpl.md +++ b/src-tauri/src/core/prompt/templates/active_goal.tpl.md @@ -11,10 +11,10 @@ Turns used: {{turns_used}}/{{max_turns}} **Completion is decided by independent verification — you cannot self-declare it.** 1. Every subtask implied by the objective must be done, with no remaining work or dangling follow-ups. 2. Verify your work by running the relevant tests, linters, or build commands as you go. -3. When you believe the goal is achieved, you MUST request acceptance by calling `agent_judge(task="...")`. +3. When you believe the goal is achieved, you MUST request acceptance by calling `agent_judge()`. Rules: -- Call `agent_judge(task="explain why you believe the goal is achieved / what to verify")` when you think the goal is complete. An independent Judge will evaluate the project against the goal's consistency and completeness. +- Call `agent_judge()` to request independent goal acceptance verification. An independent Judge will evaluate the project against the goal's completeness. You do not need to provide a self-assessment — the Judge evaluates the project state directly. - The goal is only marked verified when the Judge returns passed=true. You cannot mark the goal complete yourself. - If a Judge verification did not pass, read its findings, fix each one, then call `agent_judge` again. - Once the goal has passed Judge acceptance, stop making further changes and summarize the result. diff --git a/src-tauri/src/core/prompt/templates/subagent/judge.md b/src-tauri/src/core/prompt/templates/subagent/judge.md index 58f8b873..32839804 100644 --- a/src-tauri/src/core/prompt/templates/subagent/judge.md +++ b/src-tauri/src/core/prompt/templates/subagent/judge.md @@ -3,17 +3,20 @@ section_id: SubagentJudge version: 1 declared_keys: [] --- -You are the **Goal Acceptance Judge** — an independent verifier. The main agent has been working toward a goal and now believes it is achieved (or has fixed earlier findings and wants re-verification). Your job is to independently decide whether the project's **current state** truly satisfies the goal, focusing on **consistency** with what the goal asked for and **completeness** of the work. +You are the **Goal Acceptance Judge** — an independent auditor. Your task is to determine whether the project's **current state** satisfies a goal objective. You work **independently** — you receive no input from the main agent about what it did, changed, or believes is complete. Your assessment must be based solely on objective evidence: the goal objective, the project file system, the task board associated with this goal, and verification commands you run yourself. -You are an evaluator, not an implementer. You did not do the work, and you must not take the main agent's claims at face value — verify against the actual project state. Goal tasks are typically long-horizon with broad change surfaces, so your evaluation must scale: be thorough enough to catch real gaps, efficient enough to converge in one pass, and honest about what you actually verified. +You are an evaluator, not an implementer. Every evaluation is a **fresh, independent, full-scope assessment**. Do not inherit or defer to any prior judge's conclusions — each call starts from scratch. -## Operating principle: size first, then verify +## Core principle: size first, then verify the ENTIRE goal -Do not start verifying detail by detail before you understand the shape of the change. The right verification budget — and whether to fan out work to subagents — depends on how much actually changed and how it is distributed. +Do not start verifying detail by detail before you understand the shape of the change. The right verification budget — and whether to fan out work to subagents — depends on how much actually changed and how it is distributed. And once sized, you must verify **ALL** requirements in the goal against the current project state. A goal requirement you didn't check is a gap in your verification, not a gap that doesn't exist. ### Step 1 — Size the change (always do this first) - Run `git_status` and `git_diff --stat` (or the project's equivalent) to enumerate changed files, additions/deletions, and the rough surface area. - Cross-reference with the goal objective: identify which subsystems / layers / acceptance criteria each cluster of changes maps to. +- Parse the goal objective into distinct, verifiable requirements. Every requirement must be checked — implicit ones count too (e.g., if the goal says "implement X with tests", both the implementation and the tests are required). +- Read any design documents or acceptance criteria referenced by the goal (e.g., `@docs/architecture.md`). Extract every acceptance item from them. +- Check the task board associated with this goal (provided in your task prompt). Task board steps that are not `completed` are **direct evidence of incomplete work** and must be reported. A pending step that maps to a goal requirement means that requirement is not satisfied. - Form an explicit mental model before any deep reading: - **Small** — ≤ ~5 files changed, single module/layer, narrow concern. One linear pass is enough. - **Medium** — ~6–20 files, 2–3 subsystems or layers touched, multiple acceptance criteria. @@ -31,19 +34,23 @@ Do not start verifying detail by detail before you understand the shape of the c - **By goal subtask** — one helper per acceptance criterion when the goal is itemized. Keep each subtask independent (no shared write state), bounded in scope, and concretely scoped to file lists or topics inferred from the diff. After the parallel batch returns, **synthesize the results yourself** — reconcile conflicts, call out failures or skipped items, and form one coherent verdict. Do not just concatenate helper outputs. -### Step 3 — Run the verification commands the project actually uses -- Adapt commands to this repository (infer from manifests, scripts, CI config, and workspace instructions). Do not assume a stack. -- Prefer the *narrowest* command that still covers the changed surface (e.g. test only the affected package) before falling back to repo-wide runs. For Large changes a repo-wide build/typecheck is usually still warranted. +### Step 3 — Verify against the actual project state +- Read the relevant source files yourself. Do not assume code exists just because a task board step claims to have created it. +- **Call-chain verification**: for every type, function, or module you find defined, verify it is **actually wired into the runtime path** — called, consumed, or registered. A struct defined but never instantiated, a semaphore created but never acquired, or a policy trait implemented but never invoked in the request handler is **not** evidence of completion. Report these as findings. +- Run the verification commands the project uses (infer from manifests, CI config, workspace instructions): type-checks, tests, linters, formatters. Adapt to the actual project stack. +- When a protocol, endpoint, or feature is declared in code, verify its **routing** — is it reachable by an actual HTTP handler or equivalent entry point? A codec registered via `inventory::submit!` but never consumed by `inventory::iter` is half-finished work. - When `agent_review` is delegated, treat its verification output as authoritative — do not rerun the same commands unless its results were inconclusive. +- If a previous Judge verdict was provided, confirm that every prior finding has been **genuinely** resolved (re-read the file, re-run the command). Do not accept claims of fix without verifying the actual change. + +### Step 4 — Cross-reference with the task board +- Compare the task board state against your file-system findings. If the board says a step is `completed` but the files don't back it up, that is a finding. If a step is `pending` that directly maps to a goal requirement, that requirement is not met — report it. +- If no task board exists for this goal, note it but do not fail on that basis alone — verify entirely from the file system and goal text. ## Delegation guidelines -- `agent_explore` — single focused investigation: "where is X used?", "how is Y wired?", "does the codebase still reference Z?". Use when one targeted read-only sweep beats inlining a dozen `read`/`search` calls. -- `agent_review` — bounded review of a slice of the implementation, including running its tests/type-check/lint. Pass `target='diff'` when the helper should look at the workspace changes; provide an explicit changed-file list when you already have one. -- `agent_parallel` — 2–5 independent read-only/review subtasks dispatched together. Prefer this over sequential helper calls whenever the topics are genuinely independent. Never recurse parallel into parallel. -- Do **not** delegate when: - - The change is small enough to inspect inline. - - The subtasks are interdependent (later ones need earlier results). - - You only need one shell command — just run it. +- `agent_explore` — single focused investigation: "where is X used?", "how is Y wired?", "does Z actually get called in the request path?". Use when one targeted read-only sweep beats inlining a dozen `read`/`search` calls. +- `agent_review` — bounded review of a slice of the implementation, including running its tests/type-check/lint. Pass `target='code'` or `target='diff'` as appropriate. +- `agent_parallel` — 2–5 independent read-only/review subtasks dispatched together. Use when the goal's requirements can be split into independent topics (by layer, subsystem, or acceptance criterion). Prefer this over sequential helper calls whenever topics are genuinely independent. +- Do **not** delegate when the goal is small enough to inspect inline, the subtasks are interdependent, or you only need one shell command. - Always tell each delegate explicitly: the goal text, which slice they own, what evidence to return, and that they are read-only. ## Hard constraints (read-only acceptance) @@ -53,12 +60,13 @@ Do not start verifying detail by detail before you understand the shape of the c - Helpers you delegate to inherit the same read-only constraint; remind them in the task text when relevant. ## Coverage honesty -- Track what you actually verified vs. what you sampled vs. what you skipped. A Large change you only spot-checked is **not** the same as a Large change you fully covered. +- Track what you actually verified vs. what you sampled vs. what you skipped. A goal you only spot-checked is **not** the same as one you fully covered. A Large change you only spot-checked is **not** the same as a Large change you fully covered. - When delegating, if any helper failed, returned inconclusive results, or could not run a command, treat that area as **not verified** — record it explicitly and let it influence the verdict. - Never imply a check passed without trustworthy evidence. If your `summary` cannot point to specific files, commands, or behaviors you confirmed, you do not have a basis to pass. ## Verdict rules -- Pass (`passed=true`) only when the project genuinely satisfies the goal with no material gaps **and** your verification covered the full change surface (directly or via successful delegates). When you pass, `summary` must clearly state the verified evidence — files inspected, commands run with their results, and which goal criteria each piece of evidence maps to. It becomes the goal's completion evidence. -- If anything required by the goal is missing, inconsistent, untested, or broken, set `passed=false` and list each concrete gap in `findings` (file path + what is wrong + why it violates the goal). One concrete finding is more valuable than three vague ones. +- Pass (`passed=true`) only when **every** requirement in the goal is genuinely satisfied with no material gaps, **and** your verification covered the full requirement surface (directly or via successful delegates). When you pass, `summary` must clearly state the verified evidence — files inspected, commands run with their results, and which goal criteria each piece of evidence maps to. It becomes the goal's completion evidence. +- If anything required by the goal is missing, inconsistent, untested, broken, or **defined but not wired**, set `passed=false` and list each concrete gap in `findings` (file path + what is wrong + why it violates the goal). One concrete finding is more valuable than three vague ones. - Be honest and conservative: when in doubt, do not pass. A false "passed" is worse than an extra verification round. - Calibrate `completenessPct` to actual coverage and remaining gaps, not to effort spent. A change that does 80% of the goal correctly is 80, not 100, even if the implemented parts are flawless. +- You must never use "pre-existing" or "accepted by prior judge" as a reason to pass a finding. Each finding stands or falls on its own merit against the goal requirements. diff --git a/src-tauri/src/core/prompt/templates/subagent/output_contract.judge.md b/src-tauri/src/core/prompt/templates/subagent/output_contract.judge.md index a695dd71..e144a2da 100644 --- a/src-tauri/src/core/prompt/templates/subagent/output_contract.judge.md +++ b/src-tauri/src/core/prompt/templates/subagent/output_contract.judge.md @@ -15,7 +15,7 @@ Return exactly one JSON object with this contract and nothing else (no markdown } Field rules: -- `passed` (boolean): true only when the project genuinely satisfies the goal. +- `passed` (boolean): true only when the project genuinely satisfies **every** goal requirement. - `completenessPct` (integer 0-100): your honest estimate of how complete the work is against the goal. -- `findings` (array of strings): each concrete unmet / inconsistent / untested / broken point. REQUIRED and non-empty when `passed=false`. +- `findings` (array of strings): each concrete unmet / inconsistent / untested / broken / not-wired point. REQUIRED and non-empty when `passed=false`. Each finding must reference a concrete file path and/or a specific goal requirement it violates. Do not accept vague descriptions — state exactly what file, what is missing, and what goal requirement is violated. - `summary` (string): rationale for the verdict. REQUIRED and non-empty when `passed=true` — it becomes the goal's completion evidence. If you cannot provide real evidence, set `passed=false`. diff --git a/src-tauri/src/core/subagent/judge_contract.rs b/src-tauri/src/core/subagent/judge_contract.rs index 2a673d93..54318b0d 100644 --- a/src-tauri/src/core/subagent/judge_contract.rs +++ b/src-tauri/src/core/subagent/judge_contract.rs @@ -3,8 +3,11 @@ use serde::{Deserialize, Serialize}; /// Input for the `agent_judge` tool (provided by the main agent). #[derive(Debug, Clone)] pub struct JudgeRequest { - /// The main agent's explanation of why it believes the goal is achieved, - /// and/or points it wants the Judge to focus on. + /// An optional note from the main agent about this verification request. + /// The Judge evaluates the project state independently against the goal + /// and does not rely on this field as a self-assessment. Parsed for + /// input validation; if absent, the Judge still runs and the value is + /// discarded by `execute_judge_tool`. pub task: String, } @@ -17,8 +20,12 @@ impl JudgeRequest { .trim() .to_string(); + // Task is optional; an empty task string is valid. + // The Judge does not receive the main agent's self-assessment. if task.is_empty() { - return Err("missing agent_judge task".to_string()); + return Ok(Self { + task: "Goal acceptance verification".to_string(), + }); } Ok(Self { task }) @@ -198,8 +205,10 @@ mod tests { use super::*; #[test] - fn judge_request_requires_task() { - assert!(JudgeRequest::from_tool_input(&serde_json::json!({})).is_err()); + fn judge_request_empty_task_returns_default() { + // Empty task is now valid; returns a default task string. + let req = JudgeRequest::from_tool_input(&serde_json::json!({})).expect("empty task parses"); + assert_eq!(req.task, "Goal acceptance verification"); let req = JudgeRequest::from_tool_input(&serde_json::json!({ "task": " verify it " })) .expect("parses"); assert_eq!(req.task, "verify it"); diff --git a/src-tauri/src/core/subagent/runtime_orchestration.rs b/src-tauri/src/core/subagent/runtime_orchestration.rs index c458e098..326e748b 100644 --- a/src-tauri/src/core/subagent/runtime_orchestration.rs +++ b/src-tauri/src/core/subagent/runtime_orchestration.rs @@ -356,10 +356,9 @@ impl RuntimeOrchestrationTool { "properties": { "task": { "type": "string", - "description": "Explain why you believe the goal is achieved and call out anything the Judge should focus on (e.g. acceptance criteria, areas you are unsure about). If you are re-verifying after fixing earlier findings, summarize what you changed." + "description": "Optional note for the Judge. The Judge evaluates the project state independently against the goal and does not rely on your self-assessment." } - }, - "required": ["task"] + } }), }; diff --git a/src-tauri/src/model/goal.rs b/src-tauri/src/model/goal.rs index 62f129d2..ddc59393 100644 --- a/src-tauri/src/model/goal.rs +++ b/src-tauri/src/model/goal.rs @@ -80,17 +80,6 @@ impl PauseReason { } } } - - /// Whether the goal should auto-resume when the user sends a new message. - pub fn auto_resume_on_user_message(&self) -> bool { - matches!( - self, - PauseReason::ClarifyPending - | PauseReason::PlanPending - | PauseReason::IdleBlocked - | PauseReason::Interrupted - ) - } } /// Verdict from the post-turn evaluation. diff --git a/src-tauri/src/persistence/repo/run_helper_repo.rs b/src-tauri/src/persistence/repo/run_helper_repo.rs index 06eae05a..61e448d8 100644 --- a/src-tauri/src/persistence/repo/run_helper_repo.rs +++ b/src-tauri/src/persistence/repo/run_helper_repo.rs @@ -300,6 +300,27 @@ pub async fn list_by_run_ids( Ok(rows.into_iter().map(RunHelperRow::into_dto).collect()) } +/// List all run_helpers for a given thread. Used by the Judge's process +/// compliance layer to inspect review call history. +pub async fn list_by_thread_id( + pool: &SqlitePool, + thread_id: &str, +) -> Result, AppError> { + let rows = sqlx::query_as::<_, RunHelperRow>( + "SELECT id, run_id, thread_id, helper_kind, parent_tool_call_id, status, + input_summary, output_summary, error_summary, started_at, finished_at, + input_tokens, output_tokens, cache_read_tokens, cache_write_tokens, total_tokens + FROM run_helpers + WHERE thread_id = ? + ORDER BY started_at ASC, id ASC", + ) + .bind(thread_id) + .fetch_all(pool) + .await?; + + Ok(rows.into_iter().map(RunHelperRow::into_dto).collect()) +} + #[cfg(test)] mod tests { use super::*; @@ -853,6 +874,41 @@ mod tests { assert!(kinds.contains(&"explore".into())); } + #[tokio::test] + async fn list_by_thread_id_returns_helpers_for_thread() { + let pool = setup_test_pool().await; + + // Insert helpers for the test thread + for (id, kind) in &[("h-1", "helper_review"), ("h-2", "helper_explore")] { + let helper = RunHelperInsert { + id: id.to_string(), + run_id: "run-1".into(), + thread_id: "t1".into(), + helper_kind: kind.to_string(), + parent_tool_call_id: None, + status: "completed".into(), + model_role: "auxiliary".into(), + provider_id: None, + model_id: None, + input_summary: Some(format!("{kind} task")), + }; + insert(&pool, &helper).await.unwrap(); + } + + let result = list_by_thread_id(&pool, "t1").await.unwrap(); + assert_eq!(result.len(), 2); + let kinds: Vec = result.iter().map(|h| h.helper_kind.clone()).collect(); + assert!(kinds.contains(&"helper_review".into())); + assert!(kinds.contains(&"helper_explore".into())); + } + + #[tokio::test] + async fn list_by_thread_id_returns_empty_for_unknown_thread() { + let pool = setup_test_pool().await; + let result = list_by_thread_id(&pool, "t-unknown").await.unwrap(); + assert!(result.is_empty()); + } + #[tokio::test] async fn list_by_run_ids_returns_empty_for_empty_input() { let pool = setup_test_pool().await; diff --git a/src-tauri/src/persistence/repo/run_repo.rs b/src-tauri/src/persistence/repo/run_repo.rs index 52c9782b..994d4e12 100644 --- a/src-tauri/src/persistence/repo/run_repo.rs +++ b/src-tauri/src/persistence/repo/run_repo.rs @@ -321,6 +321,11 @@ pub async fn interrupt_active_runs(pool: &SqlitePool) -> Result { // Excludes all non-progressing statuses — see RunStatus::non_progressing_sql_in_clause() "UPDATE thread_runs SET status = 'interrupted', + elapsed_running_secs = elapsed_running_secs + + CASE WHEN running_since IS NOT NULL + THEN CAST(strftime('%s', 'now') - strftime('%s', running_since) AS INTEGER) + ELSE 0 END, + running_since = NULL, error_message = COALESCE( error_message, 'The app closed or the run was terminated before completion. Restarted in interrupted state.' diff --git a/src-tauri/tests/goal_lifecycle.rs b/src-tauri/tests/goal_lifecycle.rs index 71d8ed90..30c67845 100644 --- a/src-tauri/tests/goal_lifecycle.rs +++ b/src-tauri/tests/goal_lifecycle.rs @@ -165,7 +165,11 @@ mod tests { } #[tokio::test] - async fn evaluate_after_turn_clarify_triggers_pause() { + async fn evaluate_after_turn_clarify_no_longer_pauses() { + // Tool-based auto-pausing has been removed: `clarify` no longer + // returns a `Paused(ClarifyPending)` verdict. Status transitions are + // reserved for explicit user commands and Judge verdicts, so the + // evaluate path falls through to `Continue`. let pool = setup_pool().await; let mgr = GoalManager::new(pool.clone(), "thread-1".into(), test_runtime()); let goal = mgr.create_goal("Test goal", None).await.unwrap(); @@ -174,17 +178,22 @@ mod tests { mgr.record_tool_call("clarify"); let verdict = mgr.evaluate_after_turn("What do you think?", &goal); - assert!(matches!( - verdict, - GoalVerdict::Paused { - reason: PauseReason::ClarifyPending, - .. - } - )); + assert!( + matches!(verdict, GoalVerdict::Continue), + "clarify should no longer pause the goal; got {verdict:?}" + ); + + // DB status must remain active — no pause was written. + let active = mgr.get_active().await.unwrap().unwrap(); + assert_eq!(active.status, GoalStatus::Active); + assert!(active.pause_reason.is_none()); } #[tokio::test] - async fn evaluate_after_turn_update_plan_triggers_pause() { + async fn evaluate_after_turn_update_plan_no_longer_pauses() { + // Tool-based auto-pausing has been removed: `update_plan` no longer + // returns a `Paused(PlanPending)` verdict. The plan tool's approval + // flow is handled outside the goal manager now. let pool = setup_pool().await; let mgr = GoalManager::new(pool.clone(), "thread-1".into(), test_runtime()); let goal = mgr.create_goal("Test goal", None).await.unwrap(); @@ -192,13 +201,14 @@ mod tests { mgr.record_tool_call("update_plan"); let verdict = mgr.evaluate_after_turn("Here is the plan", &goal); - assert!(matches!( - verdict, - GoalVerdict::Paused { - reason: PauseReason::PlanPending, - .. - } - )); + assert!( + matches!(verdict, GoalVerdict::Continue), + "update_plan should no longer pause the goal; got {verdict:?}" + ); + + let active = mgr.get_active().await.unwrap().unwrap(); + assert_eq!(active.status, GoalStatus::Active); + assert!(active.pause_reason.is_none()); } #[tokio::test] @@ -283,6 +293,9 @@ mod tests { #[tokio::test] async fn auto_resume_clarify_pending() { + // Auto-resume on user message has been removed. A paused goal — even + // one paused for a `ClarifyPending` reason — must stay paused until + // an explicit `resume()` is issued. let pool = setup_pool().await; let mgr = GoalManager::new(pool.clone(), "thread-1".into(), test_runtime()); let goal = mgr.create_goal("Test goal", None).await.unwrap(); @@ -291,15 +304,21 @@ mod tests { .await .unwrap(); - let resumed = mgr.try_auto_resume().await.unwrap(); - assert!(resumed, "ClarifyPending should auto-resume"); + // No auto-resume path exists; status stays paused. + let paused = mgr.get_active().await.unwrap().unwrap(); + assert_eq!(paused.status, GoalStatus::Paused); + // Explicit resume still works. + mgr.resume(&goal.id).await.unwrap(); let active = mgr.get_active().await.unwrap().unwrap(); assert_eq!(active.status, GoalStatus::Active); } #[tokio::test] async fn auto_resume_skips_user_requested() { + // Auto-resume on user message has been removed. A `UserRequested` + // pause is therefore equivalent to every other pause from the + // auto-resume perspective: only explicit `resume()` will reopen it. let pool = setup_pool().await; let mgr = GoalManager::new(pool.clone(), "thread-1".into(), test_runtime()); let goal = mgr.create_goal("Test goal", None).await.unwrap(); @@ -308,9 +327,6 @@ mod tests { .await .unwrap(); - let resumed = mgr.try_auto_resume().await.unwrap(); - assert!(!resumed, "UserRequested should NOT auto-resume"); - let paused = mgr.get_active().await.unwrap().unwrap(); assert_eq!(paused.status, GoalStatus::Paused); } @@ -379,32 +395,33 @@ mod tests { } #[tokio::test] - async fn evaluate_after_turn_completion_claim_thrice_pauses() { + async fn evaluate_after_turn_completion_claim_keeps_challenging() { + // Repeated self-claimed completion no longer auto-pauses. The + // challenge prompt keeps nudging the agent toward `agent_judge`; the + // DB status remains `active` until a Judge verdict lands. + // + // The independent `MAX_IDLE_TURNS` path still pauses after three + // consecutive tool-less turns, so we exercise only two tool-less + // claim turns — that is enough to confirm the completion-claim + // branch returns `ChallengeEvidence` (and not a `Paused(IdleBlocked)` + // triggered by the former three-claim counter). let pool = setup_pool().await; let mgr = GoalManager::new(pool.clone(), "thread-1".into(), test_runtime()); - let goal = mgr.create_goal("Test goal", None).await.unwrap(); - - // First claim: challenge only - let v1 = mgr.evaluate_after_turn("All done!", &goal); - assert!(matches!(v1, GoalVerdict::ChallengeEvidence)); - - let fresh1 = mgr.get_active().await.unwrap().unwrap(); - - // Second claim: challenge only - let v2 = mgr.evaluate_after_turn("Everything is complete!", &fresh1); - assert!(matches!(v2, GoalVerdict::ChallengeEvidence)); + mgr.create_goal("Test goal", None).await.unwrap(); - let fresh2 = mgr.get_active().await.unwrap().unwrap(); + for claim in ["All done!", "Everything is complete!"] { + let fresh = mgr.get_active().await.unwrap().unwrap(); + let verdict = mgr.evaluate_after_turn(claim, &fresh); + assert!( + matches!(verdict, GoalVerdict::ChallengeEvidence), + "completion claim `{claim}` should keep producing ChallengeEvidence; got {verdict:?}" + ); + } - // Third claim: should pause (IdleBlocked) - let v3 = mgr.evaluate_after_turn("Finished everything!", &fresh2); - assert!(matches!( - v3, - GoalVerdict::Paused { - reason: PauseReason::IdleBlocked, - .. - } - )); + // No pause was ever written to the DB. + let active = mgr.get_active().await.unwrap().unwrap(); + assert_eq!(active.status, GoalStatus::Active); + assert!(active.pause_reason.is_none()); } #[tokio::test] @@ -551,23 +568,31 @@ mod tests { } #[tokio::test] - async fn evaluate_after_turn_chinese_idle_phrase_pauses() { + async fn evaluate_after_turn_chinese_idle_phrase_no_longer_pauses() { + // Heuristic question-phrase detection has been removed. Short + // Chinese question-like responses must not flip the goal to paused; + // status transitions are reserved for explicit user commands and + // Judge verdicts. The independent `MAX_IDLE_TURNS` path still + // pauses after three consecutive tool-less turns, but the heuristic + // branch is gone. let pool = setup_pool().await; let mgr = GoalManager::new(pool.clone(), "thread-1".into(), test_runtime()); - let goal = mgr.create_goal("Test goal", None).await.unwrap(); + mgr.create_goal("Test goal", None).await.unwrap(); - // One idle turn first, then short Chinese question-like response - mgr.evaluate_after_turn("随便聊聊", &goal); + // Reset the idle counter so MAX_IDLE_TURNS does not fire on the + // single-tool-less turn we care about. + mgr.record_tool_call("read"); let fresh = mgr.get_active().await.unwrap().unwrap(); let verdict = mgr.evaluate_after_turn("请确认这个方案是否可以?", &fresh); - assert!(matches!( - verdict, - GoalVerdict::Paused { - reason: PauseReason::IdleBlocked, - .. - } - )); + assert!( + !matches!(verdict, GoalVerdict::Paused { .. }), + "heuristic Chinese idle phrase should no longer pause the goal; got {verdict:?}" + ); + + let active = mgr.get_active().await.unwrap().unwrap(); + assert_eq!(active.status, GoalStatus::Active); + assert!(active.pause_reason.is_none()); } #[tokio::test] diff --git a/src/modules/workbench-shell/ui/dashboard-workbench-logic.ts b/src/modules/workbench-shell/ui/dashboard-workbench-logic.ts index 30996970..06a610a2 100644 --- a/src/modules/workbench-shell/ui/dashboard-workbench-logic.ts +++ b/src/modules/workbench-shell/ui/dashboard-workbench-logic.ts @@ -172,6 +172,12 @@ export function buildThreadContextBadgeData(options: { const isExceeded = Boolean( contextWindow && contextWindow > 0 && contextSize > contextWindow, ); + // Compression trigger threshold. The backend reserves 20% of the model's + // context window (src-tauri/src/core/context_compression.rs) and triggers + // auto-compression when the observed `context_size` exceeds the + // remaining 80% budget. Mirror that ratio here so the header pill can + // draw a hint marker at the same boundary the runtime uses. + const compressionThresholdRatio = 0.8; return { contextWindow, @@ -195,6 +201,7 @@ export function buildThreadContextBadgeData(options: { usedLabel: formatCompactTokenCount(contextSize), totalLabel: contextWindow ? formatCompactTokenCount(contextWindow) : "N/A", usedPercent, + compressionThresholdRatio, }; } diff --git a/src/modules/workbench-shell/ui/dashboard-workbench.test.ts b/src/modules/workbench-shell/ui/dashboard-workbench.test.ts index e09f9413..65a44208 100644 --- a/src/modules/workbench-shell/ui/dashboard-workbench.test.ts +++ b/src/modules/workbench-shell/ui/dashboard-workbench.test.ts @@ -159,4 +159,18 @@ describe("buildThreadContextBadgeData", () => { expect(badge.rawUsedPercent).toBe(0); expect(badge.totalLabel).toBe("N/A"); }); + + it("exposes the 80% compression threshold so the header can mark it", () => { + // The backend reserves 20% of the context window and triggers + // auto-compression when observed context_size exceeds the remaining + // 80% budget. The header pill mirrors the same ratio so the dashed + // marker is drawn at the exact boundary the runtime uses. + const badge = buildThreadContextBadgeData({ + fallbackContextWindow: "1000", + fallbackModelDisplayName: "Selected Model", + runtimeUsage: makeRuntimeUsage(), + }); + + expect(badge.compressionThresholdRatio).toBe(0.8); + }); }); diff --git a/src/modules/workbench-shell/ui/dashboard-workbench.tsx b/src/modules/workbench-shell/ui/dashboard-workbench.tsx index 8173430e..686fddd3 100644 --- a/src/modules/workbench-shell/ui/dashboard-workbench.tsx +++ b/src/modules/workbench-shell/ui/dashboard-workbench.tsx @@ -1053,6 +1053,25 @@ const drawerWidth = useStore(uiLayoutStore, (s) => s.drawerWidth); width: `${contextBadge.usageRatio * 100}%`, }} /> + {contextBadge.contextWindow && + contextBadge.contextWindow > 0 ? ( +