tiylabs · jorben · Jun 17, 2026 · Jun 7, 2026 · Jun 7, 2026 · Jun 7, 2026
diff --git a/src-tauri/src/core/agent_session_execution.rs b/src-tauri/src/core/agent_session_execution.rs
diff --git a/src-tauri/src/core/context_compression.rs b/src-tauri/src/core/context_compression.rs
@@ -21,9 +21,16 @@
 
 use tiycore::agent::AgentMessage;
 use tiycore::types::{ContentBlock, TextContent, UserMessage};
-/// Reserve this many tokens for the model's response + overhead.
-/// Matches pi-mono `DEFAULT_COMPACTION_SETTINGS.reserveTokens`.
-const RESERVE_TOKENS: u32 = 16_384;
+/// Fraction of the model's context window that is reserved for the model's
+/// response + provider/tool overhead, expressed in basis points (1/100th of
+/// a percent). 2000 bps == 20%.
+const RESERVE_BASIS_POINTS: u32 = 2_000;
+
+/// Minimum number of tokens to keep reserved even when 20% of the context
+/// window is smaller than this floor. The previous hard-coded reserve of
+/// `16_384` tokens is preserved as a safe lower bound for typical large
+/// context windows while still allowing tiny windows to behave sanely.
+const RESERVE_TOKENS_MIN: u32 = 16_384;
 
 /// Keep at least this many tokens of recent conversation untouched.
 /// With LLM-generated summaries providing rich context, we can keep a
@@ -233,7 +240,7 @@ impl CompressionSettings {
     pub fn new(context_window: u32) -> Self {
         Self {
             context_window,
-            reserve_tokens: RESERVE_TOKENS,
+            reserve_tokens: reserve_tokens_for(context_window),
             keep_recent_tokens: KEEP_RECENT_TOKENS,
         }
     }
@@ -244,6 +251,19 @@ impl CompressionSettings {
     }
 }
 
+/// Reserve `RESERVE_BASIS_POINTS` (20%) of the model's context window for
+/// the model's response + overhead, with `RESERVE_TOKENS_MIN` as a floor
+/// so that very small windows still keep a sane amount of headroom and
+/// huge windows don't get a pathologically tiny reserve.
+fn reserve_tokens_for(context_window: u32) -> u32 {
+    let percent_budget = ((context_window as u64)
+        .saturating_mul(RESERVE_BASIS_POINTS as u64)
+        .saturating_add(9_999))
+        / 10_000;
+    let percent_budget = percent_budget.min(u32::MAX as u64) as u32;
+    percent_budget.max(RESERVE_TOKENS_MIN)
+}
+
 // ---------------------------------------------------------------------------
 // Public API: should_compress, find_cut_point, build_compressed_messages
 // ---------------------------------------------------------------------------
@@ -911,6 +931,49 @@ mod tests {
         assert_eq!(calibration.apply_to_estimate(0), 0);
     }
 
+    #[test]
+    fn compression_settings_reserves_twenty_percent_of_context_window() {
+        // For typical large context windows the 20% reserve is well above
+        // the 16,384 token floor, so the budget is exactly 80% of the
+        // window. This is the primary behaviour change: instead of
+        // reserving a fixed 16,384 tokens regardless of model, we reserve
+        // 20% of the model's actual context window.
+        let cases = [
+            (128_000_u32, 25_600_u32, 102_400_u32),    // GPT-4o class
+            (200_000_u32, 40_000_u32, 160_000_u32),    // Claude-class
+            (1_000_000_u32, 200_000_u32, 800_000_u32), // 1M-window class
+        ];
+        for (context_window, expected_reserve, expected_budget) in cases {
+            let settings = CompressionSettings::new(context_window);
+            assert_eq!(
+                settings.reserve_tokens, expected_reserve,
+                "20% reserve for {context_window}-token window",
+            );
+            assert_eq!(
+                settings.budget(),
+                expected_budget,
+                "80% budget for {context_window}-token window",
+            );
+        }
+    }
+
+    #[test]
+    fn compression_settings_reserve_clamps_to_minimum_for_small_windows() {
+        // When 20% of the window would be smaller than the safety floor
+        // (16,384 tokens), the floor takes over so tiny windows still
+        // keep enough headroom for the model response.
+        let settings = CompressionSettings::new(32_000);
+        // 20% of 32,000 = 6,400 < 16,384 → floor wins.
+        assert_eq!(settings.reserve_tokens, 16_384);
+        assert_eq!(settings.budget(), 32_000 - 16_384);
+
+        // Window at or below the floor: reserve equals the floor and
+        // saturating_sub protects `budget` from underflow.
+        let tiny = CompressionSettings::new(8_000);
+        assert_eq!(tiny.reserve_tokens, 16_384);
+        assert_eq!(tiny.budget(), 0);
+    }
+
     #[test]
     fn should_compress_via_context_size_triggers_when_last_usage_exceeds_budget() {
         // The unified `context_size` (= input + output + cache_read +

diff --git a/src-tauri/src/core/goal_manager.rs b/src-tauri/src/core/goal_manager.rs
@@ -255,34 +255,6 @@ impl GoalManager {
         goal_repo::account_usage(&self.pool, goal_id, tokens, 1).await
     }
 
-    // ── Auto-resume ──
-
-    /// Check if a paused goal should auto-resume when the user sends a new message.
-    /// Returns Some(()) if the goal was auto-resumed, None if it shouldn't.
-    pub async fn try_auto_resume(&self) -> Result<bool, AppError> {
-        let goal = match self.get_active().await? {
-            Some(g) => g,
-            None => return Ok(false),
-        };
-
-        if goal.status != GoalStatus::Paused {
-            return Ok(false);
-        }
-
-        let should_resume = goal
-            .pause_reason
-            .as_ref()
-            .map(|r| r.auto_resume_on_user_message())
-            .unwrap_or(false);
-
-        if should_resume {
-            goal_repo::update_status(&self.pool, &goal.id, GoalStatus::Active, None, None, None)
-                .await?;
-        }
-
-        Ok(should_resume)
-    }
-
     // ── Evaluation ──
 
     /// Evaluate whether the goal should continue, pause, or complete after a turn.
@@ -307,27 +279,12 @@ impl GoalManager {
                 return verdict;
             }
 
-            // Completion claim without tool call
+            // Completion claim without tool call — keep nudging agent toward
+            // Judge verification; no DB status change. The counter is still
+            // cleared on tool activity / resume / clear, but reaching 3 no
+            // longer auto-pauses (status transitions are reserved for user
+            // commands and Judge verdicts).
             if self.detect_completion_claim(response) {
-                let should_pause = {
-                    let mut guard = self.lock_runtime();
-                    let count = guard
-                        .completion_claim_count
-                        .entry(self.thread_id.clone())
-                        .or_default();
-                    *count += 1;
-                    *count >= 3
-                };
-                if should_pause {
-                    // Reset counter before pausing
-                    self.lock_runtime()
-                        .completion_claim_count
-                        .remove(&self.thread_id);
-                    return GoalVerdict::Paused {
-                        reason: PauseReason::IdleBlocked,
-                        detail: Some("agent repeatedly claimed completion without requesting Judge verification via agent_judge".into()),
-                    };
-                }
                 return GoalVerdict::ChallengeEvidence;
             }
 
@@ -342,13 +299,12 @@ impl GoalManager {
         // Reset idle counters since tools were called
         self.reset_idle_counters();
 
-        // ── Layer 4: Budget checks ──
-        if let Some(budget) = goal.token_budget {
-            if goal.tokens_used >= budget {
-                return GoalVerdict::BudgetLimited;
-            }
-        }
-
+        // ── Layer 4: Turn budget + token budget checks ──
+        // `turns_used >= max_turns` auto-pauses (explicitly approved path).
+        // `tokens_used >= token_budget` is reported via the `budget_limited`
+        // verdict string (no DB status change) so the run loop can stop
+        // continuation. Status transitions are reserved for explicit user
+        // commands and Judge verdicts.
         if goal.turns_used >= goal.max_turns {
             return GoalVerdict::Paused {
                 reason: PauseReason::BudgetExhausted,
@@ -359,6 +315,12 @@ impl GoalManager {
             };
         }
 
+        if let Some(budget) = goal.token_budget {
+            if goal.tokens_used >= budget {
+                return GoalVerdict::BudgetLimited;
+            }
+        }
+
         // ── Default: continue ──
         GoalVerdict::Continue
     }
@@ -370,35 +332,23 @@ impl GoalManager {
         tool_calls: &[String],
         _response: &str,
     ) -> Option<GoalVerdict> {
-        for tool_name in tool_calls {
-            match tool_name.as_str() {
-                "clarify" => {
-                    return Some(GoalVerdict::Paused {
-                        reason: PauseReason::ClarifyPending,
-                        detail: Some("agent requested clarification".into()),
-                    });
-                }
-                "update_plan" => {
-                    return Some(GoalVerdict::Paused {
-                        reason: PauseReason::PlanPending,
-                        detail: Some("agent published a plan, awaiting approval".into()),
-                    });
-                }
-                // agent_judge is the main-agent-only acceptance request. It is
-                // handled by the tool execution pipeline (execute_judge_tool),
-                // which runs the Judge and records the verdict. Evaluation must
-                // not treat it as a blocking tool — like any tool call it shows
-                // the agent acted and should reset idle tendencies.
-                _ => {}
-            }
-        }
+        // Tool-based auto-pausing has been removed: status transitions are
+        // reserved for explicit user commands and Judge verdicts. `clarify`
+        // and `update_plan` no longer flip the goal to paused; they fall
+        // through to the continuation path. `agent_judge` is the main-agent
+        // acceptance request and is handled by the tool execution pipeline
+        // (execute_judge_tool), which runs the Judge and records the
+        // verdict — it is never a blocking tool here.
+        let _ = tool_calls;
         None
     }
 
     fn detect_idle_block(&self, response: &str) -> Option<GoalVerdict> {
         let idle_count = self.increment_idle_count();
-        let trimmed = response.trim().to_lowercase();
-
+        // Heuristic question detection has been removed; status transitions
+        // are reserved for explicit user commands and Judge verdicts, so idle
+        // detection is purely a turn-count trigger. `response` is unused.
+        let _ = response;
         if idle_count >= MAX_IDLE_TURNS {
             return Some(GoalVerdict::Paused {
                 reason: PauseReason::IdleBlocked,
@@ -407,31 +357,6 @@ impl GoalManager {
                 )),
             });
         }
-
-        // Lightweight heuristic: short question-like response + no tools
-        if idle_count >= 2 {
-            let blockers = [
-                "should i",
-                "do you want",
-                "would you like",
-                "请确认",
-                "需要你决定",
-                "which approach",
-                "which option",
-                "can you confirm",
-                "let me know if",
-                "before i proceed",
-                "你的选择是",
-                "你确认吗",
-                "需要你同意",
-            ];
-            if trimmed.len() < 500 && blockers.iter().any(|b| trimmed.contains(b)) {
-                return Some(GoalVerdict::Paused {
-                    reason: PauseReason::IdleBlocked,
-                    detail: Some("agent appears blocked, may need user input".into()),
-                });
-            }
-        }
         None
     }
 
@@ -613,7 +538,15 @@ impl GoalManager {
                     .await?;
             }
             GoalVerdict::BudgetLimited => {
-                self.mark_budget_limited(&current.id).await?;
+                // Advisory: token budget exhausted — do NOT write to DB.
+                // The verdict string still propagates as "budget_limited" so
+                // the run loop can stop continuation. Goal status remains
+                // `active` and is only changed by explicit user commands
+                // (`/goal budget-limit`) or Judge verdicts.
+                tracing::info!(
+                    goal_id = %current.id,
+                    "token budget exhausted: emitting budget_limited verdict without DB status change"
+                );
             }
         }
 

diff --git a/src-tauri/src/core/prompt/templates/active_goal.tpl.md b/src-tauri/src/core/prompt/templates/active_goal.tpl.md
@@ -11,10 +11,10 @@ Turns used: {{turns_used}}/{{max_turns}}
 **Completion is decided by independent verification — you cannot self-declare it.**
 1. Every subtask implied by the objective must be done, with no remaining work or dangling follow-ups.
 2. Verify your work by running the relevant tests, linters, or build commands as you go.
-3. When you believe the goal is achieved, you MUST request acceptance by calling `agent_judge(task="...")`.
+3. When you believe the goal is achieved, you MUST request acceptance by calling `agent_judge()`.
 
 Rules:
-- Call `agent_judge(task="explain why you believe the goal is achieved / what to verify")` when you think the goal is complete. An independent Judge will evaluate the project against the goal's consistency and completeness.
+- Call `agent_judge()` to request independent goal acceptance verification. An independent Judge will evaluate the project against the goal's completeness. You do not need to provide a self-assessment — the Judge evaluates the project state directly.
 - The goal is only marked verified when the Judge returns passed=true. You cannot mark the goal complete yourself.
 - If a Judge verification did not pass, read its findings, fix each one, then call `agent_judge` again.
 - Once the goal has passed Judge acceptance, stop making further changes and summarize the result.