Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
18 commits
Select commit Hold shift + click to select a range
d19a9e8
feat(goal): ✨ replace self-attestation goal_scored with independent J…
jorben Jun 7, 2026
3b77dd1
refactor(goal): ♻️ remove mark_complete and complete verdict
jorben Jun 7, 2026
b204d9b
docs: 📝 update and reorder README feature list
jorben Jun 7, 2026
e284fbe
refactor(goal): ♻️ extract resolveGoalStatusKey for testability
jorben Jun 7, 2026
e8a58f2
refactor(subagent): 🔧 increase builtin default max delegation depth to 5
jorben Jun 7, 2026
c15e885
docs: 📝 remove obsolete design document
jorben Jun 7, 2026
d60daec
docs(judge): 📝 add size-first verification strategy and delegation gu…
jorben Jun 7, 2026
dc8fca0
refactor(goal): ♻️ remove goal-level time_used_seconds in favor of ru…
jorben Jun 7, 2026
4481759
feat(judge): ✨ redesign Judge evaluation for independence and complet…
jorben Jun 10, 2026
0e8b153
fix(subagent): 🐛 make task field optional and fix UTF-8 safe truncation
jorben Jun 10, 2026
f65683a
merge: resolve origin/master conflicts on judge redesign
jorben Jun 11, 2026
539005c
chore(deps): 🔧 align tiycore to 0.2.10-rc.2 and adopt Usage::context_…
jorben Jun 11, 2026
afd221e
refactor(goal): ♻️ centralize status transitions to explicit commands…
jorben Jun 11, 2026
f80d652
fix(agent): 🐛 fix timestamp slicing panic and add has_process_require…
jorben Jun 11, 2026
0cca885
feat(compression): ✨ reserve 20% context window for auto-compression …
jorben Jun 11, 2026
73c7cb5
fix(run): 🐛 record elapsed running time when interrupting active runs
jorben Jun 11, 2026
eb4b722
test: cover Judge summary builders and mapRunSummaryToContextUsage fa…
jorben Jun 12, 2026
de0b542
merge: resolve origin/master conflicts after review fixes
jorben Jun 12, 2026
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
504 changes: 473 additions & 31 deletions src-tauri/src/core/agent_session_execution.rs

Large diffs are not rendered by default.

71 changes: 67 additions & 4 deletions src-tauri/src/core/context_compression.rs
Original file line number Diff line number Diff line change
Expand Up @@ -21,9 +21,16 @@

use tiycore::agent::AgentMessage;
use tiycore::types::{ContentBlock, TextContent, UserMessage};
/// Reserve this many tokens for the model's response + overhead.
/// Matches pi-mono `DEFAULT_COMPACTION_SETTINGS.reserveTokens`.
const RESERVE_TOKENS: u32 = 16_384;
/// Fraction of the model's context window that is reserved for the model's
/// response + provider/tool overhead, expressed in basis points (1/100th of
/// a percent). 2000 bps == 20%.
const RESERVE_BASIS_POINTS: u32 = 2_000;

/// Minimum number of tokens to keep reserved even when 20% of the context
/// window is smaller than this floor. The previous hard-coded reserve of
/// `16_384` tokens is preserved as a safe lower bound for typical large
/// context windows while still allowing tiny windows to behave sanely.
const RESERVE_TOKENS_MIN: u32 = 16_384;

/// Keep at least this many tokens of recent conversation untouched.
/// With LLM-generated summaries providing rich context, we can keep a
Expand Down Expand Up @@ -233,7 +240,7 @@ impl CompressionSettings {
pub fn new(context_window: u32) -> Self {
Self {
context_window,
reserve_tokens: RESERVE_TOKENS,
reserve_tokens: reserve_tokens_for(context_window),
keep_recent_tokens: KEEP_RECENT_TOKENS,
}
}
Expand All @@ -244,6 +251,19 @@ impl CompressionSettings {
}
}

/// Reserve `RESERVE_BASIS_POINTS` (20%) of the model's context window for
/// the model's response + overhead, with `RESERVE_TOKENS_MIN` as a floor
/// so that very small windows still keep a sane amount of headroom and
/// huge windows don't get a pathologically tiny reserve.
fn reserve_tokens_for(context_window: u32) -> u32 {
let percent_budget = ((context_window as u64)
.saturating_mul(RESERVE_BASIS_POINTS as u64)
.saturating_add(9_999))
/ 10_000;
let percent_budget = percent_budget.min(u32::MAX as u64) as u32;
percent_budget.max(RESERVE_TOKENS_MIN)
}

// ---------------------------------------------------------------------------
// Public API: should_compress, find_cut_point, build_compressed_messages
// ---------------------------------------------------------------------------
Expand Down Expand Up @@ -911,6 +931,49 @@ mod tests {
assert_eq!(calibration.apply_to_estimate(0), 0);
}

#[test]
fn compression_settings_reserves_twenty_percent_of_context_window() {
// For typical large context windows the 20% reserve is well above
// the 16,384 token floor, so the budget is exactly 80% of the
// window. This is the primary behaviour change: instead of
// reserving a fixed 16,384 tokens regardless of model, we reserve
// 20% of the model's actual context window.
let cases = [
(128_000_u32, 25_600_u32, 102_400_u32), // GPT-4o class
(200_000_u32, 40_000_u32, 160_000_u32), // Claude-class
(1_000_000_u32, 200_000_u32, 800_000_u32), // 1M-window class
];
for (context_window, expected_reserve, expected_budget) in cases {
let settings = CompressionSettings::new(context_window);
assert_eq!(
settings.reserve_tokens, expected_reserve,
"20% reserve for {context_window}-token window",
);
assert_eq!(
settings.budget(),
expected_budget,
"80% budget for {context_window}-token window",
);
}
}

#[test]
fn compression_settings_reserve_clamps_to_minimum_for_small_windows() {
// When 20% of the window would be smaller than the safety floor
// (16,384 tokens), the floor takes over so tiny windows still
// keep enough headroom for the model response.
let settings = CompressionSettings::new(32_000);
// 20% of 32,000 = 6,400 < 16,384 → floor wins.
assert_eq!(settings.reserve_tokens, 16_384);
assert_eq!(settings.budget(), 32_000 - 16_384);

// Window at or below the floor: reserve equals the floor and
// saturating_sub protects `budget` from underflow.
let tiny = CompressionSettings::new(8_000);
assert_eq!(tiny.reserve_tokens, 16_384);
assert_eq!(tiny.budget(), 0);
}

#[test]
fn should_compress_via_context_size_triggers_when_last_usage_exceeds_budget() {
// The unified `context_size` (= input + output + cache_read +
Expand Down
143 changes: 38 additions & 105 deletions src-tauri/src/core/goal_manager.rs
Original file line number Diff line number Diff line change
Expand Up @@ -255,34 +255,6 @@ impl GoalManager {
goal_repo::account_usage(&self.pool, goal_id, tokens, 1).await
}

// ── Auto-resume ──

/// Check if a paused goal should auto-resume when the user sends a new message.
/// Returns Some(()) if the goal was auto-resumed, None if it shouldn't.
pub async fn try_auto_resume(&self) -> Result<bool, AppError> {
let goal = match self.get_active().await? {
Some(g) => g,
None => return Ok(false),
};

if goal.status != GoalStatus::Paused {
return Ok(false);
}

let should_resume = goal
.pause_reason
.as_ref()
.map(|r| r.auto_resume_on_user_message())
.unwrap_or(false);

if should_resume {
goal_repo::update_status(&self.pool, &goal.id, GoalStatus::Active, None, None, None)
.await?;
}

Ok(should_resume)
}

// ── Evaluation ──

/// Evaluate whether the goal should continue, pause, or complete after a turn.
Expand All @@ -307,27 +279,12 @@ impl GoalManager {
return verdict;
}

// Completion claim without tool call
// Completion claim without tool call — keep nudging agent toward

This comment was marked as outdated.

// Judge verification; no DB status change. The counter is still
// cleared on tool activity / resume / clear, but reaching 3 no
// longer auto-pauses (status transitions are reserved for user
// commands and Judge verdicts).
if self.detect_completion_claim(response) {
let should_pause = {
let mut guard = self.lock_runtime();
let count = guard
.completion_claim_count
.entry(self.thread_id.clone())
.or_default();
*count += 1;
*count >= 3
};
if should_pause {
// Reset counter before pausing
self.lock_runtime()
.completion_claim_count
.remove(&self.thread_id);
return GoalVerdict::Paused {
reason: PauseReason::IdleBlocked,
detail: Some("agent repeatedly claimed completion without requesting Judge verification via agent_judge".into()),
};
}
return GoalVerdict::ChallengeEvidence;
}

Expand All @@ -342,13 +299,12 @@ impl GoalManager {
// Reset idle counters since tools were called
self.reset_idle_counters();

// ── Layer 4: Budget checks ──
if let Some(budget) = goal.token_budget {
if goal.tokens_used >= budget {
return GoalVerdict::BudgetLimited;
}
}

// ── Layer 4: Turn budget + token budget checks ──
// `turns_used >= max_turns` auto-pauses (explicitly approved path).
// `tokens_used >= token_budget` is reported via the `budget_limited`
// verdict string (no DB status change) so the run loop can stop
// continuation. Status transitions are reserved for explicit user
// commands and Judge verdicts.
if goal.turns_used >= goal.max_turns {
return GoalVerdict::Paused {
reason: PauseReason::BudgetExhausted,
Expand All @@ -359,6 +315,12 @@ impl GoalManager {
};
}

if let Some(budget) = goal.token_budget {
if goal.tokens_used >= budget {
return GoalVerdict::BudgetLimited;
}
}

// ── Default: continue ──
GoalVerdict::Continue
}
Expand All @@ -370,35 +332,23 @@ impl GoalManager {
tool_calls: &[String],
_response: &str,
) -> Option<GoalVerdict> {
for tool_name in tool_calls {
match tool_name.as_str() {
"clarify" => {
return Some(GoalVerdict::Paused {
reason: PauseReason::ClarifyPending,
detail: Some("agent requested clarification".into()),
});
}
"update_plan" => {
return Some(GoalVerdict::Paused {
reason: PauseReason::PlanPending,
detail: Some("agent published a plan, awaiting approval".into()),
});
}
// agent_judge is the main-agent-only acceptance request. It is
// handled by the tool execution pipeline (execute_judge_tool),
// which runs the Judge and records the verdict. Evaluation must
// not treat it as a blocking tool — like any tool call it shows
// the agent acted and should reset idle tendencies.
_ => {}
}
}
// Tool-based auto-pausing has been removed: status transitions are

This comment was marked as outdated.

// reserved for explicit user commands and Judge verdicts. `clarify`
// and `update_plan` no longer flip the goal to paused; they fall
// through to the continuation path. `agent_judge` is the main-agent
// acceptance request and is handled by the tool execution pipeline
// (execute_judge_tool), which runs the Judge and records the
// verdict — it is never a blocking tool here.
let _ = tool_calls;
None
}

fn detect_idle_block(&self, response: &str) -> Option<GoalVerdict> {
let idle_count = self.increment_idle_count();
let trimmed = response.trim().to_lowercase();

// Heuristic question detection has been removed; status transitions
// are reserved for explicit user commands and Judge verdicts, so idle
// detection is purely a turn-count trigger. `response` is unused.
let _ = response;
if idle_count >= MAX_IDLE_TURNS {
return Some(GoalVerdict::Paused {
reason: PauseReason::IdleBlocked,
Expand All @@ -407,31 +357,6 @@ impl GoalManager {
)),
});
}

// Lightweight heuristic: short question-like response + no tools
if idle_count >= 2 {
let blockers = [
"should i",
"do you want",
"would you like",
"请确认",
"需要你决定",
"which approach",
"which option",
"can you confirm",
"let me know if",
"before i proceed",
"你的选择是",
"你确认吗",
"需要你同意",
];
if trimmed.len() < 500 && blockers.iter().any(|b| trimmed.contains(b)) {
return Some(GoalVerdict::Paused {
reason: PauseReason::IdleBlocked,
detail: Some("agent appears blocked, may need user input".into()),
});
}
}
None
}

Expand Down Expand Up @@ -613,7 +538,15 @@ impl GoalManager {
.await?;
}
GoalVerdict::BudgetLimited => {
self.mark_budget_limited(&current.id).await?;
// Advisory: token budget exhausted — do NOT write to DB.
// The verdict string still propagates as "budget_limited" so
// the run loop can stop continuation. Goal status remains
// `active` and is only changed by explicit user commands
// (`/goal budget-limit`) or Judge verdicts.
tracing::info!(
goal_id = %current.id,
"token budget exhausted: emitting budget_limited verdict without DB status change"
);
}
}

Expand Down
4 changes: 2 additions & 2 deletions src-tauri/src/core/prompt/templates/active_goal.tpl.md
Original file line number Diff line number Diff line change
Expand Up @@ -11,10 +11,10 @@ Turns used: {{turns_used}}/{{max_turns}}
**Completion is decided by independent verification — you cannot self-declare it.**
1. Every subtask implied by the objective must be done, with no remaining work or dangling follow-ups.
2. Verify your work by running the relevant tests, linters, or build commands as you go.
3. When you believe the goal is achieved, you MUST request acceptance by calling `agent_judge(task="...")`.
3. When you believe the goal is achieved, you MUST request acceptance by calling `agent_judge()`.

Rules:
- Call `agent_judge(task="explain why you believe the goal is achieved / what to verify")` when you think the goal is complete. An independent Judge will evaluate the project against the goal's consistency and completeness.
- Call `agent_judge()` to request independent goal acceptance verification. An independent Judge will evaluate the project against the goal's completeness. You do not need to provide a self-assessment — the Judge evaluates the project state directly.
- The goal is only marked verified when the Judge returns passed=true. You cannot mark the goal complete yourself.
- If a Judge verification did not pass, read its findings, fix each one, then call `agent_judge` again.
- Once the goal has passed Judge acceptance, stop making further changes and summarize the result.
Expand Down
Loading
Loading