From 8c01c5c6490155c3e77caec46d4b459b26d32fa1 Mon Sep 17 00:00:00 2001 From: Rik Schreurs Date: Fri, 1 May 2026 21:43:27 +0200 Subject: [PATCH 1/2] Detect stalled sessions + optional auto-resume after wake MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit When the laptop sleeps, the in-flight HTTPS stream from `claude` dies and sessions sit in Working forever with no event to drive them back to Idle. The user's manual workaround was Stop → "continue". This adds: - A new StallDetectionService that scans every 30s for sessions whose stream has been silent past 90s (20s in the post-wake window). Wake events come from a cross-platform HeartbeatPowerService that watches for monotonic-clock jumps — no native bindings, AOT-clean. - An in-memory `IsStalled` flag on SessionVm, folded into the existing "Needs attention" sidebar filter. Stalled sessions show a banner with a Resume button that does the cancel-then-continue dance manually. - A single opt-in setting (off by default) — when on, eligible stalls auto-resume by sending "continue". Guards: only fires from Working (never RunningTool, to avoid re-running side-effecting tools), defers while the user is typing, capped at 1 retry per stall, vetoes when a ResultEvent races in mid-cancel. - Migration 12 adds `is_auto_resume` on the messages table; the transcript template hides the synthetic user bubble and renders a thin "resumed after wake" separator instead, so the conversation reads as one continuous reply. ClaudeClient now force-kills its subprocess on cancellation so the auto-resume orchestration's WaitAsync(5s) returns promptly even when the network is dead. SessionManager.UpdateStatus gains a suppressNotification parameter so the cancel-half of an auto-resume doesn't fire a "turn complete" toast. --- src/Conclave.App/Claude/ClaudeClient.cs | 23 +- src/Conclave.App/Claude/ClaudeService.cs | 44 +++- src/Conclave.App/MainWindow.axaml.cs | 20 +- .../Platform/HeartbeatPowerService.cs | 58 +++++ .../Platform/IPlatformPowerService.cs | 11 + src/Conclave.App/Sessions/Database.cs | 16 +- src/Conclave.App/Sessions/MessageRow.cs | 3 + src/Conclave.App/Sessions/SessionManager.cs | 9 +- src/Conclave.App/Sessions/SettingsKeys.cs | 6 + .../Sessions/StallDetectionService.cs | 236 ++++++++++++++++++ src/Conclave.App/ViewModels/SessionVm.cs | 48 ++++ src/Conclave.App/ViewModels/ShellVm.cs | 41 ++- .../ViewModels/TranscriptMessageVm.cs | 24 +- src/Conclave.App/Views/Shell/MainPane.axaml | 49 +++- .../Views/Shell/MainPane.axaml.cs | 6 + .../Views/Shell/PreferencesModal.axaml | 13 + 16 files changed, 579 insertions(+), 28 deletions(-) create mode 100644 src/Conclave.App/Platform/HeartbeatPowerService.cs create mode 100644 src/Conclave.App/Platform/IPlatformPowerService.cs create mode 100644 src/Conclave.App/Sessions/StallDetectionService.cs diff --git a/src/Conclave.App/Claude/ClaudeClient.cs b/src/Conclave.App/Claude/ClaudeClient.cs index 2839b78..33540ea 100644 --- a/src/Conclave.App/Claude/ClaudeClient.cs +++ b/src/Conclave.App/Claude/ClaudeClient.cs @@ -112,14 +112,14 @@ public async IAsyncEnumerable StreamAsync( psi.ArgumentList.Add(claudeSessionId); } + // Spawn off the UI thread: Process.Start with three redirected pipes can block the + // calling thread long enough to swallow the layout pass for the just-appended user + // message, so the user's bubble doesn't paint until claude itself starts streaming. + using var proc = await Task.Run(() => Process.Start(psi), ct) + ?? throw new InvalidOperationException("failed to spawn claude"); + try { - // Spawn off the UI thread: Process.Start with three redirected pipes can block the - // calling thread long enough to swallow the layout pass for the just-appended user - // message, so the user's bubble doesn't paint until claude itself starts streaming. - using var proc = await Task.Run(() => Process.Start(psi), ct) - ?? throw new InvalidOperationException("failed to spawn claude"); - // Collect stderr concurrently so it doesn't fill the pipe and deadlock us. var stderrBuf = new StringBuilder(); proc.ErrorDataReceived += (_, e) => { if (e.Data != null) stderrBuf.AppendLine(e.Data); }; @@ -148,6 +148,17 @@ public async IAsyncEnumerable StreamAsync( } finally { + // If the caller cancelled (Stop button, auto-resume on a stalled session, app shutdown), + // proactively kill the subprocess + its tree. The pipe-readers respect ct and exit + // promptly, but `using var proc` only disposes our handles — the child claude process + // can keep running until its own pipes break, which on a stalled network can take a + // while. Killing here means StallDetectionService's WaitAsync(5s) reliably observes + // a clean unwind before sending "continue". + if (ct.IsCancellationRequested) + { + try { if (!proc.HasExited) proc.Kill(entireProcessTree: true); } + catch { /* already exited or no permission — best-effort */ } + } // Best-effort: claude has already read the file by the time it spawns its // own MCP client, so a delete here is safe. Swallow IO errors — leaving a // temp file behind is preferable to crashing the turn on cleanup. diff --git a/src/Conclave.App/Claude/ClaudeService.cs b/src/Conclave.App/Claude/ClaudeService.cs index b17596f..6c74f40 100644 --- a/src/Conclave.App/Claude/ClaudeService.cs +++ b/src/Conclave.App/Claude/ClaudeService.cs @@ -15,13 +15,23 @@ public sealed class ClaudeService public ClaudeService(SessionManager manager) => _manager = manager; - public async Task RunTurnAsync(SessionVm session, string prompt, CancellationToken ct = default) + public Task RunTurnAsync(SessionVm session, string prompt, CancellationToken ct = default) + => RunTurnAsync(session, prompt, isAutoResume: false, ct); + + public async Task RunTurnAsync(SessionVm session, string prompt, bool isAutoResume, CancellationToken ct = default) { // Per-session cancellation. Combined so either the caller's token or the Cancel // button on the SessionVm kills the turn. using var internalCts = new CancellationTokenSource(); using var linked = CancellationTokenSource.CreateLinkedTokenSource(ct, internalCts.Token); session.CancellationSource = internalCts; + // Reset the stall timestamp at turn start so a stalled prior turn doesn't immediately + // re-flag this fresh one before any events have had time to arrive. StallDetectionService + // also clears IsStalled when it sees a turn enter Working — but stamping here closes + // the race where the timer fires between RunTurnAsync's UpdateStatus(Working) and the + // first stream event. + session.LastStreamEventAt = DateTime.UtcNow; + session.IsStalled = false; // Per-turn permission router. Registered with the shared MCP server so claude // can call our permission_prompt tool over HTTP; cancelled in finally so a @@ -48,6 +58,7 @@ public async Task RunTurnAsync(SessionVm session, string prompt, CancellationTok Role = MessageRole.User, Time = Now(), Content = prompt, + IsAutoResume = isAutoResume, }; session.AppendTranscript(userMsg); _manager.PersistMessage(session, userMsg); @@ -106,9 +117,15 @@ public async Task RunTurnAsync(SessionVm session, string prompt, CancellationTok catch (OperationCanceledException) { // User cancelled — not an error. Flip status to Idle; do not write an - // "[error]" transcript entry. - Log(session, LogLevel.Wrn, "Turn cancelled by user"); - _manager.UpdateStatus(session, SessionStatus.Idle); + // "[error]" transcript entry. When StallDetectionService is the canceller + // (auto-resume case) it sets SuppressNextTurnCompleteNotification so the + // user doesn't see a "turn complete" toast for what's internally a restart. + var suppress = session.SuppressNextTurnCompleteNotification; + session.SuppressNextTurnCompleteNotification = false; + Log(session, LogLevel.Wrn, suppress + ? "Turn cancelled for auto-resume after stall" + : "Turn cancelled by user"); + _manager.UpdateStatus(session, SessionStatus.Idle, suppressNotification: suppress); } catch (Exception ex) { @@ -128,6 +145,12 @@ public async Task RunTurnAsync(SessionVm session, string prompt, CancellationTok finally { session.CancellationSource = null; + session.CurrentTurnTask = null; + // Defensive cleanup: if the turn ended via a real ResultEvent rather than the + // OCE catch (e.g. the network recovered mid-cancel), the OCE handler never ran + // and the auto-resume flag is still set. Clearing here means the next legit + // turn-complete won't get its notification suppressed by a stale flag. + session.SuppressNextTurnCompleteNotification = false; // Release any in-flight permission prompts so the MCP handler unwinds and // claude doesn't sit on an orphaned request. permHandler?.CancelAll(); @@ -176,6 +199,13 @@ private void Handle( Dictionary liveByMessageId, PermissionTurnHandler? permHandler) { + // StallDetectionService checks how long ago the last event was — stamp on every + // type so a long-running tool (no text deltas for minutes) doesn't false-positive. + // Clearing IsStalled here also lets a session that briefly stalled then recovered + // drop out of needs-attention without manual intervention. + session.LastStreamEventAt = DateTime.UtcNow; + if (session.IsStalled) session.IsStalled = false; + switch (ev) { case SystemInitEvent init: @@ -215,6 +245,12 @@ private void Handle( break; case ResultEvent res: + // Record the result-arrival time before flipping status so StallDetectionService + // can detect the "result landed mid-cancel" race and skip a piggyback auto-resume. + session.LastResultEventAt = DateTime.UtcNow; + // A clean turn-complete resets the auto-resume retry budget so the next stall + // gets the same one-shot it would have on a fresh session. + session.AutoResumeAttempts = 0; var finalStatus = res.IsError && !IsInterrupt(res) ? SessionStatus.Error : SessionStatus.Idle; diff --git a/src/Conclave.App/MainWindow.axaml.cs b/src/Conclave.App/MainWindow.axaml.cs index ade3e87..58274c5 100644 --- a/src/Conclave.App/MainWindow.axaml.cs +++ b/src/Conclave.App/MainWindow.axaml.cs @@ -2,6 +2,7 @@ using Avalonia.Controls; using Conclave.App.Claude; using Conclave.App.Design; +using Conclave.App.Platform; using Conclave.App.Sessions; using Conclave.App.ViewModels; using Conclave.App.Views.Shell; @@ -17,6 +18,7 @@ public partial class MainWindow : Window private SessionManager? _manager; private AutoCleanupService? _autoCleanup; + private StallDetectionService? _stallDetection; private PermissionMcpServer? _permissions; private ShellVm? _shell; // Written on the UI thread from Activated/Deactivated, read by NotificationService @@ -86,13 +88,28 @@ public MainWindow() capabilities.BeginProbe(); _shell = new ShellVm(tokens, _manager, capabilities); StartupLog.Mark("MainWindow ctor: ShellVm built"); - _shell.SendRequested += (session, prompt) => claudeService.RunTurnAsync(session, prompt); + // Capture the in-flight task on the SessionVm so StallDetectionService can await + // it cleanly when cancelling for auto-resume. The fire-and-forget shape means the + // returned Task would otherwise be lost. + _shell.SendRequested += (session, prompt) => + { + var task = claudeService.RunTurnAsync(session, prompt); + session.CurrentTurnTask = task; + return task; + }; _shell.PropertyChanged += OnShellPropertyChanged; _autoCleanup = new AutoCleanupService(_manager); _shell.AutoCleanup = _autoCleanup; _autoCleanup.Start(); + // Detect stalled claude turns (e.g. after waking from sleep) and — if the user + // has opted in — silently send "continue" to resume them. Cross-platform wake + // detection via clock-jump heartbeat avoids any OS-specific bindings. + _stallDetection = new StallDetectionService(_manager, claudeService, new HeartbeatPowerService()); + _shell.StallDetection = _stallDetection; + _stallDetection.Start(); + DataContext = _shell; Activated += (_, _) => _isWindowActive = true; @@ -105,6 +122,7 @@ public MainWindow() // the MCP listener so an in-flight permission HTTP response can still be // written back. Closing the listener first races the response onto a closed // socket and claude sees a connection error instead of a clean deny. + _stallDetection?.Dispose(); _autoCleanup?.Dispose(); _manager?.Dispose(); _permissions?.Dispose(); diff --git a/src/Conclave.App/Platform/HeartbeatPowerService.cs b/src/Conclave.App/Platform/HeartbeatPowerService.cs new file mode 100644 index 0000000..487b472 --- /dev/null +++ b/src/Conclave.App/Platform/HeartbeatPowerService.cs @@ -0,0 +1,58 @@ +namespace Conclave.App.Platform; + +// Cross-platform wake detector. Runs a low-frequency heartbeat loop and fires DeviceWoke +// whenever a tick lands significantly later than scheduled — a strong signal that the OS +// suspended the process (laptop sleep, Windows hybrid sleep, hibernation). Works on every +// platform without OS-specific bindings, which keeps the AOT story simple and avoids the +// brittle ObjC-block marshalling that NSWorkspaceDidWakeNotification would otherwise need. +// +// Trade-off: there's a small detection latency equal to TickInterval (1.5s). For waking +// from sleep that's imperceptible. False positives can occur if the process is starved of +// CPU for several seconds (debugger pause, severe GC); StallDetectionService treats wake +// events as a hint to scan, not a definitive stall signal, so this is harmless. +public sealed class HeartbeatPowerService : IPlatformPowerService +{ + private static readonly TimeSpan TickInterval = TimeSpan.FromSeconds(1.5); + + // A tick that takes more than this long beyond TickInterval is treated as a wake. + // 8s gives generous headroom for "the GC paused us" without missing actual sleeps, + // which on macOS routinely register as 30s+ jumps even on short lid-closes. + private static readonly TimeSpan WakeThreshold = TimeSpan.FromSeconds(8); + + private readonly CancellationTokenSource _cts = new(); + private Task? _loop; + + public event Action? DeviceWoke; + + public void Start() + { + if (_loop is not null) return; + _loop = Task.Run(() => RunAsync(_cts.Token)); + } + + private async Task RunAsync(CancellationToken ct) + { + var expected = DateTime.UtcNow + TickInterval; + while (!ct.IsCancellationRequested) + { + try { await Task.Delay(TickInterval, ct); } + catch (TaskCanceledException) { return; } + + var now = DateTime.UtcNow; + var drift = now - expected; + if (drift > WakeThreshold) + { + try { DeviceWoke?.Invoke(); } catch { /* never let a subscriber kill the loop */ } + } + // Re-base off "now" rather than expected+TickInterval so a single late tick + // doesn't keep firing wake events for several intervals after a long sleep. + expected = now + TickInterval; + } + } + + public void Dispose() + { + try { _cts.Cancel(); } catch { } + _cts.Dispose(); + } +} diff --git a/src/Conclave.App/Platform/IPlatformPowerService.cs b/src/Conclave.App/Platform/IPlatformPowerService.cs new file mode 100644 index 0000000..ce42006 --- /dev/null +++ b/src/Conclave.App/Platform/IPlatformPowerService.cs @@ -0,0 +1,11 @@ +namespace Conclave.App.Platform; + +// Cross-platform OS-power events. v1 only fires DeviceWoke (resume from sleep) — that's +// the signal StallDetectionService cares about, since a sleeping laptop is the dominant +// cause of a stalled claude turn. Implementations may fire the event from a non-UI +// thread; subscribers must marshal to the UI thread before mutating VMs. +public interface IPlatformPowerService : IDisposable +{ + event Action? DeviceWoke; + void Start(); +} diff --git a/src/Conclave.App/Sessions/Database.cs b/src/Conclave.App/Sessions/Database.cs index ea28569..9cd2e2d 100644 --- a/src/Conclave.App/Sessions/Database.cs +++ b/src/Conclave.App/Sessions/Database.cs @@ -115,6 +115,12 @@ UPDATE session_worktrees ) WHERE repo_path = ''; """), + (12, """ + -- Synthetic "continue" prompts that StallDetectionService sends when auto-resuming + -- a stalled session. Hidden in the transcript UI so the conversation reads + -- naturally; the assistant's resumed reply still shows. + ALTER TABLE messages ADD COLUMN is_auto_resume INTEGER NOT NULL DEFAULT 0; + """), }; // Explicit column lists so ordinal mapping in Read*() stays stable. @@ -126,7 +132,7 @@ UPDATE session_worktrees "claude_session_id, plan_json, permission_mode, total_cost_usd, pr_merged_at, " + "pending_preamble"; private const string MessageColumns = - "id, session_id, role, content, tools_json, created_at, seq, claude_uuid"; + "id, session_id, role, content, tools_json, created_at, seq, claude_uuid, is_auto_resume"; private Database(SqliteConnection conn) => _conn = conn; @@ -426,6 +432,7 @@ public IReadOnlyList GetMessages(string sessionId) CreatedAt = r.GetInt64(5), Seq = r.GetInt32(6), ClaudeUuid = Str(r, 7), + IsAutoResume = r.GetInt32(8) != 0, }; public int NextSeq(string sessionId) @@ -439,12 +446,13 @@ public int NextSeq(string sessionId) } public void InsertMessage(MessageRow m) => Exec( - "INSERT INTO messages (id, session_id, role, content, tools_json, created_at, seq, claude_uuid) " + - "VALUES ($id, $sessionId, $role, $content, $toolsJson, $createdAt, $seq, $claudeUuid);", + "INSERT INTO messages (id, session_id, role, content, tools_json, created_at, seq, claude_uuid, is_auto_resume) " + + "VALUES ($id, $sessionId, $role, $content, $toolsJson, $createdAt, $seq, $claudeUuid, $isAutoResume);", ("$id", m.Id), ("$sessionId", m.SessionId), ("$role", m.Role), ("$content", m.Content), ("$toolsJson", (object?)m.ToolsJson), ("$createdAt", m.CreatedAt), ("$seq", m.Seq), - ("$claudeUuid", (object?)m.ClaudeUuid)); + ("$claudeUuid", (object?)m.ClaudeUuid), + ("$isAutoResume", m.IsAutoResume ? 1 : 0)); public void UpdateMessageClaudeUuid(string id, string? uuid) => Exec( "UPDATE messages SET claude_uuid = $u WHERE id = $id;", diff --git a/src/Conclave.App/Sessions/MessageRow.cs b/src/Conclave.App/Sessions/MessageRow.cs index 67197bf..a1218b5 100644 --- a/src/Conclave.App/Sessions/MessageRow.cs +++ b/src/Conclave.App/Sessions/MessageRow.cs @@ -16,4 +16,7 @@ public sealed record MessageRow // user messages and for messages persisted before this column was added. Captured to // enable future fork-at-message paths that target claude's own JSONL session storage. public string? ClaudeUuid { get; init; } + // True for synthetic "continue" prompts injected by StallDetectionService when + // auto-resuming a stalled session. Used to hide the user bubble in the transcript. + public bool IsAutoResume { get; init; } } diff --git a/src/Conclave.App/Sessions/SessionManager.cs b/src/Conclave.App/Sessions/SessionManager.cs index ec5d9d9..6512bec 100644 --- a/src/Conclave.App/Sessions/SessionManager.cs +++ b/src/Conclave.App/Sessions/SessionManager.cs @@ -357,6 +357,7 @@ public void LoadTranscriptIfNeeded(SessionVm s) .ToLocalTime().ToString("HH:mm"), Content = row.Content, ClaudeUuid = row.ClaudeUuid, + IsAutoResume = row.IsAutoResume, }; if (!string.IsNullOrEmpty(row.ToolsJson)) { @@ -380,6 +381,7 @@ public void PersistMessage(SessionVm session, TranscriptMessageVm msg) CreatedAt = Database.Now(), Seq = _db.NextSeq(session.Id), ClaudeUuid = msg.ClaudeUuid, + IsAutoResume = msg.IsAutoResume, }); } @@ -459,7 +461,7 @@ private IEnumerable DeserializeTools(string json) } } - public void UpdateStatus(SessionVm s, SessionStatus status) + public void UpdateStatus(SessionVm s, SessionStatus status, bool suppressNotification = false) { var previous = s.Status; _db.UpdateSessionStatus(s.Id, status.ToString()); @@ -472,7 +474,10 @@ public void UpdateStatus(SessionVm s, SessionStatus status) BumpToTop(s); // Only notify on a real busy→done transition. App-load resets and session // creation also pass through here at Idle but stay Idle — those mustn't fire. - if (previous is SessionStatus.Working or SessionStatus.RunningTool) + // suppressNotification is set by StallDetectionService on the cancel-half of + // an auto-resume so the user doesn't get a "turn complete" toast for what is + // really just an internal restart. + if (!suppressNotification && previous is SessionStatus.Working or SessionStatus.RunningTool) Notifications?.NotifyTurnComplete(s.Title, status == SessionStatus.Error); } } diff --git a/src/Conclave.App/Sessions/SettingsKeys.cs b/src/Conclave.App/Sessions/SettingsKeys.cs index 87a11b6..d222df8 100644 --- a/src/Conclave.App/Sessions/SettingsKeys.cs +++ b/src/Conclave.App/Sessions/SettingsKeys.cs @@ -7,6 +7,7 @@ public static class SettingsKeys public const string AutoCleanupDays = "auto_cleanup.days"; public const string NotificationsEnabled = "notifications.enabled"; public const string ClaudeVersion = "claude.version"; + public const string AutoResumeStalledSessions = "stall_detection.auto_resume"; public const int DefaultAutoCleanupDays = 7; @@ -24,4 +25,9 @@ public static int ReadAutoCleanupDays(Database db) // explicit "false" disables notifications. public static bool ReadNotificationsEnabled(Database db) => !string.Equals(db.GetSetting(NotificationsEnabled), "false", StringComparison.OrdinalIgnoreCase); + + // Default-off: stall detection is always on (sessions get flagged as needs-attention), + // but Conclave only auto-sends "continue" when the user explicitly opts in. + public static bool ReadAutoResumeStalledSessions(Database db) => + string.Equals(db.GetSetting(AutoResumeStalledSessions), "true", StringComparison.OrdinalIgnoreCase); } diff --git a/src/Conclave.App/Sessions/StallDetectionService.cs b/src/Conclave.App/Sessions/StallDetectionService.cs new file mode 100644 index 0000000..4f3a2a0 --- /dev/null +++ b/src/Conclave.App/Sessions/StallDetectionService.cs @@ -0,0 +1,236 @@ +using Conclave.App.Claude; +using Conclave.App.Platform; +using Conclave.App.ViewModels; + +namespace Conclave.App.Sessions; + +// Detects sessions whose claude stream has gone silent past a threshold while still +// nominally Working/RunningTool — typically caused by the laptop sleeping and the in-flight +// HTTPS stream dying without any visible event on our side. Surfaces them as IsStalled=true +// (the sidebar "Needs attention" filter picks them up). +// +// When the user has opted in via Preferences, the service additionally cancels the dead +// turn and sends a synthetic "continue" prompt so the conversation resumes silently. +// +// Cadence: a periodic scan every TickInterval, plus an immediate scan whenever the power +// service signals a wake event. Wake events use a much shorter silence threshold because +// a wake is a high-confidence indicator that the network was severed. +// +// Threading: the loop runs on a background task; status reads + VM mutations marshal back +// to the UI thread via Dispatcher.UIThread.InvokeAsync, mirroring the AutoCleanupService +// pattern. +public sealed class StallDetectionService : IDisposable +{ + private static readonly TimeSpan TickInterval = TimeSpan.FromSeconds(30); + private static readonly TimeSpan StartupDelay = TimeSpan.FromSeconds(15); + // Periodic-scan threshold: a tool can legitimately run >60s without text deltas, so we + // err on the conservative side. Network blips that resolve themselves in <90s won't + // false-positive the user. + private static readonly TimeSpan SilenceThreshold = TimeSpan.FromSeconds(90); + // Post-wake threshold: the OS just resumed, so any silence is almost certainly a dead + // socket. Shorter so the user sees the "Resume" affordance (or auto-resume kicks in) + // within seconds of unlocking the screen. + private static readonly TimeSpan PostWakeSilenceThreshold = TimeSpan.FromSeconds(20); + // The window during which a freshly-arrived ResultEvent vetoes auto-resume — the network + // recovered just as we tried to cancel and a real completion landed. Without this veto + // we'd pile a synthetic "continue" on top of a successful turn. + private static readonly TimeSpan ResultRaceWindow = TimeSpan.FromSeconds(2); + // Cap on the await for an in-flight turn to fully unwind after we cancel it. Past this + // we assume the subprocess is wedged and bail out (the kill in ClaudeClient's finally + // is already a belt-and-braces fallback for the SIGTERM path). + private static readonly TimeSpan CancelSettleTimeout = TimeSpan.FromSeconds(5); + // Brief debounce for back-to-back wake notifications. macOS occasionally fires the wake + // event multiple times for a single resume, and our HeartbeatPowerService can fire once + // too if the next tick is also late. + private static readonly TimeSpan WakeDebounce = TimeSpan.FromSeconds(3); + + private readonly SessionManager _manager; + private readonly ClaudeService _claudeService; + private readonly IPlatformPowerService _power; + private readonly CancellationTokenSource _cts = new(); + private Task? _loop; + + private DateTime _lastWakeUtc; + private DateTime _lastWakeHandledUtc; + // Single-flight guard: at most one auto-resume orchestration in flight across all + // sessions. Wake-storms or post-wake fan-outs would otherwise serialise themselves + // through the Database (UI thread) anyway, but bounding to one keeps the flow obvious. + private int _resumeInFlight; + + public StallDetectionService(SessionManager manager, ClaudeService claudeService, IPlatformPowerService power) + { + _manager = manager; + _claudeService = claudeService; + _power = power; + } + + public void Start() + { + if (_loop is not null) return; + _power.DeviceWoke += OnDeviceWoke; + _power.Start(); + _loop = Task.Run(() => RunLoopAsync(_cts.Token)); + } + + private void OnDeviceWoke() + { + var now = DateTime.UtcNow; + if (now - _lastWakeUtc < WakeDebounce) return; + _lastWakeUtc = now; + // Best-effort kick — the periodic loop will follow up if this misses. + _ = Task.Run(() => ScanAsync(postWake: true, _cts.Token)); + } + + private async Task RunLoopAsync(CancellationToken ct) + { + try { await Task.Delay(StartupDelay, ct); } catch (TaskCanceledException) { return; } + while (!ct.IsCancellationRequested) + { + try { await ScanAsync(postWake: false, ct); } catch { /* never let the loop die */ } + try { await Task.Delay(TickInterval, ct); } catch (TaskCanceledException) { return; } + } + } + + private async Task ScanAsync(bool postWake, CancellationToken ct) + { + if (postWake) _lastWakeHandledUtc = DateTime.UtcNow; + var threshold = postWake ? PostWakeSilenceThreshold : SilenceThreshold; + var now = DateTime.UtcNow; + + // Snapshot candidates inside one UI-thread hop. Taking SessionVm references out is + // safe — they're long-lived and we re-check status under the dispatcher before + // mutating. + var candidates = await Avalonia.Threading.Dispatcher.UIThread.InvokeAsync(() => + { + var list = new List(); + foreach (var p in _manager.Projects) + foreach (var s in p.Sessions) + { + if (s.IsStalled) continue; // already flagged + if (s.Status is not (SessionStatus.Working or SessionStatus.RunningTool)) continue; + if (s.LastStreamEventAt == default) continue; // turn just started + if (now - s.LastStreamEventAt < threshold) continue; + list.Add(s); + } + return list; + }); + + if (candidates.Count == 0 || ct.IsCancellationRequested) return; + + foreach (var session in candidates) + { + if (ct.IsCancellationRequested) return; + + // Re-check on the UI thread immediately before flipping; the periodic-scan + // window can race with a real event arriving (e.g. tool just finished) and we + // don't want to flag a session that's already moved past silence. + var shouldFlag = await Avalonia.Threading.Dispatcher.UIThread.InvokeAsync(() => + { + if (session.IsStalled) return false; + if (session.Status is not (SessionStatus.Working or SessionStatus.RunningTool)) return false; + if (DateTime.UtcNow - session.LastStreamEventAt < threshold) return false; + session.IsStalled = true; + return true; + }); + + if (!shouldFlag) continue; + + // Auto-resume eligibility: opt-in setting, only Working (not mid-tool, where + // resume could re-execute side effects), no composer text being typed, and + // we haven't already retried this stall once. + var shouldAutoResume = await Avalonia.Threading.Dispatcher.UIThread.InvokeAsync(() => + SettingsKeys.ReadAutoResumeStalledSessions(_manager.Db) + && session.Status == SessionStatus.Working + && string.IsNullOrEmpty(session.ComposerDraft) + && session.AutoResumeAttempts < 1 + && DateTime.UtcNow - session.LastResultEventAt > ResultRaceWindow); + + if (shouldAutoResume) + { + await ResumeAsync(session, ignoreRetryCap: false); + } + } + } + + // Public so ShellVm.ResumeStalledSession (the manual button) can call it. Manual clicks + // pass ignoreRetryCap=true since the user explicitly asked. + public async Task ResumeAsync(SessionVm session, bool ignoreRetryCap) + { + // Single-flight: avoid two parallel resumes piling up if a wake event and the timer + // tick race. A second caller just waits for the first to finish — there's no work + // for it to do once the session has resumed (IsStalled clears). + if (Interlocked.Exchange(ref _resumeInFlight, 1) == 1) return; + try + { + // Final eligibility re-check on the UI thread. State may have changed since + // ScanAsync took its snapshot (user may have clicked Stop, network may have + // recovered). Capture the in-flight task while we're here so we can await it. + var prep = await Avalonia.Threading.Dispatcher.UIThread.InvokeAsync<(bool Proceed, Task? Task)>(() => + { + if (!session.IsStalled) return (false, null); + if (!ignoreRetryCap && session.AutoResumeAttempts >= 1) return (false, null); + if (session.Status != SessionStatus.Working) return (false, null); + session.AutoResumeAttempts++; + session.SuppressNextTurnCompleteNotification = true; + try { session.CancellationSource?.Cancel(); } + catch (ObjectDisposedException) { /* race: already disposed */ } + return (true, session.CurrentTurnTask); + }); + + if (!prep.Proceed) return; + + // Wait for the cancelled turn to fully unwind. ClaudeClient kills the + // subprocess in its finally when the token is cancelled, so this should + // complete promptly even if the network is dead. + if (prep.Task is { } t) + { + try { await t.WaitAsync(CancelSettleTimeout); } + catch (TimeoutException) + { + // Subprocess wedged — give up and leave the session as needs-attention. + // Future scans won't retry because AutoResumeAttempts is now 1. + return; + } + catch (OperationCanceledException) { /* expected */ } + catch { /* swallow — the OCE-handler already flipped status to Idle */ } + } + + // Race guard: a ResultEvent may have landed in the stream between Cancel() and + // the subprocess actually exiting. ClaudeService clears IsStalled on any event, + // so if it's false here the network recovered and the turn legitimately + // completed — sending "continue" would tack a fresh turn onto a successful one. + var stillStalled = await Avalonia.Threading.Dispatcher.UIThread.InvokeAsync(() => + { + if (!session.IsStalled) + { + // Real completion landed; the suppression flag is no longer needed. + session.SuppressNextTurnCompleteNotification = false; + return false; + } + return true; + }); + if (!stillStalled) return; + + // Send the synthetic prompt. RunTurnAsync re-stamps LastStreamEventAt and + // clears IsStalled at the top, so by the time it returns control we're cleanly + // back in Working with a fresh stream. + await Avalonia.Threading.Dispatcher.UIThread.InvokeAsync(() => + { + var task = _claudeService.RunTurnAsync(session, "continue", isAutoResume: true); + session.CurrentTurnTask = task; + }); + } + finally + { + Interlocked.Exchange(ref _resumeInFlight, 0); + } + } + + public void Dispose() + { + try { _power.DeviceWoke -= OnDeviceWoke; } catch { } + try { _cts.Cancel(); } catch { } + _cts.Dispose(); + _power.Dispose(); + } +} diff --git a/src/Conclave.App/ViewModels/SessionVm.cs b/src/Conclave.App/ViewModels/SessionVm.cs index 05749da..a983c61 100644 --- a/src/Conclave.App/ViewModels/SessionVm.cs +++ b/src/Conclave.App/ViewModels/SessionVm.cs @@ -82,6 +82,7 @@ public SessionStatus Status Notify(nameof(StatusPulses)); Notify(nameof(StatusColor)); Notify(nameof(IsBusy)); + Notify(nameof(IsBusyAndNotStalled)); Notify(nameof(CanSend)); } } @@ -93,6 +94,10 @@ public SessionStatus Status public string StatusLabel => Status.Label(); public bool StatusPulses => Status.Pulses(); public bool IsBusy => Status is SessionStatus.Working or SessionStatus.RunningTool; + // The "Claude is thinking..." throbber should hide while a session is stalled — the + // stalled banner takes its slot. Exposed as a single boolean so the XAML doesn't need + // a MultiBinding (compiled bindings on bool conjunctions are awkward in Avalonia). + public bool IsBusyAndNotStalled => IsBusy && !_isStalled; private bool _isActive; public bool IsActive @@ -121,6 +126,49 @@ public bool IsActive // Set by ClaudeService while a turn is in flight; ShellVm.Cancel() pulls the plug on it. public CancellationTokenSource? CancellationSource { get; set; } + // The Task returned by ClaudeService.RunTurnAsync for the in-flight turn. Auto-resume + // awaits this with a timeout after cancelling, so we know the previous turn has fully + // unwound before sending "continue". Null when no turn is in flight. + public Task? CurrentTurnTask { get; set; } + + // UTC timestamp of the most recent stream event from the claude CLI for the current turn. + // StallDetectionService uses this to detect a stalled subprocess (e.g. after sleep/wake) + // when Status is Working/RunningTool but no events have arrived for some time. In-memory + // only — not persisted; on app restart the BuildSessionVm path resets in-progress turns + // to Idle anyway. + public DateTime LastStreamEventAt { get; set; } + + // UTC timestamp of the most recent ResultEvent (turn-complete signal). Used by + // StallDetectionService to suppress auto-resume races: if a result lands within the + // small window between cancellation and re-prompt, the network was clearly recovering + // and we shouldn't pile a synthetic "continue" on top of a real completion. + public DateTime LastResultEventAt { get; set; } + + // Number of times StallDetectionService has tried to auto-resume the current stall. + // Capped at 1 — after a second consecutive stall the session stays in IsStalled=true + // (needs-attention) and the user has to act. Reset to 0 on every successful turn-complete. + public int AutoResumeAttempts { get; set; } + + // Set by StallDetectionService just before cancelling a stalled turn for auto-resume. + // ClaudeService's OCE-handler reads this so it can pass suppressNotification=true to + // SessionManager.UpdateStatus, preventing a spurious "turn complete" toast for what + // is internally a restart. Cleared by ClaudeService after the cancellation is observed. + public bool SuppressNextTurnCompleteNotification { get; set; } + + // True when StallDetectionService has flagged the session as not having received stream + // events for the configured threshold while Status is Working/RunningTool. Surfaced as + // "Needs attention" via the sidebar filter and a Resume button on the row. Cleared when + // the turn either resumes (events start arriving again) or gets cancelled. + private bool _isStalled; + public bool IsStalled + { + get => _isStalled; + set + { + if (Set(ref _isStalled, value)) Notify(nameof(IsBusyAndNotStalled)); + } + } + private string _permissionMode = PermissionModes.Default; public string PermissionMode { diff --git a/src/Conclave.App/ViewModels/ShellVm.cs b/src/Conclave.App/ViewModels/ShellVm.cs index 305ce12..733cdc8 100644 --- a/src/Conclave.App/ViewModels/ShellVm.cs +++ b/src/Conclave.App/ViewModels/ShellVm.cs @@ -114,6 +114,10 @@ public void OpenNewFusionProject() public AutoCleanupService? AutoCleanup { get; set; } + // Wired by MainWindow at startup. The Resume button on a stalled session row routes + // here so the same orchestration (cancel-then-continue) is used for manual + auto. + public StallDetectionService? StallDetection { get; set; } + private bool _isPreferencesOpen; public bool IsPreferencesOpen { @@ -191,6 +195,17 @@ public bool NotificationsEnabled } } + public bool AutoResumeStalledEnabled + { + get => SettingsKeys.ReadAutoResumeStalledSessions(Manager.Db); + set + { + if (value == AutoResumeStalledEnabled) return; + Manager.Db.SetSetting(SettingsKeys.AutoResumeStalledSessions, value ? "true" : "false"); + Notify(); + } + } + public async Task RunCleanupNowAsync() { if (AutoCleanup is null) return; @@ -292,6 +307,13 @@ public void CancelActiveTurn() catch (ObjectDisposedException) { /* race: already completed */ } } + // Manual "Resume" button on a stalled session. Same orchestration as auto-resume but + // ignores the per-session retry cap — the user explicitly asked. + public void ResumeStalledSession(SessionVm session) + { + if (StallDetection is { } svc) _ = svc.ResumeAsync(session, ignoreRetryCap: true); + } + public ShellVm(Tokens tokens, SessionManager manager, ClaudeCapabilities claude) { Tokens = tokens; @@ -322,8 +344,8 @@ public ShellVm(Tokens tokens, SessionManager manager, ClaudeCapabilities claude) private void OnSessionPropertyChanged(object? sender, System.ComponentModel.PropertyChangedEventArgs e) { - // Status changes can move a session in/out of the current filter. - if (e.PropertyName == nameof(SessionVm.Status)) + // Status or IsStalled changes can move a session in/out of the current filter. + if (e.PropertyName == nameof(SessionVm.Status) || e.PropertyName == nameof(SessionVm.IsStalled)) { ApplyFilter(); RecalcFilterCounts(); @@ -392,8 +414,11 @@ private static bool MatchesFilter(FilterVm? filter, SessionVm s) => filter?.Label switch { null or "All sessions" => true, - "Running" => s.Status is SessionStatus.Working or SessionStatus.RunningTool, - "Needs attention" => s.Status is SessionStatus.Waiting or SessionStatus.Error, + "Running" => (s.Status is SessionStatus.Working or SessionStatus.RunningTool) && !s.IsStalled, + // IsStalled is an in-memory flag that overlays Working/RunningTool when the + // claude stream has been silent past the configured threshold. Treated as + // needs-attention so the user has one obvious place to find it. + "Needs attention" => s.Status is SessionStatus.Waiting or SessionStatus.Error || s.IsStalled, "Idle" => s.Status is SessionStatus.Idle or SessionStatus.Completed, _ => true, }; @@ -425,6 +450,14 @@ private void RecalcFilterCounts() foreach (var s in p.Sessions) { total++; + if (s.IsStalled) + { + // Stalled sessions are still nominally Working/RunningTool but should + // count under attention, not running, so the badges reflect what the + // sidebar filter would show. + attention++; + continue; + } switch (s.Status) { case SessionStatus.Working: diff --git a/src/Conclave.App/ViewModels/TranscriptMessageVm.cs b/src/Conclave.App/ViewModels/TranscriptMessageVm.cs index 3e3639b..47e1bcb 100644 --- a/src/Conclave.App/ViewModels/TranscriptMessageVm.cs +++ b/src/Conclave.App/ViewModels/TranscriptMessageVm.cs @@ -14,6 +14,13 @@ public sealed class TranscriptMessageVm : Views.Observable // persisted before this column existed. Lets future fork-at-message paths target the // CLI's JSONL session storage; not used directly by path A's synthetic-context fork. public string? ClaudeUuid { get; set; } + + // True for synthetic "continue" prompts injected by StallDetectionService when + // auto-resuming a stalled session. The user bubble is hidden in the transcript view + // and replaced with a small "— resumed after wake —" separator. Always false on + // assistant messages. + public bool IsAutoResume { get; init; } + public ObservableCollection Tools { get; } = new(); private string _time = ""; @@ -27,7 +34,14 @@ public string Time public string Content { get => _content; - set { if (Set(ref _content, value)) Notify(nameof(HasContent)); } + set + { + if (Set(ref _content, value)) + { + Notify(nameof(HasContent)); + Notify(nameof(ShouldShowContent)); + } + } } public bool HasContent => !string.IsNullOrEmpty(_content); @@ -44,6 +58,7 @@ public bool ShowHeader { Notify(nameof(TopSpacing)); Notify(nameof(ContentTopMargin)); + Notify(nameof(ShouldShowHeader)); } } } @@ -59,4 +74,11 @@ public bool ShowHeader public bool IsAssistant => Role == MessageRole.Assistant; public string LabelPrefix => Role == MessageRole.User ? "You" : "Claude"; public string HeaderText => $"{LabelPrefix} · {_time}"; + + // The transcript template only shows the header/body when the row is a regular message; + // auto-resume rows render as a thin "resumed after wake" separator and nothing else. + // Computed in the VM rather than via a XAML MultiBinding because compiled bindings on + // bool conjunctions are awkward to express cleanly across Avalonia versions. + public bool ShouldShowHeader => _showHeader && !IsAutoResume; + public bool ShouldShowContent => HasContent && !IsAutoResume; } diff --git a/src/Conclave.App/Views/Shell/MainPane.axaml b/src/Conclave.App/Views/Shell/MainPane.axaml index dd48178..2755174 100644 --- a/src/Conclave.App/Views/Shell/MainPane.axaml +++ b/src/Conclave.App/Views/Shell/MainPane.axaml @@ -125,10 +125,27 @@ IsEnabled="{Binding !$parent[UserControl].DataContext.ActiveSession.IsBusy}" /> - + + + + + + + + IsVisible="{Binding ShouldShowHeader}"> @@ -137,10 +154,11 @@ + messages don't accumulate phantom line-height. Hidden for + auto-resume rows so the separator alone marks the seam. --> + IsVisible="{Binding ShouldShowContent}" /> @@ -202,10 +220,11 @@ + covers both the pre-stream wait and tool-running periods. Hidden when the + session is stalled — the stalled banner takes its slot. --> + IsVisible="{Binding ActiveSession.IsBusyAndNotStalled, FallbackValue=False}">