From aed4bf62ae0ddd0a637f65f450178dd4d4dcb9cc Mon Sep 17 00:00:00 2001 From: Mark Baker Date: Tue, 17 Mar 2026 00:00:00 -0400 Subject: [PATCH] fix: stamp last_active before LLM call to prevent mid-iteration heartbeat timeouts MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Slow local models (e.g. 27B quantised MLX models) can take 3–4+ minutes per iteration, well beyond the default 180s heartbeat timeout. Because last_active was only updated at the end of an iteration — never during it — the heartbeat monitor would flag the agent as unresponsive mid-call and initiate crash/recovery while the loop was still running correctly. Changes: - Add `touch()` to `AgentRegistry`: refreshes `last_active` with no other side-effects. - Add `touch_agent(&self, agent_id: &str)` to `KernelHandle` trait with a default no-op, so existing mock implementations require no changes. - Implement `touch_agent` on `OpenFangKernel`: parses the UUID and delegates to `registry.touch()`. - Call `kernel.touch_agent(agent_id)` at the top of each agent loop iteration, immediately before the `call_with_retry` LLM call. This resets the inactivity clock at the start of every iteration rather than only at completion. Co-Authored-By: Claude Sonnet 4.6 --- crates/openfang-kernel/src/kernel.rs | 6 ++++++ crates/openfang-kernel/src/registry.rs | 8 ++++++++ crates/openfang-runtime/src/agent_loop.rs | 6 ++++++ crates/openfang-runtime/src/kernel_handle.rs | 6 ++++++ 4 files changed, 26 insertions(+) diff --git a/crates/openfang-kernel/src/kernel.rs b/crates/openfang-kernel/src/kernel.rs index 5e582d048..ca96e4602 100644 --- a/crates/openfang-kernel/src/kernel.rs +++ b/crates/openfang-kernel/src/kernel.rs @@ -5715,6 +5715,12 @@ impl KernelHandle for OpenFangKernel { .collect() } + fn touch_agent(&self, agent_id: &str) { + if let Ok(id) = agent_id.parse::() { + self.registry.touch(id); + } + } + fn kill_agent(&self, agent_id: &str) -> Result<(), String> { let id: AgentId = agent_id .parse() diff --git a/crates/openfang-kernel/src/registry.rs b/crates/openfang-kernel/src/registry.rs index b3c3a4962..0852a658e 100644 --- a/crates/openfang-kernel/src/registry.rs +++ b/crates/openfang-kernel/src/registry.rs @@ -235,6 +235,14 @@ impl AgentRegistry { Ok(()) } + /// Touch an agent — refresh last_active without changing any other state. + /// Used by the agent loop to prevent heartbeat false-positives during long LLM calls. + pub fn touch(&self, id: AgentId) { + if let Some(mut entry) = self.agents.get_mut(&id) { + entry.last_active = chrono::Utc::now(); + } + } + /// Update an agent's system prompt (hot-swap, takes effect on next message). pub fn update_system_prompt(&self, id: AgentId, new_prompt: String) -> OpenFangResult<()> { let mut entry = self diff --git a/crates/openfang-runtime/src/agent_loop.rs b/crates/openfang-runtime/src/agent_loop.rs index 2fd481d68..9830b6287 100644 --- a/crates/openfang-runtime/src/agent_loop.rs +++ b/crates/openfang-runtime/src/agent_loop.rs @@ -348,6 +348,12 @@ pub async fn run_agent_loop( cb(LoopPhase::Thinking); } + // Stamp last_active before the (potentially long) LLM call so the + // heartbeat monitor doesn't flag us as unresponsive mid-iteration. + if let Some(k) = &kernel { + k.touch_agent(&agent_id_str); + } + // Call LLM with retry, error classification, and circuit breaker let provider_name = manifest.model.provider.as_str(); let mut response = call_with_retry(&*driver, request, Some(provider_name), None).await?; diff --git a/crates/openfang-runtime/src/kernel_handle.rs b/crates/openfang-runtime/src/kernel_handle.rs index 00f195599..e3e1b7633 100644 --- a/crates/openfang-runtime/src/kernel_handle.rs +++ b/crates/openfang-runtime/src/kernel_handle.rs @@ -238,6 +238,12 @@ pub trait KernelHandle: Send + Sync { Err("Channel file data send not available".to_string()) } + /// Refresh an agent's last_active timestamp without changing any other state. + /// Called by the agent loop before long LLM calls to prevent heartbeat false-positives. + fn touch_agent(&self, agent_id: &str) { + let _ = agent_id; + } + /// Spawn an agent with capability inheritance enforcement. /// `parent_caps` are the parent's granted capabilities. The kernel MUST verify /// that every capability in the child manifest is covered by `parent_caps`.