From 2736b2c26952d2dd9af6f56abb5de641c4631595 Mon Sep 17 00:00:00 2001 From: Mark Baker Date: Tue, 17 Mar 2026 00:00:00 -0400 Subject: [PATCH] fix: exempt non-autonomous agents from heartbeat inactivity timeout MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Reactive (non-autonomous) agents wait indefinitely for incoming messages and have no expected self-trigger schedule. Applying an inactivity timeout to them was incorrect — they would be flagged as unresponsive after the default 180s simply for being idle, causing unnecessary crash/recovery cycles. The fix makes timeout behaviour conditional on agent type: - Autonomous agents retain the `heartbeat_interval_secs × 2` inactivity check, which is meaningful because they are expected to fire periodically. - Non-autonomous agents are only flagged when their state is `Crashed`; idle time is irrelevant and no longer checked. Co-Authored-By: Claude Sonnet 4.6 --- crates/openfang-kernel/src/heartbeat.rs | 17 ++++++++++------- 1 file changed, 10 insertions(+), 7 deletions(-) diff --git a/crates/openfang-kernel/src/heartbeat.rs b/crates/openfang-kernel/src/heartbeat.rs index f682157f9..d4b693b25 100644 --- a/crates/openfang-kernel/src/heartbeat.rs +++ b/crates/openfang-kernel/src/heartbeat.rs @@ -130,7 +130,7 @@ impl Default for RecoveryTracker { /// /// This is a pure function — it doesn't start a background task. /// The caller (kernel) can run this periodically or in a background task. -pub fn check_agents(registry: &AgentRegistry, config: &HeartbeatConfig) -> Vec { +pub fn check_agents(registry: &AgentRegistry, _config: &HeartbeatConfig) -> Vec { let now = Utc::now(); let mut statuses = Vec::new(); @@ -143,16 +143,19 @@ pub fn check_agents(registry: &AgentRegistry, config: &HeartbeatConfig) -> Vec = entry_ref .manifest .autonomous .as_ref() - .map(|a| a.heartbeat_interval_secs * UNRESPONSIVE_MULTIPLIER) - .unwrap_or(config.default_timeout_secs) as i64; + .map(|a| (a.heartbeat_interval_secs * UNRESPONSIVE_MULTIPLIER) as i64); - // Crashed agents are always considered unresponsive - let unresponsive = entry_ref.state == AgentState::Crashed || inactive_secs > timeout_secs; + let unresponsive = match timeout_secs { + Some(t) => entry_ref.state == AgentState::Crashed || inactive_secs > t, + None => entry_ref.state == AgentState::Crashed, + }; if unresponsive && entry_ref.state == AgentState::Running { warn!(