From 80495a7bd3d1eeddd80367820475577856eaad06 Mon Sep 17 00:00:00 2001 From: Ghost Scripter Date: Thu, 28 May 2026 01:31:10 +0530 Subject: [PATCH] fix(observability): classify 'operation timed out' as expected transport MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The channel supervisor wraps a listener failure as `format!("Channel {} error: {e:#}; restarting", ch.name())` and routes the result through `report_error_or_expected`. When the discord gateway TCP/WebSocket socket hits `ETIMEDOUT`, the anyhow chain renders without a URL anchor (this is `std::io`-level, below reqwest) and previously fell straight through every classifier arm into `report_error` — one Sentry event per backoff cycle. `TRANSIENT_TRANSPORT_PHRASES` and `contains_transient_transport_phrase` already treat `"operation timed out"` as transient at other emit sites (`authed_json` transport branch, `is_transient_message_failure`), but `expected_error_kind` — the funnel `report_error_or_expected` uses — never consulted that list. Closing the gap in `is_network_unreachable_message` keeps the classifier's per-anchor structure intact and is symmetric with `"connection refused"` / `"connection reset"` (no errno suffix pinned — `(os error 60)` BSD/macOS, `(os error 110)` Linux, `(os error 10060)` Windows `WSAETIMEDOUT`, and bare prose all share the lowercase substring). Targets Sentry OPENHUMAN-TAURI-EM (issue 608): 128 events between 2026-05-19 and 2026-05-27, all from `logger=openhuman_core::openhuman::channels::runtime::supervision`, canonical body: Channel discord error: IO error: Operation timed out (os error 60); restarting Tests pin the macOS / Linux / Windows wire shapes (so a future platform-specific change cannot silently re-open the leak), the provider-agnostic supervisor wrapper (`Channel slack error: ...`, `Channel telegram error: ...`), and a counter-example (`"timeout"` mentioned as a config-knob name, no `"operation timed out"` anchor) to confirm the matcher stays specific. --- src/core/observability.rs | 71 +++++++++++++++++++++++++++++++++++++++ 1 file changed, 71 insertions(+) diff --git a/src/core/observability.rs b/src/core/observability.rs index f1a58c543..aed76a2b4 100644 --- a/src/core/observability.rs +++ b/src/core/observability.rs @@ -407,6 +407,22 @@ fn is_network_unreachable_message(lower: &str) -> bool { || lower.contains("nodename nor servname") || lower.contains("connection refused") || lower.contains("connection reset") + // OPENHUMAN-TAURI-EM (128 events): the channel supervisor wraps + // `discord_listen()`'s anyhow chain as `format!("Channel {} error: + // {e:#}; restarting", ...)`, which lands as + // `"Channel discord error: IO error: Operation timed out (os error + // 60); restarting"`. The discord gateway TCP/WebSocket connection + // timing out is transient network state, not a code bug — the + // supervisor already retries with exponential backoff. Same shape + // surfaces on every channel (slack/telegram/...) once the + // underlying socket hits ETIMEDOUT, so we match on the platform- + // agnostic phrase, symmetric with `"connection reset"` / + // `"connection refused"` above. Errno renderings are not pinned + // because `(os error 60)` (BSD/macOS), `(os error 110)` (Linux), + // `(os error 10060)` (Windows `WSAETIMEDOUT`), and bare prose + // `"operation timed out"` (hyper / tungstenite / std::io) all + // share the same lowercase substring. + || lower.contains("operation timed out") || lower.contains("network is unreachable") || lower.contains("no route to host") || lower.contains("tls handshake") @@ -1671,6 +1687,61 @@ mod tests { ); } + #[test] + fn channel_supervisor_operation_timed_out_classifies_as_expected() { + // OPENHUMAN-TAURI-EM (128 events): `channels::runtime::supervision` + // wraps a channel listener failure as + // `format!("Channel {} error: {e:#}; restarting", ch.name())` and + // routes the message through `report_error_or_expected`. When the + // discord gateway TCP/WebSocket connection hits ETIMEDOUT, the + // anyhow chain renders without a URL anchor (this is `std::io`-level, + // not reqwest) and previously fell straight through every classifier + // arm into `report_error` — one Sentry event per restart cycle. + // + // Pin the exact macOS wire shape from the issue, plus the Linux and + // Windows errno renderings so a future platform-specific change does + // not silently re-open the leak. The bare `"operation timed out"` + // anchor matches all three since the errno digits live downstream + // of the canonical phrase. + for raw in [ + // macOS (os error 60 = ETIMEDOUT on BSD) + "Channel discord error: IO error: Operation timed out (os error 60); restarting", + // Linux (os error 110 = ETIMEDOUT) + "Channel discord error: IO error: Operation timed out (os error 110); restarting", + // Windows (os error 10060 = WSAETIMEDOUT) + "Channel discord error: IO error: Operation timed out (os error 10060); restarting", + // Same shape on other channels — supervisor wrapper is provider-agnostic. + "Channel slack error: IO error: Operation timed out (os error 60); restarting", + "Channel telegram error: IO error: Operation timed out (os error 110); restarting", + // Bare prose form (no errno suffix) from hyper / tungstenite layers + // that render `std::io::Error` without `raw_os_error()`. + "Channel discord error: WebSocket connect: IO error: Operation timed out; restarting", + ] { + assert_eq!( + expected_error_kind(raw), + Some(ExpectedErrorKind::NetworkUnreachable), + "channel supervisor timeout shape must classify as expected (got {:?} for {raw:?})", + expected_error_kind(raw) + ); + } + } + + #[test] + fn operation_timed_out_negative_cases_still_report() { + // Counter-case: a configuration/validation message that mentions + // "timeout" as a knob name (not transport state) and has no other + // classifier anchor must still reach Sentry. The substring chosen + // for the new matcher is `"operation timed out"`, not `"timeout"`, + // precisely so unrelated mentions of the word do not collide. + assert_eq!( + expected_error_kind("config rejected: timeout must be a positive integer"), + None, + "config validation noise (no 'operation timed out' anchor) must still reach Sentry" + ); + // Bare empty string — no anchors at all. + assert_eq!(expected_error_kind(""), None); + } + #[test] fn channels_dispatch_re_emit_of_provider_502_classifies_as_transient() { // OPENHUMAN-TAURI-4F (~157 events) / -1C (~87 events) / -8F