From 74f36de5c5cd192c4387453063d2cc6071b24349 Mon Sep 17 00:00:00 2001
From: oxoxDev <nikhil@tinyhumans.ai>
Date: Wed, 20 May 2026 13:50:53 +0530
Subject: [PATCH 1/2] fix(observability): demote 11 custom-provider
 config-rejection shapes (Wave 4 Lane O)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Extend `is_provider_config_rejection_message` PHRASES with 8 new
substrings covering wire shapes that the Wave 1-3 phrases miss:

- `not available in your region`            — R1 (region block)
- `modelnotallowed`                          — R4 (Doubao/ChatGLM allowlist)
- `invalid_authentication_error`             — YC (user key rejected by upstream)
- `requires more credits`                    — S5 (OpenRouter 402 out-of-credits)
- `invalid model name passed in model=`      — Y0 (litellm proxy pre-routing reject)
- `no active credentials for provider`       — JN + KB (upstream API key gap)
- `litellm.badrequesterror`                  — JK (github_copilot OAuth gap)
- `not_found_error`                          — J2 + J5 + J4 (litellm envelope `type`)

Each is a deterministic user-state error (wrong model, wrong region, bad
key, out of credits, missing OAuth scope) — the reliable-provider stack
already falls back to OpenHuman's hosted tier, so the UX is intact; only
the Sentry spam was leaking. Closes ~250 events across 11 issue IDs.

Pinned tests against the literal Sentry event bodies from each ID so a
future provider rename doesn't silently un-classify them.

Closes OPENHUMAN-TAURI-R1
Closes OPENHUMAN-TAURI-R4
Closes OPENHUMAN-TAURI-YC
Closes OPENHUMAN-TAURI-S5
Closes OPENHUMAN-TAURI-Y0
Closes OPENHUMAN-TAURI-JN
Closes OPENHUMAN-TAURI-KB
Closes OPENHUMAN-TAURI-JK
Closes OPENHUMAN-TAURI-J2
Closes OPENHUMAN-TAURI-J5
Closes OPENHUMAN-TAURI-J4
---
 .../inference/provider/config_rejection.rs    | 133 +++++++++++++++++-
 1 file changed, 127 insertions(+), 6 deletions(-)

diff --git a/src/openhuman/inference/provider/config_rejection.rs b/src/openhuman/inference/provider/config_rejection.rs
index 542ef8723b..c64eaefa4c 100644
--- a/src/openhuman/inference/provider/config_rejection.rs
+++ b/src/openhuman/inference/provider/config_rejection.rs
@@ -15,15 +15,32 @@
 //!   Moonshot Kimi K2)
 //! - `"The model \`gpt-5.5\` does not exist or you do not have access to
 //!   it."` / `"model_not_found"` (stale model pin)
+//! - `"This model is not available in your region."` (R1 — region-blocked
+//!   model on a custom cloud provider)
+//! - `"ModelNotAllowed"` (R4 — Doubao/ChatGLM model-allowlist enforcement)
+//! - `"invalid_authentication_error"` (YC — user pasted a malformed /
+//!   revoked API key into the provider config)
+//! - `"This request requires more credits"` (S5 — OpenRouter `402` when
+//!   the user's account is out of credits)
+//! - `"Invalid model name passed in model="` (Y0 — litellm-style proxy
+//!   rejecting a model id pre-routing)
+//! - `"No active credentials for provider:"` (JN / KB — user hasn't
+//!   plugged in their API key for the selected provider yet)
+//! - `"litellm.BadRequestError"` (JK — litellm github_copilot proxy 400
+//!   from a user OAuth/scope gap)
+//! - `"not_found_error"` (J2 / J5 / J4 — litellm-compatible envelope
+//!   `type` field carrying "model 'X' not found")
 //!
 //! These are **deterministic user-configuration state**, not bugs the
 //! maintainers can act on: the user pointed OpenHuman at a custom
-//! provider with a model / temperature that provider does not accept. The
-//! remediation is "fix the model or routing in Settings", which the UI
-//! surfaces. Yet every agent turn produces a fresh Sentry event
-//! (OPENHUMAN-TAURI-WJ / -QW / -HB / -NH — 88 + 146 + 39 events). This is
-//! the same class as budget-exhaustion ([`super::billing_error`]) and
-//! must be demoted from Sentry to an info log the same way.
+//! provider with a model / temperature / region / credential that
+//! provider does not accept. The remediation is "fix the model, key, or
+//! routing in Settings", which the UI surfaces. Yet every agent turn
+//! produces a fresh Sentry event (OPENHUMAN-TAURI-WJ / -QW / -HB / -NH /
+//! -R1 / -R4 / -YC / -S5 / -Y0 / -JN / -KB / -JK / -J2 / -J5 / -J4 —
+//! ~250 additional events on top of the Wave 1-3 IDs). This is the
+//! same class as budget-exhaustion ([`super::billing_error`]) and must
+//! be demoted from Sentry to an info log the same way.
 //!
 //! ## Provider-aware polarity (important)
 //!
@@ -72,6 +89,53 @@ pub fn is_provider_config_rejection_message(body: &str) -> bool {
         // Our own actionable error once a proper tier→model resolution
         // is in place (keeps this classifier stable across that fix).
         "is an abstract tier",
+        // OPENHUMAN-TAURI-R1 — custom_openai upstream 403 with body
+        // `{"error":{"message":"This model is not available in your region.","code":403}}`.
+        // User picked a model the provider blocks for their account's
+        // region. Sentry has no remediation; user must switch model.
+        "not available in your region",
+        // OPENHUMAN-TAURI-R4 — Doubao / ChatGLM-style model allowlist
+        // enforcement. Body: `{"reason":"ModelNotAllowed",...}`. Match
+        // lowercased — the provider sends the camelCase token as a
+        // sentinel `reason` value.
+        "modelnotallowed",
+        // OPENHUMAN-TAURI-YC — user-supplied custom_openai API key was
+        // rejected by upstream with the OpenAI-compatible
+        // `{"error":{"type":"invalid_authentication_error",...}}`
+        // envelope. Anchored on the type token (stable across providers
+        // that emit this OpenAI-compatible body).
+        "invalid_authentication_error",
+        // OPENHUMAN-TAURI-S5 — OpenRouter 402 when the user is out of
+        // credits. Body always carries "requires more credits, or fewer
+        // max_tokens"; pin to the unique-enough credits phrase. (The
+        // separate `billing_error` classifier handles our own
+        // OpenHuman-backend balance gate; this catches the third-party
+        // OpenRouter shape that re-emits via `agent.run_single`.)
+        "requires more credits",
+        // OPENHUMAN-TAURI-Y0 — litellm-style proxy rejected the model
+        // id pre-routing with `Invalid model name passed in model=…`.
+        // Anchored on the `passed in model=` suffix so a stray "invalid
+        // model name" log line elsewhere does not classify.
+        "invalid model name passed in model=",
+        // OPENHUMAN-TAURI-JN / -KB — custom provider proxy that fronts
+        // multiple upstream APIs surfaces a "you haven't configured the
+        // upstream provider yet" 401/404 as `{"error":{"message":"No
+        // active credentials for provider: openai",...}}`. The
+        // remediation is "add the upstream API key in Settings".
+        "no active credentials for provider",
+        // OPENHUMAN-TAURI-JK — litellm github_copilot proxy 400 driven
+        // by the user's missing / expired Copilot OAuth scope. The body
+        // always starts with the `litellm.BadRequestError:` envelope.
+        // Anchor to that prefix-shaped substring so we don't catch
+        // unrelated 400s that merely mention litellm in passing.
+        "litellm.badrequesterror",
+        // OPENHUMAN-TAURI-J2 / -J5 / -J4 — litellm-compatible
+        // envelope with `"type":"not_found_error"` carrying "model 'X'
+        // not found". Distinct from the existing `model_not_found`
+        // phrase: that's the `code` field used by OpenAI-native bodies;
+        // this is the `type` field used by litellm/Anthropic-style
+        // envelopes for the same class of user-state error.
+        "not_found_error",
     ];
 
     let lower = body.to_ascii_lowercase();
@@ -102,6 +166,63 @@ mod tests {
         }
     }
 
+    #[test]
+    fn detects_wave4_sentry_bodies() {
+        // Real wire bodies pulled from the OPENHUMAN-TAURI-* Sentry
+        // events the Wave 4 phrases drop.
+        for (sentry_id, body) in [
+            (
+                "R1",
+                r#"custom_openai API error (403 Forbidden): {"error":{"message":"This model is not available in your region.","code":403}}"#,
+            ),
+            (
+                "R4",
+                r#"custom_openai API error (403 Forbidden): {"code":403,"reason":"ModelNotAllowed","message":"模型不允许访问","metadata":{"request_id":"2026051706431574423265420620337"}}"#,
+            ),
+            (
+                "YC",
+                r#"custom_openai API error (401 Unauthorized): {"error":{"message":"Invalid Authentication","type":"invalid_authentication_error"}}"#,
+            ),
+            (
+                "S5",
+                r#"custom_openai API error (402 Payment Required): {"error":{"message":"This request requires more credits, or fewer max_tokens. You requested up to 65536 tokens, but can only afford 597.","type":"insufficient_credits"}}"#,
+            ),
+            (
+                "Y0",
+                r#"custom_openai API error (400 Bad Request): {"error":{"message":"{'error': '/chat/completions: Invalid model name passed in model=reasoning-v1. Call `/v1/models` to view available models for your key.'}","type":"None"}}"#,
+            ),
+            (
+                "JN",
+                r#"custom_openai Responses API error: {"error":{"message":"No active credentials for provider: openai","type":"invalid_request_error","code":"model_not_found"}}"#,
+            ),
+            (
+                "KB",
+                r#"OpenHuman API error (404 Not Found): {"error":{"message":"No active credentials for provider: openai","type":"invalid_request_error","code":"model_not_found"}}"#,
+            ),
+            (
+                "JK",
+                r#"custom_openai API error (400 Bad Request): {"error":{"message":"litellm.BadRequestError: Github_copilotException - Bad Request. Received Model Group=github_copilot/claude-haiku-4.5\nAvailable Model Group Fallbacks=None","type":null}}"#,
+            ),
+            (
+                "J2",
+                r#"custom_openai Responses API error: {"error":{"message":"model 'llama3.3' not found","type":"not_found_error","param":null,"code":null}}"#,
+            ),
+            (
+                "J5",
+                r#"custom_openai API error (404 Not Found): {"error":{"message":"model 'llama3.3' not found","type":"not_found_error","param":null,"code":null}}"#,
+            ),
+            (
+                "J4",
+                r#"custom_openai streaming API error (404 Not Found): {"error":{"message":"model 'llama3.3' not found","type":"not_found_error","param":null,"code":null}}"#,
+            ),
+        ] {
+            assert!(
+                is_provider_config_rejection_message(body),
+                "OPENHUMAN-TAURI-{sentry_id} body must classify as provider config-rejection: {body:?}"
+            );
+        }
+    }
+
     #[test]
     fn detection_is_case_insensitive() {
         assert!(is_provider_config_rejection_message(

From e16414ac6a73659c24e2994819345695602e70c6 Mon Sep 17 00:00:00 2001
From: oxoxDev <nikhil@tinyhumans.ai>
Date: Wed, 20 May 2026 13:51:05 +0530
Subject: [PATCH 2/2] fix(observability): classify socket transport wire-shape
 variants (Wave 4 Lane N)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Extend `is_network_unreachable_message` with three substring arms for
wire shapes the existing `dns error` / status-bearing matchers miss:

- `failed to lookup address`   — libc `getaddrinfo()` rendering when
                                  tungstenite wraps the resolver fail as
                                  an `IO error` without the `dns error`
                                  prefix (OPENHUMAN-TAURI-44 ~50 events).
- `nodename nor servname`      — companion phrase from the macOS/BSD libc
                                  resolver — same OPENHUMAN-TAURI-44
                                  wire shape, second anchor.
- `http error: 200 ok`         — tungstenite's `WsError::Http(200)`
                                  rendering when a captive portal /
                                  corporate proxy intercepts the WS
                                  upgrade handshake and returns a plain
                                  HTML 200 page (OPENHUMAN-TAURI-4P
                                  ~66 events). Tungstenite-only — reqwest
                                  renders HTTP 200 as `HTTP status server
                                  error (200)` so there is no collision
                                  with the regular HTTP path.

A precedence test (`http_200_classifier_does_not_silence_unrelated_log_lines`)
pins the substring against benign `HTTP/1.1 200 OK` / `status: 200 OK`
prose so a future broadening does not silence success traces.

Sentry has no remediation path for any of these — the user must change
their network (firewall / proxy / DNS). Closes ~116 additional events.

Closes OPENHUMAN-TAURI-44
Closes OPENHUMAN-TAURI-4P
---
 src/core/observability.rs | 64 +++++++++++++++++++++++++++++++++++++++
 1 file changed, 64 insertions(+)

diff --git a/src/core/observability.rs b/src/core/observability.rs
index 2b285dcd2c..cd74790af7 100644
--- a/src/core/observability.rs
+++ b/src/core/observability.rs
@@ -296,15 +296,34 @@ fn is_loopback_unavailable(lower: &str) -> bool {
 /// through [`is_loopback_unavailable`] *before* this matcher so the
 /// boot-window race against the embedded core keeps its own bucket — see
 /// the precedence comment in [`expected_error_kind`].
+///
+/// Three additional substrings cover wire-shape variants observed in
+/// Wave 4 that the original `"dns error"` / status-code matchers miss:
+///
+/// - `"failed to lookup address"` / `"nodename nor servname"` —
+///   `getaddrinfo()` failure renderings on macOS / BSD libc and POSIX
+///   resolvers (`OPENHUMAN-TAURI-44` ~50 events,
+///   `[socket] Connection failed: WebSocket connect: IO error: failed to
+///   lookup address information: nodename nor servname provided, or not
+///   known`).
+/// - `"http error: 200 ok"` — tungstenite's `WsError::Http(200)` render
+///   when a corporate proxy / captive portal intercepts the WebSocket
+///   handshake and returns a plain HTML 200 page (`OPENHUMAN-TAURI-4P`
+///   ~66 events). Tungstenite-only — reqwest renders HTTP 200 as
+///   `"HTTP status server error (200)"`, so this can't collide with the
+///   regular HTTP call path.
 fn is_network_unreachable_message(lower: &str) -> bool {
     lower.contains("error sending request for url")
         || lower.contains("dns error")
+        || lower.contains("failed to lookup address")
+        || lower.contains("nodename nor servname")
         || lower.contains("connection refused")
         || lower.contains("connection reset")
         || lower.contains("network is unreachable")
         || lower.contains("no route to host")
         || lower.contains("tls handshake")
         || lower.contains("certificate verify failed")
+        || lower.contains("http error: 200 ok")
 }
 
 /// Detect transient upstream HTTP failures that have bubbled up out of the
@@ -1243,6 +1262,51 @@ mod tests {
         );
     }
 
+    #[test]
+    fn classifies_wave4_socket_transport_wire_shapes() {
+        // OPENHUMAN-TAURI-44 (~50 events): libc `getaddrinfo()` rendering
+        // without the `dns error` token, wrapped by the socket emit site.
+        // The Wave 4 matcher arms catch the literal resolver phrases that
+        // the original `dns error` substring would miss when reqwest's
+        // wrapper isn't in the chain (e.g. tungstenite IO errors).
+        assert_eq!(
+            expected_error_kind(
+                "[socket] Connection failed (sustained outage after 5 attempts): \
+                 WebSocket connect: IO error: failed to lookup address information: \
+                 nodename nor servname provided, or not known"
+            ),
+            Some(ExpectedErrorKind::NetworkUnreachable)
+        );
+
+        // OPENHUMAN-TAURI-4P (~66 events): tungstenite renders a captive
+        // portal / corporate proxy that intercepts the WS handshake as
+        // `WsError::Http(200)` → `"HTTP error: 200 OK"`. Classify as
+        // network-unreachable since no amount of app-side retry can pierce
+        // an intercepting proxy.
+        assert_eq!(
+            expected_error_kind(
+                "[socket] Connection failed (sustained outage after 5 attempts): \
+                 WebSocket connect: HTTP error: 200 OK"
+            ),
+            Some(ExpectedErrorKind::NetworkUnreachable)
+        );
+    }
+
+    #[test]
+    fn http_200_classifier_does_not_silence_unrelated_log_lines() {
+        // The captive-portal arm anchors on `"http error: 200 ok"` (the
+        // exact tungstenite `WsError::Http(200)` Display rendering).
+        // Adjacent non-WebSocket log lines that mention `"HTTP/1.1 200 OK"`
+        // or `"status: 200 OK"` MUST NOT classify — those are normal-flow
+        // success traces, not failure events. Pin this precedence so a
+        // future refactor doesn't broaden the substring.
+        assert_eq!(expected_error_kind("HTTP/1.1 200 OK"), None);
+        assert_eq!(
+            expected_error_kind("upstream returned status: 200 OK after retry"),
+            None
+        );
+    }
+
     #[test]
     fn classifies_transient_upstream_http_errors() {
         // OPENHUMAN-TAURI-5Z: the canonical shape emitted by