From 34ed335b9e26219b8e66ce5bde8d8975151aaac4 Mon Sep 17 00:00:00 2001 From: "cyrus@tinyhumans.ai" Date: Fri, 29 May 2026 22:06:09 +0530 Subject: [PATCH] revert: remove all Sentry error suppression from PRs #2813 #2850 #2899 #2906 #2915 #2923 #2924 #2928 #2930 #2934 #2937 #2939 These PRs suppressed/demoted errors instead of fixing root causes. Errors will now properly fire Sentry events again so they can be tracked and fixed with proper root-cause solutions. --- src/core/observability.rs | 848 +----------------- src/main.rs | 52 -- src/openhuman/inference/ops.rs | 42 +- src/openhuman/inference/ops_tests.rs | 70 -- .../inference/provider/config_rejection.rs | 226 +---- src/openhuman/inference/provider/ops.rs | 176 +--- src/openhuman/memory_store/factories.rs | 35 +- src/openhuman/subconscious/store.rs | 14 +- 8 files changed, 55 insertions(+), 1408 deletions(-) diff --git a/src/core/observability.rs b/src/core/observability.rs index 3bcb564e02..9d46e77203 100644 --- a/src/core/observability.rs +++ b/src/core/observability.rs @@ -181,26 +181,6 @@ pub enum ExpectedErrorKind { /// deliberately not matched because that's an `rm -rf` invariant /// violation, not user input. FilesystemUserPathInvalid, - /// A memory-store write (document upsert or KV set) was rejected because - /// the namespace or key contained what the PII guard classified as a - /// personal identifier (national ID, phone number, formatted credential, - /// etc.). The guard fires *before* the write reaches SQLite so no data - /// is persisted, and the LLM or caller that triggered the write already - /// receives the error string. Sentry has no remediation path — the fix - /// is either a less aggressive namespace/key choice from the caller or a - /// PII-guard allowlist update — and the volume is high from a single user - /// (TAURI-RUST-54T: 915 events, escalating), indicating that the guard - /// is flagging false positives on valid channel names or usernames used - /// as namespace/key identifiers. Demote to `warn` so the breadcrumb - /// survives for local diagnosis but Sentry sees no error event. - /// - /// Canonical wire shapes (from `memory_store/unified/documents.rs` and - /// `memory_store/kv.rs`): - /// - /// - `"document namespace/key cannot contain personal identifiers"` - /// - `"kv key cannot contain personal identifiers"` - /// - `"kv namespace/key cannot contain personal identifiers"` - MemoryStorePiiRejection, /// The provider/model completed a turn with a completely empty body /// (`text_chars=0 thinking_chars=0 tool_calls=0`), so the agent harness /// bailed with the user-facing `"The model returned an empty response. @@ -253,17 +233,7 @@ pub fn expected_error_kind(message: &str) -> Option { if lower.contains("local ai is disabled") { return Some(ExpectedErrorKind::LocalAiDisabled); } - // `_api_key is not configured` catches backend-reported environment variable - // phrases like `VOYAGE_API_KEY is not configured` and - // `COHERE_API_KEY is not configured` returned by the embeddings backend - // when the relevant env var is absent (TAURI-RUST-2H5, ~5 K events). - // The `_api_key` anchor (lower-cased suffix of an env-var name) keeps - // generic "X is not configured" prose from being silenced — only - // ALL_CAPS_API_KEY-style names match. - if lower.contains("api key not set") - || lower.contains("missing api key") - || lower.contains("_api_key is not configured") - { + if lower.contains("api key not set") || lower.contains("missing api key") { return Some(ExpectedErrorKind::ApiKeyMissing); } // Check `ChannelSupervisorRestart` BEFORE `is_loopback_unavailable` and @@ -370,9 +340,6 @@ pub fn expected_error_kind(message: &str) -> Option { if is_config_load_timed_out_message(&lower) { return Some(ExpectedErrorKind::ConfigLoadTimedOut); } - if is_memory_store_pii_rejection(&lower) { - return Some(ExpectedErrorKind::MemoryStorePiiRejection); - } // Empty-provider-response re-report from the web-channel layer. Runs // last so an earlier, more specific matcher always wins. See the // variant doc-comment and [`is_empty_provider_response_message`] for @@ -389,86 +356,9 @@ pub fn expected_error_kind(message: &str) -> Option { if is_filesystem_user_path_invalid_message(&lower) { return Some(ExpectedErrorKind::FilesystemUserPathInvalid); } - // Upstream rate-limit responses — provider throttles the account (429) or - // wraps the 429 inside an HTTP 500 (`"429 rate limit exceeded"` in the - // body). In both cases the reliable-provider layer already retries with - // backoff, and the embeddings path has a proactive token-bucket limiter - // (`embeddings::rate_limit`). The upstream quota is an account-capacity - // signal, not a code bug — Sentry has no remediation path and the - // per-attempt events generate pure noise (OPENHUMAN-TAURI-S: ~6 984 - // events from HTTP 500 wrapping a "429 rate limit exceeded" body; - // OPENHUMAN-TAURI-6Y: ~19 849 events from direct 429s; OPENHUMAN-TAURI-2E: - // ~1 482 events carrying a `"rate_limit_error"` type in the JSON body; - // OPENHUMAN-TAURI-RQ: ~741 events from the embeddings path). - // - // Checked LAST inside `expected_error_kind` — transient HTTP status matches - // (`is_transient_upstream_http_message`) are already caught by the earlier - // arm, so this arm only adds coverage for the 500-wrapping-429 body shape - // and provider JSON envelopes that name the error type explicitly. - if is_upstream_rate_limit_message(&lower) { - return Some(ExpectedErrorKind::TransientUpstreamHttp); - } None } -/// Detect upstream rate-limit error bodies that bubble up from any provider -/// or embedding API call site. -/// -/// Covers three observed wire shapes: -/// -/// 1. **OpenAI / Anthropic JSON body** — `"rate_limit_error"` is the `"type"` -/// field in the structured error object: -/// `{"error":{"message":"Rate limit exceeded.","type":"rate_limit_error"}}` -/// (OPENHUMAN-TAURI-2E / -RQ). -/// -/// 2. **OpenHuman backend wrapping upstream** — `"Upstream rate limit exceeded -/// for model 'summarization-v1'. Please retry shortly."` embedded in a 500 -/// response body (OPENHUMAN-TAURI-6Y / -7H). -/// -/// 3. **Plain phrase** — `"429 rate limit exceeded, please try again later"` / -/// `"rate limit exceeded"` from any other upstream (OPENHUMAN-TAURI-S). -/// -/// The match is against the full lowercased error string (including any -/// caller wrapping prefix), so it survives `agent.run_single` / `rpc.invoke_method` -/// re-reports as well as the original call-site emit. -/// -/// **Polarity contract**: this predicate is *inclusive* — it returns `true` -/// only for messages that are unambiguously rate-limit throttle signals. It -/// must NOT match unrelated errors that incidentally mention "limit" or "rate" -/// (e.g. action-budget `"Rate limit exceeded: action budget exhausted"` -/// from `security::policy` — distinguished by the `"action budget"` anchor). -pub fn is_upstream_rate_limit_message(lower: &str) -> bool { - // `"rate_limit_error"` is the structured error type from OpenAI / Anthropic - // compatible APIs. Tight anchor — colons and underscores don't appear in - // ordinary log text. - if lower.contains("rate_limit_error") { - return true; - } - // `"upstream rate limit exceeded"` is the OpenHuman backend's own phrase - // when it wraps an upstream provider 429 as an HTTP 500. - if lower.contains("upstream rate limit exceeded") { - return true; - } - // `"429 rate limit exceeded"` is the numeric-prefix form emitted by some - // backends (e.g. OPENHUMAN-TAURI-S: `"error":"429 rate limit exceeded"`). - // Anchored on the `"429 rate limit"` substring so a plain `"rate limit - // exceeded"` mention (which could appear in the `security::policy` action- - // budget message) is NOT matched here — the next arm handles clean phrase - // matches only when scoped by a provider API error prefix. - if lower.contains("429 rate limit") { - return true; - } - // `"rate limit exceeded"` on its own is matched ONLY when it appears inside - // a canonical provider API error envelope (`"api error ("` prefix from - // `ops::api_error` / `embeddings::openai`). This keeps the security::policy - // `"Rate limit exceeded: action budget exhausted"` message from being - // silently swallowed — that phrase does not carry an API error prefix. - if lower.contains("api error (") && lower.contains("rate limit exceeded") { - return true; - } - false -} - /// Detect filesystem-out-of-space errors that bubble up from any syscall /// (`open`, `write`, `mkdir`, `rename`). Three platform-stable renderings: /// @@ -583,18 +473,6 @@ fn is_memory_store_breaker_open(lower: &str) -> bool { /// `"session JWT required"` — local pre-flight guards that fire when the /// stored profile is empty (`#1465`-ish onboarding spam) or has been /// cleared by a previous 401 cycle. Both shapes are OpenHuman-specific. -/// - `"backend rejected session token on GET /payments/stripe/currentPlan"` and -/// all analogous `"{METHOD} {path}"` variants — the `BackendApiError::Unauthorized` -/// typed error surfaced by `api::rest::BackendOAuthClient::authed_json` when any -/// OpenHuman REST endpoint returns HTTP 401. The `get_authed_value` wrapper in -/// `billing::ops` stringifies this via `.to_string()`, producing the -/// `"backend rejected session token on …"` prefix. This is uniquely scoped to -/// the `BackendApiError::Unauthorized` variant (the phrase does not appear in -/// any third-party provider error path) so it is safe to classify as session -/// expiry without the conjunctive-anchor guard pattern needed for `"Invalid -/// token"`. Targets TAURI-RUST-E (~1 437 events from -/// `openhuman.billing_get_current_plan` polling on every background billing -/// refresh cycle after the user's JWT lapses). /// /// At the JSON-RPC dispatch boundary the same strict match controls /// `DomainEvent::SessionExpired` publication, so downstream/provider 401s stay @@ -607,14 +485,6 @@ pub fn is_session_expired_message(msg: &str) -> bool { || msg.contains("SESSION_EXPIRED") || (msg.contains("OpenHuman API error (401") && msg.contains("\"error\":\"Invalid token\"")) || (msg.contains("Embedding API error (401") && msg.contains("\"error\":\"Invalid token\"")) - // TAURI-RUST-E — billing endpoint 401s via `BackendApiError::Unauthorized` - // stringified by `billing::ops::get_authed_value(..).map_err(|e| e.to_string())`. - // The display form is `"backend rejected session token on {METHOD} {path}"`; - // the phrase is uniquely scoped to `BackendApiError::Unauthorized` so no - // conjunctive guard is needed. Covers all billing RPC methods - // (billing_get_current_plan, billing_get_balance, etc.) and any other - // `authed_json` caller that stringifies via `.to_string()`. - || lower.contains("backend rejected session token") // OPENHUMAN-TAURI-4P0 — OpenHuman backend's "Invalid token" 401 // envelope. Both anchors must be present: the OpenHuman-scoped // `"OpenHuman API error (401"` prefix (so a third-party provider's @@ -1118,28 +988,8 @@ fn is_provider_user_state_message(lower: &str) -> bool { // // Drops Sentry TAURI-RUST-X9 (~15.7 k events / ~22 h, single user, // release openhuman@0.54.0+c25fc8e5fd3e). - // - // TAURI-RUST-322 (#2929): same direct-mode path but the Composio v3 - // `/connected_accounts` API returns HTTP 403 instead of 401. This - // happens when the BYO API key exists and is syntactically valid but - // does not carry the `connected_accounts:read` permission (e.g. a - // scoped or legacy key). Wire shape: - // - // `[composio-direct] list_connections failed: Composio v3 - // connected_accounts failed: HTTP 403` - // - // 403 from Composio v3 is a user-state condition (key permissions), - // not a bug in openhuman_core. Sentry has no remediation path — the - // user must regenerate their key with the correct scopes on - // app.composio.dev. The polling layer retries every 5 s and the UI - // already surfaces the error; flooding Sentry with 1,000+ events per - // user adds no signal. - // - // Drops Sentry TAURI-RUST-322 (1,021 events, multi-release). if lower.contains("[composio-direct]") - && (lower.contains("http 401") - || lower.contains("http 403") - || lower.contains("invalid api key")) + && (lower.contains("http 401") || lower.contains("invalid api key")) { return true; } @@ -1264,31 +1114,6 @@ fn is_filesystem_user_path_invalid_message(lower: &str) -> bool { || lower.contains("hosted path is not a directory:") } -/// Detect memory-store writes rejected because the namespace or key contained -/// a personal identifier detected by the PII guard. -/// -/// The three canonical wire shapes are emitted by -/// `memory_store/unified/documents.rs` and `memory_store/kv.rs`: -/// -/// - `"document namespace/key cannot contain personal identifiers"` — -/// `upsert_document` / `upsert_document_metadata_only` -/// - `"kv key cannot contain personal identifiers"` — `kv_set_global` -/// - `"kv namespace/key cannot contain personal identifiers"` — `kv_set_namespace` -/// -/// These are expected user-content conditions: the PII guard classifies a -/// channel name, username, or LLM-generated key as a personal identifier and -/// rejects the write. The LLM or caller already receives the error message; -/// Sentry has no remediation path. Drops TAURI-RUST-54T (~915 events, -/// escalating — all from a single user hitting false positives on valid -/// namespace/key identifiers). -/// -/// Anchor on `"cannot contain personal identifiers"` — the exact string -/// shared by all three sites — so typos or future rewordings that drop the -/// anchor still reach Sentry until explicitly classified. -fn is_memory_store_pii_rejection(lower: &str) -> bool { - lower.contains("cannot contain personal identifiers") -} - /// Detect the agent harness's empty-provider-response bail. /// /// Anchored on the literal user-facing string emitted at @@ -1618,20 +1443,6 @@ fn report_expected_message(kind: ExpectedErrorKind, message: &str, domain: &str, "[observability] {domain}.{operation} skipped expected filesystem path validation error" ); } - ExpectedErrorKind::MemoryStorePiiRejection => { - // PII guard rejected a memory-store write because the namespace or - // key was classified as containing a personal identifier. The guard - // already logs a `[memory:safety]` warn at the write site; this - // match arm keeps the diagnostic breadcrumb at warn level (not - // error) so local log files retain the context without spawning a - // Sentry error event. TAURI-RUST-54T (~915 events from one user). - tracing::warn!( - domain = domain, - operation = operation, - kind = "memory_store_pii_rejection", - "[observability] {domain}.{operation} skipped expected memory-store PII rejection" - ); - } ExpectedErrorKind::EmptyProviderResponse => { // Model/user-config condition — the provider returned a // completely empty body and the agent harness bailed with the @@ -2020,83 +1831,25 @@ pub fn is_transient_message_failure(msg: &str) -> bool { /// Returns true when a Sentry event is a budget-exhausted 400 that should be /// dropped from `before_send`. /// -/// **Two-tier match — either tier fires a drop:** -/// -/// 1. **Tag-gated path** (primary suppression, added in PR #1633): -/// - tag `failure == "non_2xx"` -/// - tag `status == "400"` -/// - event message or any exception value contains one of the tight -/// budget-exhaustion phrases from -/// [`crate::openhuman::inference::provider::is_budget_exhausted_message`] -/// -/// 2. **Text-only path** (defense-in-depth, covers OPENHUMAN-CORE-N / -/// TAURI-RUST-1P — GitHub issue #2935): -/// - event message or any exception value contains the exact phrase -/// `"Insufficient budget"` (the literal wire body the OpenHuman API -/// returns in `{"success":false,"error":"Insufficient budget"}`), -/// regardless of which tags are set. -/// -/// The text-only tier is intentionally tighter than tier 1 — it only -/// matches the exact phrase the backend uses, never the looser -/// `"add credits"` / `"budget exceeded"` phrases that might appear in -/// unrelated product copy. This prevents future call sites that call -/// `report_error` without setting `failure` / `status` tags (or that -/// invoke `sentry::capture_message` directly) from leaking budget -/// exhaustion events to Sentry. -/// -/// Note: `domain` is intentionally not gated here so a future re-emitter -/// under a different domain tag still gets filtered. -pub fn is_budget_event(event: &sentry::protocol::Event<'_>) -> bool { - // Tier 1 — tag-gated primary path. - let tags = &event.tags; - if tags.get("failure").map(String::as_str) == Some("non_2xx") - && tags.get("status").map(String::as_str) == Some("400") - && event_contains_budget_exhausted_message(event) - { - return true; - } - // Tier 2 — text-only defense-in-depth: drop any event whose message or - // exception value contains the exact wire phrase the OpenHuman backend - // emits for budget exhaustion, regardless of tags. - event_contains_budget_insufficient_phrase(event) -} - -/// Defense-in-depth `before_send` filter for Sentry event CORE-RUST-EK -/// (~827 events): every call to the cloud embedding API (OpenAI -/// `text-embedding-3-large` or Voyage) that returns HTTP 401 fires a Sentry -/// error event via `report_error_or_expected` in -/// `src/openhuman/embeddings/openai.rs`. -/// -/// 401 on the embedding call path means the configured API key is stale or -/// invalid. This is the same class of condition as the VOYAGE_API_KEY-missing -/// error (PR #2915) and the billing-expired 401 (PR #2924): the user's LLM -/// session was interrupted by an auth failure that the core will retry on the -/// next turn, but the Sentry volume-per-key ratio yields zero actionable signal. -/// /// Match criteria (all required): -/// - tag `domain == "embeddings"` — pins the filter to the embeddings call -/// path and avoids silencing unrelated 401s from provider chat, billing, or -/// the backend RPC layer -/// - tag `failure == "non_2xx"` — the marker set by `embeddings::openai::embed` -/// at the non-2xx path -/// - tag `status == "401"` — narrows to auth-rejection failures (429 / 500 -/// are handled by the existing rate-limit filter) +/// - tag `failure == "non_2xx"` +/// - tag `status == "400"` +/// - the event message or any exception value contains one of the tight +/// budget-exhaustion phrases /// -/// The primary suppression for the OpenHuman-backend "Invalid token" shape -/// already lives in `expected_error_kind` / `is_session_expired_message`. This -/// filter is defense-in-depth: it catches any third-party provider 401 that -/// doesn't carry the OpenHuman-backend body (e.g. OpenAI's -/// `{"error":{"code":"invalid_api_key",...}}`), ensuring CORE-RUST-EK stays -/// off Sentry regardless of which embedding provider is configured. -pub fn is_embeddings_api_key_401_event(event: &sentry::protocol::Event<'_>) -> bool { +/// Note: `domain` is intentionally not gated here as defense-in-depth over +/// the emit-site classifier — any non_2xx/400 event that carries the +/// budget-exhausted phrasing is dropped regardless of which domain produced +/// it, so a future re-emitter under a different tag still gets filtered. +pub fn is_budget_event(event: &sentry::protocol::Event<'_>) -> bool { let tags = &event.tags; - if tags.get("domain").map(String::as_str) != Some("embeddings") { + if tags.get("failure").map(String::as_str) != Some("non_2xx") { return false; } - if tags.get("failure").map(String::as_str) != Some("non_2xx") { + if tags.get("status").map(String::as_str) != Some("400") { return false; } - tags.get("status").map(String::as_str) == Some("401") + event_contains_budget_exhausted_message(event) } /// 404 on PATCH/DELETE to a channel-message path is an expected backend state @@ -2158,30 +1911,6 @@ fn event_contains_budget_exhausted_message(event: &sentry::protocol::Event<'_>) }) } -/// Tier-2 (text-only) budget check for [`is_budget_event`]. -/// -/// Matches the exact literal phrase `"Insufficient budget"` (case-insensitive) -/// in the event message or any exception value. This is the wire body the -/// OpenHuman backend returns: `{"success":false,"error":"Insufficient budget"}`. -/// -/// Deliberately narrower than [`event_contains_budget_exhausted_message`]: -/// only the precise backend phrase is matched, never the looser -/// `"add credits"` / `"budget exceeded"` / `"insufficient balance"` phrases -/// which might appear in unrelated product copy that would otherwise produce -/// false positives. -fn event_contains_budget_insufficient_phrase(event: &sentry::protocol::Event<'_>) -> bool { - const PHRASE: &str = "insufficient budget"; - let has_phrase = |s: &str| s.to_ascii_lowercase().contains(PHRASE); - if event.message.as_deref().is_some_and(has_phrase) { - return true; - } - event - .exception - .values - .iter() - .any(|exc| exc.value.as_deref().is_some_and(has_phrase)) -} - #[cfg(test)] mod tests { use super::*; @@ -2271,40 +2000,12 @@ mod tests { ); } - #[test] - fn classifies_backend_env_api_key_not_configured() { - // TAURI-RUST-2H5 (~5 K events): backend embedding endpoint returns a - // 400 with `{"success":false,"error":"VOYAGE_API_KEY is not configured"}` - // whenever the backend env var is absent. This is a known server-side - // config state, not an app error — silence it the same way we silence - // other `ApiKeyMissing` variants. - for raw in [ - r#"Embedding API error (400 Bad Request): {"success":false,"error":"VOYAGE_API_KEY is not configured"}"#, - r#"Embedding API error 400 Bad Request: {"success":false,"error":"VOYAGE_API_KEY is not configured"}"#, - // Future-proof: same shape for any other backend-managed embedder. - r#"Embedding API error (400 Bad Request): {"success":false,"error":"COHERE_API_KEY is not configured"}"#, - ] { - assert_eq!( - expected_error_kind(raw), - Some(ExpectedErrorKind::ApiKeyMissing), - "should classify backend env api-key missing: {raw}" - ); - } - } - #[test] fn does_not_classify_unrelated_is_not_configured_messages() { - // The `_api_key` anchor must keep prose that merely says "is not - // configured" from being silenced — only env-var-style key names - // should match. assert_eq!( expected_error_kind("workspace path is not configured for this user"), None ); - assert_eq!( - expected_error_kind("embedding model is not configured"), - None - ); assert_eq!( expected_error_kind("provider 'voyage' is not configured in settings"), None @@ -2678,36 +2379,6 @@ mod tests { } } - #[test] - fn classifies_memory_store_pii_rejection_errors() { - // TAURI-RUST-54T: ~915 events from one user where the PII guard - // rejected memory-store writes on namespace/key values that look like - // personal identifiers. All three canonical wire shapes — from - // `documents.rs` (upsert_document / upsert_document_metadata_only) - // and `kv.rs` (kv_set_global / kv_set_namespace) — must classify as - // expected so they stop reaching Sentry. - for raw in [ - "document namespace/key cannot contain personal identifiers", - "kv key cannot contain personal identifiers", - "kv namespace/key cannot contain personal identifiers", - ] { - assert_eq!( - expected_error_kind(raw), - Some(ExpectedErrorKind::MemoryStorePiiRejection), - "should classify as memory-store PII rejection: {raw}" - ); - } - - // Wrapped by the RPC dispatch layer — substring match must survive the - // `rpc.invoke_method failed: ` prefix that `jsonrpc.rs` prepends. - assert_eq!( - expected_error_kind( - "rpc.invoke_method failed: document namespace/key cannot contain personal identifiers" - ), - Some(ExpectedErrorKind::MemoryStorePiiRejection) - ); - } - #[test] fn classifies_memory_store_breaker_open() { // TAURI-RUST-52X (~455 events on self-hosted Sentry): the chunk-store @@ -2833,24 +2504,6 @@ mod tests { } } - #[test] - fn does_not_classify_unrelated_messages_as_memory_pii_rejection() { - // A generic "personal identifiers" mention without the "cannot contain" - // anchor must not be silenced. - assert_eq!( - expected_error_kind("processing personal identifiers"), - None, - "must not match a bare 'personal identifiers' mention" - ); - // The secret-rejection variant uses different wording and must not be - // swallowed by the PII classifier. - assert_eq!( - expected_error_kind("document namespace/key cannot contain secrets"), - None, - "secret rejection must remain unclassified" - ); - } - #[test] fn does_not_classify_unrelated_breaker_messages() { // Generic "circuit breaker open" without the `[memory_tree]` anchor @@ -2868,132 +2521,6 @@ mod tests { ); } - // ── Upstream rate-limit suppression (OPENHUMAN-TAURI-S / -6Y / -2E / -RQ) ─ - - /// Canonical Anthropic / OpenAI body with a structured `"rate_limit_error"` - /// type — OPENHUMAN-TAURI-2E (~1 482 events) and -RQ (~741 events). - #[test] - fn classifies_rate_limit_error_type_as_transient() { - for raw in [ - // Direct 429 from the embeddings path (OPENHUMAN-TAURI-RQ): - r#"Embedding API error (429 Too Many Requests): {"error":{"message":"Rate limit exceeded. Please retry after a brief wait.","type":"rate_limit_error"}}"#, - // Via llm_provider.api_error (OPENHUMAN-TAURI-2E): - r#"[observability] llm_provider.api_error failed: OpenHuman API error (429 Too Many Requests): {"error":{"message":"Rate limit exceeded. Please retry after a brief wait.","type":"rate_limit_error"}}"#, - // Re-reported by agent.run_single: - r#"run_chat_task failed client_id=abc thread_id=t1 request_id=r1 error=OpenHuman API error (429 Too Many Requests): {"error":{"message":"Rate limit exceeded.","type":"rate_limit_error"}}"#, - ] { - assert_eq!( - expected_error_kind(raw), - Some(ExpectedErrorKind::TransientUpstreamHttp), - "should classify rate_limit_error body as transient: {raw}" - ); - } - } - - /// OpenHuman backend wrapping an upstream 429 as HTTP 500 with a - /// `"upstream rate limit exceeded"` body — OPENHUMAN-TAURI-6Y (~19 849 - /// events). - #[test] - fn classifies_upstream_rate_limit_in_500_body_as_transient() { - for raw in [ - r#"OpenHuman API error (500 Internal Server Error): {"success":false,"error":"Upstream rate limit exceeded for model 'summarization-v1'. Please retry shortly."}"#, - r#"[observability] llm_provider.api_error failed: OpenHuman API error (500 Internal Server Error): {"success":false,"error":"Upstream rate limit exceeded for model 'summarization-v1'. Please retry shortly.","details":{"provider":"gmi","upstreamModel":"deepseek-ai/DeepSeek-V3-0324"}}"#, - // Re-wrapped by rpc.invoke_method: - r#"rpc.invoke_method failed: LLM summarisation failed: OpenHuman API error (500 Internal Server Error): {"success":false,"error":"Upstream rate limit exceeded for model 'summarization-v1'."}"#, - ] { - assert_eq!( - expected_error_kind(raw), - Some(ExpectedErrorKind::TransientUpstreamHttp), - "should classify upstream-rate-limit-in-500 as transient: {raw}" - ); - } - } - - /// Backend returning HTTP 500 with a numeric `"429 rate limit exceeded"` - /// body — OPENHUMAN-TAURI-S (~6 984 events). - #[test] - fn classifies_429_rate_limit_in_500_body_as_transient() { - for raw in [ - r#"OpenHuman API error (500 Internal Server Error): {"success":false,"error":"429 rate limit exceeded, please try again later"}"#, - r#"[observability] llm_provider.api_error failed: OpenHuman API error (500 Internal Server Error): {"success":false,"error":"429 rate limit exceeded, please try again later"}"#, - ] { - assert_eq!( - expected_error_kind(raw), - Some(ExpectedErrorKind::TransientUpstreamHttp), - "should classify 429-in-500-body as transient: {raw}" - ); - } - } - - /// The security::policy `"Rate limit exceeded: action budget exhausted"` - /// must NOT be silenced — it's a user-facing hard stop, not a transient - /// upstream quota hit. - #[test] - fn does_not_classify_security_policy_rate_limit_as_transient() { - let msg = "Rate limit exceeded: action budget exhausted (0 actions/hour). \ - Increase the limit in Settings -> Advanced -> Agent autonomy"; - assert_eq!( - expected_error_kind(msg), - None, - "security policy action-budget error must reach Sentry: {msg}" - ); - // Wrapped by rpc.invoke_method — the prefix must not accidentally - // trigger the `api error (` anchor. - assert_eq!( - expected_error_kind(&format!("rpc.invoke_method failed: {msg}")), - None, - "wrapped security policy action-budget error must reach Sentry" - ); - } - - /// Standalone `"rate limit exceeded"` without the `"api error ("` anchor - /// must NOT be silenced — keeps loose phrases from accidentally demoting - /// unrelated errors. - #[test] - fn does_not_classify_bare_rate_limit_exceeded_as_transient() { - assert_eq!( - expected_error_kind("rate limit exceeded"), - None, - "bare 'rate limit exceeded' without API error anchor must reach Sentry" - ); - } - - /// `is_upstream_rate_limit_message` predicate unit tests — verifies the - /// polarity contract independently of `expected_error_kind`. - #[test] - fn upstream_rate_limit_predicate_matches_expected_shapes() { - for lower in [ - r#"{"error":{"message":"rate limit exceeded.","type":"rate_limit_error"}}"#, - "upstream rate limit exceeded for model 'summarization-v1'", - "429 rate limit exceeded, please try again later", - r#"openai api error (429 too many requests): {"error":{"message":"rate limit exceeded.","type":"rate_limit_error"}}"#, - ] { - assert!( - is_upstream_rate_limit_message(lower), - "should match: {lower}" - ); - } - } - - #[test] - fn upstream_rate_limit_predicate_does_not_match_unrelated() { - for lower in [ - // security::policy budget message — must not be swallowed - "rate limit exceeded: action budget exhausted (0 actions/hour)", - // bare phrase without anchor - "rate limit exceeded", - // unrelated 500 body - r#"{"success":false,"error":"internal server error"}"#, - // budget exhausted — different concept - "budget exhausted, add credits to continue", - ] { - assert!( - !is_upstream_rate_limit_message(lower), - "should not match: {lower}" - ); - } - } - #[test] fn does_not_classify_unrelated_messages_as_capability_unavailable() { // The classifier anchors on the exact "for this RAM tier" substring. @@ -4006,84 +3533,6 @@ mod tests { ); } - // ── TAURI-RUST-322 (#2929): composio-direct 403 (key missing perms) ─ - - #[test] - fn classifies_composio_direct_403_as_provider_user_state() { - // Canonical Sentry TAURI-RUST-322 wire shape — the verbatim - // title body from the issue (1,021 events, multi-release). The - // Composio v3 `/connected_accounts` endpoint returns HTTP 403 - // when the BYO API key exists but lacks `connected_accounts:read` - // permission. This is a user-state condition; Sentry has no - // remediation path. - let msg = "[composio-direct] list_connections failed: \ - Composio v3 connected_accounts failed: HTTP 403"; - assert_eq!( - expected_error_kind(msg), - Some(ExpectedErrorKind::ProviderUserState), - "composio-direct HTTP 403 must demote to ProviderUserState (TAURI-RUST-322)" - ); - } - - #[test] - fn classifies_composio_direct_403_for_other_ops() { - // The `[composio-direct]` + `HTTP 403` arm must cover every op - // that can hit a 403 from the Composio v3 tenant (list_tools - // prefetch, authorize, etc.) — not just list_connections. - let shapes = [ - // list_tools prefetch of connections hits the 403 wall - "[composio-direct] list_tools: prefetch connections failed: \ - Composio v3 connected_accounts failed: HTTP 403", - // list_connections itself (the primary source of the leak) - "[composio-direct] list_connections (direct) failed: \ - Composio v3 connected_accounts failed: HTTP 403", - // any future direct-mode op that hits a 403 - "[composio-direct] composio_list_connections (direct) failed: \ - Composio v3 connected_accounts failed: HTTP 403", - ]; - for msg in shapes { - assert_eq!( - expected_error_kind(msg), - Some(ExpectedErrorKind::ProviderUserState), - "every [composio-direct] op with HTTP 403 must demote to ProviderUserState: {msg}" - ); - } - } - - #[test] - fn does_not_classify_unrelated_http_403_as_composio_direct_user_state() { - // Discrimination test: a 403 that does NOT carry the - // `[composio-direct]` prefix must NOT match this arm. Backend-mode - // composio 403s and unrelated 403s must remain visible in Sentry. - let backend_403 = "[composio] list_connections failed: \ - Backend returned 403 Forbidden for GET \ - https://api.tinyhumans.ai/agent-integrations/composio/connections"; - // The backend-mode shape passes through `is_backend_user_error_message` - // (4xx matcher), not this arm. Verify it does NOT match this arm. - assert!( - !lower_contains_composio_direct_auth_wall(backend_403), - "backend-mode 403 must NOT match the composio-direct arm" - ); - - let unrelated_403 = "GitHub API error: HTTP 403: rate limit exceeded"; - assert_ne!( - expected_error_kind(unrelated_403), - Some(ExpectedErrorKind::ProviderUserState), - "unrelated 403 (no [composio-direct] anchor) must NOT match the composio-direct arm" - ); - } - - // Helper used only in the discrimination test above — mirrors the - // exact condition in `is_provider_user_state_message` without - // requiring access to the private function. - fn lower_contains_composio_direct_auth_wall(msg: &str) -> bool { - let lower = msg.to_ascii_lowercase(); - lower.contains("[composio-direct]") - && (lower.contains("http 401") - || lower.contains("http 403") - || lower.contains("invalid api key")) - } - // ── TAURI-RUST-34H: backend-wrapped Cloudflare anti-bot interstitial ─ #[test] @@ -4571,83 +4020,6 @@ mod tests { assert_eq!(expected_error_kind("session_expired lowercase"), None); } - /// TAURI-RUST-E (~1 437 events): billing poll fires `report_error_or_expected` - /// on every refresh cycle once the user's JWT lapses because the - /// `BackendApiError::Unauthorized` typed error was stringified to - /// `"backend rejected session token on GET /payments/stripe/currentPlan"` by - /// `billing::ops::get_authed_value(..).map_err(|e| e.to_string())` before the - /// phrase was added to `is_session_expired_message`. - /// - /// The phrase `"backend rejected session token"` is uniquely produced by - /// `BackendApiError::Unauthorized`'s `Display` impl in `api::rest` — no - /// third-party provider path emits it — so no conjunctive guard is needed. - #[test] - fn classifies_billing_401_as_session_expired() { - // Exact wire shape from `billing_get_current_plan` — the most common - // event in TAURI-RUST-E. - assert_eq!( - expected_error_kind( - "backend rejected session token on GET /payments/stripe/currentPlan" - ), - Some(ExpectedErrorKind::SessionExpired), - "TAURI-RUST-E: billing_get_current_plan 401 must classify as SessionExpired" - ); - - // Other billing methods share the same `BackendApiError::Unauthorized` - // display shape — pin them so a wording change in `rest.rs` would catch - // every billing call site. - for path in [ - "/payments/credits/balance", - "/payments/credits/transactions?limit=20&offset=0", - "/payments/credits/auto-recharge", - "/payments/credits/auto-recharge/cards", - "/payments/stripe/purchasePlan", - "/payments/stripe/portal", - "/coupons/me", - ] { - let msg = format!("backend rejected session token on GET {path}"); - assert_eq!( - expected_error_kind(&msg), - Some(ExpectedErrorKind::SessionExpired), - "billing 401 must classify as SessionExpired: {msg}" - ); - } - - // POST / PATCH / DELETE variants are also produced by `authed_json`. - for raw in [ - "backend rejected session token on POST /payments/credits/top-up", - "backend rejected session token on PATCH /payments/credits/auto-recharge", - "backend rejected session token on DELETE /payments/credits/auto-recharge/cards/pm_123", - ] { - assert_eq!( - expected_error_kind(raw), - Some(ExpectedErrorKind::SessionExpired), - "TAURI-RUST-E variant must classify as SessionExpired: {raw}" - ); - } - } - - /// `"backend rejected session token"` is scoped to `BackendApiError::Unauthorized` - /// in `api::rest`. Ensure unrelated messages containing the individual - /// words don't accidentally match. - #[test] - fn does_not_classify_unrelated_rejected_messages_as_session_expired() { - // Third-party provider errors that mention a token being rejected but - // do not contain the exact OpenHuman `BackendApiError::Unauthorized` - // display phrase. - for raw in [ - "Discord API error: token rejected by upstream", - "Stripe webhook signature rejected — bad secret", - "API token rejected: please regenerate", - ] { - assert_eq!( - expected_error_kind(raw), - None, - "unrelated token-rejected message must NOT suppress Sentry: {raw}" - ); - } - } - #[test] fn report_error_does_not_panic_with_many_tags() { let err = anyhow::anyhow!("multi-tag"); @@ -5110,11 +4482,7 @@ mod tests { } #[test] - fn budget_filter_requires_non_2xx_failure_and_400_status_for_loose_phrases() { - // Tier 1 requires both tags for the loose budget phrases ("budget exceeded", - // "add credits", …) that might coincidentally appear in unrelated product - // copy. Tier 2 (text-only) only fires for the exact backend phrase - // "insufficient budget" — so the loose phrases still need both tags. + fn budget_filter_requires_non_2xx_failure_and_400_status() { let message = "Budget exceeded — add credits to continue"; for tags in [ vec![("failure", "transport"), ("status", "400")], @@ -5126,192 +4494,6 @@ mod tests { } } - /// Tier-2 defense-in-depth: drop the exact "Insufficient budget" phrase - /// the OpenHuman backend returns regardless of which tags are set. - /// Regression guard for OPENHUMAN-CORE-N / TAURI-RUST-1P (GitHub #2935). - #[test] - fn budget_filter_drops_insufficient_budget_without_tags() { - // The exact JSON wire body from the OpenHuman backend. - let message = r#"OpenHuman API error (400 Bad Request): {"success":false,"error":"Insufficient budget"}"#; - - // No tags at all. - let event = event_with_message(message); - assert!( - is_budget_event(&event), - "tier-2 must drop 'Insufficient budget' even without failure/status tags" - ); - - // Wrong status tag only. - let event = event_with_tags_and_message(&[("status", "400")], message); - assert!( - is_budget_event(&event), - "tier-2 must drop 'Insufficient budget' when only status tag is set" - ); - - // Wrong failure tag only. - let event = event_with_tags_and_message(&[("failure", "transport")], message); - assert!( - is_budget_event(&event), - "tier-2 must drop 'Insufficient budget' when failure tag doesn't match" - ); - } - - /// Regression guard: tier-2 must also catch the exception/tracing path - /// (`sentry-tracing` with `attach_stacktrace=true` may populate the - /// exception list rather than `event.message`). - #[test] - fn budget_filter_drops_insufficient_budget_exception_without_tags() { - // Exact wire body from OpenHuman backend wrapped in an exception value. - let event = event_with_exception_value( - r#"OpenHuman API error (400 Bad Request): {"success":false,"error":"INSUFFICIENT BUDGET"}"#, - ); - assert!( - is_budget_event(&event), - "tier-2 must drop exception-path 'Insufficient budget' case-insensitively" - ); - - // Mixed case variant. - let event = event_with_exception_value( - r#"OpenHuman API error (400 Bad Request): {"success":false,"error":"Insufficient Budget"}"#, - ); - assert!( - is_budget_event(&event), - "tier-2 must drop exception-path 'Insufficient Budget' case-insensitively" - ); - - // Wrong failure/status tags should not prevent tier-2 from matching. - let mut event = event_with_exception_value( - r#"OpenHuman API error (400 Bad Request): {"success":false,"error":"Insufficient budget"}"#, - ); - event - .tags - .insert("failure".to_string(), "transport".to_string()); - assert!( - is_budget_event(&event), - "tier-2 must drop exception-path even when failure tag does not match" - ); - - // Unrelated exception values must not match. - let event = event_with_exception_value("retry budget exhausted after 3 attempts"); - assert!( - !is_budget_event(&event), - "tier-2 must not match unrelated exception value" - ); - } - - /// Tier-2 only matches the tight phrase — unrelated 400 errors with "budget" - /// in an unrelated context should not be silently dropped. - #[test] - fn budget_filter_tier2_does_not_match_unrelated_messages() { - for msg in [ - "retry budget exhausted after 3 attempts", - "bad request: missing field", - "budget_id=42 not found", - "", - ] { - let event = event_with_message(msg); - assert!( - !is_budget_event(&event), - "tier-2 must not match unrelated message: {msg:?}" - ); - } - } - - /// CORE-RUST-EK (~827 events): every 401 from the embeddings call path - /// (`domain=embeddings`, `failure=non_2xx`, `status=401`) must be filtered - /// before it reaches Sentry. Covers both the OpenHuman-backend "Invalid - /// token" shape (already handled by the primary `is_session_expired_message` - /// classifier) and third-party provider body shapes (OpenAI - /// `invalid_api_key`, plain `Unauthorized`) that fall through the string - /// classifier. - #[test] - fn embeddings_401_filter_drops_domain_embeddings_status_401() { - // Canonical CORE-RUST-EK wire shape: OpenAI `text-embedding-3-large` - // key is stale. `report_error_or_expected` sets - // domain=embeddings / operation=openai_embed / failure=non_2xx / - // status=401 / model=text-embedding-3-large. - let event = event_with_tags_and_message( - &[ - ("domain", "embeddings"), - ("operation", "openai_embed"), - ("failure", "non_2xx"), - ("status", "401"), - ("model", "text-embedding-3-large"), - ], - r#"Embedding API error (401 Unauthorized): {"error":{"message":"Incorrect API key provided. You can find your API key at https://platform.openai.com/account/api-keys.","type":"invalid_request_error","param":null,"code":"invalid_api_key"}}"#, - ); - assert!( - is_embeddings_api_key_401_event(&event), - "CORE-RUST-EK: domain=embeddings status=401 must be filtered" - ); - } - - /// Any other embedding status (e.g. 429 rate-limit, 500 server error) - /// must not be filtered by the embeddings-401 guard — those have their - /// own handlers (rate-limit filter, general error reporting). - #[test] - fn embeddings_401_filter_passes_other_statuses() { - for status in ["429", "500", "400"] { - let event = event_with_tags_and_message( - &[ - ("domain", "embeddings"), - ("failure", "non_2xx"), - ("status", status), - ], - "Embedding API error", - ); - assert!( - !is_embeddings_api_key_401_event(&event), - "domain=embeddings status={status} must NOT be filtered by the 401 guard" - ); - } - } - - /// Non-embeddings domains must not be filtered even if status=401 — the - /// guard is scoped specifically to `domain=embeddings` so that provider-chat - /// and backend-API 401s remain subject to their own classifiers. - #[test] - fn embeddings_401_filter_passes_non_embeddings_domains() { - for domain in ["llm_provider", "backend_api", "rpc", "composio"] { - let event = event_with_tags_and_message( - &[ - ("domain", domain), - ("failure", "non_2xx"), - ("status", "401"), - ], - "API error (401 Unauthorized): some body", - ); - assert!( - !is_embeddings_api_key_401_event(&event), - "domain={domain} status=401 must NOT be swallowed by the embeddings-401 guard" - ); - } - } - - /// The guard requires both `failure=non_2xx` and `status=401` to be - /// present; missing either tag must cause the filter to pass the event - /// through. - #[test] - fn embeddings_401_filter_requires_failure_and_status_tags() { - let no_failure_tag = event_with_tags_and_message( - &[("domain", "embeddings"), ("status", "401")], - "Embedding API error (401 Unauthorized): unauthorized", - ); - assert!( - !is_embeddings_api_key_401_event(&no_failure_tag), - "missing failure tag must not trigger the guard" - ); - - let no_status_tag = event_with_tags_and_message( - &[("domain", "embeddings"), ("failure", "non_2xx")], - "Embedding API error (401 Unauthorized): unauthorized", - ); - assert!( - !is_embeddings_api_key_401_event(&no_status_tag), - "missing status tag must not trigger the guard" - ); - } - #[test] fn report_error_or_expected_does_not_panic() { report_error_or_expected( diff --git a/src/main.rs b/src/main.rs index 10fd69eca9..352f710a8a 100644 --- a/src/main.rs +++ b/src/main.rs @@ -66,21 +66,6 @@ fn main() { if openhuman_core::core::observability::is_budget_event(&event) { return None; } - // CORE-RUST-EK (~827 events): drop all HTTP 401 responses from the - // embeddings call path (domain=embeddings, failure=non_2xx, - // status=401). The primary suppression for the OpenHuman-backend - // "Invalid token" shape lives in `expected_error_kind` / - // `is_session_expired_message`. This is defense-in-depth that also - // catches third-party provider 401s (e.g. OpenAI `invalid_api_key` - // body) that don't carry the OpenHuman envelope and therefore fall - // through the string-based classifier to Sentry. - if openhuman_core::core::observability::is_embeddings_api_key_401_event(&event) { - log::debug!( - "[sentry-embeddings-401-filter] dropping embeddings api-key 401 event_id={:?}", - event.event_id - ); - return None; - } // Defense-in-depth: drop max-tool-iterations cap events that // slipped past the call-site filters in // `agent::harness::session::runtime::run_single`, @@ -98,43 +83,6 @@ fn main() { { return None; } - // Defense-in-depth: upstream rate-limit events that slipped past - // the call-site suppressors in `ops::api_error` (primary guard) - // and `report_error_or_expected` (secondary guard via - // `expected_error_kind`). Catches the three major shapes: - // · `rate_limit_error` type in the JSON body (OPENHUMAN-TAURI-2E, - // OPENHUMAN-TAURI-RQ — ~2 223 events combined) - // · `"upstream rate limit exceeded"` in a 500 body (TAURI-6Y — - // ~19 849 events) - // · `"429 rate limit exceeded"` in a 500 body (TAURI-S — ~6 984 - // events) - // The primary per-attempt suppression lives in - // `openhuman::inference::provider::ops::api_error` (skips - // `report_error` entirely for rate-limit bodies) and in - // `embeddings::openai::embed` (uses `report_error_or_expected` with - // the canonical `"Embedding API error ({status}): …"` format so - // `is_transient_upstream_http_message` catches it). This filter is - // the last line of defense for any future call site that adds a new - // report path without routing through one of those two guards. - { - let direct = event.message.as_deref(); - let from_logentry = event.logentry.as_ref().map(|l| l.message.as_str()); - let from_exception = event.exception.last().and_then(|e| e.value.as_deref()); - let is_rate_limited = [direct, from_logentry, from_exception] - .into_iter() - .flatten() - .map(str::to_ascii_lowercase) - .any(|lower| { - openhuman_core::core::observability::is_upstream_rate_limit_message(&lower) - }); - if is_rate_limited { - log::debug!( - "[sentry-rate-limit-filter] dropping upstream rate-limit event_id={:?}", - event.event_id - ); - return None; - } - } // Defense-in-depth: 404 on PATCH/DELETE to a channel-message path // is an expected state (provider-side delete or backend GC). Primary // suppression lives in `authed_json`; this catches any future call diff --git a/src/openhuman/inference/ops.rs b/src/openhuman/inference/ops.rs index bb882b9cd9..6c7ab72056 100644 --- a/src/openhuman/inference/ops.rs +++ b/src/openhuman/inference/ops.rs @@ -29,31 +29,6 @@ fn is_unknown_provider_user_config(err: &str) -> bool { err.contains("no cloud provider with id or slug") } -/// Returns `true` when the error from a provider chat attempt is a known, -/// expected user-state or provider-state condition that already has its own -/// Sentry report (or is deterministically expected and has no remediation -/// path): -/// -/// - **401 Unauthorized** — API key revoked / wrong key. Already reported by -/// the provider layer's `api_error` path. An ops-level duplicate adds noise -/// with no additional context. -/// - **429 Too Many Requests / rate-limit** — Quota exhaustion. Already -/// covered by the `is_upstream_rate_limit_message` classifier in -/// `expected_error_kind`; the reliable-provider layer retries with -/// backoff before propagating. -/// - **Model not found** — User selected a model that doesn't exist for -/// their key. The provider layer already classifies this as a config -/// rejection (TAURI-RUST-68, ~1309 events). -/// -/// The matcher is intentionally broad so the ops-level wrapper stays out -/// of the Sentry funnel for all provider-state failures — the underlying -/// call site (`compatible.rs` / `report_error_or_expected`) is already -/// responsible for the authoritative report. Unclassified failures (5xx, -/// unexpected payloads, network errors) are NOT matched and still escalate. -fn is_expected_chat_failure(err: &str) -> bool { - crate::core::observability::expected_error_kind(err).is_some() -} - #[derive(Debug, Clone, serde::Serialize)] pub struct InferenceTestProviderModelResult { pub reply: String, @@ -204,22 +179,7 @@ pub async fn inference_test_provider_model( output_len = outcome.value.reply.len(), "{LOG_PREFIX} test_provider_model:ok" ), - Err(err) => { - if is_expected_chat_failure(err) { - // Provider-state / user-config failure (401, 429, model not - // found, API key missing, etc.). The underlying provider - // layer already emitted its own Sentry event or classified - // this as expected. An ops-level duplicate adds noise. - // Targets TAURI-RUST-68 (~1,309 events). - warn!( - provider, - error = %err, - "{LOG_PREFIX} test_provider_model:expected-error (no Sentry)" - ); - } else { - error!(error = %err, "{LOG_PREFIX} test_provider_model:error"); - } - } + Err(err) => error!(error = %err, "{LOG_PREFIX} test_provider_model:error"), } result } diff --git a/src/openhuman/inference/ops_tests.rs b/src/openhuman/inference/ops_tests.rs index 90ca5440fe..27083c8e4e 100644 --- a/src/openhuman/inference/ops_tests.rs +++ b/src/openhuman/inference/ops_tests.rs @@ -317,73 +317,3 @@ fn is_unknown_provider_user_config_rejects_other_list_models_failures() { } } -// ── is_expected_chat_failure (TAURI-RUST-68) ───────────────────────────── -// -// `inference_test_provider_model` calls `simple_chat` which can fail with -// known provider-state or user-config conditions (401, 429, model not -// found). Before this fix every failure escalated to `error!`, which -// sentry-tracing shipped to Sentry as `"[inference::ops] -// test_provider_model:error"` — 1,309 events — while the same underlying -// errors already had their own report or were classified as expected by -// `expected_error_kind`. The gate demotes them to `warn!` so they stay in -// local logs but don't generate duplicate Sentry noise. -// -// Anchored on the shared `expected_error_kind` classifier so both the unit -// tests here and the production gate stay in sync with the central -// suppression logic in `core::observability`. - -#[test] -fn is_expected_chat_failure_matches_api_key_missing() { - // Provider layer emits this phrase when no API key is configured. - assert!(is_expected_chat_failure("api key not set for openai")); - assert!(is_expected_chat_failure( - "missing api key: openai_api_key is not configured" - )); -} - -#[test] -fn is_expected_chat_failure_matches_rate_limit() { - // 429-style rate-limit phrases emitted by the provider / OpenHuman backend. - assert!(is_expected_chat_failure( - "openai API error (429 Too Many Requests): You exceeded your current quota" - )); - assert!(is_expected_chat_failure( - "openai API error (500): 429 rate limit exceeded" - )); -} - -#[test] -fn is_expected_chat_failure_matches_provider_config_rejection() { - // OpenAI-style model-not-found code in error body. - assert!(is_expected_chat_failure( - r#"custom_openai API error (404 Not Found): {"error":{"message":"The model does not exist or you do not have access","code":"model_not_found"}}"# - )); - // Temperature-unsupported model (e.g. o1/o3/o4 reasoning models). - assert!(is_expected_chat_failure( - "custom_openai API error (400 Bad Request): invalid temperature: only 1 is allowed" - )); - // litellm-style not_found_error envelope. - assert!(is_expected_chat_failure( - r#"custom_openai API error (404 Not Found): {"error":{"message":"model 'gpt-99' not found","type":"not_found_error"}}"# - )); -} - -#[test] -fn is_expected_chat_failure_does_not_match_real_errors() { - // Real errors that must still reach Sentry must NOT be demoted. - for raw in [ - // Genuine 500 server error — actionable, must escalate - "openai API error (500 Internal Server Error): Something went wrong", - // Unexpected JSON from provider — potential provider bug - "openai API returned an unexpected chat-completions payload: missing field", - // Local I/O error — real infrastructure problem - "failed to open config file: permission denied", - // Completely empty string — fallthrough - "", - ] { - assert!( - !is_expected_chat_failure(raw), - "must NOT demote real error: {raw:?}" - ); - } -} diff --git a/src/openhuman/inference/provider/config_rejection.rs b/src/openhuman/inference/provider/config_rejection.rs index 5bec74693c..1d3b918084 100644 --- a/src/openhuman/inference/provider/config_rejection.rs +++ b/src/openhuman/inference/provider/config_rejection.rs @@ -32,15 +32,6 @@ //! from a user OAuth/scope gap) //! - `"not_found_error"` (J2 / J5 / J4 — litellm-compatible envelope //! `type` field carrying "model 'X' not found") -//! - `"does not support tools"` / `"function calling is not supported"` / -//! `"unknown parameter: tools"` / `"unrecognized field \`tools\`"` / -//! `"unsupported parameter: tools"` (TAURI-RUST-4K7 — Ollama models such -//! as `gemma3:1b-it-qat` and `huihui_ai/deepseek-r1-abliterated:8b` -//! reject tool-enabled requests with HTTP 400. The compatible provider -//! already retries without tools, so the initial 400 is not a -//! bug — it's expected discovery of the model's capability boundary. -//! Sentry noise suppressed here; the retry path in `compatible.rs` runs -//! unchanged.) //! //! These are **deterministic user-configuration state**, not bugs the //! maintainers can act on: the user pointed OpenHuman at a custom @@ -171,54 +162,33 @@ pub fn is_provider_config_rejection_message(body: &str) -> bool { // this is the `type` field used by litellm/Anthropic-style // envelopes for the same class of user-state error. "not_found_error", - // TAURI-RUST-4NM — nvidia-nim (and some other providers) return - // `{"error":{"message":"model field is required","type":"invalid_request_error","code":"missing_required_field"}}` - // when the `model` key is absent or empty in the request body. - // This is a user-configuration error (provider string has no model - // component, e.g. `nvidia-nim:` with empty model), not a product - // regression. Demote from Sentry; the factory now validates this - // up-front so in practice this phrase should no longer appear. - "model field is required", - // TAURI-RUST-4XK — Ollama 403 when the requested model requires a - // paid Ollama subscription. Body carries the upgrade URL. User must - // switch to a free model or upgrade their Ollama account. - "requires a subscription", - // TAURI-RUST-2G / TAURI-RUST-2F — DeepSeek / compatible providers - // that use extended thinking reject tool-call turns when the - // `reasoning_content` block from a prior assistant turn is not - // threaded back. This is user-config state (model requires the - // caller to replay the thinking block; the frontend replay logic in - // `turn.rs` handles it for subsequent turns, so the first-turn 400 - // is expected capability-discovery, not a regression). - "in the thinking mode must be passed back", - // TAURI-RUST-35 / TAURI-RUST-4K7 / TAURI-RUST-4Z0 — Ollama models - // (e.g. gemma3, phi3, deepseek-r1) that do not support function - // calling return HTTP 400 with this phrase. The compatible provider - // retries without tools on 400, so the initial rejection is expected - // capability-discovery. Sentry noise suppressed here. - "does not support tools", - // TAURI-RUST-4K7-d — alternative phrasing used by some Ollama model - // versions for the same tool-unsupported condition. - "function calling is not supported", - // TAURI-RUST-4K7-e — litellm / OpenAI-compatible proxies reject the - // `tools` key in the request body when the backing model does not - // support tool use (e.g. local Ollama via LiteLLM gateway). - "unknown parameter: tools", - // TAURI-RUST-4K7-f — Ollama native API surface rejects the field - // outright when the model has no function-calling capability. - "unrecognized field `tools`", - // TAURI-RUST-4K7-g — another litellm / proxy variant of the same - // tool-unsupported condition. - "unsupported parameter: tools", // TAURI-RUST-4NM — nvidia-nim (and compatible providers) return // `{"error":{"message":"model field is required","code":"missing_required_field"}}` // when the request body contains an empty `"model":""` field. "model field is required", // TAURI-RUST-2G (~2684 events) / TAURI-RUST-2F (~950 events) — - // thinking-mode model rejects a follow-up turn that doesn't echo - // the prior assistant's `reasoning_content` field. + // thinking-mode model (DeepSeek-R1 / Moonshot K2-thinking on + // `provider=cloud` custom_openai) rejects a follow-up turn that + // doesn't echo the prior assistant's `reasoning_content` field. + // Body shape (backtick-quoted JSON literal in the upstream body): + // `{"error":{"message":"The `reasoning_content` in the thinking + // mode must be passed back to the API.",...}}`. The + // provider-contract gap is on our side, but until the thinking- + // mode round-tripping ships in the inference layer, every affected + // turn fires a fresh Sentry event — and the UI already surfaces + // the actionable error to the user. Anchor on the unique + // `thinking mode must be passed back` substring so the match + // doesn't depend on the upstream's backtick-quoting around + // `reasoning_content` (some provider versions ship without them). "thinking mode must be passed back", // TAURI-RUST-4XK (~649 events) — Ollama Cloud subscription gate. + // Body: `{"error":"this model requires a subscription, upgrade for + // access: https://ollama.com/upgrade (ref: )"}` on a 403 + // Forbidden from `compatible::OpenAiCompatibleProvider` with + // `name = "ollama"`. User-state: the model picked in Settings is + // a paid-tier Ollama Cloud model the user's account doesn't + // cover. The UI surfaces an actionable upgrade link in the + // remediation message itself. "requires a subscription, upgrade for access", // TAURI-RUST-35 family — user picked a model that doesn't // implement tool calling, agent harness sent a tool spec @@ -402,103 +372,6 @@ mod tests { } } - /// TAURI-RUST-35 family — model picked by the user doesn't implement - /// tool calling. The agent harness tries to send a tool spec and the - /// upstream (Ollama / cloud Ollama relay / hosted OpenAI-compatible) - /// rejects with `{"error":{"message":" does not support tools", - /// "type":"invalid_request_error",...}}`. Pure user-config — the user - /// has to pick a tool-capable model (or run a non-agent flow). No - /// remediation path through Sentry, and the long tail is large: each - /// distinct model id + provider prefix combo creates a new Sentry - /// fingerprint, so the same root cause is currently split across at - /// least 10 unresolved Sentry issues (458 events total as of - /// 2026-05-28): - /// - /// | shortId | events | provider prefix | - /// |---|---|---| - /// | TAURI-RUST-35 | 307 | cloud | - /// | TAURI-RUST-DF | 83 | cloud | - /// | TAURI-RUST-123 | 25 | cloud | - /// | TAURI-RUST-4K7 | 19 | ollama | - /// | TAURI-RUST-4FS | 10 | cloud | - /// | TAURI-RUST-4F6 | 5 | cloud | - /// | TAURI-RUST-2YA | 4 | cloud | - /// | TAURI-RUST-4KR | 3 | ollama | - /// | TAURI-RUST-4KH | 1 | cloud | - /// | TAURI-RUST-4KY | 1 | ollama | - /// - /// Anchored on the exact `"does not support tools"` substring (the - /// message body's stable token — the model id varies per user). The - /// `streaming API error` / `API error` wrappers and the - /// `cloud` / `ollama` / `custom_openai` provider prefixes all share - /// this body, so a single phrase covers every variant. - #[test] - fn detects_does_not_support_tools_family() { - for (sentry_id, body) in [ - // TAURI-RUST-35 — verbatim from latest issue 168 event - // (model=`gemma3:1b-it-qat`, provider=cloud). - ( - "35", - r#"cloud streaming API error (400 Bad Request): {"error":{"message":"registry.ollama.ai/library/gemma3:1b-it-qat does not support tools","type":"invalid_request_error","param":null,"code":null}}"#, - ), - // TAURI-RUST-4K7 — ollama prefix, different upstream wrapper. - ( - "4K7", - r#"ollama streaming API error (400 Bad Request): {"error":{"message":"some-local-model does not support tools","type":"invalid_request_error","param":null,"code":null}}"#, - ), - // Non-streaming sibling — `API error` (no `streaming` token) - // for hosted providers that aren't using the streaming endpoint. - ( - "non-streaming", - r#"cloud API error (400 Bad Request): {"error":{"message":"registry.ollama.ai/library/qwen2.5:0.5b does not support tools","type":"invalid_request_error"}}"#, - ), - // Bare body (no wrapper) — what `expected_error_kind` would - // see if the body got extracted from the envelope upstream. - ( - "bare", - r#"{"error":{"message":"phi3.5:mini does not support tools","type":"invalid_request_error"}}"#, - ), - // TAURI-RUST-4Z0 — verbatim from issue 5664 (model=`deepseek-r1:8b`, - // provider=ollama). The envelope carries `"type":"api_error"` - // rather than `"invalid_request_error"` — pin it so the matcher - // can never be narrowed to require a specific `type` token; the - // `"does not support tools"` body substring is the only anchor. - ( - "4Z0", - r#"ollama streaming API error (400 Bad Request): {"error":{"message":"registry.ollama.ai/library/deepseek-r1:8b does not support tools","type":"api_error","param":null,"code":null}}"#, - ), - ] { - assert!( - is_provider_config_rejection_message(body), - "TAURI-RUST-{sentry_id} body must classify as provider config-rejection: {body:?}" - ); - } - } - - /// Polarity guard for the does-not-support-tools arm. The phrase is - /// scoped enough that no real bug-class body should accidentally - /// match — but pin a few near-miss shapes so a future loosening of - /// the matcher can't silently re-classify them. - #[test] - fn does_not_classify_unrelated_tools_phrases_as_config_rejection() { - for body in [ - // Tool-call dispatch failure (real bug) — must reach Sentry. - "tool execution failed: shell returned exit 1", - // Generic "tools" mention without the does-not-support phrase. - "agent ran with 0 tools available", - // Reversed phrasing — provider says they DO support tools but - // the call shape is wrong. Still actionable for triage. - "supports tools but received malformed tool_calls array", - // Empty body. - "", - ] { - assert!( - !is_provider_config_rejection_message(body), - "{body:?} must NOT classify as a provider config-rejection" - ); - } - } - #[test] fn detection_is_case_insensitive() { assert!(is_provider_config_rejection_message( @@ -629,63 +502,4 @@ mod tests { } } - /// TAURI-RUST-4K7 — Ollama models that don't support tool calling - /// (e.g. `gemma3:1b-it-qat`, `huihui_ai/deepseek-r1-abliterated:8b`) - /// return HTTP 400 with one of several tool-rejection phrases. - /// The compatible provider retries without tools, so the 400 is expected - /// capability-discovery rather than a product bug. These phrases must be - /// classified as config-rejections so Sentry is not flooded on every turn. - #[test] - fn detects_ollama_tool_unsupported_bodies() { - for (sentry_id, body) in [ - ( - "4K7-a", - r#"{"error":"gemma3:1b-it-qat does not support tools"}"#, - ), - ( - "4K7-b", - r#"{"error":"huihui_ai/deepseek-r1-abliterated:8b does not support tools"}"#, - ), - ( - "4K7-c", - r#"ollama streaming API error (400 Bad Request): {"error":"phi3:mini does not support tools"}"#, - ), - ( - "4K7-d", - r#"{"error":"function calling is not supported by this model"}"#, - ), - ( - "4K7-e", - r#"{"error":{"message":"unknown parameter: tools","type":"invalid_request_error"}}"#, - ), - ( - "4K7-f", - r#"{"error":"unrecognized field `tools` in request body"}"#, - ), - ( - "4K7-g", - r#"{"error":{"message":"unsupported parameter: tools","type":"invalid_request_error"}}"#, - ), - ] { - assert!( - is_provider_config_rejection_message(body), - "TAURI-RUST-{sentry_id} body must classify as provider config-rejection (tool-unsupported): {body:?}" - ); - } - } - - #[test] - fn detects_ollama_tool_unsupported_bodies_case_insensitive() { - // Ollama error messages should match regardless of casing. - for body in [ - "Model 'gemma3:1b-it-qat' DOES NOT SUPPORT TOOLS", - "Function Calling Is Not Supported By This Model", - "Unknown Parameter: Tools", - ] { - assert!( - is_provider_config_rejection_message(body), - "{body:?} must classify as config-rejection regardless of case" - ); - } - } } diff --git a/src/openhuman/inference/provider/ops.rs b/src/openhuman/inference/provider/ops.rs index 6de963de2a..ae07f21913 100644 --- a/src/openhuman/inference/provider/ops.rs +++ b/src/openhuman/inference/provider/ops.rs @@ -116,22 +116,6 @@ async fn list_configured_models_from_config( let status = response.status(); if !status.is_success() { - // A 404 from the /models endpoint means the provider does not support model - // listing — this is expected for many OpenAI-compatible providers (e.g. DeepSeek, - // Moonshot, Kimi, custom proxies). Return an empty model list so the caller can - // proceed normally instead of surfacing a spurious error / Sentry event. - // (Sentry issue TAURI-RUST-1Z — 819 events from this path alone.) - if status == reqwest::StatusCode::NOT_FOUND { - log::debug!( - "[providers][list_models] slug={} returned 404 — provider does not support /models listing; returning empty list", - entry.slug - ); - return Ok(crate::rpc::RpcOutcome::new( - serde_json::json!({ "models": serde_json::Value::Array(vec![]), "unsupported": true }), - vec!["provider does not support model listing (404)".to_string()], - )); - } - let body = response.text().await.unwrap_or_default(); let sanitized = sanitize_api_error(&body); let truncated = crate::openhuman::util::truncate_with_ellipsis(&sanitized, 300); @@ -899,38 +883,16 @@ pub async fn api_error(provider: &str, response: reqwest::Response) -> anyhow::E } else if is_context_window_exceeded { log_context_window_exceeded("api_error", provider, None, status); } else if should_report_provider_http_failure(status) { - // Defense-in-depth: some backends (e.g. OpenHuman) wrap an upstream - // provider 429 as an HTTP 500 with a rate-limit phrase in the body - // (`"429 rate limit exceeded"`, `"upstream rate limit exceeded"`). - // `should_report_provider_http_failure(500)` would otherwise let this - // through to Sentry — suppress it here before the report fires so the - // noise stays off Sentry (OPENHUMAN-TAURI-S: ~6 984 events). - // The `expected_error_kind` classifier in `report_error_or_expected` - // catches the same shape at re-report sites (agent / web_channel). - let lower_body = body.to_ascii_lowercase(); - let is_rate_limit_body = - crate::core::observability::is_upstream_rate_limit_message(&lower_body); - if is_rate_limit_body { - tracing::warn!( - domain = "llm_provider", - operation = "api_error", - provider = provider, - status = status_str.as_str(), - "[llm_provider] api_error: skipping Sentry report — rate-limit body in \ - non-429 response ({status})" - ); - } else { - crate::core::observability::report_error( - message.as_str(), - "llm_provider", - "api_error", - &[ - ("provider", provider), - ("status", status_str.as_str()), - ("failure", "non_2xx"), - ], - ); - } + crate::core::observability::report_error( + message.as_str(), + "llm_provider", + "api_error", + &[ + ("provider", provider), + ("status", status_str.as_str()), + ("failure", "non_2xx"), + ], + ); } anyhow::anyhow!(message) } @@ -1514,68 +1476,6 @@ mod tests { ); } - /// Spawn a minimal axum server that always returns 404 for the /models endpoint. - /// Used to verify that providers without model listing return an empty list, - /// not an error (Sentry issue TAURI-RUST-1Z). - async fn spawn_models_404_server() -> String { - let listener = tokio::net::TcpListener::bind("127.0.0.1:0") - .await - .expect("bind"); - let addr = listener.local_addr().expect("local_addr"); - let app = axum::Router::new().route( - "/models", - axum::routing::get(|| async { - (axum::http::StatusCode::NOT_FOUND, "Not Found").into_response() - }), - ); - tokio::spawn(async move { - axum::serve(listener, app).await.expect("serve"); - }); - format!("http://{addr}") - } - - #[tokio::test] - async fn models_404_returns_empty_list_not_error() { - // Providers that return 404 on /models (e.g. DeepSeek, Kimi, custom proxies) - // must yield an empty model list, not an Err. Returning an Err was firing a - // Sentry error for every `inference_list_models` call (TAURI-RUST-1Z, 819 events). - let tmp = tempfile::tempdir().expect("tempdir"); - let endpoint = spawn_models_404_server().await; - - let mut config = Config { - config_path: tmp.path().join("config.toml"), - workspace_dir: tmp.path().join("workspace"), - ..Config::default() - }; - config.secrets.encrypt = false; - config.cloud_providers.push(CloudProviderCreds { - id: "p_custom_test".to_string(), - slug: "custom-no-models".to_string(), - label: "Custom (no /models)".to_string(), - endpoint, - auth_style: AuthStyle::Bearer, - legacy_type: None, - default_model: None, - }); - - let outcome = list_configured_models_from_config("custom-no-models", &config) - .await - .expect("404 from /models must succeed with an empty list"); - - let models = outcome.value["models"] - .as_array() - .expect("response must have a `models` array"); - assert!( - models.is_empty(), - "expected empty model list for a 404 /models endpoint, got: {models:?}" - ); - assert_eq!( - outcome.value["unsupported"], - serde_json::Value::Bool(true), - "unsupported flag must be set to true for 404 providers" - ); - } - #[test] fn factory_backend() { assert!(create_backend_inference_provider( @@ -1680,62 +1580,6 @@ mod tests { } } - // Tests for the rate-limit body suppression guard added to `api_error`. - // Exercises `is_upstream_rate_limit_message` with the exact body shapes that - // produced OPENHUMAN-TAURI-S (~6 984 events from HTTP 500 wrapping a - // "429 rate limit exceeded" body) and OPENHUMAN-TAURI-6Y / -2E. - mod rate_limit_body_suppression { - use crate::core::observability::is_upstream_rate_limit_message; - - /// HTTP 500 with a `"429 rate limit exceeded"` body must be detected - /// as a rate-limit signal so the guard in `api_error` can skip the - /// Sentry report (OPENHUMAN-TAURI-S). - #[test] - fn http_500_with_429_body_phrase_is_rate_limited() { - let body = - r#"{"success":false,"error":"429 rate limit exceeded, please try again later"}"# - .to_ascii_lowercase(); - assert!( - is_upstream_rate_limit_message(&body), - "500-body with '429 rate limit exceeded' must be detected as rate-limited" - ); - } - - /// HTTP 500 with an `"upstream rate limit exceeded"` body - /// (OPENHUMAN-TAURI-6Y shape). - #[test] - fn http_500_with_upstream_rate_limit_body_is_rate_limited() { - let body = r#"{"success":false,"error":"Upstream rate limit exceeded for model 'summarization-v1'. Please retry shortly.","details":{"provider":"gmi"}}"# - .to_ascii_lowercase(); - assert!( - is_upstream_rate_limit_message(&body), - "500-body with 'upstream rate limit exceeded' must be detected" - ); - } - - /// OpenAI / Anthropic `"rate_limit_error"` type body. - #[test] - fn body_with_rate_limit_error_type_is_rate_limited() { - let body = r#"{"error":{"message":"Rate limit exceeded. Please retry after a brief wait.","type":"rate_limit_error"}}"# - .to_ascii_lowercase(); - assert!( - is_upstream_rate_limit_message(&body), - "body with 'rate_limit_error' type must be detected" - ); - } - - /// Unrelated 500 body must NOT be detected as rate-limited. - #[test] - fn http_500_unrelated_body_is_not_rate_limited() { - let body = r#"{"success":false,"error":"internal server error: database unavailable"}"# - .to_ascii_lowercase(); - assert!( - !is_upstream_rate_limit_message(&body), - "unrelated 500 body must not be detected as rate-limited" - ); - } - } - mod provider_access_policy_suppression { use super::*; diff --git a/src/openhuman/memory_store/factories.rs b/src/openhuman/memory_store/factories.rs index 5b745df97d..8e5f056199 100644 --- a/src/openhuman/memory_store/factories.rs +++ b/src/openhuman/memory_store/factories.rs @@ -51,13 +51,13 @@ fn report_ollama_health_gate_once(base_url: &str, model: &str) -> bool { let sentry_message = format!( "ollama embeddings opted-in but daemon unreachable at {base_url}; falling back to cloud embeddings for this session" ); - // Route through `report_error_or_expected` so the `expected_error_kind` - // classifier runs. The wire shape `"ollama embeddings opted-in but daemon - // unreachable at …"` matches `is_ollama_user_config_rejection` → routes to - // `ExpectedErrorKind::ProviderUserState` → demoted to a warn breadcrumb, - // NOT a Sentry error event. Using `report_error_message` directly would - // bypass the classifier and fire `sentry::capture_message(…, Level::Error)` - // unconditionally — the root cause of TAURI-RUST-B (472 events). + // Route through `report_error_or_expected` so the GX arm of + // `is_ollama_user_config_rejection` in `expected_error_kind` demotes + // the message to an info breadcrumb (user-state: ollama daemon not + // running). Direct `report_error_message` here bypassed the classifier + // and produced TAURI-RUST-B (~409 events). The `&str` input avoids + // the `format!("{:#}")` round-trip that `report_error` would do on an + // anyhow chain — the wire shape stays bit-identical. crate::core::observability::report_error_or_expected( sentry_message.as_str(), "memory", @@ -766,25 +766,4 @@ mod tests { "different URL also suppressed — gate is process-scoped, not per-URL" ); } - - /// TAURI-RUST-B (issue #2921): the exact wire shape produced by - /// `report_ollama_health_gate_once` must classify as - /// `ExpectedErrorKind::ProviderUserState` so `report_error_or_expected` - /// routes it to a warn breadcrumb rather than a Sentry error event. - /// - /// Previously the gate called `report_error_message` directly, bypassing - /// the classifier and firing `sentry::capture_message(…, Level::Error)` - /// unconditionally for every process restart where Ollama was opted-in but - /// not running (472 events at time of fix). The fix routes through - /// `report_error_or_expected` which checks `expected_error_kind` first. - #[test] - fn tauri_rust_b_wire_shape_classifies_as_expected() { - // Canonical format produced by `report_ollama_health_gate_once`. - let msg = "ollama embeddings opted-in but daemon unreachable at http://localhost:11434; falling back to cloud embeddings for this session"; - assert_eq!( - crate::core::observability::expected_error_kind(msg), - Some(crate::core::observability::ExpectedErrorKind::ProviderUserState), - "TAURI-RUST-B — daemon-unreachable health-gate message must demote to ProviderUserState, not fire as Sentry error" - ); - } } diff --git a/src/openhuman/subconscious/store.rs b/src/openhuman/subconscious/store.rs index 0e2a850ae1..23b3df9c5b 100644 --- a/src/openhuman/subconscious/store.rs +++ b/src/openhuman/subconscious/store.rs @@ -2,17 +2,6 @@ //! //! Follows the cron module's `with_connection` pattern: opens the database, //! runs DDL on every connection, and provides pure functions. -//! -//! ## Init-failure noise suppression (TAURI-RUST-A) -//! -//! `with_connection` runs the schema DDL on every call. Transient -//! `SQLITE_BUSY` / `SQLITE_LOCKED` errors (e.g. concurrent in-process RPC -//! calls, antivirus hold, network-drive WAL rejection) are handled by a -//! per-connection busy timeout (5 s) plus an application-level retry loop -//! (3 retries, 100 / 300 / 900 ms backoff). Only `DatabaseBusy` / -//! `DatabaseLocked` errors are retried — schema or corruption errors fail -//! through immediately so Sentry captures a real root cause rather than a -//! transient noise event. use anyhow::{Context, Result}; use rusqlite::{Connection, OptionalExtension}; @@ -72,8 +61,9 @@ pub fn with_connection( let db_path = workspace_dir.join("subconscious").join("subconscious.db"); if let Some(parent) = db_path.parent() { std::fs::create_dir_all(parent) - .with_context(|| format!("create subconscious dir: {}", parent.display()))?; + .with_context(|| format!("failed to create subconscious dir: {}", parent.display()))?; } + let conn = open_and_initialize_with_retry(&db_path)?; f(&conn) }