Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
92 changes: 92 additions & 0 deletions src/core/observability.rs
Original file line number Diff line number Diff line change
Expand Up @@ -142,6 +142,24 @@ pub enum ExpectedErrorKind {
/// ~56 events/hour, all from `openhuman.agent_chat` via
/// `local_ai.ops.agent_chat`).
PromptInjectionBlocked,
/// The request exceeded the model's context window — the
/// conversation/prompt is too long for the configured model. A
/// deterministic user-state / usage condition; the remediation is
/// "start a new chat, trim the conversation, or pick a larger-context
/// model", which the UI surfaces. Sentry has no signal to act on.
///
/// The provider HTTP layer (`providers::ops::api_error`) suppresses its
/// own per-attempt event for this condition, and
/// `providers::reliable` marks it non-retryable. This arm catches the
/// **re-report** when the same error is raised again by
/// `agent.run_single` / `web_channel.run_chat_task` under a different
/// `domain` tag (same two-emit-site shape as the empty-response and
/// session-expired fixes). Delegates to the single-source matcher
/// [`crate::openhuman::inference::provider::is_context_window_exceeded_message`]
/// so the retry classifier, the api_error cascade, and this arm can't
/// drift. Drops Sentry TAURI-RUST-501
/// (`Context size has been exceeded`, custom-provider 500).
ContextWindowExceeded,
/// The memory-store chunk DB's per-path circuit breaker is currently open
/// because too many consecutive SQLite init attempts failed. This is the
/// breaker doing its job — it opened *after* the underlying transient
Expand Down Expand Up @@ -235,6 +253,14 @@ pub fn expected_error_kind(message: &str) -> Option<ExpectedErrorKind> {
if is_prompt_injection_blocked_message(&lower) {
return Some(ExpectedErrorKind::PromptInjectionBlocked);
}
// Context-window-exceeded re-report from a higher layer (agent /
// web_channel). The provider api_error cascade suppresses its own
// emit; this catches the re-raise. Delegates to the single-source
// provider matcher so the phrasing can't drift. Runs last so a more
// specific matcher always wins.
if crate::openhuman::inference::provider::is_context_window_exceeded_message(message) {
return Some(ExpectedErrorKind::ContextWindowExceeded);
}
if is_memory_store_breaker_open(&lower) {
return Some(ExpectedErrorKind::MemoryStoreBreakerOpen);
}
Expand Down Expand Up @@ -1037,6 +1063,21 @@ fn report_expected_message(kind: ExpectedErrorKind, message: &str, domain: &str,
"[observability] {domain}.{operation} skipped expected prompt-injection-blocked error"
);
}
ExpectedErrorKind::ContextWindowExceeded => {
// Request too long for the model's context window. The provider
// api_error cascade already demotes its own emit; this is the
// higher-layer re-report. Deterministic user-state — the UI
// shows the retry message and the user trims / starts a new
// chat. Demote to `warn!` (breadcrumb only) — same tier as the
// other usage-state conditions.
tracing::warn!(
domain = domain,
operation = operation,
kind = "context_window_exceeded",
error = %message,
"[observability] {domain}.{operation} skipped expected context-window-exceeded error: {message}"
);
}
ExpectedErrorKind::DiskFull => {
// Host filesystem out of space. The user must free space on
// their machine — Sentry can't help. Demote at `warn!` so a
Expand Down Expand Up @@ -1723,6 +1764,57 @@ mod tests {
);
}

// ── ContextWindowExceeded (TAURI-RUST-501) ─────────────────────────────

#[test]
fn classifies_context_window_exceeded_rereport() {
// TAURI-RUST-501: the custom-provider 500 body that escapes the
// provider api_error cascade's own status-gated checks. When the
// error is re-raised by `agent.run_single` / `web_channel.
// run_chat_task`, `report_error_or_expected` runs the classifier on
// the full message — this arm must catch the new phrasing.
assert_eq!(
expected_error_kind(
"custom API error (500 Internal Server Error): \
{\"error\":{\"code\":500,\"message\":\"Context size has been exceeded.\",\"type\":\"server_error\"}}"
),
Some(ExpectedErrorKind::ContextWindowExceeded)
);

// The established phrasings the provider/reliable layer already
// recognized must classify here too (single-source matcher).
for raw in [
"OpenAI API error (400): This model's maximum context length is 8192 tokens",
"request exceeds the context window of this model",
"context length exceeded",
"prompt is too long",
] {
assert_eq!(
expected_error_kind(raw),
Some(ExpectedErrorKind::ContextWindowExceeded),
"should classify as context-window-exceeded: {raw}"
);
}
}

#[test]
fn does_not_classify_unrelated_messages_as_context_window_exceeded() {
// Anchors are context-overflow specific. A generic "window" or
// "context" mention, or an unrelated rate-limit "exceeded", must
// not classify.
for raw in [
"rate limit exceeded, retry after 30s",
"failed to open context menu window",
"tool call exceeded the allowed budget",
] {
assert_eq!(
expected_error_kind(raw),
None,
"must NOT classify as context-window-exceeded: {raw}"
);
}
}

#[test]
fn classifies_memory_store_breaker_open() {
// TAURI-RUST-52X (~455 events on self-hosted Sentry): the chunk-store
Expand Down
187 changes: 187 additions & 0 deletions src/openhuman/inference/provider/ops.rs
Original file line number Diff line number Diff line change
Expand Up @@ -606,6 +606,101 @@ pub(super) fn log_provider_config_rejection(
);
}

/// Whether a provider error body indicates the request exceeded the model's
/// context window (the conversation/prompt is too long for the configured
/// model). This is a deterministic user-state / usage condition — the
/// remediation is "start a new chat, trim the conversation, or pick a
/// larger-context model" — not a product bug. Sentry has no signal to act
/// on.
///
/// Single source of truth for the context-overflow phrasing, shared by:
/// - [`super::reliable`]'s non-retryable classifier (retrying the same
/// oversized request can't help),
/// - the [`api_error`] Sentry-suppression cascade (below), and
/// - the `core::observability` `ContextWindowExceeded` classifier (which
/// catches the higher-layer re-report under `domain=agent` /
/// `web_channel`).
///
/// Status-agnostic on purpose: providers disagree on the HTTP code for this
/// condition — OpenAI / most emit `400 context_length_exceeded`, but some
/// custom / self-hosted gateways mis-report it as `500` (Sentry
/// TAURI-RUST-501: `"custom API error (500 …): Context size has been
/// exceeded."`). Matching on the body keeps all of them in one bucket.
///
/// Anchoring is deliberately two-tier because this matcher now also feeds
/// `core::observability::expected_error_kind` (Sentry suppression) and the
/// `reliable` non-retryable decision, so an over-broad match would both
/// hide a real error from Sentry *and* wrongly mark a retryable error as
/// permanent:
///
/// - **Length/context phrases** ([`CONTEXT_HINTS`]) are unambiguous —
/// "context window", "context length", "prompt is too long" only describe
/// request-size overflow — so they match alone.
/// - **Token-count phrases** ([`TOKEN_HINTS`]) collide with per-minute token
/// *rate* limits ("rate limit reached … too many tokens per min"), which
/// are transient 429s that MUST stay retryable and keep reaching Sentry.
/// They only count as context-overflow when no rate-limit marker is
/// present.
pub fn is_context_window_exceeded_message(body: &str) -> bool {
let lower = body.to_ascii_lowercase();

// Unambiguous request-size / context phrases — match on their own.
const CONTEXT_HINTS: &[&str] = &[
"exceeds the context window",
"context window of this model",
"maximum context length",
"context length exceeded",
"context size has been exceeded",
"prompt is too long",
"input is too long",
];
if CONTEXT_HINTS.iter().any(|hint| lower.contains(hint)) {
return true;
}

// Token-count phrases are ambiguous with token-per-minute RATE limits.
// Treat them as context-overflow only when the body carries no
// rate-limit marker — otherwise a transient TPM 429 would be silenced
// from Sentry and (via `reliable`) wrongly classified as non-retryable.
const TOKEN_HINTS: &[&str] = &["too many tokens", "token limit exceeded"];
if TOKEN_HINTS.iter().any(|hint| lower.contains(hint)) {
const RATE_LIMIT_MARKERS: &[&str] = &[
"per minute",
"per min",
"rate limit",
"rate_limit",
"tpm",
"requests per",
"retry after",
"try again in",
];
return !RATE_LIMIT_MARKERS
.iter()
.any(|marker| lower.contains(marker));
}

false
}

pub(super) fn log_context_window_exceeded(
operation: &str,
provider: &str,
model: Option<&str>,
status: reqwest::StatusCode,
) {
tracing::warn!(
domain = "llm_provider",
operation = operation,
provider = provider,
model = model.unwrap_or(""),
status = status.as_u16(),
failure = "non_2xx",
kind = "context_window_exceeded",
"[llm_provider] {operation} context-window exceeded ({status}) — \
request too long for the model, not reporting to Sentry"
);
}

/// Build a sanitized provider error from a failed HTTP response.
///
/// Reports the failure to Sentry with `provider` and `status` tags so
Expand Down Expand Up @@ -647,6 +742,10 @@ pub async fn api_error(provider: &str, response: reqwest::Response) -> anyhow::E
is_custom_openai_upstream_bad_request_http_400(provider, status, &body);
let is_provider_access_policy_denied = is_provider_access_policy_denied_http_403(status, &body);
let is_provider_config_rejection = is_provider_config_rejection_http(status, provider, &body);
// Context-overflow is status-agnostic: match the body directly (some
// custom gateways mis-report it as 500 — TAURI-RUST-501 — so a status
// gate would let those through to `should_report_provider_http_failure`).
let is_context_window_exceeded = is_context_window_exceeded_message(&body);

if is_auth_failure && is_backend {
tracing::warn!(
Expand Down Expand Up @@ -675,6 +774,8 @@ pub async fn api_error(provider: &str, response: reqwest::Response) -> anyhow::E
log_provider_access_policy_denied_http_403("api_error", provider, None, status);
} else if is_provider_config_rejection {
log_provider_config_rejection("api_error", provider, None, status);
} else if is_context_window_exceeded {
log_context_window_exceeded("api_error", provider, None, status);
} else if should_report_provider_http_failure(status) {
crate::core::observability::report_error(
message.as_str(),
Expand Down Expand Up @@ -1528,6 +1629,92 @@ mod tests {
}
}

mod context_window_exceeded_suppression {
use super::*;

#[test]
fn classifies_tauri_rust_501_custom_provider_500_body() {
// TAURI-RUST-501: the custom-provider 500 wire body. The
// matcher is status-agnostic, so the 500 mis-report is caught
// (the provider api_error cascade routes it to
// `log_context_window_exceeded` instead of `report_error`).
assert!(is_context_window_exceeded_message(
"{\"error\":{\"code\":500,\"message\":\"Context size has been exceeded.\",\"type\":\"server_error\"}}"
));
}

#[test]
fn classifies_established_context_overflow_phrasings() {
// The phrasings the reliable.rs non-retryable classifier
// recognized before this refactor must all still match through
// the shared single-source matcher.
for body in [
"This model's maximum context length is 8192 tokens",
"request exceeds the context window of this model",
"context length exceeded",
"too many tokens in the prompt",
"token limit exceeded",
"prompt is too long for the selected model",
"input is too long",
] {
assert!(
is_context_window_exceeded_message(body),
"should match context-overflow body: {body}"
);
}
}

#[test]
fn does_not_match_unrelated_bodies() {
for body in [
"rate limit exceeded, retry after 30s",
"Invalid request: model not found",
"Insufficient budget",
"tool call exceeded the allowed budget",
] {
assert!(
!is_context_window_exceeded_message(body),
"must NOT match unrelated body: {body}"
);
}
}

#[test]
fn token_rate_limits_are_not_context_overflow() {
// Token-count phrases collide with per-minute token RATE limits.
// Those are transient 429s that must stay retryable and keep
// reaching Sentry — they must NOT be classified as context
// overflow (CodeRabbit review of #2820). The rate-limit marker
// disambiguates.
for body in [
"Rate limit reached: too many tokens per minute (TPM) for this org",
"rate_limit_exceeded: token limit exceeded, retry after 12s",
"You have hit too many tokens per min; try again in 30s",
] {
assert!(
!is_context_window_exceeded_message(body),
"TPM rate-limit must NOT match as context overflow: {body}"
);
}
// …but a token-count overflow with NO rate marker still matches.
assert!(is_context_window_exceeded_message(
"Request rejected: too many tokens in the input for this model"
));
}

#[test]
fn log_helper_runs_without_panicking() {
// Smoke for the demotion path taken by `api_error` — no tracing
// subscriber in unit tests.
log_context_window_exceeded(
"api_error",
"custom_openai",
None,
reqwest::StatusCode::INTERNAL_SERVER_ERROR,
);
}
}

#[test]
fn test_sanitize_api_error_utf8() {
let input = "🦀".repeat(MAX_API_ERROR_CHARS + 10);
Expand Down
18 changes: 5 additions & 13 deletions src/openhuman/inference/provider/reliable.rs
Original file line number Diff line number Diff line change
Expand Up @@ -99,19 +99,11 @@ fn is_stream_error_non_retryable(err: &StreamError) -> bool {
}

fn is_context_window_exceeded(err: &anyhow::Error) -> bool {
let lower = err.to_string().to_lowercase();
let hints = [
"exceeds the context window",
"context window of this model",
"maximum context length",
"context length exceeded",
"too many tokens",
"token limit exceeded",
"prompt is too long",
"input is too long",
];

hints.iter().any(|hint| lower.contains(hint))
// Single source of truth for the context-overflow phrasing lives in
// `ops::is_context_window_exceeded_message` so the non-retryable
// classifier here, the `api_error` Sentry-suppression cascade, and the
// `core::observability` `ContextWindowExceeded` arm can't drift apart.
super::is_context_window_exceeded_message(&err.to_string())
}

/// Detect provider-side temporary capacity/outage errors. Covers:
Expand Down
Loading