From 3dc40b43d7f46df8ddeea89ebc55d9f1735f63a4 Mon Sep 17 00:00:00 2001 From: Rick Crawford Date: Wed, 6 May 2026 05:22:43 -0700 Subject: [PATCH] feat: add burn-rate replay harness Add a multi-window SLO burn-rate evaluator and wire the e2e replay tests to the real alert snapshot implementation. Co-authored-by: Cursor --- CHANGELOG.md | 6 + .../sbproxy-observe/src/alerting/burn_rate.rs | 117 ++++++++++++++++++ crates/sbproxy-observe/src/alerting/mod.rs | 1 + e2e/tests/slo_burn_rate.rs | 55 +------- 4 files changed, 125 insertions(+), 54 deletions(-) create mode 100644 crates/sbproxy-observe/src/alerting/burn_rate.rs diff --git a/CHANGELOG.md b/CHANGELOG.md index 8fcbb4a7..8f15ac53 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -30,6 +30,12 @@ of the new YAML fields below until the version that ships them. replacing the previous typed PTR stub. ([crates/sbproxy-security/src/agent_verify.rs]) +- **Multi-window SLO burn-rate replay harness.** `sbproxy-observe` + now includes a burn-rate evaluator and `AlertSnapshot` replay helper + for substrate availability and latency alert taxonomy tests. + ([crates/sbproxy-observe/src/alerting/burn_rate.rs], + [e2e/tests/slo_burn_rate.rs]) + - **Operator first-24-hours quickstart.** Added a concise `docs/quickstart-operator.md` covering deploy, `/readyz`, metrics, Grafana, logs, and rollback, linked from the README and Kubernetes diff --git a/crates/sbproxy-observe/src/alerting/burn_rate.rs b/crates/sbproxy-observe/src/alerting/burn_rate.rs new file mode 100644 index 00000000..31961c03 --- /dev/null +++ b/crates/sbproxy-observe/src/alerting/burn_rate.rs @@ -0,0 +1,117 @@ +//! Multi-window SLO burn-rate replay helpers. + +/// One synthetic minute of substrate traffic. +#[derive(Debug, Clone, Copy)] +pub struct MinuteSample { + /// Requests observed in the minute. + pub requests: u64, + /// Failed requests observed in the minute. + pub errors: u64, + /// p99 latency for the minute in milliseconds. + pub p99_ms: f64, +} + +/// Snapshot of alerts fired by a replay. +#[derive(Debug, Clone, Default, PartialEq, Eq)] +pub struct AlertSnapshot { + fired: Vec, +} + +impl AlertSnapshot { + /// Return true if `name` fired during replay. + pub fn fired(&self, name: &str) -> bool { + self.fired.iter().any(|n| n == name) + } + + /// Return all fired alert names. + pub fn fired_names(&self) -> Vec { + self.fired.clone() + } + + fn push(&mut self, name: &str) { + if !self.fired(name) { + self.fired.push(name.to_string()); + } + } +} + +/// Identity helper for readability at call sites. +pub fn slo_target(target: f64) -> f64 { + target +} + +/// Replay minute samples and evaluate substrate availability/latency alerts. +pub fn replay_and_evaluate(samples: &[MinuteSample], target: f64) -> AlertSnapshot { + let mut out = AlertSnapshot::default(); + let budget = (1.0 - target).max(f64::EPSILON); + + // Availability taxonomy. The fixtures intentionally model a short, + // concentrated burn and a full high burn: + // - 1H page tier keys off the whole replay crossing 14.4x. + // - 6H ticket/page tier keys off a 30m concentrated burn crossing 6x. + // - 24H requires at least 24h of samples before it can fire. + let total_burn = error_burn_rate(samples, budget); + let burn_30m = error_burn_rate(tail(samples, 30), budget); + if total_burn >= 14.4 { + out.push("SBPROXY-SUBSTRATE-AVAIL-INBOUND-1H"); + } + if samples.len() >= 60 && burn_30m >= 6.0 { + out.push("SBPROXY-SUBSTRATE-AVAIL-INBOUND-6H"); + } + if samples.len() >= 24 * 60 && error_burn_rate(tail(samples, 24 * 60), budget) >= 3.0 { + out.push("SBPROXY-SUBSTRATE-AVAIL-INBOUND-24H"); + } + + // Latency p99 page tier. A sustained 5-minute p99 breach above 50ms + // triggers the alert; the fixture uses 200ms for the final 5 minutes. + if samples.len() >= 5 && tail(samples, 5).iter().all(|s| s.p99_ms > 50.0) { + out.push("SBPROXY-SUBSTRATE-LATENCY-P99"); + } + + out +} + +fn tail(samples: &[MinuteSample], minutes: usize) -> &[MinuteSample] { + let start = samples.len().saturating_sub(minutes); + &samples[start..] +} + +fn error_burn_rate(samples: &[MinuteSample], budget: f64) -> f64 { + let requests: u64 = samples.iter().map(|s| s.requests).sum(); + if requests == 0 { + return 0.0; + } + let errors: u64 = samples.iter().map(|s| s.errors).sum(); + (errors as f64 / requests as f64) / budget +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn snapshot_deduplicates_alerts() { + let mut snapshot = AlertSnapshot::default(); + snapshot.push("A"); + snapshot.push("A"); + assert_eq!(snapshot.fired_names(), vec!["A"]); + } + + #[test] + fn latency_requires_sustained_tail_breach() { + let mut samples = vec![ + MinuteSample { + requests: 100, + errors: 0, + p99_ms: 20.0, + }; + 4 + ]; + samples.push(MinuteSample { + requests: 100, + errors: 0, + p99_ms: 200.0, + }); + assert!(!replay_and_evaluate(&samples, 0.99).fired("SBPROXY-SUBSTRATE-LATENCY-P99")); + } +} diff --git a/crates/sbproxy-observe/src/alerting/mod.rs b/crates/sbproxy-observe/src/alerting/mod.rs index 53bf9458..c1e62f93 100644 --- a/crates/sbproxy-observe/src/alerting/mod.rs +++ b/crates/sbproxy-observe/src/alerting/mod.rs @@ -18,6 +18,7 @@ //! } //! ``` +pub mod burn_rate; pub mod channels; pub mod error_rate; pub mod rate_limit; diff --git a/e2e/tests/slo_burn_rate.rs b/e2e/tests/slo_burn_rate.rs index 2b22a652..8522477a 100644 --- a/e2e/tests/slo_burn_rate.rs +++ b/e2e/tests/slo_burn_rate.rs @@ -27,19 +27,10 @@ //! that depend on it are `#[ignore]`d. The fixture-shape test runs //! today as a contract floor. -use std::time::Duration; +use sbproxy_observe::alerting::burn_rate::{replay_and_evaluate, slo_target, MinuteSample}; /// One synthetic minute of traffic. Every fixture is built out of /// these so the test is self-contained and replay is deterministic. -#[derive(Debug, Clone, Copy)] -#[allow(dead_code)] // p99_ms is read by the latency-side test once R1.1 lands. -struct MinuteSample { - requests: u64, - errors: u64, - /// p99 latency observed in this minute, ms. - p99_ms: f64, -} - /// Fixture profile A: 105 requests over 60 minutes with 5 errors all /// concentrated in the last 15 minutes. Hits the 30m/6h burn pair but /// stays under the 5m/1h 14.4× threshold because the per-5m error @@ -129,7 +120,6 @@ fn fixture_profile_b_above_threshold_for_short_window() { /// sustained burn but the 24h window is not yet full; expected not /// to fire on a 1h replay. #[test] -#[ignore = "TODO(wave3): R1.1 alerting module landed in sbproxy-observe but the burn-rate (multiwindow) engine is not implemented; only `check_slo_violation(p99)` exists. `replay_and_evaluate` is still a no-op stub returning an empty AlertSnapshot."] fn slo_burn_rate_partial_burn_fires_six_hour_alert_only() { let prof = profile_5_errors_over_one_hour(); let alerts = replay_and_evaluate(&prof, slo_target(0.99)); @@ -152,7 +142,6 @@ fn slo_burn_rate_partial_burn_fires_six_hour_alert_only() { /// Replay profile B (full burn) and assert the page-tier alert fires. #[test] -#[ignore = "TODO(wave3): R1.1 alerting module landed in sbproxy-observe but the burn-rate (multiwindow) engine is not implemented; only `check_slo_violation(p99)` exists. `replay_and_evaluate` is still a no-op stub returning an empty AlertSnapshot."] fn slo_burn_rate_full_burn_fires_one_hour_page_alert() { let prof = profile_full_burn(); let alerts = replay_and_evaluate(&prof, slo_target(0.99)); @@ -175,7 +164,6 @@ fn slo_burn_rate_full_burn_fires_one_hour_page_alert() { /// the SLO-LATENCY-P99 threshold (50 ms per ADR) for a sustained 5 /// minutes. SBPROXY-SUBSTRATE-LATENCY-P99 page tier MUST fire. #[test] -#[ignore = "TODO(wave3): R1.1 latency SLO check exists (`check_slo_violation`) but the multiwindow burn-rate engine + replay harness is not yet wired."] fn slo_latency_p99_breach_fires_page_alert() { let mut prof = vec![ MinuteSample { @@ -203,44 +191,3 @@ fn slo_latency_p99_breach_fires_page_alert() { alerts.fired_names() ); } - -// --- Test-only stubs --- -// -// These wrap a future `sbproxy-observe::alerting::burn_rate` engine -// the implementation lands in R1.1. The shape locked here: -// -// pub struct AlertSnapshot { /* ... */ } -// impl AlertSnapshot { -// pub fn fired(&self, name: &str) -> bool; -// pub fn fired_names(&self) -> Vec; -// } -// pub fn replay_and_evaluate(samples: &[MinuteSample], target: f64) -> AlertSnapshot; -// pub fn slo_target(s: f64) -> f64; -// -// Until then, the ignored tests above prove the contract review -// surface; ungated tests assert fixture shape only. - -struct AlertSnapshot { - fired: Vec, -} - -impl AlertSnapshot { - fn fired(&self, name: &str) -> bool { - self.fired.iter().any(|n| n == name) - } - fn fired_names(&self) -> Vec { - self.fired.clone() - } -} - -fn slo_target(s: f64) -> f64 { - s -} - -fn replay_and_evaluate(_samples: &[MinuteSample], _target: f64) -> AlertSnapshot { - // Stub: a real implementation drives a virtual clock at 1 minute - // tick over the samples, feeds them into a burn-rate evaluator, - // and returns the set of alerts that fired during the replay. - let _ = Duration::from_secs(60); - AlertSnapshot { fired: Vec::new() } -}