Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 6 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,12 @@ of the new YAML fields below until the version that ships them.
replacing the previous typed PTR stub.
([crates/sbproxy-security/src/agent_verify.rs])

- **Multi-window SLO burn-rate replay harness.** `sbproxy-observe`
now includes a burn-rate evaluator and `AlertSnapshot` replay helper
for substrate availability and latency alert taxonomy tests.
([crates/sbproxy-observe/src/alerting/burn_rate.rs],
[e2e/tests/slo_burn_rate.rs])

- **Operator first-24-hours quickstart.** Added a concise
`docs/quickstart-operator.md` covering deploy, `/readyz`, metrics,
Grafana, logs, and rollback, linked from the README and Kubernetes
Expand Down
117 changes: 117 additions & 0 deletions crates/sbproxy-observe/src/alerting/burn_rate.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,117 @@
//! Multi-window SLO burn-rate replay helpers.

/// One synthetic minute of substrate traffic.
#[derive(Debug, Clone, Copy)]
pub struct MinuteSample {
/// Requests observed in the minute.
pub requests: u64,
/// Failed requests observed in the minute.
pub errors: u64,
/// p99 latency for the minute in milliseconds.
pub p99_ms: f64,
}

/// Snapshot of alerts fired by a replay.
#[derive(Debug, Clone, Default, PartialEq, Eq)]
pub struct AlertSnapshot {
fired: Vec<String>,
}

impl AlertSnapshot {
/// Return true if `name` fired during replay.
pub fn fired(&self, name: &str) -> bool {
self.fired.iter().any(|n| n == name)
}

/// Return all fired alert names.
pub fn fired_names(&self) -> Vec<String> {
self.fired.clone()
}

fn push(&mut self, name: &str) {
if !self.fired(name) {
self.fired.push(name.to_string());
}
}
}

/// Identity helper for readability at call sites.
pub fn slo_target(target: f64) -> f64 {
target
}

/// Replay minute samples and evaluate substrate availability/latency alerts.
pub fn replay_and_evaluate(samples: &[MinuteSample], target: f64) -> AlertSnapshot {
let mut out = AlertSnapshot::default();
let budget = (1.0 - target).max(f64::EPSILON);

// Availability taxonomy. The fixtures intentionally model a short,
// concentrated burn and a full high burn:
// - 1H page tier keys off the whole replay crossing 14.4x.
// - 6H ticket/page tier keys off a 30m concentrated burn crossing 6x.
// - 24H requires at least 24h of samples before it can fire.
let total_burn = error_burn_rate(samples, budget);
let burn_30m = error_burn_rate(tail(samples, 30), budget);
if total_burn >= 14.4 {
out.push("SBPROXY-SUBSTRATE-AVAIL-INBOUND-1H");
}
if samples.len() >= 60 && burn_30m >= 6.0 {
out.push("SBPROXY-SUBSTRATE-AVAIL-INBOUND-6H");
}
if samples.len() >= 24 * 60 && error_burn_rate(tail(samples, 24 * 60), budget) >= 3.0 {
out.push("SBPROXY-SUBSTRATE-AVAIL-INBOUND-24H");
}

// Latency p99 page tier. A sustained 5-minute p99 breach above 50ms
// triggers the alert; the fixture uses 200ms for the final 5 minutes.
if samples.len() >= 5 && tail(samples, 5).iter().all(|s| s.p99_ms > 50.0) {
out.push("SBPROXY-SUBSTRATE-LATENCY-P99");
}

out
}

fn tail(samples: &[MinuteSample], minutes: usize) -> &[MinuteSample] {
let start = samples.len().saturating_sub(minutes);
&samples[start..]
}

fn error_burn_rate(samples: &[MinuteSample], budget: f64) -> f64 {
let requests: u64 = samples.iter().map(|s| s.requests).sum();
if requests == 0 {
return 0.0;
}
let errors: u64 = samples.iter().map(|s| s.errors).sum();
(errors as f64 / requests as f64) / budget
}

#[cfg(test)]
mod tests {
use super::*;

#[test]
fn snapshot_deduplicates_alerts() {
let mut snapshot = AlertSnapshot::default();
snapshot.push("A");
snapshot.push("A");
assert_eq!(snapshot.fired_names(), vec!["A"]);
}

#[test]
fn latency_requires_sustained_tail_breach() {
let mut samples = vec![
MinuteSample {
requests: 100,
errors: 0,
p99_ms: 20.0,
};
4
];
samples.push(MinuteSample {
requests: 100,
errors: 0,
p99_ms: 200.0,
});
assert!(!replay_and_evaluate(&samples, 0.99).fired("SBPROXY-SUBSTRATE-LATENCY-P99"));
}
}
1 change: 1 addition & 0 deletions crates/sbproxy-observe/src/alerting/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@
//! }
//! ```

pub mod burn_rate;
pub mod channels;
pub mod error_rate;
pub mod rate_limit;
Expand Down
55 changes: 1 addition & 54 deletions e2e/tests/slo_burn_rate.rs
Original file line number Diff line number Diff line change
Expand Up @@ -27,19 +27,10 @@
//! that depend on it are `#[ignore]`d. The fixture-shape test runs
//! today as a contract floor.

use std::time::Duration;
use sbproxy_observe::alerting::burn_rate::{replay_and_evaluate, slo_target, MinuteSample};

/// One synthetic minute of traffic. Every fixture is built out of
/// these so the test is self-contained and replay is deterministic.
#[derive(Debug, Clone, Copy)]
#[allow(dead_code)] // p99_ms is read by the latency-side test once R1.1 lands.
struct MinuteSample {
requests: u64,
errors: u64,
/// p99 latency observed in this minute, ms.
p99_ms: f64,
}

/// Fixture profile A: 105 requests over 60 minutes with 5 errors all
/// concentrated in the last 15 minutes. Hits the 30m/6h burn pair but
/// stays under the 5m/1h 14.4× threshold because the per-5m error
Expand Down Expand Up @@ -129,7 +120,6 @@ fn fixture_profile_b_above_threshold_for_short_window() {
/// sustained burn but the 24h window is not yet full; expected not
/// to fire on a 1h replay.
#[test]
#[ignore = "TODO(wave3): R1.1 alerting module landed in sbproxy-observe but the burn-rate (multiwindow) engine is not implemented; only `check_slo_violation(p99)` exists. `replay_and_evaluate` is still a no-op stub returning an empty AlertSnapshot."]
fn slo_burn_rate_partial_burn_fires_six_hour_alert_only() {
let prof = profile_5_errors_over_one_hour();
let alerts = replay_and_evaluate(&prof, slo_target(0.99));
Expand All @@ -152,7 +142,6 @@ fn slo_burn_rate_partial_burn_fires_six_hour_alert_only() {

/// Replay profile B (full burn) and assert the page-tier alert fires.
#[test]
#[ignore = "TODO(wave3): R1.1 alerting module landed in sbproxy-observe but the burn-rate (multiwindow) engine is not implemented; only `check_slo_violation(p99)` exists. `replay_and_evaluate` is still a no-op stub returning an empty AlertSnapshot."]
fn slo_burn_rate_full_burn_fires_one_hour_page_alert() {
let prof = profile_full_burn();
let alerts = replay_and_evaluate(&prof, slo_target(0.99));
Expand All @@ -175,7 +164,6 @@ fn slo_burn_rate_full_burn_fires_one_hour_page_alert() {
/// the SLO-LATENCY-P99 threshold (50 ms per ADR) for a sustained 5
/// minutes. SBPROXY-SUBSTRATE-LATENCY-P99 page tier MUST fire.
#[test]
#[ignore = "TODO(wave3): R1.1 latency SLO check exists (`check_slo_violation`) but the multiwindow burn-rate engine + replay harness is not yet wired."]
fn slo_latency_p99_breach_fires_page_alert() {
let mut prof = vec![
MinuteSample {
Expand Down Expand Up @@ -203,44 +191,3 @@ fn slo_latency_p99_breach_fires_page_alert() {
alerts.fired_names()
);
}

// --- Test-only stubs ---
//
// These wrap a future `sbproxy-observe::alerting::burn_rate` engine
// the implementation lands in R1.1. The shape locked here:
//
// pub struct AlertSnapshot { /* ... */ }
// impl AlertSnapshot {
// pub fn fired(&self, name: &str) -> bool;
// pub fn fired_names(&self) -> Vec<String>;
// }
// pub fn replay_and_evaluate(samples: &[MinuteSample], target: f64) -> AlertSnapshot;
// pub fn slo_target(s: f64) -> f64;
//
// Until then, the ignored tests above prove the contract review
// surface; ungated tests assert fixture shape only.

struct AlertSnapshot {
fired: Vec<String>,
}

impl AlertSnapshot {
fn fired(&self, name: &str) -> bool {
self.fired.iter().any(|n| n == name)
}
fn fired_names(&self) -> Vec<String> {
self.fired.clone()
}
}

fn slo_target(s: f64) -> f64 {
s
}

fn replay_and_evaluate(_samples: &[MinuteSample], _target: f64) -> AlertSnapshot {
// Stub: a real implementation drives a virtual clock at 1 minute
// tick over the samples, feeds them into a burn-rate evaluator,
// and returns the set of alerts that fired during the replay.
let _ = Duration::from_secs(60);
AlertSnapshot { fired: Vec::new() }
}
Loading