diff --git a/CHANGELOG.md b/CHANGELOG.md index 65db66bf..15154c9b 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -19,6 +19,13 @@ of the new YAML fields below until the version that ships them. docs. ([docs/quickstart-operator.md]) +- **Hostname cardinality override for metrics.** `proxy.metrics.cardinality.hostname_cap` + can lower the `hostname` label budget independently from the default + per-label cap, enabling deterministic overflow tests and tighter + multi-tenant Prometheus budgets. + ([crates/sbproxy-config/src/types.rs], + [crates/sbproxy-observe/src/cardinality.rs]) + - **`release-fast` build profile for CI images.** Docker-based CI and local kind smoke-test builds can now use `CARGO_PROFILE=release-fast` to skip fat LTO and use more codegen units, cutting link memory/time diff --git a/crates/sbproxy-config/src/types.rs b/crates/sbproxy-config/src/types.rs index 5e7b1624..b349be68 100644 --- a/crates/sbproxy-config/src/types.rs +++ b/crates/sbproxy-config/src/types.rs @@ -849,12 +849,16 @@ pub struct MetricsConfig { /// collapsed to `__other__`. Defaults to 1 000. #[serde(default = "default_max_cardinality")] pub max_cardinality_per_label: usize, + /// Per-label cardinality overrides. + #[serde(default)] + pub cardinality: MetricsCardinalityConfig, } impl Default for MetricsConfig { fn default() -> Self { Self { max_cardinality_per_label: default_max_cardinality(), + cardinality: MetricsCardinalityConfig::default(), } } } @@ -863,6 +867,13 @@ fn default_max_cardinality() -> usize { 1000 } +/// Per-label metrics cardinality overrides. +#[derive(Debug, Clone, Default, Deserialize, Serialize)] +pub struct MetricsCardinalityConfig { + /// Optional override for the `hostname` label cap. + pub hostname_cap: Option, +} + // --- Access Log Config --- /// Structured-JSON access-log emission, off by default. diff --git a/crates/sbproxy-core/src/server.rs b/crates/sbproxy-core/src/server.rs index 705a6c12..5e9d55f3 100644 --- a/crates/sbproxy-core/src/server.rs +++ b/crates/sbproxy-core/src/server.rs @@ -11146,6 +11146,15 @@ pub fn run(config_path: &str) -> anyhow::Result<()> { let server_config = compiled.server.clone(); let hostnames: Vec = compiled.host_map.keys().map(|k| k.to_string()).collect(); + if let Some(metrics_cfg) = server_config.metrics.as_ref() { + let _ = sbproxy_observe::metrics::init_cardinality_limiter( + sbproxy_observe::CardinalityConfig { + max_per_label: metrics_cfg.max_cardinality_per_label, + hostname_cap: metrics_cfg.cardinality.hostname_cap, + }, + ); + } + // Initialise the AI provider catalog from the embedded YAML, with // an optional override path from `proxy.ai_providers_file`: use // the override file when readable, fall back to the embedded diff --git a/crates/sbproxy-observe/src/cardinality.rs b/crates/sbproxy-observe/src/cardinality.rs index efd1cb60..34b3b775 100644 --- a/crates/sbproxy-observe/src/cardinality.rs +++ b/crates/sbproxy-observe/src/cardinality.rs @@ -20,15 +20,30 @@ pub fn log_demotion(label_name: &str, value: &str) { } /// Configuration for cardinality limiting. +#[derive(Debug, Clone)] pub struct CardinalityConfig { /// Max unique values per label name. Default: 1000. pub max_per_label: usize, + /// Optional hostname-label override. Useful for tests and + /// deployments that need a tighter route-cardinality budget. + pub hostname_cap: Option, } impl Default for CardinalityConfig { fn default() -> Self { Self { max_per_label: 1000, + hostname_cap: None, + } + } +} + +impl CardinalityConfig { + /// Return the effective cap for `label_name`. + pub fn cap_for_label(&self, label_name: &str) -> usize { + match (label_name, self.hostname_cap) { + ("hostname", Some(cap)) => cap, + _ => budget_for_label(label_name), } } } @@ -111,7 +126,7 @@ impl CardinalityLimiter { /// callers of [`sanitize`](Self::sanitize) keep the workspace /// default. pub fn sanitize_budget(&self, label_name: &str, value: &str) -> String { - let cap = budget_for_label(label_name); + let cap = self.config.cap_for_label(label_name); self.sanitize_with_cap(label_name, value, cap) } @@ -164,7 +179,10 @@ mod tests { use std::sync::Arc; fn limiter_with_max(max: usize) -> CardinalityLimiter { - CardinalityLimiter::new(CardinalityConfig { max_per_label: max }) + CardinalityLimiter::new(CardinalityConfig { + max_per_label: max, + hostname_cap: None, + }) } // --- log_demotion --- @@ -299,12 +317,25 @@ mod tests { assert_eq!(budget_for_label("totally-novel-label"), 1000); } + #[test] + fn hostname_cap_override_wins_over_adr_budget() { + let config = CardinalityConfig { + max_per_label: 1_000_000, + hostname_cap: Some(2), + }; + let lim = CardinalityLimiter::new(config); + assert_eq!(lim.sanitize_budget("hostname", "a.example"), "a.example"); + assert_eq!(lim.sanitize_budget("hostname", "b.example"), "b.example"); + assert_eq!(lim.sanitize_budget("hostname", "c.example"), OTHER_LABEL); + } + #[test] fn sanitize_budget_demotes_at_per_label_cap_for_agent_class() { // agent_class budget is 8 per ADR. Insert 8 distinct values, // then verify the 9th overflows to __other__. let lim = CardinalityLimiter::new(CardinalityConfig { max_per_label: 1_000_000, + hostname_cap: None, }); for i in 0..8 { let v = format!("class-{i}"); @@ -321,6 +352,7 @@ mod tests { fn sanitize_budget_payment_rail_caps_at_six() { let lim = CardinalityLimiter::new(CardinalityConfig { max_per_label: 1_000_000, + hostname_cap: None, }); for v in &[ "none", @@ -345,6 +377,7 @@ mod tests { // fallback path doesn't accidentally cap at zero. let lim = CardinalityLimiter::new(CardinalityConfig { max_per_label: 1_000_000, + hostname_cap: None, }); assert_eq!(lim.sanitize_budget("oddball", "value-1"), "value-1"); assert_eq!(lim.sanitize_budget("oddball", "value-2"), "value-2"); @@ -359,6 +392,7 @@ mod tests { // string or "__other__", and the accepted count must not exceed 500. let lim = Arc::new(CardinalityLimiter::new(CardinalityConfig { max_per_label: 500, + hostname_cap: None, })); let mut handles = Vec::new(); diff --git a/crates/sbproxy-observe/src/metrics.rs b/crates/sbproxy-observe/src/metrics.rs index ebb2123e..649fe855 100644 --- a/crates/sbproxy-observe/src/metrics.rs +++ b/crates/sbproxy-observe/src/metrics.rs @@ -557,7 +557,8 @@ pub fn record_request_with_labels( bytes_out: u64, agent: AgentLabels<'_>, ) { - let origin_san = sanitize_label("origin", origin); + let hostname_san = sanitize_label_budget("sbproxy_requests_total", "hostname", origin); + let origin_san = sanitize_label_budget("sbproxy_origin_requests_total", "origin", origin); let status_str = status.to_string(); // --- Wave 1 / G1.6: per-agent labels on sbproxy_requests_total --- @@ -582,11 +583,12 @@ pub fn record_request_with_labels( let m = metrics(); // sbproxy_requests_total now carries the full Wave 1 label set. - // Sanitised hostname is reused via origin_san (cardinality cap - // 200 per ADR; same numeric cap, different label name). + // Sanitise with the metric's public label name (`hostname`) so + // `metrics.cardinality.hostname_cap` can lower this budget without + // affecting the per-origin views below. m.requests_total .with_label_values(&[ - &origin_san, + &hostname_san, method, &status_str, &agent_id, @@ -1143,7 +1145,10 @@ mod tests { #[test] fn test_cardinality_limiter_overflow_to_other() { // Use a fresh limiter with a tiny cap to test overflow. - let lim = CardinalityLimiter::new(CardinalityConfig { max_per_label: 3 }); + let lim = CardinalityLimiter::new(CardinalityConfig { + max_per_label: 3, + hostname_cap: None, + }); let a = lim.sanitize("origin", "a.com"); let b = lim.sanitize("origin", "b.com"); @@ -1169,6 +1174,7 @@ mod tests { // via a dedicated limiter (we can't reset the global one safely in tests). let lim = CardinalityLimiter::new(CardinalityConfig { max_per_label: 1000, + hostname_cap: None, }); for i in 0..1000 { lim.sanitize("origin", &format!("overflow-origin-{i}.example.com")); diff --git a/docs/configuration.md b/docs/configuration.md index f97564d4..7afb9ec4 100644 --- a/docs/configuration.md +++ b/docs/configuration.md @@ -153,6 +153,8 @@ proxy: metrics: max_cardinality_per_label: 1000 + cardinality: + hostname_cap: 200 admin: enabled: false @@ -216,7 +218,8 @@ socket level. | Field | Type | Default | Description | |-------|------|---------|-------------| -| `max_cardinality_per_label` | int | 1000 | Cap on unique label values per metric. New values are collapsed to `__other__`. | +| `max_cardinality_per_label` | int | 1000 | Default cap on unique label values per metric. New values are collapsed to `__other__`. | +| `cardinality.hostname_cap` | int | 200 | Optional override for the `hostname` label budget. Useful for high-tenant-count deployments and deterministic overflow tests. | ### access_log diff --git a/docs/features.md b/docs/features.md index 89f38d23..4ca8ffdc 100644 --- a/docs/features.md +++ b/docs/features.md @@ -1424,6 +1424,8 @@ proxy: port: 9090 metrics: max_cardinality_per_label: 1000 + cardinality: + hostname_cap: 200 ``` ```bash diff --git a/docs/manual.md b/docs/manual.md index e96f6179..2ab9fbe2 100644 --- a/docs/manual.md +++ b/docs/manual.md @@ -420,7 +420,7 @@ The proxy serves `/metrics` on its main HTTP port (`http_bind_port`, default `80 GET http://localhost:8080/metrics ``` -Label cardinality is capped by `metrics.max_cardinality_per_label` (default `1000`). Values past the cap collapse into the literal `__other__`. +Label cardinality is capped by `metrics.max_cardinality_per_label` (default `1000`). The `hostname` label uses its ADR budget by default and can be overridden with `metrics.cardinality.hostname_cap`. Values past the effective cap collapse into the literal `__other__`. #### Hostname-scoped metrics diff --git a/e2e/tests/metrics_per_agent.rs b/e2e/tests/metrics_per_agent.rs index a4304611..6575cdc1 100644 --- a/e2e/tests/metrics_per_agent.rs +++ b/e2e/tests/metrics_per_agent.rs @@ -25,6 +25,9 @@ use sbproxy_e2e::ProxyHarness; const FIXTURE: &str = r#" proxy: http_bind_port: 0 # overridden by the harness + metrics: + cardinality: + hostname_cap: 100 origins: "blog.localhost": action: @@ -132,7 +135,6 @@ fn cardinality_cap_keeps_series_count_bounded() { // --- Test 3: overflow sentinel + demotion counter --- #[test] -#[ignore = "TODO(wave3): hostname cardinality cap is above 250 in default config; overflow sentinel + demotion counter wired but not triggered by this fixture. Needs either a higher-volume fixture or a config knob to lower the cap for tests."] fn cardinality_overflow_emits_sentinel_and_increments_demotion_counter() { let harness = ProxyHarness::start_with_yaml(FIXTURE).expect("start proxy");