diff --git a/admin/api/metrics_test.go b/admin/api/metrics_test.go index 43f598d..c816a28 100644 --- a/admin/api/metrics_test.go +++ b/admin/api/metrics_test.go @@ -39,7 +39,7 @@ func makeSnapshot(t time.Time, totalReq, errReq, active, panics float64) metrics func TestMetricsHandler_Fleet_ReturnsAggregated(t *testing.T) { t.Parallel() st := openTestStore(t) - c := metrics.NewCollector(10) + c := metrics.NewCollector(10, time.Hour) ctx := context.Background() inst, err := st.CreateInstance(ctx, "proxy-1", "10.0.0.1:9090") @@ -78,7 +78,7 @@ func TestMetricsHandler_Fleet_ReturnsAggregated(t *testing.T) { func TestMetricsHandler_Instance_ReturnsMetrics(t *testing.T) { t.Parallel() st := openTestStore(t) - c := metrics.NewCollector(10) + c := metrics.NewCollector(10, time.Hour) ctx := context.Background() inst, err := st.CreateInstance(ctx, "proxy-1", "10.0.0.1:9090") @@ -117,7 +117,7 @@ func TestMetricsHandler_Instance_ReturnsMetrics(t *testing.T) { func TestMetricsHandler_Instance_NoData_Returns404(t *testing.T) { t.Parallel() st := openTestStore(t) - c := metrics.NewCollector(10) + c := metrics.NewCollector(10, time.Hour) h := NewMetricsHandler(st, c) mux := http.NewServeMux() @@ -135,7 +135,7 @@ func TestMetricsHandler_Instance_NoData_Returns404(t *testing.T) { func TestMetricsHandler_Instance_InvalidID_Returns400(t *testing.T) { t.Parallel() st := openTestStore(t) - c := metrics.NewCollector(10) + c := metrics.NewCollector(10, time.Hour) h := NewMetricsHandler(st, c) mux := http.NewServeMux() @@ -153,7 +153,7 @@ func TestMetricsHandler_Instance_InvalidID_Returns400(t *testing.T) { func TestMetricsHandler_Fleet_EmptyFleet_ReturnsEmptyInstances(t *testing.T) { t.Parallel() st := openTestStore(t) - c := metrics.NewCollector(10) + c := metrics.NewCollector(10, time.Hour) h := NewMetricsHandler(st, c) mux := http.NewServeMux() diff --git a/admin/cmd/chaperone-admin/main.go b/admin/cmd/chaperone-admin/main.go index 40d2691..d96fbc7 100644 --- a/admin/cmd/chaperone-admin/main.go +++ b/admin/cmd/chaperone-admin/main.go @@ -93,7 +93,10 @@ func runServer(args []string) error { } defer st.Close() - collector := metrics.NewCollector(metrics.DefaultCapacity) + collector := metrics.NewCollector( + metrics.CapacityFor(cfg.Scraper.RetentionWindow.Unwrap(), cfg.Scraper.Interval.Unwrap()), + cfg.Scraper.RetentionWindow.Unwrap(), + ) srv, err := admin.NewServer(cfg, st, collector) if err != nil { diff --git a/admin/config/config.go b/admin/config/config.go index 82ef646..d5ecc31 100644 --- a/admin/config/config.go +++ b/admin/config/config.go @@ -45,8 +45,9 @@ type DatabaseConfig struct { // ScraperConfig configures the proxy metrics scraper. type ScraperConfig struct { - Interval Duration `yaml:"interval"` - Timeout Duration `yaml:"timeout"` + Interval Duration `yaml:"interval"` + Timeout Duration `yaml:"timeout"` + RetentionWindow Duration `yaml:"retention_window"` } // SessionConfig configures session management. diff --git a/admin/config/loader.go b/admin/config/loader.go index 341c132..e288b19 100644 --- a/admin/config/loader.go +++ b/admin/config/loader.go @@ -75,6 +75,9 @@ func applyDefaults(cfg *Config) { if cfg.Scraper.Timeout == 0 { cfg.Scraper.Timeout = Duration(5 * time.Second) } + if cfg.Scraper.RetentionWindow == 0 { + cfg.Scraper.RetentionWindow = Duration(1 * time.Hour) + } if cfg.Session.MaxAge == 0 { cfg.Session.MaxAge = Duration(24 * time.Hour) } @@ -107,6 +110,7 @@ func applyEnvOverrides(cfg *Config) error { parseDuration(&cfg.Scraper.Interval, "SCRAPER_INTERVAL", &errs) parseDuration(&cfg.Scraper.Timeout, "SCRAPER_TIMEOUT", &errs) + parseDuration(&cfg.Scraper.RetentionWindow, "SCRAPER_RETENTION_WINDOW", &errs) parseDuration(&cfg.Session.MaxAge, "SESSION_MAX_AGE", &errs) parseDuration(&cfg.Session.IdleTimeout, "SESSION_IDLE_TIMEOUT", &errs) diff --git a/admin/config/loader_test.go b/admin/config/loader_test.go index 320597e..b134756 100644 --- a/admin/config/loader_test.go +++ b/admin/config/loader_test.go @@ -45,6 +45,9 @@ func TestLoad_NoFile_AppliesDefaults(t *testing.T) { if cfg.Scraper.Timeout.Unwrap() != 5*time.Second { t.Errorf("Scraper.Timeout = %v, want %v", cfg.Scraper.Timeout.Unwrap(), 5*time.Second) } + if cfg.Scraper.RetentionWindow.Unwrap() != 1*time.Hour { + t.Errorf("Scraper.RetentionWindow = %v, want %v", cfg.Scraper.RetentionWindow.Unwrap(), 1*time.Hour) + } if cfg.Session.MaxAge.Unwrap() != 24*time.Hour { t.Errorf("Session.MaxAge = %v, want %v", cfg.Session.MaxAge.Unwrap(), 24*time.Hour) } @@ -75,6 +78,7 @@ database: scraper: interval: "30s" timeout: "10s" + retention_window: "2h" session: max_age: "12h" idle_timeout: "1h" @@ -107,6 +111,9 @@ log: if cfg.Scraper.Timeout.Unwrap() != 10*time.Second { t.Errorf("Scraper.Timeout = %v, want %v", cfg.Scraper.Timeout.Unwrap(), 10*time.Second) } + if cfg.Scraper.RetentionWindow.Unwrap() != 2*time.Hour { + t.Errorf("Scraper.RetentionWindow = %v, want %v", cfg.Scraper.RetentionWindow.Unwrap(), 2*time.Hour) + } if cfg.Session.MaxAge.Unwrap() != 12*time.Hour { t.Errorf("Session.MaxAge = %v, want %v", cfg.Session.MaxAge.Unwrap(), 12*time.Hour) } @@ -171,6 +178,7 @@ func TestLoad_EnvOverrides_AllFields(t *testing.T) { t.Setenv("CHAPERONE_ADMIN_DATABASE_PATH", "/tmp/test.db") t.Setenv("CHAPERONE_ADMIN_SCRAPER_INTERVAL", "20s") t.Setenv("CHAPERONE_ADMIN_SCRAPER_TIMEOUT", "8s") + t.Setenv("CHAPERONE_ADMIN_SCRAPER_RETENTION_WINDOW", "30m") t.Setenv("CHAPERONE_ADMIN_SESSION_MAX_AGE", "48h") t.Setenv("CHAPERONE_ADMIN_SESSION_IDLE_TIMEOUT", "4h") t.Setenv("CHAPERONE_ADMIN_AUDIT_RETENTION_DAYS", "60") @@ -199,6 +207,9 @@ func TestLoad_EnvOverrides_AllFields(t *testing.T) { if cfg.Scraper.Timeout.Unwrap() != 8*time.Second { t.Errorf("Scraper.Timeout = %v, want %v", cfg.Scraper.Timeout.Unwrap(), 8*time.Second) } + if cfg.Scraper.RetentionWindow.Unwrap() != 30*time.Minute { + t.Errorf("Scraper.RetentionWindow = %v, want %v", cfg.Scraper.RetentionWindow.Unwrap(), 30*time.Minute) + } if cfg.Session.MaxAge.Unwrap() != 48*time.Hour { t.Errorf("Session.MaxAge = %v, want %v", cfg.Session.MaxAge.Unwrap(), 48*time.Hour) } @@ -269,6 +280,9 @@ func TestApplyDefaults_ZeroConfig_SetsAllDefaults(t *testing.T) { if cfg.Scraper.Timeout.Unwrap() != 5*time.Second { t.Errorf("Scraper.Timeout = %v, want 5s", cfg.Scraper.Timeout.Unwrap()) } + if cfg.Scraper.RetentionWindow.Unwrap() != 1*time.Hour { + t.Errorf("Scraper.RetentionWindow = %v, want 1h", cfg.Scraper.RetentionWindow.Unwrap()) + } if cfg.Session.MaxAge.Unwrap() != 24*time.Hour { t.Errorf("Session.MaxAge = %v, want 24h", cfg.Session.MaxAge.Unwrap()) } diff --git a/admin/config/validate.go b/admin/config/validate.go index 6eadfea..43e6342 100644 --- a/admin/config/validate.go +++ b/admin/config/validate.go @@ -33,6 +33,9 @@ func (c *Config) Validate() error { if c.Scraper.Timeout.Unwrap() >= c.Scraper.Interval.Unwrap() { errs = append(errs, errors.New("scraper.timeout must be less than scraper.interval")) } + if c.Scraper.RetentionWindow.Unwrap() < c.Scraper.Interval.Unwrap() { + errs = append(errs, errors.New("scraper.retention_window must be at least one scraper.interval")) + } if c.Session.MaxAge.Unwrap() < 1*time.Minute { errs = append(errs, errors.New("session.max_age must be at least 1m")) diff --git a/admin/config/validate_test.go b/admin/config/validate_test.go index 85ead1b..537d856 100644 --- a/admin/config/validate_test.go +++ b/admin/config/validate_test.go @@ -16,8 +16,9 @@ func validConfig() *Config { Server: ServerConfig{Addr: DefaultAddr}, Database: DatabaseConfig{Path: "./test.db"}, Scraper: ScraperConfig{ - Interval: Duration(10 * time.Second), - Timeout: Duration(5 * time.Second), + Interval: Duration(10 * time.Second), + Timeout: Duration(5 * time.Second), + RetentionWindow: Duration(1 * time.Hour), }, Session: SessionConfig{ MaxAge: Duration(24 * time.Hour), @@ -124,6 +125,43 @@ func TestValidate_TimeoutGteInterval_ReturnsError(t *testing.T) { } } +func TestValidate_RetentionWindowLessThanInterval_ReturnsError(t *testing.T) { + t.Parallel() + + // Arrange — retention window must be at least one interval long + cfg := validConfig() + cfg.Scraper.Interval = Duration(10 * time.Second) + cfg.Scraper.RetentionWindow = Duration(5 * time.Second) + + // Act + err := cfg.Validate() + + // Assert + if err == nil { + t.Fatal("expected error, got nil") + } + if !strings.Contains(err.Error(), "retention_window") { + t.Errorf("error = %q, want to contain %q", err.Error(), "retention_window") + } +} + +func TestValidate_RetentionWindowEqualsInterval_NoError(t *testing.T) { + t.Parallel() + + // Arrange — equal is allowed (degenerate but valid: capacity = 1) + cfg := validConfig() + cfg.Scraper.Interval = Duration(10 * time.Second) + cfg.Scraper.RetentionWindow = Duration(10 * time.Second) + + // Act + err := cfg.Validate() + + // Assert + if err != nil { + t.Errorf("unexpected error: %v", err) + } +} + func TestValidate_NegativeRetention_ReturnsError(t *testing.T) { t.Parallel() diff --git a/admin/metrics/collector.go b/admin/metrics/collector.go index 2e042cf..ee003c2 100644 --- a/admin/metrics/collector.go +++ b/admin/metrics/collector.go @@ -12,16 +12,20 @@ import ( // Collector manages per-instance metric ring buffers and computes derived // metrics (rates, percentiles) on demand. type Collector struct { - mu sync.RWMutex - buffers map[int64]*Ring - capacity int + mu sync.RWMutex + buffers map[int64]*Ring + capacity int + trendWindow time.Duration } -// NewCollector creates a Collector with the given ring buffer capacity. -func NewCollector(capacity int) *Collector { +// NewCollector creates a Collector with the given ring buffer capacity and +// trend lookback window. Trends compare current rates against rates from +// approximately trendWindow ago; a zero trendWindow disables trends. +func NewCollector(capacity int, trendWindow time.Duration) *Collector { return &Collector{ - buffers: make(map[int64]*Ring), - capacity: capacity, + buffers: make(map[int64]*Ring), + capacity: capacity, + trendWindow: trendWindow, } } @@ -253,20 +257,22 @@ func (*Collector) fillVendorMetrics(im *InstanceMetrics, prev, curr Snapshot) { }) } -// historicalPair returns the two snapshots forming a rate pair from ~1 hour -// ago in the ring buffer. If the buffer doesn't span at least 50 minutes, -// ok is false. -func historicalPair(buf *Ring) (prev, curr Snapshot, ok bool) { - if buf.Len() < 4 { +// historicalPair returns the two snapshots forming a rate pair from +// approximately c.trendWindow ago in the ring buffer. The buffer must span +// at least 5/6 of the trend window before a pair is returned, which keeps +// the comparison meaningful while the ring is still filling. If the trend +// window is zero or the buffer is too short, ok is false. +func (c *Collector) historicalPair(buf *Ring) (prev, curr Snapshot, ok bool) { + if c.trendWindow == 0 || buf.Len() < 4 { return Snapshot{}, Snapshot{}, false } newest := buf.At(buf.Len() - 1) oldest := buf.At(0) - if newest.Time.Sub(oldest.Time) < 50*time.Minute { + if newest.Time.Sub(oldest.Time) < (c.trendWindow*5)/6 { return Snapshot{}, Snapshot{}, false } - target := newest.Time.Add(-1 * time.Hour) + target := newest.Time.Add(-c.trendWindow) idx := findNearest(buf, target) start := idx if start > 0 { @@ -279,9 +285,9 @@ func historicalPair(buf *Ring) (prev, curr Snapshot, ok bool) { } // fillTrends computes trend values by comparing the current rate to the rate -// from approximately 1 hour ago. -func (*Collector) fillTrends(im *InstanceMetrics, buf *Ring) { - prev, curr, ok := historicalPair(buf) +// from approximately c.trendWindow ago. +func (c *Collector) fillTrends(im *InstanceMetrics, buf *Ring) { + prev, curr, ok := c.historicalPair(buf) if !ok { return } @@ -302,9 +308,10 @@ type historicalTrend struct { errDelta float64 } -// trendSnapshot returns historical RPS and request/error deltas from ~1h ago. -func (*Collector) trendSnapshot(buf *Ring) (historicalTrend, bool) { - prev, curr, ok := historicalPair(buf) +// trendSnapshot returns historical RPS and request/error deltas from +// approximately c.trendWindow ago. +func (c *Collector) trendSnapshot(buf *Ring) (historicalTrend, bool) { + prev, curr, ok := c.historicalPair(buf) if !ok { return historicalTrend{}, false } diff --git a/admin/metrics/collector_test.go b/admin/metrics/collector_test.go index 8110d44..11dddc0 100644 --- a/admin/metrics/collector_test.go +++ b/admin/metrics/collector_test.go @@ -51,7 +51,7 @@ func makeSnapshot(t time.Time, totalReq, errReq, active, panics float64) Snapsho func TestCollector_RecordScrape_ParsesAndStores(t *testing.T) { t.Parallel() - c := NewCollector(10) + c := NewCollector(10, time.Hour) err := c.RecordScrape(1, []byte(sampleMetrics), time.Now()) if err != nil { @@ -72,7 +72,7 @@ func TestCollector_RecordScrape_ParsesAndStores(t *testing.T) { func TestCollector_RecordScrape_MalformedData_ReturnsError(t *testing.T) { t.Parallel() - c := NewCollector(10) + c := NewCollector(10, time.Hour) err := c.RecordScrape(1, []byte("# TYPE foo gauge\n# TYPE foo counter\nfoo 1\n"), time.Now()) if err == nil { @@ -82,7 +82,7 @@ func TestCollector_RecordScrape_MalformedData_ReturnsError(t *testing.T) { func TestCollector_Remove(t *testing.T) { t.Parallel() - c := NewCollector(10) + c := NewCollector(10, time.Hour) c.Record(1, Snapshot{Time: time.Now()}) c.Remove(1) @@ -97,7 +97,7 @@ func TestCollector_Remove(t *testing.T) { func TestCollector_Prune(t *testing.T) { t.Parallel() - c := NewCollector(10) + c := NewCollector(10, time.Hour) c.Record(1, Snapshot{Time: time.Now()}) c.Record(2, Snapshot{Time: time.Now()}) c.Record(3, Snapshot{Time: time.Now()}) @@ -115,7 +115,7 @@ func TestCollector_Prune(t *testing.T) { func TestCollector_GetInstanceMetrics_NoData_ReturnsNil(t *testing.T) { t.Parallel() - c := NewCollector(10) + c := NewCollector(10, time.Hour) if got := c.GetInstanceMetrics(99); got != nil { t.Error("expected nil for unknown instance") } @@ -123,7 +123,7 @@ func TestCollector_GetInstanceMetrics_NoData_ReturnsNil(t *testing.T) { func TestCollector_GetInstanceMetrics_SingleSnapshot_NoRates(t *testing.T) { t.Parallel() - c := NewCollector(10) + c := NewCollector(10, time.Hour) c.Record(1, makeSnapshot(time.Now(), 1000, 50, 10, 2)) im := c.GetInstanceMetrics(1) @@ -144,7 +144,7 @@ func TestCollector_GetInstanceMetrics_SingleSnapshot_NoRates(t *testing.T) { func TestCollector_GetInstanceMetrics_TwoSnapshots_ComputesRates(t *testing.T) { t.Parallel() - c := NewCollector(10) + c := NewCollector(10, time.Hour) t0 := time.Date(2026, 3, 7, 12, 0, 0, 0, time.UTC) t1 := t0.Add(10 * time.Second) @@ -189,7 +189,7 @@ func TestCollector_GetInstanceMetrics_TwoSnapshots_ComputesRates(t *testing.T) { func TestCollector_GetInstanceMetrics_SeriesGenerated(t *testing.T) { t.Parallel() - c := NewCollector(100) + c := NewCollector(100, time.Hour) t0 := time.Date(2026, 3, 7, 12, 0, 0, 0, time.UTC) for i := 0; i < 5; i++ { @@ -225,7 +225,7 @@ func TestCollector_GetInstanceMetrics_SeriesGenerated(t *testing.T) { func TestCollector_GetInstanceMetrics_TrendWithEnoughData(t *testing.T) { t.Parallel() - c := NewCollector(DefaultCapacity) + c := NewCollector(DefaultCapacity, time.Hour) t0 := time.Date(2026, 3, 7, 11, 0, 0, 0, time.UTC) // Fill 1h of data at 10s intervals @@ -253,7 +253,7 @@ func TestCollector_GetInstanceMetrics_TrendWithEnoughData(t *testing.T) { func TestCollector_GetInstanceMetrics_NoTrendWithInsufficientData(t *testing.T) { t.Parallel() - c := NewCollector(100) + c := NewCollector(100, time.Hour) t0 := time.Date(2026, 3, 7, 12, 0, 0, 0, time.UTC) for i := 0; i < 5; i++ { @@ -277,7 +277,7 @@ func TestCollector_GetInstanceMetrics_NoTrendWithInsufficientData(t *testing.T) func TestCollector_GetFleetMetrics_AggregatesAcrossInstances(t *testing.T) { t.Parallel() - c := NewCollector(10) + c := NewCollector(10, time.Hour) t0 := time.Date(2026, 3, 7, 12, 0, 0, 0, time.UTC) t1 := t0.Add(10 * time.Second) @@ -306,7 +306,7 @@ func TestCollector_GetFleetMetrics_AggregatesAcrossInstances(t *testing.T) { func TestCollector_GetFleetMetrics_SkipsMissingInstances(t *testing.T) { t.Parallel() - c := NewCollector(10) + c := NewCollector(10, time.Hour) t0 := time.Date(2026, 3, 7, 12, 0, 0, 0, time.UTC) t1 := t0.Add(10 * time.Second) @@ -344,7 +344,7 @@ func TestAddHistograms_MismatchedBoundaries_FallsBack(t *testing.T) { func TestGetFleetMetrics_CounterReset_DoesNotCorruptErrorRate(t *testing.T) { t.Parallel() - c := NewCollector(10) + c := NewCollector(10, time.Hour) t0 := time.Date(2026, 3, 7, 12, 0, 0, 0, time.UTC) t1 := t0.Add(10 * time.Second) @@ -370,7 +370,7 @@ func TestGetFleetMetrics_CounterReset_DoesNotCorruptErrorRate(t *testing.T) { func TestGetFleetMetrics_ErrorRateTrend_Populated(t *testing.T) { t.Parallel() - c := NewCollector(DefaultCapacity) + c := NewCollector(DefaultCapacity, time.Hour) t0 := time.Date(2026, 3, 7, 11, 0, 0, 0, time.UTC) for i := 0; i <= 360; i++ { @@ -391,7 +391,7 @@ func TestGetFleetMetrics_ErrorRateTrend_Populated(t *testing.T) { func TestCollector_GetInstanceSummary_NoData_ReturnsNil(t *testing.T) { t.Parallel() - c := NewCollector(10) + c := NewCollector(10, time.Hour) if got := c.GetInstanceSummary(1); got != nil { t.Error("expected nil for unknown instance") } @@ -399,7 +399,7 @@ func TestCollector_GetInstanceSummary_NoData_ReturnsNil(t *testing.T) { func TestCollector_GetInstanceSummary_ComputesKPIs(t *testing.T) { t.Parallel() - c := NewCollector(10) + c := NewCollector(10, time.Hour) t0 := time.Date(2026, 3, 7, 12, 0, 0, 0, time.UTC) t1 := t0.Add(10 * time.Second) @@ -417,3 +417,68 @@ func TestCollector_GetInstanceSummary_ComputesKPIs(t *testing.T) { t.Errorf("ActiveConnections = %v, want 12", s.ActiveConnections) } } + +func TestCollector_TrendWindow_LongerThanOneHour_LooksBackFullWindow(t *testing.T) { + t.Parallel() + // With trendWindow=2h, the historical comparison must look back ~2h, not + // the previously hardcoded 1h. Data has two regimes: + // pairs[0..360): 100 req/s + // pairs[360..1080): 200 req/s + // At the latest snapshot RPS ~200. RPS 1h ago is also ~200 (regime B), so + // the old hardcoded lookback would yield trend ~0. RPS 2h ago is ~100 + // (regime A), so a correct 2h lookback yields trend ~100. + c := NewCollector(1080, 2*time.Hour) + t0 := time.Date(2026, 3, 7, 11, 0, 0, 0, time.UTC) + + var totalReq, totalErr float64 + const boundary = 360 + for i := 0; i < 1080; i++ { + ts := t0.Add(time.Duration(i) * 10 * time.Second) + c.Record(1, makeSnapshot(ts, totalReq, totalErr, 10, 0)) + if i < boundary { + totalReq += 1000 + totalErr += 50 + } else { + totalReq += 2000 + totalErr += 100 + } + } + + im := c.GetInstanceMetrics(1) + if im == nil { + t.Fatal("expected non-nil InstanceMetrics") + } + if im.RPSTrend == nil { + t.Fatal("expected RPSTrend to be set") + } + if math.Abs(*im.RPSTrend-100) > 5 { + t.Errorf("RPSTrend = %v, want ~100 (2h lookback should compare against regime A)", *im.RPSTrend) + } +} + +func TestCollector_TrendWindow_ShortWindow_RendersOnceBufferSpansIt(t *testing.T) { + t.Parallel() + // With trendWindow=30min, trends must render once the buffer spans + // 5/6 of that (25min). Previously the hardcoded 50min threshold would + // have suppressed the trend entirely for any retention window under 1h. + c := NewCollector(180, 30*time.Minute) + t0 := time.Date(2026, 3, 7, 11, 0, 0, 0, time.UTC) + + for i := 0; i < 151; i++ { + c.Record(1, makeSnapshot( + t0.Add(time.Duration(i)*10*time.Second), + float64(i*100), + float64(i*5), + 10, + 0, + )) + } + + im := c.GetInstanceMetrics(1) + if im == nil { + t.Fatal("expected non-nil InstanceMetrics") + } + if im.RPSTrend == nil { + t.Error("expected RPSTrend to be set with 25min of data when trendWindow=30min") + } +} diff --git a/admin/metrics/metrics.go b/admin/metrics/metrics.go index 243a68d..fc6059b 100644 --- a/admin/metrics/metrics.go +++ b/admin/metrics/metrics.go @@ -7,10 +7,30 @@ package metrics import "time" -// DefaultCapacity is the number of scrape snapshots retained per instance. -// At 10s intervals this gives ~1 hour of history. +// DefaultCapacity is a generous ring size used by tests. Production code +// computes capacity from the configured retention window via CapacityFor. const DefaultCapacity = 360 +// CapacityFor returns the ring buffer capacity needed to retain `window` +// of history when scraping every `interval`. +// +// Snapshots span (capacity-1)*interval since rates and series are computed +// from adjacent pairs, so we round up and add one to guarantee the buffer +// covers at least `window`. Always returns at least 2 — the minimum needed +// for a single rate pair — so degenerate inputs never produce an unusable +// collector. +func CapacityFor(window, interval time.Duration) int { + const minCapacity = 2 + if interval <= 0 || window <= 0 { + return minCapacity + } + n := int((window+interval-1)/interval) + 1 + if n < minCapacity { + return minCapacity + } + return n +} + // Prometheus metric names emitted by the Chaperone proxy. const ( metricRequestsTotal = "chaperone_requests_total" diff --git a/admin/metrics/metrics_test.go b/admin/metrics/metrics_test.go new file mode 100644 index 0000000..6829304 --- /dev/null +++ b/admin/metrics/metrics_test.go @@ -0,0 +1,70 @@ +// Copyright 2026 CloudBlue LLC +// SPDX-License-Identifier: Apache-2.0 + +package metrics + +import ( + "testing" + "time" +) + +func TestCapacityFor_SpansAtLeastTheRequestedWindow(t *testing.T) { + t.Parallel() + + // N snapshots at fixed interval I span (N-1)*I of wall-clock time, since + // rates and series are computed from adjacent pairs. Capacity must be + // large enough that (capacity-1)*interval >= window. + tests := []struct { + name string + window time.Duration + interval time.Duration + }{ + {"1h at 10s", 1 * time.Hour, 10 * time.Second}, + {"1h at 3s", 1 * time.Hour, 3 * time.Second}, + {"30m at 10s", 30 * time.Minute, 10 * time.Second}, + {"1h at 7s (non-divisible)", 1 * time.Hour, 7 * time.Second}, + {"window equals interval", 10 * time.Second, 10 * time.Second}, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + t.Parallel() + + got := CapacityFor(tt.window, tt.interval) + span := time.Duration(got-1) * tt.interval + if span < tt.window { + t.Errorf("CapacityFor(%v, %v) = %d snapshots spans %v, want >= %v", + tt.window, tt.interval, got, span, tt.window) + } + }) + } +} + +func TestCapacityFor_AlwaysAllowsRatePair(t *testing.T) { + t.Parallel() + + // Rate and series computation requires at least two adjacent snapshots. + // CapacityFor must never return less than 2, including for degenerate + // inputs that config validation would normally reject. + tests := []struct { + name string + window time.Duration + interval time.Duration + }{ + {"window equals interval", 10 * time.Second, 10 * time.Second}, + {"window smaller than interval", 1 * time.Second, 10 * time.Second}, + {"zero interval", 1 * time.Hour, 0}, + {"zero window", 0, 10 * time.Second}, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + t.Parallel() + + got := CapacityFor(tt.window, tt.interval) + if got < 2 { + t.Errorf("CapacityFor(%v, %v) = %d, want >= 2", tt.window, tt.interval, got) + } + }) + } +} diff --git a/admin/poller/poller_test.go b/admin/poller/poller_test.go index 84ba627..e553299 100644 --- a/admin/poller/poller_test.go +++ b/admin/poller/poller_test.go @@ -298,7 +298,7 @@ func TestPoller_RunStopsOnContextCancel(t *testing.T) { func TestPoller_MetricsScraping_RecordsToCollector(t *testing.T) { t.Parallel() st := openTestStore(t) - c := metrics.NewCollector(10) + c := metrics.NewCollector(10, time.Hour) proxy := fakeProxyWithMetrics(t) addr := strings.TrimPrefix(proxy.URL, "http://") @@ -327,7 +327,7 @@ func TestPoller_MetricsScraping_RecordsToCollector(t *testing.T) { func TestPoller_MetricsScraping_SkippedOnHealthFailure(t *testing.T) { t.Parallel() st := openTestStore(t) - c := metrics.NewCollector(10) + c := metrics.NewCollector(10, time.Hour) ctx := context.Background() inst, err := st.CreateInstance(ctx, "test-proxy", "127.0.0.1:1") @@ -347,7 +347,7 @@ func TestPoller_MetricsScraping_SkippedOnHealthFailure(t *testing.T) { func TestPoller_DeletedInstance_PrunesCollector(t *testing.T) { t.Parallel() st := openTestStore(t) - c := metrics.NewCollector(10) + c := metrics.NewCollector(10, time.Hour) proxy := fakeProxyWithMetrics(t) addr := strings.TrimPrefix(proxy.URL, "http://") diff --git a/docs/guides/admin-portal.md b/docs/guides/admin-portal.md index 31778ad..88548f2 100644 --- a/docs/guides/admin-portal.md +++ b/docs/guides/admin-portal.md @@ -52,6 +52,7 @@ database: scraper: interval: "10s" timeout: "5s" + retention_window: "1h" session: max_age: "24h" @@ -80,6 +81,7 @@ Every config key can be overridden via environment variables using the `CHAPERON | `database.path` | `CHAPERONE_ADMIN_DATABASE_PATH` | | `scraper.interval` | `CHAPERONE_ADMIN_SCRAPER_INTERVAL` | | `scraper.timeout` | `CHAPERONE_ADMIN_SCRAPER_TIMEOUT` | +| `scraper.retention_window` | `CHAPERONE_ADMIN_SCRAPER_RETENTION_WINDOW` | | `session.max_age` | `CHAPERONE_ADMIN_SESSION_MAX_AGE` | | `session.idle_timeout` | `CHAPERONE_ADMIN_SESSION_IDLE_TIMEOUT` | | `audit.retention_days` | `CHAPERONE_ADMIN_AUDIT_RETENTION_DAYS` | @@ -172,4 +174,4 @@ All portal actions (instance add/edit/remove, login, logout, password changes) a - To view per-instance metrics, open the dashboard. It displays RPS, latency percentiles (p50, p95, p99), error rate, active connections, and panic count for each proxy, computed from each proxy's `/metrics` endpoint polled every 10 seconds. - To interpret health badges, read them as: **unknown** (before first poll), **healthy** (last poll succeeded), or **unreachable** (3 consecutive failures). A single successful poll restores an unreachable instance to healthy. - To wait through the post-restart placeholder, give the portal at least two scrape cycles (~20 seconds) after a restart — charts show "Collecting data points..." until two snapshots exist to compute rates from. -- To plan around history retention, note that metrics are kept in memory only. The portal retains 360 scrape snapshots per instance (`DefaultCapacity` in `admin/metrics/metrics.go`), which at 10s intervals is exactly 1 hour of history. A restart clears all metrics. +- To plan around history retention, note that metrics are kept in memory only. Each instance retains enough scrape snapshots to cover `scraper.retention_window` (default 1h) at the configured `scraper.interval` (default 10s) — capacity is computed at startup. A restart clears all metrics.