From b974ba656822b3797a50a49e8cfc171af783b876 Mon Sep 17 00:00:00 2001 From: Arnau Giralt Date: Thu, 7 May 2026 18:52:22 +0200 Subject: [PATCH 1/3] feat(admin): decouple metric retention window from scrape interval MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The per-instance ring buffer was sized at a fixed 360 samples, so lowering scraper.interval silently shrank the visible history (3s interval gave ~18 min instead of the 1h target). Add scraper.retention_window (default 1h, env CHAPERONE_ADMIN_SCRAPER_RETENTION_WINDOW) and compute ring capacity at startup so it actually spans the requested window — ceil(window/interval) + 1, since N snapshots span (N-1)*interval. Floor at 2 so degenerate inputs always leave a usable rate pair. Validation rejects retention_window < interval so the mismatch surfaces at load. Co-Authored-By: Claude Opus 4.7 (1M context) --- admin/cmd/chaperone-admin/main.go | 5 ++- admin/config/config.go | 5 ++- admin/config/loader.go | 4 ++ admin/config/loader_test.go | 14 +++++++ admin/config/validate.go | 3 ++ admin/config/validate_test.go | 42 ++++++++++++++++++- admin/metrics/metrics.go | 24 ++++++++++- admin/metrics/metrics_test.go | 70 +++++++++++++++++++++++++++++++ 8 files changed, 160 insertions(+), 7 deletions(-) create mode 100644 admin/metrics/metrics_test.go diff --git a/admin/cmd/chaperone-admin/main.go b/admin/cmd/chaperone-admin/main.go index 40d2691..75f6af6 100644 --- a/admin/cmd/chaperone-admin/main.go +++ b/admin/cmd/chaperone-admin/main.go @@ -93,7 +93,10 @@ func runServer(args []string) error { } defer st.Close() - collector := metrics.NewCollector(metrics.DefaultCapacity) + collector := metrics.NewCollector(metrics.CapacityFor( + cfg.Scraper.RetentionWindow.Unwrap(), + cfg.Scraper.Interval.Unwrap(), + )) srv, err := admin.NewServer(cfg, st, collector) if err != nil { diff --git a/admin/config/config.go b/admin/config/config.go index 82ef646..d5ecc31 100644 --- a/admin/config/config.go +++ b/admin/config/config.go @@ -45,8 +45,9 @@ type DatabaseConfig struct { // ScraperConfig configures the proxy metrics scraper. type ScraperConfig struct { - Interval Duration `yaml:"interval"` - Timeout Duration `yaml:"timeout"` + Interval Duration `yaml:"interval"` + Timeout Duration `yaml:"timeout"` + RetentionWindow Duration `yaml:"retention_window"` } // SessionConfig configures session management. diff --git a/admin/config/loader.go b/admin/config/loader.go index 341c132..e288b19 100644 --- a/admin/config/loader.go +++ b/admin/config/loader.go @@ -75,6 +75,9 @@ func applyDefaults(cfg *Config) { if cfg.Scraper.Timeout == 0 { cfg.Scraper.Timeout = Duration(5 * time.Second) } + if cfg.Scraper.RetentionWindow == 0 { + cfg.Scraper.RetentionWindow = Duration(1 * time.Hour) + } if cfg.Session.MaxAge == 0 { cfg.Session.MaxAge = Duration(24 * time.Hour) } @@ -107,6 +110,7 @@ func applyEnvOverrides(cfg *Config) error { parseDuration(&cfg.Scraper.Interval, "SCRAPER_INTERVAL", &errs) parseDuration(&cfg.Scraper.Timeout, "SCRAPER_TIMEOUT", &errs) + parseDuration(&cfg.Scraper.RetentionWindow, "SCRAPER_RETENTION_WINDOW", &errs) parseDuration(&cfg.Session.MaxAge, "SESSION_MAX_AGE", &errs) parseDuration(&cfg.Session.IdleTimeout, "SESSION_IDLE_TIMEOUT", &errs) diff --git a/admin/config/loader_test.go b/admin/config/loader_test.go index 320597e..b134756 100644 --- a/admin/config/loader_test.go +++ b/admin/config/loader_test.go @@ -45,6 +45,9 @@ func TestLoad_NoFile_AppliesDefaults(t *testing.T) { if cfg.Scraper.Timeout.Unwrap() != 5*time.Second { t.Errorf("Scraper.Timeout = %v, want %v", cfg.Scraper.Timeout.Unwrap(), 5*time.Second) } + if cfg.Scraper.RetentionWindow.Unwrap() != 1*time.Hour { + t.Errorf("Scraper.RetentionWindow = %v, want %v", cfg.Scraper.RetentionWindow.Unwrap(), 1*time.Hour) + } if cfg.Session.MaxAge.Unwrap() != 24*time.Hour { t.Errorf("Session.MaxAge = %v, want %v", cfg.Session.MaxAge.Unwrap(), 24*time.Hour) } @@ -75,6 +78,7 @@ database: scraper: interval: "30s" timeout: "10s" + retention_window: "2h" session: max_age: "12h" idle_timeout: "1h" @@ -107,6 +111,9 @@ log: if cfg.Scraper.Timeout.Unwrap() != 10*time.Second { t.Errorf("Scraper.Timeout = %v, want %v", cfg.Scraper.Timeout.Unwrap(), 10*time.Second) } + if cfg.Scraper.RetentionWindow.Unwrap() != 2*time.Hour { + t.Errorf("Scraper.RetentionWindow = %v, want %v", cfg.Scraper.RetentionWindow.Unwrap(), 2*time.Hour) + } if cfg.Session.MaxAge.Unwrap() != 12*time.Hour { t.Errorf("Session.MaxAge = %v, want %v", cfg.Session.MaxAge.Unwrap(), 12*time.Hour) } @@ -171,6 +178,7 @@ func TestLoad_EnvOverrides_AllFields(t *testing.T) { t.Setenv("CHAPERONE_ADMIN_DATABASE_PATH", "/tmp/test.db") t.Setenv("CHAPERONE_ADMIN_SCRAPER_INTERVAL", "20s") t.Setenv("CHAPERONE_ADMIN_SCRAPER_TIMEOUT", "8s") + t.Setenv("CHAPERONE_ADMIN_SCRAPER_RETENTION_WINDOW", "30m") t.Setenv("CHAPERONE_ADMIN_SESSION_MAX_AGE", "48h") t.Setenv("CHAPERONE_ADMIN_SESSION_IDLE_TIMEOUT", "4h") t.Setenv("CHAPERONE_ADMIN_AUDIT_RETENTION_DAYS", "60") @@ -199,6 +207,9 @@ func TestLoad_EnvOverrides_AllFields(t *testing.T) { if cfg.Scraper.Timeout.Unwrap() != 8*time.Second { t.Errorf("Scraper.Timeout = %v, want %v", cfg.Scraper.Timeout.Unwrap(), 8*time.Second) } + if cfg.Scraper.RetentionWindow.Unwrap() != 30*time.Minute { + t.Errorf("Scraper.RetentionWindow = %v, want %v", cfg.Scraper.RetentionWindow.Unwrap(), 30*time.Minute) + } if cfg.Session.MaxAge.Unwrap() != 48*time.Hour { t.Errorf("Session.MaxAge = %v, want %v", cfg.Session.MaxAge.Unwrap(), 48*time.Hour) } @@ -269,6 +280,9 @@ func TestApplyDefaults_ZeroConfig_SetsAllDefaults(t *testing.T) { if cfg.Scraper.Timeout.Unwrap() != 5*time.Second { t.Errorf("Scraper.Timeout = %v, want 5s", cfg.Scraper.Timeout.Unwrap()) } + if cfg.Scraper.RetentionWindow.Unwrap() != 1*time.Hour { + t.Errorf("Scraper.RetentionWindow = %v, want 1h", cfg.Scraper.RetentionWindow.Unwrap()) + } if cfg.Session.MaxAge.Unwrap() != 24*time.Hour { t.Errorf("Session.MaxAge = %v, want 24h", cfg.Session.MaxAge.Unwrap()) } diff --git a/admin/config/validate.go b/admin/config/validate.go index 6eadfea..43e6342 100644 --- a/admin/config/validate.go +++ b/admin/config/validate.go @@ -33,6 +33,9 @@ func (c *Config) Validate() error { if c.Scraper.Timeout.Unwrap() >= c.Scraper.Interval.Unwrap() { errs = append(errs, errors.New("scraper.timeout must be less than scraper.interval")) } + if c.Scraper.RetentionWindow.Unwrap() < c.Scraper.Interval.Unwrap() { + errs = append(errs, errors.New("scraper.retention_window must be at least one scraper.interval")) + } if c.Session.MaxAge.Unwrap() < 1*time.Minute { errs = append(errs, errors.New("session.max_age must be at least 1m")) diff --git a/admin/config/validate_test.go b/admin/config/validate_test.go index 85ead1b..537d856 100644 --- a/admin/config/validate_test.go +++ b/admin/config/validate_test.go @@ -16,8 +16,9 @@ func validConfig() *Config { Server: ServerConfig{Addr: DefaultAddr}, Database: DatabaseConfig{Path: "./test.db"}, Scraper: ScraperConfig{ - Interval: Duration(10 * time.Second), - Timeout: Duration(5 * time.Second), + Interval: Duration(10 * time.Second), + Timeout: Duration(5 * time.Second), + RetentionWindow: Duration(1 * time.Hour), }, Session: SessionConfig{ MaxAge: Duration(24 * time.Hour), @@ -124,6 +125,43 @@ func TestValidate_TimeoutGteInterval_ReturnsError(t *testing.T) { } } +func TestValidate_RetentionWindowLessThanInterval_ReturnsError(t *testing.T) { + t.Parallel() + + // Arrange — retention window must be at least one interval long + cfg := validConfig() + cfg.Scraper.Interval = Duration(10 * time.Second) + cfg.Scraper.RetentionWindow = Duration(5 * time.Second) + + // Act + err := cfg.Validate() + + // Assert + if err == nil { + t.Fatal("expected error, got nil") + } + if !strings.Contains(err.Error(), "retention_window") { + t.Errorf("error = %q, want to contain %q", err.Error(), "retention_window") + } +} + +func TestValidate_RetentionWindowEqualsInterval_NoError(t *testing.T) { + t.Parallel() + + // Arrange — equal is allowed (degenerate but valid: capacity = 1) + cfg := validConfig() + cfg.Scraper.Interval = Duration(10 * time.Second) + cfg.Scraper.RetentionWindow = Duration(10 * time.Second) + + // Act + err := cfg.Validate() + + // Assert + if err != nil { + t.Errorf("unexpected error: %v", err) + } +} + func TestValidate_NegativeRetention_ReturnsError(t *testing.T) { t.Parallel() diff --git a/admin/metrics/metrics.go b/admin/metrics/metrics.go index 243a68d..fc6059b 100644 --- a/admin/metrics/metrics.go +++ b/admin/metrics/metrics.go @@ -7,10 +7,30 @@ package metrics import "time" -// DefaultCapacity is the number of scrape snapshots retained per instance. -// At 10s intervals this gives ~1 hour of history. +// DefaultCapacity is a generous ring size used by tests. Production code +// computes capacity from the configured retention window via CapacityFor. const DefaultCapacity = 360 +// CapacityFor returns the ring buffer capacity needed to retain `window` +// of history when scraping every `interval`. +// +// Snapshots span (capacity-1)*interval since rates and series are computed +// from adjacent pairs, so we round up and add one to guarantee the buffer +// covers at least `window`. Always returns at least 2 — the minimum needed +// for a single rate pair — so degenerate inputs never produce an unusable +// collector. +func CapacityFor(window, interval time.Duration) int { + const minCapacity = 2 + if interval <= 0 || window <= 0 { + return minCapacity + } + n := int((window+interval-1)/interval) + 1 + if n < minCapacity { + return minCapacity + } + return n +} + // Prometheus metric names emitted by the Chaperone proxy. const ( metricRequestsTotal = "chaperone_requests_total" diff --git a/admin/metrics/metrics_test.go b/admin/metrics/metrics_test.go new file mode 100644 index 0000000..6829304 --- /dev/null +++ b/admin/metrics/metrics_test.go @@ -0,0 +1,70 @@ +// Copyright 2026 CloudBlue LLC +// SPDX-License-Identifier: Apache-2.0 + +package metrics + +import ( + "testing" + "time" +) + +func TestCapacityFor_SpansAtLeastTheRequestedWindow(t *testing.T) { + t.Parallel() + + // N snapshots at fixed interval I span (N-1)*I of wall-clock time, since + // rates and series are computed from adjacent pairs. Capacity must be + // large enough that (capacity-1)*interval >= window. + tests := []struct { + name string + window time.Duration + interval time.Duration + }{ + {"1h at 10s", 1 * time.Hour, 10 * time.Second}, + {"1h at 3s", 1 * time.Hour, 3 * time.Second}, + {"30m at 10s", 30 * time.Minute, 10 * time.Second}, + {"1h at 7s (non-divisible)", 1 * time.Hour, 7 * time.Second}, + {"window equals interval", 10 * time.Second, 10 * time.Second}, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + t.Parallel() + + got := CapacityFor(tt.window, tt.interval) + span := time.Duration(got-1) * tt.interval + if span < tt.window { + t.Errorf("CapacityFor(%v, %v) = %d snapshots spans %v, want >= %v", + tt.window, tt.interval, got, span, tt.window) + } + }) + } +} + +func TestCapacityFor_AlwaysAllowsRatePair(t *testing.T) { + t.Parallel() + + // Rate and series computation requires at least two adjacent snapshots. + // CapacityFor must never return less than 2, including for degenerate + // inputs that config validation would normally reject. + tests := []struct { + name string + window time.Duration + interval time.Duration + }{ + {"window equals interval", 10 * time.Second, 10 * time.Second}, + {"window smaller than interval", 1 * time.Second, 10 * time.Second}, + {"zero interval", 1 * time.Hour, 0}, + {"zero window", 0, 10 * time.Second}, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + t.Parallel() + + got := CapacityFor(tt.window, tt.interval) + if got < 2 { + t.Errorf("CapacityFor(%v, %v) = %d, want >= 2", tt.window, tt.interval, got) + } + }) + } +} From 1813c9199cf8a6192b0a901e483578afec46fba4 Mon Sep 17 00:00:00 2001 From: Arnau Giralt Date: Fri, 8 May 2026 17:35:33 +0200 Subject: [PATCH 2/3] docs(admin): sync admin portal guide with retention_window flag Add retention_window to the YAML example and env var table, and rewrite the history-retention bullet to reflect that capacity is now computed from scraper.retention_window and scraper.interval at startup. Co-Authored-By: Claude Opus 4.7 (1M context) --- docs/guides/admin-portal.md | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/docs/guides/admin-portal.md b/docs/guides/admin-portal.md index 31778ad..88548f2 100644 --- a/docs/guides/admin-portal.md +++ b/docs/guides/admin-portal.md @@ -52,6 +52,7 @@ database: scraper: interval: "10s" timeout: "5s" + retention_window: "1h" session: max_age: "24h" @@ -80,6 +81,7 @@ Every config key can be overridden via environment variables using the `CHAPERON | `database.path` | `CHAPERONE_ADMIN_DATABASE_PATH` | | `scraper.interval` | `CHAPERONE_ADMIN_SCRAPER_INTERVAL` | | `scraper.timeout` | `CHAPERONE_ADMIN_SCRAPER_TIMEOUT` | +| `scraper.retention_window` | `CHAPERONE_ADMIN_SCRAPER_RETENTION_WINDOW` | | `session.max_age` | `CHAPERONE_ADMIN_SESSION_MAX_AGE` | | `session.idle_timeout` | `CHAPERONE_ADMIN_SESSION_IDLE_TIMEOUT` | | `audit.retention_days` | `CHAPERONE_ADMIN_AUDIT_RETENTION_DAYS` | @@ -172,4 +174,4 @@ All portal actions (instance add/edit/remove, login, logout, password changes) a - To view per-instance metrics, open the dashboard. It displays RPS, latency percentiles (p50, p95, p99), error rate, active connections, and panic count for each proxy, computed from each proxy's `/metrics` endpoint polled every 10 seconds. - To interpret health badges, read them as: **unknown** (before first poll), **healthy** (last poll succeeded), or **unreachable** (3 consecutive failures). A single successful poll restores an unreachable instance to healthy. - To wait through the post-restart placeholder, give the portal at least two scrape cycles (~20 seconds) after a restart — charts show "Collecting data points..." until two snapshots exist to compute rates from. -- To plan around history retention, note that metrics are kept in memory only. The portal retains 360 scrape snapshots per instance (`DefaultCapacity` in `admin/metrics/metrics.go`), which at 10s intervals is exactly 1 hour of history. A restart clears all metrics. +- To plan around history retention, note that metrics are kept in memory only. Each instance retains enough scrape snapshots to cover `scraper.retention_window` (default 1h) at the configured `scraper.interval` (default 10s) — capacity is computed at startup. A restart clears all metrics. From 9bee208d1dc240b83b7f050a48c9736311464a58 Mon Sep 17 00:00:00 2001 From: Arnau Giralt Date: Fri, 8 May 2026 17:47:31 +0200 Subject: [PATCH 3/3] feat(admin): couple trend window to retention window historicalPair previously hardcoded a 50-minute readiness threshold and a 1-hour lookback. With retention_window now configurable, this caused trends to silently never render under 1h and to ignore the configured window above 1h. Plumb the trend window through Collector and derive both the threshold and the lookback from it. Co-Authored-By: Claude Opus 4.7 (1M context) --- admin/api/metrics_test.go | 10 ++-- admin/cmd/chaperone-admin/main.go | 6 +- admin/metrics/collector.go | 47 ++++++++------- admin/metrics/collector_test.go | 97 ++++++++++++++++++++++++++----- admin/poller/poller_test.go | 6 +- 5 files changed, 119 insertions(+), 47 deletions(-) diff --git a/admin/api/metrics_test.go b/admin/api/metrics_test.go index 43f598d..c816a28 100644 --- a/admin/api/metrics_test.go +++ b/admin/api/metrics_test.go @@ -39,7 +39,7 @@ func makeSnapshot(t time.Time, totalReq, errReq, active, panics float64) metrics func TestMetricsHandler_Fleet_ReturnsAggregated(t *testing.T) { t.Parallel() st := openTestStore(t) - c := metrics.NewCollector(10) + c := metrics.NewCollector(10, time.Hour) ctx := context.Background() inst, err := st.CreateInstance(ctx, "proxy-1", "10.0.0.1:9090") @@ -78,7 +78,7 @@ func TestMetricsHandler_Fleet_ReturnsAggregated(t *testing.T) { func TestMetricsHandler_Instance_ReturnsMetrics(t *testing.T) { t.Parallel() st := openTestStore(t) - c := metrics.NewCollector(10) + c := metrics.NewCollector(10, time.Hour) ctx := context.Background() inst, err := st.CreateInstance(ctx, "proxy-1", "10.0.0.1:9090") @@ -117,7 +117,7 @@ func TestMetricsHandler_Instance_ReturnsMetrics(t *testing.T) { func TestMetricsHandler_Instance_NoData_Returns404(t *testing.T) { t.Parallel() st := openTestStore(t) - c := metrics.NewCollector(10) + c := metrics.NewCollector(10, time.Hour) h := NewMetricsHandler(st, c) mux := http.NewServeMux() @@ -135,7 +135,7 @@ func TestMetricsHandler_Instance_NoData_Returns404(t *testing.T) { func TestMetricsHandler_Instance_InvalidID_Returns400(t *testing.T) { t.Parallel() st := openTestStore(t) - c := metrics.NewCollector(10) + c := metrics.NewCollector(10, time.Hour) h := NewMetricsHandler(st, c) mux := http.NewServeMux() @@ -153,7 +153,7 @@ func TestMetricsHandler_Instance_InvalidID_Returns400(t *testing.T) { func TestMetricsHandler_Fleet_EmptyFleet_ReturnsEmptyInstances(t *testing.T) { t.Parallel() st := openTestStore(t) - c := metrics.NewCollector(10) + c := metrics.NewCollector(10, time.Hour) h := NewMetricsHandler(st, c) mux := http.NewServeMux() diff --git a/admin/cmd/chaperone-admin/main.go b/admin/cmd/chaperone-admin/main.go index 75f6af6..d96fbc7 100644 --- a/admin/cmd/chaperone-admin/main.go +++ b/admin/cmd/chaperone-admin/main.go @@ -93,10 +93,10 @@ func runServer(args []string) error { } defer st.Close() - collector := metrics.NewCollector(metrics.CapacityFor( + collector := metrics.NewCollector( + metrics.CapacityFor(cfg.Scraper.RetentionWindow.Unwrap(), cfg.Scraper.Interval.Unwrap()), cfg.Scraper.RetentionWindow.Unwrap(), - cfg.Scraper.Interval.Unwrap(), - )) + ) srv, err := admin.NewServer(cfg, st, collector) if err != nil { diff --git a/admin/metrics/collector.go b/admin/metrics/collector.go index 2e042cf..ee003c2 100644 --- a/admin/metrics/collector.go +++ b/admin/metrics/collector.go @@ -12,16 +12,20 @@ import ( // Collector manages per-instance metric ring buffers and computes derived // metrics (rates, percentiles) on demand. type Collector struct { - mu sync.RWMutex - buffers map[int64]*Ring - capacity int + mu sync.RWMutex + buffers map[int64]*Ring + capacity int + trendWindow time.Duration } -// NewCollector creates a Collector with the given ring buffer capacity. -func NewCollector(capacity int) *Collector { +// NewCollector creates a Collector with the given ring buffer capacity and +// trend lookback window. Trends compare current rates against rates from +// approximately trendWindow ago; a zero trendWindow disables trends. +func NewCollector(capacity int, trendWindow time.Duration) *Collector { return &Collector{ - buffers: make(map[int64]*Ring), - capacity: capacity, + buffers: make(map[int64]*Ring), + capacity: capacity, + trendWindow: trendWindow, } } @@ -253,20 +257,22 @@ func (*Collector) fillVendorMetrics(im *InstanceMetrics, prev, curr Snapshot) { }) } -// historicalPair returns the two snapshots forming a rate pair from ~1 hour -// ago in the ring buffer. If the buffer doesn't span at least 50 minutes, -// ok is false. -func historicalPair(buf *Ring) (prev, curr Snapshot, ok bool) { - if buf.Len() < 4 { +// historicalPair returns the two snapshots forming a rate pair from +// approximately c.trendWindow ago in the ring buffer. The buffer must span +// at least 5/6 of the trend window before a pair is returned, which keeps +// the comparison meaningful while the ring is still filling. If the trend +// window is zero or the buffer is too short, ok is false. +func (c *Collector) historicalPair(buf *Ring) (prev, curr Snapshot, ok bool) { + if c.trendWindow == 0 || buf.Len() < 4 { return Snapshot{}, Snapshot{}, false } newest := buf.At(buf.Len() - 1) oldest := buf.At(0) - if newest.Time.Sub(oldest.Time) < 50*time.Minute { + if newest.Time.Sub(oldest.Time) < (c.trendWindow*5)/6 { return Snapshot{}, Snapshot{}, false } - target := newest.Time.Add(-1 * time.Hour) + target := newest.Time.Add(-c.trendWindow) idx := findNearest(buf, target) start := idx if start > 0 { @@ -279,9 +285,9 @@ func historicalPair(buf *Ring) (prev, curr Snapshot, ok bool) { } // fillTrends computes trend values by comparing the current rate to the rate -// from approximately 1 hour ago. -func (*Collector) fillTrends(im *InstanceMetrics, buf *Ring) { - prev, curr, ok := historicalPair(buf) +// from approximately c.trendWindow ago. +func (c *Collector) fillTrends(im *InstanceMetrics, buf *Ring) { + prev, curr, ok := c.historicalPair(buf) if !ok { return } @@ -302,9 +308,10 @@ type historicalTrend struct { errDelta float64 } -// trendSnapshot returns historical RPS and request/error deltas from ~1h ago. -func (*Collector) trendSnapshot(buf *Ring) (historicalTrend, bool) { - prev, curr, ok := historicalPair(buf) +// trendSnapshot returns historical RPS and request/error deltas from +// approximately c.trendWindow ago. +func (c *Collector) trendSnapshot(buf *Ring) (historicalTrend, bool) { + prev, curr, ok := c.historicalPair(buf) if !ok { return historicalTrend{}, false } diff --git a/admin/metrics/collector_test.go b/admin/metrics/collector_test.go index 8110d44..11dddc0 100644 --- a/admin/metrics/collector_test.go +++ b/admin/metrics/collector_test.go @@ -51,7 +51,7 @@ func makeSnapshot(t time.Time, totalReq, errReq, active, panics float64) Snapsho func TestCollector_RecordScrape_ParsesAndStores(t *testing.T) { t.Parallel() - c := NewCollector(10) + c := NewCollector(10, time.Hour) err := c.RecordScrape(1, []byte(sampleMetrics), time.Now()) if err != nil { @@ -72,7 +72,7 @@ func TestCollector_RecordScrape_ParsesAndStores(t *testing.T) { func TestCollector_RecordScrape_MalformedData_ReturnsError(t *testing.T) { t.Parallel() - c := NewCollector(10) + c := NewCollector(10, time.Hour) err := c.RecordScrape(1, []byte("# TYPE foo gauge\n# TYPE foo counter\nfoo 1\n"), time.Now()) if err == nil { @@ -82,7 +82,7 @@ func TestCollector_RecordScrape_MalformedData_ReturnsError(t *testing.T) { func TestCollector_Remove(t *testing.T) { t.Parallel() - c := NewCollector(10) + c := NewCollector(10, time.Hour) c.Record(1, Snapshot{Time: time.Now()}) c.Remove(1) @@ -97,7 +97,7 @@ func TestCollector_Remove(t *testing.T) { func TestCollector_Prune(t *testing.T) { t.Parallel() - c := NewCollector(10) + c := NewCollector(10, time.Hour) c.Record(1, Snapshot{Time: time.Now()}) c.Record(2, Snapshot{Time: time.Now()}) c.Record(3, Snapshot{Time: time.Now()}) @@ -115,7 +115,7 @@ func TestCollector_Prune(t *testing.T) { func TestCollector_GetInstanceMetrics_NoData_ReturnsNil(t *testing.T) { t.Parallel() - c := NewCollector(10) + c := NewCollector(10, time.Hour) if got := c.GetInstanceMetrics(99); got != nil { t.Error("expected nil for unknown instance") } @@ -123,7 +123,7 @@ func TestCollector_GetInstanceMetrics_NoData_ReturnsNil(t *testing.T) { func TestCollector_GetInstanceMetrics_SingleSnapshot_NoRates(t *testing.T) { t.Parallel() - c := NewCollector(10) + c := NewCollector(10, time.Hour) c.Record(1, makeSnapshot(time.Now(), 1000, 50, 10, 2)) im := c.GetInstanceMetrics(1) @@ -144,7 +144,7 @@ func TestCollector_GetInstanceMetrics_SingleSnapshot_NoRates(t *testing.T) { func TestCollector_GetInstanceMetrics_TwoSnapshots_ComputesRates(t *testing.T) { t.Parallel() - c := NewCollector(10) + c := NewCollector(10, time.Hour) t0 := time.Date(2026, 3, 7, 12, 0, 0, 0, time.UTC) t1 := t0.Add(10 * time.Second) @@ -189,7 +189,7 @@ func TestCollector_GetInstanceMetrics_TwoSnapshots_ComputesRates(t *testing.T) { func TestCollector_GetInstanceMetrics_SeriesGenerated(t *testing.T) { t.Parallel() - c := NewCollector(100) + c := NewCollector(100, time.Hour) t0 := time.Date(2026, 3, 7, 12, 0, 0, 0, time.UTC) for i := 0; i < 5; i++ { @@ -225,7 +225,7 @@ func TestCollector_GetInstanceMetrics_SeriesGenerated(t *testing.T) { func TestCollector_GetInstanceMetrics_TrendWithEnoughData(t *testing.T) { t.Parallel() - c := NewCollector(DefaultCapacity) + c := NewCollector(DefaultCapacity, time.Hour) t0 := time.Date(2026, 3, 7, 11, 0, 0, 0, time.UTC) // Fill 1h of data at 10s intervals @@ -253,7 +253,7 @@ func TestCollector_GetInstanceMetrics_TrendWithEnoughData(t *testing.T) { func TestCollector_GetInstanceMetrics_NoTrendWithInsufficientData(t *testing.T) { t.Parallel() - c := NewCollector(100) + c := NewCollector(100, time.Hour) t0 := time.Date(2026, 3, 7, 12, 0, 0, 0, time.UTC) for i := 0; i < 5; i++ { @@ -277,7 +277,7 @@ func TestCollector_GetInstanceMetrics_NoTrendWithInsufficientData(t *testing.T) func TestCollector_GetFleetMetrics_AggregatesAcrossInstances(t *testing.T) { t.Parallel() - c := NewCollector(10) + c := NewCollector(10, time.Hour) t0 := time.Date(2026, 3, 7, 12, 0, 0, 0, time.UTC) t1 := t0.Add(10 * time.Second) @@ -306,7 +306,7 @@ func TestCollector_GetFleetMetrics_AggregatesAcrossInstances(t *testing.T) { func TestCollector_GetFleetMetrics_SkipsMissingInstances(t *testing.T) { t.Parallel() - c := NewCollector(10) + c := NewCollector(10, time.Hour) t0 := time.Date(2026, 3, 7, 12, 0, 0, 0, time.UTC) t1 := t0.Add(10 * time.Second) @@ -344,7 +344,7 @@ func TestAddHistograms_MismatchedBoundaries_FallsBack(t *testing.T) { func TestGetFleetMetrics_CounterReset_DoesNotCorruptErrorRate(t *testing.T) { t.Parallel() - c := NewCollector(10) + c := NewCollector(10, time.Hour) t0 := time.Date(2026, 3, 7, 12, 0, 0, 0, time.UTC) t1 := t0.Add(10 * time.Second) @@ -370,7 +370,7 @@ func TestGetFleetMetrics_CounterReset_DoesNotCorruptErrorRate(t *testing.T) { func TestGetFleetMetrics_ErrorRateTrend_Populated(t *testing.T) { t.Parallel() - c := NewCollector(DefaultCapacity) + c := NewCollector(DefaultCapacity, time.Hour) t0 := time.Date(2026, 3, 7, 11, 0, 0, 0, time.UTC) for i := 0; i <= 360; i++ { @@ -391,7 +391,7 @@ func TestGetFleetMetrics_ErrorRateTrend_Populated(t *testing.T) { func TestCollector_GetInstanceSummary_NoData_ReturnsNil(t *testing.T) { t.Parallel() - c := NewCollector(10) + c := NewCollector(10, time.Hour) if got := c.GetInstanceSummary(1); got != nil { t.Error("expected nil for unknown instance") } @@ -399,7 +399,7 @@ func TestCollector_GetInstanceSummary_NoData_ReturnsNil(t *testing.T) { func TestCollector_GetInstanceSummary_ComputesKPIs(t *testing.T) { t.Parallel() - c := NewCollector(10) + c := NewCollector(10, time.Hour) t0 := time.Date(2026, 3, 7, 12, 0, 0, 0, time.UTC) t1 := t0.Add(10 * time.Second) @@ -417,3 +417,68 @@ func TestCollector_GetInstanceSummary_ComputesKPIs(t *testing.T) { t.Errorf("ActiveConnections = %v, want 12", s.ActiveConnections) } } + +func TestCollector_TrendWindow_LongerThanOneHour_LooksBackFullWindow(t *testing.T) { + t.Parallel() + // With trendWindow=2h, the historical comparison must look back ~2h, not + // the previously hardcoded 1h. Data has two regimes: + // pairs[0..360): 100 req/s + // pairs[360..1080): 200 req/s + // At the latest snapshot RPS ~200. RPS 1h ago is also ~200 (regime B), so + // the old hardcoded lookback would yield trend ~0. RPS 2h ago is ~100 + // (regime A), so a correct 2h lookback yields trend ~100. + c := NewCollector(1080, 2*time.Hour) + t0 := time.Date(2026, 3, 7, 11, 0, 0, 0, time.UTC) + + var totalReq, totalErr float64 + const boundary = 360 + for i := 0; i < 1080; i++ { + ts := t0.Add(time.Duration(i) * 10 * time.Second) + c.Record(1, makeSnapshot(ts, totalReq, totalErr, 10, 0)) + if i < boundary { + totalReq += 1000 + totalErr += 50 + } else { + totalReq += 2000 + totalErr += 100 + } + } + + im := c.GetInstanceMetrics(1) + if im == nil { + t.Fatal("expected non-nil InstanceMetrics") + } + if im.RPSTrend == nil { + t.Fatal("expected RPSTrend to be set") + } + if math.Abs(*im.RPSTrend-100) > 5 { + t.Errorf("RPSTrend = %v, want ~100 (2h lookback should compare against regime A)", *im.RPSTrend) + } +} + +func TestCollector_TrendWindow_ShortWindow_RendersOnceBufferSpansIt(t *testing.T) { + t.Parallel() + // With trendWindow=30min, trends must render once the buffer spans + // 5/6 of that (25min). Previously the hardcoded 50min threshold would + // have suppressed the trend entirely for any retention window under 1h. + c := NewCollector(180, 30*time.Minute) + t0 := time.Date(2026, 3, 7, 11, 0, 0, 0, time.UTC) + + for i := 0; i < 151; i++ { + c.Record(1, makeSnapshot( + t0.Add(time.Duration(i)*10*time.Second), + float64(i*100), + float64(i*5), + 10, + 0, + )) + } + + im := c.GetInstanceMetrics(1) + if im == nil { + t.Fatal("expected non-nil InstanceMetrics") + } + if im.RPSTrend == nil { + t.Error("expected RPSTrend to be set with 25min of data when trendWindow=30min") + } +} diff --git a/admin/poller/poller_test.go b/admin/poller/poller_test.go index 84ba627..e553299 100644 --- a/admin/poller/poller_test.go +++ b/admin/poller/poller_test.go @@ -298,7 +298,7 @@ func TestPoller_RunStopsOnContextCancel(t *testing.T) { func TestPoller_MetricsScraping_RecordsToCollector(t *testing.T) { t.Parallel() st := openTestStore(t) - c := metrics.NewCollector(10) + c := metrics.NewCollector(10, time.Hour) proxy := fakeProxyWithMetrics(t) addr := strings.TrimPrefix(proxy.URL, "http://") @@ -327,7 +327,7 @@ func TestPoller_MetricsScraping_RecordsToCollector(t *testing.T) { func TestPoller_MetricsScraping_SkippedOnHealthFailure(t *testing.T) { t.Parallel() st := openTestStore(t) - c := metrics.NewCollector(10) + c := metrics.NewCollector(10, time.Hour) ctx := context.Background() inst, err := st.CreateInstance(ctx, "test-proxy", "127.0.0.1:1") @@ -347,7 +347,7 @@ func TestPoller_MetricsScraping_SkippedOnHealthFailure(t *testing.T) { func TestPoller_DeletedInstance_PrunesCollector(t *testing.T) { t.Parallel() st := openTestStore(t) - c := metrics.NewCollector(10) + c := metrics.NewCollector(10, time.Hour) proxy := fakeProxyWithMetrics(t) addr := strings.TrimPrefix(proxy.URL, "http://")