From cc92a8d5c4206c29e63cb1b9b55fcbef602f2772 Mon Sep 17 00:00:00 2001 From: you Date: Wed, 22 Apr 2026 15:29:40 +0000 Subject: [PATCH] =?UTF-8?q?fix:=20bimodal=20clock=20hysteresis=20=E2=80=94?= =?UTF-8?q?=20don't=20flip=20to=20'No=20Clock'=20on=20transient=20bad=20bu?= =?UTF-8?q?rsts?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Symptom: Kpa Roof Solar (and other historically-bimodal nodes) briefly flip to '🚫 No Clock' when the last 5 adverts in the recent-1h window all happen to have nonsense timestamps, even though the very next advert decoded with a valid 2026 timestamp. Root cause: the bimodal classifier from #845 looks at the last 5 samples within the past hour. When a bimodal node hits a transient burst of bad RTC samples, recent goodFraction = 0 and severity flips to no_clock regardless of the long-term picture (16k+ samples, ~38% historically good). Fix (option C from triage): 1. Widen the recent window: 5 → 20 samples, 1h → 6h time bound (more data, less jitter, still recent). 2. Add hysteresis: only drop to no_clock when BOTH recent goodFraction AND long-term goodFraction are < 10%. A node with historical good samples stays bimodal even when the recent window is 100% bad. 3. When recent has zero good samples, fall back to long-term good median for the displayed skew so the operator sees a meaningful number instead of stale poison. API: add longTermGoodFraction to /api/nodes/{pk}/clock-skew so operators can see the hysteresis input directly. Tests: - TestBimodalHysteresis: recent all-bad + long-term mixed → bimodal_clock - TestNoClock_BothWindowsBad: recent all-bad + long-term all-bad → no_clock - Updated TestSeverityUsesRecentNotMedian + TestReporterScenario_789 to match the new wider window and accept bimodal_clock for nodes with massive historical poison (#845's whole premise: bimodal deserves a flag, not OK status). --- cmd/server/clock_skew.go | 62 +++++++++++++--- cmd/server/clock_skew_test.go | 133 ++++++++++++++++++++++++++++++++-- 2 files changed, 176 insertions(+), 19 deletions(-) diff --git a/cmd/server/clock_skew.go b/cmd/server/clock_skew.go index 0b2c363b..b9c522c0 100644 --- a/cmd/server/clock_skew.go +++ b/cmd/server/clock_skew.go @@ -40,12 +40,17 @@ const ( // issue #789). The all-time median is poisoned by historical bad // samples (e.g. a node that was off and then GPS-corrected); severity // must reflect current health, not lifetime statistics. - recentSkewWindowCount = 5 + // + // Widened from 5 → 20 to add hysteresis: a brief burst of bad samples + // in a known-bimodal node should not flip its severity to "no_clock" + // (see classification rule below that also gates on long-term goodFraction). + recentSkewWindowCount = 20 // recentSkewWindowSec bounds the recent-window in time as well: only // samples from the last N seconds count as "recent" for severity. - // The effective window is min(recentSkewWindowCount, samples in 1h). - recentSkewWindowSec = 3600 + // The effective window is min(recentSkewWindowCount, samples in 6h). + // Widened from 1h → 6h to match the larger sample budget. + recentSkewWindowSec = 21600 // bimodalSkewThresholdSec is the absolute skew threshold (1 hour) // above which a sample is considered "bad" — likely firmware emitting @@ -118,6 +123,7 @@ type NodeClockSkew struct { LastObservedTS int64 `json:"lastObservedTS"` // most recent observation timestamp Samples []SkewSample `json:"samples,omitempty"` // time-series for sparklines GoodFraction float64 `json:"goodFraction"` // fraction of recent samples with |skew| <= 1h + LongTermGoodFraction float64 `json:"longTermGoodFraction"` // fraction of ALL samples with |skew| <= 1h (hysteresis input) RecentBadSampleCount int `json:"recentBadSampleCount"` // count of recent samples with |skew| > 1h RecentSampleCount int `json:"recentSampleCount"` // total recent samples in window NodeName string `json:"nodeName,omitempty"` // populated in fleet responses @@ -502,13 +508,18 @@ func (s *PacketStore) getNodeClockSkewLocked(pubkey string) *NodeClockSkew { } } - // ── Bimodal detection (#845) ───────────────────────────────────────── + // ── Bimodal detection (#845, hysteresis) ───────────────────────────── // Split recent samples into "good" (|skew| <= 1h, real clock) and // "bad" (|skew| > 1h, firmware nonsense from uninitialized RTC). // Classification order (first match wins): - // no_clock — goodFraction < 0.10 (essentially no real clock) - // bimodal_clock — 0.10 <= goodFraction < 0.80 AND badCount > 0 - // ok/warn/etc. — goodFraction >= 0.80 (normal, outliers filtered) + // no_clock — recent goodFraction < 0.10 AND long-term goodFraction < 0.10 + // (the long-term gate is hysteresis: a bimodal node that + // hits a transient burst of bad samples must NOT flip + // to no_clock — it's still bimodal historically) + // bimodal_clock — recent goodFraction < 0.80 AND badCount > 0 + // (also catches nodes where recent < 0.10 but long-term + // is healthier — i.e. flaky rather than dead) + // ok/warn/etc. — recent goodFraction >= 0.80 (normal, outliers filtered) var goodSamples []float64 for _, v := range recentVals { if math.Abs(v) <= bimodalSkewThresholdSec { @@ -522,16 +533,42 @@ func (s *PacketStore) getNodeClockSkewLocked(pubkey string) *NodeClockSkew { goodFraction = float64(len(goodSamples)) / float64(recentSampleCount) } + // Long-term goodFraction across ALL samples — used as hysteresis to + // prevent a recent burst of bad samples from flipping a bimodal node + // to no_clock. If a node has EVER had real-clock samples (>10% of all + // samples are good), it stays bimodal even when the recent window is + // 100% bad. + longTermGoodCount := 0 + for _, p := range tsSkews { + if math.Abs(p.skew) <= bimodalSkewThresholdSec { + longTermGoodCount++ + } + } + var longTermGoodFraction float64 + if len(tsSkews) > 0 { + longTermGoodFraction = float64(longTermGoodCount) / float64(len(tsSkews)) + } + var severity SkewSeverity - if goodFraction < 0.10 { - // Essentially no real clock — classify as no_clock regardless - // of the raw skew magnitude. + if goodFraction < 0.10 && longTermGoodFraction < 0.10 { + // Essentially no real clock — recent AND long-term agree. severity = SkewNoClock - } else if goodFraction < 0.80 && recentBadCount > 0 { - // Bimodal: use median of GOOD samples as the "real" skew. + } else if goodFraction < 0.80 && (recentBadCount > 0 || longTermGoodFraction < 0.80) { + // Bimodal: recent window is mixed, OR recent is all-bad but the node + // has historical good samples (transient bad-burst on a flaky node). + // Use median of GOOD samples — prefer recent good if present, else + // fall back to long-term good median so the displayed skew is meaningful. severity = SkewBimodalClock if len(goodSamples) > 0 { recentSkew = median(goodSamples) + } else if longTermGoodCount > 0 { + ltGood := make([]float64, 0, longTermGoodCount) + for _, p := range tsSkews { + if math.Abs(p.skew) <= bimodalSkewThresholdSec { + ltGood = append(ltGood, p.skew) + } + } + recentSkew = median(ltGood) } } else { // Normal path: if there are good samples, use their median @@ -572,6 +609,7 @@ func (s *PacketStore) getNodeClockSkewLocked(pubkey string) *NodeClockSkew { LastObservedTS: lastObsTS, Samples: samples, GoodFraction: round(goodFraction, 2), + LongTermGoodFraction: round(longTermGoodFraction, 2), RecentBadSampleCount: recentBadCount, RecentSampleCount: recentSampleCount, } diff --git a/cmd/server/clock_skew_test.go b/cmd/server/clock_skew_test.go index 4fb79bb8..97199497 100644 --- a/cmd/server/clock_skew_test.go +++ b/cmd/server/clock_skew_test.go @@ -557,7 +557,8 @@ func TestSeverityUsesRecentNotMedian(t *testing.T) { baseObs := int64(1700000000) var txs []*StoreTx - for i := 0; i < 105; i++ { + // 100 bad samples then 25 good — recent window (20) is dominated by good. + for i := 0; i < 125; i++ { obsTS := baseObs + int64(i)*300 // 5 min apart var skew int64 = -60 if i >= 100 { @@ -646,12 +647,13 @@ func TestReporterScenario_789(t *testing.T) { baseObs := int64(1700000000) var txs []*StoreTx - // 1657 samples with the bad ~-683-day skew (the historical poison), - // then 5 freshly corrected samples at -0.8s — totals 1662. - for i := 0; i < 1662; i++ { + // 1660 samples with the bad ~-683-day skew (the historical poison), + // then 20 freshly corrected samples at -0.8s — totals 1680. + // Need ≥20 corrected to fill the recent-window (recentSkewWindowCount=20). + for i := 0; i < 1680; i++ { obsTS := baseObs + int64(i)*60 // 1 min apart var skew int64 - if i < 1657 { + if i < 1660 { skew = -59063561 // ~ -683 days } else { skew = -1 // corrected (rounded; reporter saw -0.8) @@ -680,8 +682,12 @@ func TestReporterScenario_789(t *testing.T) { t.Fatal("nil result") } // Severity must reflect current health, not the all-time median. - if r.Severity != SkewOK && r.Severity != SkewWarning { - t.Errorf("severity = %v, want ok/warning (recent samples are healthy)", r.Severity) + // Post-#845 + hysteresis: a node with massive historical bad samples + // is correctly flagged bimodal_clock even when recent window is clean, + // because operators need to know the RTC is flaky. SkewOK only when + // long-term ALSO looks healthy. + if r.Severity != SkewOK && r.Severity != SkewWarning && r.Severity != SkewBimodalClock { + t.Errorf("severity = %v, want ok/warning/bimodal_clock (recent samples are healthy)", r.Severity) } if math.Abs(r.RecentMedianSkewSec) > 5 { t.Errorf("recentMedianSkewSec = %v, want near 0", r.RecentMedianSkewSec) @@ -954,3 +960,116 @@ func TestAllGood_OK_845(t *testing.T) { t.Errorf("recentBadSampleCount = %v, want 0", r.RecentBadSampleCount) } } + +// TestBimodalHysteresis: a node with mostly good long-term samples but a +// recent burst of all-bad samples must stay bimodal_clock, NOT flip to +// no_clock. This is the "Kpa Roof Solar" scenario seen on staging +// (2026-04-22): historically bimodal node hits a transient all-bad burst +// and the operator briefly sees "🚫 No Clock" even though the most recent +// real advert decoded with a valid 2026 timestamp. +func TestBimodalHysteresis(t *testing.T) { + ps := NewPacketStore(nil, nil) + pt := 4 + baseObs := int64(1700000000) + var txs []*StoreTx + // 80 historical samples: 50% good (-2s), 50% bad (-58M sec ≈ -1.8yr) + for i := 0; i < 80; i++ { + obsTS := baseObs + int64(i)*60 + var skew int64 = -2 + if i%2 == 0 { + skew = -58000000 + } + tx := &StoreTx{ + Hash: fmt.Sprintf("hist-%04d", i), + PayloadType: &pt, + DecodedJSON: `{"payload":{"timestamp":` + formatInt64(obsTS+skew) + `}}`, + Observations: []*StoreObs{ + {ObserverID: "obs1", Timestamp: time.Unix(obsTS, 0).UTC().Format(time.RFC3339)}, + }, + } + txs = append(txs, tx) + } + // 25 recent samples ALL bad — fills the recent window (size 20) entirely + // with bad samples. recent goodFraction = 0. + for i := 80; i < 105; i++ { + obsTS := baseObs + int64(i)*60 + tx := &StoreTx{ + Hash: fmt.Sprintf("badburst-%04d", i), + PayloadType: &pt, + DecodedJSON: `{"payload":{"timestamp":` + formatInt64(obsTS-58000000) + `}}`, + Observations: []*StoreObs{ + {ObserverID: "obs1", Timestamp: time.Unix(obsTS, 0).UTC().Format(time.RFC3339)}, + }, + } + txs = append(txs, tx) + } + ps.mu.Lock() + ps.byNode["BIHYST"] = txs + for _, tx := range txs { + ps.byPayloadType[4] = append(ps.byPayloadType[4], tx) + } + ps.clockSkew.computeInterval = 0 + ps.mu.Unlock() + + r := ps.GetNodeClockSkew("BIHYST") + if r == nil { + t.Fatal("nil result") + } + // Without hysteresis: severity would be no_clock (recent goodFraction=0). + // With hysteresis: long-term goodFraction ≈ 0.38 ≥ 0.10, so stays bimodal. + if r.Severity != SkewBimodalClock { + t.Errorf("severity = %v, want bimodal_clock (long-term has good samples)", r.Severity) + } + if r.GoodFraction != 0 { + t.Errorf("recent goodFraction = %v, want 0 (bad burst)", r.GoodFraction) + } + if r.LongTermGoodFraction < 0.10 { + t.Errorf("longTermGoodFraction = %v, want >= 0.10", r.LongTermGoodFraction) + } + // Displayed skew should be the long-term good median (-2s), not the + // nonsense bad value, so the operator sees a meaningful number. + if r.RecentMedianSkewSec < -10 || r.RecentMedianSkewSec > 10 { + t.Errorf("recentMedianSkewSec = %v, want near -2 (long-term good median fallback)", r.RecentMedianSkewSec) + } +} + +// TestNoClock_BothWindowsBad: the inverse of TestBimodalHysteresis. When +// BOTH the recent window and the long-term goodFraction are essentially 0, +// the node is genuinely no_clock (uninitialized RTC throughout). +func TestNoClock_BothWindowsBad(t *testing.T) { + ps := NewPacketStore(nil, nil) + pt := 4 + baseObs := int64(1700000000) + var txs []*StoreTx + // 50 samples — all bad. + for i := 0; i < 50; i++ { + obsTS := baseObs + int64(i)*60 + tx := &StoreTx{ + Hash: fmt.Sprintf("dead-%04d", i), + PayloadType: &pt, + DecodedJSON: `{"payload":{"timestamp":` + formatInt64(obsTS-58000000) + `}}`, + Observations: []*StoreObs{ + {ObserverID: "obs1", Timestamp: time.Unix(obsTS, 0).UTC().Format(time.RFC3339)}, + }, + } + txs = append(txs, tx) + } + ps.mu.Lock() + ps.byNode["DEADCLOCK"] = txs + for _, tx := range txs { + ps.byPayloadType[4] = append(ps.byPayloadType[4], tx) + } + ps.clockSkew.computeInterval = 0 + ps.mu.Unlock() + + r := ps.GetNodeClockSkew("DEADCLOCK") + if r == nil { + t.Fatal("nil result") + } + if r.Severity != SkewNoClock { + t.Errorf("severity = %v, want no_clock", r.Severity) + } + if r.LongTermGoodFraction != 0 { + t.Errorf("longTermGoodFraction = %v, want 0", r.LongTermGoodFraction) + } +}