Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
62 changes: 50 additions & 12 deletions cmd/server/clock_skew.go
Original file line number Diff line number Diff line change
Expand Up @@ -40,12 +40,17 @@ const (
// issue #789). The all-time median is poisoned by historical bad
// samples (e.g. a node that was off and then GPS-corrected); severity
// must reflect current health, not lifetime statistics.
recentSkewWindowCount = 5
//
// Widened from 5 → 20 to add hysteresis: a brief burst of bad samples
// in a known-bimodal node should not flip its severity to "no_clock"
// (see classification rule below that also gates on long-term goodFraction).
recentSkewWindowCount = 20

// recentSkewWindowSec bounds the recent-window in time as well: only
// samples from the last N seconds count as "recent" for severity.
// The effective window is min(recentSkewWindowCount, samples in 1h).
recentSkewWindowSec = 3600
// The effective window is min(recentSkewWindowCount, samples in 6h).
// Widened from 1h → 6h to match the larger sample budget.
recentSkewWindowSec = 21600

// bimodalSkewThresholdSec is the absolute skew threshold (1 hour)
// above which a sample is considered "bad" — likely firmware emitting
Expand Down Expand Up @@ -118,6 +123,7 @@ type NodeClockSkew struct {
LastObservedTS int64 `json:"lastObservedTS"` // most recent observation timestamp
Samples []SkewSample `json:"samples,omitempty"` // time-series for sparklines
GoodFraction float64 `json:"goodFraction"` // fraction of recent samples with |skew| <= 1h
LongTermGoodFraction float64 `json:"longTermGoodFraction"` // fraction of ALL samples with |skew| <= 1h (hysteresis input)
RecentBadSampleCount int `json:"recentBadSampleCount"` // count of recent samples with |skew| > 1h
RecentSampleCount int `json:"recentSampleCount"` // total recent samples in window
NodeName string `json:"nodeName,omitempty"` // populated in fleet responses
Expand Down Expand Up @@ -502,13 +508,18 @@ func (s *PacketStore) getNodeClockSkewLocked(pubkey string) *NodeClockSkew {
}
}

// ── Bimodal detection (#845) ─────────────────────────────────────────
// ── Bimodal detection (#845, hysteresis) ─────────────────────────────
// Split recent samples into "good" (|skew| <= 1h, real clock) and
// "bad" (|skew| > 1h, firmware nonsense from uninitialized RTC).
// Classification order (first match wins):
// no_clock — goodFraction < 0.10 (essentially no real clock)
// bimodal_clock — 0.10 <= goodFraction < 0.80 AND badCount > 0
// ok/warn/etc. — goodFraction >= 0.80 (normal, outliers filtered)
// no_clock — recent goodFraction < 0.10 AND long-term goodFraction < 0.10
// (the long-term gate is hysteresis: a bimodal node that
// hits a transient burst of bad samples must NOT flip
// to no_clock — it's still bimodal historically)
// bimodal_clock — recent goodFraction < 0.80 AND badCount > 0
// (also catches nodes where recent < 0.10 but long-term
// is healthier — i.e. flaky rather than dead)
// ok/warn/etc. — recent goodFraction >= 0.80 (normal, outliers filtered)
var goodSamples []float64
for _, v := range recentVals {
if math.Abs(v) <= bimodalSkewThresholdSec {
Expand All @@ -522,16 +533,42 @@ func (s *PacketStore) getNodeClockSkewLocked(pubkey string) *NodeClockSkew {
goodFraction = float64(len(goodSamples)) / float64(recentSampleCount)
}

// Long-term goodFraction across ALL samples — used as hysteresis to
// prevent a recent burst of bad samples from flipping a bimodal node
// to no_clock. If a node has EVER had real-clock samples (>10% of all
// samples are good), it stays bimodal even when the recent window is
// 100% bad.
longTermGoodCount := 0
for _, p := range tsSkews {
if math.Abs(p.skew) <= bimodalSkewThresholdSec {
longTermGoodCount++
}
}
var longTermGoodFraction float64
if len(tsSkews) > 0 {
longTermGoodFraction = float64(longTermGoodCount) / float64(len(tsSkews))
}

var severity SkewSeverity
if goodFraction < 0.10 {
// Essentially no real clock — classify as no_clock regardless
// of the raw skew magnitude.
if goodFraction < 0.10 && longTermGoodFraction < 0.10 {
// Essentially no real clock — recent AND long-term agree.
severity = SkewNoClock
} else if goodFraction < 0.80 && recentBadCount > 0 {
// Bimodal: use median of GOOD samples as the "real" skew.
} else if goodFraction < 0.80 && (recentBadCount > 0 || longTermGoodFraction < 0.80) {
// Bimodal: recent window is mixed, OR recent is all-bad but the node
// has historical good samples (transient bad-burst on a flaky node).
// Use median of GOOD samples — prefer recent good if present, else
// fall back to long-term good median so the displayed skew is meaningful.
severity = SkewBimodalClock
if len(goodSamples) > 0 {
recentSkew = median(goodSamples)
} else if longTermGoodCount > 0 {
ltGood := make([]float64, 0, longTermGoodCount)
for _, p := range tsSkews {
if math.Abs(p.skew) <= bimodalSkewThresholdSec {
ltGood = append(ltGood, p.skew)
}
}
recentSkew = median(ltGood)
}
} else {
// Normal path: if there are good samples, use their median
Expand Down Expand Up @@ -572,6 +609,7 @@ func (s *PacketStore) getNodeClockSkewLocked(pubkey string) *NodeClockSkew {
LastObservedTS: lastObsTS,
Samples: samples,
GoodFraction: round(goodFraction, 2),
LongTermGoodFraction: round(longTermGoodFraction, 2),
RecentBadSampleCount: recentBadCount,
RecentSampleCount: recentSampleCount,
}
Expand Down
133 changes: 126 additions & 7 deletions cmd/server/clock_skew_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -557,7 +557,8 @@ func TestSeverityUsesRecentNotMedian(t *testing.T) {

baseObs := int64(1700000000)
var txs []*StoreTx
for i := 0; i < 105; i++ {
// 100 bad samples then 25 good — recent window (20) is dominated by good.
for i := 0; i < 125; i++ {
obsTS := baseObs + int64(i)*300 // 5 min apart
var skew int64 = -60
if i >= 100 {
Expand Down Expand Up @@ -646,12 +647,13 @@ func TestReporterScenario_789(t *testing.T) {

baseObs := int64(1700000000)
var txs []*StoreTx
// 1657 samples with the bad ~-683-day skew (the historical poison),
// then 5 freshly corrected samples at -0.8s — totals 1662.
for i := 0; i < 1662; i++ {
// 1660 samples with the bad ~-683-day skew (the historical poison),
// then 20 freshly corrected samples at -0.8s — totals 1680.
// Need ≥20 corrected to fill the recent-window (recentSkewWindowCount=20).
for i := 0; i < 1680; i++ {
obsTS := baseObs + int64(i)*60 // 1 min apart
var skew int64
if i < 1657 {
if i < 1660 {
skew = -59063561 // ~ -683 days
} else {
skew = -1 // corrected (rounded; reporter saw -0.8)
Expand Down Expand Up @@ -680,8 +682,12 @@ func TestReporterScenario_789(t *testing.T) {
t.Fatal("nil result")
}
// Severity must reflect current health, not the all-time median.
if r.Severity != SkewOK && r.Severity != SkewWarning {
t.Errorf("severity = %v, want ok/warning (recent samples are healthy)", r.Severity)
// Post-#845 + hysteresis: a node with massive historical bad samples
// is correctly flagged bimodal_clock even when recent window is clean,
// because operators need to know the RTC is flaky. SkewOK only when
// long-term ALSO looks healthy.
if r.Severity != SkewOK && r.Severity != SkewWarning && r.Severity != SkewBimodalClock {
t.Errorf("severity = %v, want ok/warning/bimodal_clock (recent samples are healthy)", r.Severity)
}
if math.Abs(r.RecentMedianSkewSec) > 5 {
t.Errorf("recentMedianSkewSec = %v, want near 0", r.RecentMedianSkewSec)
Expand Down Expand Up @@ -954,3 +960,116 @@ func TestAllGood_OK_845(t *testing.T) {
t.Errorf("recentBadSampleCount = %v, want 0", r.RecentBadSampleCount)
}
}

// TestBimodalHysteresis: a node with mostly good long-term samples but a
// recent burst of all-bad samples must stay bimodal_clock, NOT flip to
// no_clock. This is the "Kpa Roof Solar" scenario seen on staging
// (2026-04-22): historically bimodal node hits a transient all-bad burst
// and the operator briefly sees "🚫 No Clock" even though the most recent
// real advert decoded with a valid 2026 timestamp.
func TestBimodalHysteresis(t *testing.T) {
ps := NewPacketStore(nil, nil)
pt := 4
baseObs := int64(1700000000)
var txs []*StoreTx
// 80 historical samples: 50% good (-2s), 50% bad (-58M sec ≈ -1.8yr)
for i := 0; i < 80; i++ {
obsTS := baseObs + int64(i)*60
var skew int64 = -2
if i%2 == 0 {
skew = -58000000
}
tx := &StoreTx{
Hash: fmt.Sprintf("hist-%04d", i),
PayloadType: &pt,
DecodedJSON: `{"payload":{"timestamp":` + formatInt64(obsTS+skew) + `}}`,
Observations: []*StoreObs{
{ObserverID: "obs1", Timestamp: time.Unix(obsTS, 0).UTC().Format(time.RFC3339)},
},
}
txs = append(txs, tx)
}
// 25 recent samples ALL bad — fills the recent window (size 20) entirely
// with bad samples. recent goodFraction = 0.
for i := 80; i < 105; i++ {
obsTS := baseObs + int64(i)*60
tx := &StoreTx{
Hash: fmt.Sprintf("badburst-%04d", i),
PayloadType: &pt,
DecodedJSON: `{"payload":{"timestamp":` + formatInt64(obsTS-58000000) + `}}`,
Observations: []*StoreObs{
{ObserverID: "obs1", Timestamp: time.Unix(obsTS, 0).UTC().Format(time.RFC3339)},
},
}
txs = append(txs, tx)
}
ps.mu.Lock()
ps.byNode["BIHYST"] = txs
for _, tx := range txs {
ps.byPayloadType[4] = append(ps.byPayloadType[4], tx)
}
ps.clockSkew.computeInterval = 0
ps.mu.Unlock()

r := ps.GetNodeClockSkew("BIHYST")
if r == nil {
t.Fatal("nil result")
}
// Without hysteresis: severity would be no_clock (recent goodFraction=0).
// With hysteresis: long-term goodFraction ≈ 0.38 ≥ 0.10, so stays bimodal.
if r.Severity != SkewBimodalClock {
t.Errorf("severity = %v, want bimodal_clock (long-term has good samples)", r.Severity)
}
if r.GoodFraction != 0 {
t.Errorf("recent goodFraction = %v, want 0 (bad burst)", r.GoodFraction)
}
if r.LongTermGoodFraction < 0.10 {
t.Errorf("longTermGoodFraction = %v, want >= 0.10", r.LongTermGoodFraction)
}
// Displayed skew should be the long-term good median (-2s), not the
// nonsense bad value, so the operator sees a meaningful number.
if r.RecentMedianSkewSec < -10 || r.RecentMedianSkewSec > 10 {
t.Errorf("recentMedianSkewSec = %v, want near -2 (long-term good median fallback)", r.RecentMedianSkewSec)
}
}

// TestNoClock_BothWindowsBad: the inverse of TestBimodalHysteresis. When
// BOTH the recent window and the long-term goodFraction are essentially 0,
// the node is genuinely no_clock (uninitialized RTC throughout).
func TestNoClock_BothWindowsBad(t *testing.T) {
ps := NewPacketStore(nil, nil)
pt := 4
baseObs := int64(1700000000)
var txs []*StoreTx
// 50 samples — all bad.
for i := 0; i < 50; i++ {
obsTS := baseObs + int64(i)*60
tx := &StoreTx{
Hash: fmt.Sprintf("dead-%04d", i),
PayloadType: &pt,
DecodedJSON: `{"payload":{"timestamp":` + formatInt64(obsTS-58000000) + `}}`,
Observations: []*StoreObs{
{ObserverID: "obs1", Timestamp: time.Unix(obsTS, 0).UTC().Format(time.RFC3339)},
},
}
txs = append(txs, tx)
}
ps.mu.Lock()
ps.byNode["DEADCLOCK"] = txs
for _, tx := range txs {
ps.byPayloadType[4] = append(ps.byPayloadType[4], tx)
}
ps.clockSkew.computeInterval = 0
ps.mu.Unlock()

r := ps.GetNodeClockSkew("DEADCLOCK")
if r == nil {
t.Fatal("nil result")
}
if r.Severity != SkewNoClock {
t.Errorf("severity = %v, want no_clock", r.Severity)
}
if r.LongTermGoodFraction != 0 {
t.Errorf("longTermGoodFraction = %v, want 0", r.LongTermGoodFraction)
}
}
Loading