From f484c8ed98a5d4450568bcea6e41985b6d8d65d3 Mon Sep 17 00:00:00 2001 From: vishal <1117327+vishalchangrani@users.noreply.github.com> Date: Wed, 20 May 2026 14:29:47 -0400 Subject: [PATCH 1/2] extend cluster topic metrics cleanup to cover all per-topic metric families Co-Authored-By: Claude Sonnet 4.6 --- module/metrics/gossipsub_score.go | 10 ++++++++++ module/metrics/network.go | 23 ++++++++++++++++++----- 2 files changed, 28 insertions(+), 5 deletions(-) diff --git a/module/metrics/gossipsub_score.go b/module/metrics/gossipsub_score.go index 15e3628469c..4b3242108f0 100644 --- a/module/metrics/gossipsub_score.go +++ b/module/metrics/gossipsub_score.go @@ -161,3 +161,13 @@ func (g *GossipSubScoreMetrics) OnInvalidMessageDeliveredUpdated(topic channels. func (g *GossipSubScoreMetrics) SetWarningStateCount(u uint) { g.warningStateGauge.Set(float64(u)) } + +// OnClusterTopicMetricsCleanup removes all per-topic scoring metric label values associated with +// the given cluster topic. Call this when the local node leaves a cluster topic to prevent +// unbounded metric cardinality growth across epoch transitions. +func (g *GossipSubScoreMetrics) OnClusterTopicMetricsCleanup(topic string) { + g.timeInMesh.DeleteLabelValues(topic) + g.meshMessageDelivery.DeleteLabelValues(topic) + g.firstMessageDelivery.DeleteLabelValues(topic) + g.invalidMessageDelivery.DeleteLabelValues(topic) +} diff --git a/module/metrics/network.go b/module/metrics/network.go index 98ec52e04e5..65a02229854 100644 --- a/module/metrics/network.go +++ b/module/metrics/network.go @@ -277,19 +277,32 @@ func (nc *NetworkCollector) DuplicateInboundMessagesDropped(topic, protocol, mes // OnClusterTopicMetricsCleanup removes all metric label values associated with the given cluster topic. // This prevents unbounded metric cardinality growth during epoch transitions when collection nodes // join new clusters and leave old ones. Only call this for cluster topics (sync-cluster-*, consensus-cluster-*). -// This method overrides the embedded LocalGossipSubRouterMetrics.OnClusterTopicMetricsCleanup to also -// clean up inbound/outbound message size metrics and iHave message ID metrics. func (nc *NetworkCollector) OnClusterTopicMetricsCleanup(topic string) { - // Clean up LocalGossipSubRouterMetrics (localMeshSize, peerGraftTopicCount, peerPruneTopicCount) + // LocalGossipSubRouterMetrics: localMeshSize, peerGraftTopicCount, peerPruneTopicCount nc.LocalGossipSubRouterMetrics.OnClusterTopicMetricsCleanup(topic) - // Clean up GossipSubRpcValidationInspectorMetrics (receivedIHaveMsgIDsHistogram) + // GossipSubRpcValidationInspectorMetrics: receivedIHaveMsgIDsHistogram nc.GossipSubRpcValidationInspectorMetrics.OnClusterTopicMetricsCleanup(topic) - // Clean up inbound/outbound message size metrics using partial match on topic + // GossipSubScoreMetrics: timeInMesh, meshMessageDelivery, firstMessageDelivery, invalidMessageDelivery + nc.GossipSubScoreMetrics.OnClusterTopicMetricsCleanup(topic) + + // inbound/outbound message size and duplicate drop counters (multi-label: channel + protocol + message) nc.inboundMessageSize.DeletePartialMatch(prometheus.Labels{LabelChannel: topic}) nc.outboundMessageSize.DeletePartialMatch(prometheus.Labels{LabelChannel: topic}) nc.duplicateMessagesDropped.DeletePartialMatch(prometheus.Labels{LabelChannel: topic}) + + // message processing gauges and inbound process time counter (single label: channel) + nc.numMessagesProcessing.DeleteLabelValues(topic) + nc.numDirectMessagesSending.DeleteLabelValues(topic) + nc.inboundProcessTime.DeleteLabelValues(topic) + + // security metrics (multi-label: role + message + channel + reason) + nc.unAuthorizedMessagesCount.DeletePartialMatch(prometheus.Labels{LabelChannel: topic}) + nc.rateLimitedUnicastMessagesCount.DeletePartialMatch(prometheus.Labels{LabelChannel: topic}) + + // ALSP misbehavior counter (multi-label: channel + misbehavior) + nc.AlspMetrics.reportedMisbehaviorCount.DeletePartialMatch(prometheus.Labels{LabelChannel: topic}) } func (nc *NetworkCollector) MessageAdded(priority int) { From 1acbdcf2f5593bfeff9091f0e847597024c73791 Mon Sep 17 00:00:00 2001 From: vishal <1117327+vishalchangrani@users.noreply.github.com> Date: Wed, 20 May 2026 15:09:24 -0400 Subject: [PATCH 2/2] add public OnClusterTopicMetricsCleanup to AlspMetrics instead of accessing private field Co-Authored-By: Claude Sonnet 4.6 --- module/metrics/alsp.go | 6 ++++++ module/metrics/network.go | 2 +- 2 files changed, 7 insertions(+), 1 deletion(-) diff --git a/module/metrics/alsp.go b/module/metrics/alsp.go index 3d5dc2bc510..888ca4dd18d 100644 --- a/module/metrics/alsp.go +++ b/module/metrics/alsp.go @@ -35,6 +35,12 @@ func NewAlspMetrics() *AlspMetrics { return alsp } +// OnClusterTopicMetricsCleanup removes all misbehavior counter label values associated with the given +// cluster topic to prevent unbounded metric cardinality growth during epoch transitions. +func (a *AlspMetrics) OnClusterTopicMetricsCleanup(topic string) { + a.reportedMisbehaviorCount.DeletePartialMatch(prometheus.Labels{LabelChannel: topic}) +} + // OnMisbehaviorReported is called when a misbehavior is reported by the application layer to ALSP. // An engine detecting a spamming-related misbehavior reports it to the ALSP module. It increases // the counter vector of reported misbehavior. diff --git a/module/metrics/network.go b/module/metrics/network.go index 65a02229854..195a03219e4 100644 --- a/module/metrics/network.go +++ b/module/metrics/network.go @@ -302,7 +302,7 @@ func (nc *NetworkCollector) OnClusterTopicMetricsCleanup(topic string) { nc.rateLimitedUnicastMessagesCount.DeletePartialMatch(prometheus.Labels{LabelChannel: topic}) // ALSP misbehavior counter (multi-label: channel + misbehavior) - nc.AlspMetrics.reportedMisbehaviorCount.DeletePartialMatch(prometheus.Labels{LabelChannel: topic}) + nc.AlspMetrics.OnClusterTopicMetricsCleanup(topic) } func (nc *NetworkCollector) MessageAdded(priority int) {