diff --git a/module/metrics/alsp.go b/module/metrics/alsp.go index 3d5dc2bc510..888ca4dd18d 100644 --- a/module/metrics/alsp.go +++ b/module/metrics/alsp.go @@ -35,6 +35,12 @@ func NewAlspMetrics() *AlspMetrics { return alsp } +// OnClusterTopicMetricsCleanup removes all misbehavior counter label values associated with the given +// cluster topic to prevent unbounded metric cardinality growth during epoch transitions. +func (a *AlspMetrics) OnClusterTopicMetricsCleanup(topic string) { + a.reportedMisbehaviorCount.DeletePartialMatch(prometheus.Labels{LabelChannel: topic}) +} + // OnMisbehaviorReported is called when a misbehavior is reported by the application layer to ALSP. // An engine detecting a spamming-related misbehavior reports it to the ALSP module. It increases // the counter vector of reported misbehavior. diff --git a/module/metrics/gossipsub_score.go b/module/metrics/gossipsub_score.go index 15e3628469c..4b3242108f0 100644 --- a/module/metrics/gossipsub_score.go +++ b/module/metrics/gossipsub_score.go @@ -161,3 +161,13 @@ func (g *GossipSubScoreMetrics) OnInvalidMessageDeliveredUpdated(topic channels. func (g *GossipSubScoreMetrics) SetWarningStateCount(u uint) { g.warningStateGauge.Set(float64(u)) } + +// OnClusterTopicMetricsCleanup removes all per-topic scoring metric label values associated with +// the given cluster topic. Call this when the local node leaves a cluster topic to prevent +// unbounded metric cardinality growth across epoch transitions. +func (g *GossipSubScoreMetrics) OnClusterTopicMetricsCleanup(topic string) { + g.timeInMesh.DeleteLabelValues(topic) + g.meshMessageDelivery.DeleteLabelValues(topic) + g.firstMessageDelivery.DeleteLabelValues(topic) + g.invalidMessageDelivery.DeleteLabelValues(topic) +} diff --git a/module/metrics/network.go b/module/metrics/network.go index 98ec52e04e5..195a03219e4 100644 --- a/module/metrics/network.go +++ b/module/metrics/network.go @@ -277,19 +277,32 @@ func (nc *NetworkCollector) DuplicateInboundMessagesDropped(topic, protocol, mes // OnClusterTopicMetricsCleanup removes all metric label values associated with the given cluster topic. // This prevents unbounded metric cardinality growth during epoch transitions when collection nodes // join new clusters and leave old ones. Only call this for cluster topics (sync-cluster-*, consensus-cluster-*). -// This method overrides the embedded LocalGossipSubRouterMetrics.OnClusterTopicMetricsCleanup to also -// clean up inbound/outbound message size metrics and iHave message ID metrics. func (nc *NetworkCollector) OnClusterTopicMetricsCleanup(topic string) { - // Clean up LocalGossipSubRouterMetrics (localMeshSize, peerGraftTopicCount, peerPruneTopicCount) + // LocalGossipSubRouterMetrics: localMeshSize, peerGraftTopicCount, peerPruneTopicCount nc.LocalGossipSubRouterMetrics.OnClusterTopicMetricsCleanup(topic) - // Clean up GossipSubRpcValidationInspectorMetrics (receivedIHaveMsgIDsHistogram) + // GossipSubRpcValidationInspectorMetrics: receivedIHaveMsgIDsHistogram nc.GossipSubRpcValidationInspectorMetrics.OnClusterTopicMetricsCleanup(topic) - // Clean up inbound/outbound message size metrics using partial match on topic + // GossipSubScoreMetrics: timeInMesh, meshMessageDelivery, firstMessageDelivery, invalidMessageDelivery + nc.GossipSubScoreMetrics.OnClusterTopicMetricsCleanup(topic) + + // inbound/outbound message size and duplicate drop counters (multi-label: channel + protocol + message) nc.inboundMessageSize.DeletePartialMatch(prometheus.Labels{LabelChannel: topic}) nc.outboundMessageSize.DeletePartialMatch(prometheus.Labels{LabelChannel: topic}) nc.duplicateMessagesDropped.DeletePartialMatch(prometheus.Labels{LabelChannel: topic}) + + // message processing gauges and inbound process time counter (single label: channel) + nc.numMessagesProcessing.DeleteLabelValues(topic) + nc.numDirectMessagesSending.DeleteLabelValues(topic) + nc.inboundProcessTime.DeleteLabelValues(topic) + + // security metrics (multi-label: role + message + channel + reason) + nc.unAuthorizedMessagesCount.DeletePartialMatch(prometheus.Labels{LabelChannel: topic}) + nc.rateLimitedUnicastMessagesCount.DeletePartialMatch(prometheus.Labels{LabelChannel: topic}) + + // ALSP misbehavior counter (multi-label: channel + misbehavior) + nc.AlspMetrics.OnClusterTopicMetricsCleanup(topic) } func (nc *NetworkCollector) MessageAdded(priority int) {