Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
14 changes: 13 additions & 1 deletion backend/alerts/backend-prometheusRule.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -46,7 +46,7 @@ spec:
) > 0.5
for: 10m
labels:
severity: critical
severity: warning
annotations:
description: 'Backend controller workqueue {{ $labels.name }} has a retry ratio of > 50% sustained over 10 minutes, indicating most queue activity is failed retries rather than fresh work.'
runbook_url: 'TBD'
Expand All @@ -65,3 +65,15 @@ spec:
description: 'Backend controller workqueue {{ $labels.name }} has had a depth > 10 for more than 5 minutes, indicating work is accumulating faster than it can be processed.'
runbook_url: 'TBD'
summary: 'Backend controller workqueue {{ $labels.name }} depth is high'
- alert: BackendControllerPanic
expr: |
sum by (controller, cluster) (
increase(panic_total{namespace="aro-hcp"}[5m])
) > 0
for: 1m
labels:
severity: warning
annotations:
description: 'Backend controller {{ $labels.controller }} has panicked {{ printf "%.0f" $value }} time(s) in the last 5 minutes.'
runbook_url: 'TBD'
summary: 'Backend controller {{ $labels.controller }} is panicking'
27 changes: 26 additions & 1 deletion backend/alerts/backend-prometheusRule_test.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -43,7 +43,7 @@ tests:
exp_alerts:
- exp_labels:
name: OperationClusterCreate
severity: critical
severity: warning
exp_annotations:
description: 'Backend controller workqueue OperationClusterCreate has a retry ratio of > 50% sustained over 10 minutes, indicating most queue activity is failed retries rather than fresh work.'
runbook_url: 'TBD'
Expand Down Expand Up @@ -84,3 +84,28 @@ tests:
- eval_time: 6m
alertname: BackendControllerQueueDepthHigh
exp_alerts: []
# Test: BackendControllerPanic fires when panics occur
- interval: 1m
input_series:
- series: 'panic_total{controller="DoNothingExample", namespace="aro-hcp"}'
values: "0 1 2 3 4 5 6 7 8 9 10"
alert_rule_test:
- eval_time: 6m
alertname: BackendControllerPanic
exp_alerts:
- exp_labels:
controller: DoNothingExample
severity: warning
exp_annotations:
description: 'Backend controller DoNothingExample has panicked 5 time(s) in the last 5 minutes.'
runbook_url: 'TBD'
summary: 'Backend controller DoNothingExample is panicking'
# Test: BackendControllerPanic does not fire when no panics
- interval: 1m
input_series:
- series: 'panic_total{controller="DoNothingExample", namespace="aro-hcp"}'
values: "0+0x10"
alert_rule_test:
- eval_time: 6m
alertname: BackendControllerPanic
exp_alerts: []
Original file line number Diff line number Diff line change
Expand Up @@ -1363,7 +1363,7 @@ resource msftPrometheusOperatorRules 'Microsoft.AlertsManagement/prometheusRuleG
title: 'Resources rejected by Prometheus operator'
}
expression: 'min_over_time(prometheus_operator_managed_resources{job="prometheus-operator",namespace="prometheus",state="rejected"}[5m]) > 0'
for: 'PT5M'
for: 'PT20M'
severity: 3
}
]
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -338,7 +338,7 @@ resource prometheusOperatorRules 'Microsoft.AlertsManagement/prometheusRuleGroup
title: 'Resources rejected by Prometheus operator'
}
expression: 'min_over_time(prometheus_operator_managed_resources{job="prometheus-operator",namespace="prometheus",state="rejected"}[5m]) > 0'
for: 'PT5M'
for: 'PT20M'
severity: 3
}
]
Expand All @@ -348,46 +348,6 @@ resource prometheusOperatorRules 'Microsoft.AlertsManagement/prometheusRuleGroup
}
}

resource mise 'Microsoft.AlertsManagement/prometheusRuleGroups@2023-03-01' = {
name: 'mise'
location: location
properties: {
interval: 'PT1M'
rules: [
{
actions: [
for g in actionGroups: {
actionGroupId: g
actionProperties: {
'IcM.Title': '#$.labels.cluster#: #$.annotations.title#'
'IcM.CorrelationId': '#$.annotations.correlationId#'
}
}
]
alert: 'MiseEnvoyScrapeDown'
enabled: true
labels: {
severity: 'info'
}
annotations: {
correlationId: 'MiseEnvoyScrapeDown/{{ $labels.cluster }}'
description: 'Prometheus scrape for envoy-stats job in namespace mise is failing or missing.'
info: 'Prometheus scrape for envoy-stats job in namespace mise is failing or missing.'
runbook_url: 'TBD'
summary: 'Envoy scrape target down for namespace=mise'
title: 'Envoy scrape target down for namespace=mise'
}
expression: 'group by (cluster) (up{job="kube-state-metrics", cluster=~".*-svc(-[0-9]+)?$"}) unless on(cluster) group by (cluster) (up{endpoint="http-envoy-prom", container="istio-proxy", namespace="mise"} == 1)'
for: 'PT5M'
severity: 4
}
]
scopes: [
azureMonitoring
]
}
}

resource frontend 'Microsoft.AlertsManagement/prometheusRuleGroups@2023-03-01' = {
name: 'frontend'
location: location
Expand Down Expand Up @@ -741,7 +701,7 @@ resource backend 'Microsoft.AlertsManagement/prometheusRuleGroups@2023-03-01' =
alert: 'BackendControllerRetryHotLoop'
enabled: true
labels: {
severity: 'critical'
severity: 'warning'
}
annotations: {
correlationId: 'BackendControllerRetryHotLoop/{{ $labels.cluster }}/{{ $labels.name }}'
Expand Down Expand Up @@ -782,6 +742,33 @@ resource backend 'Microsoft.AlertsManagement/prometheusRuleGroups@2023-03-01' =
for: 'PT5M'
severity: 3
}
{
actions: [
for g in actionGroups: {
actionGroupId: g
actionProperties: {
'IcM.Title': '#$.labels.cluster#: #$.annotations.title#'
'IcM.CorrelationId': '#$.annotations.correlationId#'
}
}
]
alert: 'BackendControllerPanic'
enabled: true
labels: {
severity: 'warning'
}
annotations: {
correlationId: 'BackendControllerPanic/{{ $labels.cluster }}/{{ $labels.controller }}'
description: 'Backend controller {{ $labels.controller }} has panicked {{ printf "%.0f" $value }} time(s) in the last 5 minutes.'
info: 'Backend controller {{ $labels.controller }} has panicked {{ printf "%.0f" $value }} time(s) in the last 5 minutes.'
runbook_url: 'TBD'
summary: 'Backend controller {{ $labels.controller }} is panicking'
title: 'Backend controller {{ $labels.controller }} is panicking'
}
expression: 'sum by (controller, cluster) ( increase(panic_total{namespace="aro-hcp"}[5m]) ) > 0'
for: 'PT1M'
severity: 3
}
]
scopes: [
azureMonitoring
Expand Down Expand Up @@ -856,6 +843,127 @@ resource adminApi 'Microsoft.AlertsManagement/prometheusRuleGroups@2023-03-01' =
}
}

resource maestro 'Microsoft.AlertsManagement/prometheusRuleGroups@2023-03-01' = {
name: 'maestro'
location: location
properties: {
interval: 'PT1M'
rules: [
{
actions: [
for g in actionGroups: {
actionGroupId: g
actionProperties: {
'IcM.Title': '#$.labels.cluster#: #$.annotations.title#'
'IcM.CorrelationId': '#$.annotations.correlationId#'
}
}
]
alert: 'MaestroGRPCSourceClientExcessConnections'
enabled: true
labels: {
severity: 'warning'
}
annotations: {
correlationId: 'MaestroGRPCSourceClientExcessConnections/{{ $labels.cluster }}'
description: 'Maestro gRPC server has {{ $value }} registered source clients, which is unusually high. Only clusters-service and backend are expected as source clients. This may indicate a connection leak or clients failing to unregister.'
info: 'Maestro gRPC server has {{ $value }} registered source clients, which is unusually high. Only clusters-service and backend are expected as source clients. This may indicate a connection leak or clients failing to unregister.'
runbook_url: 'TBD'
summary: 'Maestro has too many gRPC source client connections'
title: 'Maestro has too many gRPC source client connections'
}
expression: 'sum(grpc_server_registered_source_clients{namespace="maestro"}) > 100'
for: 'PT10M'
severity: 3
}
{
actions: [
for g in actionGroups: {
actionGroupId: g
actionProperties: {
'IcM.Title': '#$.labels.cluster#: #$.annotations.title#'
'IcM.CorrelationId': '#$.annotations.correlationId#'
}
}
]
alert: 'MaestroRESTAPIErrorRate'
enabled: true
labels: {
severity: 'warning'
}
annotations: {
correlationId: 'MaestroRESTAPIErrorRate/{{ $labels.cluster }}'
description: 'Maestro REST API 5xx error rate is above 5% for the last 5 minutes. Current value: {{ $value | humanizePercentage }}.'
info: 'Maestro REST API 5xx error rate is above 5% for the last 5 minutes. Current value: {{ $value | humanizePercentage }}.'
runbook_url: 'TBD'
summary: 'Maestro REST API error rate is high'
title: 'Maestro REST API error rate is high'
}
expression: 'sum(rate(rest_api_inbound_request_count{namespace="maestro", code=~"5.."}[5m])) / sum(rate(rest_api_inbound_request_count{namespace="maestro"}[5m])) > 0.05'
for: 'PT5M'
severity: 3
}
{
actions: [
for g in actionGroups: {
actionGroupId: g
actionProperties: {
'IcM.Title': '#$.labels.cluster#: #$.annotations.title#'
'IcM.CorrelationId': '#$.annotations.correlationId#'
}
}
]
alert: 'MaestroGRPCServerErrorRate'
enabled: true
labels: {
severity: 'warning'
}
annotations: {
correlationId: 'MaestroGRPCServerErrorRate/{{ $labels.cluster }}'
description: 'Maestro gRPC server error rate is above 5% for the last 5 minutes. Current value: {{ $value | humanizePercentage }}.'
info: 'Maestro gRPC server error rate is above 5% for the last 5 minutes. Current value: {{ $value | humanizePercentage }}.'
runbook_url: 'TBD'
summary: 'Maestro gRPC server error rate is high'
title: 'Maestro gRPC server error rate is high'
}
expression: 'sum(rate(grpc_server_processed_total{namespace="maestro", code!="OK"}[5m])) / sum(rate(grpc_server_processed_total{namespace="maestro"}[5m])) > 0.05'
for: 'PT5M'
severity: 3
}
{
actions: [
for g in actionGroups: {
actionGroupId: g
actionProperties: {
'IcM.Title': '#$.labels.cluster#: #$.annotations.title#'
'IcM.CorrelationId': '#$.annotations.correlationId#'
}
}
]
alert: 'MaestroSpecControllerReconcileErrors'
enabled: true
labels: {
severity: 'warning'
}
annotations: {
correlationId: 'MaestroSpecControllerReconcileErrors/{{ $labels.cluster }}'
description: 'Maestro spec controller reconcile error rate is above 10% for the last 10 minutes. Resources may not be reaching management clusters. Current value: {{ $value | humanizePercentage }}.'
info: 'Maestro spec controller reconcile error rate is above 10% for the last 10 minutes. Resources may not be reaching management clusters. Current value: {{ $value | humanizePercentage }}.'
runbook_url: 'TBD'
summary: 'Maestro spec controller reconcile error rate is high'
title: 'Maestro spec controller reconcile error rate is high'
}
expression: 'sum(rate(spec_controller_event_reconcile_total{namespace="maestro", status="error"}[5m])) / sum(rate(spec_controller_event_reconcile_total{namespace="maestro"}[5m])) > 0.1'
for: 'PT10M'
severity: 3
}
]
scopes: [
azureMonitoring
]
}
}

resource arobitRules 'Microsoft.AlertsManagement/prometheusRuleGroups@2023-03-01' = {
name: 'arobit-rules'
location: location
Expand Down
79 changes: 79 additions & 0 deletions maestro/alerts/maestro-prometheusRule.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,79 @@
apiVersion: monitoring.coreos.com/v1
kind: PrometheusRule
metadata:
labels:
app.kubernetes.io/name: kube-prometheus
app.kubernetes.io/part-of: kube-prometheus
prometheus: k8s
role: alert-rules
name: maestro-monitoring-rules
namespace: monitoring
spec:
groups:
- name: maestro
rules:
- alert: MaestroGRPCSourceClientExcessConnections
# Fires when the number of registered gRPC source clients is
# unusually high. Only clusters-service and backend connect as
# source clients, so a sustained count above 100 indicates a
# connection leak or clients failing to unregister properly.
expr: |
sum(grpc_server_registered_source_clients{namespace="maestro"}) > 100
for: 10m
labels:
severity: warning
annotations:
description: 'Maestro gRPC server has {{ $value }} registered source clients, which is unusually high. Only clusters-service and backend are expected as source clients. This may indicate a connection leak or clients failing to unregister.'
runbook_url: 'TBD'
summary: 'Maestro has too many gRPC source client connections'
- alert: MaestroRESTAPIErrorRate
# Fires when the 5xx error rate on the Maestro REST API exceeds
# 5%. Clusters-service and backend use this API to create, update,
# and delete resources. A sustained error rate means resource
# CRUD operations are failing.
expr: |
sum(rate(rest_api_inbound_request_count{namespace="maestro", code=~"5.."}[5m]))
/
sum(rate(rest_api_inbound_request_count{namespace="maestro"}[5m]))
> 0.05
for: 5m
labels:
severity: warning
annotations:
description: 'Maestro REST API 5xx error rate is above 5% for the last 5 minutes. Current value: {{ $value | humanizePercentage }}.'
runbook_url: 'TBD'
summary: 'Maestro REST API error rate is high'
- alert: MaestroGRPCServerErrorRate
# Fires when the gRPC error rate exceeds 5%. The gRPC server
# handles publish (spec distribution) and subscribe (status
# callbacks) operations. Errors here break resource distribution
# to management clusters.
expr: |
sum(rate(grpc_server_processed_total{namespace="maestro", code!="OK"}[5m]))
/
sum(rate(grpc_server_processed_total{namespace="maestro"}[5m]))
> 0.05
for: 5m
labels:
severity: warning
annotations:
description: 'Maestro gRPC server error rate is above 5% for the last 5 minutes. Current value: {{ $value | humanizePercentage }}.'
runbook_url: 'TBD'
summary: 'Maestro gRPC server error rate is high'
- alert: MaestroSpecControllerReconcileErrors
# Fires when the spec controller reconcile error rate exceeds
# 10%. The spec controller distributes resource specs to
# management cluster work agents. Sustained errors mean resources
# are not reaching their target clusters.
expr: |
sum(rate(spec_controller_event_reconcile_total{namespace="maestro", status="error"}[5m]))
/
sum(rate(spec_controller_event_reconcile_total{namespace="maestro"}[5m]))
> 0.1
for: 10m
labels:
severity: warning
annotations:
description: 'Maestro spec controller reconcile error rate is above 10% for the last 10 minutes. Resources may not be reaching management clusters. Current value: {{ $value | humanizePercentage }}.'
runbook_url: 'TBD'
summary: 'Maestro spec controller reconcile error rate is high'
Loading
Loading