Azure · openshift-merge-bot · Apr 15, 2026 · Apr 14, 2026
diff --git a/backend/alerts/backend-prometheusRule.yaml b/backend/alerts/backend-prometheusRule.yaml
@@ -46,7 +46,7 @@ spec:
         ) > 0.5
       for: 10m
       labels:
-        severity: critical
+        severity: warning
       annotations:
         description: 'Backend controller workqueue {{ $labels.name }} has a retry ratio of > 50% sustained over 10 minutes, indicating most queue activity is failed retries rather than fresh work.'
         runbook_url: 'TBD'
@@ -65,3 +65,15 @@ spec:
         description: 'Backend controller workqueue {{ $labels.name }} has had a depth > 10 for more than 5 minutes, indicating work is accumulating faster than it can be processed.'
         runbook_url: 'TBD'
         summary: 'Backend controller workqueue {{ $labels.name }} depth is high'
+    - alert: BackendControllerPanic
+      expr: |
+        sum by (controller, cluster) (
+          increase(panic_total{namespace="aro-hcp"}[5m])
+        ) > 0
+      for: 1m
+      labels:
+        severity: warning
+      annotations:
+        description: 'Backend controller {{ $labels.controller }} has panicked {{ printf "%.0f" $value }} time(s) in the last 5 minutes.'
+        runbook_url: 'TBD'
+        summary: 'Backend controller {{ $labels.controller }} is panicking'
diff --git a/backend/alerts/backend-prometheusRule_test.yaml b/backend/alerts/backend-prometheusRule_test.yaml
@@ -43,7 +43,7 @@ tests:
     exp_alerts:
     - exp_labels:
         name: OperationClusterCreate
-        severity: critical
+        severity: warning
       exp_annotations:
         description: 'Backend controller workqueue OperationClusterCreate has a retry ratio of > 50% sustained over 10 minutes, indicating most queue activity is failed retries rather than fresh work.'
         runbook_url: 'TBD'
@@ -84,3 +84,28 @@ tests:
   - eval_time: 6m
     alertname: BackendControllerQueueDepthHigh
     exp_alerts: []
+# Test: BackendControllerPanic fires when panics occur
+- interval: 1m
+  input_series:
+  - series: 'panic_total{controller="DoNothingExample", namespace="aro-hcp"}'
+    values: "0 1 2 3 4 5 6 7 8 9 10"
+  alert_rule_test:
+  - eval_time: 6m
+    alertname: BackendControllerPanic
+    exp_alerts:
+    - exp_labels:
+        controller: DoNothingExample
+        severity: warning
+      exp_annotations:
+        description: 'Backend controller DoNothingExample has panicked 5 time(s) in the last 5 minutes.'
+        runbook_url: 'TBD'
+        summary: 'Backend controller DoNothingExample is panicking'
+# Test: BackendControllerPanic does not fire when no panics
+- interval: 1m
+  input_series:
+  - series: 'panic_total{controller="DoNothingExample", namespace="aro-hcp"}'
+    values: "0+0x10"
+  alert_rule_test:
+  - eval_time: 6m
+    alertname: BackendControllerPanic
+    exp_alerts: []
diff --git a/dev-infrastructure/modules/metrics/rules/generatedMsftPrometheusAlertingRules.bicep b/dev-infrastructure/modules/metrics/rules/generatedMsftPrometheusAlertingRules.bicep
@@ -1363,7 +1363,7 @@ resource msftPrometheusOperatorRules 'Microsoft.AlertsManagement/prometheusRuleG
           title: 'Resources rejected by Prometheus operator'
         }
         expression: 'min_over_time(prometheus_operator_managed_resources{job="prometheus-operator",namespace="prometheus",state="rejected"}[5m]) > 0'
-        for: 'PT5M'
+        for: 'PT20M'
         severity: 3
       }
     ]

diff --git a/dev-infrastructure/modules/metrics/rules/generatedPrometheusAlertingRules.bicep b/dev-infrastructure/modules/metrics/rules/generatedPrometheusAlertingRules.bicep
@@ -338,7 +338,7 @@ resource prometheusOperatorRules 'Microsoft.AlertsManagement/prometheusRuleGroup
           title: 'Resources rejected by Prometheus operator'
         }
         expression: 'min_over_time(prometheus_operator_managed_resources{job="prometheus-operator",namespace="prometheus",state="rejected"}[5m]) > 0'
-        for: 'PT5M'
+        for: 'PT20M'
         severity: 3
       }
     ]
@@ -348,46 +348,6 @@ resource prometheusOperatorRules 'Microsoft.AlertsManagement/prometheusRuleGroup
   }
 }
 
-resource mise 'Microsoft.AlertsManagement/prometheusRuleGroups@2023-03-01' = {
-  name: 'mise'
-  location: location
-  properties: {
-    interval: 'PT1M'
-    rules: [
-      {
-        actions: [
-          for g in actionGroups: {
-            actionGroupId: g
-            actionProperties: {
-              'IcM.Title': '#$.labels.cluster#: #$.annotations.title#'
-              'IcM.CorrelationId': '#$.annotations.correlationId#'
-            }
-          }
-        ]
-        alert: 'MiseEnvoyScrapeDown'
-        enabled: true
-        labels: {
-          severity: 'info'
-        }
-        annotations: {
-          correlationId: 'MiseEnvoyScrapeDown/{{ $labels.cluster }}'
-          description: 'Prometheus scrape for envoy-stats job in namespace mise is failing or missing.'
-          info: 'Prometheus scrape for envoy-stats job in namespace mise is failing or missing.'
-          runbook_url: 'TBD'
-          summary: 'Envoy scrape target down for namespace=mise'
-          title: 'Envoy scrape target down for namespace=mise'
-        }
-        expression: 'group by (cluster) (up{job="kube-state-metrics", cluster=~".*-svc(-[0-9]+)?$"}) unless on(cluster) group by (cluster) (up{endpoint="http-envoy-prom", container="istio-proxy", namespace="mise"} == 1)'
-        for: 'PT5M'
-        severity: 4
-      }
-    ]
-    scopes: [
-      azureMonitoring
-    ]
-  }
-}
-
 resource frontend 'Microsoft.AlertsManagement/prometheusRuleGroups@2023-03-01' = {
   name: 'frontend'
   location: location
@@ -741,7 +701,7 @@ resource backend 'Microsoft.AlertsManagement/prometheusRuleGroups@2023-03-01' =
         alert: 'BackendControllerRetryHotLoop'
         enabled: true
         labels: {
-          severity: 'critical'
+          severity: 'warning'
         }
         annotations: {
           correlationId: 'BackendControllerRetryHotLoop/{{ $labels.cluster }}/{{ $labels.name }}'
@@ -782,6 +742,33 @@ resource backend 'Microsoft.AlertsManagement/prometheusRuleGroups@2023-03-01' =
         for: 'PT5M'
         severity: 3
       }
+      {
+        actions: [
+          for g in actionGroups: {
+            actionGroupId: g
+            actionProperties: {
+              'IcM.Title': '#$.labels.cluster#: #$.annotations.title#'
+              'IcM.CorrelationId': '#$.annotations.correlationId#'
+            }
+          }
+        ]
+        alert: 'BackendControllerPanic'
+        enabled: true
+        labels: {
+          severity: 'warning'
+        }
+        annotations: {
+          correlationId: 'BackendControllerPanic/{{ $labels.cluster }}/{{ $labels.controller }}'
+          description: 'Backend controller {{ $labels.controller }} has panicked {{ printf "%.0f" $value }} time(s) in the last 5 minutes.'
+          info: 'Backend controller {{ $labels.controller }} has panicked {{ printf "%.0f" $value }} time(s) in the last 5 minutes.'
+          runbook_url: 'TBD'
+          summary: 'Backend controller {{ $labels.controller }} is panicking'
+          title: 'Backend controller {{ $labels.controller }} is panicking'
+        }
+        expression: 'sum by (controller, cluster) ( increase(panic_total{namespace="aro-hcp"}[5m]) ) > 0'
+        for: 'PT1M'
+        severity: 3
+      }
     ]
     scopes: [
       azureMonitoring
@@ -856,6 +843,127 @@ resource adminApi 'Microsoft.AlertsManagement/prometheusRuleGroups@2023-03-01' =
   }
 }
 
+resource maestro 'Microsoft.AlertsManagement/prometheusRuleGroups@2023-03-01' = {
+  name: 'maestro'
+  location: location
+  properties: {
+    interval: 'PT1M'
+    rules: [
+      {
+        actions: [
+          for g in actionGroups: {
+            actionGroupId: g
+            actionProperties: {
+              'IcM.Title': '#$.labels.cluster#: #$.annotations.title#'
+              'IcM.CorrelationId': '#$.annotations.correlationId#'
+            }
+          }
+        ]
+        alert: 'MaestroGRPCSourceClientExcessConnections'
+        enabled: true
+        labels: {
+          severity: 'warning'
+        }
+        annotations: {
+          correlationId: 'MaestroGRPCSourceClientExcessConnections/{{ $labels.cluster }}'
+          description: 'Maestro gRPC server has {{ $value }} registered source clients, which is unusually high. Only clusters-service and backend are expected as source clients. This may indicate a connection leak or clients failing to unregister.'
+          info: 'Maestro gRPC server has {{ $value }} registered source clients, which is unusually high. Only clusters-service and backend are expected as source clients. This may indicate a connection leak or clients failing to unregister.'
+          runbook_url: 'TBD'
+          summary: 'Maestro has too many gRPC source client connections'
+          title: 'Maestro has too many gRPC source client connections'
+        }
+        expression: 'sum(grpc_server_registered_source_clients{namespace="maestro"}) > 100'
+        for: 'PT10M'
+        severity: 3
+      }
+      {
+        actions: [
+          for g in actionGroups: {
+            actionGroupId: g
+            actionProperties: {
+              'IcM.Title': '#$.labels.cluster#: #$.annotations.title#'
+              'IcM.CorrelationId': '#$.annotations.correlationId#'
+            }
+          }
+        ]
+        alert: 'MaestroRESTAPIErrorRate'
+        enabled: true
+        labels: {
+          severity: 'warning'
+        }
+        annotations: {
+          correlationId: 'MaestroRESTAPIErrorRate/{{ $labels.cluster }}'
+          description: 'Maestro REST API 5xx error rate is above 5% for the last 5 minutes. Current value: {{ $value | humanizePercentage }}.'
+          info: 'Maestro REST API 5xx error rate is above 5% for the last 5 minutes. Current value: {{ $value | humanizePercentage }}.'
+          runbook_url: 'TBD'
+          summary: 'Maestro REST API error rate is high'
+          title: 'Maestro REST API error rate is high'
+        }
+        expression: 'sum(rate(rest_api_inbound_request_count{namespace="maestro", code=~"5.."}[5m])) / sum(rate(rest_api_inbound_request_count{namespace="maestro"}[5m])) > 0.05'
+        for: 'PT5M'
+        severity: 3
+      }
+      {
+        actions: [
+          for g in actionGroups: {
+            actionGroupId: g
+            actionProperties: {
+              'IcM.Title': '#$.labels.cluster#: #$.annotations.title#'
+              'IcM.CorrelationId': '#$.annotations.correlationId#'
+            }
+          }
+        ]
+        alert: 'MaestroGRPCServerErrorRate'
+        enabled: true
+        labels: {
+          severity: 'warning'
+        }
+        annotations: {
+          correlationId: 'MaestroGRPCServerErrorRate/{{ $labels.cluster }}'
+          description: 'Maestro gRPC server error rate is above 5% for the last 5 minutes. Current value: {{ $value | humanizePercentage }}.'
+          info: 'Maestro gRPC server error rate is above 5% for the last 5 minutes. Current value: {{ $value | humanizePercentage }}.'
+          runbook_url: 'TBD'
+          summary: 'Maestro gRPC server error rate is high'
+          title: 'Maestro gRPC server error rate is high'
+        }
+        expression: 'sum(rate(grpc_server_processed_total{namespace="maestro", code!="OK"}[5m])) / sum(rate(grpc_server_processed_total{namespace="maestro"}[5m])) > 0.05'
+        for: 'PT5M'
+        severity: 3
+      }
+      {
+        actions: [
+          for g in actionGroups: {
+            actionGroupId: g
+            actionProperties: {
+              'IcM.Title': '#$.labels.cluster#: #$.annotations.title#'
+              'IcM.CorrelationId': '#$.annotations.correlationId#'
+            }
+          }
+        ]
+        alert: 'MaestroSpecControllerReconcileErrors'
+        enabled: true
+        labels: {
+          severity: 'warning'
+        }
+        annotations: {
+          correlationId: 'MaestroSpecControllerReconcileErrors/{{ $labels.cluster }}'
+          description: 'Maestro spec controller reconcile error rate is above 10% for the last 10 minutes. Resources may not be reaching management clusters. Current value: {{ $value | humanizePercentage }}.'
+          info: 'Maestro spec controller reconcile error rate is above 10% for the last 10 minutes. Resources may not be reaching management clusters. Current value: {{ $value | humanizePercentage }}.'
+          runbook_url: 'TBD'
+          summary: 'Maestro spec controller reconcile error rate is high'
+          title: 'Maestro spec controller reconcile error rate is high'
+        }
+        expression: 'sum(rate(spec_controller_event_reconcile_total{namespace="maestro", status="error"}[5m])) / sum(rate(spec_controller_event_reconcile_total{namespace="maestro"}[5m])) > 0.1'
+        for: 'PT10M'
+        severity: 3
+      }
+    ]
+    scopes: [
+      azureMonitoring
+    ]
+  }
+}
+
 resource arobitRules 'Microsoft.AlertsManagement/prometheusRuleGroups@2023-03-01' = {
   name: 'arobit-rules'
   location: location

diff --git a/maestro/alerts/maestro-prometheusRule.yaml b/maestro/alerts/maestro-prometheusRule.yaml
@@ -0,0 +1,79 @@
+apiVersion: monitoring.coreos.com/v1
+kind: PrometheusRule
+metadata:
+  labels:
+    app.kubernetes.io/name: kube-prometheus
+    app.kubernetes.io/part-of: kube-prometheus
+    prometheus: k8s
+    role: alert-rules
+  name: maestro-monitoring-rules
+  namespace: monitoring
+spec:
+  groups:
+  - name: maestro
+    rules:
+    - alert: MaestroGRPCSourceClientExcessConnections
+      # Fires when the number of registered gRPC source clients is
+      # unusually high. Only clusters-service and backend connect as
+      # source clients, so a sustained count above 100 indicates a
+      # connection leak or clients failing to unregister properly.
+      expr: |
+        sum(grpc_server_registered_source_clients{namespace="maestro"}) > 100
+      for: 10m
+      labels:
+        severity: warning
+      annotations:
+        description: 'Maestro gRPC server has {{ $value }} registered source clients, which is unusually high. Only clusters-service and backend are expected as source clients. This may indicate a connection leak or clients failing to unregister.'
+        runbook_url: 'TBD'
+        summary: 'Maestro has too many gRPC source client connections'
+    - alert: MaestroRESTAPIErrorRate
+      # Fires when the 5xx error rate on the Maestro REST API exceeds
+      # 5%. Clusters-service and backend use this API to create, update,
+      # and delete resources. A sustained error rate means resource
+      # CRUD operations are failing.
+      expr: |
+        sum(rate(rest_api_inbound_request_count{namespace="maestro", code=~"5.."}[5m]))
+        /
+        sum(rate(rest_api_inbound_request_count{namespace="maestro"}[5m]))
+        > 0.05
+      for: 5m
+      labels:
+        severity: warning
+      annotations:
+        description: 'Maestro REST API 5xx error rate is above 5% for the last 5 minutes. Current value: {{ $value | humanizePercentage }}.'
+        runbook_url: 'TBD'
+        summary: 'Maestro REST API error rate is high'
+    - alert: MaestroGRPCServerErrorRate
+      # Fires when the gRPC error rate exceeds 5%. The gRPC server
+      # handles publish (spec distribution) and subscribe (status
+      # callbacks) operations. Errors here break resource distribution
+      # to management clusters.
+      expr: |
+        sum(rate(grpc_server_processed_total{namespace="maestro", code!="OK"}[5m]))
+        /
+        sum(rate(grpc_server_processed_total{namespace="maestro"}[5m]))
+        > 0.05
+      for: 5m
+      labels:
+        severity: warning
+      annotations:
+        description: 'Maestro gRPC server error rate is above 5% for the last 5 minutes. Current value: {{ $value | humanizePercentage }}.'
+        runbook_url: 'TBD'
+        summary: 'Maestro gRPC server error rate is high'
+    - alert: MaestroSpecControllerReconcileErrors
+      # Fires when the spec controller reconcile error rate exceeds
+      # 10%. The spec controller distributes resource specs to
+      # management cluster work agents. Sustained errors mean resources
+      # are not reaching their target clusters.
+      expr: |
+        sum(rate(spec_controller_event_reconcile_total{namespace="maestro", status="error"}[5m]))
+        /
+        sum(rate(spec_controller_event_reconcile_total{namespace="maestro"}[5m]))
+        > 0.1
+      for: 10m
+      labels:
+        severity: warning
+      annotations:
+        description: 'Maestro spec controller reconcile error rate is above 10% for the last 10 minutes. Resources may not be reaching management clusters. Current value: {{ $value | humanizePercentage }}.'
+        runbook_url: 'TBD'
+        summary: 'Maestro spec controller reconcile error rate is high'