From 5acfc79238ed8823b3846fb59b9e578a7eeefd87 Mon Sep 17 00:00:00 2001 From: Vincent Gimenes Date: Wed, 1 Apr 2026 00:14:24 +0200 Subject: [PATCH 1/2] dcgm-exporter: auto-enable ServiceMonitor when ServiceMonitor CRD is present Signed-off-by: Vincent Gimenes --- controllers/object_controls.go | 6 ++-- controllers/object_controls_test.go | 42 ++++++++++++++++++++++++++-- deployments/gpu-operator/values.yaml | 2 +- 3 files changed, 45 insertions(+), 5 deletions(-) diff --git a/controllers/object_controls.go b/controllers/object_controls.go index 1f8806fcc..f6c525f9c 100644 --- a/controllers/object_controls.go +++ b/controllers/object_controls.go @@ -4983,6 +4983,7 @@ func ServiceMonitor(n ClusterPolicyController) (gpuv1.State, error) { if n.stateNames[state] == "state-dcgm-exporter" { serviceMonitor := n.singleton.Spec.DCGMExporter.ServiceMonitor + // Check if ServiceMonitor is disabled and cleanup resource if exists if serviceMonitor == nil || !serviceMonitor.IsEnabled() { if !serviceMonitorCRDExists { @@ -4996,9 +4997,10 @@ func ServiceMonitor(n ClusterPolicyController) (gpuv1.State, error) { return gpuv1.Disabled, nil } + // If Prometheus CRD is missing, skip gracefully if !serviceMonitorCRDExists { - logger.Error(fmt.Errorf("couldn't find ServiceMonitor CRD"), "Install Prometheus and necessary CRDs for gathering GPU metrics!") - return gpuv1.NotReady, nil + logger.V(1).Info("ServiceMonitor CRD not found, skipping DCGM Exporter ServiceMonitor creation") + return gpuv1.Ready, nil } // Apply custom edits for DCGM Exporter diff --git a/controllers/object_controls_test.go b/controllers/object_controls_test.go index a25522d16..2967e9212 100644 --- a/controllers/object_controls_test.go +++ b/controllers/object_controls_test.go @@ -1325,7 +1325,7 @@ func TestServiceMonitor(t *testing.T) { expectedServiceMonitor: nil, }, { - description: "dcgm-exporter SM enabled, CRD missing -> NotReady", + description: "dcgm-exporter SM explicitly enabled, CRD missing -> Ready (skip gracefully)", stateName: "state-dcgm-exporter", k8sObjects: nil, clusterPolicySpec: gpuv1.ClusterPolicySpec{ @@ -1334,7 +1334,45 @@ func TestServiceMonitor(t *testing.T) { ServiceMonitor: &gpuv1.DCGMExporterServiceMonitorConfig{Enabled: ptr.To(true)}, }, }, - expectedState: gpuv1.NotReady, + expectedState: gpuv1.Ready, + expectedServiceMonitor: nil, + }, + { + description: "dcgm-exporter SM nil config, CRD missing -> Ready (cleanup path; CRD absent)", + stateName: "state-dcgm-exporter", + k8sObjects: nil, + clusterPolicySpec: gpuv1.ClusterPolicySpec{ + DCGMExporter: gpuv1.DCGMExporterSpec{ + Enabled: ptr.To(true), + }, + }, + expectedState: gpuv1.Ready, + expectedServiceMonitor: nil, + }, + { + description: "dcgm-exporter SM provided but Enabled nil, CRD missing -> Ready (skip cleanup)", + stateName: "state-dcgm-exporter", + k8sObjects: nil, + clusterPolicySpec: gpuv1.ClusterPolicySpec{ + DCGMExporter: gpuv1.DCGMExporterSpec{ + Enabled: ptr.To(true), + ServiceMonitor: &gpuv1.DCGMExporterServiceMonitorConfig{}, + }, + }, + expectedState: gpuv1.Ready, + expectedServiceMonitor: nil, + }, + { + description: "dcgm-exporter SM provided but Enabled nil, CRD present -> Disabled (default false)", + stateName: "state-dcgm-exporter", + k8sObjects: []client.Object{serviceMonitorCRD}, + clusterPolicySpec: gpuv1.ClusterPolicySpec{ + DCGMExporter: gpuv1.DCGMExporterSpec{ + Enabled: ptr.To(true), + ServiceMonitor: &gpuv1.DCGMExporterServiceMonitorConfig{}, + }, + }, + expectedState: gpuv1.Disabled, expectedServiceMonitor: nil, }, { diff --git a/deployments/gpu-operator/values.yaml b/deployments/gpu-operator/values.yaml index 9dba0c80c..9379a1e32 100644 --- a/deployments/gpu-operator/values.yaml +++ b/deployments/gpu-operator/values.yaml @@ -312,7 +312,7 @@ dcgmExporter: service: internalTrafficPolicy: Cluster serviceMonitor: - enabled: false + enabled: true interval: 15s scrapeTimeout: 10s honorLabels: false From b1e1798caa65ec9fed012634ffa247b0c4a247f1 Mon Sep 17 00:00:00 2001 From: Vincent Gimenes Date: Mon, 4 May 2026 19:50:19 +0200 Subject: [PATCH 2/2] erase a useless diff Signed-off-by: Vincent Gimenes --- controllers/object_controls.go | 1 - 1 file changed, 1 deletion(-) diff --git a/controllers/object_controls.go b/controllers/object_controls.go index f6c525f9c..ac15d3425 100644 --- a/controllers/object_controls.go +++ b/controllers/object_controls.go @@ -4983,7 +4983,6 @@ func ServiceMonitor(n ClusterPolicyController) (gpuv1.State, error) { if n.stateNames[state] == "state-dcgm-exporter" { serviceMonitor := n.singleton.Spec.DCGMExporter.ServiceMonitor - // Check if ServiceMonitor is disabled and cleanup resource if exists if serviceMonitor == nil || !serviceMonitor.IsEnabled() { if !serviceMonitorCRDExists {