Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 3 additions & 2 deletions controllers/object_controls.go
Original file line number Diff line number Diff line change
Expand Up @@ -4996,9 +4996,10 @@ func ServiceMonitor(n ClusterPolicyController) (gpuv1.State, error) {
return gpuv1.Disabled, nil
}

// If Prometheus CRD is missing, skip gracefully
if !serviceMonitorCRDExists {
logger.Error(fmt.Errorf("couldn't find ServiceMonitor CRD"), "Install Prometheus and necessary CRDs for gathering GPU metrics!")
return gpuv1.NotReady, nil
logger.V(1).Info("ServiceMonitor CRD not found, skipping DCGM Exporter ServiceMonitor creation")
return gpuv1.Ready, nil
}

// Apply custom edits for DCGM Exporter
Expand Down
42 changes: 40 additions & 2 deletions controllers/object_controls_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -1325,7 +1325,7 @@ func TestServiceMonitor(t *testing.T) {
expectedServiceMonitor: nil,
},
{
description: "dcgm-exporter SM enabled, CRD missing -> NotReady",
description: "dcgm-exporter SM explicitly enabled, CRD missing -> Ready (skip gracefully)",
stateName: "state-dcgm-exporter",
k8sObjects: nil,
clusterPolicySpec: gpuv1.ClusterPolicySpec{
Expand All @@ -1334,7 +1334,45 @@ func TestServiceMonitor(t *testing.T) {
ServiceMonitor: &gpuv1.DCGMExporterServiceMonitorConfig{Enabled: ptr.To(true)},
},
},
expectedState: gpuv1.NotReady,
expectedState: gpuv1.Ready,
expectedServiceMonitor: nil,
},
{
description: "dcgm-exporter SM nil config, CRD missing -> Ready (cleanup path; CRD absent)",
stateName: "state-dcgm-exporter",
k8sObjects: nil,
clusterPolicySpec: gpuv1.ClusterPolicySpec{
DCGMExporter: gpuv1.DCGMExporterSpec{
Enabled: ptr.To(true),
},
},
expectedState: gpuv1.Ready,
expectedServiceMonitor: nil,
},
{
description: "dcgm-exporter SM provided but Enabled nil, CRD missing -> Ready (skip cleanup)",
stateName: "state-dcgm-exporter",
k8sObjects: nil,
clusterPolicySpec: gpuv1.ClusterPolicySpec{
DCGMExporter: gpuv1.DCGMExporterSpec{
Enabled: ptr.To(true),
ServiceMonitor: &gpuv1.DCGMExporterServiceMonitorConfig{},
},
},
expectedState: gpuv1.Ready,
expectedServiceMonitor: nil,
},
{
description: "dcgm-exporter SM provided but Enabled nil, CRD present -> Disabled (default false)",
stateName: "state-dcgm-exporter",
k8sObjects: []client.Object{serviceMonitorCRD},
clusterPolicySpec: gpuv1.ClusterPolicySpec{
DCGMExporter: gpuv1.DCGMExporterSpec{
Enabled: ptr.To(true),
ServiceMonitor: &gpuv1.DCGMExporterServiceMonitorConfig{},
},
},
expectedState: gpuv1.Disabled,
expectedServiceMonitor: nil,
},
{
Expand Down
2 changes: 1 addition & 1 deletion deployments/gpu-operator/values.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -312,7 +312,7 @@ dcgmExporter:
service:
internalTrafficPolicy: Cluster
serviceMonitor:
enabled: false
enabled: true
interval: 15s
scrapeTimeout: 10s
honorLabels: false
Expand Down
Loading