Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
48 changes: 48 additions & 0 deletions api/nvidia/v1/clusterpolicy_types.go
Original file line number Diff line number Diff line change
Expand Up @@ -1077,6 +1077,30 @@ type DCGMExporterSpec struct {
// +operator-sdk:gen-csv:customresourcedefinitions.specDescriptors.displayName="HPC Job Mapping Configuration"
// +operator-sdk:gen-csv:customresourcedefinitions.specDescriptors.x-descriptors="urn:alm:descriptor:com.tectonic.ui:advanced"
HPCJobMapping *DCGMExporterHPCJobMappingConfig `json:"hpcJobMapping,omitempty"`

// Enable Kubernetes pod labels as Prometheus label dimensions in DCGM exporter metrics.
// (Requires cluster-level read access to pods.)
// +kubebuilder:validation:Optional
// +operator-sdk:gen-csv:customresourcedefinitions.specDescriptors=true
// +operator-sdk:gen-csv:customresourcedefinitions.specDescriptors.displayName="Enable pod-label enrichment"
// +operator-sdk:gen-csv:customresourcedefinitions.specDescriptors.x-descriptors="urn:alm:descriptor:com.tectonic.ui:booleanSwitch"
EnablePodLabels *bool `json:"enablePodLabels,omitempty"`

// Enable Kubernetes pod UID as a Prometheus label dimension in DCGM exporter metrics.
// (Requires cluster-level read access to pods.)
// +kubebuilder:validation:Optional
// +operator-sdk:gen-csv:customresourcedefinitions.specDescriptors=true
// +operator-sdk:gen-csv:customresourcedefinitions.specDescriptors.displayName="Enable pod-UID enrichment"
// +operator-sdk:gen-csv:customresourcedefinitions.specDescriptors.x-descriptors="urn:alm:descriptor:com.tectonic.ui:booleanSwitch"
EnablePodUID *bool `json:"enablePodUID,omitempty"`

// Regex list for filtering which Kubernetes pod labels are included in DCGM exporter metrics.
// Empty means all pod labels are included.
// +kubebuilder:validation:Optional
// +operator-sdk:gen-csv:customresourcedefinitions.specDescriptors=true
// +operator-sdk:gen-csv:customresourcedefinitions.specDescriptors.displayName="Pod label allowlist regex"
// +operator-sdk:gen-csv:customresourcedefinitions.specDescriptors.x-descriptors="urn:alm:descriptor:com.tectonic.ui:advanced,urn:alm:descriptor:com.tectonic.ui:text"
PodLabelAllowlistRegex []string `json:"podLabelAllowlistRegex,omitempty"`
}

// DCGMExporterHPCJobMappingConfig defines HPC job mapping configuration for NVIDIA DCGM Exporter
Expand Down Expand Up @@ -2238,6 +2262,30 @@ func (e *DCGMExporterSpec) GetHPCJobMappingDirectory() string {
return e.HPCJobMapping.Directory
}

// IsPodLabelsEnabled returns true if pod-label enrichment is enabled for DCGM Exporter
func (e *DCGMExporterSpec) IsPodLabelsEnabled() bool {
if e.EnablePodLabels == nil {
// default is false if not specified by user
return false
}
return *e.EnablePodLabels
}

// IsPodUIDEnabled returns true if pod-UID enrichment is enabled for DCGM Exporter
func (e *DCGMExporterSpec) IsPodUIDEnabled() bool {
if e.EnablePodUID == nil {
// default is false if not specified by user
return false
}
return *e.EnablePodUID
}

// IsKubernetesPodMetadataEnabled returns true if any Kubernetes pod metadata
// enrichment is enabled for DCGM Exporter.
func (e *DCGMExporterSpec) IsKubernetesPodMetadataEnabled() bool {
return e.IsPodLabelsEnabled() || e.IsPodUIDEnabled()
}

// IsEnabled returns true if gpu-feature-discovery is enabled(default) through gpu-operator
func (g *GPUFeatureDiscoverySpec) IsEnabled() bool {
if g.Enabled == nil {
Expand Down
15 changes: 15 additions & 0 deletions api/nvidia/v1/zz_generated.deepcopy.go

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

16 changes: 16 additions & 0 deletions assets/state-dcgm-exporter/0210_clusterrole.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
apiVersion: rbac.authorization.k8s.io/v1
kind: ClusterRole
metadata:
name: nvidia-dcgm-exporter-read-pods
labels:
app: nvidia-dcgm-exporter
# TODO: Add resourceSlices permissions when GPU Operator exposes DRA exporter support.
rules:
- apiGroups:
- ""
resources:
- pods
verbs:
- get
- list
- watch
14 changes: 14 additions & 0 deletions assets/state-dcgm-exporter/0310_clusterrolebinding.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
apiVersion: rbac.authorization.k8s.io/v1
kind: ClusterRoleBinding
metadata:
name: nvidia-dcgm-exporter-read-pods
labels:
app: nvidia-dcgm-exporter
roleRef:
apiGroup: rbac.authorization.k8s.io
kind: ClusterRole
name: nvidia-dcgm-exporter-read-pods
subjects:
- kind: ServiceAccount
name: nvidia-dcgm-exporter
namespace: "FILLED BY THE OPERATOR"
1 change: 1 addition & 0 deletions assets/state-dcgm-exporter/0800_daemonset.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@ spec:
effect: NoSchedule
priorityClassName: system-node-critical
serviceAccountName: nvidia-dcgm-exporter
automountServiceAccountToken: false
initContainers:
- name: toolkit-validation
image: "FILLED BY THE OPERATOR"
Expand Down
17 changes: 17 additions & 0 deletions bundle/manifests/nvidia.com_clusterpolicies.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -575,6 +575,16 @@ spec:
metrics to be collected by NVIDIA DCGM Exporter
type: string
type: object
enablePodLabels:
description: |-
Enable Kubernetes pod labels as Prometheus label dimensions in DCGM exporter metrics.
(Requires cluster-level read access to pods.)
type: boolean
enablePodUID:
description: |-
Enable Kubernetes pod UID as a Prometheus label dimension in DCGM exporter metrics.
(Requires cluster-level read access to pods.)
type: boolean
enabled:
description: Enabled indicates if deployment of NVIDIA DCGM Exporter
through operator is enabled
Expand Down Expand Up @@ -628,6 +638,13 @@ spec:
items:
type: string
type: array
podLabelAllowlistRegex:
description: |-
Regex list for filtering which Kubernetes pod labels are included in DCGM exporter metrics.
Empty means all pod labels are included.
items:
type: string
type: array
repository:
description: NVIDIA DCGM Exporter image repository
type: string
Expand Down
17 changes: 17 additions & 0 deletions config/crd/bases/nvidia.com_clusterpolicies.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -575,6 +575,16 @@ spec:
metrics to be collected by NVIDIA DCGM Exporter
type: string
type: object
enablePodLabels:
description: |-
Enable Kubernetes pod labels as Prometheus label dimensions in DCGM exporter metrics.
(Requires cluster-level read access to pods.)
type: boolean
enablePodUID:
description: |-
Enable Kubernetes pod UID as a Prometheus label dimension in DCGM exporter metrics.
(Requires cluster-level read access to pods.)
type: boolean
enabled:
description: Enabled indicates if deployment of NVIDIA DCGM Exporter
through operator is enabled
Expand Down Expand Up @@ -628,6 +638,13 @@ spec:
items:
type: string
type: array
podLabelAllowlistRegex:
description: |-
Regex list for filtering which Kubernetes pod labels are included in DCGM exporter metrics.
Empty means all pod labels are included.
items:
type: string
type: array
repository:
description: NVIDIA DCGM Exporter image repository
type: string
Expand Down
45 changes: 45 additions & 0 deletions controllers/object_controls.go
Original file line number Diff line number Diff line change
Expand Up @@ -466,6 +466,19 @@ func ClusterRole(n ClusterPolicyController) (gpuv1.State, error) {
return gpuv1.Disabled, nil
}

// For state-dcgm-exporter, cluster-scoped pod-read RBAC is only needed
// when pod-metadata enrichment is opted in.
if n.stateNames[n.idx] == "state-dcgm-exporter" &&
obj.Name == "nvidia-dcgm-exporter-read-pods" &&
!n.singleton.Spec.DCGMExporter.IsKubernetesPodMetadataEnabled() {
err := n.client.Delete(ctx, obj)
if err != nil && !apierrors.IsNotFound(err) {
logger.Error(err, "Couldn't delete")
return gpuv1.NotReady, err
}
return gpuv1.Disabled, nil
}

if err := controllerutil.SetControllerReference(n.singleton, obj, n.scheme); err != nil {
return gpuv1.NotReady, err
}
Expand Down Expand Up @@ -507,6 +520,18 @@ func ClusterRoleBinding(n ClusterPolicyController) (gpuv1.State, error) {
return gpuv1.Disabled, nil
}

// Mirror the ClusterRole gate.
if n.stateNames[n.idx] == "state-dcgm-exporter" &&
obj.Name == "nvidia-dcgm-exporter-read-pods" &&
!n.singleton.Spec.DCGMExporter.IsKubernetesPodMetadataEnabled() {
err := n.client.Delete(ctx, obj)
if err != nil && !apierrors.IsNotFound(err) {
logger.Error(err, "Couldn't delete")
return gpuv1.NotReady, err
}
return gpuv1.Disabled, nil
}

for idx := range obj.Subjects {
obj.Subjects[idx].Namespace = n.operatorNamespace
}
Expand Down Expand Up @@ -1828,6 +1853,26 @@ func TransformDCGMExporter(obj *appsv1.DaemonSet, config *gpuv1.ClusterPolicySpe
obj.Spec.Template.Spec.Volumes = append(obj.Spec.Template.Spec.Volumes, jobMappingVol)
}

// Inject pod-metadata enrichment env vars; RBAC is provisioned via the
// 0210/0310 assets and the SA token is mounted below.
if config.DCGMExporter.IsPodLabelsEnabled() {
setContainerEnv(&(obj.Spec.Template.Spec.Containers[0]), "DCGM_EXPORTER_KUBERNETES_ENABLE_POD_LABELS", "true")
}
if config.DCGMExporter.IsPodUIDEnabled() {
setContainerEnv(&(obj.Spec.Template.Spec.Containers[0]), "DCGM_EXPORTER_KUBERNETES_ENABLE_POD_UID", "true")
}
if len(config.DCGMExporter.PodLabelAllowlistRegex) > 0 {
setContainerEnv(&(obj.Spec.Template.Spec.Containers[0]),
"DCGM_EXPORTER_KUBERNETES_POD_LABEL_ALLOWLIST_REGEX",
strings.Join(config.DCGMExporter.PodLabelAllowlistRegex, ","))
}

// Override the base asset's automountServiceAccountToken=false when
// enrichment is on so the pod informer has client-go credentials.
if config.DCGMExporter.IsKubernetesPodMetadataEnabled() {
obj.Spec.Template.Spec.AutomountServiceAccountToken = ptr.To(true)
}

// mount configmap for custom metrics if provided by user
if config.DCGMExporter.MetricsConfig != nil && config.DCGMExporter.MetricsConfig.Name != "" {
metricsConfigVolMount := corev1.VolumeMount{Name: "metrics-config", ReadOnly: true, MountPath: MetricsConfigMountPath, SubPath: MetricsConfigFileName}
Expand Down
53 changes: 44 additions & 9 deletions controllers/object_controls_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -1151,6 +1151,10 @@ func getDCGMExporterTestInput(testCase string) *gpuv1.ClusterPolicy {
case "standalone-dcgm":
dcgmEnabled := true
cp.Spec.DCGM.Enabled = &dcgmEnabled
case "pod-labels-enabled":
cp.Spec.DCGMExporter.EnablePodLabels = ptr.To(true)
case "pod-uid-enabled":
cp.Spec.DCGMExporter.EnablePodUID = ptr.To(true)
default:
return nil
}
Expand All @@ -1166,6 +1170,7 @@ func getDCGMExporterTestOutput(testCase string) map[string]interface{} {
"numDaemonsets": 1,
"dcgmExporterImage": "nvcr.io/nvidia/k8s/dcgm-exporter:3.3.0-3.2.0-ubuntu22.04",
"imagePullSecret": "ngc-secret",
"clusterRoleExists": false,
}

switch testCase {
Expand All @@ -1175,6 +1180,16 @@ func getDCGMExporterTestOutput(testCase string) map[string]interface{} {
output["env"] = map[string]string{
"DCGM_REMOTE_HOSTENGINE_INFO": "nvidia-dcgm:5555",
}
case "pod-labels-enabled":
output["env"] = map[string]string{
"DCGM_EXPORTER_KUBERNETES_ENABLE_POD_LABELS": "true",
}
output["clusterRoleExists"] = true
case "pod-uid-enabled":
output["env"] = map[string]string{
"DCGM_EXPORTER_KUBERNETES_ENABLE_POD_UID": "true",
}
output["clusterRoleExists"] = true
default:
return nil
}
Expand All @@ -1200,6 +1215,16 @@ func TestDCGMExporter(t *testing.T) {
getDCGMExporterTestInput("standalone-dcgm"),
getDCGMExporterTestOutput("standalone-dcgm"),
},
{
"PodLabelsEnabled",
getDCGMExporterTestInput("pod-labels-enabled"),
getDCGMExporterTestOutput("pod-labels-enabled"),
},
{
"PodUIDEnabled",
getDCGMExporterTestInput("pod-uid-enabled"),
getDCGMExporterTestOutput("pod-uid-enabled"),
},
}

for _, tc := range testCases {
Expand All @@ -1220,15 +1245,25 @@ func TestDCGMExporter(t *testing.T) {
}
}
for key, value := range tc.output["env"].(map[string]string) {
envFound := false
for _, envVar := range ds.Spec.Template.Spec.Containers[0].Env {
if envVar.Name == key && envVar.Value == value {
envFound = true
}
}
if !envFound {
t.Fatalf("Expected env is not set for daemonset nvidia-dcgm-exporter %s->%s", key, value)
}
require.Equal(t, value, getContainerEnv(&ds.Spec.Template.Spec.Containers[0], key), "unexpected value for env var %s on daemonset nvidia-dcgm-exporter", key)
}

// Verify cluster-scoped RBAC and SA token mount match the feature toggle.
clusterRoleKey := client.ObjectKey{Name: "nvidia-dcgm-exporter-read-pods"}
clusterRole := &rbacv1.ClusterRole{}
clusterRoleGetErr := clusterPolicyController.client.Get(context.Background(), clusterRoleKey, clusterRole)
clusterRoleBinding := &rbacv1.ClusterRoleBinding{}
clusterRoleBindingGetErr := clusterPolicyController.client.Get(context.Background(), clusterRoleKey, clusterRoleBinding)
if tc.output["clusterRoleExists"].(bool) {
require.NoError(t, clusterRoleGetErr, "ClusterRole should exist when pod metadata enrichment is enabled")
require.NoError(t, clusterRoleBindingGetErr, "ClusterRoleBinding should exist when pod metadata enrichment is enabled")
require.NotNil(t, ds.Spec.Template.Spec.AutomountServiceAccountToken, "AutomountServiceAccountToken should be set when pod metadata enrichment is enabled")
require.True(t, *ds.Spec.Template.Spec.AutomountServiceAccountToken, "AutomountServiceAccountToken should be true when pod metadata enrichment is enabled")
} else {
require.True(t, apierrors.IsNotFound(clusterRoleGetErr), "ClusterRole should not exist when pod metadata enrichment is disabled (got err=%v)", clusterRoleGetErr)
require.True(t, apierrors.IsNotFound(clusterRoleBindingGetErr), "ClusterRoleBinding should not exist when pod metadata enrichment is disabled (got err=%v)", clusterRoleBindingGetErr)
require.NotNil(t, ds.Spec.Template.Spec.AutomountServiceAccountToken, "AutomountServiceAccountToken should be explicitly set false when pod metadata enrichment is disabled")
require.False(t, *ds.Spec.Template.Spec.AutomountServiceAccountToken, "AutomountServiceAccountToken should be false when pod metadata enrichment is disabled")
}

require.Equal(t, tc.output["dcgmExporterImage"], dcgmExporterImage, "Unexpected configuration for dcgm-exporter image")
Expand Down
42 changes: 42 additions & 0 deletions controllers/transforms_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -188,6 +188,11 @@ func (d Daemonset) WithHostPID(enabled bool) Daemonset {
return d
}

func (d Daemonset) WithAutomountServiceAccountToken(enabled bool) Daemonset {
d.Spec.Template.Spec.AutomountServiceAccountToken = ptr.To(enabled)
return d
}

func (d Daemonset) WithVolume(volume corev1.Volume) Daemonset {
d.Spec.Template.Spec.Volumes = append(d.Spec.Template.Spec.Volumes, volume)
return d
Expand Down Expand Up @@ -1636,6 +1641,43 @@ func TestTransformDCGMExporter(t *testing.T) {
}).WithRuntimeClassName("nvidia").
WithHostPathVolume("pod-gpu-resources", "/custom-kubelet/pod-resources", nil),
},
{
description: "transform dcgm exporter with pod metadata enrichment fully configured",
ds: NewDaemonset().
WithContainer(corev1.Container{Name: "dcgm-exporter"}).
WithContainer(corev1.Container{Name: "dummy"}),
cpSpec: &gpuv1.ClusterPolicySpec{
DCGMExporter: gpuv1.DCGMExporterSpec{
Repository: "nvcr.io/nvidia/cloud-native",
Image: "dcgm-exporter",
Version: "v1.0.0",
ImagePullPolicy: "IfNotPresent",
ImagePullSecrets: []string{"pull-secret"},
Args: []string{"--fail-on-init-error=false"},
EnablePodLabels: newBoolPtr(true),
EnablePodUID: newBoolPtr(true),
PodLabelAllowlistRegex: []string{"^app$", `^kueue\.x-k8s\.io/.*$`},
},
DCGM: gpuv1.DCGMSpec{
Enabled: newBoolPtr(true),
},
},
expectedDs: NewDaemonset().WithContainer(corev1.Container{
Name: "dcgm-exporter",
Image: "nvcr.io/nvidia/cloud-native/dcgm-exporter:v1.0.0",
ImagePullPolicy: corev1.PullIfNotPresent,
Args: []string{"--fail-on-init-error=false"},
Env: []corev1.EnvVar{
{Name: "DCGM_REMOTE_HOSTENGINE_INFO", Value: "nvidia-dcgm:5555"},
{Name: "DCGM_EXPORTER_KUBERNETES_ENABLE_POD_LABELS", Value: "true"},
{Name: "DCGM_EXPORTER_KUBERNETES_ENABLE_POD_UID", Value: "true"},
{Name: "DCGM_EXPORTER_KUBERNETES_POD_LABEL_ALLOWLIST_REGEX", Value: `^app$,^kueue\.x-k8s\.io/.*$`},
},
}).WithContainer(corev1.Container{Name: "dummy"}).
WithPullSecret("pull-secret").
WithRuntimeClassName("nvidia").
WithAutomountServiceAccountToken(true),
},
}

for _, tc := range testCases {
Expand Down
Loading
Loading