From fa80ee627ebe90d9664538441bf37d4631174af4 Mon Sep 17 00:00:00 2001 From: Christopher Desiniotis Date: Wed, 4 Jun 2025 09:46:46 -0700 Subject: [PATCH 01/10] Integrate NVIDIA DRA Driver for GPUs as an operand Signed-off-by: Christopher Desiniotis --- api/nvidia/v1/clusterpolicy_types.go | 118 +++++++ api/nvidia/v1/zz_generated.deepcopy.go | 123 +++++++ .../0100_service_account.yaml | 5 + assets/state-dra-driver/0200_clusterrole.yaml | 69 ++++ assets/state-dra-driver/0210_role.yaml | 19 + .../0300_clusterrolebinding.yaml | 12 + assets/state-dra-driver/0310_rolebinding.yaml | 13 + ...400_deviceclass-compute-domain-daemon.yaml | 8 + ...eclass-compute-domain-default-channel.yaml | 8 + .../0420_deviceclass-gpu.yaml | 8 + .../0430_deviceclass-mig.yaml | 8 + assets/state-dra-driver/0500_deployment.yaml | 43 +++ assets/state-dra-driver/0600_configmap.yaml | 40 +++ assets/state-dra-driver/0700_daemonset.yaml | 171 +++++++++ ...rator-certified.clusterserviceversion.yaml | 103 ++++++ .../manifests/nvidia.com_clusterpolicies.yaml | 231 ++++++++++++ .../resource.nvidia.com_computedomains.yaml | 104 ++++++ .../crd/bases/nvidia.com_clusterpolicies.yaml | 231 ++++++++++++ controllers/clusterpolicy_controller.go | 12 + controllers/object_controls.go | 229 +++++++++++- controllers/resource_manager.go | 11 + controllers/state_manager.go | 20 +- controllers/transforms_test.go | 330 ++++++++++++++++++ .../crds/nvidia.com_clusterpolicies.yaml | 231 ++++++++++++ .../resource.nvidia.com_computedomains.yaml | 104 ++++++ .../gpu-operator/templates/cleanup_crd.yaml | 1 + .../gpu-operator/templates/clusterpolicy.yaml | 44 +++ .../gpu-operator/templates/clusterrole.yaml | 79 +++++ deployments/gpu-operator/templates/role.yaml | 1 + .../gpu-operator/templates/upgrade_crd.yaml | 1 + deployments/gpu-operator/values.yaml | 26 ++ 31 files changed, 2385 insertions(+), 18 deletions(-) create mode 100644 assets/state-dra-driver/0100_service_account.yaml create mode 100644 assets/state-dra-driver/0200_clusterrole.yaml create mode 100644 assets/state-dra-driver/0210_role.yaml create mode 100644 assets/state-dra-driver/0300_clusterrolebinding.yaml create mode 100644 assets/state-dra-driver/0310_rolebinding.yaml create mode 100644 assets/state-dra-driver/0400_deviceclass-compute-domain-daemon.yaml create mode 100644 assets/state-dra-driver/0410_deviceclass-compute-domain-default-channel.yaml create mode 100644 assets/state-dra-driver/0420_deviceclass-gpu.yaml create mode 100644 assets/state-dra-driver/0430_deviceclass-mig.yaml create mode 100644 assets/state-dra-driver/0500_deployment.yaml create mode 100644 assets/state-dra-driver/0600_configmap.yaml create mode 100644 assets/state-dra-driver/0700_daemonset.yaml create mode 100644 bundle/manifests/resource.nvidia.com_computedomains.yaml create mode 100644 deployments/gpu-operator/crds/resource.nvidia.com_computedomains.yaml diff --git a/api/nvidia/v1/clusterpolicy_types.go b/api/nvidia/v1/clusterpolicy_types.go index d924901e1..bd72cdb2b 100644 --- a/api/nvidia/v1/clusterpolicy_types.go +++ b/api/nvidia/v1/clusterpolicy_types.go @@ -53,6 +53,8 @@ type ClusterPolicySpec struct { Toolkit ToolkitSpec `json:"toolkit"` // DevicePlugin component spec DevicePlugin DevicePluginSpec `json:"devicePlugin"` + // DRADriver component spec + DRADriver DRADriverSpec `json:"draDriver"` // DCGMExporter spec DCGMExporter DCGMExporterSpec `json:"dcgmExporter"` // DCGM component spec @@ -985,6 +987,104 @@ type SandboxDevicePluginSpec struct { HostNetwork *bool `json:"hostNetwork,omitempty"` } +// DRADriverSpec defines the properties for the NVIDIA DRA Driver deployment +type DRADriverSpec struct { + // NVIDIA DRA Driver image repository + // +kubebuilder:validation:Optional + Repository string `json:"repository,omitempty"` + + // NVIDIA DRA Driver image name + // +kubebuilder:validation:Pattern=[a-zA-Z0-9\-]+ + Image string `json:"image,omitempty"` + + // NVIDIA DRA Driver image tag + // +kubebuilder:validation:Optional + Version string `json:"version,omitempty"` + + // Image pull policy + // +kubebuilder:validation:Optional + // +operator-sdk:gen-csv:customresourcedefinitions.specDescriptors=true + // +operator-sdk:gen-csv:customresourcedefinitions.specDescriptors.displayName="Image Pull Policy" + // +operator-sdk:gen-csv:customresourcedefinitions.specDescriptors.x-descriptors="urn:alm:descriptor:com.tectonic.ui:imagePullPolicy" + ImagePullPolicy string `json:"imagePullPolicy,omitempty"` + + // Image pull secrets + // +kubebuilder:validation:Optional + // +operator-sdk:gen-csv:customresourcedefinitions.specDescriptors=true + // +operator-sdk:gen-csv:customresourcedefinitions.specDescriptors.displayName="Image pull secrets" + // +operator-sdk:gen-csv:customresourcedefinitions.specDescriptors.x-descriptors="urn:alm:descriptor:io.kubernetes:Secret" + ImagePullSecrets []string `json:"imagePullSecrets,omitempty"` + + // GPUs defines configuration for GPUs in the NVIDIA DRA Driver + GPUs DRADriverGPUs `json:"gpus,omitempty"` + + // ComputeDomains defines configuration for ComputeDomains in the NVIDIA DRA Driver + ComputeDomains DRADriverComputeDomains `json:"computeDomains,omitempty"` +} + +// DRADriverGPUs defines configuration for GPUs in the NVIDIA DRA Driver +type DRADriverGPUs struct { + // Enabled indicates if GPUs are enabled in the NVIDIA DRA Driver + // +operator-sdk:gen-csv:customresourcedefinitions.specDescriptors=true + // +operator-sdk:gen-csv:customresourcedefinitions.specDescriptors.displayName="Enable GPUs in the NVIDIA DRA Driver" + // +operator-sdk:gen-csv:customresourcedefinitions.specDescriptors.x-descriptors="urn:alm:descriptor:com.tectonic.ui:booleanSwitch" + Enabled *bool `json:"enabled,omitempty"` + + // KubeletPlugin defines configuration for the NVIDIA DRA Driver kubelet plugin + KubeletPlugin DRADriverKubeletPlugin `json:"kubeletPlugin,omitempty"` +} + +// DRADriverComputeDomains defines configuration for ComputeDomains in the NVIDIA DRA Driver +type DRADriverComputeDomains struct { + // Enabled indicates if ComputeDomains are enabled in the NVIDIA DRA Driver + // +operator-sdk:gen-csv:customresourcedefinitions.specDescriptors=true + // +operator-sdk:gen-csv:customresourcedefinitions.specDescriptors.displayName="Enable ComputeDomains in the NVIDIA DRA Driver" + // +operator-sdk:gen-csv:customresourcedefinitions.specDescriptors.x-descriptors="urn:alm:descriptor:com.tectonic.ui:booleanSwitch" + Enabled *bool `json:"enabled,omitempty"` + + // Controller defines configuration for the NVIDIA DRA Driver controller + Controller DRADriverController `json:"controller,omitempty"` + + // KubeletPlugin defines configuration for the NVIDIA DRA Driver kubelet plugin + KubeletPlugin DRADriverKubeletPlugin `json:"kubeletPlugin,omitempty"` +} + +// DRADriverController defines configuration for the NVIDIA DRA Driver controller +type DRADriverController struct { + // Optional: List of environment variables + // +operator-sdk:gen-csv:customresourcedefinitions.specDescriptors=true + // +operator-sdk:gen-csv:customresourcedefinitions.specDescriptors.displayName="Environment Variables" + // +operator-sdk:gen-csv:customresourcedefinitions.specDescriptors.x-descriptors="urn:alm:descriptor:com.tectonic.ui:advanced,urn:alm:descriptor:com.tectonic.ui:text" + Env []EnvVar `json:"env,omitempty"` + + // Optional: Define resources requests and limits + // +operator-sdk:gen-csv:customresourcedefinitions.specDescriptors=true + // +operator-sdk:gen-csv:customresourcedefinitions.specDescriptors.displayName="Resource Requirements" + // +operator-sdk:gen-csv:customresourcedefinitions.specDescriptors.x-descriptors="urn:alm:descriptor:com.tectonic.ui:advanced,urn:alm:descriptor:com.tectonic.ui:resourceRequirements" + Resources *ResourceRequirements `json:"resources,omitempty"` + + // Optional: Set tolerations + // +operator-sdk:gen-csv:customresourcedefinitions.specDescriptors=true + // +operator-sdk:gen-csv:customresourcedefinitions.specDescriptors.displayName="Tolerations" + // +operator-sdk:gen-csv:customresourcedefinitions.specDescriptors.x-descriptors="urn:alm:descriptor:com.tectonic.ui:advanced,urn:alm:descriptor:io.kubernetes:Tolerations" + Tolerations []corev1.Toleration `json:"tolerations,omitempty"` +} + +// DRADriverKubeletPlugin defines configuration for the NVIDIA DRA Driver kubelet plugin +type DRADriverKubeletPlugin struct { + // Optional: List of environment variables + // +operator-sdk:gen-csv:customresourcedefinitions.specDescriptors=true + // +operator-sdk:gen-csv:customresourcedefinitions.specDescriptors.displayName="Environment Variables" + // +operator-sdk:gen-csv:customresourcedefinitions.specDescriptors.x-descriptors="urn:alm:descriptor:com.tectonic.ui:advanced,urn:alm:descriptor:com.tectonic.ui:text" + Env []EnvVar `json:"env,omitempty"` + + // Optional: Define resources requests and limits + // +operator-sdk:gen-csv:customresourcedefinitions.specDescriptors=true + // +operator-sdk:gen-csv:customresourcedefinitions.specDescriptors.displayName="Resource Requirements" + // +operator-sdk:gen-csv:customresourcedefinitions.specDescriptors.x-descriptors="urn:alm:descriptor:com.tectonic.ui:advanced,urn:alm:descriptor:com.tectonic.ui:resourceRequirements" + Resources *ResourceRequirements `json:"resources,omitempty"` +} + // DCGMExporterSpec defines the properties for NVIDIA DCGM Exporter deployment type DCGMExporterSpec struct { // Enabled indicates if deployment of NVIDIA DCGM Exporter through operator is enabled @@ -2079,6 +2179,9 @@ func ImagePath(spec interface{}) (string, error) { case *SandboxDevicePluginSpec: config := spec.(*SandboxDevicePluginSpec) return imagePath(config.Repository, config.Image, config.Version, "SANDBOX_DEVICE_PLUGIN_IMAGE") + case *DRADriverSpec: + config := spec.(*DRADriverSpec) + return imagePath(config.Repository, config.Image, config.Version, "DRA_DRIVER_IMAGE") case *DCGMExporterSpec: config := spec.(*DCGMExporterSpec) return imagePath(config.Repository, config.Image, config.Version, "DCGM_EXPORTER_IMAGE") @@ -2194,6 +2297,21 @@ func (p *DevicePluginSpec) IsEnabled() bool { return *p.Enabled } +// IsEnabled returns true if the DRA Driver is enabled through gpu-operator +func (d *DRADriverSpec) IsEnabled() bool { + return d.IsGPUsEnabled() || d.IsComputeDomainsEnabled() +} + +// IsGPUsEnabled returns true if the GPUs resource is enabled in the DRA Driver +func (d *DRADriverSpec) IsGPUsEnabled() bool { + return d.GPUs.Enabled != nil && *d.GPUs.Enabled +} + +// IsComputeDomainsEnabled returns true if the ComputeDomains resource is enabled in the DRA Driver +func (d *DRADriverSpec) IsComputeDomainsEnabled() bool { + return d.ComputeDomains.Enabled != nil && *d.ComputeDomains.Enabled +} + // IsEnabled returns true if dcgm-exporter is enabled(default) through gpu-operator func (e *DCGMExporterSpec) IsEnabled() bool { if e.Enabled == nil { diff --git a/api/nvidia/v1/zz_generated.deepcopy.go b/api/nvidia/v1/zz_generated.deepcopy.go index e04a3570e..e2f162719 100644 --- a/api/nvidia/v1/zz_generated.deepcopy.go +++ b/api/nvidia/v1/zz_generated.deepcopy.go @@ -191,6 +191,7 @@ func (in *ClusterPolicySpec) DeepCopyInto(out *ClusterPolicySpec) { in.Driver.DeepCopyInto(&out.Driver) in.Toolkit.DeepCopyInto(&out.Toolkit) in.DevicePlugin.DeepCopyInto(&out.DevicePlugin) + in.DRADriver.DeepCopyInto(&out.DRADriver) in.DCGMExporter.DeepCopyInto(&out.DCGMExporter) in.DCGM.DeepCopyInto(&out.DCGM) in.NodeStatusExporter.DeepCopyInto(&out.NodeStatusExporter) @@ -486,6 +487,128 @@ func (in *DCGMSpec) DeepCopy() *DCGMSpec { return out } +// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. +func (in *DRADriverComputeDomains) DeepCopyInto(out *DRADriverComputeDomains) { + *out = *in + if in.Enabled != nil { + in, out := &in.Enabled, &out.Enabled + *out = new(bool) + **out = **in + } + in.Controller.DeepCopyInto(&out.Controller) + in.KubeletPlugin.DeepCopyInto(&out.KubeletPlugin) +} + +// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new DRADriverComputeDomains. +func (in *DRADriverComputeDomains) DeepCopy() *DRADriverComputeDomains { + if in == nil { + return nil + } + out := new(DRADriverComputeDomains) + in.DeepCopyInto(out) + return out +} + +// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. +func (in *DRADriverController) DeepCopyInto(out *DRADriverController) { + *out = *in + if in.Env != nil { + in, out := &in.Env, &out.Env + *out = make([]EnvVar, len(*in)) + copy(*out, *in) + } + if in.Resources != nil { + in, out := &in.Resources, &out.Resources + *out = new(ResourceRequirements) + (*in).DeepCopyInto(*out) + } + if in.Tolerations != nil { + in, out := &in.Tolerations, &out.Tolerations + *out = make([]corev1.Toleration, len(*in)) + for i := range *in { + (*in)[i].DeepCopyInto(&(*out)[i]) + } + } +} + +// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new DRADriverController. +func (in *DRADriverController) DeepCopy() *DRADriverController { + if in == nil { + return nil + } + out := new(DRADriverController) + in.DeepCopyInto(out) + return out +} + +// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. +func (in *DRADriverGPUs) DeepCopyInto(out *DRADriverGPUs) { + *out = *in + if in.Enabled != nil { + in, out := &in.Enabled, &out.Enabled + *out = new(bool) + **out = **in + } + in.KubeletPlugin.DeepCopyInto(&out.KubeletPlugin) +} + +// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new DRADriverGPUs. +func (in *DRADriverGPUs) DeepCopy() *DRADriverGPUs { + if in == nil { + return nil + } + out := new(DRADriverGPUs) + in.DeepCopyInto(out) + return out +} + +// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. +func (in *DRADriverKubeletPlugin) DeepCopyInto(out *DRADriverKubeletPlugin) { + *out = *in + if in.Env != nil { + in, out := &in.Env, &out.Env + *out = make([]EnvVar, len(*in)) + copy(*out, *in) + } + if in.Resources != nil { + in, out := &in.Resources, &out.Resources + *out = new(ResourceRequirements) + (*in).DeepCopyInto(*out) + } +} + +// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new DRADriverKubeletPlugin. +func (in *DRADriverKubeletPlugin) DeepCopy() *DRADriverKubeletPlugin { + if in == nil { + return nil + } + out := new(DRADriverKubeletPlugin) + in.DeepCopyInto(out) + return out +} + +// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. +func (in *DRADriverSpec) DeepCopyInto(out *DRADriverSpec) { + *out = *in + if in.ImagePullSecrets != nil { + in, out := &in.ImagePullSecrets, &out.ImagePullSecrets + *out = make([]string, len(*in)) + copy(*out, *in) + } + in.GPUs.DeepCopyInto(&out.GPUs) + in.ComputeDomains.DeepCopyInto(&out.ComputeDomains) +} + +// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new DRADriverSpec. +func (in *DRADriverSpec) DeepCopy() *DRADriverSpec { + if in == nil { + return nil + } + out := new(DRADriverSpec) + in.DeepCopyInto(out) + return out +} + // DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. func (in *DaemonsetsSpec) DeepCopyInto(out *DaemonsetsSpec) { *out = *in diff --git a/assets/state-dra-driver/0100_service_account.yaml b/assets/state-dra-driver/0100_service_account.yaml new file mode 100644 index 000000000..76d6d61af --- /dev/null +++ b/assets/state-dra-driver/0100_service_account.yaml @@ -0,0 +1,5 @@ +apiVersion: v1 +kind: ServiceAccount +metadata: + name: nvidia-dra-driver + namespace: "FILLED BY THE OPERATOR" diff --git a/assets/state-dra-driver/0200_clusterrole.yaml b/assets/state-dra-driver/0200_clusterrole.yaml new file mode 100644 index 000000000..e2052e9a6 --- /dev/null +++ b/assets/state-dra-driver/0200_clusterrole.yaml @@ -0,0 +1,69 @@ +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRole +metadata: + name: nvidia-dra-driver + namespace: "FILLED BY THE OPERATOR" +rules: + - apiGroups: + - resource.nvidia.com + resources: + - computedomains + - computedomains/status + verbs: + - get + - list + - watch + - create + - update + - patch + - delete + - apiGroups: + - resource.k8s.io + resources: + - resourceclaims + - resourceclaimtemplates + verbs: + - get + - list + - watch + - create + - update + - patch + - delete + - apiGroups: + - resource.k8s.io + resources: + - deviceclasses + - resourceslices + verbs: + - get + - list + - watch + - create + - update + - patch + - delete + - apiGroups: + - resource.k8s.io + resources: + - resourceclaims/status + verbs: + - update + - apiGroups: + - "" + resources: + - nodes + verbs: + - get + - list + - watch + - update + - patch + - apiGroups: + - "" + resources: + - pods + verbs: + - get + - list + - watch diff --git a/assets/state-dra-driver/0210_role.yaml b/assets/state-dra-driver/0210_role.yaml new file mode 100644 index 000000000..62e336e3d --- /dev/null +++ b/assets/state-dra-driver/0210_role.yaml @@ -0,0 +1,19 @@ +apiVersion: rbac.authorization.k8s.io/v1 +kind: Role +metadata: + name: nvidia-dra-driver + namespace: "FILLED BY THE OPERATOR" +rules: + - apiGroups: + - apps + resources: + - daemonsets + - deployments + verbs: + - get + - list + - watch + - create + - update + - patch + - delete diff --git a/assets/state-dra-driver/0300_clusterrolebinding.yaml b/assets/state-dra-driver/0300_clusterrolebinding.yaml new file mode 100644 index 000000000..ea4f6a5e4 --- /dev/null +++ b/assets/state-dra-driver/0300_clusterrolebinding.yaml @@ -0,0 +1,12 @@ +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRoleBinding +metadata: + name: nvidia-dra-driver +roleRef: + apiGroup: rbac.authorization.k8s.io + kind: ClusterRole + name: nvidia-dra-driver +subjects: + - kind: ServiceAccount + name: nvidia-dra-driver + namespace: "FILLED BY THE OPERATOR" diff --git a/assets/state-dra-driver/0310_rolebinding.yaml b/assets/state-dra-driver/0310_rolebinding.yaml new file mode 100644 index 000000000..bf893a63c --- /dev/null +++ b/assets/state-dra-driver/0310_rolebinding.yaml @@ -0,0 +1,13 @@ +apiVersion: rbac.authorization.k8s.io/v1 +kind: RoleBinding +metadata: + name: nvidia-dra-driver + namespace: "FILLED BY THE OPERATOR" +roleRef: + apiGroup: rbac.authorization.k8s.io + kind: Role + name: nvidia-dra-driver +subjects: + - kind: ServiceAccount + name: nvidia-dra-driver + namespace: "FILLED BY THE OPERATOR" diff --git a/assets/state-dra-driver/0400_deviceclass-compute-domain-daemon.yaml b/assets/state-dra-driver/0400_deviceclass-compute-domain-daemon.yaml new file mode 100644 index 000000000..e8d6ac997 --- /dev/null +++ b/assets/state-dra-driver/0400_deviceclass-compute-domain-daemon.yaml @@ -0,0 +1,8 @@ +apiVersion: resource.k8s.io/v1beta1 +kind: DeviceClass +metadata: + name: compute-domain-daemon.nvidia.com +spec: + selectors: + - cel: + expression: "device.driver == 'compute-domain.nvidia.com' && device.attributes['compute-domain.nvidia.com'].type == 'daemon'" diff --git a/assets/state-dra-driver/0410_deviceclass-compute-domain-default-channel.yaml b/assets/state-dra-driver/0410_deviceclass-compute-domain-default-channel.yaml new file mode 100644 index 000000000..737404ccb --- /dev/null +++ b/assets/state-dra-driver/0410_deviceclass-compute-domain-default-channel.yaml @@ -0,0 +1,8 @@ +apiVersion: resource.k8s.io/v1beta1 +kind: DeviceClass +metadata: + name: compute-domain-default-channel.nvidia.com +spec: + selectors: + - cel: + expression: "device.driver == 'compute-domain.nvidia.com' && device.attributes['compute-domain.nvidia.com'].type == 'channel' && device.attributes['compute-domain.nvidia.com'].id == 0" diff --git a/assets/state-dra-driver/0420_deviceclass-gpu.yaml b/assets/state-dra-driver/0420_deviceclass-gpu.yaml new file mode 100644 index 000000000..7c65e3762 --- /dev/null +++ b/assets/state-dra-driver/0420_deviceclass-gpu.yaml @@ -0,0 +1,8 @@ +apiVersion: resource.k8s.io/v1beta1 +kind: DeviceClass +metadata: + name: gpu.nvidia.com +spec: + selectors: + - cel: + expression: "device.driver == 'gpu.nvidia.com' && device.attributes['gpu.nvidia.com'].type == 'gpu'" diff --git a/assets/state-dra-driver/0430_deviceclass-mig.yaml b/assets/state-dra-driver/0430_deviceclass-mig.yaml new file mode 100644 index 000000000..0188ca08f --- /dev/null +++ b/assets/state-dra-driver/0430_deviceclass-mig.yaml @@ -0,0 +1,8 @@ +apiVersion: resource.k8s.io/v1beta1 +kind: DeviceClass +metadata: + name: mig.nvidia.com +spec: + selectors: + - cel: + expression: "device.driver == 'gpu.nvidia.com' && device.attributes['gpu.nvidia.com'].type == 'mig'" diff --git a/assets/state-dra-driver/0500_deployment.yaml b/assets/state-dra-driver/0500_deployment.yaml new file mode 100644 index 000000000..1e5bbd0f0 --- /dev/null +++ b/assets/state-dra-driver/0500_deployment.yaml @@ -0,0 +1,43 @@ +apiVersion: apps/v1 +kind: Deployment +metadata: + name: nvidia-dra-driver-controller + namespace: "FILLED BY THE OPERATOR" + labels: + app: nvidia-dra-driver-controller +spec: + replicas: 1 + selector: + matchLabels: + app: nvidia-dra-driver-controller + template: + metadata: + labels: + app: nvidia-dra-driver-controller + spec: + priorityClassName: system-node-critical + serviceAccountName: nvidia-dra-driver + containers: + - name: compute-domains + image: "FILLED BY THE OPERATOR" + command: ["compute-domain-controller", "-v", "6"] + env: + - name: POD_NAME + valueFrom: + fieldRef: + fieldPath: metadata.name + - name: NAMESPACE + valueFrom: + fieldRef: + fieldPath: metadata.namespace + affinity: + nodeAffinity: + requiredDuringSchedulingIgnoredDuringExecution: + nodeSelectorTerms: + - matchExpressions: + - key: "node-role.kubernetes.io/control-plane" + operator: "Exists" + tolerations: + - key: node-role.kubernetes.io/control-plane + operator: Exists + effect: NoSchedule diff --git a/assets/state-dra-driver/0600_configmap.yaml b/assets/state-dra-driver/0600_configmap.yaml new file mode 100644 index 000000000..6ffda9d28 --- /dev/null +++ b/assets/state-dra-driver/0600_configmap.yaml @@ -0,0 +1,40 @@ +apiVersion: v1 +kind: ConfigMap +metadata: + name: nvidia-dra-driver-kubelet-plugin-entrypoint + namespace: "FILLED BY THE OPERATOR" + labels: + app: nvidia-dra-driver-kubelet-plugin +data: + entrypoint.sh: |- + #!/bin/sh + + if [ "$#" -ne 1 ]; then + echo "Usage: $0 COMMAND" + exit 1 + fi + + entrypoint=$1 + + until [ -f /run/nvidia/validations/driver-ready ] + do + echo "waiting for the driver validations to be ready..." + sleep 5 + done + + set -o allexport + cat /run/nvidia/validations/driver-ready + . /run/nvidia/validations/driver-ready + + # Conditionally mask the params file to prevent this container from + # recreating any missing GPU device nodes. This is necessary, for + # example, when running under nvkind to limit the set GPUs governed + # by the plugin even though it has cgroup access to all of them. + if [ "${MASK_NVIDIA_DRIVER_PARAMS}" = "true" ]; then + cp /proc/driver/nvidia/params root/gpu-params + sed -i 's/^ModifyDeviceFiles: 1$/ModifyDeviceFiles: 0/' root/gpu-params + mount --bind root/gpu-params /proc/driver/nvidia/params + fi + + echo "Starting the NVIDIA DRA Driver Kubelet Plugin" + exec $entrypoint diff --git a/assets/state-dra-driver/0700_daemonset.yaml b/assets/state-dra-driver/0700_daemonset.yaml new file mode 100644 index 000000000..a829281a6 --- /dev/null +++ b/assets/state-dra-driver/0700_daemonset.yaml @@ -0,0 +1,171 @@ +apiVersion: apps/v1 +kind: DaemonSet +metadata: + name: nvidia-dra-driver-kubelet-plugin + namespace: "FILLED BY THE OPERATOR" + labels: + app: nvidia-dra-driver-kubelet-plugin +spec: + selector: + matchLabels: + app: nvidia-dra-driver-kubelet-plugin + template: + metadata: + labels: + app: nvidia-dra-driver-kubelet-plugin + spec: + nodeSelector: + nvidia.com/gpu.deploy.dra-driver-kubelet-plugin: "true" + priorityClassName: system-node-critical + serviceAccountName: nvidia-dra-driver + initContainers: + - name: driver-validation + image: "FILLED BY THE OPERATOR" + imagePullPolicy: IfNotPresent + command: ['sh', '-c'] + args: ["nvidia-validator"] + env: + - name: WITH_WAIT + value: "true" + - name: COMPONENT + value: driver + - name: OPERATOR_NAMESPACE + valueFrom: + fieldRef: + fieldPath: metadata.namespace + securityContext: + privileged: true + seLinuxOptions: + level: "s0" + volumeMounts: + - name: driver-install-dir + mountPath: /run/nvidia/driver + mountPropagation: HostToContainer + - name: run-nvidia-validations + mountPath: /run/nvidia/validations + mountPropagation: Bidirectional + - name: host-root + mountPath: /host + readOnly: true + mountPropagation: HostToContainer + - name: host-dev-char + mountPath: /host-dev-char + containers: + - name: compute-domains + securityContext: + privileged: true + image: "FILLED BY THE OPERATOR" + command: ["/bin/sh", "-c"] + args: + - /bin/entrypoint.sh "compute-domain-kubelet-plugin -v 6" + env: + - name: NVIDIA_VISIBLE_DEVICES + value: void + - name: CDI_ROOT + value: /var/run/cdi + - name: NVIDIA_MIG_CONFIG_DEVICES + value: all + - name: NODE_NAME + valueFrom: + fieldRef: + fieldPath: spec.nodeName + - name: NAMESPACE + valueFrom: + fieldRef: + fieldPath: metadata.namespace + volumeMounts: + - name: nvidia-dra-driver-kubelet-plugin-entrypoint + readOnly: true + mountPath: /bin/entrypoint.sh + subPath: entrypoint.sh + - name: plugins-registry + mountPath: /var/lib/kubelet/plugins_registry + - name: plugins + mountPath: /var/lib/kubelet/plugins + mountPropagation: Bidirectional + - name: cdi + mountPath: /var/run/cdi + - name: run-nvidia-validations + mountPath: /run/nvidia/validations + mountPropagation: Bidirectional + - name: driver-install-dir + mountPath: /driver-root + readOnly: true + mountPropagation: HostToContainer + - name: host-root + mountPath: /host + readOnly: true + mountPropagation: HostToContainer + - name: gpus + securityContext: + privileged: true + image: "FILLED BY THE OPERATOR" + command: ["/bin/sh", "-c"] + args: + - /bin/entrypoint.sh "gpu-kubelet-plugin -v 6" + env: + - name: NVIDIA_VISIBLE_DEVICES + value: void + - name: CDI_ROOT + value: /var/run/cdi + - name: NVIDIA_MIG_CONFIG_DEVICES + value: all + - name: NODE_NAME + valueFrom: + fieldRef: + fieldPath: spec.nodeName + - name: NAMESPACE + valueFrom: + fieldRef: + fieldPath: metadata.namespace + volumeMounts: + - name: nvidia-dra-driver-kubelet-plugin-entrypoint + readOnly: true + mountPath: /bin/entrypoint.sh + subPath: entrypoint.sh + - name: plugins-registry + mountPath: /var/lib/kubelet/plugins_registry + - name: plugins + mountPath: /var/lib/kubelet/plugins + mountPropagation: Bidirectional + - name: cdi + mountPath: /var/run/cdi + - name: run-nvidia-validations + mountPath: /run/nvidia/validations + mountPropagation: Bidirectional + - name: driver-install-dir + mountPath: /driver-root + readOnly: true + mountPropagation: HostToContainer + - name: host-root + mountPath: /host + readOnly: true + mountPropagation: HostToContainer + volumes: + - name: nvidia-dra-driver-kubelet-plugin-entrypoint + configMap: + name: nvidia-dra-driver-kubelet-plugin-entrypoint + defaultMode: 448 + - name: plugins-registry + hostPath: + path: /var/lib/kubelet/plugins_registry + - name: plugins + hostPath: + path: /var/lib/kubelet/plugins + - name: cdi + hostPath: + path: /var/run/cdi + - name: run-nvidia-validations + hostPath: + path: /run/nvidia/validations + type: DirectoryOrCreate + - name: driver-install-dir + hostPath: + path: /run/nvidia/driver + type: DirectoryOrCreate + - name: host-root + hostPath: + path: / + - name: host-dev-char + hostPath: + path: /dev/char diff --git a/bundle/manifests/gpu-operator-certified.clusterserviceversion.yaml b/bundle/manifests/gpu-operator-certified.clusterserviceversion.yaml index 32a25668f..22eeca601 100644 --- a/bundle/manifests/gpu-operator-certified.clusterserviceversion.yaml +++ b/bundle/manifests/gpu-operator-certified.clusterserviceversion.yaml @@ -98,6 +98,25 @@ metadata: "maxUnavailable": "1" } }, + "draDriver": { + "gpus": { + "enabled": false, + "kubeletPlugin": {} + }, + "computeDomains": { + "enabled": false, + "controller": { + "tolerations": [ + { + "key": "node-role.kubernetes.io/control-plane", + "operator": "Exists", + "effect": "NoSchedule" + } + ] + }, + "kubeletPlugin": {} + } + }, "devicePlugin": { "enabled": true, "config": { @@ -239,6 +258,8 @@ spec: image: nvcr.io/nvidia/cloud-native/vgpu-device-manager:v0.4.2@sha256:24892b0ee0ca924d3c644648e9f0e0fa80d238e2fb681b21913f32fd0af9cde7 - name: gdrcopy-image image: nvcr.io/nvidia/cloud-native/gdrdrv@sha256:0460630559b0b932c8861237b62e69c2895dace42d37ad3cb02c87e5d751fafc + - name: dra-driver-image + image: nvcr.io/nvidia/k8s-dra-driver-gpu@sha256:5dd583277c1f2825cb637c3c07d8208c6278b1e6ccb4231f0ac011dbf651d5a9 customresourcedefinitions: owned: - name: nvidiadrivers.nvidia.com @@ -748,6 +769,7 @@ spec: - apps resources: - daemonsets + - deployments verbs: - get - list @@ -775,6 +797,84 @@ spec: - update - patch - delete + - apiGroups: + - resource.nvidia.com + resources: + - computedomains + verbs: + - get + - list + - watch + - create + - update + - patch + - delete + - apiGroups: + - resource.nvidia.com + resources: + - computedomains/status + verbs: + - get + - list + - watch + - create + - update + - patch + - delete + - apiGroups: + - resource.k8s.io + resources: + - resourceclaims + verbs: + - get + - list + - watch + - create + - update + - patch + - delete + - apiGroups: + - resource.k8s.io + resources: + - resourceclaimtemplates + verbs: + - get + - list + - watch + - create + - update + - patch + - delete + - apiGroups: + - resource.k8s.io + resources: + - deviceclasses + verbs: + - get + - list + - watch + - create + - update + - patch + - delete + - apiGroups: + - resource.k8s.io + resources: + - resourceslices + verbs: + - get + - list + - watch + - create + - update + - patch + - delete + - apiGroups: + - resource.k8s.io + resources: + - resourceclaims/status + verbs: + - update permissions: - serviceAccountName: gpu-operator rules: @@ -803,6 +903,7 @@ spec: - apps resources: - daemonsets + - deployments verbs: - create - get @@ -953,6 +1054,8 @@ spec: value: "nvcr.io/nvidia/cloud-native/vgpu-device-manager:v0.4.2@sha256:24892b0ee0ca924d3c644648e9f0e0fa80d238e2fb681b21913f32fd0af9cde7" - name: "GDRCOPY_IMAGE" value: "nvcr.io/nvidia/cloud-native/gdrdrv@sha256:0460630559b0b932c8861237b62e69c2895dace42d37ad3cb02c87e5d751fafc" + - name: "DRA_DRIVER_IMAGE" + value: "nvcr.io/nvidia/k8s-dra-driver-gpu@sha256:5dd583277c1f2825cb637c3c07d8208c6278b1e6ccb4231f0ac011dbf651d5a9" terminationGracePeriodSeconds: 10 serviceAccountName: gpu-operator strategy: deployment diff --git a/bundle/manifests/nvidia.com_clusterpolicies.yaml b/bundle/manifests/nvidia.com_clusterpolicies.yaml index 868fe9379..add8948ee 100644 --- a/bundle/manifests/nvidia.com_clusterpolicies.yaml +++ b/bundle/manifests/nvidia.com_clusterpolicies.yaml @@ -897,6 +897,236 @@ spec: description: NVIDIA Device Plugin image tag type: string type: object + draDriver: + description: DRADriver component spec + properties: + computeDomains: + description: ComputeDomains defines configuration for ComputeDomains + in the NVIDIA DRA Driver + properties: + controller: + description: Controller defines configuration for the NVIDIA + DRA Driver controller + properties: + env: + description: 'Optional: List of environment variables' + items: + description: EnvVar represents an environment variable + present in a Container. + properties: + name: + description: Name of the environment variable. + type: string + value: + description: Value of the environment variable. + type: string + required: + - name + type: object + type: array + resources: + description: 'Optional: Define resources requests and + limits' + properties: + limits: + additionalProperties: + anyOf: + - type: integer + - type: string + pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$ + x-kubernetes-int-or-string: true + description: |- + Limits describes the maximum amount of compute resources allowed. + More info: https://kubernetes.io/docs/concepts/configuration/manage-resources-containers/ + type: object + requests: + additionalProperties: + anyOf: + - type: integer + - type: string + pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$ + x-kubernetes-int-or-string: true + description: |- + Requests describes the minimum amount of compute resources required. + If Requests is omitted for a container, it defaults to Limits if that is explicitly specified, + otherwise to an implementation-defined value. Requests cannot exceed Limits. + More info: https://kubernetes.io/docs/concepts/configuration/manage-resources-containers/ + type: object + type: object + tolerations: + description: 'Optional: Set tolerations' + items: + description: |- + The pod this Toleration is attached to tolerates any taint that matches + the triple using the matching operator . + properties: + effect: + description: |- + Effect indicates the taint effect to match. Empty means match all taint effects. + When specified, allowed values are NoSchedule, PreferNoSchedule and NoExecute. + type: string + key: + description: |- + Key is the taint key that the toleration applies to. Empty means match all taint keys. + If the key is empty, operator must be Exists; this combination means to match all values and all keys. + type: string + operator: + description: |- + Operator represents a key's relationship to the value. + Valid operators are Exists, Equal, Lt, and Gt. Defaults to Equal. + Exists is equivalent to wildcard for value, so that a pod can + tolerate all taints of a particular category. + Lt and Gt perform numeric comparisons (requires feature gate TaintTolerationComparisonOperators). + type: string + tolerationSeconds: + description: |- + TolerationSeconds represents the period of time the toleration (which must be + of effect NoExecute, otherwise this field is ignored) tolerates the taint. By default, + it is not set, which means tolerate the taint forever (do not evict). Zero and + negative values will be treated as 0 (evict immediately) by the system. + format: int64 + type: integer + value: + description: |- + Value is the taint value the toleration matches to. + If the operator is Exists, the value should be empty, otherwise just a regular string. + type: string + type: object + type: array + type: object + enabled: + description: Enabled indicates if ComputeDomains are enabled + in the NVIDIA DRA Driver + type: boolean + kubeletPlugin: + description: KubeletPlugin defines configuration for the NVIDIA + DRA Driver kubelet plugin + properties: + env: + description: 'Optional: List of environment variables' + items: + description: EnvVar represents an environment variable + present in a Container. + properties: + name: + description: Name of the environment variable. + type: string + value: + description: Value of the environment variable. + type: string + required: + - name + type: object + type: array + resources: + description: 'Optional: Define resources requests and + limits' + properties: + limits: + additionalProperties: + anyOf: + - type: integer + - type: string + pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$ + x-kubernetes-int-or-string: true + description: |- + Limits describes the maximum amount of compute resources allowed. + More info: https://kubernetes.io/docs/concepts/configuration/manage-resources-containers/ + type: object + requests: + additionalProperties: + anyOf: + - type: integer + - type: string + pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$ + x-kubernetes-int-or-string: true + description: |- + Requests describes the minimum amount of compute resources required. + If Requests is omitted for a container, it defaults to Limits if that is explicitly specified, + otherwise to an implementation-defined value. Requests cannot exceed Limits. + More info: https://kubernetes.io/docs/concepts/configuration/manage-resources-containers/ + type: object + type: object + type: object + type: object + gpus: + description: GPUs defines configuration for GPUs in the NVIDIA + DRA Driver + properties: + enabled: + description: Enabled indicates if GPUs are enabled in the + NVIDIA DRA Driver + type: boolean + kubeletPlugin: + description: KubeletPlugin defines configuration for the NVIDIA + DRA Driver kubelet plugin + properties: + env: + description: 'Optional: List of environment variables' + items: + description: EnvVar represents an environment variable + present in a Container. + properties: + name: + description: Name of the environment variable. + type: string + value: + description: Value of the environment variable. + type: string + required: + - name + type: object + type: array + resources: + description: 'Optional: Define resources requests and + limits' + properties: + limits: + additionalProperties: + anyOf: + - type: integer + - type: string + pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$ + x-kubernetes-int-or-string: true + description: |- + Limits describes the maximum amount of compute resources allowed. + More info: https://kubernetes.io/docs/concepts/configuration/manage-resources-containers/ + type: object + requests: + additionalProperties: + anyOf: + - type: integer + - type: string + pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$ + x-kubernetes-int-or-string: true + description: |- + Requests describes the minimum amount of compute resources required. + If Requests is omitted for a container, it defaults to Limits if that is explicitly specified, + otherwise to an implementation-defined value. Requests cannot exceed Limits. + More info: https://kubernetes.io/docs/concepts/configuration/manage-resources-containers/ + type: object + type: object + type: object + type: object + image: + description: NVIDIA DRA Driver image name + pattern: '[a-zA-Z0-9\-]+' + type: string + imagePullPolicy: + description: Image pull policy + type: string + imagePullSecrets: + description: Image pull secrets + items: + type: string + type: array + repository: + description: NVIDIA DRA Driver image repository + type: string + version: + description: NVIDIA DRA Driver image tag + type: string + type: object driver: description: Driver component spec properties: @@ -2883,6 +3113,7 @@ spec: - dcgm - dcgmExporter - devicePlugin + - draDriver - driver - gfd - nodeStatusExporter diff --git a/bundle/manifests/resource.nvidia.com_computedomains.yaml b/bundle/manifests/resource.nvidia.com_computedomains.yaml new file mode 100644 index 000000000..307b21ff7 --- /dev/null +++ b/bundle/manifests/resource.nvidia.com_computedomains.yaml @@ -0,0 +1,104 @@ +--- +apiVersion: apiextensions.k8s.io/v1 +kind: CustomResourceDefinition +metadata: + annotations: + controller-gen.kubebuilder.io/version: v0.17.1 + name: computedomains.resource.nvidia.com +spec: + group: resource.nvidia.com + names: + kind: ComputeDomain + listKind: ComputeDomainList + plural: computedomains + singular: computedomain + scope: Namespaced + versions: + - name: v1beta1 + schema: + openAPIV3Schema: + description: ComputeDomain prepares a set of nodes to run a multi-node workload + in. + properties: + apiVersion: + description: |- + APIVersion defines the versioned schema of this representation of an object. + Servers should convert recognized schemas to the latest internal value, and + may reject unrecognized values. + More info: https://git.k8s.io/community/contributors/devel/sig-architecture/api-conventions.md#resources + type: string + kind: + description: |- + Kind is a string value representing the REST resource this object represents. + Servers may infer this from the endpoint the client submits requests to. + Cannot be updated. + In CamelCase. + More info: https://git.k8s.io/community/contributors/devel/sig-architecture/api-conventions.md#types-kinds + type: string + metadata: + type: object + spec: + description: ComputeDomainSpec provides the spec for a ComputeDomain. + properties: + channel: + description: ComputeDomainChannelSpec provides the spec for a channel + used to run a workload inside a ComputeDomain. + properties: + resourceClaimTemplate: + description: ComputeDomainResourceClaimTemplate provides the details + of the ResourceClaimTemplate to generate. + properties: + name: + type: string + required: + - name + type: object + required: + - resourceClaimTemplate + type: object + numNodes: + type: integer + required: + - channel + - numNodes + type: object + x-kubernetes-validations: + - message: A computeDomain.spec is immutable + rule: self == oldSelf + status: + description: ComputeDomainStatus provides the status for a ComputeDomain. + properties: + nodes: + items: + description: ComputeDomainNode provides information about each node + added to a ComputeDomain. + properties: + cliqueID: + type: string + ipAddress: + type: string + name: + type: string + required: + - cliqueID + - ipAddress + - name + type: object + type: array + x-kubernetes-list-map-keys: + - name + x-kubernetes-list-type: map + status: + default: NotReady + enum: + - Ready + - NotReady + type: string + required: + - status + type: object + type: object + served: true + storage: true + subresources: + status: {} diff --git a/config/crd/bases/nvidia.com_clusterpolicies.yaml b/config/crd/bases/nvidia.com_clusterpolicies.yaml index 868fe9379..add8948ee 100644 --- a/config/crd/bases/nvidia.com_clusterpolicies.yaml +++ b/config/crd/bases/nvidia.com_clusterpolicies.yaml @@ -897,6 +897,236 @@ spec: description: NVIDIA Device Plugin image tag type: string type: object + draDriver: + description: DRADriver component spec + properties: + computeDomains: + description: ComputeDomains defines configuration for ComputeDomains + in the NVIDIA DRA Driver + properties: + controller: + description: Controller defines configuration for the NVIDIA + DRA Driver controller + properties: + env: + description: 'Optional: List of environment variables' + items: + description: EnvVar represents an environment variable + present in a Container. + properties: + name: + description: Name of the environment variable. + type: string + value: + description: Value of the environment variable. + type: string + required: + - name + type: object + type: array + resources: + description: 'Optional: Define resources requests and + limits' + properties: + limits: + additionalProperties: + anyOf: + - type: integer + - type: string + pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$ + x-kubernetes-int-or-string: true + description: |- + Limits describes the maximum amount of compute resources allowed. + More info: https://kubernetes.io/docs/concepts/configuration/manage-resources-containers/ + type: object + requests: + additionalProperties: + anyOf: + - type: integer + - type: string + pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$ + x-kubernetes-int-or-string: true + description: |- + Requests describes the minimum amount of compute resources required. + If Requests is omitted for a container, it defaults to Limits if that is explicitly specified, + otherwise to an implementation-defined value. Requests cannot exceed Limits. + More info: https://kubernetes.io/docs/concepts/configuration/manage-resources-containers/ + type: object + type: object + tolerations: + description: 'Optional: Set tolerations' + items: + description: |- + The pod this Toleration is attached to tolerates any taint that matches + the triple using the matching operator . + properties: + effect: + description: |- + Effect indicates the taint effect to match. Empty means match all taint effects. + When specified, allowed values are NoSchedule, PreferNoSchedule and NoExecute. + type: string + key: + description: |- + Key is the taint key that the toleration applies to. Empty means match all taint keys. + If the key is empty, operator must be Exists; this combination means to match all values and all keys. + type: string + operator: + description: |- + Operator represents a key's relationship to the value. + Valid operators are Exists, Equal, Lt, and Gt. Defaults to Equal. + Exists is equivalent to wildcard for value, so that a pod can + tolerate all taints of a particular category. + Lt and Gt perform numeric comparisons (requires feature gate TaintTolerationComparisonOperators). + type: string + tolerationSeconds: + description: |- + TolerationSeconds represents the period of time the toleration (which must be + of effect NoExecute, otherwise this field is ignored) tolerates the taint. By default, + it is not set, which means tolerate the taint forever (do not evict). Zero and + negative values will be treated as 0 (evict immediately) by the system. + format: int64 + type: integer + value: + description: |- + Value is the taint value the toleration matches to. + If the operator is Exists, the value should be empty, otherwise just a regular string. + type: string + type: object + type: array + type: object + enabled: + description: Enabled indicates if ComputeDomains are enabled + in the NVIDIA DRA Driver + type: boolean + kubeletPlugin: + description: KubeletPlugin defines configuration for the NVIDIA + DRA Driver kubelet plugin + properties: + env: + description: 'Optional: List of environment variables' + items: + description: EnvVar represents an environment variable + present in a Container. + properties: + name: + description: Name of the environment variable. + type: string + value: + description: Value of the environment variable. + type: string + required: + - name + type: object + type: array + resources: + description: 'Optional: Define resources requests and + limits' + properties: + limits: + additionalProperties: + anyOf: + - type: integer + - type: string + pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$ + x-kubernetes-int-or-string: true + description: |- + Limits describes the maximum amount of compute resources allowed. + More info: https://kubernetes.io/docs/concepts/configuration/manage-resources-containers/ + type: object + requests: + additionalProperties: + anyOf: + - type: integer + - type: string + pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$ + x-kubernetes-int-or-string: true + description: |- + Requests describes the minimum amount of compute resources required. + If Requests is omitted for a container, it defaults to Limits if that is explicitly specified, + otherwise to an implementation-defined value. Requests cannot exceed Limits. + More info: https://kubernetes.io/docs/concepts/configuration/manage-resources-containers/ + type: object + type: object + type: object + type: object + gpus: + description: GPUs defines configuration for GPUs in the NVIDIA + DRA Driver + properties: + enabled: + description: Enabled indicates if GPUs are enabled in the + NVIDIA DRA Driver + type: boolean + kubeletPlugin: + description: KubeletPlugin defines configuration for the NVIDIA + DRA Driver kubelet plugin + properties: + env: + description: 'Optional: List of environment variables' + items: + description: EnvVar represents an environment variable + present in a Container. + properties: + name: + description: Name of the environment variable. + type: string + value: + description: Value of the environment variable. + type: string + required: + - name + type: object + type: array + resources: + description: 'Optional: Define resources requests and + limits' + properties: + limits: + additionalProperties: + anyOf: + - type: integer + - type: string + pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$ + x-kubernetes-int-or-string: true + description: |- + Limits describes the maximum amount of compute resources allowed. + More info: https://kubernetes.io/docs/concepts/configuration/manage-resources-containers/ + type: object + requests: + additionalProperties: + anyOf: + - type: integer + - type: string + pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$ + x-kubernetes-int-or-string: true + description: |- + Requests describes the minimum amount of compute resources required. + If Requests is omitted for a container, it defaults to Limits if that is explicitly specified, + otherwise to an implementation-defined value. Requests cannot exceed Limits. + More info: https://kubernetes.io/docs/concepts/configuration/manage-resources-containers/ + type: object + type: object + type: object + type: object + image: + description: NVIDIA DRA Driver image name + pattern: '[a-zA-Z0-9\-]+' + type: string + imagePullPolicy: + description: Image pull policy + type: string + imagePullSecrets: + description: Image pull secrets + items: + type: string + type: array + repository: + description: NVIDIA DRA Driver image repository + type: string + version: + description: NVIDIA DRA Driver image tag + type: string + type: object driver: description: Driver component spec properties: @@ -2883,6 +3113,7 @@ spec: - dcgm - dcgmExporter - devicePlugin + - draDriver - driver - gfd - nodeStatusExporter diff --git a/controllers/clusterpolicy_controller.go b/controllers/clusterpolicy_controller.go index d16d2d445..b92023270 100644 --- a/controllers/clusterpolicy_controller.go +++ b/controllers/clusterpolicy_controller.go @@ -121,6 +121,18 @@ func (r *ClusterPolicyReconciler) Reconcile(ctx context.Context, req ctrl.Reques return ctrl.Result{}, nil } + if instance.Spec.DevicePlugin.IsEnabled() && instance.Spec.DRADriver.IsGPUsEnabled() { + err = fmt.Errorf("the device-plugin and dra driver for GPUs cannot both be enabled") + condErr = r.conditionUpdater.SetConditionsError(ctx, instance, conditions.ReconcileFailed, err.Error()) + if condErr != nil { + r.Log.V(consts.LogLevelDebug).Error(nil, condErr.Error()) + } + if clusterPolicyCtrl.operatorMetrics != nil { + clusterPolicyCtrl.operatorMetrics.reconciliationStatus.Set(reconciliationStatusNotReady) + } + return ctrl.Result{}, err + } + if err := clusterPolicyCtrl.init(ctx, r, instance); err != nil { r.Log.Error(err, "unable to initialize ClusterPolicy controller") if condErr := r.conditionUpdater.SetConditionsError(ctx, instance, conditions.ReconcileFailed, err.Error()); condErr != nil { diff --git a/controllers/object_controls.go b/controllers/object_controls.go index 1f8806fcc..84987bd3f 100644 --- a/controllers/object_controls.go +++ b/controllers/object_controls.go @@ -36,6 +36,7 @@ import ( corev1 "k8s.io/api/core/v1" nodev1 "k8s.io/api/node/v1" nodev1beta1 "k8s.io/api/node/v1beta1" + resourceapi "k8s.io/api/resource/v1beta1" apiextensionsv1 "k8s.io/apiextensions-apiserver/pkg/apis/apiextensions/v1" apierrors "k8s.io/apimachinery/pkg/api/errors" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" @@ -151,6 +152,8 @@ const ( NvidiaCtrRuntimeCDIPrefixesEnvName = "NVIDIA_CONTAINER_RUNTIME_MODES_CDI_ANNOTATION_PREFIXES" // CDIEnabledEnvName is the name of the envvar used to enable CDI in the operands CDIEnabledEnvName = "CDI_ENABLED" + // NvidiaCTKPathEnvName is the name of the envvar specifying the path to the 'nvidia-ctk' binary + NvidiaCTKPathEnvName = "NVIDIA_CTK_PATH" // NvidiaCDIHookPathEnvName is the name of the envvar specifying the path to the 'nvidia-cdi-hook' binary NvidiaCDIHookPathEnvName = "NVIDIA_CDI_HOOK_PATH" // CRIOConfigModeEnvName is the name of the envvar controlling how the toolkit container updates the cri-o configuration @@ -703,19 +706,20 @@ func preProcessDaemonSet(obj *appsv1.DaemonSet, n ClusterPolicyController) error "nvidia-vgpu-device-manager": TransformVGPUDeviceManager, "nvidia-vfio-manager": TransformVFIOManager, "nvidia-container-toolkit-daemonset": TransformToolkit, + "nvidia-dra-driver-kubelet-plugin": TransformDRADriverKubeletPlugin, "nvidia-device-plugin-daemonset": TransformDevicePlugin, "nvidia-device-plugin-mps-control-daemon": TransformMPSControlDaemon, "nvidia-sandbox-device-plugin-daemonset": TransformSandboxDevicePlugin, "nvidia-kata-sandbox-device-plugin-daemonset": TransformKataDevicePlugin, - "nvidia-dcgm": TransformDCGM, - "nvidia-dcgm-exporter": TransformDCGMExporter, - "nvidia-node-status-exporter": TransformNodeStatusExporter, - "gpu-feature-discovery": TransformGPUDiscoveryPlugin, - "nvidia-mig-manager": TransformMIGManager, - "nvidia-operator-validator": TransformValidator, - "nvidia-sandbox-validator": TransformSandboxValidator, - "nvidia-kata-manager": TransformKataManager, - "nvidia-cc-manager": TransformCCManager, + "nvidia-dcgm": TransformDCGM, + "nvidia-dcgm-exporter": TransformDCGMExporter, + "nvidia-node-status-exporter": TransformNodeStatusExporter, + "gpu-feature-discovery": TransformGPUDiscoveryPlugin, + "nvidia-mig-manager": TransformMIGManager, + "nvidia-operator-validator": TransformValidator, + "nvidia-sandbox-validator": TransformSandboxValidator, + "nvidia-kata-manager": TransformKataManager, + "nvidia-cc-manager": TransformCCManager, } t, ok := transformations[obj.Name] @@ -1736,6 +1740,74 @@ func TransformKataDevicePlugin(obj *appsv1.DaemonSet, config *gpuv1.ClusterPolic return nil } +// TransformDRADriverKubeletPlugin transforms nvidia-dra-driver-kubelet-plugin daemonset with required config as per ClusterPolicy +func TransformDRADriverKubeletPlugin(obj *appsv1.DaemonSet, config *gpuv1.ClusterPolicySpec, n ClusterPolicyController) error { + err := transformValidationInitContainer(obj, config) + if err != nil { + return err + } + + if len(config.DRADriver.ImagePullSecrets) > 0 { + addPullSecrets(&obj.Spec.Template.Spec, config.DRADriver.ImagePullSecrets) + } + + image, err := gpuv1.ImagePath(&config.DRADriver) + if err != nil { + return err + } + + var containers []corev1.Container + for i, container := range obj.Spec.Template.Spec.Containers { + // Skip the container if the resource type is not enabled. + // As a result, the container will be removed from the spec. + if (container.Name == "gpus" && !config.DRADriver.IsGPUsEnabled()) || + (container.Name == "compute-domains" && !config.DRADriver.IsComputeDomainsEnabled()) { + continue + } + + obj.Spec.Template.Spec.Containers[i].Image = image + obj.Spec.Template.Spec.Containers[i].ImagePullPolicy = gpuv1.ImagePullPolicy(config.DRADriver.ImagePullPolicy) + + if config.Toolkit.IsEnabled() { + setContainerEnv(&(obj.Spec.Template.Spec.Containers[i]), NvidiaCTKPathEnvName, filepath.Join(config.Toolkit.InstallDir, "toolkit/nvidia-ctk")) + } + + // update the "gpus" container + if container.Name == "gpus" { + setContainerEnv(&(obj.Spec.Template.Spec.Containers[i]), "IMAGE_NAME", image) + if len(config.DRADriver.GPUs.KubeletPlugin.Env) > 0 { + for _, env := range config.DRADriver.GPUs.KubeletPlugin.Env { + setContainerEnv(&(obj.Spec.Template.Spec.Containers[i]), env.Name, env.Value) + } + } + + if config.DRADriver.GPUs.KubeletPlugin.Resources != nil { + obj.Spec.Template.Spec.Containers[i].Resources.Requests = config.DRADriver.GPUs.KubeletPlugin.Resources.Requests + obj.Spec.Template.Spec.Containers[i].Resources.Limits = config.DRADriver.GPUs.KubeletPlugin.Resources.Limits + } + } + + // update the "compute-domains" container + if container.Name == "compute-domains" { + if len(config.DRADriver.ComputeDomains.KubeletPlugin.Env) > 0 { + for _, env := range config.DRADriver.ComputeDomains.KubeletPlugin.Env { + setContainerEnv(&(obj.Spec.Template.Spec.Containers[i]), env.Name, env.Value) + } + } + + if config.DRADriver.ComputeDomains.KubeletPlugin.Resources != nil { + obj.Spec.Template.Spec.Containers[i].Resources.Requests = config.DRADriver.ComputeDomains.KubeletPlugin.Resources.Requests + obj.Spec.Template.Spec.Containers[i].Resources.Limits = config.DRADriver.ComputeDomains.KubeletPlugin.Resources.Limits + } + } + + containers = append(containers, obj.Spec.Template.Spec.Containers[i]) + } + obj.Spec.Template.Spec.Containers = containers + + return nil +} + // TransformDCGMExporter transforms dcgm exporter daemonset with required config as per ClusterPolicy func TransformDCGMExporter(obj *appsv1.DaemonSet, config *gpuv1.ClusterPolicySpec, n ClusterPolicyController) error { // update validation container @@ -4087,17 +4159,76 @@ func getDaemonsetControllerRevisionHash(ctx context.Context, daemonset *appsv1.D return hash, nil } +// TransformDRADriverController transforms nvidia-dra-driver-controller deployment with required config as per ClusterPolicy +func TransformDRADriverController(obj *appsv1.Deployment, spec *gpuv1.ClusterPolicySpec) error { + var computeDomainsCtr *corev1.Container + for i, ctr := range obj.Spec.Template.Spec.Containers { + if ctr.Name == "compute-domains" { + computeDomainsCtr = &obj.Spec.Template.Spec.Containers[i] + break + } + } + + if computeDomainsCtr == nil { + return fmt.Errorf("failed to find 'compute-domains' container") + } + + config := spec.DRADriver + image, err := gpuv1.ImagePath(&config) + if err != nil { + return err + } + + computeDomainsCtr.Image = image + setContainerEnv(computeDomainsCtr, "IMAGE_NAME", image) + + computeDomainsCtr.ImagePullPolicy = gpuv1.ImagePullPolicy(config.ImagePullPolicy) + + if len(config.ImagePullSecrets) > 0 { + addPullSecrets(&obj.Spec.Template.Spec, config.ImagePullSecrets) + } + + if len(config.ComputeDomains.Controller.Tolerations) > 0 { + obj.Spec.Template.Spec.Tolerations = append(obj.Spec.Template.Spec.Tolerations, config.ComputeDomains.Controller.Tolerations...) + } + + if len(config.ComputeDomains.Controller.Env) > 0 { + for _, env := range config.ComputeDomains.Controller.Env { + setContainerEnv(computeDomainsCtr, env.Name, env.Value) + } + } + + if config.ComputeDomains.Controller.Resources != nil { + computeDomainsCtr.Resources.Requests = config.ComputeDomains.Controller.Resources.Requests + computeDomainsCtr.Resources.Limits = config.ComputeDomains.Controller.Resources.Limits + } + + return nil +} + +func transformDeployment(obj *appsv1.Deployment, n ClusterPolicyController) error { + logger := n.logger.WithValues("Deployment", obj.Name, "Namespace", obj.Namespace) + switch obj.Name { + case "nvidia-dra-driver-controller": + return TransformDRADriverController(obj, &n.singleton.Spec) + default: + logger.Info("No transformation for object") + return nil + } +} + // Deployment creates Deployment resource func Deployment(n ClusterPolicyController) (gpuv1.State, error) { ctx := n.ctx state := n.idx + stateName := n.stateNames[state] obj := n.resources[state].Deployment.DeepCopy() obj.Namespace = n.operatorNamespace logger := n.logger.WithValues("Deployment", obj.Name, "Namespace", obj.Namespace) // Check if state is disabled and cleanup resource if exists - if !n.isStateEnabled(n.stateNames[n.idx]) { + if !n.isStateEnabled(stateName) || (obj.Name == "nvidia-dra-driver-controller" && !n.singleton.Spec.DRADriver.IsComputeDomainsEnabled()) { err := n.client.Delete(ctx, obj) if err != nil && !apierrors.IsNotFound(err) { logger.Info("Couldn't delete", "Error", err) @@ -4106,6 +4237,11 @@ func Deployment(n ClusterPolicyController) (gpuv1.State, error) { return gpuv1.Disabled, nil } + if err := transformDeployment(obj, n); err != nil { + logger.Info("Failed to transform Deployment", "Error", err) + return gpuv1.NotReady, err + } + if err := controllerutil.SetControllerReference(n.singleton, obj, n.scheme); err != nil { return gpuv1.NotReady, err } @@ -5362,3 +5498,76 @@ func clearRuntimeClasses(n ClusterPolicyController, runtimeClasses []nodev1.Runt } return nil } + +func createDeviceClass(n ClusterPolicyController, spec resourceapi.DeviceClass) (gpuv1.State, error) { + ctx := n.ctx + state := n.idx + obj := spec.DeepCopy() + + logger := n.logger.WithValues("DeviceClass", obj.Name) + + // Check if state is disabled and cleanup resource if exists + if !n.isStateEnabled(n.stateNames[state]) || + (strings.Contains(obj.Name, "compute-domain") && !n.singleton.Spec.DRADriver.IsComputeDomainsEnabled()) || + (obj.Name == "gpu.nvidia.com" && !n.singleton.Spec.DRADriver.IsGPUsEnabled()) || + (obj.Name == "mig.nvidia.com" && !n.singleton.Spec.DRADriver.IsGPUsEnabled()) { + err := n.client.Delete(ctx, obj) + if err != nil && !apierrors.IsNotFound(err) { + logger.Info("Couldn't delete", "Error", err) + return gpuv1.NotReady, err + } + return gpuv1.Disabled, nil + } + + if err := controllerutil.SetControllerReference(n.singleton, obj, n.scheme); err != nil { + return gpuv1.NotReady, err + } + + found := &resourceapi.DeviceClass{} + err := n.client.Get(ctx, types.NamespacedName{Namespace: "", Name: obj.Name}, found) + if err != nil && apierrors.IsNotFound(err) { + logger.Info("Not found, creating...") + err = n.client.Create(ctx, obj) + if err != nil { + logger.Info("Couldn't create", "Error", err) + return gpuv1.NotReady, err + } + return gpuv1.Ready, nil + } else if err != nil { + return gpuv1.NotReady, err + } + + logger.Info("Found Resource, updating...") + obj.ResourceVersion = found.ResourceVersion + + err = n.client.Update(ctx, obj) + if err != nil { + logger.Info("Couldn't update", "Error", err) + return gpuv1.NotReady, err + } + return gpuv1.Ready, nil +} + +// DeviceClasses creates DeviceClass objects +func DeviceClasses(n ClusterPolicyController) (gpuv1.State, error) { + status := gpuv1.Ready + state := n.idx + + for _, obj := range n.resources[state].DeviceClasses { + obj := obj + stat, err := createDeviceClass(n, obj) + if err != nil { + return stat, err + } + + switch stat { + case gpuv1.Ready: + continue + case gpuv1.Disabled: + continue + default: + status = gpuv1.NotReady + } + } + return status, nil +} diff --git a/controllers/resource_manager.go b/controllers/resource_manager.go index 2789bfe3d..2582143ab 100644 --- a/controllers/resource_manager.go +++ b/controllers/resource_manager.go @@ -28,6 +28,7 @@ import ( corev1 "k8s.io/api/core/v1" nodev1 "k8s.io/api/node/v1" rbacv1 "k8s.io/api/rbac/v1" + resourceapi "k8s.io/api/resource/v1beta1" schedv1 "k8s.io/api/scheduling/v1beta1" secv1 "github.com/openshift/api/security/v1" @@ -61,6 +62,7 @@ type Resources struct { SecurityContextConstraints secv1.SecurityContextConstraints RuntimeClasses []nodev1.RuntimeClass PrometheusRule promv1.PrometheusRule + DeviceClasses []resourceapi.DeviceClass } func filePathWalkDir(n *ClusterPolicyController, root string) ([]string, error) { @@ -180,6 +182,15 @@ func addResourcesControls(n *ClusterPolicyController, path string) (Resources, c _, _, err := s.Decode(m, nil, &res.PrometheusRule) panicIfError(err) ctrl = append(ctrl, PrometheusRule) + case "DeviceClass": + deviceClass := resourceapi.DeviceClass{} + _, _, err := s.Decode(m, nil, &deviceClass) + panicIfError(err) + res.DeviceClasses = append(res.DeviceClasses, deviceClass) + // only add the ctrl function when the first DeviceClass is added + if len(res.DeviceClasses) == 1 { + ctrl = append(ctrl, DeviceClasses) + } default: n.logger.Info("Unknown Resource", "Manifest", m, "Kind", kind) } diff --git a/controllers/state_manager.go b/controllers/state_manager.go index 29eef5cf2..b9b19acee 100644 --- a/controllers/state_manager.go +++ b/controllers/state_manager.go @@ -87,14 +87,15 @@ var ( var gpuStateLabels = map[string]map[string]string{ gpuWorkloadConfigContainer: { - "nvidia.com/gpu.deploy.driver": "true", - "nvidia.com/gpu.deploy.gpu-feature-discovery": "true", - "nvidia.com/gpu.deploy.container-toolkit": "true", - "nvidia.com/gpu.deploy.device-plugin": "true", - "nvidia.com/gpu.deploy.dcgm": "true", - "nvidia.com/gpu.deploy.dcgm-exporter": "true", - "nvidia.com/gpu.deploy.node-status-exporter": "true", - "nvidia.com/gpu.deploy.operator-validator": "true", + "nvidia.com/gpu.deploy.driver": "true", + "nvidia.com/gpu.deploy.gpu-feature-discovery": "true", + "nvidia.com/gpu.deploy.container-toolkit": "true", + "nvidia.com/gpu.deploy.device-plugin": "true", + "nvidia.com/gpu.deploy.dra-driver-kubelet-plugin": "true", + "nvidia.com/gpu.deploy.dcgm": "true", + "nvidia.com/gpu.deploy.dcgm-exporter": "true", + "nvidia.com/gpu.deploy.node-status-exporter": "true", + "nvidia.com/gpu.deploy.operator-validator": "true", }, gpuWorkloadConfigVMPassthrough: { "nvidia.com/gpu.deploy.sandbox-device-plugin": "true", @@ -892,6 +893,7 @@ func (n *ClusterPolicyController) init(ctx context.Context, reconciler *ClusterP addState(n, "/opt/gpu-operator/state-container-toolkit") addState(n, "/opt/gpu-operator/state-operator-validation") addState(n, "/opt/gpu-operator/state-device-plugin") + addState(n, "/opt/gpu-operator/state-dra-driver") addState(n, "/opt/gpu-operator/state-mps-control-daemon") addState(n, "/opt/gpu-operator/state-dcgm") addState(n, "/opt/gpu-operator/state-dcgm-exporter") @@ -1141,6 +1143,8 @@ func (n ClusterPolicyController) isStateEnabled(stateName string) bool { return true case "state-operator-metrics": return true + case "state-dra-driver": + return clusterPolicySpec.DRADriver.IsEnabled() default: n.logger.Error(nil, "invalid state passed", "stateName", stateName) return false diff --git a/controllers/transforms_test.go b/controllers/transforms_test.go index 64af9f93d..407f32754 100644 --- a/controllers/transforms_test.go +++ b/controllers/transforms_test.go @@ -193,6 +193,36 @@ func (d Daemonset) WithVolume(volume corev1.Volume) Daemonset { return d } +// _Deployment is a Deployment wrapper used for testing +type _Deployment struct { + *appsv1.Deployment +} + +func NewDeployment() _Deployment { + deployment := &appsv1.Deployment{ + ObjectMeta: metav1.ObjectMeta{ + Name: "test-deployment", + Namespace: "test-ns", + }, + Spec: appsv1.DeploymentSpec{ + Template: corev1.PodTemplateSpec{ + Spec: corev1.PodSpec{}, + }, + }, + } + return _Deployment{deployment} +} + +func (d _Deployment) WithContainer(container corev1.Container) _Deployment { + d.Spec.Template.Spec.Containers = append(d.Spec.Template.Spec.Containers, container) + return d +} + +func (d _Deployment) WithTolerations(tolerations []corev1.Toleration) _Deployment { + d.Spec.Template.Spec.Tolerations = tolerations + return d +} + // Pod is a Pod wrapper used for testing type Pod struct { *corev1.Pod @@ -4635,3 +4665,303 @@ func TestHashDriverInstallConfigZeroFieldInvariant(t *testing.T) { assert.NotEqual(t, originalDigest, changedDigest, "a non-zero new field should change the digest") } + +func TestTransformDRADriverKubeletPlugin(t *testing.T) { + testCases := []struct { + description string + ds Daemonset + cpSpec *gpuv1.ClusterPolicySpec + expectedDs Daemonset + errorExpected bool + }{ + { + description: "empty dra driver spec", + ds: NewDaemonset(), + cpSpec: &gpuv1.ClusterPolicySpec{ + DRADriver: gpuv1.DRADriverSpec{}, + }, + expectedDs: NewDaemonset(), + errorExpected: true, + }, + { + description: "full dra driver spec, gpus and compute domains enabled", + ds: NewDaemonset(). + WithContainer(corev1.Container{Name: "gpus"}). + WithContainer(corev1.Container{Name: "compute-domains"}), + cpSpec: &gpuv1.ClusterPolicySpec{ + Toolkit: gpuv1.ToolkitSpec{InstallDir: "/usr/local/nvidia"}, + DRADriver: gpuv1.DRADriverSpec{ + Repository: "nvcr.io/nvidia", + Image: "k8s-dra-driver-gpu", + Version: "v1.0.0", + ImagePullPolicy: "IfNotPresent", + GPUs: gpuv1.DRADriverGPUs{ + Enabled: newBoolPtr(true), + KubeletPlugin: gpuv1.DRADriverKubeletPlugin{ + Env: []gpuv1.EnvVar{{Name: "foo", Value: "bar"}}, + Resources: &gpuv1.ResourceRequirements{ + Limits: corev1.ResourceList{ + corev1.ResourceCPU: resource.MustParse("100m"), + corev1.ResourceMemory: resource.MustParse("100Mi"), + }, + Requests: corev1.ResourceList{ + corev1.ResourceCPU: resource.MustParse("50m"), + corev1.ResourceMemory: resource.MustParse("50Mi"), + }, + }, + }, + }, + ComputeDomains: gpuv1.DRADriverComputeDomains{ + Enabled: newBoolPtr(true), + KubeletPlugin: gpuv1.DRADriverKubeletPlugin{ + Env: []gpuv1.EnvVar{ + {Name: "foo", Value: "bar"}, + }, + Resources: &gpuv1.ResourceRequirements{ + Limits: corev1.ResourceList{ + corev1.ResourceCPU: resource.MustParse("100m"), + corev1.ResourceMemory: resource.MustParse("100Mi"), + }, + Requests: corev1.ResourceList{ + corev1.ResourceCPU: resource.MustParse("50m"), + corev1.ResourceMemory: resource.MustParse("50Mi"), + }, + }, + }, + }, + }, + }, + expectedDs: NewDaemonset(). + WithContainer(corev1.Container{ + Name: "gpus", + Image: "nvcr.io/nvidia/k8s-dra-driver-gpu:v1.0.0", + ImagePullPolicy: corev1.PullIfNotPresent, + Env: []corev1.EnvVar{ + {Name: NvidiaCTKPathEnvName, Value: "/usr/local/nvidia/toolkit/nvidia-ctk"}, + {Name: "IMAGE_NAME", Value: "nvcr.io/nvidia/k8s-dra-driver-gpu:v1.0.0"}, + {Name: "foo", Value: "bar"}, + }, + Resources: corev1.ResourceRequirements{ + Limits: corev1.ResourceList{ + corev1.ResourceCPU: resource.MustParse("100m"), + corev1.ResourceMemory: resource.MustParse("100Mi"), + }, + Requests: corev1.ResourceList{ + corev1.ResourceCPU: resource.MustParse("50m"), + corev1.ResourceMemory: resource.MustParse("50Mi"), + }, + }, + }). + WithContainer(corev1.Container{ + Name: "compute-domains", + Image: "nvcr.io/nvidia/k8s-dra-driver-gpu:v1.0.0", + ImagePullPolicy: corev1.PullIfNotPresent, + Env: []corev1.EnvVar{ + {Name: NvidiaCTKPathEnvName, Value: "/usr/local/nvidia/toolkit/nvidia-ctk"}, + {Name: "foo", Value: "bar"}, + }, + Resources: corev1.ResourceRequirements{ + Limits: corev1.ResourceList{ + corev1.ResourceCPU: resource.MustParse("100m"), + corev1.ResourceMemory: resource.MustParse("100Mi"), + }, + Requests: corev1.ResourceList{ + corev1.ResourceCPU: resource.MustParse("50m"), + corev1.ResourceMemory: resource.MustParse("50Mi"), + }, + }, + }), + }, + { + description: "gpus enabled, compute domains disabled", + ds: NewDaemonset(). + WithContainer(corev1.Container{Name: "gpus"}). + WithContainer(corev1.Container{Name: "compute-domains"}), + cpSpec: &gpuv1.ClusterPolicySpec{ + Toolkit: gpuv1.ToolkitSpec{InstallDir: "/usr/local/nvidia"}, + DRADriver: gpuv1.DRADriverSpec{ + Repository: "nvcr.io/nvidia", + Image: "k8s-dra-driver-gpu", + Version: "v1.0.0", + ImagePullPolicy: "IfNotPresent", + GPUs: gpuv1.DRADriverGPUs{ + Enabled: newBoolPtr(true), + }, + ComputeDomains: gpuv1.DRADriverComputeDomains{ + Enabled: newBoolPtr(false), + }, + }, + }, + expectedDs: NewDaemonset(). + WithContainer(corev1.Container{ + Name: "gpus", + Image: "nvcr.io/nvidia/k8s-dra-driver-gpu:v1.0.0", + ImagePullPolicy: corev1.PullIfNotPresent, + Env: []corev1.EnvVar{ + {Name: NvidiaCTKPathEnvName, Value: "/usr/local/nvidia/toolkit/nvidia-ctk"}, + {Name: "IMAGE_NAME", Value: "nvcr.io/nvidia/k8s-dra-driver-gpu:v1.0.0"}, + }, + }), + }, + { + description: "gpus disabled, compute domains enabled", + ds: NewDaemonset(). + WithContainer(corev1.Container{Name: "gpus"}). + WithContainer(corev1.Container{Name: "compute-domains"}), + cpSpec: &gpuv1.ClusterPolicySpec{ + Toolkit: gpuv1.ToolkitSpec{InstallDir: "/usr/local/nvidia"}, + DRADriver: gpuv1.DRADriverSpec{ + Repository: "nvcr.io/nvidia", + Image: "k8s-dra-driver-gpu", + Version: "v1.0.0", + ImagePullPolicy: "IfNotPresent", + GPUs: gpuv1.DRADriverGPUs{ + Enabled: newBoolPtr(false), + }, + ComputeDomains: gpuv1.DRADriverComputeDomains{ + Enabled: newBoolPtr(true), + }, + }, + }, + expectedDs: NewDaemonset(). + WithContainer(corev1.Container{ + Name: "compute-domains", + Image: "nvcr.io/nvidia/k8s-dra-driver-gpu:v1.0.0", + ImagePullPolicy: corev1.PullIfNotPresent, + Env: []corev1.EnvVar{ + {Name: NvidiaCTKPathEnvName, Value: "/usr/local/nvidia/toolkit/nvidia-ctk"}, + }, + }), + }, + { + description: "gpus disabled, compute domains disabled", + ds: NewDaemonset(). + WithContainer(corev1.Container{Name: "gpus"}). + WithContainer(corev1.Container{Name: "compute-domains"}), + cpSpec: &gpuv1.ClusterPolicySpec{ + DRADriver: gpuv1.DRADriverSpec{ + Repository: "nvcr.io/nvidia", + Image: "k8s-dra-driver-gpu", + Version: "v1.0.0", + ImagePullPolicy: "IfNotPresent", + GPUs: gpuv1.DRADriverGPUs{ + Enabled: newBoolPtr(false), + }, + ComputeDomains: gpuv1.DRADriverComputeDomains{ + Enabled: newBoolPtr(false), + }, + }, + }, + expectedDs: NewDaemonset(), + }, + } + + for _, tc := range testCases { + t.Run(tc.description, func(t *testing.T) { + err := TransformDRADriverKubeletPlugin(tc.ds.DaemonSet, tc.cpSpec, ClusterPolicyController{runtime: gpuv1.Containerd, logger: ctrl.Log.WithName("test")}) + if tc.errorExpected { + require.Error(t, err) + return + } + require.NoError(t, err) + require.EqualValues(t, tc.expectedDs, tc.ds) + }) + } +} + +func TestTransformDRADriverController(t *testing.T) { + testCases := []struct { + description string + deployment _Deployment + cpSpec *gpuv1.ClusterPolicySpec + expectedDeployment _Deployment + errorExpected bool + }{ + { + description: "empty dra driver spec", + deployment: NewDeployment(), + cpSpec: &gpuv1.ClusterPolicySpec{ + DRADriver: gpuv1.DRADriverSpec{}, + }, + expectedDeployment: NewDeployment(), + errorExpected: true, + }, + { + description: "full dra driver spec", + deployment: NewDeployment(). + WithContainer(corev1.Container{Name: "compute-domains"}), + cpSpec: &gpuv1.ClusterPolicySpec{ + DRADriver: gpuv1.DRADriverSpec{ + Repository: "nvcr.io/nvidia", + Image: "k8s-dra-driver-gpu", + Version: "v1.0.0", + ImagePullPolicy: "IfNotPresent", + ComputeDomains: gpuv1.DRADriverComputeDomains{ + Enabled: newBoolPtr(true), + Controller: gpuv1.DRADriverController{ + Env: []gpuv1.EnvVar{ + {Name: "foo", Value: "bar"}, + }, + Resources: &gpuv1.ResourceRequirements{ + Limits: corev1.ResourceList{ + corev1.ResourceCPU: resource.MustParse("100m"), + corev1.ResourceMemory: resource.MustParse("100Mi"), + }, + Requests: corev1.ResourceList{ + corev1.ResourceCPU: resource.MustParse("50m"), + corev1.ResourceMemory: resource.MustParse("50Mi"), + }, + }, + Tolerations: []corev1.Toleration{ + { + Key: "foo", + Operator: corev1.TolerationOpExists, + Effect: corev1.TaintEffectNoSchedule, + }, + }, + }, + }, + }, + }, + expectedDeployment: NewDeployment(). + WithTolerations([]corev1.Toleration{ + { + Key: "foo", + Operator: corev1.TolerationOpExists, + Effect: corev1.TaintEffectNoSchedule, + }, + }). + WithContainer(corev1.Container{ + Name: "compute-domains", + Image: "nvcr.io/nvidia/k8s-dra-driver-gpu:v1.0.0", + ImagePullPolicy: corev1.PullIfNotPresent, + Env: []corev1.EnvVar{ + {Name: "IMAGE_NAME", Value: "nvcr.io/nvidia/k8s-dra-driver-gpu:v1.0.0"}, + {Name: "foo", Value: "bar"}, + }, + Resources: corev1.ResourceRequirements{ + Limits: corev1.ResourceList{ + corev1.ResourceCPU: resource.MustParse("100m"), + corev1.ResourceMemory: resource.MustParse("100Mi"), + }, + Requests: corev1.ResourceList{ + corev1.ResourceCPU: resource.MustParse("50m"), + corev1.ResourceMemory: resource.MustParse("50Mi"), + }, + }, + }), + }, + } + + for _, tc := range testCases { + t.Run(tc.description, func(t *testing.T) { + err := TransformDRADriverController(tc.deployment.Deployment, tc.cpSpec) + if tc.errorExpected { + require.Error(t, err) + return + } + require.NoError(t, err) + require.EqualValues(t, tc.expectedDeployment, tc.deployment) + }) + } +} diff --git a/deployments/gpu-operator/crds/nvidia.com_clusterpolicies.yaml b/deployments/gpu-operator/crds/nvidia.com_clusterpolicies.yaml index 868fe9379..add8948ee 100644 --- a/deployments/gpu-operator/crds/nvidia.com_clusterpolicies.yaml +++ b/deployments/gpu-operator/crds/nvidia.com_clusterpolicies.yaml @@ -897,6 +897,236 @@ spec: description: NVIDIA Device Plugin image tag type: string type: object + draDriver: + description: DRADriver component spec + properties: + computeDomains: + description: ComputeDomains defines configuration for ComputeDomains + in the NVIDIA DRA Driver + properties: + controller: + description: Controller defines configuration for the NVIDIA + DRA Driver controller + properties: + env: + description: 'Optional: List of environment variables' + items: + description: EnvVar represents an environment variable + present in a Container. + properties: + name: + description: Name of the environment variable. + type: string + value: + description: Value of the environment variable. + type: string + required: + - name + type: object + type: array + resources: + description: 'Optional: Define resources requests and + limits' + properties: + limits: + additionalProperties: + anyOf: + - type: integer + - type: string + pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$ + x-kubernetes-int-or-string: true + description: |- + Limits describes the maximum amount of compute resources allowed. + More info: https://kubernetes.io/docs/concepts/configuration/manage-resources-containers/ + type: object + requests: + additionalProperties: + anyOf: + - type: integer + - type: string + pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$ + x-kubernetes-int-or-string: true + description: |- + Requests describes the minimum amount of compute resources required. + If Requests is omitted for a container, it defaults to Limits if that is explicitly specified, + otherwise to an implementation-defined value. Requests cannot exceed Limits. + More info: https://kubernetes.io/docs/concepts/configuration/manage-resources-containers/ + type: object + type: object + tolerations: + description: 'Optional: Set tolerations' + items: + description: |- + The pod this Toleration is attached to tolerates any taint that matches + the triple using the matching operator . + properties: + effect: + description: |- + Effect indicates the taint effect to match. Empty means match all taint effects. + When specified, allowed values are NoSchedule, PreferNoSchedule and NoExecute. + type: string + key: + description: |- + Key is the taint key that the toleration applies to. Empty means match all taint keys. + If the key is empty, operator must be Exists; this combination means to match all values and all keys. + type: string + operator: + description: |- + Operator represents a key's relationship to the value. + Valid operators are Exists, Equal, Lt, and Gt. Defaults to Equal. + Exists is equivalent to wildcard for value, so that a pod can + tolerate all taints of a particular category. + Lt and Gt perform numeric comparisons (requires feature gate TaintTolerationComparisonOperators). + type: string + tolerationSeconds: + description: |- + TolerationSeconds represents the period of time the toleration (which must be + of effect NoExecute, otherwise this field is ignored) tolerates the taint. By default, + it is not set, which means tolerate the taint forever (do not evict). Zero and + negative values will be treated as 0 (evict immediately) by the system. + format: int64 + type: integer + value: + description: |- + Value is the taint value the toleration matches to. + If the operator is Exists, the value should be empty, otherwise just a regular string. + type: string + type: object + type: array + type: object + enabled: + description: Enabled indicates if ComputeDomains are enabled + in the NVIDIA DRA Driver + type: boolean + kubeletPlugin: + description: KubeletPlugin defines configuration for the NVIDIA + DRA Driver kubelet plugin + properties: + env: + description: 'Optional: List of environment variables' + items: + description: EnvVar represents an environment variable + present in a Container. + properties: + name: + description: Name of the environment variable. + type: string + value: + description: Value of the environment variable. + type: string + required: + - name + type: object + type: array + resources: + description: 'Optional: Define resources requests and + limits' + properties: + limits: + additionalProperties: + anyOf: + - type: integer + - type: string + pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$ + x-kubernetes-int-or-string: true + description: |- + Limits describes the maximum amount of compute resources allowed. + More info: https://kubernetes.io/docs/concepts/configuration/manage-resources-containers/ + type: object + requests: + additionalProperties: + anyOf: + - type: integer + - type: string + pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$ + x-kubernetes-int-or-string: true + description: |- + Requests describes the minimum amount of compute resources required. + If Requests is omitted for a container, it defaults to Limits if that is explicitly specified, + otherwise to an implementation-defined value. Requests cannot exceed Limits. + More info: https://kubernetes.io/docs/concepts/configuration/manage-resources-containers/ + type: object + type: object + type: object + type: object + gpus: + description: GPUs defines configuration for GPUs in the NVIDIA + DRA Driver + properties: + enabled: + description: Enabled indicates if GPUs are enabled in the + NVIDIA DRA Driver + type: boolean + kubeletPlugin: + description: KubeletPlugin defines configuration for the NVIDIA + DRA Driver kubelet plugin + properties: + env: + description: 'Optional: List of environment variables' + items: + description: EnvVar represents an environment variable + present in a Container. + properties: + name: + description: Name of the environment variable. + type: string + value: + description: Value of the environment variable. + type: string + required: + - name + type: object + type: array + resources: + description: 'Optional: Define resources requests and + limits' + properties: + limits: + additionalProperties: + anyOf: + - type: integer + - type: string + pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$ + x-kubernetes-int-or-string: true + description: |- + Limits describes the maximum amount of compute resources allowed. + More info: https://kubernetes.io/docs/concepts/configuration/manage-resources-containers/ + type: object + requests: + additionalProperties: + anyOf: + - type: integer + - type: string + pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$ + x-kubernetes-int-or-string: true + description: |- + Requests describes the minimum amount of compute resources required. + If Requests is omitted for a container, it defaults to Limits if that is explicitly specified, + otherwise to an implementation-defined value. Requests cannot exceed Limits. + More info: https://kubernetes.io/docs/concepts/configuration/manage-resources-containers/ + type: object + type: object + type: object + type: object + image: + description: NVIDIA DRA Driver image name + pattern: '[a-zA-Z0-9\-]+' + type: string + imagePullPolicy: + description: Image pull policy + type: string + imagePullSecrets: + description: Image pull secrets + items: + type: string + type: array + repository: + description: NVIDIA DRA Driver image repository + type: string + version: + description: NVIDIA DRA Driver image tag + type: string + type: object driver: description: Driver component spec properties: @@ -2883,6 +3113,7 @@ spec: - dcgm - dcgmExporter - devicePlugin + - draDriver - driver - gfd - nodeStatusExporter diff --git a/deployments/gpu-operator/crds/resource.nvidia.com_computedomains.yaml b/deployments/gpu-operator/crds/resource.nvidia.com_computedomains.yaml new file mode 100644 index 000000000..307b21ff7 --- /dev/null +++ b/deployments/gpu-operator/crds/resource.nvidia.com_computedomains.yaml @@ -0,0 +1,104 @@ +--- +apiVersion: apiextensions.k8s.io/v1 +kind: CustomResourceDefinition +metadata: + annotations: + controller-gen.kubebuilder.io/version: v0.17.1 + name: computedomains.resource.nvidia.com +spec: + group: resource.nvidia.com + names: + kind: ComputeDomain + listKind: ComputeDomainList + plural: computedomains + singular: computedomain + scope: Namespaced + versions: + - name: v1beta1 + schema: + openAPIV3Schema: + description: ComputeDomain prepares a set of nodes to run a multi-node workload + in. + properties: + apiVersion: + description: |- + APIVersion defines the versioned schema of this representation of an object. + Servers should convert recognized schemas to the latest internal value, and + may reject unrecognized values. + More info: https://git.k8s.io/community/contributors/devel/sig-architecture/api-conventions.md#resources + type: string + kind: + description: |- + Kind is a string value representing the REST resource this object represents. + Servers may infer this from the endpoint the client submits requests to. + Cannot be updated. + In CamelCase. + More info: https://git.k8s.io/community/contributors/devel/sig-architecture/api-conventions.md#types-kinds + type: string + metadata: + type: object + spec: + description: ComputeDomainSpec provides the spec for a ComputeDomain. + properties: + channel: + description: ComputeDomainChannelSpec provides the spec for a channel + used to run a workload inside a ComputeDomain. + properties: + resourceClaimTemplate: + description: ComputeDomainResourceClaimTemplate provides the details + of the ResourceClaimTemplate to generate. + properties: + name: + type: string + required: + - name + type: object + required: + - resourceClaimTemplate + type: object + numNodes: + type: integer + required: + - channel + - numNodes + type: object + x-kubernetes-validations: + - message: A computeDomain.spec is immutable + rule: self == oldSelf + status: + description: ComputeDomainStatus provides the status for a ComputeDomain. + properties: + nodes: + items: + description: ComputeDomainNode provides information about each node + added to a ComputeDomain. + properties: + cliqueID: + type: string + ipAddress: + type: string + name: + type: string + required: + - cliqueID + - ipAddress + - name + type: object + type: array + x-kubernetes-list-map-keys: + - name + x-kubernetes-list-type: map + status: + default: NotReady + enum: + - Ready + - NotReady + type: string + required: + - status + type: object + type: object + served: true + storage: true + subresources: + status: {} diff --git a/deployments/gpu-operator/templates/cleanup_crd.yaml b/deployments/gpu-operator/templates/cleanup_crd.yaml index 0d426f952..347563498 100644 --- a/deployments/gpu-operator/templates/cleanup_crd.yaml +++ b/deployments/gpu-operator/templates/cleanup_crd.yaml @@ -40,6 +40,7 @@ spec: - delete - --filepath=/opt/gpu-operator/nvidia.com_clusterpolicies.yaml - --filepath=/opt/gpu-operator/nvidia.com_nvidiadrivers.yaml + - --filepath=/opt/gpu-operator/resource.nvidia.com_computedomains.yaml {{- if .Values.nfd.enabled }} - --filepath=/opt/gpu-operator/nfd-api-crds.yaml {{- end }} diff --git a/deployments/gpu-operator/templates/clusterpolicy.yaml b/deployments/gpu-operator/templates/clusterpolicy.yaml index d96a1f03f..c4442e646 100644 --- a/deployments/gpu-operator/templates/clusterpolicy.yaml +++ b/deployments/gpu-operator/templates/clusterpolicy.yaml @@ -518,6 +518,50 @@ spec: {{- if .Values.devicePlugin.hostNetwork }} hostNetwork: {{ .Values.devicePlugin.hostNetwork }} {{- end }} + draDriver: + {{- if .Values.draDriver.repository }} + repository: {{ .Values.draDriver.repository }} + {{- end }} + {{- if .Values.draDriver.image }} + image: {{ .Values.draDriver.image }} + {{- end }} + {{- if .Values.draDriver.version }} + version: {{ .Values.draDriver.version | quote }} + {{- end }} + {{- if .Values.draDriver.imagePullPolicy }} + imagePullPolicy: {{ .Values.draDriver.imagePullPolicy }} + {{- end }} + {{- if .Values.draDriver.imagePullSecrets }} + imagePullSecrets: {{ toYaml .Values.draDriver.imagePullSecrets | nindent 6 }} + {{- end }} + gpus: + enabled: {{ .Values.draDriver.gpus.enabled }} + kubeletPlugin: + {{- if .Values.draDriver.gpus.kubeletPlugin.env }} + env: {{ toYaml .Values.draDriver.gpus.kubeletPlugin.env | nindent 8 }} + {{- end }} + {{- if .Values.draDriver.gpus.kubeletPlugin.resources }} + resources: {{ toYaml .Values.draDriver.gpus.kubeletPlugin.resources | nindent 8 }} + {{- end }} + computeDomains: + enabled: {{ .Values.draDriver.computeDomains.enabled }} + controller: + {{- if .Values.draDriver.computeDomains.controller.env }} + env: {{ toYaml .Values.draDriver.computeDomains.controller.env | nindent 8 }} + {{- end }} + {{- if .Values.draDriver.computeDomains.controller.resources }} + resources: {{ toYaml .Values.draDriver.computeDomains.controller.resources | nindent 8 }} + {{- end }} + {{- if .Values.draDriver.computeDomains.controller.tolerations }} + tolerations: {{ toYaml .Values.draDriver.computeDomains.controller.tolerations | nindent 8 }} + {{- end }} + kubeletPlugin: + {{- if .Values.draDriver.computeDomains.kubeletPlugin.env }} + env: {{ toYaml .Values.draDriver.computeDomains.kubeletPlugin.env | nindent 8 }} + {{- end }} + {{- if .Values.draDriver.computeDomains.kubeletPlugin.resources }} + resources: {{ toYaml .Values.draDriver.computeDomains.kubeletPlugin.resources | nindent 8 }} + {{- end }} dcgm: enabled: {{ .Values.dcgm.enabled }} {{- if .Values.dcgm.repository }} diff --git a/deployments/gpu-operator/templates/clusterrole.yaml b/deployments/gpu-operator/templates/clusterrole.yaml index 2af291e22..3bc02222a 100644 --- a/deployments/gpu-operator/templates/clusterrole.yaml +++ b/deployments/gpu-operator/templates/clusterrole.yaml @@ -97,6 +97,7 @@ rules: - apps resources: - daemonsets + - deployments verbs: - get - list @@ -153,3 +154,81 @@ rules: {{- if .Values.operator.cleanupCRD }} - delete {{- end }} +- apiGroups: + - resource.nvidia.com + resources: + - computedomains + verbs: + - get + - list + - watch + - create + - update + - patch + - delete +- apiGroups: + - resource.nvidia.com + resources: + - computedomains/status + verbs: + - get + - list + - watch + - create + - update + - patch + - delete +- apiGroups: + - resource.k8s.io + resources: + - resourceclaims + verbs: + - get + - list + - watch + - create + - update + - patch + - delete +- apiGroups: + - resource.k8s.io + resources: + - resourceclaimtemplates + verbs: + - get + - list + - watch + - create + - update + - patch + - delete +- apiGroups: + - resource.k8s.io + resources: + - deviceclasses + verbs: + - get + - list + - watch + - create + - update + - patch + - delete +- apiGroups: + - resource.k8s.io + resources: + - resourceslices + verbs: + - get + - list + - watch + - create + - update + - patch + - delete +- apiGroups: + - resource.k8s.io + resources: + - resourceclaims/status + verbs: + - update diff --git a/deployments/gpu-operator/templates/role.yaml b/deployments/gpu-operator/templates/role.yaml index dc4674c57..2837ce435 100644 --- a/deployments/gpu-operator/templates/role.yaml +++ b/deployments/gpu-operator/templates/role.yaml @@ -32,6 +32,7 @@ rules: - apps resources: - daemonsets + - deployments verbs: - create - get diff --git a/deployments/gpu-operator/templates/upgrade_crd.yaml b/deployments/gpu-operator/templates/upgrade_crd.yaml index e887b3a81..ab66ee7d2 100644 --- a/deployments/gpu-operator/templates/upgrade_crd.yaml +++ b/deployments/gpu-operator/templates/upgrade_crd.yaml @@ -89,6 +89,7 @@ spec: - apply - --filepath=/opt/gpu-operator/nvidia.com_clusterpolicies.yaml - --filepath=/opt/gpu-operator/nvidia.com_nvidiadrivers.yaml + - --filepath=/opt/gpu-operator/resource.nvidia.com_computedomains.yaml {{- if .Values.nfd.enabled }} - --filepath=/opt/gpu-operator/nfd-api-crds.yaml {{- end }} diff --git a/deployments/gpu-operator/values.yaml b/deployments/gpu-operator/values.yaml index 9dba0c80c..06483c601 100644 --- a/deployments/gpu-operator/values.yaml +++ b/deployments/gpu-operator/values.yaml @@ -280,6 +280,32 @@ devicePlugin: root: "/run/nvidia/mps" hostNetwork: false +draDriver: + repository: nvcr.io/nvidia + image: k8s-dra-driver-gpu + version: v25.3.0-rc.4 + imagePullPolicy: IfNotPresent + imagePullSecrets: [] + + gpus: + enabled: false + kubeletPlugin: + env: [] + resources: {} + + computeDomains: + enabled: false + controller: + env: [] + resources: {} + tolerations: + - key: node-role.kubernetes.io/control-plane + operator: Exists + effect: NoSchedule + kubeletPlugin: + env: [] + resources: {} + # standalone dcgm hostengine dcgm: # disabled by default to use embedded nv-hostengine by exporter From 4167fae6751362bcf9a3babc049c68dfbee81e7f Mon Sep 17 00:00:00 2001 From: Christopher Desiniotis Date: Fri, 18 Jul 2025 15:45:55 -0700 Subject: [PATCH 02/10] Handle clusters were DRA is not supported or enabled Signed-off-by: Christopher Desiniotis --- controllers/clusterpolicy_controller.go | 13 +------ controllers/clusterpolicy_validator.go | 43 ++++++++++++++++++++ controllers/state_manager.go | 52 ++++++++++++++++++++++++- 3 files changed, 94 insertions(+), 14 deletions(-) create mode 100644 controllers/clusterpolicy_validator.go diff --git a/controllers/clusterpolicy_controller.go b/controllers/clusterpolicy_controller.go index b92023270..6db0ecebe 100644 --- a/controllers/clusterpolicy_controller.go +++ b/controllers/clusterpolicy_controller.go @@ -121,20 +121,9 @@ func (r *ClusterPolicyReconciler) Reconcile(ctx context.Context, req ctrl.Reques return ctrl.Result{}, nil } - if instance.Spec.DevicePlugin.IsEnabled() && instance.Spec.DRADriver.IsGPUsEnabled() { - err = fmt.Errorf("the device-plugin and dra driver for GPUs cannot both be enabled") - condErr = r.conditionUpdater.SetConditionsError(ctx, instance, conditions.ReconcileFailed, err.Error()) - if condErr != nil { - r.Log.V(consts.LogLevelDebug).Error(nil, condErr.Error()) - } - if clusterPolicyCtrl.operatorMetrics != nil { - clusterPolicyCtrl.operatorMetrics.reconciliationStatus.Set(reconciliationStatusNotReady) - } - return ctrl.Result{}, err - } - if err := clusterPolicyCtrl.init(ctx, r, instance); err != nil { r.Log.Error(err, "unable to initialize ClusterPolicy controller") + updateCRState(ctx, r, req.NamespacedName, gpuv1.NotReady) if condErr := r.conditionUpdater.SetConditionsError(ctx, instance, conditions.ReconcileFailed, err.Error()); condErr != nil { r.Log.Error(condErr, "failed to set condition") } diff --git a/controllers/clusterpolicy_validator.go b/controllers/clusterpolicy_validator.go new file mode 100644 index 000000000..2f630a9e4 --- /dev/null +++ b/controllers/clusterpolicy_validator.go @@ -0,0 +1,43 @@ +/* + * Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package controllers + +import ( + "fmt" + + gpuv1 "github.com/NVIDIA/gpu-operator/api/nvidia/v1" +) + +func (n *ClusterPolicyController) validateClusterPolicy() error { + err := validateDRA(n.singleton, n.draSupported) + if err != nil { + return fmt.Errorf("failed to validate DRA: %w", err) + } + return nil +} + +func validateDRA(clusterpolicy *gpuv1.ClusterPolicy, draSupported bool) error { + if !draSupported && clusterpolicy.Spec.DRADriver.IsEnabled() { + return fmt.Errorf("the NVIDIA DRA driver for GPUs is enabled in ClusterPolicy but Dynamic Resource Allocation is not enabled in the Kubernetes cluster") + } + + if clusterpolicy.Spec.DevicePlugin.IsEnabled() && clusterpolicy.Spec.DRADriver.IsGPUsEnabled() { + return fmt.Errorf("the NVIDIA device plugin and the NVIDIA DRA driver for GPUs cannot both be enabled in ClusterPolicy") + } + + return nil +} diff --git a/controllers/state_manager.go b/controllers/state_manager.go index b9b19acee..781e9ac72 100644 --- a/controllers/state_manager.go +++ b/controllers/state_manager.go @@ -162,6 +162,7 @@ type ClusterPolicyController struct { currentKernelVersion string k8sVersion string + draSupported bool openshift string ocpDriverToolkit OpenShiftDriverToolkit @@ -226,6 +227,38 @@ func KubernetesVersion() (string, error) { return info.GitVersion, nil } +// IsDRASupported checks if Dynamic Resource Allocation is enabled in the Kubernetes cluster +// by checking if the 'DeviceClass' resource is a valid Kind. +func IsDRASupported(logger logr.Logger) (bool, error) { + cfg := config.GetConfigOrDie() + discoveryClient, err := discovery.NewDiscoveryClientForConfig(cfg) + if err != nil { + return false, fmt.Errorf("error building discovery client: %w", err) + } + + apiResourceLists, err := discoveryClient.ServerPreferredResources() + if err != nil { + return false, fmt.Errorf("error getting API resources from discovery client: %w", err) + } + + var matches []string + kind := "DeviceClass" + for _, resourceList := range apiResourceLists { + for _, resource := range resourceList.APIResources { + if resource.Kind == kind { + matches = append(matches, resourceList.GroupVersion) + } + } + } + + draSupported := len(matches) > 0 + if draSupported { + logger.Info(fmt.Sprintf("Kind %q exists in the following group/versions: %s", kind, strings.Join(matches, ", "))) + } + + return len(matches) > 0, nil +} + // GetClusterWideProxy returns cluster wide proxy object setup in OCP func GetClusterWideProxy(ctx context.Context) (*apiconfigv1.Proxy, error) { cfg := config.GetConfigOrDie() @@ -884,6 +917,12 @@ func (n *ClusterPolicyController) init(ctx context.Context, reconciler *ClusterP return fmt.Errorf("error validating clusterpolicy: %w", err) } + draSupported, err := IsDRASupported(n.logger) + if err != nil { + return fmt.Errorf("failed to detect if DRA is supported: %w", err) + } + n.draSupported = draSupported + n.operatorMetrics = initOperatorMetrics() n.logger.Info("Operator metrics initialized.") @@ -893,14 +932,12 @@ func (n *ClusterPolicyController) init(ctx context.Context, reconciler *ClusterP addState(n, "/opt/gpu-operator/state-container-toolkit") addState(n, "/opt/gpu-operator/state-operator-validation") addState(n, "/opt/gpu-operator/state-device-plugin") - addState(n, "/opt/gpu-operator/state-dra-driver") addState(n, "/opt/gpu-operator/state-mps-control-daemon") addState(n, "/opt/gpu-operator/state-dcgm") addState(n, "/opt/gpu-operator/state-dcgm-exporter") addState(n, "/opt/gpu-operator/gpu-feature-discovery") addState(n, "/opt/gpu-operator/state-mig-manager") addState(n, "/opt/gpu-operator/state-node-status-exporter") - // add sandbox workload states addState(n, "/opt/gpu-operator/state-vgpu-manager") addState(n, "/opt/gpu-operator/state-vgpu-device-manager") addState(n, "/opt/gpu-operator/state-sandbox-validation") @@ -909,6 +946,17 @@ func (n *ClusterPolicyController) init(ctx context.Context, reconciler *ClusterP addState(n, "/opt/gpu-operator/state-kata-device-plugin") addState(n, "/opt/gpu-operator/state-kata-manager") addState(n, "/opt/gpu-operator/state-cc-manager") + + if n.draSupported { + addState(n, "/opt/gpu-operator/state-dra-driver") + } + } + + // TODO: combine this validation logic with the call to + // ValidateClusterPolicySpec() up above + err := n.validateClusterPolicy() + if err != nil { + return fmt.Errorf("ClusterPolicy validation failed: %w", err) } if clusterPolicy.Spec.SandboxWorkloads.IsEnabled() { From 6bf74971398be4b9813cf477fae82006557e1ea6 Mon Sep 17 00:00:00 2001 From: Christopher Desiniotis Date: Sat, 19 Jul 2025 15:19:29 -0700 Subject: [PATCH 03/10] Add service account for compute-domain-daemon Signed-off-by: Christopher Desiniotis --- ...compute_domain_daemon-service_account.yaml | 5 ++ ...220_compute_domain_daemon-clusterrole.yaml | 17 +++++ ...pute_domain_daemon-clusterrolebinding.yaml | 13 ++++ controllers/object_controls.go | 66 ++++++++++++++++--- controllers/resource_manager.go | 33 +++++++--- 5 files changed, 116 insertions(+), 18 deletions(-) create mode 100644 assets/state-dra-driver/0110_compute_domain_daemon-service_account.yaml create mode 100644 assets/state-dra-driver/0220_compute_domain_daemon-clusterrole.yaml create mode 100644 assets/state-dra-driver/0320_compute_domain_daemon-clusterrolebinding.yaml diff --git a/assets/state-dra-driver/0110_compute_domain_daemon-service_account.yaml b/assets/state-dra-driver/0110_compute_domain_daemon-service_account.yaml new file mode 100644 index 000000000..e4bfe6255 --- /dev/null +++ b/assets/state-dra-driver/0110_compute_domain_daemon-service_account.yaml @@ -0,0 +1,5 @@ +apiVersion: v1 +kind: ServiceAccount +metadata: + name: compute-domain-daemon-service-account + namespace: "FILLED BY THE OPERATOR" diff --git a/assets/state-dra-driver/0220_compute_domain_daemon-clusterrole.yaml b/assets/state-dra-driver/0220_compute_domain_daemon-clusterrole.yaml new file mode 100644 index 000000000..4b157fa4a --- /dev/null +++ b/assets/state-dra-driver/0220_compute_domain_daemon-clusterrole.yaml @@ -0,0 +1,17 @@ +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRole +metadata: + name: compute-domain-daemon-clusterrole + namespace: "FILLED BY THE OPERATOR" +rules: + - apiGroups: + - resource.nvidia.com + resources: + - computedomains + - computedomains/status + verbs: + - get + - list + - watch + - update + - patch diff --git a/assets/state-dra-driver/0320_compute_domain_daemon-clusterrolebinding.yaml b/assets/state-dra-driver/0320_compute_domain_daemon-clusterrolebinding.yaml new file mode 100644 index 000000000..5ba739004 --- /dev/null +++ b/assets/state-dra-driver/0320_compute_domain_daemon-clusterrolebinding.yaml @@ -0,0 +1,13 @@ +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRoleBinding +metadata: + name: compute-domain-daemon-clusterrole-binding + namespace: "FILLED BY THE OPERATOR" +subjects: + - kind: ServiceAccount + name: compute-domain-daemon-service-account + namespace: "FILLED BY THE OPERATOR" +roleRef: + kind: ClusterRole + name: compute-domain-daemon-clusterrole + apiGroup: rbac.authorization.k8s.io diff --git a/controllers/object_controls.go b/controllers/object_controls.go index 84987bd3f..57f2dd6f9 100644 --- a/controllers/object_controls.go +++ b/controllers/object_controls.go @@ -323,11 +323,11 @@ var SubscriptionPathMap = map[string](MountPathToVolumeSource){ type controlFunc []func(n ClusterPolicyController) (gpuv1.State, error) -// ServiceAccount creates ServiceAccount resource -func ServiceAccount(n ClusterPolicyController) (gpuv1.State, error) { +// createServiceAccount creates a ServiceAccount resource +func createServiceAccount(n ClusterPolicyController, idx int) (gpuv1.State, error) { ctx := n.ctx state := n.idx - obj := n.resources[state].ServiceAccount.DeepCopy() + obj := n.resources[state].ServiceAccounts[idx].DeepCopy() obj.Namespace = n.operatorNamespace logger := n.logger.WithValues("ServiceAccount", obj.Name, "Namespace", obj.Namespace) @@ -358,6 +358,22 @@ func ServiceAccount(n ClusterPolicyController) (gpuv1.State, error) { return gpuv1.Ready, nil } +// ServiceAccounts creates one or more ServiceAccount resources +func ServiceAccounts(n ClusterPolicyController) (gpuv1.State, error) { + status := gpuv1.Ready + state := n.idx + for i := range n.resources[state].ServiceAccounts { + stat, err := createServiceAccount(n, i) + if err != nil { + return stat, err + } + if stat == gpuv1.NotReady { + status = gpuv1.NotReady + } + } + return status, nil +} + // Role creates Role resource func Role(n ClusterPolicyController) (gpuv1.State, error) { ctx := n.ctx @@ -450,11 +466,11 @@ func RoleBinding(n ClusterPolicyController) (gpuv1.State, error) { return gpuv1.Ready, nil } -// ClusterRole creates ClusterRole resource -func ClusterRole(n ClusterPolicyController) (gpuv1.State, error) { +// createClusterRole creates a ClusterRole resource +func createClusterRole(n ClusterPolicyController, idx int) (gpuv1.State, error) { ctx := n.ctx state := n.idx - obj := n.resources[state].ClusterRole.DeepCopy() + obj := n.resources[state].ClusterRoles[idx].DeepCopy() obj.Namespace = n.operatorNamespace logger := n.logger.WithValues("ClusterRole", obj.Name, "Namespace", obj.Namespace) @@ -491,11 +507,27 @@ func ClusterRole(n ClusterPolicyController) (gpuv1.State, error) { return gpuv1.Ready, nil } -// ClusterRoleBinding creates ClusterRoleBinding resource -func ClusterRoleBinding(n ClusterPolicyController) (gpuv1.State, error) { +// ClusterRoles creates one or more ClusterRole resources +func ClusterRoles(n ClusterPolicyController) (gpuv1.State, error) { + status := gpuv1.Ready + state := n.idx + for i := range n.resources[state].ClusterRoles { + stat, err := createClusterRole(n, i) + if err != nil { + return stat, err + } + if stat == gpuv1.NotReady { + status = gpuv1.NotReady + } + } + return status, nil +} + +// createClusterRoleBinding creates a ClusterRoleBinding resource +func createClusterRoleBinding(n ClusterPolicyController, idx int) (gpuv1.State, error) { ctx := n.ctx state := n.idx - obj := n.resources[state].ClusterRoleBinding.DeepCopy() + obj := n.resources[state].ClusterRoleBindings[idx].DeepCopy() obj.Namespace = n.operatorNamespace logger := n.logger.WithValues("ClusterRoleBinding", obj.Name, "Namespace", obj.Namespace) @@ -536,6 +568,22 @@ func ClusterRoleBinding(n ClusterPolicyController) (gpuv1.State, error) { return gpuv1.Ready, nil } +// ClusterRoleBindings creates one or more ClusterRoleBinding resources +func ClusterRoleBindings(n ClusterPolicyController) (gpuv1.State, error) { + status := gpuv1.Ready + state := n.idx + for i := range n.resources[state].ClusterRoleBindings { + stat, err := createClusterRoleBinding(n, i) + if err != nil { + return stat, err + } + if stat == gpuv1.NotReady { + status = gpuv1.NotReady + } + } + return status, nil +} + // createConfigMap creates a ConfigMap resource func createConfigMap(n ClusterPolicyController, configMapIdx int) (gpuv1.State, error) { ctx := n.ctx diff --git a/controllers/resource_manager.go b/controllers/resource_manager.go index 2582143ab..e8acbe4e8 100644 --- a/controllers/resource_manager.go +++ b/controllers/resource_manager.go @@ -46,11 +46,11 @@ type assetsFromFile []byte // Resources indicates resources managed by GPU operator type Resources struct { - ServiceAccount corev1.ServiceAccount + ServiceAccounts []corev1.ServiceAccount Role rbacv1.Role RoleBinding rbacv1.RoleBinding - ClusterRole rbacv1.ClusterRole - ClusterRoleBinding rbacv1.ClusterRoleBinding + ClusterRoles []rbacv1.ClusterRole + ClusterRoleBindings []rbacv1.ClusterRoleBinding ConfigMaps []corev1.ConfigMap DaemonSet appsv1.DaemonSet Deployment appsv1.Deployment @@ -121,9 +121,14 @@ func addResourcesControls(n *ClusterPolicyController, path string) (Resources, c switch kind { case "ServiceAccount": - _, _, err := s.Decode(m, nil, &res.ServiceAccount) + serviceAccount := corev1.ServiceAccount{} + _, _, err := s.Decode(m, nil, &serviceAccount) panicIfError(err) - ctrl = append(ctrl, ServiceAccount) + res.ServiceAccounts = append(res.ServiceAccounts, serviceAccount) + // only add the ctrl function when the first ServiceAccount is added for this component + if len(res.ServiceAccounts) == 1 { + ctrl = append(ctrl, ServiceAccounts) + } case "Role": _, _, err := s.Decode(m, nil, &res.Role) panicIfError(err) @@ -133,13 +138,23 @@ func addResourcesControls(n *ClusterPolicyController, path string) (Resources, c panicIfError(err) ctrl = append(ctrl, RoleBinding) case "ClusterRole": - _, _, err := s.Decode(m, nil, &res.ClusterRole) + clusterRole := rbacv1.ClusterRole{} + _, _, err := s.Decode(m, nil, &clusterRole) panicIfError(err) - ctrl = append(ctrl, ClusterRole) + res.ClusterRoles = append(res.ClusterRoles, clusterRole) + // only add the ctrl function when the first ClusterRole is added for this component + if len(res.ClusterRoles) == 1 { + ctrl = append(ctrl, ClusterRoles) + } case "ClusterRoleBinding": - _, _, err := s.Decode(m, nil, &res.ClusterRoleBinding) + clusterRoleBinding := rbacv1.ClusterRoleBinding{} + _, _, err := s.Decode(m, nil, &clusterRoleBinding) panicIfError(err) - ctrl = append(ctrl, ClusterRoleBinding) + res.ClusterRoleBindings = append(res.ClusterRoleBindings, clusterRoleBinding) + // only add the ctrl function when the first ClusterRoleBinding is added for this component + if len(res.ClusterRoleBindings) == 1 { + ctrl = append(ctrl, ClusterRoleBindings) + } case "ConfigMap": cm := corev1.ConfigMap{} _, _, err := s.Decode(m, nil, &cm) From fe9f0829d716d70b1fa01b1fe808065bfb7857e2 Mon Sep 17 00:00:00 2001 From: Christopher Desiniotis Date: Mon, 28 Jul 2025 16:52:47 -0700 Subject: [PATCH 04/10] Allow DRA kubelet-plugin to run privileged on OpenShift Signed-off-by: Christopher Desiniotis --- .../0330_rolebinding.openshift.yaml | 13 +++++++++ controllers/object_controls.go | 28 +++++++++++++------ controllers/resource_manager.go | 11 ++++++-- 3 files changed, 40 insertions(+), 12 deletions(-) create mode 100644 assets/state-dra-driver/0330_rolebinding.openshift.yaml diff --git a/assets/state-dra-driver/0330_rolebinding.openshift.yaml b/assets/state-dra-driver/0330_rolebinding.openshift.yaml new file mode 100644 index 000000000..bb49c649a --- /dev/null +++ b/assets/state-dra-driver/0330_rolebinding.openshift.yaml @@ -0,0 +1,13 @@ +apiVersion: rbac.authorization.k8s.io/v1 +kind: RoleBinding +metadata: + name: nvidia-dra-driver-openshift-privileged-role-binding + namespace: "FILLED BY THE OPERATOR" +subjects: + - kind: ServiceAccount + name: nvidia-dra-driver + namespace: "FILLED BY THE OPERATOR" +roleRef: + kind: ClusterRole + name: system:openshift:scc:privileged + apiGroup: rbac.authorization.k8s.io diff --git a/controllers/object_controls.go b/controllers/object_controls.go index 57f2dd6f9..b4f9622b7 100644 --- a/controllers/object_controls.go +++ b/controllers/object_controls.go @@ -415,11 +415,11 @@ func Role(n ClusterPolicyController) (gpuv1.State, error) { return gpuv1.Ready, nil } -// RoleBinding creates RoleBinding resource -func RoleBinding(n ClusterPolicyController) (gpuv1.State, error) { +// createRoleBinding creates a RoleBinding resource +func createRoleBinding(n ClusterPolicyController, idx int) (gpuv1.State, error) { ctx := n.ctx state := n.idx - obj := n.resources[state].RoleBinding.DeepCopy() + obj := n.resources[state].RoleBindings[idx].DeepCopy() obj.Namespace = n.operatorNamespace logger := n.logger.WithValues("RoleBinding", obj.Name, "Namespace", obj.Namespace) @@ -435,12 +435,6 @@ func RoleBinding(n ClusterPolicyController) (gpuv1.State, error) { } for idx := range obj.Subjects { - // we don't want to update ALL the Subjects[].Namespace, eg we need to keep 'openshift-monitoring' - // for allowing PrometheusOperator to scrape our metrics resources: - // see in assets/state-dcgm-exporter, 0500_prom_rolebinding_openshift.yaml vs 0300_rolebinding.yaml - if obj.Subjects[idx].Namespace != "FILLED BY THE OPERATOR" { - continue - } obj.Subjects[idx].Namespace = n.operatorNamespace } @@ -466,6 +460,22 @@ func RoleBinding(n ClusterPolicyController) (gpuv1.State, error) { return gpuv1.Ready, nil } +// RoleBindings creates one or more RoleBinding resources +func RoleBindings(n ClusterPolicyController) (gpuv1.State, error) { + status := gpuv1.Ready + state := n.idx + for i := range n.resources[state].RoleBindings { + stat, err := createRoleBinding(n, i) + if err != nil { + return stat, err + } + if stat == gpuv1.NotReady { + status = gpuv1.NotReady + } + } + return status, nil +} + // createClusterRole creates a ClusterRole resource func createClusterRole(n ClusterPolicyController, idx int) (gpuv1.State, error) { ctx := n.ctx diff --git a/controllers/resource_manager.go b/controllers/resource_manager.go index e8acbe4e8..c48b12b1b 100644 --- a/controllers/resource_manager.go +++ b/controllers/resource_manager.go @@ -48,7 +48,7 @@ type assetsFromFile []byte type Resources struct { ServiceAccounts []corev1.ServiceAccount Role rbacv1.Role - RoleBinding rbacv1.RoleBinding + RoleBindings []rbacv1.RoleBinding ClusterRoles []rbacv1.ClusterRole ClusterRoleBindings []rbacv1.ClusterRoleBinding ConfigMaps []corev1.ConfigMap @@ -134,9 +134,14 @@ func addResourcesControls(n *ClusterPolicyController, path string) (Resources, c panicIfError(err) ctrl = append(ctrl, Role) case "RoleBinding": - _, _, err := s.Decode(m, nil, &res.RoleBinding) + roleBinding := rbacv1.RoleBinding{} + _, _, err := s.Decode(m, nil, &roleBinding) panicIfError(err) - ctrl = append(ctrl, RoleBinding) + res.RoleBindings = append(res.RoleBindings, roleBinding) + // only add the ctrl function when the first RoleBinding is added for this component + if len(res.RoleBindings) == 1 { + ctrl = append(ctrl, RoleBindings) + } case "ClusterRole": clusterRole := rbacv1.ClusterRole{} _, _, err := s.Decode(m, nil, &clusterRole) From 7d467394987c76bae92a1a3ed37c8bcb8dd68e0a Mon Sep 17 00:00:00 2001 From: Christopher Desiniotis Date: Wed, 30 Jul 2025 17:30:22 -0700 Subject: [PATCH 05/10] Add validations for DRA in the helm chart Signed-off-by: Christopher Desiniotis --- deployments/gpu-operator/templates/validation.yaml | 14 ++++++++++++++ 1 file changed, 14 insertions(+) create mode 100644 deployments/gpu-operator/templates/validation.yaml diff --git a/deployments/gpu-operator/templates/validation.yaml b/deployments/gpu-operator/templates/validation.yaml new file mode 100644 index 000000000..263b29155 --- /dev/null +++ b/deployments/gpu-operator/templates/validation.yaml @@ -0,0 +1,14 @@ +{{- $draEnabled := or (eq .Values.draDriver.gpus.enabled true) (eq .Values.draDriver.computeDomains.enabled true) }} +{{- $clusterSupportsDRA := or (.Capabilities.APIVersions.Has "resource.k8s.io/v1beta1/DeviceClass") (.Capabilities.APIVersions.Has "resource.k8s.io/v1beta2/DeviceClass") }} + +{{- if and (eq .Values.devicePlugin.enabled true) (eq .Values.draDriver.gpus.enabled true) }} +{{- $error := "" }} +{{- $error = printf "%s\nThe NVIDIA device plugin and the NVIDIA DRA Driver for GPUs cannot both be enabled" $error }} +{{- fail $error }} +{{- end}} + +{{- if and ($draEnabled) (not $clusterSupportsDRA) }} +{{- $error := "" }} +{{- $error = printf "%s\nCannot enable the NVIDIA DRA Driver for GPUs on a Kubernetes cluster that does not support DRA" $error }} +{{- fail $error }} +{{- end}} From a1578f7dad2d3a3529d124b9b3fad03ad805b9f1 Mon Sep 17 00:00:00 2001 From: Christopher Desiniotis Date: Wed, 17 Sep 2025 16:40:46 -0700 Subject: [PATCH 06/10] Conditionally set the apiVersion for all device classes we install Signed-off-by: Christopher Desiniotis --- controllers/object_controls.go | 28 ++++++++------- controllers/resource_manager.go | 6 ++-- controllers/state_manager.go | 60 ++++++++++++++++++++++++++------- 3 files changed, 67 insertions(+), 27 deletions(-) diff --git a/controllers/object_controls.go b/controllers/object_controls.go index b4f9622b7..29ffb9bb3 100644 --- a/controllers/object_controls.go +++ b/controllers/object_controls.go @@ -36,10 +36,10 @@ import ( corev1 "k8s.io/api/core/v1" nodev1 "k8s.io/api/node/v1" nodev1beta1 "k8s.io/api/node/v1beta1" - resourceapi "k8s.io/api/resource/v1beta1" apiextensionsv1 "k8s.io/apiextensions-apiserver/pkg/apis/apiextensions/v1" apierrors "k8s.io/apimachinery/pkg/api/errors" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/apimachinery/pkg/apis/meta/v1/unstructured" "k8s.io/apimachinery/pkg/runtime/schema" "k8s.io/apimachinery/pkg/types" "k8s.io/apimachinery/pkg/util/intstr" @@ -5557,19 +5557,24 @@ func clearRuntimeClasses(n ClusterPolicyController, runtimeClasses []nodev1.Runt return nil } -func createDeviceClass(n ClusterPolicyController, spec resourceapi.DeviceClass) (gpuv1.State, error) { +func createDeviceClass(n ClusterPolicyController, spec unstructured.Unstructured) (gpuv1.State, error) { ctx := n.ctx state := n.idx obj := spec.DeepCopy() + deviceClassName := obj.GetName() - logger := n.logger.WithValues("DeviceClass", obj.Name) + logger := n.logger.WithValues("DeviceClass", deviceClassName) + + gvr := n.resourceGVR + apiVersion := gvr.Group + "/" + gvr.Version + obj.SetAPIVersion(apiVersion) // Check if state is disabled and cleanup resource if exists if !n.isStateEnabled(n.stateNames[state]) || - (strings.Contains(obj.Name, "compute-domain") && !n.singleton.Spec.DRADriver.IsComputeDomainsEnabled()) || - (obj.Name == "gpu.nvidia.com" && !n.singleton.Spec.DRADriver.IsGPUsEnabled()) || - (obj.Name == "mig.nvidia.com" && !n.singleton.Spec.DRADriver.IsGPUsEnabled()) { - err := n.client.Delete(ctx, obj) + (strings.Contains(deviceClassName, "compute-domain") && !n.singleton.Spec.DRADriver.IsComputeDomainsEnabled()) || + (deviceClassName == "gpu.nvidia.com" && !n.singleton.Spec.DRADriver.IsGPUsEnabled()) || + (deviceClassName == "mig.nvidia.com" && !n.singleton.Spec.DRADriver.IsGPUsEnabled()) { + err := n.dynamicClient.Resource(gvr).Delete(ctx, deviceClassName, metav1.DeleteOptions{}) if err != nil && !apierrors.IsNotFound(err) { logger.Info("Couldn't delete", "Error", err) return gpuv1.NotReady, err @@ -5581,11 +5586,10 @@ func createDeviceClass(n ClusterPolicyController, spec resourceapi.DeviceClass) return gpuv1.NotReady, err } - found := &resourceapi.DeviceClass{} - err := n.client.Get(ctx, types.NamespacedName{Namespace: "", Name: obj.Name}, found) + found, err := n.dynamicClient.Resource(gvr).Get(ctx, deviceClassName, metav1.GetOptions{}) if err != nil && apierrors.IsNotFound(err) { logger.Info("Not found, creating...") - err = n.client.Create(ctx, obj) + _, err := n.dynamicClient.Resource(gvr).Create(ctx, obj, metav1.CreateOptions{}) if err != nil { logger.Info("Couldn't create", "Error", err) return gpuv1.NotReady, err @@ -5596,9 +5600,9 @@ func createDeviceClass(n ClusterPolicyController, spec resourceapi.DeviceClass) } logger.Info("Found Resource, updating...") - obj.ResourceVersion = found.ResourceVersion + obj.SetResourceVersion(found.GetResourceVersion()) - err = n.client.Update(ctx, obj) + _, err = n.dynamicClient.Resource(gvr).Update(ctx, obj, metav1.UpdateOptions{}) if err != nil { logger.Info("Couldn't update", "Error", err) return gpuv1.NotReady, err diff --git a/controllers/resource_manager.go b/controllers/resource_manager.go index c48b12b1b..2ae7497ba 100644 --- a/controllers/resource_manager.go +++ b/controllers/resource_manager.go @@ -28,8 +28,8 @@ import ( corev1 "k8s.io/api/core/v1" nodev1 "k8s.io/api/node/v1" rbacv1 "k8s.io/api/rbac/v1" - resourceapi "k8s.io/api/resource/v1beta1" schedv1 "k8s.io/api/scheduling/v1beta1" + "k8s.io/apimachinery/pkg/apis/meta/v1/unstructured" secv1 "github.com/openshift/api/security/v1" @@ -62,7 +62,7 @@ type Resources struct { SecurityContextConstraints secv1.SecurityContextConstraints RuntimeClasses []nodev1.RuntimeClass PrometheusRule promv1.PrometheusRule - DeviceClasses []resourceapi.DeviceClass + DeviceClasses []unstructured.Unstructured } func filePathWalkDir(n *ClusterPolicyController, root string) ([]string, error) { @@ -203,7 +203,7 @@ func addResourcesControls(n *ClusterPolicyController, path string) (Resources, c panicIfError(err) ctrl = append(ctrl, PrometheusRule) case "DeviceClass": - deviceClass := resourceapi.DeviceClass{} + deviceClass := unstructured.Unstructured{} _, _, err := s.Decode(m, nil, &deviceClass) panicIfError(err) res.DeviceClasses = append(res.DeviceClasses, deviceClass) diff --git a/controllers/state_manager.go b/controllers/state_manager.go index 781e9ac72..7d40c0676 100644 --- a/controllers/state_manager.go +++ b/controllers/state_manager.go @@ -20,6 +20,7 @@ import ( "context" "fmt" "path/filepath" + "slices" "strconv" "strings" @@ -31,7 +32,9 @@ import ( apierrors "k8s.io/apimachinery/pkg/api/errors" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" "k8s.io/apimachinery/pkg/runtime" + "k8s.io/apimachinery/pkg/runtime/schema" "k8s.io/client-go/discovery" + "k8s.io/client-go/dynamic" "sigs.k8s.io/controller-runtime/pkg/client" "sigs.k8s.io/controller-runtime/pkg/client/config" @@ -145,7 +148,8 @@ type OpenShiftDriverToolkit struct { // ClusterPolicyController represents clusterpolicy controller spec for GPU operator type ClusterPolicyController struct { - client client.Client + client client.Client + dynamicClient dynamic.Interface ctx context.Context singleton *gpuv1.ClusterPolicy @@ -163,6 +167,7 @@ type ClusterPolicyController struct { k8sVersion string draSupported bool + resourceGVR schema.GroupVersionResource openshift string ocpDriverToolkit OpenShiftDriverToolkit @@ -229,34 +234,48 @@ func KubernetesVersion() (string, error) { // IsDRASupported checks if Dynamic Resource Allocation is enabled in the Kubernetes cluster // by checking if the 'DeviceClass' resource is a valid Kind. -func IsDRASupported(logger logr.Logger) (bool, error) { +func IsDRASupported(logger logr.Logger) (bool, schema.GroupVersionResource, error) { + var resourceGVR schema.GroupVersionResource + cfg := config.GetConfigOrDie() discoveryClient, err := discovery.NewDiscoveryClientForConfig(cfg) if err != nil { - return false, fmt.Errorf("error building discovery client: %w", err) + return false, resourceGVR, fmt.Errorf("error building discovery client: %w", err) } apiResourceLists, err := discoveryClient.ServerPreferredResources() if err != nil { - return false, fmt.Errorf("error getting API resources from discovery client: %w", err) + return false, resourceGVR, fmt.Errorf("error getting API resources from discovery client: %w", err) } - var matches []string + var resourceAPIGroupVersions []string kind := "DeviceClass" for _, resourceList := range apiResourceLists { for _, resource := range resourceList.APIResources { if resource.Kind == kind { - matches = append(matches, resourceList.GroupVersion) + resourceAPIGroupVersions = append(resourceAPIGroupVersions, resourceList.GroupVersion) } } } - draSupported := len(matches) > 0 - if draSupported { - logger.Info(fmt.Sprintf("Kind %q exists in the following group/versions: %s", kind, strings.Join(matches, ", "))) + if len(resourceAPIGroupVersions) == 0 { + return false, resourceGVR, nil + } + + logger.Info(fmt.Sprintf("Kind %q exists in the following group/versions: %s", kind, strings.Join(resourceAPIGroupVersions, ", "))) + + switch { + case slices.Contains(resourceAPIGroupVersions, "resource.k8s.io/v1"): + resourceGVR = schema.GroupVersionResource{Group: "resource.k8s.io", Version: "v1", Resource: "deviceclasses"} + case slices.Contains(resourceAPIGroupVersions, "resource.k8s.io/v1beta2"): + resourceGVR = schema.GroupVersionResource{Group: "resource.k8s.io", Version: "v1beta2", Resource: "deviceclasses"} + case slices.Contains(resourceAPIGroupVersions, "resource.k8s.io/v1beta1"): + resourceGVR = schema.GroupVersionResource{Group: "resource.k8s.io", Version: "v1beta1", Resource: "deviceclasses"} + default: + return false, resourceGVR, fmt.Errorf("failed to determine the GVR to use for the DeviceClass resource") } - return len(matches) > 0, nil + return true, resourceGVR, nil } // GetClusterWideProxy returns cluster wide proxy object setup in OCP @@ -885,6 +904,16 @@ func (n *ClusterPolicyController) getRuntime() error { return nil } +func newDynamicClient() (dynamic.Interface, error) { + cfg := config.GetConfigOrDie() + dynamicClient, err := dynamic.NewForConfig(cfg) + if err != nil { + return nil, err + } + + return dynamicClient, nil +} + func (n *ClusterPolicyController) init(ctx context.Context, reconciler *ClusterPolicyReconciler, clusterPolicy *gpuv1.ClusterPolicy) error { n.singleton = clusterPolicy n.ctx = ctx @@ -893,6 +922,12 @@ func (n *ClusterPolicyController) init(ctx context.Context, reconciler *ClusterP n.client = reconciler.Client n.scheme = reconciler.Scheme + dynamicClient, err := newDynamicClient() + if err != nil { + return fmt.Errorf("failed to get dynamic k8s client: %w", err) + } + n.dynamicClient = dynamicClient + if len(n.controls) == 0 { clusterPolicyCtrl.operatorNamespace = reconciler.Namespace @@ -917,11 +952,12 @@ func (n *ClusterPolicyController) init(ctx context.Context, reconciler *ClusterP return fmt.Errorf("error validating clusterpolicy: %w", err) } - draSupported, err := IsDRASupported(n.logger) + draSupported, resourceGVR, err := IsDRASupported(n.logger) if err != nil { return fmt.Errorf("failed to detect if DRA is supported: %w", err) } n.draSupported = draSupported + n.resourceGVR = resourceGVR n.operatorMetrics = initOperatorMetrics() n.logger.Info("Operator metrics initialized.") @@ -954,7 +990,7 @@ func (n *ClusterPolicyController) init(ctx context.Context, reconciler *ClusterP // TODO: combine this validation logic with the call to // ValidateClusterPolicySpec() up above - err := n.validateClusterPolicy() + err = n.validateClusterPolicy() if err != nil { return fmt.Errorf("ClusterPolicy validation failed: %w", err) } From 885599f70a658756f2cb030876a387a58113ac36 Mon Sep 17 00:00:00 2001 From: Christopher Desiniotis Date: Thu, 18 Sep 2025 11:33:48 -0700 Subject: [PATCH 07/10] Bump to latest dra driver image on HEAD of main Signed-off-by: Christopher Desiniotis --- assets/state-dra-driver/0600_configmap.yaml | 2 +- assets/state-dra-driver/0700_daemonset.yaml | 10 ++++++++-- deployments/gpu-operator/values.yaml | 4 ++-- 3 files changed, 11 insertions(+), 5 deletions(-) diff --git a/assets/state-dra-driver/0600_configmap.yaml b/assets/state-dra-driver/0600_configmap.yaml index 6ffda9d28..495d5ca2c 100644 --- a/assets/state-dra-driver/0600_configmap.yaml +++ b/assets/state-dra-driver/0600_configmap.yaml @@ -7,7 +7,7 @@ metadata: app: nvidia-dra-driver-kubelet-plugin data: entrypoint.sh: |- - #!/bin/sh + #!/bin/bash if [ "$#" -ne 1 ]; then echo "Usage: $0 COMMAND" diff --git a/assets/state-dra-driver/0700_daemonset.yaml b/assets/state-dra-driver/0700_daemonset.yaml index a829281a6..440d2d8e2 100644 --- a/assets/state-dra-driver/0700_daemonset.yaml +++ b/assets/state-dra-driver/0700_daemonset.yaml @@ -55,7 +55,10 @@ spec: securityContext: privileged: true image: "FILLED BY THE OPERATOR" - command: ["/bin/sh", "-c"] + # (cdesiniotis) note that while the k8s-dra-driver-gpu image is built on top of + # the NVIDIA distroless base image, which does not have bash, a statically compiled + # bash is added to the final image at /bin/bash. + command: ["/bin/bash", "-c"] args: - /bin/entrypoint.sh "compute-domain-kubelet-plugin -v 6" env: @@ -100,7 +103,10 @@ spec: securityContext: privileged: true image: "FILLED BY THE OPERATOR" - command: ["/bin/sh", "-c"] + # (cdesiniotis) note that while the k8s-dra-driver-gpu image is built on top of + # the NVIDIA distroless base image, which does not have bash, a statically compiled + # bash is added to the final image at /bin/bash. + command: ["/bin/bash", "-c"] args: - /bin/entrypoint.sh "gpu-kubelet-plugin -v 6" env: diff --git a/deployments/gpu-operator/values.yaml b/deployments/gpu-operator/values.yaml index 06483c601..27fb30e63 100644 --- a/deployments/gpu-operator/values.yaml +++ b/deployments/gpu-operator/values.yaml @@ -281,9 +281,9 @@ devicePlugin: hostNetwork: false draDriver: - repository: nvcr.io/nvidia + repository: ghcr.io/nvidia image: k8s-dra-driver-gpu - version: v25.3.0-rc.4 + version: v25.8.0-dev-124734f2 imagePullPolicy: IfNotPresent imagePullSecrets: [] From 066fce0feeae37484de82f5f9a653f2a23ff7596 Mon Sep 17 00:00:00 2001 From: Christopher Desiniotis Date: Wed, 15 Apr 2026 18:05:58 -0700 Subject: [PATCH 08/10] Prevent DRA driver and sandboxWorkloads from both being enabled Signed-off-by: Christopher Desiniotis --- controllers/clusterpolicy_validator.go | 4 ++++ deployments/gpu-operator/templates/validation.yaml | 6 ++++++ 2 files changed, 10 insertions(+) diff --git a/controllers/clusterpolicy_validator.go b/controllers/clusterpolicy_validator.go index 2f630a9e4..b8caeb0d0 100644 --- a/controllers/clusterpolicy_validator.go +++ b/controllers/clusterpolicy_validator.go @@ -39,5 +39,9 @@ func validateDRA(clusterpolicy *gpuv1.ClusterPolicy, draSupported bool) error { return fmt.Errorf("the NVIDIA device plugin and the NVIDIA DRA driver for GPUs cannot both be enabled in ClusterPolicy") } + if clusterpolicy.Spec.SandboxWorkloads.IsEnabled() && clusterpolicy.Spec.DRADriver.IsEnabled() { + return fmt.Errorf("sandboxWorkloads and the NVIDIA DRA driver for GPUs cannot both be enabled in ClusterPolicy") + } + return nil } diff --git a/deployments/gpu-operator/templates/validation.yaml b/deployments/gpu-operator/templates/validation.yaml index 263b29155..7355409f8 100644 --- a/deployments/gpu-operator/templates/validation.yaml +++ b/deployments/gpu-operator/templates/validation.yaml @@ -12,3 +12,9 @@ {{- $error = printf "%s\nCannot enable the NVIDIA DRA Driver for GPUs on a Kubernetes cluster that does not support DRA" $error }} {{- fail $error }} {{- end}} + +{{- if and ($draEnabled) (eq .Values.sandboxWorkloads.enabled true) }} +{{- $error := "" }} +{{- $error = printf "%s\nThe NVIDIA DRA Driver for GPUs and 'sandboxWorkloads' cannot both be enabled" $error }} +{{- fail $error }} +{{- end}} From a269c98e7d66d28a3a0a0fed3c87d96425b9f471 Mon Sep 17 00:00:00 2001 From: Christopher Desiniotis Date: Wed, 15 Apr 2026 18:29:44 -0700 Subject: [PATCH 09/10] chore: consolidate clusterpolicy validation code Signed-off-by: Christopher Desiniotis --- controllers/clusterpolicy_validator.go | 27 +++- controllers/clusterpolicy_validator_test.go | 117 ++++++++++++++++++ controllers/state_manager.go | 19 --- controllers/state_manager_test.go | 54 -------- .../gpu-operator/templates/validation.yaml | 20 --- .../gpu-operator/templates/validations.yaml | 21 ++++ 6 files changed, 159 insertions(+), 99 deletions(-) create mode 100644 controllers/clusterpolicy_validator_test.go delete mode 100644 deployments/gpu-operator/templates/validation.yaml diff --git a/controllers/clusterpolicy_validator.go b/controllers/clusterpolicy_validator.go index b8caeb0d0..931441067 100644 --- a/controllers/clusterpolicy_validator.go +++ b/controllers/clusterpolicy_validator.go @@ -23,23 +23,38 @@ import ( ) func (n *ClusterPolicyController) validateClusterPolicy() error { - err := validateDRA(n.singleton, n.draSupported) - if err != nil { + if err := validateDRA(&n.singleton.Spec, n.draSupported); err != nil { return fmt.Errorf("failed to validate DRA: %w", err) } + + if err := validateNRIPlugin(&n.singleton.Spec); err != nil { + return fmt.Errorf("failed to validate the NRI Plugin: %w", err) + } + return nil +} + +func validateNRIPlugin(spec *gpuv1.ClusterPolicySpec) error { + if !spec.CDI.IsEnabled() && spec.CDI.IsNRIPluginEnabled() { + return fmt.Errorf("the NRI Plugin cannot be enabled when CDI is disabled") + } + + if spec.CDI.IsNRIPluginEnabled() && !spec.Toolkit.IsEnabled() { + return fmt.Errorf("the NRI Plugin cannot be enabled when the Container Toolkit is disabled") + } + return nil } -func validateDRA(clusterpolicy *gpuv1.ClusterPolicy, draSupported bool) error { - if !draSupported && clusterpolicy.Spec.DRADriver.IsEnabled() { +func validateDRA(spec *gpuv1.ClusterPolicySpec, draSupported bool) error { + if !draSupported && spec.DRADriver.IsEnabled() { return fmt.Errorf("the NVIDIA DRA driver for GPUs is enabled in ClusterPolicy but Dynamic Resource Allocation is not enabled in the Kubernetes cluster") } - if clusterpolicy.Spec.DevicePlugin.IsEnabled() && clusterpolicy.Spec.DRADriver.IsGPUsEnabled() { + if spec.DevicePlugin.IsEnabled() && spec.DRADriver.IsGPUsEnabled() { return fmt.Errorf("the NVIDIA device plugin and the NVIDIA DRA driver for GPUs cannot both be enabled in ClusterPolicy") } - if clusterpolicy.Spec.SandboxWorkloads.IsEnabled() && clusterpolicy.Spec.DRADriver.IsEnabled() { + if spec.SandboxWorkloads.IsEnabled() && spec.DRADriver.IsEnabled() { return fmt.Errorf("sandboxWorkloads and the NVIDIA DRA driver for GPUs cannot both be enabled in ClusterPolicy") } diff --git a/controllers/clusterpolicy_validator_test.go b/controllers/clusterpolicy_validator_test.go new file mode 100644 index 000000000..537d8c52c --- /dev/null +++ b/controllers/clusterpolicy_validator_test.go @@ -0,0 +1,117 @@ +/* + * Copyright (c) NVIDIA CORPORATION. All rights reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package controllers + +import ( + "errors" + "testing" + + "github.com/stretchr/testify/require" + "k8s.io/utils/ptr" + + gpuv1 "github.com/NVIDIA/gpu-operator/api/nvidia/v1" +) + +func TestValidateDRA(t *testing.T) { + tests := []struct { + description string + spec *gpuv1.ClusterPolicySpec + draSupported bool + err error + }{ + { + description: "dra not supported, dra driver not enabled", + spec: &gpuv1.ClusterPolicySpec{}, + }, + { + description: "dra not supported, dra driver enabled", + spec: &gpuv1.ClusterPolicySpec{ + DRADriver: gpuv1.DRADriverSpec{ + GPUs: gpuv1.DRADriverGPUs{ + Enabled: ptr.To(true), + }, + }, + }, + err: errors.New("the NVIDIA DRA driver for GPUs is enabled in ClusterPolicy but Dynamic Resource Allocation is not enabled in the Kubernetes cluster"), + }, + } + + for _, tc := range tests { + t.Run(tc.description, func(t *testing.T) { + err := validateDRA(tc.spec, tc.draSupported) + if tc.err == nil { + require.NoError(t, err) + } else { + require.Error(t, err) + require.Equal(t, tc.err.Error(), err.Error()) + } + }) + } +} + +func TestValidateNRIPlugin(t *testing.T) { + tests := []struct { + description string + spec *gpuv1.ClusterPolicySpec + err error + }{ + { + description: "valid CDI object in spec", + spec: &gpuv1.ClusterPolicySpec{ + CDI: gpuv1.CDIConfigSpec{ + Enabled: ptr.To(true), + NRIPluginEnabled: ptr.To(true), + }, + }, + }, + { + description: "invalid CDI object in spec", + spec: &gpuv1.ClusterPolicySpec{ + CDI: gpuv1.CDIConfigSpec{ + Enabled: ptr.To(false), + NRIPluginEnabled: ptr.To(true), + }, + }, + err: errors.New("the NRI Plugin cannot be enabled when CDI is disabled"), + }, + { + description: "invalid CDI and Toolkit config combination", + spec: &gpuv1.ClusterPolicySpec{ + CDI: gpuv1.CDIConfigSpec{ + Enabled: ptr.To(true), + NRIPluginEnabled: ptr.To(true), + }, + Toolkit: gpuv1.ToolkitSpec{ + Enabled: ptr.To(false), + }, + }, + err: errors.New("the NRI Plugin cannot be enabled when the Container Toolkit is disabled"), + }, + } + + for _, tc := range tests { + t.Run(tc.description, func(t *testing.T) { + err := validateNRIPlugin(tc.spec) + if tc.err == nil { + require.NoError(t, err) + } else { + require.Error(t, err) + require.Equal(t, tc.err.Error(), err.Error()) + } + }) + } +} diff --git a/controllers/state_manager.go b/controllers/state_manager.go index 7d40c0676..a7cda36f7 100644 --- a/controllers/state_manager.go +++ b/controllers/state_manager.go @@ -947,11 +947,6 @@ func (n *ClusterPolicyController) init(ctx context.Context, reconciler *ClusterP n.k8sVersion = k8sVersion n.logger.Info("Kubernetes version detected", "version", k8sVersion) - err = validateClusterPolicySpec(&clusterPolicy.Spec) - if err != nil { - return fmt.Errorf("error validating clusterpolicy: %w", err) - } - draSupported, resourceGVR, err := IsDRASupported(n.logger) if err != nil { return fmt.Errorf("failed to detect if DRA is supported: %w", err) @@ -988,8 +983,6 @@ func (n *ClusterPolicyController) init(ctx context.Context, reconciler *ClusterP } } - // TODO: combine this validation logic with the call to - // ValidateClusterPolicySpec() up above err = n.validateClusterPolicy() if err != nil { return fmt.Errorf("ClusterPolicy validation failed: %w", err) @@ -1234,15 +1227,3 @@ func (n ClusterPolicyController) isStateEnabled(stateName string) bool { return false } } - -func validateClusterPolicySpec(spec *gpuv1.ClusterPolicySpec) error { - if !spec.CDI.IsEnabled() && spec.CDI.IsNRIPluginEnabled() { - return fmt.Errorf("the NRI Plugin cannot be enabled when CDI is disabled") - } - - if spec.CDI.IsNRIPluginEnabled() && !spec.Toolkit.IsEnabled() { - return fmt.Errorf("the NRI Plugin cannot be enabled when the Container Toolkit is disabled") - } - - return nil -} diff --git a/controllers/state_manager_test.go b/controllers/state_manager_test.go index 6585de196..f8f0f8e3a 100644 --- a/controllers/state_manager_test.go +++ b/controllers/state_manager_test.go @@ -18,7 +18,6 @@ package controllers import ( "context" - "errors" "testing" "github.com/stretchr/testify/require" @@ -293,59 +292,6 @@ func TestHasMIGCapableGPU(t *testing.T) { } } -func TestValidateClusterPolicySpec(t *testing.T) { - tests := []struct { - description string - spec *gpuv1.ClusterPolicySpec - err error - }{ - { - description: "valid CDI object in spec", - spec: &gpuv1.ClusterPolicySpec{ - CDI: gpuv1.CDIConfigSpec{ - Enabled: ptr.To(true), - NRIPluginEnabled: ptr.To(true), - }, - }, - }, - { - description: "invalid CDI object in spec", - spec: &gpuv1.ClusterPolicySpec{ - CDI: gpuv1.CDIConfigSpec{ - Enabled: ptr.To(false), - NRIPluginEnabled: ptr.To(true), - }, - }, - err: errors.New("the NRI Plugin cannot be enabled when CDI is disabled"), - }, - { - description: "invalid CDI and Toolkit config combination", - spec: &gpuv1.ClusterPolicySpec{ - CDI: gpuv1.CDIConfigSpec{ - Enabled: ptr.To(true), - NRIPluginEnabled: ptr.To(true), - }, - Toolkit: gpuv1.ToolkitSpec{ - Enabled: ptr.To(false), - }, - }, - err: errors.New("the NRI Plugin cannot be enabled when the Container Toolkit is disabled"), - }, - } - - for _, tc := range tests { - t.Run(tc.description, func(t *testing.T) { - err := validateClusterPolicySpec(tc.spec) - if tc.err == nil { - require.NoError(t, err) - } else { - require.Error(t, err) - require.Equal(t, tc.err.Error(), err.Error()) - } - }) - } -} - func TestGetEffectiveStateLabels(t *testing.T) { // getEffectiveStateLabels returns labels for workload config and sandbox mode. // For container and vm-vgpu, mode has no effect. For vm-passthrough, mode selects diff --git a/deployments/gpu-operator/templates/validation.yaml b/deployments/gpu-operator/templates/validation.yaml deleted file mode 100644 index 7355409f8..000000000 --- a/deployments/gpu-operator/templates/validation.yaml +++ /dev/null @@ -1,20 +0,0 @@ -{{- $draEnabled := or (eq .Values.draDriver.gpus.enabled true) (eq .Values.draDriver.computeDomains.enabled true) }} -{{- $clusterSupportsDRA := or (.Capabilities.APIVersions.Has "resource.k8s.io/v1beta1/DeviceClass") (.Capabilities.APIVersions.Has "resource.k8s.io/v1beta2/DeviceClass") }} - -{{- if and (eq .Values.devicePlugin.enabled true) (eq .Values.draDriver.gpus.enabled true) }} -{{- $error := "" }} -{{- $error = printf "%s\nThe NVIDIA device plugin and the NVIDIA DRA Driver for GPUs cannot both be enabled" $error }} -{{- fail $error }} -{{- end}} - -{{- if and ($draEnabled) (not $clusterSupportsDRA) }} -{{- $error := "" }} -{{- $error = printf "%s\nCannot enable the NVIDIA DRA Driver for GPUs on a Kubernetes cluster that does not support DRA" $error }} -{{- fail $error }} -{{- end}} - -{{- if and ($draEnabled) (eq .Values.sandboxWorkloads.enabled true) }} -{{- $error := "" }} -{{- $error = printf "%s\nThe NVIDIA DRA Driver for GPUs and 'sandboxWorkloads' cannot both be enabled" $error }} -{{- fail $error }} -{{- end}} diff --git a/deployments/gpu-operator/templates/validations.yaml b/deployments/gpu-operator/templates/validations.yaml index 9eaf283f4..8d52f3896 100644 --- a/deployments/gpu-operator/templates/validations.yaml +++ b/deployments/gpu-operator/templates/validations.yaml @@ -5,3 +5,24 @@ {{- if and (eq .Values.cdi.nriPluginEnabled true) (eq .Values.toolkit.enabled false) }} {{ fail "the NRI Plugin cannot be enabled when the Container Toolkit is disabled" }} {{- end }} + +{{- $draEnabled := or (eq .Values.draDriver.gpus.enabled true) (eq .Values.draDriver.computeDomains.enabled true) }} +{{- $clusterSupportsDRA := or (.Capabilities.APIVersions.Has "resource.k8s.io/v1/DeviceClass") (.Capabilities.APIVersions.Has "resource.k8s.io/v1beta1/DeviceClass") (.Capabilities.APIVersions.Has "resource.k8s.io/v1beta2/DeviceClass") }} + +{{- if and (eq .Values.devicePlugin.enabled true) (eq .Values.draDriver.gpus.enabled true) }} +{{- $error := "" }} +{{- $error = printf "%s\nThe NVIDIA device plugin and the NVIDIA DRA Driver for GPUs cannot both be enabled" $error }} +{{- fail $error }} +{{- end }} + +{{- if and ($draEnabled) (not $clusterSupportsDRA) }} +{{- $error := "" }} +{{- $error = printf "%s\nCannot enable the NVIDIA DRA Driver for GPUs on a Kubernetes cluster that does not support DRA" $error }} +{{- fail $error }} +{{- end }} + +{{- if and ($draEnabled) (eq .Values.sandboxWorkloads.enabled true) }} +{{- $error := "" }} +{{- $error = printf "%s\nThe NVIDIA DRA Driver for GPUs and 'sandboxWorkloads' cannot both be enabled" $error }} +{{- fail $error }} +{{- end }} From 48c20796ae5364efe891d6b5e46719026e21d0e1 Mon Sep 17 00:00:00 2001 From: Karthik Vetrivel Date: Fri, 1 May 2026 13:36:58 -0400 Subject: [PATCH 10/10] Update DRA driver assets for refreshed NVIDIA DRA driver Signed-off-by: Karthik Vetrivel --- .../0100_service_account.yaml | 10 +- ...compute_domain_daemon-service_account.yaml | 2 +- .../0120_kubeletplugin-service_account.yaml | 11 + assets/state-dra-driver/0200_clusterrole.yaml | 117 +++--- assets/state-dra-driver/0210_role.yaml | 38 +- ...220_compute_domain_daemon-clusterrole.yaml | 45 +- .../0230_kubeletplugin-clusterrole.yaml | 49 +++ .../0300_clusterrolebinding.yaml | 14 +- assets/state-dra-driver/0310_rolebinding.yaml | 16 +- ...pute_domain_daemon-clusterrolebinding.yaml | 11 +- .../0330_rolebinding.openshift.yaml | 11 +- ...0340_kubeletplugin-clusterrolebinding.yaml | 12 + .../0350_kubeletplugin-role.yaml | 12 + .../0360_kubeletplugin-rolebinding.yaml | 13 + ...n_daemon-clusterrolebinding.openshift.yaml | 12 + assets/state-dra-driver/0500_deployment.yaml | 101 +++-- assets/state-dra-driver/0600_configmap.yaml | 40 -- assets/state-dra-driver/0700_daemonset.yaml | 390 +++++++++++------- ...rator-certified.clusterserviceversion.yaml | 4 +- ...ource.nvidia.com_computedomaincliques.yaml | 84 ++++ .../resource.nvidia.com_computedomains.yaml | 224 ++++++---- controllers/object_controls.go | 65 ++- controllers/resource_manager.go | 11 +- controllers/transforms_test.go | 48 ++- ...ource.nvidia.com_computedomaincliques.yaml | 84 ++++ .../resource.nvidia.com_computedomains.yaml | 224 ++++++---- .../gpu-operator/templates/cleanup_crd.yaml | 1 + .../gpu-operator/templates/upgrade_crd.yaml | 1 + deployments/gpu-operator/values.yaml | 6 +- docker/Dockerfile | 2 + 30 files changed, 1134 insertions(+), 524 deletions(-) create mode 100644 assets/state-dra-driver/0120_kubeletplugin-service_account.yaml create mode 100644 assets/state-dra-driver/0230_kubeletplugin-clusterrole.yaml create mode 100644 assets/state-dra-driver/0340_kubeletplugin-clusterrolebinding.yaml create mode 100644 assets/state-dra-driver/0350_kubeletplugin-role.yaml create mode 100644 assets/state-dra-driver/0360_kubeletplugin-rolebinding.yaml create mode 100644 assets/state-dra-driver/0370_compute_domain_daemon-clusterrolebinding.openshift.yaml delete mode 100644 assets/state-dra-driver/0600_configmap.yaml create mode 100644 bundle/manifests/resource.nvidia.com_computedomaincliques.yaml create mode 100644 deployments/gpu-operator/crds/resource.nvidia.com_computedomaincliques.yaml diff --git a/assets/state-dra-driver/0100_service_account.yaml b/assets/state-dra-driver/0100_service_account.yaml index 76d6d61af..b76133aba 100644 --- a/assets/state-dra-driver/0100_service_account.yaml +++ b/assets/state-dra-driver/0100_service_account.yaml @@ -1,5 +1,11 @@ apiVersion: v1 kind: ServiceAccount metadata: - name: nvidia-dra-driver - namespace: "FILLED BY THE OPERATOR" + name: nvidia-dra-driver-controller + namespace: gpu-operator + labels: + helm.sh/chart: nvidia-dra-driver-26.4.0-dev + app.kubernetes.io/version: 26.4.0-dev + app.kubernetes.io/managed-by: Helm + app.kubernetes.io/name: nvidia-dra-driver + app.kubernetes.io/instance: nvidia-dra-driver diff --git a/assets/state-dra-driver/0110_compute_domain_daemon-service_account.yaml b/assets/state-dra-driver/0110_compute_domain_daemon-service_account.yaml index e4bfe6255..970100254 100644 --- a/assets/state-dra-driver/0110_compute_domain_daemon-service_account.yaml +++ b/assets/state-dra-driver/0110_compute_domain_daemon-service_account.yaml @@ -2,4 +2,4 @@ apiVersion: v1 kind: ServiceAccount metadata: name: compute-domain-daemon-service-account - namespace: "FILLED BY THE OPERATOR" + namespace: gpu-operator diff --git a/assets/state-dra-driver/0120_kubeletplugin-service_account.yaml b/assets/state-dra-driver/0120_kubeletplugin-service_account.yaml new file mode 100644 index 000000000..6b8e044fb --- /dev/null +++ b/assets/state-dra-driver/0120_kubeletplugin-service_account.yaml @@ -0,0 +1,11 @@ +apiVersion: v1 +kind: ServiceAccount +metadata: + name: nvidia-dra-driver-kubeletplugin + namespace: gpu-operator + labels: + helm.sh/chart: nvidia-dra-driver-26.4.0-dev + app.kubernetes.io/version: 26.4.0-dev + app.kubernetes.io/managed-by: Helm + app.kubernetes.io/name: nvidia-dra-driver + app.kubernetes.io/instance: nvidia-dra-driver diff --git a/assets/state-dra-driver/0200_clusterrole.yaml b/assets/state-dra-driver/0200_clusterrole.yaml index e2052e9a6..dcfcfc8ab 100644 --- a/assets/state-dra-driver/0200_clusterrole.yaml +++ b/assets/state-dra-driver/0200_clusterrole.yaml @@ -1,69 +1,56 @@ apiVersion: rbac.authorization.k8s.io/v1 kind: ClusterRole metadata: - name: nvidia-dra-driver - namespace: "FILLED BY THE OPERATOR" + name: nvidia-dra-driver-clusterrole-controller rules: - - apiGroups: - - resource.nvidia.com - resources: - - computedomains - - computedomains/status - verbs: - - get - - list - - watch - - create - - update - - patch - - delete - - apiGroups: - - resource.k8s.io - resources: - - resourceclaims - - resourceclaimtemplates - verbs: - - get - - list - - watch - - create - - update - - patch - - delete - - apiGroups: - - resource.k8s.io - resources: - - deviceclasses - - resourceslices - verbs: - - get - - list - - watch - - create - - update - - patch - - delete - - apiGroups: - - resource.k8s.io - resources: - - resourceclaims/status - verbs: - - update - - apiGroups: - - "" - resources: - - nodes - verbs: - - get - - list - - watch - - update - - patch - - apiGroups: - - "" - resources: - - pods - verbs: - - get - - list - - watch +- apiGroups: + - resource.nvidia.com + resources: + - computedomains + verbs: + - get + - list + - watch + - update +- apiGroups: + - resource.nvidia.com + resources: + - computedomains/status + verbs: + - update +- apiGroups: + - resource.k8s.io + resources: + - resourceclaimtemplates + verbs: + - get + - list + - watch + - create + - update + - delete +- apiGroups: + - coordination.k8s.io + resources: + - leases + verbs: + - get + - create + - update +- apiGroups: + - '' + resources: + - nodes + verbs: + - get + - list + - watch + - update +- apiGroups: + - '' + resources: + - pods + verbs: + - get + - list + - watch diff --git a/assets/state-dra-driver/0210_role.yaml b/assets/state-dra-driver/0210_role.yaml index 62e336e3d..37bcc7834 100644 --- a/assets/state-dra-driver/0210_role.yaml +++ b/assets/state-dra-driver/0210_role.yaml @@ -1,19 +1,27 @@ apiVersion: rbac.authorization.k8s.io/v1 kind: Role metadata: - name: nvidia-dra-driver - namespace: "FILLED BY THE OPERATOR" + name: nvidia-dra-driver-role-controller + namespace: gpu-operator rules: - - apiGroups: - - apps - resources: - - daemonsets - - deployments - verbs: - - get - - list - - watch - - create - - update - - patch - - delete +- apiGroups: + - apps + resources: + - daemonsets + verbs: + - get + - list + - watch + - create + - update + - patch + - delete +- apiGroups: + - resource.nvidia.com + resources: + - computedomaincliques + verbs: + - get + - list + - watch + - update diff --git a/assets/state-dra-driver/0220_compute_domain_daemon-clusterrole.yaml b/assets/state-dra-driver/0220_compute_domain_daemon-clusterrole.yaml index 4b157fa4a..7f113506d 100644 --- a/assets/state-dra-driver/0220_compute_domain_daemon-clusterrole.yaml +++ b/assets/state-dra-driver/0220_compute_domain_daemon-clusterrole.yaml @@ -1,17 +1,36 @@ apiVersion: rbac.authorization.k8s.io/v1 kind: ClusterRole metadata: - name: compute-domain-daemon-clusterrole - namespace: "FILLED BY THE OPERATOR" + name: compute-domain-daemon-role rules: - - apiGroups: - - resource.nvidia.com - resources: - - computedomains - - computedomains/status - verbs: - - get - - list - - watch - - update - - patch +- apiGroups: + - resource.nvidia.com + resources: + - computedomains + - computedomains/status + verbs: + - get + - list + - watch + - update + - patch +- apiGroups: + - resource.nvidia.com + resources: + - computedomaincliques + verbs: + - get + - list + - watch + - create + - update + - patch +- apiGroups: + - '' + resources: + - pods + verbs: + - get + - list + - watch + - patch diff --git a/assets/state-dra-driver/0230_kubeletplugin-clusterrole.yaml b/assets/state-dra-driver/0230_kubeletplugin-clusterrole.yaml new file mode 100644 index 000000000..11fc91586 --- /dev/null +++ b/assets/state-dra-driver/0230_kubeletplugin-clusterrole.yaml @@ -0,0 +1,49 @@ +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRole +metadata: + name: nvidia-dra-driver-clusterrole-kubeletplugin +rules: +- apiGroups: + - resource.nvidia.com + resources: + - computedomains + verbs: + - get + - list + - watch +- apiGroups: + - resource.k8s.io + resources: + - resourceclaims + verbs: + - get + - list + - watch +- apiGroups: + - resource.k8s.io + resources: + - resourceslices + verbs: + - get + - list + - watch + - create + - update + - delete +- apiGroups: + - '' + resources: + - nodes + verbs: + - get + - list + - watch + - update +- apiGroups: + - '' + resources: + - pods + verbs: + - get + - list + - watch diff --git a/assets/state-dra-driver/0300_clusterrolebinding.yaml b/assets/state-dra-driver/0300_clusterrolebinding.yaml index ea4f6a5e4..fe0fe0113 100644 --- a/assets/state-dra-driver/0300_clusterrolebinding.yaml +++ b/assets/state-dra-driver/0300_clusterrolebinding.yaml @@ -1,12 +1,12 @@ apiVersion: rbac.authorization.k8s.io/v1 kind: ClusterRoleBinding metadata: - name: nvidia-dra-driver + name: nvidia-dra-driver-clusterrole-binding-controller-gpu-operator +subjects: +- kind: ServiceAccount + name: nvidia-dra-driver-controller + namespace: gpu-operator roleRef: - apiGroup: rbac.authorization.k8s.io kind: ClusterRole - name: nvidia-dra-driver -subjects: - - kind: ServiceAccount - name: nvidia-dra-driver - namespace: "FILLED BY THE OPERATOR" + name: nvidia-dra-driver-clusterrole-controller + apiGroup: rbac.authorization.k8s.io diff --git a/assets/state-dra-driver/0310_rolebinding.yaml b/assets/state-dra-driver/0310_rolebinding.yaml index bf893a63c..fe41c07bf 100644 --- a/assets/state-dra-driver/0310_rolebinding.yaml +++ b/assets/state-dra-driver/0310_rolebinding.yaml @@ -1,13 +1,13 @@ apiVersion: rbac.authorization.k8s.io/v1 kind: RoleBinding metadata: - name: nvidia-dra-driver - namespace: "FILLED BY THE OPERATOR" + name: nvidia-dra-driver-role-binding-controller + namespace: gpu-operator +subjects: +- kind: ServiceAccount + name: nvidia-dra-driver-controller + namespace: gpu-operator roleRef: - apiGroup: rbac.authorization.k8s.io kind: Role - name: nvidia-dra-driver -subjects: - - kind: ServiceAccount - name: nvidia-dra-driver - namespace: "FILLED BY THE OPERATOR" + name: nvidia-dra-driver-role-controller + apiGroup: rbac.authorization.k8s.io diff --git a/assets/state-dra-driver/0320_compute_domain_daemon-clusterrolebinding.yaml b/assets/state-dra-driver/0320_compute_domain_daemon-clusterrolebinding.yaml index 5ba739004..b60eca681 100644 --- a/assets/state-dra-driver/0320_compute_domain_daemon-clusterrolebinding.yaml +++ b/assets/state-dra-driver/0320_compute_domain_daemon-clusterrolebinding.yaml @@ -1,13 +1,12 @@ apiVersion: rbac.authorization.k8s.io/v1 kind: ClusterRoleBinding metadata: - name: compute-domain-daemon-clusterrole-binding - namespace: "FILLED BY THE OPERATOR" + name: compute-domain-daemon-role-binding subjects: - - kind: ServiceAccount - name: compute-domain-daemon-service-account - namespace: "FILLED BY THE OPERATOR" +- kind: ServiceAccount + name: compute-domain-daemon-service-account + namespace: gpu-operator roleRef: kind: ClusterRole - name: compute-domain-daemon-clusterrole + name: compute-domain-daemon-role apiGroup: rbac.authorization.k8s.io diff --git a/assets/state-dra-driver/0330_rolebinding.openshift.yaml b/assets/state-dra-driver/0330_rolebinding.openshift.yaml index bb49c649a..ce49c30c6 100644 --- a/assets/state-dra-driver/0330_rolebinding.openshift.yaml +++ b/assets/state-dra-driver/0330_rolebinding.openshift.yaml @@ -1,12 +1,11 @@ apiVersion: rbac.authorization.k8s.io/v1 -kind: RoleBinding +kind: ClusterRoleBinding metadata: - name: nvidia-dra-driver-openshift-privileged-role-binding - namespace: "FILLED BY THE OPERATOR" + name: nvidia-dra-driver-openshift-privileged-role-binding-kubeletplugin subjects: - - kind: ServiceAccount - name: nvidia-dra-driver - namespace: "FILLED BY THE OPERATOR" +- kind: ServiceAccount + name: nvidia-dra-driver-kubeletplugin + namespace: gpu-operator roleRef: kind: ClusterRole name: system:openshift:scc:privileged diff --git a/assets/state-dra-driver/0340_kubeletplugin-clusterrolebinding.yaml b/assets/state-dra-driver/0340_kubeletplugin-clusterrolebinding.yaml new file mode 100644 index 000000000..f68f16763 --- /dev/null +++ b/assets/state-dra-driver/0340_kubeletplugin-clusterrolebinding.yaml @@ -0,0 +1,12 @@ +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRoleBinding +metadata: + name: nvidia-dra-driver-clusterrole-binding-kubeletplugin +subjects: +- kind: ServiceAccount + name: nvidia-dra-driver-kubeletplugin + namespace: gpu-operator +roleRef: + kind: ClusterRole + name: nvidia-dra-driver-clusterrole-kubeletplugin + apiGroup: rbac.authorization.k8s.io diff --git a/assets/state-dra-driver/0350_kubeletplugin-role.yaml b/assets/state-dra-driver/0350_kubeletplugin-role.yaml new file mode 100644 index 000000000..b1356333e --- /dev/null +++ b/assets/state-dra-driver/0350_kubeletplugin-role.yaml @@ -0,0 +1,12 @@ +apiVersion: rbac.authorization.k8s.io/v1 +kind: Role +metadata: + name: nvidia-dra-driver-role-kubeletplugin + namespace: gpu-operator +rules: +- apiGroups: + - resource.nvidia.com + resources: + - computedomaincliques + verbs: + - get diff --git a/assets/state-dra-driver/0360_kubeletplugin-rolebinding.yaml b/assets/state-dra-driver/0360_kubeletplugin-rolebinding.yaml new file mode 100644 index 000000000..67a83a409 --- /dev/null +++ b/assets/state-dra-driver/0360_kubeletplugin-rolebinding.yaml @@ -0,0 +1,13 @@ +apiVersion: rbac.authorization.k8s.io/v1 +kind: RoleBinding +metadata: + name: nvidia-dra-driver-role-binding-kubeletplugin + namespace: gpu-operator +subjects: +- kind: ServiceAccount + name: nvidia-dra-driver-kubeletplugin + namespace: gpu-operator +roleRef: + kind: Role + name: nvidia-dra-driver-role-kubeletplugin + apiGroup: rbac.authorization.k8s.io diff --git a/assets/state-dra-driver/0370_compute_domain_daemon-clusterrolebinding.openshift.yaml b/assets/state-dra-driver/0370_compute_domain_daemon-clusterrolebinding.openshift.yaml new file mode 100644 index 000000000..3a0665b69 --- /dev/null +++ b/assets/state-dra-driver/0370_compute_domain_daemon-clusterrolebinding.openshift.yaml @@ -0,0 +1,12 @@ +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRoleBinding +metadata: + name: compute-domain-daemon-openshift-anyuid-role-binding +subjects: +- kind: ServiceAccount + name: compute-domain-daemon-service-account + namespace: gpu-operator +roleRef: + kind: ClusterRole + name: system:openshift:scc:anyuid + apiGroup: rbac.authorization.k8s.io diff --git a/assets/state-dra-driver/0500_deployment.yaml b/assets/state-dra-driver/0500_deployment.yaml index 1e5bbd0f0..ca221c498 100644 --- a/assets/state-dra-driver/0500_deployment.yaml +++ b/assets/state-dra-driver/0500_deployment.yaml @@ -2,42 +2,97 @@ apiVersion: apps/v1 kind: Deployment metadata: name: nvidia-dra-driver-controller - namespace: "FILLED BY THE OPERATOR" + namespace: gpu-operator labels: + helm.sh/chart: nvidia-dra-driver-26.4.0-dev + app.kubernetes.io/version: 26.4.0-dev + app.kubernetes.io/managed-by: Helm + app.kubernetes.io/name: nvidia-dra-driver + app.kubernetes.io/instance: nvidia-dra-driver app: nvidia-dra-driver-controller spec: replicas: 1 selector: matchLabels: - app: nvidia-dra-driver-controller + nvidia-dra-driver-component: controller template: metadata: labels: + app.kubernetes.io/name: nvidia-dra-driver + app.kubernetes.io/instance: nvidia-dra-driver + nvidia-dra-driver-component: controller app: nvidia-dra-driver-controller spec: priorityClassName: system-node-critical - serviceAccountName: nvidia-dra-driver + serviceAccountName: nvidia-dra-driver-controller + securityContext: {} containers: - - name: compute-domains - image: "FILLED BY THE OPERATOR" - command: ["compute-domain-controller", "-v", "6"] - env: - - name: POD_NAME - valueFrom: - fieldRef: - fieldPath: metadata.name - - name: NAMESPACE - valueFrom: - fieldRef: - fieldPath: metadata.namespace + - name: compute-domain + securityContext: {} + image: FILLED BY THE OPERATOR + imagePullPolicy: IfNotPresent + command: + - compute-domain-controller + - "-v" + - "$(LOG_VERBOSITY)" + resources: {} + env: + - name: HTTP_ENDPOINT + value: ":8080" + - name: METRICS_PATH + value: "/metrics" + - name: PPROF_PATH + value: '' + - name: LOG_VERBOSITY + value: '4' + - name: LOG_VERBOSITY_CD_DAEMON + value: '4' + - name: POD_NAME + valueFrom: + fieldRef: + fieldPath: metadata.name + - name: NAMESPACE + valueFrom: + fieldRef: + fieldPath: metadata.namespace + - name: IMAGE_NAME + value: FILLED BY THE OPERATOR + - name: NVIDIA_VISIBLE_DEVICES + value: void + - name: LEADER_ELECTION_ENABLED + value: 'false' + - name: LEADER_ELECTION_LEASE_LOCK_NAME + value: nvidia-dra-driver-controller + - name: LEADER_ELECTION_LEASE_LOCK_NAMESPACE + value: gpu-operator + - name: LEADER_ELECTION_LEASE_DURATION + value: 15s + - name: LEADER_ELECTION_RENEW_DEADLINE + value: 10s + - name: LEADER_ELECTION_RETRY_PERIOD + value: 2s affinity: nodeAffinity: - requiredDuringSchedulingIgnoredDuringExecution: - nodeSelectorTerms: - - matchExpressions: - - key: "node-role.kubernetes.io/control-plane" - operator: "Exists" + preferredDuringSchedulingIgnoredDuringExecution: + - preference: + matchExpressions: + - key: node-role.kubernetes.io/control-plane + operator: Exists + weight: 100 + podAntiAffinity: + preferredDuringSchedulingIgnoredDuringExecution: + - podAffinityTerm: + labelSelector: + matchLabels: + nvidia-dra-driver-component: controller + topologyKey: kubernetes.io/hostname + weight: 100 tolerations: - - key: node-role.kubernetes.io/control-plane - operator: Exists - effect: NoSchedule + - effect: NoSchedule + key: node-role.kubernetes.io/control-plane + operator: Exists + - effect: NoSchedule + key: node-role.kubernetes.io/master + operator: Exists + - key: CriticalAddonsOnly + operator: Exists diff --git a/assets/state-dra-driver/0600_configmap.yaml b/assets/state-dra-driver/0600_configmap.yaml deleted file mode 100644 index 495d5ca2c..000000000 --- a/assets/state-dra-driver/0600_configmap.yaml +++ /dev/null @@ -1,40 +0,0 @@ -apiVersion: v1 -kind: ConfigMap -metadata: - name: nvidia-dra-driver-kubelet-plugin-entrypoint - namespace: "FILLED BY THE OPERATOR" - labels: - app: nvidia-dra-driver-kubelet-plugin -data: - entrypoint.sh: |- - #!/bin/bash - - if [ "$#" -ne 1 ]; then - echo "Usage: $0 COMMAND" - exit 1 - fi - - entrypoint=$1 - - until [ -f /run/nvidia/validations/driver-ready ] - do - echo "waiting for the driver validations to be ready..." - sleep 5 - done - - set -o allexport - cat /run/nvidia/validations/driver-ready - . /run/nvidia/validations/driver-ready - - # Conditionally mask the params file to prevent this container from - # recreating any missing GPU device nodes. This is necessary, for - # example, when running under nvkind to limit the set GPUs governed - # by the plugin even though it has cgroup access to all of them. - if [ "${MASK_NVIDIA_DRIVER_PARAMS}" = "true" ]; then - cp /proc/driver/nvidia/params root/gpu-params - sed -i 's/^ModifyDeviceFiles: 1$/ModifyDeviceFiles: 0/' root/gpu-params - mount --bind root/gpu-params /proc/driver/nvidia/params - fi - - echo "Starting the NVIDIA DRA Driver Kubelet Plugin" - exec $entrypoint diff --git a/assets/state-dra-driver/0700_daemonset.yaml b/assets/state-dra-driver/0700_daemonset.yaml index 440d2d8e2..5cc064255 100644 --- a/assets/state-dra-driver/0700_daemonset.yaml +++ b/assets/state-dra-driver/0700_daemonset.yaml @@ -2,176 +2,252 @@ apiVersion: apps/v1 kind: DaemonSet metadata: name: nvidia-dra-driver-kubelet-plugin - namespace: "FILLED BY THE OPERATOR" + namespace: gpu-operator labels: + helm.sh/chart: nvidia-dra-driver-26.4.0-dev + app.kubernetes.io/version: 26.4.0-dev + app.kubernetes.io/managed-by: Helm + app.kubernetes.io/name: nvidia-dra-driver + app.kubernetes.io/instance: nvidia-dra-driver app: nvidia-dra-driver-kubelet-plugin spec: selector: matchLabels: - app: nvidia-dra-driver-kubelet-plugin + nvidia-dra-driver-component: kubelet-plugin + updateStrategy: + rollingUpdate: + maxUnavailable: 100% + type: RollingUpdate template: metadata: labels: + app.kubernetes.io/name: nvidia-dra-driver + app.kubernetes.io/instance: nvidia-dra-driver + nvidia-dra-driver-component: kubelet-plugin app: nvidia-dra-driver-kubelet-plugin spec: - nodeSelector: - nvidia.com/gpu.deploy.dra-driver-kubelet-plugin: "true" priorityClassName: system-node-critical - serviceAccountName: nvidia-dra-driver + serviceAccountName: nvidia-dra-driver-kubeletplugin + securityContext: {} initContainers: - - name: driver-validation - image: "FILLED BY THE OPERATOR" - imagePullPolicy: IfNotPresent - command: ['sh', '-c'] - args: ["nvidia-validator"] - env: - - name: WITH_WAIT - value: "true" - - name: COMPONENT - value: driver - - name: OPERATOR_NAMESPACE - valueFrom: - fieldRef: - fieldPath: metadata.namespace - securityContext: - privileged: true - seLinuxOptions: - level: "s0" - volumeMounts: - - name: driver-install-dir - mountPath: /run/nvidia/driver - mountPropagation: HostToContainer - - name: run-nvidia-validations - mountPath: /run/nvidia/validations - mountPropagation: Bidirectional - - name: host-root - mountPath: /host - readOnly: true - mountPropagation: HostToContainer - - name: host-dev-char - mountPath: /host-dev-char + - name: init-container + image: FILLED BY THE OPERATOR + securityContext: + privileged: true + command: + - bash + - "/usr/bin/kubelet-plugin-prestart.sh" + env: + - name: NVIDIA_DRIVER_ROOT + value: "/run/nvidia/driver" + - name: NVIDIA_VISIBLE_DEVICES + value: void + - name: KUBELET_REGISTRAR_DIRECTORY_PATH + value: "/var/lib/kubelet/plugins_registry" + - name: KUBELET_PLUGINS_DIRECTORY_PATH + value: "/var/lib/kubelet/plugins" + volumeMounts: + - name: driver-root-parent + mountPath: "/driver-root-parent" + mountPropagation: HostToContainer containers: - - name: compute-domains - securityContext: - privileged: true - image: "FILLED BY THE OPERATOR" - # (cdesiniotis) note that while the k8s-dra-driver-gpu image is built on top of - # the NVIDIA distroless base image, which does not have bash, a statically compiled - # bash is added to the final image at /bin/bash. - command: ["/bin/bash", "-c"] - args: - - /bin/entrypoint.sh "compute-domain-kubelet-plugin -v 6" - env: - - name: NVIDIA_VISIBLE_DEVICES - value: void - - name: CDI_ROOT - value: /var/run/cdi - - name: NVIDIA_MIG_CONFIG_DEVICES - value: all - - name: NODE_NAME - valueFrom: - fieldRef: - fieldPath: spec.nodeName - - name: NAMESPACE - valueFrom: - fieldRef: - fieldPath: metadata.namespace - volumeMounts: - - name: nvidia-dra-driver-kubelet-plugin-entrypoint - readOnly: true - mountPath: /bin/entrypoint.sh - subPath: entrypoint.sh - - name: plugins-registry - mountPath: /var/lib/kubelet/plugins_registry - - name: plugins - mountPath: /var/lib/kubelet/plugins - mountPropagation: Bidirectional - - name: cdi - mountPath: /var/run/cdi - - name: run-nvidia-validations - mountPath: /run/nvidia/validations - mountPropagation: Bidirectional - - name: driver-install-dir - mountPath: /driver-root - readOnly: true - mountPropagation: HostToContainer - - name: host-root - mountPath: /host - readOnly: true - mountPropagation: HostToContainer - - name: gpus - securityContext: - privileged: true - image: "FILLED BY THE OPERATOR" - # (cdesiniotis) note that while the k8s-dra-driver-gpu image is built on top of - # the NVIDIA distroless base image, which does not have bash, a statically compiled - # bash is added to the final image at /bin/bash. - command: ["/bin/bash", "-c"] - args: - - /bin/entrypoint.sh "gpu-kubelet-plugin -v 6" - env: - - name: NVIDIA_VISIBLE_DEVICES - value: void - - name: CDI_ROOT - value: /var/run/cdi - - name: NVIDIA_MIG_CONFIG_DEVICES - value: all - - name: NODE_NAME - valueFrom: - fieldRef: - fieldPath: spec.nodeName - - name: NAMESPACE - valueFrom: - fieldRef: - fieldPath: metadata.namespace - volumeMounts: - - name: nvidia-dra-driver-kubelet-plugin-entrypoint - readOnly: true - mountPath: /bin/entrypoint.sh - subPath: entrypoint.sh - - name: plugins-registry - mountPath: /var/lib/kubelet/plugins_registry - - name: plugins - mountPath: /var/lib/kubelet/plugins - mountPropagation: Bidirectional - - name: cdi - mountPath: /var/run/cdi - - name: run-nvidia-validations - mountPath: /run/nvidia/validations - mountPropagation: Bidirectional - - name: driver-install-dir - mountPath: /driver-root - readOnly: true - mountPropagation: HostToContainer - - name: host-root - mountPath: /host - readOnly: true - mountPropagation: HostToContainer - volumes: - - name: nvidia-dra-driver-kubelet-plugin-entrypoint - configMap: - name: nvidia-dra-driver-kubelet-plugin-entrypoint - defaultMode: 448 + - name: compute-domains + securityContext: + privileged: true + image: FILLED BY THE OPERATOR + imagePullPolicy: IfNotPresent + command: + - bash + - "-c" + args: + - |- + # Conditionally mask the params file to prevent this container from + # recreating any missing GPU device nodes. This is necessary, for + # example, when running under nvkind to limit the set GPUs governed + # by the plugin even though it has cgroup access to all of them. + if [ "${MASK_NVIDIA_DRIVER_PARAMS}" = "true" ]; then + cp /proc/driver/nvidia/params /root/gpu-params + sed -i 's/^ModifyDeviceFiles: 1$/ModifyDeviceFiles: 0/' /root/gpu-params + mount --bind /root/gpu-params /proc/driver/nvidia/params + fi + compute-domain-kubelet-plugin -v $(LOG_VERBOSITY) + resources: {} + startupProbe: + grpc: + port: 51515 + service: liveness + failureThreshold: 600 + periodSeconds: 1 + timeoutSeconds: 10 + livenessProbe: + grpc: + port: 51515 + service: liveness + failureThreshold: 3 + periodSeconds: 10 + timeoutSeconds: 10 + env: + - name: HTTP_ENDPOINT + value: ":8081" + - name: METRICS_PATH + value: "/metrics" + - name: LOG_VERBOSITY + value: '4' + - name: MASK_NVIDIA_DRIVER_PARAMS + value: '' + - name: NVIDIA_DRIVER_ROOT + value: "/run/nvidia/driver" + - name: NVIDIA_VISIBLE_DEVICES + value: void + - name: CDI_ROOT + value: "/var/run/cdi" + - name: NVIDIA_MIG_CONFIG_DEVICES + value: all + - name: NODE_NAME + valueFrom: + fieldRef: + fieldPath: spec.nodeName + - name: NAMESPACE + valueFrom: + fieldRef: + fieldPath: metadata.namespace + - name: KUBELET_REGISTRAR_DIRECTORY_PATH + value: "/var/lib/kubelet/plugins_registry" + - name: KUBELET_PLUGINS_DIRECTORY_PATH + value: "/var/lib/kubelet/plugins" + - name: HEALTHCHECK_PORT + value: '51515' + volumeMounts: - name: plugins-registry - hostPath: - path: /var/lib/kubelet/plugins_registry + mountPath: "/var/lib/kubelet/plugins_registry" - name: plugins - hostPath: - path: /var/lib/kubelet/plugins + mountPath: "/var/lib/kubelet/plugins" + mountPropagation: Bidirectional - name: cdi - hostPath: - path: /var/run/cdi - - name: run-nvidia-validations - hostPath: - path: /run/nvidia/validations - type: DirectoryOrCreate - - name: driver-install-dir - hostPath: - path: /run/nvidia/driver - type: DirectoryOrCreate - - name: host-root - hostPath: - path: / - - name: host-dev-char - hostPath: - path: /dev/char + mountPath: "/var/run/cdi" + - name: driver-root + mountPath: "/driver-root" + readOnly: true + mountPropagation: HostToContainer + - name: gpus + securityContext: + privileged: true + image: FILLED BY THE OPERATOR + imagePullPolicy: IfNotPresent + command: + - bash + - "-c" + args: + - |- + # Conditionally mask the params file to prevent this container from + # recreating any missing GPU device nodes. This is necessary, for + # example, when running under nvkind to limit the set GPUs governed + # by the plugin even though it has cgroup access to all of them. + if [ "${MASK_NVIDIA_DRIVER_PARAMS}" = "true" ]; then + cp /proc/driver/nvidia/params /root/gpu-params + sed -i 's/^ModifyDeviceFiles: 1$/ModifyDeviceFiles: 0/' /root/gpu-params + mount --bind /root/gpu-params /proc/driver/nvidia/params + fi + gpu-kubelet-plugin -v $(LOG_VERBOSITY) + resources: {} + startupProbe: + grpc: + port: 51516 + service: liveness + failureThreshold: 600 + periodSeconds: 1 + timeoutSeconds: 10 + livenessProbe: + grpc: + port: 51516 + service: liveness + failureThreshold: 3 + periodSeconds: 30 + timeoutSeconds: 10 + env: + - name: HTTP_ENDPOINT + value: ":8080" + - name: METRICS_PATH + value: "/metrics" + - name: LOG_VERBOSITY + value: '4' + - name: MASK_NVIDIA_DRIVER_PARAMS + value: '' + - name: NVIDIA_DRIVER_ROOT + value: "/run/nvidia/driver" + - name: NVIDIA_VISIBLE_DEVICES + value: void + - name: CDI_ROOT + value: "/var/run/cdi" + - name: NVIDIA_MIG_CONFIG_DEVICES + value: all + - name: NODE_NAME + valueFrom: + fieldRef: + fieldPath: spec.nodeName + - name: NAMESPACE + valueFrom: + fieldRef: + fieldPath: metadata.namespace + - name: IMAGE_NAME + value: FILLED BY THE OPERATOR + - name: KUBELET_REGISTRAR_DIRECTORY_PATH + value: "/var/lib/kubelet/plugins_registry" + - name: KUBELET_PLUGINS_DIRECTORY_PATH + value: "/var/lib/kubelet/plugins" + - name: HEALTHCHECK_PORT + value: '51516' + volumeMounts: + - name: plugins-registry + mountPath: "/var/lib/kubelet/plugins_registry" + - name: plugins + mountPath: "/var/lib/kubelet/plugins" + mountPropagation: Bidirectional + - name: cdi + mountPath: "/var/run/cdi" + - name: driver-root + mountPath: "/driver-root" + mountPropagation: HostToContainer + volumes: + - name: plugins-registry + hostPath: + path: "/var/lib/kubelet/plugins_registry" + - name: plugins + hostPath: + path: "/var/lib/kubelet/plugins" + - name: cdi + hostPath: + path: "/var/run/cdi" + - name: driver-root-parent + hostPath: + path: "/run/nvidia" + type: DirectoryOrCreate + - name: driver-root + hostPath: + path: "/run/nvidia/driver" + type: DirectoryOrCreate + affinity: + nodeAffinity: + requiredDuringSchedulingIgnoredDuringExecution: + nodeSelectorTerms: + - matchExpressions: + - key: feature.node.kubernetes.io/pci-10de.present + operator: In + values: + - 'true' + - matchExpressions: + - key: feature.node.kubernetes.io/cpu-model.vendor_id + operator: In + values: + - NVIDIA + - matchExpressions: + - key: nvidia.com/gpu.present + operator: In + values: + - 'true' + tolerations: + - effect: NoSchedule + key: nvidia.com/gpu + operator: Exists + nodeSelector: + nvidia.com/gpu.deploy.dra-driver-kubelet-plugin: 'true' diff --git a/bundle/manifests/gpu-operator-certified.clusterserviceversion.yaml b/bundle/manifests/gpu-operator-certified.clusterserviceversion.yaml index 22eeca601..ffb600fb1 100644 --- a/bundle/manifests/gpu-operator-certified.clusterserviceversion.yaml +++ b/bundle/manifests/gpu-operator-certified.clusterserviceversion.yaml @@ -259,7 +259,7 @@ spec: - name: gdrcopy-image image: nvcr.io/nvidia/cloud-native/gdrdrv@sha256:0460630559b0b932c8861237b62e69c2895dace42d37ad3cb02c87e5d751fafc - name: dra-driver-image - image: nvcr.io/nvidia/k8s-dra-driver-gpu@sha256:5dd583277c1f2825cb637c3c07d8208c6278b1e6ccb4231f0ac011dbf651d5a9 + image: us-central1-docker.pkg.dev/k8s-staging-images/dra-driver-nvidia/dra-driver-nvidia-gpu:v26.4.0-dev-bef400ef customresourcedefinitions: owned: - name: nvidiadrivers.nvidia.com @@ -1055,7 +1055,7 @@ spec: - name: "GDRCOPY_IMAGE" value: "nvcr.io/nvidia/cloud-native/gdrdrv@sha256:0460630559b0b932c8861237b62e69c2895dace42d37ad3cb02c87e5d751fafc" - name: "DRA_DRIVER_IMAGE" - value: "nvcr.io/nvidia/k8s-dra-driver-gpu@sha256:5dd583277c1f2825cb637c3c07d8208c6278b1e6ccb4231f0ac011dbf651d5a9" + value: "us-central1-docker.pkg.dev/k8s-staging-images/dra-driver-nvidia/dra-driver-nvidia-gpu:v26.4.0-dev-bef400ef" terminationGracePeriodSeconds: 10 serviceAccountName: gpu-operator strategy: deployment diff --git a/bundle/manifests/resource.nvidia.com_computedomaincliques.yaml b/bundle/manifests/resource.nvidia.com_computedomaincliques.yaml new file mode 100644 index 000000000..b3eddef71 --- /dev/null +++ b/bundle/manifests/resource.nvidia.com_computedomaincliques.yaml @@ -0,0 +1,84 @@ +--- +apiVersion: apiextensions.k8s.io/v1 +kind: CustomResourceDefinition +metadata: + annotations: + controller-gen.kubebuilder.io/version: v0.17.1 + name: computedomaincliques.resource.nvidia.com +spec: + group: resource.nvidia.com + names: + kind: ComputeDomainClique + listKind: ComputeDomainCliqueList + plural: computedomaincliques + singular: computedomainclique + scope: Namespaced + versions: + - name: v1beta1 + schema: + openAPIV3Schema: + description: |- + ComputeDomainClique holds information about a specific clique within a ComputeDomain. + It is created in the driver namespace and named as ".". + properties: + apiVersion: + description: |- + APIVersion defines the versioned schema of this representation of an object. + Servers should convert recognized schemas to the latest internal value, and + may reject unrecognized values. + More info: https://git.k8s.io/community/contributors/devel/sig-architecture/api-conventions.md#resources + type: string + daemons: + items: + description: ComputeDomainDaemonInfo provides information about each + daemon in a ComputeDomainClique. + properties: + cliqueID: + type: string + index: + description: |- + The Index field is used to ensure a consistent IP-to-DNS name + mapping across all machines within an IMEX domain. Each node's index + directly determines its DNS name within a given NVLink partition + (i.e. clique). In other words, the 2-tuple of (CliqueID, Index) will + always be unique. This field is marked as optional (but not + omitempty) in order to support downgrades and avoid an API bump. + type: integer + ipAddress: + type: string + nodeName: + type: string + status: + default: NotReady + description: |- + The Status field tracks the readiness of the IMEX daemon running on + this node. It gets switched to Ready whenever the IMEX daemon is + ready to broker GPU memory exchanges and switches to NotReady when + it is not. It is marked as optional in order to support downgrades + and avoid an API bump. + enum: + - Ready + - NotReady + type: string + required: + - cliqueID + - ipAddress + - nodeName + type: object + type: array + x-kubernetes-list-map-keys: + - nodeName + x-kubernetes-list-type: map + kind: + description: |- + Kind is a string value representing the REST resource this object represents. + Servers may infer this from the endpoint the client submits requests to. + Cannot be updated. + In CamelCase. + More info: https://git.k8s.io/community/contributors/devel/sig-architecture/api-conventions.md#types-kinds + type: string + metadata: + type: object + type: object + served: true + storage: true diff --git a/bundle/manifests/resource.nvidia.com_computedomains.yaml b/bundle/manifests/resource.nvidia.com_computedomains.yaml index 307b21ff7..5a28ae17c 100644 --- a/bundle/manifests/resource.nvidia.com_computedomains.yaml +++ b/bundle/manifests/resource.nvidia.com_computedomains.yaml @@ -14,91 +14,149 @@ spec: singular: computedomain scope: Namespaced versions: - - name: v1beta1 - schema: - openAPIV3Schema: - description: ComputeDomain prepares a set of nodes to run a multi-node workload - in. - properties: - apiVersion: - description: |- - APIVersion defines the versioned schema of this representation of an object. - Servers should convert recognized schemas to the latest internal value, and - may reject unrecognized values. - More info: https://git.k8s.io/community/contributors/devel/sig-architecture/api-conventions.md#resources - type: string - kind: - description: |- - Kind is a string value representing the REST resource this object represents. - Servers may infer this from the endpoint the client submits requests to. - Cannot be updated. - In CamelCase. - More info: https://git.k8s.io/community/contributors/devel/sig-architecture/api-conventions.md#types-kinds - type: string - metadata: - type: object - spec: - description: ComputeDomainSpec provides the spec for a ComputeDomain. - properties: - channel: - description: ComputeDomainChannelSpec provides the spec for a channel - used to run a workload inside a ComputeDomain. - properties: - resourceClaimTemplate: - description: ComputeDomainResourceClaimTemplate provides the details - of the ResourceClaimTemplate to generate. - properties: - name: - type: string - required: - - name - type: object - required: - - resourceClaimTemplate - type: object - numNodes: - type: integer - required: - - channel - - numNodes - type: object - x-kubernetes-validations: - - message: A computeDomain.spec is immutable - rule: self == oldSelf - status: - description: ComputeDomainStatus provides the status for a ComputeDomain. - properties: - nodes: - items: - description: ComputeDomainNode provides information about each node - added to a ComputeDomain. + - name: v1beta1 + schema: + openAPIV3Schema: + description: ComputeDomain prepares a set of nodes to run a multi-node workload + in. + properties: + apiVersion: + description: |- + APIVersion defines the versioned schema of this representation of an object. + Servers should convert recognized schemas to the latest internal value, and + may reject unrecognized values. + More info: https://git.k8s.io/community/contributors/devel/sig-architecture/api-conventions.md#resources + type: string + kind: + description: |- + Kind is a string value representing the REST resource this object represents. + Servers may infer this from the endpoint the client submits requests to. + Cannot be updated. + In CamelCase. + More info: https://git.k8s.io/community/contributors/devel/sig-architecture/api-conventions.md#types-kinds + type: string + metadata: + type: object + spec: + description: ComputeDomainSpec provides the spec for a ComputeDomain. + properties: + channel: + description: ComputeDomainChannelSpec provides the spec for a channel + used to run a workload inside a ComputeDomain. + properties: + allocationMode: + default: Single + description: |- + Allows for requesting all IMEX channels (the maximum per IMEX domain) or + precisely one. + enum: + - All + - Single + type: string + resourceClaimTemplate: + description: ComputeDomainResourceClaimTemplate provides the details + of the ResourceClaimTemplate to generate. properties: - cliqueID: - type: string - ipAddress: - type: string name: type: string required: - - cliqueID - - ipAddress - - name - type: object - type: array - x-kubernetes-list-map-keys: - name - x-kubernetes-list-type: map - status: - default: NotReady - enum: - - Ready - - NotReady - type: string - required: - - status - type: object - type: object - served: true - storage: true - subresources: - status: {} + type: object + required: + - resourceClaimTemplate + type: object + numNodes: + description: |- + Intended number of IMEX daemons (i.e., individual compute nodes) in the + ComputeDomain. Must be zero or greater. + + With `featureGates.IMEXDaemonsWithDNSNames=true` (the default), this is + recommended to be set to zero. Workload must implement and consult its + own source of truth for the number of workers online before trying to + share GPU memory (and hence triggering IMEX interaction). When non-zero, + `numNodes` is used only for automatically updating the global + ComputeDomain `Status` (indicating `Ready` when the number of ready IMEX + daemons equals `numNodes`). In this mode, a `numNodes` value greater than + zero in particular does not gate the startup of IMEX daemons: individual + IMEX daemons are started immediately without waiting for its peers, and + any workload pod gets released right after its local IMEX daemon has + started. + + With `featureGates.IMEXDaemonsWithDNSNames=false`, `numNodes` must be set + to the expected number of worker nodes joining the ComputeDomain. In that + mode, all workload pods are held back (with containers in state + `ContainerCreating`) until the underlying IMEX domain has been joined by + `numNodes` IMEX daemons. Pods from more than `numNodes` nodes trying to + join the ComputeDomain may lead to unexpected behavior. + + The `numNodes` parameter is deprecated and will be removed in the next + API version. + type: integer + required: + - channel + - numNodes + type: object + x-kubernetes-validations: + - message: A computeDomain.spec is immutable + rule: self == oldSelf + status: + description: |- + Global ComputeDomain status. Can be used to guide debugging efforts. + Workload however should not rely on inspecting this field at any point + during its lifecycle. + properties: + nodes: + items: + description: ComputeDomainNode provides information about each node + added to a ComputeDomain. + properties: + cliqueID: + type: string + index: + description: |- + The Index field is used to ensure a consistent IP-to-DNS name + mapping across all machines within an IMEX domain. Each node's index + directly determines its DNS name within a given NVLink partition + (i.e. clique). In other words, the 2-tuple of (CliqueID, Index) will + always be unique. This field is marked as optional (but not + omitempty) in order to support downgrades and avoid an API bump. + type: integer + ipAddress: + type: string + name: + type: string + status: + default: NotReady + description: |- + The Status field tracks the readiness of the IMEX daemon running on + this node. It gets switched to Ready whenever the IMEX daemon is + ready to broker GPU memory exchanges and switches to NotReady when + it is not. It is marked as optional in order to support downgrades + and avoid an API bump. + enum: + - Ready + - NotReady + type: string + required: + - cliqueID + - ipAddress + - name + type: object + type: array + x-kubernetes-list-map-keys: + - name + x-kubernetes-list-type: map + status: + default: NotReady + enum: + - Ready + - NotReady + type: string + required: + - status + type: object + type: object + served: true + storage: true + subresources: + status: {} diff --git a/controllers/object_controls.go b/controllers/object_controls.go index 29ffb9bb3..5c3c4c38e 100644 --- a/controllers/object_controls.go +++ b/controllers/object_controls.go @@ -374,11 +374,11 @@ func ServiceAccounts(n ClusterPolicyController) (gpuv1.State, error) { return status, nil } -// Role creates Role resource -func Role(n ClusterPolicyController) (gpuv1.State, error) { +// createRole creates a Role resource +func createRole(n ClusterPolicyController, idx int) (gpuv1.State, error) { ctx := n.ctx state := n.idx - obj := n.resources[state].Role.DeepCopy() + obj := n.resources[state].Roles[idx].DeepCopy() obj.Namespace = n.operatorNamespace logger := n.logger.WithValues("Role", obj.Name, "Namespace", obj.Namespace) @@ -415,6 +415,22 @@ func Role(n ClusterPolicyController) (gpuv1.State, error) { return gpuv1.Ready, nil } +// Role creates one or more Role resources +func Role(n ClusterPolicyController) (gpuv1.State, error) { + status := gpuv1.Ready + state := n.idx + for i := range n.resources[state].Roles { + stat, err := createRole(n, i) + if err != nil { + return stat, err + } + if stat == gpuv1.NotReady { + status = gpuv1.NotReady + } + } + return status, nil +} + // createRoleBinding creates a RoleBinding resource func createRoleBinding(n ClusterPolicyController, idx int) (gpuv1.State, error) { ctx := n.ctx @@ -1814,6 +1830,16 @@ func TransformDRADriverKubeletPlugin(obj *appsv1.DaemonSet, config *gpuv1.Cluste return err } + for i := range obj.Spec.Template.Spec.InitContainers { + if obj.Spec.Template.Spec.InitContainers[i].Name != "init-container" { + continue + } + obj.Spec.Template.Spec.InitContainers[i].Image = image + obj.Spec.Template.Spec.InitContainers[i].ImagePullPolicy = gpuv1.ImagePullPolicy(config.DRADriver.ImagePullPolicy) + } + + transformDRADriverRoot(obj, config) + var containers []corev1.Container for i, container := range obj.Spec.Template.Spec.Containers { // Skip the container if the resource type is not enabled. @@ -1866,6 +1892,35 @@ func TransformDRADriverKubeletPlugin(obj *appsv1.DaemonSet, config *gpuv1.Cluste return nil } +func transformDRADriverRoot(obj *appsv1.DaemonSet, config *gpuv1.ClusterPolicySpec) { + driverRoot := config.HostPaths.DriverInstallDir + if driverRoot == "" || driverRoot == DefaultDriverInstallDir { + return + } + + driverRootParent := "/" + if driverRoot != "/" { + driverRootParent = filepath.Dir(strings.TrimRight(driverRoot, "/")) + } + + for i := range obj.Spec.Template.Spec.InitContainers { + setContainerEnv(&obj.Spec.Template.Spec.InitContainers[i], "NVIDIA_DRIVER_ROOT", driverRoot) + } + + for i := range obj.Spec.Template.Spec.Containers { + setContainerEnv(&obj.Spec.Template.Spec.Containers[i], "NVIDIA_DRIVER_ROOT", driverRoot) + } + + for i := range obj.Spec.Template.Spec.Volumes { + switch obj.Spec.Template.Spec.Volumes[i].Name { + case "driver-root": + obj.Spec.Template.Spec.Volumes[i].HostPath.Path = driverRoot + case "driver-root-parent": + obj.Spec.Template.Spec.Volumes[i].HostPath.Path = driverRootParent + } + } +} + // TransformDCGMExporter transforms dcgm exporter daemonset with required config as per ClusterPolicy func TransformDCGMExporter(obj *appsv1.DaemonSet, config *gpuv1.ClusterPolicySpec, n ClusterPolicyController) error { // update validation container @@ -4221,14 +4276,14 @@ func getDaemonsetControllerRevisionHash(ctx context.Context, daemonset *appsv1.D func TransformDRADriverController(obj *appsv1.Deployment, spec *gpuv1.ClusterPolicySpec) error { var computeDomainsCtr *corev1.Container for i, ctr := range obj.Spec.Template.Spec.Containers { - if ctr.Name == "compute-domains" { + if ctr.Name == "compute-domain" { computeDomainsCtr = &obj.Spec.Template.Spec.Containers[i] break } } if computeDomainsCtr == nil { - return fmt.Errorf("failed to find 'compute-domains' container") + return fmt.Errorf("failed to find 'compute-domain' container") } config := spec.DRADriver diff --git a/controllers/resource_manager.go b/controllers/resource_manager.go index 2ae7497ba..409f7ab6c 100644 --- a/controllers/resource_manager.go +++ b/controllers/resource_manager.go @@ -47,7 +47,7 @@ type assetsFromFile []byte // Resources indicates resources managed by GPU operator type Resources struct { ServiceAccounts []corev1.ServiceAccount - Role rbacv1.Role + Roles []rbacv1.Role RoleBindings []rbacv1.RoleBinding ClusterRoles []rbacv1.ClusterRole ClusterRoleBindings []rbacv1.ClusterRoleBinding @@ -130,9 +130,14 @@ func addResourcesControls(n *ClusterPolicyController, path string) (Resources, c ctrl = append(ctrl, ServiceAccounts) } case "Role": - _, _, err := s.Decode(m, nil, &res.Role) + role := rbacv1.Role{} + _, _, err := s.Decode(m, nil, &role) panicIfError(err) - ctrl = append(ctrl, Role) + res.Roles = append(res.Roles, role) + // only add the ctrl function when the first Role is added for this component + if len(res.Roles) == 1 { + ctrl = append(ctrl, Role) + } case "RoleBinding": roleBinding := rbacv1.RoleBinding{} _, _, err := s.Decode(m, nil, &roleBinding) diff --git a/controllers/transforms_test.go b/controllers/transforms_test.go index 407f32754..0b700bcbf 100644 --- a/controllers/transforms_test.go +++ b/controllers/transforms_test.go @@ -4833,6 +4833,50 @@ func TestTransformDRADriverKubeletPlugin(t *testing.T) { }, }), }, + { + description: "custom driver root updates init container and volumes", + ds: NewDaemonset(). + WithInitContainer(corev1.Container{Name: "init-container"}). + WithContainer(corev1.Container{Name: "gpus"}). + WithHostPathVolume("driver-root", DefaultDriverInstallDir, ptr.To(corev1.HostPathDirectoryOrCreate)). + WithHostPathVolume("driver-root-parent", "/run/nvidia", ptr.To(corev1.HostPathDirectoryOrCreate)), + cpSpec: &gpuv1.ClusterPolicySpec{ + HostPaths: gpuv1.HostPathsSpec{DriverInstallDir: "/opt/nvidia/driver"}, + DRADriver: gpuv1.DRADriverSpec{ + Repository: "nvcr.io/nvidia", + Image: "k8s-dra-driver-gpu", + Version: "v1.0.0", + ImagePullPolicy: "IfNotPresent", + GPUs: gpuv1.DRADriverGPUs{ + Enabled: newBoolPtr(true), + }, + ComputeDomains: gpuv1.DRADriverComputeDomains{ + Enabled: newBoolPtr(false), + }, + }, + }, + expectedDs: NewDaemonset(). + WithInitContainer(corev1.Container{ + Name: "init-container", + Image: "nvcr.io/nvidia/k8s-dra-driver-gpu:v1.0.0", + ImagePullPolicy: corev1.PullIfNotPresent, + Env: []corev1.EnvVar{ + {Name: "NVIDIA_DRIVER_ROOT", Value: "/opt/nvidia/driver"}, + }, + }). + WithContainer(corev1.Container{ + Name: "gpus", + Image: "nvcr.io/nvidia/k8s-dra-driver-gpu:v1.0.0", + ImagePullPolicy: corev1.PullIfNotPresent, + Env: []corev1.EnvVar{ + {Name: "NVIDIA_DRIVER_ROOT", Value: "/opt/nvidia/driver"}, + {Name: NvidiaCTKPathEnvName, Value: "toolkit/nvidia-ctk"}, + {Name: "IMAGE_NAME", Value: "nvcr.io/nvidia/k8s-dra-driver-gpu:v1.0.0"}, + }, + }). + WithHostPathVolume("driver-root", "/opt/nvidia/driver", ptr.To(corev1.HostPathDirectoryOrCreate)). + WithHostPathVolume("driver-root-parent", "/opt/nvidia", ptr.To(corev1.HostPathDirectoryOrCreate)), + }, { description: "gpus disabled, compute domains disabled", ds: NewDaemonset(). @@ -4889,7 +4933,7 @@ func TestTransformDRADriverController(t *testing.T) { { description: "full dra driver spec", deployment: NewDeployment(). - WithContainer(corev1.Container{Name: "compute-domains"}), + WithContainer(corev1.Container{Name: "compute-domain"}), cpSpec: &gpuv1.ClusterPolicySpec{ DRADriver: gpuv1.DRADriverSpec{ Repository: "nvcr.io/nvidia", @@ -4932,7 +4976,7 @@ func TestTransformDRADriverController(t *testing.T) { }, }). WithContainer(corev1.Container{ - Name: "compute-domains", + Name: "compute-domain", Image: "nvcr.io/nvidia/k8s-dra-driver-gpu:v1.0.0", ImagePullPolicy: corev1.PullIfNotPresent, Env: []corev1.EnvVar{ diff --git a/deployments/gpu-operator/crds/resource.nvidia.com_computedomaincliques.yaml b/deployments/gpu-operator/crds/resource.nvidia.com_computedomaincliques.yaml new file mode 100644 index 000000000..b3eddef71 --- /dev/null +++ b/deployments/gpu-operator/crds/resource.nvidia.com_computedomaincliques.yaml @@ -0,0 +1,84 @@ +--- +apiVersion: apiextensions.k8s.io/v1 +kind: CustomResourceDefinition +metadata: + annotations: + controller-gen.kubebuilder.io/version: v0.17.1 + name: computedomaincliques.resource.nvidia.com +spec: + group: resource.nvidia.com + names: + kind: ComputeDomainClique + listKind: ComputeDomainCliqueList + plural: computedomaincliques + singular: computedomainclique + scope: Namespaced + versions: + - name: v1beta1 + schema: + openAPIV3Schema: + description: |- + ComputeDomainClique holds information about a specific clique within a ComputeDomain. + It is created in the driver namespace and named as ".". + properties: + apiVersion: + description: |- + APIVersion defines the versioned schema of this representation of an object. + Servers should convert recognized schemas to the latest internal value, and + may reject unrecognized values. + More info: https://git.k8s.io/community/contributors/devel/sig-architecture/api-conventions.md#resources + type: string + daemons: + items: + description: ComputeDomainDaemonInfo provides information about each + daemon in a ComputeDomainClique. + properties: + cliqueID: + type: string + index: + description: |- + The Index field is used to ensure a consistent IP-to-DNS name + mapping across all machines within an IMEX domain. Each node's index + directly determines its DNS name within a given NVLink partition + (i.e. clique). In other words, the 2-tuple of (CliqueID, Index) will + always be unique. This field is marked as optional (but not + omitempty) in order to support downgrades and avoid an API bump. + type: integer + ipAddress: + type: string + nodeName: + type: string + status: + default: NotReady + description: |- + The Status field tracks the readiness of the IMEX daemon running on + this node. It gets switched to Ready whenever the IMEX daemon is + ready to broker GPU memory exchanges and switches to NotReady when + it is not. It is marked as optional in order to support downgrades + and avoid an API bump. + enum: + - Ready + - NotReady + type: string + required: + - cliqueID + - ipAddress + - nodeName + type: object + type: array + x-kubernetes-list-map-keys: + - nodeName + x-kubernetes-list-type: map + kind: + description: |- + Kind is a string value representing the REST resource this object represents. + Servers may infer this from the endpoint the client submits requests to. + Cannot be updated. + In CamelCase. + More info: https://git.k8s.io/community/contributors/devel/sig-architecture/api-conventions.md#types-kinds + type: string + metadata: + type: object + type: object + served: true + storage: true diff --git a/deployments/gpu-operator/crds/resource.nvidia.com_computedomains.yaml b/deployments/gpu-operator/crds/resource.nvidia.com_computedomains.yaml index 307b21ff7..5a28ae17c 100644 --- a/deployments/gpu-operator/crds/resource.nvidia.com_computedomains.yaml +++ b/deployments/gpu-operator/crds/resource.nvidia.com_computedomains.yaml @@ -14,91 +14,149 @@ spec: singular: computedomain scope: Namespaced versions: - - name: v1beta1 - schema: - openAPIV3Schema: - description: ComputeDomain prepares a set of nodes to run a multi-node workload - in. - properties: - apiVersion: - description: |- - APIVersion defines the versioned schema of this representation of an object. - Servers should convert recognized schemas to the latest internal value, and - may reject unrecognized values. - More info: https://git.k8s.io/community/contributors/devel/sig-architecture/api-conventions.md#resources - type: string - kind: - description: |- - Kind is a string value representing the REST resource this object represents. - Servers may infer this from the endpoint the client submits requests to. - Cannot be updated. - In CamelCase. - More info: https://git.k8s.io/community/contributors/devel/sig-architecture/api-conventions.md#types-kinds - type: string - metadata: - type: object - spec: - description: ComputeDomainSpec provides the spec for a ComputeDomain. - properties: - channel: - description: ComputeDomainChannelSpec provides the spec for a channel - used to run a workload inside a ComputeDomain. - properties: - resourceClaimTemplate: - description: ComputeDomainResourceClaimTemplate provides the details - of the ResourceClaimTemplate to generate. - properties: - name: - type: string - required: - - name - type: object - required: - - resourceClaimTemplate - type: object - numNodes: - type: integer - required: - - channel - - numNodes - type: object - x-kubernetes-validations: - - message: A computeDomain.spec is immutable - rule: self == oldSelf - status: - description: ComputeDomainStatus provides the status for a ComputeDomain. - properties: - nodes: - items: - description: ComputeDomainNode provides information about each node - added to a ComputeDomain. + - name: v1beta1 + schema: + openAPIV3Schema: + description: ComputeDomain prepares a set of nodes to run a multi-node workload + in. + properties: + apiVersion: + description: |- + APIVersion defines the versioned schema of this representation of an object. + Servers should convert recognized schemas to the latest internal value, and + may reject unrecognized values. + More info: https://git.k8s.io/community/contributors/devel/sig-architecture/api-conventions.md#resources + type: string + kind: + description: |- + Kind is a string value representing the REST resource this object represents. + Servers may infer this from the endpoint the client submits requests to. + Cannot be updated. + In CamelCase. + More info: https://git.k8s.io/community/contributors/devel/sig-architecture/api-conventions.md#types-kinds + type: string + metadata: + type: object + spec: + description: ComputeDomainSpec provides the spec for a ComputeDomain. + properties: + channel: + description: ComputeDomainChannelSpec provides the spec for a channel + used to run a workload inside a ComputeDomain. + properties: + allocationMode: + default: Single + description: |- + Allows for requesting all IMEX channels (the maximum per IMEX domain) or + precisely one. + enum: + - All + - Single + type: string + resourceClaimTemplate: + description: ComputeDomainResourceClaimTemplate provides the details + of the ResourceClaimTemplate to generate. properties: - cliqueID: - type: string - ipAddress: - type: string name: type: string required: - - cliqueID - - ipAddress - - name - type: object - type: array - x-kubernetes-list-map-keys: - name - x-kubernetes-list-type: map - status: - default: NotReady - enum: - - Ready - - NotReady - type: string - required: - - status - type: object - type: object - served: true - storage: true - subresources: - status: {} + type: object + required: + - resourceClaimTemplate + type: object + numNodes: + description: |- + Intended number of IMEX daemons (i.e., individual compute nodes) in the + ComputeDomain. Must be zero or greater. + + With `featureGates.IMEXDaemonsWithDNSNames=true` (the default), this is + recommended to be set to zero. Workload must implement and consult its + own source of truth for the number of workers online before trying to + share GPU memory (and hence triggering IMEX interaction). When non-zero, + `numNodes` is used only for automatically updating the global + ComputeDomain `Status` (indicating `Ready` when the number of ready IMEX + daemons equals `numNodes`). In this mode, a `numNodes` value greater than + zero in particular does not gate the startup of IMEX daemons: individual + IMEX daemons are started immediately without waiting for its peers, and + any workload pod gets released right after its local IMEX daemon has + started. + + With `featureGates.IMEXDaemonsWithDNSNames=false`, `numNodes` must be set + to the expected number of worker nodes joining the ComputeDomain. In that + mode, all workload pods are held back (with containers in state + `ContainerCreating`) until the underlying IMEX domain has been joined by + `numNodes` IMEX daemons. Pods from more than `numNodes` nodes trying to + join the ComputeDomain may lead to unexpected behavior. + + The `numNodes` parameter is deprecated and will be removed in the next + API version. + type: integer + required: + - channel + - numNodes + type: object + x-kubernetes-validations: + - message: A computeDomain.spec is immutable + rule: self == oldSelf + status: + description: |- + Global ComputeDomain status. Can be used to guide debugging efforts. + Workload however should not rely on inspecting this field at any point + during its lifecycle. + properties: + nodes: + items: + description: ComputeDomainNode provides information about each node + added to a ComputeDomain. + properties: + cliqueID: + type: string + index: + description: |- + The Index field is used to ensure a consistent IP-to-DNS name + mapping across all machines within an IMEX domain. Each node's index + directly determines its DNS name within a given NVLink partition + (i.e. clique). In other words, the 2-tuple of (CliqueID, Index) will + always be unique. This field is marked as optional (but not + omitempty) in order to support downgrades and avoid an API bump. + type: integer + ipAddress: + type: string + name: + type: string + status: + default: NotReady + description: |- + The Status field tracks the readiness of the IMEX daemon running on + this node. It gets switched to Ready whenever the IMEX daemon is + ready to broker GPU memory exchanges and switches to NotReady when + it is not. It is marked as optional in order to support downgrades + and avoid an API bump. + enum: + - Ready + - NotReady + type: string + required: + - cliqueID + - ipAddress + - name + type: object + type: array + x-kubernetes-list-map-keys: + - name + x-kubernetes-list-type: map + status: + default: NotReady + enum: + - Ready + - NotReady + type: string + required: + - status + type: object + type: object + served: true + storage: true + subresources: + status: {} diff --git a/deployments/gpu-operator/templates/cleanup_crd.yaml b/deployments/gpu-operator/templates/cleanup_crd.yaml index 347563498..f5c93d3f3 100644 --- a/deployments/gpu-operator/templates/cleanup_crd.yaml +++ b/deployments/gpu-operator/templates/cleanup_crd.yaml @@ -41,6 +41,7 @@ spec: - --filepath=/opt/gpu-operator/nvidia.com_clusterpolicies.yaml - --filepath=/opt/gpu-operator/nvidia.com_nvidiadrivers.yaml - --filepath=/opt/gpu-operator/resource.nvidia.com_computedomains.yaml + - --filepath=/opt/gpu-operator/resource.nvidia.com_computedomaincliques.yaml {{- if .Values.nfd.enabled }} - --filepath=/opt/gpu-operator/nfd-api-crds.yaml {{- end }} diff --git a/deployments/gpu-operator/templates/upgrade_crd.yaml b/deployments/gpu-operator/templates/upgrade_crd.yaml index ab66ee7d2..8d96eec86 100644 --- a/deployments/gpu-operator/templates/upgrade_crd.yaml +++ b/deployments/gpu-operator/templates/upgrade_crd.yaml @@ -90,6 +90,7 @@ spec: - --filepath=/opt/gpu-operator/nvidia.com_clusterpolicies.yaml - --filepath=/opt/gpu-operator/nvidia.com_nvidiadrivers.yaml - --filepath=/opt/gpu-operator/resource.nvidia.com_computedomains.yaml + - --filepath=/opt/gpu-operator/resource.nvidia.com_computedomaincliques.yaml {{- if .Values.nfd.enabled }} - --filepath=/opt/gpu-operator/nfd-api-crds.yaml {{- end }} diff --git a/deployments/gpu-operator/values.yaml b/deployments/gpu-operator/values.yaml index 27fb30e63..4c71b33c9 100644 --- a/deployments/gpu-operator/values.yaml +++ b/deployments/gpu-operator/values.yaml @@ -281,9 +281,9 @@ devicePlugin: hostNetwork: false draDriver: - repository: ghcr.io/nvidia - image: k8s-dra-driver-gpu - version: v25.8.0-dev-124734f2 + repository: us-central1-docker.pkg.dev/k8s-staging-images/dra-driver-nvidia + image: dra-driver-nvidia-gpu + version: v26.4.0-dev-bef400ef imagePullPolicy: IfNotPresent imagePullSecrets: [] diff --git a/docker/Dockerfile b/docker/Dockerfile index 121eed9ec..67a4ace72 100644 --- a/docker/Dockerfile +++ b/docker/Dockerfile @@ -105,6 +105,8 @@ COPY hack/must-gather.sh /usr/bin/gather # Add CRD resource into the image for helm upgrades COPY deployments/gpu-operator/crds/nvidia.com_clusterpolicies.yaml /opt/gpu-operator/nvidia.com_clusterpolicies.yaml COPY deployments/gpu-operator/crds/nvidia.com_nvidiadrivers.yaml /opt/gpu-operator/nvidia.com_nvidiadrivers.yaml +COPY deployments/gpu-operator/crds/resource.nvidia.com_computedomains.yaml /opt/gpu-operator/resource.nvidia.com_computedomains.yaml +COPY deployments/gpu-operator/crds/resource.nvidia.com_computedomaincliques.yaml /opt/gpu-operator/resource.nvidia.com_computedomaincliques.yaml COPY deployments/gpu-operator/charts/node-feature-discovery/crds/nfd-api-crds.yaml /opt/gpu-operator/nfd-api-crds.yaml USER 65532:65532