From fa80ee627ebe90d9664538441bf37d4631174af4 Mon Sep 17 00:00:00 2001
From: Christopher Desiniotis <cdesiniotis@nvidia.com>
Date: Wed, 4 Jun 2025 09:46:46 -0700
Subject: [PATCH 01/10] Integrate NVIDIA DRA Driver for GPUs as an operand

Signed-off-by: Christopher Desiniotis <cdesiniotis@nvidia.com>
---
 api/nvidia/v1/clusterpolicy_types.go          | 118 +++++++
 api/nvidia/v1/zz_generated.deepcopy.go        | 123 +++++++
 .../0100_service_account.yaml                 |   5 +
 assets/state-dra-driver/0200_clusterrole.yaml |  69 ++++
 assets/state-dra-driver/0210_role.yaml        |  19 +
 .../0300_clusterrolebinding.yaml              |  12 +
 assets/state-dra-driver/0310_rolebinding.yaml |  13 +
 ...400_deviceclass-compute-domain-daemon.yaml |   8 +
 ...eclass-compute-domain-default-channel.yaml |   8 +
 .../0420_deviceclass-gpu.yaml                 |   8 +
 .../0430_deviceclass-mig.yaml                 |   8 +
 assets/state-dra-driver/0500_deployment.yaml  |  43 +++
 assets/state-dra-driver/0600_configmap.yaml   |  40 +++
 assets/state-dra-driver/0700_daemonset.yaml   | 171 +++++++++
 ...rator-certified.clusterserviceversion.yaml | 103 ++++++
 .../manifests/nvidia.com_clusterpolicies.yaml | 231 ++++++++++++
 .../resource.nvidia.com_computedomains.yaml   | 104 ++++++
 .../crd/bases/nvidia.com_clusterpolicies.yaml | 231 ++++++++++++
 controllers/clusterpolicy_controller.go       |  12 +
 controllers/object_controls.go                | 229 +++++++++++-
 controllers/resource_manager.go               |  11 +
 controllers/state_manager.go                  |  20 +-
 controllers/transforms_test.go                | 330 ++++++++++++++++++
 .../crds/nvidia.com_clusterpolicies.yaml      | 231 ++++++++++++
 .../resource.nvidia.com_computedomains.yaml   | 104 ++++++
 .../gpu-operator/templates/cleanup_crd.yaml   |   1 +
 .../gpu-operator/templates/clusterpolicy.yaml |  44 +++
 .../gpu-operator/templates/clusterrole.yaml   |  79 +++++
 deployments/gpu-operator/templates/role.yaml  |   1 +
 .../gpu-operator/templates/upgrade_crd.yaml   |   1 +
 deployments/gpu-operator/values.yaml          |  26 ++
 31 files changed, 2385 insertions(+), 18 deletions(-)
 create mode 100644 assets/state-dra-driver/0100_service_account.yaml
 create mode 100644 assets/state-dra-driver/0200_clusterrole.yaml
 create mode 100644 assets/state-dra-driver/0210_role.yaml
 create mode 100644 assets/state-dra-driver/0300_clusterrolebinding.yaml
 create mode 100644 assets/state-dra-driver/0310_rolebinding.yaml
 create mode 100644 assets/state-dra-driver/0400_deviceclass-compute-domain-daemon.yaml
 create mode 100644 assets/state-dra-driver/0410_deviceclass-compute-domain-default-channel.yaml
 create mode 100644 assets/state-dra-driver/0420_deviceclass-gpu.yaml
 create mode 100644 assets/state-dra-driver/0430_deviceclass-mig.yaml
 create mode 100644 assets/state-dra-driver/0500_deployment.yaml
 create mode 100644 assets/state-dra-driver/0600_configmap.yaml
 create mode 100644 assets/state-dra-driver/0700_daemonset.yaml
 create mode 100644 bundle/manifests/resource.nvidia.com_computedomains.yaml
 create mode 100644 deployments/gpu-operator/crds/resource.nvidia.com_computedomains.yaml

diff --git a/api/nvidia/v1/clusterpolicy_types.go b/api/nvidia/v1/clusterpolicy_types.go
index d924901e1..bd72cdb2b 100644
--- a/api/nvidia/v1/clusterpolicy_types.go
+++ b/api/nvidia/v1/clusterpolicy_types.go
@@ -53,6 +53,8 @@ type ClusterPolicySpec struct {
 	Toolkit ToolkitSpec `json:"toolkit"`
 	// DevicePlugin component spec
 	DevicePlugin DevicePluginSpec `json:"devicePlugin"`
+	// DRADriver component spec
+	DRADriver DRADriverSpec `json:"draDriver"`
 	// DCGMExporter spec
 	DCGMExporter DCGMExporterSpec `json:"dcgmExporter"`
 	// DCGM component spec
@@ -985,6 +987,104 @@ type SandboxDevicePluginSpec struct {
 	HostNetwork *bool `json:"hostNetwork,omitempty"`
 }
 
+// DRADriverSpec defines the properties for the NVIDIA DRA Driver deployment
+type DRADriverSpec struct {
+	// NVIDIA DRA Driver image repository
+	// +kubebuilder:validation:Optional
+	Repository string `json:"repository,omitempty"`
+
+	// NVIDIA DRA Driver image name
+	// +kubebuilder:validation:Pattern=[a-zA-Z0-9\-]+
+	Image string `json:"image,omitempty"`
+
+	// NVIDIA DRA Driver image tag
+	// +kubebuilder:validation:Optional
+	Version string `json:"version,omitempty"`
+
+	// Image pull policy
+	// +kubebuilder:validation:Optional
+	// +operator-sdk:gen-csv:customresourcedefinitions.specDescriptors=true
+	// +operator-sdk:gen-csv:customresourcedefinitions.specDescriptors.displayName="Image Pull Policy"
+	// +operator-sdk:gen-csv:customresourcedefinitions.specDescriptors.x-descriptors="urn:alm:descriptor:com.tectonic.ui:imagePullPolicy"
+	ImagePullPolicy string `json:"imagePullPolicy,omitempty"`
+
+	// Image pull secrets
+	// +kubebuilder:validation:Optional
+	// +operator-sdk:gen-csv:customresourcedefinitions.specDescriptors=true
+	// +operator-sdk:gen-csv:customresourcedefinitions.specDescriptors.displayName="Image pull secrets"
+	// +operator-sdk:gen-csv:customresourcedefinitions.specDescriptors.x-descriptors="urn:alm:descriptor:io.kubernetes:Secret"
+	ImagePullSecrets []string `json:"imagePullSecrets,omitempty"`
+
+	// GPUs defines configuration for GPUs in the NVIDIA DRA Driver
+	GPUs DRADriverGPUs `json:"gpus,omitempty"`
+
+	// ComputeDomains defines configuration for ComputeDomains in the NVIDIA DRA Driver
+	ComputeDomains DRADriverComputeDomains `json:"computeDomains,omitempty"`
+}
+
+// DRADriverGPUs defines configuration for GPUs in the NVIDIA DRA Driver
+type DRADriverGPUs struct {
+	// Enabled indicates if GPUs are enabled in the NVIDIA DRA Driver
+	// +operator-sdk:gen-csv:customresourcedefinitions.specDescriptors=true
+	// +operator-sdk:gen-csv:customresourcedefinitions.specDescriptors.displayName="Enable GPUs in the NVIDIA DRA Driver"
+	// +operator-sdk:gen-csv:customresourcedefinitions.specDescriptors.x-descriptors="urn:alm:descriptor:com.tectonic.ui:booleanSwitch"
+	Enabled *bool `json:"enabled,omitempty"`
+
+	// KubeletPlugin defines configuration for the NVIDIA DRA Driver kubelet plugin
+	KubeletPlugin DRADriverKubeletPlugin `json:"kubeletPlugin,omitempty"`
+}
+
+// DRADriverComputeDomains defines configuration for ComputeDomains in the NVIDIA DRA Driver
+type DRADriverComputeDomains struct {
+	// Enabled indicates if ComputeDomains are enabled in the NVIDIA DRA Driver
+	// +operator-sdk:gen-csv:customresourcedefinitions.specDescriptors=true
+	// +operator-sdk:gen-csv:customresourcedefinitions.specDescriptors.displayName="Enable ComputeDomains in the NVIDIA DRA Driver"
+	// +operator-sdk:gen-csv:customresourcedefinitions.specDescriptors.x-descriptors="urn:alm:descriptor:com.tectonic.ui:booleanSwitch"
+	Enabled *bool `json:"enabled,omitempty"`
+
+	// Controller defines configuration for the NVIDIA DRA Driver controller
+	Controller DRADriverController `json:"controller,omitempty"`
+
+	// KubeletPlugin defines configuration for the NVIDIA DRA Driver kubelet plugin
+	KubeletPlugin DRADriverKubeletPlugin `json:"kubeletPlugin,omitempty"`
+}
+
+// DRADriverController defines configuration for the NVIDIA DRA Driver controller
+type DRADriverController struct {
+	// Optional: List of environment variables
+	// +operator-sdk:gen-csv:customresourcedefinitions.specDescriptors=true
+	// +operator-sdk:gen-csv:customresourcedefinitions.specDescriptors.displayName="Environment Variables"
+	// +operator-sdk:gen-csv:customresourcedefinitions.specDescriptors.x-descriptors="urn:alm:descriptor:com.tectonic.ui:advanced,urn:alm:descriptor:com.tectonic.ui:text"
+	Env []EnvVar `json:"env,omitempty"`
+
+	// Optional: Define resources requests and limits
+	// +operator-sdk:gen-csv:customresourcedefinitions.specDescriptors=true
+	// +operator-sdk:gen-csv:customresourcedefinitions.specDescriptors.displayName="Resource Requirements"
+	// +operator-sdk:gen-csv:customresourcedefinitions.specDescriptors.x-descriptors="urn:alm:descriptor:com.tectonic.ui:advanced,urn:alm:descriptor:com.tectonic.ui:resourceRequirements"
+	Resources *ResourceRequirements `json:"resources,omitempty"`
+
+	// Optional: Set tolerations
+	// +operator-sdk:gen-csv:customresourcedefinitions.specDescriptors=true
+	// +operator-sdk:gen-csv:customresourcedefinitions.specDescriptors.displayName="Tolerations"
+	// +operator-sdk:gen-csv:customresourcedefinitions.specDescriptors.x-descriptors="urn:alm:descriptor:com.tectonic.ui:advanced,urn:alm:descriptor:io.kubernetes:Tolerations"
+	Tolerations []corev1.Toleration `json:"tolerations,omitempty"`
+}
+
+// DRADriverKubeletPlugin defines configuration for the NVIDIA DRA Driver kubelet plugin
+type DRADriverKubeletPlugin struct {
+	// Optional: List of environment variables
+	// +operator-sdk:gen-csv:customresourcedefinitions.specDescriptors=true
+	// +operator-sdk:gen-csv:customresourcedefinitions.specDescriptors.displayName="Environment Variables"
+	// +operator-sdk:gen-csv:customresourcedefinitions.specDescriptors.x-descriptors="urn:alm:descriptor:com.tectonic.ui:advanced,urn:alm:descriptor:com.tectonic.ui:text"
+	Env []EnvVar `json:"env,omitempty"`
+
+	// Optional: Define resources requests and limits
+	// +operator-sdk:gen-csv:customresourcedefinitions.specDescriptors=true
+	// +operator-sdk:gen-csv:customresourcedefinitions.specDescriptors.displayName="Resource Requirements"
+	// +operator-sdk:gen-csv:customresourcedefinitions.specDescriptors.x-descriptors="urn:alm:descriptor:com.tectonic.ui:advanced,urn:alm:descriptor:com.tectonic.ui:resourceRequirements"
+	Resources *ResourceRequirements `json:"resources,omitempty"`
+}
+
 // DCGMExporterSpec defines the properties for NVIDIA DCGM Exporter deployment
 type DCGMExporterSpec struct {
 	// Enabled indicates if deployment of NVIDIA DCGM Exporter through operator is enabled
@@ -2079,6 +2179,9 @@ func ImagePath(spec interface{}) (string, error) {
 	case *SandboxDevicePluginSpec:
 		config := spec.(*SandboxDevicePluginSpec)
 		return imagePath(config.Repository, config.Image, config.Version, "SANDBOX_DEVICE_PLUGIN_IMAGE")
+	case *DRADriverSpec:
+		config := spec.(*DRADriverSpec)
+		return imagePath(config.Repository, config.Image, config.Version, "DRA_DRIVER_IMAGE")
 	case *DCGMExporterSpec:
 		config := spec.(*DCGMExporterSpec)
 		return imagePath(config.Repository, config.Image, config.Version, "DCGM_EXPORTER_IMAGE")
@@ -2194,6 +2297,21 @@ func (p *DevicePluginSpec) IsEnabled() bool {
 	return *p.Enabled
 }
 
+// IsEnabled returns true if the DRA Driver is enabled through gpu-operator
+func (d *DRADriverSpec) IsEnabled() bool {
+	return d.IsGPUsEnabled() || d.IsComputeDomainsEnabled()
+}
+
+// IsGPUsEnabled returns true if the GPUs resource is enabled in the DRA Driver
+func (d *DRADriverSpec) IsGPUsEnabled() bool {
+	return d.GPUs.Enabled != nil && *d.GPUs.Enabled
+}
+
+// IsComputeDomainsEnabled returns true if the ComputeDomains resource is enabled in the DRA Driver
+func (d *DRADriverSpec) IsComputeDomainsEnabled() bool {
+	return d.ComputeDomains.Enabled != nil && *d.ComputeDomains.Enabled
+}
+
 // IsEnabled returns true if dcgm-exporter is enabled(default) through gpu-operator
 func (e *DCGMExporterSpec) IsEnabled() bool {
 	if e.Enabled == nil {
diff --git a/api/nvidia/v1/zz_generated.deepcopy.go b/api/nvidia/v1/zz_generated.deepcopy.go
index e04a3570e..e2f162719 100644
--- a/api/nvidia/v1/zz_generated.deepcopy.go
+++ b/api/nvidia/v1/zz_generated.deepcopy.go
@@ -191,6 +191,7 @@ func (in *ClusterPolicySpec) DeepCopyInto(out *ClusterPolicySpec) {
 	in.Driver.DeepCopyInto(&out.Driver)
 	in.Toolkit.DeepCopyInto(&out.Toolkit)
 	in.DevicePlugin.DeepCopyInto(&out.DevicePlugin)
+	in.DRADriver.DeepCopyInto(&out.DRADriver)
 	in.DCGMExporter.DeepCopyInto(&out.DCGMExporter)
 	in.DCGM.DeepCopyInto(&out.DCGM)
 	in.NodeStatusExporter.DeepCopyInto(&out.NodeStatusExporter)
@@ -486,6 +487,128 @@ func (in *DCGMSpec) DeepCopy() *DCGMSpec {
 	return out
 }
 
+// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil.
+func (in *DRADriverComputeDomains) DeepCopyInto(out *DRADriverComputeDomains) {
+	*out = *in
+	if in.Enabled != nil {
+		in, out := &in.Enabled, &out.Enabled
+		*out = new(bool)
+		**out = **in
+	}
+	in.Controller.DeepCopyInto(&out.Controller)
+	in.KubeletPlugin.DeepCopyInto(&out.KubeletPlugin)
+}
+
+// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new DRADriverComputeDomains.
+func (in *DRADriverComputeDomains) DeepCopy() *DRADriverComputeDomains {
+	if in == nil {
+		return nil
+	}
+	out := new(DRADriverComputeDomains)
+	in.DeepCopyInto(out)
+	return out
+}
+
+// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil.
+func (in *DRADriverController) DeepCopyInto(out *DRADriverController) {
+	*out = *in
+	if in.Env != nil {
+		in, out := &in.Env, &out.Env
+		*out = make([]EnvVar, len(*in))
+		copy(*out, *in)
+	}
+	if in.Resources != nil {
+		in, out := &in.Resources, &out.Resources
+		*out = new(ResourceRequirements)
+		(*in).DeepCopyInto(*out)
+	}
+	if in.Tolerations != nil {
+		in, out := &in.Tolerations, &out.Tolerations
+		*out = make([]corev1.Toleration, len(*in))
+		for i := range *in {
+			(*in)[i].DeepCopyInto(&(*out)[i])
+		}
+	}
+}
+
+// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new DRADriverController.
+func (in *DRADriverController) DeepCopy() *DRADriverController {
+	if in == nil {
+		return nil
+	}
+	out := new(DRADriverController)
+	in.DeepCopyInto(out)
+	return out
+}
+
+// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil.
+func (in *DRADriverGPUs) DeepCopyInto(out *DRADriverGPUs) {
+	*out = *in
+	if in.Enabled != nil {
+		in, out := &in.Enabled, &out.Enabled
+		*out = new(bool)
+		**out = **in
+	}
+	in.KubeletPlugin.DeepCopyInto(&out.KubeletPlugin)
+}
+
+// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new DRADriverGPUs.
+func (in *DRADriverGPUs) DeepCopy() *DRADriverGPUs {
+	if in == nil {
+		return nil
+	}
+	out := new(DRADriverGPUs)
+	in.DeepCopyInto(out)
+	return out
+}
+
+// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil.
+func (in *DRADriverKubeletPlugin) DeepCopyInto(out *DRADriverKubeletPlugin) {
+	*out = *in
+	if in.Env != nil {
+		in, out := &in.Env, &out.Env
+		*out = make([]EnvVar, len(*in))
+		copy(*out, *in)
+	}
+	if in.Resources != nil {
+		in, out := &in.Resources, &out.Resources
+		*out = new(ResourceRequirements)
+		(*in).DeepCopyInto(*out)
+	}
+}
+
+// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new DRADriverKubeletPlugin.
+func (in *DRADriverKubeletPlugin) DeepCopy() *DRADriverKubeletPlugin {
+	if in == nil {
+		return nil
+	}
+	out := new(DRADriverKubeletPlugin)
+	in.DeepCopyInto(out)
+	return out
+}
+
+// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil.
+func (in *DRADriverSpec) DeepCopyInto(out *DRADriverSpec) {
+	*out = *in
+	if in.ImagePullSecrets != nil {
+		in, out := &in.ImagePullSecrets, &out.ImagePullSecrets
+		*out = make([]string, len(*in))
+		copy(*out, *in)
+	}
+	in.GPUs.DeepCopyInto(&out.GPUs)
+	in.ComputeDomains.DeepCopyInto(&out.ComputeDomains)
+}
+
+// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new DRADriverSpec.
+func (in *DRADriverSpec) DeepCopy() *DRADriverSpec {
+	if in == nil {
+		return nil
+	}
+	out := new(DRADriverSpec)
+	in.DeepCopyInto(out)
+	return out
+}
+
 // DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil.
 func (in *DaemonsetsSpec) DeepCopyInto(out *DaemonsetsSpec) {
 	*out = *in
diff --git a/assets/state-dra-driver/0100_service_account.yaml b/assets/state-dra-driver/0100_service_account.yaml
new file mode 100644
index 000000000..76d6d61af
--- /dev/null
+++ b/assets/state-dra-driver/0100_service_account.yaml
@@ -0,0 +1,5 @@
+apiVersion: v1
+kind: ServiceAccount
+metadata:
+  name: nvidia-dra-driver
+  namespace: "FILLED BY THE OPERATOR"
diff --git a/assets/state-dra-driver/0200_clusterrole.yaml b/assets/state-dra-driver/0200_clusterrole.yaml
new file mode 100644
index 000000000..e2052e9a6
--- /dev/null
+++ b/assets/state-dra-driver/0200_clusterrole.yaml
@@ -0,0 +1,69 @@
+apiVersion: rbac.authorization.k8s.io/v1
+kind: ClusterRole
+metadata:
+  name: nvidia-dra-driver
+  namespace: "FILLED BY THE OPERATOR"
+rules:
+  - apiGroups:
+      - resource.nvidia.com
+    resources:
+      - computedomains
+      - computedomains/status
+    verbs:
+      - get
+      - list
+      - watch
+      - create
+      - update
+      - patch
+      - delete
+  - apiGroups:
+      - resource.k8s.io
+    resources:
+      - resourceclaims
+      - resourceclaimtemplates
+    verbs:
+      - get
+      - list
+      - watch
+      - create
+      - update
+      - patch
+      - delete
+  - apiGroups:
+      - resource.k8s.io
+    resources:
+      - deviceclasses
+      - resourceslices
+    verbs:
+      - get
+      - list
+      - watch
+      - create
+      - update
+      - patch
+      - delete
+  - apiGroups:
+      - resource.k8s.io
+    resources:
+      - resourceclaims/status
+    verbs:
+      - update
+  - apiGroups:
+      - ""
+    resources:
+      - nodes
+    verbs:
+      - get
+      - list
+      - watch
+      - update
+      - patch
+  - apiGroups:
+      - ""
+    resources:
+      - pods
+    verbs:
+      - get
+      - list
+      - watch
diff --git a/assets/state-dra-driver/0210_role.yaml b/assets/state-dra-driver/0210_role.yaml
new file mode 100644
index 000000000..62e336e3d
--- /dev/null
+++ b/assets/state-dra-driver/0210_role.yaml
@@ -0,0 +1,19 @@
+apiVersion: rbac.authorization.k8s.io/v1
+kind: Role
+metadata:
+  name: nvidia-dra-driver
+  namespace: "FILLED BY THE OPERATOR"
+rules:
+  - apiGroups:
+      - apps
+    resources:
+      - daemonsets
+      - deployments
+    verbs:
+      - get
+      - list
+      - watch
+      - create
+      - update
+      - patch
+      - delete
diff --git a/assets/state-dra-driver/0300_clusterrolebinding.yaml b/assets/state-dra-driver/0300_clusterrolebinding.yaml
new file mode 100644
index 000000000..ea4f6a5e4
--- /dev/null
+++ b/assets/state-dra-driver/0300_clusterrolebinding.yaml
@@ -0,0 +1,12 @@
+apiVersion: rbac.authorization.k8s.io/v1
+kind: ClusterRoleBinding
+metadata:
+  name: nvidia-dra-driver
+roleRef:
+  apiGroup: rbac.authorization.k8s.io
+  kind: ClusterRole
+  name: nvidia-dra-driver
+subjects:
+  - kind: ServiceAccount
+    name: nvidia-dra-driver
+    namespace: "FILLED BY THE OPERATOR"
diff --git a/assets/state-dra-driver/0310_rolebinding.yaml b/assets/state-dra-driver/0310_rolebinding.yaml
new file mode 100644
index 000000000..bf893a63c
--- /dev/null
+++ b/assets/state-dra-driver/0310_rolebinding.yaml
@@ -0,0 +1,13 @@
+apiVersion: rbac.authorization.k8s.io/v1
+kind: RoleBinding
+metadata:
+  name: nvidia-dra-driver
+  namespace: "FILLED BY THE OPERATOR"
+roleRef:
+  apiGroup: rbac.authorization.k8s.io
+  kind: Role
+  name: nvidia-dra-driver
+subjects:
+  - kind: ServiceAccount
+    name: nvidia-dra-driver
+    namespace: "FILLED BY THE OPERATOR"
diff --git a/assets/state-dra-driver/0400_deviceclass-compute-domain-daemon.yaml b/assets/state-dra-driver/0400_deviceclass-compute-domain-daemon.yaml
new file mode 100644
index 000000000..e8d6ac997
--- /dev/null
+++ b/assets/state-dra-driver/0400_deviceclass-compute-domain-daemon.yaml
@@ -0,0 +1,8 @@
+apiVersion: resource.k8s.io/v1beta1
+kind: DeviceClass
+metadata:
+  name: compute-domain-daemon.nvidia.com
+spec:
+  selectors:
+    - cel:
+        expression: "device.driver == 'compute-domain.nvidia.com' && device.attributes['compute-domain.nvidia.com'].type == 'daemon'"
diff --git a/assets/state-dra-driver/0410_deviceclass-compute-domain-default-channel.yaml b/assets/state-dra-driver/0410_deviceclass-compute-domain-default-channel.yaml
new file mode 100644
index 000000000..737404ccb
--- /dev/null
+++ b/assets/state-dra-driver/0410_deviceclass-compute-domain-default-channel.yaml
@@ -0,0 +1,8 @@
+apiVersion: resource.k8s.io/v1beta1
+kind: DeviceClass
+metadata:
+  name: compute-domain-default-channel.nvidia.com
+spec:
+  selectors:
+    - cel:
+        expression: "device.driver == 'compute-domain.nvidia.com' && device.attributes['compute-domain.nvidia.com'].type == 'channel' && device.attributes['compute-domain.nvidia.com'].id == 0"
diff --git a/assets/state-dra-driver/0420_deviceclass-gpu.yaml b/assets/state-dra-driver/0420_deviceclass-gpu.yaml
new file mode 100644
index 000000000..7c65e3762
--- /dev/null
+++ b/assets/state-dra-driver/0420_deviceclass-gpu.yaml
@@ -0,0 +1,8 @@
+apiVersion: resource.k8s.io/v1beta1
+kind: DeviceClass
+metadata:
+  name: gpu.nvidia.com
+spec:
+  selectors:
+    - cel:
+        expression: "device.driver == 'gpu.nvidia.com' && device.attributes['gpu.nvidia.com'].type == 'gpu'"
diff --git a/assets/state-dra-driver/0430_deviceclass-mig.yaml b/assets/state-dra-driver/0430_deviceclass-mig.yaml
new file mode 100644
index 000000000..0188ca08f
--- /dev/null
+++ b/assets/state-dra-driver/0430_deviceclass-mig.yaml
@@ -0,0 +1,8 @@
+apiVersion: resource.k8s.io/v1beta1
+kind: DeviceClass
+metadata:
+  name: mig.nvidia.com
+spec:
+  selectors:
+    - cel:
+        expression: "device.driver == 'gpu.nvidia.com' && device.attributes['gpu.nvidia.com'].type == 'mig'"
diff --git a/assets/state-dra-driver/0500_deployment.yaml b/assets/state-dra-driver/0500_deployment.yaml
new file mode 100644
index 000000000..1e5bbd0f0
--- /dev/null
+++ b/assets/state-dra-driver/0500_deployment.yaml
@@ -0,0 +1,43 @@
+apiVersion: apps/v1
+kind: Deployment
+metadata:
+  name: nvidia-dra-driver-controller
+  namespace: "FILLED BY THE OPERATOR"
+  labels:
+    app: nvidia-dra-driver-controller
+spec:
+  replicas: 1
+  selector:
+    matchLabels:
+      app: nvidia-dra-driver-controller
+  template:
+    metadata:
+      labels:
+        app: nvidia-dra-driver-controller
+    spec:
+      priorityClassName: system-node-critical
+      serviceAccountName: nvidia-dra-driver
+      containers:
+        - name: compute-domains
+          image: "FILLED BY THE OPERATOR"
+          command: ["compute-domain-controller", "-v", "6"]
+          env:
+            - name: POD_NAME
+              valueFrom:
+                fieldRef:
+                  fieldPath: metadata.name
+            - name: NAMESPACE
+              valueFrom:
+                fieldRef:
+                  fieldPath: metadata.namespace
+      affinity:
+        nodeAffinity:
+          requiredDuringSchedulingIgnoredDuringExecution:
+            nodeSelectorTerms:
+              - matchExpressions:
+                  - key: "node-role.kubernetes.io/control-plane"
+                    operator: "Exists"
+      tolerations:
+        - key: node-role.kubernetes.io/control-plane
+          operator: Exists
+          effect: NoSchedule
diff --git a/assets/state-dra-driver/0600_configmap.yaml b/assets/state-dra-driver/0600_configmap.yaml
new file mode 100644
index 000000000..6ffda9d28
--- /dev/null
+++ b/assets/state-dra-driver/0600_configmap.yaml
@@ -0,0 +1,40 @@
+apiVersion: v1
+kind: ConfigMap
+metadata:
+  name: nvidia-dra-driver-kubelet-plugin-entrypoint
+  namespace: "FILLED BY THE OPERATOR"
+  labels:
+    app: nvidia-dra-driver-kubelet-plugin
+data:
+  entrypoint.sh: |-
+    #!/bin/sh
+
+    if [ "$#" -ne 1 ]; then
+      echo "Usage: $0 COMMAND"
+      exit 1
+    fi
+    
+    entrypoint=$1
+    
+    until [ -f /run/nvidia/validations/driver-ready ]
+    do
+      echo "waiting for the driver validations to be ready..."
+      sleep 5
+    done
+    
+    set -o allexport
+    cat /run/nvidia/validations/driver-ready
+    . /run/nvidia/validations/driver-ready
+    
+    # Conditionally mask the params file to prevent this container from
+    # recreating any missing GPU device nodes. This is necessary, for
+    # example, when running under nvkind to limit the set GPUs governed
+    # by the plugin even though it has cgroup access to all of them.
+    if [ "${MASK_NVIDIA_DRIVER_PARAMS}" = "true" ]; then
+      cp /proc/driver/nvidia/params root/gpu-params
+      sed -i 's/^ModifyDeviceFiles: 1$/ModifyDeviceFiles: 0/' root/gpu-params
+      mount --bind root/gpu-params /proc/driver/nvidia/params
+    fi
+
+    echo "Starting the NVIDIA DRA Driver Kubelet Plugin"
+    exec $entrypoint
diff --git a/assets/state-dra-driver/0700_daemonset.yaml b/assets/state-dra-driver/0700_daemonset.yaml
new file mode 100644
index 000000000..a829281a6
--- /dev/null
+++ b/assets/state-dra-driver/0700_daemonset.yaml
@@ -0,0 +1,171 @@
+apiVersion: apps/v1
+kind: DaemonSet
+metadata:
+  name: nvidia-dra-driver-kubelet-plugin
+  namespace: "FILLED BY THE OPERATOR"
+  labels:
+    app: nvidia-dra-driver-kubelet-plugin
+spec:
+  selector:
+    matchLabels:
+      app: nvidia-dra-driver-kubelet-plugin
+  template:
+    metadata:
+      labels:
+        app: nvidia-dra-driver-kubelet-plugin
+    spec:
+      nodeSelector:
+        nvidia.com/gpu.deploy.dra-driver-kubelet-plugin: "true"
+      priorityClassName: system-node-critical
+      serviceAccountName: nvidia-dra-driver
+      initContainers:
+        - name: driver-validation
+          image: "FILLED BY THE OPERATOR"
+          imagePullPolicy: IfNotPresent
+          command: ['sh', '-c']
+          args: ["nvidia-validator"]
+          env:
+            - name: WITH_WAIT
+              value: "true"
+            - name: COMPONENT
+              value: driver
+            - name: OPERATOR_NAMESPACE
+              valueFrom:
+                fieldRef:
+                  fieldPath: metadata.namespace
+          securityContext:
+            privileged: true
+            seLinuxOptions:
+              level: "s0"
+          volumeMounts:
+            - name: driver-install-dir
+              mountPath: /run/nvidia/driver
+              mountPropagation: HostToContainer
+            - name: run-nvidia-validations
+              mountPath: /run/nvidia/validations
+              mountPropagation: Bidirectional
+            - name: host-root
+              mountPath: /host
+              readOnly: true
+              mountPropagation: HostToContainer
+            - name: host-dev-char
+              mountPath: /host-dev-char
+      containers:
+        - name: compute-domains
+          securityContext:
+            privileged: true
+          image: "FILLED BY THE OPERATOR"
+          command: ["/bin/sh", "-c"]
+          args:
+            - /bin/entrypoint.sh "compute-domain-kubelet-plugin -v 6"
+          env:
+            - name: NVIDIA_VISIBLE_DEVICES
+              value: void
+            - name: CDI_ROOT
+              value: /var/run/cdi
+            - name: NVIDIA_MIG_CONFIG_DEVICES
+              value: all
+            - name: NODE_NAME
+              valueFrom:
+                fieldRef:
+                  fieldPath: spec.nodeName
+            - name: NAMESPACE
+              valueFrom:
+                fieldRef:
+                  fieldPath: metadata.namespace
+          volumeMounts:
+            - name: nvidia-dra-driver-kubelet-plugin-entrypoint
+              readOnly: true
+              mountPath: /bin/entrypoint.sh
+              subPath: entrypoint.sh
+            - name: plugins-registry
+              mountPath: /var/lib/kubelet/plugins_registry
+            - name: plugins
+              mountPath: /var/lib/kubelet/plugins
+              mountPropagation: Bidirectional
+            - name: cdi
+              mountPath: /var/run/cdi
+            - name: run-nvidia-validations
+              mountPath: /run/nvidia/validations
+              mountPropagation: Bidirectional
+            - name: driver-install-dir
+              mountPath: /driver-root
+              readOnly: true
+              mountPropagation: HostToContainer
+            - name: host-root
+              mountPath: /host
+              readOnly: true
+              mountPropagation: HostToContainer
+        - name: gpus
+          securityContext:
+            privileged: true
+          image: "FILLED BY THE OPERATOR"
+          command: ["/bin/sh", "-c"]
+          args:
+            - /bin/entrypoint.sh "gpu-kubelet-plugin -v 6"
+          env:
+            - name: NVIDIA_VISIBLE_DEVICES
+              value: void
+            - name: CDI_ROOT
+              value: /var/run/cdi
+            - name: NVIDIA_MIG_CONFIG_DEVICES
+              value: all
+            - name: NODE_NAME
+              valueFrom:
+                fieldRef:
+                  fieldPath: spec.nodeName
+            - name: NAMESPACE
+              valueFrom:
+                fieldRef:
+                  fieldPath: metadata.namespace
+          volumeMounts:
+            - name: nvidia-dra-driver-kubelet-plugin-entrypoint
+              readOnly: true
+              mountPath: /bin/entrypoint.sh
+              subPath: entrypoint.sh
+            - name: plugins-registry
+              mountPath: /var/lib/kubelet/plugins_registry
+            - name: plugins
+              mountPath: /var/lib/kubelet/plugins
+              mountPropagation: Bidirectional
+            - name: cdi
+              mountPath: /var/run/cdi
+            - name: run-nvidia-validations
+              mountPath: /run/nvidia/validations
+              mountPropagation: Bidirectional
+            - name: driver-install-dir
+              mountPath: /driver-root
+              readOnly: true
+              mountPropagation: HostToContainer
+            - name: host-root
+              mountPath: /host
+              readOnly: true
+              mountPropagation: HostToContainer
+      volumes:
+        - name: nvidia-dra-driver-kubelet-plugin-entrypoint
+          configMap:
+            name: nvidia-dra-driver-kubelet-plugin-entrypoint
+            defaultMode: 448
+        - name: plugins-registry
+          hostPath:
+            path: /var/lib/kubelet/plugins_registry
+        - name: plugins
+          hostPath:
+            path: /var/lib/kubelet/plugins
+        - name: cdi
+          hostPath:
+            path: /var/run/cdi
+        - name: run-nvidia-validations
+          hostPath:
+            path: /run/nvidia/validations
+            type: DirectoryOrCreate
+        - name: driver-install-dir
+          hostPath:
+            path: /run/nvidia/driver
+            type: DirectoryOrCreate
+        - name: host-root
+          hostPath:
+            path: /
+        - name: host-dev-char
+          hostPath:
+            path: /dev/char
diff --git a/bundle/manifests/gpu-operator-certified.clusterserviceversion.yaml b/bundle/manifests/gpu-operator-certified.clusterserviceversion.yaml
index 32a25668f..22eeca601 100644
--- a/bundle/manifests/gpu-operator-certified.clusterserviceversion.yaml
+++ b/bundle/manifests/gpu-operator-certified.clusterserviceversion.yaml
@@ -98,6 +98,25 @@ metadata:
                 "maxUnavailable": "1"
               }
             },
+            "draDriver": {
+              "gpus": {
+                "enabled": false,
+                "kubeletPlugin": {}
+              },
+              "computeDomains": {
+                "enabled": false,
+                "controller": {
+                  "tolerations": [
+                    {
+                      "key": "node-role.kubernetes.io/control-plane",
+                      "operator": "Exists",
+                      "effect": "NoSchedule"
+                    }
+                  ]
+                },
+                "kubeletPlugin": {}
+              }
+            },
             "devicePlugin": {
               "enabled": true,
               "config": {
@@ -239,6 +258,8 @@ spec:
       image: nvcr.io/nvidia/cloud-native/vgpu-device-manager:v0.4.2@sha256:24892b0ee0ca924d3c644648e9f0e0fa80d238e2fb681b21913f32fd0af9cde7
     - name: gdrcopy-image
       image: nvcr.io/nvidia/cloud-native/gdrdrv@sha256:0460630559b0b932c8861237b62e69c2895dace42d37ad3cb02c87e5d751fafc
+    - name: dra-driver-image
+      image: nvcr.io/nvidia/k8s-dra-driver-gpu@sha256:5dd583277c1f2825cb637c3c07d8208c6278b1e6ccb4231f0ac011dbf651d5a9
   customresourcedefinitions:
     owned:
     - name: nvidiadrivers.nvidia.com
@@ -748,6 +769,7 @@ spec:
           - apps
           resources:
           - daemonsets
+          - deployments
           verbs:
           - get
           - list
@@ -775,6 +797,84 @@ spec:
           - update
           - patch
           - delete
+        - apiGroups:
+          - resource.nvidia.com
+          resources:
+          - computedomains
+          verbs:
+          - get
+          - list
+          - watch
+          - create
+          - update
+          - patch
+          - delete
+        - apiGroups:
+          - resource.nvidia.com
+          resources:
+          - computedomains/status
+          verbs:
+          - get
+          - list
+          - watch
+          - create
+          - update
+          - patch
+          - delete
+        - apiGroups:
+          - resource.k8s.io
+          resources:
+          - resourceclaims
+          verbs:
+          - get
+          - list
+          - watch
+          - create
+          - update
+          - patch
+          - delete
+        - apiGroups:
+          - resource.k8s.io
+          resources:
+          - resourceclaimtemplates
+          verbs:
+          - get
+          - list
+          - watch
+          - create
+          - update
+          - patch
+          - delete
+        - apiGroups:
+          - resource.k8s.io
+          resources:
+          - deviceclasses
+          verbs:
+          - get
+          - list
+          - watch
+          - create
+          - update
+          - patch
+          - delete
+        - apiGroups:
+          - resource.k8s.io
+          resources:
+          - resourceslices
+          verbs:
+          - get
+          - list
+          - watch
+          - create
+          - update
+          - patch
+          - delete
+        - apiGroups:
+          - resource.k8s.io
+          resources:
+          - resourceclaims/status
+          verbs:
+          - update
       permissions:
       - serviceAccountName: gpu-operator
         rules:
@@ -803,6 +903,7 @@ spec:
           - apps
           resources:
           - daemonsets
+          - deployments
           verbs:
           - create
           - get
@@ -953,6 +1054,8 @@ spec:
                     value: "nvcr.io/nvidia/cloud-native/vgpu-device-manager:v0.4.2@sha256:24892b0ee0ca924d3c644648e9f0e0fa80d238e2fb681b21913f32fd0af9cde7"
                   - name: "GDRCOPY_IMAGE"
                     value: "nvcr.io/nvidia/cloud-native/gdrdrv@sha256:0460630559b0b932c8861237b62e69c2895dace42d37ad3cb02c87e5d751fafc"
+                  - name: "DRA_DRIVER_IMAGE"
+                    value: "nvcr.io/nvidia/k8s-dra-driver-gpu@sha256:5dd583277c1f2825cb637c3c07d8208c6278b1e6ccb4231f0ac011dbf651d5a9"
               terminationGracePeriodSeconds: 10
               serviceAccountName: gpu-operator
     strategy: deployment
diff --git a/bundle/manifests/nvidia.com_clusterpolicies.yaml b/bundle/manifests/nvidia.com_clusterpolicies.yaml
index 868fe9379..add8948ee 100644
--- a/bundle/manifests/nvidia.com_clusterpolicies.yaml
+++ b/bundle/manifests/nvidia.com_clusterpolicies.yaml
@@ -897,6 +897,236 @@ spec:
                     description: NVIDIA Device Plugin image tag
                     type: string
                 type: object
+              draDriver:
+                description: DRADriver component spec
+                properties:
+                  computeDomains:
+                    description: ComputeDomains defines configuration for ComputeDomains
+                      in the NVIDIA DRA Driver
+                    properties:
+                      controller:
+                        description: Controller defines configuration for the NVIDIA
+                          DRA Driver controller
+                        properties:
+                          env:
+                            description: 'Optional: List of environment variables'
+                            items:
+                              description: EnvVar represents an environment variable
+                                present in a Container.
+                              properties:
+                                name:
+                                  description: Name of the environment variable.
+                                  type: string
+                                value:
+                                  description: Value of the environment variable.
+                                  type: string
+                              required:
+                              - name
+                              type: object
+                            type: array
+                          resources:
+                            description: 'Optional: Define resources requests and
+                              limits'
+                            properties:
+                              limits:
+                                additionalProperties:
+                                  anyOf:
+                                  - type: integer
+                                  - type: string
+                                  pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$
+                                  x-kubernetes-int-or-string: true
+                                description: |-
+                                  Limits describes the maximum amount of compute resources allowed.
+                                  More info: https://kubernetes.io/docs/concepts/configuration/manage-resources-containers/
+                                type: object
+                              requests:
+                                additionalProperties:
+                                  anyOf:
+                                  - type: integer
+                                  - type: string
+                                  pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$
+                                  x-kubernetes-int-or-string: true
+                                description: |-
+                                  Requests describes the minimum amount of compute resources required.
+                                  If Requests is omitted for a container, it defaults to Limits if that is explicitly specified,
+                                  otherwise to an implementation-defined value. Requests cannot exceed Limits.
+                                  More info: https://kubernetes.io/docs/concepts/configuration/manage-resources-containers/
+                                type: object
+                            type: object
+                          tolerations:
+                            description: 'Optional: Set tolerations'
+                            items:
+                              description: |-
+                                The pod this Toleration is attached to tolerates any taint that matches
+                                the triple <key,value,effect> using the matching operator <operator>.
+                              properties:
+                                effect:
+                                  description: |-
+                                    Effect indicates the taint effect to match. Empty means match all taint effects.
+                                    When specified, allowed values are NoSchedule, PreferNoSchedule and NoExecute.
+                                  type: string
+                                key:
+                                  description: |-
+                                    Key is the taint key that the toleration applies to. Empty means match all taint keys.
+                                    If the key is empty, operator must be Exists; this combination means to match all values and all keys.
+                                  type: string
+                                operator:
+                                  description: |-
+                                    Operator represents a key's relationship to the value.
+                                    Valid operators are Exists, Equal, Lt, and Gt. Defaults to Equal.
+                                    Exists is equivalent to wildcard for value, so that a pod can
+                                    tolerate all taints of a particular category.
+                                    Lt and Gt perform numeric comparisons (requires feature gate TaintTolerationComparisonOperators).
+                                  type: string
+                                tolerationSeconds:
+                                  description: |-
+                                    TolerationSeconds represents the period of time the toleration (which must be
+                                    of effect NoExecute, otherwise this field is ignored) tolerates the taint. By default,
+                                    it is not set, which means tolerate the taint forever (do not evict). Zero and
+                                    negative values will be treated as 0 (evict immediately) by the system.
+                                  format: int64
+                                  type: integer
+                                value:
+                                  description: |-
+                                    Value is the taint value the toleration matches to.
+                                    If the operator is Exists, the value should be empty, otherwise just a regular string.
+                                  type: string
+                              type: object
+                            type: array
+                        type: object
+                      enabled:
+                        description: Enabled indicates if ComputeDomains are enabled
+                          in the NVIDIA DRA Driver
+                        type: boolean
+                      kubeletPlugin:
+                        description: KubeletPlugin defines configuration for the NVIDIA
+                          DRA Driver kubelet plugin
+                        properties:
+                          env:
+                            description: 'Optional: List of environment variables'
+                            items:
+                              description: EnvVar represents an environment variable
+                                present in a Container.
+                              properties:
+                                name:
+                                  description: Name of the environment variable.
+                                  type: string
+                                value:
+                                  description: Value of the environment variable.
+                                  type: string
+                              required:
+                              - name
+                              type: object
+                            type: array
+                          resources:
+                            description: 'Optional: Define resources requests and
+                              limits'
+                            properties:
+                              limits:
+                                additionalProperties:
+                                  anyOf:
+                                  - type: integer
+                                  - type: string
+                                  pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$
+                                  x-kubernetes-int-or-string: true
+                                description: |-
+                                  Limits describes the maximum amount of compute resources allowed.
+                                  More info: https://kubernetes.io/docs/concepts/configuration/manage-resources-containers/
+                                type: object
+                              requests:
+                                additionalProperties:
+                                  anyOf:
+                                  - type: integer
+                                  - type: string
+                                  pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$
+                                  x-kubernetes-int-or-string: true
+                                description: |-
+                                  Requests describes the minimum amount of compute resources required.
+                                  If Requests is omitted for a container, it defaults to Limits if that is explicitly specified,
+                                  otherwise to an implementation-defined value. Requests cannot exceed Limits.
+                                  More info: https://kubernetes.io/docs/concepts/configuration/manage-resources-containers/
+                                type: object
+                            type: object
+                        type: object
+                    type: object
+                  gpus:
+                    description: GPUs defines configuration for GPUs in the NVIDIA
+                      DRA Driver
+                    properties:
+                      enabled:
+                        description: Enabled indicates if GPUs are enabled in the
+                          NVIDIA DRA Driver
+                        type: boolean
+                      kubeletPlugin:
+                        description: KubeletPlugin defines configuration for the NVIDIA
+                          DRA Driver kubelet plugin
+                        properties:
+                          env:
+                            description: 'Optional: List of environment variables'
+                            items:
+                              description: EnvVar represents an environment variable
+                                present in a Container.
+                              properties:
+                                name:
+                                  description: Name of the environment variable.
+                                  type: string
+                                value:
+                                  description: Value of the environment variable.
+                                  type: string
+                              required:
+                              - name
+                              type: object
+                            type: array
+                          resources:
+                            description: 'Optional: Define resources requests and
+                              limits'
+                            properties:
+                              limits:
+                                additionalProperties:
+                                  anyOf:
+                                  - type: integer
+                                  - type: string
+                                  pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$
+                                  x-kubernetes-int-or-string: true
+                                description: |-
+                                  Limits describes the maximum amount of compute resources allowed.
+                                  More info: https://kubernetes.io/docs/concepts/configuration/manage-resources-containers/
+                                type: object
+                              requests:
+                                additionalProperties:
+                                  anyOf:
+                                  - type: integer
+                                  - type: string
+                                  pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$
+                                  x-kubernetes-int-or-string: true
+                                description: |-
+                                  Requests describes the minimum amount of compute resources required.
+                                  If Requests is omitted for a container, it defaults to Limits if that is explicitly specified,
+                                  otherwise to an implementation-defined value. Requests cannot exceed Limits.
+                                  More info: https://kubernetes.io/docs/concepts/configuration/manage-resources-containers/
+                                type: object
+                            type: object
+                        type: object
+                    type: object
+                  image:
+                    description: NVIDIA DRA Driver image name
+                    pattern: '[a-zA-Z0-9\-]+'
+                    type: string
+                  imagePullPolicy:
+                    description: Image pull policy
+                    type: string
+                  imagePullSecrets:
+                    description: Image pull secrets
+                    items:
+                      type: string
+                    type: array
+                  repository:
+                    description: NVIDIA DRA Driver image repository
+                    type: string
+                  version:
+                    description: NVIDIA DRA Driver image tag
+                    type: string
+                type: object
               driver:
                 description: Driver component spec
                 properties:
@@ -2883,6 +3113,7 @@ spec:
             - dcgm
             - dcgmExporter
             - devicePlugin
+            - draDriver
             - driver
             - gfd
             - nodeStatusExporter
diff --git a/bundle/manifests/resource.nvidia.com_computedomains.yaml b/bundle/manifests/resource.nvidia.com_computedomains.yaml
new file mode 100644
index 000000000..307b21ff7
--- /dev/null
+++ b/bundle/manifests/resource.nvidia.com_computedomains.yaml
@@ -0,0 +1,104 @@
+---
+apiVersion: apiextensions.k8s.io/v1
+kind: CustomResourceDefinition
+metadata:
+  annotations:
+    controller-gen.kubebuilder.io/version: v0.17.1
+  name: computedomains.resource.nvidia.com
+spec:
+  group: resource.nvidia.com
+  names:
+    kind: ComputeDomain
+    listKind: ComputeDomainList
+    plural: computedomains
+    singular: computedomain
+  scope: Namespaced
+  versions:
+    - name: v1beta1
+      schema:
+        openAPIV3Schema:
+          description: ComputeDomain prepares a set of nodes to run a multi-node workload
+            in.
+          properties:
+            apiVersion:
+              description: |-
+                APIVersion defines the versioned schema of this representation of an object.
+                Servers should convert recognized schemas to the latest internal value, and
+                may reject unrecognized values.
+                More info: https://git.k8s.io/community/contributors/devel/sig-architecture/api-conventions.md#resources
+              type: string
+            kind:
+              description: |-
+                Kind is a string value representing the REST resource this object represents.
+                Servers may infer this from the endpoint the client submits requests to.
+                Cannot be updated.
+                In CamelCase.
+                More info: https://git.k8s.io/community/contributors/devel/sig-architecture/api-conventions.md#types-kinds
+              type: string
+            metadata:
+              type: object
+            spec:
+              description: ComputeDomainSpec provides the spec for a ComputeDomain.
+              properties:
+                channel:
+                  description: ComputeDomainChannelSpec provides the spec for a channel
+                    used to run a workload inside a ComputeDomain.
+                  properties:
+                    resourceClaimTemplate:
+                      description: ComputeDomainResourceClaimTemplate provides the details
+                        of the ResourceClaimTemplate to generate.
+                      properties:
+                        name:
+                          type: string
+                      required:
+                        - name
+                      type: object
+                  required:
+                    - resourceClaimTemplate
+                  type: object
+                numNodes:
+                  type: integer
+              required:
+                - channel
+                - numNodes
+              type: object
+              x-kubernetes-validations:
+                - message: A computeDomain.spec is immutable
+                  rule: self == oldSelf
+            status:
+              description: ComputeDomainStatus provides the status for a ComputeDomain.
+              properties:
+                nodes:
+                  items:
+                    description: ComputeDomainNode provides information about each node
+                      added to a ComputeDomain.
+                    properties:
+                      cliqueID:
+                        type: string
+                      ipAddress:
+                        type: string
+                      name:
+                        type: string
+                    required:
+                      - cliqueID
+                      - ipAddress
+                      - name
+                    type: object
+                  type: array
+                  x-kubernetes-list-map-keys:
+                    - name
+                  x-kubernetes-list-type: map
+                status:
+                  default: NotReady
+                  enum:
+                    - Ready
+                    - NotReady
+                  type: string
+              required:
+                - status
+              type: object
+          type: object
+      served: true
+      storage: true
+      subresources:
+        status: {}
diff --git a/config/crd/bases/nvidia.com_clusterpolicies.yaml b/config/crd/bases/nvidia.com_clusterpolicies.yaml
index 868fe9379..add8948ee 100644
--- a/config/crd/bases/nvidia.com_clusterpolicies.yaml
+++ b/config/crd/bases/nvidia.com_clusterpolicies.yaml
@@ -897,6 +897,236 @@ spec:
                     description: NVIDIA Device Plugin image tag
                     type: string
                 type: object
+              draDriver:
+                description: DRADriver component spec
+                properties:
+                  computeDomains:
+                    description: ComputeDomains defines configuration for ComputeDomains
+                      in the NVIDIA DRA Driver
+                    properties:
+                      controller:
+                        description: Controller defines configuration for the NVIDIA
+                          DRA Driver controller
+                        properties:
+                          env:
+                            description: 'Optional: List of environment variables'
+                            items:
+                              description: EnvVar represents an environment variable
+                                present in a Container.
+                              properties:
+                                name:
+                                  description: Name of the environment variable.
+                                  type: string
+                                value:
+                                  description: Value of the environment variable.
+                                  type: string
+                              required:
+                              - name
+                              type: object
+                            type: array
+                          resources:
+                            description: 'Optional: Define resources requests and
+                              limits'
+                            properties:
+                              limits:
+                                additionalProperties:
+                                  anyOf:
+                                  - type: integer
+                                  - type: string
+                                  pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$
+                                  x-kubernetes-int-or-string: true
+                                description: |-
+                                  Limits describes the maximum amount of compute resources allowed.
+                                  More info: https://kubernetes.io/docs/concepts/configuration/manage-resources-containers/
+                                type: object
+                              requests:
+                                additionalProperties:
+                                  anyOf:
+                                  - type: integer
+                                  - type: string
+                                  pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$
+                                  x-kubernetes-int-or-string: true
+                                description: |-
+                                  Requests describes the minimum amount of compute resources required.
+                                  If Requests is omitted for a container, it defaults to Limits if that is explicitly specified,
+                                  otherwise to an implementation-defined value. Requests cannot exceed Limits.
+                                  More info: https://kubernetes.io/docs/concepts/configuration/manage-resources-containers/
+                                type: object
+                            type: object
+                          tolerations:
+                            description: 'Optional: Set tolerations'
+                            items:
+                              description: |-
+                                The pod this Toleration is attached to tolerates any taint that matches
+                                the triple <key,value,effect> using the matching operator <operator>.
+                              properties:
+                                effect:
+                                  description: |-
+                                    Effect indicates the taint effect to match. Empty means match all taint effects.
+                                    When specified, allowed values are NoSchedule, PreferNoSchedule and NoExecute.
+                                  type: string
+                                key:
+                                  description: |-
+                                    Key is the taint key that the toleration applies to. Empty means match all taint keys.
+                                    If the key is empty, operator must be Exists; this combination means to match all values and all keys.
+                                  type: string
+                                operator:
+                                  description: |-
+                                    Operator represents a key's relationship to the value.
+                                    Valid operators are Exists, Equal, Lt, and Gt. Defaults to Equal.
+                                    Exists is equivalent to wildcard for value, so that a pod can
+                                    tolerate all taints of a particular category.
+                                    Lt and Gt perform numeric comparisons (requires feature gate TaintTolerationComparisonOperators).
+                                  type: string
+                                tolerationSeconds:
+                                  description: |-
+                                    TolerationSeconds represents the period of time the toleration (which must be
+                                    of effect NoExecute, otherwise this field is ignored) tolerates the taint. By default,
+                                    it is not set, which means tolerate the taint forever (do not evict). Zero and
+                                    negative values will be treated as 0 (evict immediately) by the system.
+                                  format: int64
+                                  type: integer
+                                value:
+                                  description: |-
+                                    Value is the taint value the toleration matches to.
+                                    If the operator is Exists, the value should be empty, otherwise just a regular string.
+                                  type: string
+                              type: object
+                            type: array
+                        type: object
+                      enabled:
+                        description: Enabled indicates if ComputeDomains are enabled
+                          in the NVIDIA DRA Driver
+                        type: boolean
+                      kubeletPlugin:
+                        description: KubeletPlugin defines configuration for the NVIDIA
+                          DRA Driver kubelet plugin
+                        properties:
+                          env:
+                            description: 'Optional: List of environment variables'
+                            items:
+                              description: EnvVar represents an environment variable
+                                present in a Container.
+                              properties:
+                                name:
+                                  description: Name of the environment variable.
+                                  type: string
+                                value:
+                                  description: Value of the environment variable.
+                                  type: string
+                              required:
+                              - name
+                              type: object
+                            type: array
+                          resources:
+                            description: 'Optional: Define resources requests and
+                              limits'
+                            properties:
+                              limits:
+                                additionalProperties:
+                                  anyOf:
+                                  - type: integer
+                                  - type: string
+                                  pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$
+                                  x-kubernetes-int-or-string: true
+                                description: |-
+                                  Limits describes the maximum amount of compute resources allowed.
+                                  More info: https://kubernetes.io/docs/concepts/configuration/manage-resources-containers/
+                                type: object
+                              requests:
+                                additionalProperties:
+                                  anyOf:
+                                  - type: integer
+                                  - type: string
+                                  pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$
+                                  x-kubernetes-int-or-string: true
+                                description: |-
+                                  Requests describes the minimum amount of compute resources required.
+                                  If Requests is omitted for a container, it defaults to Limits if that is explicitly specified,
+                                  otherwise to an implementation-defined value. Requests cannot exceed Limits.
+                                  More info: https://kubernetes.io/docs/concepts/configuration/manage-resources-containers/
+                                type: object
+                            type: object
+                        type: object
+                    type: object
+                  gpus:
+                    description: GPUs defines configuration for GPUs in the NVIDIA
+                      DRA Driver
+                    properties:
+                      enabled:
+                        description: Enabled indicates if GPUs are enabled in the
+                          NVIDIA DRA Driver
+                        type: boolean
+                      kubeletPlugin:
+                        description: KubeletPlugin defines configuration for the NVIDIA
+                          DRA Driver kubelet plugin
+                        properties:
+                          env:
+                            description: 'Optional: List of environment variables'
+                            items:
+                              description: EnvVar represents an environment variable
+                                present in a Container.
+                              properties:
+                                name:
+                                  description: Name of the environment variable.
+                                  type: string
+                                value:
+                                  description: Value of the environment variable.
+                                  type: string
+                              required:
+                              - name
+                              type: object
+                            type: array
+                          resources:
+                            description: 'Optional: Define resources requests and
+                              limits'
+                            properties:
+                              limits:
+                                additionalProperties:
+                                  anyOf:
+                                  - type: integer
+                                  - type: string
+                                  pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$
+                                  x-kubernetes-int-or-string: true
+                                description: |-
+                                  Limits describes the maximum amount of compute resources allowed.
+                                  More info: https://kubernetes.io/docs/concepts/configuration/manage-resources-containers/
+                                type: object
+                              requests:
+                                additionalProperties:
+                                  anyOf:
+                                  - type: integer
+                                  - type: string
+                                  pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$
+                                  x-kubernetes-int-or-string: true
+                                description: |-
+                                  Requests describes the minimum amount of compute resources required.
+                                  If Requests is omitted for a container, it defaults to Limits if that is explicitly specified,
+                                  otherwise to an implementation-defined value. Requests cannot exceed Limits.
+                                  More info: https://kubernetes.io/docs/concepts/configuration/manage-resources-containers/
+                                type: object
+                            type: object
+                        type: object
+                    type: object
+                  image:
+                    description: NVIDIA DRA Driver image name
+                    pattern: '[a-zA-Z0-9\-]+'
+                    type: string
+                  imagePullPolicy:
+                    description: Image pull policy
+                    type: string
+                  imagePullSecrets:
+                    description: Image pull secrets
+                    items:
+                      type: string
+                    type: array
+                  repository:
+                    description: NVIDIA DRA Driver image repository
+                    type: string
+                  version:
+                    description: NVIDIA DRA Driver image tag
+                    type: string
+                type: object
               driver:
                 description: Driver component spec
                 properties:
@@ -2883,6 +3113,7 @@ spec:
             - dcgm
             - dcgmExporter
             - devicePlugin
+            - draDriver
             - driver
             - gfd
             - nodeStatusExporter
diff --git a/controllers/clusterpolicy_controller.go b/controllers/clusterpolicy_controller.go
index d16d2d445..b92023270 100644
--- a/controllers/clusterpolicy_controller.go
+++ b/controllers/clusterpolicy_controller.go
@@ -121,6 +121,18 @@ func (r *ClusterPolicyReconciler) Reconcile(ctx context.Context, req ctrl.Reques
 		return ctrl.Result{}, nil
 	}
 
+	if instance.Spec.DevicePlugin.IsEnabled() && instance.Spec.DRADriver.IsGPUsEnabled() {
+		err = fmt.Errorf("the device-plugin and dra driver for GPUs cannot both be enabled")
+		condErr = r.conditionUpdater.SetConditionsError(ctx, instance, conditions.ReconcileFailed, err.Error())
+		if condErr != nil {
+			r.Log.V(consts.LogLevelDebug).Error(nil, condErr.Error())
+		}
+		if clusterPolicyCtrl.operatorMetrics != nil {
+			clusterPolicyCtrl.operatorMetrics.reconciliationStatus.Set(reconciliationStatusNotReady)
+		}
+		return ctrl.Result{}, err
+	}
+
 	if err := clusterPolicyCtrl.init(ctx, r, instance); err != nil {
 		r.Log.Error(err, "unable to initialize ClusterPolicy controller")
 		if condErr := r.conditionUpdater.SetConditionsError(ctx, instance, conditions.ReconcileFailed, err.Error()); condErr != nil {
diff --git a/controllers/object_controls.go b/controllers/object_controls.go
index 1f8806fcc..84987bd3f 100644
--- a/controllers/object_controls.go
+++ b/controllers/object_controls.go
@@ -36,6 +36,7 @@ import (
 	corev1 "k8s.io/api/core/v1"
 	nodev1 "k8s.io/api/node/v1"
 	nodev1beta1 "k8s.io/api/node/v1beta1"
+	resourceapi "k8s.io/api/resource/v1beta1"
 	apiextensionsv1 "k8s.io/apiextensions-apiserver/pkg/apis/apiextensions/v1"
 	apierrors "k8s.io/apimachinery/pkg/api/errors"
 	metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
@@ -151,6 +152,8 @@ const (
 	NvidiaCtrRuntimeCDIPrefixesEnvName = "NVIDIA_CONTAINER_RUNTIME_MODES_CDI_ANNOTATION_PREFIXES"
 	// CDIEnabledEnvName is the name of the envvar used to enable CDI in the operands
 	CDIEnabledEnvName = "CDI_ENABLED"
+	// NvidiaCTKPathEnvName is the name of the envvar specifying the path to the 'nvidia-ctk' binary
+	NvidiaCTKPathEnvName = "NVIDIA_CTK_PATH"
 	// NvidiaCDIHookPathEnvName is the name of the envvar specifying the path to the 'nvidia-cdi-hook' binary
 	NvidiaCDIHookPathEnvName = "NVIDIA_CDI_HOOK_PATH"
 	// CRIOConfigModeEnvName is the name of the envvar controlling how the toolkit container updates the cri-o configuration
@@ -703,19 +706,20 @@ func preProcessDaemonSet(obj *appsv1.DaemonSet, n ClusterPolicyController) error
 		"nvidia-vgpu-device-manager":                  TransformVGPUDeviceManager,
 		"nvidia-vfio-manager":                         TransformVFIOManager,
 		"nvidia-container-toolkit-daemonset":          TransformToolkit,
+		"nvidia-dra-driver-kubelet-plugin":            TransformDRADriverKubeletPlugin,
 		"nvidia-device-plugin-daemonset":              TransformDevicePlugin,
 		"nvidia-device-plugin-mps-control-daemon":     TransformMPSControlDaemon,
 		"nvidia-sandbox-device-plugin-daemonset":      TransformSandboxDevicePlugin,
 		"nvidia-kata-sandbox-device-plugin-daemonset": TransformKataDevicePlugin,
-		"nvidia-dcgm":                                 TransformDCGM,
-		"nvidia-dcgm-exporter":                        TransformDCGMExporter,
-		"nvidia-node-status-exporter":                 TransformNodeStatusExporter,
-		"gpu-feature-discovery":                       TransformGPUDiscoveryPlugin,
-		"nvidia-mig-manager":                          TransformMIGManager,
-		"nvidia-operator-validator":                   TransformValidator,
-		"nvidia-sandbox-validator":                    TransformSandboxValidator,
-		"nvidia-kata-manager":                         TransformKataManager,
-		"nvidia-cc-manager":                           TransformCCManager,
+		"nvidia-dcgm":                 TransformDCGM,
+		"nvidia-dcgm-exporter":        TransformDCGMExporter,
+		"nvidia-node-status-exporter": TransformNodeStatusExporter,
+		"gpu-feature-discovery":       TransformGPUDiscoveryPlugin,
+		"nvidia-mig-manager":          TransformMIGManager,
+		"nvidia-operator-validator":   TransformValidator,
+		"nvidia-sandbox-validator":    TransformSandboxValidator,
+		"nvidia-kata-manager":         TransformKataManager,
+		"nvidia-cc-manager":           TransformCCManager,
 	}
 
 	t, ok := transformations[obj.Name]
@@ -1736,6 +1740,74 @@ func TransformKataDevicePlugin(obj *appsv1.DaemonSet, config *gpuv1.ClusterPolic
 	return nil
 }
 
+// TransformDRADriverKubeletPlugin transforms nvidia-dra-driver-kubelet-plugin daemonset with required config as per ClusterPolicy
+func TransformDRADriverKubeletPlugin(obj *appsv1.DaemonSet, config *gpuv1.ClusterPolicySpec, n ClusterPolicyController) error {
+	err := transformValidationInitContainer(obj, config)
+	if err != nil {
+		return err
+	}
+
+	if len(config.DRADriver.ImagePullSecrets) > 0 {
+		addPullSecrets(&obj.Spec.Template.Spec, config.DRADriver.ImagePullSecrets)
+	}
+
+	image, err := gpuv1.ImagePath(&config.DRADriver)
+	if err != nil {
+		return err
+	}
+
+	var containers []corev1.Container
+	for i, container := range obj.Spec.Template.Spec.Containers {
+		// Skip the container if the resource type is not enabled.
+		// As a result, the container will be removed from the spec.
+		if (container.Name == "gpus" && !config.DRADriver.IsGPUsEnabled()) ||
+			(container.Name == "compute-domains" && !config.DRADriver.IsComputeDomainsEnabled()) {
+			continue
+		}
+
+		obj.Spec.Template.Spec.Containers[i].Image = image
+		obj.Spec.Template.Spec.Containers[i].ImagePullPolicy = gpuv1.ImagePullPolicy(config.DRADriver.ImagePullPolicy)
+
+		if config.Toolkit.IsEnabled() {
+			setContainerEnv(&(obj.Spec.Template.Spec.Containers[i]), NvidiaCTKPathEnvName, filepath.Join(config.Toolkit.InstallDir, "toolkit/nvidia-ctk"))
+		}
+
+		// update the "gpus" container
+		if container.Name == "gpus" {
+			setContainerEnv(&(obj.Spec.Template.Spec.Containers[i]), "IMAGE_NAME", image)
+			if len(config.DRADriver.GPUs.KubeletPlugin.Env) > 0 {
+				for _, env := range config.DRADriver.GPUs.KubeletPlugin.Env {
+					setContainerEnv(&(obj.Spec.Template.Spec.Containers[i]), env.Name, env.Value)
+				}
+			}
+
+			if config.DRADriver.GPUs.KubeletPlugin.Resources != nil {
+				obj.Spec.Template.Spec.Containers[i].Resources.Requests = config.DRADriver.GPUs.KubeletPlugin.Resources.Requests
+				obj.Spec.Template.Spec.Containers[i].Resources.Limits = config.DRADriver.GPUs.KubeletPlugin.Resources.Limits
+			}
+		}
+
+		// update the "compute-domains" container
+		if container.Name == "compute-domains" {
+			if len(config.DRADriver.ComputeDomains.KubeletPlugin.Env) > 0 {
+				for _, env := range config.DRADriver.ComputeDomains.KubeletPlugin.Env {
+					setContainerEnv(&(obj.Spec.Template.Spec.Containers[i]), env.Name, env.Value)
+				}
+			}
+
+			if config.DRADriver.ComputeDomains.KubeletPlugin.Resources != nil {
+				obj.Spec.Template.Spec.Containers[i].Resources.Requests = config.DRADriver.ComputeDomains.KubeletPlugin.Resources.Requests
+				obj.Spec.Template.Spec.Containers[i].Resources.Limits = config.DRADriver.ComputeDomains.KubeletPlugin.Resources.Limits
+			}
+		}
+
+		containers = append(containers, obj.Spec.Template.Spec.Containers[i])
+	}
+	obj.Spec.Template.Spec.Containers = containers
+
+	return nil
+}
+
 // TransformDCGMExporter transforms dcgm exporter daemonset with required config as per ClusterPolicy
 func TransformDCGMExporter(obj *appsv1.DaemonSet, config *gpuv1.ClusterPolicySpec, n ClusterPolicyController) error {
 	// update validation container
@@ -4087,17 +4159,76 @@ func getDaemonsetControllerRevisionHash(ctx context.Context, daemonset *appsv1.D
 	return hash, nil
 }
 
+// TransformDRADriverController transforms nvidia-dra-driver-controller deployment with required config as per ClusterPolicy
+func TransformDRADriverController(obj *appsv1.Deployment, spec *gpuv1.ClusterPolicySpec) error {
+	var computeDomainsCtr *corev1.Container
+	for i, ctr := range obj.Spec.Template.Spec.Containers {
+		if ctr.Name == "compute-domains" {
+			computeDomainsCtr = &obj.Spec.Template.Spec.Containers[i]
+			break
+		}
+	}
+
+	if computeDomainsCtr == nil {
+		return fmt.Errorf("failed to find 'compute-domains' container")
+	}
+
+	config := spec.DRADriver
+	image, err := gpuv1.ImagePath(&config)
+	if err != nil {
+		return err
+	}
+
+	computeDomainsCtr.Image = image
+	setContainerEnv(computeDomainsCtr, "IMAGE_NAME", image)
+
+	computeDomainsCtr.ImagePullPolicy = gpuv1.ImagePullPolicy(config.ImagePullPolicy)
+
+	if len(config.ImagePullSecrets) > 0 {
+		addPullSecrets(&obj.Spec.Template.Spec, config.ImagePullSecrets)
+	}
+
+	if len(config.ComputeDomains.Controller.Tolerations) > 0 {
+		obj.Spec.Template.Spec.Tolerations = append(obj.Spec.Template.Spec.Tolerations, config.ComputeDomains.Controller.Tolerations...)
+	}
+
+	if len(config.ComputeDomains.Controller.Env) > 0 {
+		for _, env := range config.ComputeDomains.Controller.Env {
+			setContainerEnv(computeDomainsCtr, env.Name, env.Value)
+		}
+	}
+
+	if config.ComputeDomains.Controller.Resources != nil {
+		computeDomainsCtr.Resources.Requests = config.ComputeDomains.Controller.Resources.Requests
+		computeDomainsCtr.Resources.Limits = config.ComputeDomains.Controller.Resources.Limits
+	}
+
+	return nil
+}
+
+func transformDeployment(obj *appsv1.Deployment, n ClusterPolicyController) error {
+	logger := n.logger.WithValues("Deployment", obj.Name, "Namespace", obj.Namespace)
+	switch obj.Name {
+	case "nvidia-dra-driver-controller":
+		return TransformDRADriverController(obj, &n.singleton.Spec)
+	default:
+		logger.Info("No transformation for object")
+		return nil
+	}
+}
+
 // Deployment creates Deployment resource
 func Deployment(n ClusterPolicyController) (gpuv1.State, error) {
 	ctx := n.ctx
 	state := n.idx
+	stateName := n.stateNames[state]
 	obj := n.resources[state].Deployment.DeepCopy()
 	obj.Namespace = n.operatorNamespace
 
 	logger := n.logger.WithValues("Deployment", obj.Name, "Namespace", obj.Namespace)
 
 	// Check if state is disabled and cleanup resource if exists
-	if !n.isStateEnabled(n.stateNames[n.idx]) {
+	if !n.isStateEnabled(stateName) || (obj.Name == "nvidia-dra-driver-controller" && !n.singleton.Spec.DRADriver.IsComputeDomainsEnabled()) {
 		err := n.client.Delete(ctx, obj)
 		if err != nil && !apierrors.IsNotFound(err) {
 			logger.Info("Couldn't delete", "Error", err)
@@ -4106,6 +4237,11 @@ func Deployment(n ClusterPolicyController) (gpuv1.State, error) {
 		return gpuv1.Disabled, nil
 	}
 
+	if err := transformDeployment(obj, n); err != nil {
+		logger.Info("Failed to transform Deployment", "Error", err)
+		return gpuv1.NotReady, err
+	}
+
 	if err := controllerutil.SetControllerReference(n.singleton, obj, n.scheme); err != nil {
 		return gpuv1.NotReady, err
 	}
@@ -5362,3 +5498,76 @@ func clearRuntimeClasses(n ClusterPolicyController, runtimeClasses []nodev1.Runt
 	}
 	return nil
 }
+
+func createDeviceClass(n ClusterPolicyController, spec resourceapi.DeviceClass) (gpuv1.State, error) {
+	ctx := n.ctx
+	state := n.idx
+	obj := spec.DeepCopy()
+
+	logger := n.logger.WithValues("DeviceClass", obj.Name)
+
+	// Check if state is disabled and cleanup resource if exists
+	if !n.isStateEnabled(n.stateNames[state]) ||
+		(strings.Contains(obj.Name, "compute-domain") && !n.singleton.Spec.DRADriver.IsComputeDomainsEnabled()) ||
+		(obj.Name == "gpu.nvidia.com" && !n.singleton.Spec.DRADriver.IsGPUsEnabled()) ||
+		(obj.Name == "mig.nvidia.com" && !n.singleton.Spec.DRADriver.IsGPUsEnabled()) {
+		err := n.client.Delete(ctx, obj)
+		if err != nil && !apierrors.IsNotFound(err) {
+			logger.Info("Couldn't delete", "Error", err)
+			return gpuv1.NotReady, err
+		}
+		return gpuv1.Disabled, nil
+	}
+
+	if err := controllerutil.SetControllerReference(n.singleton, obj, n.scheme); err != nil {
+		return gpuv1.NotReady, err
+	}
+
+	found := &resourceapi.DeviceClass{}
+	err := n.client.Get(ctx, types.NamespacedName{Namespace: "", Name: obj.Name}, found)
+	if err != nil && apierrors.IsNotFound(err) {
+		logger.Info("Not found, creating...")
+		err = n.client.Create(ctx, obj)
+		if err != nil {
+			logger.Info("Couldn't create", "Error", err)
+			return gpuv1.NotReady, err
+		}
+		return gpuv1.Ready, nil
+	} else if err != nil {
+		return gpuv1.NotReady, err
+	}
+
+	logger.Info("Found Resource, updating...")
+	obj.ResourceVersion = found.ResourceVersion
+
+	err = n.client.Update(ctx, obj)
+	if err != nil {
+		logger.Info("Couldn't update", "Error", err)
+		return gpuv1.NotReady, err
+	}
+	return gpuv1.Ready, nil
+}
+
+// DeviceClasses creates DeviceClass objects
+func DeviceClasses(n ClusterPolicyController) (gpuv1.State, error) {
+	status := gpuv1.Ready
+	state := n.idx
+
+	for _, obj := range n.resources[state].DeviceClasses {
+		obj := obj
+		stat, err := createDeviceClass(n, obj)
+		if err != nil {
+			return stat, err
+		}
+
+		switch stat {
+		case gpuv1.Ready:
+			continue
+		case gpuv1.Disabled:
+			continue
+		default:
+			status = gpuv1.NotReady
+		}
+	}
+	return status, nil
+}
diff --git a/controllers/resource_manager.go b/controllers/resource_manager.go
index 2789bfe3d..2582143ab 100644
--- a/controllers/resource_manager.go
+++ b/controllers/resource_manager.go
@@ -28,6 +28,7 @@ import (
 	corev1 "k8s.io/api/core/v1"
 	nodev1 "k8s.io/api/node/v1"
 	rbacv1 "k8s.io/api/rbac/v1"
+	resourceapi "k8s.io/api/resource/v1beta1"
 	schedv1 "k8s.io/api/scheduling/v1beta1"
 
 	secv1 "github.com/openshift/api/security/v1"
@@ -61,6 +62,7 @@ type Resources struct {
 	SecurityContextConstraints secv1.SecurityContextConstraints
 	RuntimeClasses             []nodev1.RuntimeClass
 	PrometheusRule             promv1.PrometheusRule
+	DeviceClasses              []resourceapi.DeviceClass
 }
 
 func filePathWalkDir(n *ClusterPolicyController, root string) ([]string, error) {
@@ -180,6 +182,15 @@ func addResourcesControls(n *ClusterPolicyController, path string) (Resources, c
 			_, _, err := s.Decode(m, nil, &res.PrometheusRule)
 			panicIfError(err)
 			ctrl = append(ctrl, PrometheusRule)
+		case "DeviceClass":
+			deviceClass := resourceapi.DeviceClass{}
+			_, _, err := s.Decode(m, nil, &deviceClass)
+			panicIfError(err)
+			res.DeviceClasses = append(res.DeviceClasses, deviceClass)
+			// only add the ctrl function when the first DeviceClass is added
+			if len(res.DeviceClasses) == 1 {
+				ctrl = append(ctrl, DeviceClasses)
+			}
 		default:
 			n.logger.Info("Unknown Resource", "Manifest", m, "Kind", kind)
 		}
diff --git a/controllers/state_manager.go b/controllers/state_manager.go
index 29eef5cf2..b9b19acee 100644
--- a/controllers/state_manager.go
+++ b/controllers/state_manager.go
@@ -87,14 +87,15 @@ var (
 
 var gpuStateLabels = map[string]map[string]string{
 	gpuWorkloadConfigContainer: {
-		"nvidia.com/gpu.deploy.driver":                "true",
-		"nvidia.com/gpu.deploy.gpu-feature-discovery": "true",
-		"nvidia.com/gpu.deploy.container-toolkit":     "true",
-		"nvidia.com/gpu.deploy.device-plugin":         "true",
-		"nvidia.com/gpu.deploy.dcgm":                  "true",
-		"nvidia.com/gpu.deploy.dcgm-exporter":         "true",
-		"nvidia.com/gpu.deploy.node-status-exporter":  "true",
-		"nvidia.com/gpu.deploy.operator-validator":    "true",
+		"nvidia.com/gpu.deploy.driver":                    "true",
+		"nvidia.com/gpu.deploy.gpu-feature-discovery":     "true",
+		"nvidia.com/gpu.deploy.container-toolkit":         "true",
+		"nvidia.com/gpu.deploy.device-plugin":             "true",
+		"nvidia.com/gpu.deploy.dra-driver-kubelet-plugin": "true",
+		"nvidia.com/gpu.deploy.dcgm":                      "true",
+		"nvidia.com/gpu.deploy.dcgm-exporter":             "true",
+		"nvidia.com/gpu.deploy.node-status-exporter":      "true",
+		"nvidia.com/gpu.deploy.operator-validator":        "true",
 	},
 	gpuWorkloadConfigVMPassthrough: {
 		"nvidia.com/gpu.deploy.sandbox-device-plugin": "true",
@@ -892,6 +893,7 @@ func (n *ClusterPolicyController) init(ctx context.Context, reconciler *ClusterP
 		addState(n, "/opt/gpu-operator/state-container-toolkit")
 		addState(n, "/opt/gpu-operator/state-operator-validation")
 		addState(n, "/opt/gpu-operator/state-device-plugin")
+		addState(n, "/opt/gpu-operator/state-dra-driver")
 		addState(n, "/opt/gpu-operator/state-mps-control-daemon")
 		addState(n, "/opt/gpu-operator/state-dcgm")
 		addState(n, "/opt/gpu-operator/state-dcgm-exporter")
@@ -1141,6 +1143,8 @@ func (n ClusterPolicyController) isStateEnabled(stateName string) bool {
 		return true
 	case "state-operator-metrics":
 		return true
+	case "state-dra-driver":
+		return clusterPolicySpec.DRADriver.IsEnabled()
 	default:
 		n.logger.Error(nil, "invalid state passed", "stateName", stateName)
 		return false
diff --git a/controllers/transforms_test.go b/controllers/transforms_test.go
index 64af9f93d..407f32754 100644
--- a/controllers/transforms_test.go
+++ b/controllers/transforms_test.go
@@ -193,6 +193,36 @@ func (d Daemonset) WithVolume(volume corev1.Volume) Daemonset {
 	return d
 }
 
+// _Deployment is a Deployment wrapper used for testing
+type _Deployment struct {
+	*appsv1.Deployment
+}
+
+func NewDeployment() _Deployment {
+	deployment := &appsv1.Deployment{
+		ObjectMeta: metav1.ObjectMeta{
+			Name:      "test-deployment",
+			Namespace: "test-ns",
+		},
+		Spec: appsv1.DeploymentSpec{
+			Template: corev1.PodTemplateSpec{
+				Spec: corev1.PodSpec{},
+			},
+		},
+	}
+	return _Deployment{deployment}
+}
+
+func (d _Deployment) WithContainer(container corev1.Container) _Deployment {
+	d.Spec.Template.Spec.Containers = append(d.Spec.Template.Spec.Containers, container)
+	return d
+}
+
+func (d _Deployment) WithTolerations(tolerations []corev1.Toleration) _Deployment {
+	d.Spec.Template.Spec.Tolerations = tolerations
+	return d
+}
+
 // Pod is a Pod wrapper used for testing
 type Pod struct {
 	*corev1.Pod
@@ -4635,3 +4665,303 @@ func TestHashDriverInstallConfigZeroFieldInvariant(t *testing.T) {
 	assert.NotEqual(t, originalDigest, changedDigest,
 		"a non-zero new field should change the digest")
 }
+
+func TestTransformDRADriverKubeletPlugin(t *testing.T) {
+	testCases := []struct {
+		description   string
+		ds            Daemonset
+		cpSpec        *gpuv1.ClusterPolicySpec
+		expectedDs    Daemonset
+		errorExpected bool
+	}{
+		{
+			description: "empty dra driver spec",
+			ds:          NewDaemonset(),
+			cpSpec: &gpuv1.ClusterPolicySpec{
+				DRADriver: gpuv1.DRADriverSpec{},
+			},
+			expectedDs:    NewDaemonset(),
+			errorExpected: true,
+		},
+		{
+			description: "full dra driver spec, gpus and compute domains enabled",
+			ds: NewDaemonset().
+				WithContainer(corev1.Container{Name: "gpus"}).
+				WithContainer(corev1.Container{Name: "compute-domains"}),
+			cpSpec: &gpuv1.ClusterPolicySpec{
+				Toolkit: gpuv1.ToolkitSpec{InstallDir: "/usr/local/nvidia"},
+				DRADriver: gpuv1.DRADriverSpec{
+					Repository:      "nvcr.io/nvidia",
+					Image:           "k8s-dra-driver-gpu",
+					Version:         "v1.0.0",
+					ImagePullPolicy: "IfNotPresent",
+					GPUs: gpuv1.DRADriverGPUs{
+						Enabled: newBoolPtr(true),
+						KubeletPlugin: gpuv1.DRADriverKubeletPlugin{
+							Env: []gpuv1.EnvVar{{Name: "foo", Value: "bar"}},
+							Resources: &gpuv1.ResourceRequirements{
+								Limits: corev1.ResourceList{
+									corev1.ResourceCPU:    resource.MustParse("100m"),
+									corev1.ResourceMemory: resource.MustParse("100Mi"),
+								},
+								Requests: corev1.ResourceList{
+									corev1.ResourceCPU:    resource.MustParse("50m"),
+									corev1.ResourceMemory: resource.MustParse("50Mi"),
+								},
+							},
+						},
+					},
+					ComputeDomains: gpuv1.DRADriverComputeDomains{
+						Enabled: newBoolPtr(true),
+						KubeletPlugin: gpuv1.DRADriverKubeletPlugin{
+							Env: []gpuv1.EnvVar{
+								{Name: "foo", Value: "bar"},
+							},
+							Resources: &gpuv1.ResourceRequirements{
+								Limits: corev1.ResourceList{
+									corev1.ResourceCPU:    resource.MustParse("100m"),
+									corev1.ResourceMemory: resource.MustParse("100Mi"),
+								},
+								Requests: corev1.ResourceList{
+									corev1.ResourceCPU:    resource.MustParse("50m"),
+									corev1.ResourceMemory: resource.MustParse("50Mi"),
+								},
+							},
+						},
+					},
+				},
+			},
+			expectedDs: NewDaemonset().
+				WithContainer(corev1.Container{
+					Name:            "gpus",
+					Image:           "nvcr.io/nvidia/k8s-dra-driver-gpu:v1.0.0",
+					ImagePullPolicy: corev1.PullIfNotPresent,
+					Env: []corev1.EnvVar{
+						{Name: NvidiaCTKPathEnvName, Value: "/usr/local/nvidia/toolkit/nvidia-ctk"},
+						{Name: "IMAGE_NAME", Value: "nvcr.io/nvidia/k8s-dra-driver-gpu:v1.0.0"},
+						{Name: "foo", Value: "bar"},
+					},
+					Resources: corev1.ResourceRequirements{
+						Limits: corev1.ResourceList{
+							corev1.ResourceCPU:    resource.MustParse("100m"),
+							corev1.ResourceMemory: resource.MustParse("100Mi"),
+						},
+						Requests: corev1.ResourceList{
+							corev1.ResourceCPU:    resource.MustParse("50m"),
+							corev1.ResourceMemory: resource.MustParse("50Mi"),
+						},
+					},
+				}).
+				WithContainer(corev1.Container{
+					Name:            "compute-domains",
+					Image:           "nvcr.io/nvidia/k8s-dra-driver-gpu:v1.0.0",
+					ImagePullPolicy: corev1.PullIfNotPresent,
+					Env: []corev1.EnvVar{
+						{Name: NvidiaCTKPathEnvName, Value: "/usr/local/nvidia/toolkit/nvidia-ctk"},
+						{Name: "foo", Value: "bar"},
+					},
+					Resources: corev1.ResourceRequirements{
+						Limits: corev1.ResourceList{
+							corev1.ResourceCPU:    resource.MustParse("100m"),
+							corev1.ResourceMemory: resource.MustParse("100Mi"),
+						},
+						Requests: corev1.ResourceList{
+							corev1.ResourceCPU:    resource.MustParse("50m"),
+							corev1.ResourceMemory: resource.MustParse("50Mi"),
+						},
+					},
+				}),
+		},
+		{
+			description: "gpus enabled, compute domains disabled",
+			ds: NewDaemonset().
+				WithContainer(corev1.Container{Name: "gpus"}).
+				WithContainer(corev1.Container{Name: "compute-domains"}),
+			cpSpec: &gpuv1.ClusterPolicySpec{
+				Toolkit: gpuv1.ToolkitSpec{InstallDir: "/usr/local/nvidia"},
+				DRADriver: gpuv1.DRADriverSpec{
+					Repository:      "nvcr.io/nvidia",
+					Image:           "k8s-dra-driver-gpu",
+					Version:         "v1.0.0",
+					ImagePullPolicy: "IfNotPresent",
+					GPUs: gpuv1.DRADriverGPUs{
+						Enabled: newBoolPtr(true),
+					},
+					ComputeDomains: gpuv1.DRADriverComputeDomains{
+						Enabled: newBoolPtr(false),
+					},
+				},
+			},
+			expectedDs: NewDaemonset().
+				WithContainer(corev1.Container{
+					Name:            "gpus",
+					Image:           "nvcr.io/nvidia/k8s-dra-driver-gpu:v1.0.0",
+					ImagePullPolicy: corev1.PullIfNotPresent,
+					Env: []corev1.EnvVar{
+						{Name: NvidiaCTKPathEnvName, Value: "/usr/local/nvidia/toolkit/nvidia-ctk"},
+						{Name: "IMAGE_NAME", Value: "nvcr.io/nvidia/k8s-dra-driver-gpu:v1.0.0"},
+					},
+				}),
+		},
+		{
+			description: "gpus disabled, compute domains enabled",
+			ds: NewDaemonset().
+				WithContainer(corev1.Container{Name: "gpus"}).
+				WithContainer(corev1.Container{Name: "compute-domains"}),
+			cpSpec: &gpuv1.ClusterPolicySpec{
+				Toolkit: gpuv1.ToolkitSpec{InstallDir: "/usr/local/nvidia"},
+				DRADriver: gpuv1.DRADriverSpec{
+					Repository:      "nvcr.io/nvidia",
+					Image:           "k8s-dra-driver-gpu",
+					Version:         "v1.0.0",
+					ImagePullPolicy: "IfNotPresent",
+					GPUs: gpuv1.DRADriverGPUs{
+						Enabled: newBoolPtr(false),
+					},
+					ComputeDomains: gpuv1.DRADriverComputeDomains{
+						Enabled: newBoolPtr(true),
+					},
+				},
+			},
+			expectedDs: NewDaemonset().
+				WithContainer(corev1.Container{
+					Name:            "compute-domains",
+					Image:           "nvcr.io/nvidia/k8s-dra-driver-gpu:v1.0.0",
+					ImagePullPolicy: corev1.PullIfNotPresent,
+					Env: []corev1.EnvVar{
+						{Name: NvidiaCTKPathEnvName, Value: "/usr/local/nvidia/toolkit/nvidia-ctk"},
+					},
+				}),
+		},
+		{
+			description: "gpus disabled, compute domains disabled",
+			ds: NewDaemonset().
+				WithContainer(corev1.Container{Name: "gpus"}).
+				WithContainer(corev1.Container{Name: "compute-domains"}),
+			cpSpec: &gpuv1.ClusterPolicySpec{
+				DRADriver: gpuv1.DRADriverSpec{
+					Repository:      "nvcr.io/nvidia",
+					Image:           "k8s-dra-driver-gpu",
+					Version:         "v1.0.0",
+					ImagePullPolicy: "IfNotPresent",
+					GPUs: gpuv1.DRADriverGPUs{
+						Enabled: newBoolPtr(false),
+					},
+					ComputeDomains: gpuv1.DRADriverComputeDomains{
+						Enabled: newBoolPtr(false),
+					},
+				},
+			},
+			expectedDs: NewDaemonset(),
+		},
+	}
+
+	for _, tc := range testCases {
+		t.Run(tc.description, func(t *testing.T) {
+			err := TransformDRADriverKubeletPlugin(tc.ds.DaemonSet, tc.cpSpec, ClusterPolicyController{runtime: gpuv1.Containerd, logger: ctrl.Log.WithName("test")})
+			if tc.errorExpected {
+				require.Error(t, err)
+				return
+			}
+			require.NoError(t, err)
+			require.EqualValues(t, tc.expectedDs, tc.ds)
+		})
+	}
+}
+
+func TestTransformDRADriverController(t *testing.T) {
+	testCases := []struct {
+		description        string
+		deployment         _Deployment
+		cpSpec             *gpuv1.ClusterPolicySpec
+		expectedDeployment _Deployment
+		errorExpected      bool
+	}{
+		{
+			description: "empty dra driver spec",
+			deployment:  NewDeployment(),
+			cpSpec: &gpuv1.ClusterPolicySpec{
+				DRADriver: gpuv1.DRADriverSpec{},
+			},
+			expectedDeployment: NewDeployment(),
+			errorExpected:      true,
+		},
+		{
+			description: "full dra driver spec",
+			deployment: NewDeployment().
+				WithContainer(corev1.Container{Name: "compute-domains"}),
+			cpSpec: &gpuv1.ClusterPolicySpec{
+				DRADriver: gpuv1.DRADriverSpec{
+					Repository:      "nvcr.io/nvidia",
+					Image:           "k8s-dra-driver-gpu",
+					Version:         "v1.0.0",
+					ImagePullPolicy: "IfNotPresent",
+					ComputeDomains: gpuv1.DRADriverComputeDomains{
+						Enabled: newBoolPtr(true),
+						Controller: gpuv1.DRADriverController{
+							Env: []gpuv1.EnvVar{
+								{Name: "foo", Value: "bar"},
+							},
+							Resources: &gpuv1.ResourceRequirements{
+								Limits: corev1.ResourceList{
+									corev1.ResourceCPU:    resource.MustParse("100m"),
+									corev1.ResourceMemory: resource.MustParse("100Mi"),
+								},
+								Requests: corev1.ResourceList{
+									corev1.ResourceCPU:    resource.MustParse("50m"),
+									corev1.ResourceMemory: resource.MustParse("50Mi"),
+								},
+							},
+							Tolerations: []corev1.Toleration{
+								{
+									Key:      "foo",
+									Operator: corev1.TolerationOpExists,
+									Effect:   corev1.TaintEffectNoSchedule,
+								},
+							},
+						},
+					},
+				},
+			},
+			expectedDeployment: NewDeployment().
+				WithTolerations([]corev1.Toleration{
+					{
+						Key:      "foo",
+						Operator: corev1.TolerationOpExists,
+						Effect:   corev1.TaintEffectNoSchedule,
+					},
+				}).
+				WithContainer(corev1.Container{
+					Name:            "compute-domains",
+					Image:           "nvcr.io/nvidia/k8s-dra-driver-gpu:v1.0.0",
+					ImagePullPolicy: corev1.PullIfNotPresent,
+					Env: []corev1.EnvVar{
+						{Name: "IMAGE_NAME", Value: "nvcr.io/nvidia/k8s-dra-driver-gpu:v1.0.0"},
+						{Name: "foo", Value: "bar"},
+					},
+					Resources: corev1.ResourceRequirements{
+						Limits: corev1.ResourceList{
+							corev1.ResourceCPU:    resource.MustParse("100m"),
+							corev1.ResourceMemory: resource.MustParse("100Mi"),
+						},
+						Requests: corev1.ResourceList{
+							corev1.ResourceCPU:    resource.MustParse("50m"),
+							corev1.ResourceMemory: resource.MustParse("50Mi"),
+						},
+					},
+				}),
+		},
+	}
+
+	for _, tc := range testCases {
+		t.Run(tc.description, func(t *testing.T) {
+			err := TransformDRADriverController(tc.deployment.Deployment, tc.cpSpec)
+			if tc.errorExpected {
+				require.Error(t, err)
+				return
+			}
+			require.NoError(t, err)
+			require.EqualValues(t, tc.expectedDeployment, tc.deployment)
+		})
+	}
+}
diff --git a/deployments/gpu-operator/crds/nvidia.com_clusterpolicies.yaml b/deployments/gpu-operator/crds/nvidia.com_clusterpolicies.yaml
index 868fe9379..add8948ee 100644
--- a/deployments/gpu-operator/crds/nvidia.com_clusterpolicies.yaml
+++ b/deployments/gpu-operator/crds/nvidia.com_clusterpolicies.yaml
@@ -897,6 +897,236 @@ spec:
                     description: NVIDIA Device Plugin image tag
                     type: string
                 type: object
+              draDriver:
+                description: DRADriver component spec
+                properties:
+                  computeDomains:
+                    description: ComputeDomains defines configuration for ComputeDomains
+                      in the NVIDIA DRA Driver
+                    properties:
+                      controller:
+                        description: Controller defines configuration for the NVIDIA
+                          DRA Driver controller
+                        properties:
+                          env:
+                            description: 'Optional: List of environment variables'
+                            items:
+                              description: EnvVar represents an environment variable
+                                present in a Container.
+                              properties:
+                                name:
+                                  description: Name of the environment variable.
+                                  type: string
+                                value:
+                                  description: Value of the environment variable.
+                                  type: string
+                              required:
+                              - name
+                              type: object
+                            type: array
+                          resources:
+                            description: 'Optional: Define resources requests and
+                              limits'
+                            properties:
+                              limits:
+                                additionalProperties:
+                                  anyOf:
+                                  - type: integer
+                                  - type: string
+                                  pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$
+                                  x-kubernetes-int-or-string: true
+                                description: |-
+                                  Limits describes the maximum amount of compute resources allowed.
+                                  More info: https://kubernetes.io/docs/concepts/configuration/manage-resources-containers/
+                                type: object
+                              requests:
+                                additionalProperties:
+                                  anyOf:
+                                  - type: integer
+                                  - type: string
+                                  pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$
+                                  x-kubernetes-int-or-string: true
+                                description: |-
+                                  Requests describes the minimum amount of compute resources required.
+                                  If Requests is omitted for a container, it defaults to Limits if that is explicitly specified,
+                                  otherwise to an implementation-defined value. Requests cannot exceed Limits.
+                                  More info: https://kubernetes.io/docs/concepts/configuration/manage-resources-containers/
+                                type: object
+                            type: object
+                          tolerations:
+                            description: 'Optional: Set tolerations'
+                            items:
+                              description: |-
+                                The pod this Toleration is attached to tolerates any taint that matches
+                                the triple <key,value,effect> using the matching operator <operator>.
+                              properties:
+                                effect:
+                                  description: |-
+                                    Effect indicates the taint effect to match. Empty means match all taint effects.
+                                    When specified, allowed values are NoSchedule, PreferNoSchedule and NoExecute.
+                                  type: string
+                                key:
+                                  description: |-
+                                    Key is the taint key that the toleration applies to. Empty means match all taint keys.
+                                    If the key is empty, operator must be Exists; this combination means to match all values and all keys.
+                                  type: string
+                                operator:
+                                  description: |-
+                                    Operator represents a key's relationship to the value.
+                                    Valid operators are Exists, Equal, Lt, and Gt. Defaults to Equal.
+                                    Exists is equivalent to wildcard for value, so that a pod can
+                                    tolerate all taints of a particular category.
+                                    Lt and Gt perform numeric comparisons (requires feature gate TaintTolerationComparisonOperators).
+                                  type: string
+                                tolerationSeconds:
+                                  description: |-
+                                    TolerationSeconds represents the period of time the toleration (which must be
+                                    of effect NoExecute, otherwise this field is ignored) tolerates the taint. By default,
+                                    it is not set, which means tolerate the taint forever (do not evict). Zero and
+                                    negative values will be treated as 0 (evict immediately) by the system.
+                                  format: int64
+                                  type: integer
+                                value:
+                                  description: |-
+                                    Value is the taint value the toleration matches to.
+                                    If the operator is Exists, the value should be empty, otherwise just a regular string.
+                                  type: string
+                              type: object
+                            type: array
+                        type: object
+                      enabled:
+                        description: Enabled indicates if ComputeDomains are enabled
+                          in the NVIDIA DRA Driver
+                        type: boolean
+                      kubeletPlugin:
+                        description: KubeletPlugin defines configuration for the NVIDIA
+                          DRA Driver kubelet plugin
+                        properties:
+                          env:
+                            description: 'Optional: List of environment variables'
+                            items:
+                              description: EnvVar represents an environment variable
+                                present in a Container.
+                              properties:
+                                name:
+                                  description: Name of the environment variable.
+                                  type: string
+                                value:
+                                  description: Value of the environment variable.
+                                  type: string
+                              required:
+                              - name
+                              type: object
+                            type: array
+                          resources:
+                            description: 'Optional: Define resources requests and
+                              limits'
+                            properties:
+                              limits:
+                                additionalProperties:
+                                  anyOf:
+                                  - type: integer
+                                  - type: string
+                                  pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$
+                                  x-kubernetes-int-or-string: true
+                                description: |-
+                                  Limits describes the maximum amount of compute resources allowed.
+                                  More info: https://kubernetes.io/docs/concepts/configuration/manage-resources-containers/
+                                type: object
+                              requests:
+                                additionalProperties:
+                                  anyOf:
+                                  - type: integer
+                                  - type: string
+                                  pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$
+                                  x-kubernetes-int-or-string: true
+                                description: |-
+                                  Requests describes the minimum amount of compute resources required.
+                                  If Requests is omitted for a container, it defaults to Limits if that is explicitly specified,
+                                  otherwise to an implementation-defined value. Requests cannot exceed Limits.
+                                  More info: https://kubernetes.io/docs/concepts/configuration/manage-resources-containers/
+                                type: object
+                            type: object
+                        type: object
+                    type: object
+                  gpus:
+                    description: GPUs defines configuration for GPUs in the NVIDIA
+                      DRA Driver
+                    properties:
+                      enabled:
+                        description: Enabled indicates if GPUs are enabled in the
+                          NVIDIA DRA Driver
+                        type: boolean
+                      kubeletPlugin:
+                        description: KubeletPlugin defines configuration for the NVIDIA
+                          DRA Driver kubelet plugin
+                        properties:
+                          env:
+                            description: 'Optional: List of environment variables'
+                            items:
+                              description: EnvVar represents an environment variable
+                                present in a Container.
+                              properties:
+                                name:
+                                  description: Name of the environment variable.
+                                  type: string
+                                value:
+                                  description: Value of the environment variable.
+                                  type: string
+                              required:
+                              - name
+                              type: object
+                            type: array
+                          resources:
+                            description: 'Optional: Define resources requests and
+                              limits'
+                            properties:
+                              limits:
+                                additionalProperties:
+                                  anyOf:
+                                  - type: integer
+                                  - type: string
+                                  pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$
+                                  x-kubernetes-int-or-string: true
+                                description: |-
+                                  Limits describes the maximum amount of compute resources allowed.
+                                  More info: https://kubernetes.io/docs/concepts/configuration/manage-resources-containers/
+                                type: object
+                              requests:
+                                additionalProperties:
+                                  anyOf:
+                                  - type: integer
+                                  - type: string
+                                  pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$
+                                  x-kubernetes-int-or-string: true
+                                description: |-
+                                  Requests describes the minimum amount of compute resources required.
+                                  If Requests is omitted for a container, it defaults to Limits if that is explicitly specified,
+                                  otherwise to an implementation-defined value. Requests cannot exceed Limits.
+                                  More info: https://kubernetes.io/docs/concepts/configuration/manage-resources-containers/
+                                type: object
+                            type: object
+                        type: object
+                    type: object
+                  image:
+                    description: NVIDIA DRA Driver image name
+                    pattern: '[a-zA-Z0-9\-]+'
+                    type: string
+                  imagePullPolicy:
+                    description: Image pull policy
+                    type: string
+                  imagePullSecrets:
+                    description: Image pull secrets
+                    items:
+                      type: string
+                    type: array
+                  repository:
+                    description: NVIDIA DRA Driver image repository
+                    type: string
+                  version:
+                    description: NVIDIA DRA Driver image tag
+                    type: string
+                type: object
               driver:
                 description: Driver component spec
                 properties:
@@ -2883,6 +3113,7 @@ spec:
             - dcgm
             - dcgmExporter
             - devicePlugin
+            - draDriver
             - driver
             - gfd
             - nodeStatusExporter
diff --git a/deployments/gpu-operator/crds/resource.nvidia.com_computedomains.yaml b/deployments/gpu-operator/crds/resource.nvidia.com_computedomains.yaml
new file mode 100644
index 000000000..307b21ff7
--- /dev/null
+++ b/deployments/gpu-operator/crds/resource.nvidia.com_computedomains.yaml
@@ -0,0 +1,104 @@
+---
+apiVersion: apiextensions.k8s.io/v1
+kind: CustomResourceDefinition
+metadata:
+  annotations:
+    controller-gen.kubebuilder.io/version: v0.17.1
+  name: computedomains.resource.nvidia.com
+spec:
+  group: resource.nvidia.com
+  names:
+    kind: ComputeDomain
+    listKind: ComputeDomainList
+    plural: computedomains
+    singular: computedomain
+  scope: Namespaced
+  versions:
+    - name: v1beta1
+      schema:
+        openAPIV3Schema:
+          description: ComputeDomain prepares a set of nodes to run a multi-node workload
+            in.
+          properties:
+            apiVersion:
+              description: |-
+                APIVersion defines the versioned schema of this representation of an object.
+                Servers should convert recognized schemas to the latest internal value, and
+                may reject unrecognized values.
+                More info: https://git.k8s.io/community/contributors/devel/sig-architecture/api-conventions.md#resources
+              type: string
+            kind:
+              description: |-
+                Kind is a string value representing the REST resource this object represents.
+                Servers may infer this from the endpoint the client submits requests to.
+                Cannot be updated.
+                In CamelCase.
+                More info: https://git.k8s.io/community/contributors/devel/sig-architecture/api-conventions.md#types-kinds
+              type: string
+            metadata:
+              type: object
+            spec:
+              description: ComputeDomainSpec provides the spec for a ComputeDomain.
+              properties:
+                channel:
+                  description: ComputeDomainChannelSpec provides the spec for a channel
+                    used to run a workload inside a ComputeDomain.
+                  properties:
+                    resourceClaimTemplate:
+                      description: ComputeDomainResourceClaimTemplate provides the details
+                        of the ResourceClaimTemplate to generate.
+                      properties:
+                        name:
+                          type: string
+                      required:
+                        - name
+                      type: object
+                  required:
+                    - resourceClaimTemplate
+                  type: object
+                numNodes:
+                  type: integer
+              required:
+                - channel
+                - numNodes
+              type: object
+              x-kubernetes-validations:
+                - message: A computeDomain.spec is immutable
+                  rule: self == oldSelf
+            status:
+              description: ComputeDomainStatus provides the status for a ComputeDomain.
+              properties:
+                nodes:
+                  items:
+                    description: ComputeDomainNode provides information about each node
+                      added to a ComputeDomain.
+                    properties:
+                      cliqueID:
+                        type: string
+                      ipAddress:
+                        type: string
+                      name:
+                        type: string
+                    required:
+                      - cliqueID
+                      - ipAddress
+                      - name
+                    type: object
+                  type: array
+                  x-kubernetes-list-map-keys:
+                    - name
+                  x-kubernetes-list-type: map
+                status:
+                  default: NotReady
+                  enum:
+                    - Ready
+                    - NotReady
+                  type: string
+              required:
+                - status
+              type: object
+          type: object
+      served: true
+      storage: true
+      subresources:
+        status: {}
diff --git a/deployments/gpu-operator/templates/cleanup_crd.yaml b/deployments/gpu-operator/templates/cleanup_crd.yaml
index 0d426f952..347563498 100644
--- a/deployments/gpu-operator/templates/cleanup_crd.yaml
+++ b/deployments/gpu-operator/templates/cleanup_crd.yaml
@@ -40,6 +40,7 @@ spec:
             - delete
             - --filepath=/opt/gpu-operator/nvidia.com_clusterpolicies.yaml
             - --filepath=/opt/gpu-operator/nvidia.com_nvidiadrivers.yaml
+            - --filepath=/opt/gpu-operator/resource.nvidia.com_computedomains.yaml
         {{- if .Values.nfd.enabled }}
             - --filepath=/opt/gpu-operator/nfd-api-crds.yaml
         {{- end }}
diff --git a/deployments/gpu-operator/templates/clusterpolicy.yaml b/deployments/gpu-operator/templates/clusterpolicy.yaml
index d96a1f03f..c4442e646 100644
--- a/deployments/gpu-operator/templates/clusterpolicy.yaml
+++ b/deployments/gpu-operator/templates/clusterpolicy.yaml
@@ -518,6 +518,50 @@ spec:
     {{- if .Values.devicePlugin.hostNetwork }}
     hostNetwork: {{ .Values.devicePlugin.hostNetwork }}
     {{- end }}
+  draDriver:
+    {{- if .Values.draDriver.repository }}
+    repository: {{ .Values.draDriver.repository }}
+    {{- end }}
+    {{- if .Values.draDriver.image }}
+    image: {{ .Values.draDriver.image }}
+    {{- end }}
+    {{- if .Values.draDriver.version }}
+    version: {{ .Values.draDriver.version | quote }}
+    {{- end }}
+    {{- if .Values.draDriver.imagePullPolicy }}
+    imagePullPolicy: {{ .Values.draDriver.imagePullPolicy }}
+    {{- end }}
+    {{- if .Values.draDriver.imagePullSecrets }}
+    imagePullSecrets: {{ toYaml .Values.draDriver.imagePullSecrets | nindent 6 }}
+    {{- end }}
+    gpus:
+      enabled: {{ .Values.draDriver.gpus.enabled }}
+      kubeletPlugin:
+        {{- if .Values.draDriver.gpus.kubeletPlugin.env }}
+        env: {{ toYaml .Values.draDriver.gpus.kubeletPlugin.env | nindent 8 }}
+        {{- end }}
+        {{- if .Values.draDriver.gpus.kubeletPlugin.resources }}
+        resources: {{ toYaml .Values.draDriver.gpus.kubeletPlugin.resources | nindent 8 }}
+        {{- end }}
+    computeDomains:
+      enabled: {{ .Values.draDriver.computeDomains.enabled }}
+      controller:
+        {{- if .Values.draDriver.computeDomains.controller.env }}
+        env: {{ toYaml .Values.draDriver.computeDomains.controller.env | nindent 8 }}
+        {{- end }}
+        {{- if .Values.draDriver.computeDomains.controller.resources }}
+        resources: {{ toYaml .Values.draDriver.computeDomains.controller.resources | nindent 8 }}
+        {{- end }}
+        {{- if .Values.draDriver.computeDomains.controller.tolerations }}
+        tolerations: {{ toYaml .Values.draDriver.computeDomains.controller.tolerations | nindent 8 }}
+        {{- end }}
+      kubeletPlugin:
+        {{- if .Values.draDriver.computeDomains.kubeletPlugin.env }}
+        env: {{ toYaml .Values.draDriver.computeDomains.kubeletPlugin.env | nindent 8 }}
+        {{- end }}
+        {{- if .Values.draDriver.computeDomains.kubeletPlugin.resources }}
+        resources: {{ toYaml .Values.draDriver.computeDomains.kubeletPlugin.resources | nindent 8 }}
+        {{- end }}
   dcgm:
     enabled: {{ .Values.dcgm.enabled }}
     {{- if .Values.dcgm.repository }}
diff --git a/deployments/gpu-operator/templates/clusterrole.yaml b/deployments/gpu-operator/templates/clusterrole.yaml
index 2af291e22..3bc02222a 100644
--- a/deployments/gpu-operator/templates/clusterrole.yaml
+++ b/deployments/gpu-operator/templates/clusterrole.yaml
@@ -97,6 +97,7 @@ rules:
   - apps
   resources:
   - daemonsets
+  - deployments
   verbs:
   - get
   - list
@@ -153,3 +154,81 @@ rules:
 {{- if .Values.operator.cleanupCRD }}
   - delete
 {{- end }}
+- apiGroups:
+  - resource.nvidia.com
+  resources:
+  - computedomains
+  verbs:
+  - get
+  - list
+  - watch
+  - create
+  - update
+  - patch
+  - delete
+- apiGroups:
+  - resource.nvidia.com
+  resources:
+  - computedomains/status
+  verbs:
+  - get
+  - list
+  - watch
+  - create
+  - update
+  - patch
+  - delete
+- apiGroups:
+  - resource.k8s.io
+  resources:
+  - resourceclaims
+  verbs:
+  - get
+  - list
+  - watch
+  - create
+  - update
+  - patch
+  - delete
+- apiGroups:
+  - resource.k8s.io
+  resources:
+  - resourceclaimtemplates
+  verbs:
+  - get
+  - list
+  - watch
+  - create
+  - update
+  - patch
+  - delete
+- apiGroups:
+  - resource.k8s.io
+  resources:
+  - deviceclasses
+  verbs:
+  - get
+  - list
+  - watch
+  - create
+  - update
+  - patch
+  - delete
+- apiGroups:
+  - resource.k8s.io
+  resources:
+  - resourceslices
+  verbs:
+  - get
+  - list
+  - watch
+  - create
+  - update
+  - patch
+  - delete
+- apiGroups:
+  - resource.k8s.io
+  resources:
+  - resourceclaims/status
+  verbs:
+  - update
diff --git a/deployments/gpu-operator/templates/role.yaml b/deployments/gpu-operator/templates/role.yaml
index dc4674c57..2837ce435 100644
--- a/deployments/gpu-operator/templates/role.yaml
+++ b/deployments/gpu-operator/templates/role.yaml
@@ -32,6 +32,7 @@ rules:
   - apps
   resources:
   - daemonsets
+  - deployments
   verbs:
   - create
   - get
diff --git a/deployments/gpu-operator/templates/upgrade_crd.yaml b/deployments/gpu-operator/templates/upgrade_crd.yaml
index e887b3a81..ab66ee7d2 100644
--- a/deployments/gpu-operator/templates/upgrade_crd.yaml
+++ b/deployments/gpu-operator/templates/upgrade_crd.yaml
@@ -89,6 +89,7 @@ spec:
             - apply
             - --filepath=/opt/gpu-operator/nvidia.com_clusterpolicies.yaml
             - --filepath=/opt/gpu-operator/nvidia.com_nvidiadrivers.yaml
+            - --filepath=/opt/gpu-operator/resource.nvidia.com_computedomains.yaml
         {{- if .Values.nfd.enabled }}
             - --filepath=/opt/gpu-operator/nfd-api-crds.yaml
         {{- end }}
diff --git a/deployments/gpu-operator/values.yaml b/deployments/gpu-operator/values.yaml
index 9dba0c80c..06483c601 100644
--- a/deployments/gpu-operator/values.yaml
+++ b/deployments/gpu-operator/values.yaml
@@ -280,6 +280,32 @@ devicePlugin:
     root: "/run/nvidia/mps"
   hostNetwork: false
 
+draDriver:
+  repository: nvcr.io/nvidia
+  image: k8s-dra-driver-gpu
+  version: v25.3.0-rc.4
+  imagePullPolicy: IfNotPresent
+  imagePullSecrets: []
+
+  gpus:
+    enabled: false
+    kubeletPlugin:
+      env: []
+      resources: {}
+
+  computeDomains:
+    enabled: false
+    controller:
+      env: []
+      resources: {}
+      tolerations:
+        - key: node-role.kubernetes.io/control-plane
+          operator: Exists
+          effect: NoSchedule
+    kubeletPlugin:
+      env: []
+      resources: {}
+
 # standalone dcgm hostengine
 dcgm:
   # disabled by default to use embedded nv-hostengine by exporter

From 4167fae6751362bcf9a3babc049c68dfbee81e7f Mon Sep 17 00:00:00 2001
From: Christopher Desiniotis <cdesiniotis@nvidia.com>
Date: Fri, 18 Jul 2025 15:45:55 -0700
Subject: [PATCH 02/10] Handle clusters were DRA is not supported or enabled

Signed-off-by: Christopher Desiniotis <cdesiniotis@nvidia.com>
---
 controllers/clusterpolicy_controller.go | 13 +------
 controllers/clusterpolicy_validator.go  | 43 ++++++++++++++++++++
 controllers/state_manager.go            | 52 ++++++++++++++++++++++++-
 3 files changed, 94 insertions(+), 14 deletions(-)
 create mode 100644 controllers/clusterpolicy_validator.go

diff --git a/controllers/clusterpolicy_controller.go b/controllers/clusterpolicy_controller.go
index b92023270..6db0ecebe 100644
--- a/controllers/clusterpolicy_controller.go
+++ b/controllers/clusterpolicy_controller.go
@@ -121,20 +121,9 @@ func (r *ClusterPolicyReconciler) Reconcile(ctx context.Context, req ctrl.Reques
 		return ctrl.Result{}, nil
 	}
 
-	if instance.Spec.DevicePlugin.IsEnabled() && instance.Spec.DRADriver.IsGPUsEnabled() {
-		err = fmt.Errorf("the device-plugin and dra driver for GPUs cannot both be enabled")
-		condErr = r.conditionUpdater.SetConditionsError(ctx, instance, conditions.ReconcileFailed, err.Error())
-		if condErr != nil {
-			r.Log.V(consts.LogLevelDebug).Error(nil, condErr.Error())
-		}
-		if clusterPolicyCtrl.operatorMetrics != nil {
-			clusterPolicyCtrl.operatorMetrics.reconciliationStatus.Set(reconciliationStatusNotReady)
-		}
-		return ctrl.Result{}, err
-	}
-
 	if err := clusterPolicyCtrl.init(ctx, r, instance); err != nil {
 		r.Log.Error(err, "unable to initialize ClusterPolicy controller")
+		updateCRState(ctx, r, req.NamespacedName, gpuv1.NotReady)
 		if condErr := r.conditionUpdater.SetConditionsError(ctx, instance, conditions.ReconcileFailed, err.Error()); condErr != nil {
 			r.Log.Error(condErr, "failed to set condition")
 		}
diff --git a/controllers/clusterpolicy_validator.go b/controllers/clusterpolicy_validator.go
new file mode 100644
index 000000000..2f630a9e4
--- /dev/null
+++ b/controllers/clusterpolicy_validator.go
@@ -0,0 +1,43 @@
+/*
+ * Copyright (c) 2025, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package controllers
+
+import (
+	"fmt"
+
+	gpuv1 "github.com/NVIDIA/gpu-operator/api/nvidia/v1"
+)
+
+func (n *ClusterPolicyController) validateClusterPolicy() error {
+	err := validateDRA(n.singleton, n.draSupported)
+	if err != nil {
+		return fmt.Errorf("failed to validate DRA: %w", err)
+	}
+	return nil
+}
+
+func validateDRA(clusterpolicy *gpuv1.ClusterPolicy, draSupported bool) error {
+	if !draSupported && clusterpolicy.Spec.DRADriver.IsEnabled() {
+		return fmt.Errorf("the NVIDIA DRA driver for GPUs is enabled in ClusterPolicy but Dynamic Resource Allocation is not enabled in the Kubernetes cluster")
+	}
+
+	if clusterpolicy.Spec.DevicePlugin.IsEnabled() && clusterpolicy.Spec.DRADriver.IsGPUsEnabled() {
+		return fmt.Errorf("the NVIDIA device plugin and the NVIDIA DRA driver for GPUs cannot both be enabled in ClusterPolicy")
+	}
+
+	return nil
+}
diff --git a/controllers/state_manager.go b/controllers/state_manager.go
index b9b19acee..781e9ac72 100644
--- a/controllers/state_manager.go
+++ b/controllers/state_manager.go
@@ -162,6 +162,7 @@ type ClusterPolicyController struct {
 	currentKernelVersion string
 
 	k8sVersion       string
+	draSupported     bool
 	openshift        string
 	ocpDriverToolkit OpenShiftDriverToolkit
 
@@ -226,6 +227,38 @@ func KubernetesVersion() (string, error) {
 	return info.GitVersion, nil
 }
 
+// IsDRASupported checks if Dynamic Resource Allocation is enabled in the Kubernetes cluster
+// by checking if the 'DeviceClass' resource is a valid Kind.
+func IsDRASupported(logger logr.Logger) (bool, error) {
+	cfg := config.GetConfigOrDie()
+	discoveryClient, err := discovery.NewDiscoveryClientForConfig(cfg)
+	if err != nil {
+		return false, fmt.Errorf("error building discovery client: %w", err)
+	}
+
+	apiResourceLists, err := discoveryClient.ServerPreferredResources()
+	if err != nil {
+		return false, fmt.Errorf("error getting API resources from discovery client: %w", err)
+	}
+
+	var matches []string
+	kind := "DeviceClass"
+	for _, resourceList := range apiResourceLists {
+		for _, resource := range resourceList.APIResources {
+			if resource.Kind == kind {
+				matches = append(matches, resourceList.GroupVersion)
+			}
+		}
+	}
+
+	draSupported := len(matches) > 0
+	if draSupported {
+		logger.Info(fmt.Sprintf("Kind %q exists in the following group/versions: %s", kind, strings.Join(matches, ", ")))
+	}
+
+	return len(matches) > 0, nil
+}
+
 // GetClusterWideProxy returns cluster wide proxy object setup in OCP
 func GetClusterWideProxy(ctx context.Context) (*apiconfigv1.Proxy, error) {
 	cfg := config.GetConfigOrDie()
@@ -884,6 +917,12 @@ func (n *ClusterPolicyController) init(ctx context.Context, reconciler *ClusterP
 			return fmt.Errorf("error validating clusterpolicy: %w", err)
 		}
 
+		draSupported, err := IsDRASupported(n.logger)
+		if err != nil {
+			return fmt.Errorf("failed to detect if DRA is supported: %w", err)
+		}
+		n.draSupported = draSupported
+
 		n.operatorMetrics = initOperatorMetrics()
 		n.logger.Info("Operator metrics initialized.")
 
@@ -893,14 +932,12 @@ func (n *ClusterPolicyController) init(ctx context.Context, reconciler *ClusterP
 		addState(n, "/opt/gpu-operator/state-container-toolkit")
 		addState(n, "/opt/gpu-operator/state-operator-validation")
 		addState(n, "/opt/gpu-operator/state-device-plugin")
-		addState(n, "/opt/gpu-operator/state-dra-driver")
 		addState(n, "/opt/gpu-operator/state-mps-control-daemon")
 		addState(n, "/opt/gpu-operator/state-dcgm")
 		addState(n, "/opt/gpu-operator/state-dcgm-exporter")
 		addState(n, "/opt/gpu-operator/gpu-feature-discovery")
 		addState(n, "/opt/gpu-operator/state-mig-manager")
 		addState(n, "/opt/gpu-operator/state-node-status-exporter")
-		// add sandbox workload states
 		addState(n, "/opt/gpu-operator/state-vgpu-manager")
 		addState(n, "/opt/gpu-operator/state-vgpu-device-manager")
 		addState(n, "/opt/gpu-operator/state-sandbox-validation")
@@ -909,6 +946,17 @@ func (n *ClusterPolicyController) init(ctx context.Context, reconciler *ClusterP
 		addState(n, "/opt/gpu-operator/state-kata-device-plugin")
 		addState(n, "/opt/gpu-operator/state-kata-manager")
 		addState(n, "/opt/gpu-operator/state-cc-manager")
+
+		if n.draSupported {
+			addState(n, "/opt/gpu-operator/state-dra-driver")
+		}
+	}
+
+	// TODO: combine this validation logic with the call to
+	// ValidateClusterPolicySpec() up above
+	err := n.validateClusterPolicy()
+	if err != nil {
+		return fmt.Errorf("ClusterPolicy validation failed: %w", err)
 	}
 
 	if clusterPolicy.Spec.SandboxWorkloads.IsEnabled() {

From 6bf74971398be4b9813cf477fae82006557e1ea6 Mon Sep 17 00:00:00 2001
From: Christopher Desiniotis <cdesiniotis@nvidia.com>
Date: Sat, 19 Jul 2025 15:19:29 -0700
Subject: [PATCH 03/10] Add service account for compute-domain-daemon

Signed-off-by: Christopher Desiniotis <cdesiniotis@nvidia.com>
---
 ...compute_domain_daemon-service_account.yaml |  5 ++
 ...220_compute_domain_daemon-clusterrole.yaml | 17 +++++
 ...pute_domain_daemon-clusterrolebinding.yaml | 13 ++++
 controllers/object_controls.go                | 66 ++++++++++++++++---
 controllers/resource_manager.go               | 33 +++++++---
 5 files changed, 116 insertions(+), 18 deletions(-)
 create mode 100644 assets/state-dra-driver/0110_compute_domain_daemon-service_account.yaml
 create mode 100644 assets/state-dra-driver/0220_compute_domain_daemon-clusterrole.yaml
 create mode 100644 assets/state-dra-driver/0320_compute_domain_daemon-clusterrolebinding.yaml

diff --git a/assets/state-dra-driver/0110_compute_domain_daemon-service_account.yaml b/assets/state-dra-driver/0110_compute_domain_daemon-service_account.yaml
new file mode 100644
index 000000000..e4bfe6255
--- /dev/null
+++ b/assets/state-dra-driver/0110_compute_domain_daemon-service_account.yaml
@@ -0,0 +1,5 @@
+apiVersion: v1
+kind: ServiceAccount
+metadata:
+  name: compute-domain-daemon-service-account
+  namespace: "FILLED BY THE OPERATOR"
diff --git a/assets/state-dra-driver/0220_compute_domain_daemon-clusterrole.yaml b/assets/state-dra-driver/0220_compute_domain_daemon-clusterrole.yaml
new file mode 100644
index 000000000..4b157fa4a
--- /dev/null
+++ b/assets/state-dra-driver/0220_compute_domain_daemon-clusterrole.yaml
@@ -0,0 +1,17 @@
+apiVersion: rbac.authorization.k8s.io/v1
+kind: ClusterRole
+metadata:
+  name: compute-domain-daemon-clusterrole
+  namespace: "FILLED BY THE OPERATOR"
+rules:
+  - apiGroups:
+      - resource.nvidia.com
+    resources:
+      - computedomains
+      - computedomains/status
+    verbs:
+      - get
+      - list
+      - watch
+      - update
+      - patch
diff --git a/assets/state-dra-driver/0320_compute_domain_daemon-clusterrolebinding.yaml b/assets/state-dra-driver/0320_compute_domain_daemon-clusterrolebinding.yaml
new file mode 100644
index 000000000..5ba739004
--- /dev/null
+++ b/assets/state-dra-driver/0320_compute_domain_daemon-clusterrolebinding.yaml
@@ -0,0 +1,13 @@
+apiVersion: rbac.authorization.k8s.io/v1
+kind: ClusterRoleBinding
+metadata:
+  name: compute-domain-daemon-clusterrole-binding
+  namespace: "FILLED BY THE OPERATOR"
+subjects:
+  - kind: ServiceAccount
+    name: compute-domain-daemon-service-account
+    namespace: "FILLED BY THE OPERATOR"
+roleRef:
+  kind: ClusterRole
+  name: compute-domain-daemon-clusterrole
+  apiGroup: rbac.authorization.k8s.io
diff --git a/controllers/object_controls.go b/controllers/object_controls.go
index 84987bd3f..57f2dd6f9 100644
--- a/controllers/object_controls.go
+++ b/controllers/object_controls.go
@@ -323,11 +323,11 @@ var SubscriptionPathMap = map[string](MountPathToVolumeSource){
 
 type controlFunc []func(n ClusterPolicyController) (gpuv1.State, error)
 
-// ServiceAccount creates ServiceAccount resource
-func ServiceAccount(n ClusterPolicyController) (gpuv1.State, error) {
+// createServiceAccount creates a ServiceAccount resource
+func createServiceAccount(n ClusterPolicyController, idx int) (gpuv1.State, error) {
 	ctx := n.ctx
 	state := n.idx
-	obj := n.resources[state].ServiceAccount.DeepCopy()
+	obj := n.resources[state].ServiceAccounts[idx].DeepCopy()
 	obj.Namespace = n.operatorNamespace
 
 	logger := n.logger.WithValues("ServiceAccount", obj.Name, "Namespace", obj.Namespace)
@@ -358,6 +358,22 @@ func ServiceAccount(n ClusterPolicyController) (gpuv1.State, error) {
 	return gpuv1.Ready, nil
 }
 
+// ServiceAccounts creates one or more ServiceAccount resources
+func ServiceAccounts(n ClusterPolicyController) (gpuv1.State, error) {
+	status := gpuv1.Ready
+	state := n.idx
+	for i := range n.resources[state].ServiceAccounts {
+		stat, err := createServiceAccount(n, i)
+		if err != nil {
+			return stat, err
+		}
+		if stat == gpuv1.NotReady {
+			status = gpuv1.NotReady
+		}
+	}
+	return status, nil
+}
+
 // Role creates Role resource
 func Role(n ClusterPolicyController) (gpuv1.State, error) {
 	ctx := n.ctx
@@ -450,11 +466,11 @@ func RoleBinding(n ClusterPolicyController) (gpuv1.State, error) {
 	return gpuv1.Ready, nil
 }
 
-// ClusterRole creates ClusterRole resource
-func ClusterRole(n ClusterPolicyController) (gpuv1.State, error) {
+// createClusterRole creates a ClusterRole resource
+func createClusterRole(n ClusterPolicyController, idx int) (gpuv1.State, error) {
 	ctx := n.ctx
 	state := n.idx
-	obj := n.resources[state].ClusterRole.DeepCopy()
+	obj := n.resources[state].ClusterRoles[idx].DeepCopy()
 	obj.Namespace = n.operatorNamespace
 
 	logger := n.logger.WithValues("ClusterRole", obj.Name, "Namespace", obj.Namespace)
@@ -491,11 +507,27 @@ func ClusterRole(n ClusterPolicyController) (gpuv1.State, error) {
 	return gpuv1.Ready, nil
 }
 
-// ClusterRoleBinding creates ClusterRoleBinding resource
-func ClusterRoleBinding(n ClusterPolicyController) (gpuv1.State, error) {
+// ClusterRoles creates one or more ClusterRole resources
+func ClusterRoles(n ClusterPolicyController) (gpuv1.State, error) {
+	status := gpuv1.Ready
+	state := n.idx
+	for i := range n.resources[state].ClusterRoles {
+		stat, err := createClusterRole(n, i)
+		if err != nil {
+			return stat, err
+		}
+		if stat == gpuv1.NotReady {
+			status = gpuv1.NotReady
+		}
+	}
+	return status, nil
+}
+
+// createClusterRoleBinding creates a ClusterRoleBinding resource
+func createClusterRoleBinding(n ClusterPolicyController, idx int) (gpuv1.State, error) {
 	ctx := n.ctx
 	state := n.idx
-	obj := n.resources[state].ClusterRoleBinding.DeepCopy()
+	obj := n.resources[state].ClusterRoleBindings[idx].DeepCopy()
 	obj.Namespace = n.operatorNamespace
 
 	logger := n.logger.WithValues("ClusterRoleBinding", obj.Name, "Namespace", obj.Namespace)
@@ -536,6 +568,22 @@ func ClusterRoleBinding(n ClusterPolicyController) (gpuv1.State, error) {
 	return gpuv1.Ready, nil
 }
 
+// ClusterRoleBindings creates one or more ClusterRoleBinding resources
+func ClusterRoleBindings(n ClusterPolicyController) (gpuv1.State, error) {
+	status := gpuv1.Ready
+	state := n.idx
+	for i := range n.resources[state].ClusterRoleBindings {
+		stat, err := createClusterRoleBinding(n, i)
+		if err != nil {
+			return stat, err
+		}
+		if stat == gpuv1.NotReady {
+			status = gpuv1.NotReady
+		}
+	}
+	return status, nil
+}
+
 // createConfigMap creates a ConfigMap resource
 func createConfigMap(n ClusterPolicyController, configMapIdx int) (gpuv1.State, error) {
 	ctx := n.ctx
diff --git a/controllers/resource_manager.go b/controllers/resource_manager.go
index 2582143ab..e8acbe4e8 100644
--- a/controllers/resource_manager.go
+++ b/controllers/resource_manager.go
@@ -46,11 +46,11 @@ type assetsFromFile []byte
 
 // Resources indicates resources managed by GPU operator
 type Resources struct {
-	ServiceAccount             corev1.ServiceAccount
+	ServiceAccounts            []corev1.ServiceAccount
 	Role                       rbacv1.Role
 	RoleBinding                rbacv1.RoleBinding
-	ClusterRole                rbacv1.ClusterRole
-	ClusterRoleBinding         rbacv1.ClusterRoleBinding
+	ClusterRoles               []rbacv1.ClusterRole
+	ClusterRoleBindings        []rbacv1.ClusterRoleBinding
 	ConfigMaps                 []corev1.ConfigMap
 	DaemonSet                  appsv1.DaemonSet
 	Deployment                 appsv1.Deployment
@@ -121,9 +121,14 @@ func addResourcesControls(n *ClusterPolicyController, path string) (Resources, c
 
 		switch kind {
 		case "ServiceAccount":
-			_, _, err := s.Decode(m, nil, &res.ServiceAccount)
+			serviceAccount := corev1.ServiceAccount{}
+			_, _, err := s.Decode(m, nil, &serviceAccount)
 			panicIfError(err)
-			ctrl = append(ctrl, ServiceAccount)
+			res.ServiceAccounts = append(res.ServiceAccounts, serviceAccount)
+			// only add the ctrl function when the first ServiceAccount is added for this component
+			if len(res.ServiceAccounts) == 1 {
+				ctrl = append(ctrl, ServiceAccounts)
+			}
 		case "Role":
 			_, _, err := s.Decode(m, nil, &res.Role)
 			panicIfError(err)
@@ -133,13 +138,23 @@ func addResourcesControls(n *ClusterPolicyController, path string) (Resources, c
 			panicIfError(err)
 			ctrl = append(ctrl, RoleBinding)
 		case "ClusterRole":
-			_, _, err := s.Decode(m, nil, &res.ClusterRole)
+			clusterRole := rbacv1.ClusterRole{}
+			_, _, err := s.Decode(m, nil, &clusterRole)
 			panicIfError(err)
-			ctrl = append(ctrl, ClusterRole)
+			res.ClusterRoles = append(res.ClusterRoles, clusterRole)
+			// only add the ctrl function when the first ClusterRole is added for this component
+			if len(res.ClusterRoles) == 1 {
+				ctrl = append(ctrl, ClusterRoles)
+			}
 		case "ClusterRoleBinding":
-			_, _, err := s.Decode(m, nil, &res.ClusterRoleBinding)
+			clusterRoleBinding := rbacv1.ClusterRoleBinding{}
+			_, _, err := s.Decode(m, nil, &clusterRoleBinding)
 			panicIfError(err)
-			ctrl = append(ctrl, ClusterRoleBinding)
+			res.ClusterRoleBindings = append(res.ClusterRoleBindings, clusterRoleBinding)
+			// only add the ctrl function when the first ClusterRoleBinding is added for this component
+			if len(res.ClusterRoleBindings) == 1 {
+				ctrl = append(ctrl, ClusterRoleBindings)
+			}
 		case "ConfigMap":
 			cm := corev1.ConfigMap{}
 			_, _, err := s.Decode(m, nil, &cm)

From fe9f0829d716d70b1fa01b1fe808065bfb7857e2 Mon Sep 17 00:00:00 2001
From: Christopher Desiniotis <cdesiniotis@nvidia.com>
Date: Mon, 28 Jul 2025 16:52:47 -0700
Subject: [PATCH 04/10] Allow DRA kubelet-plugin to run privileged on OpenShift

Signed-off-by: Christopher Desiniotis <cdesiniotis@nvidia.com>
---
 .../0330_rolebinding.openshift.yaml           | 13 +++++++++
 controllers/object_controls.go                | 28 +++++++++++++------
 controllers/resource_manager.go               | 11 ++++++--
 3 files changed, 40 insertions(+), 12 deletions(-)
 create mode 100644 assets/state-dra-driver/0330_rolebinding.openshift.yaml

diff --git a/assets/state-dra-driver/0330_rolebinding.openshift.yaml b/assets/state-dra-driver/0330_rolebinding.openshift.yaml
new file mode 100644
index 000000000..bb49c649a
--- /dev/null
+++ b/assets/state-dra-driver/0330_rolebinding.openshift.yaml
@@ -0,0 +1,13 @@
+apiVersion: rbac.authorization.k8s.io/v1
+kind: RoleBinding
+metadata:
+  name: nvidia-dra-driver-openshift-privileged-role-binding
+  namespace: "FILLED BY THE OPERATOR"
+subjects:
+  - kind: ServiceAccount
+    name: nvidia-dra-driver
+    namespace: "FILLED BY THE OPERATOR"
+roleRef:
+  kind: ClusterRole
+  name: system:openshift:scc:privileged
+  apiGroup: rbac.authorization.k8s.io
diff --git a/controllers/object_controls.go b/controllers/object_controls.go
index 57f2dd6f9..b4f9622b7 100644
--- a/controllers/object_controls.go
+++ b/controllers/object_controls.go
@@ -415,11 +415,11 @@ func Role(n ClusterPolicyController) (gpuv1.State, error) {
 	return gpuv1.Ready, nil
 }
 
-// RoleBinding creates RoleBinding resource
-func RoleBinding(n ClusterPolicyController) (gpuv1.State, error) {
+// createRoleBinding creates a RoleBinding resource
+func createRoleBinding(n ClusterPolicyController, idx int) (gpuv1.State, error) {
 	ctx := n.ctx
 	state := n.idx
-	obj := n.resources[state].RoleBinding.DeepCopy()
+	obj := n.resources[state].RoleBindings[idx].DeepCopy()
 	obj.Namespace = n.operatorNamespace
 
 	logger := n.logger.WithValues("RoleBinding", obj.Name, "Namespace", obj.Namespace)
@@ -435,12 +435,6 @@ func RoleBinding(n ClusterPolicyController) (gpuv1.State, error) {
 	}
 
 	for idx := range obj.Subjects {
-		// we don't want to update ALL the Subjects[].Namespace, eg we need to keep 'openshift-monitoring'
-		// for allowing PrometheusOperator to scrape our metrics resources:
-		// see in assets/state-dcgm-exporter, 0500_prom_rolebinding_openshift.yaml vs 0300_rolebinding.yaml
-		if obj.Subjects[idx].Namespace != "FILLED BY THE OPERATOR" {
-			continue
-		}
 		obj.Subjects[idx].Namespace = n.operatorNamespace
 	}
 
@@ -466,6 +460,22 @@ func RoleBinding(n ClusterPolicyController) (gpuv1.State, error) {
 	return gpuv1.Ready, nil
 }
 
+// RoleBindings creates one or more RoleBinding resources
+func RoleBindings(n ClusterPolicyController) (gpuv1.State, error) {
+	status := gpuv1.Ready
+	state := n.idx
+	for i := range n.resources[state].RoleBindings {
+		stat, err := createRoleBinding(n, i)
+		if err != nil {
+			return stat, err
+		}
+		if stat == gpuv1.NotReady {
+			status = gpuv1.NotReady
+		}
+	}
+	return status, nil
+}
+
 // createClusterRole creates a ClusterRole resource
 func createClusterRole(n ClusterPolicyController, idx int) (gpuv1.State, error) {
 	ctx := n.ctx
diff --git a/controllers/resource_manager.go b/controllers/resource_manager.go
index e8acbe4e8..c48b12b1b 100644
--- a/controllers/resource_manager.go
+++ b/controllers/resource_manager.go
@@ -48,7 +48,7 @@ type assetsFromFile []byte
 type Resources struct {
 	ServiceAccounts            []corev1.ServiceAccount
 	Role                       rbacv1.Role
-	RoleBinding                rbacv1.RoleBinding
+	RoleBindings               []rbacv1.RoleBinding
 	ClusterRoles               []rbacv1.ClusterRole
 	ClusterRoleBindings        []rbacv1.ClusterRoleBinding
 	ConfigMaps                 []corev1.ConfigMap
@@ -134,9 +134,14 @@ func addResourcesControls(n *ClusterPolicyController, path string) (Resources, c
 			panicIfError(err)
 			ctrl = append(ctrl, Role)
 		case "RoleBinding":
-			_, _, err := s.Decode(m, nil, &res.RoleBinding)
+			roleBinding := rbacv1.RoleBinding{}
+			_, _, err := s.Decode(m, nil, &roleBinding)
 			panicIfError(err)
-			ctrl = append(ctrl, RoleBinding)
+			res.RoleBindings = append(res.RoleBindings, roleBinding)
+			// only add the ctrl function when the first RoleBinding is added for this component
+			if len(res.RoleBindings) == 1 {
+				ctrl = append(ctrl, RoleBindings)
+			}
 		case "ClusterRole":
 			clusterRole := rbacv1.ClusterRole{}
 			_, _, err := s.Decode(m, nil, &clusterRole)

From 7d467394987c76bae92a1a3ed37c8bcb8dd68e0a Mon Sep 17 00:00:00 2001
From: Christopher Desiniotis <cdesiniotis@nvidia.com>
Date: Wed, 30 Jul 2025 17:30:22 -0700
Subject: [PATCH 05/10] Add validations for DRA in the helm chart

Signed-off-by: Christopher Desiniotis <cdesiniotis@nvidia.com>
---
 deployments/gpu-operator/templates/validation.yaml | 14 ++++++++++++++
 1 file changed, 14 insertions(+)
 create mode 100644 deployments/gpu-operator/templates/validation.yaml

diff --git a/deployments/gpu-operator/templates/validation.yaml b/deployments/gpu-operator/templates/validation.yaml
new file mode 100644
index 000000000..263b29155
--- /dev/null
+++ b/deployments/gpu-operator/templates/validation.yaml
@@ -0,0 +1,14 @@
+{{- $draEnabled := or (eq .Values.draDriver.gpus.enabled true) (eq .Values.draDriver.computeDomains.enabled true) }}
+{{- $clusterSupportsDRA := or (.Capabilities.APIVersions.Has "resource.k8s.io/v1beta1/DeviceClass") (.Capabilities.APIVersions.Has "resource.k8s.io/v1beta2/DeviceClass") }}
+
+{{- if and (eq .Values.devicePlugin.enabled true) (eq .Values.draDriver.gpus.enabled true) }}
+{{- $error := "" }}
+{{- $error = printf "%s\nThe NVIDIA device plugin and the NVIDIA DRA Driver for GPUs cannot both be enabled" $error }}
+{{- fail $error }}
+{{- end}}
+
+{{- if and ($draEnabled) (not $clusterSupportsDRA) }}
+{{- $error := "" }}
+{{- $error = printf "%s\nCannot enable the NVIDIA DRA Driver for GPUs on a Kubernetes cluster that does not support DRA" $error }}
+{{- fail $error }}
+{{- end}}

From a1578f7dad2d3a3529d124b9b3fad03ad805b9f1 Mon Sep 17 00:00:00 2001
From: Christopher Desiniotis <cdesiniotis@nvidia.com>
Date: Wed, 17 Sep 2025 16:40:46 -0700
Subject: [PATCH 06/10] Conditionally set the apiVersion for all device classes
 we install

Signed-off-by: Christopher Desiniotis <cdesiniotis@nvidia.com>
---
 controllers/object_controls.go  | 28 ++++++++-------
 controllers/resource_manager.go |  6 ++--
 controllers/state_manager.go    | 60 ++++++++++++++++++++++++++-------
 3 files changed, 67 insertions(+), 27 deletions(-)

diff --git a/controllers/object_controls.go b/controllers/object_controls.go
index b4f9622b7..29ffb9bb3 100644
--- a/controllers/object_controls.go
+++ b/controllers/object_controls.go
@@ -36,10 +36,10 @@ import (
 	corev1 "k8s.io/api/core/v1"
 	nodev1 "k8s.io/api/node/v1"
 	nodev1beta1 "k8s.io/api/node/v1beta1"
-	resourceapi "k8s.io/api/resource/v1beta1"
 	apiextensionsv1 "k8s.io/apiextensions-apiserver/pkg/apis/apiextensions/v1"
 	apierrors "k8s.io/apimachinery/pkg/api/errors"
 	metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
+	"k8s.io/apimachinery/pkg/apis/meta/v1/unstructured"
 	"k8s.io/apimachinery/pkg/runtime/schema"
 	"k8s.io/apimachinery/pkg/types"
 	"k8s.io/apimachinery/pkg/util/intstr"
@@ -5557,19 +5557,24 @@ func clearRuntimeClasses(n ClusterPolicyController, runtimeClasses []nodev1.Runt
 	return nil
 }
 
-func createDeviceClass(n ClusterPolicyController, spec resourceapi.DeviceClass) (gpuv1.State, error) {
+func createDeviceClass(n ClusterPolicyController, spec unstructured.Unstructured) (gpuv1.State, error) {
 	ctx := n.ctx
 	state := n.idx
 	obj := spec.DeepCopy()
+	deviceClassName := obj.GetName()
 
-	logger := n.logger.WithValues("DeviceClass", obj.Name)
+	logger := n.logger.WithValues("DeviceClass", deviceClassName)
+
+	gvr := n.resourceGVR
+	apiVersion := gvr.Group + "/" + gvr.Version
+	obj.SetAPIVersion(apiVersion)
 
 	// Check if state is disabled and cleanup resource if exists
 	if !n.isStateEnabled(n.stateNames[state]) ||
-		(strings.Contains(obj.Name, "compute-domain") && !n.singleton.Spec.DRADriver.IsComputeDomainsEnabled()) ||
-		(obj.Name == "gpu.nvidia.com" && !n.singleton.Spec.DRADriver.IsGPUsEnabled()) ||
-		(obj.Name == "mig.nvidia.com" && !n.singleton.Spec.DRADriver.IsGPUsEnabled()) {
-		err := n.client.Delete(ctx, obj)
+		(strings.Contains(deviceClassName, "compute-domain") && !n.singleton.Spec.DRADriver.IsComputeDomainsEnabled()) ||
+		(deviceClassName == "gpu.nvidia.com" && !n.singleton.Spec.DRADriver.IsGPUsEnabled()) ||
+		(deviceClassName == "mig.nvidia.com" && !n.singleton.Spec.DRADriver.IsGPUsEnabled()) {
+		err := n.dynamicClient.Resource(gvr).Delete(ctx, deviceClassName, metav1.DeleteOptions{})
 		if err != nil && !apierrors.IsNotFound(err) {
 			logger.Info("Couldn't delete", "Error", err)
 			return gpuv1.NotReady, err
@@ -5581,11 +5586,10 @@ func createDeviceClass(n ClusterPolicyController, spec resourceapi.DeviceClass)
 		return gpuv1.NotReady, err
 	}
 
-	found := &resourceapi.DeviceClass{}
-	err := n.client.Get(ctx, types.NamespacedName{Namespace: "", Name: obj.Name}, found)
+	found, err := n.dynamicClient.Resource(gvr).Get(ctx, deviceClassName, metav1.GetOptions{})
 	if err != nil && apierrors.IsNotFound(err) {
 		logger.Info("Not found, creating...")
-		err = n.client.Create(ctx, obj)
+		_, err := n.dynamicClient.Resource(gvr).Create(ctx, obj, metav1.CreateOptions{})
 		if err != nil {
 			logger.Info("Couldn't create", "Error", err)
 			return gpuv1.NotReady, err
@@ -5596,9 +5600,9 @@ func createDeviceClass(n ClusterPolicyController, spec resourceapi.DeviceClass)
 	}
 
 	logger.Info("Found Resource, updating...")
-	obj.ResourceVersion = found.ResourceVersion
+	obj.SetResourceVersion(found.GetResourceVersion())
 
-	err = n.client.Update(ctx, obj)
+	_, err = n.dynamicClient.Resource(gvr).Update(ctx, obj, metav1.UpdateOptions{})
 	if err != nil {
 		logger.Info("Couldn't update", "Error", err)
 		return gpuv1.NotReady, err
diff --git a/controllers/resource_manager.go b/controllers/resource_manager.go
index c48b12b1b..2ae7497ba 100644
--- a/controllers/resource_manager.go
+++ b/controllers/resource_manager.go
@@ -28,8 +28,8 @@ import (
 	corev1 "k8s.io/api/core/v1"
 	nodev1 "k8s.io/api/node/v1"
 	rbacv1 "k8s.io/api/rbac/v1"
-	resourceapi "k8s.io/api/resource/v1beta1"
 	schedv1 "k8s.io/api/scheduling/v1beta1"
+	"k8s.io/apimachinery/pkg/apis/meta/v1/unstructured"
 
 	secv1 "github.com/openshift/api/security/v1"
 
@@ -62,7 +62,7 @@ type Resources struct {
 	SecurityContextConstraints secv1.SecurityContextConstraints
 	RuntimeClasses             []nodev1.RuntimeClass
 	PrometheusRule             promv1.PrometheusRule
-	DeviceClasses              []resourceapi.DeviceClass
+	DeviceClasses              []unstructured.Unstructured
 }
 
 func filePathWalkDir(n *ClusterPolicyController, root string) ([]string, error) {
@@ -203,7 +203,7 @@ func addResourcesControls(n *ClusterPolicyController, path string) (Resources, c
 			panicIfError(err)
 			ctrl = append(ctrl, PrometheusRule)
 		case "DeviceClass":
-			deviceClass := resourceapi.DeviceClass{}
+			deviceClass := unstructured.Unstructured{}
 			_, _, err := s.Decode(m, nil, &deviceClass)
 			panicIfError(err)
 			res.DeviceClasses = append(res.DeviceClasses, deviceClass)
diff --git a/controllers/state_manager.go b/controllers/state_manager.go
index 781e9ac72..7d40c0676 100644
--- a/controllers/state_manager.go
+++ b/controllers/state_manager.go
@@ -20,6 +20,7 @@ import (
 	"context"
 	"fmt"
 	"path/filepath"
+	"slices"
 	"strconv"
 	"strings"
 
@@ -31,7 +32,9 @@ import (
 	apierrors "k8s.io/apimachinery/pkg/api/errors"
 	metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
 	"k8s.io/apimachinery/pkg/runtime"
+	"k8s.io/apimachinery/pkg/runtime/schema"
 	"k8s.io/client-go/discovery"
+	"k8s.io/client-go/dynamic"
 	"sigs.k8s.io/controller-runtime/pkg/client"
 	"sigs.k8s.io/controller-runtime/pkg/client/config"
 
@@ -145,7 +148,8 @@ type OpenShiftDriverToolkit struct {
 
 // ClusterPolicyController represents clusterpolicy controller spec for GPU operator
 type ClusterPolicyController struct {
-	client client.Client
+	client        client.Client
+	dynamicClient dynamic.Interface
 
 	ctx               context.Context
 	singleton         *gpuv1.ClusterPolicy
@@ -163,6 +167,7 @@ type ClusterPolicyController struct {
 
 	k8sVersion       string
 	draSupported     bool
+	resourceGVR      schema.GroupVersionResource
 	openshift        string
 	ocpDriverToolkit OpenShiftDriverToolkit
 
@@ -229,34 +234,48 @@ func KubernetesVersion() (string, error) {
 
 // IsDRASupported checks if Dynamic Resource Allocation is enabled in the Kubernetes cluster
 // by checking if the 'DeviceClass' resource is a valid Kind.
-func IsDRASupported(logger logr.Logger) (bool, error) {
+func IsDRASupported(logger logr.Logger) (bool, schema.GroupVersionResource, error) {
+	var resourceGVR schema.GroupVersionResource
+
 	cfg := config.GetConfigOrDie()
 	discoveryClient, err := discovery.NewDiscoveryClientForConfig(cfg)
 	if err != nil {
-		return false, fmt.Errorf("error building discovery client: %w", err)
+		return false, resourceGVR, fmt.Errorf("error building discovery client: %w", err)
 	}
 
 	apiResourceLists, err := discoveryClient.ServerPreferredResources()
 	if err != nil {
-		return false, fmt.Errorf("error getting API resources from discovery client: %w", err)
+		return false, resourceGVR, fmt.Errorf("error getting API resources from discovery client: %w", err)
 	}
 
-	var matches []string
+	var resourceAPIGroupVersions []string
 	kind := "DeviceClass"
 	for _, resourceList := range apiResourceLists {
 		for _, resource := range resourceList.APIResources {
 			if resource.Kind == kind {
-				matches = append(matches, resourceList.GroupVersion)
+				resourceAPIGroupVersions = append(resourceAPIGroupVersions, resourceList.GroupVersion)
 			}
 		}
 	}
 
-	draSupported := len(matches) > 0
-	if draSupported {
-		logger.Info(fmt.Sprintf("Kind %q exists in the following group/versions: %s", kind, strings.Join(matches, ", ")))
+	if len(resourceAPIGroupVersions) == 0 {
+		return false, resourceGVR, nil
+	}
+
+	logger.Info(fmt.Sprintf("Kind %q exists in the following group/versions: %s", kind, strings.Join(resourceAPIGroupVersions, ", ")))
+
+	switch {
+	case slices.Contains(resourceAPIGroupVersions, "resource.k8s.io/v1"):
+		resourceGVR = schema.GroupVersionResource{Group: "resource.k8s.io", Version: "v1", Resource: "deviceclasses"}
+	case slices.Contains(resourceAPIGroupVersions, "resource.k8s.io/v1beta2"):
+		resourceGVR = schema.GroupVersionResource{Group: "resource.k8s.io", Version: "v1beta2", Resource: "deviceclasses"}
+	case slices.Contains(resourceAPIGroupVersions, "resource.k8s.io/v1beta1"):
+		resourceGVR = schema.GroupVersionResource{Group: "resource.k8s.io", Version: "v1beta1", Resource: "deviceclasses"}
+	default:
+		return false, resourceGVR, fmt.Errorf("failed to determine the GVR to use for the DeviceClass resource")
 	}
 
-	return len(matches) > 0, nil
+	return true, resourceGVR, nil
 }
 
 // GetClusterWideProxy returns cluster wide proxy object setup in OCP
@@ -885,6 +904,16 @@ func (n *ClusterPolicyController) getRuntime() error {
 	return nil
 }
 
+func newDynamicClient() (dynamic.Interface, error) {
+	cfg := config.GetConfigOrDie()
+	dynamicClient, err := dynamic.NewForConfig(cfg)
+	if err != nil {
+		return nil, err
+	}
+
+	return dynamicClient, nil
+}
+
 func (n *ClusterPolicyController) init(ctx context.Context, reconciler *ClusterPolicyReconciler, clusterPolicy *gpuv1.ClusterPolicy) error {
 	n.singleton = clusterPolicy
 	n.ctx = ctx
@@ -893,6 +922,12 @@ func (n *ClusterPolicyController) init(ctx context.Context, reconciler *ClusterP
 	n.client = reconciler.Client
 	n.scheme = reconciler.Scheme
 
+	dynamicClient, err := newDynamicClient()
+	if err != nil {
+		return fmt.Errorf("failed to get dynamic k8s client: %w", err)
+	}
+	n.dynamicClient = dynamicClient
+
 	if len(n.controls) == 0 {
 		clusterPolicyCtrl.operatorNamespace = reconciler.Namespace
 
@@ -917,11 +952,12 @@ func (n *ClusterPolicyController) init(ctx context.Context, reconciler *ClusterP
 			return fmt.Errorf("error validating clusterpolicy: %w", err)
 		}
 
-		draSupported, err := IsDRASupported(n.logger)
+		draSupported, resourceGVR, err := IsDRASupported(n.logger)
 		if err != nil {
 			return fmt.Errorf("failed to detect if DRA is supported: %w", err)
 		}
 		n.draSupported = draSupported
+		n.resourceGVR = resourceGVR
 
 		n.operatorMetrics = initOperatorMetrics()
 		n.logger.Info("Operator metrics initialized.")
@@ -954,7 +990,7 @@ func (n *ClusterPolicyController) init(ctx context.Context, reconciler *ClusterP
 
 	// TODO: combine this validation logic with the call to
 	// ValidateClusterPolicySpec() up above
-	err := n.validateClusterPolicy()
+	err = n.validateClusterPolicy()
 	if err != nil {
 		return fmt.Errorf("ClusterPolicy validation failed: %w", err)
 	}

From 885599f70a658756f2cb030876a387a58113ac36 Mon Sep 17 00:00:00 2001
From: Christopher Desiniotis <cdesiniotis@nvidia.com>
Date: Thu, 18 Sep 2025 11:33:48 -0700
Subject: [PATCH 07/10] Bump to latest dra driver image on HEAD of main

Signed-off-by: Christopher Desiniotis <cdesiniotis@nvidia.com>
---
 assets/state-dra-driver/0600_configmap.yaml |  2 +-
 assets/state-dra-driver/0700_daemonset.yaml | 10 ++++++++--
 deployments/gpu-operator/values.yaml        |  4 ++--
 3 files changed, 11 insertions(+), 5 deletions(-)

diff --git a/assets/state-dra-driver/0600_configmap.yaml b/assets/state-dra-driver/0600_configmap.yaml
index 6ffda9d28..495d5ca2c 100644
--- a/assets/state-dra-driver/0600_configmap.yaml
+++ b/assets/state-dra-driver/0600_configmap.yaml
@@ -7,7 +7,7 @@ metadata:
     app: nvidia-dra-driver-kubelet-plugin
 data:
   entrypoint.sh: |-
-    #!/bin/sh
+    #!/bin/bash
 
     if [ "$#" -ne 1 ]; then
       echo "Usage: $0 COMMAND"
diff --git a/assets/state-dra-driver/0700_daemonset.yaml b/assets/state-dra-driver/0700_daemonset.yaml
index a829281a6..440d2d8e2 100644
--- a/assets/state-dra-driver/0700_daemonset.yaml
+++ b/assets/state-dra-driver/0700_daemonset.yaml
@@ -55,7 +55,10 @@ spec:
           securityContext:
             privileged: true
           image: "FILLED BY THE OPERATOR"
-          command: ["/bin/sh", "-c"]
+          # (cdesiniotis) note that while the k8s-dra-driver-gpu image is built on top of
+          # the NVIDIA distroless base image, which does not have bash, a statically compiled
+          # bash is added to the final image at /bin/bash.
+          command: ["/bin/bash", "-c"]
           args:
             - /bin/entrypoint.sh "compute-domain-kubelet-plugin -v 6"
           env:
@@ -100,7 +103,10 @@ spec:
           securityContext:
             privileged: true
           image: "FILLED BY THE OPERATOR"
-          command: ["/bin/sh", "-c"]
+          # (cdesiniotis) note that while the k8s-dra-driver-gpu image is built on top of
+          # the NVIDIA distroless base image, which does not have bash, a statically compiled
+          # bash is added to the final image at /bin/bash.
+          command: ["/bin/bash", "-c"]
           args:
             - /bin/entrypoint.sh "gpu-kubelet-plugin -v 6"
           env:
diff --git a/deployments/gpu-operator/values.yaml b/deployments/gpu-operator/values.yaml
index 06483c601..27fb30e63 100644
--- a/deployments/gpu-operator/values.yaml
+++ b/deployments/gpu-operator/values.yaml
@@ -281,9 +281,9 @@ devicePlugin:
   hostNetwork: false
 
 draDriver:
-  repository: nvcr.io/nvidia
+  repository: ghcr.io/nvidia
   image: k8s-dra-driver-gpu
-  version: v25.3.0-rc.4
+  version: v25.8.0-dev-124734f2
   imagePullPolicy: IfNotPresent
   imagePullSecrets: []
 

From 066fce0feeae37484de82f5f9a653f2a23ff7596 Mon Sep 17 00:00:00 2001
From: Christopher Desiniotis <cdesiniotis@nvidia.com>
Date: Wed, 15 Apr 2026 18:05:58 -0700
Subject: [PATCH 08/10] Prevent DRA driver and sandboxWorkloads from both being
 enabled

Signed-off-by: Christopher Desiniotis <cdesiniotis@nvidia.com>
---
 controllers/clusterpolicy_validator.go             | 4 ++++
 deployments/gpu-operator/templates/validation.yaml | 6 ++++++
 2 files changed, 10 insertions(+)

diff --git a/controllers/clusterpolicy_validator.go b/controllers/clusterpolicy_validator.go
index 2f630a9e4..b8caeb0d0 100644
--- a/controllers/clusterpolicy_validator.go
+++ b/controllers/clusterpolicy_validator.go
@@ -39,5 +39,9 @@ func validateDRA(clusterpolicy *gpuv1.ClusterPolicy, draSupported bool) error {
 		return fmt.Errorf("the NVIDIA device plugin and the NVIDIA DRA driver for GPUs cannot both be enabled in ClusterPolicy")
 	}
 
+	if clusterpolicy.Spec.SandboxWorkloads.IsEnabled() && clusterpolicy.Spec.DRADriver.IsEnabled() {
+		return fmt.Errorf("sandboxWorkloads and the NVIDIA DRA driver for GPUs cannot both be enabled in ClusterPolicy")
+	}
+
 	return nil
 }
diff --git a/deployments/gpu-operator/templates/validation.yaml b/deployments/gpu-operator/templates/validation.yaml
index 263b29155..7355409f8 100644
--- a/deployments/gpu-operator/templates/validation.yaml
+++ b/deployments/gpu-operator/templates/validation.yaml
@@ -12,3 +12,9 @@
 {{- $error = printf "%s\nCannot enable the NVIDIA DRA Driver for GPUs on a Kubernetes cluster that does not support DRA" $error }}
 {{- fail $error }}
 {{- end}}
+
+{{- if and ($draEnabled) (eq .Values.sandboxWorkloads.enabled true) }}
+{{- $error := "" }}
+{{- $error = printf "%s\nThe NVIDIA DRA Driver for GPUs and 'sandboxWorkloads' cannot both be enabled" $error }}
+{{- fail $error }}
+{{- end}}

From a269c98e7d66d28a3a0a0fed3c87d96425b9f471 Mon Sep 17 00:00:00 2001
From: Christopher Desiniotis <cdesiniotis@nvidia.com>
Date: Wed, 15 Apr 2026 18:29:44 -0700
Subject: [PATCH 09/10] chore: consolidate clusterpolicy validation code

Signed-off-by: Christopher Desiniotis <cdesiniotis@nvidia.com>
---
 controllers/clusterpolicy_validator.go        |  27 +++-
 controllers/clusterpolicy_validator_test.go   | 117 ++++++++++++++++++
 controllers/state_manager.go                  |  19 ---
 controllers/state_manager_test.go             |  54 --------
 .../gpu-operator/templates/validation.yaml    |  20 ---
 .../gpu-operator/templates/validations.yaml   |  21 ++++
 6 files changed, 159 insertions(+), 99 deletions(-)
 create mode 100644 controllers/clusterpolicy_validator_test.go
 delete mode 100644 deployments/gpu-operator/templates/validation.yaml

diff --git a/controllers/clusterpolicy_validator.go b/controllers/clusterpolicy_validator.go
index b8caeb0d0..931441067 100644
--- a/controllers/clusterpolicy_validator.go
+++ b/controllers/clusterpolicy_validator.go
@@ -23,23 +23,38 @@ import (
 )
 
 func (n *ClusterPolicyController) validateClusterPolicy() error {
-	err := validateDRA(n.singleton, n.draSupported)
-	if err != nil {
+	if err := validateDRA(&n.singleton.Spec, n.draSupported); err != nil {
 		return fmt.Errorf("failed to validate DRA: %w", err)
 	}
+
+	if err := validateNRIPlugin(&n.singleton.Spec); err != nil {
+		return fmt.Errorf("failed to validate the NRI Plugin: %w", err)
+	}
+	return nil
+}
+
+func validateNRIPlugin(spec *gpuv1.ClusterPolicySpec) error {
+	if !spec.CDI.IsEnabled() && spec.CDI.IsNRIPluginEnabled() {
+		return fmt.Errorf("the NRI Plugin cannot be enabled when CDI is disabled")
+	}
+
+	if spec.CDI.IsNRIPluginEnabled() && !spec.Toolkit.IsEnabled() {
+		return fmt.Errorf("the NRI Plugin cannot be enabled when the Container Toolkit is disabled")
+	}
+
 	return nil
 }
 
-func validateDRA(clusterpolicy *gpuv1.ClusterPolicy, draSupported bool) error {
-	if !draSupported && clusterpolicy.Spec.DRADriver.IsEnabled() {
+func validateDRA(spec *gpuv1.ClusterPolicySpec, draSupported bool) error {
+	if !draSupported && spec.DRADriver.IsEnabled() {
 		return fmt.Errorf("the NVIDIA DRA driver for GPUs is enabled in ClusterPolicy but Dynamic Resource Allocation is not enabled in the Kubernetes cluster")
 	}
 
-	if clusterpolicy.Spec.DevicePlugin.IsEnabled() && clusterpolicy.Spec.DRADriver.IsGPUsEnabled() {
+	if spec.DevicePlugin.IsEnabled() && spec.DRADriver.IsGPUsEnabled() {
 		return fmt.Errorf("the NVIDIA device plugin and the NVIDIA DRA driver for GPUs cannot both be enabled in ClusterPolicy")
 	}
 
-	if clusterpolicy.Spec.SandboxWorkloads.IsEnabled() && clusterpolicy.Spec.DRADriver.IsEnabled() {
+	if spec.SandboxWorkloads.IsEnabled() && spec.DRADriver.IsEnabled() {
 		return fmt.Errorf("sandboxWorkloads and the NVIDIA DRA driver for GPUs cannot both be enabled in ClusterPolicy")
 	}
 
diff --git a/controllers/clusterpolicy_validator_test.go b/controllers/clusterpolicy_validator_test.go
new file mode 100644
index 000000000..537d8c52c
--- /dev/null
+++ b/controllers/clusterpolicy_validator_test.go
@@ -0,0 +1,117 @@
+/*
+ * Copyright (c) NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package controllers
+
+import (
+	"errors"
+	"testing"
+
+	"github.com/stretchr/testify/require"
+	"k8s.io/utils/ptr"
+
+	gpuv1 "github.com/NVIDIA/gpu-operator/api/nvidia/v1"
+)
+
+func TestValidateDRA(t *testing.T) {
+	tests := []struct {
+		description  string
+		spec         *gpuv1.ClusterPolicySpec
+		draSupported bool
+		err          error
+	}{
+		{
+			description: "dra not supported, dra driver not enabled",
+			spec:        &gpuv1.ClusterPolicySpec{},
+		},
+		{
+			description: "dra not supported, dra driver enabled",
+			spec: &gpuv1.ClusterPolicySpec{
+				DRADriver: gpuv1.DRADriverSpec{
+					GPUs: gpuv1.DRADriverGPUs{
+						Enabled: ptr.To(true),
+					},
+				},
+			},
+			err: errors.New("the NVIDIA DRA driver for GPUs is enabled in ClusterPolicy but Dynamic Resource Allocation is not enabled in the Kubernetes cluster"),
+		},
+	}
+
+	for _, tc := range tests {
+		t.Run(tc.description, func(t *testing.T) {
+			err := validateDRA(tc.spec, tc.draSupported)
+			if tc.err == nil {
+				require.NoError(t, err)
+			} else {
+				require.Error(t, err)
+				require.Equal(t, tc.err.Error(), err.Error())
+			}
+		})
+	}
+}
+
+func TestValidateNRIPlugin(t *testing.T) {
+	tests := []struct {
+		description string
+		spec        *gpuv1.ClusterPolicySpec
+		err         error
+	}{
+		{
+			description: "valid CDI object in spec",
+			spec: &gpuv1.ClusterPolicySpec{
+				CDI: gpuv1.CDIConfigSpec{
+					Enabled:          ptr.To(true),
+					NRIPluginEnabled: ptr.To(true),
+				},
+			},
+		},
+		{
+			description: "invalid CDI object in spec",
+			spec: &gpuv1.ClusterPolicySpec{
+				CDI: gpuv1.CDIConfigSpec{
+					Enabled:          ptr.To(false),
+					NRIPluginEnabled: ptr.To(true),
+				},
+			},
+			err: errors.New("the NRI Plugin cannot be enabled when CDI is disabled"),
+		},
+		{
+			description: "invalid CDI and Toolkit config combination",
+			spec: &gpuv1.ClusterPolicySpec{
+				CDI: gpuv1.CDIConfigSpec{
+					Enabled:          ptr.To(true),
+					NRIPluginEnabled: ptr.To(true),
+				},
+				Toolkit: gpuv1.ToolkitSpec{
+					Enabled: ptr.To(false),
+				},
+			},
+			err: errors.New("the NRI Plugin cannot be enabled when the Container Toolkit is disabled"),
+		},
+	}
+
+	for _, tc := range tests {
+		t.Run(tc.description, func(t *testing.T) {
+			err := validateNRIPlugin(tc.spec)
+			if tc.err == nil {
+				require.NoError(t, err)
+			} else {
+				require.Error(t, err)
+				require.Equal(t, tc.err.Error(), err.Error())
+			}
+		})
+	}
+}
diff --git a/controllers/state_manager.go b/controllers/state_manager.go
index 7d40c0676..a7cda36f7 100644
--- a/controllers/state_manager.go
+++ b/controllers/state_manager.go
@@ -947,11 +947,6 @@ func (n *ClusterPolicyController) init(ctx context.Context, reconciler *ClusterP
 		n.k8sVersion = k8sVersion
 		n.logger.Info("Kubernetes version detected", "version", k8sVersion)
 
-		err = validateClusterPolicySpec(&clusterPolicy.Spec)
-		if err != nil {
-			return fmt.Errorf("error validating clusterpolicy: %w", err)
-		}
-
 		draSupported, resourceGVR, err := IsDRASupported(n.logger)
 		if err != nil {
 			return fmt.Errorf("failed to detect if DRA is supported: %w", err)
@@ -988,8 +983,6 @@ func (n *ClusterPolicyController) init(ctx context.Context, reconciler *ClusterP
 		}
 	}
 
-	// TODO: combine this validation logic with the call to
-	// ValidateClusterPolicySpec() up above
 	err = n.validateClusterPolicy()
 	if err != nil {
 		return fmt.Errorf("ClusterPolicy validation failed: %w", err)
@@ -1234,15 +1227,3 @@ func (n ClusterPolicyController) isStateEnabled(stateName string) bool {
 		return false
 	}
 }
-
-func validateClusterPolicySpec(spec *gpuv1.ClusterPolicySpec) error {
-	if !spec.CDI.IsEnabled() && spec.CDI.IsNRIPluginEnabled() {
-		return fmt.Errorf("the NRI Plugin cannot be enabled when CDI is disabled")
-	}
-
-	if spec.CDI.IsNRIPluginEnabled() && !spec.Toolkit.IsEnabled() {
-		return fmt.Errorf("the NRI Plugin cannot be enabled when the Container Toolkit is disabled")
-	}
-
-	return nil
-}
diff --git a/controllers/state_manager_test.go b/controllers/state_manager_test.go
index 6585de196..f8f0f8e3a 100644
--- a/controllers/state_manager_test.go
+++ b/controllers/state_manager_test.go
@@ -18,7 +18,6 @@ package controllers
 
 import (
 	"context"
-	"errors"
 	"testing"
 
 	"github.com/stretchr/testify/require"
@@ -293,59 +292,6 @@ func TestHasMIGCapableGPU(t *testing.T) {
 	}
 }
 
-func TestValidateClusterPolicySpec(t *testing.T) {
-	tests := []struct {
-		description string
-		spec        *gpuv1.ClusterPolicySpec
-		err         error
-	}{
-		{
-			description: "valid CDI object in spec",
-			spec: &gpuv1.ClusterPolicySpec{
-				CDI: gpuv1.CDIConfigSpec{
-					Enabled:          ptr.To(true),
-					NRIPluginEnabled: ptr.To(true),
-				},
-			},
-		},
-		{
-			description: "invalid CDI object in spec",
-			spec: &gpuv1.ClusterPolicySpec{
-				CDI: gpuv1.CDIConfigSpec{
-					Enabled:          ptr.To(false),
-					NRIPluginEnabled: ptr.To(true),
-				},
-			},
-			err: errors.New("the NRI Plugin cannot be enabled when CDI is disabled"),
-		},
-		{
-			description: "invalid CDI and Toolkit config combination",
-			spec: &gpuv1.ClusterPolicySpec{
-				CDI: gpuv1.CDIConfigSpec{
-					Enabled:          ptr.To(true),
-					NRIPluginEnabled: ptr.To(true),
-				},
-				Toolkit: gpuv1.ToolkitSpec{
-					Enabled: ptr.To(false),
-				},
-			},
-			err: errors.New("the NRI Plugin cannot be enabled when the Container Toolkit is disabled"),
-		},
-	}
-
-	for _, tc := range tests {
-		t.Run(tc.description, func(t *testing.T) {
-			err := validateClusterPolicySpec(tc.spec)
-			if tc.err == nil {
-				require.NoError(t, err)
-			} else {
-				require.Error(t, err)
-				require.Equal(t, tc.err.Error(), err.Error())
-			}
-		})
-	}
-}
-
 func TestGetEffectiveStateLabels(t *testing.T) {
 	// getEffectiveStateLabels returns labels for workload config and sandbox mode.
 	// For container and vm-vgpu, mode has no effect. For vm-passthrough, mode selects
diff --git a/deployments/gpu-operator/templates/validation.yaml b/deployments/gpu-operator/templates/validation.yaml
deleted file mode 100644
index 7355409f8..000000000
--- a/deployments/gpu-operator/templates/validation.yaml
+++ /dev/null
@@ -1,20 +0,0 @@
-{{- $draEnabled := or (eq .Values.draDriver.gpus.enabled true) (eq .Values.draDriver.computeDomains.enabled true) }}
-{{- $clusterSupportsDRA := or (.Capabilities.APIVersions.Has "resource.k8s.io/v1beta1/DeviceClass") (.Capabilities.APIVersions.Has "resource.k8s.io/v1beta2/DeviceClass") }}
-
-{{- if and (eq .Values.devicePlugin.enabled true) (eq .Values.draDriver.gpus.enabled true) }}
-{{- $error := "" }}
-{{- $error = printf "%s\nThe NVIDIA device plugin and the NVIDIA DRA Driver for GPUs cannot both be enabled" $error }}
-{{- fail $error }}
-{{- end}}
-
-{{- if and ($draEnabled) (not $clusterSupportsDRA) }}
-{{- $error := "" }}
-{{- $error = printf "%s\nCannot enable the NVIDIA DRA Driver for GPUs on a Kubernetes cluster that does not support DRA" $error }}
-{{- fail $error }}
-{{- end}}
-
-{{- if and ($draEnabled) (eq .Values.sandboxWorkloads.enabled true) }}
-{{- $error := "" }}
-{{- $error = printf "%s\nThe NVIDIA DRA Driver for GPUs and 'sandboxWorkloads' cannot both be enabled" $error }}
-{{- fail $error }}
-{{- end}}
diff --git a/deployments/gpu-operator/templates/validations.yaml b/deployments/gpu-operator/templates/validations.yaml
index 9eaf283f4..8d52f3896 100644
--- a/deployments/gpu-operator/templates/validations.yaml
+++ b/deployments/gpu-operator/templates/validations.yaml
@@ -5,3 +5,24 @@
 {{- if and (eq .Values.cdi.nriPluginEnabled true) (eq .Values.toolkit.enabled false)  }}
 {{ fail "the NRI Plugin cannot be enabled when the Container Toolkit is disabled" }}
 {{- end }}
+
+{{- $draEnabled := or (eq .Values.draDriver.gpus.enabled true) (eq .Values.draDriver.computeDomains.enabled true) }}
+{{- $clusterSupportsDRA := or (.Capabilities.APIVersions.Has "resource.k8s.io/v1/DeviceClass") (.Capabilities.APIVersions.Has "resource.k8s.io/v1beta1/DeviceClass") (.Capabilities.APIVersions.Has "resource.k8s.io/v1beta2/DeviceClass") }}
+
+{{- if and (eq .Values.devicePlugin.enabled true) (eq .Values.draDriver.gpus.enabled true) }}
+{{- $error := "" }}
+{{- $error = printf "%s\nThe NVIDIA device plugin and the NVIDIA DRA Driver for GPUs cannot both be enabled" $error }}
+{{- fail $error }}
+{{- end }}
+
+{{- if and ($draEnabled) (not $clusterSupportsDRA) }}
+{{- $error := "" }}
+{{- $error = printf "%s\nCannot enable the NVIDIA DRA Driver for GPUs on a Kubernetes cluster that does not support DRA" $error }}
+{{- fail $error }}
+{{- end }}
+
+{{- if and ($draEnabled) (eq .Values.sandboxWorkloads.enabled true) }}
+{{- $error := "" }}
+{{- $error = printf "%s\nThe NVIDIA DRA Driver for GPUs and 'sandboxWorkloads' cannot both be enabled" $error }}
+{{- fail $error }}
+{{- end }}

From 48c20796ae5364efe891d6b5e46719026e21d0e1 Mon Sep 17 00:00:00 2001
From: Karthik Vetrivel <kvetrivel@nvidia.com>
Date: Fri, 1 May 2026 13:36:58 -0400
Subject: [PATCH 10/10] Update DRA driver assets for refreshed NVIDIA DRA
 driver

Signed-off-by: Karthik Vetrivel <kvetrivel@nvidia.com>
---
 .../0100_service_account.yaml                 |  10 +-
 ...compute_domain_daemon-service_account.yaml |   2 +-
 .../0120_kubeletplugin-service_account.yaml   |  11 +
 assets/state-dra-driver/0200_clusterrole.yaml | 117 +++---
 assets/state-dra-driver/0210_role.yaml        |  38 +-
 ...220_compute_domain_daemon-clusterrole.yaml |  45 +-
 .../0230_kubeletplugin-clusterrole.yaml       |  49 +++
 .../0300_clusterrolebinding.yaml              |  14 +-
 assets/state-dra-driver/0310_rolebinding.yaml |  16 +-
 ...pute_domain_daemon-clusterrolebinding.yaml |  11 +-
 .../0330_rolebinding.openshift.yaml           |  11 +-
 ...0340_kubeletplugin-clusterrolebinding.yaml |  12 +
 .../0350_kubeletplugin-role.yaml              |  12 +
 .../0360_kubeletplugin-rolebinding.yaml       |  13 +
 ...n_daemon-clusterrolebinding.openshift.yaml |  12 +
 assets/state-dra-driver/0500_deployment.yaml  | 101 +++--
 assets/state-dra-driver/0600_configmap.yaml   |  40 --
 assets/state-dra-driver/0700_daemonset.yaml   | 390 +++++++++++-------
 ...rator-certified.clusterserviceversion.yaml |   4 +-
 ...ource.nvidia.com_computedomaincliques.yaml |  84 ++++
 .../resource.nvidia.com_computedomains.yaml   | 224 ++++++----
 controllers/object_controls.go                |  65 ++-
 controllers/resource_manager.go               |  11 +-
 controllers/transforms_test.go                |  48 ++-
 ...ource.nvidia.com_computedomaincliques.yaml |  84 ++++
 .../resource.nvidia.com_computedomains.yaml   | 224 ++++++----
 .../gpu-operator/templates/cleanup_crd.yaml   |   1 +
 .../gpu-operator/templates/upgrade_crd.yaml   |   1 +
 deployments/gpu-operator/values.yaml          |   6 +-
 docker/Dockerfile                             |   2 +
 30 files changed, 1134 insertions(+), 524 deletions(-)
 create mode 100644 assets/state-dra-driver/0120_kubeletplugin-service_account.yaml
 create mode 100644 assets/state-dra-driver/0230_kubeletplugin-clusterrole.yaml
 create mode 100644 assets/state-dra-driver/0340_kubeletplugin-clusterrolebinding.yaml
 create mode 100644 assets/state-dra-driver/0350_kubeletplugin-role.yaml
 create mode 100644 assets/state-dra-driver/0360_kubeletplugin-rolebinding.yaml
 create mode 100644 assets/state-dra-driver/0370_compute_domain_daemon-clusterrolebinding.openshift.yaml
 delete mode 100644 assets/state-dra-driver/0600_configmap.yaml
 create mode 100644 bundle/manifests/resource.nvidia.com_computedomaincliques.yaml
 create mode 100644 deployments/gpu-operator/crds/resource.nvidia.com_computedomaincliques.yaml

diff --git a/assets/state-dra-driver/0100_service_account.yaml b/assets/state-dra-driver/0100_service_account.yaml
index 76d6d61af..b76133aba 100644
--- a/assets/state-dra-driver/0100_service_account.yaml
+++ b/assets/state-dra-driver/0100_service_account.yaml
@@ -1,5 +1,11 @@
 apiVersion: v1
 kind: ServiceAccount
 metadata:
-  name: nvidia-dra-driver
-  namespace: "FILLED BY THE OPERATOR"
+  name: nvidia-dra-driver-controller
+  namespace: gpu-operator
+  labels:
+    helm.sh/chart: nvidia-dra-driver-26.4.0-dev
+    app.kubernetes.io/version: 26.4.0-dev
+    app.kubernetes.io/managed-by: Helm
+    app.kubernetes.io/name: nvidia-dra-driver
+    app.kubernetes.io/instance: nvidia-dra-driver
diff --git a/assets/state-dra-driver/0110_compute_domain_daemon-service_account.yaml b/assets/state-dra-driver/0110_compute_domain_daemon-service_account.yaml
index e4bfe6255..970100254 100644
--- a/assets/state-dra-driver/0110_compute_domain_daemon-service_account.yaml
+++ b/assets/state-dra-driver/0110_compute_domain_daemon-service_account.yaml
@@ -2,4 +2,4 @@ apiVersion: v1
 kind: ServiceAccount
 metadata:
   name: compute-domain-daemon-service-account
-  namespace: "FILLED BY THE OPERATOR"
+  namespace: gpu-operator
diff --git a/assets/state-dra-driver/0120_kubeletplugin-service_account.yaml b/assets/state-dra-driver/0120_kubeletplugin-service_account.yaml
new file mode 100644
index 000000000..6b8e044fb
--- /dev/null
+++ b/assets/state-dra-driver/0120_kubeletplugin-service_account.yaml
@@ -0,0 +1,11 @@
+apiVersion: v1
+kind: ServiceAccount
+metadata:
+  name: nvidia-dra-driver-kubeletplugin
+  namespace: gpu-operator
+  labels:
+    helm.sh/chart: nvidia-dra-driver-26.4.0-dev
+    app.kubernetes.io/version: 26.4.0-dev
+    app.kubernetes.io/managed-by: Helm
+    app.kubernetes.io/name: nvidia-dra-driver
+    app.kubernetes.io/instance: nvidia-dra-driver
diff --git a/assets/state-dra-driver/0200_clusterrole.yaml b/assets/state-dra-driver/0200_clusterrole.yaml
index e2052e9a6..dcfcfc8ab 100644
--- a/assets/state-dra-driver/0200_clusterrole.yaml
+++ b/assets/state-dra-driver/0200_clusterrole.yaml
@@ -1,69 +1,56 @@
 apiVersion: rbac.authorization.k8s.io/v1
 kind: ClusterRole
 metadata:
-  name: nvidia-dra-driver
-  namespace: "FILLED BY THE OPERATOR"
+  name: nvidia-dra-driver-clusterrole-controller
 rules:
-  - apiGroups:
-      - resource.nvidia.com
-    resources:
-      - computedomains
-      - computedomains/status
-    verbs:
-      - get
-      - list
-      - watch
-      - create
-      - update
-      - patch
-      - delete
-  - apiGroups:
-      - resource.k8s.io
-    resources:
-      - resourceclaims
-      - resourceclaimtemplates
-    verbs:
-      - get
-      - list
-      - watch
-      - create
-      - update
-      - patch
-      - delete
-  - apiGroups:
-      - resource.k8s.io
-    resources:
-      - deviceclasses
-      - resourceslices
-    verbs:
-      - get
-      - list
-      - watch
-      - create
-      - update
-      - patch
-      - delete
-  - apiGroups:
-      - resource.k8s.io
-    resources:
-      - resourceclaims/status
-    verbs:
-      - update
-  - apiGroups:
-      - ""
-    resources:
-      - nodes
-    verbs:
-      - get
-      - list
-      - watch
-      - update
-      - patch
-  - apiGroups:
-      - ""
-    resources:
-      - pods
-    verbs:
-      - get
-      - list
-      - watch
+- apiGroups:
+  - resource.nvidia.com
+  resources:
+  - computedomains
+  verbs:
+  - get
+  - list
+  - watch
+  - update
+- apiGroups:
+  - resource.nvidia.com
+  resources:
+  - computedomains/status
+  verbs:
+  - update
+- apiGroups:
+  - resource.k8s.io
+  resources:
+  - resourceclaimtemplates
+  verbs:
+  - get
+  - list
+  - watch
+  - create
+  - update
+  - delete
+- apiGroups:
+  - coordination.k8s.io
+  resources:
+  - leases
+  verbs:
+  - get
+  - create
+  - update
+- apiGroups:
+  - ''
+  resources:
+  - nodes
+  verbs:
+  - get
+  - list
+  - watch
+  - update
+- apiGroups:
+  - ''
+  resources:
+  - pods
+  verbs:
+  - get
+  - list
+  - watch
diff --git a/assets/state-dra-driver/0210_role.yaml b/assets/state-dra-driver/0210_role.yaml
index 62e336e3d..37bcc7834 100644
--- a/assets/state-dra-driver/0210_role.yaml
+++ b/assets/state-dra-driver/0210_role.yaml
@@ -1,19 +1,27 @@
 apiVersion: rbac.authorization.k8s.io/v1
 kind: Role
 metadata:
-  name: nvidia-dra-driver
-  namespace: "FILLED BY THE OPERATOR"
+  name: nvidia-dra-driver-role-controller
+  namespace: gpu-operator
 rules:
-  - apiGroups:
-      - apps
-    resources:
-      - daemonsets
-      - deployments
-    verbs:
-      - get
-      - list
-      - watch
-      - create
-      - update
-      - patch
-      - delete
+- apiGroups:
+  - apps
+  resources:
+  - daemonsets
+  verbs:
+  - get
+  - list
+  - watch
+  - create
+  - update
+  - patch
+  - delete
+- apiGroups:
+  - resource.nvidia.com
+  resources:
+  - computedomaincliques
+  verbs:
+  - get
+  - list
+  - watch
+  - update
diff --git a/assets/state-dra-driver/0220_compute_domain_daemon-clusterrole.yaml b/assets/state-dra-driver/0220_compute_domain_daemon-clusterrole.yaml
index 4b157fa4a..7f113506d 100644
--- a/assets/state-dra-driver/0220_compute_domain_daemon-clusterrole.yaml
+++ b/assets/state-dra-driver/0220_compute_domain_daemon-clusterrole.yaml
@@ -1,17 +1,36 @@
 apiVersion: rbac.authorization.k8s.io/v1
 kind: ClusterRole
 metadata:
-  name: compute-domain-daemon-clusterrole
-  namespace: "FILLED BY THE OPERATOR"
+  name: compute-domain-daemon-role
 rules:
-  - apiGroups:
-      - resource.nvidia.com
-    resources:
-      - computedomains
-      - computedomains/status
-    verbs:
-      - get
-      - list
-      - watch
-      - update
-      - patch
+- apiGroups:
+  - resource.nvidia.com
+  resources:
+  - computedomains
+  - computedomains/status
+  verbs:
+  - get
+  - list
+  - watch
+  - update
+  - patch
+- apiGroups:
+  - resource.nvidia.com
+  resources:
+  - computedomaincliques
+  verbs:
+  - get
+  - list
+  - watch
+  - create
+  - update
+  - patch
+- apiGroups:
+  - ''
+  resources:
+  - pods
+  verbs:
+  - get
+  - list
+  - watch
+  - patch
diff --git a/assets/state-dra-driver/0230_kubeletplugin-clusterrole.yaml b/assets/state-dra-driver/0230_kubeletplugin-clusterrole.yaml
new file mode 100644
index 000000000..11fc91586
--- /dev/null
+++ b/assets/state-dra-driver/0230_kubeletplugin-clusterrole.yaml
@@ -0,0 +1,49 @@
+apiVersion: rbac.authorization.k8s.io/v1
+kind: ClusterRole
+metadata:
+  name: nvidia-dra-driver-clusterrole-kubeletplugin
+rules:
+- apiGroups:
+  - resource.nvidia.com
+  resources:
+  - computedomains
+  verbs:
+  - get
+  - list
+  - watch
+- apiGroups:
+  - resource.k8s.io
+  resources:
+  - resourceclaims
+  verbs:
+  - get
+  - list
+  - watch
+- apiGroups:
+  - resource.k8s.io
+  resources:
+  - resourceslices
+  verbs:
+  - get
+  - list
+  - watch
+  - create
+  - update
+  - delete
+- apiGroups:
+  - ''
+  resources:
+  - nodes
+  verbs:
+  - get
+  - list
+  - watch
+  - update
+- apiGroups:
+  - ''
+  resources:
+  - pods
+  verbs:
+  - get
+  - list
+  - watch
diff --git a/assets/state-dra-driver/0300_clusterrolebinding.yaml b/assets/state-dra-driver/0300_clusterrolebinding.yaml
index ea4f6a5e4..fe0fe0113 100644
--- a/assets/state-dra-driver/0300_clusterrolebinding.yaml
+++ b/assets/state-dra-driver/0300_clusterrolebinding.yaml
@@ -1,12 +1,12 @@
 apiVersion: rbac.authorization.k8s.io/v1
 kind: ClusterRoleBinding
 metadata:
-  name: nvidia-dra-driver
+  name: nvidia-dra-driver-clusterrole-binding-controller-gpu-operator
+subjects:
+- kind: ServiceAccount
+  name: nvidia-dra-driver-controller
+  namespace: gpu-operator
 roleRef:
-  apiGroup: rbac.authorization.k8s.io
   kind: ClusterRole
-  name: nvidia-dra-driver
-subjects:
-  - kind: ServiceAccount
-    name: nvidia-dra-driver
-    namespace: "FILLED BY THE OPERATOR"
+  name: nvidia-dra-driver-clusterrole-controller
+  apiGroup: rbac.authorization.k8s.io
diff --git a/assets/state-dra-driver/0310_rolebinding.yaml b/assets/state-dra-driver/0310_rolebinding.yaml
index bf893a63c..fe41c07bf 100644
--- a/assets/state-dra-driver/0310_rolebinding.yaml
+++ b/assets/state-dra-driver/0310_rolebinding.yaml
@@ -1,13 +1,13 @@
 apiVersion: rbac.authorization.k8s.io/v1
 kind: RoleBinding
 metadata:
-  name: nvidia-dra-driver
-  namespace: "FILLED BY THE OPERATOR"
+  name: nvidia-dra-driver-role-binding-controller
+  namespace: gpu-operator
+subjects:
+- kind: ServiceAccount
+  name: nvidia-dra-driver-controller
+  namespace: gpu-operator
 roleRef:
-  apiGroup: rbac.authorization.k8s.io
   kind: Role
-  name: nvidia-dra-driver
-subjects:
-  - kind: ServiceAccount
-    name: nvidia-dra-driver
-    namespace: "FILLED BY THE OPERATOR"
+  name: nvidia-dra-driver-role-controller
+  apiGroup: rbac.authorization.k8s.io
diff --git a/assets/state-dra-driver/0320_compute_domain_daemon-clusterrolebinding.yaml b/assets/state-dra-driver/0320_compute_domain_daemon-clusterrolebinding.yaml
index 5ba739004..b60eca681 100644
--- a/assets/state-dra-driver/0320_compute_domain_daemon-clusterrolebinding.yaml
+++ b/assets/state-dra-driver/0320_compute_domain_daemon-clusterrolebinding.yaml
@@ -1,13 +1,12 @@
 apiVersion: rbac.authorization.k8s.io/v1
 kind: ClusterRoleBinding
 metadata:
-  name: compute-domain-daemon-clusterrole-binding
-  namespace: "FILLED BY THE OPERATOR"
+  name: compute-domain-daemon-role-binding
 subjects:
-  - kind: ServiceAccount
-    name: compute-domain-daemon-service-account
-    namespace: "FILLED BY THE OPERATOR"
+- kind: ServiceAccount
+  name: compute-domain-daemon-service-account
+  namespace: gpu-operator
 roleRef:
   kind: ClusterRole
-  name: compute-domain-daemon-clusterrole
+  name: compute-domain-daemon-role
   apiGroup: rbac.authorization.k8s.io
diff --git a/assets/state-dra-driver/0330_rolebinding.openshift.yaml b/assets/state-dra-driver/0330_rolebinding.openshift.yaml
index bb49c649a..ce49c30c6 100644
--- a/assets/state-dra-driver/0330_rolebinding.openshift.yaml
+++ b/assets/state-dra-driver/0330_rolebinding.openshift.yaml
@@ -1,12 +1,11 @@
 apiVersion: rbac.authorization.k8s.io/v1
-kind: RoleBinding
+kind: ClusterRoleBinding
 metadata:
-  name: nvidia-dra-driver-openshift-privileged-role-binding
-  namespace: "FILLED BY THE OPERATOR"
+  name: nvidia-dra-driver-openshift-privileged-role-binding-kubeletplugin
 subjects:
-  - kind: ServiceAccount
-    name: nvidia-dra-driver
-    namespace: "FILLED BY THE OPERATOR"
+- kind: ServiceAccount
+  name: nvidia-dra-driver-kubeletplugin
+  namespace: gpu-operator
 roleRef:
   kind: ClusterRole
   name: system:openshift:scc:privileged
diff --git a/assets/state-dra-driver/0340_kubeletplugin-clusterrolebinding.yaml b/assets/state-dra-driver/0340_kubeletplugin-clusterrolebinding.yaml
new file mode 100644
index 000000000..f68f16763
--- /dev/null
+++ b/assets/state-dra-driver/0340_kubeletplugin-clusterrolebinding.yaml
@@ -0,0 +1,12 @@
+apiVersion: rbac.authorization.k8s.io/v1
+kind: ClusterRoleBinding
+metadata:
+  name: nvidia-dra-driver-clusterrole-binding-kubeletplugin
+subjects:
+- kind: ServiceAccount
+  name: nvidia-dra-driver-kubeletplugin
+  namespace: gpu-operator
+roleRef:
+  kind: ClusterRole
+  name: nvidia-dra-driver-clusterrole-kubeletplugin
+  apiGroup: rbac.authorization.k8s.io
diff --git a/assets/state-dra-driver/0350_kubeletplugin-role.yaml b/assets/state-dra-driver/0350_kubeletplugin-role.yaml
new file mode 100644
index 000000000..b1356333e
--- /dev/null
+++ b/assets/state-dra-driver/0350_kubeletplugin-role.yaml
@@ -0,0 +1,12 @@
+apiVersion: rbac.authorization.k8s.io/v1
+kind: Role
+metadata:
+  name: nvidia-dra-driver-role-kubeletplugin
+  namespace: gpu-operator
+rules:
+- apiGroups:
+  - resource.nvidia.com
+  resources:
+  - computedomaincliques
+  verbs:
+  - get
diff --git a/assets/state-dra-driver/0360_kubeletplugin-rolebinding.yaml b/assets/state-dra-driver/0360_kubeletplugin-rolebinding.yaml
new file mode 100644
index 000000000..67a83a409
--- /dev/null
+++ b/assets/state-dra-driver/0360_kubeletplugin-rolebinding.yaml
@@ -0,0 +1,13 @@
+apiVersion: rbac.authorization.k8s.io/v1
+kind: RoleBinding
+metadata:
+  name: nvidia-dra-driver-role-binding-kubeletplugin
+  namespace: gpu-operator
+subjects:
+- kind: ServiceAccount
+  name: nvidia-dra-driver-kubeletplugin
+  namespace: gpu-operator
+roleRef:
+  kind: Role
+  name: nvidia-dra-driver-role-kubeletplugin
+  apiGroup: rbac.authorization.k8s.io
diff --git a/assets/state-dra-driver/0370_compute_domain_daemon-clusterrolebinding.openshift.yaml b/assets/state-dra-driver/0370_compute_domain_daemon-clusterrolebinding.openshift.yaml
new file mode 100644
index 000000000..3a0665b69
--- /dev/null
+++ b/assets/state-dra-driver/0370_compute_domain_daemon-clusterrolebinding.openshift.yaml
@@ -0,0 +1,12 @@
+apiVersion: rbac.authorization.k8s.io/v1
+kind: ClusterRoleBinding
+metadata:
+  name: compute-domain-daemon-openshift-anyuid-role-binding
+subjects:
+- kind: ServiceAccount
+  name: compute-domain-daemon-service-account
+  namespace: gpu-operator
+roleRef:
+  kind: ClusterRole
+  name: system:openshift:scc:anyuid
+  apiGroup: rbac.authorization.k8s.io
diff --git a/assets/state-dra-driver/0500_deployment.yaml b/assets/state-dra-driver/0500_deployment.yaml
index 1e5bbd0f0..ca221c498 100644
--- a/assets/state-dra-driver/0500_deployment.yaml
+++ b/assets/state-dra-driver/0500_deployment.yaml
@@ -2,42 +2,97 @@ apiVersion: apps/v1
 kind: Deployment
 metadata:
   name: nvidia-dra-driver-controller
-  namespace: "FILLED BY THE OPERATOR"
+  namespace: gpu-operator
   labels:
+    helm.sh/chart: nvidia-dra-driver-26.4.0-dev
+    app.kubernetes.io/version: 26.4.0-dev
+    app.kubernetes.io/managed-by: Helm
+    app.kubernetes.io/name: nvidia-dra-driver
+    app.kubernetes.io/instance: nvidia-dra-driver
     app: nvidia-dra-driver-controller
 spec:
   replicas: 1
   selector:
     matchLabels:
-      app: nvidia-dra-driver-controller
+      nvidia-dra-driver-component: controller
   template:
     metadata:
       labels:
+        app.kubernetes.io/name: nvidia-dra-driver
+        app.kubernetes.io/instance: nvidia-dra-driver
+        nvidia-dra-driver-component: controller
         app: nvidia-dra-driver-controller
     spec:
       priorityClassName: system-node-critical
-      serviceAccountName: nvidia-dra-driver
+      serviceAccountName: nvidia-dra-driver-controller
+      securityContext: {}
       containers:
-        - name: compute-domains
-          image: "FILLED BY THE OPERATOR"
-          command: ["compute-domain-controller", "-v", "6"]
-          env:
-            - name: POD_NAME
-              valueFrom:
-                fieldRef:
-                  fieldPath: metadata.name
-            - name: NAMESPACE
-              valueFrom:
-                fieldRef:
-                  fieldPath: metadata.namespace
+      - name: compute-domain
+        securityContext: {}
+        image: FILLED BY THE OPERATOR
+        imagePullPolicy: IfNotPresent
+        command:
+        - compute-domain-controller
+        - "-v"
+        - "$(LOG_VERBOSITY)"
+        resources: {}
+        env:
+        - name: HTTP_ENDPOINT
+          value: ":8080"
+        - name: METRICS_PATH
+          value: "/metrics"
+        - name: PPROF_PATH
+          value: ''
+        - name: LOG_VERBOSITY
+          value: '4'
+        - name: LOG_VERBOSITY_CD_DAEMON
+          value: '4'
+        - name: POD_NAME
+          valueFrom:
+            fieldRef:
+              fieldPath: metadata.name
+        - name: NAMESPACE
+          valueFrom:
+            fieldRef:
+              fieldPath: metadata.namespace
+        - name: IMAGE_NAME
+          value: FILLED BY THE OPERATOR
+        - name: NVIDIA_VISIBLE_DEVICES
+          value: void
+        - name: LEADER_ELECTION_ENABLED
+          value: 'false'
+        - name: LEADER_ELECTION_LEASE_LOCK_NAME
+          value: nvidia-dra-driver-controller
+        - name: LEADER_ELECTION_LEASE_LOCK_NAMESPACE
+          value: gpu-operator
+        - name: LEADER_ELECTION_LEASE_DURATION
+          value: 15s
+        - name: LEADER_ELECTION_RENEW_DEADLINE
+          value: 10s
+        - name: LEADER_ELECTION_RETRY_PERIOD
+          value: 2s
       affinity:
         nodeAffinity:
-          requiredDuringSchedulingIgnoredDuringExecution:
-            nodeSelectorTerms:
-              - matchExpressions:
-                  - key: "node-role.kubernetes.io/control-plane"
-                    operator: "Exists"
+          preferredDuringSchedulingIgnoredDuringExecution:
+          - preference:
+              matchExpressions:
+              - key: node-role.kubernetes.io/control-plane
+                operator: Exists
+            weight: 100
+        podAntiAffinity:
+          preferredDuringSchedulingIgnoredDuringExecution:
+          - podAffinityTerm:
+              labelSelector:
+                matchLabels:
+                  nvidia-dra-driver-component: controller
+              topologyKey: kubernetes.io/hostname
+            weight: 100
       tolerations:
-        - key: node-role.kubernetes.io/control-plane
-          operator: Exists
-          effect: NoSchedule
+      - effect: NoSchedule
+        key: node-role.kubernetes.io/control-plane
+        operator: Exists
+      - effect: NoSchedule
+        key: node-role.kubernetes.io/master
+        operator: Exists
+      - key: CriticalAddonsOnly
+        operator: Exists
diff --git a/assets/state-dra-driver/0600_configmap.yaml b/assets/state-dra-driver/0600_configmap.yaml
deleted file mode 100644
index 495d5ca2c..000000000
--- a/assets/state-dra-driver/0600_configmap.yaml
+++ /dev/null
@@ -1,40 +0,0 @@
-apiVersion: v1
-kind: ConfigMap
-metadata:
-  name: nvidia-dra-driver-kubelet-plugin-entrypoint
-  namespace: "FILLED BY THE OPERATOR"
-  labels:
-    app: nvidia-dra-driver-kubelet-plugin
-data:
-  entrypoint.sh: |-
-    #!/bin/bash
-
-    if [ "$#" -ne 1 ]; then
-      echo "Usage: $0 COMMAND"
-      exit 1
-    fi
-    
-    entrypoint=$1
-    
-    until [ -f /run/nvidia/validations/driver-ready ]
-    do
-      echo "waiting for the driver validations to be ready..."
-      sleep 5
-    done
-    
-    set -o allexport
-    cat /run/nvidia/validations/driver-ready
-    . /run/nvidia/validations/driver-ready
-    
-    # Conditionally mask the params file to prevent this container from
-    # recreating any missing GPU device nodes. This is necessary, for
-    # example, when running under nvkind to limit the set GPUs governed
-    # by the plugin even though it has cgroup access to all of them.
-    if [ "${MASK_NVIDIA_DRIVER_PARAMS}" = "true" ]; then
-      cp /proc/driver/nvidia/params root/gpu-params
-      sed -i 's/^ModifyDeviceFiles: 1$/ModifyDeviceFiles: 0/' root/gpu-params
-      mount --bind root/gpu-params /proc/driver/nvidia/params
-    fi
-
-    echo "Starting the NVIDIA DRA Driver Kubelet Plugin"
-    exec $entrypoint
diff --git a/assets/state-dra-driver/0700_daemonset.yaml b/assets/state-dra-driver/0700_daemonset.yaml
index 440d2d8e2..5cc064255 100644
--- a/assets/state-dra-driver/0700_daemonset.yaml
+++ b/assets/state-dra-driver/0700_daemonset.yaml
@@ -2,176 +2,252 @@ apiVersion: apps/v1
 kind: DaemonSet
 metadata:
   name: nvidia-dra-driver-kubelet-plugin
-  namespace: "FILLED BY THE OPERATOR"
+  namespace: gpu-operator
   labels:
+    helm.sh/chart: nvidia-dra-driver-26.4.0-dev
+    app.kubernetes.io/version: 26.4.0-dev
+    app.kubernetes.io/managed-by: Helm
+    app.kubernetes.io/name: nvidia-dra-driver
+    app.kubernetes.io/instance: nvidia-dra-driver
     app: nvidia-dra-driver-kubelet-plugin
 spec:
   selector:
     matchLabels:
-      app: nvidia-dra-driver-kubelet-plugin
+      nvidia-dra-driver-component: kubelet-plugin
+  updateStrategy:
+    rollingUpdate:
+      maxUnavailable: 100%
+    type: RollingUpdate
   template:
     metadata:
       labels:
+        app.kubernetes.io/name: nvidia-dra-driver
+        app.kubernetes.io/instance: nvidia-dra-driver
+        nvidia-dra-driver-component: kubelet-plugin
         app: nvidia-dra-driver-kubelet-plugin
     spec:
-      nodeSelector:
-        nvidia.com/gpu.deploy.dra-driver-kubelet-plugin: "true"
       priorityClassName: system-node-critical
-      serviceAccountName: nvidia-dra-driver
+      serviceAccountName: nvidia-dra-driver-kubeletplugin
+      securityContext: {}
       initContainers:
-        - name: driver-validation
-          image: "FILLED BY THE OPERATOR"
-          imagePullPolicy: IfNotPresent
-          command: ['sh', '-c']
-          args: ["nvidia-validator"]
-          env:
-            - name: WITH_WAIT
-              value: "true"
-            - name: COMPONENT
-              value: driver
-            - name: OPERATOR_NAMESPACE
-              valueFrom:
-                fieldRef:
-                  fieldPath: metadata.namespace
-          securityContext:
-            privileged: true
-            seLinuxOptions:
-              level: "s0"
-          volumeMounts:
-            - name: driver-install-dir
-              mountPath: /run/nvidia/driver
-              mountPropagation: HostToContainer
-            - name: run-nvidia-validations
-              mountPath: /run/nvidia/validations
-              mountPropagation: Bidirectional
-            - name: host-root
-              mountPath: /host
-              readOnly: true
-              mountPropagation: HostToContainer
-            - name: host-dev-char
-              mountPath: /host-dev-char
+      - name: init-container
+        image: FILLED BY THE OPERATOR
+        securityContext:
+          privileged: true
+        command:
+        - bash
+        - "/usr/bin/kubelet-plugin-prestart.sh"
+        env:
+        - name: NVIDIA_DRIVER_ROOT
+          value: "/run/nvidia/driver"
+        - name: NVIDIA_VISIBLE_DEVICES
+          value: void
+        - name: KUBELET_REGISTRAR_DIRECTORY_PATH
+          value: "/var/lib/kubelet/plugins_registry"
+        - name: KUBELET_PLUGINS_DIRECTORY_PATH
+          value: "/var/lib/kubelet/plugins"
+        volumeMounts:
+        - name: driver-root-parent
+          mountPath: "/driver-root-parent"
+          mountPropagation: HostToContainer
       containers:
-        - name: compute-domains
-          securityContext:
-            privileged: true
-          image: "FILLED BY THE OPERATOR"
-          # (cdesiniotis) note that while the k8s-dra-driver-gpu image is built on top of
-          # the NVIDIA distroless base image, which does not have bash, a statically compiled
-          # bash is added to the final image at /bin/bash.
-          command: ["/bin/bash", "-c"]
-          args:
-            - /bin/entrypoint.sh "compute-domain-kubelet-plugin -v 6"
-          env:
-            - name: NVIDIA_VISIBLE_DEVICES
-              value: void
-            - name: CDI_ROOT
-              value: /var/run/cdi
-            - name: NVIDIA_MIG_CONFIG_DEVICES
-              value: all
-            - name: NODE_NAME
-              valueFrom:
-                fieldRef:
-                  fieldPath: spec.nodeName
-            - name: NAMESPACE
-              valueFrom:
-                fieldRef:
-                  fieldPath: metadata.namespace
-          volumeMounts:
-            - name: nvidia-dra-driver-kubelet-plugin-entrypoint
-              readOnly: true
-              mountPath: /bin/entrypoint.sh
-              subPath: entrypoint.sh
-            - name: plugins-registry
-              mountPath: /var/lib/kubelet/plugins_registry
-            - name: plugins
-              mountPath: /var/lib/kubelet/plugins
-              mountPropagation: Bidirectional
-            - name: cdi
-              mountPath: /var/run/cdi
-            - name: run-nvidia-validations
-              mountPath: /run/nvidia/validations
-              mountPropagation: Bidirectional
-            - name: driver-install-dir
-              mountPath: /driver-root
-              readOnly: true
-              mountPropagation: HostToContainer
-            - name: host-root
-              mountPath: /host
-              readOnly: true
-              mountPropagation: HostToContainer
-        - name: gpus
-          securityContext:
-            privileged: true
-          image: "FILLED BY THE OPERATOR"
-          # (cdesiniotis) note that while the k8s-dra-driver-gpu image is built on top of
-          # the NVIDIA distroless base image, which does not have bash, a statically compiled
-          # bash is added to the final image at /bin/bash.
-          command: ["/bin/bash", "-c"]
-          args:
-            - /bin/entrypoint.sh "gpu-kubelet-plugin -v 6"
-          env:
-            - name: NVIDIA_VISIBLE_DEVICES
-              value: void
-            - name: CDI_ROOT
-              value: /var/run/cdi
-            - name: NVIDIA_MIG_CONFIG_DEVICES
-              value: all
-            - name: NODE_NAME
-              valueFrom:
-                fieldRef:
-                  fieldPath: spec.nodeName
-            - name: NAMESPACE
-              valueFrom:
-                fieldRef:
-                  fieldPath: metadata.namespace
-          volumeMounts:
-            - name: nvidia-dra-driver-kubelet-plugin-entrypoint
-              readOnly: true
-              mountPath: /bin/entrypoint.sh
-              subPath: entrypoint.sh
-            - name: plugins-registry
-              mountPath: /var/lib/kubelet/plugins_registry
-            - name: plugins
-              mountPath: /var/lib/kubelet/plugins
-              mountPropagation: Bidirectional
-            - name: cdi
-              mountPath: /var/run/cdi
-            - name: run-nvidia-validations
-              mountPath: /run/nvidia/validations
-              mountPropagation: Bidirectional
-            - name: driver-install-dir
-              mountPath: /driver-root
-              readOnly: true
-              mountPropagation: HostToContainer
-            - name: host-root
-              mountPath: /host
-              readOnly: true
-              mountPropagation: HostToContainer
-      volumes:
-        - name: nvidia-dra-driver-kubelet-plugin-entrypoint
-          configMap:
-            name: nvidia-dra-driver-kubelet-plugin-entrypoint
-            defaultMode: 448
+      - name: compute-domains
+        securityContext:
+          privileged: true
+        image: FILLED BY THE OPERATOR
+        imagePullPolicy: IfNotPresent
+        command:
+        - bash
+        - "-c"
+        args:
+        - |-
+          # Conditionally mask the params file to prevent this container from
+          # recreating any missing GPU device nodes. This is necessary, for
+          # example, when running under nvkind to limit the set GPUs governed
+          # by the plugin even though it has cgroup access to all of them.
+          if [ "${MASK_NVIDIA_DRIVER_PARAMS}" = "true" ]; then
+            cp /proc/driver/nvidia/params /root/gpu-params
+            sed -i 's/^ModifyDeviceFiles: 1$/ModifyDeviceFiles: 0/' /root/gpu-params
+            mount --bind /root/gpu-params /proc/driver/nvidia/params
+          fi
+          compute-domain-kubelet-plugin -v $(LOG_VERBOSITY)
+        resources: {}
+        startupProbe:
+          grpc:
+            port: 51515
+            service: liveness
+          failureThreshold: 600
+          periodSeconds: 1
+          timeoutSeconds: 10
+        livenessProbe:
+          grpc:
+            port: 51515
+            service: liveness
+          failureThreshold: 3
+          periodSeconds: 10
+          timeoutSeconds: 10
+        env:
+        - name: HTTP_ENDPOINT
+          value: ":8081"
+        - name: METRICS_PATH
+          value: "/metrics"
+        - name: LOG_VERBOSITY
+          value: '4'
+        - name: MASK_NVIDIA_DRIVER_PARAMS
+          value: ''
+        - name: NVIDIA_DRIVER_ROOT
+          value: "/run/nvidia/driver"
+        - name: NVIDIA_VISIBLE_DEVICES
+          value: void
+        - name: CDI_ROOT
+          value: "/var/run/cdi"
+        - name: NVIDIA_MIG_CONFIG_DEVICES
+          value: all
+        - name: NODE_NAME
+          valueFrom:
+            fieldRef:
+              fieldPath: spec.nodeName
+        - name: NAMESPACE
+          valueFrom:
+            fieldRef:
+              fieldPath: metadata.namespace
+        - name: KUBELET_REGISTRAR_DIRECTORY_PATH
+          value: "/var/lib/kubelet/plugins_registry"
+        - name: KUBELET_PLUGINS_DIRECTORY_PATH
+          value: "/var/lib/kubelet/plugins"
+        - name: HEALTHCHECK_PORT
+          value: '51515'
+        volumeMounts:
         - name: plugins-registry
-          hostPath:
-            path: /var/lib/kubelet/plugins_registry
+          mountPath: "/var/lib/kubelet/plugins_registry"
         - name: plugins
-          hostPath:
-            path: /var/lib/kubelet/plugins
+          mountPath: "/var/lib/kubelet/plugins"
+          mountPropagation: Bidirectional
         - name: cdi
-          hostPath:
-            path: /var/run/cdi
-        - name: run-nvidia-validations
-          hostPath:
-            path: /run/nvidia/validations
-            type: DirectoryOrCreate
-        - name: driver-install-dir
-          hostPath:
-            path: /run/nvidia/driver
-            type: DirectoryOrCreate
-        - name: host-root
-          hostPath:
-            path: /
-        - name: host-dev-char
-          hostPath:
-            path: /dev/char
+          mountPath: "/var/run/cdi"
+        - name: driver-root
+          mountPath: "/driver-root"
+          readOnly: true
+          mountPropagation: HostToContainer
+      - name: gpus
+        securityContext:
+          privileged: true
+        image: FILLED BY THE OPERATOR
+        imagePullPolicy: IfNotPresent
+        command:
+        - bash
+        - "-c"
+        args:
+        - |-
+          # Conditionally mask the params file to prevent this container from
+          # recreating any missing GPU device nodes. This is necessary, for
+          # example, when running under nvkind to limit the set GPUs governed
+          # by the plugin even though it has cgroup access to all of them.
+          if [ "${MASK_NVIDIA_DRIVER_PARAMS}" = "true" ]; then
+            cp /proc/driver/nvidia/params /root/gpu-params
+            sed -i 's/^ModifyDeviceFiles: 1$/ModifyDeviceFiles: 0/' /root/gpu-params
+            mount --bind /root/gpu-params /proc/driver/nvidia/params
+          fi
+          gpu-kubelet-plugin -v $(LOG_VERBOSITY)
+        resources: {}
+        startupProbe:
+          grpc:
+            port: 51516
+            service: liveness
+          failureThreshold: 600
+          periodSeconds: 1
+          timeoutSeconds: 10
+        livenessProbe:
+          grpc:
+            port: 51516
+            service: liveness
+          failureThreshold: 3
+          periodSeconds: 30
+          timeoutSeconds: 10
+        env:
+        - name: HTTP_ENDPOINT
+          value: ":8080"
+        - name: METRICS_PATH
+          value: "/metrics"
+        - name: LOG_VERBOSITY
+          value: '4'
+        - name: MASK_NVIDIA_DRIVER_PARAMS
+          value: ''
+        - name: NVIDIA_DRIVER_ROOT
+          value: "/run/nvidia/driver"
+        - name: NVIDIA_VISIBLE_DEVICES
+          value: void
+        - name: CDI_ROOT
+          value: "/var/run/cdi"
+        - name: NVIDIA_MIG_CONFIG_DEVICES
+          value: all
+        - name: NODE_NAME
+          valueFrom:
+            fieldRef:
+              fieldPath: spec.nodeName
+        - name: NAMESPACE
+          valueFrom:
+            fieldRef:
+              fieldPath: metadata.namespace
+        - name: IMAGE_NAME
+          value: FILLED BY THE OPERATOR
+        - name: KUBELET_REGISTRAR_DIRECTORY_PATH
+          value: "/var/lib/kubelet/plugins_registry"
+        - name: KUBELET_PLUGINS_DIRECTORY_PATH
+          value: "/var/lib/kubelet/plugins"
+        - name: HEALTHCHECK_PORT
+          value: '51516'
+        volumeMounts:
+        - name: plugins-registry
+          mountPath: "/var/lib/kubelet/plugins_registry"
+        - name: plugins
+          mountPath: "/var/lib/kubelet/plugins"
+          mountPropagation: Bidirectional
+        - name: cdi
+          mountPath: "/var/run/cdi"
+        - name: driver-root
+          mountPath: "/driver-root"
+          mountPropagation: HostToContainer
+      volumes:
+      - name: plugins-registry
+        hostPath:
+          path: "/var/lib/kubelet/plugins_registry"
+      - name: plugins
+        hostPath:
+          path: "/var/lib/kubelet/plugins"
+      - name: cdi
+        hostPath:
+          path: "/var/run/cdi"
+      - name: driver-root-parent
+        hostPath:
+          path: "/run/nvidia"
+          type: DirectoryOrCreate
+      - name: driver-root
+        hostPath:
+          path: "/run/nvidia/driver"
+          type: DirectoryOrCreate
+      affinity:
+        nodeAffinity:
+          requiredDuringSchedulingIgnoredDuringExecution:
+            nodeSelectorTerms:
+            - matchExpressions:
+              - key: feature.node.kubernetes.io/pci-10de.present
+                operator: In
+                values:
+                - 'true'
+            - matchExpressions:
+              - key: feature.node.kubernetes.io/cpu-model.vendor_id
+                operator: In
+                values:
+                - NVIDIA
+            - matchExpressions:
+              - key: nvidia.com/gpu.present
+                operator: In
+                values:
+                - 'true'
+      tolerations:
+      - effect: NoSchedule
+        key: nvidia.com/gpu
+        operator: Exists
+      nodeSelector:
+        nvidia.com/gpu.deploy.dra-driver-kubelet-plugin: 'true'
diff --git a/bundle/manifests/gpu-operator-certified.clusterserviceversion.yaml b/bundle/manifests/gpu-operator-certified.clusterserviceversion.yaml
index 22eeca601..ffb600fb1 100644
--- a/bundle/manifests/gpu-operator-certified.clusterserviceversion.yaml
+++ b/bundle/manifests/gpu-operator-certified.clusterserviceversion.yaml
@@ -259,7 +259,7 @@ spec:
     - name: gdrcopy-image
       image: nvcr.io/nvidia/cloud-native/gdrdrv@sha256:0460630559b0b932c8861237b62e69c2895dace42d37ad3cb02c87e5d751fafc
     - name: dra-driver-image
-      image: nvcr.io/nvidia/k8s-dra-driver-gpu@sha256:5dd583277c1f2825cb637c3c07d8208c6278b1e6ccb4231f0ac011dbf651d5a9
+      image: us-central1-docker.pkg.dev/k8s-staging-images/dra-driver-nvidia/dra-driver-nvidia-gpu:v26.4.0-dev-bef400ef
   customresourcedefinitions:
     owned:
     - name: nvidiadrivers.nvidia.com
@@ -1055,7 +1055,7 @@ spec:
                   - name: "GDRCOPY_IMAGE"
                     value: "nvcr.io/nvidia/cloud-native/gdrdrv@sha256:0460630559b0b932c8861237b62e69c2895dace42d37ad3cb02c87e5d751fafc"
                   - name: "DRA_DRIVER_IMAGE"
-                    value: "nvcr.io/nvidia/k8s-dra-driver-gpu@sha256:5dd583277c1f2825cb637c3c07d8208c6278b1e6ccb4231f0ac011dbf651d5a9"
+                    value: "us-central1-docker.pkg.dev/k8s-staging-images/dra-driver-nvidia/dra-driver-nvidia-gpu:v26.4.0-dev-bef400ef"
               terminationGracePeriodSeconds: 10
               serviceAccountName: gpu-operator
     strategy: deployment
diff --git a/bundle/manifests/resource.nvidia.com_computedomaincliques.yaml b/bundle/manifests/resource.nvidia.com_computedomaincliques.yaml
new file mode 100644
index 000000000..b3eddef71
--- /dev/null
+++ b/bundle/manifests/resource.nvidia.com_computedomaincliques.yaml
@@ -0,0 +1,84 @@
+---
+apiVersion: apiextensions.k8s.io/v1
+kind: CustomResourceDefinition
+metadata:
+  annotations:
+    controller-gen.kubebuilder.io/version: v0.17.1
+  name: computedomaincliques.resource.nvidia.com
+spec:
+  group: resource.nvidia.com
+  names:
+    kind: ComputeDomainClique
+    listKind: ComputeDomainCliqueList
+    plural: computedomaincliques
+    singular: computedomainclique
+  scope: Namespaced
+  versions:
+  - name: v1beta1
+    schema:
+      openAPIV3Schema:
+        description: |-
+          ComputeDomainClique holds information about a specific clique within a ComputeDomain.
+          It is created in the driver namespace and named as "<computeDomainUID>.<cliqueID>".
+        properties:
+          apiVersion:
+            description: |-
+              APIVersion defines the versioned schema of this representation of an object.
+              Servers should convert recognized schemas to the latest internal value, and
+              may reject unrecognized values.
+              More info: https://git.k8s.io/community/contributors/devel/sig-architecture/api-conventions.md#resources
+            type: string
+          daemons:
+            items:
+              description: ComputeDomainDaemonInfo provides information about each
+                daemon in a ComputeDomainClique.
+              properties:
+                cliqueID:
+                  type: string
+                index:
+                  description: |-
+                    The Index field is used to ensure a consistent IP-to-DNS name
+                    mapping across all machines within an IMEX domain. Each node's index
+                    directly determines its DNS name within a given NVLink partition
+                    (i.e. clique). In other words, the 2-tuple of (CliqueID, Index) will
+                    always be unique. This field is marked as optional (but not
+                    omitempty) in order to support downgrades and avoid an API bump.
+                  type: integer
+                ipAddress:
+                  type: string
+                nodeName:
+                  type: string
+                status:
+                  default: NotReady
+                  description: |-
+                    The Status field tracks the readiness of the IMEX daemon running on
+                    this node. It gets switched to Ready whenever the IMEX daemon is
+                    ready to broker GPU memory exchanges and switches to NotReady when
+                    it is not. It is marked as optional in order to support downgrades
+                    and avoid an API bump.
+                  enum:
+                  - Ready
+                  - NotReady
+                  type: string
+              required:
+              - cliqueID
+              - ipAddress
+              - nodeName
+              type: object
+            type: array
+            x-kubernetes-list-map-keys:
+            - nodeName
+            x-kubernetes-list-type: map
+          kind:
+            description: |-
+              Kind is a string value representing the REST resource this object represents.
+              Servers may infer this from the endpoint the client submits requests to.
+              Cannot be updated.
+              In CamelCase.
+              More info: https://git.k8s.io/community/contributors/devel/sig-architecture/api-conventions.md#types-kinds
+            type: string
+          metadata:
+            type: object
+        type: object
+    served: true
+    storage: true
diff --git a/bundle/manifests/resource.nvidia.com_computedomains.yaml b/bundle/manifests/resource.nvidia.com_computedomains.yaml
index 307b21ff7..5a28ae17c 100644
--- a/bundle/manifests/resource.nvidia.com_computedomains.yaml
+++ b/bundle/manifests/resource.nvidia.com_computedomains.yaml
@@ -14,91 +14,149 @@ spec:
     singular: computedomain
   scope: Namespaced
   versions:
-    - name: v1beta1
-      schema:
-        openAPIV3Schema:
-          description: ComputeDomain prepares a set of nodes to run a multi-node workload
-            in.
-          properties:
-            apiVersion:
-              description: |-
-                APIVersion defines the versioned schema of this representation of an object.
-                Servers should convert recognized schemas to the latest internal value, and
-                may reject unrecognized values.
-                More info: https://git.k8s.io/community/contributors/devel/sig-architecture/api-conventions.md#resources
-              type: string
-            kind:
-              description: |-
-                Kind is a string value representing the REST resource this object represents.
-                Servers may infer this from the endpoint the client submits requests to.
-                Cannot be updated.
-                In CamelCase.
-                More info: https://git.k8s.io/community/contributors/devel/sig-architecture/api-conventions.md#types-kinds
-              type: string
-            metadata:
-              type: object
-            spec:
-              description: ComputeDomainSpec provides the spec for a ComputeDomain.
-              properties:
-                channel:
-                  description: ComputeDomainChannelSpec provides the spec for a channel
-                    used to run a workload inside a ComputeDomain.
-                  properties:
-                    resourceClaimTemplate:
-                      description: ComputeDomainResourceClaimTemplate provides the details
-                        of the ResourceClaimTemplate to generate.
-                      properties:
-                        name:
-                          type: string
-                      required:
-                        - name
-                      type: object
-                  required:
-                    - resourceClaimTemplate
-                  type: object
-                numNodes:
-                  type: integer
-              required:
-                - channel
-                - numNodes
-              type: object
-              x-kubernetes-validations:
-                - message: A computeDomain.spec is immutable
-                  rule: self == oldSelf
-            status:
-              description: ComputeDomainStatus provides the status for a ComputeDomain.
-              properties:
-                nodes:
-                  items:
-                    description: ComputeDomainNode provides information about each node
-                      added to a ComputeDomain.
+  - name: v1beta1
+    schema:
+      openAPIV3Schema:
+        description: ComputeDomain prepares a set of nodes to run a multi-node workload
+          in.
+        properties:
+          apiVersion:
+            description: |-
+              APIVersion defines the versioned schema of this representation of an object.
+              Servers should convert recognized schemas to the latest internal value, and
+              may reject unrecognized values.
+              More info: https://git.k8s.io/community/contributors/devel/sig-architecture/api-conventions.md#resources
+            type: string
+          kind:
+            description: |-
+              Kind is a string value representing the REST resource this object represents.
+              Servers may infer this from the endpoint the client submits requests to.
+              Cannot be updated.
+              In CamelCase.
+              More info: https://git.k8s.io/community/contributors/devel/sig-architecture/api-conventions.md#types-kinds
+            type: string
+          metadata:
+            type: object
+          spec:
+            description: ComputeDomainSpec provides the spec for a ComputeDomain.
+            properties:
+              channel:
+                description: ComputeDomainChannelSpec provides the spec for a channel
+                  used to run a workload inside a ComputeDomain.
+                properties:
+                  allocationMode:
+                    default: Single
+                    description: |-
+                      Allows for requesting all IMEX channels (the maximum per IMEX domain) or
+                      precisely one.
+                    enum:
+                    - All
+                    - Single
+                    type: string
+                  resourceClaimTemplate:
+                    description: ComputeDomainResourceClaimTemplate provides the details
+                      of the ResourceClaimTemplate to generate.
                     properties:
-                      cliqueID:
-                        type: string
-                      ipAddress:
-                        type: string
                       name:
                         type: string
                     required:
-                      - cliqueID
-                      - ipAddress
-                      - name
-                    type: object
-                  type: array
-                  x-kubernetes-list-map-keys:
                     - name
-                  x-kubernetes-list-type: map
-                status:
-                  default: NotReady
-                  enum:
-                    - Ready
-                    - NotReady
-                  type: string
-              required:
-                - status
-              type: object
-          type: object
-      served: true
-      storage: true
-      subresources:
-        status: {}
+                    type: object
+                required:
+                - resourceClaimTemplate
+                type: object
+              numNodes:
+                description: |-
+                  Intended number of IMEX daemons (i.e., individual compute nodes) in the
+                  ComputeDomain. Must be zero or greater.
+
+                  With `featureGates.IMEXDaemonsWithDNSNames=true` (the default), this is
+                  recommended to be set to zero. Workload must implement and consult its
+                  own source of truth for the number of workers online before trying to
+                  share GPU memory (and hence triggering IMEX interaction). When non-zero,
+                  `numNodes` is used only for automatically updating the global
+                  ComputeDomain `Status` (indicating `Ready` when the number of ready IMEX
+                  daemons equals `numNodes`). In this mode, a `numNodes` value greater than
+                  zero in particular does not gate the startup of IMEX daemons: individual
+                  IMEX daemons are started immediately without waiting for its peers, and
+                  any workload pod gets released right after its local IMEX daemon has
+                  started.
+
+                  With `featureGates.IMEXDaemonsWithDNSNames=false`, `numNodes` must be set
+                  to the expected number of worker nodes joining the ComputeDomain. In that
+                  mode, all workload pods are held back (with containers in state
+                  `ContainerCreating`) until the underlying IMEX domain has been joined by
+                  `numNodes` IMEX daemons. Pods from more than `numNodes` nodes trying to
+                  join the ComputeDomain may lead to unexpected behavior.
+
+                  The `numNodes` parameter is deprecated and will be removed in the next
+                  API version.
+                type: integer
+            required:
+            - channel
+            - numNodes
+            type: object
+            x-kubernetes-validations:
+            - message: A computeDomain.spec is immutable
+              rule: self == oldSelf
+          status:
+            description: |-
+              Global ComputeDomain status. Can be used to guide debugging efforts.
+              Workload however should not rely on inspecting this field at any point
+              during its lifecycle.
+            properties:
+              nodes:
+                items:
+                  description: ComputeDomainNode provides information about each node
+                    added to a ComputeDomain.
+                  properties:
+                    cliqueID:
+                      type: string
+                    index:
+                      description: |-
+                        The Index field is used to ensure a consistent IP-to-DNS name
+                        mapping across all machines within an IMEX domain. Each node's index
+                        directly determines its DNS name within a given NVLink partition
+                        (i.e. clique). In other words, the 2-tuple of (CliqueID, Index) will
+                        always be unique. This field is marked as optional (but not
+                        omitempty) in order to support downgrades and avoid an API bump.
+                      type: integer
+                    ipAddress:
+                      type: string
+                    name:
+                      type: string
+                    status:
+                      default: NotReady
+                      description: |-
+                        The Status field tracks the readiness of the IMEX daemon running on
+                        this node. It gets switched to Ready whenever the IMEX daemon is
+                        ready to broker GPU memory exchanges and switches to NotReady when
+                        it is not. It is marked as optional in order to support downgrades
+                        and avoid an API bump.
+                      enum:
+                      - Ready
+                      - NotReady
+                      type: string
+                  required:
+                  - cliqueID
+                  - ipAddress
+                  - name
+                  type: object
+                type: array
+                x-kubernetes-list-map-keys:
+                - name
+                x-kubernetes-list-type: map
+              status:
+                default: NotReady
+                enum:
+                - Ready
+                - NotReady
+                type: string
+            required:
+            - status
+            type: object
+        type: object
+    served: true
+    storage: true
+    subresources:
+      status: {}
diff --git a/controllers/object_controls.go b/controllers/object_controls.go
index 29ffb9bb3..5c3c4c38e 100644
--- a/controllers/object_controls.go
+++ b/controllers/object_controls.go
@@ -374,11 +374,11 @@ func ServiceAccounts(n ClusterPolicyController) (gpuv1.State, error) {
 	return status, nil
 }
 
-// Role creates Role resource
-func Role(n ClusterPolicyController) (gpuv1.State, error) {
+// createRole creates a Role resource
+func createRole(n ClusterPolicyController, idx int) (gpuv1.State, error) {
 	ctx := n.ctx
 	state := n.idx
-	obj := n.resources[state].Role.DeepCopy()
+	obj := n.resources[state].Roles[idx].DeepCopy()
 	obj.Namespace = n.operatorNamespace
 
 	logger := n.logger.WithValues("Role", obj.Name, "Namespace", obj.Namespace)
@@ -415,6 +415,22 @@ func Role(n ClusterPolicyController) (gpuv1.State, error) {
 	return gpuv1.Ready, nil
 }
 
+// Role creates one or more Role resources
+func Role(n ClusterPolicyController) (gpuv1.State, error) {
+	status := gpuv1.Ready
+	state := n.idx
+	for i := range n.resources[state].Roles {
+		stat, err := createRole(n, i)
+		if err != nil {
+			return stat, err
+		}
+		if stat == gpuv1.NotReady {
+			status = gpuv1.NotReady
+		}
+	}
+	return status, nil
+}
+
 // createRoleBinding creates a RoleBinding resource
 func createRoleBinding(n ClusterPolicyController, idx int) (gpuv1.State, error) {
 	ctx := n.ctx
@@ -1814,6 +1830,16 @@ func TransformDRADriverKubeletPlugin(obj *appsv1.DaemonSet, config *gpuv1.Cluste
 		return err
 	}
 
+	for i := range obj.Spec.Template.Spec.InitContainers {
+		if obj.Spec.Template.Spec.InitContainers[i].Name != "init-container" {
+			continue
+		}
+		obj.Spec.Template.Spec.InitContainers[i].Image = image
+		obj.Spec.Template.Spec.InitContainers[i].ImagePullPolicy = gpuv1.ImagePullPolicy(config.DRADriver.ImagePullPolicy)
+	}
+
+	transformDRADriverRoot(obj, config)
+
 	var containers []corev1.Container
 	for i, container := range obj.Spec.Template.Spec.Containers {
 		// Skip the container if the resource type is not enabled.
@@ -1866,6 +1892,35 @@ func TransformDRADriverKubeletPlugin(obj *appsv1.DaemonSet, config *gpuv1.Cluste
 	return nil
 }
 
+func transformDRADriverRoot(obj *appsv1.DaemonSet, config *gpuv1.ClusterPolicySpec) {
+	driverRoot := config.HostPaths.DriverInstallDir
+	if driverRoot == "" || driverRoot == DefaultDriverInstallDir {
+		return
+	}
+
+	driverRootParent := "/"
+	if driverRoot != "/" {
+		driverRootParent = filepath.Dir(strings.TrimRight(driverRoot, "/"))
+	}
+
+	for i := range obj.Spec.Template.Spec.InitContainers {
+		setContainerEnv(&obj.Spec.Template.Spec.InitContainers[i], "NVIDIA_DRIVER_ROOT", driverRoot)
+	}
+
+	for i := range obj.Spec.Template.Spec.Containers {
+		setContainerEnv(&obj.Spec.Template.Spec.Containers[i], "NVIDIA_DRIVER_ROOT", driverRoot)
+	}
+
+	for i := range obj.Spec.Template.Spec.Volumes {
+		switch obj.Spec.Template.Spec.Volumes[i].Name {
+		case "driver-root":
+			obj.Spec.Template.Spec.Volumes[i].HostPath.Path = driverRoot
+		case "driver-root-parent":
+			obj.Spec.Template.Spec.Volumes[i].HostPath.Path = driverRootParent
+		}
+	}
+}
+
 // TransformDCGMExporter transforms dcgm exporter daemonset with required config as per ClusterPolicy
 func TransformDCGMExporter(obj *appsv1.DaemonSet, config *gpuv1.ClusterPolicySpec, n ClusterPolicyController) error {
 	// update validation container
@@ -4221,14 +4276,14 @@ func getDaemonsetControllerRevisionHash(ctx context.Context, daemonset *appsv1.D
 func TransformDRADriverController(obj *appsv1.Deployment, spec *gpuv1.ClusterPolicySpec) error {
 	var computeDomainsCtr *corev1.Container
 	for i, ctr := range obj.Spec.Template.Spec.Containers {
-		if ctr.Name == "compute-domains" {
+		if ctr.Name == "compute-domain" {
 			computeDomainsCtr = &obj.Spec.Template.Spec.Containers[i]
 			break
 		}
 	}
 
 	if computeDomainsCtr == nil {
-		return fmt.Errorf("failed to find 'compute-domains' container")
+		return fmt.Errorf("failed to find 'compute-domain' container")
 	}
 
 	config := spec.DRADriver
diff --git a/controllers/resource_manager.go b/controllers/resource_manager.go
index 2ae7497ba..409f7ab6c 100644
--- a/controllers/resource_manager.go
+++ b/controllers/resource_manager.go
@@ -47,7 +47,7 @@ type assetsFromFile []byte
 // Resources indicates resources managed by GPU operator
 type Resources struct {
 	ServiceAccounts            []corev1.ServiceAccount
-	Role                       rbacv1.Role
+	Roles                      []rbacv1.Role
 	RoleBindings               []rbacv1.RoleBinding
 	ClusterRoles               []rbacv1.ClusterRole
 	ClusterRoleBindings        []rbacv1.ClusterRoleBinding
@@ -130,9 +130,14 @@ func addResourcesControls(n *ClusterPolicyController, path string) (Resources, c
 				ctrl = append(ctrl, ServiceAccounts)
 			}
 		case "Role":
-			_, _, err := s.Decode(m, nil, &res.Role)
+			role := rbacv1.Role{}
+			_, _, err := s.Decode(m, nil, &role)
 			panicIfError(err)
-			ctrl = append(ctrl, Role)
+			res.Roles = append(res.Roles, role)
+			// only add the ctrl function when the first Role is added for this component
+			if len(res.Roles) == 1 {
+				ctrl = append(ctrl, Role)
+			}
 		case "RoleBinding":
 			roleBinding := rbacv1.RoleBinding{}
 			_, _, err := s.Decode(m, nil, &roleBinding)
diff --git a/controllers/transforms_test.go b/controllers/transforms_test.go
index 407f32754..0b700bcbf 100644
--- a/controllers/transforms_test.go
+++ b/controllers/transforms_test.go
@@ -4833,6 +4833,50 @@ func TestTransformDRADriverKubeletPlugin(t *testing.T) {
 					},
 				}),
 		},
+		{
+			description: "custom driver root updates init container and volumes",
+			ds: NewDaemonset().
+				WithInitContainer(corev1.Container{Name: "init-container"}).
+				WithContainer(corev1.Container{Name: "gpus"}).
+				WithHostPathVolume("driver-root", DefaultDriverInstallDir, ptr.To(corev1.HostPathDirectoryOrCreate)).
+				WithHostPathVolume("driver-root-parent", "/run/nvidia", ptr.To(corev1.HostPathDirectoryOrCreate)),
+			cpSpec: &gpuv1.ClusterPolicySpec{
+				HostPaths: gpuv1.HostPathsSpec{DriverInstallDir: "/opt/nvidia/driver"},
+				DRADriver: gpuv1.DRADriverSpec{
+					Repository:      "nvcr.io/nvidia",
+					Image:           "k8s-dra-driver-gpu",
+					Version:         "v1.0.0",
+					ImagePullPolicy: "IfNotPresent",
+					GPUs: gpuv1.DRADriverGPUs{
+						Enabled: newBoolPtr(true),
+					},
+					ComputeDomains: gpuv1.DRADriverComputeDomains{
+						Enabled: newBoolPtr(false),
+					},
+				},
+			},
+			expectedDs: NewDaemonset().
+				WithInitContainer(corev1.Container{
+					Name:            "init-container",
+					Image:           "nvcr.io/nvidia/k8s-dra-driver-gpu:v1.0.0",
+					ImagePullPolicy: corev1.PullIfNotPresent,
+					Env: []corev1.EnvVar{
+						{Name: "NVIDIA_DRIVER_ROOT", Value: "/opt/nvidia/driver"},
+					},
+				}).
+				WithContainer(corev1.Container{
+					Name:            "gpus",
+					Image:           "nvcr.io/nvidia/k8s-dra-driver-gpu:v1.0.0",
+					ImagePullPolicy: corev1.PullIfNotPresent,
+					Env: []corev1.EnvVar{
+						{Name: "NVIDIA_DRIVER_ROOT", Value: "/opt/nvidia/driver"},
+						{Name: NvidiaCTKPathEnvName, Value: "toolkit/nvidia-ctk"},
+						{Name: "IMAGE_NAME", Value: "nvcr.io/nvidia/k8s-dra-driver-gpu:v1.0.0"},
+					},
+				}).
+				WithHostPathVolume("driver-root", "/opt/nvidia/driver", ptr.To(corev1.HostPathDirectoryOrCreate)).
+				WithHostPathVolume("driver-root-parent", "/opt/nvidia", ptr.To(corev1.HostPathDirectoryOrCreate)),
+		},
 		{
 			description: "gpus disabled, compute domains disabled",
 			ds: NewDaemonset().
@@ -4889,7 +4933,7 @@ func TestTransformDRADriverController(t *testing.T) {
 		{
 			description: "full dra driver spec",
 			deployment: NewDeployment().
-				WithContainer(corev1.Container{Name: "compute-domains"}),
+				WithContainer(corev1.Container{Name: "compute-domain"}),
 			cpSpec: &gpuv1.ClusterPolicySpec{
 				DRADriver: gpuv1.DRADriverSpec{
 					Repository:      "nvcr.io/nvidia",
@@ -4932,7 +4976,7 @@ func TestTransformDRADriverController(t *testing.T) {
 					},
 				}).
 				WithContainer(corev1.Container{
-					Name:            "compute-domains",
+					Name:            "compute-domain",
 					Image:           "nvcr.io/nvidia/k8s-dra-driver-gpu:v1.0.0",
 					ImagePullPolicy: corev1.PullIfNotPresent,
 					Env: []corev1.EnvVar{
diff --git a/deployments/gpu-operator/crds/resource.nvidia.com_computedomaincliques.yaml b/deployments/gpu-operator/crds/resource.nvidia.com_computedomaincliques.yaml
new file mode 100644
index 000000000..b3eddef71
--- /dev/null
+++ b/deployments/gpu-operator/crds/resource.nvidia.com_computedomaincliques.yaml
@@ -0,0 +1,84 @@
+---
+apiVersion: apiextensions.k8s.io/v1
+kind: CustomResourceDefinition
+metadata:
+  annotations:
+    controller-gen.kubebuilder.io/version: v0.17.1
+  name: computedomaincliques.resource.nvidia.com
+spec:
+  group: resource.nvidia.com
+  names:
+    kind: ComputeDomainClique
+    listKind: ComputeDomainCliqueList
+    plural: computedomaincliques
+    singular: computedomainclique
+  scope: Namespaced
+  versions:
+  - name: v1beta1
+    schema:
+      openAPIV3Schema:
+        description: |-
+          ComputeDomainClique holds information about a specific clique within a ComputeDomain.
+          It is created in the driver namespace and named as "<computeDomainUID>.<cliqueID>".
+        properties:
+          apiVersion:
+            description: |-
+              APIVersion defines the versioned schema of this representation of an object.
+              Servers should convert recognized schemas to the latest internal value, and
+              may reject unrecognized values.
+              More info: https://git.k8s.io/community/contributors/devel/sig-architecture/api-conventions.md#resources
+            type: string
+          daemons:
+            items:
+              description: ComputeDomainDaemonInfo provides information about each
+                daemon in a ComputeDomainClique.
+              properties:
+                cliqueID:
+                  type: string
+                index:
+                  description: |-
+                    The Index field is used to ensure a consistent IP-to-DNS name
+                    mapping across all machines within an IMEX domain. Each node's index
+                    directly determines its DNS name within a given NVLink partition
+                    (i.e. clique). In other words, the 2-tuple of (CliqueID, Index) will
+                    always be unique. This field is marked as optional (but not
+                    omitempty) in order to support downgrades and avoid an API bump.
+                  type: integer
+                ipAddress:
+                  type: string
+                nodeName:
+                  type: string
+                status:
+                  default: NotReady
+                  description: |-
+                    The Status field tracks the readiness of the IMEX daemon running on
+                    this node. It gets switched to Ready whenever the IMEX daemon is
+                    ready to broker GPU memory exchanges and switches to NotReady when
+                    it is not. It is marked as optional in order to support downgrades
+                    and avoid an API bump.
+                  enum:
+                  - Ready
+                  - NotReady
+                  type: string
+              required:
+              - cliqueID
+              - ipAddress
+              - nodeName
+              type: object
+            type: array
+            x-kubernetes-list-map-keys:
+            - nodeName
+            x-kubernetes-list-type: map
+          kind:
+            description: |-
+              Kind is a string value representing the REST resource this object represents.
+              Servers may infer this from the endpoint the client submits requests to.
+              Cannot be updated.
+              In CamelCase.
+              More info: https://git.k8s.io/community/contributors/devel/sig-architecture/api-conventions.md#types-kinds
+            type: string
+          metadata:
+            type: object
+        type: object
+    served: true
+    storage: true
diff --git a/deployments/gpu-operator/crds/resource.nvidia.com_computedomains.yaml b/deployments/gpu-operator/crds/resource.nvidia.com_computedomains.yaml
index 307b21ff7..5a28ae17c 100644
--- a/deployments/gpu-operator/crds/resource.nvidia.com_computedomains.yaml
+++ b/deployments/gpu-operator/crds/resource.nvidia.com_computedomains.yaml
@@ -14,91 +14,149 @@ spec:
     singular: computedomain
   scope: Namespaced
   versions:
-    - name: v1beta1
-      schema:
-        openAPIV3Schema:
-          description: ComputeDomain prepares a set of nodes to run a multi-node workload
-            in.
-          properties:
-            apiVersion:
-              description: |-
-                APIVersion defines the versioned schema of this representation of an object.
-                Servers should convert recognized schemas to the latest internal value, and
-                may reject unrecognized values.
-                More info: https://git.k8s.io/community/contributors/devel/sig-architecture/api-conventions.md#resources
-              type: string
-            kind:
-              description: |-
-                Kind is a string value representing the REST resource this object represents.
-                Servers may infer this from the endpoint the client submits requests to.
-                Cannot be updated.
-                In CamelCase.
-                More info: https://git.k8s.io/community/contributors/devel/sig-architecture/api-conventions.md#types-kinds
-              type: string
-            metadata:
-              type: object
-            spec:
-              description: ComputeDomainSpec provides the spec for a ComputeDomain.
-              properties:
-                channel:
-                  description: ComputeDomainChannelSpec provides the spec for a channel
-                    used to run a workload inside a ComputeDomain.
-                  properties:
-                    resourceClaimTemplate:
-                      description: ComputeDomainResourceClaimTemplate provides the details
-                        of the ResourceClaimTemplate to generate.
-                      properties:
-                        name:
-                          type: string
-                      required:
-                        - name
-                      type: object
-                  required:
-                    - resourceClaimTemplate
-                  type: object
-                numNodes:
-                  type: integer
-              required:
-                - channel
-                - numNodes
-              type: object
-              x-kubernetes-validations:
-                - message: A computeDomain.spec is immutable
-                  rule: self == oldSelf
-            status:
-              description: ComputeDomainStatus provides the status for a ComputeDomain.
-              properties:
-                nodes:
-                  items:
-                    description: ComputeDomainNode provides information about each node
-                      added to a ComputeDomain.
+  - name: v1beta1
+    schema:
+      openAPIV3Schema:
+        description: ComputeDomain prepares a set of nodes to run a multi-node workload
+          in.
+        properties:
+          apiVersion:
+            description: |-
+              APIVersion defines the versioned schema of this representation of an object.
+              Servers should convert recognized schemas to the latest internal value, and
+              may reject unrecognized values.
+              More info: https://git.k8s.io/community/contributors/devel/sig-architecture/api-conventions.md#resources
+            type: string
+          kind:
+            description: |-
+              Kind is a string value representing the REST resource this object represents.
+              Servers may infer this from the endpoint the client submits requests to.
+              Cannot be updated.
+              In CamelCase.
+              More info: https://git.k8s.io/community/contributors/devel/sig-architecture/api-conventions.md#types-kinds
+            type: string
+          metadata:
+            type: object
+          spec:
+            description: ComputeDomainSpec provides the spec for a ComputeDomain.
+            properties:
+              channel:
+                description: ComputeDomainChannelSpec provides the spec for a channel
+                  used to run a workload inside a ComputeDomain.
+                properties:
+                  allocationMode:
+                    default: Single
+                    description: |-
+                      Allows for requesting all IMEX channels (the maximum per IMEX domain) or
+                      precisely one.
+                    enum:
+                    - All
+                    - Single
+                    type: string
+                  resourceClaimTemplate:
+                    description: ComputeDomainResourceClaimTemplate provides the details
+                      of the ResourceClaimTemplate to generate.
                     properties:
-                      cliqueID:
-                        type: string
-                      ipAddress:
-                        type: string
                       name:
                         type: string
                     required:
-                      - cliqueID
-                      - ipAddress
-                      - name
-                    type: object
-                  type: array
-                  x-kubernetes-list-map-keys:
                     - name
-                  x-kubernetes-list-type: map
-                status:
-                  default: NotReady
-                  enum:
-                    - Ready
-                    - NotReady
-                  type: string
-              required:
-                - status
-              type: object
-          type: object
-      served: true
-      storage: true
-      subresources:
-        status: {}
+                    type: object
+                required:
+                - resourceClaimTemplate
+                type: object
+              numNodes:
+                description: |-
+                  Intended number of IMEX daemons (i.e., individual compute nodes) in the
+                  ComputeDomain. Must be zero or greater.
+
+                  With `featureGates.IMEXDaemonsWithDNSNames=true` (the default), this is
+                  recommended to be set to zero. Workload must implement and consult its
+                  own source of truth for the number of workers online before trying to
+                  share GPU memory (and hence triggering IMEX interaction). When non-zero,
+                  `numNodes` is used only for automatically updating the global
+                  ComputeDomain `Status` (indicating `Ready` when the number of ready IMEX
+                  daemons equals `numNodes`). In this mode, a `numNodes` value greater than
+                  zero in particular does not gate the startup of IMEX daemons: individual
+                  IMEX daemons are started immediately without waiting for its peers, and
+                  any workload pod gets released right after its local IMEX daemon has
+                  started.
+
+                  With `featureGates.IMEXDaemonsWithDNSNames=false`, `numNodes` must be set
+                  to the expected number of worker nodes joining the ComputeDomain. In that
+                  mode, all workload pods are held back (with containers in state
+                  `ContainerCreating`) until the underlying IMEX domain has been joined by
+                  `numNodes` IMEX daemons. Pods from more than `numNodes` nodes trying to
+                  join the ComputeDomain may lead to unexpected behavior.
+
+                  The `numNodes` parameter is deprecated and will be removed in the next
+                  API version.
+                type: integer
+            required:
+            - channel
+            - numNodes
+            type: object
+            x-kubernetes-validations:
+            - message: A computeDomain.spec is immutable
+              rule: self == oldSelf
+          status:
+            description: |-
+              Global ComputeDomain status. Can be used to guide debugging efforts.
+              Workload however should not rely on inspecting this field at any point
+              during its lifecycle.
+            properties:
+              nodes:
+                items:
+                  description: ComputeDomainNode provides information about each node
+                    added to a ComputeDomain.
+                  properties:
+                    cliqueID:
+                      type: string
+                    index:
+                      description: |-
+                        The Index field is used to ensure a consistent IP-to-DNS name
+                        mapping across all machines within an IMEX domain. Each node's index
+                        directly determines its DNS name within a given NVLink partition
+                        (i.e. clique). In other words, the 2-tuple of (CliqueID, Index) will
+                        always be unique. This field is marked as optional (but not
+                        omitempty) in order to support downgrades and avoid an API bump.
+                      type: integer
+                    ipAddress:
+                      type: string
+                    name:
+                      type: string
+                    status:
+                      default: NotReady
+                      description: |-
+                        The Status field tracks the readiness of the IMEX daemon running on
+                        this node. It gets switched to Ready whenever the IMEX daemon is
+                        ready to broker GPU memory exchanges and switches to NotReady when
+                        it is not. It is marked as optional in order to support downgrades
+                        and avoid an API bump.
+                      enum:
+                      - Ready
+                      - NotReady
+                      type: string
+                  required:
+                  - cliqueID
+                  - ipAddress
+                  - name
+                  type: object
+                type: array
+                x-kubernetes-list-map-keys:
+                - name
+                x-kubernetes-list-type: map
+              status:
+                default: NotReady
+                enum:
+                - Ready
+                - NotReady
+                type: string
+            required:
+            - status
+            type: object
+        type: object
+    served: true
+    storage: true
+    subresources:
+      status: {}
diff --git a/deployments/gpu-operator/templates/cleanup_crd.yaml b/deployments/gpu-operator/templates/cleanup_crd.yaml
index 347563498..f5c93d3f3 100644
--- a/deployments/gpu-operator/templates/cleanup_crd.yaml
+++ b/deployments/gpu-operator/templates/cleanup_crd.yaml
@@ -41,6 +41,7 @@ spec:
             - --filepath=/opt/gpu-operator/nvidia.com_clusterpolicies.yaml
             - --filepath=/opt/gpu-operator/nvidia.com_nvidiadrivers.yaml
             - --filepath=/opt/gpu-operator/resource.nvidia.com_computedomains.yaml
+            - --filepath=/opt/gpu-operator/resource.nvidia.com_computedomaincliques.yaml
         {{- if .Values.nfd.enabled }}
             - --filepath=/opt/gpu-operator/nfd-api-crds.yaml
         {{- end }}
diff --git a/deployments/gpu-operator/templates/upgrade_crd.yaml b/deployments/gpu-operator/templates/upgrade_crd.yaml
index ab66ee7d2..8d96eec86 100644
--- a/deployments/gpu-operator/templates/upgrade_crd.yaml
+++ b/deployments/gpu-operator/templates/upgrade_crd.yaml
@@ -90,6 +90,7 @@ spec:
             - --filepath=/opt/gpu-operator/nvidia.com_clusterpolicies.yaml
             - --filepath=/opt/gpu-operator/nvidia.com_nvidiadrivers.yaml
             - --filepath=/opt/gpu-operator/resource.nvidia.com_computedomains.yaml
+            - --filepath=/opt/gpu-operator/resource.nvidia.com_computedomaincliques.yaml
         {{- if .Values.nfd.enabled }}
             - --filepath=/opt/gpu-operator/nfd-api-crds.yaml
         {{- end }}
diff --git a/deployments/gpu-operator/values.yaml b/deployments/gpu-operator/values.yaml
index 27fb30e63..4c71b33c9 100644
--- a/deployments/gpu-operator/values.yaml
+++ b/deployments/gpu-operator/values.yaml
@@ -281,9 +281,9 @@ devicePlugin:
   hostNetwork: false
 
 draDriver:
-  repository: ghcr.io/nvidia
-  image: k8s-dra-driver-gpu
-  version: v25.8.0-dev-124734f2
+  repository: us-central1-docker.pkg.dev/k8s-staging-images/dra-driver-nvidia
+  image: dra-driver-nvidia-gpu
+  version: v26.4.0-dev-bef400ef
   imagePullPolicy: IfNotPresent
   imagePullSecrets: []
 
diff --git a/docker/Dockerfile b/docker/Dockerfile
index 121eed9ec..67a4ace72 100644
--- a/docker/Dockerfile
+++ b/docker/Dockerfile
@@ -105,6 +105,8 @@ COPY hack/must-gather.sh /usr/bin/gather
 # Add CRD resource into the image for helm upgrades
 COPY deployments/gpu-operator/crds/nvidia.com_clusterpolicies.yaml /opt/gpu-operator/nvidia.com_clusterpolicies.yaml
 COPY deployments/gpu-operator/crds/nvidia.com_nvidiadrivers.yaml /opt/gpu-operator/nvidia.com_nvidiadrivers.yaml
+COPY deployments/gpu-operator/crds/resource.nvidia.com_computedomains.yaml /opt/gpu-operator/resource.nvidia.com_computedomains.yaml
+COPY deployments/gpu-operator/crds/resource.nvidia.com_computedomaincliques.yaml /opt/gpu-operator/resource.nvidia.com_computedomaincliques.yaml
 COPY deployments/gpu-operator/charts/node-feature-discovery/crds/nfd-api-crds.yaml /opt/gpu-operator/nfd-api-crds.yaml
 
 USER 65532:65532