From a9ccf2a08d74f0b2f390b289fb30ceef2b41588e Mon Sep 17 00:00:00 2001 From: Tariq Ibrahim Date: Mon, 27 Apr 2026 18:31:16 -0700 Subject: [PATCH] [state-driver] add new toleration to handle driver upgrades This change is added in anticipation of a new enhancement in k8s-driver-manager where the driver-manager will add a taint to a node where a gpu driver upgrade is taking place. This taint helps the driver-manager to evict third party gpu client pods to ensure driver module unloads are successful during an upgrade cycle. Signed-off-by: Tariq Ibrahim --- assets/state-driver/0500_daemonset.yaml | 3 +++ internal/state/testdata/golden/driver-additional-configs.yaml | 3 +++ internal/state/testdata/golden/driver-full-spec.yaml | 3 +++ internal/state/testdata/golden/driver-gdrcopy-openshift.yaml | 3 +++ internal/state/testdata/golden/driver-gdrcopy.yaml | 3 +++ internal/state/testdata/golden/driver-gds.yaml | 3 +++ internal/state/testdata/golden/driver-hostnetwork.yaml | 3 +++ internal/state/testdata/golden/driver-minimal.yaml | 3 +++ .../state/testdata/golden/driver-openshift-drivertoolkit.yaml | 3 +++ internal/state/testdata/golden/driver-precompiled.yaml | 3 +++ internal/state/testdata/golden/driver-rdma-hostmofed.yaml | 3 +++ internal/state/testdata/golden/driver-rdma.yaml | 3 +++ internal/state/testdata/golden/driver-secret-env.yaml | 3 +++ .../testdata/golden/driver-vgpu-host-manager-openshift.yaml | 3 +++ internal/state/testdata/golden/driver-vgpu-host-manager.yaml | 3 +++ .../state/testdata/golden/driver-vgpu-licensing-secret.yaml | 3 +++ internal/state/testdata/golden/driver-vgpu-licensing.yaml | 3 +++ manifests/state-driver/0500_daemonset.yaml | 3 +++ 18 files changed, 54 insertions(+) diff --git a/assets/state-driver/0500_daemonset.yaml b/assets/state-driver/0500_daemonset.yaml index 853cf6fc9..22cc08d8d 100644 --- a/assets/state-driver/0500_daemonset.yaml +++ b/assets/state-driver/0500_daemonset.yaml @@ -30,6 +30,9 @@ spec: - key: nvidia.com/gpu operator: Exists effect: NoSchedule + - key: nvidia.com/gpu-driver-update + operator: Exists + effect: NoSchedule affinity: podAntiAffinity: requiredDuringSchedulingIgnoredDuringExecution: diff --git a/internal/state/testdata/golden/driver-additional-configs.yaml b/internal/state/testdata/golden/driver-additional-configs.yaml index 774daf599..8ad7a6d2d 100644 --- a/internal/state/testdata/golden/driver-additional-configs.yaml +++ b/internal/state/testdata/golden/driver-additional-configs.yaml @@ -313,6 +313,9 @@ spec: - effect: NoSchedule key: nvidia.com/gpu operator: Exists + - effect: NoSchedule + key: nvidia.com/gpu-driver-update + operator: Exists volumes: - hostPath: path: /run/nvidia diff --git a/internal/state/testdata/golden/driver-full-spec.yaml b/internal/state/testdata/golden/driver-full-spec.yaml index 24f859bb6..0cb48579f 100644 --- a/internal/state/testdata/golden/driver-full-spec.yaml +++ b/internal/state/testdata/golden/driver-full-spec.yaml @@ -332,6 +332,9 @@ spec: - effect: NoSchedule key: nvidia.com/gpu operator: Exists + - effect: NoSchedule + key: nvidia.com/gpu-driver-update + operator: Exists - effect: NoSchedule key: foo operator: Equal diff --git a/internal/state/testdata/golden/driver-gdrcopy-openshift.yaml b/internal/state/testdata/golden/driver-gdrcopy-openshift.yaml index 391e22841..840378944 100644 --- a/internal/state/testdata/golden/driver-gdrcopy-openshift.yaml +++ b/internal/state/testdata/golden/driver-gdrcopy-openshift.yaml @@ -487,6 +487,9 @@ spec: - effect: NoSchedule key: nvidia.com/gpu operator: Exists + - effect: NoSchedule + key: nvidia.com/gpu-driver-update + operator: Exists volumes: - hostPath: path: /run/nvidia diff --git a/internal/state/testdata/golden/driver-gdrcopy.yaml b/internal/state/testdata/golden/driver-gdrcopy.yaml index 77f21927a..c8bf48528 100644 --- a/internal/state/testdata/golden/driver-gdrcopy.yaml +++ b/internal/state/testdata/golden/driver-gdrcopy.yaml @@ -367,6 +367,9 @@ spec: - effect: NoSchedule key: nvidia.com/gpu operator: Exists + - effect: NoSchedule + key: nvidia.com/gpu-driver-update + operator: Exists volumes: - hostPath: path: /run/nvidia diff --git a/internal/state/testdata/golden/driver-gds.yaml b/internal/state/testdata/golden/driver-gds.yaml index 109c49709..2ea523968 100644 --- a/internal/state/testdata/golden/driver-gds.yaml +++ b/internal/state/testdata/golden/driver-gds.yaml @@ -367,6 +367,9 @@ spec: - effect: NoSchedule key: nvidia.com/gpu operator: Exists + - effect: NoSchedule + key: nvidia.com/gpu-driver-update + operator: Exists volumes: - hostPath: path: /run/nvidia diff --git a/internal/state/testdata/golden/driver-hostnetwork.yaml b/internal/state/testdata/golden/driver-hostnetwork.yaml index e08f74186..4541f375e 100644 --- a/internal/state/testdata/golden/driver-hostnetwork.yaml +++ b/internal/state/testdata/golden/driver-hostnetwork.yaml @@ -306,6 +306,9 @@ spec: - effect: NoSchedule key: nvidia.com/gpu operator: Exists + - effect: NoSchedule + key: nvidia.com/gpu-driver-update + operator: Exists volumes: - hostPath: path: /run/nvidia diff --git a/internal/state/testdata/golden/driver-minimal.yaml b/internal/state/testdata/golden/driver-minimal.yaml index d08ba1c2b..6a29837d3 100644 --- a/internal/state/testdata/golden/driver-minimal.yaml +++ b/internal/state/testdata/golden/driver-minimal.yaml @@ -304,6 +304,9 @@ spec: - effect: NoSchedule key: nvidia.com/gpu operator: Exists + - effect: NoSchedule + key: nvidia.com/gpu-driver-update + operator: Exists volumes: - hostPath: path: /run/nvidia diff --git a/internal/state/testdata/golden/driver-openshift-drivertoolkit.yaml b/internal/state/testdata/golden/driver-openshift-drivertoolkit.yaml index ad978ad56..2dafd85bc 100644 --- a/internal/state/testdata/golden/driver-openshift-drivertoolkit.yaml +++ b/internal/state/testdata/golden/driver-openshift-drivertoolkit.yaml @@ -423,6 +423,9 @@ spec: - effect: NoSchedule key: nvidia.com/gpu operator: Exists + - effect: NoSchedule + key: nvidia.com/gpu-driver-update + operator: Exists volumes: - hostPath: path: /run/nvidia diff --git a/internal/state/testdata/golden/driver-precompiled.yaml b/internal/state/testdata/golden/driver-precompiled.yaml index 8441a3438..ee718f1c9 100644 --- a/internal/state/testdata/golden/driver-precompiled.yaml +++ b/internal/state/testdata/golden/driver-precompiled.yaml @@ -307,6 +307,9 @@ spec: - effect: NoSchedule key: nvidia.com/gpu operator: Exists + - effect: NoSchedule + key: nvidia.com/gpu-driver-update + operator: Exists volumes: - hostPath: path: /run/nvidia diff --git a/internal/state/testdata/golden/driver-rdma-hostmofed.yaml b/internal/state/testdata/golden/driver-rdma-hostmofed.yaml index c2f055b4a..206175762 100644 --- a/internal/state/testdata/golden/driver-rdma-hostmofed.yaml +++ b/internal/state/testdata/golden/driver-rdma-hostmofed.yaml @@ -387,6 +387,9 @@ spec: - effect: NoSchedule key: nvidia.com/gpu operator: Exists + - effect: NoSchedule + key: nvidia.com/gpu-driver-update + operator: Exists volumes: - hostPath: path: /run/nvidia diff --git a/internal/state/testdata/golden/driver-rdma.yaml b/internal/state/testdata/golden/driver-rdma.yaml index 7f6f1127a..5766b6bf7 100644 --- a/internal/state/testdata/golden/driver-rdma.yaml +++ b/internal/state/testdata/golden/driver-rdma.yaml @@ -381,6 +381,9 @@ spec: - effect: NoSchedule key: nvidia.com/gpu operator: Exists + - effect: NoSchedule + key: nvidia.com/gpu-driver-update + operator: Exists volumes: - hostPath: path: /run/nvidia diff --git a/internal/state/testdata/golden/driver-secret-env.yaml b/internal/state/testdata/golden/driver-secret-env.yaml index 6db2ceb59..99ad4878c 100644 --- a/internal/state/testdata/golden/driver-secret-env.yaml +++ b/internal/state/testdata/golden/driver-secret-env.yaml @@ -399,6 +399,9 @@ spec: - effect: NoSchedule key: nvidia.com/gpu operator: Exists + - effect: NoSchedule + key: nvidia.com/gpu-driver-update + operator: Exists volumes: - hostPath: path: /run/nvidia diff --git a/internal/state/testdata/golden/driver-vgpu-host-manager-openshift.yaml b/internal/state/testdata/golden/driver-vgpu-host-manager-openshift.yaml index 376d0910e..10cda37cd 100644 --- a/internal/state/testdata/golden/driver-vgpu-host-manager-openshift.yaml +++ b/internal/state/testdata/golden/driver-vgpu-host-manager-openshift.yaml @@ -386,6 +386,9 @@ spec: - effect: NoSchedule key: nvidia.com/gpu operator: Exists + - effect: NoSchedule + key: nvidia.com/gpu-driver-update + operator: Exists volumes: - hostPath: path: /run/nvidia diff --git a/internal/state/testdata/golden/driver-vgpu-host-manager.yaml b/internal/state/testdata/golden/driver-vgpu-host-manager.yaml index 110a71c56..9b0744028 100644 --- a/internal/state/testdata/golden/driver-vgpu-host-manager.yaml +++ b/internal/state/testdata/golden/driver-vgpu-host-manager.yaml @@ -292,6 +292,9 @@ spec: - effect: NoSchedule key: nvidia.com/gpu operator: Exists + - effect: NoSchedule + key: nvidia.com/gpu-driver-update + operator: Exists volumes: - hostPath: path: /run/nvidia diff --git a/internal/state/testdata/golden/driver-vgpu-licensing-secret.yaml b/internal/state/testdata/golden/driver-vgpu-licensing-secret.yaml index 9c63fa61c..047082398 100644 --- a/internal/state/testdata/golden/driver-vgpu-licensing-secret.yaml +++ b/internal/state/testdata/golden/driver-vgpu-licensing-secret.yaml @@ -310,6 +310,9 @@ spec: - effect: NoSchedule key: nvidia.com/gpu operator: Exists + - effect: NoSchedule + key: nvidia.com/gpu-driver-update + operator: Exists volumes: - hostPath: path: /run/nvidia diff --git a/internal/state/testdata/golden/driver-vgpu-licensing.yaml b/internal/state/testdata/golden/driver-vgpu-licensing.yaml index b04ed567f..279f696c2 100644 --- a/internal/state/testdata/golden/driver-vgpu-licensing.yaml +++ b/internal/state/testdata/golden/driver-vgpu-licensing.yaml @@ -310,6 +310,9 @@ spec: - effect: NoSchedule key: nvidia.com/gpu operator: Exists + - effect: NoSchedule + key: nvidia.com/gpu-driver-update + operator: Exists volumes: - hostPath: path: /run/nvidia diff --git a/manifests/state-driver/0500_daemonset.yaml b/manifests/state-driver/0500_daemonset.yaml index 5b9c6c62f..d9381161c 100644 --- a/manifests/state-driver/0500_daemonset.yaml +++ b/manifests/state-driver/0500_daemonset.yaml @@ -87,6 +87,9 @@ spec: - key: nvidia.com/gpu operator: Exists effect: NoSchedule + - key: nvidia.com/gpu-driver-update + operator: Exists + effect: NoSchedule {{- if .Driver.Spec.Tolerations }} {{- .Driver.Spec.Tolerations | yaml | nindent 8 }} {{- end }}