From d6e96fd30cbdb123a21b1975b10f1c470eac4d48 Mon Sep 17 00:00:00 2001 From: Arjun Date: Mon, 2 Feb 2026 08:20:18 +0000 Subject: [PATCH] Fix flaking failures in the GPU Operator CI e2e tests Signed-off-by: Arjun --- tests/scripts/update-nvidiadriver.sh | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/tests/scripts/update-nvidiadriver.sh b/tests/scripts/update-nvidiadriver.sh index c60db6560..e95909025 100755 --- a/tests/scripts/update-nvidiadriver.sh +++ b/tests/scripts/update-nvidiadriver.sh @@ -50,7 +50,14 @@ test_custom_labels_override() { exit 1 fi - # The labels override triggers a rollout of all gpu-operator operands, so we wait for the driver upgrade to transition to "upgrade-done" state. + # Wait for the operator to update the pod template with new labels + sleep 10 + + # Delete driver pod to force recreation with updated labels. Labels are applied to the pod template but existing pods are not automatically restarted. + echo "Deleting driver pod to trigger recreation with updated labels..." + kubectl delete pod -l "app.kubernetes.io/component=nvidia-driver" -n "$TEST_NAMESPACE" + + # Wait for the driver upgrade to transition to "upgrade-done" state wait_for_driver_upgrade_done check_nvidia_driver_pods_ready