From 845b32db86b3647b8f252d6db60ecab7823c1fed Mon Sep 17 00:00:00 2001 From: Tariq Ibrahim Date: Fri, 30 Jan 2026 12:12:50 -0800 Subject: [PATCH] add support rhel10 driver containers Signed-off-by: Tariq Ibrahim --- controllers/object_controls.go | 11 +++++--- controllers/object_controls_test.go | 42 ++++++++++++++++++++++++++--- internal/state/driver.go | 12 ++++----- internal/state/driver_test.go | 15 +++++++++++ internal/state/nodepool.go | 28 ++++++++++++++----- 5 files changed, 90 insertions(+), 18 deletions(-) diff --git a/controllers/object_controls.go b/controllers/object_controls.go index f26a0c063..1d53b48c4 100644 --- a/controllers/object_controls.go +++ b/controllers/object_controls.go @@ -710,10 +710,15 @@ func kernelFullVersion(n ClusterPolicyController) (string, string, string) { if !ok { return kFVersion, "", "" } + osMajorVersion := strings.Split(osVersion, ".")[0] + osMajorNumber, err := strconv.Atoi(osMajorVersion) + if err != nil { + return kFVersion, "", "" + } - if osName == "rocky" { - // If the OS is RockyLinux, we will omit the RockyLinux minor version when constructing the os image tag - osVersion = strings.Split(osVersion, ".")[0] + // If the OS is RockyLinux or RHEL 10 & above, we will omit the minor version when constructing the os image tag + if osName == "rocky" || (osName == "rhel" && osMajorNumber >= 10) { + osVersion = osMajorVersion } osTag := fmt.Sprintf("%s%s", osName, osVersion) diff --git a/controllers/object_controls_test.go b/controllers/object_controls_test.go index 2e5ea8285..fa26dc6d3 100644 --- a/controllers/object_controls_test.go +++ b/controllers/object_controls_test.go @@ -1524,6 +1524,42 @@ func TestKernelFullVersion(t *testing.T) { node *corev1.Node expected map[string]string }{ + { + node: &corev1.Node{ + ObjectMeta: metav1.ObjectMeta{ + Name: "test-node", + Labels: map[string]string{ + nfdOSReleaseIDLabelKey: "rhel", + nfdOSVersionIDLabelKey: "10.1", + nfdKernelLabelKey: "6.12.0-124.8.1.el10_1.x86_64", + commonGPULabelKey: "true", + }, + }, + }, + expected: map[string]string{ + "kernelFullVersion": "6.12.0-124.8.1.el10_1.x86_64", + "imageTagSuffix": "rhel10", + "osVersion": "10", + }, + }, + { + node: &corev1.Node{ + ObjectMeta: metav1.ObjectMeta{ + Name: "test-node", + Labels: map[string]string{ + nfdOSReleaseIDLabelKey: "rhel", + nfdOSVersionIDLabelKey: "9.6", + nfdKernelLabelKey: "5.14.0-570.78.1.el9_6.x86_64", + commonGPULabelKey: "true", + }, + }, + }, + expected: map[string]string{ + "kernelFullVersion": "5.14.0-570.78.1.el9_6.x86_64", + "imageTagSuffix": "rhel9.6", + "osVersion": "9.6", + }, + }, { node: &corev1.Node{ ObjectMeta: metav1.ObjectMeta{ @@ -1539,7 +1575,7 @@ func TestKernelFullVersion(t *testing.T) { expected: map[string]string{ "kernelFullVersion": "5.14.0-611.5.1.el9_7.x86_64", "imageTagSuffix": "rocky9", - "osVersionMajor": "9", + "osVersion": "9", }, }, { @@ -1557,7 +1593,7 @@ func TestKernelFullVersion(t *testing.T) { expected: map[string]string{ "kernelFullVersion": "6.8.0-60-generic", "imageTagSuffix": "ubuntu24.04", - "osVersionMajor": "24.04", + "osVersion": "24.04", }, }, } @@ -1574,6 +1610,6 @@ func TestKernelFullVersion(t *testing.T) { require.Equal(t, test.expected["kernelFullVersion"], kFVersion) require.Equal(t, test.expected["imageTagSuffix"], osTag) - require.Equal(t, test.expected["osVersionMajor"], osVersion) + require.Equal(t, test.expected["osVersion"], osVersion) } } diff --git a/internal/state/driver.go b/internal/state/driver.go index b0e6484de..fe6c9fffd 100644 --- a/internal/state/driver.go +++ b/internal/state/driver.go @@ -482,7 +482,7 @@ func getDriverAppName(cr *nvidiav1alpha1.NVIDIADriver, pool nodePool) string { var hashBuilder strings.Builder - appNamePrefix := fmt.Sprintf(appNamePrefixFormat, cr.Spec.DriverType, pool.getOS()) + appNamePrefix := fmt.Sprintf(appNamePrefixFormat, cr.Spec.DriverType, pool.osTag) uid := string(cr.UID) hashBuilder.WriteString(uid) @@ -519,7 +519,7 @@ func getDefaultStartupProbe(spec *nvidiav1alpha1.NVIDIADriverSpec) *nvidiav1alph } func getDriverImagePath(spec *nvidiav1alpha1.NVIDIADriverSpec, nodePool nodePool) (string, error) { - os := nodePool.getOS() + os := nodePool.osTag if spec.UsePrecompiledDrivers() { return spec.GetPrecompiledImagePath(os, nodePool.kernel) @@ -547,7 +547,7 @@ func getDriverSpec(cr *nvidiav1alpha1.NVIDIADriver, nodePool nodePool) (*driverS return nil, fmt.Errorf("no NVIDIADriver CR provided") } - nvidiaDriverName := getDriverName(cr, nodePool.getOS()) + nvidiaDriverName := getDriverName(cr, nodePool.osTag) nvidiaDriverAppName := getDriverAppName(cr, nodePool) spec := cr.Spec.DeepCopy() @@ -575,7 +575,7 @@ func getDriverSpec(cr *nvidiav1alpha1.NVIDIADriver, nodePool nodePool) (*driverS Name: nvidiaDriverName, ImagePath: imagePath, ManagerImagePath: managerImagePath, - OSVersion: nodePool.getOS(), + OSVersion: nodePool.osTag, }, nil } @@ -585,7 +585,7 @@ func getGDSSpec(spec *nvidiav1alpha1.NVIDIADriverSpec, pool nodePool) (*gdsDrive return nil, nil } gdsSpec := spec.GPUDirectStorage - imagePath, err := gdsSpec.GetImagePath(pool.getOS()) + imagePath, err := gdsSpec.GetImagePath(pool.osTag) if err != nil { return nil, err } @@ -602,7 +602,7 @@ func getGDRCopySpec(spec *nvidiav1alpha1.NVIDIADriverSpec, pool nodePool) (*gdrc return nil, nil } gdrcopySpec := spec.GDRCopy - imagePath, err := gdrcopySpec.GetImagePath(pool.getOS()) + imagePath, err := gdrcopySpec.GetImagePath(pool.osTag) if err != nil { return nil, err } diff --git a/internal/state/driver_test.go b/internal/state/driver_test.go index 75d8b04f9..d9d44fbd1 100644 --- a/internal/state/driver_test.go +++ b/internal/state/driver_test.go @@ -507,6 +507,9 @@ func TestGetDriverAppName(t *testing.T) { osRelease: "ubuntu", osVersion: "20.04", } + var err error + pool.osTag, err = getOSTag(pool.osRelease, pool.osVersion) + assert.NoError(t, err) actual := getDriverAppName(cr, pool) expected := "nvidia-gpu-driver-ubuntu20.04-67cc6dbb79" @@ -522,6 +525,8 @@ func TestGetDriverAppName(t *testing.T) { // Now set the osVersion to a really long string pool.osRelease = "redhatCoreOS" pool.osVersion = "4.14-414.92.202309282257" + pool.osTag, err = getOSTag(pool.osRelease, pool.osVersion) + assert.NoError(t, err) actual = getDriverAppName(cr, pool) expected = "nvidia-gpu-driver-redhatCoreOS4.14-414.92.2023092822-59b779bcc5" @@ -544,6 +549,9 @@ func TestGetDriverAppNameRHCOS(t *testing.T) { osVersion: "4.14", rhcosVersion: "414.92.202309282257", } + var err error + pool.osTag, err = getOSTag(pool.osRelease, pool.osVersion) + assert.NoError(t, err) actual := getDriverAppName(cr, pool) expected := "nvidia-gpu-driver-rhcos4.14-6f4fc4fc6" @@ -940,6 +948,10 @@ func TestGetDriverSpecMultipleNodePools(t *testing.T) { }, } + var err error + pool1.osTag, err = getOSTag(pool1.osRelease, pool1.osVersion) + require.NoError(t, err) + pool2 := nodePool{ osRelease: "ubuntu", osVersion: "20.04", @@ -950,6 +962,9 @@ func TestGetDriverSpecMultipleNodePools(t *testing.T) { }, } + pool2.osTag, err = getOSTag(pool2.osRelease, pool2.osVersion) + require.NoError(t, err) + spec1, err := getDriverSpec(cr, pool1) require.NoError(t, err) spec2, err := getDriverSpec(cr, pool2) diff --git a/internal/state/nodepool.go b/internal/state/nodepool.go index ddfc0c2e0..acd68a67a 100644 --- a/internal/state/nodepool.go +++ b/internal/state/nodepool.go @@ -20,6 +20,7 @@ import ( "context" "fmt" "maps" + "strconv" "strings" corev1 "k8s.io/api/core/v1" @@ -38,6 +39,7 @@ type nodePool struct { name string osRelease string osVersion string + osTag string rhcosVersion string kernel string nodeSelector map[string]string @@ -94,7 +96,12 @@ func getNodePools(ctx context.Context, k8sClient client.Client, selector map[str nodePool.nodeSelector[nfdOSVersionIDLabelKey] = osVersion nodePool.osRelease = osID nodePool.osVersion = osVersion - nodePool.name = nodePool.getOS() + + osTag, err := getOSTag(osID, osVersion) + if err != nil { + return nil, fmt.Errorf("failed to get OS info for node %s: %w", node.Name, err) + } + nodePool.osTag = osTag if precompiled { kernelVersion, ok := nodeLabels[nfdKernelLabelKey] @@ -132,10 +139,19 @@ func getNodePools(ctx context.Context, k8sClient client.Client, selector map[str return nodePools, nil } -func (n nodePool) getOS() string { - if n.osRelease == "rocky" { - // If the OS is RockyLinux, we will omit the RockyLinux minor version when constructing the os image tag - n.osVersion = strings.Split(n.osVersion, ".")[0] +func getOSTag(osRelease, osVersion string) (string, error) { + osMajorVersion := strings.Split(osVersion, ".")[0] + osMajorNumber, err := strconv.Atoi(osMajorVersion) + if err != nil { + return "", fmt.Errorf("failed to parse os version: %w", err) + } + + var osTagSuffix string + // If the OS is RockyLinux or RHEL 10 & above, we will omit the minor version when constructing the os image tag + if osRelease == "rocky" || (osRelease == "rhel" && osMajorNumber >= 10) { + osTagSuffix = osMajorVersion + } else { + osTagSuffix = osVersion } - return fmt.Sprintf("%s%s", n.osRelease, n.osVersion) + return fmt.Sprintf("%s%s", osRelease, osTagSuffix), nil }