diff --git a/controllers/object_controls.go b/controllers/object_controls.go index 1f8806fcc..689977e66 100644 --- a/controllers/object_controls.go +++ b/controllers/object_controls.go @@ -45,7 +45,6 @@ import ( "k8s.io/utils/ptr" "sigs.k8s.io/controller-runtime/pkg/client" "sigs.k8s.io/controller-runtime/pkg/controller/controllerutil" - "sigs.k8s.io/yaml" gpuv1 "github.com/NVIDIA/gpu-operator/api/nvidia/v1" driverconfig "github.com/NVIDIA/gpu-operator/internal/config" @@ -161,8 +160,6 @@ const ( DeviceListStrategyEnvName = "DEVICE_LIST_STRATEGY" // CDIAnnotationPrefixEnvName is the name of the device-plugin envvar for configuring the CDI annotation prefix CDIAnnotationPrefixEnvName = "CDI_ANNOTATION_PREFIX" - // KataManagerAnnotationHashKey is the annotation indicating the hash of the kata-manager configuration - KataManagerAnnotationHashKey = "nvidia.com/kata-manager.last-applied-hash" // DefaultKataArtifactsDir is the default directory to store kata artifacts on the host DefaultKataArtifactsDir = "/opt/nvidia-gpu-operator/artifacts/runtimeclasses/" // PodControllerRevisionHashLabelKey is the annotation key for pod controller revision hash value @@ -577,16 +574,6 @@ func createConfigMap(n ClusterPolicyController, configMapIdx int) (gpuv1.State, } } - if obj.Name == "nvidia-kata-manager-config" { - data, err := yaml.Marshal(config.KataManager.Config) - if err != nil { - return gpuv1.NotReady, fmt.Errorf("failed to marshal kata manager config: %v", err) - } - obj.Data = map[string]string{ - "config.yaml": string(data), - } - } - if err := controllerutil.SetControllerReference(n.singleton, obj, n.scheme); err != nil { return gpuv1.NotReady, err } @@ -714,7 +701,6 @@ func preProcessDaemonSet(obj *appsv1.DaemonSet, n ClusterPolicyController) error "nvidia-mig-manager": TransformMIGManager, "nvidia-operator-validator": TransformValidator, "nvidia-sandbox-validator": TransformSandboxValidator, - "nvidia-kata-manager": TransformKataManager, "nvidia-cc-manager": TransformCCManager, } @@ -1392,13 +1378,7 @@ func transformForRuntime(obj *appsv1.DaemonSet, config *gpuv1.ClusterPolicySpec, } // Handle the drop-in configs - // TODO: It's a bit of a hack to skip the `nvidia-kata-manager` container here. - // Ideally if the two projects are using the SAME API then this should be - // captured more rigorously. - // Note that we probably want to implement drop-in file support in the - // kata manager in any case -- in which case it will be good to use a - // similar implementation. - if dropInConfigFile != "" && container.Name != "nvidia-kata-manager" { + if dropInConfigFile != "" { sourceConfigFileName := path.Base(dropInConfigFile) sourceConfigDir := path.Dir(dropInConfigFile) containerConfigDir := DefaultRuntimeDropInConfigTargetDir @@ -2023,83 +2003,6 @@ func TransformMIGManager(obj *appsv1.DaemonSet, config *gpuv1.ClusterPolicySpec, return nil } -// TransformKataManager transforms Kata Manager daemonset with required config as per ClusterPolicy -func TransformKataManager(obj *appsv1.DaemonSet, config *gpuv1.ClusterPolicySpec, n ClusterPolicyController) error { - // update image - image, err := gpuv1.ImagePath(&config.KataManager) - if err != nil { - return err - } - obj.Spec.Template.Spec.Containers[0].Image = image - - // update image pull policy - obj.Spec.Template.Spec.Containers[0].ImagePullPolicy = gpuv1.ImagePullPolicy(config.KataManager.ImagePullPolicy) - - // set image pull secrets - if len(config.KataManager.ImagePullSecrets) > 0 { - addPullSecrets(&obj.Spec.Template.Spec, config.KataManager.ImagePullSecrets) - } - - // set resource limits - if config.KataManager.Resources != nil { - // apply resource limits to all containers - for i := range obj.Spec.Template.Spec.Containers { - obj.Spec.Template.Spec.Containers[i].Resources.Requests = config.KataManager.Resources.Requests - obj.Spec.Template.Spec.Containers[i].Resources.Limits = config.KataManager.Resources.Limits - } - } - - // set arguments if specified for mig-manager container - if len(config.KataManager.Args) > 0 { - obj.Spec.Template.Spec.Containers[0].Args = config.KataManager.Args - } - - // mount artifactsDir - artifactsDir := DefaultKataArtifactsDir - if config.KataManager.Config.ArtifactsDir != "" { - artifactsDir = config.KataManager.Config.ArtifactsDir - } - - // set env used by readinessProbe to determine path to kata-manager pid file. - setContainerEnv(&(obj.Spec.Template.Spec.Containers[0]), "KATA_ARTIFACTS_DIR", artifactsDir) - - artifactsVolMount := corev1.VolumeMount{Name: "kata-artifacts", MountPath: artifactsDir} - obj.Spec.Template.Spec.Containers[0].VolumeMounts = append(obj.Spec.Template.Spec.Containers[0].VolumeMounts, artifactsVolMount) - - artifactsVol := corev1.Volume{Name: "kata-artifacts", VolumeSource: corev1.VolumeSource{HostPath: &corev1.HostPathVolumeSource{Path: artifactsDir, Type: ptr.To(corev1.HostPathDirectoryOrCreate)}}} - obj.Spec.Template.Spec.Volumes = append(obj.Spec.Template.Spec.Volumes, artifactsVol) - - // Compute hash of kata manager config and add an annotation with the value. - // If the kata config changes, a new revision of the daemonset will be - // created and thus the kata-manager pods will restart with the updated config. - hash := utils.GetObjectHash(config.KataManager.Config) - - if obj.Spec.Template.Annotations == nil { - obj.Spec.Template.Annotations = make(map[string]string) - } - obj.Spec.Template.Annotations[KataManagerAnnotationHashKey] = hash - - if len(config.KataManager.Env) > 0 { - for _, env := range config.KataManager.Env { - setContainerEnv(&(obj.Spec.Template.Spec.Containers[0]), env.Name, env.Value) - } - } - - // mount containerd config and socket - // setup mounts for runtime config file - runtime := n.runtime.String() - // kata manager is the only container in this daemonset - err = transformForRuntime(obj, config, runtime, &obj.Spec.Template.Spec.Containers[0]) - if err != nil { - return fmt.Errorf("error transforming kata-manager daemonset : %w", err) - } - - // set hostNetwork for kata-manager if specified - applyHostNetworkConfig(&obj.Spec.Template.Spec, config.KataManager.HostNetwork) - - return nil -} - // TransformVFIOManager transforms VFIO-PCI Manager daemonset with required config as per ClusterPolicy func TransformVFIOManager(obj *appsv1.DaemonSet, config *gpuv1.ClusterPolicySpec, n ClusterPolicyController) error { // update k8s-driver-manager initContainer @@ -5160,117 +5063,10 @@ func transformRuntimeClass(n ClusterPolicyController, spec nodev1.RuntimeClass) return gpuv1.Ready, nil } -func transformKataRuntimeClasses(n ClusterPolicyController) (gpuv1.State, error) { - ctx := n.ctx - state := n.idx - config := n.singleton.Spec - - // Get all existing Kata RuntimeClasses - opts := []client.ListOption{&client.MatchingLabels{"nvidia.com/kata-runtime-class": "true"}} - list := &nodev1.RuntimeClassList{} - err := n.client.List(ctx, list, opts...) - if err != nil { - n.logger.Info("Could not get Kata RuntimeClassList", err) - return gpuv1.NotReady, fmt.Errorf("error getting kata RuntimeClassList: %v", err) - } - n.logger.V(1).Info("Kata RuntimeClasses", "Number", len(list.Items)) - - if !config.KataManager.IsEnabled() { - // Delete all Kata RuntimeClasses - n.logger.Info("Kata Manager disabled, deleting all Kata RuntimeClasses") - for _, rc := range list.Items { - rc := rc - n.logger.V(1).Info("Deleting Kata RuntimeClass", "Name", rc.Name) - err := n.client.Delete(ctx, &rc) - if err != nil { - return gpuv1.NotReady, fmt.Errorf("error deleting kata RuntimeClass '%s': %v", rc.Name, err) - } - } - return gpuv1.Ready, nil - } - - // Get names of desired kata RuntimeClasses - rcNames := make(map[string]struct{}) - for _, rc := range config.KataManager.Config.RuntimeClasses { - rcNames[rc.Name] = struct{}{} - } - - // Delete any existing Kata RuntimeClasses that are no longer specified in KataManager configuration - for _, rc := range list.Items { - if _, ok := rcNames[rc.Name]; !ok { - rc := rc - n.logger.Info("Deleting Kata RuntimeClass", "Name", rc.Name) - err := n.client.Delete(ctx, &rc) - if err != nil { - return gpuv1.NotReady, fmt.Errorf("error deleting kata RuntimeClass '%s': %v", rc.Name, err) - } - } - } - - // Using kata RuntimClass template, create / update RuntimeClass objects specified in KataManager configuration - template := n.resources[state].RuntimeClasses[0] - for _, rc := range config.KataManager.Config.RuntimeClasses { - logger := n.logger.WithValues("RuntimeClass", rc.Name) - - if rc.Name == config.Operator.RuntimeClass { - return gpuv1.NotReady, fmt.Errorf("error creating kata runtimeclass '%s' as it conflicts with the runtimeclass used for the gpu-operator operand pods itself", rc.Name) - } - - obj := nodev1.RuntimeClass{} - obj.Name = rc.Name - obj.Handler = rc.Name - obj.Labels = template.Labels - obj.Scheduling = &nodev1.Scheduling{} - nodeSelector := make(map[string]string) - for k, v := range template.Scheduling.NodeSelector { - nodeSelector[k] = v - } - if rc.NodeSelector != nil { - // append user provided selectors to default nodeSelector - for k, v := range rc.NodeSelector { - nodeSelector[k] = v - } - } - obj.Scheduling.NodeSelector = nodeSelector - - if err := controllerutil.SetControllerReference(n.singleton, &obj, n.scheme); err != nil { - return gpuv1.NotReady, err - } - - found := &nodev1.RuntimeClass{} - err := n.client.Get(ctx, types.NamespacedName{Namespace: "", Name: obj.Name}, found) - if err != nil && apierrors.IsNotFound(err) { - logger.Info("Not found, creating...") - err = n.client.Create(ctx, &obj) - if err != nil { - logger.Info("Couldn't create", "Error", err) - return gpuv1.NotReady, err - } - continue - } else if err != nil { - return gpuv1.NotReady, err - } - - logger.Info("Found Resource, updating...") - obj.ResourceVersion = found.ResourceVersion - - err = n.client.Update(ctx, &obj) - if err != nil { - logger.Info("Couldn't update", "Error", err) - return gpuv1.NotReady, err - } - } - return gpuv1.Ready, nil -} - func RuntimeClasses(n ClusterPolicyController) (gpuv1.State, error) { status := gpuv1.Ready state := n.idx - if n.stateNames[state] == "state-kata-manager" { - return transformKataRuntimeClasses(n) - } - nvidiaRuntimeClasses := n.resources[state].RuntimeClasses if n.stateNames[state] == "pre-requisites" && !n.isStateEnabled(n.stateNames[state]) { err := clearRuntimeClasses(n, nvidiaRuntimeClasses) diff --git a/controllers/state_manager.go b/controllers/state_manager.go index 29eef5cf2..c83655c59 100644 --- a/controllers/state_manager.go +++ b/controllers/state_manager.go @@ -100,7 +100,6 @@ var gpuStateLabels = map[string]map[string]string{ "nvidia.com/gpu.deploy.sandbox-device-plugin": "true", "nvidia.com/gpu.deploy.sandbox-validator": "true", "nvidia.com/gpu.deploy.vfio-manager": "true", - "nvidia.com/gpu.deploy.kata-manager": "true", "nvidia.com/gpu.deploy.cc-manager": "true", }, gpuWorkloadConfigVMVgpu: { @@ -905,7 +904,6 @@ func (n *ClusterPolicyController) init(ctx context.Context, reconciler *ClusterP addState(n, "/opt/gpu-operator/state-vfio-manager") addState(n, "/opt/gpu-operator/state-sandbox-device-plugin") addState(n, "/opt/gpu-operator/state-kata-device-plugin") - addState(n, "/opt/gpu-operator/state-kata-manager") addState(n, "/opt/gpu-operator/state-cc-manager") } diff --git a/controllers/transforms_test.go b/controllers/transforms_test.go index 64af9f93d..c98c51108 100644 --- a/controllers/transforms_test.go +++ b/controllers/transforms_test.go @@ -20,7 +20,6 @@ import ( "path/filepath" "testing" - kata_v1alpha1 "github.com/NVIDIA/k8s-kata-manager/api/v1alpha1/config" "github.com/stretchr/testify/assert" "github.com/stretchr/testify/require" appsv1 "k8s.io/api/apps/v1" @@ -505,31 +504,6 @@ func TestTransformForRuntime(t *testing.T) { }, }), }, - // Cover the kata-manager naming case - { - description: "containerd skips drop-in for kata manager", - runtime: gpuv1.Containerd, - input: NewDaemonset(). - WithContainer(corev1.Container{Name: "nvidia-kata-manager"}), - expectedOutput: NewDaemonset(). - WithHostPathVolume("containerd-config", filepath.Dir(DefaultContainerdConfigFile), ptr.To(corev1.HostPathDirectoryOrCreate)). - WithHostPathVolume("containerd-socket", filepath.Dir(DefaultContainerdSocketFile), nil). - WithContainer(corev1.Container{ - Name: "nvidia-kata-manager", - Env: []corev1.EnvVar{ - {Name: "RUNTIME", Value: gpuv1.Containerd.String()}, - {Name: "CONTAINERD_RUNTIME_CLASS", Value: DefaultRuntimeClass}, - {Name: "RUNTIME_CONFIG", Value: filepath.Join(DefaultRuntimeConfigTargetDir, filepath.Base(DefaultContainerdConfigFile))}, - {Name: "CONTAINERD_CONFIG", Value: filepath.Join(DefaultRuntimeConfigTargetDir, filepath.Base(DefaultContainerdConfigFile))}, - {Name: "RUNTIME_SOCKET", Value: filepath.Join(DefaultRuntimeSocketTargetDir, filepath.Base(DefaultContainerdSocketFile))}, - {Name: "CONTAINERD_SOCKET", Value: filepath.Join(DefaultRuntimeSocketTargetDir, filepath.Base(DefaultContainerdSocketFile))}, - }, - VolumeMounts: []corev1.VolumeMount{ - {Name: "containerd-config", MountPath: DefaultRuntimeConfigTargetDir}, - {Name: "containerd-socket", MountPath: DefaultRuntimeSocketTargetDir}, - }, - }), - }, { description: "docker", runtime: gpuv1.Docker, @@ -1933,124 +1907,6 @@ func TestTransformMigManager(t *testing.T) { } } -func TestTransformKataManager(t *testing.T) { - testCases := []struct { - description string - ds Daemonset - cpSpec *gpuv1.ClusterPolicySpec - expectedDs Daemonset - }{ - { - description: "transform kata manager", - ds: NewDaemonset().WithContainer(corev1.Container{Name: "nvidia-kata-manager"}), - cpSpec: &gpuv1.ClusterPolicySpec{ - KataManager: gpuv1.KataManagerSpec{ - Repository: "nvcr.io/nvidia/cloud-native", - Image: "kata-manager", - Version: "v1.0.0", - ImagePullPolicy: "IfNotPresent", - ImagePullSecrets: []string{"pull-secret"}, - Args: []string{"--test-flag"}, - Config: &kata_v1alpha1.Config{ - ArtifactsDir: "/var/lib/kata", - }, - Env: []gpuv1.EnvVar{ - {Name: "foo", Value: "bar"}, - }, - }, - }, - expectedDs: NewDaemonset().WithContainer(corev1.Container{ - Name: "nvidia-kata-manager", - Image: "nvcr.io/nvidia/cloud-native/kata-manager:v1.0.0", - ImagePullPolicy: corev1.PullIfNotPresent, - Args: []string{"--test-flag"}, - Env: []corev1.EnvVar{ - {Name: "KATA_ARTIFACTS_DIR", Value: "/var/lib/kata"}, - {Name: "foo", Value: "bar"}, - {Name: "RUNTIME", Value: "containerd"}, - {Name: "CONTAINERD_RUNTIME_CLASS", Value: "nvidia"}, - {Name: "RUNTIME_CONFIG", Value: "/runtime/config-dir/config.toml"}, - {Name: "CONTAINERD_CONFIG", Value: "/runtime/config-dir/config.toml"}, - {Name: "RUNTIME_SOCKET", Value: "/runtime/sock-dir/containerd.sock"}, - {Name: "CONTAINERD_SOCKET", Value: "/runtime/sock-dir/containerd.sock"}, - }, - VolumeMounts: []corev1.VolumeMount{ - {Name: "kata-artifacts", MountPath: "/var/lib/kata"}, - {Name: "containerd-config", MountPath: "/runtime/config-dir/"}, - {Name: "containerd-socket", MountPath: "/runtime/sock-dir/"}, - }, - }).WithPullSecret("pull-secret").WithPodAnnotations(map[string]string{"nvidia.com/kata-manager.last-applied-hash": "1929911998"}).WithHostPathVolume("kata-artifacts", "/var/lib/kata", ptr.To(corev1.HostPathDirectoryOrCreate)).WithHostPathVolume("containerd-config", "/etc/containerd", ptr.To(corev1.HostPathDirectoryOrCreate)).WithHostPathVolume("containerd-socket", "/run/containerd", nil), - }, - { - description: "transform kata manager with custom container runtime socket", - ds: NewDaemonset().WithContainer(corev1.Container{Name: "nvidia-kata-manager"}), - cpSpec: &gpuv1.ClusterPolicySpec{ - KataManager: gpuv1.KataManagerSpec{ - Repository: "nvcr.io/nvidia/cloud-native", - Image: "kata-manager", - Version: "v1.0.0", - ImagePullPolicy: "IfNotPresent", - ImagePullSecrets: []string{"pull-secret"}, - Args: []string{"--test-flag"}, - Config: &kata_v1alpha1.Config{ - ArtifactsDir: "/var/lib/kata", - }, - Env: []gpuv1.EnvVar{ - { - Name: "CONTAINERD_CONFIG", Value: "/var/lib/rancher/k3s/agent/etc/containerd/config.toml", - }, - { - Name: "CONTAINERD_SOCKET", Value: "/run/k3s/containerd/containerd.sock", - }, - { - Name: "CONTAINERD_RUNTIME_CLASS", Value: "nvidia", - }, - { - Name: "CONTAINERD_SET_AS_DEFAULT", Value: "true", - }, - }, - }, - }, - expectedDs: NewDaemonset().WithContainer(corev1.Container{ - Name: "nvidia-kata-manager", - Image: "nvcr.io/nvidia/cloud-native/kata-manager:v1.0.0", - ImagePullPolicy: corev1.PullIfNotPresent, - Args: []string{"--test-flag"}, - Env: []corev1.EnvVar{ - {Name: "KATA_ARTIFACTS_DIR", Value: "/var/lib/kata"}, - {Name: "CONTAINERD_CONFIG", Value: "/runtime/config-dir/config.toml"}, - {Name: "CONTAINERD_SOCKET", Value: "/runtime/sock-dir/containerd.sock"}, - {Name: "CONTAINERD_RUNTIME_CLASS", Value: "nvidia"}, - {Name: "CONTAINERD_SET_AS_DEFAULT", Value: "true"}, - {Name: "RUNTIME", Value: "containerd"}, - {Name: "RUNTIME_CONFIG", Value: "/runtime/config-dir/config.toml"}, - {Name: "RUNTIME_SOCKET", Value: "/runtime/sock-dir/containerd.sock"}, - }, - VolumeMounts: []corev1.VolumeMount{ - {Name: "kata-artifacts", MountPath: "/var/lib/kata"}, - {Name: "containerd-config", MountPath: "/runtime/config-dir/"}, - {Name: "containerd-socket", MountPath: "/runtime/sock-dir/"}, - }, - }).WithPullSecret("pull-secret"). - WithPodAnnotations(map[string]string{"nvidia.com/kata-manager.last-applied-hash": "1929911998"}). - WithHostPathVolume("kata-artifacts", "/var/lib/kata", ptr.To(corev1.HostPathDirectoryOrCreate)). - WithHostPathVolume("containerd-config", "/var/lib/rancher/k3s/agent/etc/containerd", ptr.To(corev1.HostPathDirectoryOrCreate)). - WithHostPathVolume("containerd-socket", "/run/k3s/containerd", nil), - }, - } - - for _, tc := range testCases { - t.Run(tc.description, func(t *testing.T) { - err := TransformKataManager(tc.ds.DaemonSet, tc.cpSpec, ClusterPolicyController{ - runtime: gpuv1.Containerd, - logger: ctrl.Log.WithName("test"), - }) - require.NoError(t, err) - require.EqualValues(t, tc.expectedDs, tc.ds) - }) - } -} - func TestTransformVFIOManager(t *testing.T) { resources := corev1.ResourceRequirements{ Limits: corev1.ResourceList{