Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
206 changes: 1 addition & 205 deletions controllers/object_controls.go
Original file line number Diff line number Diff line change
Expand Up @@ -45,7 +45,6 @@ import (
"k8s.io/utils/ptr"
"sigs.k8s.io/controller-runtime/pkg/client"
"sigs.k8s.io/controller-runtime/pkg/controller/controllerutil"
"sigs.k8s.io/yaml"

gpuv1 "github.com/NVIDIA/gpu-operator/api/nvidia/v1"
driverconfig "github.com/NVIDIA/gpu-operator/internal/config"
Expand Down Expand Up @@ -161,8 +160,6 @@ const (
DeviceListStrategyEnvName = "DEVICE_LIST_STRATEGY"
// CDIAnnotationPrefixEnvName is the name of the device-plugin envvar for configuring the CDI annotation prefix
CDIAnnotationPrefixEnvName = "CDI_ANNOTATION_PREFIX"
// KataManagerAnnotationHashKey is the annotation indicating the hash of the kata-manager configuration
KataManagerAnnotationHashKey = "nvidia.com/kata-manager.last-applied-hash"
// DefaultKataArtifactsDir is the default directory to store kata artifacts on the host
DefaultKataArtifactsDir = "/opt/nvidia-gpu-operator/artifacts/runtimeclasses/"
// PodControllerRevisionHashLabelKey is the annotation key for pod controller revision hash value
Expand Down Expand Up @@ -577,16 +574,6 @@ func createConfigMap(n ClusterPolicyController, configMapIdx int) (gpuv1.State,
}
}

if obj.Name == "nvidia-kata-manager-config" {
data, err := yaml.Marshal(config.KataManager.Config)
if err != nil {
return gpuv1.NotReady, fmt.Errorf("failed to marshal kata manager config: %v", err)
}
obj.Data = map[string]string{
"config.yaml": string(data),
}
}

if err := controllerutil.SetControllerReference(n.singleton, obj, n.scheme); err != nil {
return gpuv1.NotReady, err
}
Expand Down Expand Up @@ -714,7 +701,6 @@ func preProcessDaemonSet(obj *appsv1.DaemonSet, n ClusterPolicyController) error
"nvidia-mig-manager": TransformMIGManager,
"nvidia-operator-validator": TransformValidator,
"nvidia-sandbox-validator": TransformSandboxValidator,
"nvidia-kata-manager": TransformKataManager,
"nvidia-cc-manager": TransformCCManager,
}

Expand Down Expand Up @@ -1392,13 +1378,7 @@ func transformForRuntime(obj *appsv1.DaemonSet, config *gpuv1.ClusterPolicySpec,
}

// Handle the drop-in configs
// TODO: It's a bit of a hack to skip the `nvidia-kata-manager` container here.
// Ideally if the two projects are using the SAME API then this should be
// captured more rigorously.
// Note that we probably want to implement drop-in file support in the
// kata manager in any case -- in which case it will be good to use a
// similar implementation.
if dropInConfigFile != "" && container.Name != "nvidia-kata-manager" {
if dropInConfigFile != "" {
sourceConfigFileName := path.Base(dropInConfigFile)
sourceConfigDir := path.Dir(dropInConfigFile)
containerConfigDir := DefaultRuntimeDropInConfigTargetDir
Expand Down Expand Up @@ -2023,83 +2003,6 @@ func TransformMIGManager(obj *appsv1.DaemonSet, config *gpuv1.ClusterPolicySpec,
return nil
}

// TransformKataManager transforms Kata Manager daemonset with required config as per ClusterPolicy
func TransformKataManager(obj *appsv1.DaemonSet, config *gpuv1.ClusterPolicySpec, n ClusterPolicyController) error {
// update image
image, err := gpuv1.ImagePath(&config.KataManager)
if err != nil {
return err
}
obj.Spec.Template.Spec.Containers[0].Image = image

// update image pull policy
obj.Spec.Template.Spec.Containers[0].ImagePullPolicy = gpuv1.ImagePullPolicy(config.KataManager.ImagePullPolicy)

// set image pull secrets
if len(config.KataManager.ImagePullSecrets) > 0 {
addPullSecrets(&obj.Spec.Template.Spec, config.KataManager.ImagePullSecrets)
}

// set resource limits
if config.KataManager.Resources != nil {
// apply resource limits to all containers
for i := range obj.Spec.Template.Spec.Containers {
obj.Spec.Template.Spec.Containers[i].Resources.Requests = config.KataManager.Resources.Requests
obj.Spec.Template.Spec.Containers[i].Resources.Limits = config.KataManager.Resources.Limits
}
}

// set arguments if specified for mig-manager container
if len(config.KataManager.Args) > 0 {
obj.Spec.Template.Spec.Containers[0].Args = config.KataManager.Args
}

// mount artifactsDir
artifactsDir := DefaultKataArtifactsDir
if config.KataManager.Config.ArtifactsDir != "" {
artifactsDir = config.KataManager.Config.ArtifactsDir
}

// set env used by readinessProbe to determine path to kata-manager pid file.
setContainerEnv(&(obj.Spec.Template.Spec.Containers[0]), "KATA_ARTIFACTS_DIR", artifactsDir)

artifactsVolMount := corev1.VolumeMount{Name: "kata-artifacts", MountPath: artifactsDir}
obj.Spec.Template.Spec.Containers[0].VolumeMounts = append(obj.Spec.Template.Spec.Containers[0].VolumeMounts, artifactsVolMount)

artifactsVol := corev1.Volume{Name: "kata-artifacts", VolumeSource: corev1.VolumeSource{HostPath: &corev1.HostPathVolumeSource{Path: artifactsDir, Type: ptr.To(corev1.HostPathDirectoryOrCreate)}}}
obj.Spec.Template.Spec.Volumes = append(obj.Spec.Template.Spec.Volumes, artifactsVol)

// Compute hash of kata manager config and add an annotation with the value.
// If the kata config changes, a new revision of the daemonset will be
// created and thus the kata-manager pods will restart with the updated config.
hash := utils.GetObjectHash(config.KataManager.Config)

if obj.Spec.Template.Annotations == nil {
obj.Spec.Template.Annotations = make(map[string]string)
}
obj.Spec.Template.Annotations[KataManagerAnnotationHashKey] = hash

if len(config.KataManager.Env) > 0 {
for _, env := range config.KataManager.Env {
setContainerEnv(&(obj.Spec.Template.Spec.Containers[0]), env.Name, env.Value)
}
}

// mount containerd config and socket
// setup mounts for runtime config file
runtime := n.runtime.String()
// kata manager is the only container in this daemonset
err = transformForRuntime(obj, config, runtime, &obj.Spec.Template.Spec.Containers[0])
if err != nil {
return fmt.Errorf("error transforming kata-manager daemonset : %w", err)
}

// set hostNetwork for kata-manager if specified
applyHostNetworkConfig(&obj.Spec.Template.Spec, config.KataManager.HostNetwork)

return nil
}

// TransformVFIOManager transforms VFIO-PCI Manager daemonset with required config as per ClusterPolicy
func TransformVFIOManager(obj *appsv1.DaemonSet, config *gpuv1.ClusterPolicySpec, n ClusterPolicyController) error {
// update k8s-driver-manager initContainer
Expand Down Expand Up @@ -5160,117 +5063,10 @@ func transformRuntimeClass(n ClusterPolicyController, spec nodev1.RuntimeClass)
return gpuv1.Ready, nil
}

func transformKataRuntimeClasses(n ClusterPolicyController) (gpuv1.State, error) {
ctx := n.ctx
state := n.idx
config := n.singleton.Spec

// Get all existing Kata RuntimeClasses
opts := []client.ListOption{&client.MatchingLabels{"nvidia.com/kata-runtime-class": "true"}}
list := &nodev1.RuntimeClassList{}
err := n.client.List(ctx, list, opts...)
if err != nil {
n.logger.Info("Could not get Kata RuntimeClassList", err)
return gpuv1.NotReady, fmt.Errorf("error getting kata RuntimeClassList: %v", err)
}
n.logger.V(1).Info("Kata RuntimeClasses", "Number", len(list.Items))

if !config.KataManager.IsEnabled() {
// Delete all Kata RuntimeClasses
n.logger.Info("Kata Manager disabled, deleting all Kata RuntimeClasses")
for _, rc := range list.Items {
rc := rc
n.logger.V(1).Info("Deleting Kata RuntimeClass", "Name", rc.Name)
err := n.client.Delete(ctx, &rc)
if err != nil {
return gpuv1.NotReady, fmt.Errorf("error deleting kata RuntimeClass '%s': %v", rc.Name, err)
}
}
return gpuv1.Ready, nil
}

// Get names of desired kata RuntimeClasses
rcNames := make(map[string]struct{})
for _, rc := range config.KataManager.Config.RuntimeClasses {
rcNames[rc.Name] = struct{}{}
}

// Delete any existing Kata RuntimeClasses that are no longer specified in KataManager configuration
for _, rc := range list.Items {
if _, ok := rcNames[rc.Name]; !ok {
rc := rc
n.logger.Info("Deleting Kata RuntimeClass", "Name", rc.Name)
err := n.client.Delete(ctx, &rc)
if err != nil {
return gpuv1.NotReady, fmt.Errorf("error deleting kata RuntimeClass '%s': %v", rc.Name, err)
}
}
}

// Using kata RuntimClass template, create / update RuntimeClass objects specified in KataManager configuration
template := n.resources[state].RuntimeClasses[0]
for _, rc := range config.KataManager.Config.RuntimeClasses {
logger := n.logger.WithValues("RuntimeClass", rc.Name)

if rc.Name == config.Operator.RuntimeClass {
return gpuv1.NotReady, fmt.Errorf("error creating kata runtimeclass '%s' as it conflicts with the runtimeclass used for the gpu-operator operand pods itself", rc.Name)
}

obj := nodev1.RuntimeClass{}
obj.Name = rc.Name
obj.Handler = rc.Name
obj.Labels = template.Labels
obj.Scheduling = &nodev1.Scheduling{}
nodeSelector := make(map[string]string)
for k, v := range template.Scheduling.NodeSelector {
nodeSelector[k] = v
}
if rc.NodeSelector != nil {
// append user provided selectors to default nodeSelector
for k, v := range rc.NodeSelector {
nodeSelector[k] = v
}
}
obj.Scheduling.NodeSelector = nodeSelector

if err := controllerutil.SetControllerReference(n.singleton, &obj, n.scheme); err != nil {
return gpuv1.NotReady, err
}

found := &nodev1.RuntimeClass{}
err := n.client.Get(ctx, types.NamespacedName{Namespace: "", Name: obj.Name}, found)
if err != nil && apierrors.IsNotFound(err) {
logger.Info("Not found, creating...")
err = n.client.Create(ctx, &obj)
if err != nil {
logger.Info("Couldn't create", "Error", err)
return gpuv1.NotReady, err
}
continue
} else if err != nil {
return gpuv1.NotReady, err
}

logger.Info("Found Resource, updating...")
obj.ResourceVersion = found.ResourceVersion

err = n.client.Update(ctx, &obj)
if err != nil {
logger.Info("Couldn't update", "Error", err)
return gpuv1.NotReady, err
}
}
return gpuv1.Ready, nil
}

func RuntimeClasses(n ClusterPolicyController) (gpuv1.State, error) {
status := gpuv1.Ready
state := n.idx

if n.stateNames[state] == "state-kata-manager" {
return transformKataRuntimeClasses(n)
}

nvidiaRuntimeClasses := n.resources[state].RuntimeClasses
if n.stateNames[state] == "pre-requisites" && !n.isStateEnabled(n.stateNames[state]) {
err := clearRuntimeClasses(n, nvidiaRuntimeClasses)
Expand Down
2 changes: 0 additions & 2 deletions controllers/state_manager.go
Original file line number Diff line number Diff line change
Expand Up @@ -100,7 +100,6 @@ var gpuStateLabels = map[string]map[string]string{
"nvidia.com/gpu.deploy.sandbox-device-plugin": "true",
"nvidia.com/gpu.deploy.sandbox-validator": "true",
"nvidia.com/gpu.deploy.vfio-manager": "true",
"nvidia.com/gpu.deploy.kata-manager": "true",
"nvidia.com/gpu.deploy.cc-manager": "true",
},
gpuWorkloadConfigVMVgpu: {
Expand Down Expand Up @@ -905,7 +904,6 @@ func (n *ClusterPolicyController) init(ctx context.Context, reconciler *ClusterP
addState(n, "/opt/gpu-operator/state-vfio-manager")
addState(n, "/opt/gpu-operator/state-sandbox-device-plugin")
addState(n, "/opt/gpu-operator/state-kata-device-plugin")
addState(n, "/opt/gpu-operator/state-kata-manager")
addState(n, "/opt/gpu-operator/state-cc-manager")
}

Expand Down
Loading
Loading